gcc/config/aarch64/aarch64.cc

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2024 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #define INCLUDE_STRING
  24 #define INCLUDE_ALGORITHM
  25 #define INCLUDE_MEMORY
  26 #define INCLUDE_VECTOR
  27 #include "config.h"
  28 #include "system.h"
  29 #include "coretypes.h"
  30 #include "backend.h"
  31 #include "target.h"
  32 #include "rtl.h"
  33 #include "tree.h"
  34 #include "memmodel.h"
  35 #include "gimple.h"
  36 #include "cfghooks.h"
  37 #include "cfgloop.h"
  38 #include "df.h"
  39 #include "tm_p.h"
  40 #include "stringpool.h"
  41 #include "attribs.h"
  42 #include "optabs.h"
  43 #include "regs.h"
  44 #include "emit-rtl.h"
  45 #include "recog.h"
  46 #include "cgraph.h"
  47 #include "diagnostic.h"
  48 #include "insn-attr.h"
  49 #include "alias.h"
  50 #include "fold-const.h"
  51 #include "stor-layout.h"
  52 #include "calls.h"
  53 #include "varasm.h"
  54 #include "output.h"
  55 #include "flags.h"
  56 #include "explow.h"
  57 #include "expr.h"
  58 #include "reload.h"
  59 #include "langhooks.h"
  60 #include "opts.h"
  61 #include "gimplify.h"
  62 #include "dwarf2.h"
  63 #include "dwarf2out.h"
  64 #include "gimple-iterator.h"
  65 #include "tree-vectorizer.h"
  66 #include "aarch64-cost-tables.h"
  67 #include "dumpfile.h"
  68 #include "builtins.h"
  69 #include "rtl-iter.h"
  70 #include "tm-constrs.h"
  71 #include "sched-int.h"
  72 #include "target-globals.h"
  73 #include "common/common-target.h"
  74 #include "cfgrtl.h"
  75 #include "selftest.h"
  76 #include "selftest-rtl.h"
  77 #include "rtx-vector-builder.h"
  78 #include "intl.h"
  79 #include "expmed.h"
  80 #include "function-abi.h"
  81 #include "gimple-pretty-print.h"
  82 #include "tree-ssa-loop-niter.h"
  83 #include "fractional-cost.h"
  84 #include "rtlanal.h"
  85 #include "tree-dfa.h"
  86 #include "asan.h"
  87 #include "aarch64-feature-deps.h"
  88 #include "config/arm/aarch-common.h"
  89 #include "config/arm/aarch-common-protos.h"
  90 #include "common/config/aarch64/cpuinfo.h"
  91 #include "ssa.h"
  92 #include "except.h"
  93 #include "tree-pass.h"
  94 #include "cfgbuild.h"
  95 #include "symbol-summary.h"
  96 #include "sreal.h"
  97 #include "ipa-cp.h"
  98 #include "ipa-prop.h"
  99 #include "ipa-fnsummary.h"
 100 #include "hash-map.h"
 101
 102 /* This file should be included last.  */
 103 #include "target-def.h"
 104
 105 /* Defined for convenience.  */
 106 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 107
 108 /* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
 109    and 1 MOVI/DUP (same size as a call).  */
 110 #define MAX_SET_SIZE(speed) (speed ? 256 : 96)
 111
 112 /* Flags that describe how a function shares certain architectural state
 113    with its callers.
 114
 115    - AARCH64_STATE_SHARED indicates that the function does share the state
 116      with callers.
 117
 118    - AARCH64_STATE_IN indicates that the function reads (or might read) the
 119      incoming state.  The converse is that the function ignores the incoming
 120      state.
 121
 122    - AARCH64_STATE_OUT indicates that the function returns new state.
 123      The converse is that the state on return is the same as it was on entry.
 124
 125    A function that partially modifies the state treats it as both IN
 126    and OUT (because the value on return depends to some extent on the
 127    value on input).  */
 128 constexpr auto AARCH64_STATE_SHARED = 1U << 0;
 129 constexpr auto AARCH64_STATE_IN = 1U << 1;
 130 constexpr auto AARCH64_STATE_OUT = 1U << 2;
 131
 132 /* Information about a legitimate vector immediate operand.  */
 133 struct simd_immediate_info
 134 {
 135   enum insn_type { MOV, MVN, INDEX, PTRUE };
 136   enum modifier_type { LSL, MSL };
 137
 138   simd_immediate_info () {}
 139   simd_immediate_info (scalar_float_mode, rtx);
 140   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 141                        insn_type = MOV, modifier_type = LSL,
 142                        unsigned int = 0);
 143   simd_immediate_info (scalar_mode, rtx, rtx);
 144   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
 145
 146   /* The mode of the elements.  */
 147   scalar_mode elt_mode;
 148
 149   /* The instruction to use to move the immediate into a vector.  */
 150   insn_type insn;
 151
 152   union
 153   {
 154     /* For MOV and MVN.  */
 155     struct
 156     {
 157       /* The value of each element.  */
 158       rtx value;
 159
 160       /* The kind of shift modifier to use, and the number of bits to shift.
 161          This is (LSL, 0) if no shift is needed.  */
 162       modifier_type modifier;
 163       unsigned int shift;
 164     } mov;
 165
 166     /* For INDEX.  */
 167     struct
 168     {
 169       /* The value of the first element and the step to be added for each
 170          subsequent element.  */
 171       rtx base, step;
 172     } index;
 173
 174     /* For PTRUE.  */
 175     aarch64_svpattern pattern;
 176   } u;
 177 };
 178
 179 /* Construct a floating-point immediate in which each element has mode
 180    ELT_MODE_IN and value VALUE_IN.  */
 181 inline simd_immediate_info
 182 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 183   : elt_mode (elt_mode_in), insn (MOV)
 184 {
 185   u.mov.value = value_in;
 186   u.mov.modifier = LSL;
 187   u.mov.shift = 0;
 188 }
 189
 190 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 191    and value VALUE_IN.  The other parameters are as for the structure
 192    fields.  */
 193 inline simd_immediate_info
 194 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 195                        unsigned HOST_WIDE_INT value_in,
 196                        insn_type insn_in, modifier_type modifier_in,
 197                        unsigned int shift_in)
 198   : elt_mode (elt_mode_in), insn (insn_in)
 199 {
 200   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 201   u.mov.modifier = modifier_in;
 202   u.mov.shift = shift_in;
 203 }
 204
 205 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 206    and where element I is equal to BASE_IN + I * STEP_IN.  */
 207 inline simd_immediate_info
 208 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 209   : elt_mode (elt_mode_in), insn (INDEX)
 210 {
 211   u.index.base = base_in;
 212   u.index.step = step_in;
 213 }
 214
 215 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 216    and has PTRUE pattern PATTERN_IN.  */
 217 inline simd_immediate_info
 218 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 219                        aarch64_svpattern pattern_in)
 220   : elt_mode (elt_mode_in), insn (PTRUE)
 221 {
 222   u.pattern = pattern_in;
 223 }
 224
 225 namespace {
 226
 227 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
 228 class pure_scalable_type_info
 229 {
 230 public:
 231   /* Represents the result of analyzing a type.  All values are nonzero,
 232      in the possibly forlorn hope that accidental conversions to bool
 233      trigger a warning.  */
 234   enum analysis_result
 235   {
 236     /* The type does not have an ABI identity; i.e. it doesn't contain
 237        at least one object whose type is a Fundamental Data Type.  */
 238     NO_ABI_IDENTITY = 1,
 239
 240     /* The type is definitely a Pure Scalable Type.  */
 241     IS_PST,
 242
 243     /* The type is definitely not a Pure Scalable Type.  */
 244     ISNT_PST,
 245
 246     /* It doesn't matter for PCS purposes whether the type is a Pure
 247        Scalable Type or not, since the type will be handled the same
 248        way regardless.
 249
 250        Specifically, this means that if the type is a Pure Scalable Type,
 251        there aren't enough argument registers to hold it, and so it will
 252        need to be passed or returned in memory.  If the type isn't a
 253        Pure Scalable Type, it's too big to be passed or returned in core
 254        or SIMD&FP registers, and so again will need to go in memory.  */
 255     DOESNT_MATTER
 256   };
 257
 258   /* Aggregates of 17 bytes or more are normally passed and returned
 259      in memory, so aggregates of that size can safely be analyzed as
 260      DOESNT_MATTER.  We need to be able to collect enough pieces to
 261      represent a PST that is smaller than that.  Since predicates are
 262      2 bytes in size for -msve-vector-bits=128, that means we need to be
 263      able to store at least 8 pieces.
 264
 265      We also need to be able to store enough pieces to represent
 266      a single vector in each vector argument register and a single
 267      predicate in each predicate argument register.  This means that
 268      we need at least 12 pieces.  */
 269   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
 270   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
 271
 272   /* Describes one piece of a PST.  Each piece is one of:
 273
 274      - a single Scalable Vector Type (SVT)
 275      - a single Scalable Predicate Type (SPT)
 276      - a PST containing 2, 3 or 4 SVTs, with no padding
 277
 278      It either represents a single built-in type or a PST formed from
 279      multiple homogeneous built-in types.  */
 280   struct piece
 281   {
 282     rtx get_rtx (unsigned int, unsigned int) const;
 283
 284     /* The number of vector and predicate registers that the piece
 285        occupies.  One of the two is always zero.  */
 286     unsigned int num_zr;
 287     unsigned int num_pr;
 288
 289     /* The mode of the registers described above.  */
 290     machine_mode mode;
 291
 292     /* If this piece is formed from multiple homogeneous built-in types,
 293        this is the mode of the built-in types, otherwise it is MODE.  */
 294     machine_mode orig_mode;
 295
 296     /* The offset in bytes of the piece from the start of the type.  */
 297     poly_uint64 offset;
 298   };
 299
 300   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
 301      are in memory order.  */
 302   auto_vec<piece, MAX_PIECES> pieces;
 303
 304   unsigned int num_zr () const;
 305   unsigned int num_pr () const;
 306
 307   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
 308
 309   analysis_result analyze (const_tree);
 310   bool analyze_registers (const_tree);
 311
 312 private:
 313   analysis_result analyze_array (const_tree);
 314   analysis_result analyze_record (const_tree);
 315   void add_piece (const piece &);
 316 };
 317 }
 318
 319 /* The current code model.  */
 320 enum aarch64_code_model aarch64_cmodel;
 321
 322 enum aarch64_tp_reg aarch64_tpidr_register;
 323
 324 /* The number of 64-bit elements in an SVE vector.  */
 325 poly_uint16 aarch64_sve_vg;
 326
 327 #ifdef HAVE_AS_TLS
 328 #undef TARGET_HAVE_TLS
 329 #define TARGET_HAVE_TLS 1
 330 #endif
 331
 332 static bool aarch64_composite_type_p (const_tree, machine_mode);
 333 static bool aarch64_return_in_memory_1 (const_tree);
 334 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 335                                                      const_tree,
 336                                                      machine_mode *, int *,
 337                                                      bool *, bool);
 338 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 339 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 340 static void aarch64_override_options_after_change (void);
 341 static bool aarch64_vector_mode_supported_p (machine_mode);
 342 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 343 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 344                                                          const_tree type,
 345                                                          int misalignment,
 346                                                          bool is_packed);
 347 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 348 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 349                                             aarch64_addr_query_type);
 350
 351 /* The processor for which instructions should be scheduled.  */
 352 enum aarch64_processor aarch64_tune = cortexa53;
 353
 354 /* Global flag for PC relative loads.  */
 355 bool aarch64_pcrelative_literal_loads;
 356
 357 /* Global flag for whether frame pointer is enabled.  */
 358 bool aarch64_use_frame_pointer;
 359
 360 /* Support for command line parsing of boolean flags in the tuning
 361    structures.  */
 362 struct aarch64_flag_desc
 363 {
 364   const char* name;
 365   unsigned int flag;
 366 };
 367
 368 #define AARCH64_FUSION_PAIR(name, internal_name) \
 369   { name, AARCH64_FUSE_##internal_name },
 370 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 371 {
 372   { "none", AARCH64_FUSE_NOTHING },
 373 #include "aarch64-fusion-pairs.def"
 374   { "all", AARCH64_FUSE_ALL },
 375   { NULL, AARCH64_FUSE_NOTHING }
 376 };
 377
 378 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 379   { name, AARCH64_EXTRA_TUNE_##internal_name },
 380 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 381 {
 382   { "none", AARCH64_EXTRA_TUNE_NONE },
 383 #include "aarch64-tuning-flags.def"
 384   { "all", AARCH64_EXTRA_TUNE_ALL },
 385   { NULL, AARCH64_EXTRA_TUNE_NONE }
 386 };
 387
 388 /* Tuning parameters.  */
 389 #include "tuning_models/generic.h"
 390 #include "tuning_models/generic_armv8_a.h"
 391 #include "tuning_models/generic_armv9_a.h"
 392 #include "tuning_models/cortexa35.h"
 393 #include "tuning_models/cortexa53.h"
 394 #include "tuning_models/cortexa57.h"
 395 #include "tuning_models/cortexa72.h"
 396 #include "tuning_models/cortexa73.h"
 397 #include "tuning_models/cortexx925.h"
 398 #include "tuning_models/exynosm1.h"
 399 #include "tuning_models/thunderxt88.h"
 400 #include "tuning_models/thunderx.h"
 401 #include "tuning_models/tsv110.h"
 402 #include "tuning_models/xgene1.h"
 403 #include "tuning_models/emag.h"
 404 #include "tuning_models/qdf24xx.h"
 405 #include "tuning_models/saphira.h"
 406 #include "tuning_models/thunderx2t99.h"
 407 #include "tuning_models/thunderx3t110.h"
 408 #include "tuning_models/neoversen1.h"
 409 #include "tuning_models/ampere1.h"
 410 #include "tuning_models/ampere1a.h"
 411 #include "tuning_models/ampere1b.h"
 412 #include "tuning_models/neoversev1.h"
 413 #include "tuning_models/neoverse512tvb.h"
 414 #include "tuning_models/neoversen2.h"
 415 #include "tuning_models/neoversen3.h"
 416 #include "tuning_models/neoversev2.h"
 417 #include "tuning_models/neoversev3.h"
 418 #include "tuning_models/neoversev3ae.h"
 419 #include "tuning_models/a64fx.h"
 420
 421 /* Support for fine-grained override of the tuning structures.  */
 422 struct aarch64_tuning_override_function
 423 {
 424   const char* name;
 425   void (*parse_override)(const char*, struct tune_params*);
 426 };
 427
 428 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 429 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 430 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
 431
 432 static const struct aarch64_tuning_override_function
 433 aarch64_tuning_override_functions[] =
 434 {
 435   { "fuse", aarch64_parse_fuse_string },
 436   { "tune", aarch64_parse_tune_string },
 437   { "sve_width", aarch64_parse_sve_width_string },
 438   { NULL, NULL }
 439 };
 440
 441 /* A processor implementing AArch64.  */
 442 struct processor
 443 {
 444   const char *name;
 445   aarch64_processor ident;
 446   aarch64_processor sched_core;
 447   aarch64_arch arch;
 448   aarch64_feature_flags flags;
 449   const tune_params *tune;
 450 };
 451
 452 /* Architectures implementing AArch64.  */
 453 static CONSTEXPR const processor all_architectures[] =
 454 {
 455 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
 456   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
 457    feature_deps::ARCH_IDENT ().enable, NULL},
 458 #include "aarch64-arches.def"
 459   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
 460 };
 461
 462 /* Processor cores implementing AArch64.  */
 463 static const struct processor all_cores[] =
 464 {
 465 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
 466   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
 467    feature_deps::cpu_##IDENT, &COSTS##_tunings},
 468 #include "aarch64-cores.def"
 469   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
 470 };
 471 /* Internal representation of system registers.  */
 472 typedef struct {
 473   const char *name;
 474   /* Stringified sysreg encoding values, represented as
 475      s<sn>_<op1>_c<cn>_c<cm>_<op2>.  */
 476   const char *encoding;
 477   /* Flags affecting sysreg usage, such as read/write-only.  */
 478   unsigned properties;
 479   /* Architectural features implied by sysreg.  */
 480   aarch64_feature_flags arch_reqs;
 481 } sysreg_t;
 482
 483 /* An aarch64_feature_set initializer for a single feature,
 484    AARCH64_FEATURE_<FEAT>.  */
 485 #define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
 486
 487 /* Used by AARCH64_FEATURES.  */
 488 #define AARCH64_OR_FEATURES_1(X, F1) \
 489   AARCH64_FEATURE (F1)
 490 #define AARCH64_OR_FEATURES_2(X, F1, F2) \
 491   (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
 492 #define AARCH64_OR_FEATURES_3(X, F1, ...) \
 493   (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
 494
 495 /* An aarch64_feature_set initializer for the N features listed in "...".  */
 496 #define AARCH64_FEATURES(N, ...) \
 497   AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
 498
 499 #define AARCH64_NO_FEATURES        0
 500
 501 /* Flags associated with the properties of system registers.  It mainly serves
 502    to mark particular registers as read or write only.  */
 503 #define F_DEPRECATED               (1 << 1)
 504 #define F_REG_READ                 (1 << 2)
 505 #define F_REG_WRITE                (1 << 3)
 506 #define F_ARCHEXT                  (1 << 4)
 507 /* Flag indicating register name is alias for another system register.  */
 508 #define F_REG_ALIAS                (1 << 5)
 509 /* Flag indicatinig registers which may be implemented with 128-bits.  */
 510 #define F_REG_128                  (1 << 6)
 511
 512 /* Database of system registers, their encodings and architectural
 513    requirements.  */
 514 const sysreg_t aarch64_sysregs[] =
 515 {
 516 #define CPENC(SN, OP1, CN, CM, OP2) "s"#SN"_"#OP1"_c"#CN"_c"#CM"_"#OP2
 517 #define SYSREG(NAME, ENC, FLAGS, ARCH) \
 518   { NAME, ENC, FLAGS, ARCH },
 519 #include "aarch64-sys-regs.def"
 520 #undef CPENC
 521 };
 522
 523 #undef AARCH64_NO_FEATURES
 524
 525 using sysreg_map_t = hash_map<nofree_string_hash, const sysreg_t *>;
 526 static sysreg_map_t *sysreg_map = nullptr;
 527
 528 /* Map system register names to their hardware metadata: encoding,
 529    feature flags and architectural feature requirements, all of which
 530    are encoded in a sysreg_t struct.  */
 531 void
 532 aarch64_register_sysreg (const char *name, const sysreg_t *metadata)
 533 {
 534   bool dup = sysreg_map->put (name, metadata);
 535   gcc_checking_assert (!dup);
 536 }
 537
 538 /* Lazily initialize hash table for system register validation,
 539    checking the validity of supplied register name and returning
 540    register's associated metadata.  */
 541 static void
 542 aarch64_init_sysregs (void)
 543 {
 544   gcc_assert (!sysreg_map);
 545   sysreg_map = new sysreg_map_t;
 546
 547
 548   for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
 549     {
 550       const sysreg_t *reg = aarch64_sysregs + i;
 551       aarch64_register_sysreg (reg->name, reg);
 552     }
 553 }
 554
 555 /* No direct access to the sysreg hash-map should be made.  Doing so
 556    risks trying to acess an unitialized hash-map and dereferencing the
 557    returned double pointer without due care risks dereferencing a
 558    null-pointer.  */
 559 const sysreg_t *
 560 aarch64_lookup_sysreg_map (const char *regname)
 561 {
 562   if (!sysreg_map)
 563     aarch64_init_sysregs ();
 564
 565   const sysreg_t **sysreg_entry = sysreg_map->get (regname);
 566   if (sysreg_entry != NULL)
 567     return *sysreg_entry;
 568   return NULL;
 569 }
 570
 571 /* The current tuning set.  */
 572 struct tune_params aarch64_tune_params = generic_tunings;
 573
 574 /* If NAME is the name of an arm:: attribute that describes shared state,
 575    return its associated AARCH64_STATE_* flags, otherwise return 0.  */
 576 static unsigned int
 577 aarch64_attribute_shared_state_flags (const char *name)
 578 {
 579   if (strcmp (name, "in") == 0)
 580     return AARCH64_STATE_SHARED | AARCH64_STATE_IN;
 581   if (strcmp (name, "inout") == 0)
 582     return AARCH64_STATE_SHARED | AARCH64_STATE_IN | AARCH64_STATE_OUT;
 583   if (strcmp (name, "out") == 0)
 584     return AARCH64_STATE_SHARED | AARCH64_STATE_OUT;
 585   if (strcmp (name, "preserves") == 0)
 586     return AARCH64_STATE_SHARED;
 587   return 0;
 588 }
 589
 590 /* See whether attribute list ATTRS has any sharing information
 591    for state STATE_NAME.  Return the associated state flags if so,
 592    otherwise return 0.  */
 593 static unsigned int
 594 aarch64_lookup_shared_state_flags (tree attrs, const char *state_name)
 595 {
 596   for (tree attr = attrs; attr; attr = TREE_CHAIN (attr))
 597     {
 598       if (!is_attribute_namespace_p ("arm", attr))
 599         continue;
 600
 601       auto attr_name = IDENTIFIER_POINTER (get_attribute_name (attr));
 602       auto flags = aarch64_attribute_shared_state_flags (attr_name);
 603       if (!flags)
 604         continue;
 605
 606       for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
 607         {
 608           tree value = TREE_VALUE (arg);
 609           if (TREE_CODE (value) == STRING_CST
 610               && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
 611             return flags;
 612         }
 613     }
 614   return 0;
 615 }
 616
 617 /* Return true if DECL creates a new scope for state STATE_STRING.  */
 618 static bool
 619 aarch64_fndecl_has_new_state (const_tree decl, const char *state_name)
 620 {
 621   if (tree attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl)))
 622     for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
 623       {
 624         tree value = TREE_VALUE (arg);
 625         if (TREE_CODE (value) == STRING_CST
 626             && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
 627           return true;
 628       }
 629   return false;
 630 }
 631
 632 /* Return true if attribute argument VALUE is a recognized state string,
 633    otherwise report an error.  NAME is the name of the attribute to which
 634    VALUE is being passed.  */
 635 static bool
 636 aarch64_check_state_string (tree name, tree value)
 637 {
 638   if (TREE_CODE (value) != STRING_CST)
 639     {
 640       error ("the arguments to %qE must be constant strings", name);
 641       return false;
 642     }
 643
 644   const char *state_name = TREE_STRING_POINTER (value);
 645   if (strcmp (state_name, "za") != 0
 646       && strcmp (state_name, "zt0") != 0)
 647     {
 648       error ("unrecognized state string %qs", state_name);
 649       return false;
 650     }
 651
 652   return true;
 653 }
 654
 655 /* qsort callback to compare two STRING_CSTs.  */
 656 static int
 657 cmp_string_csts (const void *a, const void *b)
 658 {
 659   return strcmp (TREE_STRING_POINTER (*(const_tree const *) a),
 660                  TREE_STRING_POINTER (*(const_tree const *) b));
 661 }
 662
 663 /* Canonicalize a list of state strings.  ARGS contains the arguments to
 664    a new attribute while OLD_ATTR, if nonnull, contains a previous attribute
 665    of the same type.  If CAN_MERGE_IN_PLACE, it is safe to adjust OLD_ATTR's
 666    arguments and drop the new attribute.  Otherwise, the new attribute must
 667    be kept and ARGS must include the information in OLD_ATTR.
 668
 669    In both cases, the new arguments must be a sorted list of state strings
 670    with duplicates removed.
 671
 672    Return true if new attribute should be kept, false if it should be
 673    dropped.  */
 674 static bool
 675 aarch64_merge_string_arguments (tree args, tree old_attr,
 676                                 bool can_merge_in_place)
 677 {
 678   /* Get a sorted list of all state strings (including duplicates).  */
 679   auto add_args = [](vec<tree> &strings, const_tree args)
 680     {
 681       for (const_tree arg = args; arg; arg = TREE_CHAIN (arg))
 682         if (TREE_CODE (TREE_VALUE (arg)) == STRING_CST)
 683           strings.safe_push (TREE_VALUE (arg));
 684     };
 685   auto_vec<tree, 16> strings;
 686   add_args (strings, args);
 687   if (old_attr)
 688     add_args (strings, TREE_VALUE (old_attr));
 689   strings.qsort (cmp_string_csts);
 690
 691   /* The list can be empty if there was no previous attribute and if all
 692      the new arguments are erroneous.  Drop the attribute in that case.  */
 693   if (strings.is_empty ())
 694     return false;
 695
 696   /* Destructively modify one of the argument lists, removing duplicates
 697      on the fly.  */
 698   bool use_old_attr = old_attr && can_merge_in_place;
 699   tree *end = use_old_attr ? &TREE_VALUE (old_attr) : &args;
 700   tree prev = NULL_TREE;
 701   for (tree arg : strings)
 702     {
 703       if (prev && simple_cst_equal (arg, prev))
 704         continue;
 705       prev = arg;
 706       if (!*end)
 707         *end = tree_cons (NULL_TREE, arg, NULL_TREE);
 708       else
 709         TREE_VALUE (*end) = arg;
 710       end = &TREE_CHAIN (*end);
 711     }
 712   *end = NULL_TREE;
 713   return !use_old_attr;
 714 }
 715
 716 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
 717
 718 static tree
 719 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
 720                                      int, bool *no_add_attrs)
 721 {
 722   /* Since we set fn_type_req to true, the caller should have checked
 723      this for us.  */
 724   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
 725   switch ((arm_pcs) fntype_abi (*node).id ())
 726     {
 727     case ARM_PCS_AAPCS64:
 728     case ARM_PCS_SIMD:
 729       return NULL_TREE;
 730
 731     case ARM_PCS_SVE:
 732       error ("the %qE attribute cannot be applied to an SVE function type",
 733              name);
 734       *no_add_attrs = true;
 735       return NULL_TREE;
 736
 737     case ARM_PCS_TLSDESC:
 738     case ARM_PCS_UNKNOWN:
 739       break;
 740     }
 741   gcc_unreachable ();
 742 }
 743
 744 /* Return true if arm::new(ARGS) is compatible with the type of decl DECL,
 745    otherwise report an error.  */
 746 static bool
 747 aarch64_check_arm_new_against_type (tree args, tree decl)
 748 {
 749   tree type_attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl));
 750   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 751     {
 752       tree value = TREE_VALUE (arg);
 753       if (TREE_CODE (value) == STRING_CST)
 754         {
 755           const char *state_name = TREE_STRING_POINTER (value);
 756           if (aarch64_lookup_shared_state_flags (type_attrs, state_name))
 757             {
 758               error_at (DECL_SOURCE_LOCATION (decl),
 759                         "cannot create a new %qs scope since %qs is shared"
 760                         " with callers", state_name, state_name);
 761               return false;
 762             }
 763         }
 764     }
 765   return true;
 766 }
 767
 768 /* Callback for arm::new attributes.  */
 769 static tree
 770 handle_arm_new (tree *node, tree name, tree args, int, bool *no_add_attrs)
 771 {
 772   tree decl = *node;
 773   if (TREE_CODE (decl) != FUNCTION_DECL)
 774     {
 775       error ("%qE attribute applies only to function definitions", name);
 776       *no_add_attrs = true;
 777       return NULL_TREE;
 778     }
 779   if (TREE_TYPE (decl) == error_mark_node)
 780     {
 781       *no_add_attrs = true;
 782       return NULL_TREE;
 783     }
 784
 785   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 786     aarch64_check_state_string (name, TREE_VALUE (arg));
 787
 788   if (!aarch64_check_arm_new_against_type (args, decl))
 789     {
 790       *no_add_attrs = true;
 791       return NULL_TREE;
 792     }
 793
 794   /* If there is an old attribute, we should try to update it in-place,
 795      so that there is only one (definitive) arm::new attribute on the decl.  */
 796   tree old_attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl));
 797   if (!aarch64_merge_string_arguments (args, old_attr, true))
 798     *no_add_attrs = true;
 799
 800   return NULL_TREE;
 801 }
 802
 803 /* Callback for arm::{in,out,inout,preserves} attributes.  */
 804 static tree
 805 handle_arm_shared (tree *node, tree name, tree args,
 806                    int, bool *no_add_attrs)
 807 {
 808   tree type = *node;
 809   tree old_attrs = TYPE_ATTRIBUTES (type);
 810   auto flags = aarch64_attribute_shared_state_flags (IDENTIFIER_POINTER (name));
 811   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 812     {
 813       tree value = TREE_VALUE (arg);
 814       if (aarch64_check_state_string (name, value))
 815         {
 816           const char *state_name = TREE_STRING_POINTER (value);
 817           auto old_flags = aarch64_lookup_shared_state_flags (old_attrs,
 818                                                               state_name);
 819           if (old_flags && old_flags != flags)
 820             {
 821               error ("inconsistent attributes for state %qs", state_name);
 822               *no_add_attrs = true;
 823               return NULL_TREE;
 824             }
 825         }
 826     }
 827
 828   /* We can't update an old attribute in-place, since types are shared.
 829      Instead make sure that this new attribute contains all the
 830      information, so that the old attribute becomes redundant.  */
 831   tree old_attr = lookup_attribute ("arm", IDENTIFIER_POINTER (name),
 832                                     old_attrs);
 833   if (!aarch64_merge_string_arguments (args, old_attr, false))
 834     *no_add_attrs = true;
 835
 836   return NULL_TREE;
 837 }
 838
 839 /* Mutually-exclusive function type attributes for controlling PSTATE.SM.  */
 840 static const struct attribute_spec::exclusions attr_streaming_exclusions[] =
 841 {
 842   /* Attribute name     exclusion applies to:
 843                         function, type, variable */
 844   { "streaming", false, true, false },
 845   { "streaming_compatible", false, true, false },
 846   { NULL, false, false, false }
 847 };
 848
 849 /* Table of machine attributes.  */
 850 static const attribute_spec aarch64_gnu_attributes[] =
 851 {
 852   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
 853        affects_type_identity, handler, exclude } */
 854   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
 855                           handle_aarch64_vector_pcs_attribute, NULL },
 856   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
 857                           aarch64_sve::handle_arm_sve_vector_bits_attribute,
 858                           NULL },
 859   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
 860   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
 861   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
 862 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
 863   { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute, NULL },
 864   { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute, NULL },
 865 #endif
 866 #ifdef SUBTARGET_ATTRIBUTE_TABLE
 867   SUBTARGET_ATTRIBUTE_TABLE
 868 #endif
 869 };
 870
 871 static const scoped_attribute_specs aarch64_gnu_attribute_table =
 872 {
 873   "gnu", { aarch64_gnu_attributes }
 874 };
 875
 876 static const attribute_spec aarch64_arm_attributes[] =
 877 {
 878   { "streaming",          0, 0, false, true,  true,  true,
 879                           NULL, attr_streaming_exclusions },
 880   { "streaming_compatible", 0, 0, false, true,  true,  true,
 881                           NULL, attr_streaming_exclusions },
 882   { "locally_streaming",  0, 0, true, false, false, false, NULL, NULL },
 883   { "new",                1, -1, true, false, false, false,
 884                           handle_arm_new, NULL },
 885   { "preserves",          1, -1, false, true,  true,  true,
 886                           handle_arm_shared, NULL },
 887   { "in",                 1, -1, false, true,  true,  true,
 888                           handle_arm_shared, NULL },
 889   { "out",                1, -1, false, true,  true,  true,
 890                           handle_arm_shared, NULL },
 891   { "inout",              1, -1, false, true,  true,  true,
 892                           handle_arm_shared, NULL }
 893 };
 894
 895 static const scoped_attribute_specs aarch64_arm_attribute_table =
 896 {
 897   "arm", { aarch64_arm_attributes }
 898 };
 899
 900 static const scoped_attribute_specs *const aarch64_attribute_table[] =
 901 {
 902   &aarch64_gnu_attribute_table,
 903   &aarch64_arm_attribute_table
 904 };
 905
 906 typedef enum aarch64_cond_code
 907 {
 908   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 909   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 910   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 911 }
 912 aarch64_cc;
 913
 914 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 915
 916
 917 /* The condition codes of the processor, and the inverse function.  */
 918 static const char * const aarch64_condition_codes[] =
 919 {
 920   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 921   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 922 };
 923
 924 /* The preferred condition codes for SVE conditions.  */
 925 static const char *const aarch64_sve_condition_codes[] =
 926 {
 927   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
 928   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
 929 };
 930
 931 /* Return the assembly token for svpattern value VALUE.  */
 932
 933 static const char *
 934 svpattern_token (enum aarch64_svpattern pattern)
 935 {
 936   switch (pattern)
 937     {
 938 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
 939     AARCH64_FOR_SVPATTERN (CASE)
 940 #undef CASE
 941     case AARCH64_NUM_SVPATTERNS:
 942       break;
 943     }
 944   gcc_unreachable ();
 945 }
 946
 947 /* Return the location of a piece that is known to be passed or returned
 948    in registers.  FIRST_ZR is the first unused vector argument register
 949    and FIRST_PR is the first unused predicate argument register.  */
 950
 951 rtx
 952 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
 953                                          unsigned int first_pr) const
 954 {
 955   gcc_assert (VECTOR_MODE_P (mode)
 956               && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
 957               && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
 958
 959   if (num_zr > 0 && num_pr == 0)
 960     return gen_rtx_REG (mode, first_zr);
 961
 962   if (num_zr == 0 && num_pr <= 2)
 963     return gen_rtx_REG (mode, first_pr);
 964
 965   gcc_unreachable ();
 966 }
 967
 968 /* Return the total number of vector registers required by the PST.  */
 969
 970 unsigned int
 971 pure_scalable_type_info::num_zr () const
 972 {
 973   unsigned int res = 0;
 974   for (unsigned int i = 0; i < pieces.length (); ++i)
 975     res += pieces[i].num_zr;
 976   return res;
 977 }
 978
 979 /* Return the total number of predicate registers required by the PST.  */
 980
 981 unsigned int
 982 pure_scalable_type_info::num_pr () const
 983 {
 984   unsigned int res = 0;
 985   for (unsigned int i = 0; i < pieces.length (); ++i)
 986     res += pieces[i].num_pr;
 987   return res;
 988 }
 989
 990 /* Return the location of a PST that is known to be passed or returned
 991    in registers.  FIRST_ZR is the first unused vector argument register
 992    and FIRST_PR is the first unused predicate argument register.  */
 993
 994 rtx
 995 pure_scalable_type_info::get_rtx (machine_mode mode,
 996                                   unsigned int first_zr,
 997                                   unsigned int first_pr) const
 998 {
 999   /* Try to return a single REG if possible.  This leads to better
1000      code generation; it isn't required for correctness.  */
1001   if (mode == pieces[0].mode)
1002     {
1003       gcc_assert (pieces.length () == 1);
1004       return pieces[0].get_rtx (first_zr, first_pr);
1005     }
1006
1007   /* Build up a PARALLEL that contains the individual pieces.  */
1008   rtvec rtxes = rtvec_alloc (pieces.length ());
1009   for (unsigned int i = 0; i < pieces.length (); ++i)
1010     {
1011       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1012       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1013       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1014       first_zr += pieces[i].num_zr;
1015       first_pr += pieces[i].num_pr;
1016     }
1017   return gen_rtx_PARALLEL (mode, rtxes);
1018 }
1019
1020 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1021    in the AAPCS64.  */
1022
1023 pure_scalable_type_info::analysis_result
1024 pure_scalable_type_info::analyze (const_tree type)
1025 {
1026   /* Prevent accidental reuse.  */
1027   gcc_assert (pieces.is_empty ());
1028
1029   /* No code will be generated for erroneous types, so we won't establish
1030      an ABI mapping.  */
1031   if (type == error_mark_node)
1032     return NO_ABI_IDENTITY;
1033
1034   /* Zero-sized types disappear in the language->ABI mapping.  */
1035   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1036     return NO_ABI_IDENTITY;
1037
1038   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
1039   piece p = {};
1040   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1041     {
1042       machine_mode mode = TYPE_MODE_RAW (type);
1043       gcc_assert (VECTOR_MODE_P (mode)
1044                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1045
1046       p.mode = p.orig_mode = mode;
1047       add_piece (p);
1048       return IS_PST;
1049     }
1050
1051   /* Check for user-defined PSTs.  */
1052   if (TREE_CODE (type) == ARRAY_TYPE)
1053     return analyze_array (type);
1054   if (TREE_CODE (type) == RECORD_TYPE)
1055     return analyze_record (type);
1056
1057   return ISNT_PST;
1058 }
1059
1060 /* Analyze a type that is known not to be passed or returned in memory.
1061    Return true if it has an ABI identity and is a Pure Scalable Type.  */
1062
1063 bool
1064 pure_scalable_type_info::analyze_registers (const_tree type)
1065 {
1066   analysis_result result = analyze (type);
1067   gcc_assert (result != DOESNT_MATTER);
1068   return result == IS_PST;
1069 }
1070
1071 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
1072
1073 pure_scalable_type_info::analysis_result
1074 pure_scalable_type_info::analyze_array (const_tree type)
1075 {
1076   /* Analyze the element type.  */
1077   pure_scalable_type_info element_info;
1078   analysis_result result = element_info.analyze (TREE_TYPE (type));
1079   if (result != IS_PST)
1080     return result;
1081
1082   /* An array of unknown, flexible or variable length will be passed and
1083      returned by reference whatever we do.  */
1084   tree nelts_minus_one = array_type_nelts (type);
1085   if (!tree_fits_uhwi_p (nelts_minus_one))
1086     return DOESNT_MATTER;
1087
1088   /* Likewise if the array is constant-sized but too big to be interesting.
1089      The double checks against MAX_PIECES are to protect against overflow.  */
1090   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1091   if (count > MAX_PIECES)
1092     return DOESNT_MATTER;
1093   count += 1;
1094   if (count * element_info.pieces.length () > MAX_PIECES)
1095     return DOESNT_MATTER;
1096
1097   /* The above checks should have weeded out elements of unknown size.  */
1098   poly_uint64 element_bytes;
1099   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1100     gcc_unreachable ();
1101
1102   /* Build up the list of individual vectors and predicates.  */
1103   gcc_assert (!element_info.pieces.is_empty ());
1104   for (unsigned int i = 0; i < count; ++i)
1105     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1106       {
1107         piece p = element_info.pieces[j];
1108         p.offset += i * element_bytes;
1109         add_piece (p);
1110       }
1111   return IS_PST;
1112 }
1113
1114 /* Subroutine of analyze for handling RECORD_TYPEs.  */
1115
1116 pure_scalable_type_info::analysis_result
1117 pure_scalable_type_info::analyze_record (const_tree type)
1118 {
1119   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1120     {
1121       if (TREE_CODE (field) != FIELD_DECL)
1122         continue;
1123
1124       /* Zero-sized fields disappear in the language->ABI mapping.  */
1125       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1126         continue;
1127
1128       /* All fields with an ABI identity must be PSTs for the record as
1129          a whole to be a PST.  If any individual field is too big to be
1130          interesting then the record is too.  */
1131       pure_scalable_type_info field_info;
1132       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1133       if (subresult == NO_ABI_IDENTITY)
1134         continue;
1135       if (subresult != IS_PST)
1136         return subresult;
1137
1138       /* Since all previous fields are PSTs, we ought to be able to track
1139          the field offset using poly_ints.  */
1140       tree bitpos = bit_position (field);
1141       gcc_assert (poly_int_tree_p (bitpos));
1142
1143       /* For the same reason, it shouldn't be possible to create a PST field
1144          whose offset isn't byte-aligned.  */
1145       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1146                                                 BITS_PER_UNIT);
1147
1148       /* Punt if the record is too big to be interesting.  */
1149       poly_uint64 bytepos;
1150       if (!wide_bytepos.to_uhwi (&bytepos)
1151           || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1152         return DOESNT_MATTER;
1153
1154       /* Add the individual vectors and predicates in the field to the
1155          record's list.  */
1156       gcc_assert (!field_info.pieces.is_empty ());
1157       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1158         {
1159           piece p = field_info.pieces[i];
1160           p.offset += bytepos;
1161           add_piece (p);
1162         }
1163     }
1164   /* Empty structures disappear in the language->ABI mapping.  */
1165   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1166 }
1167
1168 /* Add P to the list of pieces in the type.  */
1169
1170 void
1171 pure_scalable_type_info::add_piece (const piece &p)
1172 {
1173   /* Try to fold the new piece into the previous one to form a
1174      single-mode PST.  For example, if we see three consecutive vectors
1175      of the same mode, we can represent them using the corresponding
1176      3-tuple mode.
1177
1178      This is purely an optimization.  */
1179   if (!pieces.is_empty ())
1180     {
1181       piece &prev = pieces.last ();
1182       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1183       unsigned int nelems1, nelems2;
1184       if (prev.orig_mode == p.orig_mode
1185           && GET_MODE_CLASS (p.orig_mode) != MODE_VECTOR_BOOL
1186           && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1187           && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1188                                   GET_MODE_NUNITS (p.orig_mode), &nelems1)
1189           && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1190                                   GET_MODE_NUNITS (p.orig_mode), &nelems2)
1191           && targetm.array_mode (p.orig_mode,
1192                                  nelems1 + nelems2).exists (&prev.mode))
1193         {
1194           prev.num_zr += p.num_zr;
1195           prev.num_pr += p.num_pr;
1196           return;
1197         }
1198     }
1199   pieces.quick_push (p);
1200 }
1201
1202 /* Return true if at least one possible value of type TYPE includes at
1203    least one object of Pure Scalable Type, in the sense of the AAPCS64.
1204
1205    This is a relatively expensive test for some types, so it should
1206    generally be made as late as possible.  */
1207
1208 static bool
1209 aarch64_some_values_include_pst_objects_p (const_tree type)
1210 {
1211   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1212     return false;
1213
1214   if (aarch64_sve::builtin_type_p (type))
1215     return true;
1216
1217   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1218     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1219
1220   if (RECORD_OR_UNION_TYPE_P (type))
1221     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1222       if (TREE_CODE (field) == FIELD_DECL
1223           && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1224         return true;
1225
1226   return false;
1227 }
1228
1229 /* Return the descriptor of the SIMD ABI.  */
1230
1231 static const predefined_function_abi &
1232 aarch64_simd_abi (void)
1233 {
1234   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1235   if (!simd_abi.initialized_p ())
1236     {
1237       HARD_REG_SET full_reg_clobbers
1238         = default_function_abi.full_reg_clobbers ();
1239       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1240         if (FP_SIMD_SAVED_REGNUM_P (regno))
1241           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1242       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1243     }
1244   return simd_abi;
1245 }
1246
1247 /* Return the descriptor of the SVE PCS.  */
1248
1249 static const predefined_function_abi &
1250 aarch64_sve_abi (void)
1251 {
1252   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1253   if (!sve_abi.initialized_p ())
1254     {
1255       HARD_REG_SET full_reg_clobbers
1256         = default_function_abi.full_reg_clobbers ();
1257       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1258         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1259       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1260         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1261       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1262     }
1263   return sve_abi;
1264 }
1265
1266 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
1267    wraps, otherwise return X itself.  */
1268
1269 static rtx
1270 strip_salt (rtx x)
1271 {
1272   rtx search = x;
1273   if (GET_CODE (search) == CONST)
1274     search = XEXP (search, 0);
1275   if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
1276     x = XVECEXP (search, 0, 0);
1277   return x;
1278 }
1279
1280 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
1281    expression.  */
1282
1283 static rtx
1284 strip_offset_and_salt (rtx addr, poly_int64 *offset)
1285 {
1286   return strip_salt (strip_offset (addr, offset));
1287 }
1288
1289 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1290 const char *
1291 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1292                         const char * branch_format)
1293 {
1294     rtx_code_label * tmp_label = gen_label_rtx ();
1295     char label_buf[256];
1296     char buffer[128];
1297     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1298                                  CODE_LABEL_NUMBER (tmp_label));
1299     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1300     rtx dest_label = operands[pos_label];
1301     operands[pos_label] = tmp_label;
1302
1303     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1304     output_asm_insn (buffer, operands);
1305
1306     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1307     operands[pos_label] = dest_label;
1308     output_asm_insn (buffer, operands);
1309     return "";
1310 }
1311
1312 void
1313 aarch64_err_no_fpadvsimd (machine_mode mode)
1314 {
1315   if (TARGET_GENERAL_REGS_ONLY)
1316     if (FLOAT_MODE_P (mode))
1317       error ("%qs is incompatible with the use of floating-point types",
1318              "-mgeneral-regs-only");
1319     else
1320       error ("%qs is incompatible with the use of vector types",
1321              "-mgeneral-regs-only");
1322   else
1323     if (FLOAT_MODE_P (mode))
1324       error ("%qs feature modifier is incompatible with the use of"
1325              " floating-point types", "+nofp");
1326     else
1327       error ("%qs feature modifier is incompatible with the use of"
1328              " vector types", "+nofp");
1329 }
1330
1331 /* Report when we try to do something that requires SVE when SVE is disabled.
1332    This is an error of last resort and isn't very high-quality.  It usually
1333    involves attempts to measure the vector length in some way.  */
1334 static void
1335 aarch64_report_sve_required (void)
1336 {
1337   static bool reported_p = false;
1338
1339   /* Avoid reporting a slew of messages for a single oversight.  */
1340   if (reported_p)
1341     return;
1342
1343   error ("this operation requires the SVE ISA extension");
1344   inform (input_location, "you can enable SVE using the command-line"
1345           " option %<-march%>, or by using the %<target%>"
1346           " attribute or pragma");
1347   reported_p = true;
1348 }
1349
1350 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1351    registers.  */
1352 inline bool
1353 pr_or_ffr_regnum_p (unsigned int regno)
1354 {
1355   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1356 }
1357
1358 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1359    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1360    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1361    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1362    and GENERAL_REGS is lower than the memory cost (in this case the best class
1363    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1364    cost results in bad allocations with many redundant int<->FP moves which
1365    are expensive on various cores.
1366    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1367    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1368    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1369    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1370    The result of this is that it is no longer inefficient to have a higher
1371    memory move cost than the register move cost.
1372 */
1373
1374 static reg_class_t
1375 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1376                                          reg_class_t best_class)
1377 {
1378   machine_mode mode;
1379
1380   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1381       || !reg_class_subset_p (FP_REGS, allocno_class))
1382     return allocno_class;
1383
1384   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1385       || !reg_class_subset_p (FP_REGS, best_class))
1386     return best_class;
1387
1388   mode = PSEUDO_REGNO_MODE (regno);
1389   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1390 }
1391
1392 static unsigned int
1393 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1394 {
1395   if (GET_MODE_UNIT_SIZE (mode) == 4)
1396     return aarch64_tune_params.min_div_recip_mul_sf;
1397   return aarch64_tune_params.min_div_recip_mul_df;
1398 }
1399
1400 /* Return the reassociation width of treeop OPC with mode MODE.  */
1401 static int
1402 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1403 {
1404   if (VECTOR_MODE_P (mode))
1405     return aarch64_tune_params.vec_reassoc_width;
1406   if (INTEGRAL_MODE_P (mode))
1407     return aarch64_tune_params.int_reassoc_width;
1408   /* Reassociation reduces the number of FMAs which may result in worse
1409      performance.  Use a per-CPU setting for FMA reassociation which allows
1410      narrow CPUs with few FP pipes to switch it off (value of 1), and wider
1411      CPUs with many FP pipes to enable reassociation.
1412      Since the reassociation pass doesn't understand FMA at all, assume
1413      that any FP addition might turn into FMA.  */
1414   if (FLOAT_MODE_P (mode))
1415     return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
1416                             : aarch64_tune_params.fp_reassoc_width;
1417   return 1;
1418 }
1419
1420 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1421 unsigned
1422 aarch64_debugger_regno (unsigned regno)
1423 {
1424    if (GP_REGNUM_P (regno))
1425      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1426    else if (regno == SP_REGNUM)
1427      return AARCH64_DWARF_SP;
1428    else if (FP_REGNUM_P (regno))
1429      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1430    else if (PR_REGNUM_P (regno))
1431      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1432    else if (regno == VG_REGNUM)
1433      return AARCH64_DWARF_VG;
1434
1435    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1436       equivalent DWARF register.  */
1437    return DWARF_FRAME_REGISTERS;
1438 }
1439
1440 /* Implement TARGET_DWARF_FRAME_REG_MODE.  */
1441 static machine_mode
1442 aarch64_dwarf_frame_reg_mode (int regno)
1443 {
1444   /* Predicate registers are call-clobbered in the EH ABI (which is
1445      ARM_PCS_AAPCS64), so they should not be described by CFI.
1446      Their size changes as VL changes, so any values computed by
1447      __builtin_init_dwarf_reg_size_table might not be valid for
1448      all frames.  */
1449   if (PR_REGNUM_P (regno))
1450     return VOIDmode;
1451   return default_dwarf_frame_reg_mode (regno);
1452 }
1453
1454 /* Implement TARGET_OUTPUT_CFI_DIRECTIVE.  */
1455 static bool
1456 aarch64_output_cfi_directive (FILE *f, dw_cfi_ref cfi)
1457 {
1458   bool found = false;
1459   if (cfi->dw_cfi_opc == DW_CFA_AARCH64_negate_ra_state)
1460     {
1461       fprintf (f, "\t.cfi_negate_ra_state\n");
1462       found = true;
1463     }
1464   return found;
1465 }
1466
1467 /* Implement TARGET_DW_CFI_OPRND1_DESC.  */
1468 static bool
1469 aarch64_dw_cfi_oprnd1_desc (dwarf_call_frame_info cfi_opc,
1470                             dw_cfi_oprnd_type &oprnd_type)
1471 {
1472   if (cfi_opc == DW_CFA_AARCH64_negate_ra_state)
1473     {
1474       oprnd_type = dw_cfi_oprnd_unused;
1475       return true;
1476     }
1477   return false;
1478 }
1479
1480 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1481    integer, otherwise return X unmodified.  */
1482 static rtx
1483 aarch64_bit_representation (rtx x)
1484 {
1485   if (CONST_DOUBLE_P (x))
1486     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1487   return x;
1488 }
1489
1490 /* Return an estimate for the number of quadwords in an SVE vector.  This is
1491    equivalent to the number of Advanced SIMD vectors in an SVE vector.  */
1492 static unsigned int
1493 aarch64_estimated_sve_vq ()
1494 {
1495   return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
1496 }
1497
1498 /* Return true if MODE is an SVE predicate mode.  */
1499 static bool
1500 aarch64_sve_pred_mode_p (machine_mode mode)
1501 {
1502   return (TARGET_SVE
1503           && (mode == VNx16BImode
1504               || mode == VNx8BImode
1505               || mode == VNx4BImode
1506               || mode == VNx2BImode));
1507 }
1508
1509 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1510 const unsigned int VEC_ADVSIMD  = 1;
1511 const unsigned int VEC_SVE_DATA = 2;
1512 const unsigned int VEC_SVE_PRED = 4;
1513 /* Indicates a structure of 2, 3 or 4 vectors or predicates.  */
1514 const unsigned int VEC_STRUCT   = 8;
1515 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1516    vector has fewer significant bytes than a full SVE vector.  */
1517 const unsigned int VEC_PARTIAL  = 16;
1518 /* Useful combinations of the above.  */
1519 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1520 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1521
1522 /* Return a set of flags describing the vector properties of mode MODE.
1523    If ANY_TARGET_P is false (the default), ignore modes that are not supported
1524    by the current target.  Otherwise categorize the modes that can be used
1525    with the set of all targets supported by the port.  */
1526
1527 static unsigned int
1528 aarch64_classify_vector_mode (machine_mode mode, bool any_target_p = false)
1529 {
1530   if (aarch64_sve_pred_mode_p (mode))
1531     return VEC_SVE_PRED;
1532
1533   /* Make the decision based on the mode's enum value rather than its
1534      properties, so that we keep the correct classification regardless
1535      of -msve-vector-bits.  */
1536   switch (mode)
1537     {
1538     /* Partial SVE QI vectors.  */
1539     case E_VNx2QImode:
1540     case E_VNx4QImode:
1541     case E_VNx8QImode:
1542     /* Partial SVE HI vectors.  */
1543     case E_VNx2HImode:
1544     case E_VNx4HImode:
1545     /* Partial SVE SI vector.  */
1546     case E_VNx2SImode:
1547     /* Partial SVE HF vectors.  */
1548     case E_VNx2HFmode:
1549     case E_VNx4HFmode:
1550     /* Partial SVE BF vectors.  */
1551     case E_VNx2BFmode:
1552     case E_VNx4BFmode:
1553     /* Partial SVE SF vector.  */
1554     case E_VNx2SFmode:
1555       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1556
1557     case E_VNx16QImode:
1558     case E_VNx8HImode:
1559     case E_VNx4SImode:
1560     case E_VNx2DImode:
1561     case E_VNx8BFmode:
1562     case E_VNx8HFmode:
1563     case E_VNx4SFmode:
1564     case E_VNx2DFmode:
1565       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA : 0;
1566
1567     /* x2 SVE vectors.  */
1568     case E_VNx32QImode:
1569     case E_VNx16HImode:
1570     case E_VNx8SImode:
1571     case E_VNx4DImode:
1572     case E_VNx16BFmode:
1573     case E_VNx16HFmode:
1574     case E_VNx8SFmode:
1575     case E_VNx4DFmode:
1576     /* x3 SVE vectors.  */
1577     case E_VNx48QImode:
1578     case E_VNx24HImode:
1579     case E_VNx12SImode:
1580     case E_VNx6DImode:
1581     case E_VNx24BFmode:
1582     case E_VNx24HFmode:
1583     case E_VNx12SFmode:
1584     case E_VNx6DFmode:
1585     /* x4 SVE vectors.  */
1586     case E_VNx64QImode:
1587     case E_VNx32HImode:
1588     case E_VNx16SImode:
1589     case E_VNx8DImode:
1590     case E_VNx32BFmode:
1591     case E_VNx32HFmode:
1592     case E_VNx16SFmode:
1593     case E_VNx8DFmode:
1594       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_STRUCT : 0;
1595
1596     case E_OImode:
1597     case E_CImode:
1598     case E_XImode:
1599       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1600
1601     /* Structures of 64-bit Advanced SIMD vectors.  */
1602     case E_V2x8QImode:
1603     case E_V2x4HImode:
1604     case E_V2x2SImode:
1605     case E_V2x1DImode:
1606     case E_V2x4BFmode:
1607     case E_V2x4HFmode:
1608     case E_V2x2SFmode:
1609     case E_V2x1DFmode:
1610     case E_V3x8QImode:
1611     case E_V3x4HImode:
1612     case E_V3x2SImode:
1613     case E_V3x1DImode:
1614     case E_V3x4BFmode:
1615     case E_V3x4HFmode:
1616     case E_V3x2SFmode:
1617     case E_V3x1DFmode:
1618     case E_V4x8QImode:
1619     case E_V4x4HImode:
1620     case E_V4x2SImode:
1621     case E_V4x1DImode:
1622     case E_V4x4BFmode:
1623     case E_V4x4HFmode:
1624     case E_V4x2SFmode:
1625     case E_V4x1DFmode:
1626       return (TARGET_FLOAT || any_target_p)
1627               ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
1628
1629     /* Structures of 128-bit Advanced SIMD vectors.  */
1630     case E_V2x16QImode:
1631     case E_V2x8HImode:
1632     case E_V2x4SImode:
1633     case E_V2x2DImode:
1634     case E_V2x8BFmode:
1635     case E_V2x8HFmode:
1636     case E_V2x4SFmode:
1637     case E_V2x2DFmode:
1638     case E_V3x16QImode:
1639     case E_V3x8HImode:
1640     case E_V3x4SImode:
1641     case E_V3x2DImode:
1642     case E_V3x8BFmode:
1643     case E_V3x8HFmode:
1644     case E_V3x4SFmode:
1645     case E_V3x2DFmode:
1646     case E_V4x16QImode:
1647     case E_V4x8HImode:
1648     case E_V4x4SImode:
1649     case E_V4x2DImode:
1650     case E_V4x8BFmode:
1651     case E_V4x8HFmode:
1652     case E_V4x4SFmode:
1653     case E_V4x2DFmode:
1654       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1655
1656     /* 64-bit Advanced SIMD vectors.  */
1657     case E_V8QImode:
1658     case E_V4HImode:
1659     case E_V2SImode:
1660     case E_V1DImode:
1661     case E_V4HFmode:
1662     case E_V4BFmode:
1663     case E_V2SFmode:
1664     case E_V1DFmode:
1665     /* 128-bit Advanced SIMD vectors.  */
1666     case E_V16QImode:
1667     case E_V8HImode:
1668     case E_V4SImode:
1669     case E_V2DImode:
1670     case E_V8HFmode:
1671     case E_V8BFmode:
1672     case E_V4SFmode:
1673     case E_V2DFmode:
1674       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD : 0;
1675
1676     case E_VNx32BImode:
1677       return TARGET_SVE ? VEC_SVE_PRED | VEC_STRUCT : 0;
1678
1679     default:
1680       return 0;
1681     }
1682 }
1683
1684 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1685 bool
1686 aarch64_advsimd_struct_mode_p (machine_mode mode)
1687 {
1688   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1689   return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
1690 }
1691
1692 /* Return true if MODE is an Advanced SIMD D-register structure mode.  */
1693 static bool
1694 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
1695 {
1696   return (aarch64_classify_vector_mode (mode)
1697           == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
1698 }
1699
1700 /* Return true if MODE is an Advanced SIMD Q-register structure mode.  */
1701 static bool
1702 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
1703 {
1704   return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
1705 }
1706
1707 /* Return true if MODE is any of the data vector modes, including
1708    structure modes.  */
1709 static bool
1710 aarch64_vector_data_mode_p (machine_mode mode)
1711 {
1712   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1713 }
1714
1715 /* Return true if MODE is any form of SVE mode, including predicates,
1716    vectors and structures.  */
1717 bool
1718 aarch64_sve_mode_p (machine_mode mode)
1719 {
1720   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1721 }
1722
1723 /* Return true if MODE is an SVE data vector mode; either a single vector
1724    or a structure of vectors.  */
1725 static bool
1726 aarch64_sve_data_mode_p (machine_mode mode)
1727 {
1728   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1729 }
1730
1731 /* Return the number of defined bytes in one constituent vector of
1732    SVE mode MODE, which has vector flags VEC_FLAGS.  */
1733 static poly_int64
1734 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1735 {
1736   if (vec_flags & VEC_PARTIAL)
1737     /* A single partial vector.  */
1738     return GET_MODE_SIZE (mode);
1739
1740   if (vec_flags & VEC_SVE_DATA)
1741     /* A single vector or a tuple.  */
1742     return BYTES_PER_SVE_VECTOR;
1743
1744   /* A single predicate.  */
1745   gcc_assert (vec_flags & VEC_SVE_PRED);
1746   return BYTES_PER_SVE_PRED;
1747 }
1748
1749 /* If MODE holds an array of vectors, return the number of vectors
1750    in the array, otherwise return 1.  */
1751
1752 static unsigned int
1753 aarch64_ldn_stn_vectors (machine_mode mode)
1754 {
1755   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1756   if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
1757     return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
1758   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
1759     return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
1760   if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
1761     return exact_div (GET_MODE_SIZE (mode),
1762                       BYTES_PER_SVE_VECTOR).to_constant ();
1763   return 1;
1764 }
1765
1766 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
1767    corresponding vector structure mode.  */
1768 static opt_machine_mode
1769 aarch64_advsimd_vector_array_mode (machine_mode mode,
1770                                    unsigned HOST_WIDE_INT nelems)
1771 {
1772   unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
1773   if (known_eq (GET_MODE_SIZE (mode), 8))
1774     flags |= VEC_PARTIAL;
1775
1776   machine_mode struct_mode;
1777   FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
1778     if (aarch64_classify_vector_mode (struct_mode) == flags
1779         && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
1780         && known_eq (GET_MODE_NUNITS (struct_mode),
1781              GET_MODE_NUNITS (mode) * nelems))
1782       return struct_mode;
1783   return opt_machine_mode ();
1784 }
1785
1786 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1787
1788 opt_machine_mode
1789 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1790 {
1791   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1792                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1793   machine_mode mode;
1794   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1795     if (inner_mode == GET_MODE_INNER (mode)
1796         && known_eq (nunits, GET_MODE_NUNITS (mode))
1797         && aarch64_sve_data_mode_p (mode))
1798       return mode;
1799   return opt_machine_mode ();
1800 }
1801
1802 /* Implement target hook TARGET_ARRAY_MODE.  */
1803 static opt_machine_mode
1804 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1805 {
1806   if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1807     {
1808       /* Use VNx32BI for pairs of predicates, but explicitly reject giving
1809          a mode to other array sizes.  Using integer modes requires a round
1810          trip through memory and generates terrible code.  */
1811       if (nelems == 1)
1812         return mode;
1813       if (mode == VNx16BImode && nelems == 2)
1814         return VNx32BImode;
1815       return BLKmode;
1816     }
1817
1818   auto flags = aarch64_classify_vector_mode (mode);
1819   if (flags == VEC_SVE_DATA && IN_RANGE (nelems, 2, 4))
1820     return aarch64_sve_data_mode (GET_MODE_INNER (mode),
1821                                   GET_MODE_NUNITS (mode) * nelems);
1822
1823   if (flags == VEC_ADVSIMD && IN_RANGE (nelems, 2, 4))
1824     return aarch64_advsimd_vector_array_mode (mode, nelems);
1825
1826   return opt_machine_mode ();
1827 }
1828
1829 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1830 static bool
1831 aarch64_array_mode_supported_p (machine_mode mode,
1832                                 unsigned HOST_WIDE_INT nelems)
1833 {
1834   if (TARGET_BASE_SIMD
1835       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1836           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1837       && (nelems >= 2 && nelems <= 4))
1838     return true;
1839
1840   return false;
1841 }
1842
1843 /* MODE is some form of SVE vector mode.  For data modes, return the number
1844    of vector register bits that each element of MODE occupies, such as 64
1845    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1846    in a 64-bit container).  For predicate modes, return the number of
1847    data bits controlled by each significant predicate bit.  */
1848
1849 static unsigned int
1850 aarch64_sve_container_bits (machine_mode mode)
1851 {
1852   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1853   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1854                              ? BITS_PER_SVE_VECTOR
1855                              : GET_MODE_BITSIZE (mode));
1856   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1857 }
1858
1859 /* Return the SVE predicate mode to use for elements that have
1860    ELEM_NBYTES bytes, if such a mode exists.  */
1861
1862 opt_machine_mode
1863 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1864 {
1865   if (TARGET_SVE)
1866     {
1867       if (elem_nbytes == 1)
1868         return VNx16BImode;
1869       if (elem_nbytes == 2)
1870         return VNx8BImode;
1871       if (elem_nbytes == 4)
1872         return VNx4BImode;
1873       if (elem_nbytes == 8)
1874         return VNx2BImode;
1875     }
1876   return opt_machine_mode ();
1877 }
1878
1879 /* Return the SVE predicate mode that should be used to control
1880    SVE mode MODE.  */
1881
1882 machine_mode
1883 aarch64_sve_pred_mode (machine_mode mode)
1884 {
1885   unsigned int bits = aarch64_sve_container_bits (mode);
1886   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1887 }
1888
1889 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1890
1891 static opt_machine_mode
1892 aarch64_get_mask_mode (machine_mode mode)
1893 {
1894   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1895   if (vec_flags & VEC_SVE_DATA)
1896     return aarch64_sve_pred_mode (mode);
1897
1898   return default_get_mask_mode (mode);
1899 }
1900
1901 /* Return the integer element mode associated with SVE mode MODE.  */
1902
1903 static scalar_int_mode
1904 aarch64_sve_element_int_mode (machine_mode mode)
1905 {
1906   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1907                              ? BITS_PER_SVE_VECTOR
1908                              : GET_MODE_BITSIZE (mode));
1909   unsigned int elt_bits = vector_element_size (vector_bits,
1910                                                GET_MODE_NUNITS (mode));
1911   return int_mode_for_size (elt_bits, 0).require ();
1912 }
1913
1914 /* Return an integer element mode that contains exactly
1915    aarch64_sve_container_bits (MODE) bits.  This is wider than
1916    aarch64_sve_element_int_mode if MODE is a partial vector,
1917    otherwise it's the same.  */
1918
1919 static scalar_int_mode
1920 aarch64_sve_container_int_mode (machine_mode mode)
1921 {
1922   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1923 }
1924
1925 /* Return the integer vector mode associated with SVE mode MODE.
1926    Unlike related_int_vector_mode, this can handle the case in which
1927    MODE is a predicate (and thus has a different total size).  */
1928
1929 machine_mode
1930 aarch64_sve_int_mode (machine_mode mode)
1931 {
1932   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1933   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1934 }
1935
1936 /* Look for a vector mode with the same classification as VEC_MODE,
1937    but with each group of FACTOR elements coalesced into a single element.
1938    In other words, look for a mode in which the elements are FACTOR times
1939    larger and in which the number of elements is FACTOR times smaller.
1940
1941    Return the mode found, if one exists.  */
1942
1943 static opt_machine_mode
1944 aarch64_coalesce_units (machine_mode vec_mode, unsigned int factor)
1945 {
1946   auto elt_bits = vector_element_size (GET_MODE_BITSIZE (vec_mode),
1947                                        GET_MODE_NUNITS (vec_mode));
1948   auto vec_flags = aarch64_classify_vector_mode (vec_mode);
1949   if (vec_flags & VEC_SVE_PRED)
1950     {
1951       if (known_eq (GET_MODE_SIZE (vec_mode), BYTES_PER_SVE_PRED))
1952         return aarch64_sve_pred_mode (elt_bits * factor);
1953       return {};
1954     }
1955
1956   scalar_mode new_elt_mode;
1957   if (!int_mode_for_size (elt_bits * factor, false).exists (&new_elt_mode))
1958     return {};
1959
1960   if (vec_flags == VEC_ADVSIMD)
1961     {
1962       auto mode = aarch64_simd_container_mode (new_elt_mode,
1963                                                GET_MODE_BITSIZE (vec_mode));
1964       if (mode != word_mode)
1965         return mode;
1966     }
1967   else if (vec_flags & VEC_SVE_DATA)
1968     {
1969       poly_uint64 new_nunits;
1970       if (multiple_p (GET_MODE_NUNITS (vec_mode), factor, &new_nunits))
1971         return aarch64_sve_data_mode (new_elt_mode, new_nunits);
1972     }
1973   return {};
1974 }
1975
1976 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
1977
1978 static opt_machine_mode
1979 aarch64_vectorize_related_mode (machine_mode vector_mode,
1980                                 scalar_mode element_mode,
1981                                 poly_uint64 nunits)
1982 {
1983   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
1984
1985   /* If we're operating on SVE vectors, try to return an SVE mode.  */
1986   poly_uint64 sve_nunits;
1987   if ((vec_flags & VEC_SVE_DATA)
1988       && multiple_p (BYTES_PER_SVE_VECTOR,
1989                      GET_MODE_SIZE (element_mode), &sve_nunits))
1990     {
1991       machine_mode sve_mode;
1992       if (maybe_ne (nunits, 0U))
1993         {
1994           /* Try to find a full or partial SVE mode with exactly
1995              NUNITS units.  */
1996           if (multiple_p (sve_nunits, nunits)
1997               && aarch64_sve_data_mode (element_mode,
1998                                         nunits).exists (&sve_mode))
1999             return sve_mode;
2000         }
2001       else
2002         {
2003           /* Take the preferred number of units from the number of bytes
2004              that fit in VECTOR_MODE.  We always start by "autodetecting"
2005              a full vector mode with preferred_simd_mode, so vectors
2006              chosen here will also be full vector modes.  Then
2007              autovectorize_vector_modes tries smaller starting modes
2008              and thus smaller preferred numbers of units.  */
2009           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2010           if (aarch64_sve_data_mode (element_mode,
2011                                      sve_nunits).exists (&sve_mode))
2012             return sve_mode;
2013         }
2014     }
2015
2016   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
2017   if (TARGET_SIMD
2018       && (vec_flags & VEC_ADVSIMD)
2019       && known_eq (nunits, 0U)
2020       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
2021       && maybe_ge (GET_MODE_BITSIZE (element_mode)
2022                    * GET_MODE_NUNITS (vector_mode), 128U))
2023     {
2024       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
2025       if (VECTOR_MODE_P (res))
2026         return res;
2027     }
2028
2029   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2030 }
2031
2032 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
2033
2034 static bool
2035 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
2036 {
2037   machine_mode mode = TYPE_MODE (type);
2038   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2039   bool sve_p = (vec_flags & VEC_ANY_SVE);
2040   bool simd_p = (vec_flags & VEC_ADVSIMD);
2041
2042   return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
2043 }
2044
2045 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
2046    prefer to use the first arithmetic operand as the else value if
2047    the else value doesn't matter, since that exactly matches the SVE
2048    destructive merging form.  For ternary operations we could either
2049    pick the first operand and use FMAD-like instructions or the last
2050    operand and use FMLA-like instructions; the latter seems more
2051    natural.  */
2052
2053 static tree
2054 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
2055 {
2056   return nops == 3 ? ops[2] : ops[0];
2057 }
2058
2059 /* Implement TARGET_HARD_REGNO_NREGS.  */
2060
2061 static unsigned int
2062 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
2063 {
2064   /* ??? Logically we should only need to provide a value when
2065      HARD_REGNO_MODE_OK says that the combination is valid,
2066      but at the moment we need to handle all modes.  Just ignore
2067      any runtime parts for registers that can't store them.  */
2068   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
2069   switch (aarch64_regno_regclass (regno))
2070     {
2071     case FP_REGS:
2072     case FP_LO_REGS:
2073     case FP_LO8_REGS:
2074       {
2075         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2076         if (vec_flags & VEC_SVE_DATA)
2077           return exact_div (GET_MODE_SIZE (mode),
2078                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2079         if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
2080           return GET_MODE_SIZE (mode).to_constant () / 8;
2081         return CEIL (lowest_size, UNITS_PER_VREG);
2082       }
2083
2084     case PR_REGS:
2085     case PR_LO_REGS:
2086     case PR_HI_REGS:
2087       return mode == VNx32BImode ? 2 : 1;
2088
2089     case MOVEABLE_SYSREGS:
2090     case FFR_REGS:
2091     case PR_AND_FFR_REGS:
2092     case FAKE_REGS:
2093       return 1;
2094
2095     default:
2096       return CEIL (lowest_size, UNITS_PER_WORD);
2097     }
2098   gcc_unreachable ();
2099 }
2100
2101 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
2102
2103 static bool
2104 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2105 {
2106   if (mode == V8DImode)
2107     return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
2108            && multiple_p (regno - R0_REGNUM, 2);
2109
2110   if (GET_MODE_CLASS (mode) == MODE_CC)
2111     return regno == CC_REGNUM;
2112
2113   if (regno == VG_REGNUM)
2114     /* This must have the same size as _Unwind_Word.  */
2115     return mode == DImode;
2116
2117   if (regno == FPM_REGNUM)
2118     return mode == QImode || mode == HImode || mode == SImode || mode == DImode;
2119
2120   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2121   if (vec_flags == VEC_SVE_PRED)
2122     return pr_or_ffr_regnum_p (regno);
2123
2124   if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
2125     return PR_REGNUM_P (regno);
2126
2127   if (pr_or_ffr_regnum_p (regno))
2128     return false;
2129
2130   /* These registers are abstract; their modes don't matter.  */
2131   if (FAKE_REGNUM_P (regno))
2132     return true;
2133
2134   if (regno == SP_REGNUM)
2135     /* The purpose of comparing with ptr_mode is to support the
2136        global register variable associated with the stack pointer
2137        register via the syntax of asm ("wsp") in ILP32.  */
2138     return mode == Pmode || mode == ptr_mode;
2139
2140   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2141     return mode == Pmode;
2142
2143   if (GP_REGNUM_P (regno))
2144     {
2145       if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
2146         return false;
2147       if (known_le (GET_MODE_SIZE (mode), 8))
2148         return true;
2149       if (known_le (GET_MODE_SIZE (mode), 16))
2150         return (regno & 1) == 0;
2151     }
2152   else if (FP_REGNUM_P (regno))
2153     {
2154       if (vec_flags & VEC_STRUCT)
2155         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2156       else
2157         return !VECTOR_MODE_P (mode) || vec_flags != 0;
2158     }
2159
2160   return false;
2161 }
2162
2163 /* Return true if a function with type FNTYPE returns its value in
2164    SVE vector or predicate registers.  */
2165
2166 static bool
2167 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2168 {
2169   tree return_type = TREE_TYPE (fntype);
2170
2171   pure_scalable_type_info pst_info;
2172   switch (pst_info.analyze (return_type))
2173     {
2174     case pure_scalable_type_info::IS_PST:
2175       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2176               && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2177
2178     case pure_scalable_type_info::DOESNT_MATTER:
2179       gcc_assert (aarch64_return_in_memory_1 (return_type));
2180       return false;
2181
2182     case pure_scalable_type_info::NO_ABI_IDENTITY:
2183     case pure_scalable_type_info::ISNT_PST:
2184       return false;
2185     }
2186   gcc_unreachable ();
2187 }
2188
2189 /* Return true if a function with type FNTYPE takes arguments in
2190    SVE vector or predicate registers.  */
2191
2192 static bool
2193 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2194 {
2195   CUMULATIVE_ARGS args_so_far_v;
2196   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2197                                 NULL_TREE, 0, true);
2198   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2199
2200   for (tree chain = TYPE_ARG_TYPES (fntype);
2201        chain && chain != void_list_node;
2202        chain = TREE_CHAIN (chain))
2203     {
2204       tree arg_type = TREE_VALUE (chain);
2205       if (arg_type == error_mark_node)
2206         return false;
2207
2208       function_arg_info arg (arg_type, /*named=*/true);
2209       apply_pass_by_reference_rules (&args_so_far_v, arg);
2210       pure_scalable_type_info pst_info;
2211       if (pst_info.analyze_registers (arg.type))
2212         {
2213           unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2214           unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2215           gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2216           return true;
2217         }
2218
2219       targetm.calls.function_arg_advance (args_so_far, arg);
2220     }
2221   return false;
2222 }
2223
2224 /* Implement TARGET_FNTYPE_ABI.  */
2225
2226 static const predefined_function_abi &
2227 aarch64_fntype_abi (const_tree fntype)
2228 {
2229   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2230     return aarch64_simd_abi ();
2231
2232   if (aarch64_returns_value_in_sve_regs_p (fntype)
2233       || aarch64_takes_arguments_in_sve_regs_p (fntype))
2234     return aarch64_sve_abi ();
2235
2236   return default_function_abi;
2237 }
2238
2239 /* Return the state of PSTATE.SM on entry to functions of type FNTYPE.  */
2240
2241 static aarch64_isa_mode
2242 aarch64_fntype_pstate_sm (const_tree fntype)
2243 {
2244   if (lookup_attribute ("arm", "streaming", TYPE_ATTRIBUTES (fntype)))
2245     return AARCH64_ISA_MODE_SM_ON;
2246
2247   if (lookup_attribute ("arm", "streaming_compatible",
2248                         TYPE_ATTRIBUTES (fntype)))
2249     return 0;
2250
2251   return AARCH64_ISA_MODE_SM_OFF;
2252 }
2253
2254 /* Return state flags that describe whether and how functions of type
2255    FNTYPE share state STATE_NAME with their callers.  */
2256
2257 static unsigned int
2258 aarch64_fntype_shared_flags (const_tree fntype, const char *state_name)
2259 {
2260   return aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (fntype),
2261                                             state_name);
2262 }
2263
2264 /* Return the state of PSTATE.ZA on entry to functions of type FNTYPE.  */
2265
2266 static aarch64_isa_mode
2267 aarch64_fntype_pstate_za (const_tree fntype)
2268 {
2269   if (aarch64_fntype_shared_flags (fntype, "za")
2270       || aarch64_fntype_shared_flags (fntype, "zt0"))
2271     return AARCH64_ISA_MODE_ZA_ON;
2272
2273   return 0;
2274 }
2275
2276 /* Return the ISA mode on entry to functions of type FNTYPE.  */
2277
2278 static aarch64_isa_mode
2279 aarch64_fntype_isa_mode (const_tree fntype)
2280 {
2281   return (aarch64_fntype_pstate_sm (fntype)
2282           | aarch64_fntype_pstate_za (fntype));
2283 }
2284
2285 /* Return true if FNDECL uses streaming mode internally, as an
2286    implementation choice.  */
2287
2288 static bool
2289 aarch64_fndecl_is_locally_streaming (const_tree fndecl)
2290 {
2291   return lookup_attribute ("arm", "locally_streaming",
2292                            DECL_ATTRIBUTES (fndecl));
2293 }
2294
2295 /* Return the state of PSTATE.SM when compiling the body of
2296    function FNDECL.  This might be different from the state of
2297    PSTATE.SM on entry.  */
2298
2299 static aarch64_isa_mode
2300 aarch64_fndecl_pstate_sm (const_tree fndecl)
2301 {
2302   if (aarch64_fndecl_is_locally_streaming (fndecl))
2303     return AARCH64_ISA_MODE_SM_ON;
2304
2305   return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
2306 }
2307
2308 /* Return true if function FNDECL has state STATE_NAME, either by creating
2309    new state itself or by sharing state with callers.  */
2310
2311 static bool
2312 aarch64_fndecl_has_state (tree fndecl, const char *state_name)
2313 {
2314   return (aarch64_fndecl_has_new_state (fndecl, state_name)
2315           || aarch64_fntype_shared_flags (TREE_TYPE (fndecl),
2316                                           state_name) != 0);
2317 }
2318
2319 /* Return the state of PSTATE.ZA when compiling the body of function FNDECL.
2320    This might be different from the state of PSTATE.ZA on entry.  */
2321
2322 static aarch64_isa_mode
2323 aarch64_fndecl_pstate_za (const_tree fndecl)
2324 {
2325   if (aarch64_fndecl_has_new_state (fndecl, "za")
2326       || aarch64_fndecl_has_new_state (fndecl, "zt0"))
2327     return AARCH64_ISA_MODE_ZA_ON;
2328
2329   return aarch64_fntype_pstate_za (TREE_TYPE (fndecl));
2330 }
2331
2332 /* Return the ISA mode that should be used to compile the body of
2333    function FNDECL.  */
2334
2335 static aarch64_isa_mode
2336 aarch64_fndecl_isa_mode (const_tree fndecl)
2337 {
2338   return (aarch64_fndecl_pstate_sm (fndecl)
2339           | aarch64_fndecl_pstate_za (fndecl));
2340 }
2341
2342 /* Return the state of PSTATE.SM on entry to the current function.
2343    This might be different from the state of PSTATE.SM in the function
2344    body.  */
2345
2346 static aarch64_isa_mode
2347 aarch64_cfun_incoming_pstate_sm ()
2348 {
2349   return aarch64_fntype_pstate_sm (TREE_TYPE (cfun->decl));
2350 }
2351
2352 /* Return the state of PSTATE.ZA on entry to the current function.
2353    This might be different from the state of PSTATE.ZA in the function
2354    body.  */
2355
2356 static aarch64_isa_mode
2357 aarch64_cfun_incoming_pstate_za ()
2358 {
2359   return aarch64_fntype_pstate_za (TREE_TYPE (cfun->decl));
2360 }
2361
2362 /* Return state flags that describe whether and how the current function shares
2363    state STATE_NAME with callers.  */
2364
2365 static unsigned int
2366 aarch64_cfun_shared_flags (const char *state_name)
2367 {
2368   return aarch64_fntype_shared_flags (TREE_TYPE (cfun->decl), state_name);
2369 }
2370
2371 /* Return true if the current function creates new state of type STATE_NAME
2372    (as opposed to sharing the state with its callers or ignoring the state
2373    altogether).  */
2374
2375 static bool
2376 aarch64_cfun_has_new_state (const char *state_name)
2377 {
2378   return aarch64_fndecl_has_new_state (cfun->decl, state_name);
2379 }
2380
2381 /* Return true if PSTATE.SM is 1 in the body of the current function,
2382    but is not guaranteed to be 1 on entry.  */
2383
2384 static bool
2385 aarch64_cfun_enables_pstate_sm ()
2386 {
2387   return (aarch64_fndecl_is_locally_streaming (cfun->decl)
2388           && aarch64_cfun_incoming_pstate_sm () != AARCH64_ISA_MODE_SM_ON);
2389 }
2390
2391 /* Return true if the current function has state STATE_NAME, either by
2392    creating new state itself or by sharing state with callers.  */
2393
2394 static bool
2395 aarch64_cfun_has_state (const char *state_name)
2396 {
2397   return aarch64_fndecl_has_state (cfun->decl, state_name);
2398 }
2399
2400 /* Return true if a call from the current function to a function with
2401    ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
2402    the BL instruction.  */
2403
2404 static bool
2405 aarch64_call_switches_pstate_sm (aarch64_isa_mode callee_mode)
2406 {
2407   return (bool) (callee_mode & ~AARCH64_ISA_MODE & AARCH64_ISA_MODE_SM_STATE);
2408 }
2409
2410 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
2411
2412 static bool
2413 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2414 {
2415   return (aarch64_sve::builtin_type_p (type1)
2416           == aarch64_sve::builtin_type_p (type2));
2417 }
2418
2419 /* Return true if we should emit CFI for register REGNO.  */
2420
2421 static bool
2422 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2423 {
2424   return (GP_REGNUM_P (regno)
2425           || !default_function_abi.clobbers_full_reg_p (regno));
2426 }
2427
2428 /* Return the mode we should use to save and restore register REGNO.  */
2429
2430 static machine_mode
2431 aarch64_reg_save_mode (unsigned int regno)
2432 {
2433   if (GP_REGNUM_P (regno) || regno == VG_REGNUM)
2434     return DImode;
2435
2436   if (FP_REGNUM_P (regno))
2437     switch (crtl->abi->id ())
2438       {
2439       case ARM_PCS_AAPCS64:
2440         /* Only the low 64 bits are saved by the base PCS.  */
2441         return DFmode;
2442
2443       case ARM_PCS_SIMD:
2444         /* The vector PCS saves the low 128 bits (which is the full
2445            register on non-SVE targets).  */
2446         return V16QImode;
2447
2448       case ARM_PCS_SVE:
2449         /* Use vectors of DImode for registers that need frame
2450            information, so that the first 64 bytes of the save slot
2451            are always the equivalent of what storing D<n> would give.  */
2452         if (aarch64_emit_cfi_for_reg_p (regno))
2453           return VNx2DImode;
2454
2455         /* Use vectors of bytes otherwise, so that the layout is
2456            endian-agnostic, and so that we can use LDR and STR for
2457            big-endian targets.  */
2458         return VNx16QImode;
2459
2460       case ARM_PCS_TLSDESC:
2461       case ARM_PCS_UNKNOWN:
2462         break;
2463       }
2464
2465   if (PR_REGNUM_P (regno))
2466     /* Save the full predicate register.  */
2467     return VNx16BImode;
2468
2469   gcc_unreachable ();
2470 }
2471
2472 /* Given the ISA mode on entry to a callee and the ABI of the callee,
2473    return the CONST_INT that should be placed in an UNSPEC_CALLEE_ABI rtx.  */
2474
2475 rtx
2476 aarch64_gen_callee_cookie (aarch64_isa_mode isa_mode, arm_pcs pcs_variant)
2477 {
2478   return gen_int_mode ((unsigned int) isa_mode
2479                        | (unsigned int) pcs_variant << AARCH64_NUM_ISA_MODES,
2480                        DImode);
2481 }
2482
2483 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx.  Return the
2484    callee's ABI.  */
2485
2486 static const predefined_function_abi &
2487 aarch64_callee_abi (rtx cookie)
2488 {
2489   return function_abis[UINTVAL (cookie) >> AARCH64_NUM_ISA_MODES];
2490 }
2491
2492 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx.  Return the
2493    required ISA mode on entry to the callee, which is also the ISA
2494    mode on return from the callee.  */
2495
2496 static aarch64_isa_mode
2497 aarch64_callee_isa_mode (rtx cookie)
2498 {
2499   return UINTVAL (cookie) & ((1 << AARCH64_NUM_ISA_MODES) - 1);
2500 }
2501
2502 /* INSN is a call instruction.  Return the CONST_INT stored in its
2503    UNSPEC_CALLEE_ABI rtx.  */
2504
2505 static rtx
2506 aarch64_insn_callee_cookie (const rtx_insn *insn)
2507 {
2508   rtx pat = PATTERN (insn);
2509   gcc_assert (GET_CODE (pat) == PARALLEL);
2510   rtx unspec = XVECEXP (pat, 0, 1);
2511   gcc_assert (GET_CODE (unspec) == UNSPEC
2512               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2513   return XVECEXP (unspec, 0, 0);
2514 }
2515
2516 /* Implement TARGET_INSN_CALLEE_ABI.  */
2517
2518 const predefined_function_abi &
2519 aarch64_insn_callee_abi (const rtx_insn *insn)
2520 {
2521   return aarch64_callee_abi (aarch64_insn_callee_cookie (insn));
2522 }
2523
2524 /* INSN is a call instruction.  Return the required ISA mode on entry to
2525    the callee, which is also the ISA mode on return from the callee.  */
2526
2527 static aarch64_isa_mode
2528 aarch64_insn_callee_isa_mode (const rtx_insn *insn)
2529 {
2530   return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn));
2531 }
2532
2533 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2534    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2535    clobbers the top 64 bits when restoring the bottom 64 bits.  */
2536
2537 static bool
2538 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2539                                         unsigned int regno,
2540                                         machine_mode mode)
2541 {
2542   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2543     {
2544       poly_int64 per_register_size = GET_MODE_SIZE (mode);
2545       unsigned int nregs = hard_regno_nregs (regno, mode);
2546       if (nregs > 1)
2547         per_register_size = exact_div (per_register_size, nregs);
2548       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2549         return maybe_gt (per_register_size, 16);
2550       return maybe_gt (per_register_size, 8);
2551     }
2552   return false;
2553 }
2554
2555 /* Implement REGMODE_NATURAL_SIZE.  */
2556 poly_uint64
2557 aarch64_regmode_natural_size (machine_mode mode)
2558 {
2559   /* The natural size for SVE data modes is one SVE data vector,
2560      and similarly for predicates.  We can't independently modify
2561      anything smaller than that.  */
2562   /* ??? For now, only do this for variable-width SVE registers.
2563      Doing it for constant-sized registers breaks lower-subreg.cc.  */
2564   /* ??? And once that's fixed, we should probably have similar
2565      code for Advanced SIMD.  */
2566   if (!aarch64_sve_vg.is_constant ())
2567     {
2568       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2569       if (vec_flags & VEC_SVE_PRED)
2570         return BYTES_PER_SVE_PRED;
2571       if (vec_flags & VEC_SVE_DATA)
2572         return BYTES_PER_SVE_VECTOR;
2573     }
2574   return UNITS_PER_WORD;
2575 }
2576
2577 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2578 machine_mode
2579 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2580                                      machine_mode mode)
2581 {
2582   /* The predicate mode determines which bits are significant and
2583      which are "don't care".  Decreasing the number of lanes would
2584      lose data while increasing the number of lanes would make bits
2585      unnecessarily significant.  */
2586   if (PR_REGNUM_P (regno))
2587     return mode;
2588   if (known_lt (GET_MODE_SIZE (mode), 4)
2589       && REG_CAN_CHANGE_MODE_P (regno, mode, SImode)
2590       && REG_CAN_CHANGE_MODE_P (regno, SImode, mode))
2591     return SImode;
2592   return mode;
2593 }
2594
2595 /* Return true if I's bits are consecutive ones from the MSB.  */
2596 bool
2597 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2598 {
2599   return exact_log2 (-i) != HOST_WIDE_INT_M1;
2600 }
2601
2602 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2603    that strcpy from constants will be faster.  */
2604
2605 static HOST_WIDE_INT
2606 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2607 {
2608   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2609     return MAX (align, BITS_PER_WORD);
2610   return align;
2611 }
2612
2613 /* Return true if calls to DECL should be treated as
2614    long-calls (ie called via a register).  */
2615 static bool
2616 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2617 {
2618   return false;
2619 }
2620
2621 /* Return true if calls to symbol-ref SYM should be treated as
2622    long-calls (ie called via a register).  */
2623 bool
2624 aarch64_is_long_call_p (rtx sym)
2625 {
2626   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2627 }
2628
2629 /* Return true if calls to symbol-ref SYM should not go through
2630    plt stubs.  */
2631
2632 bool
2633 aarch64_is_noplt_call_p (rtx sym)
2634 {
2635   const_tree decl = SYMBOL_REF_DECL (sym);
2636
2637   if (flag_pic
2638       && decl
2639       && (!flag_plt
2640           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2641       && !targetm.binds_local_p (decl))
2642     return true;
2643
2644   return false;
2645 }
2646
2647 /* Emit an insn that's a simple single-set.  Both the operands must be
2648    known to be valid.  */
2649 inline static rtx_insn *
2650 emit_set_insn (rtx x, rtx y)
2651 {
2652   return emit_insn (gen_rtx_SET (x, y));
2653 }
2654
2655 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2656    return the rtx for register 0 in the proper mode.  */
2657 rtx
2658 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2659 {
2660   machine_mode cmp_mode = GET_MODE (x);
2661   machine_mode cc_mode;
2662   rtx cc_reg;
2663
2664   if (cmp_mode == TImode)
2665     {
2666       gcc_assert (code == NE);
2667
2668       cc_mode = CCmode;
2669       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2670
2671       rtx x_lo = operand_subword (x, 0, 0, TImode);
2672       rtx y_lo = operand_subword (y, 0, 0, TImode);
2673       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2674
2675       rtx x_hi = operand_subword (x, 1, 0, TImode);
2676       rtx y_hi = operand_subword (y, 1, 0, TImode);
2677       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2678                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2679                                GEN_INT (AARCH64_EQ)));
2680     }
2681   else
2682     {
2683       cc_mode = SELECT_CC_MODE (code, x, y);
2684       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2685       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2686     }
2687   return cc_reg;
2688 }
2689
2690 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2691
2692 static rtx
2693 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2694                                   machine_mode y_mode)
2695 {
2696   if (y_mode == E_QImode || y_mode == E_HImode)
2697     {
2698       if (CONST_INT_P (y))
2699         {
2700           y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2701           y_mode = SImode;
2702         }
2703       else
2704         {
2705           rtx t, cc_reg;
2706           machine_mode cc_mode;
2707
2708           t = gen_rtx_ZERO_EXTEND (SImode, y);
2709           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2710           cc_mode = CC_SWPmode;
2711           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2712           emit_set_insn (cc_reg, t);
2713           return cc_reg;
2714         }
2715     }
2716
2717   if (!aarch64_plus_operand (y, y_mode))
2718     y = force_reg (y_mode, y);
2719
2720   return aarch64_gen_compare_reg (code, x, y);
2721 }
2722
2723 /* Generate conditional branch to LABEL, comparing X to 0 using CODE.
2724    Return the jump instruction.  */
2725
2726 static rtx
2727 aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x,
2728                                      rtx_code_label *label)
2729 {
2730   if (aarch64_track_speculation)
2731     {
2732       /* Emit an explicit compare instruction, so that we can correctly
2733          track the condition codes.  */
2734       rtx cc_reg = aarch64_gen_compare_reg (code, x, const0_rtx);
2735       x = gen_rtx_fmt_ee (code, GET_MODE (cc_reg), cc_reg, const0_rtx);
2736     }
2737   else
2738     x = gen_rtx_fmt_ee (code, VOIDmode, x, const0_rtx);
2739
2740   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
2741                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
2742   return gen_rtx_SET (pc_rtx, x);
2743 }
2744
2745 /* Return an rtx that branches to LABEL based on the value of bit BITNUM of X.
2746    If CODE is NE, it branches to LABEL when the bit is set; if CODE is EQ,
2747    it branches to LABEL when the bit is clear.  */
2748
2749 static rtx
2750 aarch64_gen_test_and_branch (rtx_code code, rtx x, int bitnum,
2751                              rtx_code_label *label)
2752 {
2753   auto mode = GET_MODE (x);
2754   if (aarch64_track_speculation)
2755     {
2756       auto mask = gen_int_mode (HOST_WIDE_INT_1U << bitnum, mode);
2757       emit_insn (gen_aarch64_and3nr_compare0 (mode, x, mask));
2758       rtx cc_reg = gen_rtx_REG (CC_NZVmode, CC_REGNUM);
2759       rtx x = gen_rtx_fmt_ee (code, CC_NZVmode, cc_reg, const0_rtx);
2760       return gen_condjump (x, cc_reg, label);
2761     }
2762   return gen_aarch64_tb (code, mode, mode,
2763                          x, gen_int_mode (bitnum, mode), label);
2764 }
2765
2766 /* Consider the operation:
2767
2768      OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
2769
2770    where:
2771
2772    - CODE is [SU]MAX or [SU]MIN
2773    - OPERANDS[2] and OPERANDS[3] are constant integers
2774    - OPERANDS[3] is a positive or negative shifted 12-bit immediate
2775    - all operands have mode MODE
2776
2777    Decide whether it is possible to implement the operation using:
2778
2779      SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
2780      or
2781      ADDS <tmp>, OPERANDS[1], OPERANDS[3]
2782
2783    followed by:
2784
2785      <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
2786
2787    where <insn> is one of CSEL, CSINV or CSINC.  Return true if so.
2788    If GENERATE_P is true, also update OPERANDS as follows:
2789
2790      OPERANDS[4] = -OPERANDS[3]
2791      OPERANDS[5] = the rtl condition representing <cond>
2792      OPERANDS[6] = <tmp>
2793      OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC.  */
2794 bool
2795 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
2796 {
2797   signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
2798   rtx dst = operands[0];
2799   rtx maxmin_op = operands[2];
2800   rtx add_op = operands[3];
2801   machine_mode mode = GET_MODE (dst);
2802
2803   /* max (x, y) - z == (x >= y + 1 ? x : y) - z
2804                     == (x >= y ? x : y) - z
2805                     == (x > y ? x : y) - z
2806                     == (x > y - 1 ? x : y) - z
2807
2808      min (x, y) - z == (x <= y - 1 ? x : y) - z
2809                     == (x <= y ? x : y) - z
2810                     == (x < y ? x : y) - z
2811                     == (x < y + 1 ? x : y) - z
2812
2813      Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
2814      which x is compared with z.  Set DIFF to y - z.  Thus the supported
2815      combinations are as follows, with DIFF being the value after the ":":
2816
2817      max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1   [z == y + 1]
2818                     == x >= y ? x - y : 0              [z == y]
2819                     == x > y ? x - y : 0               [z == y]
2820                     == x > y - 1 ? x - (y - 1) : 1     [z == y - 1]
2821
2822      min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1    [z == y - 1]
2823                     == x <= y ? x - y : 0              [z == y]
2824                     == x < y ? x - y : 0               [z == y]
2825                     == x < y + 1 ? x - (y + 1) : -1    [z == y + 1].  */
2826   auto maxmin_val = rtx_mode_t (maxmin_op, mode);
2827   auto add_val = rtx_mode_t (add_op, mode);
2828   auto sub_val = wi::neg (add_val);
2829   auto diff = wi::sub (maxmin_val, sub_val);
2830   if (!(diff == 0
2831         || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
2832         || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
2833     return false;
2834
2835   if (!generate_p)
2836     return true;
2837
2838   rtx_code cmp;
2839   switch (code)
2840     {
2841     case SMAX:
2842       cmp = diff == 1 ? GT : GE;
2843       break;
2844     case UMAX:
2845       cmp = diff == 1 ? GTU : GEU;
2846       break;
2847     case SMIN:
2848       cmp = diff == -1 ? LT : LE;
2849       break;
2850     case UMIN:
2851       cmp = diff == -1 ? LTU : LEU;
2852       break;
2853     default:
2854       gcc_unreachable ();
2855     }
2856   rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
2857
2858   operands[4] = immed_wide_int_const (sub_val, mode);
2859   operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
2860   if (can_create_pseudo_p ())
2861     operands[6] = gen_reg_rtx (mode);
2862   else
2863     operands[6] = dst;
2864   operands[7] = immed_wide_int_const (diff, mode);
2865
2866   return true;
2867 }
2868
2869
2870 /* Build the SYMBOL_REF for __tls_get_addr.  */
2871
2872 static GTY(()) rtx tls_get_addr_libfunc;
2873
2874 rtx
2875 aarch64_tls_get_addr (void)
2876 {
2877   if (!tls_get_addr_libfunc)
2878     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2879   return tls_get_addr_libfunc;
2880 }
2881
2882 /* Return the TLS model to use for ADDR.  */
2883
2884 static enum tls_model
2885 tls_symbolic_operand_type (rtx addr)
2886 {
2887   enum tls_model tls_kind = TLS_MODEL_NONE;
2888   poly_int64 offset;
2889   addr = strip_offset_and_salt (addr, &offset);
2890   if (SYMBOL_REF_P (addr))
2891     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2892
2893   return tls_kind;
2894 }
2895
2896 /* We'll allow lo_sum's in addresses in our legitimate addresses
2897    so that combine would take care of combining addresses where
2898    necessary, but for generation purposes, we'll generate the address
2899    as :
2900    RTL                               Absolute
2901    tmp = hi (symbol_ref);            adrp  x1, foo
2902    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2903                                      nop
2904
2905    PIC                               TLS
2906    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2907    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2908                                      bl   __tls_get_addr
2909                                      nop
2910
2911    Load TLS symbol, depending on TLS mechanism and TLS access model.
2912
2913    Global Dynamic - Traditional TLS:
2914    adrp tmp, :tlsgd:imm
2915    add  dest, tmp, #:tlsgd_lo12:imm
2916    bl   __tls_get_addr
2917
2918    Global Dynamic - TLS Descriptors:
2919    adrp dest, :tlsdesc:imm
2920    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2921    add  dest, dest, #:tlsdesc_lo12:imm
2922    blr  tmp
2923    mrs  tp, tpidr_el0
2924    add  dest, dest, tp
2925
2926    Initial Exec:
2927    mrs  tp, tpidr_el0
2928    adrp tmp, :gottprel:imm
2929    ldr  dest, [tmp, #:gottprel_lo12:imm]
2930    add  dest, dest, tp
2931
2932    Local Exec:
2933    mrs  tp, tpidr_el0
2934    add  t0, tp, #:tprel_hi12:imm, lsl #12
2935    add  t0, t0, #:tprel_lo12_nc:imm
2936 */
2937
2938 static void
2939 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2940                                    enum aarch64_symbol_type type)
2941 {
2942 #if TARGET_PECOFF
2943   rtx tmp = legitimize_pe_coff_symbol (imm, true);
2944   if (tmp)
2945     {
2946       emit_insn (gen_rtx_SET (dest, tmp));
2947       return;
2948     }
2949 #endif
2950
2951   switch (type)
2952     {
2953     case SYMBOL_SMALL_ABSOLUTE:
2954       {
2955         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2956         rtx tmp_reg = dest;
2957         machine_mode mode = GET_MODE (dest);
2958
2959         gcc_assert (mode == Pmode || mode == ptr_mode);
2960
2961         if (can_create_pseudo_p ())
2962           tmp_reg = gen_reg_rtx (mode);
2963
2964         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
2965         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2966         return;
2967       }
2968
2969     case SYMBOL_TINY_ABSOLUTE:
2970       emit_insn (gen_rtx_SET (dest, imm));
2971       return;
2972
2973     case SYMBOL_SMALL_GOT_28K:
2974       {
2975         machine_mode mode = GET_MODE (dest);
2976         rtx gp_rtx = pic_offset_table_rtx;
2977         rtx insn;
2978         rtx mem;
2979
2980         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2981            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2982            decide rtx costs, in which case pic_offset_table_rtx is not
2983            initialized.  For that case no need to generate the first adrp
2984            instruction as the final cost for global variable access is
2985            one instruction.  */
2986         if (gp_rtx != NULL)
2987           {
2988             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2989                using the page base as GOT base, the first page may be wasted,
2990                in the worst scenario, there is only 28K space for GOT).
2991
2992                The generate instruction sequence for accessing global variable
2993                is:
2994
2995                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2996
2997                Only one instruction needed. But we must initialize
2998                pic_offset_table_rtx properly.  We generate initialize insn for
2999                every global access, and allow CSE to remove all redundant.
3000
3001                The final instruction sequences will look like the following
3002                for multiply global variables access.
3003
3004                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3005
3006                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3007                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3008                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3009                  ...  */
3010
3011             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3012             crtl->uses_pic_offset_table = 1;
3013             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3014
3015             if (mode != GET_MODE (gp_rtx))
3016              gp_rtx = gen_lowpart (mode, gp_rtx);
3017
3018           }
3019
3020         if (mode == ptr_mode)
3021           {
3022             if (mode == DImode)
3023               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
3024             else
3025               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3026
3027             mem = XVECEXP (SET_SRC (insn), 0, 0);
3028           }
3029         else
3030           {
3031             gcc_assert (mode == Pmode);
3032
3033             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3034             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3035           }
3036
3037         /* The operand is expected to be MEM.  Whenever the related insn
3038            pattern changed, above code which calculate mem should be
3039            updated.  */
3040         gcc_assert (MEM_P (mem));
3041         MEM_READONLY_P (mem) = 1;
3042         MEM_NOTRAP_P (mem) = 1;
3043         emit_insn (insn);
3044         return;
3045       }
3046
3047     case SYMBOL_SMALL_GOT_4G:
3048       emit_insn (gen_rtx_SET (dest, imm));
3049       return;
3050
3051     case SYMBOL_SMALL_TLSGD:
3052       {
3053         rtx_insn *insns;
3054         /* The return type of __tls_get_addr is the C pointer type
3055            so use ptr_mode.  */
3056         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3057         rtx tmp_reg = dest;
3058
3059         if (GET_MODE (dest) != ptr_mode)
3060           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3061
3062         start_sequence ();
3063         if (ptr_mode == SImode)
3064           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3065         else
3066           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3067         insns = get_insns ();
3068         end_sequence ();
3069
3070         RTL_CONST_CALL_P (insns) = 1;
3071         emit_libcall_block (insns, tmp_reg, result, imm);
3072         /* Convert back to the mode of the dest adding a zero_extend
3073            from SImode (ptr_mode) to DImode (Pmode). */
3074         if (dest != tmp_reg)
3075           convert_move (dest, tmp_reg, true);
3076         return;
3077       }
3078
3079     case SYMBOL_SMALL_TLSDESC:
3080       {
3081         machine_mode mode = GET_MODE (dest);
3082         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3083         rtx tp;
3084
3085         gcc_assert (mode == Pmode || mode == ptr_mode);
3086
3087         /* In ILP32, the got entry is always of SImode size.  Unlike
3088            small GOT, the dest is fixed at reg 0.  */
3089         if (TARGET_ILP32)
3090           emit_insn (gen_tlsdesc_small_si (imm));
3091         else
3092           emit_insn (gen_tlsdesc_small_di (imm));
3093         tp = aarch64_load_tp (NULL);
3094
3095         if (mode != Pmode)
3096           tp = gen_lowpart (mode, tp);
3097
3098         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3099         if (REG_P (dest))
3100           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3101         return;
3102       }
3103
3104     case SYMBOL_SMALL_TLSIE:
3105       {
3106         /* In ILP32, the mode of dest can be either SImode or DImode,
3107            while the got entry is always of SImode size.  The mode of
3108            dest depends on how dest is used: if dest is assigned to a
3109            pointer (e.g. in the memory), it has SImode; it may have
3110            DImode if dest is dereferenced to access the memeory.
3111            This is why we have to handle three different tlsie_small
3112            patterns here (two patterns for ILP32).  */
3113         machine_mode mode = GET_MODE (dest);
3114         rtx tmp_reg = gen_reg_rtx (mode);
3115         rtx tp = aarch64_load_tp (NULL);
3116
3117         if (mode == ptr_mode)
3118           {
3119             if (mode == DImode)
3120               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3121             else
3122               {
3123                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3124                 tp = gen_lowpart (mode, tp);
3125               }
3126           }
3127         else
3128           {
3129             gcc_assert (mode == Pmode);
3130             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3131           }
3132
3133         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3134         if (REG_P (dest))
3135           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3136         return;
3137       }
3138
3139     case SYMBOL_TLSLE12:
3140     case SYMBOL_TLSLE24:
3141     case SYMBOL_TLSLE32:
3142     case SYMBOL_TLSLE48:
3143       {
3144         machine_mode mode = GET_MODE (dest);
3145         rtx tp = aarch64_load_tp (NULL);
3146
3147         if (mode != Pmode)
3148           tp = gen_lowpart (mode, tp);
3149
3150         switch (type)
3151           {
3152           case SYMBOL_TLSLE12:
3153             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3154                         (dest, tp, imm));
3155             break;
3156           case SYMBOL_TLSLE24:
3157             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3158                         (dest, tp, imm));
3159           break;
3160           case SYMBOL_TLSLE32:
3161             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3162                         (dest, imm));
3163             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3164                         (dest, dest, tp));
3165           break;
3166           case SYMBOL_TLSLE48:
3167             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3168                         (dest, imm));
3169             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3170                         (dest, dest, tp));
3171             break;
3172           default:
3173             gcc_unreachable ();
3174           }
3175
3176         if (REG_P (dest))
3177           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3178         return;
3179       }
3180
3181     case SYMBOL_TINY_GOT:
3182       {
3183         rtx insn;
3184         machine_mode mode = GET_MODE (dest);
3185
3186         if (mode == ptr_mode)
3187           insn = gen_ldr_got_tiny (mode, dest, imm);
3188         else
3189           {
3190             gcc_assert (mode == Pmode);
3191             insn = gen_ldr_got_tiny_sidi (dest, imm);
3192           }
3193
3194         emit_insn (insn);
3195         return;
3196       }
3197
3198     case SYMBOL_TINY_TLSIE:
3199       {
3200         machine_mode mode = GET_MODE (dest);
3201         rtx tp = aarch64_load_tp (NULL);
3202
3203         if (mode == ptr_mode)
3204           {
3205             if (mode == DImode)
3206               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3207             else
3208               {
3209                 tp = gen_lowpart (mode, tp);
3210                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3211               }
3212           }
3213         else
3214           {
3215             gcc_assert (mode == Pmode);
3216             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3217           }
3218
3219         if (REG_P (dest))
3220           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3221         return;
3222       }
3223
3224     default:
3225       gcc_unreachable ();
3226     }
3227 }
3228
3229 /* Emit a move from SRC to DEST.  Assume that the move expanders can
3230    handle all moves if !can_create_pseudo_p ().  The distinction is
3231    important because, unlike emit_move_insn, the move expanders know
3232    how to force Pmode objects into the constant pool even when the
3233    constant pool address is not itself legitimate.  */
3234 static rtx
3235 aarch64_emit_move (rtx dest, rtx src)
3236 {
3237   return (can_create_pseudo_p ()
3238           ? emit_move_insn (dest, src)
3239           : emit_move_insn_1 (dest, src));
3240 }
3241
3242 /* Apply UNOPTAB to OP and store the result in DEST.  */
3243
3244 static void
3245 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3246 {
3247   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3248   if (dest != tmp)
3249     emit_move_insn (dest, tmp);
3250 }
3251
3252 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
3253
3254 static void
3255 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3256 {
3257   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3258                           OPTAB_DIRECT);
3259   if (dest != tmp)
3260     emit_move_insn (dest, tmp);
3261 }
3262
3263 /* Split a move from SRC to DST into two moves of mode SINGLE_MODE.  */
3264
3265 void
3266 aarch64_split_double_move (rtx dst, rtx src, machine_mode single_mode)
3267 {
3268   machine_mode mode = GET_MODE (dst);
3269
3270   rtx dst0 = simplify_gen_subreg (single_mode, dst, mode, 0);
3271   rtx dst1 = simplify_gen_subreg (single_mode, dst, mode,
3272                                   GET_MODE_SIZE (single_mode));
3273   rtx src0 = simplify_gen_subreg (single_mode, src, mode, 0);
3274   rtx src1 = simplify_gen_subreg (single_mode, src, mode,
3275                                   GET_MODE_SIZE (single_mode));
3276
3277   /* At most one pairing may overlap.  */
3278   if (reg_overlap_mentioned_p (dst0, src1))
3279     {
3280       aarch64_emit_move (dst1, src1);
3281       aarch64_emit_move (dst0, src0);
3282     }
3283   else
3284     {
3285       aarch64_emit_move (dst0, src0);
3286       aarch64_emit_move (dst1, src1);
3287     }
3288 }
3289
3290 /* Split a 128-bit move operation into two 64-bit move operations,
3291    taking care to handle partial overlap of register to register
3292    copies.  Special cases are needed when moving between GP regs and
3293    FP regs.  SRC can be a register, constant or memory; DST a register
3294    or memory.  If either operand is memory it must not have any side
3295    effects.  */
3296 void
3297 aarch64_split_128bit_move (rtx dst, rtx src)
3298 {
3299   machine_mode mode = GET_MODE (dst);
3300
3301   gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
3302   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3303   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3304
3305   if (REG_P (dst) && REG_P (src))
3306     {
3307       int src_regno = REGNO (src);
3308       int dst_regno = REGNO (dst);
3309
3310       /* Handle FP <-> GP regs.  */
3311       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3312         {
3313           rtx src_lo = gen_lowpart (word_mode, src);
3314           rtx src_hi = gen_highpart (word_mode, src);
3315
3316           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3317           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3318           return;
3319         }
3320       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3321         {
3322           rtx dst_lo = gen_lowpart (word_mode, dst);
3323           rtx dst_hi = gen_highpart (word_mode, dst);
3324
3325           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3326           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3327           return;
3328         }
3329     }
3330
3331   aarch64_split_double_move (dst, src, word_mode);
3332 }
3333
3334 /* Return true if we should split a move from 128-bit value SRC
3335    to 128-bit register DEST.  */
3336
3337 bool
3338 aarch64_split_128bit_move_p (rtx dst, rtx src)
3339 {
3340   if (FP_REGNUM_P (REGNO (dst)))
3341     return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3342   /* All moves to GPRs need to be split.  */
3343   return true;
3344 }
3345
3346 /* Split a complex SIMD move.  */
3347
3348 void
3349 aarch64_split_simd_move (rtx dst, rtx src)
3350 {
3351   machine_mode src_mode = GET_MODE (src);
3352   machine_mode dst_mode = GET_MODE (dst);
3353
3354   gcc_assert (VECTOR_MODE_P (dst_mode));
3355
3356   if (REG_P (dst) && REG_P (src))
3357     {
3358       gcc_assert (VECTOR_MODE_P (src_mode));
3359       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3360     }
3361 }
3362
3363 /* Return a register that contains SVE value X reinterpreted as SVE mode MODE.
3364    The semantics of those of svreinterpret rather than those of subregs;
3365    see the comment at the head of aarch64-sve.md for details about the
3366    difference.  */
3367
3368 rtx
3369 aarch64_sve_reinterpret (machine_mode mode, rtx x)
3370 {
3371   if (GET_MODE (x) == mode)
3372     return x;
3373
3374   /* can_change_mode_class must only return true if subregs and svreinterprets
3375      have the same semantics.  */
3376   if (targetm.can_change_mode_class (GET_MODE (x), mode, FP_REGS))
3377     return force_lowpart_subreg (mode, x, GET_MODE (x));
3378
3379   rtx res = gen_reg_rtx (mode);
3380   x = force_reg (GET_MODE (x), x);
3381   emit_insn (gen_aarch64_sve_reinterpret (mode, res, x));
3382   return res;
3383 }
3384
3385 bool
3386 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3387                               machine_mode ymode, rtx y)
3388 {
3389   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3390   gcc_assert (r != NULL);
3391   return rtx_equal_p (x, r);
3392 }
3393
3394 /* Return TARGET if it is nonnull and a register of mode MODE.
3395    Otherwise, return a fresh register of mode MODE if we can,
3396    or TARGET reinterpreted as MODE if we can't.  */
3397
3398 static rtx
3399 aarch64_target_reg (rtx target, machine_mode mode)
3400 {
3401   if (target && REG_P (target) && GET_MODE (target) == mode)
3402     return target;
3403   if (!can_create_pseudo_p ())
3404     {
3405       gcc_assert (target);
3406       return gen_lowpart (mode, target);
3407     }
3408   return gen_reg_rtx (mode);
3409 }
3410
3411 /* Return a register that contains the constant in BUILDER, given that
3412    the constant is a legitimate move operand.  Use TARGET as the register
3413    if it is nonnull and convenient.  */
3414
3415 static rtx
3416 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3417 {
3418   rtx src = builder.build ();
3419   target = aarch64_target_reg (target, GET_MODE (src));
3420   emit_insn (gen_rtx_SET (target, src));
3421   return target;
3422 }
3423
3424 static rtx
3425 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3426 {
3427   if (can_create_pseudo_p ())
3428     return force_reg (mode, value);
3429   else
3430     {
3431       gcc_assert (x);
3432       aarch64_emit_move (x, value);
3433       return x;
3434     }
3435 }
3436
3437 /* Return true if predicate value X is a constant in which every element
3438    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
3439    value, i.e. as a predicate in which all bits are significant.  */
3440
3441 static bool
3442 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3443 {
3444   if (!CONST_VECTOR_P (x))
3445     return false;
3446
3447   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3448                                              GET_MODE_NUNITS (GET_MODE (x)));
3449   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3450   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3451   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3452
3453   unsigned int nelts = const_vector_encoded_nelts (x);
3454   for (unsigned int i = 0; i < nelts; ++i)
3455     {
3456       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3457       if (!CONST_INT_P (elt))
3458         return false;
3459
3460       builder.quick_push (elt);
3461       for (unsigned int j = 1; j < factor; ++j)
3462         builder.quick_push (const0_rtx);
3463     }
3464   builder.finalize ();
3465   return true;
3466 }
3467
3468 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
3469    widest predicate element size it can have (that is, the largest size
3470    for which each element would still be 0 or 1).  */
3471
3472 unsigned int
3473 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3474 {
3475   /* Start with the most optimistic assumption: that we only need
3476      one bit per pattern.  This is what we will use if only the first
3477      bit in each pattern is ever set.  */
3478   unsigned int mask = GET_MODE_SIZE (DImode);
3479   mask |= builder.npatterns ();
3480
3481   /* Look for set bits.  */
3482   unsigned int nelts = builder.encoded_nelts ();
3483   for (unsigned int i = 1; i < nelts; ++i)
3484     if (INTVAL (builder.elt (i)) != 0)
3485       {
3486         if (i & 1)
3487           return 1;
3488         mask |= i;
3489       }
3490   return mask & -mask;
3491 }
3492
3493 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3494    return that predicate mode, otherwise return opt_machine_mode ().  */
3495
3496 opt_machine_mode
3497 aarch64_ptrue_all_mode (rtx x)
3498 {
3499   gcc_assert (GET_MODE (x) == VNx16BImode);
3500   if (!CONST_VECTOR_P (x)
3501       || !CONST_VECTOR_DUPLICATE_P (x)
3502       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3503       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3504     return opt_machine_mode ();
3505
3506   unsigned int nelts = const_vector_encoded_nelts (x);
3507   for (unsigned int i = 1; i < nelts; ++i)
3508     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3509       return opt_machine_mode ();
3510
3511   return aarch64_sve_pred_mode (nelts);
3512 }
3513
3514 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
3515    that the constant would have with predicate element size ELT_SIZE
3516    (ignoring the upper bits in each element) and return:
3517
3518    * -1 if all bits are set
3519    * N if the predicate has N leading set bits followed by all clear bits
3520    * 0 if the predicate does not have any of these forms.  */
3521
3522 int
3523 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3524                               unsigned int elt_size)
3525 {
3526   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3527      followed by set bits.  */
3528   if (builder.nelts_per_pattern () == 3)
3529     return 0;
3530
3531   /* Skip over leading set bits.  */
3532   unsigned int nelts = builder.encoded_nelts ();
3533   unsigned int i = 0;
3534   for (; i < nelts; i += elt_size)
3535     if (INTVAL (builder.elt (i)) == 0)
3536       break;
3537   unsigned int vl = i / elt_size;
3538
3539   /* Check for the all-true case.  */
3540   if (i == nelts)
3541     return -1;
3542
3543   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3544      repeating pattern of set bits followed by clear bits.  */
3545   if (builder.nelts_per_pattern () != 2)
3546     return 0;
3547
3548   /* We have a "foreground" value and a duplicated "background" value.
3549      If the background might repeat and the last set bit belongs to it,
3550      we might have set bits followed by clear bits followed by set bits.  */
3551   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3552     return 0;
3553
3554   /* Make sure that the rest are all clear.  */
3555   for (; i < nelts; i += elt_size)
3556     if (INTVAL (builder.elt (i)) != 0)
3557       return 0;
3558
3559   return vl;
3560 }
3561
3562 /* See if there is an svpattern that encodes an SVE predicate of mode
3563    PRED_MODE in which the first VL bits are set and the rest are clear.
3564    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3565    A VL of -1 indicates an all-true vector.  */
3566
3567 aarch64_svpattern
3568 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3569 {
3570   if (vl < 0)
3571     return AARCH64_SV_ALL;
3572
3573   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3574     return AARCH64_NUM_SVPATTERNS;
3575
3576   if (vl >= 1 && vl <= 8)
3577     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3578
3579   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3580     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3581
3582   int max_vl;
3583   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3584     {
3585       if (vl == (max_vl / 3) * 3)
3586         return AARCH64_SV_MUL3;
3587       /* These would only trigger for non-power-of-2 lengths.  */
3588       if (vl == (max_vl & -4))
3589         return AARCH64_SV_MUL4;
3590       if (vl == (1 << floor_log2 (max_vl)))
3591         return AARCH64_SV_POW2;
3592       if (vl == max_vl)
3593         return AARCH64_SV_ALL;
3594     }
3595   return AARCH64_NUM_SVPATTERNS;
3596 }
3597
3598 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3599    bits has the lowest bit set and the upper bits clear.  This is the
3600    VNx16BImode equivalent of a PTRUE for controlling elements of
3601    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3602    all bits are significant, even the upper zeros.  */
3603
3604 rtx
3605 aarch64_ptrue_all (unsigned int elt_size)
3606 {
3607   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3608   builder.quick_push (const1_rtx);
3609   for (unsigned int i = 1; i < elt_size; ++i)
3610     builder.quick_push (const0_rtx);
3611   return builder.build ();
3612 }
3613
3614 /* Return an all-true predicate register of mode MODE.  */
3615
3616 rtx
3617 aarch64_ptrue_reg (machine_mode mode)
3618 {
3619   gcc_assert (aarch64_sve_pred_mode_p (mode));
3620   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3621   return gen_lowpart (mode, reg);
3622 }
3623
3624 /* Return an all-false predicate register of mode MODE.  */
3625
3626 rtx
3627 aarch64_pfalse_reg (machine_mode mode)
3628 {
3629   gcc_assert (aarch64_sve_pred_mode_p (mode));
3630   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3631   return gen_lowpart (mode, reg);
3632 }
3633
3634 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3635    for it.  PRED2[0] is the predicate for the instruction whose result
3636    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3637    for it.  Return true if we can prove that the two predicates are
3638    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3639    with PRED1[0] without changing behavior.  */
3640
3641 bool
3642 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3643 {
3644   machine_mode mode = GET_MODE (pred1[0]);
3645   gcc_assert (aarch64_sve_pred_mode_p (mode)
3646               && mode == GET_MODE (pred2[0])
3647               && aarch64_sve_ptrue_flag (pred1[1], SImode)
3648               && aarch64_sve_ptrue_flag (pred2[1], SImode));
3649
3650   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3651                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3652   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3653                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3654   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3655 }
3656
3657 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3658    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3659    Use TARGET as the target register if nonnull and convenient.  */
3660
3661 static rtx
3662 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3663                           machine_mode data_mode, rtx op1, rtx op2)
3664 {
3665   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3666   expand_operand ops[5];
3667   create_output_operand (&ops[0], target, pred_mode);
3668   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3669   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3670   create_input_operand (&ops[3], op1, data_mode);
3671   create_input_operand (&ops[4], op2, data_mode);
3672   expand_insn (icode, 5, ops);
3673   return ops[0].value;
3674 }
3675
3676 /* Use a comparison to convert integer vector SRC into MODE, which is
3677    the corresponding SVE predicate mode.  Use TARGET for the result
3678    if it's nonnull and convenient.  */
3679
3680 rtx
3681 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3682 {
3683   machine_mode src_mode = GET_MODE (src);
3684   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3685                                    src, CONST0_RTX (src_mode));
3686 }
3687
3688 /* Return the assembly token for svprfop value PRFOP.  */
3689
3690 static const char *
3691 svprfop_token (enum aarch64_svprfop prfop)
3692 {
3693   switch (prfop)
3694     {
3695 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3696     AARCH64_FOR_SVPRFOP (CASE)
3697 #undef CASE
3698     case AARCH64_NUM_SVPRFOPS:
3699       break;
3700     }
3701   gcc_unreachable ();
3702 }
3703
3704 /* Return the assembly string for an SVE prefetch operation with
3705    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3706    and that SUFFIX is the format for the remaining operands.  */
3707
3708 char *
3709 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3710                              const char *suffix)
3711 {
3712   static char buffer[128];
3713   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3714   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3715                                    mnemonic, svprfop_token (prfop), suffix);
3716   gcc_assert (written < sizeof (buffer));
3717   return buffer;
3718 }
3719
3720 /* Check whether we can calculate the number of elements in PATTERN
3721    at compile time, given that there are NELTS_PER_VQ elements per
3722    128-bit block.  Return the value if so, otherwise return -1.  */
3723
3724 HOST_WIDE_INT
3725 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3726 {
3727   unsigned int vl, const_vg;
3728   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3729     vl = 1 + (pattern - AARCH64_SV_VL1);
3730   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3731     vl = 16 << (pattern - AARCH64_SV_VL16);
3732   else if (aarch64_sve_vg.is_constant (&const_vg))
3733     {
3734       /* There are two vector granules per quadword.  */
3735       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3736       switch (pattern)
3737         {
3738         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3739         case AARCH64_SV_MUL4: return nelts & -4;
3740         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3741         case AARCH64_SV_ALL: return nelts;
3742         default: gcc_unreachable ();
3743         }
3744     }
3745   else
3746     return -1;
3747
3748   /* There are two vector granules per quadword.  */
3749   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3750   if (known_le (vl, nelts_all))
3751     return vl;
3752
3753   /* Requesting more elements than are available results in a PFALSE.  */
3754   if (known_gt (vl, nelts_all))
3755     return 0;
3756
3757   return -1;
3758 }
3759
3760 /* Return true if a single CNT[BHWD] instruction can multiply FACTOR
3761    by the number of 128-bit quadwords in an SVE vector.  */
3762
3763 static bool
3764 aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
3765 {
3766   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
3767   return (IN_RANGE (factor, 2, 16 * 16)
3768           && (factor & 1) == 0
3769           && factor <= 16 * (factor & -factor));
3770 }
3771
3772 /* Return true if we can move VALUE into a register using a single
3773    CNT[BHWD] instruction.  */
3774
3775 static bool
3776 aarch64_sve_cnt_immediate_p (poly_int64 value)
3777 {
3778   HOST_WIDE_INT factor = value.coeffs[0];
3779   return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
3780 }
3781
3782 /* Likewise for rtx X.  */
3783
3784 bool
3785 aarch64_sve_cnt_immediate_p (rtx x)
3786 {
3787   poly_int64 value;
3788   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3789 }
3790
3791 /* Return the asm string for an instruction with a CNT-like vector size
3792    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3793    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3794    first part of the operands template (the part that comes before the
3795    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
3796    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
3797    in each quadword.  If it is zero, we can use any element size.  */
3798
3799 static char *
3800 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3801                                   aarch64_svpattern pattern,
3802                                   unsigned int factor,
3803                                   unsigned int nelts_per_vq)
3804 {
3805   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3806
3807   if (nelts_per_vq == 0)
3808     /* There is some overlap in the ranges of the four CNT instructions.
3809        Here we always use the smallest possible element size, so that the
3810        multiplier is 1 whereever possible.  */
3811     nelts_per_vq = factor & -factor;
3812   int shift = std::min (exact_log2 (nelts_per_vq), 4);
3813   gcc_assert (IN_RANGE (shift, 1, 4));
3814   char suffix = "dwhb"[shift - 1];
3815
3816   factor >>= shift;
3817   unsigned int written;
3818   if (pattern == AARCH64_SV_ALL && factor == 1)
3819     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3820                         prefix, suffix, operands);
3821   else if (factor == 1)
3822     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3823                         prefix, suffix, operands, svpattern_token (pattern));
3824   else
3825     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3826                         prefix, suffix, operands, svpattern_token (pattern),
3827                         factor);
3828   gcc_assert (written < sizeof (buffer));
3829   return buffer;
3830 }
3831
3832 /* Return the asm string for an instruction with a CNT-like vector size
3833    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3834    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3835    first part of the operands template (the part that comes before the
3836    vector size itself).  X is the value of the vector size operand,
3837    as a polynomial integer rtx; we need to convert this into an "all"
3838    pattern with a multiplier.  */
3839
3840 char *
3841 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3842                                   rtx x)
3843 {
3844   poly_int64 value = rtx_to_poly_int64 (x);
3845   gcc_assert (aarch64_sve_cnt_immediate_p (value));
3846   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3847                                            value.coeffs[1], 0);
3848 }
3849
3850 /* Return the asm string for an instruction with a CNT-like vector size
3851    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3852    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3853    first part of the operands template (the part that comes before the
3854    vector size itself).  CNT_PAT[0..2] are the operands of the
3855    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
3856
3857 char *
3858 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3859                                       const char *operands, rtx *cnt_pat)
3860 {
3861   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3862   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3863   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3864   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3865                                            factor, nelts_per_vq);
3866 }
3867
3868 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
3869
3870 bool
3871 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3872 {
3873   poly_int64 value;
3874   return (poly_int_rtx_p (x, &value)
3875           && (aarch64_sve_cnt_immediate_p (value)
3876               || aarch64_sve_cnt_immediate_p (-value)));
3877 }
3878
3879 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3880    operand 0.  */
3881
3882 char *
3883 aarch64_output_sve_scalar_inc_dec (rtx offset)
3884 {
3885   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3886   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3887   if (offset_value.coeffs[1] > 0)
3888     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3889                                              offset_value.coeffs[1], 0);
3890   else
3891     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3892                                              -offset_value.coeffs[1], 0);
3893 }
3894
3895 /* Return true if a single RDVL instruction can multiply FACTOR by the
3896    number of 128-bit quadwords in an SVE vector.  This is also the
3897    range of ADDVL.  */
3898
3899 static bool
3900 aarch64_sve_rdvl_addvl_factor_p (HOST_WIDE_INT factor)
3901 {
3902   return (multiple_p (factor, 16)
3903           && IN_RANGE (factor, -32 * 16, 31 * 16));
3904 }
3905
3906 /* Return true if ADDPL can be used to add FACTOR multiplied by the number
3907    of quadwords in an SVE vector.  */
3908
3909 static bool
3910 aarch64_sve_addpl_factor_p (HOST_WIDE_INT factor)
3911 {
3912   return (multiple_p (factor, 2)
3913           && IN_RANGE (factor, -32 * 2, 31 * 2));
3914 }
3915
3916 /* Return true if we can move VALUE into a register using a single
3917    RDVL instruction.  */
3918
3919 static bool
3920 aarch64_sve_rdvl_immediate_p (poly_int64 value)
3921 {
3922   HOST_WIDE_INT factor = value.coeffs[0];
3923   return value.coeffs[1] == factor && aarch64_sve_rdvl_addvl_factor_p (factor);
3924 }
3925
3926 /* Likewise for rtx X.  */
3927
3928 bool
3929 aarch64_sve_rdvl_immediate_p (rtx x)
3930 {
3931   poly_int64 value;
3932   return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
3933 }
3934
3935 /* Return the asm string for moving RDVL immediate OFFSET into register
3936    operand 0.  */
3937
3938 char *
3939 aarch64_output_sve_rdvl (rtx offset)
3940 {
3941   static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
3942   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3943   gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
3944
3945   int factor = offset_value.coeffs[1];
3946   snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
3947   return buffer;
3948 }
3949
3950 /* Return true if we can add VALUE to a register using a single ADDVL
3951    or ADDPL instruction.  */
3952
3953 static bool
3954 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3955 {
3956   HOST_WIDE_INT factor = value.coeffs[0];
3957   if (factor == 0 || value.coeffs[1] != factor)
3958     return false;
3959   return (aarch64_sve_rdvl_addvl_factor_p (factor)
3960           || aarch64_sve_addpl_factor_p (factor));
3961 }
3962
3963 /* Likewise for rtx X.  */
3964
3965 bool
3966 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3967 {
3968   poly_int64 value;
3969   return (poly_int_rtx_p (x, &value)
3970           && aarch64_sve_addvl_addpl_immediate_p (value));
3971 }
3972
3973 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3974    to operand 1 and storing the result in operand 0.  */
3975
3976 char *
3977 aarch64_output_sve_addvl_addpl (rtx offset)
3978 {
3979   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3980   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3981   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3982
3983   int factor = offset_value.coeffs[1];
3984   if ((factor & 15) == 0)
3985     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3986   else
3987     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3988   return buffer;
3989 }
3990
3991 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3992    instruction.  If it is, store the number of elements in each vector
3993    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3994    factor in *FACTOR_OUT (if nonnull).  */
3995
3996 bool
3997 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3998                                         unsigned int *nelts_per_vq_out)
3999 {
4000   rtx elt;
4001   poly_int64 value;
4002
4003   if (!const_vec_duplicate_p (x, &elt)
4004       || !poly_int_rtx_p (elt, &value))
4005     return false;
4006
4007   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
4008   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
4009     /* There's no vector INCB.  */
4010     return false;
4011
4012   HOST_WIDE_INT factor = value.coeffs[0];
4013   if (value.coeffs[1] != factor)
4014     return false;
4015
4016   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
4017   if ((factor % nelts_per_vq) != 0
4018       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4019     return false;
4020
4021   if (factor_out)
4022     *factor_out = factor;
4023   if (nelts_per_vq_out)
4024     *nelts_per_vq_out = nelts_per_vq;
4025   return true;
4026 }
4027
4028 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4029    instruction.  */
4030
4031 bool
4032 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
4033 {
4034   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
4035 }
4036
4037 /* Return the asm template for an SVE vector INC or DEC instruction.
4038    OPERANDS gives the operands before the vector count and X is the
4039    value of the vector count operand itself.  */
4040
4041 char *
4042 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
4043 {
4044   int factor;
4045   unsigned int nelts_per_vq;
4046   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
4047     gcc_unreachable ();
4048   if (factor < 0)
4049     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4050                                              -factor, nelts_per_vq);
4051   else
4052     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4053                                              factor, nelts_per_vq);
4054 }
4055
4056 /* Return a constant that represents FACTOR multiplied by the
4057    number of 128-bit quadwords in an SME vector.  ISA_MODE is the
4058    ISA mode in which the calculation is being performed.  */
4059
4060 rtx
4061 aarch64_sme_vq_immediate (machine_mode mode, HOST_WIDE_INT factor,
4062                           aarch64_isa_mode isa_mode)
4063 {
4064   gcc_assert (aarch64_sve_rdvl_addvl_factor_p (factor));
4065   if (isa_mode & AARCH64_ISA_MODE_SM_ON)
4066     /* We're in streaming mode, so we can use normal poly-int values.  */
4067     return gen_int_mode ({ factor, factor }, mode);
4068
4069   rtvec vec = gen_rtvec (1, gen_int_mode (factor, SImode));
4070   rtx unspec = gen_rtx_UNSPEC (mode, vec, UNSPEC_SME_VQ);
4071   return gen_rtx_CONST (mode, unspec);
4072 }
4073
4074 /* Return true if X is a constant that represents some number X
4075    multiplied by the number of quadwords in an SME vector.  Store this X
4076    in *FACTOR if so.  */
4077
4078 static bool
4079 aarch64_sme_vq_unspec_p (const_rtx x, HOST_WIDE_INT *factor)
4080 {
4081   if (!TARGET_SME || GET_CODE (x) != CONST)
4082     return false;
4083
4084   x = XEXP (x, 0);
4085   if (GET_CODE (x) != UNSPEC
4086       || XINT (x, 1) != UNSPEC_SME_VQ
4087       || XVECLEN (x, 0) != 1)
4088     return false;
4089
4090   x = XVECEXP (x, 0, 0);
4091   if (!CONST_INT_P (x))
4092     return false;
4093
4094   *factor = INTVAL (x);
4095   return true;
4096 }
4097
4098 /* Return true if X is a constant that represents some number Y
4099    multiplied by the number of quadwords in an SME vector, and if
4100    that Y is in the range of RDSVL.  */
4101
4102 bool
4103 aarch64_rdsvl_immediate_p (const_rtx x)
4104 {
4105   HOST_WIDE_INT factor;
4106   return (aarch64_sme_vq_unspec_p (x, &factor)
4107           && aarch64_sve_rdvl_addvl_factor_p (factor));
4108 }
4109
4110 /* Return the asm string for an RDSVL instruction that calculates X,
4111    which is a constant that satisfies aarch64_rdsvl_immediate_p.  */
4112
4113 char *
4114 aarch64_output_rdsvl (const_rtx x)
4115 {
4116   gcc_assert (aarch64_rdsvl_immediate_p (x));
4117   static char buffer[sizeof ("rdsvl\t%x0, #-") + 3 * sizeof (int)];
4118   x = XVECEXP (XEXP (x, 0), 0, 0);
4119   snprintf (buffer, sizeof (buffer), "rdsvl\t%%x0, #%d",
4120             (int) INTVAL (x) / 16);
4121   return buffer;
4122 }
4123
4124 /* Return true if X is a constant that can be added using ADDSVL or ADDSPL.  */
4125
4126 bool
4127 aarch64_addsvl_addspl_immediate_p (const_rtx x)
4128 {
4129   HOST_WIDE_INT factor;
4130   return (aarch64_sme_vq_unspec_p (x, &factor)
4131           && (aarch64_sve_rdvl_addvl_factor_p (factor)
4132               || aarch64_sve_addpl_factor_p (factor)));
4133 }
4134
4135 /* X is a constant that satisfies aarch64_addsvl_addspl_immediate_p.
4136    Return the asm string for the associated instruction.  */
4137
4138 char *
4139 aarch64_output_addsvl_addspl (rtx x)
4140 {
4141   static char buffer[sizeof ("addspl\t%x0, %x1, #-") + 3 * sizeof (int)];
4142   HOST_WIDE_INT factor;
4143   if (!aarch64_sme_vq_unspec_p (x, &factor))
4144     gcc_unreachable ();
4145   if (aarch64_sve_rdvl_addvl_factor_p (factor))
4146     snprintf (buffer, sizeof (buffer), "addsvl\t%%x0, %%x1, #%d",
4147               (int) factor / 16);
4148   else if (aarch64_sve_addpl_factor_p (factor))
4149     snprintf (buffer, sizeof (buffer), "addspl\t%%x0, %%x1, #%d",
4150               (int) factor / 2);
4151   else
4152     gcc_unreachable ();
4153   return buffer;
4154 }
4155
4156 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4157
4158 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4159   {
4160     0x0000000100000001ull,
4161     0x0001000100010001ull,
4162     0x0101010101010101ull,
4163     0x1111111111111111ull,
4164     0x5555555555555555ull,
4165   };
4166
4167
4168
4169 /* Return true if 64-bit VAL is a valid bitmask immediate.  */
4170 static bool
4171 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
4172 {
4173   unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
4174   int bits;
4175
4176   /* Check for a single sequence of one bits and return quickly if so.
4177      The special cases of all ones and all zeroes returns false.  */
4178   tmp = val + (val & -val);
4179
4180   if (tmp == (tmp & -tmp))
4181     return (val + 1) > 1;
4182
4183   /* Invert if the immediate doesn't start with a zero bit - this means we
4184      only need to search for sequences of one bits.  */
4185   if (val & 1)
4186     val = ~val;
4187
4188   /* Find the first set bit and set tmp to val with the first sequence of one
4189      bits removed.  Return success if there is a single sequence of ones.  */
4190   first_one = val & -val;
4191   tmp = val & (val + first_one);
4192
4193   if (tmp == 0)
4194     return true;
4195
4196   /* Find the next set bit and compute the difference in bit position.  */
4197   next_one = tmp & -tmp;
4198   bits = clz_hwi (first_one) - clz_hwi (next_one);
4199   mask = val ^ tmp;
4200
4201   /* Check the bit position difference is a power of 2, and that the first
4202      sequence of one bits fits within 'bits' bits.  */
4203   if ((mask >> bits) != 0 || bits != (bits & -bits))
4204     return false;
4205
4206   /* Check the sequence of one bits is repeated 64/bits times.  */
4207   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4208 }
4209
4210
4211 /* Return true if VAL is a valid bitmask immediate for MODE.  */
4212 bool
4213 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4214 {
4215   if (mode == DImode)
4216     return aarch64_bitmask_imm (val);
4217
4218   if (mode == SImode)
4219     return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
4220
4221   /* Replicate small immediates to fit 64 bits.  */
4222   int size = GET_MODE_UNIT_PRECISION (mode);
4223   val &= (HOST_WIDE_INT_1U << size) - 1;
4224   val *= bitmask_imm_mul[__builtin_clz (size) - 26];
4225
4226   return aarch64_bitmask_imm (val);
4227 }
4228
4229
4230 /* Return true if the immediate VAL can be a bitfield immediate
4231    by changing the given MASK bits in VAL to zeroes, ones or bits
4232    from the other half of VAL.  Return the new immediate in VAL2.  */
4233 static inline bool
4234 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
4235                        unsigned HOST_WIDE_INT &val2,
4236                        unsigned HOST_WIDE_INT mask)
4237 {
4238   val2 = val & ~mask;
4239   if (val2 != val && aarch64_bitmask_imm (val2))
4240     return true;
4241   val2 = val | mask;
4242   if (val2 != val && aarch64_bitmask_imm (val2))
4243     return true;
4244   val = val & ~mask;
4245   val2 = val | (((val >> 32) | (val << 32)) & mask);
4246   if (val2 != val && aarch64_bitmask_imm (val2))
4247     return true;
4248   val2 = val | (((val >> 16) | (val << 48)) & mask);
4249   if (val2 != val && aarch64_bitmask_imm (val2))
4250     return true;
4251   return false;
4252 }
4253
4254
4255 /* Return true if VAL is a valid MOVZ immediate.  */
4256 static inline bool
4257 aarch64_is_movz (unsigned HOST_WIDE_INT val)
4258 {
4259   return (val >> (ctz_hwi (val) & 48)) < 65536;
4260 }
4261
4262
4263 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ.  */
4264 bool
4265 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
4266 {
4267   return aarch64_is_movz (val) || aarch64_is_movz (~val)
4268     || aarch64_bitmask_imm (val);
4269 }
4270
4271
4272 /* Return true if VAL is an immediate that can be created by a single
4273    MOV instruction.  */
4274 bool
4275 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4276 {
4277   gcc_assert (mode == SImode || mode == DImode);
4278
4279   if (val < 65536)
4280     return true;
4281
4282   unsigned HOST_WIDE_INT mask =
4283     (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
4284
4285   if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
4286     return true;
4287
4288   val = (val & mask) | ((val << 32) & ~mask);
4289   return aarch64_bitmask_imm (val);
4290 }
4291
4292
4293 static int
4294 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4295                                 machine_mode mode)
4296 {
4297   int i;
4298   unsigned HOST_WIDE_INT val, val2, val3, mask;
4299   int one_match, zero_match;
4300   int num_insns;
4301
4302   gcc_assert (mode == SImode || mode == DImode);
4303
4304   val = INTVAL (imm);
4305
4306   if (aarch64_move_imm (val, mode))
4307     {
4308       if (generate)
4309         emit_insn (gen_rtx_SET (dest, imm));
4310       return 1;
4311     }
4312
4313   if ((val >> 32) == 0 || mode == SImode)
4314     {
4315       if (generate)
4316         {
4317           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4318           if (mode == SImode)
4319             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4320                                        GEN_INT ((val >> 16) & 0xffff)));
4321           else
4322             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4323                                        GEN_INT ((val >> 16) & 0xffff)));
4324         }
4325       return 2;
4326     }
4327
4328   /* Remaining cases are all for DImode.  */
4329
4330   mask = 0xffff;
4331   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4332     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4333   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4334     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4335
4336   /* Try a bitmask immediate and a movk to generate the immediate
4337      in 2 instructions.  */
4338
4339   if (zero_match < 2 && one_match < 2)
4340     {
4341       for (i = 0; i < 64; i += 16)
4342         {
4343           if (aarch64_check_bitmask (val, val2, mask << i))
4344             break;
4345
4346           val2 = val & ~(mask << i);
4347           if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
4348             break;
4349         }
4350
4351       if (i != 64)
4352         {
4353           if (generate)
4354             {
4355               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4356               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4357                                          GEN_INT ((val >> i) & 0xffff)));
4358             }
4359           return 2;
4360         }
4361
4362       /* Try 2 bitmask immediates which are xor'd together. */
4363       for (i = 0; i < 64; i += 16)
4364         {
4365           val2 = (val >> i) & mask;
4366           val2 |= val2 << 16;
4367           val2 |= val2 << 32;
4368           if (aarch64_bitmask_imm (val2) && aarch64_bitmask_imm (val ^ val2))
4369             break;
4370         }
4371
4372       if (i != 64)
4373         {
4374           if (generate)
4375             {
4376               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4377               emit_insn (gen_xordi3 (dest, dest, GEN_INT (val ^ val2)));
4378             }
4379           return 2;
4380         }
4381     }
4382
4383   /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
4384   if (zero_match + one_match == 0)
4385     {
4386       for (i = 0; i < 48; i += 16)
4387         for (int j = i + 16; j < 64; j += 16)
4388           if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
4389             {
4390               if (generate)
4391                 {
4392                   emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4393                   emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4394                                              GEN_INT ((val >> i) & 0xffff)));
4395                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
4396                                                GEN_INT ((val >> j) & 0xffff)));
4397                 }
4398               return 3;
4399             }
4400
4401       /* Try shifting and inserting the bottom 32-bits into the top bits.  */
4402       val2 = val & 0xffffffff;
4403       val3 = 0xffffffff;
4404       val3 = val2 | (val3 << 32);
4405       for (i = 17; i < 48; i++)
4406         if ((val2 | (val2 << i)) == val)
4407           {
4408             if (generate)
4409               {
4410                 emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
4411                 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4412                                            GEN_INT (val2 >> 16)));
4413                 emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
4414               }
4415             return 3;
4416           }
4417         else if ((val3 & ~(val3 << i)) == val)
4418           {
4419             if (generate)
4420               {
4421                 emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
4422                 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4423                                            GEN_INT (val2 >> 16)));
4424                 emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
4425                                                       dest));
4426               }
4427             return 3;
4428           }
4429     }
4430
4431   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4432      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
4433      otherwise skip zero bits.  */
4434
4435   num_insns = 1;
4436   mask = 0xffff;
4437   val2 = one_match > zero_match ? ~val : val;
4438   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4439
4440   if (generate)
4441     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4442                                            ? (val | ~(mask << i))
4443                                            : (val & (mask << i)))));
4444   for (i += 16; i < 64; i += 16)
4445     {
4446       if ((val2 & (mask << i)) == 0)
4447         continue;
4448       if (generate)
4449         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4450                                    GEN_INT ((val >> i) & 0xffff)));
4451       num_insns ++;
4452     }
4453
4454   return num_insns;
4455 }
4456
4457 /* Return whether imm is a 128-bit immediate which is simple enough to
4458    expand inline.  */
4459 bool
4460 aarch64_mov128_immediate (rtx imm)
4461 {
4462   if (CONST_INT_P (imm))
4463     return true;
4464
4465   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4466
4467   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4468   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4469
4470   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4471          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4472 }
4473
4474
4475 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4476    a left shift of 0 or 12 bits.  */
4477 bool
4478 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
4479 {
4480   return val < 4096 || (val & 0xfff000) == val;
4481 }
4482
4483 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
4484    that can be created with a left shift of 0 or 12.  */
4485 static HOST_WIDE_INT
4486 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
4487 {
4488   /* Check to see if the value fits in 24 bits, as that is the maximum we can
4489      handle correctly.  */
4490   gcc_assert (val < 0x1000000);
4491
4492   if (val < 4096)
4493     return val;
4494
4495   return val & 0xfff000;
4496 }
4497
4498
4499 /* Test whether:
4500
4501      X = (X & AND_VAL) | IOR_VAL;
4502
4503    can be implemented using:
4504
4505      MOVK X, #(IOR_VAL >> shift), LSL #shift
4506
4507    Return the shift if so, otherwise return -1.  */
4508 int
4509 aarch64_movk_shift (const wide_int_ref &and_val,
4510                     const wide_int_ref &ior_val)
4511 {
4512   unsigned int precision = and_val.get_precision ();
4513   unsigned HOST_WIDE_INT mask = 0xffff;
4514   for (unsigned int shift = 0; shift < precision; shift += 16)
4515     {
4516       if (and_val == ~mask && (ior_val & mask) == ior_val)
4517         return shift;
4518       mask <<= 16;
4519     }
4520   return -1;
4521 }
4522
4523 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4524    Assumed precondition: VAL_IN Is not zero.  */
4525
4526 unsigned HOST_WIDE_INT
4527 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4528 {
4529   int lowest_bit_set = ctz_hwi (val_in);
4530   int highest_bit_set = floor_log2 (val_in);
4531   gcc_assert (val_in != 0);
4532
4533   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4534           (HOST_WIDE_INT_1U << lowest_bit_set));
4535 }
4536
4537 /* Create constant where bits outside of lowest bit set to highest bit set
4538    are set to 1.  */
4539
4540 unsigned HOST_WIDE_INT
4541 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4542 {
4543   return val_in | ~aarch64_and_split_imm1 (val_in);
4544 }
4545
4546 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4547
4548 bool
4549 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4550 {
4551   scalar_int_mode int_mode;
4552   if (!is_a <scalar_int_mode> (mode, &int_mode))
4553     return false;
4554
4555   if (aarch64_bitmask_imm (val_in, int_mode))
4556     return false;
4557
4558   if (aarch64_move_imm (val_in, int_mode))
4559     return false;
4560
4561   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4562
4563   return aarch64_bitmask_imm (imm2, int_mode);
4564 }
4565
4566 /* Return the number of temporary registers that aarch64_add_offset_1
4567    would need to add OFFSET to a register.  */
4568
4569 static unsigned int
4570 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4571 {
4572   return absu_hwi (offset) < 0x1000000 ? 0 : 1;
4573 }
4574
4575 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
4576    a non-polynomial OFFSET.  MODE is the mode of the addition.
4577    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4578    be set and CFA adjustments added to the generated instructions.
4579
4580    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4581    temporary if register allocation is already complete.  This temporary
4582    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
4583    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4584    the immediate again.
4585
4586    Since this function may be used to adjust the stack pointer, we must
4587    ensure that it cannot cause transient stack deallocation (for example
4588    by first incrementing SP and then decrementing when adjusting by a
4589    large immediate).  */
4590
4591 static void
4592 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4593                       rtx src, HOST_WIDE_INT offset, rtx temp1,
4594                       bool frame_related_p, bool emit_move_imm)
4595 {
4596   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4597   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4598
4599   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4600   rtx_insn *insn;
4601
4602   if (!moffset)
4603     {
4604       if (!rtx_equal_p (dest, src))
4605         {
4606           insn = emit_insn (gen_rtx_SET (dest, src));
4607           RTX_FRAME_RELATED_P (insn) = frame_related_p;
4608         }
4609       return;
4610     }
4611
4612   /* Single instruction adjustment.  */
4613   if (aarch64_uimm12_shift (moffset))
4614     {
4615       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4616       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4617       return;
4618     }
4619
4620   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4621      and either:
4622
4623      a) the offset cannot be loaded by a 16-bit move or
4624      b) there is no spare register into which we can move it.  */
4625   if (moffset < 0x1000000
4626       && ((!temp1 && !can_create_pseudo_p ())
4627           || !aarch64_move_imm (moffset, mode)))
4628     {
4629       HOST_WIDE_INT low_off = moffset & 0xfff;
4630
4631       low_off = offset < 0 ? -low_off : low_off;
4632       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4633       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4634       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4635       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4636       return;
4637     }
4638
4639   /* Emit a move immediate if required and an addition/subtraction.  */
4640   if (emit_move_imm)
4641     {
4642       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4643       temp1 = aarch64_force_temporary (mode, temp1,
4644                                        gen_int_mode (moffset, mode));
4645     }
4646   insn = emit_insn (offset < 0
4647                     ? gen_sub3_insn (dest, src, temp1)
4648                     : gen_add3_insn (dest, src, temp1));
4649   if (frame_related_p)
4650     {
4651       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4652       rtx adj = plus_constant (mode, src, offset);
4653       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4654     }
4655 }
4656
4657 /* Return the number of temporary registers that aarch64_add_offset
4658    would need to move OFFSET into a register or add OFFSET to a register;
4659    ADD_P is true if we want the latter rather than the former.  */
4660
4661 static unsigned int
4662 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4663 {
4664   /* This follows the same structure as aarch64_add_offset.  */
4665   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4666     return 0;
4667
4668   unsigned int count = 0;
4669   HOST_WIDE_INT factor = offset.coeffs[1];
4670   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4671   poly_int64 poly_offset (factor, factor);
4672   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4673     /* Need one register for the ADDVL/ADDPL result.  */
4674     count += 1;
4675   else if (factor != 0)
4676     {
4677       factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
4678       if (!IN_RANGE (factor, -32, 31))
4679         /* Need one register for the CNT or RDVL result and one for the
4680            multiplication factor.  If necessary, the second temporary
4681            can be reused for the constant part of the offset.  */
4682         return 2;
4683       /* Need one register for the CNT or RDVL result (which might then
4684          be shifted).  */
4685       count += 1;
4686     }
4687   return count + aarch64_add_offset_1_temporaries (constant);
4688 }
4689
4690 /* If X can be represented as a poly_int64, return the number
4691    of temporaries that are required to add it to a register.
4692    Return -1 otherwise.  */
4693
4694 int
4695 aarch64_add_offset_temporaries (rtx x)
4696 {
4697   poly_int64 offset;
4698   if (!poly_int_rtx_p (x, &offset))
4699     return -1;
4700   return aarch64_offset_temporaries (true, offset);
4701 }
4702
4703 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
4704    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4705    be set and CFA adjustments added to the generated instructions.
4706
4707    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4708    temporary if register allocation is already complete.  This temporary
4709    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4710    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4711    false to avoid emitting the immediate again.
4712
4713    TEMP2, if nonnull, is a second temporary register that doesn't
4714    overlap either DEST or REG.
4715
4716    FORCE_ISA_MODE is AARCH64_ISA_MODE_SM_ON if any variable component of OFFSET
4717    is measured relative to the SME vector length instead of the current
4718    prevailing vector length.  It is 0 otherwise.
4719
4720    Since this function may be used to adjust the stack pointer, we must
4721    ensure that it cannot cause transient stack deallocation (for example
4722    by first incrementing SP and then decrementing when adjusting by a
4723    large immediate).  */
4724
4725 static void
4726 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4727                     poly_int64 offset, rtx temp1, rtx temp2,
4728                     aarch64_isa_mode force_isa_mode,
4729                     bool frame_related_p, bool emit_move_imm = true)
4730 {
4731   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4732   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4733   gcc_assert (temp1 == NULL_RTX
4734               || !frame_related_p
4735               || !reg_overlap_mentioned_p (temp1, dest));
4736   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4737
4738   /* Try using ADDVL or ADDPL to add the whole value.  */
4739   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4740     {
4741       gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
4742       rtx offset_rtx;
4743       if (force_isa_mode == 0)
4744         offset_rtx = gen_int_mode (offset, mode);
4745       else
4746         offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
4747       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4748       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4749       if (frame_related_p && (force_isa_mode & AARCH64_ISA_MODE_SM_ON))
4750         add_reg_note (insn, REG_CFA_ADJUST_CFA,
4751                       gen_rtx_SET (dest, plus_constant (Pmode, src,
4752                                                         offset)));
4753       return;
4754     }
4755
4756   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4757      SVE vector register, over and above the minimum size of 128 bits.
4758      This is equivalent to half the value returned by CNTD with a
4759      vector shape of ALL.  */
4760   HOST_WIDE_INT factor = offset.coeffs[1];
4761   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4762
4763   /* Try using ADDVL or ADDPL to add the VG-based part.  */
4764   poly_int64 poly_offset (factor, factor);
4765   if (src != const0_rtx
4766       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4767     {
4768       rtx offset_rtx;
4769       if (force_isa_mode == 0)
4770         offset_rtx = gen_int_mode (poly_offset, mode);
4771       else
4772         offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
4773       if (frame_related_p)
4774         {
4775           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4776           RTX_FRAME_RELATED_P (insn) = true;
4777           if (force_isa_mode & AARCH64_ISA_MODE_SM_ON)
4778             add_reg_note (insn, REG_CFA_ADJUST_CFA,
4779                           gen_rtx_SET (dest, plus_constant (Pmode, src,
4780                                                             poly_offset)));
4781           src = dest;
4782         }
4783       else
4784         {
4785           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4786           src = aarch64_force_temporary (mode, temp1, addr);
4787           temp1 = temp2;
4788           temp2 = NULL_RTX;
4789         }
4790     }
4791   /* Otherwise use a CNT-based sequence.  */
4792   else if (factor != 0)
4793     {
4794       /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
4795          with negative shifts indicating a shift right.  */
4796       HOST_WIDE_INT low_bit = least_bit_hwi (factor);
4797       HOST_WIDE_INT rel_factor = factor / low_bit;
4798       int shift = exact_log2 (low_bit) - 4;
4799       gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
4800
4801       /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
4802          equal to CNTB * FACTOR / 16, with CODE being the [+-].
4803
4804          We can avoid a multiplication if REL_FACTOR is in the range
4805          of RDVL, although there are then various optimizations that
4806          we can try on top.  */
4807       rtx_code code = PLUS;
4808       rtx val;
4809       if (IN_RANGE (rel_factor, -32, 31))
4810         {
4811           if (force_isa_mode & AARCH64_ISA_MODE_SM_ON)
4812             {
4813               /* Try to use an unshifted RDSVL, otherwise fall back on
4814                  a shifted RDSVL #1.  */
4815               if (aarch64_sve_rdvl_addvl_factor_p (factor))
4816                 shift = 0;
4817               else
4818                 factor = rel_factor * 16;
4819               val = aarch64_sme_vq_immediate (mode, factor, 0);
4820             }
4821           /* Try to use an unshifted CNT[BHWD] or RDVL.  */
4822           else if (aarch64_sve_cnt_factor_p (factor)
4823                    || aarch64_sve_rdvl_addvl_factor_p (factor))
4824             {
4825               val = gen_int_mode (poly_int64 (factor, factor), mode);
4826               shift = 0;
4827             }
4828           /* Try to subtract an unshifted CNT[BHWD].  */
4829           else if (aarch64_sve_cnt_factor_p (-factor))
4830             {
4831               code = MINUS;
4832               val = gen_int_mode (poly_int64 (-factor, -factor), mode);
4833               shift = 0;
4834             }
4835           /* If subtraction is free, prefer to load a positive constant.
4836              In the best case this will fit a shifted CNTB.  */
4837           else if (src != const0_rtx && rel_factor < 0)
4838             {
4839               code = MINUS;
4840               val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
4841             }
4842           /* Otherwise use a shifted RDVL or CNT[BHWD].  */
4843           else
4844             val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
4845         }
4846       else
4847         {
4848           /* If we can calculate CNTB << SHIFT directly, prefer to do that,
4849              since it should increase the chances of being able to use
4850              a shift and add sequence for the multiplication.
4851              If CNTB << SHIFT is out of range, stick with the current
4852              shift factor.  */
4853           if (force_isa_mode == 0
4854               && IN_RANGE (low_bit, 2, 16 * 16))
4855             {
4856               val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
4857               shift = 0;
4858             }
4859           else if ((force_isa_mode & AARCH64_ISA_MODE_SM_ON)
4860                    && aarch64_sve_rdvl_addvl_factor_p (low_bit))
4861             {
4862               val = aarch64_sme_vq_immediate (mode, low_bit, 0);
4863               shift = 0;
4864             }
4865           else
4866             val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
4867
4868           val = aarch64_force_temporary (mode, temp1, val);
4869
4870           /* Prefer to multiply by a positive factor and subtract rather
4871              than multiply by a negative factor and add, since positive
4872              values are usually easier to move.  */
4873           if (rel_factor < 0 && src != const0_rtx)
4874             {
4875               rel_factor = -rel_factor;
4876               code = MINUS;
4877             }
4878
4879           if (can_create_pseudo_p ())
4880             {
4881               rtx coeff1 = gen_int_mode (rel_factor, mode);
4882               val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
4883             }
4884           else
4885             {
4886               rtx coeff1 = gen_int_mode (rel_factor, mode);
4887               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4888               val = gen_rtx_MULT (mode, val, coeff1);
4889             }
4890         }
4891
4892       /* Multiply by 2 ** SHIFT.  */
4893       if (shift > 0)
4894         {
4895           val = aarch64_force_temporary (mode, temp1, val);
4896           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4897         }
4898       else if (shift < 0)
4899         {
4900           val = aarch64_force_temporary (mode, temp1, val);
4901           val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
4902         }
4903
4904       /* Add the result to SRC or subtract the result from SRC.  */
4905       if (src != const0_rtx)
4906         {
4907           val = aarch64_force_temporary (mode, temp1, val);
4908           val = gen_rtx_fmt_ee (code, mode, src, val);
4909         }
4910       else if (code == MINUS)
4911         {
4912           val = aarch64_force_temporary (mode, temp1, val);
4913           val = gen_rtx_NEG (mode, val);
4914         }
4915
4916       if (constant == 0 || frame_related_p)
4917         {
4918           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4919           if (frame_related_p)
4920             {
4921               RTX_FRAME_RELATED_P (insn) = true;
4922               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4923                             gen_rtx_SET (dest, plus_constant (Pmode, src,
4924                                                               poly_offset)));
4925             }
4926           src = dest;
4927           if (constant == 0)
4928             return;
4929         }
4930       else
4931         {
4932           src = aarch64_force_temporary (mode, temp1, val);
4933           temp1 = temp2;
4934           temp2 = NULL_RTX;
4935         }
4936
4937       emit_move_imm = true;
4938     }
4939
4940   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4941                         frame_related_p, emit_move_imm);
4942 }
4943
4944 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4945    than a poly_int64.  */
4946
4947 void
4948 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4949                           rtx offset_rtx, rtx temp1, rtx temp2)
4950 {
4951   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4952                       temp1, temp2, 0, false);
4953 }
4954
4955 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4956    TEMP1 is available as a temporary if nonnull.  FORCE_ISA_MODE is as
4957    for aarch64_add_offset.  EMIT_MOVE_IMM is false if TEMP1 already
4958    contains abs (DELTA).  */
4959
4960 static inline void
4961 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
4962                 aarch64_isa_mode force_isa_mode, bool emit_move_imm)
4963 {
4964   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4965                       temp1, temp2, force_isa_mode, true, emit_move_imm);
4966 }
4967
4968 /* Subtract DELTA from the stack pointer, marking the instructions
4969    frame-related if FRAME_RELATED_P.  FORCE_ISA_MODE is as for
4970    aarch64_add_offset.  TEMP1 is available as a temporary if nonnull.  */
4971
4972 static inline void
4973 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
4974                 aarch64_isa_mode force_isa_mode,
4975                 bool frame_related_p, bool emit_move_imm = true)
4976 {
4977   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4978                       temp1, temp2, force_isa_mode, frame_related_p,
4979                       emit_move_imm);
4980 }
4981
4982 /* A streaming-compatible function needs to switch temporarily to the known
4983    PSTATE.SM mode described by LOCAL_MODE.  The low bit of OLD_SVCR contains
4984    the runtime state of PSTATE.SM in the streaming-compatible code, before
4985    the start of the switch to LOCAL_MODE.
4986
4987    Emit instructions to branch around the mode switch if PSTATE.SM already
4988    matches LOCAL_MODE.  Return the label that the branch jumps to.  */
4989
4990 static rtx_insn *
4991 aarch64_guard_switch_pstate_sm (rtx old_svcr, aarch64_isa_mode local_mode)
4992 {
4993   local_mode &= AARCH64_ISA_MODE_SM_STATE;
4994   gcc_assert (local_mode != 0);
4995   auto already_ok_cond = (local_mode & AARCH64_ISA_MODE_SM_ON ? NE : EQ);
4996   auto *label = gen_label_rtx ();
4997   auto branch = aarch64_gen_test_and_branch (already_ok_cond, old_svcr, 0,
4998                                              label);
4999   auto *jump = emit_jump_insn (branch);
5000   JUMP_LABEL (jump) = label;
5001   return label;
5002 }
5003
5004 /* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM
5005    state in NEW_MODE.  This is known to involve either an SMSTART SM or
5006    an SMSTOP SM.  */
5007
5008 static void
5009 aarch64_switch_pstate_sm (aarch64_isa_mode old_mode, aarch64_isa_mode new_mode)
5010 {
5011   old_mode &= AARCH64_ISA_MODE_SM_STATE;
5012   new_mode &= AARCH64_ISA_MODE_SM_STATE;
5013   gcc_assert (old_mode != new_mode);
5014
5015   if ((new_mode & AARCH64_ISA_MODE_SM_ON)
5016       || (!new_mode && (old_mode & AARCH64_ISA_MODE_SM_OFF)))
5017     emit_insn (gen_aarch64_smstart_sm ());
5018   else
5019     emit_insn (gen_aarch64_smstop_sm ());
5020 }
5021
5022 /* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all
5023    FP and predicate registers.  This class emits code to preserve any
5024    necessary registers around the mode switch.
5025
5026    The class uses four approaches to saving and restoring contents, enumerated
5027    by group_type:
5028
5029    - GPR: save and restore the contents of FP registers using GPRs.
5030      This is used if the FP register contains no more than 64 significant
5031      bits.  The registers used are FIRST_GPR onwards.
5032
5033    - MEM_128: save and restore 128-bit SIMD registers using memory.
5034
5035    - MEM_SVE_PRED: save and restore full SVE predicate registers using memory.
5036
5037    - MEM_SVE_DATA: save and restore full SVE vector registers using memory.
5038
5039    The save slots within each memory group are consecutive, with the
5040    MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots.
5041
5042    There will only be two mode switches for each use of SME, so they should
5043    not be particularly performance-sensitive.  It's also rare for SIMD, SVE
5044    or predicate registers to be live across mode switches.  We therefore
5045    don't preallocate the save slots but instead allocate them locally on
5046    demand.  This makes the code emitted by the class self-contained.  */
5047
5048 class aarch64_sme_mode_switch_regs
5049 {
5050 public:
5051   static const unsigned int FIRST_GPR = R10_REGNUM;
5052
5053   void add_reg (machine_mode, unsigned int);
5054   void add_call_args (rtx_call_insn *);
5055   void add_call_result (rtx_call_insn *);
5056   void add_call_preserved_reg (unsigned int);
5057   void add_call_preserved_regs (bitmap);
5058
5059   void emit_prologue ();
5060   void emit_epilogue ();
5061
5062   /* The number of GPRs needed to save FP registers, starting from
5063      FIRST_GPR.  */
5064   unsigned int num_gprs () { return m_group_count[GPR]; }
5065
5066 private:
5067   enum sequence { PROLOGUE, EPILOGUE };
5068   enum group_type { GPR, MEM_128, MEM_SVE_PRED, MEM_SVE_DATA, NUM_GROUPS };
5069
5070   /* Information about the save location for one FP, SIMD, SVE data, or
5071      SVE predicate register.  */
5072   struct save_location {
5073     /* The register to be saved.  */
5074     rtx reg;
5075
5076     /* Which group the save location belongs to.  */
5077     group_type group;
5078
5079     /* A zero-based index of the register within the group.  */
5080     unsigned int index;
5081   };
5082
5083   unsigned int sve_data_headroom ();
5084   rtx get_slot_mem (machine_mode, poly_int64);
5085   void emit_stack_adjust (sequence, poly_int64);
5086   void emit_mem_move (sequence, const save_location &, poly_int64);
5087
5088   void emit_gpr_moves (sequence);
5089   void emit_mem_128_moves (sequence);
5090   void emit_sve_sp_adjust (sequence);
5091   void emit_sve_pred_moves (sequence);
5092   void emit_sve_data_moves (sequence);
5093
5094   /* All save locations, in no particular order.  */
5095   auto_vec<save_location, 12> m_save_locations;
5096
5097   /* The number of registers in each group.  */
5098   unsigned int m_group_count[NUM_GROUPS] = {};
5099 };
5100
5101 /* Record that (reg:MODE REGNO) needs to be preserved around the mode
5102    switch.  */
5103
5104 void
5105 aarch64_sme_mode_switch_regs::add_reg (machine_mode mode, unsigned int regno)
5106 {
5107   if (!FP_REGNUM_P (regno) && !PR_REGNUM_P (regno))
5108     return;
5109
5110   unsigned int end_regno = end_hard_regno (mode, regno);
5111   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5112   gcc_assert ((vec_flags & VEC_STRUCT) || end_regno == regno + 1);
5113   for (; regno < end_regno; regno++)
5114     {
5115       /* Force the mode of SVE saves and restores even for single registers.
5116          This is necessary because big-endian targets only allow LDR Z and
5117          STR Z to be used with byte modes.  */
5118       machine_mode submode = mode;
5119       if (vec_flags & VEC_SVE_PRED)
5120         submode = VNx16BImode;
5121       else if (vec_flags & VEC_SVE_DATA)
5122         submode = SVE_BYTE_MODE;
5123       else if (vec_flags & VEC_STRUCT)
5124         {
5125           if (vec_flags & VEC_PARTIAL)
5126             submode = V8QImode;
5127           else
5128             submode = V16QImode;
5129         }
5130       save_location loc;
5131       loc.reg = gen_rtx_REG (submode, regno);
5132       if (vec_flags & VEC_SVE_PRED)
5133         {
5134           gcc_assert (PR_REGNUM_P (regno));
5135           loc.group = MEM_SVE_PRED;
5136         }
5137       else
5138         {
5139           gcc_assert (FP_REGNUM_P (regno));
5140           if (known_le (GET_MODE_SIZE (submode), 8))
5141             loc.group = GPR;
5142           else if (known_eq (GET_MODE_SIZE (submode), 16))
5143             loc.group = MEM_128;
5144           else
5145             loc.group = MEM_SVE_DATA;
5146         }
5147       loc.index = m_group_count[loc.group]++;
5148       m_save_locations.quick_push (loc);
5149     }
5150 }
5151
5152 /* Record that the arguments to CALL_INSN need to be preserved around
5153    the mode switch.  */
5154
5155 void
5156 aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn *call_insn)
5157 {
5158   for (rtx node = CALL_INSN_FUNCTION_USAGE (call_insn);
5159        node; node = XEXP (node, 1))
5160     {
5161       rtx item = XEXP (node, 0);
5162       if (GET_CODE (item) != USE)
5163         continue;
5164       item = XEXP (item, 0);
5165       if (!REG_P (item))
5166         continue;
5167       add_reg (GET_MODE (item), REGNO (item));
5168     }
5169 }
5170
5171 /* Record that the return value from CALL_INSN (if any) needs to be
5172    preserved around the mode switch.  */
5173
5174 void
5175 aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn *call_insn)
5176 {
5177   rtx pat = PATTERN (call_insn);
5178   gcc_assert (GET_CODE (pat) == PARALLEL);
5179   pat = XVECEXP (pat, 0, 0);
5180   if (GET_CODE (pat) == CALL)
5181     return;
5182   rtx dest = SET_DEST (pat);
5183   if (GET_CODE (dest) == PARALLEL)
5184     for (int i = 0; i < XVECLEN (dest, 0); ++i)
5185       {
5186         rtx x = XVECEXP (dest, 0, i);
5187         gcc_assert (GET_CODE (x) == EXPR_LIST);
5188         rtx reg = XEXP (x, 0);
5189         add_reg (GET_MODE (reg), REGNO (reg));
5190       }
5191   else
5192     add_reg (GET_MODE (dest), REGNO (dest));
5193 }
5194
5195 /* REGNO is a register that is call-preserved under the current function's ABI.
5196    Record that it must be preserved around the mode switch.  */
5197
5198 void
5199 aarch64_sme_mode_switch_regs::add_call_preserved_reg (unsigned int regno)
5200 {
5201   if (FP_REGNUM_P (regno))
5202     switch (crtl->abi->id ())
5203       {
5204       case ARM_PCS_SVE:
5205         add_reg (VNx16QImode, regno);
5206         break;
5207       case ARM_PCS_SIMD:
5208         add_reg (V16QImode, regno);
5209         break;
5210       case ARM_PCS_AAPCS64:
5211         add_reg (DImode, regno);
5212         break;
5213       default:
5214         gcc_unreachable ();
5215       }
5216   else if (PR_REGNUM_P (regno))
5217     add_reg (VNx16BImode, regno);
5218 }
5219
5220 /* The hard registers in REGS are call-preserved under the current function's
5221    ABI.  Record that they must be preserved around the mode switch.  */
5222
5223 void
5224 aarch64_sme_mode_switch_regs::add_call_preserved_regs (bitmap regs)
5225 {
5226   bitmap_iterator bi;
5227   unsigned int regno;
5228   EXECUTE_IF_SET_IN_BITMAP (regs, 0, regno, bi)
5229     if (HARD_REGISTER_NUM_P (regno))
5230       add_call_preserved_reg (regno);
5231     else
5232       break;
5233 }
5234
5235 /* Emit code to save registers before the mode switch.  */
5236
5237 void
5238 aarch64_sme_mode_switch_regs::emit_prologue ()
5239 {
5240   emit_sve_sp_adjust (PROLOGUE);
5241   emit_sve_pred_moves (PROLOGUE);
5242   emit_sve_data_moves (PROLOGUE);
5243   emit_mem_128_moves (PROLOGUE);
5244   emit_gpr_moves (PROLOGUE);
5245 }
5246
5247 /* Emit code to restore registers after the mode switch.  */
5248
5249 void
5250 aarch64_sme_mode_switch_regs::emit_epilogue ()
5251 {
5252   emit_gpr_moves (EPILOGUE);
5253   emit_mem_128_moves (EPILOGUE);
5254   emit_sve_pred_moves (EPILOGUE);
5255   emit_sve_data_moves (EPILOGUE);
5256   emit_sve_sp_adjust (EPILOGUE);
5257 }
5258
5259 /* The SVE predicate registers are stored below the SVE data registers,
5260    with the predicate save area being padded to a data-register-sized
5261    boundary.  Return the size of this padded area as a whole number
5262    of data register slots.  */
5263
5264 unsigned int
5265 aarch64_sme_mode_switch_regs::sve_data_headroom ()
5266 {
5267   return CEIL (m_group_count[MEM_SVE_PRED], 8);
5268 }
5269
5270 /* Return a memory reference of mode MODE to OFFSET bytes from the
5271    stack pointer.  */
5272
5273 rtx
5274 aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode,
5275                                             poly_int64 offset)
5276 {
5277   rtx addr = plus_constant (Pmode, stack_pointer_rtx, offset);
5278   return gen_rtx_MEM (mode, addr);
5279 }
5280
5281 /* Allocate or deallocate SIZE bytes of stack space: SEQ decides which.  */
5282
5283 void
5284 aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq,
5285                                                  poly_int64 size)
5286 {
5287   if (seq == PROLOGUE)
5288     size = -size;
5289   emit_insn (gen_rtx_SET (stack_pointer_rtx,
5290                           plus_constant (Pmode, stack_pointer_rtx, size)));
5291 }
5292
5293 /* Save or restore the register in LOC, whose slot is OFFSET bytes from
5294    the stack pointer.  SEQ chooses between saving and restoring.  */
5295
5296 void
5297 aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq,
5298                                              const save_location &loc,
5299                                              poly_int64 offset)
5300 {
5301   rtx mem = get_slot_mem (GET_MODE (loc.reg), offset);
5302   if (seq == PROLOGUE)
5303     emit_move_insn (mem, loc.reg);
5304   else
5305     emit_move_insn (loc.reg, mem);
5306 }
5307
5308 /* Emit instructions to save or restore the GPR group.  SEQ chooses between
5309    saving and restoring.  */
5310
5311 void
5312 aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq)
5313 {
5314   for (auto &loc : m_save_locations)
5315     if (loc.group == GPR)
5316       {
5317         gcc_assert (loc.index < 8);
5318         rtx gpr = gen_rtx_REG (GET_MODE (loc.reg), FIRST_GPR + loc.index);
5319         if (seq == PROLOGUE)
5320           emit_move_insn (gpr, loc.reg);
5321         else
5322           emit_move_insn (loc.reg, gpr);
5323       }
5324 }
5325
5326 /* Emit instructions to save or restore the MEM_128 group.  SEQ chooses
5327    between saving and restoring.  */
5328
5329 void
5330 aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq)
5331 {
5332   HOST_WIDE_INT count = m_group_count[MEM_128];
5333   if (count == 0)
5334     return;
5335
5336   auto sp = stack_pointer_rtx;
5337   auto sp_adjust = (seq == PROLOGUE ? -count : count) * 16;
5338
5339   /* Pick a common mode that supports LDR & STR with pre/post-modification
5340      and LDP & STP with pre/post-modification.  */
5341   auto mode = TFmode;
5342
5343   /* An instruction pattern that should be emitted at the end.  */
5344   rtx last_pat = NULL_RTX;
5345
5346   /* A previous MEM_128 location that hasn't been handled yet.  */
5347   save_location *prev_loc = nullptr;
5348
5349   /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC.  */
5350   for (auto &loc : m_save_locations)
5351     if (loc.group == MEM_128)
5352       {
5353         if (!prev_loc)
5354           {
5355             prev_loc = &loc;
5356             continue;
5357           }
5358         gcc_assert (loc.index == prev_loc->index + 1);
5359
5360         /* The offset of the base of the save area from the current
5361            stack pointer.  */
5362         HOST_WIDE_INT bias = 0;
5363         if (prev_loc->index == 0 && seq == PROLOGUE)
5364           bias = sp_adjust;
5365
5366         /* Get the two sets in the LDP/STP.  */
5367         rtx ops[] = {
5368           gen_rtx_REG (mode, REGNO (prev_loc->reg)),
5369           get_slot_mem (mode, prev_loc->index * 16 + bias),
5370           gen_rtx_REG (mode, REGNO (loc.reg)),
5371           get_slot_mem (mode, loc.index * 16 + bias)
5372         };
5373         unsigned int lhs = (seq == PROLOGUE);
5374         rtx set1 = gen_rtx_SET (ops[lhs], ops[1 - lhs]);
5375         rtx set2 = gen_rtx_SET (ops[lhs + 2], ops[3 - lhs]);
5376
5377         /* Combine the sets with any stack allocation/deallocation.  */
5378         rtx pat;
5379         if (prev_loc->index == 0)
5380           {
5381             rtx plus_sp = plus_constant (Pmode, sp, sp_adjust);
5382             rtvec vec = gen_rtvec (3, gen_rtx_SET (sp, plus_sp), set1, set2);
5383             pat = gen_rtx_PARALLEL (VOIDmode, vec);
5384           }
5385         else if (seq == PROLOGUE)
5386           pat = aarch64_gen_store_pair (ops[1], ops[0], ops[2]);
5387         else
5388           pat = aarch64_gen_load_pair (ops[0], ops[2], ops[1]);
5389
5390         /* Queue a deallocation to the end, otherwise emit the
5391            instruction now.  */
5392         if (seq == EPILOGUE && prev_loc->index == 0)
5393           last_pat = pat;
5394         else
5395           emit_insn (pat);
5396         prev_loc = nullptr;
5397       }
5398
5399   /* Handle any leftover LDR/STR.  */
5400   if (prev_loc)
5401     {
5402       rtx reg = gen_rtx_REG (mode, REGNO (prev_loc->reg));
5403       rtx addr;
5404       if (prev_loc->index != 0)
5405         addr = plus_constant (Pmode, sp, prev_loc->index * 16);
5406       else if (seq == PROLOGUE)
5407         {
5408           rtx allocate = plus_constant (Pmode, sp, -count * 16);
5409           addr = gen_rtx_PRE_MODIFY (Pmode, sp, allocate);
5410         }
5411       else
5412         {
5413           rtx deallocate = plus_constant (Pmode, sp, count * 16);
5414           addr = gen_rtx_POST_MODIFY (Pmode, sp, deallocate);
5415         }
5416       rtx mem = gen_rtx_MEM (mode, addr);
5417       if (seq == PROLOGUE)
5418         emit_move_insn (mem, reg);
5419       else
5420         emit_move_insn (reg, mem);
5421     }
5422
5423   if (last_pat)
5424     emit_insn (last_pat);
5425 }
5426
5427 /* Allocate or deallocate the stack space needed by the SVE groups.
5428    SEQ chooses between allocating and deallocating.  */
5429
5430 void
5431 aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq)
5432 {
5433   if (unsigned int count = m_group_count[MEM_SVE_DATA] + sve_data_headroom ())
5434     emit_stack_adjust (seq, count * BYTES_PER_SVE_VECTOR);
5435 }
5436
5437 /* Save or restore the MEM_SVE_DATA group.  SEQ chooses between saving
5438    and restoring.  */
5439
5440 void
5441 aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq)
5442 {
5443   for (auto &loc : m_save_locations)
5444     if (loc.group == MEM_SVE_DATA)
5445       {
5446         auto index = loc.index + sve_data_headroom ();
5447         emit_mem_move (seq, loc, index * BYTES_PER_SVE_VECTOR);
5448       }
5449 }
5450
5451 /* Save or restore the MEM_SVE_PRED group.  SEQ chooses between saving
5452    and restoring.  */
5453
5454 void
5455 aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq)
5456 {
5457   for (auto &loc : m_save_locations)
5458     if (loc.group == MEM_SVE_PRED)
5459       emit_mem_move (seq, loc, loc.index * BYTES_PER_SVE_PRED);
5460 }
5461
5462 /* Set DEST to (vec_series BASE STEP).  */
5463
5464 static void
5465 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
5466 {
5467   machine_mode mode = GET_MODE (dest);
5468   scalar_mode inner = GET_MODE_INNER (mode);
5469
5470   /* Each operand can be a register or an immediate in the range [-16, 15].  */
5471   if (!aarch64_sve_index_immediate_p (base))
5472     base = force_reg (inner, base);
5473   if (!aarch64_sve_index_immediate_p (step))
5474     step = force_reg (inner, step);
5475
5476   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
5477 }
5478
5479 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5480    register of mode MODE.  Use TARGET for the result if it's nonnull
5481    and convenient.
5482
5483    The two vector modes must have the same element mode.  The behavior
5484    is to duplicate architectural lane N of SRC into architectural lanes
5485    N + I * STEP of the result.  On big-endian targets, architectural
5486    lane 0 of an Advanced SIMD vector is the last element of the vector
5487    in memory layout, so for big-endian targets this operation has the
5488    effect of reversing SRC before duplicating it.  Callers need to
5489    account for this.  */
5490
5491 rtx
5492 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
5493 {
5494   machine_mode src_mode = GET_MODE (src);
5495   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
5496   insn_code icode = (BYTES_BIG_ENDIAN
5497                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
5498                      : code_for_aarch64_vec_duplicate_vq_le (mode));
5499
5500   unsigned int i = 0;
5501   expand_operand ops[3];
5502   create_output_operand (&ops[i++], target, mode);
5503   create_output_operand (&ops[i++], src, src_mode);
5504   if (BYTES_BIG_ENDIAN)
5505     {
5506       /* Create a PARALLEL describing the reversal of SRC.  */
5507       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
5508       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
5509                                                   nelts_per_vq - 1, -1);
5510       create_fixed_operand (&ops[i++], sel);
5511     }
5512   expand_insn (icode, i, ops);
5513   return ops[0].value;
5514 }
5515
5516 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5517    the memory image into DEST.  Return true on success.  */
5518
5519 static bool
5520 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
5521 {
5522   src = force_const_mem (GET_MODE (src), src);
5523   if (!src)
5524     return false;
5525
5526   /* Make sure that the address is legitimate.  */
5527   if (!aarch64_sve_ld1rq_operand_p (src))
5528     {
5529       rtx addr = force_reg (Pmode, XEXP (src, 0));
5530       src = replace_equiv_address (src, addr);
5531     }
5532
5533   machine_mode mode = GET_MODE (dest);
5534   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5535   rtx ptrue = aarch64_ptrue_reg (pred_mode);
5536   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
5537   return true;
5538 }
5539
5540 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5541    by N "background" values.  Try to move it into TARGET using:
5542
5543       PTRUE PRED.<T>, VL<N>
5544       MOV TRUE.<T>, #<foreground>
5545       MOV FALSE.<T>, #<background>
5546       SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5547
5548    The PTRUE is always a single instruction but the MOVs might need a
5549    longer sequence.  If the background value is zero (as it often is),
5550    the sequence can sometimes collapse to a PTRUE followed by a
5551    zero-predicated move.
5552
5553    Return the target on success, otherwise return null.  */
5554
5555 static rtx
5556 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
5557 {
5558   gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
5559
5560   /* Make sure that the PTRUE is valid.  */
5561   machine_mode mode = GET_MODE (src);
5562   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5563   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5564   if (aarch64_svpattern_for_vl (pred_mode, npatterns)
5565       == AARCH64_NUM_SVPATTERNS)
5566     return NULL_RTX;
5567
5568   rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
5569   rtx_vector_builder true_builder (mode, npatterns, 1);
5570   rtx_vector_builder false_builder (mode, npatterns, 1);
5571   for (unsigned int i = 0; i < npatterns; ++i)
5572     {
5573       true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5574       pred_builder.quick_push (CONST1_RTX (BImode));
5575     }
5576   for (unsigned int i = 0; i < npatterns; ++i)
5577     {
5578       false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
5579       pred_builder.quick_push (CONST0_RTX (BImode));
5580     }
5581   expand_operand ops[4];
5582   create_output_operand (&ops[0], target, mode);
5583   create_input_operand (&ops[1], true_builder.build (), mode);
5584   create_input_operand (&ops[2], false_builder.build (), mode);
5585   create_input_operand (&ops[3], pred_builder.build (), pred_mode);
5586   expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
5587   return target;
5588 }
5589
5590 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5591    SVE data mode and isn't a legitimate constant.  Use TARGET for the
5592    result if convenient.
5593
5594    The returned register can have whatever mode seems most natural
5595    given the contents of SRC.  */
5596
5597 static rtx
5598 aarch64_expand_sve_const_vector (rtx target, rtx src)
5599 {
5600   machine_mode mode = GET_MODE (src);
5601   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5602   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
5603   scalar_mode elt_mode = GET_MODE_INNER (mode);
5604   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
5605   unsigned int container_bits = aarch64_sve_container_bits (mode);
5606   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
5607
5608   if (nelts_per_pattern == 1
5609       && encoded_bits <= 128
5610       && container_bits != elt_bits)
5611     {
5612       /* We have a partial vector mode and a constant whose full-vector
5613          equivalent would occupy a repeating 128-bit sequence.  Build that
5614          full-vector equivalent instead, so that we have the option of
5615          using LD1RQ and Advanced SIMD operations.  */
5616       unsigned int repeat = container_bits / elt_bits;
5617       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
5618       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
5619       for (unsigned int i = 0; i < npatterns; ++i)
5620         for (unsigned int j = 0; j < repeat; ++j)
5621           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5622       target = aarch64_target_reg (target, full_mode);
5623       return aarch64_expand_sve_const_vector (target, builder.build ());
5624     }
5625
5626   if (nelts_per_pattern == 1 && encoded_bits == 128)
5627     {
5628       /* The constant is a duplicated quadword but can't be narrowed
5629          beyond a quadword.  Get the memory image of the first quadword
5630          as a 128-bit vector and try using LD1RQ to load it from memory.
5631
5632          The effect for both endiannesses is to load memory lane N into
5633          architectural lanes N + I * STEP of the result.  On big-endian
5634          targets, the layout of the 128-bit vector in an Advanced SIMD
5635          register would be different from its layout in an SVE register,
5636          but this 128-bit vector is a memory value only.  */
5637       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5638       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
5639       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
5640         return target;
5641     }
5642
5643   if (nelts_per_pattern == 1 && encoded_bits < 128)
5644     {
5645       /* The vector is a repeating sequence of 64 bits or fewer.
5646          See if we can load them using an Advanced SIMD move and then
5647          duplicate it to fill a vector.  This is better than using a GPR
5648          move because it keeps everything in the same register file.  */
5649       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5650       rtx_vector_builder builder (vq_mode, npatterns, 1);
5651       for (unsigned int i = 0; i < npatterns; ++i)
5652         {
5653           /* We want memory lane N to go into architectural lane N,
5654              so reverse for big-endian targets.  The DUP .Q pattern
5655              has a compensating reverse built-in.  */
5656           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
5657           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
5658         }
5659       rtx vq_src = builder.build ();
5660       if (aarch64_simd_valid_immediate (vq_src, NULL))
5661         {
5662           vq_src = force_reg (vq_mode, vq_src);
5663           return aarch64_expand_sve_dupq (target, mode, vq_src);
5664         }
5665
5666       /* Get an integer representation of the repeating part of Advanced
5667          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
5668          which for big-endian targets is lane-swapped wrt a normal
5669          Advanced SIMD vector.  This means that for both endiannesses,
5670          memory lane N of SVE vector SRC corresponds to architectural
5671          lane N of a register holding VQ_SRC.  This in turn means that
5672          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5673          as a single 128-bit value) and thus that memory lane 0 of SRC is
5674          in the lsb of the integer.  Duplicating the integer therefore
5675          ensures that memory lane N of SRC goes into architectural lane
5676          N + I * INDEX of the SVE register.  */
5677       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
5678       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
5679       if (elt_value)
5680         {
5681           /* Pretend that we had a vector of INT_MODE to start with.  */
5682           elt_mode = int_mode;
5683           mode = aarch64_full_sve_mode (int_mode).require ();
5684
5685           /* If the integer can be moved into a general register by a
5686              single instruction, do that and duplicate the result.  */
5687           if (CONST_INT_P (elt_value)
5688               && aarch64_move_imm (INTVAL (elt_value),
5689                                    encoded_bits <= 32 ? SImode : DImode))
5690             {
5691               elt_value = force_reg (elt_mode, elt_value);
5692               return expand_vector_broadcast (mode, elt_value);
5693             }
5694         }
5695       else if (npatterns == 1)
5696         /* We're duplicating a single value, but can't do better than
5697            force it to memory and load from there.  This handles things
5698            like symbolic constants.  */
5699         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
5700
5701       if (elt_value)
5702         {
5703           /* Load the element from memory if we can, otherwise move it into
5704              a register and use a DUP.  */
5705           rtx op = force_const_mem (elt_mode, elt_value);
5706           if (!op)
5707             op = force_reg (elt_mode, elt_value);
5708           return expand_vector_broadcast (mode, op);
5709         }
5710     }
5711
5712   /* Try using INDEX.  */
5713   rtx base, step;
5714   if (const_vec_series_p (src, &base, &step))
5715     {
5716       aarch64_expand_vec_series (target, base, step);
5717       return target;
5718     }
5719
5720   /* From here on, it's better to force the whole constant to memory
5721      if we can.  */
5722   if (GET_MODE_NUNITS (mode).is_constant ())
5723     return NULL_RTX;
5724
5725   if (nelts_per_pattern == 2)
5726     if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5727       return res;
5728
5729   /* Expand each pattern individually.  */
5730   gcc_assert (npatterns > 1);
5731   rtx_vector_builder builder;
5732   auto_vec<rtx, 16> vectors (npatterns);
5733   for (unsigned int i = 0; i < npatterns; ++i)
5734     {
5735       builder.new_vector (mode, 1, nelts_per_pattern);
5736       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5737         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5738       vectors.quick_push (force_reg (mode, builder.build ()));
5739     }
5740
5741   /* Use permutes to interleave the separate vectors.  */
5742   while (npatterns > 1)
5743     {
5744       npatterns /= 2;
5745       for (unsigned int i = 0; i < npatterns; ++i)
5746         {
5747           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
5748           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5749           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5750           vectors[i] = tmp;
5751         }
5752     }
5753   gcc_assert (vectors[0] == target);
5754   return target;
5755 }
5756
5757 /* Use WHILE to set a predicate register of mode MODE in which the first
5758    VL bits are set and the rest are clear.  Use TARGET for the register
5759    if it's nonnull and convenient.  */
5760
5761 static rtx
5762 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5763                                  unsigned int vl)
5764 {
5765   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
5766   target = aarch64_target_reg (target, mode);
5767   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
5768                         target, const0_rtx, limit));
5769   return target;
5770 }
5771
5772 static rtx
5773 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5774
5775 /* BUILDER is a constant predicate in which the index of every set bit
5776    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5777    by inverting every element at a multiple of ELT_SIZE and EORing the
5778    result with an ELT_SIZE PTRUE.
5779
5780    Return a register that contains the constant on success, otherwise
5781    return null.  Use TARGET as the register if it is nonnull and
5782    convenient.  */
5783
5784 static rtx
5785 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5786                                    unsigned int elt_size)
5787 {
5788   /* Invert every element at a multiple of ELT_SIZE, keeping the
5789      other bits zero.  */
5790   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5791                                   builder.nelts_per_pattern ());
5792   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5793     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5794       inv_builder.quick_push (const1_rtx);
5795     else
5796       inv_builder.quick_push (const0_rtx);
5797   inv_builder.finalize ();
5798
5799   /* See if we can load the constant cheaply.  */
5800   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5801   if (!inv)
5802     return NULL_RTX;
5803
5804   /* EOR the result with an ELT_SIZE PTRUE.  */
5805   rtx mask = aarch64_ptrue_all (elt_size);
5806   mask = force_reg (VNx16BImode, mask);
5807   inv = gen_lowpart (VNx16BImode, inv);
5808   target = aarch64_target_reg (target, VNx16BImode);
5809   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5810   return target;
5811 }
5812
5813 /* BUILDER is a constant predicate in which the index of every set bit
5814    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5815    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
5816    register on success, otherwise return null.  Use TARGET as the register
5817    if nonnull and convenient.  */
5818
5819 static rtx
5820 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5821                                    unsigned int elt_size,
5822                                    unsigned int permute_size)
5823 {
5824   /* We're going to split the constant into two new constants A and B,
5825      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5826      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5827
5828      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5829      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5830
5831      where _ indicates elements that will be discarded by the permute.
5832
5833      First calculate the ELT_SIZEs for A and B.  */
5834   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
5835   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
5836   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
5837     if (INTVAL (builder.elt (i)) != 0)
5838       {
5839         if (i & permute_size)
5840           b_elt_size |= i - permute_size;
5841         else
5842           a_elt_size |= i;
5843       }
5844   a_elt_size &= -a_elt_size;
5845   b_elt_size &= -b_elt_size;
5846
5847   /* Now construct the vectors themselves.  */
5848   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5849                                 builder.nelts_per_pattern ());
5850   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5851                                 builder.nelts_per_pattern ());
5852   unsigned int nelts = builder.encoded_nelts ();
5853   for (unsigned int i = 0; i < nelts; ++i)
5854     if (i & (elt_size - 1))
5855       {
5856         a_builder.quick_push (const0_rtx);
5857         b_builder.quick_push (const0_rtx);
5858       }
5859     else if ((i & permute_size) == 0)
5860       {
5861         /* The A and B elements are significant.  */
5862         a_builder.quick_push (builder.elt (i));
5863         b_builder.quick_push (builder.elt (i + permute_size));
5864       }
5865     else
5866       {
5867         /* The A and B elements are going to be discarded, so pick whatever
5868            is likely to give a nice constant.  We are targeting element
5869            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5870            with the aim of each being a sequence of ones followed by
5871            a sequence of zeros.  So:
5872
5873            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5874              duplicate the last X_ELT_SIZE element, to extend the
5875              current sequence of ones or zeros.
5876
5877            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5878              zero, so that the constant really does have X_ELT_SIZE and
5879              not a smaller size.  */
5880         if (a_elt_size > permute_size)
5881           a_builder.quick_push (const0_rtx);
5882         else
5883           a_builder.quick_push (a_builder.elt (i - a_elt_size));
5884         if (b_elt_size > permute_size)
5885           b_builder.quick_push (const0_rtx);
5886         else
5887           b_builder.quick_push (b_builder.elt (i - b_elt_size));
5888       }
5889   a_builder.finalize ();
5890   b_builder.finalize ();
5891
5892   /* Try loading A into a register.  */
5893   rtx_insn *last = get_last_insn ();
5894   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5895   if (!a)
5896     return NULL_RTX;
5897
5898   /* Try loading B into a register.  */
5899   rtx b = a;
5900   if (a_builder != b_builder)
5901     {
5902       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5903       if (!b)
5904         {
5905           delete_insns_since (last);
5906           return NULL_RTX;
5907         }
5908     }
5909
5910   /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
5911      operands but permutes them as though they had mode MODE.  */
5912   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5913   target = aarch64_target_reg (target, GET_MODE (a));
5914   rtx type_reg = CONST0_RTX (mode);
5915   emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
5916   return target;
5917 }
5918
5919 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
5920    constant in BUILDER into an SVE predicate register.  Return the register
5921    on success, otherwise return null.  Use TARGET for the register if
5922    nonnull and convenient.
5923
5924    ALLOW_RECURSE_P is true if we can use methods that would call this
5925    function recursively.  */
5926
5927 static rtx
5928 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5929                                  bool allow_recurse_p)
5930 {
5931   if (builder.encoded_nelts () == 1)
5932     /* A PFALSE or a PTRUE .B ALL.  */
5933     return aarch64_emit_set_immediate (target, builder);
5934
5935   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5936   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5937     {
5938       /* If we can load the constant using PTRUE, use it as-is.  */
5939       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5940       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5941         return aarch64_emit_set_immediate (target, builder);
5942
5943       /* Otherwise use WHILE to set the first VL bits.  */
5944       return aarch64_sve_move_pred_via_while (target, mode, vl);
5945     }
5946
5947   if (!allow_recurse_p)
5948     return NULL_RTX;
5949
5950   /* Try inverting the vector in element size ELT_SIZE and then EORing
5951      the result with an ELT_SIZE PTRUE.  */
5952   if (INTVAL (builder.elt (0)) == 0)
5953     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5954                                                      elt_size))
5955       return res;
5956
5957   /* Try using TRN1 to permute two simpler constants.  */
5958   for (unsigned int i = elt_size; i <= 8; i *= 2)
5959     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5960                                                      elt_size, i))
5961       return res;
5962
5963   return NULL_RTX;
5964 }
5965
5966 /* Return an SVE predicate register that contains the VNx16BImode
5967    constant in BUILDER, without going through the move expanders.
5968
5969    The returned register can have whatever mode seems most natural
5970    given the contents of BUILDER.  Use TARGET for the result if
5971    convenient.  */
5972
5973 static rtx
5974 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5975 {
5976   /* Try loading the constant using pure predicate operations.  */
5977   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5978     return res;
5979
5980   /* Try forcing the constant to memory.  */
5981   if (builder.full_nelts ().is_constant ())
5982     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5983       {
5984         target = aarch64_target_reg (target, VNx16BImode);
5985         emit_move_insn (target, mem);
5986         return target;
5987       }
5988
5989   /* The last resort is to load the constant as an integer and then
5990      compare it against zero.  Use -1 for set bits in order to increase
5991      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
5992   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5993                                   builder.nelts_per_pattern ());
5994   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5995     int_builder.quick_push (INTVAL (builder.elt (i))
5996                             ? constm1_rtx : const0_rtx);
5997   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5998                                            int_builder.build ());
5999 }
6000
6001 /* Set DEST to immediate IMM.  */
6002
6003 void
6004 aarch64_expand_mov_immediate (rtx dest, rtx imm)
6005 {
6006   machine_mode mode = GET_MODE (dest);
6007
6008   /* Check on what type of symbol it is.  */
6009   scalar_int_mode int_mode;
6010   if ((SYMBOL_REF_P (imm)
6011        || LABEL_REF_P (imm)
6012        || GET_CODE (imm) == CONST
6013        || GET_CODE (imm) == CONST_POLY_INT)
6014       && is_a <scalar_int_mode> (mode, &int_mode))
6015     {
6016       rtx mem;
6017       poly_int64 offset;
6018       HOST_WIDE_INT const_offset;
6019       enum aarch64_symbol_type sty;
6020
6021       /* If we have (const (plus symbol offset)), separate out the offset
6022          before we start classifying the symbol.  */
6023       rtx base = strip_offset (imm, &offset);
6024
6025       /* We must always add an offset involving VL separately, rather than
6026          folding it into the relocation.  */
6027       if (!offset.is_constant (&const_offset))
6028         {
6029           if (!TARGET_SVE)
6030             {
6031               aarch64_report_sve_required ();
6032               return;
6033             }
6034           if (base == const0_rtx
6035               && (aarch64_sve_cnt_immediate_p (offset)
6036                   || aarch64_sve_rdvl_immediate_p (offset)))
6037             emit_insn (gen_rtx_SET (dest, imm));
6038           else
6039             {
6040               /* Do arithmetic on 32-bit values if the result is smaller
6041                  than that.  */
6042               if (partial_subreg_p (int_mode, SImode))
6043                 {
6044                   /* It is invalid to do symbol calculations in modes
6045                      narrower than SImode.  */
6046                   gcc_assert (base == const0_rtx);
6047                   dest = gen_lowpart (SImode, dest);
6048                   int_mode = SImode;
6049                 }
6050               if (base != const0_rtx)
6051                 {
6052                   base = aarch64_force_temporary (int_mode, dest, base);
6053                   aarch64_add_offset (int_mode, dest, base, offset,
6054                                       NULL_RTX, NULL_RTX, 0, false);
6055                 }
6056               else
6057                 aarch64_add_offset (int_mode, dest, base, offset,
6058                                     dest, NULL_RTX, 0, false);
6059             }
6060           return;
6061         }
6062
6063       if (aarch64_rdsvl_immediate_p (base))
6064         {
6065           /* We could handle non-constant offsets if they are ever
6066              generated.  */
6067           gcc_assert (const_offset == 0);
6068           emit_insn (gen_rtx_SET (dest, imm));
6069           return;
6070         }
6071
6072       sty = aarch64_classify_symbol (base, const_offset);
6073       switch (sty)
6074         {
6075         case SYMBOL_FORCE_TO_MEM:
6076           if (int_mode != ptr_mode)
6077             imm = convert_memory_address (ptr_mode, imm);
6078
6079           if (const_offset != 0
6080               && targetm.cannot_force_const_mem (ptr_mode, imm))
6081             {
6082               gcc_assert (can_create_pseudo_p ());
6083               base = aarch64_force_temporary (int_mode, dest, base);
6084               aarch64_add_offset (int_mode, dest, base, const_offset,
6085                                   NULL_RTX, NULL_RTX, 0, false);
6086               return;
6087             }
6088
6089           mem = force_const_mem (ptr_mode, imm);
6090           gcc_assert (mem);
6091
6092           /* If we aren't generating PC relative literals, then
6093              we need to expand the literal pool access carefully.
6094              This is something that needs to be done in a number
6095              of places, so could well live as a separate function.  */
6096           if (!aarch64_pcrelative_literal_loads)
6097             {
6098               gcc_assert (can_create_pseudo_p ());
6099               base = gen_reg_rtx (ptr_mode);
6100               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6101               if (ptr_mode != Pmode)
6102                 base = convert_memory_address (Pmode, base);
6103               mem = gen_rtx_MEM (ptr_mode, base);
6104             }
6105
6106           if (int_mode != ptr_mode)
6107             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6108
6109           emit_insn (gen_rtx_SET (dest, mem));
6110
6111           return;
6112
6113         case SYMBOL_SMALL_TLSGD:
6114         case SYMBOL_SMALL_TLSDESC:
6115         case SYMBOL_SMALL_TLSIE:
6116         case SYMBOL_SMALL_GOT_28K:
6117         case SYMBOL_SMALL_GOT_4G:
6118         case SYMBOL_TINY_GOT:
6119         case SYMBOL_TINY_TLSIE:
6120           if (const_offset != 0)
6121             {
6122               gcc_assert(can_create_pseudo_p ());
6123               base = aarch64_force_temporary (int_mode, dest, base);
6124               aarch64_add_offset (int_mode, dest, base, const_offset,
6125                                   NULL_RTX, NULL_RTX, 0, false);
6126               return;
6127             }
6128           /* FALLTHRU */
6129
6130         case SYMBOL_SMALL_ABSOLUTE:
6131         case SYMBOL_TINY_ABSOLUTE:
6132         case SYMBOL_TLSLE12:
6133         case SYMBOL_TLSLE24:
6134         case SYMBOL_TLSLE32:
6135         case SYMBOL_TLSLE48:
6136           aarch64_load_symref_appropriately (dest, imm, sty);
6137           return;
6138
6139         default:
6140           gcc_unreachable ();
6141         }
6142     }
6143
6144   if (!CONST_INT_P (imm))
6145     {
6146       if (aarch64_sve_pred_mode_p (mode))
6147         {
6148           /* Only the low bit of each .H, .S and .D element is defined,
6149              so we can set the upper bits to whatever we like.  If the
6150              predicate is all-true in MODE, prefer to set all the undefined
6151              bits as well, so that we can share a single .B predicate for
6152              all modes.  */
6153           if (imm == CONSTM1_RTX (mode))
6154             imm = CONSTM1_RTX (VNx16BImode);
6155
6156           /* All methods for constructing predicate modes wider than VNx16BI
6157              will set the upper bits of each element to zero.  Expose this
6158              by moving such constants as a VNx16BI, so that all bits are
6159              significant and so that constants for different modes can be
6160              shared.  The wider constant will still be available as a
6161              REG_EQUAL note.  */
6162           rtx_vector_builder builder;
6163           if (aarch64_get_sve_pred_bits (builder, imm))
6164             {
6165               rtx res = aarch64_expand_sve_const_pred (dest, builder);
6166               if (dest != res)
6167                 emit_move_insn (dest, gen_lowpart (mode, res));
6168               return;
6169             }
6170         }
6171
6172       if (GET_CODE (imm) == HIGH
6173           || aarch64_simd_valid_immediate (imm, NULL))
6174         {
6175           emit_insn (gen_rtx_SET (dest, imm));
6176           return;
6177         }
6178
6179       if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6180         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6181           {
6182             if (dest != res)
6183               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6184             return;
6185           }
6186
6187       rtx mem = force_const_mem (mode, imm);
6188       gcc_assert (mem);
6189       emit_move_insn (dest, mem);
6190       return;
6191     }
6192
6193   aarch64_internal_mov_immediate (dest, imm, true, mode);
6194 }
6195
6196 /* Return the MEM rtx that provides the canary value that should be used
6197    for stack-smashing protection.  MODE is the mode of the memory.
6198    For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6199    (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
6200    indicates whether the caller is performing a SET or a TEST operation.  */
6201
6202 rtx
6203 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6204                                   aarch64_salt_type salt_type)
6205 {
6206   rtx addr;
6207   if (aarch64_stack_protector_guard == SSP_GLOBAL)
6208     {
6209       gcc_assert (MEM_P (decl_rtl));
6210       addr = XEXP (decl_rtl, 0);
6211       poly_int64 offset;
6212       rtx base = strip_offset_and_salt (addr, &offset);
6213       if (!SYMBOL_REF_P (base))
6214         return decl_rtl;
6215
6216       rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6217       addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6218       addr = gen_rtx_CONST (Pmode, addr);
6219       addr = plus_constant (Pmode, addr, offset);
6220     }
6221   else
6222     {
6223       /* Calculate the address from the system register.  */
6224       rtx salt = GEN_INT (salt_type);
6225       addr = gen_reg_rtx (mode);
6226       if (mode == DImode)
6227         emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6228       else
6229         {
6230           emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6231           addr = convert_memory_address (Pmode, addr);
6232         }
6233       addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6234     }
6235   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6236 }
6237
6238 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
6239    that is known to contain PTRUE.  */
6240
6241 void
6242 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6243 {
6244   expand_operand ops[3];
6245   machine_mode mode = GET_MODE (dest);
6246   create_output_operand (&ops[0], dest, mode);
6247   create_input_operand (&ops[1], pred, GET_MODE(pred));
6248   create_input_operand (&ops[2], src, mode);
6249   temporary_volatile_ok v (true);
6250   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6251 }
6252
6253 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6254    operand is in memory.  In this case we need to use the predicated LD1
6255    and ST1 instead of LDR and STR, both for correctness on big-endian
6256    targets and because LD1 and ST1 support a wider range of addressing modes.
6257    PRED_MODE is the mode of the predicate.
6258
6259    See the comment at the head of aarch64-sve.md for details about the
6260    big-endian handling.  */
6261
6262 void
6263 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6264 {
6265   machine_mode mode = GET_MODE (dest);
6266   rtx ptrue = aarch64_ptrue_reg (pred_mode);
6267   if (!register_operand (src, mode)
6268       && !register_operand (dest, mode))
6269     {
6270       rtx tmp = gen_reg_rtx (mode);
6271       if (MEM_P (src))
6272         aarch64_emit_sve_pred_move (tmp, ptrue, src);
6273       else
6274         emit_move_insn (tmp, src);
6275       src = tmp;
6276     }
6277   aarch64_emit_sve_pred_move (dest, ptrue, src);
6278 }
6279
6280 /* Called only on big-endian targets.  See whether an SVE vector move
6281    from SRC to DEST is effectively a REV[BHW] instruction, because at
6282    least one operand is a subreg of an SVE vector that has wider or
6283    narrower elements.  Return true and emit the instruction if so.
6284
6285    For example:
6286
6287      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6288
6289    represents a VIEW_CONVERT between the following vectors, viewed
6290    in memory order:
6291
6292      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
6293      R1: { [0],      [1],      [2],      [3],     ... }
6294
6295    The high part of lane X in R2 should therefore correspond to lane X*2
6296    of R1, but the register representations are:
6297
6298          msb                                      lsb
6299      R2: ...... [1].high  [1].low   [0].high  [0].low
6300      R1: ...... [3]       [2]       [1]       [0]
6301
6302    where the low part of lane X in R2 corresponds to lane X*2 in R1.
6303    We therefore need a reverse operation to swap the high and low values
6304    around.
6305
6306    This is purely an optimization.  Without it we would spill the
6307    subreg operand to the stack in one mode and reload it in the
6308    other mode, which has the same effect as the REV.  */
6309
6310 bool
6311 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6312 {
6313   gcc_assert (BYTES_BIG_ENDIAN);
6314
6315   /* Do not try to optimize subregs that LRA has created for matched
6316      reloads.  These subregs only exist as a temporary measure to make
6317      the RTL well-formed, but they are exempt from the usual
6318      TARGET_CAN_CHANGE_MODE_CLASS rules.
6319
6320      For example, if we have:
6321
6322        (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6323
6324      and the constraints require R1 and R2 to be in the same register,
6325      LRA may need to create RTL such as:
6326
6327        (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6328        (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6329        (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6330
6331      which forces both the input and output of the original instruction
6332      to use the same hard register.  But for this to work, the normal
6333      rules have to be suppressed on the subreg input, otherwise LRA
6334      would need to reload that input too, meaning that the process
6335      would never terminate.  To compensate for this, the normal rules
6336      are also suppressed for the subreg output of the first move.
6337      Ignoring the special case and handling the first move normally
6338      would therefore generate wrong code: we would reverse the elements
6339      for the first subreg but not reverse them back for the second subreg.  */
6340   if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
6341     dest = SUBREG_REG (dest);
6342   if (SUBREG_P (src) && !LRA_SUBREG_P (src))
6343     src = SUBREG_REG (src);
6344
6345   /* The optimization handles two single SVE REGs with different element
6346      sizes.  */
6347   if (!REG_P (dest)
6348       || !REG_P (src)
6349       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6350       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6351       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6352           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6353     return false;
6354
6355   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
6356   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
6357   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6358                                UNSPEC_REV_SUBREG);
6359   emit_insn (gen_rtx_SET (dest, unspec));
6360   return true;
6361 }
6362
6363 /* Return a copy of X with mode MODE, without changing its other
6364    attributes.  Unlike gen_lowpart, this doesn't care whether the
6365    mode change is valid.  */
6366
6367 rtx
6368 aarch64_replace_reg_mode (rtx x, machine_mode mode)
6369 {
6370   if (GET_MODE (x) == mode)
6371     return x;
6372
6373   x = shallow_copy_rtx (x);
6374   set_mode_and_regno (x, mode, REGNO (x));
6375   return x;
6376 }
6377
6378 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6379    stored in wider integer containers.  */
6380
6381 static unsigned int
6382 aarch64_sve_rev_unspec (machine_mode mode)
6383 {
6384   switch (GET_MODE_UNIT_SIZE (mode))
6385     {
6386     case 1: return UNSPEC_REVB;
6387     case 2: return UNSPEC_REVH;
6388     case 4: return UNSPEC_REVW;
6389     }
6390   gcc_unreachable ();
6391 }
6392
6393 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6394    operands.  */
6395
6396 void
6397 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6398 {
6399   /* Decide which REV operation we need.  The mode with wider elements
6400      determines the mode of the operands and the mode with the narrower
6401      elements determines the reverse width.  */
6402   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6403   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6404   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6405       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6406     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6407
6408   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6409   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6410
6411   /* Get the operands in the appropriate modes and emit the instruction.  */
6412   ptrue = gen_lowpart (pred_mode, ptrue);
6413   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6414   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6415   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6416                                dest, ptrue, src));
6417 }
6418
6419 static bool
6420 aarch64_function_ok_for_sibcall (tree, tree exp)
6421 {
6422   if (crtl->abi->id () != expr_callee_abi (exp).id ())
6423     return false;
6424
6425   tree fntype = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp)));
6426   if (aarch64_fntype_pstate_sm (fntype) & ~aarch64_cfun_incoming_pstate_sm ())
6427     return false;
6428   for (auto state : { "za", "zt0" })
6429     if (bool (aarch64_cfun_shared_flags (state))
6430         != bool (aarch64_fntype_shared_flags (fntype, state)))
6431       return false;
6432   return true;
6433 }
6434
6435 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6436    passed in SVE registers.  */
6437
6438 static bool
6439 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6440                              const function_arg_info &arg)
6441 {
6442   HOST_WIDE_INT size;
6443   machine_mode dummymode;
6444   int nregs;
6445
6446   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
6447   if (arg.mode == BLKmode && arg.type)
6448     size = int_size_in_bytes (arg.type);
6449   else
6450     /* No frontends can create types with variable-sized modes, so we
6451        shouldn't be asked to pass or return them.  */
6452     size = GET_MODE_SIZE (arg.mode).to_constant ();
6453
6454   /* Aggregates are passed by reference based on their size.  */
6455   if (arg.aggregate_type_p ())
6456     size = int_size_in_bytes (arg.type);
6457
6458   /* Variable sized arguments are always returned by reference.  */
6459   if (size < 0)
6460     return true;
6461
6462   /* Can this be a candidate to be passed in fp/simd register(s)?  */
6463   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6464                                                &dummymode, &nregs, NULL,
6465                                                !pcum || pcum->silent_p))
6466     return false;
6467
6468   /* Arguments which are variable sized or larger than 2 registers are
6469      passed by reference unless they are a homogenous floating point
6470      aggregate.  */
6471   return size > 2 * UNITS_PER_WORD;
6472 }
6473
6474 /* Implement TARGET_PASS_BY_REFERENCE.  */
6475
6476 static bool
6477 aarch64_pass_by_reference (cumulative_args_t pcum_v,
6478                            const function_arg_info &arg)
6479 {
6480   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6481
6482   if (!arg.type)
6483     return aarch64_pass_by_reference_1 (pcum, arg);
6484
6485   pure_scalable_type_info pst_info;
6486   switch (pst_info.analyze (arg.type))
6487     {
6488     case pure_scalable_type_info::IS_PST:
6489       if (pcum && !pcum->silent_p && !TARGET_SVE)
6490         /* We can't gracefully recover at this point, so make this a
6491            fatal error.  */
6492         fatal_error (input_location, "arguments of type %qT require"
6493                      " the SVE ISA extension", arg.type);
6494
6495       /* Variadic SVE types are passed by reference.  Normal non-variadic
6496          arguments are too if we've run out of registers.  */
6497       return (!arg.named
6498               || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
6499               || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
6500
6501     case pure_scalable_type_info::DOESNT_MATTER:
6502       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
6503       return true;
6504
6505     case pure_scalable_type_info::NO_ABI_IDENTITY:
6506     case pure_scalable_type_info::ISNT_PST:
6507       return aarch64_pass_by_reference_1 (pcum, arg);
6508     }
6509   gcc_unreachable ();
6510 }
6511
6512 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
6513 static bool
6514 aarch64_return_in_msb (const_tree valtype)
6515 {
6516   machine_mode dummy_mode;
6517   int dummy_int;
6518
6519   /* Never happens in little-endian mode.  */
6520   if (!BYTES_BIG_ENDIAN)
6521     return false;
6522
6523   /* Only composite types smaller than or equal to 16 bytes can
6524      be potentially returned in registers.  */
6525   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
6526       || int_size_in_bytes (valtype) <= 0
6527       || int_size_in_bytes (valtype) > 16)
6528     return false;
6529
6530   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6531      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6532      is always passed/returned in the least significant bits of fp/simd
6533      register(s).  */
6534   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
6535                                                &dummy_mode, &dummy_int, NULL,
6536                                                false))
6537     return false;
6538
6539   /* Likewise pure scalable types for SVE vector and predicate registers.  */
6540   pure_scalable_type_info pst_info;
6541   if (pst_info.analyze_registers (valtype))
6542     return false;
6543
6544   return true;
6545 }
6546
6547 /* Implement TARGET_FUNCTION_VALUE.
6548    Define how to find the value returned by a function.  */
6549
6550 static rtx
6551 aarch64_function_value (const_tree type, const_tree func,
6552                         bool outgoing ATTRIBUTE_UNUSED)
6553 {
6554   machine_mode mode;
6555   int unsignedp;
6556
6557   mode = TYPE_MODE (type);
6558   if (INTEGRAL_TYPE_P (type))
6559     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
6560
6561   pure_scalable_type_info pst_info;
6562   if (type && pst_info.analyze_registers (type))
6563     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
6564
6565   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6566      are returned in memory, not by value.  */
6567   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6568   bool sve_p = (vec_flags & VEC_ANY_SVE);
6569
6570   if (aarch64_return_in_msb (type))
6571     {
6572       HOST_WIDE_INT size = int_size_in_bytes (type);
6573
6574       if (size % UNITS_PER_WORD != 0)
6575         {
6576           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
6577           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
6578         }
6579     }
6580
6581   int count;
6582   machine_mode ag_mode;
6583   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
6584                                                NULL, false))
6585     {
6586       gcc_assert (!sve_p);
6587       if (!aarch64_composite_type_p (type, mode))
6588         {
6589           gcc_assert (count == 1 && mode == ag_mode);
6590           return gen_rtx_REG (mode, V0_REGNUM);
6591         }
6592       else if (aarch64_advsimd_full_struct_mode_p (mode)
6593                && known_eq (GET_MODE_SIZE (ag_mode), 16))
6594         return gen_rtx_REG (mode, V0_REGNUM);
6595       else if (aarch64_advsimd_partial_struct_mode_p (mode)
6596                && known_eq (GET_MODE_SIZE (ag_mode), 8))
6597         return gen_rtx_REG (mode, V0_REGNUM);
6598       else
6599         {
6600           int i;
6601           rtx par;
6602
6603           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
6604           for (i = 0; i < count; i++)
6605             {
6606               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6607               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
6608               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6609               XVECEXP (par, 0, i) = tmp;
6610             }
6611           return par;
6612         }
6613     }
6614   else
6615     {
6616       if (sve_p)
6617         {
6618           /* Vector types can acquire a partial SVE mode using things like
6619              __attribute__((vector_size(N))), and this is potentially useful.
6620              However, the choice of mode doesn't affect the type's ABI
6621              identity, so we should treat the types as though they had
6622              the associated integer mode, just like they did before SVE
6623              was introduced.
6624
6625              We know that the vector must be 128 bits or smaller,
6626              otherwise we'd have returned it in memory instead.  */
6627           gcc_assert (type
6628                       && (aarch64_some_values_include_pst_objects_p (type)
6629                           || (vec_flags & VEC_PARTIAL)));
6630
6631           scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
6632           rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
6633           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
6634           return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
6635         }
6636       return gen_rtx_REG (mode, R0_REGNUM);
6637     }
6638 }
6639
6640 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6641    Return true if REGNO is the number of a hard register in which the values
6642    of called function may come back.  */
6643
6644 static bool
6645 aarch64_function_value_regno_p (const unsigned int regno)
6646 {
6647   /* Maximum of 16 bytes can be returned in the general registers.  Examples
6648      of 16-byte return values are: 128-bit integers and 16-byte small
6649      structures (excluding homogeneous floating-point aggregates).  */
6650   if (regno == R0_REGNUM || regno == R1_REGNUM)
6651     return true;
6652
6653   /* Up to four fp/simd registers can return a function value, e.g. a
6654      homogeneous floating-point aggregate having four members.  */
6655   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
6656     return TARGET_FLOAT;
6657
6658   if (regno >= P0_REGNUM && regno < P0_REGNUM + HA_MAX_NUM_FLDS)
6659     return TARGET_SVE;
6660
6661   return false;
6662 }
6663
6664 /* Subroutine for aarch64_return_in_memory for types that are not returned
6665    in SVE registers.  */
6666
6667 static bool
6668 aarch64_return_in_memory_1 (const_tree type)
6669 {
6670   HOST_WIDE_INT size;
6671   machine_mode ag_mode;
6672   int count;
6673
6674   if (!AGGREGATE_TYPE_P (type)
6675       && TREE_CODE (type) != BITINT_TYPE
6676       && TREE_CODE (type) != COMPLEX_TYPE
6677       && TREE_CODE (type) != VECTOR_TYPE)
6678     /* Simple scalar types always returned in registers.  */
6679     return false;
6680
6681   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6682                                                &ag_mode, &count, NULL, false))
6683     return false;
6684
6685   /* Types larger than 2 registers returned in memory.  */
6686   size = int_size_in_bytes (type);
6687   return (size < 0 || size > 2 * UNITS_PER_WORD);
6688 }
6689
6690 /* Implement TARGET_RETURN_IN_MEMORY.
6691
6692    If the type T of the result of a function is such that
6693      void func (T arg)
6694    would require that arg be passed as a value in a register (or set of
6695    registers) according to the parameter passing rules, then the result
6696    is returned in the same registers as would be used for such an
6697    argument.  */
6698
6699 static bool
6700 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
6701 {
6702   pure_scalable_type_info pst_info;
6703   switch (pst_info.analyze (type))
6704     {
6705     case pure_scalable_type_info::IS_PST:
6706       return (pst_info.num_zr () > NUM_FP_ARG_REGS
6707               || pst_info.num_pr () > NUM_PR_ARG_REGS);
6708
6709     case pure_scalable_type_info::DOESNT_MATTER:
6710       gcc_assert (aarch64_return_in_memory_1 (type));
6711       return true;
6712
6713     case pure_scalable_type_info::NO_ABI_IDENTITY:
6714     case pure_scalable_type_info::ISNT_PST:
6715       return aarch64_return_in_memory_1 (type);
6716     }
6717   gcc_unreachable ();
6718 }
6719
6720 static bool
6721 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
6722                                const_tree type, int *nregs)
6723 {
6724   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6725   return aarch64_vfp_is_call_or_return_candidate (mode, type,
6726                                                   &pcum->aapcs_vfp_rmode,
6727                                                   nregs, NULL, pcum->silent_p);
6728 }
6729
6730 /* Given MODE and TYPE of a function argument, return the alignment in
6731    bits.  The idea is to suppress any stronger alignment requested by
6732    the user and opt for the natural alignment (specified in AAPCS64 \S
6733    4.1).  ABI_BREAK_GCC_9 is set to the old alignment if the alignment
6734    was incorrectly calculated in versions of GCC prior to GCC 9.
6735    ABI_BREAK_GCC_13 is set to the old alignment if it was incorrectly
6736    calculated in versions between GCC 9 and GCC 13.  If the alignment
6737    might have changed between GCC 13 and GCC 14, ABI_BREAK_GCC_14
6738    is the old GCC 13 alignment, otherwise it is zero.
6739
6740    This is a helper function for local use only.  */
6741
6742 static unsigned int
6743 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6744                                 unsigned int *abi_break_gcc_9,
6745                                 unsigned int *abi_break_gcc_13,
6746                                 unsigned int *abi_break_gcc_14)
6747 {
6748   *abi_break_gcc_9 = 0;
6749   *abi_break_gcc_13 = 0;
6750   *abi_break_gcc_14 = 0;
6751   if (!type)
6752     return GET_MODE_ALIGNMENT (mode);
6753
6754   if (integer_zerop (TYPE_SIZE (type)))
6755     return 0;
6756
6757   gcc_assert (TYPE_MODE (type) == mode);
6758
6759   if (!AGGREGATE_TYPE_P (type))
6760     {
6761       /* The ABI alignment is the natural alignment of the type, without
6762          any attributes applied.  Normally this is the alignment of the
6763          TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
6764          For now we just handle the known exceptions explicitly.  */
6765       type = TYPE_MAIN_VARIANT (type);
6766       if (POINTER_TYPE_P (type))
6767         {
6768           gcc_assert (known_eq (POINTER_SIZE, GET_MODE_BITSIZE (mode)));
6769           return POINTER_SIZE;
6770         }
6771       if (TREE_CODE (type) == ENUMERAL_TYPE && TREE_TYPE (type))
6772         {
6773           *abi_break_gcc_14 = TYPE_ALIGN (type);
6774           type = TYPE_MAIN_VARIANT (TREE_TYPE (type));
6775         }
6776       gcc_assert (!TYPE_USER_ALIGN (type));
6777       return TYPE_ALIGN (type);
6778     }
6779
6780   if (TREE_CODE (type) == ARRAY_TYPE)
6781     return TYPE_ALIGN (TREE_TYPE (type));
6782
6783   unsigned int alignment = 0;
6784   unsigned int bitfield_alignment_with_packed = 0;
6785   unsigned int bitfield_alignment = 0;
6786   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6787     if (TREE_CODE (field) == FIELD_DECL)
6788       {
6789         /* Note that we explicitly consider zero-sized fields here,
6790            even though they don't map to AAPCS64 machine types.
6791            For example, in:
6792
6793                struct __attribute__((aligned(8))) empty {};
6794
6795                struct s {
6796                  [[no_unique_address]] empty e;
6797                  int x;
6798                };
6799
6800            "s" contains only one Fundamental Data Type (the int field)
6801            but gains 8-byte alignment and size thanks to "e".  */
6802         alignment = std::max (alignment, DECL_ALIGN (field));
6803         if (DECL_BIT_FIELD_TYPE (field))
6804           {
6805             /* Take the bit-field type's alignment into account only
6806                if the user didn't reduce this field's alignment with
6807                the packed attribute.  */
6808             if (!DECL_PACKED (field))
6809               bitfield_alignment
6810                 = std::max (bitfield_alignment,
6811                             TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6812
6813             /* Compute the alignment even if the bit-field is
6814                packed, so that we can emit a warning in case the
6815                alignment changed between GCC versions.  */
6816             bitfield_alignment_with_packed
6817               = std::max (bitfield_alignment_with_packed,
6818                           TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6819           }
6820       }
6821
6822   /* Emit a warning if the alignment is different when taking the
6823      'packed' attribute into account.  */
6824   if (bitfield_alignment != bitfield_alignment_with_packed
6825       && bitfield_alignment_with_packed > alignment)
6826     *abi_break_gcc_13 = bitfield_alignment_with_packed;
6827
6828   if (bitfield_alignment > alignment)
6829     {
6830       *abi_break_gcc_9 = alignment;
6831       return bitfield_alignment;
6832     }
6833
6834   return alignment;
6835 }
6836
6837 /* Return true if TYPE describes a _BitInt(N) or an angreggate that uses the
6838    _BitInt(N) type.  These include ARRAY_TYPE's with an element that is a
6839    _BitInt(N) or an aggregate that uses it, and a RECORD_TYPE or a UNION_TYPE
6840    with a field member that is a _BitInt(N) or an aggregate that uses it.
6841    Return false otherwise.  */
6842
6843 static bool
6844 bitint_or_aggr_of_bitint_p (tree type)
6845 {
6846   if (!type)
6847     return false;
6848
6849   if (TREE_CODE (type) == BITINT_TYPE)
6850     return true;
6851
6852   /* If ARRAY_TYPE, check it's element type.  */
6853   if (TREE_CODE (type) == ARRAY_TYPE)
6854     return bitint_or_aggr_of_bitint_p (TREE_TYPE (type));
6855
6856   /* If RECORD_TYPE or UNION_TYPE, check the fields' types.  */
6857   if (RECORD_OR_UNION_TYPE_P (type))
6858     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
6859       {
6860         if (TREE_CODE (field) != FIELD_DECL)
6861           continue;
6862         if (bitint_or_aggr_of_bitint_p (TREE_TYPE (field)))
6863           return true;
6864       }
6865   return false;
6866 }
6867
6868 /* Layout a function argument according to the AAPCS64 rules.  The rule
6869    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
6870    mode that was originally given to us by the target hook, whereas the
6871    mode in ARG might be the result of replacing partial SVE modes with
6872    the equivalent integer mode.  */
6873
6874 static void
6875 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6876 {
6877   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6878   tree type = arg.type;
6879   machine_mode mode = arg.mode;
6880   int ncrn, nvrn, nregs;
6881   bool allocate_ncrn, allocate_nvrn;
6882   HOST_WIDE_INT size;
6883   unsigned int abi_break_gcc_9;
6884   unsigned int abi_break_gcc_13;
6885   unsigned int abi_break_gcc_14;
6886
6887   /* We need to do this once per argument.  */
6888   if (pcum->aapcs_arg_processed)
6889     return;
6890
6891   bool warn_pcs_change
6892     = (warn_psabi
6893        && !pcum->silent_p
6894        && (currently_expanding_function_start
6895            || currently_expanding_gimple_stmt));
6896
6897   /* HFAs and HVAs can have an alignment greater than 16 bytes.  For example:
6898
6899        typedef struct foo {
6900          __Int8x16_t foo[2] __attribute__((aligned(32)));
6901        } foo;
6902
6903      is still a HVA despite its larger-than-normal alignment.
6904      However, such over-aligned HFAs and HVAs are guaranteed to have
6905      no padding.
6906
6907      If we exclude HFAs and HVAs from the discussion below, then there
6908      are several things to note:
6909
6910      - Both the C and AAPCS64 interpretations of a type's alignment should
6911        give a value that is no greater than the type's size.
6912
6913      - Types bigger than 16 bytes are passed indirectly.
6914
6915      - If an argument of type T is passed indirectly, TYPE and MODE describe
6916        a pointer to T rather than T iself.
6917
6918      It follows that the AAPCS64 alignment of TYPE must be no greater
6919      than 16 bytes.
6920
6921      Versions prior to GCC 9.1 ignored a bitfield's underlying type
6922      and so could calculate an alignment that was too small.  If this
6923      happened for TYPE then ABI_BREAK_GCC_9 is this older, too-small alignment.
6924
6925      Although GCC 9.1 fixed that bug, it introduced a different one:
6926      it would consider the alignment of a bitfield's underlying type even
6927      if the field was packed (which should have the effect of overriding
6928      the alignment of the underlying type).  This was fixed in GCC 13.1.
6929
6930      As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
6931      that was too big.  If this happened for TYPE, ABI_BREAK_GCC_13 is
6932      this older, too-big alignment.
6933
6934      Also, the fact that GCC 9 to GCC 12 considered irrelevant
6935      alignments meant they could calculate type alignments that were
6936      bigger than the type's size, contrary to the assumption above.
6937      The handling of register arguments was nevertheless (and justifiably)
6938      written to follow the assumption that the alignment can never be
6939      greater than the size.  The same was not true for stack arguments;
6940      their alignment was instead handled by MIN bounds in
6941      aarch64_function_arg_boundary.
6942
6943      The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
6944      an alignment of more than 16 bytes for TYPE then:
6945
6946      - If the argument was passed in registers, these GCC versions
6947        would treat the alignment as though it was *less than* 16 bytes.
6948
6949      - If the argument was passed on the stack, these GCC versions
6950        would treat the alignment as though it was *equal to* 16 bytes.
6951
6952      Both behaviors were wrong, but in different cases.  */
6953
6954   pcum->aapcs_arg_processed = true;
6955
6956   pure_scalable_type_info pst_info;
6957   if (type && pst_info.analyze_registers (type))
6958     {
6959       /* aarch64_function_arg_alignment has never had an effect on
6960          this case.  */
6961
6962       /* The PCS says that it is invalid to pass an SVE value to an
6963          unprototyped function.  There is no ABI-defined location we
6964          can return in this case, so we have no real choice but to raise
6965          an error immediately, even though this is only a query function.  */
6966       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
6967         {
6968           gcc_assert (!pcum->silent_p);
6969           error ("SVE type %qT cannot be passed to an unprototyped function",
6970                  arg.type);
6971           /* Avoid repeating the message, and avoid tripping the assert
6972              below.  */
6973           pcum->pcs_variant = ARM_PCS_SVE;
6974         }
6975
6976       /* We would have converted the argument into pass-by-reference
6977          form if it didn't fit in registers.  */
6978       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
6979       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
6980       gcc_assert (arg.named
6981                   && pcum->pcs_variant == ARM_PCS_SVE
6982                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
6983                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
6984       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
6985                                           P0_REGNUM + pcum->aapcs_nprn);
6986       return;
6987     }
6988
6989   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6990      are passed by reference, not by value.  */
6991   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6992   bool sve_p = (vec_flags & VEC_ANY_SVE);
6993   if (sve_p)
6994     /* Vector types can acquire a partial SVE mode using things like
6995        __attribute__((vector_size(N))), and this is potentially useful.
6996        However, the choice of mode doesn't affect the type's ABI
6997        identity, so we should treat the types as though they had
6998        the associated integer mode, just like they did before SVE
6999        was introduced.
7000
7001        We know that the vector must be 128 bits or smaller,
7002        otherwise we'd have passed it in memory instead.  */
7003     gcc_assert (type
7004                 && (aarch64_some_values_include_pst_objects_p (type)
7005                     || (vec_flags & VEC_PARTIAL)));
7006
7007   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
7008   if (type)
7009     size = int_size_in_bytes (type);
7010   else
7011     /* No frontends can create types with variable-sized modes, so we
7012        shouldn't be asked to pass or return them.  */
7013     size = GET_MODE_SIZE (mode).to_constant ();
7014   size = ROUND_UP (size, UNITS_PER_WORD);
7015
7016   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
7017   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
7018                                                  mode,
7019                                                  type,
7020                                                  &nregs);
7021   gcc_assert (!sve_p || !allocate_nvrn);
7022
7023   unsigned int alignment
7024     = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
7025                                       &abi_break_gcc_13, &abi_break_gcc_14);
7026
7027   gcc_assert ((allocate_nvrn || alignment <= 16 * BITS_PER_UNIT)
7028               && (!alignment || abi_break_gcc_9 < alignment)
7029               && (!abi_break_gcc_13 || alignment < abi_break_gcc_13));
7030
7031   /* _BitInt(N) was only added in GCC 14.  */
7032   bool warn_pcs_change_le_gcc14
7033     = warn_pcs_change && !bitint_or_aggr_of_bitint_p (type);
7034
7035   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7036      The following code thus handles passing by SIMD/FP registers first.  */
7037
7038   nvrn = pcum->aapcs_nvrn;
7039
7040   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7041      and homogenous short-vector aggregates (HVA).  */
7042   if (allocate_nvrn)
7043     {
7044       /* aarch64_function_arg_alignment has never had an effect on
7045          this case.  */
7046       if (!pcum->silent_p && !TARGET_FLOAT)
7047         aarch64_err_no_fpadvsimd (mode);
7048
7049       if (nvrn + nregs <= NUM_FP_ARG_REGS)
7050         {
7051           pcum->aapcs_nextnvrn = nvrn + nregs;
7052           if (!aarch64_composite_type_p (type, mode))
7053             {
7054               gcc_assert (nregs == 1);
7055               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7056             }
7057           else if (aarch64_advsimd_full_struct_mode_p (mode)
7058                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
7059             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7060           else if (aarch64_advsimd_partial_struct_mode_p (mode)
7061                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
7062             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7063           else
7064             {
7065               rtx par;
7066               int i;
7067               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7068               for (i = 0; i < nregs; i++)
7069                 {
7070                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7071                                          V0_REGNUM + nvrn + i);
7072                   rtx offset = gen_int_mode
7073                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7074                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7075                   XVECEXP (par, 0, i) = tmp;
7076                 }
7077               pcum->aapcs_reg = par;
7078             }
7079           return;
7080         }
7081       else
7082         {
7083           /* C.3 NSRN is set to 8.  */
7084           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7085           goto on_stack;
7086         }
7087     }
7088
7089   ncrn = pcum->aapcs_ncrn;
7090   nregs = size / UNITS_PER_WORD;
7091
7092   /* C6 - C9.  though the sign and zero extension semantics are
7093      handled elsewhere.  This is the case where the argument fits
7094      entirely general registers.  */
7095   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7096     {
7097       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7098
7099       /* C.8 if the argument has an alignment of 16 then the NGRN is
7100          rounded up to the next even number.  */
7101       if (nregs == 2
7102           && ncrn % 2)
7103         {
7104           /* Emit a warning if the alignment changed when taking the
7105              'packed' attribute into account.  */
7106           if (warn_pcs_change_le_gcc14
7107               && abi_break_gcc_13
7108               && ((abi_break_gcc_13 == 16 * BITS_PER_UNIT)
7109                   != (alignment == 16 * BITS_PER_UNIT)))
7110             inform (input_location, "parameter passing for argument of type "
7111                     "%qT changed in GCC 13.1", type);
7112
7113           if (warn_pcs_change_le_gcc14
7114               && abi_break_gcc_14
7115               && ((abi_break_gcc_14 == 16 * BITS_PER_UNIT)
7116                   != (alignment == 16 * BITS_PER_UNIT)))
7117             inform (input_location, "parameter passing for argument of type "
7118                     "%qT changed in GCC 14.1", type);
7119
7120           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7121              comparison is there because for > 16 * BITS_PER_UNIT
7122              alignment nregs should be > 2 and therefore it should be
7123              passed by reference rather than value.  */
7124           if (alignment == 16 * BITS_PER_UNIT)
7125             {
7126               if (warn_pcs_change_le_gcc14
7127                   && abi_break_gcc_9)
7128                 inform (input_location, "parameter passing for argument of type "
7129                         "%qT changed in GCC 9.1", type);
7130               ++ncrn;
7131               gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7132             }
7133         }
7134
7135       /* If an argument with an SVE mode needs to be shifted up to the
7136          high part of the register, treat it as though it had an integer mode.
7137          Using the normal (parallel [...]) would suppress the shifting.  */
7138       if (sve_p
7139           && BYTES_BIG_ENDIAN
7140           && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7141           && aarch64_pad_reg_upward (mode, type, false))
7142         {
7143           mode = int_mode_for_mode (mode).require ();
7144           sve_p = false;
7145         }
7146
7147       /* NREGS can be 0 when e.g. an empty structure is to be passed.
7148          A reg is still generated for it, but the caller should be smart
7149          enough not to use it.  */
7150       if (nregs == 0
7151           || (nregs == 1 && !sve_p)
7152           || GET_MODE_CLASS (mode) == MODE_INT)
7153         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7154       else
7155         {
7156           rtx par;
7157           int i;
7158
7159           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7160           for (i = 0; i < nregs; i++)
7161             {
7162               scalar_int_mode reg_mode = word_mode;
7163               if (nregs == 1)
7164                 reg_mode = int_mode_for_mode (mode).require ();
7165               rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7166               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7167                                        GEN_INT (i * UNITS_PER_WORD));
7168               XVECEXP (par, 0, i) = tmp;
7169             }
7170           pcum->aapcs_reg = par;
7171         }
7172
7173       pcum->aapcs_nextncrn = ncrn + nregs;
7174       return;
7175     }
7176
7177   /* C.11  */
7178   pcum->aapcs_nextncrn = NUM_ARG_REGS;
7179
7180   /* The argument is passed on stack; record the needed number of words for
7181      this argument and align the total size if necessary.  */
7182 on_stack:
7183   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7184
7185   if (warn_pcs_change_le_gcc14
7186       && abi_break_gcc_13
7187       && ((abi_break_gcc_13 >= 16 * BITS_PER_UNIT)
7188           != (alignment >= 16 * BITS_PER_UNIT)))
7189     inform (input_location, "parameter passing for argument of type "
7190             "%qT changed in GCC 13.1", type);
7191
7192   if (warn_pcs_change_le_gcc14
7193       && abi_break_gcc_14
7194       && ((abi_break_gcc_14 >= 16 * BITS_PER_UNIT)
7195           != (alignment >= 16 * BITS_PER_UNIT)))
7196     inform (input_location, "parameter passing for argument of type "
7197             "%qT changed in GCC 14.1", type);
7198
7199   if (alignment == 16 * BITS_PER_UNIT)
7200     {
7201       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7202       if (pcum->aapcs_stack_size != new_size)
7203         {
7204           if (warn_pcs_change_le_gcc14
7205               && abi_break_gcc_9)
7206             inform (input_location, "parameter passing for argument of type "
7207                     "%qT changed in GCC 9.1", type);
7208           pcum->aapcs_stack_size = new_size;
7209         }
7210     }
7211   return;
7212 }
7213
7214 /* Add the current argument register to the set of those that need
7215    to be saved and restored around a change to PSTATE.SM.  */
7216
7217 static void
7218 aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7219 {
7220   subrtx_var_iterator::array_type array;
7221   FOR_EACH_SUBRTX_VAR (iter, array, pcum->aapcs_reg, NONCONST)
7222     {
7223       rtx x = *iter;
7224       if (REG_P (x) && (FP_REGNUM_P (REGNO (x)) || PR_REGNUM_P (REGNO (x))))
7225         {
7226           unsigned int i = pcum->num_sme_mode_switch_args++;
7227           gcc_assert (i < ARRAY_SIZE (pcum->sme_mode_switch_args));
7228           pcum->sme_mode_switch_args[i] = x;
7229         }
7230     }
7231 }
7232
7233 /* Return a parallel that contains all the registers that need to be
7234    saved around a change to PSTATE.SM.  Return const0_rtx if there is
7235    no such mode switch, or if no registers need to be saved.  */
7236
7237 static rtx
7238 aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7239 {
7240   if (!pcum->num_sme_mode_switch_args)
7241     return const0_rtx;
7242
7243   auto argvec = gen_rtvec_v (pcum->num_sme_mode_switch_args,
7244                              pcum->sme_mode_switch_args);
7245   return gen_rtx_PARALLEL (VOIDmode, argvec);
7246 }
7247
7248 /* Implement TARGET_FUNCTION_ARG.  */
7249
7250 static rtx
7251 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7252 {
7253   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7254   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7255               || pcum->pcs_variant == ARM_PCS_SIMD
7256               || pcum->pcs_variant == ARM_PCS_SVE);
7257
7258   if (arg.end_marker_p ())
7259     {
7260       rtx abi_cookie = aarch64_gen_callee_cookie (pcum->isa_mode,
7261                                                   pcum->pcs_variant);
7262       rtx sme_mode_switch_args = aarch64_finish_sme_mode_switch_args (pcum);
7263       rtx shared_za_flags = gen_int_mode (pcum->shared_za_flags, SImode);
7264       rtx shared_zt0_flags = gen_int_mode (pcum->shared_zt0_flags, SImode);
7265       return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (4, abi_cookie,
7266                                                     sme_mode_switch_args,
7267                                                     shared_za_flags,
7268                                                     shared_zt0_flags));
7269     }
7270
7271   aarch64_layout_arg (pcum_v, arg);
7272   return pcum->aapcs_reg;
7273 }
7274
7275 void
7276 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7277                               const_tree fntype,
7278                               rtx libname ATTRIBUTE_UNUSED,
7279                               const_tree fndecl,
7280                               unsigned n_named ATTRIBUTE_UNUSED,
7281                               bool silent_p)
7282 {
7283   pcum->aapcs_ncrn = 0;
7284   pcum->aapcs_nvrn = 0;
7285   pcum->aapcs_nprn = 0;
7286   pcum->aapcs_nextncrn = 0;
7287   pcum->aapcs_nextnvrn = 0;
7288   pcum->aapcs_nextnprn = 0;
7289   if (fntype)
7290     {
7291       pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7292       pcum->isa_mode = aarch64_fntype_isa_mode (fntype);
7293     }
7294   else
7295     {
7296       pcum->pcs_variant = ARM_PCS_AAPCS64;
7297       pcum->isa_mode = AARCH64_DEFAULT_ISA_MODE;
7298     }
7299   pcum->aapcs_reg = NULL_RTX;
7300   pcum->aapcs_arg_processed = false;
7301   pcum->aapcs_stack_words = 0;
7302   pcum->aapcs_stack_size = 0;
7303   pcum->silent_p = silent_p;
7304   pcum->shared_za_flags
7305     = (fntype ? aarch64_fntype_shared_flags (fntype, "za") : 0U);
7306   pcum->shared_zt0_flags
7307     = (fntype ? aarch64_fntype_shared_flags (fntype, "zt0") : 0U);
7308   pcum->num_sme_mode_switch_args = 0;
7309
7310   if (!silent_p
7311       && !TARGET_FLOAT
7312       && fntype && fntype != error_mark_node)
7313     {
7314       const_tree type = TREE_TYPE (fntype);
7315       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
7316       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
7317       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7318                                                    &mode, &nregs, NULL, false))
7319         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7320     }
7321
7322   if (!silent_p
7323       && !TARGET_SVE
7324       && pcum->pcs_variant == ARM_PCS_SVE)
7325     {
7326       /* We can't gracefully recover at this point, so make this a
7327          fatal error.  */
7328       if (fndecl)
7329         fatal_error (input_location, "%qE requires the SVE ISA extension",
7330                      fndecl);
7331       else
7332         fatal_error (input_location, "calls to functions of type %qT require"
7333                      " the SVE ISA extension", fntype);
7334     }
7335 }
7336
7337 static void
7338 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7339                               const function_arg_info &arg)
7340 {
7341   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7342   if (pcum->pcs_variant == ARM_PCS_AAPCS64
7343       || pcum->pcs_variant == ARM_PCS_SIMD
7344       || pcum->pcs_variant == ARM_PCS_SVE)
7345     {
7346       aarch64_layout_arg (pcum_v, arg);
7347       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7348                   != (pcum->aapcs_stack_words != 0));
7349       if (pcum->aapcs_reg
7350           && aarch64_call_switches_pstate_sm (pcum->isa_mode))
7351         aarch64_record_sme_mode_switch_args (pcum);
7352
7353       pcum->aapcs_arg_processed = false;
7354       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7355       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7356       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7357       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7358       pcum->aapcs_stack_words = 0;
7359       pcum->aapcs_reg = NULL_RTX;
7360     }
7361 }
7362
7363 bool
7364 aarch64_function_arg_regno_p (unsigned regno)
7365 {
7366   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7367           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS)
7368           || (PR_REGNUM_P (regno) && regno < P0_REGNUM + NUM_PR_ARG_REGS));
7369 }
7370
7371 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
7372    PARM_BOUNDARY bits of alignment, but will be given anything up
7373    to STACK_BOUNDARY bits if the type requires it.  This makes sure
7374    that both before and after the layout of each argument, the Next
7375    Stacked Argument Address (NSAA) will have a minimum alignment of
7376    8 bytes.  */
7377
7378 static unsigned int
7379 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7380 {
7381   unsigned int abi_break_gcc_9;
7382   unsigned int abi_break_gcc_13;
7383   unsigned int abi_break_gcc_14;
7384   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7385                                                            &abi_break_gcc_9,
7386                                                            &abi_break_gcc_13,
7387                                                            &abi_break_gcc_14);
7388   /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7389      to emit warnings about ABI incompatibility.  */
7390   alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7391   return alignment;
7392 }
7393
7394 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
7395
7396 static fixed_size_mode
7397 aarch64_get_reg_raw_mode (int regno)
7398 {
7399   /* Don't use any non GP registers for __builtin_apply and
7400      __builtin_return if general registers only mode is requested. */
7401   if (TARGET_GENERAL_REGS_ONLY && !GP_REGNUM_P (regno))
7402     return as_a <fixed_size_mode> (VOIDmode);
7403   if (TARGET_SVE && FP_REGNUM_P (regno))
7404     /* Don't use the SVE part of the register for __builtin_apply and
7405        __builtin_return.  The SVE registers aren't used by the normal PCS,
7406        so using them there would be a waste of time.  The PCS extensions
7407        for SVE types are fundamentally incompatible with the
7408        __builtin_return/__builtin_apply interface.  */
7409     return as_a <fixed_size_mode> (V16QImode);
7410   if (PR_REGNUM_P (regno))
7411     /* For SVE PR regs, indicate that they should be ignored for
7412        __builtin_apply/__builtin_return.  */
7413     return as_a <fixed_size_mode> (VOIDmode);
7414   return default_get_reg_raw_mode (regno);
7415 }
7416
7417 /* Implement TARGET_FUNCTION_ARG_PADDING.
7418
7419    Small aggregate types are placed in the lowest memory address.
7420
7421    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
7422
7423 static pad_direction
7424 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7425 {
7426   /* On little-endian targets, the least significant byte of every stack
7427      argument is passed at the lowest byte address of the stack slot.  */
7428   if (!BYTES_BIG_ENDIAN)
7429     return PAD_UPWARD;
7430
7431   /* Otherwise, integral, floating-point and pointer types are padded downward:
7432      the least significant byte of a stack argument is passed at the highest
7433      byte address of the stack slot.  */
7434   if (type
7435       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7436          || POINTER_TYPE_P (type))
7437       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7438     return PAD_DOWNWARD;
7439
7440   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
7441   return PAD_UPWARD;
7442 }
7443
7444 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7445
7446    It specifies padding for the last (may also be the only)
7447    element of a block move between registers and memory.  If
7448    assuming the block is in the memory, padding upward means that
7449    the last element is padded after its highest significant byte,
7450    while in downward padding, the last element is padded at the
7451    its least significant byte side.
7452
7453    Small aggregates and small complex types are always padded
7454    upwards.
7455
7456    We don't need to worry about homogeneous floating-point or
7457    short-vector aggregates; their move is not affected by the
7458    padding direction determined here.  Regardless of endianness,
7459    each element of such an aggregate is put in the least
7460    significant bits of a fp/simd register.
7461
7462    Return !BYTES_BIG_ENDIAN if the least significant byte of the
7463    register has useful data, and return the opposite if the most
7464    significant byte does.  */
7465
7466 bool
7467 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
7468                      bool first ATTRIBUTE_UNUSED)
7469 {
7470
7471   /* Aside from pure scalable types, small composite types are always
7472      padded upward.  */
7473   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
7474     {
7475       HOST_WIDE_INT size;
7476       if (type)
7477         size = int_size_in_bytes (type);
7478       else
7479         /* No frontends can create types with variable-sized modes, so we
7480            shouldn't be asked to pass or return them.  */
7481         size = GET_MODE_SIZE (mode).to_constant ();
7482       if (size < 2 * UNITS_PER_WORD)
7483         {
7484           pure_scalable_type_info pst_info;
7485           if (pst_info.analyze_registers (type))
7486             return false;
7487           return true;
7488         }
7489     }
7490
7491   /* Otherwise, use the default padding.  */
7492   return !BYTES_BIG_ENDIAN;
7493 }
7494
7495 static scalar_int_mode
7496 aarch64_libgcc_cmp_return_mode (void)
7497 {
7498   return SImode;
7499 }
7500
7501 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7502
7503 /* We use the 12-bit shifted immediate arithmetic instructions so values
7504    must be multiple of (1 << 12), i.e. 4096.  */
7505 #define ARITH_FACTOR 4096
7506
7507 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7508 #error Cannot use simple address calculation for stack probing
7509 #endif
7510
7511 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7512    inclusive.  These are offsets from the current stack pointer.  */
7513
7514 static void
7515 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
7516 {
7517   HOST_WIDE_INT size;
7518   if (!poly_size.is_constant (&size))
7519     {
7520       sorry ("stack probes for SVE frames");
7521       return;
7522     }
7523
7524   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
7525
7526   /* See the same assertion on PROBE_INTERVAL above.  */
7527   gcc_assert ((first % ARITH_FACTOR) == 0);
7528
7529   /* See if we have a constant small number of probes to generate.  If so,
7530      that's the easy case.  */
7531   if (size <= PROBE_INTERVAL)
7532     {
7533       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7534
7535       emit_set_insn (reg1,
7536                      plus_constant (Pmode,
7537                                     stack_pointer_rtx, -(first + base)));
7538       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
7539     }
7540
7541   /* The run-time loop is made up of 8 insns in the generic case while the
7542      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
7543   else if (size <= 4 * PROBE_INTERVAL)
7544     {
7545       HOST_WIDE_INT i, rem;
7546
7547       emit_set_insn (reg1,
7548                      plus_constant (Pmode,
7549                                     stack_pointer_rtx,
7550                                     -(first + PROBE_INTERVAL)));
7551       emit_stack_probe (reg1);
7552
7553       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7554          it exceeds SIZE.  If only two probes are needed, this will not
7555          generate any code.  Then probe at FIRST + SIZE.  */
7556       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7557         {
7558           emit_set_insn (reg1,
7559                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
7560           emit_stack_probe (reg1);
7561         }
7562
7563       rem = size - (i - PROBE_INTERVAL);
7564       if (rem > 256)
7565         {
7566           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7567
7568           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7569           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
7570         }
7571       else
7572         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
7573     }
7574
7575   /* Otherwise, do the same as above, but in a loop.  Note that we must be
7576      extra careful with variables wrapping around because we might be at
7577      the very top (or the very bottom) of the address space and we have
7578      to be able to handle this case properly; in particular, we use an
7579      equality test for the loop condition.  */
7580   else
7581     {
7582       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
7583
7584       /* Step 1: round SIZE to the previous multiple of the interval.  */
7585
7586       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7587
7588
7589       /* Step 2: compute initial and final value of the loop counter.  */
7590
7591       /* TEST_ADDR = SP + FIRST.  */
7592       emit_set_insn (reg1,
7593                      plus_constant (Pmode, stack_pointer_rtx, -first));
7594
7595       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
7596       HOST_WIDE_INT adjustment = - (first + rounded_size);
7597       if (! aarch64_uimm12_shift (adjustment))
7598         {
7599           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7600                                           true, Pmode);
7601           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7602         }
7603       else
7604         emit_set_insn (reg2,
7605                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
7606
7607       /* Step 3: the loop
7608
7609          do
7610            {
7611              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7612              probe at TEST_ADDR
7613            }
7614          while (TEST_ADDR != LAST_ADDR)
7615
7616          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7617          until it is equal to ROUNDED_SIZE.  */
7618
7619       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
7620
7621
7622       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7623          that SIZE is equal to ROUNDED_SIZE.  */
7624
7625       if (size != rounded_size)
7626         {
7627           HOST_WIDE_INT rem = size - rounded_size;
7628
7629           if (rem > 256)
7630             {
7631               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7632
7633               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7634               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
7635             }
7636           else
7637             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
7638         }
7639     }
7640
7641   /* Make sure nothing is scheduled before we are done.  */
7642   emit_insn (gen_blockage ());
7643 }
7644
7645 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
7646    absolute addresses.  */
7647
7648 const char *
7649 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7650 {
7651   static int labelno = 0;
7652   char loop_lab[32];
7653   rtx xops[2];
7654
7655   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7656
7657   /* Loop.  */
7658   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7659
7660   HOST_WIDE_INT stack_clash_probe_interval
7661     = 1 << param_stack_clash_protection_guard_size;
7662
7663   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
7664   xops[0] = reg1;
7665   HOST_WIDE_INT interval;
7666   if (flag_stack_clash_protection)
7667     interval = stack_clash_probe_interval;
7668   else
7669     interval = PROBE_INTERVAL;
7670
7671   gcc_assert (aarch64_uimm12_shift (interval));
7672   xops[1] = GEN_INT (interval);
7673
7674   output_asm_insn ("sub\t%0, %0, %1", xops);
7675
7676   /* If doing stack clash protection then we probe up by the ABI specified
7677      amount.  We do this because we're dropping full pages at a time in the
7678      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
7679   if (flag_stack_clash_protection)
7680     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7681   else
7682     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7683
7684   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
7685      by this amount for each iteration.  */
7686   output_asm_insn ("str\txzr, [%0, %1]", xops);
7687
7688   /* Test if TEST_ADDR == LAST_ADDR.  */
7689   xops[1] = reg2;
7690   output_asm_insn ("cmp\t%0, %1", xops);
7691
7692   /* Branch.  */
7693   fputs ("\tb.ne\t", asm_out_file);
7694   assemble_name_raw (asm_out_file, loop_lab);
7695   fputc ('\n', asm_out_file);
7696
7697   return "";
7698 }
7699
7700 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7701    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7702    of GUARD_SIZE.  When a probe is emitted it is done at most
7703    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7704    at most MIN_PROBE_THRESHOLD.  By the end of this function
7705    BASE = BASE - ADJUSTMENT.  */
7706
7707 const char *
7708 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7709                                       rtx min_probe_threshold, rtx guard_size)
7710 {
7711   /* This function is not allowed to use any instruction generation function
7712      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
7713      so instead emit the code you want using output_asm_insn.  */
7714   gcc_assert (flag_stack_clash_protection);
7715   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7716   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7717
7718   /* The minimum required allocation before the residual requires probing.  */
7719   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7720
7721   /* Clamp the value down to the nearest value that can be used with a cmp.  */
7722   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7723   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7724
7725   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7726   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7727
7728   static int labelno = 0;
7729   char loop_start_lab[32];
7730   char loop_end_lab[32];
7731   rtx xops[2];
7732
7733   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7734   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7735
7736   /* Emit loop start label.  */
7737   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7738
7739   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
7740   xops[0] = adjustment;
7741   xops[1] = probe_offset_value_rtx;
7742   output_asm_insn ("cmp\t%0, %1", xops);
7743
7744   /* Branch to end if not enough adjustment to probe.  */
7745   fputs ("\tb.lt\t", asm_out_file);
7746   assemble_name_raw (asm_out_file, loop_end_lab);
7747   fputc ('\n', asm_out_file);
7748
7749   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
7750   xops[0] = base;
7751   xops[1] = probe_offset_value_rtx;
7752   output_asm_insn ("sub\t%0, %0, %1", xops);
7753
7754   /* Probe at BASE.  */
7755   xops[1] = const0_rtx;
7756   output_asm_insn ("str\txzr, [%0, %1]", xops);
7757
7758   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
7759   xops[0] = adjustment;
7760   xops[1] = probe_offset_value_rtx;
7761   output_asm_insn ("sub\t%0, %0, %1", xops);
7762
7763   /* Branch to start if still more bytes to allocate.  */
7764   fputs ("\tb\t", asm_out_file);
7765   assemble_name_raw (asm_out_file, loop_start_lab);
7766   fputc ('\n', asm_out_file);
7767
7768   /* No probe leave.  */
7769   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
7770
7771   /* BASE = BASE - ADJUSTMENT.  */
7772   xops[0] = base;
7773   xops[1] = adjustment;
7774   output_asm_insn ("sub\t%0, %0, %1", xops);
7775   return "";
7776 }
7777
7778 /* Determine whether a frame chain needs to be generated.  */
7779 static bool
7780 aarch64_needs_frame_chain (void)
7781 {
7782   if (frame_pointer_needed)
7783     return true;
7784
7785   /* A leaf function cannot have calls or write LR.  */
7786   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
7787
7788   /* Don't use a frame chain in leaf functions if leaf frame pointers
7789      are disabled.  */
7790   if (flag_omit_leaf_frame_pointer && is_leaf)
7791     return false;
7792
7793   return aarch64_use_frame_pointer;
7794 }
7795
7796 /* Return true if the current function should save registers above
7797    the locals area, rather than below it.  */
7798
7799 static bool
7800 aarch64_save_regs_above_locals_p ()
7801 {
7802   /* When using stack smash protection, make sure that the canary slot
7803      comes between the locals and the saved registers.  Otherwise,
7804      it would be possible for a carefully sized smash attack to change
7805      the saved registers (particularly LR and FP) without reaching the
7806      canary.  */
7807   return crtl->stack_protect_guard;
7808 }
7809
7810 /* Return true if the current function needs to record the incoming
7811    value of PSTATE.SM.  */
7812 static bool
7813 aarch64_need_old_pstate_sm ()
7814 {
7815   /* Exit early if the incoming value of PSTATE.SM is known at
7816      compile time.  */
7817   if (aarch64_cfun_incoming_pstate_sm () != 0)
7818     return false;
7819
7820   if (aarch64_cfun_enables_pstate_sm ())
7821     return true;
7822
7823   /* Non-local goto receivers are entered with PSTATE.SM equal to 0,
7824      but the function needs to return with PSTATE.SM unchanged.  */
7825   if (nonlocal_goto_handler_labels)
7826     return true;
7827
7828   /* Likewise for exception handlers.  */
7829   eh_landing_pad lp;
7830   for (unsigned int i = 1; vec_safe_iterate (cfun->eh->lp_array, i, &lp); ++i)
7831     if (lp && lp->post_landing_pad)
7832       return true;
7833
7834   /* Non-local gotos need to set PSTATE.SM to zero.  It's possible to call
7835      streaming-compatible functions without SME being available, so PSTATE.SM
7836      should only be changed if it is currently set to one.  */
7837   if (crtl->has_nonlocal_goto)
7838     return true;
7839
7840   if (cfun->machine->call_switches_pstate_sm)
7841     for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
7842       if (auto *call = dyn_cast<rtx_call_insn *> (insn))
7843         if (!SIBLING_CALL_P (call))
7844           {
7845             /* Return true if there is a call to a non-streaming-compatible
7846                function.  */
7847             auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
7848             if (aarch64_call_switches_pstate_sm (callee_isa_mode))
7849               return true;
7850           }
7851   return false;
7852 }
7853
7854 /* Mark the registers that need to be saved by the callee and calculate
7855    the size of the callee-saved registers area and frame record (both FP
7856    and LR may be omitted).  */
7857 static void
7858 aarch64_layout_frame (void)
7859 {
7860   unsigned regno, last_fp_reg = INVALID_REGNUM;
7861   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
7862   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
7863   bool frame_related_fp_reg_p = false;
7864   aarch64_frame &frame = cfun->machine->frame;
7865   poly_int64 top_of_locals = -1;
7866   bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
7867
7868   vec_safe_truncate (frame.saved_gprs, 0);
7869   vec_safe_truncate (frame.saved_fprs, 0);
7870   vec_safe_truncate (frame.saved_prs, 0);
7871
7872   frame.emit_frame_chain = aarch64_needs_frame_chain ();
7873
7874   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
7875      the mid-end is doing.  */
7876   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
7877
7878 #define SLOT_NOT_REQUIRED (-2)
7879 #define SLOT_REQUIRED     (-1)
7880
7881   frame.wb_push_candidate1 = INVALID_REGNUM;
7882   frame.wb_push_candidate2 = INVALID_REGNUM;
7883   frame.spare_pred_reg = INVALID_REGNUM;
7884
7885   /* First mark all the registers that really need to be saved...  */
7886   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7887     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
7888   frame.old_svcr_offset = SLOT_NOT_REQUIRED;
7889
7890   /* ... that includes the eh data registers (if needed)...  */
7891   if (crtl->calls_eh_return)
7892     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
7893       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
7894
7895   /* ... and any callee saved register that dataflow says is live.  */
7896   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7897     if (df_regs_ever_live_p (regno)
7898         && !fixed_regs[regno]
7899         && (regno == R30_REGNUM
7900             || !crtl->abi->clobbers_full_reg_p (regno)))
7901       frame.reg_offset[regno] = SLOT_REQUIRED;
7902
7903   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7904     if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7905         && !fixed_regs[regno]
7906         && !crtl->abi->clobbers_full_reg_p (regno))
7907       {
7908         frame.reg_offset[regno] = SLOT_REQUIRED;
7909         last_fp_reg = regno;
7910         if (aarch64_emit_cfi_for_reg_p (regno))
7911           frame_related_fp_reg_p = true;
7912       }
7913
7914   /* Big-endian SVE frames need a spare predicate register in order
7915      to save Z8-Z15.  Decide which register they should use.  Prefer
7916      an unused argument register if possible, so that we don't force P4
7917      to be saved unnecessarily.  */
7918   if (frame_related_fp_reg_p
7919       && crtl->abi->id () == ARM_PCS_SVE
7920       && BYTES_BIG_ENDIAN)
7921     {
7922       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7923       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
7924       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
7925         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
7926           break;
7927       gcc_assert (regno <= P7_REGNUM);
7928       frame.spare_pred_reg = regno;
7929       df_set_regs_ever_live (regno, true);
7930     }
7931
7932   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7933     if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7934         && !fixed_regs[regno]
7935         && !crtl->abi->clobbers_full_reg_p (regno))
7936       frame.reg_offset[regno] = SLOT_REQUIRED;
7937
7938   bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
7939
7940   poly_int64 offset = crtl->outgoing_args_size;
7941   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
7942   if (regs_at_top_p)
7943     {
7944       offset += get_frame_size ();
7945       offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7946       top_of_locals = offset;
7947     }
7948   frame.bytes_below_saved_regs = offset;
7949   frame.sve_save_and_probe = INVALID_REGNUM;
7950
7951   /* Now assign stack slots for the registers.  Start with the predicate
7952      registers, since predicate LDR and STR have a relatively small
7953      offset range.  These saves happen below the hard frame pointer.  */
7954   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7955     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7956       {
7957         vec_safe_push (frame.saved_prs, regno);
7958         if (frame.sve_save_and_probe == INVALID_REGNUM)
7959           frame.sve_save_and_probe = regno;
7960         frame.reg_offset[regno] = offset;
7961         offset += BYTES_PER_SVE_PRED;
7962       }
7963
7964   poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
7965   if (maybe_ne (saved_prs_size, 0))
7966     {
7967       /* If we have any vector registers to save above the predicate registers,
7968          the offset of the vector register save slots need to be a multiple
7969          of the vector size.  This lets us use the immediate forms of LDR/STR
7970          (or LD1/ST1 for big-endian).
7971
7972          A vector register is 8 times the size of a predicate register,
7973          and we need to save a maximum of 12 predicate registers, so the
7974          first vector register will be at either #1, MUL VL or #2, MUL VL.
7975
7976          If we don't have any vector registers to save, and we know how
7977          big the predicate save area is, we can just round it up to the
7978          next 16-byte boundary.  */
7979       if (last_fp_reg == INVALID_REGNUM && offset.is_constant ())
7980         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7981       else
7982         {
7983           if (known_le (saved_prs_size, vector_save_size))
7984             offset = frame.bytes_below_saved_regs + vector_save_size;
7985           else if (known_le (saved_prs_size, vector_save_size * 2))
7986             offset = frame.bytes_below_saved_regs + vector_save_size * 2;
7987           else
7988             gcc_unreachable ();
7989         }
7990     }
7991
7992   /* If we need to save any SVE vector registers, add them next.  */
7993   if (last_fp_reg != INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
7994     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7995       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7996         {
7997           vec_safe_push (frame.saved_fprs, regno);
7998           if (frame.sve_save_and_probe == INVALID_REGNUM)
7999             frame.sve_save_and_probe = regno;
8000           frame.reg_offset[regno] = offset;
8001           offset += vector_save_size;
8002         }
8003
8004   /* OFFSET is now the offset of the hard frame pointer from the bottom
8005      of the callee save area.  */
8006   auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
8007   bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
8008   gcc_assert (!saves_below_hard_fp_p
8009               || (frame.sve_save_and_probe != INVALID_REGNUM
8010                   && known_eq (frame.reg_offset[frame.sve_save_and_probe],
8011                                frame.bytes_below_saved_regs)));
8012
8013   frame.bytes_below_hard_fp = offset;
8014   frame.hard_fp_save_and_probe = INVALID_REGNUM;
8015
8016   auto allocate_gpr_slot = [&](unsigned int regno)
8017     {
8018       vec_safe_push (frame.saved_gprs, regno);
8019       frame.reg_offset[regno] = offset;
8020       offset += UNITS_PER_WORD;
8021     };
8022
8023   if (frame.emit_frame_chain)
8024     {
8025       /* FP and LR are placed in the linkage record.  */
8026       allocate_gpr_slot (R29_REGNUM);
8027       allocate_gpr_slot (R30_REGNUM);
8028     }
8029   else if ((flag_stack_clash_protection || !frame.is_scs_enabled)
8030            && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
8031     /* Put the LR save slot first, since it makes a good choice of probe
8032        for stack clash purposes.  The idea is that the link register usually
8033        has to be saved before a call anyway, and so we lose little by
8034        stopping it from being individually shrink-wrapped.  */
8035     allocate_gpr_slot (R30_REGNUM);
8036
8037   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8038     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8039       allocate_gpr_slot (regno);
8040
8041   if (aarch64_need_old_pstate_sm ())
8042     {
8043       frame.old_svcr_offset = offset;
8044       offset += UNITS_PER_WORD;
8045     }
8046
8047   /* If the current function changes the SVE vector length, ensure that the
8048      old value of the DWARF VG register is saved and available in the CFI,
8049      so that outer frames with VL-sized offsets can be processed correctly.  */
8050   if (cfun->machine->call_switches_pstate_sm
8051       || aarch64_cfun_enables_pstate_sm ())
8052     {
8053       frame.reg_offset[VG_REGNUM] = offset;
8054       offset += UNITS_PER_WORD;
8055     }
8056
8057   poly_int64 max_int_offset = offset;
8058   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8059   bool has_align_gap = maybe_ne (offset, max_int_offset);
8060
8061   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8062     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8063       {
8064         vec_safe_push (frame.saved_fprs, regno);
8065         /* If there is an alignment gap between integer and fp callee-saves,
8066            allocate the last fp register to it if possible.  */
8067         if (regno == last_fp_reg
8068             && has_align_gap
8069             && known_eq (vector_save_size, 8)
8070             && multiple_p (offset, 16))
8071           {
8072             frame.reg_offset[regno] = max_int_offset;
8073             break;
8074           }
8075
8076         frame.reg_offset[regno] = offset;
8077         offset += vector_save_size;
8078       }
8079
8080   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8081   auto saved_regs_size = offset - frame.bytes_below_saved_regs;
8082
8083   array_slice<unsigned int> push_regs = (!vec_safe_is_empty (frame.saved_gprs)
8084                                          ? frame.saved_gprs
8085                                          : frame.saved_fprs);
8086   if (!push_regs.empty ()
8087       && known_eq (frame.reg_offset[push_regs[0]], frame.bytes_below_hard_fp))
8088     {
8089       frame.hard_fp_save_and_probe = push_regs[0];
8090       frame.wb_push_candidate1 = push_regs[0];
8091       if (push_regs.size () > 1)
8092         frame.wb_push_candidate2 = push_regs[1];
8093     }
8094
8095   /* With stack-clash, a register must be saved in non-leaf functions.
8096      The saving of the bottommost register counts as an implicit probe,
8097      which allows us to maintain the invariant described in the comment
8098      at expand_prologue.  */
8099   gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
8100
8101   if (!regs_at_top_p)
8102     {
8103       offset += get_frame_size ();
8104       offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8105       top_of_locals = offset;
8106     }
8107   offset += frame.saved_varargs_size;
8108   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
8109   frame.frame_size = offset;
8110
8111   frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
8112   gcc_assert (known_ge (top_of_locals, 0));
8113   frame.bytes_above_locals = frame.frame_size - top_of_locals;
8114
8115   frame.initial_adjust = 0;
8116   frame.final_adjust = 0;
8117   frame.callee_adjust = 0;
8118   frame.sve_callee_adjust = 0;
8119
8120   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8121   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8122
8123   /* Shadow call stack only deals with functions where the LR is pushed
8124      onto the stack and without specifying the "no_sanitize" attribute
8125      with the argument "shadow-call-stack".  */
8126   frame.is_scs_enabled
8127     = (!crtl->calls_eh_return
8128        && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8129        && known_ge (frame.reg_offset[LR_REGNUM], 0));
8130
8131   /* When shadow call stack is enabled, the scs_pop in the epilogue will
8132      restore x30, and we don't need to pop x30 again in the traditional
8133      way.  Pop candidates record the registers that need to be popped
8134      eventually.  */
8135   if (frame.is_scs_enabled)
8136     {
8137       if (frame.wb_pop_candidate2 == R30_REGNUM)
8138         frame.wb_pop_candidate2 = INVALID_REGNUM;
8139       else if (frame.wb_pop_candidate1 == R30_REGNUM)
8140         frame.wb_pop_candidate1 = INVALID_REGNUM;
8141     }
8142
8143   /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8144      256 to ensure that the offset meets the requirements of emit_move_insn.
8145      Similarly, if candidate1 is INVALID_REGNUM, we need to set
8146      max_push_offset to 0, because no registers are popped at this time,
8147      so callee_adjust cannot be adjusted.  */
8148   HOST_WIDE_INT max_push_offset = 0;
8149   if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8150     {
8151       if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8152         max_push_offset = 512;
8153       else
8154         max_push_offset = 256;
8155     }
8156
8157   HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
8158   HOST_WIDE_INT const_saved_regs_size;
8159   if (known_eq (saved_regs_size, 0))
8160     frame.initial_adjust = frame.frame_size;
8161   else if (frame.frame_size.is_constant (&const_size)
8162            && const_size < max_push_offset
8163            && known_eq (frame.bytes_above_hard_fp, const_size))
8164     {
8165       /* Simple, small frame with no data below the saved registers.
8166
8167          stp reg1, reg2, [sp, -frame_size]!
8168          stp reg3, reg4, [sp, 16]  */
8169       frame.callee_adjust = const_size;
8170     }
8171   else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
8172            && saved_regs_size.is_constant (&const_saved_regs_size)
8173            && const_below_saved_regs + const_saved_regs_size < 512
8174            /* We could handle this case even with data below the saved
8175               registers, provided that that data left us with valid offsets
8176               for all predicate and vector save slots.  It's such a rare
8177               case that it hardly seems worth the effort though.  */
8178            && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
8179            && !(cfun->calls_alloca
8180                 && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8181                 && const_above_fp < max_push_offset))
8182     {
8183       /* Frame with small area below the saved registers:
8184
8185          sub sp, sp, frame_size
8186          stp reg1, reg2, [sp, bytes_below_saved_regs]
8187          stp reg3, reg4, [sp, bytes_below_saved_regs + 16]  */
8188       frame.initial_adjust = frame.frame_size;
8189     }
8190   else if (saves_below_hard_fp_p
8191            && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
8192     {
8193       /* Frame in which all saves are SVE saves:
8194
8195          sub sp, sp, frame_size - bytes_below_saved_regs
8196          save SVE registers relative to SP
8197          sub sp, sp, bytes_below_saved_regs  */
8198       frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
8199       frame.final_adjust = frame.bytes_below_saved_regs;
8200     }
8201   else if (frame.wb_push_candidate1 != INVALID_REGNUM
8202            && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8203            && const_above_fp < max_push_offset)
8204     {
8205       /* Frame with large area below the saved registers, or with SVE saves,
8206          but with a small area above:
8207
8208          stp reg1, reg2, [sp, -hard_fp_offset]!
8209          stp reg3, reg4, [sp, 16]
8210          [sub sp, sp, below_hard_fp_saved_regs_size]
8211          [save SVE registers relative to SP]
8212          sub sp, sp, bytes_below_saved_regs  */
8213       frame.callee_adjust = const_above_fp;
8214       frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8215       frame.final_adjust = frame.bytes_below_saved_regs;
8216     }
8217   else
8218     {
8219       /* General case:
8220
8221          sub sp, sp, hard_fp_offset
8222          stp x29, x30, [sp, 0]
8223          add x29, sp, 0
8224          stp reg3, reg4, [sp, 16]
8225          [sub sp, sp, below_hard_fp_saved_regs_size]
8226          [save SVE registers relative to SP]
8227          sub sp, sp, bytes_below_saved_regs  */
8228       frame.initial_adjust = frame.bytes_above_hard_fp;
8229       frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8230       frame.final_adjust = frame.bytes_below_saved_regs;
8231     }
8232
8233   /* The frame is allocated in pieces, with each non-final piece
8234      including a register save at offset 0 that acts as a probe for
8235      the following piece.  In addition, the save of the bottommost register
8236      acts as a probe for callees and allocas.  Roll back any probes that
8237      aren't needed.
8238
8239      A probe isn't needed if it is associated with the final allocation
8240      (including callees and allocas) that happens before the epilogue is
8241      executed.  */
8242   if (crtl->is_leaf
8243       && !cfun->calls_alloca
8244       && known_eq (frame.final_adjust, 0))
8245     {
8246       if (maybe_ne (frame.sve_callee_adjust, 0))
8247         frame.sve_save_and_probe = INVALID_REGNUM;
8248       else
8249         frame.hard_fp_save_and_probe = INVALID_REGNUM;
8250     }
8251
8252   /* Make sure the individual adjustments add up to the full frame size.  */
8253   gcc_assert (known_eq (frame.initial_adjust
8254                         + frame.callee_adjust
8255                         + frame.sve_callee_adjust
8256                         + frame.final_adjust, frame.frame_size));
8257
8258   if (frame.callee_adjust == 0)
8259     {
8260       /* We've decided not to do a "real" push and pop.  However,
8261          setting up the frame chain is treated as being essentially
8262          a multi-instruction push.  */
8263       frame.wb_pop_candidate1 = frame.wb_pop_candidate2 = INVALID_REGNUM;
8264       if (!frame.emit_frame_chain)
8265         frame.wb_push_candidate1 = frame.wb_push_candidate2 = INVALID_REGNUM;
8266     }
8267
8268   frame.laid_out = true;
8269 }
8270
8271 /* Return true if the register REGNO is saved on entry to
8272    the current function.  */
8273
8274 static bool
8275 aarch64_register_saved_on_entry (int regno)
8276 {
8277   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8278 }
8279
8280 /* Push the register number REGNO of mode MODE to the stack with write-back
8281    adjusting the stack by ADJUSTMENT.  */
8282
8283 static void
8284 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8285                            HOST_WIDE_INT adjustment)
8286  {
8287   rtx base_rtx = stack_pointer_rtx;
8288   rtx insn, reg, mem;
8289
8290   reg = gen_rtx_REG (mode, regno);
8291   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8292                             plus_constant (Pmode, base_rtx, -adjustment));
8293   mem = gen_frame_mem (mode, mem);
8294
8295   insn = emit_move_insn (mem, reg);
8296   RTX_FRAME_RELATED_P (insn) = 1;
8297 }
8298
8299 /* Generate and return an instruction to store the pair of registers
8300    REG and REG2 of mode MODE to location BASE with write-back adjusting
8301    the stack location BASE by ADJUSTMENT.  */
8302
8303 static rtx
8304 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8305                           HOST_WIDE_INT adjustment)
8306 {
8307   rtx new_base = plus_constant (Pmode, base, -adjustment);
8308   rtx mem = gen_frame_mem (mode, new_base);
8309   rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8310
8311   return gen_rtx_PARALLEL (VOIDmode,
8312                            gen_rtvec (3,
8313                                       gen_rtx_SET (base, new_base),
8314                                       gen_rtx_SET (mem, reg),
8315                                       gen_rtx_SET (mem2, reg2)));
8316 }
8317
8318 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8319    stack pointer by ADJUSTMENT.  */
8320
8321 static void
8322 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8323 {
8324   rtx_insn *insn;
8325   machine_mode mode = aarch64_reg_save_mode (regno1);
8326
8327   if (regno2 == INVALID_REGNUM)
8328     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8329
8330   rtx reg1 = gen_rtx_REG (mode, regno1);
8331   rtx reg2 = gen_rtx_REG (mode, regno2);
8332
8333   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8334                                               reg2, adjustment));
8335   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8336   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8337   RTX_FRAME_RELATED_P (insn) = 1;
8338 }
8339
8340 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8341    adjusting it by ADJUSTMENT afterwards.  */
8342
8343 static rtx
8344 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8345                          HOST_WIDE_INT adjustment)
8346 {
8347   rtx mem = gen_frame_mem (mode, base);
8348   rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8349   rtx new_base = plus_constant (Pmode, base, adjustment);
8350
8351   return gen_rtx_PARALLEL (VOIDmode,
8352                            gen_rtvec (3,
8353                                       gen_rtx_SET (base, new_base),
8354                                       gen_rtx_SET (reg, mem),
8355                                       gen_rtx_SET (reg2, mem2)));
8356 }
8357
8358 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8359    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8360    into CFI_OPS.  */
8361
8362 static void
8363 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8364                   rtx *cfi_ops)
8365 {
8366   machine_mode mode = aarch64_reg_save_mode (regno1);
8367   rtx reg1 = gen_rtx_REG (mode, regno1);
8368
8369   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8370
8371   if (regno2 == INVALID_REGNUM)
8372     {
8373       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8374       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8375       emit_move_insn (reg1, gen_frame_mem (mode, mem));
8376     }
8377   else
8378     {
8379       rtx reg2 = gen_rtx_REG (mode, regno2);
8380       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8381       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8382                                           reg2, adjustment));
8383     }
8384 }
8385
8386 /* Given an ldp/stp register operand mode MODE, return a suitable mode to use
8387    for a mem rtx representing the entire pair.  */
8388
8389 static machine_mode
8390 aarch64_pair_mode_for_mode (machine_mode mode)
8391 {
8392   if (known_eq (GET_MODE_SIZE (mode), 4))
8393     return V2x4QImode;
8394   else if (known_eq (GET_MODE_SIZE (mode), 8))
8395     return V2x8QImode;
8396   else if (known_eq (GET_MODE_SIZE (mode), 16))
8397     return V2x16QImode;
8398   else
8399     gcc_unreachable ();
8400 }
8401
8402 /* Given a base mem MEM with mode and address suitable for a single ldp/stp
8403    operand, return an rtx like MEM which instead represents the entire pair.  */
8404
8405 static rtx
8406 aarch64_pair_mem_from_base (rtx mem)
8407 {
8408   auto pair_mode = aarch64_pair_mode_for_mode (GET_MODE (mem));
8409   mem = adjust_bitfield_address_nv (mem, pair_mode, 0);
8410   gcc_assert (aarch64_mem_pair_lanes_operand (mem, pair_mode));
8411   return mem;
8412 }
8413
8414 /* Generate and return a store pair instruction to store REG1 and REG2
8415    into memory starting at BASE_MEM.  All three rtxes should have modes of the
8416    same size.  */
8417
8418 rtx
8419 aarch64_gen_store_pair (rtx base_mem, rtx reg1, rtx reg2)
8420 {
8421   rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8422
8423   return gen_rtx_SET (pair_mem,
8424                       gen_rtx_UNSPEC (GET_MODE (pair_mem),
8425                                       gen_rtvec (2, reg1, reg2),
8426                                       UNSPEC_STP));
8427 }
8428
8429 /* Generate and return a load pair instruction to load a pair of
8430    registers starting at BASE_MEM into REG1 and REG2.  If CODE is
8431    UNKNOWN, all three rtxes should have modes of the same size.
8432    Otherwise, CODE is {SIGN,ZERO}_EXTEND, base_mem should be in SImode,
8433    and REG{1,2} should be in DImode.  */
8434
8435 rtx
8436 aarch64_gen_load_pair (rtx reg1, rtx reg2, rtx base_mem, enum rtx_code code)
8437 {
8438   rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8439
8440   const bool any_extend_p = (code == ZERO_EXTEND || code == SIGN_EXTEND);
8441   if (any_extend_p)
8442     gcc_checking_assert (GET_MODE (base_mem) == SImode
8443                          && GET_MODE (reg1) == DImode
8444                          && GET_MODE (reg2) == DImode);
8445   else
8446     gcc_assert (code == UNKNOWN);
8447
8448   rtx unspecs[2] = {
8449     gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg1),
8450                     gen_rtvec (1, pair_mem),
8451                     UNSPEC_LDP_FST),
8452     gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg2),
8453                     gen_rtvec (1, copy_rtx (pair_mem)),
8454                     UNSPEC_LDP_SND)
8455   };
8456
8457   if (any_extend_p)
8458     for (int i = 0; i < 2; i++)
8459       unspecs[i] = gen_rtx_fmt_e (code, DImode, unspecs[i]);
8460
8461   return gen_rtx_PARALLEL (VOIDmode,
8462                            gen_rtvec (2,
8463                                       gen_rtx_SET (reg1, unspecs[0]),
8464                                       gen_rtx_SET (reg2, unspecs[1])));
8465 }
8466
8467 /* Return TRUE if return address signing should be enabled for the current
8468    function, otherwise return FALSE.  */
8469
8470 bool
8471 aarch64_return_address_signing_enabled (void)
8472 {
8473   /* This function should only be called after frame laid out.   */
8474   gcc_assert (cfun->machine->frame.laid_out);
8475
8476   /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8477      if its LR is pushed onto stack.  */
8478   return (aarch_ra_sign_scope == AARCH_FUNCTION_ALL
8479           || (aarch_ra_sign_scope == AARCH_FUNCTION_NON_LEAF
8480               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8481 }
8482
8483 /* Only used by the arm backend.  */
8484 void aarch_bti_arch_check (void)
8485 {}
8486
8487 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
8488 bool
8489 aarch_bti_enabled (void)
8490 {
8491   return (aarch_enable_bti == 1);
8492 }
8493
8494 /* Check if INSN is a BTI J insn.  */
8495 bool
8496 aarch_bti_j_insn_p (rtx_insn *insn)
8497 {
8498   if (!insn || !INSN_P (insn))
8499     return false;
8500
8501   rtx pat = PATTERN (insn);
8502   return GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == UNSPECV_BTI_J;
8503 }
8504
8505 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction.  */
8506 bool
8507 aarch_pac_insn_p (rtx x)
8508 {
8509   if (!INSN_P (x))
8510     return false;
8511
8512   subrtx_var_iterator::array_type array;
8513   FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (x), ALL)
8514     {
8515       rtx sub = *iter;
8516       if (sub && GET_CODE (sub) == UNSPEC)
8517         {
8518           int unspec_val = XINT (sub, 1);
8519           switch (unspec_val)
8520             {
8521             case UNSPEC_PACIASP:
8522             case UNSPEC_PACIBSP:
8523               return true;
8524
8525             default:
8526               return false;
8527             }
8528           iter.skip_subrtxes ();
8529         }
8530     }
8531   return false;
8532 }
8533
8534 rtx aarch_gen_bti_c (void)
8535 {
8536   return gen_bti_c ();
8537 }
8538
8539 rtx aarch_gen_bti_j (void)
8540 {
8541   return gen_bti_j ();
8542 }
8543
8544 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8545    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8546    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
8547
8548      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8549          or LD1D address
8550
8551      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8552          if the variable isn't already nonnull
8553
8554    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8555    Handle this case using a temporary base register that is suitable for
8556    all offsets in that range.  Use ANCHOR_REG as this base register if it
8557    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
8558
8559 static inline void
8560 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8561                                      rtx &anchor_reg, poly_int64 &offset,
8562                                      rtx &ptrue)
8563 {
8564   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8565     {
8566       /* This is the maximum valid offset of the anchor from the base.
8567          Lower values would be valid too.  */
8568       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8569       if (!anchor_reg)
8570         {
8571           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8572           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8573                                     gen_int_mode (anchor_offset, Pmode)));
8574         }
8575       base_rtx = anchor_reg;
8576       offset -= anchor_offset;
8577     }
8578   if (!ptrue)
8579     {
8580       int pred_reg = cfun->machine->frame.spare_pred_reg;
8581       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8582                       CONSTM1_RTX (VNx16BImode));
8583       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8584     }
8585 }
8586
8587 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8588    is saved at BASE + OFFSET.  */
8589
8590 static void
8591 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8592                             rtx base, poly_int64 offset)
8593 {
8594   rtx mem = gen_frame_mem (GET_MODE (reg),
8595                            plus_constant (Pmode, base, offset));
8596   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8597 }
8598
8599 /* Emit code to save the callee-saved registers in REGS.  Skip any
8600    write-back candidates if SKIP_WB is true, otherwise consider only
8601    write-back candidates.
8602
8603    The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8604    of the static frame.  HARD_FP_VALID_P is true if the hard frame pointer
8605    has been set up.  */
8606
8607 static void
8608 aarch64_save_callee_saves (poly_int64 bytes_below_sp,
8609                            array_slice<unsigned int> regs, bool skip_wb,
8610                            bool hard_fp_valid_p)
8611 {
8612   aarch64_frame &frame = cfun->machine->frame;
8613   rtx_insn *insn;
8614   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8615
8616   auto skip_save_p = [&](unsigned int regno)
8617     {
8618       if (cfun->machine->reg_is_wrapped_separately[regno])
8619         return true;
8620
8621       if (skip_wb == (regno == frame.wb_push_candidate1
8622                       || regno == frame.wb_push_candidate2))
8623         return true;
8624
8625       return false;
8626     };
8627
8628   for (unsigned int i = 0; i < regs.size (); ++i)
8629     {
8630       unsigned int regno = regs[i];
8631       poly_int64 offset;
8632       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8633
8634       if (skip_save_p (regno))
8635         continue;
8636
8637       machine_mode mode = aarch64_reg_save_mode (regno);
8638       rtx reg = gen_rtx_REG (mode, regno);
8639       rtx move_src = reg;
8640       offset = frame.reg_offset[regno] - bytes_below_sp;
8641       if (regno == VG_REGNUM)
8642         {
8643           move_src = gen_rtx_REG (DImode, IP0_REGNUM);
8644           emit_move_insn (move_src, gen_int_mode (aarch64_sve_vg, DImode));
8645         }
8646       rtx base_rtx = stack_pointer_rtx;
8647       poly_int64 sp_offset = offset;
8648
8649       HOST_WIDE_INT const_offset;
8650       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8651         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8652                                              offset, ptrue);
8653       else if (GP_REGNUM_P (REGNO (reg))
8654                && (!offset.is_constant (&const_offset) || const_offset >= 512))
8655         {
8656           poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
8657           if (hard_fp_valid_p)
8658             base_rtx = hard_frame_pointer_rtx;
8659           else
8660             {
8661               if (!anchor_reg)
8662                 {
8663                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8664                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8665                                             gen_int_mode (fp_offset, Pmode)));
8666                 }
8667               base_rtx = anchor_reg;
8668             }
8669           offset -= fp_offset;
8670         }
8671       rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8672       rtx cfi_mem = gen_frame_mem (mode, plus_constant (Pmode,
8673                                                         stack_pointer_rtx,
8674                                                         sp_offset));
8675       rtx cfi_set = gen_rtx_SET (cfi_mem, reg);
8676       bool need_cfi_note_p = (base_rtx != stack_pointer_rtx);
8677
8678       unsigned int regno2;
8679       if (!aarch64_sve_mode_p (mode)
8680           && reg == move_src
8681           && i + 1 < regs.size ()
8682           && (regno2 = regs[i + 1], !skip_save_p (regno2))
8683           && known_eq (GET_MODE_SIZE (mode),
8684                        frame.reg_offset[regno2] - frame.reg_offset[regno]))
8685         {
8686           rtx reg2 = gen_rtx_REG (mode, regno2);
8687
8688           offset += GET_MODE_SIZE (mode);
8689           insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
8690
8691           rtx cfi_mem2
8692             = gen_frame_mem (mode,
8693                              plus_constant (Pmode,
8694                                             stack_pointer_rtx,
8695                                             sp_offset + GET_MODE_SIZE (mode)));
8696           rtx cfi_set2 = gen_rtx_SET (cfi_mem2, reg2);
8697
8698           /* The first part of a frame-related parallel insn is always
8699              assumed to be relevant to the frame calculations;
8700              subsequent parts, are only frame-related if
8701              explicitly marked.  */
8702           if (aarch64_emit_cfi_for_reg_p (regno2))
8703             RTX_FRAME_RELATED_P (cfi_set2) = 1;
8704
8705           /* Add a REG_FRAME_RELATED_EXPR note since the unspec
8706              representation of stp cannot be understood directly by
8707              dwarf2cfi.  */
8708           rtx par = gen_rtx_PARALLEL (VOIDmode,
8709                                       gen_rtvec (2, cfi_set, cfi_set2));
8710           add_reg_note (insn, REG_FRAME_RELATED_EXPR, par);
8711
8712           regno = regno2;
8713           ++i;
8714         }
8715       else
8716         {
8717           if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8718             {
8719               insn = emit_insn (gen_aarch64_pred_mov (mode, mem,
8720                                                       ptrue, move_src));
8721               need_cfi_note_p = true;
8722             }
8723           else if (aarch64_sve_mode_p (mode))
8724             insn = emit_insn (gen_rtx_SET (mem, move_src));
8725           else
8726             insn = emit_move_insn (mem, move_src);
8727
8728           if (frame_related_p && (need_cfi_note_p || move_src != reg))
8729             add_reg_note (insn, REG_FRAME_RELATED_EXPR, cfi_set);
8730         }
8731
8732       RTX_FRAME_RELATED_P (insn) = frame_related_p;
8733
8734       /* Emit a fake instruction to indicate that the VG save slot has
8735          been initialized.  */
8736       if (regno == VG_REGNUM)
8737         emit_insn (gen_aarch64_old_vg_saved (move_src, mem));
8738     }
8739 }
8740
8741 /* Emit code to restore the callee registers in REGS, ignoring pop candidates
8742    and any other registers that are handled separately.  Write the appropriate
8743    REG_CFA_RESTORE notes into CFI_OPS.
8744
8745    The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8746    of the static frame.  */
8747
8748 static void
8749 aarch64_restore_callee_saves (poly_int64 bytes_below_sp,
8750                               array_slice<unsigned int> regs, rtx *cfi_ops)
8751 {
8752   aarch64_frame &frame = cfun->machine->frame;
8753   poly_int64 offset;
8754   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8755
8756   auto skip_restore_p = [&](unsigned int regno)
8757     {
8758       if (cfun->machine->reg_is_wrapped_separately[regno])
8759         return true;
8760
8761       if (regno == frame.wb_pop_candidate1
8762           || regno == frame.wb_pop_candidate2)
8763         return true;
8764
8765       /* The shadow call stack code restores LR separately.  */
8766       if (frame.is_scs_enabled && regno == LR_REGNUM)
8767         return true;
8768
8769       return false;
8770     };
8771
8772   for (unsigned int i = 0; i < regs.size (); ++i)
8773     {
8774       unsigned int regno = regs[i];
8775       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8776       if (skip_restore_p (regno))
8777         continue;
8778
8779       machine_mode mode = aarch64_reg_save_mode (regno);
8780       rtx reg = gen_rtx_REG (mode, regno);
8781       offset = frame.reg_offset[regno] - bytes_below_sp;
8782       rtx base_rtx = stack_pointer_rtx;
8783       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8784         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8785                                              offset, ptrue);
8786       rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8787
8788       unsigned int regno2;
8789       if (!aarch64_sve_mode_p (mode)
8790           && i + 1 < regs.size ()
8791           && (regno2 = regs[i + 1], !skip_restore_p (regno2))
8792           && known_eq (GET_MODE_SIZE (mode),
8793                        frame.reg_offset[regno2] - frame.reg_offset[regno]))
8794         {
8795           rtx reg2 = gen_rtx_REG (mode, regno2);
8796
8797           offset += GET_MODE_SIZE (mode);
8798           emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
8799
8800           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8801           regno = regno2;
8802           ++i;
8803         }
8804       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8805         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
8806       else if (aarch64_sve_mode_p (mode))
8807         emit_insn (gen_rtx_SET (reg, mem));
8808       else
8809         emit_move_insn (reg, mem);
8810       if (frame_related_p)
8811         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
8812     }
8813 }
8814
8815 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
8816    of MODE.  */
8817
8818 static inline bool
8819 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8820 {
8821   HOST_WIDE_INT multiple;
8822   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8823           && IN_RANGE (multiple, -8, 7));
8824 }
8825
8826 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
8827    of MODE.  */
8828
8829 static inline bool
8830 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8831 {
8832   HOST_WIDE_INT multiple;
8833   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8834           && IN_RANGE (multiple, -32, 31));
8835 }
8836
8837 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
8838    of MODE.  */
8839
8840 static inline bool
8841 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8842 {
8843   HOST_WIDE_INT multiple;
8844   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8845           && IN_RANGE (multiple, 0, 63));
8846 }
8847
8848 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
8849    of MODE.  */
8850
8851 bool
8852 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8853 {
8854   HOST_WIDE_INT multiple;
8855   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8856           && IN_RANGE (multiple, -64, 63));
8857 }
8858
8859 /* Return true if OFFSET is a signed 9-bit value.  */
8860
8861 bool
8862 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
8863                                        poly_int64 offset)
8864 {
8865   HOST_WIDE_INT const_offset;
8866   return (offset.is_constant (&const_offset)
8867           && IN_RANGE (const_offset, -256, 255));
8868 }
8869
8870 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
8871    of MODE.  */
8872
8873 static inline bool
8874 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8875 {
8876   HOST_WIDE_INT multiple;
8877   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8878           && IN_RANGE (multiple, -256, 255));
8879 }
8880
8881 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
8882    of MODE.  */
8883
8884 static inline bool
8885 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8886 {
8887   HOST_WIDE_INT multiple;
8888   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8889           && IN_RANGE (multiple, 0, 4095));
8890 }
8891
8892 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
8893
8894 static sbitmap
8895 aarch64_get_separate_components (void)
8896 {
8897   aarch64_frame &frame = cfun->machine->frame;
8898   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8899   bitmap_clear (components);
8900
8901   /* The registers we need saved to the frame.  */
8902   bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
8903   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8904     if (aarch64_register_saved_on_entry (regno))
8905       {
8906         /* Disallow shrink wrapping for registers that will be clobbered
8907            by an SMSTART SM in the prologue.  */
8908         if (enables_pstate_sm
8909             && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
8910           continue;
8911
8912         /* Punt on saves and restores that use ST1D and LD1D.  We could
8913            try to be smarter, but it would involve making sure that the
8914            spare predicate register itself is safe to use at the save
8915            and restore points.  Also, when a frame pointer is being used,
8916            the slots are often out of reach of ST1D and LD1D anyway.  */
8917         machine_mode mode = aarch64_reg_save_mode (regno);
8918         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8919           continue;
8920
8921         poly_int64 offset = frame.reg_offset[regno];
8922
8923         /* Get the offset relative to the register we'll use.  */
8924         if (frame_pointer_needed)
8925           offset -= frame.bytes_below_hard_fp;
8926
8927         /* Check that we can access the stack slot of the register with one
8928            direct load with no adjustments needed.  */
8929         if (aarch64_sve_mode_p (mode)
8930             ? offset_9bit_signed_scaled_p (mode, offset)
8931             : offset_12bit_unsigned_scaled_p (mode, offset))
8932           bitmap_set_bit (components, regno);
8933       }
8934
8935   /* Don't mess with the hard frame pointer.  */
8936   if (frame_pointer_needed)
8937     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
8938
8939   /* If the spare predicate register used by big-endian SVE code
8940      is call-preserved, it must be saved in the main prologue
8941      before any saves that use it.  */
8942   if (frame.spare_pred_reg != INVALID_REGNUM)
8943     bitmap_clear_bit (components, frame.spare_pred_reg);
8944
8945   unsigned reg1 = frame.wb_push_candidate1;
8946   unsigned reg2 = frame.wb_push_candidate2;
8947   /* If registers have been chosen to be stored/restored with
8948      writeback don't interfere with them to avoid having to output explicit
8949      stack adjustment instructions.  */
8950   if (reg2 != INVALID_REGNUM)
8951     bitmap_clear_bit (components, reg2);
8952   if (reg1 != INVALID_REGNUM)
8953     bitmap_clear_bit (components, reg1);
8954
8955   bitmap_clear_bit (components, LR_REGNUM);
8956   bitmap_clear_bit (components, SP_REGNUM);
8957   if (flag_stack_clash_protection)
8958     {
8959       if (frame.sve_save_and_probe != INVALID_REGNUM)
8960         bitmap_clear_bit (components, frame.sve_save_and_probe);
8961       if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
8962         bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
8963     }
8964
8965   /* The VG save sequence needs a temporary GPR.  Punt for now on trying
8966      to find one.  */
8967   bitmap_clear_bit (components, VG_REGNUM);
8968
8969   return components;
8970 }
8971
8972 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
8973
8974 static sbitmap
8975 aarch64_components_for_bb (basic_block bb)
8976 {
8977   bitmap in = DF_LIVE_IN (bb);
8978   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
8979   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
8980
8981   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8982   bitmap_clear (components);
8983
8984   /* Clobbered registers don't generate values in any meaningful sense,
8985      since nothing after the clobber can rely on their value.  And we can't
8986      say that partially-clobbered registers are unconditionally killed,
8987      because whether they're killed or not depends on the mode of the
8988      value they're holding.  Thus partially call-clobbered registers
8989      appear in neither the kill set nor the gen set.
8990
8991      Check manually for any calls that clobber more of a register than the
8992      current function can.  */
8993   function_abi_aggregator callee_abis;
8994   rtx_insn *insn;
8995   FOR_BB_INSNS (bb, insn)
8996     if (CALL_P (insn))
8997       callee_abis.note_callee_abi (insn_callee_abi (insn));
8998   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
8999
9000   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
9001   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9002     if (!fixed_regs[regno]
9003         && !crtl->abi->clobbers_full_reg_p (regno)
9004         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
9005             || bitmap_bit_p (in, regno)
9006             || bitmap_bit_p (gen, regno)
9007             || bitmap_bit_p (kill, regno)))
9008       {
9009         bitmap_set_bit (components, regno);
9010
9011         /* If there is a callee-save at an adjacent offset, add it too
9012            to increase the use of LDP/STP.  */
9013         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9014         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
9015
9016         if (regno2 <= LAST_SAVED_REGNUM)
9017           {
9018             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9019             if (regno < regno2
9020                 ? known_eq (offset + 8, offset2)
9021                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
9022               bitmap_set_bit (components, regno2);
9023           }
9024       }
9025
9026   return components;
9027 }
9028
9029 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9030    Nothing to do for aarch64.  */
9031
9032 static void
9033 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
9034 {
9035 }
9036
9037 /* Return the next set bit in BMP from START onwards.  Return the total number
9038    of bits in BMP if no set bit is found at or after START.  */
9039
9040 static unsigned int
9041 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
9042 {
9043   unsigned int nbits = SBITMAP_SIZE (bmp);
9044   if (start == nbits)
9045     return start;
9046
9047   gcc_assert (start < nbits);
9048   for (unsigned int i = start; i < nbits; i++)
9049     if (bitmap_bit_p (bmp, i))
9050       return i;
9051
9052   return nbits;
9053 }
9054
9055 /* Do the work for aarch64_emit_prologue_components and
9056    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
9057    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9058    for these components or the epilogue sequence.  That is, it determines
9059    whether we should emit stores or loads and what kind of CFA notes to attach
9060    to the insns.  Otherwise the logic for the two sequences is very
9061    similar.  */
9062
9063 static void
9064 aarch64_process_components (sbitmap components, bool prologue_p)
9065 {
9066   aarch64_frame &frame = cfun->machine->frame;
9067   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
9068                              ? HARD_FRAME_POINTER_REGNUM
9069                              : STACK_POINTER_REGNUM);
9070
9071   unsigned last_regno = SBITMAP_SIZE (components);
9072   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9073   rtx_insn *insn = NULL;
9074
9075   while (regno != last_regno)
9076     {
9077       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9078       machine_mode mode = aarch64_reg_save_mode (regno);
9079
9080       rtx reg = gen_rtx_REG (mode, regno);
9081       poly_int64 offset = frame.reg_offset[regno];
9082       if (frame_pointer_needed)
9083         offset -= frame.bytes_below_hard_fp;
9084
9085       rtx addr = plus_constant (Pmode, ptr_reg, offset);
9086       rtx mem = gen_frame_mem (mode, addr);
9087
9088       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9089       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9090       /* No more registers to handle after REGNO.
9091          Emit a single save/restore and exit.  */
9092       if (regno2 == last_regno)
9093         {
9094           insn = emit_insn (set);
9095           if (frame_related_p)
9096             {
9097               RTX_FRAME_RELATED_P (insn) = 1;
9098               if (prologue_p)
9099                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9100               else
9101                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9102             }
9103           break;
9104         }
9105
9106       poly_int64 offset2 = frame.reg_offset[regno2];
9107       /* The next register is not of the same class or its offset is not
9108          mergeable with the current one into a pair.  */
9109       if (aarch64_sve_mode_p (mode)
9110           || !satisfies_constraint_Ump (mem)
9111           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9112           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9113           || maybe_ne ((offset2 - frame.reg_offset[regno]),
9114                        GET_MODE_SIZE (mode)))
9115         {
9116           insn = emit_insn (set);
9117           if (frame_related_p)
9118             {
9119               RTX_FRAME_RELATED_P (insn) = 1;
9120               if (prologue_p)
9121                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9122               else
9123                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9124             }
9125
9126           regno = regno2;
9127           continue;
9128         }
9129
9130       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9131
9132       /* REGNO2 can be saved/restored in a pair with REGNO.  */
9133       rtx reg2 = gen_rtx_REG (mode, regno2);
9134       if (frame_pointer_needed)
9135         offset2 -= frame.bytes_below_hard_fp;
9136       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9137       rtx mem2 = gen_frame_mem (mode, addr2);
9138       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9139                              : gen_rtx_SET (reg2, mem2);
9140
9141       if (prologue_p)
9142         insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
9143       else
9144         insn = emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
9145
9146       if (frame_related_p || frame_related2_p)
9147         {
9148           RTX_FRAME_RELATED_P (insn) = 1;
9149           if (prologue_p)
9150             {
9151               if (frame_related_p)
9152                 add_reg_note (insn, REG_CFA_OFFSET, set);
9153               if (frame_related2_p)
9154                 add_reg_note (insn, REG_CFA_OFFSET, set2);
9155             }
9156           else
9157             {
9158               if (frame_related_p)
9159                 add_reg_note (insn, REG_CFA_RESTORE, reg);
9160               if (frame_related2_p)
9161                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9162             }
9163         }
9164
9165       regno = aarch64_get_next_set_bit (components, regno2 + 1);
9166     }
9167 }
9168
9169 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
9170
9171 static void
9172 aarch64_emit_prologue_components (sbitmap components)
9173 {
9174   aarch64_process_components (components, true);
9175 }
9176
9177 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
9178
9179 static void
9180 aarch64_emit_epilogue_components (sbitmap components)
9181 {
9182   aarch64_process_components (components, false);
9183 }
9184
9185 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
9186
9187 static void
9188 aarch64_set_handled_components (sbitmap components)
9189 {
9190   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9191     if (bitmap_bit_p (components, regno))
9192       cfun->machine->reg_is_wrapped_separately[regno] = true;
9193 }
9194
9195 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
9196    determining the probe offset for alloca.  */
9197
9198 static HOST_WIDE_INT
9199 aarch64_stack_clash_protection_alloca_probe_range (void)
9200 {
9201   return STACK_CLASH_CALLER_GUARD;
9202 }
9203
9204 /* Emit a stack tie that acts as a scheduling barrier for all previous and
9205    subsequent memory accesses and that requires the stack pointer and REG
9206    to have their current values.  REG can be stack_pointer_rtx if no
9207    other register's value needs to be fixed.  */
9208
9209 static void
9210 aarch64_emit_stack_tie (rtx reg)
9211 {
9212   emit_insn (gen_stack_tie (reg, gen_int_mode (REGNO (reg), DImode)));
9213 }
9214
9215 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9216    registers.  If POLY_SIZE is not large enough to require a probe this function
9217    will only adjust the stack.  When allocating the stack space
9218    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9219    FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
9220    the saved registers.  If we are then we ensure that any allocation
9221    larger than the ABI defined buffer needs a probe so that the
9222    invariant of having a 1KB buffer is maintained.
9223
9224    We emit barriers after each stack adjustment to prevent optimizations from
9225    breaking the invariant that we never drop the stack more than a page.  This
9226    invariant is needed to make it easier to correctly handle asynchronous
9227    events, e.g. if we were to allow the stack to be dropped by more than a page
9228    and then have multiple probes up and we take a signal somewhere in between
9229    then the signal handler doesn't know the state of the stack and can make no
9230    assumptions about which pages have been probed.
9231
9232    FORCE_ISA_MODE is AARCH64_ISA_MODE_SM_ON if any variable component of
9233    POLY_SIZE is measured relative to the SME vector length instead of the
9234    current prevailing vector length.  It is 0 otherwise.  */
9235
9236 static void
9237 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9238                                         poly_int64 poly_size,
9239                                         aarch64_isa_mode force_isa_mode,
9240                                         bool frame_related_p,
9241                                         bool final_adjustment_p)
9242 {
9243   aarch64_frame &frame = cfun->machine->frame;
9244   HOST_WIDE_INT guard_size
9245     = 1 << param_stack_clash_protection_guard_size;
9246   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9247   HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
9248   gcc_assert (multiple_p (poly_size, byte_sp_alignment));
9249   HOST_WIDE_INT min_probe_threshold
9250     = (final_adjustment_p
9251        ? guard_used_by_caller + byte_sp_alignment
9252        : guard_size - guard_used_by_caller);
9253   poly_int64 frame_size = frame.frame_size;
9254
9255   /* We should always have a positive probe threshold.  */
9256   gcc_assert (min_probe_threshold > 0);
9257
9258   if (flag_stack_clash_protection && !final_adjustment_p)
9259     {
9260       poly_int64 initial_adjust = frame.initial_adjust;
9261       poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9262       poly_int64 final_adjust = frame.final_adjust;
9263
9264       if (known_eq (frame_size, 0))
9265         {
9266           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9267         }
9268       else if (known_lt (initial_adjust + sve_callee_adjust,
9269                          guard_size - guard_used_by_caller)
9270                && known_lt (final_adjust, guard_used_by_caller))
9271         {
9272           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9273         }
9274     }
9275
9276   /* If SIZE is not large enough to require probing, just adjust the stack and
9277      exit.  */
9278   if (known_lt (poly_size, min_probe_threshold)
9279       || !flag_stack_clash_protection)
9280     {
9281       aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
9282                       frame_related_p);
9283       return;
9284     }
9285
9286   HOST_WIDE_INT size;
9287   /* Handle the SVE non-constant case first.  */
9288   if (!poly_size.is_constant (&size))
9289     {
9290      if (dump_file)
9291       {
9292         fprintf (dump_file, "Stack clash SVE prologue: ");
9293         print_dec (poly_size, dump_file);
9294         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9295       }
9296
9297       /* First calculate the amount of bytes we're actually spilling.  */
9298       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9299                           poly_size, temp1, temp2, force_isa_mode,
9300                           false, true);
9301
9302       rtx_insn *insn = get_last_insn ();
9303
9304       if (frame_related_p)
9305         {
9306           /* This is done to provide unwinding information for the stack
9307              adjustments we're about to do, however to prevent the optimizers
9308              from removing the R11 move and leaving the CFA note (which would be
9309              very wrong) we tie the old and new stack pointer together.
9310              The tie will expand to nothing but the optimizers will not touch
9311              the instruction.  */
9312           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9313           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9314           aarch64_emit_stack_tie (stack_ptr_copy);
9315
9316           /* We want the CFA independent of the stack pointer for the
9317              duration of the loop.  */
9318           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9319           RTX_FRAME_RELATED_P (insn) = 1;
9320         }
9321
9322       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9323       rtx guard_const = gen_int_mode (guard_size, Pmode);
9324
9325       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9326                                                    stack_pointer_rtx, temp1,
9327                                                    probe_const, guard_const));
9328
9329       /* Now reset the CFA register if needed.  */
9330       if (frame_related_p)
9331         {
9332           add_reg_note (insn, REG_CFA_DEF_CFA,
9333                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9334                                       gen_int_mode (poly_size, Pmode)));
9335           RTX_FRAME_RELATED_P (insn) = 1;
9336         }
9337
9338       return;
9339     }
9340
9341   if (dump_file)
9342     fprintf (dump_file,
9343              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9344              " bytes, probing will be required.\n", size);
9345
9346   /* Round size to the nearest multiple of guard_size, and calculate the
9347      residual as the difference between the original size and the rounded
9348      size.  */
9349   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9350   HOST_WIDE_INT residual = size - rounded_size;
9351
9352   /* We can handle a small number of allocations/probes inline.  Otherwise
9353      punt to a loop.  */
9354   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9355     {
9356       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9357         {
9358           aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
9359           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9360                                            guard_used_by_caller));
9361           emit_insn (gen_blockage ());
9362         }
9363       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9364     }
9365   else
9366     {
9367       /* Compute the ending address.  */
9368       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9369                           temp1, NULL, force_isa_mode, false, true);
9370       rtx_insn *insn = get_last_insn ();
9371
9372       /* For the initial allocation, we don't have a frame pointer
9373          set up, so we always need CFI notes.  If we're doing the
9374          final allocation, then we may have a frame pointer, in which
9375          case it is the CFA, otherwise we need CFI notes.
9376
9377          We can determine which allocation we are doing by looking at
9378          the value of FRAME_RELATED_P since the final allocations are not
9379          frame related.  */
9380       if (frame_related_p)
9381         {
9382           /* We want the CFA independent of the stack pointer for the
9383              duration of the loop.  */
9384           add_reg_note (insn, REG_CFA_DEF_CFA,
9385                         plus_constant (Pmode, temp1, rounded_size));
9386           RTX_FRAME_RELATED_P (insn) = 1;
9387         }
9388
9389       /* This allocates and probes the stack.  Note that this re-uses some of
9390          the existing Ada stack protection code.  However we are guaranteed not
9391          to enter the non loop or residual branches of that code.
9392
9393          The non-loop part won't be entered because if our allocation amount
9394          doesn't require a loop, the case above would handle it.
9395
9396          The residual amount won't be entered because TEMP1 is a mutliple of
9397          the allocation size.  The residual will always be 0.  As such, the only
9398          part we are actually using from that code is the loop setup.  The
9399          actual probing is done in aarch64_output_probe_stack_range.  */
9400       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9401                                                stack_pointer_rtx, temp1));
9402
9403       /* Now reset the CFA register if needed.  */
9404       if (frame_related_p)
9405         {
9406           add_reg_note (insn, REG_CFA_DEF_CFA,
9407                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9408           RTX_FRAME_RELATED_P (insn) = 1;
9409         }
9410
9411       emit_insn (gen_blockage ());
9412       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9413     }
9414
9415   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
9416      be probed.  This maintains the requirement that each page is probed at
9417      least once.  For initial probing we probe only if the allocation is
9418      more than GUARD_SIZE - buffer, and below the saved registers we probe
9419      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
9420      GUARD_SIZE.  This works that for any allocation that is large enough to
9421      trigger a probe here, we'll have at least one, and if they're not large
9422      enough for this code to emit anything for them, The page would have been
9423      probed by the saving of FP/LR either by this function or any callees.  If
9424      we don't have any callees then we won't have more stack adjustments and so
9425      are still safe.  */
9426   if (residual)
9427     {
9428       gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
9429
9430       /* If we're doing final adjustments, and we've done any full page
9431          allocations then any residual needs to be probed.  */
9432       if (final_adjustment_p && rounded_size != 0)
9433         min_probe_threshold = 0;
9434
9435       aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
9436       if (residual >= min_probe_threshold)
9437         {
9438           if (dump_file)
9439             fprintf (dump_file,
9440                      "Stack clash AArch64 prologue residuals: "
9441                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9442                      "\n", residual);
9443
9444           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9445                                            guard_used_by_caller));
9446           emit_insn (gen_blockage ());
9447         }
9448     }
9449 }
9450
9451 /* Implement TARGET_EXTRA_LIVE_ON_ENTRY.  */
9452
9453 void
9454 aarch64_extra_live_on_entry (bitmap regs)
9455 {
9456   if (TARGET_ZA)
9457     {
9458       bitmap_set_bit (regs, LOWERING_REGNUM);
9459       bitmap_set_bit (regs, SME_STATE_REGNUM);
9460       bitmap_set_bit (regs, TPIDR2_SETUP_REGNUM);
9461       bitmap_set_bit (regs, ZA_FREE_REGNUM);
9462       bitmap_set_bit (regs, ZA_SAVED_REGNUM);
9463
9464       /* The only time ZA can't have live contents on entry is when
9465          the function explicitly treats it as a pure output.  */
9466       auto za_flags = aarch64_cfun_shared_flags ("za");
9467       if (za_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9468         bitmap_set_bit (regs, ZA_REGNUM);
9469
9470       /* Since ZT0 is call-clobbered, it is only live on input if
9471          it is explicitly shared, and is not a pure output.  */
9472       auto zt0_flags = aarch64_cfun_shared_flags ("zt0");
9473       if (zt0_flags != 0
9474           && zt0_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9475         bitmap_set_bit (regs, ZT0_REGNUM);
9476     }
9477 }
9478
9479 /* Return 1 if the register is used by the epilogue.  We need to say the
9480    return register is used, but only after epilogue generation is complete.
9481    Note that in the case of sibcalls, the values "used by the epilogue" are
9482    considered live at the start of the called function.  */
9483
9484 int
9485 aarch64_epilogue_uses (int regno)
9486 {
9487   if (epilogue_completed)
9488     {
9489       if (regno == LR_REGNUM)
9490         return 1;
9491     }
9492   if (regno == LOWERING_REGNUM && TARGET_ZA)
9493     return 1;
9494   if (regno == SME_STATE_REGNUM && TARGET_ZA)
9495     return 1;
9496   if (regno == TPIDR2_SETUP_REGNUM && TARGET_ZA)
9497     return 1;
9498   /* If the function shares SME state with its caller, ensure that that
9499      data is not in the lazy save buffer on exit.  */
9500   if (regno == ZA_SAVED_REGNUM && aarch64_cfun_incoming_pstate_za () != 0)
9501     return 1;
9502   if (regno == ZA_REGNUM && aarch64_cfun_shared_flags ("za") != 0)
9503     return 1;
9504   if (regno == ZT0_REGNUM && aarch64_cfun_shared_flags ("zt0") != 0)
9505     return 1;
9506   return 0;
9507 }
9508
9509 /* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE.  */
9510
9511 static bool
9512 aarch64_use_late_prologue_epilogue ()
9513 {
9514   return aarch64_cfun_enables_pstate_sm ();
9515 }
9516
9517 /* The current function's frame has a save slot for the incoming state
9518    of SVCR.  Return a legitimate memory for the slot, based on the hard
9519    frame pointer.  */
9520
9521 static rtx
9522 aarch64_old_svcr_mem ()
9523 {
9524   gcc_assert (frame_pointer_needed
9525               && known_ge (cfun->machine->frame.old_svcr_offset, 0));
9526   rtx base = hard_frame_pointer_rtx;
9527   poly_int64 offset = (0
9528                        /* hard fp -> bottom of frame.  */
9529                        - cfun->machine->frame.bytes_below_hard_fp
9530                        /* bottom of frame -> save slot.  */
9531                        + cfun->machine->frame.old_svcr_offset);
9532   return gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
9533 }
9534
9535 /* The current function's frame has a save slot for the incoming state
9536    of SVCR.  Load the slot into register REGNO and return the register.  */
9537
9538 static rtx
9539 aarch64_read_old_svcr (unsigned int regno)
9540 {
9541   rtx svcr = gen_rtx_REG (DImode, regno);
9542   emit_move_insn (svcr, aarch64_old_svcr_mem ());
9543   return svcr;
9544 }
9545
9546 /* Like the rtx version of aarch64_guard_switch_pstate_sm, but first
9547    load the incoming value of SVCR from its save slot into temporary
9548    register REGNO.  */
9549
9550 static rtx_insn *
9551 aarch64_guard_switch_pstate_sm (unsigned int regno,
9552                                 aarch64_isa_mode local_mode)
9553 {
9554   rtx old_svcr = aarch64_read_old_svcr (regno);
9555   return aarch64_guard_switch_pstate_sm (old_svcr, local_mode);
9556 }
9557
9558 /* AArch64 stack frames generated by this compiler look like:
9559
9560         +-------------------------------+
9561         |                               |
9562         |  incoming stack arguments     |
9563         |                               |
9564         +-------------------------------+
9565         |                               | <-- incoming stack pointer (aligned)
9566         |  callee-allocated save area   |
9567         |  for register varargs         |
9568         |                               |
9569         +-------------------------------+
9570         |  local variables (1)          | <-- frame_pointer_rtx
9571         |                               |
9572         +-------------------------------+
9573         |  padding (1)                  |
9574         +-------------------------------+
9575         |  callee-saved registers       |
9576         +-------------------------------+
9577         |  LR'                          |
9578         +-------------------------------+
9579         |  FP'                          |
9580         +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
9581         |  SVE vector registers         |
9582         +-------------------------------+
9583         |  SVE predicate registers      |
9584         +-------------------------------+
9585         |  local variables (2)          |
9586         +-------------------------------+
9587         |  padding (2)                  |
9588         +-------------------------------+
9589         |  dynamic allocation           |
9590         +-------------------------------+
9591         |  padding                      |
9592         +-------------------------------+
9593         |  outgoing stack arguments     | <-- arg_pointer
9594         |                               |
9595         +-------------------------------+
9596         |                               | <-- stack_pointer_rtx (aligned)
9597
9598    The regions marked (1) and (2) are mutually exclusive.  (2) is used
9599    when aarch64_save_regs_above_locals_p is true.
9600
9601    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9602    but leave frame_pointer_rtx and hard_frame_pointer_rtx
9603    unchanged.
9604
9605    By default for stack-clash we assume the guard is at least 64KB, but this
9606    value is configurable to either 4KB or 64KB.  We also force the guard size to
9607    be the same as the probing interval and both values are kept in sync.
9608
9609    With those assumptions the callee can allocate up to 63KB (or 3KB depending
9610    on the guard size) of stack space without probing.
9611
9612    When probing is needed, we emit a probe at the start of the prologue
9613    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9614
9615    We can also use register saves as probes.  These are stored in
9616    sve_save_and_probe and hard_fp_save_and_probe.
9617
9618    For outgoing arguments we probe if the size is larger than 1KB, such that
9619    the ABI specified buffer is maintained for the next callee.
9620
9621    The following registers are reserved during frame layout and should not be
9622    used for any other purpose:
9623
9624    - r11: Used by stack clash protection when SVE is enabled, and also
9625           as an anchor register when saving and restoring registers
9626    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9627    - r14 and r15: Used for speculation tracking.
9628    - r16(IP0), r17(IP1): Used by indirect tailcalls.
9629    - r30(LR), r29(FP): Used by standard frame layout.
9630
9631    These registers must be avoided in frame layout related code unless the
9632    explicit intention is to interact with one of the features listed above.  */
9633
9634 /* Generate the prologue instructions for entry into a function.
9635    Establish the stack frame by decreasing the stack pointer with a
9636    properly calculated size and, if necessary, create a frame record
9637    filled with the values of LR and previous frame pointer.  The
9638    current FP is also set up if it is in use.  */
9639
9640 void
9641 aarch64_expand_prologue (void)
9642 {
9643   aarch64_frame &frame = cfun->machine->frame;
9644   poly_int64 frame_size = frame.frame_size;
9645   poly_int64 initial_adjust = frame.initial_adjust;
9646   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9647   poly_int64 final_adjust = frame.final_adjust;
9648   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9649   unsigned reg1 = frame.wb_push_candidate1;
9650   unsigned reg2 = frame.wb_push_candidate2;
9651   bool emit_frame_chain = frame.emit_frame_chain;
9652   rtx_insn *insn;
9653   aarch64_isa_mode force_isa_mode = 0;
9654   if (aarch64_cfun_enables_pstate_sm ())
9655     force_isa_mode = AARCH64_ISA_MODE_SM_ON;
9656
9657   if (flag_stack_clash_protection
9658       && known_eq (callee_adjust, 0)
9659       && known_lt (frame.reg_offset[VG_REGNUM], 0))
9660     {
9661       /* Fold the SVE allocation into the initial allocation.
9662          We don't do this in aarch64_layout_arg to avoid pessimizing
9663          the epilogue code.  */
9664       initial_adjust += sve_callee_adjust;
9665       sve_callee_adjust = 0;
9666     }
9667
9668   /* Sign return address for functions.  */
9669   if (aarch64_return_address_signing_enabled ())
9670     {
9671       switch (aarch64_ra_sign_key)
9672         {
9673           case AARCH64_KEY_A:
9674             insn = emit_insn (gen_paciasp ());
9675             break;
9676           case AARCH64_KEY_B:
9677             insn = emit_insn (gen_pacibsp ());
9678             break;
9679           default:
9680             gcc_unreachable ();
9681         }
9682       add_reg_note (insn, REG_CFA_NEGATE_RA_STATE, const0_rtx);
9683       RTX_FRAME_RELATED_P (insn) = 1;
9684     }
9685
9686   /* Push return address to shadow call stack.  */
9687   if (frame.is_scs_enabled)
9688     emit_insn (gen_scs_push ());
9689
9690   if (flag_stack_usage_info)
9691     current_function_static_stack_size = constant_lower_bound (frame_size);
9692
9693   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9694     {
9695       if (crtl->is_leaf && !cfun->calls_alloca)
9696         {
9697           if (maybe_gt (frame_size, PROBE_INTERVAL)
9698               && maybe_gt (frame_size, get_stack_check_protect ()))
9699             aarch64_emit_probe_stack_range (get_stack_check_protect (),
9700                                             (frame_size
9701                                              - get_stack_check_protect ()));
9702         }
9703       else if (maybe_gt (frame_size, 0))
9704         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9705     }
9706
9707   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9708   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9709
9710   /* In theory we should never have both an initial adjustment
9711      and a callee save adjustment.  Verify that is the case since the
9712      code below does not handle it for -fstack-clash-protection.  */
9713   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9714
9715   /* Will only probe if the initial adjustment is larger than the guard
9716      less the amount of the guard reserved for use by the caller's
9717      outgoing args.  */
9718   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9719                                           force_isa_mode, true, false);
9720
9721   if (callee_adjust != 0)
9722     aarch64_push_regs (reg1, reg2, callee_adjust);
9723
9724   /* The offset of the current SP from the bottom of the static frame.  */
9725   poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
9726
9727   if (emit_frame_chain)
9728     {
9729       /* The offset of the frame chain record (if any) from the current SP.  */
9730       poly_int64 chain_offset = (initial_adjust + callee_adjust
9731                                  - frame.bytes_above_hard_fp);
9732       gcc_assert (known_ge (chain_offset, 0));
9733
9734       gcc_assert (reg1 == R29_REGNUM && reg2 == R30_REGNUM);
9735       if (callee_adjust == 0)
9736         aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs,
9737                                    false, false);
9738       else
9739         gcc_assert (known_eq (chain_offset, 0));
9740       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9741                           stack_pointer_rtx, chain_offset,
9742                           tmp1_rtx, tmp0_rtx, force_isa_mode,
9743                           frame_pointer_needed);
9744       if (frame_pointer_needed && !frame_size.is_constant ())
9745         {
9746           /* Variable-sized frames need to describe the save slot
9747              address using DW_CFA_expression rather than DW_CFA_offset.
9748              This means that, without taking further action, the
9749              locations of the registers that we've already saved would
9750              remain based on the stack pointer even after we redefine
9751              the CFA based on the frame pointer.  We therefore need new
9752              DW_CFA_expressions to re-express the save slots with addresses
9753              based on the frame pointer.  */
9754           rtx_insn *insn = get_last_insn ();
9755           gcc_assert (RTX_FRAME_RELATED_P (insn));
9756
9757           /* Add an explicit CFA definition if this was previously
9758              implicit.  */
9759           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9760             {
9761               rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
9762               add_reg_note (insn, REG_CFA_ADJUST_CFA,
9763                             gen_rtx_SET (hard_frame_pointer_rtx, src));
9764             }
9765
9766           /* Change the save slot expressions for the registers that
9767              we've already saved.  */
9768           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9769                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
9770           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
9771                                       hard_frame_pointer_rtx, 0);
9772         }
9773       aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9774     }
9775
9776   aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs, true,
9777                              emit_frame_chain);
9778   if (maybe_ge (frame.reg_offset[VG_REGNUM], 0))
9779     {
9780       unsigned int saved_regs[] = { VG_REGNUM };
9781       aarch64_save_callee_saves (bytes_below_sp, saved_regs, true,
9782                                  emit_frame_chain);
9783     }
9784   if (maybe_ne (sve_callee_adjust, 0))
9785     {
9786       gcc_assert (!flag_stack_clash_protection
9787                   || known_eq (initial_adjust, 0)
9788                   /* The VG save isn't shrink-wrapped and so serves as
9789                      a probe of the initial allocation.  */
9790                   || known_eq (frame.reg_offset[VG_REGNUM], bytes_below_sp));
9791       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
9792                                               sve_callee_adjust,
9793                                               force_isa_mode,
9794                                               !frame_pointer_needed, false);
9795       bytes_below_sp -= sve_callee_adjust;
9796     }
9797   aarch64_save_callee_saves (bytes_below_sp, frame.saved_prs, true,
9798                              emit_frame_chain);
9799   aarch64_save_callee_saves (bytes_below_sp, frame.saved_fprs, true,
9800                              emit_frame_chain);
9801
9802   /* We may need to probe the final adjustment if it is larger than the guard
9803      that is assumed by the called.  */
9804   gcc_assert (known_eq (bytes_below_sp, final_adjust));
9805   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
9806                                           force_isa_mode,
9807                                           !frame_pointer_needed, true);
9808   if (emit_frame_chain && maybe_ne (final_adjust, 0))
9809     aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9810
9811   /* Save the incoming value of PSTATE.SM, if required.  Code further
9812      down does this for locally-streaming functions.  */
9813   if (known_ge (frame.old_svcr_offset, 0)
9814       && !aarch64_cfun_enables_pstate_sm ())
9815     {
9816       rtx mem = aarch64_old_svcr_mem ();
9817       MEM_VOLATILE_P (mem) = 1;
9818       if (TARGET_SME)
9819         {
9820           rtx reg = gen_rtx_REG (DImode, IP0_REGNUM);
9821           emit_insn (gen_aarch64_read_svcr (reg));
9822           emit_move_insn (mem, reg);
9823         }
9824       else
9825         {
9826           rtx old_r0 = NULL_RTX, old_r1 = NULL_RTX;
9827           auto &args = crtl->args.info;
9828           if (args.aapcs_ncrn > 0)
9829             {
9830               old_r0 = gen_rtx_REG (DImode, PROBE_STACK_FIRST_REGNUM);
9831               emit_move_insn (old_r0, gen_rtx_REG (DImode, R0_REGNUM));
9832             }
9833           if (args.aapcs_ncrn > 1)
9834             {
9835               old_r1 = gen_rtx_REG (DImode, PROBE_STACK_SECOND_REGNUM);
9836               emit_move_insn (old_r1, gen_rtx_REG (DImode, R1_REGNUM));
9837             }
9838           emit_insn (gen_aarch64_get_sme_state ());
9839           emit_move_insn (mem, gen_rtx_REG (DImode, R0_REGNUM));
9840           if (old_r0)
9841             emit_move_insn (gen_rtx_REG (DImode, R0_REGNUM), old_r0);
9842           if (old_r1)
9843             emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
9844         }
9845     }
9846
9847   /* Enable PSTATE.SM, if required.  */
9848   if (aarch64_cfun_enables_pstate_sm ())
9849     {
9850       rtx_insn *guard_label = nullptr;
9851       if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9852         {
9853           /* The current function is streaming-compatible.  Save the
9854              original state of PSTATE.SM.  */
9855           rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
9856           emit_insn (gen_aarch64_read_svcr (svcr));
9857           emit_move_insn (aarch64_old_svcr_mem (), svcr);
9858           guard_label = aarch64_guard_switch_pstate_sm (svcr,
9859                                                         AARCH64_ISA_MODE);
9860         }
9861       aarch64_sme_mode_switch_regs args_switch;
9862       auto &args = crtl->args.info;
9863       for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
9864         {
9865           rtx x = args.sme_mode_switch_args[i];
9866           args_switch.add_reg (GET_MODE (x), REGNO (x));
9867         }
9868       args_switch.emit_prologue ();
9869       emit_insn (gen_aarch64_smstart_sm ());
9870       args_switch.emit_epilogue ();
9871       if (guard_label)
9872         emit_label (guard_label);
9873     }
9874 }
9875
9876 /* Return TRUE if we can use a simple_return insn.
9877
9878    This function checks whether the callee saved stack is empty, which
9879    means no restore actions are need. The pro_and_epilogue will use
9880    this to check whether shrink-wrapping opt is feasible.  */
9881
9882 bool
9883 aarch64_use_return_insn_p (void)
9884 {
9885   if (!reload_completed)
9886     return false;
9887
9888   if (crtl->profile)
9889     return false;
9890
9891   return known_eq (cfun->machine->frame.frame_size, 0);
9892 }
9893
9894 /* Generate the epilogue instructions for returning from a function.
9895    This is almost exactly the reverse of the prolog sequence, except
9896    that we need to insert barriers to avoid scheduling loads that read
9897    from a deallocated stack, and we optimize the unwind records by
9898    emitting them all together if possible.  */
9899 void
9900 aarch64_expand_epilogue (rtx_call_insn *sibcall)
9901 {
9902   aarch64_frame &frame = cfun->machine->frame;
9903   poly_int64 initial_adjust = frame.initial_adjust;
9904   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9905   poly_int64 final_adjust = frame.final_adjust;
9906   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9907   poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
9908   unsigned reg1 = frame.wb_pop_candidate1;
9909   unsigned reg2 = frame.wb_pop_candidate2;
9910   rtx cfi_ops = NULL;
9911   rtx_insn *insn;
9912   /* A stack clash protection prologue may not have left EP0_REGNUM or
9913      EP1_REGNUM in a usable state.  The same is true for allocations
9914      with an SVE component, since we then need both temporary registers
9915      for each allocation.  For stack clash we are in a usable state if
9916      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
9917   HOST_WIDE_INT guard_size
9918     = 1 << param_stack_clash_protection_guard_size;
9919   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9920   aarch64_isa_mode force_isa_mode = 0;
9921   if (aarch64_cfun_enables_pstate_sm ())
9922     force_isa_mode = AARCH64_ISA_MODE_SM_ON;
9923
9924   /* We can re-use the registers when:
9925
9926      (a) the deallocation amount is the same as the corresponding
9927          allocation amount (which is false if we combine the initial
9928          and SVE callee save allocations in the prologue); and
9929
9930      (b) the allocation amount doesn't need a probe (which is false
9931          if the amount is guard_size - guard_used_by_caller or greater).
9932
9933      In such situations the register should remain live with the correct
9934      value.  */
9935   bool can_inherit_p = (initial_adjust.is_constant ()
9936                         && final_adjust.is_constant ()
9937                         && (!flag_stack_clash_protection
9938                             || (known_lt (initial_adjust,
9939                                           guard_size - guard_used_by_caller)
9940                                 && known_eq (sve_callee_adjust, 0))));
9941
9942   /* We need to add memory barrier to prevent read from deallocated stack.  */
9943   bool need_barrier_p
9944     = maybe_ne (get_frame_size ()
9945                 + frame.saved_varargs_size, 0);
9946
9947   /* Reset PSTATE.SM, if required.  */
9948   if (aarch64_cfun_enables_pstate_sm ())
9949     {
9950       rtx_insn *guard_label = nullptr;
9951       if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9952         guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
9953                                                       AARCH64_ISA_MODE);
9954       aarch64_sme_mode_switch_regs return_switch;
9955       if (sibcall)
9956         return_switch.add_call_args (sibcall);
9957       else if (crtl->return_rtx && REG_P (crtl->return_rtx))
9958         return_switch.add_reg (GET_MODE (crtl->return_rtx),
9959                                REGNO (crtl->return_rtx));
9960       return_switch.emit_prologue ();
9961       emit_insn (gen_aarch64_smstop_sm ());
9962       return_switch.emit_epilogue ();
9963       if (guard_label)
9964         emit_label (guard_label);
9965     }
9966
9967   /* Emit a barrier to prevent loads from a deallocated stack.  */
9968   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
9969       || cfun->calls_alloca
9970       || crtl->calls_eh_return)
9971     {
9972       aarch64_emit_stack_tie (stack_pointer_rtx);
9973       need_barrier_p = false;
9974     }
9975
9976   /* Restore the stack pointer from the frame pointer if it may not
9977      be the same as the stack pointer.  */
9978   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9979   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9980   if (frame_pointer_needed
9981       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
9982     /* If writeback is used when restoring callee-saves, the CFA
9983        is restored on the instruction doing the writeback.  */
9984     aarch64_add_offset (Pmode, stack_pointer_rtx,
9985                         hard_frame_pointer_rtx,
9986                         -bytes_below_hard_fp + final_adjust,
9987                         tmp1_rtx, tmp0_rtx, force_isa_mode,
9988                         callee_adjust == 0);
9989   else
9990      /* The case where we need to re-use the register here is very rare, so
9991         avoid the complicated condition and just always emit a move if the
9992         immediate doesn't fit.  */
9993      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
9994
9995   /* Restore the vector registers before the predicate registers,
9996      so that we can use P4 as a temporary for big-endian SVE frames.  */
9997   aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
9998   aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
9999   if (maybe_ne (sve_callee_adjust, 0))
10000     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
10001                     force_isa_mode, true);
10002
10003   /* When shadow call stack is enabled, the scs_pop in the epilogue will
10004      restore x30, we don't need to restore x30 again in the traditional
10005      way.  */
10006   aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
10007                                 frame.saved_gprs, &cfi_ops);
10008
10009   if (need_barrier_p)
10010     aarch64_emit_stack_tie (stack_pointer_rtx);
10011
10012   if (callee_adjust != 0)
10013     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
10014
10015   /* If we have no register restore information, the CFA must have been
10016      defined in terms of the stack pointer since the end of the prologue.  */
10017   gcc_assert (cfi_ops || !frame_pointer_needed);
10018
10019   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
10020     {
10021       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
10022       insn = get_last_insn ();
10023       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
10024       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
10025       RTX_FRAME_RELATED_P (insn) = 1;
10026       cfi_ops = NULL;
10027     }
10028
10029   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10030      add restriction on emit_move optimization to leaf functions.  */
10031   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
10032                   (!can_inherit_p || !crtl->is_leaf
10033                    || df_regs_ever_live_p (EP0_REGNUM)));
10034
10035   if (cfi_ops)
10036     {
10037       /* Emit delayed restores and reset the CFA to be SP.  */
10038       insn = get_last_insn ();
10039       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
10040       REG_NOTES (insn) = cfi_ops;
10041       RTX_FRAME_RELATED_P (insn) = 1;
10042     }
10043
10044   /* Pop return address from shadow call stack.  */
10045   if (frame.is_scs_enabled)
10046     {
10047       machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
10048       rtx reg = gen_rtx_REG (mode, R30_REGNUM);
10049
10050       insn = emit_insn (gen_scs_pop ());
10051       add_reg_note (insn, REG_CFA_RESTORE, reg);
10052       RTX_FRAME_RELATED_P (insn) = 1;
10053     }
10054
10055   /* Stack adjustment for exception handler.  */
10056   if (crtl->calls_eh_return && !sibcall)
10057     {
10058       /* If the EH_RETURN_TAKEN_RTX flag is set then we need
10059          to unwind the stack and jump to the handler, otherwise
10060          skip this eh_return logic and continue with normal
10061          return after the label.  We have already reset the CFA
10062          to be SP; letting the CFA move during this adjustment
10063          is just as correct as retaining the CFA from the body
10064          of the function.  Therefore, do nothing special.  */
10065       rtx_code_label *label = gen_label_rtx ();
10066       rtx x = aarch64_gen_compare_zero_and_branch (EQ, EH_RETURN_TAKEN_RTX,
10067                                                    label);
10068       rtx jump = emit_jump_insn (x);
10069       JUMP_LABEL (jump) = label;
10070       LABEL_NUSES (label)++;
10071       emit_insn (gen_add2_insn (stack_pointer_rtx,
10072                                 EH_RETURN_STACKADJ_RTX));
10073       emit_jump_insn (gen_indirect_jump (EH_RETURN_HANDLER_RTX));
10074       emit_barrier ();
10075       emit_label (label);
10076     }
10077
10078   /* We prefer to emit the combined return/authenticate instruction RETAA,
10079      however there are three cases in which we must instead emit an explicit
10080      authentication instruction.
10081
10082         1) Sibcalls don't return in a normal way, so if we're about to call one
10083            we must authenticate.
10084
10085         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10086            generating code for !TARGET_ARMV8_3 we can't use it and must
10087            explicitly authenticate.
10088     */
10089   if (aarch64_return_address_signing_enabled ()
10090       && (sibcall || !TARGET_ARMV8_3))
10091     {
10092       switch (aarch64_ra_sign_key)
10093         {
10094           case AARCH64_KEY_A:
10095             insn = emit_insn (gen_autiasp ());
10096             break;
10097           case AARCH64_KEY_B:
10098             insn = emit_insn (gen_autibsp ());
10099             break;
10100           default:
10101             gcc_unreachable ();
10102         }
10103       add_reg_note (insn, REG_CFA_NEGATE_RA_STATE, const0_rtx);
10104       RTX_FRAME_RELATED_P (insn) = 1;
10105     }
10106
10107   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10108   if (!sibcall)
10109     emit_jump_insn (ret_rtx);
10110 }
10111
10112 /* Output code to add DELTA to the first argument, and then jump
10113    to FUNCTION.  Used for C++ multiple inheritance.  */
10114 static void
10115 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10116                          HOST_WIDE_INT delta,
10117                          HOST_WIDE_INT vcall_offset,
10118                          tree function)
10119 {
10120   /* The this pointer is always in x0.  Note that this differs from
10121      Arm where the this pointer maybe bumped to r1 if r0 is required
10122      to return a pointer to an aggregate.  On AArch64 a result value
10123      pointer will be in x8.  */
10124   int this_regno = R0_REGNUM;
10125   rtx this_rtx, temp0, temp1, addr, funexp;
10126   rtx_insn *insn;
10127   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10128
10129   if (aarch_bti_enabled ())
10130     emit_insn (gen_bti_c());
10131
10132   reload_completed = 1;
10133   emit_note (NOTE_INSN_PROLOGUE_END);
10134
10135   this_rtx = gen_rtx_REG (Pmode, this_regno);
10136   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10137   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10138
10139   if (vcall_offset == 0)
10140     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
10141                         0, false);
10142   else
10143     {
10144       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10145
10146       addr = this_rtx;
10147       if (delta != 0)
10148         {
10149           if (delta >= -256 && delta < 256)
10150             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10151                                        plus_constant (Pmode, this_rtx, delta));
10152           else
10153             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10154                                 temp1, temp0, 0, false);
10155         }
10156
10157       if (Pmode == ptr_mode)
10158         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10159       else
10160         aarch64_emit_move (temp0,
10161                            gen_rtx_ZERO_EXTEND (Pmode,
10162                                                 gen_rtx_MEM (ptr_mode, addr)));
10163
10164       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10165           addr = plus_constant (Pmode, temp0, vcall_offset);
10166       else
10167         {
10168           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10169                                           Pmode);
10170           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10171         }
10172
10173       if (Pmode == ptr_mode)
10174         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10175       else
10176         aarch64_emit_move (temp1,
10177                            gen_rtx_SIGN_EXTEND (Pmode,
10178                                                 gen_rtx_MEM (ptr_mode, addr)));
10179
10180       emit_insn (gen_add2_insn (this_rtx, temp1));
10181     }
10182
10183   /* Generate a tail call to the target function.  */
10184   if (!TREE_USED (function))
10185     {
10186       assemble_external (function);
10187       TREE_USED (function) = 1;
10188     }
10189   funexp = XEXP (DECL_RTL (function), 0);
10190   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10191   auto isa_mode = aarch64_fntype_isa_mode (TREE_TYPE (function));
10192   auto pcs_variant = arm_pcs (fndecl_abi (function).id ());
10193   rtx callee_abi = aarch64_gen_callee_cookie (isa_mode, pcs_variant);
10194   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10195   SIBLING_CALL_P (insn) = 1;
10196
10197   insn = get_insns ();
10198   shorten_branches (insn);
10199
10200   assemble_start_function (thunk, fnname);
10201   final_start_function (insn, file, 1);
10202   final (insn, file, 1);
10203   final_end_function ();
10204   assemble_end_function (thunk, fnname);
10205
10206   /* Stop pretending to be a post-reload pass.  */
10207   reload_completed = 0;
10208 }
10209
10210 static bool
10211 aarch64_tls_referenced_p (rtx x)
10212 {
10213   if (!TARGET_HAVE_TLS)
10214     return false;
10215   subrtx_iterator::array_type array;
10216   FOR_EACH_SUBRTX (iter, array, x, ALL)
10217     {
10218       const_rtx x = *iter;
10219       if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10220         return true;
10221       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10222          TLS offsets, not real symbol references.  */
10223       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10224         iter.skip_subrtxes ();
10225     }
10226   return false;
10227 }
10228
10229
10230 static bool
10231 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10232 {
10233   if (GET_CODE (x) == HIGH)
10234     return true;
10235
10236   /* There's no way to calculate VL-based values using relocations.  */
10237   subrtx_iterator::array_type array;
10238   HOST_WIDE_INT factor;
10239   FOR_EACH_SUBRTX (iter, array, x, ALL)
10240     if (GET_CODE (*iter) == CONST_POLY_INT
10241         || aarch64_sme_vq_unspec_p (x, &factor))
10242       return true;
10243
10244   poly_int64 offset;
10245   rtx base = strip_offset_and_salt (x, &offset);
10246   if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10247     {
10248       /* We checked for POLY_INT_CST offsets above.  */
10249       if (aarch64_classify_symbol (base, offset.to_constant ())
10250           != SYMBOL_FORCE_TO_MEM)
10251         return true;
10252       else
10253         /* Avoid generating a 64-bit relocation in ILP32; leave
10254            to aarch64_expand_mov_immediate to handle it properly.  */
10255         return mode != ptr_mode;
10256     }
10257
10258   return aarch64_tls_referenced_p (x);
10259 }
10260
10261 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10262    The expansion for a table switch is quite expensive due to the number
10263    of instructions, the table lookup and hard to predict indirect jump.
10264    When optimizing for speed, and -O3 enabled, use the per-core tuning if
10265    set, otherwise use tables for >= 11 cases as a tradeoff between size and
10266    performance.  When optimizing for size, use 8 for smallest codesize.  */
10267
10268 static unsigned int
10269 aarch64_case_values_threshold (void)
10270 {
10271   /* Use the specified limit for the number of cases before using jump
10272      tables at higher optimization levels.  */
10273   if (optimize > 2
10274       && aarch64_tune_params.max_case_values != 0)
10275     return aarch64_tune_params.max_case_values;
10276   else
10277     return optimize_size ? 8 : 11;
10278 }
10279
10280 /* Return true if register REGNO is a valid index register.
10281    STRICT_P is true if REG_OK_STRICT is in effect.  */
10282
10283 bool
10284 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10285 {
10286   if (!HARD_REGISTER_NUM_P (regno))
10287     {
10288       if (!strict_p)
10289         return true;
10290
10291       if (!reg_renumber)
10292         return false;
10293
10294       regno = reg_renumber[regno];
10295     }
10296   return GP_REGNUM_P (regno);
10297 }
10298
10299 /* Return true if register REGNO is a valid base register for mode MODE.
10300    STRICT_P is true if REG_OK_STRICT is in effect.  */
10301
10302 bool
10303 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10304 {
10305   if (!HARD_REGISTER_NUM_P (regno))
10306     {
10307       if (!strict_p)
10308         return true;
10309
10310       if (!reg_renumber)
10311         return false;
10312
10313       regno = reg_renumber[regno];
10314     }
10315
10316   /* The fake registers will be eliminated to either the stack or
10317      hard frame pointer, both of which are usually valid base registers.
10318      Reload deals with the cases where the eliminated form isn't valid.  */
10319   return (GP_REGNUM_P (regno)
10320           || regno == SP_REGNUM
10321           || regno == FRAME_POINTER_REGNUM
10322           || regno == ARG_POINTER_REGNUM);
10323 }
10324
10325 /* Return true if X is a valid base register for mode MODE.
10326    STRICT_P is true if REG_OK_STRICT is in effect.  */
10327
10328 static bool
10329 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10330 {
10331   if (!strict_p
10332       && SUBREG_P (x)
10333       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10334     x = SUBREG_REG (x);
10335
10336   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10337 }
10338
10339 /* Return true if address offset is a valid index.  If it is, fill in INFO
10340    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10341
10342 static bool
10343 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10344                         machine_mode mode, bool strict_p)
10345 {
10346   enum aarch64_address_type type;
10347   rtx index;
10348   int shift;
10349
10350   /* (reg:P) */
10351   if ((REG_P (x) || SUBREG_P (x))
10352       && GET_MODE (x) == Pmode)
10353     {
10354       type = ADDRESS_REG_REG;
10355       index = x;
10356       shift = 0;
10357     }
10358   /* (sign_extend:DI (reg:SI)) */
10359   else if ((GET_CODE (x) == SIGN_EXTEND
10360             || GET_CODE (x) == ZERO_EXTEND)
10361            && GET_MODE (x) == DImode
10362            && GET_MODE (XEXP (x, 0)) == SImode)
10363     {
10364       type = (GET_CODE (x) == SIGN_EXTEND)
10365         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10366       index = XEXP (x, 0);
10367       shift = 0;
10368     }
10369   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10370   else if (GET_CODE (x) == MULT
10371            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10372                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10373            && GET_MODE (XEXP (x, 0)) == DImode
10374            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10375            && CONST_INT_P (XEXP (x, 1)))
10376     {
10377       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10378         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10379       index = XEXP (XEXP (x, 0), 0);
10380       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10381     }
10382   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10383   else if (GET_CODE (x) == ASHIFT
10384            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10385                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10386            && GET_MODE (XEXP (x, 0)) == DImode
10387            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10388            && CONST_INT_P (XEXP (x, 1)))
10389     {
10390       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10391         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10392       index = XEXP (XEXP (x, 0), 0);
10393       shift = INTVAL (XEXP (x, 1));
10394     }
10395   /* (and:DI (mult:DI (reg:DI) (const_int scale))
10396      (const_int 0xffffffff<<shift)) */
10397   else if (GET_CODE (x) == AND
10398            && GET_MODE (x) == DImode
10399            && GET_CODE (XEXP (x, 0)) == MULT
10400            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10401            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10402            && CONST_INT_P (XEXP (x, 1)))
10403     {
10404       type = ADDRESS_REG_UXTW;
10405       index = XEXP (XEXP (x, 0), 0);
10406       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10407       /* Avoid undefined code dealing with shift being -1. */
10408       if (shift != -1
10409           && INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10410         shift = -1;
10411     }
10412   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10413      (const_int 0xffffffff<<shift)) */
10414   else if (GET_CODE (x) == AND
10415            && GET_MODE (x) == DImode
10416            && GET_CODE (XEXP (x, 0)) == ASHIFT
10417            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10418            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10419            && CONST_INT_P (XEXP (x, 1)))
10420     {
10421       type = ADDRESS_REG_UXTW;
10422       index = XEXP (XEXP (x, 0), 0);
10423       shift = INTVAL (XEXP (XEXP (x, 0), 1));
10424       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10425         shift = -1;
10426     }
10427   /* (mult:P (reg:P) (const_int scale)) */
10428   else if (GET_CODE (x) == MULT
10429            && GET_MODE (x) == Pmode
10430            && GET_MODE (XEXP (x, 0)) == Pmode
10431            && CONST_INT_P (XEXP (x, 1)))
10432     {
10433       type = ADDRESS_REG_REG;
10434       index = XEXP (x, 0);
10435       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10436     }
10437   /* (ashift:P (reg:P) (const_int shift)) */
10438   else if (GET_CODE (x) == ASHIFT
10439            && GET_MODE (x) == Pmode
10440            && GET_MODE (XEXP (x, 0)) == Pmode
10441            && CONST_INT_P (XEXP (x, 1)))
10442     {
10443       type = ADDRESS_REG_REG;
10444       index = XEXP (x, 0);
10445       shift = INTVAL (XEXP (x, 1));
10446     }
10447   else
10448     return false;
10449
10450   if (!strict_p
10451       && SUBREG_P (index)
10452       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10453     index = SUBREG_REG (index);
10454
10455   if (aarch64_sve_data_mode_p (mode) || mode == VNx1TImode)
10456     {
10457       if (type != ADDRESS_REG_REG
10458           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10459         return false;
10460     }
10461   else
10462     {
10463       if (shift != 0
10464           && !(IN_RANGE (shift, 1, 3)
10465                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10466         return false;
10467     }
10468
10469   if (REG_P (index)
10470       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10471     {
10472       info->type = type;
10473       info->offset = index;
10474       info->shift = shift;
10475       return true;
10476     }
10477
10478   return false;
10479 }
10480
10481 /* Return true if MODE is one of the modes for which we
10482    support LDP/STP operations.  */
10483
10484 static bool
10485 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10486 {
10487   return mode == SImode || mode == DImode
10488          || mode == SFmode || mode == DFmode
10489          || mode == SDmode || mode == DDmode
10490          || (aarch64_vector_mode_supported_p (mode)
10491              && (known_eq (GET_MODE_SIZE (mode), 8)
10492                  || known_eq (GET_MODE_SIZE (mode), 16)));
10493 }
10494
10495 /* Return true if REGNO is a virtual pointer register, or an eliminable
10496    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
10497    include stack_pointer or hard_frame_pointer.  */
10498 static bool
10499 virt_or_elim_regno_p (unsigned regno)
10500 {
10501   return ((regno >= FIRST_VIRTUAL_REGISTER
10502            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10503           || regno == FRAME_POINTER_REGNUM
10504           || regno == ARG_POINTER_REGNUM);
10505 }
10506
10507 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10508    If it is, fill in INFO appropriately.  STRICT_P is true if
10509    REG_OK_STRICT is in effect.  */
10510
10511 bool
10512 aarch64_classify_address (struct aarch64_address_info *info,
10513                           rtx x, machine_mode mode, bool strict_p,
10514                           aarch64_addr_query_type type)
10515 {
10516   enum rtx_code code = GET_CODE (x);
10517   rtx op0, op1;
10518   poly_int64 offset;
10519
10520   HOST_WIDE_INT const_size;
10521
10522   /* Whether a vector mode is partial doesn't affect address legitimacy.
10523      Partial vectors like VNx8QImode allow the same indexed addressing
10524      mode and MUL VL addressing mode as full vectors like VNx16QImode;
10525      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
10526   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10527   vec_flags &= ~VEC_PARTIAL;
10528
10529   /* On BE, we use load/store pair for all large int mode load/stores.
10530      TI/TF/TDmode may also use a load/store pair.  */
10531   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10532   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10533                             || type == ADDR_QUERY_LDP_STP_N
10534                             || mode == TImode
10535                             || mode == TFmode
10536                             || mode == TDmode
10537                             || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10538                                 && advsimd_struct_p));
10539   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10540      corresponds to the actual size of the memory being loaded/stored and the
10541      mode of the corresponding addressing mode is half of that.  */
10542   if (type == ADDR_QUERY_LDP_STP_N)
10543     {
10544       if (known_eq (GET_MODE_SIZE (mode), 32))
10545         mode = V16QImode;
10546       else if (known_eq (GET_MODE_SIZE (mode), 16))
10547         mode = DFmode;
10548       else if (known_eq (GET_MODE_SIZE (mode), 8))
10549         mode = SFmode;
10550       else
10551         return false;
10552
10553       /* This isn't really an Advanced SIMD struct mode, but a mode
10554          used to represent the complete mem in a load/store pair.  */
10555       advsimd_struct_p = false;
10556     }
10557
10558   bool allow_reg_index_p = (!load_store_pair_p
10559                             && ((vec_flags == 0
10560                                  && known_lt (GET_MODE_SIZE (mode), 16))
10561                                 || vec_flags == VEC_ADVSIMD
10562                                 || vec_flags & VEC_SVE_DATA
10563                                 || mode == VNx1TImode));
10564
10565   /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10566      The latter is not valid for SVE predicates, and that's rejected through
10567      allow_reg_index_p above.  */
10568   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10569       && (code != REG && code != PLUS))
10570     return false;
10571
10572   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10573      REG addressing.  */
10574   if (advsimd_struct_p
10575       && TARGET_SIMD
10576       && !BYTES_BIG_ENDIAN
10577       && (code != POST_INC && code != REG))
10578     return false;
10579
10580   gcc_checking_assert (GET_MODE (x) == VOIDmode
10581                        || SCALAR_INT_MODE_P (GET_MODE (x)));
10582
10583   switch (code)
10584     {
10585     case REG:
10586     case SUBREG:
10587       info->type = ADDRESS_REG_IMM;
10588       info->base = x;
10589       info->offset = const0_rtx;
10590       info->const_offset = 0;
10591       return aarch64_base_register_rtx_p (x, strict_p);
10592
10593     case PLUS:
10594       op0 = XEXP (x, 0);
10595       op1 = XEXP (x, 1);
10596
10597       if (! strict_p
10598           && REG_P (op0)
10599           && virt_or_elim_regno_p (REGNO (op0))
10600           && poly_int_rtx_p (op1, &offset))
10601         {
10602           info->type = ADDRESS_REG_IMM;
10603           info->base = op0;
10604           info->offset = op1;
10605           info->const_offset = offset;
10606
10607           return true;
10608         }
10609
10610       if (maybe_ne (GET_MODE_SIZE (mode), 0)
10611           && aarch64_base_register_rtx_p (op0, strict_p)
10612           && poly_int_rtx_p (op1, &offset))
10613         {
10614           info->type = ADDRESS_REG_IMM;
10615           info->base = op0;
10616           info->offset = op1;
10617           info->const_offset = offset;
10618
10619           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10620              registers and individual Q registers.  The available
10621              address modes are:
10622              X,X: 7-bit signed scaled offset
10623              Q:   9-bit signed offset
10624              We conservatively require an offset representable in either mode.
10625              When performing the check for pairs of X registers i.e.  LDP/STP
10626              pass down DImode since that is the natural size of the LDP/STP
10627              instruction memory accesses.  */
10628           if (mode == TImode || mode == TFmode || mode == TDmode)
10629             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10630                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10631                         || offset_12bit_unsigned_scaled_p (mode, offset)));
10632
10633           if (mode == V8DImode)
10634             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10635                     && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10636
10637           /* A 7bit offset check because OImode will emit a ldp/stp
10638              instruction (only !TARGET_SIMD or big endian will get here).
10639              For ldp/stp instructions, the offset is scaled for the size of a
10640              single element of the pair.  */
10641           if (aarch64_advsimd_partial_struct_mode_p (mode)
10642               && known_eq (GET_MODE_SIZE (mode), 16))
10643             return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10644           if (aarch64_advsimd_full_struct_mode_p (mode)
10645               && known_eq (GET_MODE_SIZE (mode), 32))
10646             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10647
10648           /* Three 9/12 bit offsets checks because CImode will emit three
10649              ldr/str instructions (only !TARGET_SIMD or big endian will
10650              get here).  */
10651           if (aarch64_advsimd_partial_struct_mode_p (mode)
10652               && known_eq (GET_MODE_SIZE (mode), 24))
10653             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10654                     && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10655                                                                offset + 16)
10656                         || offset_12bit_unsigned_scaled_p (DImode,
10657                                                            offset + 16)));
10658           if (aarch64_advsimd_full_struct_mode_p (mode)
10659               && known_eq (GET_MODE_SIZE (mode), 48))
10660             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10661                     && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10662                                                                offset + 32)
10663                         || offset_12bit_unsigned_scaled_p (TImode,
10664                                                            offset + 32)));
10665
10666           /* Two 7bit offsets checks because XImode will emit two ldp/stp
10667              instructions (only big endian will get here).  */
10668           if (aarch64_advsimd_partial_struct_mode_p (mode)
10669               && known_eq (GET_MODE_SIZE (mode), 32))
10670             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10671                     && aarch64_offset_7bit_signed_scaled_p (DImode,
10672                                                             offset + 16));
10673           if (aarch64_advsimd_full_struct_mode_p (mode)
10674               && known_eq (GET_MODE_SIZE (mode), 64))
10675             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10676                     && aarch64_offset_7bit_signed_scaled_p (TImode,
10677                                                             offset + 32));
10678
10679           /* Make "m" use the LD1 offset range for SVE data modes, so
10680              that pre-RTL optimizers like ivopts will work to that
10681              instead of the wider LDR/STR range.  */
10682           if (vec_flags == VEC_SVE_DATA || mode == VNx1TImode)
10683             return (type == ADDR_QUERY_M
10684                     ? offset_4bit_signed_scaled_p (mode, offset)
10685                     : offset_9bit_signed_scaled_p (mode, offset));
10686
10687           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10688             {
10689               poly_int64 end_offset = (offset
10690                                        + GET_MODE_SIZE (mode)
10691                                        - BYTES_PER_SVE_VECTOR);
10692               return (type == ADDR_QUERY_M
10693                       ? offset_4bit_signed_scaled_p (mode, offset)
10694                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10695                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10696                                                          end_offset)));
10697             }
10698
10699           if (vec_flags == VEC_SVE_PRED)
10700             return offset_9bit_signed_scaled_p (mode, offset);
10701
10702           if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
10703             {
10704               poly_int64 end_offset = (offset
10705                                        + GET_MODE_SIZE (mode)
10706                                        - BYTES_PER_SVE_PRED);
10707               return (offset_9bit_signed_scaled_p (VNx16BImode, end_offset)
10708                       && offset_9bit_signed_scaled_p (VNx16BImode, offset));
10709             }
10710
10711           if (load_store_pair_p)
10712             return ((known_eq (GET_MODE_SIZE (mode), 4)
10713                      || known_eq (GET_MODE_SIZE (mode), 8)
10714                      || known_eq (GET_MODE_SIZE (mode), 16))
10715                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10716           else
10717             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10718                     || offset_12bit_unsigned_scaled_p (mode, offset));
10719         }
10720
10721       if (allow_reg_index_p)
10722         {
10723           /* Look for base + (scaled/extended) index register.  */
10724           if (aarch64_base_register_rtx_p (op0, strict_p)
10725               && aarch64_classify_index (info, op1, mode, strict_p))
10726             {
10727               info->base = op0;
10728               return true;
10729             }
10730           if (aarch64_base_register_rtx_p (op1, strict_p)
10731               && aarch64_classify_index (info, op0, mode, strict_p))
10732             {
10733               info->base = op1;
10734               return true;
10735             }
10736         }
10737
10738       return false;
10739
10740     case POST_INC:
10741     case POST_DEC:
10742     case PRE_INC:
10743     case PRE_DEC:
10744       info->type = ADDRESS_REG_WB;
10745       info->base = XEXP (x, 0);
10746       info->offset = NULL_RTX;
10747       return aarch64_base_register_rtx_p (info->base, strict_p);
10748
10749     case POST_MODIFY:
10750     case PRE_MODIFY:
10751       info->type = ADDRESS_REG_WB;
10752       info->base = XEXP (x, 0);
10753       if (GET_CODE (XEXP (x, 1)) == PLUS
10754           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10755           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10756           && aarch64_base_register_rtx_p (info->base, strict_p))
10757         {
10758           info->offset = XEXP (XEXP (x, 1), 1);
10759           info->const_offset = offset;
10760
10761           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10762              registers and individual Q registers.  The available
10763              address modes are:
10764              X,X: 7-bit signed scaled offset
10765              Q:   9-bit signed offset
10766              We conservatively require an offset representable in either mode.
10767            */
10768           if (mode == TImode || mode == TFmode || mode == TDmode)
10769             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10770                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10771
10772           if (load_store_pair_p)
10773             return ((known_eq (GET_MODE_SIZE (mode), 4)
10774                      || known_eq (GET_MODE_SIZE (mode), 8)
10775                      || known_eq (GET_MODE_SIZE (mode), 16))
10776                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10777           else
10778             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10779         }
10780       return false;
10781
10782     case CONST:
10783     case SYMBOL_REF:
10784     case LABEL_REF:
10785       /* load literal: pc-relative constant pool entry.  Only supported
10786          for SI mode or larger.  */
10787       info->type = ADDRESS_SYMBOLIC;
10788
10789       if (!load_store_pair_p
10790           && GET_MODE_SIZE (mode).is_constant (&const_size)
10791           && const_size >= 4)
10792         {
10793           poly_int64 offset;
10794           rtx sym = strip_offset_and_salt (x, &offset);
10795           return ((LABEL_REF_P (sym)
10796                    || (SYMBOL_REF_P (sym)
10797                        && CONSTANT_POOL_ADDRESS_P (sym)
10798                        && aarch64_pcrelative_literal_loads)));
10799         }
10800       return false;
10801
10802     case LO_SUM:
10803       info->type = ADDRESS_LO_SUM;
10804       info->base = XEXP (x, 0);
10805       info->offset = XEXP (x, 1);
10806       if (allow_reg_index_p
10807           && aarch64_base_register_rtx_p (info->base, strict_p))
10808         {
10809           poly_int64 offset;
10810           HOST_WIDE_INT const_offset;
10811           rtx sym = strip_offset_and_salt (info->offset, &offset);
10812           if (SYMBOL_REF_P (sym)
10813               && offset.is_constant (&const_offset)
10814               && (aarch64_classify_symbol (sym, const_offset)
10815                   == SYMBOL_SMALL_ABSOLUTE))
10816             {
10817               /* The symbol and offset must be aligned to the access size.  */
10818               unsigned int align;
10819
10820               if (CONSTANT_POOL_ADDRESS_P (sym))
10821                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
10822               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
10823                 {
10824                   tree exp = SYMBOL_REF_DECL (sym);
10825                   align = TYPE_ALIGN (TREE_TYPE (exp));
10826                   align = aarch64_constant_alignment (exp, align);
10827                 }
10828               else if (SYMBOL_REF_DECL (sym))
10829                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
10830               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
10831                        && SYMBOL_REF_BLOCK (sym) != NULL)
10832                 align = SYMBOL_REF_BLOCK (sym)->alignment;
10833               else
10834                 align = BITS_PER_UNIT;
10835
10836               poly_int64 ref_size = GET_MODE_SIZE (mode);
10837               if (known_eq (ref_size, 0))
10838                 ref_size = GET_MODE_SIZE (DImode);
10839
10840               return (multiple_p (const_offset, ref_size)
10841                       && multiple_p (align / BITS_PER_UNIT, ref_size));
10842             }
10843         }
10844       return false;
10845
10846     default:
10847       return false;
10848     }
10849 }
10850
10851 /* Return true if the address X is valid for a PRFM instruction.
10852    STRICT_P is true if we should do strict checking with
10853    aarch64_classify_address.  */
10854
10855 bool
10856 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
10857 {
10858   struct aarch64_address_info addr;
10859
10860   /* PRFM accepts the same addresses as DImode...  */
10861   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
10862   if (!res)
10863     return false;
10864
10865   /* ... except writeback forms.  */
10866   return addr.type != ADDRESS_REG_WB;
10867 }
10868
10869 bool
10870 aarch64_symbolic_address_p (rtx x)
10871 {
10872   poly_int64 offset;
10873   x = strip_offset_and_salt (x, &offset);
10874   return SYMBOL_REF_P (x) || LABEL_REF_P (x);
10875 }
10876
10877 /* Classify the base of symbolic expression X.  */
10878
10879 enum aarch64_symbol_type
10880 aarch64_classify_symbolic_expression (rtx x)
10881 {
10882   rtx offset;
10883
10884   split_const (x, &x, &offset);
10885   return aarch64_classify_symbol (x, INTVAL (offset));
10886 }
10887
10888
10889 /* Return TRUE if X is a legitimate address for accessing memory in
10890    mode MODE.  */
10891 static bool
10892 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p,
10893                                    code_helper = ERROR_MARK)
10894 {
10895   struct aarch64_address_info addr;
10896
10897   return aarch64_classify_address (&addr, x, mode, strict_p);
10898 }
10899
10900 /* Return TRUE if X is a legitimate address of type TYPE for accessing
10901    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10902 bool
10903 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
10904                               aarch64_addr_query_type type)
10905 {
10906   struct aarch64_address_info addr;
10907
10908   return aarch64_classify_address (&addr, x, mode, strict_p, type);
10909 }
10910
10911 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
10912
10913 static bool
10914 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
10915                                          poly_int64 orig_offset,
10916                                          machine_mode mode)
10917 {
10918   HOST_WIDE_INT size;
10919   if (GET_MODE_SIZE (mode).is_constant (&size))
10920     {
10921       HOST_WIDE_INT const_offset, second_offset;
10922
10923       /* A general SVE offset is A * VQ + B.  Remove the A component from
10924          coefficient 0 in order to get the constant B.  */
10925       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
10926
10927       /* Split an out-of-range address displacement into a base and
10928          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
10929          range otherwise to increase opportunities for sharing the base
10930          address of different sizes.  Unaligned accesses use the signed
10931          9-bit range, TImode/TFmode/TDmode use the intersection of signed
10932          scaled 7-bit and signed 9-bit offset.  */
10933       if (mode == TImode || mode == TFmode || mode == TDmode)
10934         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
10935       else if ((const_offset & (size - 1)) != 0)
10936         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
10937       else
10938         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
10939
10940       if (second_offset == 0 || known_eq (orig_offset, second_offset))
10941         return false;
10942
10943       /* Split the offset into second_offset and the rest.  */
10944       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10945       *offset2 = gen_int_mode (second_offset, Pmode);
10946       return true;
10947     }
10948   else
10949     {
10950       /* Get the mode we should use as the basis of the range.  For structure
10951          modes this is the mode of one vector.  */
10952       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10953       machine_mode step_mode
10954         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
10955
10956       /* Get the "mul vl" multiplier we'd like to use.  */
10957       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
10958       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
10959       if (vec_flags & VEC_SVE_DATA)
10960         /* LDR supports a 9-bit range, but the move patterns for
10961            structure modes require all vectors to be in range of the
10962            same base.  The simplest way of accomodating that while still
10963            promoting reuse of anchor points between different modes is
10964            to use an 8-bit range unconditionally.  */
10965         vnum = ((vnum + 128) & 255) - 128;
10966       else
10967         /* Predicates are only handled singly, so we might as well use
10968            the full range.  */
10969         vnum = ((vnum + 256) & 511) - 256;
10970       if (vnum == 0)
10971         return false;
10972
10973       /* Convert the "mul vl" multiplier into a byte offset.  */
10974       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
10975       if (known_eq (second_offset, orig_offset))
10976         return false;
10977
10978       /* Split the offset into second_offset and the rest.  */
10979       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10980       *offset2 = gen_int_mode (second_offset, Pmode);
10981       return true;
10982     }
10983 }
10984
10985 /* Return the binary representation of floating point constant VALUE in INTVAL.
10986    If the value cannot be converted, return false without setting INTVAL.
10987    The conversion is done in the given MODE.  */
10988 bool
10989 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
10990 {
10991
10992   /* We make a general exception for 0.  */
10993   if (aarch64_float_const_zero_rtx_p (value))
10994     {
10995       *intval = 0;
10996       return true;
10997     }
10998
10999   scalar_float_mode mode;
11000   if (!CONST_DOUBLE_P (value)
11001       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
11002       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
11003       /* Only support up to DF mode.  */
11004       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
11005     return false;
11006
11007   unsigned HOST_WIDE_INT ival = 0;
11008
11009   long res[2];
11010   real_to_target (res,
11011                   CONST_DOUBLE_REAL_VALUE (value),
11012                   REAL_MODE_FORMAT (mode));
11013
11014   if (mode == DFmode || mode == DDmode)
11015     {
11016       int order = BYTES_BIG_ENDIAN ? 1 : 0;
11017       ival = zext_hwi (res[order], 32);
11018       ival |= (zext_hwi (res[1 - order], 32) << 32);
11019     }
11020   else
11021       ival = zext_hwi (res[0], 32);
11022
11023   *intval = ival;
11024   return true;
11025 }
11026
11027 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11028    single MOV(+MOVK) followed by an FMOV.  */
11029 bool
11030 aarch64_float_const_rtx_p (rtx x)
11031 {
11032   machine_mode mode = GET_MODE (x);
11033   if (mode == VOIDmode)
11034     return false;
11035
11036   /* Determine whether it's cheaper to write float constants as
11037      mov/movk pairs over ldr/adrp pairs.  */
11038   unsigned HOST_WIDE_INT ival;
11039
11040   if (CONST_DOUBLE_P (x)
11041       && SCALAR_FLOAT_MODE_P (mode)
11042       && aarch64_reinterpret_float_as_int (x, &ival))
11043     {
11044       machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
11045       int num_instr = aarch64_internal_mov_immediate
11046                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11047       return num_instr < 3;
11048     }
11049
11050   return false;
11051 }
11052
11053 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11054    Floating Point).  */
11055 bool
11056 aarch64_float_const_zero_rtx_p (rtx x)
11057 {
11058   /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11059      zr as our callers expect, so no need to check the actual
11060      value if X is of Decimal Floating Point type.  */
11061   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
11062     return false;
11063
11064   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
11065     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
11066   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
11067 }
11068
11069 /* Return true if X is any kind of constant zero rtx.  */
11070
11071 bool
11072 aarch64_const_zero_rtx_p (rtx x)
11073 {
11074   return (x == CONST0_RTX (GET_MODE (x))
11075           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)));
11076 }
11077
11078 /* Return TRUE if rtx X is immediate constant that fits in a single
11079    MOVI immediate operation.  */
11080 bool
11081 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11082 {
11083   if (!TARGET_SIMD)
11084      return false;
11085
11086   machine_mode vmode;
11087   scalar_int_mode imode;
11088   unsigned HOST_WIDE_INT ival;
11089
11090   if (CONST_DOUBLE_P (x)
11091       && SCALAR_FLOAT_MODE_P (mode))
11092     {
11093       if (!aarch64_reinterpret_float_as_int (x, &ival))
11094         return false;
11095
11096       /* We make a general exception for 0.  */
11097       if (aarch64_float_const_zero_rtx_p (x))
11098         return true;
11099
11100       imode = int_mode_for_mode (mode).require ();
11101     }
11102   else if (CONST_INT_P (x)
11103            && is_a <scalar_int_mode> (mode, &imode))
11104     ival = INTVAL (x);
11105   else
11106     return false;
11107
11108    /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11109      a 128 bit vector mode.  */
11110   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11111
11112   vmode = aarch64_simd_container_mode (imode, width);
11113   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11114
11115   return aarch64_simd_valid_immediate (v_op, NULL);
11116 }
11117
11118
11119 /* Return the fixed registers used for condition codes.  */
11120
11121 static bool
11122 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11123 {
11124   *p1 = CC_REGNUM;
11125   *p2 = INVALID_REGNUM;
11126   return true;
11127 }
11128
11129 /* Return a fresh memory reference to the current function's TPIDR2 block,
11130    creating a block if necessary.  */
11131
11132 static rtx
11133 aarch64_get_tpidr2_block ()
11134 {
11135   if (!cfun->machine->tpidr2_block)
11136     /* The TPIDR2 block is 16 bytes in size and must be aligned to a 128-bit
11137        boundary.  */
11138     cfun->machine->tpidr2_block = assign_stack_local (V16QImode, 16, 128);
11139   return copy_rtx (cfun->machine->tpidr2_block);
11140 }
11141
11142 /* Return a fresh register that points to the current function's
11143    TPIDR2 block, creating a block if necessary.  */
11144
11145 static rtx
11146 aarch64_get_tpidr2_ptr ()
11147 {
11148   rtx block = aarch64_get_tpidr2_block ();
11149   return force_reg (Pmode, XEXP (block, 0));
11150 }
11151
11152 /* Emit instructions to allocate a ZA lazy save buffer and initialize the
11153    current function's TPIDR2 block.  */
11154
11155 static void
11156 aarch64_init_tpidr2_block ()
11157 {
11158   rtx block = aarch64_get_tpidr2_block ();
11159
11160   /* The ZA save buffer is SVL.B*SVL.B bytes in size.  */
11161   rtx svl_bytes = aarch64_sme_vq_immediate (Pmode, 16, AARCH64_ISA_MODE);
11162   rtx svl_bytes_reg = force_reg (DImode, svl_bytes);
11163   rtx za_size = expand_simple_binop (Pmode, MULT, svl_bytes_reg,
11164                                      svl_bytes_reg, NULL, 0, OPTAB_LIB_WIDEN);
11165   rtx za_save_buffer = allocate_dynamic_stack_space (za_size, 128,
11166                                                      BITS_PER_UNIT, -1, true);
11167   za_save_buffer = force_reg (Pmode, za_save_buffer);
11168   cfun->machine->za_save_buffer = za_save_buffer;
11169
11170   /* The first word of the block points to the save buffer and the second
11171      word is the number of ZA slices to save.  */
11172   rtx block_0 = adjust_address (block, DImode, 0);
11173   emit_insn (aarch64_gen_store_pair (block_0, za_save_buffer, svl_bytes_reg));
11174
11175   if (!memory_operand (block, V16QImode))
11176     block = replace_equiv_address (block, force_reg (Pmode, XEXP (block, 0)));
11177   emit_insn (gen_aarch64_setup_local_tpidr2 (block));
11178 }
11179
11180 /* Restore the contents of ZA from the lazy save buffer, given that
11181    register TPIDR2_BLOCK points to the current function's TPIDR2 block.
11182    PSTATE.ZA is known to be 0 and TPIDR2_EL0 is known to be null.  */
11183
11184 void
11185 aarch64_restore_za (rtx tpidr2_block)
11186 {
11187   emit_insn (gen_aarch64_smstart_za ());
11188   if (REGNO (tpidr2_block) != R0_REGNUM)
11189     emit_move_insn (gen_rtx_REG (Pmode, R0_REGNUM), tpidr2_block);
11190   emit_insn (gen_aarch64_tpidr2_restore ());
11191 }
11192
11193 /* Return the ZT0 save buffer, creating one if necessary.  */
11194
11195 static rtx
11196 aarch64_get_zt0_save_buffer ()
11197 {
11198   if (!cfun->machine->zt0_save_buffer)
11199     cfun->machine->zt0_save_buffer = assign_stack_local (V8DImode, 64, 128);
11200   return cfun->machine->zt0_save_buffer;
11201 }
11202
11203 /* Save ZT0 to the current function's save buffer.  */
11204
11205 static void
11206 aarch64_save_zt0 ()
11207 {
11208   rtx mem = aarch64_get_zt0_save_buffer ();
11209   mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11210   emit_insn (gen_aarch64_sme_str_zt0 (mem));
11211 }
11212
11213 /* Restore ZT0 from the current function's save buffer.  FROM_LAZY_SAVE_P
11214    is true if the load is happening after a call to a private-ZA function,
11215    false if it can be treated as a normal load.  */
11216
11217 static void
11218 aarch64_restore_zt0 (bool from_lazy_save_p)
11219 {
11220   rtx mem = aarch64_get_zt0_save_buffer ();
11221   mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11222   emit_insn (from_lazy_save_p
11223              ? gen_aarch64_restore_zt0 (mem)
11224              : gen_aarch64_sme_ldr_zt0 (mem));
11225 }
11226
11227 /* Implement TARGET_START_CALL_ARGS.  */
11228
11229 static void
11230 aarch64_start_call_args (cumulative_args_t ca_v)
11231 {
11232   CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11233
11234   if (!TARGET_SME && (ca->isa_mode & AARCH64_ISA_MODE_SM_ON))
11235     {
11236       error ("calling a streaming function requires the ISA extension %qs",
11237              "sme");
11238       inform (input_location, "you can enable %qs using the command-line"
11239               " option %<-march%>, or by using the %<target%>"
11240               " attribute or pragma", "sme");
11241     }
11242
11243   if ((ca->shared_za_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11244       && !aarch64_cfun_has_state ("za"))
11245     error ("call to a function that shares %qs state from a function"
11246            " that has no %qs state", "za", "za");
11247   else if ((ca->shared_zt0_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11248            && !aarch64_cfun_has_state ("zt0"))
11249     error ("call to a function that shares %qs state from a function"
11250            " that has no %qs state", "zt0", "zt0");
11251   else if (!TARGET_ZA && (ca->isa_mode & AARCH64_ISA_MODE_ZA_ON))
11252     error ("call to a function that shares SME state from a function"
11253            " that has no SME state");
11254
11255   /* If this is a call to a private ZA function, emit a marker to
11256      indicate where any necessary set-up code could be inserted.
11257      The code itself is inserted by the mode-switching pass.  */
11258   if (TARGET_ZA && !(ca->isa_mode & AARCH64_ISA_MODE_ZA_ON))
11259     emit_insn (gen_aarch64_start_private_za_call ());
11260
11261   /* If this is a call to a shared-ZA function that doesn't share ZT0,
11262      save and restore ZT0 around the call.  */
11263   if (aarch64_cfun_has_state ("zt0")
11264       && (ca->isa_mode & AARCH64_ISA_MODE_ZA_ON)
11265       && ca->shared_zt0_flags == 0)
11266     aarch64_save_zt0 ();
11267 }
11268
11269 /* This function is used by the call expanders of the machine description.
11270    RESULT is the register in which the result is returned.  It's NULL for
11271    "call" and "sibcall".
11272    MEM is the location of the function call.
11273    COOKIE is either:
11274      - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI.
11275      - a PARALLEL that contains such a const_int as its first element.
11276        The second element is a PARALLEL that lists all the argument
11277        registers that need to be saved and restored around a change
11278        in PSTATE.SM, or const0_rtx if no such switch is needed.
11279        The third and fourth elements are const_ints that contain the
11280        sharing flags for ZA and ZT0 respectively.
11281    SIBCALL indicates whether this function call is normal call or sibling call.
11282    It will generate different pattern accordingly.  */
11283
11284 void
11285 aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
11286 {
11287   rtx call, callee, tmp;
11288   rtvec vec;
11289   machine_mode mode;
11290
11291   rtx callee_abi = cookie;
11292   rtx sme_mode_switch_args = const0_rtx;
11293   unsigned int shared_za_flags = 0;
11294   unsigned int shared_zt0_flags = 0;
11295   if (GET_CODE (cookie) == PARALLEL)
11296     {
11297       callee_abi = XVECEXP (cookie, 0, 0);
11298       sme_mode_switch_args = XVECEXP (cookie, 0, 1);
11299       shared_za_flags = INTVAL (XVECEXP (cookie, 0, 2));
11300       shared_zt0_flags = INTVAL (XVECEXP (cookie, 0, 3));
11301     }
11302
11303   gcc_assert (CONST_INT_P (callee_abi));
11304   auto callee_isa_mode = aarch64_callee_isa_mode (callee_abi);
11305
11306   if (aarch64_cfun_has_state ("za")
11307       && (callee_isa_mode & AARCH64_ISA_MODE_ZA_ON)
11308       && !shared_za_flags)
11309     {
11310       sorry ("call to a function that shares state other than %qs"
11311              " from a function that has %qs state", "za", "za");
11312       inform (input_location, "use %<__arm_preserves(\"za\")%> if the"
11313               " callee preserves ZA");
11314     }
11315
11316   gcc_assert (MEM_P (mem));
11317   callee = XEXP (mem, 0);
11318
11319 #if TARGET_PECOFF
11320   tmp = legitimize_pe_coff_symbol (callee, false);
11321   if (tmp)
11322     callee = tmp;
11323 #endif
11324
11325   mode = GET_MODE (callee);
11326   gcc_assert (mode == Pmode);
11327
11328   /* Decide if we should generate indirect calls by loading the
11329      address of the callee into a register before performing
11330      the branch-and-link.  */
11331   if (SYMBOL_REF_P (callee)
11332       ? (aarch64_is_long_call_p (callee)
11333          || aarch64_is_noplt_call_p (callee))
11334       : !REG_P (callee))
11335     XEXP (mem, 0) = force_reg (mode, callee);
11336
11337   /* Accumulate the return values, including state that is shared via
11338      attributes.  */
11339   auto_vec<rtx, 8> return_values;
11340   if (result)
11341     {
11342       if (GET_CODE (result) == PARALLEL)
11343         for (int i = 0; i < XVECLEN (result, 0); ++i)
11344           return_values.safe_push (XVECEXP (result, 0, i));
11345       else
11346         return_values.safe_push (result);
11347     }
11348   unsigned int orig_num_return_values = return_values.length ();
11349   if (shared_za_flags & AARCH64_STATE_OUT)
11350     return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11351   /* When calling private-ZA functions from functions with ZA state,
11352      we want to know whether the call committed a lazy save.  */
11353   if (TARGET_ZA && !shared_za_flags)
11354     return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11355   if (shared_zt0_flags & AARCH64_STATE_OUT)
11356     return_values.safe_push (gen_rtx_REG (V8DImode, ZT0_REGNUM));
11357
11358   /* Create the new return value, if necessary.  */
11359   if (orig_num_return_values != return_values.length ())
11360     {
11361       if (return_values.length () == 1)
11362         result = return_values[0];
11363       else
11364         {
11365           for (rtx &x : return_values)
11366             if (GET_CODE (x) != EXPR_LIST)
11367               x = gen_rtx_EXPR_LIST (VOIDmode, x, const0_rtx);
11368           rtvec v = gen_rtvec_v (return_values.length (),
11369                                  return_values.address ());
11370           result = gen_rtx_PARALLEL (VOIDmode, v);
11371         }
11372     }
11373
11374   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11375
11376   if (result != NULL_RTX)
11377     call = gen_rtx_SET (result, call);
11378
11379   if (sibcall)
11380     tmp = ret_rtx;
11381   else
11382     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11383
11384   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11385                                UNSPEC_CALLEE_ABI);
11386
11387   vec = gen_rtvec (3, call, callee_abi, tmp);
11388   call = gen_rtx_PARALLEL (VOIDmode, vec);
11389
11390   auto call_insn = aarch64_emit_call_insn (call);
11391
11392   /* Check whether the call requires a change to PSTATE.SM.  We can't
11393      emit the instructions to change PSTATE.SM yet, since they involve
11394      a change in vector length and a change in instruction set, which
11395      cannot be represented in RTL.
11396
11397      For now, just record which registers will be clobbered and used
11398      by the changes to PSTATE.SM.  */
11399   if (!sibcall && aarch64_call_switches_pstate_sm (callee_isa_mode))
11400     {
11401       aarch64_sme_mode_switch_regs args_switch;
11402       if (sme_mode_switch_args != const0_rtx)
11403         {
11404           unsigned int num_args = XVECLEN (sme_mode_switch_args, 0);
11405           for (unsigned int i = 0; i < num_args; ++i)
11406             {
11407               rtx x = XVECEXP (sme_mode_switch_args, 0, i);
11408               args_switch.add_reg (GET_MODE (x), REGNO (x));
11409             }
11410         }
11411
11412       aarch64_sme_mode_switch_regs result_switch;
11413       if (result)
11414         result_switch.add_call_result (call_insn);
11415
11416       unsigned int num_gprs = MAX (args_switch.num_gprs (),
11417                                    result_switch.num_gprs ());
11418       for (unsigned int i = 0; i < num_gprs; ++i)
11419         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11420                      gen_rtx_REG (DImode, args_switch.FIRST_GPR + i));
11421
11422       for (int regno = V0_REGNUM; regno < V0_REGNUM + 32; regno += 4)
11423         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11424                      gen_rtx_REG (V4x16QImode, regno));
11425
11426       for (int regno = P0_REGNUM; regno < P0_REGNUM + 16; regno += 1)
11427         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11428                      gen_rtx_REG (VNx16BImode, regno));
11429
11430       /* Ensure that the VG save slot has been initialized.  Also emit
11431          an instruction to model the effect of the temporary clobber
11432          of VG, so that the prologue/epilogue pass sees the need to
11433          save the old value.  */
11434       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11435                gen_rtx_REG (DImode, VG_REGNUM));
11436       emit_insn_before (gen_aarch64_update_vg (), call_insn);
11437
11438       cfun->machine->call_switches_pstate_sm = true;
11439     }
11440
11441   /* Add any ZA-related information.
11442
11443      ZA_REGNUM represents the current function's ZA state, rather than
11444      the contents of the ZA register itself.  We ensure that the function's
11445      ZA state is preserved by private-ZA call sequences, so the call itself
11446      does not use or clobber ZA_REGNUM.  The same thing applies to
11447      ZT0_REGNUM.  */
11448   if (TARGET_ZA)
11449     {
11450       /* The callee requires ZA to be active if the callee is shared-ZA,
11451          otherwise it requires ZA to be dormant or off.  The state of ZA is
11452          captured by a combination of SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM,
11453          and ZA_SAVED_REGNUM.  */
11454       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11455                gen_rtx_REG (DImode, SME_STATE_REGNUM));
11456       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11457                gen_rtx_REG (DImode, TPIDR2_SETUP_REGNUM));
11458       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11459                gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11460
11461       /* Keep the aarch64_start/end_private_za_call markers live.  */
11462       if (!(callee_isa_mode & AARCH64_ISA_MODE_ZA_ON))
11463         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11464                  gen_rtx_REG (VNx16BImode, LOWERING_REGNUM));
11465
11466       /* If the callee is a shared-ZA function, record whether it uses the
11467          current value of ZA and ZT0.  */
11468       if (shared_za_flags & AARCH64_STATE_IN)
11469         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11470                  gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11471
11472       if (shared_zt0_flags & AARCH64_STATE_IN)
11473         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11474                  gen_rtx_REG (V8DImode, ZT0_REGNUM));
11475     }
11476 }
11477
11478 /* Implement TARGET_END_CALL_ARGS.  */
11479
11480 static void
11481 aarch64_end_call_args (cumulative_args_t ca_v)
11482 {
11483   CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11484
11485   /* If this is a call to a private ZA function, emit a marker to
11486      indicate where any necessary restoration code could be inserted.
11487      The code itself is inserted by the mode-switching pass.  */
11488   if (TARGET_ZA && !(ca->isa_mode & AARCH64_ISA_MODE_ZA_ON))
11489     emit_insn (gen_aarch64_end_private_za_call ());
11490
11491   /* If this is a call to a shared-ZA function that doesn't share ZT0,
11492      save and restore ZT0 around the call.  */
11493   if (aarch64_cfun_has_state ("zt0")
11494       && (ca->isa_mode & AARCH64_ISA_MODE_ZA_ON)
11495       && ca->shared_zt0_flags == 0)
11496     aarch64_restore_zt0 (false);
11497 }
11498
11499 /* Emit call insn with PAT and do aarch64-specific handling.  */
11500
11501 rtx_call_insn *
11502 aarch64_emit_call_insn (rtx pat)
11503 {
11504   auto insn = emit_call_insn (pat);
11505
11506   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11507   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11508   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11509   return as_a<rtx_call_insn *> (insn);
11510 }
11511
11512 machine_mode
11513 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11514 {
11515   machine_mode mode_x = GET_MODE (x);
11516   rtx_code code_x = GET_CODE (x);
11517
11518   /* All floating point compares return CCFP if it is an equality
11519      comparison, and CCFPE otherwise.  */
11520   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11521     {
11522       switch (code)
11523         {
11524         case EQ:
11525         case NE:
11526         case UNORDERED:
11527         case ORDERED:
11528         case UNLT:
11529         case UNLE:
11530         case UNGT:
11531         case UNGE:
11532         case UNEQ:
11533           return CCFPmode;
11534
11535         case LT:
11536         case LE:
11537         case GT:
11538         case GE:
11539         case LTGT:
11540           return CCFPEmode;
11541
11542         default:
11543           gcc_unreachable ();
11544         }
11545     }
11546
11547   /* Equality comparisons of short modes against zero can be performed
11548      using the TST instruction with the appropriate bitmask.  */
11549   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11550       && (code == EQ || code == NE)
11551       && (mode_x == HImode || mode_x == QImode))
11552     return CC_Zmode;
11553
11554   /* Similarly, comparisons of zero_extends from shorter modes can
11555      be performed using an ANDS with an immediate mask.  */
11556   if (y == const0_rtx && code_x == ZERO_EXTEND
11557       && (mode_x == SImode || mode_x == DImode)
11558       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11559       && (code == EQ || code == NE))
11560     return CC_Zmode;
11561
11562   /* Zero extracts support equality comparisons.  */
11563   if ((mode_x == SImode || mode_x == DImode)
11564       && y == const0_rtx
11565       && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11566           && CONST_INT_P (XEXP (x, 2)))
11567       && (code == EQ || code == NE))
11568     return CC_Zmode;
11569
11570   /* ANDS/BICS/TST support equality and all signed comparisons.  */
11571   if ((mode_x == SImode || mode_x == DImode)
11572       && y == const0_rtx
11573       && (code_x == AND)
11574       && (code == EQ || code == NE || code == LT || code == GE
11575           || code == GT || code == LE))
11576     return CC_NZVmode;
11577
11578   /* ADDS/SUBS correctly set N and Z flags.  */
11579   if ((mode_x == SImode || mode_x == DImode)
11580       && y == const0_rtx
11581       && (code == EQ || code == NE || code == LT || code == GE)
11582       && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11583     return CC_NZmode;
11584
11585   /* A compare with a shifted operand.  Because of canonicalization,
11586      the comparison will have to be swapped when we emit the assembly
11587      code.  */
11588   if ((mode_x == SImode || mode_x == DImode)
11589       && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11590       && (code_x == ASHIFT || code_x == ASHIFTRT
11591           || code_x == LSHIFTRT
11592           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11593     return CC_SWPmode;
11594
11595   /* Similarly for a negated operand, but we can only do this for
11596      equalities.  */
11597   if ((mode_x == SImode || mode_x == DImode)
11598       && (REG_P (y) || SUBREG_P (y))
11599       && (code == EQ || code == NE)
11600       && code_x == NEG)
11601     return CC_Zmode;
11602
11603   /* A test for unsigned overflow from an addition.  */
11604   if ((mode_x == DImode || mode_x == TImode)
11605       && (code == LTU || code == GEU)
11606       && code_x == PLUS
11607       && rtx_equal_p (XEXP (x, 0), y))
11608     return CC_Cmode;
11609
11610   /* A test for unsigned overflow from an add with carry.  */
11611   if ((mode_x == DImode || mode_x == TImode)
11612       && (code == LTU || code == GEU)
11613       && code_x == PLUS
11614       && CONST_SCALAR_INT_P (y)
11615       && (rtx_mode_t (y, mode_x)
11616           == (wi::shwi (1, mode_x)
11617               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11618     return CC_ADCmode;
11619
11620   /* A test for signed overflow.  */
11621   if ((mode_x == DImode || mode_x == TImode)
11622       && code == NE
11623       && code_x == PLUS
11624       && GET_CODE (y) == SIGN_EXTEND)
11625     return CC_Vmode;
11626
11627   /* For everything else, return CCmode.  */
11628   return CCmode;
11629 }
11630
11631 static int
11632 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11633
11634 int
11635 aarch64_get_condition_code (rtx x)
11636 {
11637   machine_mode mode = GET_MODE (XEXP (x, 0));
11638   enum rtx_code comp_code = GET_CODE (x);
11639
11640   if (GET_MODE_CLASS (mode) != MODE_CC)
11641     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11642   return aarch64_get_condition_code_1 (mode, comp_code);
11643 }
11644
11645 static int
11646 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11647 {
11648   switch (mode)
11649     {
11650     case E_CCFPmode:
11651     case E_CCFPEmode:
11652       switch (comp_code)
11653         {
11654         case GE: return AARCH64_GE;
11655         case GT: return AARCH64_GT;
11656         case LE: return AARCH64_LS;
11657         case LT: return AARCH64_MI;
11658         case NE: return AARCH64_NE;
11659         case EQ: return AARCH64_EQ;
11660         case ORDERED: return AARCH64_VC;
11661         case UNORDERED: return AARCH64_VS;
11662         case UNLT: return AARCH64_LT;
11663         case UNLE: return AARCH64_LE;
11664         case UNGT: return AARCH64_HI;
11665         case UNGE: return AARCH64_PL;
11666         default: return -1;
11667         }
11668       break;
11669
11670     case E_CCmode:
11671       switch (comp_code)
11672         {
11673         case NE: return AARCH64_NE;
11674         case EQ: return AARCH64_EQ;
11675         case GE: return AARCH64_GE;
11676         case GT: return AARCH64_GT;
11677         case LE: return AARCH64_LE;
11678         case LT: return AARCH64_LT;
11679         case GEU: return AARCH64_CS;
11680         case GTU: return AARCH64_HI;
11681         case LEU: return AARCH64_LS;
11682         case LTU: return AARCH64_CC;
11683         default: return -1;
11684         }
11685       break;
11686
11687     case E_CC_SWPmode:
11688       switch (comp_code)
11689         {
11690         case NE: return AARCH64_NE;
11691         case EQ: return AARCH64_EQ;
11692         case GE: return AARCH64_LE;
11693         case GT: return AARCH64_LT;
11694         case LE: return AARCH64_GE;
11695         case LT: return AARCH64_GT;
11696         case GEU: return AARCH64_LS;
11697         case GTU: return AARCH64_CC;
11698         case LEU: return AARCH64_CS;
11699         case LTU: return AARCH64_HI;
11700         default: return -1;
11701         }
11702       break;
11703
11704     case E_CC_NZCmode:
11705       switch (comp_code)
11706         {
11707         case NE: return AARCH64_NE; /* = any */
11708         case EQ: return AARCH64_EQ; /* = none */
11709         case GE: return AARCH64_PL; /* = nfrst */
11710         case LT: return AARCH64_MI; /* = first */
11711         case GEU: return AARCH64_CS; /* = nlast */
11712         case GTU: return AARCH64_HI; /* = pmore */
11713         case LEU: return AARCH64_LS; /* = plast */
11714         case LTU: return AARCH64_CC; /* = last */
11715         default: return -1;
11716         }
11717       break;
11718
11719     case E_CC_NZVmode:
11720       switch (comp_code)
11721         {
11722         case NE: return AARCH64_NE;
11723         case EQ: return AARCH64_EQ;
11724         case GE: return AARCH64_PL;
11725         case LT: return AARCH64_MI;
11726         case GT: return AARCH64_GT;
11727         case LE: return AARCH64_LE;
11728         default: return -1;
11729         }
11730       break;
11731
11732     case E_CC_NZmode:
11733       switch (comp_code)
11734         {
11735         case NE: return AARCH64_NE;
11736         case EQ: return AARCH64_EQ;
11737         case GE: return AARCH64_PL;
11738         case LT: return AARCH64_MI;
11739         default: return -1;
11740         }
11741       break;
11742
11743     case E_CC_Zmode:
11744       switch (comp_code)
11745         {
11746         case NE: return AARCH64_NE;
11747         case EQ: return AARCH64_EQ;
11748         default: return -1;
11749         }
11750       break;
11751
11752     case E_CC_Cmode:
11753       switch (comp_code)
11754         {
11755         case LTU: return AARCH64_CS;
11756         case GEU: return AARCH64_CC;
11757         default: return -1;
11758         }
11759       break;
11760
11761     case E_CC_ADCmode:
11762       switch (comp_code)
11763         {
11764         case GEU: return AARCH64_CS;
11765         case LTU: return AARCH64_CC;
11766         default: return -1;
11767         }
11768       break;
11769
11770     case E_CC_Vmode:
11771       switch (comp_code)
11772         {
11773         case NE: return AARCH64_VS;
11774         case EQ: return AARCH64_VC;
11775         default: return -1;
11776         }
11777       break;
11778
11779     default:
11780       return -1;
11781     }
11782
11783   return -1;
11784 }
11785
11786 /* Return true if X is a CONST_INT, CONST_WIDE_INT or a constant vector
11787    duplicate of such constants.  If so, store in RET_WI the wide_int
11788    representation of the constant paired with the inner mode of the vector mode
11789    or MODE for scalar X constants.  If MODE is not provided then TImode is
11790    used.  */
11791
11792 static bool
11793 aarch64_extract_vec_duplicate_wide_int (rtx x, wide_int *ret_wi,
11794                                         scalar_mode mode = TImode)
11795 {
11796   rtx elt = unwrap_const_vec_duplicate (x);
11797   if (!CONST_SCALAR_INT_P (elt))
11798     return false;
11799   scalar_mode smode
11800     = CONST_SCALAR_INT_P (x) ? mode : GET_MODE_INNER (GET_MODE (x));
11801   *ret_wi = rtx_mode_t (elt, smode);
11802   return true;
11803 }
11804
11805 /* Return true if X is a scalar or a constant vector of integer
11806    immediates that represent the rounding constant used in the fixed-point
11807    arithmetic instructions.
11808    The accepted form of the constant is (1 << (C - 1)) where C is in the range
11809    [1, MODE_WIDTH/2].  */
11810
11811 bool
11812 aarch64_rnd_imm_p (rtx x)
11813 {
11814   wide_int rnd_cst;
11815   if (!aarch64_extract_vec_duplicate_wide_int (x, &rnd_cst))
11816     return false;
11817   int log2 = wi::exact_log2 (rnd_cst);
11818   if (log2 < 0)
11819     return false;
11820   return IN_RANGE (log2, 0, rnd_cst.get_precision () / 2 - 1);
11821 }
11822
11823 /* Return true if RND is a constant vector of integer rounding constants
11824    corresponding to a constant vector of shifts, SHIFT.
11825    The relationship should be RND == (1 << (SHIFT - 1)).  */
11826
11827 bool
11828 aarch64_const_vec_rnd_cst_p (rtx rnd, rtx shift)
11829 {
11830   wide_int rnd_cst, shft_cst;
11831   if (!aarch64_extract_vec_duplicate_wide_int (rnd, &rnd_cst)
11832       || !aarch64_extract_vec_duplicate_wide_int (shift, &shft_cst))
11833     return false;
11834
11835   return rnd_cst == (wi::shwi (1, rnd_cst.get_precision ()) << (shft_cst - 1));
11836 }
11837
11838 bool
11839 aarch64_const_vec_all_same_in_range_p (rtx x,
11840                                        HOST_WIDE_INT minval,
11841                                        HOST_WIDE_INT maxval)
11842 {
11843   rtx elt;
11844   return (const_vec_duplicate_p (x, &elt)
11845           && CONST_INT_P (elt)
11846           && IN_RANGE (INTVAL (elt), minval, maxval));
11847 }
11848
11849 /* Some constants can't be made using normal mov instructions in Advanced SIMD
11850    but we can still create them in various ways.  If the constant in VAL can be
11851    created using alternate methods then if possible then return true and
11852    additionally set TARGET to the rtx for the sequence if TARGET is not NULL.
11853    Otherwise return false if sequence is not possible.  */
11854
11855 bool
11856 aarch64_maybe_generate_simd_constant (rtx target, rtx val, machine_mode mode)
11857 {
11858   wide_int wval;
11859   auto smode = GET_MODE_INNER (mode);
11860   if (!aarch64_extract_vec_duplicate_wide_int (val, &wval, smode))
11861     return false;
11862
11863   /* For Advanced SIMD we can create an integer with only the top bit set
11864      using fneg (0.0f).  */
11865   if (TARGET_SIMD
11866       && !TARGET_SVE
11867       && smode == DImode
11868       && wi::only_sign_bit_p (wval))
11869     {
11870       if (!target)
11871         return true;
11872
11873       /* Use the same base type as aarch64_gen_shareable_zero.  */
11874       rtx zero = CONST0_RTX (V4SImode);
11875       emit_move_insn (lowpart_subreg (V4SImode, target, mode), zero);
11876       rtx neg = lowpart_subreg (V2DImode, target, mode);
11877       emit_insn (gen_aarch64_fnegv2di2 (neg, copy_rtx (neg)));
11878       return true;
11879     }
11880
11881   return false;
11882 }
11883
11884 /* Check if the value in VAL with mode MODE can be created using special
11885    instruction sequences.  */
11886
11887 bool aarch64_simd_special_constant_p (rtx val, machine_mode mode)
11888 {
11889   return aarch64_maybe_generate_simd_constant (NULL_RTX, val, mode);
11890 }
11891
11892 bool
11893 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11894 {
11895   return aarch64_const_vec_all_same_in_range_p (x, val, val);
11896 }
11897
11898 /* Return true if VEC is a constant in which every element is in the range
11899    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
11900
11901 static bool
11902 aarch64_const_vec_all_in_range_p (rtx vec,
11903                                   HOST_WIDE_INT minval,
11904                                   HOST_WIDE_INT maxval)
11905 {
11906   if (!CONST_VECTOR_P (vec)
11907       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11908     return false;
11909
11910   int nunits;
11911   if (!CONST_VECTOR_STEPPED_P (vec))
11912     nunits = const_vector_encoded_nelts (vec);
11913   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11914     return false;
11915
11916   for (int i = 0; i < nunits; i++)
11917     {
11918       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11919       if (!CONST_INT_P (vec_elem)
11920           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11921         return false;
11922     }
11923   return true;
11924 }
11925
11926 /* N Z C V.  */
11927 #define AARCH64_CC_V 1
11928 #define AARCH64_CC_C (1 << 1)
11929 #define AARCH64_CC_Z (1 << 2)
11930 #define AARCH64_CC_N (1 << 3)
11931
11932 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
11933 static const int aarch64_nzcv_codes[] =
11934 {
11935   0,            /* EQ, Z == 1.  */
11936   AARCH64_CC_Z, /* NE, Z == 0.  */
11937   0,            /* CS, C == 1.  */
11938   AARCH64_CC_C, /* CC, C == 0.  */
11939   0,            /* MI, N == 1.  */
11940   AARCH64_CC_N, /* PL, N == 0.  */
11941   0,            /* VS, V == 1.  */
11942   AARCH64_CC_V, /* VC, V == 0.  */
11943   0,            /* HI, C ==1 && Z == 0.  */
11944   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
11945   AARCH64_CC_V, /* GE, N == V.  */
11946   0,            /* LT, N != V.  */
11947   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
11948   0,            /* LE, !(Z == 0 && N == V).  */
11949   0,            /* AL, Any.  */
11950   0             /* NV, Any.  */
11951 };
11952
11953 /* Print floating-point vector immediate operand X to F, negating it
11954    first if NEGATE is true.  Return true on success, false if it isn't
11955    a constant we can handle.  */
11956
11957 static bool
11958 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11959 {
11960   rtx elt;
11961
11962   if (!const_vec_duplicate_p (x, &elt))
11963     return false;
11964
11965   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11966   if (negate)
11967     r = real_value_negate (&r);
11968
11969   /* Handle the SVE single-bit immediates specially, since they have a
11970      fixed form in the assembly syntax.  */
11971   if (real_equal (&r, &dconst0))
11972     asm_fprintf (f, "0.0");
11973   else if (real_equal (&r, &dconst2))
11974     asm_fprintf (f, "2.0");
11975   else if (real_equal (&r, &dconst1))
11976     asm_fprintf (f, "1.0");
11977   else if (real_equal (&r, &dconsthalf))
11978     asm_fprintf (f, "0.5");
11979   else
11980     {
11981       const int buf_size = 20;
11982       char float_buf[buf_size] = {'\0'};
11983       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11984                                 1, GET_MODE (elt));
11985       asm_fprintf (f, "%s", float_buf);
11986     }
11987
11988   return true;
11989 }
11990
11991 /* Return the equivalent letter for size.  */
11992 static char
11993 sizetochar (int size)
11994 {
11995   switch (size)
11996     {
11997     case 64: return 'd';
11998     case 32: return 's';
11999     case 16: return 'h';
12000     case 8 : return 'b';
12001     default: gcc_unreachable ();
12002     }
12003 }
12004
12005 /* Print operand X to file F in a target specific manner according to CODE.
12006    The acceptable formatting commands given by CODE are:
12007      'c':               An integer or symbol address without a preceding #
12008                         sign.
12009      'C':               Take the duplicated element in a vector constant
12010                         and print it in hex.
12011      'D':               Take the duplicated element in a vector constant
12012                         and print it as an unsigned integer, in decimal.
12013      'e':               Print the sign/zero-extend size as a character 8->b,
12014                         16->h, 32->w.  Can also be used for masks:
12015                         0xff->b, 0xffff->h, 0xffffffff->w.
12016      'I':               If the operand is a duplicated vector constant,
12017                         replace it with the duplicated scalar.  If the
12018                         operand is then a floating-point constant, replace
12019                         it with the integer bit representation.  Print the
12020                         transformed constant as a signed decimal number.
12021      'p':               Prints N such that 2^N == X (X must be power of 2 and
12022                         const int).
12023      'P':               Print the number of non-zero bits in X (a const_int).
12024      'H':               Print the higher numbered register of a pair (TImode)
12025                         of regs.
12026      'm':               Print a condition (eq, ne, etc).
12027      'M':               Same as 'm', but invert condition.
12028      'N':               Take the duplicated element in a vector constant
12029                         and print the negative of it in decimal.
12030      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
12031      'Z':               Same for SVE registers.  ('z' was already taken.)
12032                         Note that it is not necessary to use %Z for operands
12033                         that have SVE modes.  The convention is to use %Z
12034                         only for non-SVE (or potentially non-SVE) modes.
12035      'S/T/U/V':         Print a FP/SIMD register name for a register list.
12036                         The register printed is the FP/SIMD register name
12037                         of X + 0/1/2/3 for S/T/U/V.
12038      'R':               Print a scalar Integer/FP/SIMD register name + 1.
12039      'X':               Print bottom 16 bits of integer constant in hex.
12040      'w/x':             Print a general register name or the zero register
12041                         (32-bit or 64-bit).
12042      '0':               Print a normal operand, if it's a general register,
12043                         then we assume DImode.
12044      'k':               Print NZCV for conditional compare instructions.
12045      'K':               Print a predicate register as pn<N> rather than p<N>
12046      'A':               Output address constant representing the first
12047                         argument of X, specifying a relocation offset
12048                         if appropriate.
12049      'L':               Output constant address specified by X
12050                         with a relocation offset if appropriate.
12051      'G':               Prints address of X, specifying a PC relative
12052                         relocation mode if appropriate.
12053      'y':               Output address of LDP or STP - this is used for
12054                         some LDP/STPs which don't use a PARALLEL in their
12055                         pattern (so the mode needs to be adjusted).
12056      'z':               Output address of a typical LDP or STP.  */
12057
12058 static void
12059 aarch64_print_operand (FILE *f, rtx x, int code)
12060 {
12061   rtx elt;
12062   switch (code)
12063     {
12064     case 'c':
12065       if (CONST_INT_P (x))
12066         fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
12067       else
12068         {
12069           poly_int64 offset;
12070           rtx base = strip_offset_and_salt (x, &offset);
12071           if (SYMBOL_REF_P (base))
12072             output_addr_const (f, x);
12073           else
12074             output_operand_lossage ("unsupported operand for code '%c'", code);
12075         }
12076       break;
12077
12078     case 'e':
12079       {
12080         x = unwrap_const_vec_duplicate (x);
12081         if (!CONST_INT_P (x))
12082           {
12083             output_operand_lossage ("invalid operand for '%%%c'", code);
12084             return;
12085           }
12086
12087         HOST_WIDE_INT val = INTVAL (x);
12088         if ((val & ~7) == 8 || val == 0xff)
12089           fputc ('b', f);
12090         else if ((val & ~7) == 16 || val == 0xffff)
12091           fputc ('h', f);
12092         else if ((val & ~7) == 32 || val == 0xffffffff)
12093           fputc ('w', f);
12094         else
12095           {
12096             output_operand_lossage ("invalid operand for '%%%c'", code);
12097             return;
12098           }
12099       }
12100       break;
12101
12102     case 'p':
12103       {
12104         int n;
12105
12106         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
12107           {
12108             output_operand_lossage ("invalid operand for '%%%c'", code);
12109             return;
12110           }
12111
12112         asm_fprintf (f, "%d", n);
12113       }
12114       break;
12115
12116     case 'P':
12117       if (!CONST_INT_P (x))
12118         {
12119           output_operand_lossage ("invalid operand for '%%%c'", code);
12120           return;
12121         }
12122
12123       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
12124       break;
12125
12126     case 'H':
12127       if (x == const0_rtx)
12128         {
12129           asm_fprintf (f, "xzr");
12130           break;
12131         }
12132
12133       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
12134         {
12135           output_operand_lossage ("invalid operand for '%%%c'", code);
12136           return;
12137         }
12138
12139       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
12140       break;
12141
12142     case 'I':
12143       {
12144         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
12145         if (CONST_INT_P (x))
12146           asm_fprintf (f, "%wd", INTVAL (x));
12147         else
12148           {
12149             output_operand_lossage ("invalid operand for '%%%c'", code);
12150             return;
12151           }
12152         break;
12153       }
12154
12155     case 'M':
12156     case 'm':
12157       {
12158         int cond_code;
12159         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
12160         if (x == const_true_rtx)
12161           {
12162             if (code == 'M')
12163               fputs ("nv", f);
12164             return;
12165           }
12166
12167         if (!COMPARISON_P (x))
12168           {
12169             output_operand_lossage ("invalid operand for '%%%c'", code);
12170             return;
12171           }
12172
12173         cond_code = aarch64_get_condition_code (x);
12174         gcc_assert (cond_code >= 0);
12175         if (code == 'M')
12176           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
12177         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
12178           fputs (aarch64_sve_condition_codes[cond_code], f);
12179         else
12180           fputs (aarch64_condition_codes[cond_code], f);
12181       }
12182       break;
12183
12184     case 'N':
12185       if (!const_vec_duplicate_p (x, &elt))
12186         {
12187           output_operand_lossage ("invalid vector constant");
12188           return;
12189         }
12190
12191       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12192         asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
12193       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12194                && aarch64_print_vector_float_operand (f, x, true))
12195         ;
12196       else
12197         {
12198           output_operand_lossage ("invalid vector constant");
12199           return;
12200         }
12201       break;
12202
12203     case 'b':
12204     case 'h':
12205     case 's':
12206     case 'd':
12207     case 'q':
12208     case 'Z':
12209       code = TOLOWER (code);
12210       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12211         {
12212           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12213           return;
12214         }
12215       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
12216       break;
12217
12218     case 'S':
12219     case 'T':
12220     case 'U':
12221     case 'V':
12222       if (!REG_P (x) || (!FP_REGNUM_P (REGNO (x)) && !PR_REGNUM_P (REGNO (x))))
12223         {
12224           output_operand_lossage ("incompatible operand for '%%%c'", code);
12225           return;
12226         }
12227       if (PR_REGNUM_P (REGNO (x)))
12228         asm_fprintf (f, "p%d", REGNO (x) - P0_REGNUM + (code - 'S'));
12229       else
12230         asm_fprintf (f, "%c%d",
12231                      aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12232                      REGNO (x) - V0_REGNUM + (code - 'S'));
12233       break;
12234
12235     case 'R':
12236       if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12237           && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12238         asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12239       else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
12240         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12241       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12242         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12243       else
12244         output_operand_lossage ("incompatible register operand for '%%%c'",
12245                                 code);
12246       break;
12247
12248     case 'X':
12249       if (!CONST_INT_P (x))
12250         {
12251           output_operand_lossage ("invalid operand for '%%%c'", code);
12252           return;
12253         }
12254       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
12255       break;
12256
12257     case 'C':
12258       {
12259         /* Print a replicated constant in hex.  */
12260         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12261           {
12262             output_operand_lossage ("invalid operand for '%%%c'", code);
12263             return;
12264           }
12265         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12266         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12267       }
12268       break;
12269
12270     case 'D':
12271       {
12272         /* Print a replicated constant in decimal, treating it as
12273            unsigned.  */
12274         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12275           {
12276             output_operand_lossage ("invalid operand for '%%%c'", code);
12277             return;
12278           }
12279         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12280         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12281       }
12282       break;
12283
12284     case 'w':
12285     case 'x':
12286       if (aarch64_const_zero_rtx_p (x))
12287         {
12288           asm_fprintf (f, "%czr", code);
12289           break;
12290         }
12291
12292       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12293         {
12294           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12295           break;
12296         }
12297
12298       if (REG_P (x) && REGNO (x) == SP_REGNUM)
12299         {
12300           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12301           break;
12302         }
12303
12304       /* Fall through */
12305
12306     case 0:
12307       if (x == NULL)
12308         {
12309           output_operand_lossage ("missing operand");
12310           return;
12311         }
12312
12313       switch (GET_CODE (x))
12314         {
12315         case CONST_STRING:
12316           {
12317             asm_fprintf (f, "%s", XSTR (x, 0));
12318             break;
12319           }
12320         case REG:
12321           if (aarch64_sve_data_mode_p (GET_MODE (x)))
12322             {
12323               if (REG_NREGS (x) == 1)
12324                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12325               else
12326                 {
12327                   char suffix
12328                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12329                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
12330                                REGNO (x) - V0_REGNUM, suffix,
12331                                END_REGNO (x) - V0_REGNUM - 1, suffix);
12332                 }
12333             }
12334           else
12335             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12336           break;
12337
12338         case MEM:
12339           output_address (GET_MODE (x), XEXP (x, 0));
12340           break;
12341
12342         case LABEL_REF:
12343         case SYMBOL_REF:
12344           output_addr_const (asm_out_file, x);
12345           break;
12346
12347         case CONST_INT:
12348           asm_fprintf (f, "%wd", INTVAL (x));
12349           break;
12350
12351         case CONST:
12352           if (!VECTOR_MODE_P (GET_MODE (x)))
12353             {
12354               output_addr_const (asm_out_file, x);
12355               break;
12356             }
12357           /* fall through */
12358
12359         case CONST_VECTOR:
12360           if (!const_vec_duplicate_p (x, &elt))
12361             {
12362               output_operand_lossage ("invalid vector constant");
12363               return;
12364             }
12365
12366           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12367             asm_fprintf (f, "%wd", INTVAL (elt));
12368           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12369                    && aarch64_print_vector_float_operand (f, x, false))
12370             ;
12371           else
12372             {
12373               output_operand_lossage ("invalid vector constant");
12374               return;
12375             }
12376           break;
12377
12378         case CONST_DOUBLE:
12379           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12380              be getting CONST_DOUBLEs holding integers.  */
12381           gcc_assert (GET_MODE (x) != VOIDmode);
12382           if (aarch64_float_const_zero_rtx_p (x))
12383             {
12384               fputc ('0', f);
12385               break;
12386             }
12387           else if (aarch64_float_const_representable_p (x))
12388             {
12389 #define buf_size 20
12390               char float_buf[buf_size] = {'\0'};
12391               real_to_decimal_for_mode (float_buf,
12392                                         CONST_DOUBLE_REAL_VALUE (x),
12393                                         buf_size, buf_size,
12394                                         1, GET_MODE (x));
12395               asm_fprintf (asm_out_file, "%s", float_buf);
12396               break;
12397 #undef buf_size
12398             }
12399           output_operand_lossage ("invalid constant");
12400           return;
12401         default:
12402           output_operand_lossage ("invalid operand");
12403           return;
12404         }
12405       break;
12406
12407     case 'A':
12408       if (GET_CODE (x) == HIGH)
12409         x = XEXP (x, 0);
12410
12411       switch (aarch64_classify_symbolic_expression (x))
12412         {
12413         case SYMBOL_SMALL_GOT_4G:
12414           asm_fprintf (asm_out_file, ":got:");
12415           break;
12416
12417         case SYMBOL_SMALL_TLSGD:
12418           asm_fprintf (asm_out_file, ":tlsgd:");
12419           break;
12420
12421         case SYMBOL_SMALL_TLSDESC:
12422           asm_fprintf (asm_out_file, ":tlsdesc:");
12423           break;
12424
12425         case SYMBOL_SMALL_TLSIE:
12426           asm_fprintf (asm_out_file, ":gottprel:");
12427           break;
12428
12429         case SYMBOL_TLSLE24:
12430           asm_fprintf (asm_out_file, ":tprel:");
12431           break;
12432
12433         case SYMBOL_TINY_GOT:
12434           gcc_unreachable ();
12435           break;
12436
12437         default:
12438           break;
12439         }
12440       output_addr_const (asm_out_file, x);
12441       break;
12442
12443     case 'L':
12444       switch (aarch64_classify_symbolic_expression (x))
12445         {
12446         case SYMBOL_SMALL_GOT_4G:
12447           asm_fprintf (asm_out_file, ":got_lo12:");
12448           break;
12449
12450         case SYMBOL_SMALL_TLSGD:
12451           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12452           break;
12453
12454         case SYMBOL_SMALL_TLSDESC:
12455           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12456           break;
12457
12458         case SYMBOL_SMALL_TLSIE:
12459           asm_fprintf (asm_out_file, ":gottprel_lo12:");
12460           break;
12461
12462         case SYMBOL_TLSLE12:
12463           asm_fprintf (asm_out_file, ":tprel_lo12:");
12464           break;
12465
12466         case SYMBOL_TLSLE24:
12467           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12468           break;
12469
12470         case SYMBOL_TINY_GOT:
12471           asm_fprintf (asm_out_file, ":got:");
12472           break;
12473
12474         case SYMBOL_TINY_TLSIE:
12475           asm_fprintf (asm_out_file, ":gottprel:");
12476           break;
12477
12478         default:
12479           break;
12480         }
12481       output_addr_const (asm_out_file, x);
12482       break;
12483
12484     case 'G':
12485       switch (aarch64_classify_symbolic_expression (x))
12486         {
12487         case SYMBOL_TLSLE24:
12488           asm_fprintf (asm_out_file, ":tprel_hi12:");
12489           break;
12490         default:
12491           break;
12492         }
12493       output_addr_const (asm_out_file, x);
12494       break;
12495
12496     case 'k':
12497       {
12498         HOST_WIDE_INT cond_code;
12499
12500         if (!CONST_INT_P (x))
12501           {
12502             output_operand_lossage ("invalid operand for '%%%c'", code);
12503             return;
12504           }
12505
12506         cond_code = INTVAL (x);
12507         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12508         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12509       }
12510       break;
12511
12512     case 'K':
12513       if (!REG_P (x) || !PR_REGNUM_P (REGNO (x)))
12514         {
12515           output_operand_lossage ("invalid operand for '%%%c'", code);
12516           return;
12517         }
12518       asm_fprintf (f, "pn%d", REGNO (x) - P0_REGNUM);
12519       break;
12520
12521     case 'y':
12522     case 'z':
12523       {
12524         machine_mode mode = GET_MODE (x);
12525
12526         if (!MEM_P (x)
12527             || (code == 'y'
12528                 && maybe_ne (GET_MODE_SIZE (mode), 8)
12529                 && maybe_ne (GET_MODE_SIZE (mode), 16)
12530                 && maybe_ne (GET_MODE_SIZE (mode), 32)))
12531           {
12532             output_operand_lossage ("invalid operand for '%%%c'", code);
12533             return;
12534           }
12535
12536         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12537                                             code == 'y'
12538                                             ? ADDR_QUERY_LDP_STP_N
12539                                             : ADDR_QUERY_LDP_STP))
12540           output_operand_lossage ("invalid operand prefix '%%%c'", code);
12541       }
12542       break;
12543
12544     default:
12545       output_operand_lossage ("invalid operand prefix '%%%c'", code);
12546       return;
12547     }
12548 }
12549
12550 /* Print address 'x' of a memory access with mode 'mode'.
12551    'op' is the context required by aarch64_classify_address.  It can either be
12552    MEM for a normal memory access or PARALLEL for LDP/STP.  */
12553 static bool
12554 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12555                                 aarch64_addr_query_type type)
12556 {
12557   struct aarch64_address_info addr;
12558   unsigned int size, vec_flags;
12559
12560   /* Check all addresses are Pmode - including ILP32.  */
12561   if (GET_MODE (x) != Pmode
12562       && (!CONST_INT_P (x)
12563           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12564     {
12565       output_operand_lossage ("invalid address mode");
12566       return false;
12567     }
12568
12569   const bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
12570                                   || type == ADDR_QUERY_LDP_STP_N);
12571
12572   if (aarch64_classify_address (&addr, x, mode, true, type))
12573     switch (addr.type)
12574       {
12575       case ADDRESS_REG_IMM:
12576         if (known_eq (addr.const_offset, 0))
12577           {
12578             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12579             return true;
12580           }
12581
12582         vec_flags = aarch64_classify_vector_mode (mode);
12583         if ((vec_flags & VEC_ANY_SVE) && !load_store_pair_p)
12584           {
12585             HOST_WIDE_INT vnum
12586               = exact_div (addr.const_offset,
12587                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12588             asm_fprintf (f, "[%s, #%wd, mul vl]",
12589                          reg_names[REGNO (addr.base)], vnum);
12590             return true;
12591           }
12592
12593         if (!CONST_INT_P (addr.offset))
12594           return false;
12595
12596         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12597                      INTVAL (addr.offset));
12598         return true;
12599
12600       case ADDRESS_REG_REG:
12601         if (addr.shift == 0)
12602           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12603                        reg_names [REGNO (addr.offset)]);
12604         else
12605           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12606                        reg_names [REGNO (addr.offset)], addr.shift);
12607         return true;
12608
12609       case ADDRESS_REG_UXTW:
12610         if (addr.shift == 0)
12611           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12612                        REGNO (addr.offset) - R0_REGNUM);
12613         else
12614           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12615                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12616         return true;
12617
12618       case ADDRESS_REG_SXTW:
12619         if (addr.shift == 0)
12620           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12621                        REGNO (addr.offset) - R0_REGNUM);
12622         else
12623           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12624                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12625         return true;
12626
12627       case ADDRESS_REG_WB:
12628         /* Writeback is only supported for fixed-width modes.  */
12629         size = GET_MODE_SIZE (mode).to_constant ();
12630         switch (GET_CODE (x))
12631           {
12632           case PRE_INC:
12633             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12634             return true;
12635           case POST_INC:
12636             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12637             return true;
12638           case PRE_DEC:
12639             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12640             return true;
12641           case POST_DEC:
12642             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12643             return true;
12644           case PRE_MODIFY:
12645             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12646                          INTVAL (addr.offset));
12647             return true;
12648           case POST_MODIFY:
12649             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12650                          INTVAL (addr.offset));
12651             return true;
12652           default:
12653             break;
12654           }
12655         break;
12656
12657       case ADDRESS_LO_SUM:
12658         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12659         output_addr_const (f, addr.offset);
12660         asm_fprintf (f, "]");
12661         return true;
12662
12663       case ADDRESS_SYMBOLIC:
12664         output_addr_const (f, x);
12665         return true;
12666       }
12667
12668   return false;
12669 }
12670
12671 /* Print address 'x' of a memory access with mode 'mode'.  */
12672 static void
12673 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12674 {
12675   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12676     output_addr_const (f, x);
12677 }
12678
12679 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
12680
12681 static bool
12682 aarch64_output_addr_const_extra (FILE *file, rtx x)
12683 {
12684   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12685     {
12686       output_addr_const (file, XVECEXP (x, 0, 0));
12687       return true;
12688    }
12689   return false;
12690 }
12691
12692 bool
12693 aarch64_label_mentioned_p (rtx x)
12694 {
12695   const char *fmt;
12696   int i;
12697
12698   if (LABEL_REF_P (x))
12699     return true;
12700
12701   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12702      referencing instruction, but they are constant offsets, not
12703      symbols.  */
12704   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12705     return false;
12706
12707   fmt = GET_RTX_FORMAT (GET_CODE (x));
12708   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12709     {
12710       if (fmt[i] == 'E')
12711         {
12712           int j;
12713
12714           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12715             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12716               return 1;
12717         }
12718       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12719         return 1;
12720     }
12721
12722   return 0;
12723 }
12724
12725 /* Implement REGNO_REG_CLASS.  */
12726
12727 enum reg_class
12728 aarch64_regno_regclass (unsigned regno)
12729 {
12730   if (W8_W11_REGNUM_P (regno))
12731     return W8_W11_REGS;
12732
12733   if (W12_W15_REGNUM_P (regno))
12734     return W12_W15_REGS;
12735
12736   if (STUB_REGNUM_P (regno))
12737     return STUB_REGS;
12738
12739   if (GP_REGNUM_P (regno))
12740     return GENERAL_REGS;
12741
12742   if (regno == SP_REGNUM)
12743     return STACK_REG;
12744
12745   if (regno == FRAME_POINTER_REGNUM
12746       || regno == ARG_POINTER_REGNUM)
12747     return POINTER_REGS;
12748
12749   if (FP_REGNUM_P (regno))
12750     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12751             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12752
12753   if (PR_REGNUM_P (regno))
12754     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12755
12756   if (regno == FPM_REGNUM)
12757     return MOVEABLE_SYSREGS;
12758
12759   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12760     return FFR_REGS;
12761
12762   if (FAKE_REGNUM_P (regno))
12763     return FAKE_REGS;
12764
12765   return NO_REGS;
12766 }
12767
12768 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12769    If OFFSET is out of range, return an offset of an anchor point
12770    that is in range.  Return 0 otherwise.  */
12771
12772 static HOST_WIDE_INT
12773 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12774                        machine_mode mode)
12775 {
12776   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
12777   if (size > 16)
12778     return (offset + 0x400) & ~0x7f0;
12779
12780   /* For offsets that aren't a multiple of the access size, the limit is
12781      -256...255.  */
12782   if (offset & (size - 1))
12783     {
12784       /* BLKmode typically uses LDP of X-registers.  */
12785       if (mode == BLKmode)
12786         return (offset + 512) & ~0x3ff;
12787       return (offset + 0x100) & ~0x1ff;
12788     }
12789
12790   /* Small negative offsets are supported.  */
12791   if (IN_RANGE (offset, -256, 0))
12792     return 0;
12793
12794   if (mode == TImode || mode == TFmode || mode == TDmode)
12795     return (offset + 0x100) & ~0x1ff;
12796
12797   /* Use 12-bit offset by access size.  */
12798   return offset & (~0xfff * size);
12799 }
12800
12801 static rtx
12802 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
12803 {
12804 #if TARGET_PECOFF
12805   rtx tmp = legitimize_pe_coff_symbol (x, true);
12806   if (tmp)
12807     return tmp;
12808 #endif
12809
12810   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12811      where mask is selected by alignment and size of the offset.
12812      We try to pick as large a range for the offset as possible to
12813      maximize the chance of a CSE.  However, for aligned addresses
12814      we limit the range to 4k so that structures with different sized
12815      elements are likely to use the same base.  We need to be careful
12816      not to split a CONST for some forms of address expression, otherwise
12817      it will generate sub-optimal code.  */
12818
12819   /* First split X + CONST (base, offset) into (base + X) + offset.  */
12820   if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 1)) == CONST)
12821     {
12822       poly_int64 offset;
12823       rtx base = strip_offset (XEXP (x, 1), &offset);
12824
12825       base = expand_binop (Pmode, add_optab, base, XEXP (x, 0),
12826                            NULL_RTX, true, OPTAB_DIRECT);
12827       x = plus_constant (Pmode, base, offset);
12828     }
12829
12830   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12831     {
12832       rtx base = XEXP (x, 0);
12833       rtx offset_rtx = XEXP (x, 1);
12834       HOST_WIDE_INT offset = INTVAL (offset_rtx);
12835
12836       if (GET_CODE (base) == PLUS)
12837         {
12838           rtx op0 = XEXP (base, 0);
12839           rtx op1 = XEXP (base, 1);
12840
12841           /* Force any scaling into a temp for CSE.  */
12842           op0 = force_reg (Pmode, op0);
12843           op1 = force_reg (Pmode, op1);
12844
12845           /* Let the pointer register be in op0.  */
12846           if (REG_POINTER (op1))
12847             std::swap (op0, op1);
12848
12849           /* If the pointer is virtual or frame related, then we know that
12850              virtual register instantiation or register elimination is going
12851              to apply a second constant.  We want the two constants folded
12852              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
12853           if (virt_or_elim_regno_p (REGNO (op0)))
12854             {
12855               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12856                                    NULL_RTX, true, OPTAB_DIRECT);
12857               return gen_rtx_PLUS (Pmode, base, op1);
12858             }
12859
12860           /* Otherwise, in order to encourage CSE (and thence loop strength
12861              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
12862           base = expand_binop (Pmode, add_optab, op0, op1,
12863                                NULL_RTX, true, OPTAB_DIRECT);
12864           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12865         }
12866
12867       HOST_WIDE_INT size;
12868       if (GET_MODE_SIZE (mode).is_constant (&size))
12869         {
12870           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12871                                                              mode);
12872           if (base_offset != 0)
12873             {
12874               base = plus_constant (Pmode, base, base_offset);
12875               base = force_operand (base, NULL_RTX);
12876               return plus_constant (Pmode, base, offset - base_offset);
12877             }
12878         }
12879     }
12880
12881   return x;
12882 }
12883
12884 static reg_class_t
12885 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12886                           reg_class_t rclass,
12887                           machine_mode mode,
12888                           secondary_reload_info *sri)
12889 {
12890   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12891      LDR and STR.  See the comment at the head of aarch64-sve.md for
12892      more details about the big-endian handling.  */
12893   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12894   if (reg_class_subset_p (rclass, FP_REGS)
12895       && !((REG_P (x) && HARD_REGISTER_P (x))
12896            || aarch64_simd_valid_immediate (x, NULL))
12897       && mode != VNx16QImode
12898       && (vec_flags & VEC_SVE_DATA)
12899       && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12900     {
12901       sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12902       return NO_REGS;
12903     }
12904
12905   /* If we have to disable direct literal pool loads and stores because the
12906      function is too big, then we need a scratch register.  */
12907   if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12908       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12909           || targetm.vector_mode_supported_p (GET_MODE (x)))
12910       && !aarch64_pcrelative_literal_loads)
12911     {
12912       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12913       return NO_REGS;
12914     }
12915
12916   /* Without the TARGET_SIMD or TARGET_SVE instructions we cannot move a
12917      Q register to a Q register directly.  We need a scratch.  */
12918   if (REG_P (x)
12919       && (mode == TFmode
12920           || mode == TImode
12921           || mode == TDmode
12922           || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
12923       && mode == GET_MODE (x)
12924       && !TARGET_SIMD
12925       && FP_REGNUM_P (REGNO (x))
12926       && reg_class_subset_p (rclass, FP_REGS))
12927     {
12928       sri->icode = code_for_aarch64_reload_mov (mode);
12929       return NO_REGS;
12930     }
12931
12932   /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12933      because AArch64 has richer addressing modes for LDR/STR instructions
12934      than LDP/STP instructions.  */
12935   if (TARGET_FLOAT && rclass == GENERAL_REGS
12936       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12937     return FP_REGS;
12938
12939   if (rclass == FP_REGS
12940       && (mode == TImode || mode == TFmode || mode == TDmode)
12941       && CONSTANT_P(x))
12942       return GENERAL_REGS;
12943
12944   return NO_REGS;
12945 }
12946
12947 /* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
12948
12949 static bool
12950 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12951                                  reg_class_t class2)
12952 {
12953   if (!TARGET_SIMD
12954       && reg_classes_intersect_p (class1, FP_REGS)
12955       && reg_classes_intersect_p (class2, FP_REGS))
12956     {
12957       /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12958          so we can't easily split a move involving tuples of 128-bit
12959          vectors.  Force the copy through memory instead.
12960
12961          (Tuples of 64-bit vectors are fine.)  */
12962       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12963       if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12964         return true;
12965     }
12966   return false;
12967 }
12968
12969 /* Implement TARGET_FRAME_POINTER_REQUIRED.  */
12970
12971 static bool
12972 aarch64_frame_pointer_required ()
12973 {
12974   /* If the function needs to record the incoming value of PSTATE.SM,
12975      make sure that the slot is accessible from the frame pointer.  */
12976   return aarch64_need_old_pstate_sm ();
12977 }
12978
12979 static bool
12980 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12981 {
12982   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12983
12984   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12985      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
12986   if (frame_pointer_needed)
12987     return to == HARD_FRAME_POINTER_REGNUM;
12988   return true;
12989 }
12990
12991 poly_int64
12992 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12993 {
12994   aarch64_frame &frame = cfun->machine->frame;
12995
12996   if (to == HARD_FRAME_POINTER_REGNUM)
12997     {
12998       if (from == ARG_POINTER_REGNUM)
12999         return frame.bytes_above_hard_fp;
13000
13001       if (from == FRAME_POINTER_REGNUM)
13002         return frame.bytes_above_hard_fp - frame.bytes_above_locals;
13003     }
13004
13005   if (to == STACK_POINTER_REGNUM)
13006     {
13007       if (from == FRAME_POINTER_REGNUM)
13008         return frame.frame_size - frame.bytes_above_locals;
13009     }
13010
13011   return frame.frame_size;
13012 }
13013
13014
13015 /* Get return address without mangling.  */
13016
13017 rtx
13018 aarch64_return_addr_rtx (void)
13019 {
13020   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
13021   /* Note: aarch64_return_address_signing_enabled only
13022      works after cfun->machine->frame.laid_out is set,
13023      so here we don't know if the return address will
13024      be signed or not.  */
13025   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
13026   emit_move_insn (lr, val);
13027   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
13028   return lr;
13029 }
13030
13031
13032 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
13033    previous frame.  */
13034
13035 rtx
13036 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
13037 {
13038   if (count != 0)
13039     return const0_rtx;
13040   return aarch64_return_addr_rtx ();
13041 }
13042
13043 static void
13044 aarch64_asm_trampoline_template (FILE *f)
13045 {
13046   /* Even if the current function doesn't have branch protection, some
13047      later function might, so since this template is only generated once
13048      we have to add a BTI just in case. */
13049   asm_fprintf (f, "\thint\t34 // bti c\n");
13050
13051   if (TARGET_ILP32)
13052     {
13053       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
13054       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
13055     }
13056   else
13057     {
13058       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
13059       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
13060     }
13061   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
13062
13063   /* We always emit a speculation barrier.
13064      This is because the same trampoline template is used for every nested
13065      function.  Since nested functions are not particularly common or
13066      performant we don't worry too much about the extra instructions to copy
13067      around.
13068      This is not yet a problem, since we have not yet implemented function
13069      specific attributes to choose between hardening against straight line
13070      speculation or not, but such function specific attributes are likely to
13071      happen in the future.  */
13072   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
13073
13074   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
13075   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
13076 }
13077
13078 static void
13079 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
13080 {
13081   rtx fnaddr, mem, a_tramp;
13082   const int tramp_code_sz = 24;
13083
13084   /* Don't need to copy the trailing D-words, we fill those in below.  */
13085   /* We create our own memory address in Pmode so that `emit_block_move` can
13086      use parts of the backend which expect Pmode addresses.  */
13087   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
13088   emit_block_move (gen_rtx_MEM (BLKmode, temp),
13089                    assemble_trampoline_template (),
13090                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
13091   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
13092   fnaddr = XEXP (DECL_RTL (fndecl), 0);
13093   if (GET_MODE (fnaddr) != ptr_mode)
13094     fnaddr = convert_memory_address (ptr_mode, fnaddr);
13095   emit_move_insn (mem, fnaddr);
13096
13097   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
13098   emit_move_insn (mem, chain_value);
13099
13100   /* XXX We should really define a "clear_cache" pattern and use
13101      gen_clear_cache().  */
13102   a_tramp = XEXP (m_tramp, 0);
13103   maybe_emit_call_builtin___clear_cache (a_tramp,
13104                                          plus_constant (ptr_mode,
13105                                                         a_tramp,
13106                                                         TRAMPOLINE_SIZE));
13107 }
13108
13109 static unsigned char
13110 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
13111 {
13112   /* ??? Logically we should only need to provide a value when
13113      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
13114      can hold MODE, but at the moment we need to handle all modes.
13115      Just ignore any runtime parts for registers that can't store them.  */
13116   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
13117   unsigned int nregs, vec_flags;
13118   switch (regclass)
13119     {
13120     case W8_W11_REGS:
13121     case W12_W15_REGS:
13122     case STUB_REGS:
13123     case TAILCALL_ADDR_REGS:
13124     case POINTER_REGS:
13125     case GENERAL_REGS:
13126     case ALL_REGS:
13127     case POINTER_AND_FP_REGS:
13128     case FP_REGS:
13129     case FP_LO_REGS:
13130     case FP_LO8_REGS:
13131       vec_flags = aarch64_classify_vector_mode (mode);
13132       if ((vec_flags & VEC_SVE_DATA)
13133           && constant_multiple_p (GET_MODE_SIZE (mode),
13134                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
13135         return nregs;
13136       if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
13137         return GET_MODE_SIZE (mode).to_constant () / 8;
13138       return (vec_flags & VEC_ADVSIMD
13139               ? CEIL (lowest_size, UNITS_PER_VREG)
13140               : CEIL (lowest_size, UNITS_PER_WORD));
13141
13142     case PR_REGS:
13143     case PR_LO_REGS:
13144     case PR_HI_REGS:
13145       return mode == VNx32BImode ? 2 : 1;
13146
13147     case MOVEABLE_SYSREGS:
13148     case STACK_REG:
13149     case FFR_REGS:
13150     case PR_AND_FFR_REGS:
13151     case FAKE_REGS:
13152       return 1;
13153
13154     case NO_REGS:
13155       return 0;
13156
13157     default:
13158       break;
13159     }
13160   gcc_unreachable ();
13161 }
13162
13163 static reg_class_t
13164 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
13165 {
13166   if (regclass == POINTER_REGS)
13167     return GENERAL_REGS;
13168
13169   if (regclass == STACK_REG)
13170     {
13171       if (REG_P(x)
13172           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
13173           return regclass;
13174
13175       return NO_REGS;
13176     }
13177
13178   /* Register eliminiation can result in a request for
13179      SP+constant->FP_REGS.  We cannot support such operations which
13180      use SP as source and an FP_REG as destination, so reject out
13181      right now.  */
13182   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
13183     {
13184       rtx lhs = XEXP (x, 0);
13185
13186       /* Look through a possible SUBREG introduced by ILP32.  */
13187       if (SUBREG_P (lhs))
13188         lhs = SUBREG_REG (lhs);
13189
13190       gcc_assert (REG_P (lhs));
13191       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
13192                                       POINTER_REGS));
13193       return NO_REGS;
13194     }
13195
13196   return regclass;
13197 }
13198
13199 void
13200 aarch64_asm_output_labelref (FILE* f, const char *name)
13201 {
13202   asm_fprintf (f, "%U%s", name);
13203 }
13204
13205 static void
13206 aarch64_elf_asm_constructor (rtx symbol, int priority)
13207 {
13208   if (priority == DEFAULT_INIT_PRIORITY)
13209     default_ctor_section_asm_out_constructor (symbol, priority);
13210   else
13211     {
13212       section *s;
13213       /* While priority is known to be in range [0, 65535], so 18 bytes
13214          would be enough, the compiler might not know that.  To avoid
13215          -Wformat-truncation false positive, use a larger size.  */
13216       char buf[23];
13217       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
13218       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13219       switch_to_section (s);
13220       assemble_align (POINTER_SIZE);
13221       assemble_aligned_integer (POINTER_BYTES, symbol);
13222     }
13223 }
13224
13225 static void
13226 aarch64_elf_asm_destructor (rtx symbol, int priority)
13227 {
13228   if (priority == DEFAULT_INIT_PRIORITY)
13229     default_dtor_section_asm_out_destructor (symbol, priority);
13230   else
13231     {
13232       section *s;
13233       /* While priority is known to be in range [0, 65535], so 18 bytes
13234          would be enough, the compiler might not know that.  To avoid
13235          -Wformat-truncation false positive, use a larger size.  */
13236       char buf[23];
13237       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
13238       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13239       switch_to_section (s);
13240       assemble_align (POINTER_SIZE);
13241       assemble_aligned_integer (POINTER_BYTES, symbol);
13242     }
13243 }
13244
13245 const char*
13246 aarch64_output_casesi (rtx *operands)
13247 {
13248   char buf[100];
13249   char label[100];
13250   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
13251   int index;
13252   static const char *const patterns[4][2] =
13253   {
13254     {
13255       "ldrb\t%w3, [%0,%w1,uxtw]",
13256       "add\t%3, %4, %w3, sxtb #2"
13257     },
13258     {
13259       "ldrh\t%w3, [%0,%w1,uxtw #1]",
13260       "add\t%3, %4, %w3, sxth #2"
13261     },
13262     {
13263       "ldr\t%w3, [%0,%w1,uxtw #2]",
13264       "add\t%3, %4, %w3, sxtw #2"
13265     },
13266     /* We assume that DImode is only generated when not optimizing and
13267        that we don't really need 64-bit address offsets.  That would
13268        imply an object file with 8GB of code in a single function!  */
13269     {
13270       "ldr\t%w3, [%0,%w1,uxtw #2]",
13271       "add\t%3, %4, %w3, sxtw #2"
13272     }
13273   };
13274
13275   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13276
13277   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13278   index = exact_log2 (GET_MODE_SIZE (mode));
13279
13280   gcc_assert (index >= 0 && index <= 3);
13281
13282   /* Need to implement table size reduction, by chaning the code below.  */
13283   output_asm_insn (patterns[index][0], operands);
13284   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13285   snprintf (buf, sizeof (buf),
13286             "adr\t%%4, %s", targetm.strip_name_encoding (label));
13287   output_asm_insn (buf, operands);
13288   output_asm_insn (patterns[index][1], operands);
13289   output_asm_insn ("br\t%3", operands);
13290   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13291                    operands);
13292   assemble_label (asm_out_file, label);
13293   return "";
13294 }
13295
13296 /* Return the asm string for an SME ZERO instruction whose 8-bit mask
13297    operand is MASK.  */
13298 const char *
13299 aarch64_output_sme_zero_za (rtx mask)
13300 {
13301   auto mask_val = UINTVAL (mask);
13302   if (mask_val == 0)
13303     return "zero\t{}";
13304
13305   if (mask_val == 0xff)
13306     return "zero\t{ za }";
13307
13308   static constexpr struct { unsigned char mask; char letter; } tiles[] = {
13309     { 0xff, 'b' },
13310     { 0x55, 'h' },
13311     { 0x11, 's' },
13312     { 0x01, 'd' }
13313   };
13314   /* The last entry in the list has the form "za7.d }", but that's the
13315      same length as "za7.d, ".  */
13316   static char buffer[sizeof("zero\t{ ") + sizeof ("za7.d, ") * 8 + 1];
13317   for (auto &tile : tiles)
13318     {
13319       unsigned int tile_mask = tile.mask;
13320       unsigned int tile_index = 0;
13321       unsigned int i = snprintf (buffer, sizeof (buffer), "zero\t");
13322       const char *prefix = "{ ";
13323       auto remaining_mask = mask_val;
13324       while (tile_mask < 0x100)
13325         {
13326           if ((remaining_mask & tile_mask) == tile_mask)
13327             {
13328               i += snprintf (buffer + i, sizeof (buffer) - i, "%sza%d.%c",
13329                              prefix, tile_index, tile.letter);
13330               prefix = ", ";
13331               remaining_mask &= ~tile_mask;
13332             }
13333           tile_mask <<= 1;
13334           tile_index += 1;
13335         }
13336       if (remaining_mask == 0)
13337         {
13338           gcc_assert (i + 3 <= sizeof (buffer));
13339           snprintf (buffer + i, sizeof (buffer) - i, " }");
13340           return buffer;
13341         }
13342     }
13343   gcc_unreachable ();
13344 }
13345
13346 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13347    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13348    operator.  */
13349
13350 int
13351 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13352 {
13353   if (shift >= 0 && shift <= 4)
13354     {
13355       int size;
13356       for (size = 8; size <= 32; size *= 2)
13357         {
13358           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13359           if (mask == bits << shift)
13360             return size;
13361         }
13362     }
13363   return 0;
13364 }
13365
13366 /* Constant pools are per function only when PC relative
13367    literal loads are true or we are in the large memory
13368    model.  */
13369
13370 static inline bool
13371 aarch64_can_use_per_function_literal_pools_p (void)
13372 {
13373   return (aarch64_pcrelative_literal_loads
13374           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13375 }
13376
13377 static bool
13378 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
13379 {
13380   /* We can't use blocks for constants when we're using a per-function
13381      constant pool.  */
13382   return !aarch64_can_use_per_function_literal_pools_p ();
13383 }
13384
13385 /* Select appropriate section for constants depending
13386    on where we place literal pools.  */
13387
13388 static section *
13389 aarch64_select_rtx_section (machine_mode mode,
13390                             rtx x,
13391                             unsigned HOST_WIDE_INT align)
13392 {
13393   if (aarch64_can_use_per_function_literal_pools_p ())
13394     return function_section (current_function_decl);
13395
13396   return default_elf_select_rtx_section (mode, x, align);
13397 }
13398
13399 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
13400 void
13401 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13402                                   HOST_WIDE_INT offset)
13403 {
13404   /* When using per-function literal pools, we must ensure that any code
13405      section is aligned to the minimal instruction length, lest we get
13406      errors from the assembler re "unaligned instructions".  */
13407   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13408     ASM_OUTPUT_ALIGN (f, 2);
13409 }
13410
13411 /* Costs.  */
13412
13413 /* Helper function for rtx cost calculation.  Strip a shift expression
13414    from X.  Returns the inner operand if successful, or the original
13415    expression on failure.  */
13416 static rtx
13417 aarch64_strip_shift (rtx x)
13418 {
13419   rtx op = x;
13420
13421   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13422      we can convert both to ROR during final output.  */
13423   if ((GET_CODE (op) == ASHIFT
13424        || GET_CODE (op) == ASHIFTRT
13425        || GET_CODE (op) == LSHIFTRT
13426        || GET_CODE (op) == ROTATERT
13427        || GET_CODE (op) == ROTATE)
13428       && CONST_INT_P (XEXP (op, 1)))
13429     return XEXP (op, 0);
13430
13431   if (GET_CODE (op) == MULT
13432       && CONST_INT_P (XEXP (op, 1))
13433       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13434     return XEXP (op, 0);
13435
13436   return x;
13437 }
13438
13439 /* Helper function for rtx cost calculation.  Strip an extend
13440    expression from X.  Returns the inner operand if successful, or the
13441    original expression on failure.  We deal with a number of possible
13442    canonicalization variations here. If STRIP_SHIFT is true, then
13443    we can strip off a shift also.  */
13444 static rtx
13445 aarch64_strip_extend (rtx x, bool strip_shift)
13446 {
13447   scalar_int_mode mode;
13448   rtx op = x;
13449
13450   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13451     return op;
13452
13453   if (GET_CODE (op) == AND
13454       && GET_CODE (XEXP (op, 0)) == MULT
13455       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13456       && CONST_INT_P (XEXP (op, 1))
13457       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13458                            INTVAL (XEXP (op, 1))) != 0)
13459     return XEXP (XEXP (op, 0), 0);
13460
13461   /* Now handle extended register, as this may also have an optional
13462      left shift by 1..4.  */
13463   if (strip_shift
13464       && GET_CODE (op) == ASHIFT
13465       && CONST_INT_P (XEXP (op, 1))
13466       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13467     op = XEXP (op, 0);
13468
13469   if (GET_CODE (op) == ZERO_EXTEND
13470       || GET_CODE (op) == SIGN_EXTEND)
13471     op = XEXP (op, 0);
13472
13473   if (op != x)
13474     return op;
13475
13476   return x;
13477 }
13478
13479 /* Helper function for rtx cost calculation. Strip extension as well as any
13480    inner VEC_SELECT high-half from X. Returns the inner vector operand if
13481    successful, or the original expression on failure.  */
13482 static rtx
13483 aarch64_strip_extend_vec_half (rtx x)
13484 {
13485   if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13486     {
13487       x = XEXP (x, 0);
13488       if (GET_CODE (x) == VEC_SELECT
13489           && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13490                                     XEXP (x, 1)))
13491         x = XEXP (x, 0);
13492     }
13493   return x;
13494 }
13495
13496 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13497    any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13498    operand if successful, or the original expression on failure.  */
13499 static rtx
13500 aarch64_strip_duplicate_vec_elt (rtx x)
13501 {
13502   if (GET_CODE (x) == VEC_DUPLICATE
13503       && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13504     {
13505       x = XEXP (x, 0);
13506       if (GET_CODE (x) == VEC_SELECT)
13507         x = XEXP (x, 0);
13508       else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13509                && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13510         x = XEXP (XEXP (x, 0), 0);
13511     }
13512   return x;
13513 }
13514
13515 /* Return true iff CODE is a shift supported in combination
13516    with arithmetic instructions.  */
13517
13518 static bool
13519 aarch64_shift_p (enum rtx_code code)
13520 {
13521   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13522 }
13523
13524
13525 /* Return true iff X is a cheap shift without a sign extend. */
13526
13527 static bool
13528 aarch64_cheap_mult_shift_p (rtx x)
13529 {
13530   rtx op0, op1;
13531
13532   op0 = XEXP (x, 0);
13533   op1 = XEXP (x, 1);
13534
13535   if (!(aarch64_tune_params.extra_tuning_flags
13536                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13537     return false;
13538
13539   if (GET_CODE (op0) == SIGN_EXTEND)
13540     return false;
13541
13542   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13543       && UINTVAL (op1) <= 4)
13544     return true;
13545
13546   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13547     return false;
13548
13549   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13550
13551   if (l2 > 0 && l2 <= 4)
13552     return true;
13553
13554   return false;
13555 }
13556
13557 /* Helper function for rtx cost calculation.  Calculate the cost of
13558    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13559    Return the calculated cost of the expression, recursing manually in to
13560    operands where needed.  */
13561
13562 static int
13563 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13564 {
13565   rtx op0, op1;
13566   const struct cpu_cost_table *extra_cost
13567     = aarch64_tune_params.insn_extra_cost;
13568   int cost = 0;
13569   bool compound_p = (outer == PLUS || outer == MINUS);
13570   machine_mode mode = GET_MODE (x);
13571
13572   gcc_checking_assert (code == MULT);
13573
13574   op0 = XEXP (x, 0);
13575   op1 = XEXP (x, 1);
13576
13577   if (VECTOR_MODE_P (mode))
13578     {
13579       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13580       if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13581         {
13582           /* The select-operand-high-half versions of the instruction have the
13583              same cost as the three vector version - don't add the costs of the
13584              extension or selection into the costs of the multiply.  */
13585           op0 = aarch64_strip_extend_vec_half (op0);
13586           op1 = aarch64_strip_extend_vec_half (op1);
13587           /* The by-element versions of the instruction have the same costs as
13588              the normal 3-vector version.  We make an assumption that the input
13589              to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
13590              costing of a MUL by element pre RA is a bit optimistic.  */
13591           op0 = aarch64_strip_duplicate_vec_elt (op0);
13592           op1 = aarch64_strip_duplicate_vec_elt (op1);
13593         }
13594       cost += rtx_cost (op0, mode, MULT, 0, speed);
13595       cost += rtx_cost (op1, mode, MULT, 1, speed);
13596       if (speed)
13597         {
13598           if (GET_CODE (x) == MULT)
13599             cost += extra_cost->vect.mult;
13600           /* This is to catch the SSRA costing currently flowing here.  */
13601           else
13602             cost += extra_cost->vect.alu;
13603         }
13604       return cost;
13605     }
13606
13607   /* Integer multiply/fma.  */
13608   if (GET_MODE_CLASS (mode) == MODE_INT)
13609     {
13610       /* The multiply will be canonicalized as a shift, cost it as such.  */
13611       if (aarch64_shift_p (GET_CODE (x))
13612           || (CONST_INT_P (op1)
13613               && exact_log2 (INTVAL (op1)) > 0))
13614         {
13615           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13616                            || GET_CODE (op0) == SIGN_EXTEND;
13617           if (speed)
13618             {
13619               if (compound_p)
13620                 {
13621                   /* If the shift is considered cheap,
13622                      then don't add any cost. */
13623                   if (aarch64_cheap_mult_shift_p (x))
13624                     ;
13625                   else if (REG_P (op1))
13626                     /* ARITH + shift-by-register.  */
13627                     cost += extra_cost->alu.arith_shift_reg;
13628                   else if (is_extend)
13629                     /* ARITH + extended register.  We don't have a cost field
13630                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
13631                     cost += extra_cost->alu.extend_arith;
13632                   else
13633                     /* ARITH + shift-by-immediate.  */
13634                     cost += extra_cost->alu.arith_shift;
13635                 }
13636               else
13637                 /* LSL (immediate).  */
13638                 cost += extra_cost->alu.shift;
13639
13640             }
13641           /* Strip extends as we will have costed them in the case above.  */
13642           if (is_extend)
13643             op0 = aarch64_strip_extend (op0, true);
13644
13645           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13646
13647           return cost;
13648         }
13649
13650       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
13651          compound and let the below cases handle it.  After all, MNEG is a
13652          special-case alias of MSUB.  */
13653       if (GET_CODE (op0) == NEG)
13654         {
13655           op0 = XEXP (op0, 0);
13656           compound_p = true;
13657         }
13658
13659       /* Integer multiplies or FMAs have zero/sign extending variants.  */
13660       if ((GET_CODE (op0) == ZERO_EXTEND
13661            && GET_CODE (op1) == ZERO_EXTEND)
13662           || (GET_CODE (op0) == SIGN_EXTEND
13663               && GET_CODE (op1) == SIGN_EXTEND))
13664         {
13665           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13666           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13667
13668           if (speed)
13669             {
13670               if (compound_p)
13671                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
13672                 cost += extra_cost->mult[0].extend_add;
13673               else
13674                 /* MUL/SMULL/UMULL.  */
13675                 cost += extra_cost->mult[0].extend;
13676             }
13677
13678           return cost;
13679         }
13680
13681       /* This is either an integer multiply or a MADD.  In both cases
13682          we want to recurse and cost the operands.  */
13683       cost += rtx_cost (op0, mode, MULT, 0, speed);
13684       cost += rtx_cost (op1, mode, MULT, 1, speed);
13685
13686       if (speed)
13687         {
13688           if (compound_p)
13689             /* MADD/MSUB.  */
13690             cost += extra_cost->mult[mode == DImode].add;
13691           else
13692             /* MUL.  */
13693             cost += extra_cost->mult[mode == DImode].simple;
13694         }
13695
13696       return cost;
13697     }
13698   else
13699     {
13700       if (speed)
13701         {
13702           /* Floating-point FMA/FMUL can also support negations of the
13703              operands, unless the rounding mode is upward or downward in
13704              which case FNMUL is different than FMUL with operand negation.  */
13705           bool neg0 = GET_CODE (op0) == NEG;
13706           bool neg1 = GET_CODE (op1) == NEG;
13707           if (compound_p || !flag_rounding_math || (neg0 && neg1))
13708             {
13709               if (neg0)
13710                 op0 = XEXP (op0, 0);
13711               if (neg1)
13712                 op1 = XEXP (op1, 0);
13713             }
13714
13715           if (compound_p)
13716             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
13717             cost += extra_cost->fp[mode == DFmode].fma;
13718           else
13719             /* FMUL/FNMUL.  */
13720             cost += extra_cost->fp[mode == DFmode].mult;
13721         }
13722
13723       cost += rtx_cost (op0, mode, MULT, 0, speed);
13724       cost += rtx_cost (op1, mode, MULT, 1, speed);
13725       return cost;
13726     }
13727 }
13728
13729 static int
13730 aarch64_address_cost (rtx x,
13731                       machine_mode mode,
13732                       addr_space_t as ATTRIBUTE_UNUSED,
13733                       bool speed)
13734 {
13735   enum rtx_code c = GET_CODE (x);
13736   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13737   struct aarch64_address_info info;
13738   int cost = 0;
13739   info.shift = 0;
13740
13741   if (!aarch64_classify_address (&info, x, mode, false))
13742     {
13743       if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13744         {
13745           /* This is a CONST or SYMBOL ref which will be split
13746              in a different way depending on the code model in use.
13747              Cost it through the generic infrastructure.  */
13748           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13749           /* Divide through by the cost of one instruction to
13750              bring it to the same units as the address costs.  */
13751           cost_symbol_ref /= COSTS_N_INSNS (1);
13752           /* The cost is then the cost of preparing the address,
13753              followed by an immediate (possibly 0) offset.  */
13754           return cost_symbol_ref + addr_cost->imm_offset;
13755         }
13756       else
13757         {
13758           /* This is most likely a jump table from a case
13759              statement.  */
13760           return addr_cost->register_offset;
13761         }
13762     }
13763
13764   switch (info.type)
13765     {
13766       case ADDRESS_LO_SUM:
13767       case ADDRESS_SYMBOLIC:
13768       case ADDRESS_REG_IMM:
13769         cost += addr_cost->imm_offset;
13770         break;
13771
13772       case ADDRESS_REG_WB:
13773         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13774           cost += addr_cost->pre_modify;
13775         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13776           {
13777             unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13778             if (nvectors == 3)
13779               cost += addr_cost->post_modify_ld3_st3;
13780             else if (nvectors == 4)
13781               cost += addr_cost->post_modify_ld4_st4;
13782             else
13783               cost += addr_cost->post_modify;
13784           }
13785         else
13786           gcc_unreachable ();
13787
13788         break;
13789
13790       case ADDRESS_REG_REG:
13791         cost += addr_cost->register_offset;
13792         break;
13793
13794       case ADDRESS_REG_SXTW:
13795         cost += addr_cost->register_sextend;
13796         break;
13797
13798       case ADDRESS_REG_UXTW:
13799         cost += addr_cost->register_zextend;
13800         break;
13801
13802       default:
13803         gcc_unreachable ();
13804     }
13805
13806
13807   if (info.shift > 0)
13808     {
13809       /* For the sake of calculating the cost of the shifted register
13810          component, we can treat same sized modes in the same way.  */
13811       if (known_eq (GET_MODE_BITSIZE (mode), 16))
13812         cost += addr_cost->addr_scale_costs.hi;
13813       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13814         cost += addr_cost->addr_scale_costs.si;
13815       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13816         cost += addr_cost->addr_scale_costs.di;
13817       else
13818         /* We can't tell, or this is a 128-bit vector.  */
13819         cost += addr_cost->addr_scale_costs.ti;
13820     }
13821
13822   return cost;
13823 }
13824
13825 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
13826    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
13827    to be taken.  */
13828
13829 int
13830 aarch64_branch_cost (bool speed_p, bool predictable_p)
13831 {
13832   /* When optimizing for speed, use the cost of unpredictable branches.  */
13833   const struct cpu_branch_cost *branch_costs =
13834     aarch64_tune_params.branch_costs;
13835
13836   if (!speed_p || predictable_p)
13837     return branch_costs->predictable;
13838   else
13839     return branch_costs->unpredictable;
13840 }
13841
13842 /* Return true if X is a zero or sign extract
13843    usable in an ADD or SUB (extended register) instruction.  */
13844 static bool
13845 aarch64_rtx_arith_op_extract_p (rtx x)
13846 {
13847   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13848      No shift.  */
13849   if (GET_CODE (x) == SIGN_EXTEND
13850       || GET_CODE (x) == ZERO_EXTEND)
13851     return REG_P (XEXP (x, 0));
13852
13853   return false;
13854 }
13855
13856 static bool
13857 aarch64_frint_unspec_p (unsigned int u)
13858 {
13859   switch (u)
13860     {
13861       case UNSPEC_FRINTZ:
13862       case UNSPEC_FRINTP:
13863       case UNSPEC_FRINTM:
13864       case UNSPEC_FRINTA:
13865       case UNSPEC_FRINTN:
13866       case UNSPEC_FRINTX:
13867       case UNSPEC_FRINTI:
13868         return true;
13869
13870       default:
13871         return false;
13872     }
13873 }
13874
13875 /* Return true iff X is an rtx that will match an extr instruction
13876    i.e. as described in the *extr<mode>5_insn family of patterns.
13877    OP0 and OP1 will be set to the operands of the shifts involved
13878    on success and will be NULL_RTX otherwise.  */
13879
13880 static bool
13881 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13882 {
13883   rtx op0, op1;
13884   scalar_int_mode mode;
13885   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13886     return false;
13887
13888   *res_op0 = NULL_RTX;
13889   *res_op1 = NULL_RTX;
13890
13891   if (GET_CODE (x) != IOR)
13892     return false;
13893
13894   op0 = XEXP (x, 0);
13895   op1 = XEXP (x, 1);
13896
13897   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13898       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13899     {
13900      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
13901       if (GET_CODE (op1) == ASHIFT)
13902         std::swap (op0, op1);
13903
13904       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13905         return false;
13906
13907       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13908       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13909
13910       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13911           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13912         {
13913           *res_op0 = XEXP (op0, 0);
13914           *res_op1 = XEXP (op1, 0);
13915           return true;
13916         }
13917     }
13918
13919   return false;
13920 }
13921
13922 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13923    storing it in *COST.  Result is true if the total cost of the operation
13924    has now been calculated.  */
13925 static bool
13926 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13927 {
13928   rtx inner;
13929   rtx comparator;
13930   enum rtx_code cmpcode;
13931   const struct cpu_cost_table *extra_cost
13932     = aarch64_tune_params.insn_extra_cost;
13933
13934   if (COMPARISON_P (op0))
13935     {
13936       inner = XEXP (op0, 0);
13937       comparator = XEXP (op0, 1);
13938       cmpcode = GET_CODE (op0);
13939     }
13940   else
13941     {
13942       inner = op0;
13943       comparator = const0_rtx;
13944       cmpcode = NE;
13945     }
13946
13947   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13948     {
13949       /* Conditional branch.  */
13950       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13951         return true;
13952       else
13953         {
13954           if (cmpcode == NE || cmpcode == EQ)
13955             {
13956               if (comparator == const0_rtx)
13957                 {
13958                   /* TBZ/TBNZ/CBZ/CBNZ.  */
13959                   if (GET_CODE (inner) == ZERO_EXTRACT)
13960                     /* TBZ/TBNZ.  */
13961                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13962                                        ZERO_EXTRACT, 0, speed);
13963                   else
13964                     /* CBZ/CBNZ.  */
13965                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13966
13967                   return true;
13968                 }
13969               if (register_operand (inner, VOIDmode)
13970                   && aarch64_imm24 (comparator, VOIDmode))
13971                 {
13972                   /* SUB and SUBS.  */
13973                   *cost += COSTS_N_INSNS (2);
13974                   if (speed)
13975                     *cost += extra_cost->alu.arith * 2;
13976                   return true;
13977                 }
13978             }
13979           else if (cmpcode == LT || cmpcode == GE)
13980             {
13981               /* TBZ/TBNZ.  */
13982               if (comparator == const0_rtx)
13983                 return true;
13984             }
13985         }
13986     }
13987   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13988     {
13989       /* CCMP.  */
13990       if (GET_CODE (op1) == COMPARE)
13991         {
13992           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
13993           if (XEXP (op1, 1) == const0_rtx)
13994             *cost += 1;
13995           if (speed)
13996             {
13997               machine_mode mode = GET_MODE (XEXP (op1, 0));
13998
13999               if (GET_MODE_CLASS (mode) == MODE_INT)
14000                 *cost += extra_cost->alu.arith;
14001               else
14002                 *cost += extra_cost->fp[mode == DFmode].compare;
14003             }
14004           return true;
14005         }
14006
14007       /* It's a conditional operation based on the status flags,
14008          so it must be some flavor of CSEL.  */
14009
14010       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
14011       if (GET_CODE (op1) == NEG
14012           || GET_CODE (op1) == NOT
14013           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
14014         op1 = XEXP (op1, 0);
14015       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
14016         {
14017           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
14018           op1 = XEXP (op1, 0);
14019           op2 = XEXP (op2, 0);
14020         }
14021       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
14022         {
14023           inner = XEXP (op1, 0);
14024           if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
14025             /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
14026             op1 = XEXP (inner, 0);
14027         }
14028       else if (op1 == constm1_rtx || op1 == const1_rtx)
14029         {
14030           /* Use CSINV or CSINC.  */
14031           *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
14032           return true;
14033         }
14034       else if (op2 == constm1_rtx || op2 == const1_rtx)
14035         {
14036           /* Use CSINV or CSINC.  */
14037           *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
14038           return true;
14039         }
14040
14041       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
14042       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
14043       return true;
14044     }
14045
14046   /* We don't know what this is, cost all operands.  */
14047   return false;
14048 }
14049
14050 /* Check whether X is a bitfield operation of the form shift + extend that
14051    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
14052    operand to which the bitfield operation is applied.  Otherwise return
14053    NULL_RTX.  */
14054
14055 static rtx
14056 aarch64_extend_bitfield_pattern_p (rtx x)
14057 {
14058   rtx_code outer_code = GET_CODE (x);
14059   machine_mode outer_mode = GET_MODE (x);
14060
14061   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
14062       && outer_mode != SImode && outer_mode != DImode)
14063     return NULL_RTX;
14064
14065   rtx inner = XEXP (x, 0);
14066   rtx_code inner_code = GET_CODE (inner);
14067   machine_mode inner_mode = GET_MODE (inner);
14068   rtx op = NULL_RTX;
14069
14070   switch (inner_code)
14071     {
14072       case ASHIFT:
14073         if (CONST_INT_P (XEXP (inner, 1))
14074             && (inner_mode == QImode || inner_mode == HImode))
14075           op = XEXP (inner, 0);
14076         break;
14077       case LSHIFTRT:
14078         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
14079             && (inner_mode == QImode || inner_mode == HImode))
14080           op = XEXP (inner, 0);
14081         break;
14082       case ASHIFTRT:
14083         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
14084             && (inner_mode == QImode || inner_mode == HImode))
14085           op = XEXP (inner, 0);
14086         break;
14087       default:
14088         break;
14089     }
14090
14091   return op;
14092 }
14093
14094 /* Return true if the mask and a shift amount from an RTX of the form
14095    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
14096    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
14097
14098 bool
14099 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
14100                                     rtx shft_amnt)
14101 {
14102   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
14103          && INTVAL (mask) > 0
14104          && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
14105          && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
14106          && (UINTVAL (mask)
14107              & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
14108 }
14109
14110 /* Return true if the masks and a shift amount from an RTX of the form
14111    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
14112    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
14113
14114 bool
14115 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
14116                                    unsigned HOST_WIDE_INT mask1,
14117                                    unsigned HOST_WIDE_INT shft_amnt,
14118                                    unsigned HOST_WIDE_INT mask2)
14119 {
14120   unsigned HOST_WIDE_INT t;
14121
14122   /* Verify that there is no overlap in what bits are set in the two masks.  */
14123   if (mask1 != ~mask2)
14124     return false;
14125
14126   /* Verify that mask2 is not all zeros or ones.  */
14127   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
14128     return false;
14129
14130   /* The shift amount should always be less than the mode size.  */
14131   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
14132
14133   /* Verify that the mask being shifted is contiguous and would be in the
14134      least significant bits after shifting by shft_amnt.  */
14135   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
14136   return (t == (t & -t));
14137 }
14138
14139 /* Return true if X is an RTX representing an operation in the ABD family
14140    of instructions.  */
14141
14142 static bool
14143 aarch64_abd_rtx_p (rtx x)
14144 {
14145   if (GET_CODE (x) != MINUS)
14146     return false;
14147   rtx max_arm = XEXP (x, 0);
14148   rtx min_arm = XEXP (x, 1);
14149   if (GET_CODE (max_arm) != SMAX && GET_CODE (max_arm) != UMAX)
14150     return false;
14151   bool signed_p = GET_CODE (max_arm) == SMAX;
14152   if (signed_p && GET_CODE (min_arm) != SMIN)
14153     return false;
14154   else if (!signed_p && GET_CODE (min_arm) != UMIN)
14155     return false;
14156
14157   rtx maxop0 = XEXP (max_arm, 0);
14158   rtx maxop1 = XEXP (max_arm, 1);
14159   rtx minop0 = XEXP (min_arm, 0);
14160   rtx minop1 = XEXP (min_arm, 1);
14161   return rtx_equal_p (maxop0, minop0) && rtx_equal_p (maxop1, minop1);
14162 }
14163
14164 /* Calculate the cost of calculating X, storing it in *COST.  Result
14165    is true if the total cost of the operation has now been calculated.  */
14166 static bool
14167 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
14168                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
14169 {
14170   rtx op0, op1, op2;
14171   const struct cpu_cost_table *extra_cost
14172     = aarch64_tune_params.insn_extra_cost;
14173   rtx_code code = GET_CODE (x);
14174   scalar_int_mode int_mode;
14175
14176   /* By default, assume that everything has equivalent cost to the
14177      cheapest instruction.  Any additional costs are applied as a delta
14178      above this default.  */
14179   *cost = COSTS_N_INSNS (1);
14180
14181   switch (code)
14182     {
14183     case SET:
14184       /* The cost depends entirely on the operands to SET.  */
14185       *cost = 0;
14186       op0 = SET_DEST (x);
14187       op1 = SET_SRC (x);
14188
14189       switch (GET_CODE (op0))
14190         {
14191         case MEM:
14192           if (speed)
14193             {
14194               rtx address = XEXP (op0, 0);
14195               if (VECTOR_MODE_P (mode))
14196                 *cost += extra_cost->ldst.storev;
14197               else if (GET_MODE_CLASS (mode) == MODE_INT)
14198                 *cost += extra_cost->ldst.store;
14199               else if (mode == SFmode || mode == SDmode)
14200                 *cost += extra_cost->ldst.storef;
14201               else if (mode == DFmode || mode == DDmode)
14202                 *cost += extra_cost->ldst.stored;
14203
14204               *cost +=
14205                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14206                                                      0, speed));
14207             }
14208
14209           *cost += rtx_cost (op1, mode, SET, 1, speed);
14210           return true;
14211
14212         case SUBREG:
14213           if (! REG_P (SUBREG_REG (op0)))
14214             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
14215
14216           /* Fall through.  */
14217         case REG:
14218           /* The cost is one per vector-register copied.  */
14219           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
14220             {
14221               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
14222               *cost = COSTS_N_INSNS (nregs);
14223             }
14224           /* const0_rtx is in general free, but we will use an
14225              instruction to set a register to 0.  */
14226           else if (REG_P (op1) || op1 == const0_rtx)
14227             {
14228               /* The cost is 1 per register copied.  */
14229               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
14230               *cost = COSTS_N_INSNS (nregs);
14231             }
14232           else
14233             /* Cost is just the cost of the RHS of the set.  */
14234             *cost += rtx_cost (op1, mode, SET, 1, speed);
14235           return true;
14236
14237         case ZERO_EXTRACT:
14238         case SIGN_EXTRACT:
14239           /* Bit-field insertion.  Strip any redundant widening of
14240              the RHS to meet the width of the target.  */
14241           if (SUBREG_P (op1))
14242             op1 = SUBREG_REG (op1);
14243           if ((GET_CODE (op1) == ZERO_EXTEND
14244                || GET_CODE (op1) == SIGN_EXTEND)
14245               && CONST_INT_P (XEXP (op0, 1))
14246               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
14247               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
14248             op1 = XEXP (op1, 0);
14249
14250           if (CONST_INT_P (op1))
14251             {
14252               /* MOV immediate is assumed to always be cheap.  */
14253               *cost = COSTS_N_INSNS (1);
14254             }
14255           else
14256             {
14257               /* BFM.  */
14258               if (speed)
14259                 *cost += extra_cost->alu.bfi;
14260               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
14261             }
14262
14263           return true;
14264
14265         default:
14266           /* We can't make sense of this, assume default cost.  */
14267           *cost = COSTS_N_INSNS (1);
14268           return false;
14269         }
14270       return false;
14271
14272     case CONST_INT:
14273       /* If an instruction can incorporate a constant within the
14274          instruction, the instruction's expression avoids calling
14275          rtx_cost() on the constant.  If rtx_cost() is called on a
14276          constant, then it is usually because the constant must be
14277          moved into a register by one or more instructions.
14278
14279          The exception is constant 0, which can be expressed
14280          as XZR/WZR and is therefore free.  The exception to this is
14281          if we have (set (reg) (const0_rtx)) in which case we must cost
14282          the move.  However, we can catch that when we cost the SET, so
14283          we don't need to consider that here.  */
14284       if (x == const0_rtx)
14285         *cost = 0;
14286       else
14287         {
14288           /* To an approximation, building any other constant is
14289              proportionally expensive to the number of instructions
14290              required to build that constant.  This is true whether we
14291              are compiling for SPEED or otherwise.  */
14292           machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
14293                                 ? SImode : DImode;
14294           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
14295                                  (NULL_RTX, x, false, imode));
14296         }
14297       return true;
14298
14299     case CONST_DOUBLE:
14300
14301       /* First determine number of instructions to do the move
14302           as an integer constant.  */
14303       if (!aarch64_float_const_representable_p (x)
14304            && !aarch64_can_const_movi_rtx_p (x, mode)
14305            && aarch64_float_const_rtx_p (x))
14306         {
14307           unsigned HOST_WIDE_INT ival;
14308           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
14309           gcc_assert (succeed);
14310
14311           machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
14312                                 ? DImode : SImode;
14313           int ncost = aarch64_internal_mov_immediate
14314                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
14315           *cost += COSTS_N_INSNS (ncost);
14316           return true;
14317         }
14318
14319       if (speed)
14320         {
14321           /* mov[df,sf]_aarch64.  */
14322           if (aarch64_float_const_representable_p (x))
14323             /* FMOV (scalar immediate).  */
14324             *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
14325           else if (!aarch64_float_const_zero_rtx_p (x))
14326             {
14327               /* This will be a load from memory.  */
14328               if (mode == DFmode || mode == DDmode)
14329                 *cost += extra_cost->ldst.loadd;
14330               else
14331                 *cost += extra_cost->ldst.loadf;
14332             }
14333           else
14334             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
14335                or MOV v0.s[0], wzr - neither of which are modeled by the
14336                cost tables.  Just use the default cost.  */
14337             {
14338             }
14339         }
14340
14341       return true;
14342
14343     case MEM:
14344       if (speed)
14345         {
14346           /* For loads we want the base cost of a load, plus an
14347              approximation for the additional cost of the addressing
14348              mode.  */
14349           rtx address = XEXP (x, 0);
14350           if (VECTOR_MODE_P (mode))
14351             *cost += extra_cost->ldst.loadv;
14352           else if (GET_MODE_CLASS (mode) == MODE_INT)
14353             *cost += extra_cost->ldst.load;
14354           else if (mode == SFmode || mode == SDmode)
14355             *cost += extra_cost->ldst.loadf;
14356           else if (mode == DFmode || mode == DDmode)
14357             *cost += extra_cost->ldst.loadd;
14358
14359           *cost +=
14360                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14361                                                      0, speed));
14362         }
14363
14364       return true;
14365
14366     case NEG:
14367       op0 = XEXP (x, 0);
14368
14369       if (VECTOR_MODE_P (mode))
14370         {
14371           /* Many vector comparison operations are represented as NEG
14372              of a comparison.  */
14373           if (COMPARISON_P (op0))
14374             {
14375               rtx op00 = XEXP (op0, 0);
14376               rtx op01 = XEXP (op0, 1);
14377               machine_mode inner_mode = GET_MODE (op00);
14378               /* FACGE/FACGT.  */
14379               if (GET_MODE_CLASS (inner_mode) == MODE_VECTOR_FLOAT
14380                   && GET_CODE (op00) == ABS
14381                   && GET_CODE (op01) == ABS)
14382                 {
14383                   op00 = XEXP (op00, 0);
14384                   op01 = XEXP (op01, 0);
14385                 }
14386               *cost += rtx_cost (op00, inner_mode, GET_CODE (op0), 0, speed);
14387               *cost += rtx_cost (op01, inner_mode, GET_CODE (op0), 1, speed);
14388               if (speed)
14389                 *cost += extra_cost->vect.alu;
14390               return true;
14391             }
14392           if (speed)
14393             {
14394               /* FNEG.  */
14395               *cost += extra_cost->vect.alu;
14396             }
14397           return false;
14398         }
14399
14400       if (GET_MODE_CLASS (mode) == MODE_INT)
14401         {
14402           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14403               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14404             {
14405               /* CSETM.  */
14406               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
14407               return true;
14408             }
14409
14410           /* Cost this as SUB wzr, X.  */
14411           op0 = CONST0_RTX (mode);
14412           op1 = XEXP (x, 0);
14413           goto cost_minus;
14414         }
14415
14416       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14417         {
14418           /* Support (neg(fma...)) as a single instruction only if
14419              sign of zeros is unimportant.  This matches the decision
14420              making in aarch64.md.  */
14421           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14422             {
14423               /* FNMADD.  */
14424               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14425               return true;
14426             }
14427           if (GET_CODE (op0) == MULT)
14428             {
14429               /* FNMUL.  */
14430               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14431               return true;
14432             }
14433           if (speed)
14434             /* FNEG.  */
14435             *cost += extra_cost->fp[mode == DFmode].neg;
14436           return false;
14437         }
14438
14439       return false;
14440
14441     case CLRSB:
14442     case CLZ:
14443       if (speed)
14444         {
14445           if (VECTOR_MODE_P (mode))
14446             *cost += extra_cost->vect.alu;
14447           else
14448             *cost += extra_cost->alu.clz;
14449         }
14450
14451       return false;
14452
14453     case CTZ:
14454       if (VECTOR_MODE_P (mode))
14455         {
14456           *cost = COSTS_N_INSNS (3);
14457           if (speed)
14458             *cost += extra_cost->vect.alu * 3;
14459         }
14460       else if (TARGET_CSSC)
14461         {
14462           *cost = COSTS_N_INSNS (1);
14463           if (speed)
14464             *cost += extra_cost->alu.clz;
14465         }
14466       else
14467         {
14468           *cost = COSTS_N_INSNS (2);
14469           if (speed)
14470             *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14471         }
14472       return false;
14473
14474     case COMPARE:
14475       op0 = XEXP (x, 0);
14476       op1 = XEXP (x, 1);
14477
14478       if (op1 == const0_rtx
14479           && GET_CODE (op0) == AND)
14480         {
14481           x = op0;
14482           mode = GET_MODE (op0);
14483           goto cost_logic;
14484         }
14485
14486       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14487         {
14488           /* TODO: A write to the CC flags possibly costs extra, this
14489              needs encoding in the cost tables.  */
14490
14491           mode = GET_MODE (op0);
14492           /* ANDS.  */
14493           if (GET_CODE (op0) == AND)
14494             {
14495               x = op0;
14496               goto cost_logic;
14497             }
14498
14499           if (GET_CODE (op0) == PLUS)
14500             {
14501               /* ADDS (and CMN alias).  */
14502               x = op0;
14503               goto cost_plus;
14504             }
14505
14506           if (GET_CODE (op0) == MINUS)
14507             {
14508               /* SUBS.  */
14509               x = op0;
14510               goto cost_minus;
14511             }
14512
14513           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14514               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14515               && CONST_INT_P (XEXP (op0, 2)))
14516             {
14517               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14518                  Handle it here directly rather than going to cost_logic
14519                  since we know the immediate generated for the TST is valid
14520                  so we can avoid creating an intermediate rtx for it only
14521                  for costing purposes.  */
14522               if (speed)
14523                 *cost += extra_cost->alu.logical;
14524
14525               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14526                                  ZERO_EXTRACT, 0, speed);
14527               return true;
14528             }
14529
14530           if (GET_CODE (op1) == NEG)
14531             {
14532               /* CMN.  */
14533               if (speed)
14534                 *cost += extra_cost->alu.arith;
14535
14536               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14537               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14538               return true;
14539             }
14540
14541           /* CMP.
14542
14543              Compare can freely swap the order of operands, and
14544              canonicalization puts the more complex operation first.
14545              But the integer MINUS logic expects the shift/extend
14546              operation in op1.  */
14547           if (! (REG_P (op0)
14548                  || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14549           {
14550             op0 = XEXP (x, 1);
14551             op1 = XEXP (x, 0);
14552           }
14553           goto cost_minus;
14554         }
14555
14556       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14557         {
14558           /* FCMP.  */
14559           if (speed)
14560             *cost += extra_cost->fp[mode == DFmode].compare;
14561
14562           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14563             {
14564               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14565               /* FCMP supports constant 0.0 for no extra cost. */
14566               return true;
14567             }
14568           return false;
14569         }
14570
14571       if (VECTOR_MODE_P (mode))
14572         {
14573           /* Vector compare.  */
14574           if (speed)
14575             *cost += extra_cost->vect.alu;
14576
14577           if (aarch64_float_const_zero_rtx_p (op1))
14578             {
14579               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14580                  cost.  */
14581               return true;
14582             }
14583           return false;
14584         }
14585       return false;
14586
14587     case MINUS:
14588       {
14589         op0 = XEXP (x, 0);
14590         op1 = XEXP (x, 1);
14591
14592 cost_minus:
14593         if (VECTOR_MODE_P (mode))
14594           {
14595             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14596             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14597               {
14598                 /* Recognise the SABD and UABD operation here.
14599                    Recursion from the PLUS case will catch the accumulating
14600                    forms.  */
14601                 if (aarch64_abd_rtx_p (x))
14602                   {
14603                     if (speed)
14604                       *cost += extra_cost->vect.alu;
14605                     return true;
14606                   }
14607                   /* SUBL2 and SUBW2.
14608                    The select-operand-high-half versions of the sub instruction
14609                    have the same cost as the regular three vector version -
14610                    don't add the costs of the select into the costs of the sub.
14611                    */
14612                 op0 = aarch64_strip_extend_vec_half (op0);
14613                 op1 = aarch64_strip_extend_vec_half (op1);
14614               }
14615           }
14616
14617         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14618
14619         /* Detect valid immediates.  */
14620         if ((GET_MODE_CLASS (mode) == MODE_INT
14621              || (GET_MODE_CLASS (mode) == MODE_CC
14622                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14623             && CONST_INT_P (op1)
14624             && aarch64_uimm12_shift (INTVAL (op1)))
14625           {
14626             if (speed)
14627               /* SUB(S) (immediate).  */
14628               *cost += extra_cost->alu.arith;
14629             return true;
14630           }
14631
14632         /* Look for SUB (extended register).  */
14633         if (is_a <scalar_int_mode> (mode)
14634             && aarch64_rtx_arith_op_extract_p (op1))
14635           {
14636             if (speed)
14637               *cost += extra_cost->alu.extend_arith;
14638
14639             op1 = aarch64_strip_extend (op1, true);
14640             *cost += rtx_cost (op1, VOIDmode,
14641                                (enum rtx_code) GET_CODE (op1), 0, speed);
14642             return true;
14643           }
14644
14645         rtx new_op1 = aarch64_strip_extend (op1, false);
14646
14647         /* Cost this as an FMA-alike operation.  */
14648         if ((GET_CODE (new_op1) == MULT
14649              || aarch64_shift_p (GET_CODE (new_op1)))
14650             && code != COMPARE)
14651           {
14652             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14653                                             (enum rtx_code) code,
14654                                             speed);
14655             return true;
14656           }
14657
14658         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14659
14660         if (speed)
14661           {
14662             if (VECTOR_MODE_P (mode))
14663               {
14664                 /* Vector SUB.  */
14665                 *cost += extra_cost->vect.alu;
14666               }
14667             else if (GET_MODE_CLASS (mode) == MODE_INT)
14668               {
14669                 /* SUB(S).  */
14670                 *cost += extra_cost->alu.arith;
14671               }
14672             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14673               {
14674                 /* FSUB.  */
14675                 *cost += extra_cost->fp[mode == DFmode].addsub;
14676               }
14677           }
14678         return true;
14679       }
14680
14681     case PLUS:
14682       {
14683         rtx new_op0;
14684
14685         op0 = XEXP (x, 0);
14686         op1 = XEXP (x, 1);
14687
14688 cost_plus:
14689         if (VECTOR_MODE_P (mode))
14690           {
14691             /* ADDL2 and ADDW2.  */
14692             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14693             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14694               {
14695                 /* The select-operand-high-half versions of the add instruction
14696                    have the same cost as the regular three vector version -
14697                    don't add the costs of the select into the costs of the add.
14698                    */
14699                 op0 = aarch64_strip_extend_vec_half (op0);
14700                 op1 = aarch64_strip_extend_vec_half (op1);
14701               }
14702           }
14703
14704         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14705             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14706           {
14707             /* CSINC.  */
14708             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14709             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14710             return true;
14711           }
14712
14713         if (GET_MODE_CLASS (mode) == MODE_INT
14714             && (aarch64_plus_immediate (op1, mode)
14715                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14716           {
14717             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14718
14719             if (speed)
14720               {
14721                 /* ADD (immediate).  */
14722                 *cost += extra_cost->alu.arith;
14723
14724                 /* Some tunings prefer to not use the VL-based scalar ops.
14725                    Increase the cost of the poly immediate to prevent their
14726                    formation.  */
14727                 if (GET_CODE (op1) == CONST_POLY_INT
14728                     && (aarch64_tune_params.extra_tuning_flags
14729                         & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14730                   *cost += COSTS_N_INSNS (1);
14731               }
14732             return true;
14733           }
14734
14735         if (aarch64_pluslong_immediate (op1, mode))
14736           {
14737             /* 24-bit add in 2 instructions or 12-bit shifted add.  */
14738             if ((INTVAL (op1) & 0xfff) != 0)
14739               *cost += COSTS_N_INSNS (1);
14740
14741             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14742             return true;
14743           }
14744
14745         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14746
14747         /* Look for ADD (extended register).  */
14748         if (is_a <scalar_int_mode> (mode)
14749             && aarch64_rtx_arith_op_extract_p (op0))
14750           {
14751             if (speed)
14752               *cost += extra_cost->alu.extend_arith;
14753
14754             op0 = aarch64_strip_extend (op0, true);
14755             *cost += rtx_cost (op0, VOIDmode,
14756                                (enum rtx_code) GET_CODE (op0), 0, speed);
14757             return true;
14758           }
14759
14760         /* Strip any extend, leave shifts behind as we will
14761            cost them through mult_cost.  */
14762         new_op0 = aarch64_strip_extend (op0, false);
14763
14764         if (GET_CODE (new_op0) == MULT
14765             || aarch64_shift_p (GET_CODE (new_op0)))
14766           {
14767             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14768                                             speed);
14769             return true;
14770           }
14771
14772         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14773
14774         if (speed)
14775           {
14776             if (VECTOR_MODE_P (mode))
14777               {
14778                 /* Vector ADD.  */
14779                 *cost += extra_cost->vect.alu;
14780               }
14781             else if (GET_MODE_CLASS (mode) == MODE_INT)
14782               {
14783                 /* ADD.  */
14784                 *cost += extra_cost->alu.arith;
14785               }
14786             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14787               {
14788                 /* FADD.  */
14789                 *cost += extra_cost->fp[mode == DFmode].addsub;
14790               }
14791           }
14792         return true;
14793       }
14794
14795     case BITREVERSE:
14796     case BSWAP:
14797       *cost = COSTS_N_INSNS (1);
14798
14799       if (speed)
14800         {
14801           if (VECTOR_MODE_P (mode))
14802             *cost += extra_cost->vect.alu;
14803           else
14804             *cost += extra_cost->alu.rev;
14805         }
14806       return false;
14807
14808     case IOR:
14809       if (aarch_rev16_p (x))
14810         {
14811           *cost = COSTS_N_INSNS (1);
14812
14813           if (speed)
14814             {
14815               if (VECTOR_MODE_P (mode))
14816                 *cost += extra_cost->vect.alu;
14817               else
14818                 *cost += extra_cost->alu.rev;
14819             }
14820           return true;
14821         }
14822
14823       if (aarch64_extr_rtx_p (x, &op0, &op1))
14824         {
14825           *cost += rtx_cost (op0, mode, IOR, 0, speed);
14826           *cost += rtx_cost (op1, mode, IOR, 1, speed);
14827           if (speed)
14828             *cost += extra_cost->alu.shift;
14829
14830           return true;
14831         }
14832     /* Fall through.  */
14833     case XOR:
14834     case AND:
14835     cost_logic:
14836       op0 = XEXP (x, 0);
14837       op1 = XEXP (x, 1);
14838
14839       if (VECTOR_MODE_P (mode))
14840         {
14841           if (speed)
14842             *cost += extra_cost->vect.alu;
14843           return true;
14844         }
14845
14846       if (code == AND
14847           && GET_CODE (op0) == MULT
14848           && CONST_INT_P (XEXP (op0, 1))
14849           && CONST_INT_P (op1)
14850           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14851                                INTVAL (op1)) != 0)
14852         {
14853           /* This is a UBFM/SBFM.  */
14854           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14855           if (speed)
14856             *cost += extra_cost->alu.bfx;
14857           return true;
14858         }
14859
14860       if (is_int_mode (mode, &int_mode))
14861         {
14862           if (CONST_INT_P (op1))
14863             {
14864               /* We have a mask + shift version of a UBFIZ
14865                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
14866               if (GET_CODE (op0) == ASHIFT
14867                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14868                                                          XEXP (op0, 1)))
14869                 {
14870                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
14871                                      (enum rtx_code) code, 0, speed);
14872                   if (speed)
14873                     *cost += extra_cost->alu.bfx;
14874
14875                   return true;
14876                 }
14877               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14878                 {
14879                 /* We possibly get the immediate for free, this is not
14880                    modelled.  */
14881                   *cost += rtx_cost (op0, int_mode,
14882                                      (enum rtx_code) code, 0, speed);
14883                   if (speed)
14884                     *cost += extra_cost->alu.logical;
14885
14886                   return true;
14887                 }
14888             }
14889           else
14890             {
14891               rtx new_op0 = op0;
14892
14893               /* Handle ORN, EON, or BIC.  */
14894               if (GET_CODE (op0) == NOT)
14895                 op0 = XEXP (op0, 0);
14896
14897               new_op0 = aarch64_strip_shift (op0);
14898
14899               /* If we had a shift on op0 then this is a logical-shift-
14900                  by-register/immediate operation.  Otherwise, this is just
14901                  a logical operation.  */
14902               if (speed)
14903                 {
14904                   if (new_op0 != op0)
14905                     {
14906                       /* Shift by immediate.  */
14907                       if (CONST_INT_P (XEXP (op0, 1)))
14908                         *cost += extra_cost->alu.log_shift;
14909                       else
14910                         *cost += extra_cost->alu.log_shift_reg;
14911                     }
14912                   else
14913                     *cost += extra_cost->alu.logical;
14914                 }
14915
14916               /* In both cases we want to cost both operands.  */
14917               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14918                                  0, speed);
14919               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14920                                  1, speed);
14921
14922               return true;
14923             }
14924         }
14925       return false;
14926
14927     case NOT:
14928       x = XEXP (x, 0);
14929       op0 = aarch64_strip_shift (x);
14930
14931       if (VECTOR_MODE_P (mode))
14932         {
14933           /* Vector NOT.  */
14934           *cost += extra_cost->vect.alu;
14935           return false;
14936         }
14937
14938       /* MVN-shifted-reg.  */
14939       if (op0 != x)
14940         {
14941           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14942
14943           if (speed)
14944             *cost += extra_cost->alu.log_shift;
14945
14946           return true;
14947         }
14948       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14949          Handle the second form here taking care that 'a' in the above can
14950          be a shift.  */
14951       else if (GET_CODE (op0) == XOR)
14952         {
14953           rtx newop0 = XEXP (op0, 0);
14954           rtx newop1 = XEXP (op0, 1);
14955           rtx op0_stripped = aarch64_strip_shift (newop0);
14956
14957           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14958           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14959
14960           if (speed)
14961             {
14962               if (op0_stripped != newop0)
14963                 *cost += extra_cost->alu.log_shift;
14964               else
14965                 *cost += extra_cost->alu.logical;
14966             }
14967
14968           return true;
14969         }
14970       /* MVN.  */
14971       if (speed)
14972         *cost += extra_cost->alu.logical;
14973
14974       return false;
14975
14976     case ZERO_EXTEND:
14977
14978       op0 = XEXP (x, 0);
14979       /* If a value is written in SI mode, then zero extended to DI
14980          mode, the operation will in general be free as a write to
14981          a 'w' register implicitly zeroes the upper bits of an 'x'
14982          register.  However, if this is
14983
14984            (set (reg) (zero_extend (reg)))
14985
14986          we must cost the explicit register move.  */
14987       if (mode == DImode
14988           && GET_MODE (op0) == SImode)
14989         {
14990           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14991
14992         /* If OP_COST is non-zero, then the cost of the zero extend
14993            is effectively the cost of the inner operation.  Otherwise
14994            we have a MOV instruction and we take the cost from the MOV
14995            itself.  This is true independently of whether we are
14996            optimizing for space or time.  */
14997           if (op_cost)
14998             *cost = op_cost;
14999
15000           return true;
15001         }
15002       else if (MEM_P (op0))
15003         {
15004           /* All loads can zero extend to any size for free.  */
15005           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
15006           return true;
15007         }
15008
15009       op0 = aarch64_extend_bitfield_pattern_p (x);
15010       if (op0)
15011         {
15012           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
15013           if (speed)
15014             *cost += extra_cost->alu.bfx;
15015           return true;
15016         }
15017
15018       if (speed)
15019         {
15020           if (VECTOR_MODE_P (mode))
15021             {
15022               /* UMOV.  */
15023               *cost += extra_cost->vect.alu;
15024             }
15025           else
15026             {
15027               /* We generate an AND instead of UXTB/UXTH.  */
15028               *cost += extra_cost->alu.logical;
15029             }
15030         }
15031       return false;
15032
15033     case SIGN_EXTEND:
15034       if (MEM_P (XEXP (x, 0)))
15035         {
15036           /* LDRSH.  */
15037           if (speed)
15038             {
15039               rtx address = XEXP (XEXP (x, 0), 0);
15040               *cost += extra_cost->ldst.load_sign_extend;
15041
15042               *cost +=
15043                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
15044                                                      0, speed));
15045             }
15046           return true;
15047         }
15048
15049       op0 = aarch64_extend_bitfield_pattern_p (x);
15050       if (op0)
15051         {
15052           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
15053           if (speed)
15054             *cost += extra_cost->alu.bfx;
15055           return true;
15056         }
15057
15058       if (speed)
15059         {
15060           if (VECTOR_MODE_P (mode))
15061             *cost += extra_cost->vect.alu;
15062           else
15063             *cost += extra_cost->alu.extend;
15064         }
15065       return false;
15066
15067     case ROTATE:
15068     case ROTATERT:
15069     case LSHIFTRT:
15070     case ASHIFTRT:
15071     case ASHIFT:
15072       op0 = XEXP (x, 0);
15073       op1 = XEXP (x, 1);
15074
15075       if (CONST_INT_P (op1))
15076         {
15077           if (speed)
15078             {
15079               if (VECTOR_MODE_P (mode))
15080                 {
15081                   /* Vector shift (immediate).  */
15082                   *cost += extra_cost->vect.alu;
15083                 }
15084               else
15085                 {
15086                   /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
15087                      These are all aliases.  */
15088                   *cost += extra_cost->alu.shift;
15089                 }
15090             }
15091
15092           /* We can incorporate zero/sign extend for free.  */
15093           if (GET_CODE (op0) == ZERO_EXTEND
15094               || GET_CODE (op0) == SIGN_EXTEND)
15095             op0 = XEXP (op0, 0);
15096
15097           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
15098           return true;
15099         }
15100       else
15101         {
15102           if (VECTOR_MODE_P (mode))
15103             {
15104               if (speed)
15105                 /* Vector shift (register).  */
15106                 *cost += extra_cost->vect.alu;
15107             }
15108           else
15109             {
15110               if (speed)
15111                 /* LSLV, ASRV.  */
15112                 *cost += extra_cost->alu.shift_reg;
15113
15114                /* The register shift amount may be in a shorter mode expressed
15115                   as a lowpart SUBREG.  For costing purposes just look inside.  */
15116               if (SUBREG_P (op1) && subreg_lowpart_p (op1))
15117                 op1 = SUBREG_REG (op1);
15118               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
15119                   && CONST_INT_P (XEXP (op1, 1))
15120                   && known_eq (INTVAL (XEXP (op1, 1)),
15121                                GET_MODE_BITSIZE (mode) - 1))
15122                 {
15123                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
15124                   /* We already demanded XEXP (op1, 0) to be REG_P, so
15125                      don't recurse into it.  */
15126                   return true;
15127                 }
15128             }
15129           return false;  /* All arguments need to be in registers.  */
15130         }
15131
15132     case SYMBOL_REF:
15133
15134       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
15135           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
15136         {
15137           /* LDR.  */
15138           if (speed)
15139             *cost += extra_cost->ldst.load;
15140         }
15141       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
15142                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
15143         {
15144           /* ADRP, followed by ADD.  */
15145           *cost += COSTS_N_INSNS (1);
15146           if (speed)
15147             *cost += 2 * extra_cost->alu.arith;
15148         }
15149       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
15150                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
15151         {
15152           /* ADR.  */
15153           if (speed)
15154             *cost += extra_cost->alu.arith;
15155         }
15156
15157       if (flag_pic)
15158         {
15159           /* One extra load instruction, after accessing the GOT.  */
15160           *cost += COSTS_N_INSNS (1);
15161           if (speed)
15162             *cost += extra_cost->ldst.load;
15163         }
15164       return true;
15165
15166     case HIGH:
15167     case LO_SUM:
15168       /* ADRP/ADD (immediate).  */
15169       if (speed)
15170         *cost += extra_cost->alu.arith;
15171       return true;
15172
15173     case ZERO_EXTRACT:
15174     case SIGN_EXTRACT:
15175       /* UBFX/SBFX.  */
15176       if (speed)
15177         {
15178           if (VECTOR_MODE_P (mode))
15179             *cost += extra_cost->vect.alu;
15180           else
15181             *cost += extra_cost->alu.bfx;
15182         }
15183
15184       /* We can trust that the immediates used will be correct (there
15185          are no by-register forms), so we need only cost op0.  */
15186       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
15187       return true;
15188
15189     case MULT:
15190       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
15191       /* aarch64_rtx_mult_cost always handles recursion to its
15192          operands.  */
15193       return true;
15194
15195     case MOD:
15196     /* We can expand signed mod by power of 2 using a NEGS, two parallel
15197        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
15198        an unconditional negate.  This case should only ever be reached through
15199        the set_smod_pow2_cheap check in expmed.cc.  */
15200       if (CONST_INT_P (XEXP (x, 1))
15201           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
15202           && (mode == SImode || mode == DImode))
15203         {
15204           /* We expand to 4 instructions.  Reset the baseline.  */
15205           *cost = COSTS_N_INSNS (4);
15206
15207           if (speed)
15208             *cost += 2 * extra_cost->alu.logical
15209                      + 2 * extra_cost->alu.arith;
15210
15211           return true;
15212         }
15213
15214     /* Fall-through.  */
15215     case UMOD:
15216       if (speed)
15217         {
15218           /* Slighly prefer UMOD over SMOD.  */
15219           if (VECTOR_MODE_P (mode))
15220             *cost += extra_cost->vect.alu;
15221           else if (GET_MODE_CLASS (mode) == MODE_INT)
15222             *cost += (extra_cost->mult[mode == DImode].add
15223                       + extra_cost->mult[mode == DImode].idiv
15224                       + (code == MOD ? 1 : 0));
15225         }
15226       return false;  /* All arguments need to be in registers.  */
15227
15228     case DIV:
15229     case UDIV:
15230     case SQRT:
15231       if (speed)
15232         {
15233           if (VECTOR_MODE_P (mode))
15234             *cost += extra_cost->vect.alu;
15235           else if (GET_MODE_CLASS (mode) == MODE_INT)
15236             /* There is no integer SQRT, so only DIV and UDIV can get
15237                here.  */
15238             *cost += (extra_cost->mult[mode == DImode].idiv
15239                      /* Slighly prefer UDIV over SDIV.  */
15240                      + (code == DIV ? 1 : 0));
15241           else
15242             *cost += extra_cost->fp[mode == DFmode].div;
15243         }
15244       return false;  /* All arguments need to be in registers.  */
15245
15246     case IF_THEN_ELSE:
15247       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
15248                                          XEXP (x, 2), cost, speed);
15249
15250     case EQ:
15251     case NE:
15252     case GT:
15253     case GTU:
15254     case LT:
15255     case LTU:
15256     case GE:
15257     case GEU:
15258     case LE:
15259     case LEU:
15260
15261       return false; /* All arguments must be in registers.  */
15262
15263     case FMA:
15264       op0 = XEXP (x, 0);
15265       op1 = XEXP (x, 1);
15266       op2 = XEXP (x, 2);
15267
15268       if (speed)
15269         {
15270           if (VECTOR_MODE_P (mode))
15271             *cost += extra_cost->vect.alu;
15272           else
15273             *cost += extra_cost->fp[mode == DFmode].fma;
15274         }
15275
15276       /* FMSUB, FNMADD, and FNMSUB are free.  */
15277       if (GET_CODE (op0) == NEG)
15278         op0 = XEXP (op0, 0);
15279
15280       if (GET_CODE (op2) == NEG)
15281         op2 = XEXP (op2, 0);
15282
15283       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
15284          and the by-element operand as operand 0.  */
15285       if (GET_CODE (op1) == NEG)
15286         op1 = XEXP (op1, 0);
15287
15288       /* Catch vector-by-element operations.  The by-element operand can
15289          either be (vec_duplicate (vec_select (x))) or just
15290          (vec_select (x)), depending on whether we are multiplying by
15291          a vector or a scalar.
15292
15293          Canonicalization is not very good in these cases, FMA4 will put the
15294          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
15295       if (GET_CODE (op0) == VEC_DUPLICATE)
15296         op0 = XEXP (op0, 0);
15297       else if (GET_CODE (op1) == VEC_DUPLICATE)
15298         op1 = XEXP (op1, 0);
15299
15300       if (GET_CODE (op0) == VEC_SELECT)
15301         op0 = XEXP (op0, 0);
15302       else if (GET_CODE (op1) == VEC_SELECT)
15303         op1 = XEXP (op1, 0);
15304
15305       /* If the remaining parameters are not registers,
15306          get the cost to put them into registers.  */
15307       *cost += rtx_cost (op0, mode, FMA, 0, speed);
15308       *cost += rtx_cost (op1, mode, FMA, 1, speed);
15309       *cost += rtx_cost (op2, mode, FMA, 2, speed);
15310       return true;
15311
15312     case FLOAT:
15313     case UNSIGNED_FLOAT:
15314       if (speed)
15315         *cost += extra_cost->fp[mode == DFmode].fromint;
15316       return false;
15317
15318     case FLOAT_EXTEND:
15319       if (speed)
15320         {
15321           if (VECTOR_MODE_P (mode))
15322             {
15323               /*Vector truncate.  */
15324               *cost += extra_cost->vect.alu;
15325             }
15326           else
15327             *cost += extra_cost->fp[mode == DFmode].widen;
15328         }
15329       return false;
15330
15331     case FLOAT_TRUNCATE:
15332       if (speed)
15333         {
15334           if (VECTOR_MODE_P (mode))
15335             {
15336               /*Vector conversion.  */
15337               *cost += extra_cost->vect.alu;
15338             }
15339           else
15340             *cost += extra_cost->fp[mode == DFmode].narrow;
15341         }
15342       return false;
15343
15344     case FIX:
15345     case UNSIGNED_FIX:
15346       x = XEXP (x, 0);
15347       /* Strip the rounding part.  They will all be implemented
15348          by the fcvt* family of instructions anyway.  */
15349       if (GET_CODE (x) == UNSPEC)
15350         {
15351           unsigned int uns_code = XINT (x, 1);
15352
15353           if (uns_code == UNSPEC_FRINTA
15354               || uns_code == UNSPEC_FRINTM
15355               || uns_code == UNSPEC_FRINTN
15356               || uns_code == UNSPEC_FRINTP
15357               || uns_code == UNSPEC_FRINTZ)
15358             x = XVECEXP (x, 0, 0);
15359         }
15360
15361       if (speed)
15362         {
15363           if (VECTOR_MODE_P (mode))
15364             *cost += extra_cost->vect.alu;
15365           else
15366             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
15367         }
15368
15369       /* We can combine fmul by a power of 2 followed by a fcvt into a single
15370          fixed-point fcvt.  */
15371       if (GET_CODE (x) == MULT
15372           && ((VECTOR_MODE_P (mode)
15373                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
15374               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
15375         {
15376           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
15377                              0, speed);
15378           return true;
15379         }
15380
15381       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
15382       return true;
15383
15384     case ABS:
15385       if (VECTOR_MODE_P (mode))
15386         {
15387           /* ABS (vector).  */
15388           if (speed)
15389             *cost += extra_cost->vect.alu;
15390         }
15391       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15392         {
15393           op0 = XEXP (x, 0);
15394
15395           /* FABD, which is analogous to FADD.  */
15396           if (GET_CODE (op0) == MINUS)
15397             {
15398               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15399               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
15400               if (speed)
15401                 *cost += extra_cost->fp[mode == DFmode].addsub;
15402
15403               return true;
15404             }
15405           /* Simple FABS is analogous to FNEG.  */
15406           if (speed)
15407             *cost += extra_cost->fp[mode == DFmode].neg;
15408         }
15409       else
15410         {
15411           /* Integer ABS will either be split to
15412              two arithmetic instructions, or will be an ABS
15413              (scalar), which we don't model.  */
15414           *cost = COSTS_N_INSNS (2);
15415           if (speed)
15416             *cost += 2 * extra_cost->alu.arith;
15417         }
15418       return false;
15419
15420     case SMAX:
15421     case SMIN:
15422       if (speed)
15423         {
15424           if (VECTOR_MODE_P (mode))
15425             *cost += extra_cost->vect.alu;
15426           else
15427             {
15428               /* FMAXNM/FMINNM/FMAX/FMIN.
15429                  TODO: This may not be accurate for all implementations, but
15430                  we do not model this in the cost tables.  */
15431               *cost += extra_cost->fp[mode == DFmode].addsub;
15432             }
15433         }
15434       return false;
15435
15436     case UNSPEC:
15437       /* The floating point round to integer frint* instructions.  */
15438       if (aarch64_frint_unspec_p (XINT (x, 1)))
15439         {
15440           if (speed)
15441             *cost += extra_cost->fp[mode == DFmode].roundint;
15442
15443           return false;
15444         }
15445       break;
15446
15447     case TRUNCATE:
15448
15449       /* Decompose <su>muldi3_highpart.  */
15450       if (/* (truncate:DI  */
15451           mode == DImode
15452           /*   (lshiftrt:TI  */
15453           && GET_MODE (XEXP (x, 0)) == TImode
15454           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15455           /*      (mult:TI  */
15456           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15457           /*        (ANY_EXTEND:TI (reg:DI))
15458                     (ANY_EXTEND:TI (reg:DI)))  */
15459           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15460                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15461               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15462                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15463           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15464           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15465           /*     (const_int 64)  */
15466           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15467           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15468         {
15469           /* UMULH/SMULH.  */
15470           if (speed)
15471             *cost += extra_cost->mult[mode == DImode].extend;
15472           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15473                              mode, MULT, 0, speed);
15474           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15475                              mode, MULT, 1, speed);
15476           return true;
15477         }
15478         break;
15479     case CONST_VECTOR:
15480         {
15481           /* Load using MOVI/MVNI.  */
15482           if (aarch64_simd_valid_immediate (x, NULL))
15483             *cost = extra_cost->vect.movi;
15484           else /* Load using constant pool.  */
15485             *cost = extra_cost->ldst.load;
15486           break;
15487         }
15488     case VEC_CONCAT:
15489         /* depending on the operation, either DUP or INS.
15490            For now, keep default costing.  */
15491         break;
15492     case VEC_DUPLICATE:
15493         /* Load using a DUP.  */
15494         *cost = extra_cost->vect.dup;
15495         return false;
15496     case VEC_SELECT:
15497         {
15498           rtx op0 = XEXP (x, 0);
15499           *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15500
15501           /* cost subreg of 0 as free, otherwise as DUP */
15502           rtx op1 = XEXP (x, 1);
15503           if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15504             ;
15505           else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15506             *cost = extra_cost->vect.dup;
15507           else
15508             *cost = extra_cost->vect.extract;
15509           return true;
15510         }
15511     default:
15512       break;
15513     }
15514
15515   if (dump_file
15516       && flag_aarch64_verbose_cost)
15517     fprintf (dump_file,
15518       "\nFailed to cost RTX.  Assuming default cost.\n");
15519
15520   return true;
15521 }
15522
15523 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15524    calculated for X.  This cost is stored in *COST.  Returns true
15525    if the total cost of X was calculated.  */
15526 static bool
15527 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15528                    int param, int *cost, bool speed)
15529 {
15530   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15531
15532   if (dump_file
15533       && flag_aarch64_verbose_cost)
15534     {
15535       print_rtl_single (dump_file, x);
15536       fprintf (dump_file, "\n%s cost: %d (%s)\n",
15537                speed ? "Hot" : "Cold",
15538                *cost, result ? "final" : "partial");
15539     }
15540
15541   return result;
15542 }
15543
15544 static int
15545 aarch64_register_move_cost (machine_mode mode,
15546                             reg_class_t from_i, reg_class_t to_i)
15547 {
15548   enum reg_class from = (enum reg_class) from_i;
15549   enum reg_class to = (enum reg_class) to_i;
15550   const struct cpu_regmove_cost *regmove_cost
15551     = aarch64_tune_params.regmove_cost;
15552
15553   /* Trest any subset of POINTER_REGS as though it were GENERAL_REGS.  */
15554   if (reg_class_subset_p (to, POINTER_REGS))
15555     to = GENERAL_REGS;
15556
15557   if (reg_class_subset_p (from, POINTER_REGS))
15558     from = GENERAL_REGS;
15559
15560   /* Make RDFFR very expensive.  In particular, if we know that the FFR
15561      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15562      as a way of obtaining a PTRUE.  */
15563   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15564       && hard_reg_set_subset_p (reg_class_contents[from_i],
15565                                 reg_class_contents[FFR_REGS]))
15566     return 80;
15567
15568   /* Moving between GPR and stack cost is the same as GP2GP.  */
15569   if ((from == GENERAL_REGS && to == STACK_REG)
15570       || (to == GENERAL_REGS && from == STACK_REG))
15571     return regmove_cost->GP2GP;
15572
15573   /* To/From the stack register, we move via the gprs.  */
15574   if (to == STACK_REG || from == STACK_REG)
15575     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15576             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15577
15578   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15579   if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15580       && known_eq (GET_MODE_SIZE (mode), 16))
15581     {
15582       /* 128-bit operations on general registers require 2 instructions.  */
15583       if (from == GENERAL_REGS && to == GENERAL_REGS)
15584         return regmove_cost->GP2GP * 2;
15585       else if (from == GENERAL_REGS)
15586         return regmove_cost->GP2FP * 2;
15587       else if (to == GENERAL_REGS)
15588         return regmove_cost->FP2GP * 2;
15589
15590       /* When AdvSIMD instructions are disabled it is not possible to move
15591          a 128-bit value directly between Q registers.  This is handled in
15592          secondary reload.  A general register is used as a scratch to move
15593          the upper DI value and the lower DI value is moved directly,
15594          hence the cost is the sum of three moves. */
15595       if (!TARGET_SIMD && !TARGET_SVE)
15596         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15597
15598       return regmove_cost->FP2FP;
15599     }
15600
15601   if (from == GENERAL_REGS && to == GENERAL_REGS)
15602     return regmove_cost->GP2GP;
15603   else if (from == GENERAL_REGS)
15604     return regmove_cost->GP2FP;
15605   else if (to == GENERAL_REGS)
15606     return regmove_cost->FP2GP;
15607
15608   if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15609     {
15610       /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15611          The cost must be greater than 2 units to indicate that direct
15612          moves aren't possible.  */
15613       auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15614                          + aarch64_tune_params.memmov_cost.store_fp);
15615       return MIN (CEIL (per_vector, 2), 4);
15616     }
15617
15618   return regmove_cost->FP2FP;
15619 }
15620
15621 /* Implements TARGET_MEMORY_MOVE_COST.  */
15622 static int
15623 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15624 {
15625   enum reg_class rclass = (enum reg_class) rclass_i;
15626   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15627       ? reg_classes_intersect_p (rclass, PR_REGS)
15628       : reg_class_subset_p (rclass, PR_REGS))
15629     return (in
15630             ? aarch64_tune_params.memmov_cost.load_pred
15631             : aarch64_tune_params.memmov_cost.store_pred);
15632
15633   if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15634       ? reg_classes_intersect_p (rclass, FP_REGS)
15635       : reg_class_subset_p (rclass, FP_REGS))
15636     return (in
15637             ? aarch64_tune_params.memmov_cost.load_fp
15638             : aarch64_tune_params.memmov_cost.store_fp);
15639
15640   return (in
15641           ? aarch64_tune_params.memmov_cost.load_int
15642           : aarch64_tune_params.memmov_cost.store_int);
15643 }
15644
15645 /* Implement TARGET_INSN_COST.  We have the opportunity to do something
15646    much more productive here, such as using insn attributes to cost things.
15647    But we don't, not yet.
15648
15649    The main point of this current definition is to make calling insn_cost
15650    on one instruction equivalent to calling seq_cost on a sequence that
15651    contains only that instruction.  The default definition would instead
15652    only look at SET_SRCs, ignoring SET_DESTs.
15653
15654    This ensures that, for example, storing a 128-bit zero vector is more
15655    expensive than storing a 128-bit vector register.  A move of zero
15656    into a 128-bit vector register followed by multiple stores of that
15657    register is then cheaper than multiple stores of zero (which would
15658    use STP of XZR).  This in turn allows STP Qs to be formed.  */
15659 static int
15660 aarch64_insn_cost (rtx_insn *insn, bool speed)
15661 {
15662   if (rtx set = single_set (insn))
15663     return set_rtx_cost (set, speed);
15664   return pattern_cost (PATTERN (insn), speed);
15665 }
15666
15667 /* Implement TARGET_INIT_BUILTINS.  */
15668 static void
15669 aarch64_init_builtins ()
15670 {
15671   aarch64_general_init_builtins ();
15672   aarch64_sve::init_builtins ();
15673 #ifdef SUBTARGET_INIT_BUILTINS
15674   SUBTARGET_INIT_BUILTINS;
15675 #endif
15676 }
15677
15678 /* Implement TARGET_FOLD_BUILTIN.  */
15679 static tree
15680 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15681 {
15682   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15683   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15684   tree type = TREE_TYPE (TREE_TYPE (fndecl));
15685   switch (code & AARCH64_BUILTIN_CLASS)
15686     {
15687     case AARCH64_BUILTIN_GENERAL:
15688       return aarch64_general_fold_builtin (subcode, type, nargs, args);
15689
15690     case AARCH64_BUILTIN_SVE:
15691       return NULL_TREE;
15692     }
15693   gcc_unreachable ();
15694 }
15695
15696 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
15697 static bool
15698 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15699 {
15700   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15701   tree fndecl = gimple_call_fndecl (stmt);
15702   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15703   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15704   gimple *new_stmt = NULL;
15705   switch (code & AARCH64_BUILTIN_CLASS)
15706     {
15707     case AARCH64_BUILTIN_GENERAL:
15708       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15709       break;
15710
15711     case AARCH64_BUILTIN_SVE:
15712       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15713       break;
15714     }
15715
15716   if (!new_stmt)
15717     return false;
15718
15719   gsi_replace (gsi, new_stmt, false);
15720   return true;
15721 }
15722
15723 /* Implement TARGET_EXPAND_BUILTIN.  */
15724 static rtx
15725 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15726 {
15727   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15728   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15729   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15730   switch (code & AARCH64_BUILTIN_CLASS)
15731     {
15732     case AARCH64_BUILTIN_GENERAL:
15733       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15734
15735     case AARCH64_BUILTIN_SVE:
15736       return aarch64_sve::expand_builtin (subcode, exp, target);
15737     }
15738   gcc_unreachable ();
15739 }
15740
15741 /* Implement TARGET_BUILTIN_DECL.  */
15742 static tree
15743 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15744 {
15745   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15746   switch (code & AARCH64_BUILTIN_CLASS)
15747     {
15748     case AARCH64_BUILTIN_GENERAL:
15749       return aarch64_general_builtin_decl (subcode, initialize_p);
15750
15751     case AARCH64_BUILTIN_SVE:
15752       return aarch64_sve::builtin_decl (subcode, initialize_p);
15753     }
15754   gcc_unreachable ();
15755 }
15756
15757 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15758    to optimize 1.0/sqrt.  */
15759
15760 static bool
15761 use_rsqrt_p (machine_mode mode)
15762 {
15763   return (!flag_trapping_math
15764           && flag_unsafe_math_optimizations
15765           && ((aarch64_tune_params.approx_modes->recip_sqrt
15766                & AARCH64_APPROX_MODE (mode))
15767               || flag_mrecip_low_precision_sqrt));
15768 }
15769
15770 /* Function to decide when to use the approximate reciprocal square root
15771    builtin.  */
15772
15773 static tree
15774 aarch64_builtin_reciprocal (tree fndecl)
15775 {
15776   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15777
15778   if (!use_rsqrt_p (mode))
15779     return NULL_TREE;
15780   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15781   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15782   switch (code & AARCH64_BUILTIN_CLASS)
15783     {
15784     case AARCH64_BUILTIN_GENERAL:
15785       return aarch64_general_builtin_rsqrt (subcode);
15786
15787     case AARCH64_BUILTIN_SVE:
15788       return NULL_TREE;
15789     }
15790   gcc_unreachable ();
15791 }
15792
15793 /* Emit code to perform the floating-point operation:
15794
15795      DST = SRC1 * SRC2
15796
15797    where all three operands are already known to be registers.
15798    If the operation is an SVE one, PTRUE is a suitable all-true
15799    predicate.  */
15800
15801 static void
15802 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15803 {
15804   if (ptrue)
15805     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15806                                  dst, ptrue, src1, src2,
15807                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
15808   else
15809     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15810 }
15811
15812 /* Emit instruction sequence to compute either the approximate square root
15813    or its approximate reciprocal, depending on the flag RECP, and return
15814    whether the sequence was emitted or not.  */
15815
15816 bool
15817 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15818 {
15819   machine_mode mode = GET_MODE (dst);
15820
15821   if (GET_MODE_INNER (mode) == HFmode)
15822     {
15823       gcc_assert (!recp);
15824       return false;
15825     }
15826
15827   if (!recp)
15828     {
15829       if (!(flag_mlow_precision_sqrt
15830             || (aarch64_tune_params.approx_modes->sqrt
15831                 & AARCH64_APPROX_MODE (mode))))
15832         return false;
15833
15834       if (!flag_finite_math_only
15835           || flag_trapping_math
15836           || !flag_unsafe_math_optimizations
15837           || optimize_function_for_size_p (cfun))
15838         return false;
15839     }
15840   else
15841     /* Caller assumes we cannot fail.  */
15842     gcc_assert (use_rsqrt_p (mode));
15843
15844   rtx pg = NULL_RTX;
15845   if (aarch64_sve_mode_p (mode))
15846     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15847   machine_mode mmsk = (VECTOR_MODE_P (mode)
15848                        ? related_int_vector_mode (mode).require ()
15849                        : int_mode_for_mode (mode).require ());
15850   rtx xmsk = NULL_RTX;
15851   if (!recp)
15852     {
15853       /* When calculating the approximate square root, compare the
15854          argument with 0.0 and create a mask.  */
15855       rtx zero = CONST0_RTX (mode);
15856       if (pg)
15857         {
15858           xmsk = gen_reg_rtx (GET_MODE (pg));
15859           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15860           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15861                                            xmsk, pg, hint, src, zero));
15862         }
15863       else
15864         {
15865           xmsk = gen_reg_rtx (mmsk);
15866           emit_insn (gen_rtx_SET (xmsk,
15867                                   gen_rtx_NEG (mmsk,
15868                                                gen_rtx_EQ (mmsk, src, zero))));
15869         }
15870     }
15871
15872   /* Estimate the approximate reciprocal square root.  */
15873   rtx xdst = gen_reg_rtx (mode);
15874   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15875
15876   /* Iterate over the series twice for SF and thrice for DF.  */
15877   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15878
15879   /* Optionally iterate over the series once less for faster performance
15880      while sacrificing the accuracy.  */
15881   if ((recp && flag_mrecip_low_precision_sqrt)
15882       || (!recp && flag_mlow_precision_sqrt))
15883     iterations--;
15884
15885   /* Iterate over the series to calculate the approximate reciprocal square
15886      root.  */
15887   rtx x1 = gen_reg_rtx (mode);
15888   while (iterations--)
15889     {
15890       rtx x2 = gen_reg_rtx (mode);
15891       aarch64_emit_mult (x2, pg, xdst, xdst);
15892
15893       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15894
15895       if (iterations > 0)
15896         aarch64_emit_mult (xdst, pg, xdst, x1);
15897     }
15898
15899   if (!recp)
15900     {
15901       if (pg)
15902         /* Multiply nonzero source values by the corresponding intermediate
15903            result elements, so that the final calculation is the approximate
15904            square root rather than its reciprocal.  Select a zero result for
15905            zero source values, to avoid the Inf * 0 -> NaN that we'd get
15906            otherwise.  */
15907         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15908                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15909       else
15910         {
15911           /* Qualify the approximate reciprocal square root when the
15912              argument is 0.0 by squashing the intermediary result to 0.0.  */
15913           rtx xtmp = gen_reg_rtx (mmsk);
15914           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15915                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
15916           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15917
15918           /* Calculate the approximate square root.  */
15919           aarch64_emit_mult (xdst, pg, xdst, src);
15920         }
15921     }
15922
15923   /* Finalize the approximation.  */
15924   aarch64_emit_mult (dst, pg, xdst, x1);
15925
15926   return true;
15927 }
15928
15929 /* Emit the instruction sequence to compute the approximation for the division
15930    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
15931
15932 bool
15933 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15934 {
15935   machine_mode mode = GET_MODE (quo);
15936
15937   if (GET_MODE_INNER (mode) == HFmode)
15938     return false;
15939
15940   bool use_approx_division_p = (flag_mlow_precision_div
15941                                 || (aarch64_tune_params.approx_modes->division
15942                                     & AARCH64_APPROX_MODE (mode)));
15943
15944   if (!flag_finite_math_only
15945       || flag_trapping_math
15946       || !flag_unsafe_math_optimizations
15947       || optimize_function_for_size_p (cfun)
15948       || !use_approx_division_p)
15949     return false;
15950
15951   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15952     return false;
15953
15954   rtx pg = NULL_RTX;
15955   if (aarch64_sve_mode_p (mode))
15956     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15957
15958   /* Estimate the approximate reciprocal.  */
15959   rtx xrcp = gen_reg_rtx (mode);
15960   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15961
15962   /* Iterate over the series twice for SF and thrice for DF.  */
15963   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15964
15965   /* Optionally iterate over the series less for faster performance,
15966      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
15967   if (flag_mlow_precision_div)
15968     iterations = (GET_MODE_INNER (mode) == DFmode
15969                   ? aarch64_double_recp_precision
15970                   : aarch64_float_recp_precision);
15971
15972   /* Iterate over the series to calculate the approximate reciprocal.  */
15973   rtx xtmp = gen_reg_rtx (mode);
15974   while (iterations--)
15975     {
15976       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15977
15978       if (iterations > 0)
15979         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15980     }
15981
15982   if (num != CONST1_RTX (mode))
15983     {
15984       /* As the approximate reciprocal of DEN is already calculated, only
15985          calculate the approximate division when NUM is not 1.0.  */
15986       rtx xnum = force_reg (mode, num);
15987       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15988     }
15989
15990   /* Finalize the approximation.  */
15991   aarch64_emit_mult (quo, pg, xrcp, xtmp);
15992   return true;
15993 }
15994
15995 /* Return the number of instructions that can be issued per cycle.  */
15996 static int
15997 aarch64_sched_issue_rate (void)
15998 {
15999   return aarch64_tune_params.issue_rate;
16000 }
16001
16002 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
16003 static int
16004 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
16005 {
16006   if (DEBUG_INSN_P (insn))
16007     return more;
16008
16009   rtx_code code = GET_CODE (PATTERN (insn));
16010   if (code == USE || code == CLOBBER)
16011     return more;
16012
16013   if (get_attr_type (insn) == TYPE_NO_INSN)
16014     return more;
16015
16016   return more - 1;
16017 }
16018
16019 static int
16020 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
16021 {
16022   int issue_rate = aarch64_sched_issue_rate ();
16023
16024   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
16025 }
16026
16027
16028 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
16029    autopref_multipass_dfa_lookahead_guard from haifa-sched.cc.  It only
16030    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
16031
16032 static int
16033 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
16034                                                     int ready_index)
16035 {
16036   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
16037 }
16038
16039
16040 /* Vectorizer cost model target hooks.  */
16041
16042 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
16043    return the decl that should be recorded.  Return null otherwise.  */
16044 tree
16045 aarch64_vector_load_decl (tree addr)
16046 {
16047   if (TREE_CODE (addr) != ADDR_EXPR)
16048     return NULL_TREE;
16049   tree base = get_base_address (TREE_OPERAND (addr, 0));
16050   if (TREE_CODE (base) != VAR_DECL)
16051     return NULL_TREE;
16052   return base;
16053 }
16054
16055 /* Return true if STMT_INFO accesses a decl that is known to be the
16056    argument to a vld1 in the same function.  */
16057 static bool
16058 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
16059 {
16060   if (!cfun->machine->vector_load_decls)
16061     return false;
16062   auto dr = STMT_VINFO_DATA_REF (stmt_info);
16063   if (!dr)
16064     return false;
16065   tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
16066   return decl && cfun->machine->vector_load_decls->contains (decl);
16067 }
16068
16069 /* Information about how the CPU would issue the scalar, Advanced SIMD
16070    or SVE version of a vector loop, using the scheme defined by the
16071    aarch64_base_vec_issue_info hierarchy of structures.  */
16072 class aarch64_vec_op_count
16073 {
16074 public:
16075   aarch64_vec_op_count () = default;
16076   aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
16077                         unsigned int = 1);
16078
16079   unsigned int vec_flags () const { return m_vec_flags; }
16080   unsigned int vf_factor () const { return m_vf_factor; }
16081
16082   const aarch64_base_vec_issue_info *base_issue_info () const;
16083   const aarch64_simd_vec_issue_info *simd_issue_info () const;
16084   const aarch64_sve_vec_issue_info *sve_issue_info () const;
16085
16086   fractional_cost rename_cycles_per_iter () const;
16087   fractional_cost min_nonpred_cycles_per_iter () const;
16088   fractional_cost min_pred_cycles_per_iter () const;
16089   fractional_cost min_cycles_per_iter () const;
16090
16091   void dump () const;
16092
16093   /* The number of individual "general" operations.  See the comments
16094      in aarch64_base_vec_issue_info for details.  */
16095   unsigned int general_ops = 0;
16096
16097   /* The number of load and store operations, under the same scheme
16098      as above.  */
16099   unsigned int loads = 0;
16100   unsigned int stores = 0;
16101
16102   /* The minimum number of cycles needed to execute all loop-carried
16103      operations, which in the vector code become associated with
16104      reductions.  */
16105   unsigned int reduction_latency = 0;
16106
16107   /* The number of individual predicate operations.  See the comments
16108      in aarch64_sve_vec_issue_info for details.  */
16109   unsigned int pred_ops = 0;
16110
16111 private:
16112   /* The issue information for the core.  */
16113   const aarch64_vec_issue_info *m_issue_info = nullptr;
16114
16115   /* - If M_VEC_FLAGS is zero then this structure describes scalar code
16116      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
16117        Advanced SIMD code.
16118      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
16119        SVE code.  */
16120   unsigned int m_vec_flags = 0;
16121
16122   /* Assume that, when the code is executing on the core described
16123      by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
16124      times more data than the vectorizer anticipates.
16125
16126      This is only ever different from 1 for SVE.  It allows us to consider
16127      what would happen on a 256-bit SVE target even when the -mtune
16128      parameters say that the “likely” SVE length is 128 bits.  */
16129   unsigned int m_vf_factor = 1;
16130 };
16131
16132 aarch64_vec_op_count::
16133 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
16134                       unsigned int vec_flags, unsigned int vf_factor)
16135   : m_issue_info (issue_info),
16136     m_vec_flags (vec_flags),
16137     m_vf_factor (vf_factor)
16138 {
16139 }
16140
16141 /* Return the base issue information (i.e. the parts that make sense
16142    for both scalar and vector code).  Return null if we have no issue
16143    information.  */
16144 const aarch64_base_vec_issue_info *
16145 aarch64_vec_op_count::base_issue_info () const
16146 {
16147   if (auto *ret = simd_issue_info ())
16148     return ret;
16149   return m_issue_info->scalar;
16150 }
16151
16152 /* If the structure describes vector code and we have associated issue
16153    information, return that issue information, otherwise return null.  */
16154 const aarch64_simd_vec_issue_info *
16155 aarch64_vec_op_count::simd_issue_info () const
16156 {
16157   if (auto *ret = sve_issue_info ())
16158     return ret;
16159   if (m_vec_flags)
16160     return m_issue_info->advsimd;
16161   return nullptr;
16162 }
16163
16164 /* If the structure describes SVE code and we have associated issue
16165    information, return that issue information, otherwise return null.  */
16166 const aarch64_sve_vec_issue_info *
16167 aarch64_vec_op_count::sve_issue_info () const
16168 {
16169   if (m_vec_flags & VEC_ANY_SVE)
16170     return m_issue_info->sve;
16171   return nullptr;
16172 }
16173
16174 /* Estimate the minimum number of cycles per iteration needed to rename
16175    the instructions.
16176
16177    ??? For now this is done inline rather than via cost tables, since it
16178    isn't clear how it should be parameterized for the general case.  */
16179 fractional_cost
16180 aarch64_vec_op_count::rename_cycles_per_iter () const
16181 {
16182   if (sve_issue_info () == &neoverse512tvb_sve_issue_info
16183       || sve_issue_info () == &neoversen2_sve_issue_info
16184       || sve_issue_info () == &neoversev2_sve_issue_info)
16185     /* + 1 for an addition.  We've already counted a general op for each
16186        store, so we don't need to account for stores separately.  The branch
16187        reads no registers and so does not need to be counted either.
16188
16189        ??? This value is very much on the pessimistic side, but seems to work
16190        pretty well in practice.  */
16191     return { general_ops + loads + pred_ops + 1, 5 };
16192
16193   return 0;
16194 }
16195
16196 /* Like min_cycles_per_iter, but excluding predicate operations.  */
16197 fractional_cost
16198 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
16199 {
16200   auto *issue_info = base_issue_info ();
16201
16202   fractional_cost cycles = MAX (reduction_latency, 1);
16203   cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
16204   cycles = std::max (cycles, { loads + stores,
16205                                issue_info->loads_stores_per_cycle });
16206   cycles = std::max (cycles, { general_ops,
16207                                issue_info->general_ops_per_cycle });
16208   cycles = std::max (cycles, rename_cycles_per_iter ());
16209   return cycles;
16210 }
16211
16212 /* Like min_cycles_per_iter, but including only the predicate operations.  */
16213 fractional_cost
16214 aarch64_vec_op_count::min_pred_cycles_per_iter () const
16215 {
16216   if (auto *issue_info = sve_issue_info ())
16217     return { pred_ops, issue_info->pred_ops_per_cycle };
16218   return 0;
16219 }
16220
16221 /* Estimate the minimum number of cycles needed to issue the operations.
16222    This is a very simplistic model!  */
16223 fractional_cost
16224 aarch64_vec_op_count::min_cycles_per_iter () const
16225 {
16226   return std::max (min_nonpred_cycles_per_iter (),
16227                    min_pred_cycles_per_iter ());
16228 }
16229
16230 /* Dump information about the structure.  */
16231 void
16232 aarch64_vec_op_count::dump () const
16233 {
16234   dump_printf_loc (MSG_NOTE, vect_location,
16235                    "  load operations = %d\n", loads);
16236   dump_printf_loc (MSG_NOTE, vect_location,
16237                    "  store operations = %d\n", stores);
16238   dump_printf_loc (MSG_NOTE, vect_location,
16239                    "  general operations = %d\n", general_ops);
16240   if (sve_issue_info ())
16241     dump_printf_loc (MSG_NOTE, vect_location,
16242                      "  predicate operations = %d\n", pred_ops);
16243   dump_printf_loc (MSG_NOTE, vect_location,
16244                    "  reduction latency = %d\n", reduction_latency);
16245   if (auto rcpi = rename_cycles_per_iter ())
16246     dump_printf_loc (MSG_NOTE, vect_location,
16247                      "  estimated cycles per iteration to rename = %f\n",
16248                      rcpi.as_double ());
16249   if (auto pred_cpi = min_pred_cycles_per_iter ())
16250     {
16251       dump_printf_loc (MSG_NOTE, vect_location,
16252                        "  estimated min cycles per iteration"
16253                        " without predication = %f\n",
16254                        min_nonpred_cycles_per_iter ().as_double ());
16255       dump_printf_loc (MSG_NOTE, vect_location,
16256                        "  estimated min cycles per iteration"
16257                        " for predication = %f\n", pred_cpi.as_double ());
16258     }
16259   if (auto cpi = min_cycles_per_iter ())
16260     dump_printf_loc (MSG_NOTE, vect_location,
16261                      "  estimated min cycles per iteration = %f\n",
16262                      cpi.as_double ());
16263 }
16264
16265 /* Information about vector code that we're in the process of costing.  */
16266 class aarch64_vector_costs : public vector_costs
16267 {
16268 public:
16269   aarch64_vector_costs (vec_info *, bool);
16270
16271   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
16272                               stmt_vec_info stmt_info, slp_tree, tree vectype,
16273                               int misalign,
16274                               vect_cost_model_location where) override;
16275   void finish_cost (const vector_costs *) override;
16276   bool better_main_loop_than_p (const vector_costs *other) const override;
16277
16278 private:
16279   void record_potential_advsimd_unrolling (loop_vec_info);
16280   void analyze_loop_vinfo (loop_vec_info);
16281   void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info, slp_tree,
16282                   aarch64_vec_op_count *);
16283   fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
16284                                         fractional_cost, unsigned int,
16285                                         unsigned int *, bool *);
16286   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
16287                                  unsigned int);
16288   bool prefer_unrolled_loop () const;
16289   unsigned int determine_suggested_unroll_factor ();
16290
16291   /* True if we have performed one-time initialization based on the
16292      vec_info.  */
16293   bool m_analyzed_vinfo = false;
16294
16295   /* This loop uses an average operation that is not supported by SVE, but is
16296      supported by Advanced SIMD and SVE2.  */
16297   bool m_has_avg = false;
16298
16299   /* Additional initialization costs for using gather or scatter operation in
16300      the current loop.  */
16301   unsigned int m_sve_gather_scatter_init_cost = 0;
16302
16303   /* True if the vector body contains a store to a decl and if the
16304      function is known to have a vld1 from the same decl.
16305
16306      In the Advanced SIMD ACLE, the recommended endian-agnostic way of
16307      initializing a vector is:
16308
16309        float f[4] = { elts };
16310        float32x4_t x = vld1q_f32(f);
16311
16312      We should strongly prefer vectorization of the initialization of f,
16313      so that the store to f and the load back can be optimized away,
16314      leaving a vectorization of { elts }.  */
16315   bool m_stores_to_vector_load_decl = false;
16316
16317   /* Non-zero if the last operation we costed is a vector promotion or demotion.
16318      In this case the value is the number of insns in the last operation.
16319
16320      On AArch64 vector promotion and demotions require us to first widen or
16321      narrow the input and only after that emit conversion instructions.  For
16322      costing this means we need to emit the cost of the final conversions as
16323      well.  */
16324   unsigned int m_num_last_promote_demote = 0;
16325
16326   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
16327      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
16328        SIMD code.
16329      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
16330   unsigned int m_vec_flags = 0;
16331
16332   /* At the moment, we do not model LDP and STP in the vector and scalar costs.
16333      This means that code such as:
16334
16335         a[0] = x;
16336         a[1] = x;
16337
16338      will be costed as two scalar instructions and two vector instructions
16339      (a scalar_to_vec and an unaligned_store).  For SLP, the vector form
16340      wins if the costs are equal, because of the fact that the vector costs
16341      include constant initializations whereas the scalar costs don't.
16342      We would therefore tend to vectorize the code above, even though
16343      the scalar version can use a single STP.
16344
16345      We should eventually fix this and model LDP and STP in the main costs;
16346      see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
16347      Until then, we look specifically for code that does nothing more than
16348      STP-like operations.  We cost them on that basis in addition to the
16349      normal latency-based costs.
16350
16351      If the scalar or vector code could be a sequence of STPs +
16352      initialization, this variable counts the cost of the sequence,
16353      with 2 units per instruction.  The variable is ~0U for other
16354      kinds of code.  */
16355   unsigned int m_stp_sequence_cost = 0;
16356
16357   /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
16358      throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
16359      situations, we try to predict whether an Advanced SIMD implementation
16360      of the loop could be completely unrolled and become straight-line code.
16361      If so, it is generally better to use the Advanced SIMD version rather
16362      than length-agnostic SVE, since the SVE loop would execute an unknown
16363      number of times and so could not be completely unrolled in the same way.
16364
16365      If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
16366      number of Advanced SIMD loop iterations that would be unrolled and
16367      M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
16368      in the unrolled loop.  Both values are zero if we're not applying
16369      the heuristic.  */
16370   unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
16371   unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
16372
16373   /* If we're vectorizing a loop that executes a constant number of times,
16374      this variable gives the number of times that the vector loop would
16375      iterate, otherwise it is zero.  */
16376   uint64_t m_num_vector_iterations = 0;
16377
16378   /* Used only when vectorizing loops.  Estimates the number and kind of
16379      operations that would be needed by one iteration of the scalar
16380      or vector loop.  There is one entry for each tuning option of
16381      interest.  */
16382   auto_vec<aarch64_vec_op_count, 2> m_ops;
16383 };
16384
16385 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
16386                                             bool costing_for_scalar)
16387   : vector_costs (vinfo, costing_for_scalar),
16388     m_vec_flags (costing_for_scalar ? 0
16389                  : aarch64_classify_vector_mode (vinfo->vector_mode))
16390 {
16391   if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
16392     {
16393       m_ops.quick_push ({ issue_info, m_vec_flags });
16394       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
16395         {
16396           unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
16397           m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
16398                               vf_factor });
16399         }
16400     }
16401 }
16402
16403 /* Implement TARGET_VECTORIZE_CREATE_COSTS.  */
16404 vector_costs *
16405 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
16406 {
16407   return new aarch64_vector_costs (vinfo, costing_for_scalar);
16408 }
16409
16410 /* Return true if the current CPU should use the new costs defined
16411    in GCC 11.  This should be removed for GCC 12 and above, with the
16412    costs applying to all CPUs instead.  */
16413 static bool
16414 aarch64_use_new_vector_costs_p ()
16415 {
16416   return (aarch64_tune_params.extra_tuning_flags
16417           & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
16418 }
16419
16420 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
16421 static const simd_vec_cost *
16422 aarch64_simd_vec_costs (tree vectype)
16423 {
16424   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16425   if (vectype != NULL
16426       && aarch64_sve_mode_p (TYPE_MODE (vectype))
16427       && costs->sve != NULL)
16428     return costs->sve;
16429   return costs->advsimd;
16430 }
16431
16432 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
16433 static const simd_vec_cost *
16434 aarch64_simd_vec_costs_for_flags (unsigned int flags)
16435 {
16436   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16437   if ((flags & VEC_ANY_SVE) && costs->sve)
16438     return costs->sve;
16439   return costs->advsimd;
16440 }
16441
16442 /* If STMT_INFO is a memory reference, return the scalar memory type,
16443    otherwise return null.  */
16444 static tree
16445 aarch64_dr_type (stmt_vec_info stmt_info)
16446 {
16447   if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16448     return TREE_TYPE (DR_REF (dr));
16449   return NULL_TREE;
16450 }
16451
16452 /* Decide whether to use the unrolling heuristic described above
16453    m_unrolled_advsimd_niters, updating that field if so.  LOOP_VINFO
16454    describes the loop that we're vectorizing.  */
16455 void
16456 aarch64_vector_costs::
16457 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
16458 {
16459   /* The heuristic only makes sense on targets that have the same
16460      vector throughput for SVE and Advanced SIMD.  */
16461   if (!(aarch64_tune_params.extra_tuning_flags
16462         & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16463     return;
16464
16465   /* We only want to apply the heuristic if LOOP_VINFO is being
16466      vectorized for SVE.  */
16467   if (!(m_vec_flags & VEC_ANY_SVE))
16468     return;
16469
16470   /* Check whether it is possible in principle to use Advanced SIMD
16471      instead.  */
16472   if (aarch64_autovec_preference == AARCH64_AUTOVEC_SVE_ONLY)
16473     return;
16474
16475   /* We don't want to apply the heuristic to outer loops, since it's
16476      harder to track two levels of unrolling.  */
16477   if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16478     return;
16479
16480   /* Only handle cases in which the number of Advanced SIMD iterations
16481      would be known at compile time but the number of SVE iterations
16482      would not.  */
16483   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16484       || aarch64_sve_vg.is_constant ())
16485     return;
16486
16487   /* Guess how many times the Advanced SIMD loop would iterate and make
16488      sure that it is within the complete unrolling limit.  Even if the
16489      number of iterations is small enough, the number of statements might
16490      not be, which is why we need to estimate the number of statements too.  */
16491   unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16492   unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16493   unsigned HOST_WIDE_INT unrolled_advsimd_niters
16494     = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16495   if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16496     return;
16497
16498   /* Record that we're applying the heuristic and should try to estimate
16499      the number of statements in the Advanced SIMD loop.  */
16500   m_unrolled_advsimd_niters = unrolled_advsimd_niters;
16501 }
16502
16503 /* Do one-time initialization of the aarch64_vector_costs given that we're
16504    costing the loop vectorization described by LOOP_VINFO.  */
16505 void
16506 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
16507 {
16508   /* Record the number of times that the vector loop would execute,
16509      if known.  */
16510   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16511   auto scalar_niters = max_stmt_executions_int (loop);
16512   if (scalar_niters >= 0)
16513     {
16514       unsigned int vf = vect_vf_for_cost (loop_vinfo);
16515       if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16516         m_num_vector_iterations = scalar_niters / vf;
16517       else
16518         m_num_vector_iterations = CEIL (scalar_niters, vf);
16519     }
16520
16521   /* Detect whether we're vectorizing for SVE and should apply the unrolling
16522      heuristic described above m_unrolled_advsimd_niters.  */
16523   record_potential_advsimd_unrolling (loop_vinfo);
16524 }
16525
16526 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
16527 static int
16528 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16529                                     tree vectype,
16530                                     int misalign ATTRIBUTE_UNUSED)
16531 {
16532   unsigned elements;
16533   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16534   bool fp = false;
16535
16536   if (vectype != NULL)
16537     fp = FLOAT_TYPE_P (vectype);
16538
16539   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16540
16541   switch (type_of_cost)
16542     {
16543       case scalar_stmt:
16544         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16545
16546       case scalar_load:
16547         return costs->scalar_load_cost;
16548
16549       case scalar_store:
16550         return costs->scalar_store_cost;
16551
16552       case vector_stmt:
16553         return fp ? simd_costs->fp_stmt_cost
16554                   : simd_costs->int_stmt_cost;
16555
16556       case vector_load:
16557         return simd_costs->align_load_cost;
16558
16559       case vector_store:
16560         return simd_costs->store_cost;
16561
16562       case vec_to_scalar:
16563         return simd_costs->vec_to_scalar_cost;
16564
16565       case scalar_to_vec:
16566         return simd_costs->scalar_to_vec_cost;
16567
16568       case unaligned_load:
16569       case vector_gather_load:
16570         return simd_costs->unalign_load_cost;
16571
16572       case unaligned_store:
16573       case vector_scatter_store:
16574         return simd_costs->unalign_store_cost;
16575
16576       case cond_branch_taken:
16577         return costs->cond_taken_branch_cost;
16578
16579       case cond_branch_not_taken:
16580         return costs->cond_not_taken_branch_cost;
16581
16582       case vec_perm:
16583         return simd_costs->permute_cost;
16584
16585       case vec_promote_demote:
16586         return fp ? simd_costs->fp_stmt_cost
16587                   : simd_costs->int_stmt_cost;
16588
16589       case vec_construct:
16590         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16591         return elements / 2 + 1;
16592
16593       default:
16594         gcc_unreachable ();
16595     }
16596 }
16597
16598 /* Return true if an access of kind KIND for STMT_INFO (or NODE if SLP)
16599    represents one vector of an LD[234] or ST[234] operation.  Return the total
16600    number of vectors (2, 3 or 4) if so, otherwise return a value outside that
16601    range.  */
16602 static int
16603 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
16604                              slp_tree node)
16605 {
16606   if ((kind == vector_load
16607        || kind == unaligned_load
16608        || kind == vector_store
16609        || kind == unaligned_store)
16610       && STMT_VINFO_DATA_REF (stmt_info))
16611     {
16612       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16613       if (stmt_info
16614           && vect_mem_access_type (stmt_info, node) == VMAT_LOAD_STORE_LANES)
16615         return DR_GROUP_SIZE (stmt_info);
16616     }
16617   return 0;
16618 }
16619
16620 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16621    vectors would produce a series of LDP or STP operations.  KIND is the
16622    kind of statement that STMT_INFO represents.  */
16623 static bool
16624 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16625                            stmt_vec_info stmt_info)
16626 {
16627   switch (kind)
16628     {
16629     case vector_load:
16630     case vector_store:
16631     case unaligned_load:
16632     case unaligned_store:
16633       break;
16634
16635     default:
16636       return false;
16637     }
16638
16639   return is_gimple_assign (stmt_info->stmt);
16640 }
16641
16642 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16643    or multiply-subtract sequence that might be suitable for fusing into a
16644    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16645    a scalar one, otherwise analyze it as an operation on vectors with those
16646    VEC_* flags.  */
16647 static bool
16648 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16649                         unsigned int vec_flags)
16650 {
16651   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16652   if (!assign)
16653     return false;
16654   tree_code code = gimple_assign_rhs_code (assign);
16655   if (code != PLUS_EXPR && code != MINUS_EXPR)
16656     return false;
16657
16658   auto is_mul_result = [&](int i)
16659     {
16660       tree rhs = gimple_op (assign, i);
16661       /* ??? Should we try to check for a single use as well?  */
16662       if (TREE_CODE (rhs) != SSA_NAME)
16663         return false;
16664
16665       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16666       if (!def_stmt_info
16667           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16668         return false;
16669       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16670       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16671         return false;
16672
16673       if (vec_flags & VEC_ADVSIMD)
16674         {
16675           /* Scalar and SVE code can tie the result to any FMLA input (or none,
16676              although that requires a MOVPRFX for SVE).  However, Advanced SIMD
16677              only supports MLA forms, so will require a move if the result
16678              cannot be tied to the accumulator.  The most important case in
16679              which this is true is when the accumulator input is invariant.  */
16680           rhs = gimple_op (assign, 3 - i);
16681           if (TREE_CODE (rhs) != SSA_NAME)
16682             return false;
16683           def_stmt_info = vinfo->lookup_def (rhs);
16684           if (!def_stmt_info
16685               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
16686               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
16687             return false;
16688         }
16689
16690       return true;
16691     };
16692
16693   if (code == MINUS_EXPR && (vec_flags & VEC_ADVSIMD))
16694     /* Advanced SIMD doesn't have FNMADD/FNMSUB/FNMLA/FNMLS, so the
16695        multiplication must be on the second operand (to form an FMLS).
16696        But if both operands are multiplications and the second operand
16697        is used more than once, we'll instead negate the second operand
16698        and use it as an accumulator for the first operand.  */
16699     return (is_mul_result (2)
16700             && (has_single_use (gimple_assign_rhs2 (assign))
16701                 || !is_mul_result (1)));
16702
16703   return is_mul_result (1) || is_mul_result (2);
16704 }
16705
16706 /* Return true if STMT_INFO is the second part of a two-statement boolean AND
16707    expression sequence that might be suitable for fusing into a
16708    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16709    a scalar one, otherwise analyze it as an operation on vectors with those
16710    VEC_* flags.  */
16711
16712 static bool
16713 aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
16714                          unsigned int vec_flags)
16715 {
16716   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16717   if (!assign
16718       || gimple_assign_rhs_code (assign) != BIT_AND_EXPR
16719       || !STMT_VINFO_VECTYPE (stmt_info)
16720       || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info)))
16721     return false;
16722
16723   for (int i = 1; i < 3; ++i)
16724     {
16725       tree rhs = gimple_op (assign, i);
16726
16727       if (TREE_CODE (rhs) != SSA_NAME)
16728         continue;
16729
16730       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16731       if (!def_stmt_info
16732           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16733         continue;
16734
16735       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16736       if (!rhs_assign
16737           || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign))
16738                 != tcc_comparison)
16739         continue;
16740
16741       if (vec_flags & VEC_ADVSIMD)
16742         return false;
16743
16744       return true;
16745     }
16746   return false;
16747 }
16748
16749 /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
16750    in-loop reduction that SVE supports directly, return its latency in cycles,
16751    otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
16752    instructions.  */
16753 static unsigned int
16754 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16755                                        stmt_vec_info stmt_info,
16756                                        const sve_vec_cost *sve_costs)
16757 {
16758   switch (vect_reduc_type (vinfo, stmt_info))
16759     {
16760     case EXTRACT_LAST_REDUCTION:
16761       return sve_costs->clast_cost;
16762
16763     case FOLD_LEFT_REDUCTION:
16764       switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16765         {
16766         case E_HFmode:
16767         case E_BFmode:
16768           return sve_costs->fadda_f16_cost;
16769
16770         case E_SFmode:
16771           return sve_costs->fadda_f32_cost;
16772
16773         case E_DFmode:
16774           return sve_costs->fadda_f64_cost;
16775
16776         default:
16777           break;
16778         }
16779       break;
16780     }
16781
16782   return 0;
16783 }
16784
16785 /* STMT_INFO describes a loop-carried operation in the original scalar code
16786    that we are considering implementing as a reduction.  Return one of the
16787    following values, depending on VEC_FLAGS:
16788
16789    - If VEC_FLAGS is zero, return the loop carry latency of the original
16790      scalar operation.
16791
16792    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16793      Advanced SIMD implementation.
16794
16795    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16796      SVE implementation.  */
16797 static unsigned int
16798 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16799                                    unsigned int vec_flags)
16800 {
16801   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16802   const sve_vec_cost *sve_costs = nullptr;
16803   if (vec_flags & VEC_ANY_SVE)
16804     sve_costs = aarch64_tune_params.vec_costs->sve;
16805
16806   /* If the caller is asking for the SVE latency, check for forms of reduction
16807      that only SVE can handle directly.  */
16808   if (sve_costs)
16809     {
16810       unsigned int latency
16811         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16812       if (latency)
16813         return latency;
16814     }
16815
16816   /* Handle scalar costs.  */
16817   bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16818   if (vec_flags == 0)
16819     {
16820       if (is_float)
16821         return vec_costs->scalar_fp_stmt_cost;
16822       return vec_costs->scalar_int_stmt_cost;
16823     }
16824
16825   /* Otherwise, the loop body just contains normal integer or FP operations,
16826      with a vector reduction outside the loop.  */
16827   const simd_vec_cost *simd_costs
16828     = aarch64_simd_vec_costs_for_flags (vec_flags);
16829   if (is_float)
16830     return simd_costs->fp_stmt_cost;
16831   return simd_costs->int_stmt_cost;
16832 }
16833
16834 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16835    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
16836    try to subdivide the target-independent categorization provided by KIND
16837    to get a more accurate cost.  */
16838 static fractional_cost
16839 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16840                                     stmt_vec_info stmt_info,
16841                                     fractional_cost stmt_cost)
16842 {
16843   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
16844      the extension with the load.  */
16845   if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16846     return 0;
16847
16848   return stmt_cost;
16849 }
16850
16851 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16852    for the vectorized form of STMT_INFO possibly using SLP node NODE, which has
16853    cost kind KIND and which when vectorized would operate on vector type
16854    VECTYPE.  Try to subdivide the target-independent categorization provided by
16855    KIND to get a more accurate cost.  WHERE specifies where the cost associated
16856    with KIND occurs.  */
16857 static fractional_cost
16858 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16859                                     stmt_vec_info stmt_info, slp_tree node,
16860                                     tree vectype,
16861                                     enum vect_cost_model_location where,
16862                                     fractional_cost stmt_cost)
16863 {
16864   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16865   const sve_vec_cost *sve_costs = nullptr;
16866   if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16867     sve_costs = aarch64_tune_params.vec_costs->sve;
16868
16869   /* It's generally better to avoid costing inductions, since the induction
16870      will usually be hidden by other operations.  This is particularly true
16871      for things like COND_REDUCTIONS.  */
16872   if (is_a<gphi *> (stmt_info->stmt))
16873     return 0;
16874
16875   /* Detect cases in which vec_to_scalar is describing the extraction of a
16876      vector element in preparation for a scalar store.  The store itself is
16877      costed separately.  */
16878   if (vect_is_store_elt_extraction (kind, stmt_info))
16879     return simd_costs->store_elt_extra_cost;
16880
16881   /* Detect SVE gather loads, which are costed as a single scalar_load
16882      for each element.  We therefore need to divide the full-instruction
16883      cost by the number of elements in the vector.  */
16884   if (kind == scalar_load
16885       && sve_costs
16886       && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
16887     {
16888       unsigned int nunits = vect_nunits_for_cost (vectype);
16889       /* Test for VNx2 modes, which have 64-bit containers.  */
16890       if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype)), aarch64_sve_vg))
16891         return { sve_costs->gather_load_x64_cost, nunits };
16892       return { sve_costs->gather_load_x32_cost, nunits };
16893     }
16894
16895   /* Detect cases in which a scalar_store is really storing one element
16896      in a scatter operation.  */
16897   if (kind == scalar_store
16898       && sve_costs
16899       && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
16900     return sve_costs->scatter_store_elt_cost;
16901
16902   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
16903   if (kind == vec_to_scalar
16904       && where == vect_body
16905       && sve_costs)
16906     {
16907       unsigned int latency
16908         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16909       if (latency)
16910         return latency;
16911     }
16912
16913   /* Detect cases in which vec_to_scalar represents a single reduction
16914      instruction like FADDP or MAXV.  */
16915   if (kind == vec_to_scalar
16916       && where == vect_epilogue
16917       && vect_is_reduction (stmt_info))
16918     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16919       {
16920       case E_QImode:
16921         return simd_costs->reduc_i8_cost;
16922
16923       case E_HImode:
16924         return simd_costs->reduc_i16_cost;
16925
16926       case E_SImode:
16927         return simd_costs->reduc_i32_cost;
16928
16929       case E_DImode:
16930         return simd_costs->reduc_i64_cost;
16931
16932       case E_HFmode:
16933       case E_BFmode:
16934         return simd_costs->reduc_f16_cost;
16935
16936       case E_SFmode:
16937         return simd_costs->reduc_f32_cost;
16938
16939       case E_DFmode:
16940         return simd_costs->reduc_f64_cost;
16941
16942       default:
16943         break;
16944       }
16945
16946   /* Otherwise stick with the original categorization.  */
16947   return stmt_cost;
16948 }
16949
16950 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16951    for STMT_INFO, which has cost kind KIND and which when vectorized would
16952    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
16953    targets.  */
16954 static fractional_cost
16955 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16956                               stmt_vec_info stmt_info, tree vectype,
16957                               fractional_cost stmt_cost)
16958 {
16959   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16960      vector register size or number of units.  Integer promotions of this
16961      type therefore map to SXT[BHW] or UXT[BHW].
16962
16963      Most loads have extending forms that can do the sign or zero extension
16964      on the fly.  Optimistically assume that a load followed by an extension
16965      will fold to this form during combine, and that the extension therefore
16966      comes for free.  */
16967   if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16968     stmt_cost = 0;
16969
16970   /* For similar reasons, vector_stmt integer truncations are a no-op,
16971      because we can just ignore the unused upper bits of the source.  */
16972   if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16973     stmt_cost = 0;
16974
16975   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16976      but there are no equivalent instructions for SVE.  This means that
16977      (all other things being equal) 128-bit SVE needs twice as many load
16978      and store instructions as Advanced SIMD in order to process vector pairs.
16979
16980      Also, scalar code can often use LDP and STP to access pairs of values,
16981      so it is too simplistic to say that one SVE load or store replaces
16982      VF scalar loads and stores.
16983
16984      Ideally we would account for this in the scalar and Advanced SIMD
16985      costs by making suitable load/store pairs as cheap as a single
16986      load/store.  However, that would be a very invasive change and in
16987      practice it tends to stress other parts of the cost model too much.
16988      E.g. stores of scalar constants currently count just a store,
16989      whereas stores of vector constants count a store and a vec_init.
16990      This is an artificial distinction for AArch64, where stores of
16991      nonzero scalar constants need the same kind of register invariant
16992      as vector stores.
16993
16994      An alternative would be to double the cost of any SVE loads and stores
16995      that could be paired in Advanced SIMD (and possibly also paired in
16996      scalar code).  But this tends to stress other parts of the cost model
16997      in the same way.  It also means that we can fall back to Advanced SIMD
16998      even if full-loop predication would have been useful.
16999
17000      Here we go for a more conservative version: double the costs of SVE
17001      loads and stores if one iteration of the scalar loop processes enough
17002      elements for it to use a whole number of Advanced SIMD LDP or STP
17003      instructions.  This makes it very likely that the VF would be 1 for
17004      Advanced SIMD, and so no epilogue should be needed.  */
17005   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
17006     {
17007       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
17008       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
17009       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
17010       if (multiple_p (count * elt_bits, 256)
17011           && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
17012         stmt_cost *= 2;
17013     }
17014
17015   return stmt_cost;
17016 }
17017
17018 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
17019    and which when vectorized would operate on vector type VECTYPE.  Add the
17020    cost of any embedded operations.  */
17021 static fractional_cost
17022 aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
17023                           stmt_vec_info stmt_info, slp_tree node, tree vectype,
17024                           unsigned vec_flags, fractional_cost stmt_cost)
17025 {
17026   if (vectype)
17027     {
17028       const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
17029
17030       /* Detect cases in which a vector load or store represents an
17031          LD[234] or ST[234] instruction.  */
17032       switch (aarch64_ld234_st234_vectors (kind, stmt_info, node))
17033         {
17034         case 2:
17035           stmt_cost += simd_costs->ld2_st2_permute_cost;
17036           break;
17037
17038         case 3:
17039           stmt_cost += simd_costs->ld3_st3_permute_cost;
17040           break;
17041
17042         case 4:
17043           stmt_cost += simd_costs->ld4_st4_permute_cost;
17044           break;
17045         }
17046
17047       gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
17048       if ((kind == scalar_stmt || kind == vector_stmt) && assign)
17049         {
17050           /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
17051           if (!vect_is_reduction (stmt_info)
17052               && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
17053             return 0;
17054
17055           /* For vector boolean ANDs with a compare operand we just need
17056              one insn.  */
17057           if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
17058             return 0;
17059         }
17060
17061       if (kind == vector_stmt || kind == vec_to_scalar)
17062         if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
17063           {
17064             if (FLOAT_TYPE_P (cmp_type))
17065               stmt_cost += simd_costs->fp_stmt_cost;
17066             else
17067               stmt_cost += simd_costs->int_stmt_cost;
17068           }
17069     }
17070
17071   if (kind == scalar_stmt)
17072     if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
17073       {
17074         if (FLOAT_TYPE_P (cmp_type))
17075           stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
17076         else
17077           stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
17078       }
17079
17080   return stmt_cost;
17081 }
17082
17083 /* Return true if STMT_INFO is part of a reduction that has the form:
17084
17085       r = r op ...;
17086       r = r op ...;
17087
17088    with the single accumulator being read and written multiple times.  */
17089 static bool
17090 aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info)
17091 {
17092   if (!STMT_VINFO_REDUC_DEF (stmt_info))
17093     return false;
17094
17095   auto reduc_info = info_for_reduction (vinfo, stmt_info);
17096   return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
17097 }
17098
17099 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
17100    and they describe an operation in the body of a vector loop.  Record issue
17101    information relating to the vector operation in OPS.  */
17102 void
17103 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
17104                                  stmt_vec_info stmt_info, slp_tree node,
17105                                  aarch64_vec_op_count *ops)
17106 {
17107   const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
17108   if (!base_issue)
17109     return;
17110   const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
17111   const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
17112
17113   /* Calculate the minimum cycles per iteration imposed by a reduction
17114      operation.  */
17115   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17116       && vect_is_reduction (stmt_info))
17117     {
17118       unsigned int base
17119         = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
17120       if (aarch64_force_single_cycle (m_vinfo, stmt_info))
17121         /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
17122            and then accumulate that, but at the moment the loop-carried
17123            dependency includes all copies.  */
17124         ops->reduction_latency = MAX (ops->reduction_latency, base * count);
17125       else
17126         ops->reduction_latency = MAX (ops->reduction_latency, base);
17127     }
17128
17129   if (stmt_info && (kind == scalar_stmt || kind == vector_stmt))
17130     {
17131       /* Assume that multiply-adds will become a single operation.  */
17132       if (aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
17133         return;
17134
17135       /* Assume that bool AND with compare operands will become a single
17136          operation.  */
17137       if (aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
17138         return;
17139     }
17140
17141
17142   /* Count the basic operation cost associated with KIND.  */
17143   switch (kind)
17144     {
17145     case cond_branch_taken:
17146     case cond_branch_not_taken:
17147     case vector_gather_load:
17148     case vector_scatter_store:
17149       /* We currently don't expect these to be used in a loop body.  */
17150       break;
17151
17152     case vec_perm:
17153     case vec_promote_demote:
17154     case vec_construct:
17155     case vec_to_scalar:
17156     case scalar_to_vec:
17157     case vector_stmt:
17158     case scalar_stmt:
17159       ops->general_ops += count;
17160       break;
17161
17162     case scalar_load:
17163     case vector_load:
17164     case unaligned_load:
17165       ops->loads += count;
17166       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17167         ops->general_ops += base_issue->fp_simd_load_general_ops * count;
17168       break;
17169
17170     case vector_store:
17171     case unaligned_store:
17172     case scalar_store:
17173       ops->stores += count;
17174       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
17175         ops->general_ops += base_issue->fp_simd_store_general_ops * count;
17176       break;
17177     }
17178
17179   /* Add any embedded comparison operations.  */
17180   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
17181       && vect_embedded_comparison_type (stmt_info))
17182     ops->general_ops += count;
17183
17184   /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
17185      have only accounted for one.  */
17186   if ((kind == vector_stmt || kind == vec_to_scalar)
17187       && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
17188     ops->general_ops += count;
17189
17190   /* Count the predicate operations needed by an SVE comparison.  */
17191   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
17192     if (tree type = vect_comparison_type (stmt_info))
17193       {
17194         unsigned int base = (FLOAT_TYPE_P (type)
17195                              ? sve_issue->fp_cmp_pred_ops
17196                              : sve_issue->int_cmp_pred_ops);
17197         ops->pred_ops += base * count;
17198       }
17199
17200   /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
17201   if (simd_issue)
17202     switch (aarch64_ld234_st234_vectors (kind, stmt_info, node))
17203       {
17204       case 2:
17205         ops->general_ops += simd_issue->ld2_st2_general_ops * count;
17206         break;
17207
17208       case 3:
17209         ops->general_ops += simd_issue->ld3_st3_general_ops * count;
17210         break;
17211
17212       case 4:
17213         ops->general_ops += simd_issue->ld4_st4_general_ops * count;
17214         break;
17215       }
17216
17217   /* Add any overhead associated with gather loads and scatter stores.  */
17218   if (sve_issue
17219       && (kind == scalar_load || kind == scalar_store)
17220       && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
17221     {
17222       unsigned int pairs = CEIL (count, 2);
17223       ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
17224       ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
17225     }
17226 }
17227
17228 /* Return true if STMT_INFO contains a memory access and if the constant
17229    component of the memory address is aligned to SIZE bytes.  */
17230 static bool
17231 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
17232                                    poly_uint64 size)
17233 {
17234   if (!STMT_VINFO_DATA_REF (stmt_info))
17235     return false;
17236
17237   if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
17238     stmt_info = first_stmt;
17239   tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
17240   /* Needed for gathers & scatters, for example.  */
17241   if (!constant_offset)
17242     return false;
17243
17244   return multiple_p (wi::to_poly_offset (constant_offset), size);
17245 }
17246
17247 /* Check if a scalar or vector stmt could be part of a region of code
17248    that does nothing more than store values to memory, in the scalar
17249    case using STP.  Return the cost of the stmt if so, counting 2 for
17250    one instruction.  Return ~0U otherwise.
17251
17252    The arguments are a subset of those passed to add_stmt_cost.  */
17253 unsigned int
17254 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
17255                            stmt_vec_info stmt_info, tree vectype)
17256 {
17257   /* Code that stores vector constants uses a vector_load to create
17258      the constant.  We don't apply the heuristic to that case for two
17259      main reasons:
17260
17261      - At the moment, STPs are only formed via peephole2, and the
17262        constant scalar moves would often come between STRs and so
17263        prevent STP formation.
17264
17265      - The scalar code also has to load the constant somehow, and that
17266        isn't costed.  */
17267   switch (kind)
17268     {
17269     case scalar_to_vec:
17270       /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup.  */
17271       return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
17272
17273     case vec_construct:
17274       if (FLOAT_TYPE_P (vectype))
17275         /* Count 1 insn for the maximum number of FP->SIMD INS
17276            instructions.  */
17277         return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
17278
17279       /* Count 2 insns for a GPR->SIMD move and 2 insns for the
17280          maximum number of GPR->SIMD INS instructions.  */
17281       return vect_nunits_for_cost (vectype) * 4 * count;
17282
17283     case vector_store:
17284     case unaligned_store:
17285       /* Count 1 insn per vector if we can't form STP Q pairs.  */
17286       if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
17287         return count * 2;
17288
17289       if (stmt_info)
17290         {
17291           /* Assume we won't be able to use STP if the constant offset
17292              component of the address is misaligned.  ??? This could be
17293              removed if we formed STP pairs earlier, rather than relying
17294              on peephole2.  */
17295           auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
17296           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17297             return count * 2;
17298         }
17299       return CEIL (count, 2) * 2;
17300
17301     case scalar_store:
17302       if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
17303         {
17304           /* Check for a mode in which STP pairs can be formed.  */
17305           auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
17306           if (maybe_ne (size, 4) && maybe_ne (size, 8))
17307             return ~0U;
17308
17309           /* Assume we won't be able to use STP if the constant offset
17310              component of the address is misaligned.  ??? This could be
17311              removed if we formed STP pairs earlier, rather than relying
17312              on peephole2.  */
17313           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17314             return ~0U;
17315         }
17316       return count;
17317
17318     default:
17319       return ~0U;
17320     }
17321 }
17322
17323 unsigned
17324 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
17325                                      stmt_vec_info stmt_info, slp_tree node,
17326                                      tree vectype, int misalign,
17327                                      vect_cost_model_location where)
17328 {
17329   fractional_cost stmt_cost
17330     = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
17331
17332   bool in_inner_loop_p = (where == vect_body
17333                           && stmt_info
17334                           && stmt_in_inner_loop_p (m_vinfo, stmt_info));
17335
17336   /* Do one-time initialization based on the vinfo.  */
17337   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17338   if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
17339     {
17340       if (loop_vinfo)
17341         analyze_loop_vinfo (loop_vinfo);
17342
17343       m_analyzed_vinfo = true;
17344     }
17345
17346   /* Apply the heuristic described above m_stp_sequence_cost.  */
17347   if (m_stp_sequence_cost != ~0U)
17348     {
17349       uint64_t cost = aarch64_stp_sequence_cost (count, kind,
17350                                                  stmt_info, vectype);
17351       m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
17352     }
17353
17354   /* Try to get a more accurate cost by looking at STMT_INFO instead
17355      of just looking at KIND.  */
17356   if (stmt_info && aarch64_use_new_vector_costs_p ())
17357     {
17358       /* If we scalarize a strided store, the vectorizer costs one
17359          vec_to_scalar for each element.  However, we can store the first
17360          element using an FP store without a separate extract step.  */
17361       if (vect_is_store_elt_extraction (kind, stmt_info))
17362         count -= 1;
17363
17364       stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
17365                                                       stmt_info, stmt_cost);
17366
17367       if (vectype && m_vec_flags)
17368         stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
17369                                                         stmt_info, node,
17370                                                         vectype, where,
17371                                                         stmt_cost);
17372
17373       /* Check if we've seen an SVE gather/scatter operation and which size.  */
17374       if (kind == scalar_load
17375           && aarch64_sve_mode_p (TYPE_MODE (vectype))
17376           && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER)
17377         {
17378           const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve;
17379           if (sve_costs)
17380             {
17381               /* Test for VNx2 modes, which have 64-bit containers.  */
17382               if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype)),
17383                             aarch64_sve_vg))
17384                 m_sve_gather_scatter_init_cost
17385                   += sve_costs->gather_load_x64_init_cost;
17386               else
17387                 m_sve_gather_scatter_init_cost
17388                   += sve_costs->gather_load_x32_init_cost;
17389             }
17390         }
17391     }
17392
17393   /* Do any SVE-specific adjustments to the cost.  */
17394   if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
17395     stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17396                                               vectype, stmt_cost);
17397
17398   /*  Vector promotion and demotion requires us to widen the operation first
17399       and only after that perform the conversion.  Unfortunately the mid-end
17400       expects this to be doable as a single operation and doesn't pass on
17401       enough context here for us to tell which operation is happening.  To
17402       account for this we count every promote-demote operation twice and if
17403       the previously costed operation was also a promote-demote we reduce
17404       the cost of the currently being costed operation to simulate the final
17405       conversion cost.  Note that for SVE we can do better here if the converted
17406       value comes from a load since the widening load would consume the widening
17407       operations.  However since we're in stage 3 we can't change the helper
17408       vect_is_extending_load and duplicating the code seems not useful.  */
17409   gassign *assign = NULL;
17410   if (kind == vec_promote_demote
17411       && (assign = dyn_cast <gassign *> (STMT_VINFO_STMT (stmt_info)))
17412       && gimple_assign_rhs_code (assign) == FLOAT_EXPR)
17413     {
17414       auto new_count = count * 2 - m_num_last_promote_demote;
17415       m_num_last_promote_demote = count;
17416       count = new_count;
17417     }
17418   else
17419     m_num_last_promote_demote = 0;
17420
17421   if (stmt_info && aarch64_use_new_vector_costs_p ())
17422     {
17423       /* Account for any extra "embedded" costs that apply additively
17424          to the base cost calculated above.  */
17425       stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info, node,
17426                                             vectype, m_vec_flags, stmt_cost);
17427
17428       /* If we're recording a nonzero vector loop body cost for the
17429          innermost loop, also estimate the operations that would need
17430          to be issued by all relevant implementations of the loop.  */
17431       if (loop_vinfo
17432           && (m_costing_for_scalar || where == vect_body)
17433           && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
17434           && stmt_cost != 0)
17435         for (auto &ops : m_ops)
17436           count_ops (count, kind, stmt_info, node, &ops);
17437
17438       /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
17439          estimate the number of statements in the unrolled Advanced SIMD
17440          loop.  For simplicitly, we assume that one iteration of the
17441          Advanced SIMD loop would need the same number of statements
17442          as one iteration of the SVE loop.  */
17443       if (where == vect_body && m_unrolled_advsimd_niters)
17444         m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
17445
17446       /* Detect the use of an averaging operation.  */
17447       gimple *stmt = stmt_info->stmt;
17448       if (is_gimple_call (stmt)
17449           && gimple_call_internal_p (stmt))
17450         {
17451           switch (gimple_call_internal_fn (stmt))
17452             {
17453             case IFN_AVG_FLOOR:
17454             case IFN_AVG_CEIL:
17455               m_has_avg = true;
17456             default:
17457               break;
17458             }
17459         }
17460     }
17461
17462   /* If the statement stores to a decl that is known to be the argument
17463      to a vld1 in the same function, ignore the store for costing purposes.
17464      See the comment above m_stores_to_vector_load_decl for more details.  */
17465   if (stmt_info
17466       && (kind == vector_store || kind == unaligned_store)
17467       && aarch64_accesses_vector_load_decl_p (stmt_info))
17468     {
17469       stmt_cost = 0;
17470       m_stores_to_vector_load_decl = true;
17471     }
17472
17473   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
17474 }
17475
17476 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
17477    heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
17478    says that we should prefer the Advanced SIMD loop.  */
17479 bool
17480 aarch64_vector_costs::prefer_unrolled_loop () const
17481 {
17482   if (!m_unrolled_advsimd_stmts)
17483     return false;
17484
17485   if (dump_enabled_p ())
17486     dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
17487                      " unrolled Advanced SIMD loop = "
17488                      HOST_WIDE_INT_PRINT_UNSIGNED "\n",
17489                      m_unrolled_advsimd_stmts);
17490
17491   /* The balance here is tricky.  On the one hand, we can't be sure whether
17492      the code is vectorizable with Advanced SIMD or not.  However, even if
17493      it isn't vectorizable with Advanced SIMD, there's a possibility that
17494      the scalar code could also be unrolled.  Some of the code might then
17495      benefit from SLP, or from using LDP and STP.  We therefore apply
17496      the heuristic regardless of can_use_advsimd_p.  */
17497   return (m_unrolled_advsimd_stmts
17498           && (m_unrolled_advsimd_stmts
17499               <= (unsigned int) param_max_completely_peeled_insns));
17500 }
17501
17502 /* Subroutine of adjust_body_cost for handling SVE.  Use ISSUE_INFO to work out
17503    how fast the SVE code can be issued and compare it to the equivalent value
17504    for scalar code (SCALAR_CYCLES_PER_ITER).  If COULD_USE_ADVSIMD is true,
17505    also compare it to the issue rate of Advanced SIMD code
17506    (ADVSIMD_CYCLES_PER_ITER).
17507
17508    ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
17509    *BODY_COST is the current value of the adjusted cost.  *SHOULD_DISPARAGE
17510    is true if we think the loop body is too expensive.  */
17511
17512 fractional_cost
17513 aarch64_vector_costs::
17514 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
17515                       fractional_cost scalar_cycles_per_iter,
17516                       unsigned int orig_body_cost, unsigned int *body_cost,
17517                       bool *should_disparage)
17518 {
17519   if (dump_enabled_p ())
17520     ops->dump ();
17521
17522   fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
17523   fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
17524
17525   /* If the scalar version of the loop could issue at least as
17526      quickly as the predicate parts of the SVE loop, make the SVE loop
17527      prohibitively expensive.  In this case vectorization is adding an
17528      overhead that the original scalar code didn't have.
17529
17530      This is mostly intended to detect cases in which WHILELOs dominate
17531      for very tight loops, which is something that normal latency-based
17532      costs would not model.  Adding this kind of cliffedge would be
17533      too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17534      code in the caller handles that case in a more conservative way.  */
17535   fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
17536   if (scalar_cycles_per_iter < sve_estimate)
17537     {
17538       unsigned int min_cost
17539         = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17540       if (*body_cost < min_cost)
17541         {
17542           if (dump_enabled_p ())
17543             dump_printf_loc (MSG_NOTE, vect_location,
17544                              "Increasing body cost to %d because the"
17545                              " scalar code could issue within the limit"
17546                              " imposed by predicate operations\n",
17547                              min_cost);
17548           *body_cost = min_cost;
17549           *should_disparage = true;
17550         }
17551     }
17552
17553   return sve_cycles_per_iter;
17554 }
17555
17556 unsigned int
17557 aarch64_vector_costs::determine_suggested_unroll_factor ()
17558 {
17559   bool sve = m_vec_flags & VEC_ANY_SVE;
17560   /* If we are trying to unroll an Advanced SIMD main loop that contains
17561      an averaging operation that we do not support with SVE and we might use a
17562      predicated epilogue, we need to be conservative and block unrolling as
17563      this might lead to a less optimal loop for the first and only epilogue
17564      using the original loop's vectorization factor.
17565      TODO: Remove this constraint when we add support for multiple epilogue
17566      vectorization.  */
17567   if (!sve && !TARGET_SVE2 && m_has_avg)
17568     return 1;
17569
17570   unsigned int max_unroll_factor = 1;
17571   for (auto vec_ops : m_ops)
17572     {
17573       aarch64_simd_vec_issue_info const *vec_issue
17574         = vec_ops.simd_issue_info ();
17575       if (!vec_issue)
17576         return 1;
17577       /* Limit unroll factor to a value adjustable by the user, the default
17578          value is 4. */
17579       unsigned int unroll_factor = aarch64_vect_unroll_limit;
17580       unsigned int factor
17581        = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17582       unsigned int temp;
17583
17584       /* Sanity check, this should never happen.  */
17585       if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17586         return 1;
17587
17588       /* Check stores.  */
17589       if (vec_ops.stores > 0)
17590         {
17591           temp = CEIL (factor * vec_issue->stores_per_cycle,
17592                        vec_ops.stores);
17593           unroll_factor = MIN (unroll_factor, temp);
17594         }
17595
17596       /* Check loads + stores.  */
17597       if (vec_ops.loads > 0)
17598         {
17599           temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17600                        vec_ops.loads + vec_ops.stores);
17601           unroll_factor = MIN (unroll_factor, temp);
17602         }
17603
17604       /* Check general ops.  */
17605       if (vec_ops.general_ops > 0)
17606         {
17607           temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17608                        vec_ops.general_ops);
17609           unroll_factor = MIN (unroll_factor, temp);
17610          }
17611       max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17612     }
17613
17614   /* Make sure unroll factor is power of 2.  */
17615   return 1 << ceil_log2 (max_unroll_factor);
17616 }
17617
17618 /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
17619    and return the new cost.  */
17620 unsigned int
17621 aarch64_vector_costs::
17622 adjust_body_cost (loop_vec_info loop_vinfo,
17623                   const aarch64_vector_costs *scalar_costs,
17624                   unsigned int body_cost)
17625 {
17626   if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17627     return body_cost;
17628
17629   const auto &scalar_ops = scalar_costs->m_ops[0];
17630   const auto &vector_ops = m_ops[0];
17631   unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17632   unsigned int orig_body_cost = body_cost;
17633   bool should_disparage = false;
17634
17635   if (dump_enabled_p ())
17636     dump_printf_loc (MSG_NOTE, vect_location,
17637                      "Original vector body cost = %d\n", body_cost);
17638
17639   /* If we know we have a single partial vector iteration, cap the VF
17640      to the number of scalar iterations for costing purposes.  */
17641   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
17642     {
17643       auto niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
17644       if (niters < estimated_vf && dump_enabled_p ())
17645         dump_printf_loc (MSG_NOTE, vect_location,
17646                          "Scalar loop iterates at most %wd times.  Capping VF "
17647                          " from %d to %wd\n", niters, estimated_vf, niters);
17648
17649       estimated_vf = MIN (estimated_vf, niters);
17650     }
17651
17652   fractional_cost scalar_cycles_per_iter
17653     = scalar_ops.min_cycles_per_iter () * estimated_vf;
17654
17655   fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17656
17657   if (dump_enabled_p ())
17658     {
17659       if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17660         dump_printf_loc (MSG_NOTE, vect_location,
17661                          "Vector loop iterates at most %wd times\n",
17662                          m_num_vector_iterations);
17663       dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17664       scalar_ops.dump ();
17665       dump_printf_loc (MSG_NOTE, vect_location,
17666                        "  estimated cycles per vector iteration"
17667                        " (for VF %d) = %f\n",
17668                        estimated_vf, scalar_cycles_per_iter.as_double ());
17669     }
17670
17671   if (vector_ops.sve_issue_info ())
17672     {
17673       if (dump_enabled_p ())
17674         dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17675       vector_cycles_per_iter
17676         = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17677                                 orig_body_cost, &body_cost, &should_disparage);
17678
17679       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17680         {
17681           /* Also take Neoverse V1 tuning into account, doubling the
17682              scalar and Advanced SIMD estimates to account for the
17683              doubling in SVE vector length.  */
17684           if (dump_enabled_p ())
17685             dump_printf_loc (MSG_NOTE, vect_location,
17686                              "Neoverse V1 estimate:\n");
17687           auto vf_factor = m_ops[1].vf_factor ();
17688           adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17689                                 orig_body_cost, &body_cost, &should_disparage);
17690         }
17691     }
17692   else
17693     {
17694       if (dump_enabled_p ())
17695         {
17696           dump_printf_loc (MSG_NOTE, vect_location,
17697                            "Vector issue estimate:\n");
17698           vector_ops.dump ();
17699         }
17700     }
17701
17702   /* Decide whether to stick to latency-based costs or whether to try to
17703      take issue rates into account.  */
17704   unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17705   if (m_vec_flags & VEC_ANY_SVE)
17706     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17707
17708   if (m_num_vector_iterations >= 1
17709       && m_num_vector_iterations < threshold)
17710     {
17711       if (dump_enabled_p ())
17712         dump_printf_loc (MSG_NOTE, vect_location,
17713                          "Low iteration count, so using pure latency"
17714                          " costs\n");
17715     }
17716   /* Increase the cost of the vector code if it looks like the scalar code
17717      could issue more quickly.  These values are only rough estimates,
17718      so minor differences should only result in minor changes.  */
17719   else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17720     {
17721       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17722                                           scalar_cycles_per_iter);
17723       if (dump_enabled_p ())
17724         dump_printf_loc (MSG_NOTE, vect_location,
17725                          "Increasing body cost to %d because scalar code"
17726                          " would issue more quickly\n", body_cost);
17727     }
17728   /* In general, it's expected that the proposed vector code would be able
17729      to issue more quickly than the original scalar code.  This should
17730      already be reflected to some extent in the latency-based costs.
17731
17732      However, the latency-based costs effectively assume that the scalar
17733      code and the vector code execute serially, which tends to underplay
17734      one important case: if the real (non-serialized) execution time of
17735      a scalar iteration is dominated by loop-carried dependencies,
17736      and if the vector code is able to reduce both the length of
17737      the loop-carried dependencies *and* the number of cycles needed
17738      to issue the code in general, we can be more confident that the
17739      vector code is an improvement, even if adding the other (non-loop-carried)
17740      latencies tends to hide this saving.  We therefore reduce the cost of the
17741      vector loop body in proportion to the saving.  */
17742   else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17743            && scalar_ops.reduction_latency == scalar_cycles_per_iter
17744            && scalar_cycles_per_iter > vector_cycles_per_iter
17745            && !should_disparage)
17746     {
17747       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17748                                           scalar_cycles_per_iter);
17749       if (dump_enabled_p ())
17750         dump_printf_loc (MSG_NOTE, vect_location,
17751                          "Decreasing body cost to %d account for smaller"
17752                          " reduction latency\n", body_cost);
17753     }
17754
17755   return body_cost;
17756 }
17757
17758 void
17759 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17760 {
17761   /* Record the issue information for any SVE WHILE instructions that the
17762      loop needs.  */
17763   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17764   if (!m_ops.is_empty ()
17765       && loop_vinfo
17766       && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
17767     {
17768       unsigned int num_masks = 0;
17769       rgroup_controls *rgm;
17770       unsigned int num_vectors_m1;
17771       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
17772                         num_vectors_m1, rgm)
17773         if (rgm->type)
17774           num_masks += num_vectors_m1 + 1;
17775       for (auto &ops : m_ops)
17776         if (auto *issue = ops.sve_issue_info ())
17777           ops.pred_ops += num_masks * issue->while_pred_ops;
17778     }
17779
17780   auto *scalar_costs
17781     = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17782   if (loop_vinfo
17783       && m_vec_flags
17784       && aarch64_use_new_vector_costs_p ())
17785     {
17786       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17787                                              m_costs[vect_body]);
17788       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17789
17790       /* For gather and scatters there's an additional overhead for the first
17791          iteration.  For low count loops they're not beneficial so model the
17792          overhead as loop prologue costs.  */
17793       m_costs[vect_prologue] += m_sve_gather_scatter_init_cost;
17794     }
17795
17796   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
17797      the scalar code in the event of a tie, since there is more chance
17798      of scalar code being optimized with surrounding operations.
17799
17800      In addition, if the vector body is a simple store to a decl that
17801      is elsewhere loaded using vld1, strongly prefer the vector form,
17802      to the extent of giving the prologue a zero cost.  See the comment
17803      above m_stores_to_vector_load_decl for details.  */
17804   if (!loop_vinfo
17805       && scalar_costs
17806       && m_stp_sequence_cost != ~0U)
17807     {
17808       if (m_stores_to_vector_load_decl)
17809         m_costs[vect_prologue] = 0;
17810       else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17811         m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17812     }
17813
17814   vector_costs::finish_cost (scalar_costs);
17815 }
17816
17817 bool
17818 aarch64_vector_costs::
17819 better_main_loop_than_p (const vector_costs *uncast_other) const
17820 {
17821   auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17822
17823   auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17824   auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17825
17826   if (dump_enabled_p ())
17827     dump_printf_loc (MSG_NOTE, vect_location,
17828                      "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17829                      GET_MODE_NAME (this_loop_vinfo->vector_mode),
17830                      vect_vf_for_cost (this_loop_vinfo),
17831                      GET_MODE_NAME (other_loop_vinfo->vector_mode),
17832                      vect_vf_for_cost (other_loop_vinfo));
17833
17834   /* Apply the unrolling heuristic described above
17835      m_unrolled_advsimd_niters.  */
17836   if (bool (m_unrolled_advsimd_stmts)
17837       != bool (other->m_unrolled_advsimd_stmts))
17838     {
17839       bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17840       bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17841       if (this_prefer_unrolled != other_prefer_unrolled)
17842         {
17843           if (dump_enabled_p ())
17844             dump_printf_loc (MSG_NOTE, vect_location,
17845                              "Preferring Advanced SIMD loop because"
17846                              " it can be unrolled\n");
17847           return other_prefer_unrolled;
17848         }
17849     }
17850
17851   for (unsigned int i = 0; i < m_ops.length (); ++i)
17852     {
17853       if (dump_enabled_p ())
17854         {
17855           if (i)
17856             dump_printf_loc (MSG_NOTE, vect_location,
17857                              "Reconsidering with subtuning %d\n", i);
17858           dump_printf_loc (MSG_NOTE, vect_location,
17859                            "Issue info for %s loop:\n",
17860                            GET_MODE_NAME (this_loop_vinfo->vector_mode));
17861           this->m_ops[i].dump ();
17862           dump_printf_loc (MSG_NOTE, vect_location,
17863                            "Issue info for %s loop:\n",
17864                            GET_MODE_NAME (other_loop_vinfo->vector_mode));
17865           other->m_ops[i].dump ();
17866         }
17867
17868       auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17869                                 * this->m_ops[i].vf_factor ());
17870       auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17871                                  * other->m_ops[i].vf_factor ());
17872
17873       /* If it appears that one loop could process the same amount of data
17874          in fewer cycles, prefer that loop over the other one.  */
17875       fractional_cost this_cost
17876         = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17877       fractional_cost other_cost
17878         = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17879       if (dump_enabled_p ())
17880         {
17881           dump_printf_loc (MSG_NOTE, vect_location,
17882                            "Weighted cycles per iteration of %s loop ~= %f\n",
17883                            GET_MODE_NAME (this_loop_vinfo->vector_mode),
17884                            this_cost.as_double ());
17885           dump_printf_loc (MSG_NOTE, vect_location,
17886                            "Weighted cycles per iteration of %s loop ~= %f\n",
17887                            GET_MODE_NAME (other_loop_vinfo->vector_mode),
17888                            other_cost.as_double ());
17889         }
17890       if (this_cost != other_cost)
17891         {
17892           if (dump_enabled_p ())
17893             dump_printf_loc (MSG_NOTE, vect_location,
17894                              "Preferring loop with lower cycles"
17895                              " per iteration\n");
17896           return this_cost < other_cost;
17897         }
17898
17899       /* If the issue rate of SVE code is limited by predicate operations
17900          (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17901          and if Advanced SIMD code could issue within the limit imposed
17902          by the predicate operations, the predicate operations are adding an
17903          overhead that the original code didn't have and so we should prefer
17904          the Advanced SIMD version.  */
17905       auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17906                                     const aarch64_vec_op_count &b) -> bool
17907         {
17908           if (a.pred_ops == 0
17909               && (b.min_pred_cycles_per_iter ()
17910                   > b.min_nonpred_cycles_per_iter ()))
17911             {
17912               if (dump_enabled_p ())
17913                 dump_printf_loc (MSG_NOTE, vect_location,
17914                                  "Preferring Advanced SIMD loop since"
17915                                  " SVE loop is predicate-limited\n");
17916               return true;
17917             }
17918           return false;
17919         };
17920       if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17921         return true;
17922       if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17923         return false;
17924     }
17925
17926   return vector_costs::better_main_loop_than_p (other);
17927 }
17928
17929 static void initialize_aarch64_code_model (struct gcc_options *);
17930
17931 /* Parse the TO_PARSE string and put the architecture struct that it
17932    selects into RES and the architectural features into ISA_FLAGS.
17933    Return an aarch_parse_opt_result describing the parse result.
17934    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17935    When the TO_PARSE string contains an invalid extension,
17936    a copy of the string is created and stored to INVALID_EXTENSION.  */
17937
17938 static enum aarch_parse_opt_result
17939 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17940                     aarch64_feature_flags *isa_flags,
17941                     std::string *invalid_extension)
17942 {
17943   const char *ext;
17944   const struct processor *arch;
17945   size_t len;
17946
17947   ext = strchr (to_parse, '+');
17948
17949   if (ext != NULL)
17950     len = ext - to_parse;
17951   else
17952     len = strlen (to_parse);
17953
17954   if (len == 0)
17955     return AARCH_PARSE_MISSING_ARG;
17956
17957
17958   /* Loop through the list of supported ARCHes to find a match.  */
17959   for (arch = all_architectures; arch->name != NULL; arch++)
17960     {
17961       if (strlen (arch->name) == len
17962           && strncmp (arch->name, to_parse, len) == 0)
17963         {
17964           auto isa_temp = arch->flags;
17965
17966           if (ext != NULL)
17967             {
17968               /* TO_PARSE string contains at least one extension.  */
17969               enum aarch_parse_opt_result ext_res
17970                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17971
17972               if (ext_res != AARCH_PARSE_OK)
17973                 return ext_res;
17974             }
17975           /* Extension parsing was successful.  Confirm the result
17976              arch and ISA flags.  */
17977           *res = arch;
17978           *isa_flags = isa_temp;
17979           return AARCH_PARSE_OK;
17980         }
17981     }
17982
17983   /* ARCH name not found in list.  */
17984   return AARCH_PARSE_INVALID_ARG;
17985 }
17986
17987 /* Parse the TO_PARSE string and put the result tuning in RES and the
17988    architecture flags in ISA_FLAGS.  Return an aarch_parse_opt_result
17989    describing the parse result.  If there is an error parsing, RES and
17990    ISA_FLAGS are left unchanged.
17991    When the TO_PARSE string contains an invalid extension,
17992    a copy of the string is created and stored to INVALID_EXTENSION.  */
17993
17994 static enum aarch_parse_opt_result
17995 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17996                    aarch64_feature_flags *isa_flags,
17997                    std::string *invalid_extension)
17998 {
17999   const char *ext;
18000   const struct processor *cpu;
18001   size_t len;
18002
18003   ext = strchr (to_parse, '+');
18004
18005   if (ext != NULL)
18006     len = ext - to_parse;
18007   else
18008     len = strlen (to_parse);
18009
18010   if (len == 0)
18011     return AARCH_PARSE_MISSING_ARG;
18012
18013
18014   /* Loop through the list of supported CPUs to find a match.  */
18015   for (cpu = all_cores; cpu->name != NULL; cpu++)
18016     {
18017       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
18018         {
18019           auto isa_temp = cpu->flags;
18020
18021           if (ext != NULL)
18022             {
18023               /* TO_PARSE string contains at least one extension.  */
18024               enum aarch_parse_opt_result ext_res
18025                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
18026
18027               if (ext_res != AARCH_PARSE_OK)
18028                 return ext_res;
18029             }
18030           /* Extension parsing was successfull.  Confirm the result
18031              cpu and ISA flags.  */
18032           *res = cpu;
18033           *isa_flags = isa_temp;
18034           return AARCH_PARSE_OK;
18035         }
18036     }
18037
18038   /* CPU name not found in list.  */
18039   return AARCH_PARSE_INVALID_ARG;
18040 }
18041
18042 /* Parse the TO_PARSE string and put the cpu it selects into RES.
18043    Return an aarch_parse_opt_result describing the parse result.
18044    If the parsing fails the RES does not change.  */
18045
18046 static enum aarch_parse_opt_result
18047 aarch64_parse_tune (const char *to_parse, const struct processor **res)
18048 {
18049   const struct processor *cpu;
18050
18051   /* Loop through the list of supported CPUs to find a match.  */
18052   for (cpu = all_cores; cpu->name != NULL; cpu++)
18053     {
18054       if (strcmp (cpu->name, to_parse) == 0)
18055         {
18056           *res = cpu;
18057           return AARCH_PARSE_OK;
18058         }
18059     }
18060
18061   /* CPU name not found in list.  */
18062   return AARCH_PARSE_INVALID_ARG;
18063 }
18064
18065 /* Parse TOKEN, which has length LENGTH to see if it is an option
18066    described in FLAG.  If it is, return the index bit for that fusion type.
18067    If not, error (printing OPTION_NAME) and return zero.  */
18068
18069 static unsigned int
18070 aarch64_parse_one_option_token (const char *token,
18071                                 size_t length,
18072                                 const struct aarch64_flag_desc *flag,
18073                                 const char *option_name)
18074 {
18075   for (; flag->name != NULL; flag++)
18076     {
18077       if (length == strlen (flag->name)
18078           && !strncmp (flag->name, token, length))
18079         return flag->flag;
18080     }
18081
18082   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
18083   return 0;
18084 }
18085
18086 /* Parse OPTION which is a comma-separated list of flags to enable.
18087    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
18088    default state we inherit from the CPU tuning structures.  OPTION_NAME
18089    gives the top-level option we are parsing in the -moverride string,
18090    for use in error messages.  */
18091
18092 static unsigned int
18093 aarch64_parse_boolean_options (const char *option,
18094                                const struct aarch64_flag_desc *flags,
18095                                unsigned int initial_state,
18096                                const char *option_name)
18097 {
18098   const char separator = '.';
18099   const char* specs = option;
18100   const char* ntoken = option;
18101   unsigned int found_flags = initial_state;
18102
18103   while ((ntoken = strchr (specs, separator)))
18104     {
18105       size_t token_length = ntoken - specs;
18106       unsigned token_ops = aarch64_parse_one_option_token (specs,
18107                                                            token_length,
18108                                                            flags,
18109                                                            option_name);
18110       /* If we find "none" (or, for simplicity's sake, an error) anywhere
18111          in the token stream, reset the supported operations.  So:
18112
18113            adrp+add.cmp+branch.none.adrp+add
18114
18115            would have the result of turning on only adrp+add fusion.  */
18116       if (!token_ops)
18117         found_flags = 0;
18118
18119       found_flags |= token_ops;
18120       specs = ++ntoken;
18121     }
18122
18123   /* We ended with a comma, print something.  */
18124   if (!(*specs))
18125     {
18126       error ("%qs string ill-formed", option_name);
18127       return 0;
18128     }
18129
18130   /* We still have one more token to parse.  */
18131   size_t token_length = strlen (specs);
18132   unsigned token_ops = aarch64_parse_one_option_token (specs,
18133                                                        token_length,
18134                                                        flags,
18135                                                        option_name);
18136    if (!token_ops)
18137      found_flags = 0;
18138
18139   found_flags |= token_ops;
18140   return found_flags;
18141 }
18142
18143 /* Support for overriding instruction fusion.  */
18144
18145 static void
18146 aarch64_parse_fuse_string (const char *fuse_string,
18147                             struct tune_params *tune)
18148 {
18149   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
18150                                                      aarch64_fusible_pairs,
18151                                                      tune->fusible_ops,
18152                                                      "fuse=");
18153 }
18154
18155 /* Support for overriding other tuning flags.  */
18156
18157 static void
18158 aarch64_parse_tune_string (const char *tune_string,
18159                             struct tune_params *tune)
18160 {
18161   tune->extra_tuning_flags
18162     = aarch64_parse_boolean_options (tune_string,
18163                                      aarch64_tuning_flags,
18164                                      tune->extra_tuning_flags,
18165                                      "tune=");
18166 }
18167
18168 /* Parse the sve_width tuning moverride string in TUNE_STRING.
18169    Accept the valid SVE vector widths allowed by
18170    aarch64_sve_vector_bits_enum and use it to override sve_width
18171    in TUNE.  */
18172
18173 static void
18174 aarch64_parse_sve_width_string (const char *tune_string,
18175                                 struct tune_params *tune)
18176 {
18177   int width = -1;
18178
18179   int n = sscanf (tune_string, "%d", &width);
18180   if (n == EOF)
18181     {
18182       error ("invalid format for %<sve_width%>");
18183       return;
18184     }
18185   switch (width)
18186     {
18187     case SVE_128:
18188     case SVE_256:
18189     case SVE_512:
18190     case SVE_1024:
18191     case SVE_2048:
18192       break;
18193     default:
18194       error ("invalid %<sve_width%> value: %d", width);
18195     }
18196   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
18197 }
18198
18199 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
18200    we understand.  If it is, extract the option string and handoff to
18201    the appropriate function.  */
18202
18203 void
18204 aarch64_parse_one_override_token (const char* token,
18205                                   size_t length,
18206                                   struct tune_params *tune)
18207 {
18208   const struct aarch64_tuning_override_function *fn
18209     = aarch64_tuning_override_functions;
18210
18211   const char *option_part = strchr (token, '=');
18212   if (!option_part)
18213     {
18214       error ("tuning string missing in option (%s)", token);
18215       return;
18216     }
18217
18218   /* Get the length of the option name.  */
18219   length = option_part - token;
18220   /* Skip the '=' to get to the option string.  */
18221   option_part++;
18222
18223   for (; fn->name != NULL; fn++)
18224     {
18225       if (!strncmp (fn->name, token, length))
18226         {
18227           fn->parse_override (option_part, tune);
18228           return;
18229         }
18230     }
18231
18232   error ("unknown tuning option (%s)",token);
18233   return;
18234 }
18235
18236 /* A checking mechanism for the implementation of the tls size.  */
18237
18238 static void
18239 initialize_aarch64_tls_size (struct gcc_options *opts)
18240 {
18241   if (aarch64_tls_size == 0)
18242     aarch64_tls_size = 24;
18243
18244   switch (opts->x_aarch64_cmodel_var)
18245     {
18246     case AARCH64_CMODEL_TINY:
18247       /* Both the default and maximum TLS size allowed under tiny is 1M which
18248          needs two instructions to address, so we clamp the size to 24.  */
18249       if (aarch64_tls_size > 24)
18250         aarch64_tls_size = 24;
18251       break;
18252     case AARCH64_CMODEL_SMALL:
18253       /* The maximum TLS size allowed under small is 4G.  */
18254       if (aarch64_tls_size > 32)
18255         aarch64_tls_size = 32;
18256       break;
18257     case AARCH64_CMODEL_LARGE:
18258       /* The maximum TLS size allowed under large is 16E.
18259          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
18260       if (aarch64_tls_size > 48)
18261         aarch64_tls_size = 48;
18262       break;
18263     default:
18264       gcc_unreachable ();
18265     }
18266
18267   return;
18268 }
18269
18270 /* Return the CPU corresponding to the enum CPU.  */
18271
18272 static const struct processor *
18273 aarch64_get_tune_cpu (enum aarch64_processor cpu)
18274 {
18275   gcc_assert (cpu != aarch64_none);
18276
18277   return &all_cores[cpu];
18278 }
18279
18280 /* Return the architecture corresponding to the enum ARCH.  */
18281
18282 static const struct processor *
18283 aarch64_get_arch (enum aarch64_arch arch)
18284 {
18285   gcc_assert (arch != aarch64_no_arch);
18286
18287   return &all_architectures[arch];
18288 }
18289
18290 /* Parse STRING looking for options in the format:
18291      string     :: option:string
18292      option     :: name=substring
18293      name       :: {a-z}
18294      substring  :: defined by option.  */
18295
18296 static void
18297 aarch64_parse_override_string (const char* input_string,
18298                                struct tune_params* tune)
18299 {
18300   const char separator = ':';
18301   size_t string_length = strlen (input_string) + 1;
18302   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
18303   char *string = string_root;
18304   strncpy (string, input_string, string_length);
18305   string[string_length - 1] = '\0';
18306
18307   char* ntoken = string;
18308
18309   while ((ntoken = strchr (string, separator)))
18310     {
18311       size_t token_length = ntoken - string;
18312       /* Make this substring look like a string.  */
18313       *ntoken = '\0';
18314       aarch64_parse_one_override_token (string, token_length, tune);
18315       string = ++ntoken;
18316     }
18317
18318   /* One last option to parse.  */
18319   aarch64_parse_one_override_token (string, strlen (string), tune);
18320   free (string_root);
18321 }
18322
18323 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
18324    are best for a generic target with the currently-enabled architecture
18325    extensions.  */
18326 static void
18327 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
18328 {
18329   /* Neoverse V1 is the only core that is known to benefit from
18330      AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.  There is therefore no
18331      point enabling it for SVE2 and above.  */
18332   if (TARGET_SVE2)
18333     current_tune.extra_tuning_flags
18334       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
18335 }
18336
18337 static void
18338 aarch64_override_options_after_change_1 (struct gcc_options *opts)
18339 {
18340   /* PR 70044: We have to be careful about being called multiple times for the
18341      same function.  This means all changes should be repeatable.  */
18342
18343   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
18344      Disable the frame pointer flag so the mid-end will not use a frame
18345      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
18346      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
18347      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
18348   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
18349   if (opts->x_flag_omit_frame_pointer == 0)
18350     opts->x_flag_omit_frame_pointer = 2;
18351
18352   /* If not optimizing for size, set the default
18353      alignment to what the target wants.  */
18354   if (!opts->x_optimize_size)
18355     {
18356       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
18357         opts->x_str_align_loops = aarch64_tune_params.loop_align;
18358       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
18359         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
18360       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
18361         opts->x_str_align_functions = aarch64_tune_params.function_align;
18362     }
18363
18364   /* We default to no pc-relative literal loads.  */
18365
18366   aarch64_pcrelative_literal_loads = false;
18367
18368   /* If -mpc-relative-literal-loads is set on the command line, this
18369      implies that the user asked for PC relative literal loads.  */
18370   if (opts->x_pcrelative_literal_loads == 1)
18371     aarch64_pcrelative_literal_loads = true;
18372
18373   /* In the tiny memory model it makes no sense to disallow PC relative
18374      literal pool loads.  */
18375   if (aarch64_cmodel == AARCH64_CMODEL_TINY
18376       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
18377     aarch64_pcrelative_literal_loads = true;
18378
18379   /* When enabling the lower precision Newton series for the square root, also
18380      enable it for the reciprocal square root, since the latter is an
18381      intermediary step for the former.  */
18382   if (flag_mlow_precision_sqrt)
18383     flag_mrecip_low_precision_sqrt = true;
18384 }
18385
18386 /* 'Unpack' up the internal tuning structs and update the options
18387     in OPTS.  The caller must have set up selected_tune and selected_arch
18388     as all the other target-specific codegen decisions are
18389     derived from them.  */
18390
18391 void
18392 aarch64_override_options_internal (struct gcc_options *opts)
18393 {
18394   const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
18395   aarch64_tune = tune->sched_core;
18396   /* Make a copy of the tuning parameters attached to the core, which
18397      we may later overwrite.  */
18398   aarch64_tune_params = *(tune->tune);
18399   if (tune->tune == &generic_tunings)
18400     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
18401
18402   if (opts->x_aarch64_override_tune_string)
18403     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
18404                                    &aarch64_tune_params);
18405
18406   if (opts->x_aarch64_ldp_policy_param)
18407     aarch64_tune_params.ldp_policy_model = opts->x_aarch64_ldp_policy_param;
18408
18409   if (opts->x_aarch64_stp_policy_param)
18410     aarch64_tune_params.stp_policy_model = opts->x_aarch64_stp_policy_param;
18411
18412   /* This target defaults to strict volatile bitfields.  */
18413   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
18414     opts->x_flag_strict_volatile_bitfields = 1;
18415
18416   if (aarch64_stack_protector_guard == SSP_GLOBAL
18417       && opts->x_aarch64_stack_protector_guard_offset_str)
18418     {
18419       error ("incompatible options %<-mstack-protector-guard=global%> and "
18420              "%<-mstack-protector-guard-offset=%s%>",
18421              aarch64_stack_protector_guard_offset_str);
18422     }
18423
18424   if (aarch64_stack_protector_guard == SSP_SYSREG
18425       && !(opts->x_aarch64_stack_protector_guard_offset_str
18426            && opts->x_aarch64_stack_protector_guard_reg_str))
18427     {
18428       error ("both %<-mstack-protector-guard-offset%> and "
18429              "%<-mstack-protector-guard-reg%> must be used "
18430              "with %<-mstack-protector-guard=sysreg%>");
18431     }
18432
18433   if (opts->x_aarch64_stack_protector_guard_reg_str)
18434     {
18435       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
18436           error ("specify a system register with a small string length");
18437     }
18438
18439   if (opts->x_aarch64_stack_protector_guard_offset_str)
18440     {
18441       char *end;
18442       const char *str = aarch64_stack_protector_guard_offset_str;
18443       errno = 0;
18444       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
18445       if (!*str || *end || errno)
18446         error ("%qs is not a valid offset in %qs", str,
18447                "-mstack-protector-guard-offset=");
18448       aarch64_stack_protector_guard_offset = offs;
18449     }
18450
18451   if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
18452       && !fixed_regs[R18_REGNUM])
18453     error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
18454
18455   aarch64_feature_flags isa_flags = aarch64_get_isa_flags (opts);
18456   if ((isa_flags & (AARCH64_FL_SM_ON | AARCH64_FL_ZA_ON))
18457       && !(isa_flags & AARCH64_FL_SME))
18458     {
18459       if (isa_flags & AARCH64_FL_SM_ON)
18460         error ("streaming functions require the ISA extension %qs", "sme");
18461       else
18462         error ("functions with SME state require the ISA extension %qs",
18463                "sme");
18464       inform (input_location, "you can enable %qs using the command-line"
18465               " option %<-march%>, or by using the %<target%>"
18466               " attribute or pragma", "sme");
18467       opts->x_target_flags &= ~MASK_GENERAL_REGS_ONLY;
18468       auto new_flags = isa_flags | feature_deps::SME ().enable;
18469       aarch64_set_asm_isa_flags (opts, new_flags);
18470     }
18471
18472   initialize_aarch64_code_model (opts);
18473   initialize_aarch64_tls_size (opts);
18474   aarch64_tpidr_register = opts->x_aarch64_tpidr_reg;
18475
18476   int queue_depth = 0;
18477   switch (aarch64_tune_params.autoprefetcher_model)
18478     {
18479       case tune_params::AUTOPREFETCHER_OFF:
18480         queue_depth = -1;
18481         break;
18482       case tune_params::AUTOPREFETCHER_WEAK:
18483         queue_depth = 0;
18484         break;
18485       case tune_params::AUTOPREFETCHER_STRONG:
18486         queue_depth = max_insn_queue_index + 1;
18487         break;
18488       default:
18489         gcc_unreachable ();
18490     }
18491
18492   /* We don't mind passing in global_options_set here as we don't use
18493      the *options_set structs anyway.  */
18494   SET_OPTION_IF_UNSET (opts, &global_options_set,
18495                        param_sched_autopref_queue_depth, queue_depth);
18496
18497   /* Set up parameters to be used in prefetching algorithm.  Do not
18498      override the defaults unless we are tuning for a core we have
18499      researched values for.  */
18500   if (aarch64_tune_params.prefetch->num_slots > 0)
18501     SET_OPTION_IF_UNSET (opts, &global_options_set,
18502                          param_simultaneous_prefetches,
18503                          aarch64_tune_params.prefetch->num_slots);
18504   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
18505     SET_OPTION_IF_UNSET (opts, &global_options_set,
18506                          param_l1_cache_size,
18507                          aarch64_tune_params.prefetch->l1_cache_size);
18508   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18509     SET_OPTION_IF_UNSET (opts, &global_options_set,
18510                          param_l1_cache_line_size,
18511                          aarch64_tune_params.prefetch->l1_cache_line_size);
18512
18513   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18514     {
18515       SET_OPTION_IF_UNSET (opts, &global_options_set,
18516                            param_destruct_interfere_size,
18517                            aarch64_tune_params.prefetch->l1_cache_line_size);
18518       SET_OPTION_IF_UNSET (opts, &global_options_set,
18519                            param_construct_interfere_size,
18520                            aarch64_tune_params.prefetch->l1_cache_line_size);
18521     }
18522   else
18523     {
18524       /* For a generic AArch64 target, cover the current range of cache line
18525          sizes.  */
18526       SET_OPTION_IF_UNSET (opts, &global_options_set,
18527                            param_destruct_interfere_size,
18528                            256);
18529       SET_OPTION_IF_UNSET (opts, &global_options_set,
18530                            param_construct_interfere_size,
18531                            64);
18532     }
18533
18534   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
18535     SET_OPTION_IF_UNSET (opts, &global_options_set,
18536                          param_l2_cache_size,
18537                          aarch64_tune_params.prefetch->l2_cache_size);
18538   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
18539     SET_OPTION_IF_UNSET (opts, &global_options_set,
18540                          param_prefetch_dynamic_strides, 0);
18541   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
18542     SET_OPTION_IF_UNSET (opts, &global_options_set,
18543                          param_prefetch_minimum_stride,
18544                          aarch64_tune_params.prefetch->minimum_stride);
18545
18546   /* Use the alternative scheduling-pressure algorithm by default.  */
18547   SET_OPTION_IF_UNSET (opts, &global_options_set,
18548                        param_sched_pressure_algorithm,
18549                        SCHED_PRESSURE_MODEL);
18550
18551   /* Validate the guard size.  */
18552   int guard_size = param_stack_clash_protection_guard_size;
18553
18554   if (guard_size != 12 && guard_size != 16)
18555     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
18556            "size.  Given value %d (%llu KB) is out of range",
18557            guard_size, (1ULL << guard_size) / 1024ULL);
18558
18559   /* Enforce that interval is the same size as size so the mid-end does the
18560      right thing.  */
18561   SET_OPTION_IF_UNSET (opts, &global_options_set,
18562                        param_stack_clash_protection_probe_interval,
18563                        guard_size);
18564
18565   /* The maybe_set calls won't update the value if the user has explicitly set
18566      one.  Which means we need to validate that probing interval and guard size
18567      are equal.  */
18568   int probe_interval
18569     = param_stack_clash_protection_probe_interval;
18570   if (guard_size != probe_interval)
18571     error ("stack clash guard size %<%d%> must be equal to probing interval "
18572            "%<%d%>", guard_size, probe_interval);
18573
18574   /* Enable sw prefetching at specified optimization level for
18575      CPUS that have prefetch.  Lower optimization level threshold by 1
18576      when profiling is enabled.  */
18577   if (opts->x_flag_prefetch_loop_arrays < 0
18578       && !opts->x_optimize_size
18579       && aarch64_tune_params.prefetch->default_opt_level >= 0
18580       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
18581     opts->x_flag_prefetch_loop_arrays = 1;
18582
18583   /* Avoid loop-dependant FMA chains.  */
18584   if (aarch64_tune_params.extra_tuning_flags
18585       & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA)
18586     SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
18587                          512);
18588
18589   /* Consider fully pipelined FMA in reassociation.  */
18590   if (aarch64_tune_params.extra_tuning_flags
18591       & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
18592     SET_OPTION_IF_UNSET (opts, &global_options_set, param_fully_pipelined_fma,
18593                          1);
18594
18595   aarch64_override_options_after_change_1 (opts);
18596 }
18597
18598 /* Print a hint with a suggestion for a core or architecture name that
18599    most closely resembles what the user passed in STR.  ARCH is true if
18600    the user is asking for an architecture name.  ARCH is false if the user
18601    is asking for a core name.  */
18602
18603 static void
18604 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
18605 {
18606   auto_vec<const char *> candidates;
18607   const struct processor *entry = arch ? all_architectures : all_cores;
18608   for (; entry->name != NULL; entry++)
18609     candidates.safe_push (entry->name);
18610
18611 #ifdef HAVE_LOCAL_CPU_DETECT
18612   /* Add also "native" as possible value.  */
18613   if (arch)
18614     candidates.safe_push ("native");
18615 #endif
18616
18617   char *s;
18618   const char *hint = candidates_list_and_hint (str, s, candidates);
18619   if (hint)
18620     inform (input_location, "valid arguments are: %s;"
18621                              " did you mean %qs?", s, hint);
18622   else
18623     inform (input_location, "valid arguments are: %s", s);
18624
18625   XDELETEVEC (s);
18626 }
18627
18628 /* Print a hint with a suggestion for a core name that most closely resembles
18629    what the user passed in STR.  */
18630
18631 inline static void
18632 aarch64_print_hint_for_core (const char *str)
18633 {
18634   aarch64_print_hint_for_core_or_arch (str, false);
18635 }
18636
18637 /* Print a hint with a suggestion for an architecture name that most closely
18638    resembles what the user passed in STR.  */
18639
18640 inline static void
18641 aarch64_print_hint_for_arch (const char *str)
18642 {
18643   aarch64_print_hint_for_core_or_arch (str, true);
18644 }
18645
18646
18647 /* Print a hint with a suggestion for an extension name
18648    that most closely resembles what the user passed in STR.  */
18649
18650 void
18651 aarch64_print_hint_for_extensions (const std::string &str)
18652 {
18653   auto_vec<const char *> candidates;
18654   aarch64_get_all_extension_candidates (&candidates);
18655   char *s;
18656   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
18657   if (hint)
18658     inform (input_location, "valid arguments are: %s;"
18659                              " did you mean %qs?", s, hint);
18660   else
18661     inform (input_location, "valid arguments are: %s", s);
18662
18663   XDELETEVEC (s);
18664 }
18665
18666 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
18667    specified in STR and throw errors if appropriate.  Put the results if
18668    they are valid in RES and ISA_FLAGS.  Return whether the option is
18669    valid.  */
18670
18671 static bool
18672 aarch64_validate_mcpu (const char *str, const struct processor **res,
18673                        aarch64_feature_flags *isa_flags)
18674 {
18675   std::string invalid_extension;
18676   enum aarch_parse_opt_result parse_res
18677     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18678
18679   if (parse_res == AARCH_PARSE_OK)
18680     return true;
18681
18682   switch (parse_res)
18683     {
18684       case AARCH_PARSE_MISSING_ARG:
18685         error ("missing cpu name in %<-mcpu=%s%>", str);
18686         break;
18687       case AARCH_PARSE_INVALID_ARG:
18688         error ("unknown value %qs for %<-mcpu%>", str);
18689         aarch64_print_hint_for_core (str);
18690         /* A common user error is confusing -march and -mcpu.
18691            If the -mcpu string matches a known architecture then suggest
18692            -march=.  */
18693         parse_res = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18694         if (parse_res == AARCH_PARSE_OK)
18695           inform (input_location, "did you mean %<-march=%s%>?", str);
18696         break;
18697       case AARCH_PARSE_INVALID_FEATURE:
18698         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18699                invalid_extension.c_str (), str);
18700         aarch64_print_hint_for_extensions (invalid_extension);
18701         break;
18702       default:
18703         gcc_unreachable ();
18704     }
18705
18706   return false;
18707 }
18708
18709 /* Straight line speculation indicators.  */
18710 enum aarch64_sls_hardening_type
18711 {
18712   SLS_NONE = 0,
18713   SLS_RETBR = 1,
18714   SLS_BLR = 2,
18715   SLS_ALL = 3,
18716 };
18717 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18718
18719 /* Return whether we should mitigatate Straight Line Speculation for the RET
18720    and BR instructions.  */
18721 bool
18722 aarch64_harden_sls_retbr_p (void)
18723 {
18724   return aarch64_sls_hardening & SLS_RETBR;
18725 }
18726
18727 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18728    instruction.  */
18729 bool
18730 aarch64_harden_sls_blr_p (void)
18731 {
18732   return aarch64_sls_hardening & SLS_BLR;
18733 }
18734
18735 /* As of yet we only allow setting these options globally, in the future we may
18736    allow setting them per function.  */
18737 static void
18738 aarch64_validate_sls_mitigation (const char *const_str)
18739 {
18740   char *token_save = NULL;
18741   char *str = NULL;
18742
18743   if (strcmp (const_str, "none") == 0)
18744     {
18745       aarch64_sls_hardening = SLS_NONE;
18746       return;
18747     }
18748   if (strcmp (const_str, "all") == 0)
18749     {
18750       aarch64_sls_hardening = SLS_ALL;
18751       return;
18752     }
18753
18754   char *str_root = xstrdup (const_str);
18755   str = strtok_r (str_root, ",", &token_save);
18756   if (!str)
18757     error ("invalid argument given to %<-mharden-sls=%>");
18758
18759   int temp = SLS_NONE;
18760   while (str)
18761     {
18762       if (strcmp (str, "blr") == 0)
18763         temp |= SLS_BLR;
18764       else if (strcmp (str, "retbr") == 0)
18765         temp |= SLS_RETBR;
18766       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18767         {
18768           error ("%qs must be by itself for %<-mharden-sls=%>", str);
18769           break;
18770         }
18771       else
18772         {
18773           error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18774           break;
18775         }
18776       str = strtok_r (NULL, ",", &token_save);
18777     }
18778   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18779   free (str_root);
18780 }
18781
18782 /* Validate a command-line -march option.  Parse the arch and extensions
18783    (if any) specified in STR and throw errors if appropriate.  Put the
18784    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
18785    option is valid.  */
18786
18787 static bool
18788 aarch64_validate_march (const char *str, const struct processor **res,
18789                         aarch64_feature_flags *isa_flags)
18790 {
18791   std::string invalid_extension;
18792   enum aarch_parse_opt_result parse_res
18793     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18794
18795   if (parse_res == AARCH_PARSE_OK)
18796     return true;
18797
18798   switch (parse_res)
18799     {
18800       case AARCH_PARSE_MISSING_ARG:
18801         error ("missing arch name in %<-march=%s%>", str);
18802         break;
18803       case AARCH_PARSE_INVALID_ARG:
18804         error ("unknown value %qs for %<-march%>", str);
18805         aarch64_print_hint_for_arch (str);
18806         /* A common user error is confusing -march and -mcpu.
18807            If the -march string matches a known CPU suggest -mcpu.  */
18808         parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18809         if (parse_res == AARCH_PARSE_OK)
18810           inform (input_location, "did you mean %<-mcpu=%s%>?", str);
18811         break;
18812       case AARCH_PARSE_INVALID_FEATURE:
18813         error ("invalid feature modifier %qs in %<-march=%s%>",
18814                invalid_extension.c_str (), str);
18815         aarch64_print_hint_for_extensions (invalid_extension);
18816         break;
18817       default:
18818         gcc_unreachable ();
18819     }
18820
18821   return false;
18822 }
18823
18824 /* Validate a command-line -mtune option.  Parse the cpu
18825    specified in STR and throw errors if appropriate.  Put the
18826    result, if it is valid, in RES.  Return whether the option is
18827    valid.  */
18828
18829 static bool
18830 aarch64_validate_mtune (const char *str, const struct processor **res)
18831 {
18832   enum aarch_parse_opt_result parse_res
18833     = aarch64_parse_tune (str, res);
18834
18835   if (parse_res == AARCH_PARSE_OK)
18836     return true;
18837
18838   switch (parse_res)
18839     {
18840       case AARCH_PARSE_MISSING_ARG:
18841         error ("missing cpu name in %<-mtune=%s%>", str);
18842         break;
18843       case AARCH_PARSE_INVALID_ARG:
18844         error ("unknown value %qs for %<-mtune%>", str);
18845         aarch64_print_hint_for_core (str);
18846         break;
18847       default:
18848         gcc_unreachable ();
18849     }
18850   return false;
18851 }
18852
18853 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
18854
18855 static poly_uint16
18856 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18857 {
18858   /* 128-bit SVE and Advanced SIMD modes use different register layouts
18859      on big-endian targets, so we would need to forbid subregs that convert
18860      from one to the other.  By default a reinterpret sequence would then
18861      involve a store to memory in one mode and a load back in the other.
18862      Even if we optimize that sequence using reverse instructions,
18863      it would still be a significant potential overhead.
18864
18865      For now, it seems better to generate length-agnostic code for that
18866      case instead.  */
18867   if (value == SVE_SCALABLE
18868       || (value == SVE_128 && BYTES_BIG_ENDIAN))
18869     return poly_uint16 (2, 2);
18870   else
18871     return (int) value / 64;
18872 }
18873
18874 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18875    aarch64_isa_flags accordingly.  */
18876
18877 void
18878 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18879 {
18880   aarch64_set_asm_isa_flags (&global_options, flags);
18881 }
18882
18883 static void
18884 aarch64_handle_no_branch_protection (void)
18885 {
18886   aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18887   aarch_enable_bti = 0;
18888 }
18889
18890 static void
18891 aarch64_handle_standard_branch_protection (void)
18892 {
18893   aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18894   aarch64_ra_sign_key = AARCH64_KEY_A;
18895   aarch_enable_bti = 1;
18896 }
18897
18898 static void
18899 aarch64_handle_pac_ret_protection (void)
18900 {
18901   aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18902   aarch64_ra_sign_key = AARCH64_KEY_A;
18903 }
18904
18905 static void
18906 aarch64_handle_pac_ret_leaf (void)
18907 {
18908   aarch_ra_sign_scope = AARCH_FUNCTION_ALL;
18909 }
18910
18911 static void
18912 aarch64_handle_pac_ret_b_key (void)
18913 {
18914   aarch64_ra_sign_key = AARCH64_KEY_B;
18915 }
18916
18917 static void
18918 aarch64_handle_bti_protection (void)
18919 {
18920   aarch_enable_bti = 1;
18921 }
18922
18923 static const struct aarch_branch_protect_type aarch64_pac_ret_subtypes[] = {
18924   { "leaf", false, aarch64_handle_pac_ret_leaf, NULL, 0 },
18925   { "b-key", false, aarch64_handle_pac_ret_b_key, NULL, 0 },
18926   { NULL, false, NULL, NULL, 0 }
18927 };
18928
18929 static const struct aarch_branch_protect_type aarch64_branch_protect_types[] =
18930 {
18931   { "none", true, aarch64_handle_no_branch_protection, NULL, 0 },
18932   { "standard", true, aarch64_handle_standard_branch_protection, NULL, 0 },
18933   { "pac-ret", false, aarch64_handle_pac_ret_protection,
18934     aarch64_pac_ret_subtypes, ARRAY_SIZE (aarch64_pac_ret_subtypes) },
18935   { "bti", false, aarch64_handle_bti_protection, NULL, 0 },
18936   { NULL, false, NULL, NULL, 0 }
18937 };
18938
18939 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
18940    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18941    tuning structs.  In particular it must set selected_tune and
18942    aarch64_asm_isa_flags that define the available ISA features and tuning
18943    decisions.  It must also set selected_arch as this will be used to
18944    output the .arch asm tags for each function.  */
18945
18946 static void
18947 aarch64_override_options (void)
18948 {
18949   aarch64_feature_flags cpu_isa = 0;
18950   aarch64_feature_flags arch_isa = 0;
18951   aarch64_set_asm_isa_flags (0);
18952
18953   const struct processor *cpu = NULL;
18954   const struct processor *arch = NULL;
18955   const struct processor *tune = NULL;
18956
18957   if (aarch64_harden_sls_string)
18958     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18959
18960   if (aarch64_branch_protection_string)
18961     aarch_validate_mbranch_protection (aarch64_branch_protect_types,
18962                                        aarch64_branch_protection_string,
18963                                        "-mbranch-protection=");
18964
18965   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18966      If either of -march or -mtune is given, they override their
18967      respective component of -mcpu.  */
18968   if (aarch64_cpu_string)
18969     aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18970
18971   if (aarch64_arch_string)
18972     aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18973
18974   if (aarch64_tune_string)
18975     aarch64_validate_mtune (aarch64_tune_string, &tune);
18976
18977 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18978   SUBTARGET_OVERRIDE_OPTIONS;
18979 #endif
18980
18981   if (cpu && arch)
18982     {
18983       /* If both -mcpu and -march are specified, warn if they are not
18984          feature compatible.  feature compatible means that the inclusion of the
18985          cpu features would end up disabling an achitecture feature.  In
18986          otherwords the cpu features need to be a strict superset of the arch
18987          features and if so prefer the -march ISA flags.  */
18988       auto full_arch_flags = arch->flags | arch_isa;
18989       auto full_cpu_flags = cpu->flags | cpu_isa;
18990       if (~full_cpu_flags & full_arch_flags)
18991         {
18992           std::string ext_diff
18993             = aarch64_get_extension_string_for_isa_flags (full_arch_flags,
18994                                                           full_cpu_flags);
18995           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch "
18996                       "and resulted in options %<%s%> being added",
18997                        aarch64_cpu_string,
18998                        aarch64_arch_string,
18999                        ext_diff.c_str ());
19000         }
19001
19002       selected_arch = arch->arch;
19003       aarch64_set_asm_isa_flags (arch_isa | AARCH64_FL_DEFAULT_ISA_MODE);
19004     }
19005   else if (cpu)
19006     {
19007       selected_arch = cpu->arch;
19008       aarch64_set_asm_isa_flags (cpu_isa | AARCH64_FL_DEFAULT_ISA_MODE);
19009     }
19010   else if (arch)
19011     {
19012       cpu = &all_cores[arch->ident];
19013       selected_arch = arch->arch;
19014       aarch64_set_asm_isa_flags (arch_isa | AARCH64_FL_DEFAULT_ISA_MODE);
19015     }
19016   else
19017     {
19018       /* No -mcpu or -march specified, so use the default CPU.  */
19019       cpu = &all_cores[TARGET_CPU_DEFAULT];
19020       selected_arch = cpu->arch;
19021       aarch64_set_asm_isa_flags (cpu->flags | AARCH64_FL_DEFAULT_ISA_MODE);
19022     }
19023
19024   selected_tune = tune ? tune->ident : cpu->ident;
19025
19026   if (aarch_enable_bti == 2)
19027     {
19028 #ifdef TARGET_ENABLE_BTI
19029       aarch_enable_bti = 1;
19030 #else
19031       aarch_enable_bti = 0;
19032 #endif
19033     }
19034
19035   /* Return address signing is currently not supported for ILP32 targets.  For
19036      LP64 targets use the configured option in the absence of a command-line
19037      option for -mbranch-protection.  */
19038   if (!TARGET_ILP32 && aarch64_branch_protection_string == NULL)
19039     {
19040 #ifdef TARGET_ENABLE_PAC_RET
19041       aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
19042 #else
19043       aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
19044 #endif
19045     }
19046
19047 #ifndef HAVE_AS_MABI_OPTION
19048   /* The compiler may have been configured with 2.23.* binutils, which does
19049      not have support for ILP32.  */
19050   if (TARGET_ILP32)
19051     error ("assembler does not support %<-mabi=ilp32%>");
19052 #endif
19053
19054   /* Convert -msve-vector-bits to a VG count.  */
19055   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
19056
19057   if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE && TARGET_ILP32)
19058     sorry ("return address signing is only supported for %<-mabi=lp64%>");
19059
19060   /* The pass to insert speculation tracking runs before
19061      shrink-wrapping and the latter does not know how to update the
19062      tracking status.  So disable it in this case.  */
19063   if (aarch64_track_speculation)
19064     flag_shrink_wrap = 0;
19065
19066   aarch64_override_options_internal (&global_options);
19067
19068   /* Save these options as the default ones in case we push and pop them later
19069      while processing functions with potential target attributes.  */
19070   target_option_default_node = target_option_current_node
19071     = build_target_option_node (&global_options, &global_options_set);
19072 }
19073
19074 /* Implement targetm.override_options_after_change.  */
19075
19076 static void
19077 aarch64_override_options_after_change (void)
19078 {
19079   aarch64_override_options_after_change_1 (&global_options);
19080 }
19081
19082 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
19083 static char *
19084 aarch64_offload_options (void)
19085 {
19086   if (TARGET_ILP32)
19087     return xstrdup ("-foffload-abi=ilp32 -foffload-abi-host-opts=-mabi=ilp32");
19088   else
19089     return xstrdup ("-foffload-abi=lp64 -foffload-abi-host-opts=-mabi=lp64");
19090 }
19091
19092 static struct machine_function *
19093 aarch64_init_machine_status (void)
19094 {
19095   struct machine_function *machine;
19096   machine = ggc_cleared_alloc<machine_function> ();
19097   return machine;
19098 }
19099
19100 void
19101 aarch64_init_expanders (void)
19102 {
19103   init_machine_status = aarch64_init_machine_status;
19104 }
19105
19106 /* A checking mechanism for the implementation of the various code models.  */
19107 static void
19108 initialize_aarch64_code_model (struct gcc_options *opts)
19109 {
19110   aarch64_cmodel = opts->x_aarch64_cmodel_var;
19111   switch (opts->x_aarch64_cmodel_var)
19112     {
19113     case AARCH64_CMODEL_TINY:
19114       if (opts->x_flag_pic)
19115         aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
19116       break;
19117     case AARCH64_CMODEL_SMALL:
19118       if (opts->x_flag_pic)
19119         {
19120 #ifdef HAVE_AS_SMALL_PIC_RELOCS
19121           aarch64_cmodel = (flag_pic == 2
19122                             ? AARCH64_CMODEL_SMALL_PIC
19123                             : AARCH64_CMODEL_SMALL_SPIC);
19124 #else
19125           aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
19126 #endif
19127         }
19128       break;
19129     case AARCH64_CMODEL_LARGE:
19130       if (opts->x_flag_pic)
19131         sorry ("code model %qs with %<-f%s%>", "large",
19132                opts->x_flag_pic > 1 ? "PIC" : "pic");
19133       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
19134         sorry ("code model %qs not supported in ilp32 mode", "large");
19135       break;
19136     case AARCH64_CMODEL_TINY_PIC:
19137     case AARCH64_CMODEL_SMALL_PIC:
19138     case AARCH64_CMODEL_SMALL_SPIC:
19139       gcc_unreachable ();
19140     }
19141 }
19142
19143 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
19144    using the information saved in PTR.  */
19145
19146 static void
19147 aarch64_option_restore (struct gcc_options *opts,
19148                         struct gcc_options * /* opts_set */,
19149                         struct cl_target_option * /* ptr */)
19150 {
19151   aarch64_override_options_internal (opts);
19152 }
19153
19154 /* Implement TARGET_OPTION_PRINT.  */
19155
19156 static void
19157 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
19158 {
19159   const struct processor *cpu
19160     = aarch64_get_tune_cpu (ptr->x_selected_tune);
19161   const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
19162   aarch64_feature_flags isa_flags = aarch64_get_asm_isa_flags(ptr);
19163   std::string extension
19164     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
19165
19166   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
19167   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
19168            arch->name, extension.c_str ());
19169 }
19170
19171 static GTY(()) tree aarch64_previous_fndecl;
19172
19173 void
19174 aarch64_reset_previous_fndecl (void)
19175 {
19176   aarch64_previous_fndecl = NULL;
19177 }
19178
19179 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
19180    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
19181    make sure optab availability predicates are recomputed when necessary.  */
19182
19183 void
19184 aarch64_save_restore_target_globals (tree new_tree)
19185 {
19186   if (TREE_TARGET_GLOBALS (new_tree))
19187     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
19188   else if (new_tree == target_option_default_node)
19189     restore_target_globals (&default_target_globals);
19190   else
19191     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
19192 }
19193
19194 /* Return the target_option_node for FNDECL, or the current options
19195    if FNDECL is null.  */
19196
19197 static tree
19198 aarch64_fndecl_options (tree fndecl)
19199 {
19200   if (!fndecl)
19201     return target_option_current_node;
19202
19203   if (tree options = DECL_FUNCTION_SPECIFIC_TARGET (fndecl))
19204     return options;
19205
19206   return target_option_default_node;
19207 }
19208
19209 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
19210    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
19211    of the function, if such exists.  This function may be called multiple
19212    times on a single function so use aarch64_previous_fndecl to avoid
19213    setting up identical state.  */
19214
19215 static void
19216 aarch64_set_current_function (tree fndecl)
19217 {
19218   tree old_tree = aarch64_fndecl_options (aarch64_previous_fndecl);
19219   tree new_tree = aarch64_fndecl_options (fndecl);
19220
19221   auto new_isa_mode = (fndecl
19222                        ? aarch64_fndecl_isa_mode (fndecl)
19223                        : AARCH64_DEFAULT_ISA_MODE);
19224   auto isa_flags = aarch64_get_isa_flags (TREE_TARGET_OPTION (new_tree));
19225
19226   static bool reported_zt0_p;
19227   if (!reported_zt0_p
19228       && !(isa_flags & AARCH64_FL_SME2)
19229       && fndecl
19230       && aarch64_fndecl_has_state (fndecl, "zt0"))
19231     {
19232       error ("functions with %qs state require the ISA extension %qs",
19233              "zt0", "sme2");
19234       inform (input_location, "you can enable %qs using the command-line"
19235               " option %<-march%>, or by using the %<target%>"
19236               " attribute or pragma", "sme2");
19237       reported_zt0_p = true;
19238     }
19239
19240   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
19241      the default have been handled by aarch64_save_restore_target_globals from
19242      aarch64_pragma_target_parse.  */
19243   if (old_tree == new_tree
19244       && (!fndecl || aarch64_previous_fndecl)
19245       && (isa_flags & AARCH64_FL_ISA_MODES).val[0] == new_isa_mode)
19246     {
19247       gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19248       return;
19249     }
19250
19251   aarch64_previous_fndecl = fndecl;
19252
19253   /* First set the target options.  */
19254   cl_target_option_restore (&global_options, &global_options_set,
19255                             TREE_TARGET_OPTION (new_tree));
19256
19257   /* The ISA mode can vary based on function type attributes and
19258      function declaration attributes.  Make sure that the target
19259      options correctly reflect these attributes.  */
19260   if ((isa_flags & AARCH64_FL_ISA_MODES).val[0] != new_isa_mode)
19261     {
19262       auto base_flags = (aarch64_asm_isa_flags & ~AARCH64_FL_ISA_MODES);
19263       aarch64_set_asm_isa_flags (base_flags
19264                                  | aarch64_feature_flags (new_isa_mode));
19265
19266       aarch64_override_options_internal (&global_options);
19267       new_tree = build_target_option_node (&global_options,
19268                                            &global_options_set);
19269       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_tree;
19270
19271       tree new_optimize = build_optimization_node (&global_options,
19272                                                    &global_options_set);
19273       if (new_optimize != optimization_default_node)
19274         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19275     }
19276
19277   aarch64_save_restore_target_globals (new_tree);
19278
19279   gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
19280 }
19281
19282 /* Enum describing the various ways we can handle attributes.
19283    In many cases we can reuse the generic option handling machinery.  */
19284
19285 enum aarch64_attr_opt_type
19286 {
19287   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
19288   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
19289   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
19290   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
19291 };
19292
19293 /* All the information needed to handle a target attribute.
19294    NAME is the name of the attribute.
19295    ATTR_TYPE specifies the type of behavior of the attribute as described
19296    in the definition of enum aarch64_attr_opt_type.
19297    ALLOW_NEG is true if the attribute supports a "no-" form.
19298    HANDLER is the function that takes the attribute string as an argument
19299    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
19300    OPT_NUM is the enum specifying the option that the attribute modifies.
19301    This is needed for attributes that mirror the behavior of a command-line
19302    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
19303    aarch64_attr_enum.  */
19304
19305 struct aarch64_attribute_info
19306 {
19307   const char *name;
19308   enum aarch64_attr_opt_type attr_type;
19309   bool allow_neg;
19310   bool (*handler) (const char *);
19311   enum opt_code opt_num;
19312 };
19313
19314 /* Handle the ARCH_STR argument to the arch= target attribute.  */
19315
19316 static bool
19317 aarch64_handle_attr_arch (const char *str)
19318 {
19319   const struct processor *tmp_arch = NULL;
19320   std::string invalid_extension;
19321   aarch64_feature_flags tmp_flags;
19322   enum aarch_parse_opt_result parse_res
19323     = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
19324
19325   if (parse_res == AARCH_PARSE_OK)
19326     {
19327       gcc_assert (tmp_arch);
19328       selected_arch = tmp_arch->arch;
19329       aarch64_set_asm_isa_flags (tmp_flags | (aarch64_asm_isa_flags
19330                                               & AARCH64_FL_ISA_MODES));
19331       return true;
19332     }
19333
19334   switch (parse_res)
19335     {
19336       case AARCH_PARSE_MISSING_ARG:
19337         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
19338         break;
19339       case AARCH_PARSE_INVALID_ARG:
19340         error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
19341         aarch64_print_hint_for_arch (str);
19342         break;
19343       case AARCH_PARSE_INVALID_FEATURE:
19344         error ("invalid feature modifier %s of value %qs in "
19345                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19346         aarch64_print_hint_for_extensions (invalid_extension);
19347         break;
19348       default:
19349         gcc_unreachable ();
19350     }
19351
19352   return false;
19353 }
19354
19355 /* Handle the argument CPU_STR to the cpu= target attribute.  */
19356
19357 static bool
19358 aarch64_handle_attr_cpu (const char *str)
19359 {
19360   const struct processor *tmp_cpu = NULL;
19361   std::string invalid_extension;
19362   aarch64_feature_flags tmp_flags;
19363   enum aarch_parse_opt_result parse_res
19364     = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
19365
19366   if (parse_res == AARCH_PARSE_OK)
19367     {
19368       gcc_assert (tmp_cpu);
19369       selected_tune = tmp_cpu->ident;
19370       selected_arch = tmp_cpu->arch;
19371       aarch64_set_asm_isa_flags (tmp_flags | (aarch64_asm_isa_flags
19372                                               & AARCH64_FL_ISA_MODES));
19373       return true;
19374     }
19375
19376   switch (parse_res)
19377     {
19378       case AARCH_PARSE_MISSING_ARG:
19379         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
19380         break;
19381       case AARCH_PARSE_INVALID_ARG:
19382         error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
19383         aarch64_print_hint_for_core (str);
19384         break;
19385       case AARCH_PARSE_INVALID_FEATURE:
19386         error ("invalid feature modifier %qs of value %qs in "
19387                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19388         aarch64_print_hint_for_extensions (invalid_extension);
19389         break;
19390       default:
19391         gcc_unreachable ();
19392     }
19393
19394   return false;
19395 }
19396
19397 /* Handle the argument STR to the branch-protection= attribute.  */
19398
19399 static bool
19400 aarch64_handle_attr_branch_protection (const char* str)
19401 {
19402   return aarch_validate_mbranch_protection (aarch64_branch_protect_types, str,
19403                                             "target(\"branch-protection=\")");
19404 }
19405
19406 /* Handle the argument STR to the tune= target attribute.  */
19407
19408 static bool
19409 aarch64_handle_attr_tune (const char *str)
19410 {
19411   const struct processor *tmp_tune = NULL;
19412   enum aarch_parse_opt_result parse_res
19413     = aarch64_parse_tune (str, &tmp_tune);
19414
19415   if (parse_res == AARCH_PARSE_OK)
19416     {
19417       gcc_assert (tmp_tune);
19418       selected_tune = tmp_tune->ident;
19419       return true;
19420     }
19421
19422   switch (parse_res)
19423     {
19424       case AARCH_PARSE_INVALID_ARG:
19425         error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
19426         aarch64_print_hint_for_core (str);
19427         break;
19428       default:
19429         gcc_unreachable ();
19430     }
19431
19432   return false;
19433 }
19434
19435 /* Parse an architecture extensions target attribute string specified in STR.
19436    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
19437    if successful.  Update aarch64_isa_flags to reflect the ISA features
19438    modified.  */
19439
19440 static bool
19441 aarch64_handle_attr_isa_flags (char *str)
19442 {
19443   enum aarch_parse_opt_result parse_res;
19444   auto isa_flags = aarch64_asm_isa_flags;
19445
19446   /* We allow "+nothing" in the beginning to clear out all architectural
19447      features if the user wants to handpick specific features.  */
19448   if (strncmp ("+nothing", str, 8) == 0)
19449     {
19450       isa_flags &= AARCH64_FL_ISA_MODES;
19451       str += 8;
19452     }
19453
19454   std::string invalid_extension;
19455   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
19456
19457   if (parse_res == AARCH_PARSE_OK)
19458     {
19459       aarch64_set_asm_isa_flags (isa_flags);
19460       return true;
19461     }
19462
19463   switch (parse_res)
19464     {
19465       case AARCH_PARSE_MISSING_ARG:
19466         error ("missing value in %<target()%> pragma or attribute");
19467         break;
19468
19469       case AARCH_PARSE_INVALID_FEATURE:
19470         error ("invalid feature modifier %qs of value %qs in "
19471                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19472         break;
19473
19474       default:
19475         gcc_unreachable ();
19476     }
19477
19478  return false;
19479 }
19480
19481 /* The target attributes that we support.  On top of these we also support just
19482    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
19483    handled explicitly in aarch64_process_one_target_attr.  */
19484
19485 static const struct aarch64_attribute_info aarch64_attributes[] =
19486 {
19487   { "general-regs-only", aarch64_attr_mask, false, NULL,
19488      OPT_mgeneral_regs_only },
19489   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
19490      OPT_mfix_cortex_a53_835769 },
19491   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
19492      OPT_mfix_cortex_a53_843419 },
19493   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
19494   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
19495   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
19496      OPT_momit_leaf_frame_pointer },
19497   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
19498   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
19499      OPT_march_ },
19500   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
19501   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
19502      OPT_mtune_ },
19503   { "branch-protection", aarch64_attr_custom, false,
19504      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
19505   { "sign-return-address", aarch64_attr_enum, false, NULL,
19506      OPT_msign_return_address_ },
19507   { "outline-atomics", aarch64_attr_bool, true, NULL,
19508      OPT_moutline_atomics},
19509   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
19510 };
19511
19512 /* Parse ARG_STR which contains the definition of one target attribute.
19513    Show appropriate errors if any or return true if the attribute is valid.  */
19514
19515 static bool
19516 aarch64_process_one_target_attr (char *arg_str)
19517 {
19518   bool invert = false;
19519
19520   size_t len = strlen (arg_str);
19521
19522   if (len == 0)
19523     {
19524       error ("malformed %<target()%> pragma or attribute");
19525       return false;
19526     }
19527
19528   auto_vec<char, 32> buffer;
19529   buffer.safe_grow (len + 1);
19530   char *str_to_check = buffer.address ();
19531   memcpy (str_to_check, arg_str, len + 1);
19532
19533   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
19534      It is easier to detect and handle it explicitly here rather than going
19535      through the machinery for the rest of the target attributes in this
19536      function.  */
19537   if (*str_to_check == '+')
19538     return aarch64_handle_attr_isa_flags (str_to_check);
19539
19540   if (len > 3 && startswith (str_to_check, "no-"))
19541     {
19542       invert = true;
19543       str_to_check += 3;
19544     }
19545   char *arg = strchr (str_to_check, '=');
19546
19547   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
19548      and point ARG to "foo".  */
19549   if (arg)
19550     {
19551       *arg = '\0';
19552       arg++;
19553     }
19554   const struct aarch64_attribute_info *p_attr;
19555   bool found = false;
19556   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
19557     {
19558       /* If the names don't match up, or the user has given an argument
19559          to an attribute that doesn't accept one, or didn't give an argument
19560          to an attribute that expects one, fail to match.  */
19561       if (strcmp (str_to_check, p_attr->name) != 0)
19562         continue;
19563
19564       found = true;
19565       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
19566                               || p_attr->attr_type == aarch64_attr_enum;
19567
19568       if (attr_need_arg_p ^ (arg != NULL))
19569         {
19570           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
19571           return false;
19572         }
19573
19574       /* If the name matches but the attribute does not allow "no-" versions
19575          then we can't match.  */
19576       if (invert && !p_attr->allow_neg)
19577         {
19578           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
19579           return false;
19580         }
19581
19582       switch (p_attr->attr_type)
19583         {
19584         /* Has a custom handler registered.
19585            For example, cpu=, arch=, tune=.  */
19586           case aarch64_attr_custom:
19587             gcc_assert (p_attr->handler);
19588             if (!p_attr->handler (arg))
19589               return false;
19590             break;
19591
19592           /* Either set or unset a boolean option.  */
19593           case aarch64_attr_bool:
19594             {
19595               struct cl_decoded_option decoded;
19596
19597               generate_option (p_attr->opt_num, NULL, !invert,
19598                                CL_TARGET, &decoded);
19599               aarch64_handle_option (&global_options, &global_options_set,
19600                                       &decoded, input_location);
19601               break;
19602             }
19603           /* Set or unset a bit in the target_flags.  aarch64_handle_option
19604              should know what mask to apply given the option number.  */
19605           case aarch64_attr_mask:
19606             {
19607               struct cl_decoded_option decoded;
19608               /* We only need to specify the option number.
19609                  aarch64_handle_option will know which mask to apply.  */
19610               decoded.opt_index = p_attr->opt_num;
19611               decoded.value = !invert;
19612               aarch64_handle_option (&global_options, &global_options_set,
19613                                       &decoded, input_location);
19614               break;
19615             }
19616           /* Use the option setting machinery to set an option to an enum.  */
19617           case aarch64_attr_enum:
19618             {
19619               gcc_assert (arg);
19620               bool valid;
19621               int value;
19622               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
19623                                               &value, CL_TARGET);
19624               if (valid)
19625                 {
19626                   set_option (&global_options, NULL, p_attr->opt_num, value,
19627                               NULL, DK_UNSPECIFIED, input_location,
19628                               global_dc);
19629                 }
19630               else
19631                 {
19632                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
19633                 }
19634               break;
19635             }
19636           default:
19637             gcc_unreachable ();
19638         }
19639     }
19640
19641   /* If we reached here we either have found an attribute and validated
19642      it or didn't match any.  If we matched an attribute but its arguments
19643      were malformed we will have returned false already.  */
19644   return found;
19645 }
19646
19647 /* Count how many times the character C appears in
19648    NULL-terminated string STR.  */
19649
19650 static unsigned int
19651 num_occurences_in_str (char c, char *str)
19652 {
19653   unsigned int res = 0;
19654   while (*str != '\0')
19655     {
19656       if (*str == c)
19657         res++;
19658
19659       str++;
19660     }
19661
19662   return res;
19663 }
19664
19665 /* Parse the tree in ARGS that contains the target attribute information
19666    and update the global target options space.  */
19667
19668 bool
19669 aarch64_process_target_attr (tree args)
19670 {
19671   if (TREE_CODE (args) == TREE_LIST)
19672     {
19673       do
19674         {
19675           tree head = TREE_VALUE (args);
19676           if (head)
19677             {
19678               if (!aarch64_process_target_attr (head))
19679                 return false;
19680             }
19681           args = TREE_CHAIN (args);
19682         } while (args);
19683
19684       return true;
19685     }
19686
19687   if (TREE_CODE (args) != STRING_CST)
19688     {
19689       error ("attribute %<target%> argument not a string");
19690       return false;
19691     }
19692
19693   size_t len = strlen (TREE_STRING_POINTER (args));
19694   auto_vec<char, 32> buffer;
19695   buffer.safe_grow (len + 1);
19696   char *str_to_check = buffer.address ();
19697   memcpy (str_to_check, TREE_STRING_POINTER (args), len + 1);
19698
19699   if (len == 0)
19700     {
19701       error ("malformed %<target()%> pragma or attribute");
19702       return false;
19703     }
19704
19705   /* Used to catch empty spaces between commas i.e.
19706      attribute ((target ("attr1,,attr2"))).  */
19707   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19708
19709   /* Handle multiple target attributes separated by ','.  */
19710   char *token = strtok_r (str_to_check, ",", &str_to_check);
19711
19712   unsigned int num_attrs = 0;
19713   while (token)
19714     {
19715       num_attrs++;
19716       if (!aarch64_process_one_target_attr (token))
19717         {
19718           /* Check if token is possibly an arch extension without
19719              leading '+'.  */
19720           aarch64_feature_flags isa_temp = 0;
19721           auto with_plus = std::string ("+") + token;
19722           enum aarch_parse_opt_result ext_res
19723             = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19724
19725           if (ext_res == AARCH_PARSE_OK)
19726             error ("arch extension %<%s%> should be prefixed by %<+%>",
19727                    token);
19728           else
19729             error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19730           return false;
19731         }
19732
19733       token = strtok_r (NULL, ",", &str_to_check);
19734     }
19735
19736   if (num_attrs != num_commas + 1)
19737     {
19738       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19739       return false;
19740     }
19741
19742   return true;
19743 }
19744
19745 static bool aarch64_process_target_version_attr (tree args);
19746
19747 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
19748    process attribute ((target ("..."))).  */
19749
19750 static bool
19751 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19752 {
19753   struct cl_target_option cur_target;
19754   bool ret;
19755   tree old_optimize;
19756   tree new_target, new_optimize;
19757   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19758
19759   /* If what we're processing is the current pragma string then the
19760      target option node is already stored in target_option_current_node
19761      by aarch64_pragma_target_parse in aarch64-c.cc.  Use that to avoid
19762      having to re-parse the string.  This is especially useful to keep
19763      arm_neon.h compile times down since that header contains a lot
19764      of intrinsics enclosed in pragmas.  */
19765   if (!existing_target && args == current_target_pragma)
19766     {
19767       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19768       return true;
19769     }
19770   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19771
19772   old_optimize
19773     = build_optimization_node (&global_options, &global_options_set);
19774   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19775
19776   /* If the function changed the optimization levels as well as setting
19777      target options, start with the optimizations specified.  */
19778   if (func_optimize && func_optimize != old_optimize)
19779     cl_optimization_restore (&global_options, &global_options_set,
19780                              TREE_OPTIMIZATION (func_optimize));
19781
19782   /* Save the current target options to restore at the end.  */
19783   cl_target_option_save (&cur_target, &global_options, &global_options_set);
19784
19785   /* If fndecl already has some target attributes applied to it, unpack
19786      them so that we add this attribute on top of them, rather than
19787      overwriting them.  */
19788   if (existing_target)
19789     {
19790       struct cl_target_option *existing_options
19791         = TREE_TARGET_OPTION (existing_target);
19792
19793       if (existing_options)
19794         cl_target_option_restore (&global_options, &global_options_set,
19795                                   existing_options);
19796     }
19797   else
19798     cl_target_option_restore (&global_options, &global_options_set,
19799                               TREE_TARGET_OPTION (target_option_current_node));
19800
19801   ret = aarch64_process_target_attr (args);
19802   if (ret)
19803     {
19804       tree version_attr = lookup_attribute ("target_version",
19805                                             DECL_ATTRIBUTES (fndecl));
19806       if (version_attr != NULL_TREE)
19807         {
19808           /* Reapply any target_version attribute after target attribute.
19809              This should be equivalent to applying the target_version once
19810              after processing all target attributes.  */
19811           tree version_args = TREE_VALUE (version_attr);
19812           ret = aarch64_process_target_version_attr (version_args);
19813         }
19814     }
19815
19816   /* Set up any additional state.  */
19817   if (ret)
19818     {
19819       aarch64_override_options_internal (&global_options);
19820       new_target = build_target_option_node (&global_options,
19821                                              &global_options_set);
19822     }
19823   else
19824     new_target = NULL;
19825
19826   new_optimize = build_optimization_node (&global_options,
19827                                           &global_options_set);
19828
19829   if (fndecl && ret)
19830     {
19831       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19832
19833       if (old_optimize != new_optimize)
19834         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19835     }
19836
19837   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19838
19839   if (old_optimize != new_optimize)
19840     cl_optimization_restore (&global_options, &global_options_set,
19841                              TREE_OPTIMIZATION (old_optimize));
19842   return ret;
19843 }
19844
19845 typedef unsigned long long aarch64_fmv_feature_mask;
19846
19847 typedef struct
19848 {
19849   const char *name;
19850   aarch64_fmv_feature_mask feature_mask;
19851   aarch64_feature_flags opt_flags;
19852 } aarch64_fmv_feature_datum;
19853
19854 #define AARCH64_FMV_FEATURE(NAME, FEAT_NAME, C) \
19855   {NAME, 1ULL << FEAT_##FEAT_NAME, ::feature_deps::fmv_deps_##FEAT_NAME},
19856
19857 /* The "rdma" alias uses a different FEAT_NAME to avoid a duplicate
19858    feature_deps name.  */
19859 #define FEAT_RDMA FEAT_RDM
19860
19861 /* FMV features are listed in priority order, to make it easier to sort target
19862    strings.  */
19863 static aarch64_fmv_feature_datum aarch64_fmv_feature_data[] = {
19864 #include "config/aarch64/aarch64-option-extensions.def"
19865 };
19866
19867 /* Parse a function multiversioning feature string STR, as found in a
19868    target_version or target_clones attribute.
19869
19870    If ISA_FLAGS is nonnull, then update it with the specified architecture
19871    features turned on.  If FEATURE_MASK is nonnull, then assign to it a bitmask
19872    representing the set of features explicitly specified in the feature string.
19873    Return an aarch_parse_opt_result describing the result.
19874
19875    When the STR string contains an invalid or duplicate extension, a copy of
19876    the extension string is created and stored to INVALID_EXTENSION.  */
19877
19878 static enum aarch_parse_opt_result
19879 aarch64_parse_fmv_features (const char *str, aarch64_feature_flags *isa_flags,
19880                             aarch64_fmv_feature_mask *feature_mask,
19881                             std::string *invalid_extension)
19882 {
19883   if (feature_mask)
19884     *feature_mask = 0ULL;
19885
19886   if (strcmp (str, "default") == 0)
19887     return AARCH_PARSE_OK;
19888
19889   while (str != NULL && *str != 0)
19890     {
19891       const char *ext;
19892       size_t len;
19893
19894       ext = strchr (str, '+');
19895
19896       if (ext != NULL)
19897         len = ext - str;
19898       else
19899         len = strlen (str);
19900
19901       if (len == 0)
19902         return AARCH_PARSE_MISSING_ARG;
19903
19904       int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
19905       int i;
19906       for (i = 0; i < num_features; i++)
19907         {
19908           if (strlen (aarch64_fmv_feature_data[i].name) == len
19909               && strncmp (aarch64_fmv_feature_data[i].name, str, len) == 0)
19910             {
19911               if (isa_flags)
19912                 *isa_flags |= aarch64_fmv_feature_data[i].opt_flags;
19913               if (feature_mask)
19914                 {
19915                   auto old_feature_mask = *feature_mask;
19916                   *feature_mask |= aarch64_fmv_feature_data[i].feature_mask;
19917                   if (*feature_mask == old_feature_mask)
19918                     {
19919                       /* Duplicate feature.  */
19920                       if (invalid_extension)
19921                         *invalid_extension = std::string (str, len);
19922                       return AARCH_PARSE_DUPLICATE_FEATURE;
19923                     }
19924                 }
19925               break;
19926             }
19927         }
19928
19929       if (i == num_features)
19930         {
19931           /* Feature not found in list.  */
19932           if (invalid_extension)
19933             *invalid_extension = std::string (str, len);
19934           return AARCH_PARSE_INVALID_FEATURE;
19935         }
19936
19937       str = ext;
19938       if (str)
19939         /* Skip over the next '+'.  */
19940         str++;
19941     }
19942
19943   return AARCH_PARSE_OK;
19944 }
19945
19946 /* Parse the tree in ARGS that contains the target_version attribute
19947    information and update the global target options space.  */
19948
19949 static bool
19950 aarch64_process_target_version_attr (tree args)
19951 {
19952   if (TREE_CODE (args) == TREE_LIST)
19953     {
19954       if (TREE_CHAIN (args))
19955         {
19956           error ("attribute %<target_version%> has multiple values");
19957           return false;
19958         }
19959       args = TREE_VALUE (args);
19960     }
19961
19962   if (!args || TREE_CODE (args) != STRING_CST)
19963     {
19964       error ("attribute %<target_version%> argument not a string");
19965       return false;
19966     }
19967
19968   const char *str = TREE_STRING_POINTER (args);
19969
19970   enum aarch_parse_opt_result parse_res;
19971   auto isa_flags = aarch64_asm_isa_flags;
19972
19973   std::string invalid_extension;
19974   parse_res = aarch64_parse_fmv_features (str, &isa_flags, NULL,
19975                                           &invalid_extension);
19976
19977   if (parse_res == AARCH_PARSE_OK)
19978     {
19979       aarch64_set_asm_isa_flags (isa_flags);
19980       return true;
19981     }
19982
19983   switch (parse_res)
19984     {
19985     case AARCH_PARSE_MISSING_ARG:
19986       error ("missing value in %<target_version%> attribute");
19987       break;
19988
19989     case AARCH_PARSE_INVALID_FEATURE:
19990       error ("invalid feature modifier %qs of value %qs in "
19991              "%<target_version%> attribute", invalid_extension.c_str (),
19992              str);
19993       break;
19994
19995     case AARCH_PARSE_DUPLICATE_FEATURE:
19996       error ("duplicate feature modifier %qs of value %qs in "
19997              "%<target_version%> attribute", invalid_extension.c_str (),
19998              str);
19999       break;
20000
20001     default:
20002       gcc_unreachable ();
20003     }
20004
20005   return false;
20006 }
20007
20008 /* Implement TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P.  This is used to
20009    process attribute ((target_version ("..."))).  */
20010
20011 static bool
20012 aarch64_option_valid_version_attribute_p (tree fndecl, tree, tree args, int)
20013 {
20014   struct cl_target_option cur_target;
20015   bool ret;
20016   tree new_target;
20017   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
20018
20019   /* Save the current target options to restore at the end.  */
20020   cl_target_option_save (&cur_target, &global_options, &global_options_set);
20021
20022   /* If fndecl already has some target attributes applied to it, unpack
20023      them so that we add this attribute on top of them, rather than
20024      overwriting them.  */
20025   if (existing_target)
20026     {
20027       struct cl_target_option *existing_options
20028         = TREE_TARGET_OPTION (existing_target);
20029
20030       if (existing_options)
20031         cl_target_option_restore (&global_options, &global_options_set,
20032                                   existing_options);
20033     }
20034   else
20035     cl_target_option_restore (&global_options, &global_options_set,
20036                               TREE_TARGET_OPTION (target_option_current_node));
20037
20038   ret = aarch64_process_target_version_attr (args);
20039
20040   /* Set up any additional state.  */
20041   if (ret)
20042     {
20043       aarch64_override_options_internal (&global_options);
20044       new_target = build_target_option_node (&global_options,
20045                                              &global_options_set);
20046     }
20047   else
20048     new_target = NULL;
20049
20050   if (fndecl && ret)
20051       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
20052
20053   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
20054
20055   return ret;
20056 }
20057
20058 /* This parses the attribute arguments to target_version in DECL and the
20059    feature mask required to select those targets.  No adjustments are made to
20060    add or remove redundant feature requirements.  */
20061
20062 static aarch64_fmv_feature_mask
20063 get_feature_mask_for_version (tree decl)
20064 {
20065   tree version_attr = lookup_attribute ("target_version",
20066                                         DECL_ATTRIBUTES (decl));
20067   if (version_attr == NULL)
20068     return 0;
20069
20070   const char *version_string = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE
20071                                                     (version_attr)));
20072   enum aarch_parse_opt_result parse_res;
20073   aarch64_fmv_feature_mask feature_mask;
20074
20075   parse_res = aarch64_parse_fmv_features (version_string, NULL, &feature_mask,
20076                                           NULL);
20077
20078   /* We should have detected any errors before getting here.  */
20079   gcc_assert (parse_res == AARCH_PARSE_OK);
20080
20081   return feature_mask;
20082 }
20083
20084 /* Compare priorities of two feature masks. Return:
20085      1: mask1 is higher priority
20086     -1: mask2 is higher priority
20087      0: masks are equal.  */
20088
20089 static int
20090 compare_feature_masks (aarch64_fmv_feature_mask mask1,
20091                        aarch64_fmv_feature_mask mask2)
20092 {
20093   int pop1 = popcount_hwi (mask1);
20094   int pop2 = popcount_hwi (mask2);
20095   if (pop1 > pop2)
20096     return 1;
20097   if (pop2 > pop1)
20098     return -1;
20099
20100   auto diff_mask = mask1 ^ mask2;
20101   if (diff_mask == 0ULL)
20102     return 0;
20103   int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
20104   for (int i = num_features - 1; i >= 0; i--)
20105     {
20106       auto bit_mask = aarch64_fmv_feature_data[i].feature_mask;
20107       if (diff_mask & bit_mask)
20108         return (mask1 & bit_mask) ? 1 : -1;
20109     }
20110   gcc_unreachable();
20111 }
20112
20113 /* Compare priorities of two version decls.  */
20114
20115 int
20116 aarch64_compare_version_priority (tree decl1, tree decl2)
20117 {
20118   auto mask1 = get_feature_mask_for_version (decl1);
20119   auto mask2 = get_feature_mask_for_version (decl2);
20120
20121   return compare_feature_masks (mask1, mask2);
20122 }
20123
20124 /* Build the struct __ifunc_arg_t type:
20125
20126    struct __ifunc_arg_t
20127    {
20128      unsigned long _size; // Size of the struct, so it can grow.
20129      unsigned long _hwcap;
20130      unsigned long _hwcap2;
20131    }
20132  */
20133
20134 static tree
20135 build_ifunc_arg_type ()
20136 {
20137   tree ifunc_arg_type = lang_hooks.types.make_type (RECORD_TYPE);
20138   tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20139                             get_identifier ("_size"),
20140                             long_unsigned_type_node);
20141   tree field2 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20142                             get_identifier ("_hwcap"),
20143                             long_unsigned_type_node);
20144   tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20145                             get_identifier ("_hwcap2"),
20146                             long_unsigned_type_node);
20147
20148   DECL_FIELD_CONTEXT (field1) = ifunc_arg_type;
20149   DECL_FIELD_CONTEXT (field2) = ifunc_arg_type;
20150   DECL_FIELD_CONTEXT (field3) = ifunc_arg_type;
20151
20152   TYPE_FIELDS (ifunc_arg_type) = field1;
20153   DECL_CHAIN (field1) = field2;
20154   DECL_CHAIN (field2) = field3;
20155
20156   layout_type (ifunc_arg_type);
20157
20158   tree const_type = build_qualified_type (ifunc_arg_type, TYPE_QUAL_CONST);
20159   tree pointer_type = build_pointer_type (const_type);
20160
20161   return pointer_type;
20162 }
20163
20164 /* Implement TARGET_MANGLE_DECL_ASSEMBLER_NAME, to add function multiversioning
20165    suffixes.  */
20166
20167 tree
20168 aarch64_mangle_decl_assembler_name (tree decl, tree id)
20169 {
20170   /* For function version, add the target suffix to the assembler name.  */
20171   if (TREE_CODE (decl) == FUNCTION_DECL
20172       && DECL_FUNCTION_VERSIONED (decl))
20173     {
20174       aarch64_fmv_feature_mask feature_mask = get_feature_mask_for_version (decl);
20175
20176       std::string name = IDENTIFIER_POINTER (id);
20177
20178       /* For the default version, append ".default".  */
20179       if (feature_mask == 0ULL)
20180         {
20181           name += ".default";
20182           return get_identifier (name.c_str());
20183         }
20184
20185       name += "._";
20186
20187       int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
20188       for (int i = 0; i < num_features; i++)
20189         {
20190           if (feature_mask & aarch64_fmv_feature_data[i].feature_mask)
20191             {
20192               name += "M";
20193               name += aarch64_fmv_feature_data[i].name;
20194             }
20195         }
20196
20197       if (DECL_ASSEMBLER_NAME_SET_P (decl))
20198         SET_DECL_RTL (decl, NULL);
20199
20200       id = get_identifier (name.c_str());
20201     }
20202   return id;
20203 }
20204
20205 /* Return an identifier for the base assembler name of a versioned function.
20206    This is computed by taking the default version's assembler name, and
20207    stripping off the ".default" suffix if it's already been appended.  */
20208
20209 static tree
20210 get_suffixed_assembler_name (tree default_decl, const char *suffix)
20211 {
20212   std::string name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (default_decl));
20213
20214   auto size = name.size ();
20215   if (size >= 8 && name.compare (size - 8, 8, ".default") == 0)
20216     name.resize (size - 8);
20217   name += suffix;
20218   return get_identifier (name.c_str());
20219 }
20220
20221 /* Make the resolver function decl to dispatch the versions of
20222    a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
20223    ifunc alias that will point to the created resolver.  Create an
20224    empty basic block in the resolver and store the pointer in
20225    EMPTY_BB.  Return the decl of the resolver function.  */
20226
20227 static tree
20228 make_resolver_func (const tree default_decl,
20229                     const tree ifunc_alias_decl,
20230                     basic_block *empty_bb)
20231 {
20232   tree decl, type, t;
20233
20234   /* Create resolver function name based on default_decl.  We need to remove an
20235      existing ".default" suffix if this has already been appended.  */
20236   tree decl_name = get_suffixed_assembler_name (default_decl, ".resolver");
20237   const char *resolver_name = IDENTIFIER_POINTER (decl_name);
20238
20239   /* The resolver function should have signature
20240      (void *) resolver (uint64_t, const __ifunc_arg_t *) */
20241   type = build_function_type_list (ptr_type_node,
20242                                    uint64_type_node,
20243                                    build_ifunc_arg_type (),
20244                                    NULL_TREE);
20245
20246   decl = build_fn_decl (resolver_name, type);
20247   SET_DECL_ASSEMBLER_NAME (decl, decl_name);
20248
20249   DECL_NAME (decl) = decl_name;
20250   TREE_USED (decl) = 1;
20251   DECL_ARTIFICIAL (decl) = 1;
20252   DECL_IGNORED_P (decl) = 1;
20253   TREE_PUBLIC (decl) = 0;
20254   DECL_UNINLINABLE (decl) = 1;
20255
20256   /* Resolver is not external, body is generated.  */
20257   DECL_EXTERNAL (decl) = 0;
20258   DECL_EXTERNAL (ifunc_alias_decl) = 0;
20259
20260   DECL_CONTEXT (decl) = NULL_TREE;
20261   DECL_INITIAL (decl) = make_node (BLOCK);
20262   DECL_STATIC_CONSTRUCTOR (decl) = 0;
20263
20264   if (DECL_COMDAT_GROUP (default_decl)
20265       || TREE_PUBLIC (default_decl))
20266     {
20267       /* In this case, each translation unit with a call to this
20268          versioned function will put out a resolver.  Ensure it
20269          is comdat to keep just one copy.  */
20270       DECL_COMDAT (decl) = 1;
20271       make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
20272     }
20273   else
20274     TREE_PUBLIC (ifunc_alias_decl) = 0;
20275
20276   /* Build result decl and add to function_decl. */
20277   t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
20278   DECL_CONTEXT (t) = decl;
20279   DECL_ARTIFICIAL (t) = 1;
20280   DECL_IGNORED_P (t) = 1;
20281   DECL_RESULT (decl) = t;
20282
20283   /* Build parameter decls and add to function_decl. */
20284   tree arg1 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20285                           get_identifier ("hwcap"),
20286                           uint64_type_node);
20287   tree arg2 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
20288                           get_identifier ("arg"),
20289                           build_ifunc_arg_type());
20290   DECL_CONTEXT (arg1) = decl;
20291   DECL_CONTEXT (arg2) = decl;
20292   DECL_ARTIFICIAL (arg1) = 1;
20293   DECL_ARTIFICIAL (arg2) = 1;
20294   DECL_IGNORED_P (arg1) = 1;
20295   DECL_IGNORED_P (arg2) = 1;
20296   DECL_ARG_TYPE (arg1) = uint64_type_node;
20297   DECL_ARG_TYPE (arg2) = build_ifunc_arg_type ();
20298   DECL_ARGUMENTS (decl) = arg1;
20299   TREE_CHAIN (arg1) = arg2;
20300
20301   gimplify_function_tree (decl);
20302   push_cfun (DECL_STRUCT_FUNCTION (decl));
20303   *empty_bb = init_lowered_empty_function (decl, false,
20304                                            profile_count::uninitialized ());
20305
20306   cgraph_node::add_new_function (decl, true);
20307   symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
20308
20309   pop_cfun ();
20310
20311   gcc_assert (ifunc_alias_decl != NULL);
20312   /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
20313   DECL_ATTRIBUTES (ifunc_alias_decl)
20314     = make_attribute ("ifunc", resolver_name,
20315                       DECL_ATTRIBUTES (ifunc_alias_decl));
20316
20317   /* Create the alias for dispatch to resolver here.  */
20318   cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
20319   return decl;
20320 }
20321
20322 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
20323    to return a pointer to VERSION_DECL if all feature bits specified in
20324    FEATURE_MASK are not set in MASK_VAR.  This function will be called during
20325    version dispatch to decide which function version to execute.  It returns
20326    the basic block at the end, to which more conditions can be added.  */
20327 static basic_block
20328 add_condition_to_bb (tree function_decl, tree version_decl,
20329                      aarch64_fmv_feature_mask feature_mask,
20330                      tree mask_var, basic_block new_bb)
20331 {
20332   gimple *return_stmt;
20333   tree convert_expr, result_var;
20334   gimple *convert_stmt;
20335   gimple *if_else_stmt;
20336
20337   basic_block bb1, bb2, bb3;
20338   edge e12, e23;
20339
20340   gimple_seq gseq;
20341
20342   push_cfun (DECL_STRUCT_FUNCTION (function_decl));
20343
20344   gcc_assert (new_bb != NULL);
20345   gseq = bb_seq (new_bb);
20346
20347   convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
20348                          build_fold_addr_expr (version_decl));
20349   result_var = create_tmp_var (ptr_type_node);
20350   convert_stmt = gimple_build_assign (result_var, convert_expr);
20351   return_stmt = gimple_build_return (result_var);
20352
20353   if (feature_mask == 0ULL)
20354     {
20355       /* Default version.  */
20356       gimple_seq_add_stmt (&gseq, convert_stmt);
20357       gimple_seq_add_stmt (&gseq, return_stmt);
20358       set_bb_seq (new_bb, gseq);
20359       gimple_set_bb (convert_stmt, new_bb);
20360       gimple_set_bb (return_stmt, new_bb);
20361       pop_cfun ();
20362       return new_bb;
20363     }
20364
20365   tree and_expr_var = create_tmp_var (long_long_unsigned_type_node);
20366   tree and_expr = build2 (BIT_AND_EXPR,
20367                           long_long_unsigned_type_node,
20368                           mask_var,
20369                           build_int_cst (long_long_unsigned_type_node,
20370                                          feature_mask));
20371   gimple *and_stmt = gimple_build_assign (and_expr_var, and_expr);
20372   gimple_set_block (and_stmt, DECL_INITIAL (function_decl));
20373   gimple_set_bb (and_stmt, new_bb);
20374   gimple_seq_add_stmt (&gseq, and_stmt);
20375
20376   tree zero_llu = build_int_cst (long_long_unsigned_type_node, 0);
20377   if_else_stmt = gimple_build_cond (EQ_EXPR, and_expr_var, zero_llu,
20378                                     NULL_TREE, NULL_TREE);
20379   gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
20380   gimple_set_bb (if_else_stmt, new_bb);
20381   gimple_seq_add_stmt (&gseq, if_else_stmt);
20382
20383   gimple_seq_add_stmt (&gseq, convert_stmt);
20384   gimple_seq_add_stmt (&gseq, return_stmt);
20385   set_bb_seq (new_bb, gseq);
20386
20387   bb1 = new_bb;
20388   e12 = split_block (bb1, if_else_stmt);
20389   bb2 = e12->dest;
20390   e12->flags &= ~EDGE_FALLTHRU;
20391   e12->flags |= EDGE_TRUE_VALUE;
20392
20393   e23 = split_block (bb2, return_stmt);
20394
20395   gimple_set_bb (convert_stmt, bb2);
20396   gimple_set_bb (return_stmt, bb2);
20397
20398   bb3 = e23->dest;
20399   make_edge (bb1, bb3, EDGE_FALSE_VALUE);
20400
20401   remove_edge (e23);
20402   make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
20403
20404   pop_cfun ();
20405
20406   return bb3;
20407 }
20408
20409 /* This function generates the dispatch function for
20410    multi-versioned functions.  DISPATCH_DECL is the function which will
20411    contain the dispatch logic.  FNDECLS are the function choices for
20412    dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
20413    in DISPATCH_DECL in which the dispatch code is generated.  */
20414
20415 static int
20416 dispatch_function_versions (tree dispatch_decl,
20417                             void *fndecls_p,
20418                             basic_block *empty_bb)
20419 {
20420   gimple *ifunc_cpu_init_stmt;
20421   gimple_seq gseq;
20422   vec<tree> *fndecls;
20423
20424   gcc_assert (dispatch_decl != NULL
20425               && fndecls_p != NULL
20426               && empty_bb != NULL);
20427
20428   push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
20429
20430   gseq = bb_seq (*empty_bb);
20431   /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
20432      constructors, so explicity call __init_cpu_features_resolver here.  */
20433   tree init_fn_type = build_function_type_list (void_type_node,
20434                                                 long_unsigned_type_node,
20435                                                 build_ifunc_arg_type(),
20436                                                 NULL);
20437   tree init_fn_id = get_identifier ("__init_cpu_features_resolver");
20438   tree init_fn_decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
20439                                   init_fn_id, init_fn_type);
20440   tree arg1 = DECL_ARGUMENTS (dispatch_decl);
20441   tree arg2 = TREE_CHAIN (arg1);
20442   ifunc_cpu_init_stmt = gimple_build_call (init_fn_decl, 2, arg1, arg2);
20443   gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
20444   gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
20445
20446   /* Build the struct type for __aarch64_cpu_features.  */
20447   tree global_type = lang_hooks.types.make_type (RECORD_TYPE);
20448   tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20449                             get_identifier ("features"),
20450                             long_long_unsigned_type_node);
20451   DECL_FIELD_CONTEXT (field1) = global_type;
20452   TYPE_FIELDS (global_type) = field1;
20453   layout_type (global_type);
20454
20455   tree global_var = build_decl (UNKNOWN_LOCATION, VAR_DECL,
20456                                 get_identifier ("__aarch64_cpu_features"),
20457                                 global_type);
20458   DECL_EXTERNAL (global_var) = 1;
20459   tree mask_var = create_tmp_var (long_long_unsigned_type_node);
20460
20461   tree component_expr = build3 (COMPONENT_REF, long_long_unsigned_type_node,
20462                                 global_var, field1, NULL_TREE);
20463   gimple *component_stmt = gimple_build_assign (mask_var, component_expr);
20464   gimple_set_block (component_stmt, DECL_INITIAL (dispatch_decl));
20465   gimple_set_bb (component_stmt, *empty_bb);
20466   gimple_seq_add_stmt (&gseq, component_stmt);
20467
20468   tree not_expr = build1 (BIT_NOT_EXPR, long_long_unsigned_type_node, mask_var);
20469   gimple *not_stmt = gimple_build_assign (mask_var, not_expr);
20470   gimple_set_block (not_stmt, DECL_INITIAL (dispatch_decl));
20471   gimple_set_bb (not_stmt, *empty_bb);
20472   gimple_seq_add_stmt (&gseq, not_stmt);
20473
20474   set_bb_seq (*empty_bb, gseq);
20475
20476   pop_cfun ();
20477
20478   /* fndecls_p is actually a vector.  */
20479   fndecls = static_cast<vec<tree> *> (fndecls_p);
20480
20481   /* At least one more version other than the default.  */
20482   unsigned int num_versions = fndecls->length ();
20483   gcc_assert (num_versions >= 2);
20484
20485   struct function_version_info
20486     {
20487       tree version_decl;
20488       aarch64_fmv_feature_mask feature_mask;
20489     } *function_versions;
20490
20491   function_versions = (struct function_version_info *)
20492     XNEWVEC (struct function_version_info, (num_versions));
20493
20494   unsigned int actual_versions = 0;
20495
20496   for (tree version_decl : *fndecls)
20497     {
20498       aarch64_fmv_feature_mask feature_mask;
20499       /* Get attribute string, parse it and find the right features.  */
20500       feature_mask = get_feature_mask_for_version (version_decl);
20501       function_versions [actual_versions].version_decl = version_decl;
20502       function_versions [actual_versions].feature_mask = feature_mask;
20503       actual_versions++;
20504     }
20505
20506   auto compare_feature_version_info = [](const void *p1, const void *p2) {
20507     const function_version_info v1 = *(const function_version_info *)p1;
20508     const function_version_info v2 = *(const function_version_info *)p2;
20509     return - compare_feature_masks (v1.feature_mask, v2.feature_mask);
20510   };
20511
20512   /* Sort the versions according to descending order of dispatch priority.  */
20513   qsort (function_versions, actual_versions,
20514          sizeof (struct function_version_info), compare_feature_version_info);
20515
20516   for (unsigned int i = 0; i < actual_versions; ++i)
20517     *empty_bb = add_condition_to_bb (dispatch_decl,
20518                                      function_versions[i].version_decl,
20519                                      function_versions[i].feature_mask,
20520                                      mask_var,
20521                                      *empty_bb);
20522
20523   free (function_versions);
20524   return 0;
20525 }
20526
20527 /* Implement TARGET_GENERATE_VERSION_DISPATCHER_BODY.  */
20528
20529 tree
20530 aarch64_generate_version_dispatcher_body (void *node_p)
20531 {
20532   tree resolver_decl;
20533   basic_block empty_bb;
20534   tree default_ver_decl;
20535   struct cgraph_node *versn;
20536   struct cgraph_node *node;
20537
20538   struct cgraph_function_version_info *node_version_info = NULL;
20539   struct cgraph_function_version_info *versn_info = NULL;
20540
20541   node = (cgraph_node *)node_p;
20542
20543   node_version_info = node->function_version ();
20544   gcc_assert (node->dispatcher_function
20545               && node_version_info != NULL);
20546
20547   if (node_version_info->dispatcher_resolver)
20548     return node_version_info->dispatcher_resolver;
20549
20550   /* The first version in the chain corresponds to the default version.  */
20551   default_ver_decl = node_version_info->next->this_node->decl;
20552
20553   /* node is going to be an alias, so remove the finalized bit.  */
20554   node->definition = false;
20555
20556   resolver_decl = make_resolver_func (default_ver_decl,
20557                                       node->decl, &empty_bb);
20558
20559   node_version_info->dispatcher_resolver = resolver_decl;
20560
20561   push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
20562
20563   auto_vec<tree, 2> fn_ver_vec;
20564
20565   for (versn_info = node_version_info->next; versn_info;
20566        versn_info = versn_info->next)
20567     {
20568       versn = versn_info->this_node;
20569       /* Check for virtual functions here again, as by this time it should
20570          have been determined if this function needs a vtable index or
20571          not.  This happens for methods in derived classes that override
20572          virtual methods in base classes but are not explicitly marked as
20573          virtual.  */
20574       if (DECL_VINDEX (versn->decl))
20575         sorry ("virtual function multiversioning not supported");
20576
20577       fn_ver_vec.safe_push (versn->decl);
20578     }
20579
20580   dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
20581   cgraph_edge::rebuild_edges ();
20582   pop_cfun ();
20583
20584   /* Fix up symbol names.  First we need to obtain the base name, which may
20585      have already been mangled.  */
20586   tree base_name = get_suffixed_assembler_name (default_ver_decl, "");
20587
20588   /* We need to redo the version mangling on the non-default versions for the
20589      target_clones case.  Redoing the mangling for the target_version case is
20590      redundant but does no harm.  We need to skip the default version, because
20591      expand_clones will append ".default" later; fortunately that suffix is the
20592      one we want anyway.  */
20593   for (versn_info = node_version_info->next->next; versn_info;
20594        versn_info = versn_info->next)
20595     {
20596       tree version_decl = versn_info->this_node->decl;
20597       tree name = aarch64_mangle_decl_assembler_name (version_decl,
20598                                                       base_name);
20599       symtab->change_decl_assembler_name (version_decl, name);
20600     }
20601
20602   /* We also need to use the base name for the ifunc declaration.  */
20603   symtab->change_decl_assembler_name (node->decl, base_name);
20604
20605   return resolver_decl;
20606 }
20607
20608 /* Make a dispatcher declaration for the multi-versioned function DECL.
20609    Calls to DECL function will be replaced with calls to the dispatcher
20610    by the front-end.  Returns the decl of the dispatcher function.  */
20611
20612 tree
20613 aarch64_get_function_versions_dispatcher (void *decl)
20614 {
20615   tree fn = (tree) decl;
20616   struct cgraph_node *node = NULL;
20617   struct cgraph_node *default_node = NULL;
20618   struct cgraph_function_version_info *node_v = NULL;
20619   struct cgraph_function_version_info *first_v = NULL;
20620
20621   tree dispatch_decl = NULL;
20622
20623   struct cgraph_function_version_info *default_version_info = NULL;
20624
20625   gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
20626
20627   node = cgraph_node::get (fn);
20628   gcc_assert (node != NULL);
20629
20630   node_v = node->function_version ();
20631   gcc_assert (node_v != NULL);
20632
20633   if (node_v->dispatcher_resolver != NULL)
20634     return node_v->dispatcher_resolver;
20635
20636   /* Find the default version and make it the first node.  */
20637   first_v = node_v;
20638   /* Go to the beginning of the chain.  */
20639   while (first_v->prev != NULL)
20640     first_v = first_v->prev;
20641   default_version_info = first_v;
20642   while (default_version_info != NULL)
20643     {
20644       if (get_feature_mask_for_version
20645             (default_version_info->this_node->decl) == 0ULL)
20646         break;
20647       default_version_info = default_version_info->next;
20648     }
20649
20650   /* If there is no default node, just return NULL.  */
20651   if (default_version_info == NULL)
20652     return NULL;
20653
20654   /* Make default info the first node.  */
20655   if (first_v != default_version_info)
20656     {
20657       default_version_info->prev->next = default_version_info->next;
20658       if (default_version_info->next)
20659         default_version_info->next->prev = default_version_info->prev;
20660       first_v->prev = default_version_info;
20661       default_version_info->next = first_v;
20662       default_version_info->prev = NULL;
20663     }
20664
20665   default_node = default_version_info->this_node;
20666
20667   if (targetm.has_ifunc_p ())
20668     {
20669       struct cgraph_function_version_info *it_v = NULL;
20670       struct cgraph_node *dispatcher_node = NULL;
20671       struct cgraph_function_version_info *dispatcher_version_info = NULL;
20672
20673       /* Right now, the dispatching is done via ifunc.  */
20674       dispatch_decl = make_dispatcher_decl (default_node->decl);
20675       TREE_NOTHROW (dispatch_decl) = TREE_NOTHROW (fn);
20676
20677       dispatcher_node = cgraph_node::get_create (dispatch_decl);
20678       gcc_assert (dispatcher_node != NULL);
20679       dispatcher_node->dispatcher_function = 1;
20680       dispatcher_version_info
20681         = dispatcher_node->insert_new_function_version ();
20682       dispatcher_version_info->next = default_version_info;
20683       dispatcher_node->definition = 1;
20684
20685       /* Set the dispatcher for all the versions.  */
20686       it_v = default_version_info;
20687       while (it_v != NULL)
20688         {
20689           it_v->dispatcher_resolver = dispatch_decl;
20690           it_v = it_v->next;
20691         }
20692     }
20693   else
20694     {
20695       error_at (DECL_SOURCE_LOCATION (default_node->decl),
20696                 "multiversioning needs %<ifunc%> which is not supported "
20697                 "on this target");
20698     }
20699
20700   return dispatch_decl;
20701 }
20702
20703 /* This function returns true if FN1 and FN2 are versions of the same function,
20704    that is, the target_version attributes of the function decls are different.
20705    This assumes that FN1 and FN2 have the same signature.  */
20706
20707 bool
20708 aarch64_common_function_versions (tree fn1, tree fn2)
20709 {
20710   if (TREE_CODE (fn1) != FUNCTION_DECL
20711       || TREE_CODE (fn2) != FUNCTION_DECL)
20712     return false;
20713
20714   return (aarch64_compare_version_priority (fn1, fn2) != 0);
20715 }
20716
20717 /* Implement TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P.  Use an opt-out
20718    rather than an opt-in list.  */
20719
20720 static bool
20721 aarch64_function_attribute_inlinable_p (const_tree fndecl)
20722 {
20723   /* A function that has local SME state cannot be inlined into its caller,
20724      since we only support managing PSTATE.ZA switches at function scope.  */
20725   return (!aarch64_fndecl_has_new_state (fndecl, "za")
20726           && !aarch64_fndecl_has_new_state (fndecl, "zt0"));
20727 }
20728
20729 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
20730    tri-bool options (yes, no, don't care) and the default value is
20731    DEF, determine whether to reject inlining.  */
20732
20733 static bool
20734 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
20735                                      int dont_care, int def)
20736 {
20737   /* If the callee doesn't care, always allow inlining.  */
20738   if (callee == dont_care)
20739     return true;
20740
20741   /* If the caller doesn't care, always allow inlining.  */
20742   if (caller == dont_care)
20743     return true;
20744
20745   /* Otherwise, allow inlining if either the callee and caller values
20746      agree, or if the callee is using the default value.  */
20747   return (callee == caller || callee == def);
20748 }
20749
20750 /* Bit allocations for ipa_fn_summary::target_info.  */
20751
20752 /* Set if the function contains a stmt that relies on the function's
20753    choice of PSTATE.SM setting (0 for non-streaming, 1 for streaming).
20754    Not meaningful for streaming-compatible functions.  */
20755 constexpr auto AARCH64_IPA_SM_FIXED = 1U << 0;
20756
20757 /* Set if the function clobbers ZA and ZT0.  Not meaningful for functions that
20758    have ZA state.  */
20759 constexpr auto AARCH64_IPA_CLOBBERS_ZA = 1U << 1;
20760 constexpr auto AARCH64_IPA_CLOBBERS_ZT0 = 1U << 2;
20761
20762 /* Implement TARGET_NEED_IPA_FN_TARGET_INFO.  */
20763
20764 static bool
20765 aarch64_need_ipa_fn_target_info (const_tree, unsigned int &)
20766 {
20767   /* We could in principle skip this for streaming-compatible functions
20768      that have ZA state, but that's a rare combination.  */
20769   return true;
20770 }
20771
20772 /* Implement TARGET_UPDATE_IPA_FN_TARGET_INFO.  */
20773
20774 static bool
20775 aarch64_update_ipa_fn_target_info (unsigned int &info, const gimple *stmt)
20776 {
20777   if (auto *ga = dyn_cast<const gasm *> (stmt))
20778     {
20779       /* We don't know what the asm does, so conservatively assume that
20780          it requires the function's current SM mode.  */
20781       info |= AARCH64_IPA_SM_FIXED;
20782       for (unsigned int i = 0; i < gimple_asm_nclobbers (ga); ++i)
20783         {
20784           tree op = gimple_asm_clobber_op (ga, i);
20785           const char *clobber = TREE_STRING_POINTER (TREE_VALUE (op));
20786           if (strcmp (clobber, "za") == 0)
20787             info |= AARCH64_IPA_CLOBBERS_ZA;
20788           if (strcmp (clobber, "zt0") == 0)
20789             info |= AARCH64_IPA_CLOBBERS_ZT0;
20790         }
20791     }
20792   if (auto *call = dyn_cast<const gcall *> (stmt))
20793     {
20794       if (gimple_call_builtin_p (call, BUILT_IN_MD))
20795         {
20796           /* The attributes on AArch64 builtins are supposed to be accurate.
20797              If the function isn't marked streaming-compatible then it
20798              needs whichever SM mode it selects.  */
20799           tree decl = gimple_call_fndecl (call);
20800           if (aarch64_fndecl_pstate_sm (decl) != 0)
20801             info |= AARCH64_IPA_SM_FIXED;
20802         }
20803     }
20804   return true;
20805 }
20806
20807 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
20808    to inline CALLEE into CALLER based on target-specific info.
20809    Make sure that the caller and callee have compatible architectural
20810    features.  Then go through the other possible target attributes
20811    and see if they can block inlining.  Try not to reject always_inline
20812    callees unless they are incompatible architecturally.  */
20813
20814 static bool
20815 aarch64_can_inline_p (tree caller, tree callee)
20816 {
20817   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
20818   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
20819
20820   struct cl_target_option *caller_opts
20821         = TREE_TARGET_OPTION (caller_tree ? caller_tree
20822                                            : target_option_default_node);
20823
20824   struct cl_target_option *callee_opts
20825         = TREE_TARGET_OPTION (callee_tree ? callee_tree
20826                                            : target_option_default_node);
20827
20828   /* Callee's ISA flags should be a subset of the caller's.  */
20829   auto caller_asm_isa = (aarch64_get_asm_isa_flags (caller_opts)
20830                          & ~AARCH64_FL_ISA_MODES);
20831   auto callee_asm_isa = (aarch64_get_asm_isa_flags (callee_opts)
20832                          & ~AARCH64_FL_ISA_MODES);
20833   if (callee_asm_isa & ~caller_asm_isa)
20834     return false;
20835
20836   auto caller_isa = (aarch64_get_isa_flags (caller_opts)
20837                      & ~AARCH64_FL_ISA_MODES);
20838   auto callee_isa = (aarch64_get_isa_flags (callee_opts)
20839                      & ~AARCH64_FL_ISA_MODES);
20840   if (callee_isa & ~caller_isa)
20841     return false;
20842
20843   /* Return true if the callee might have target_info property PROPERTY.
20844      The answer must be true unless we have positive proof to the contrary.  */
20845   auto callee_has_property = [&](unsigned int property)
20846     {
20847       if (ipa_fn_summaries)
20848         if (auto *summary = ipa_fn_summaries->get (cgraph_node::get (callee)))
20849           if (!(summary->target_info & property))
20850             return false;
20851       return true;
20852     };
20853
20854   /* Streaming-compatible code can be inlined into functions with any
20855      PSTATE.SM mode.  Otherwise the caller and callee must agree on
20856      PSTATE.SM mode, unless we can prove that the callee is naturally
20857      streaming-compatible.  */
20858   auto caller_sm = (aarch64_get_isa_flags (caller_opts) & AARCH64_FL_SM_STATE);
20859   auto callee_sm = (aarch64_get_isa_flags (callee_opts) & AARCH64_FL_SM_STATE);
20860   if (callee_sm
20861       && caller_sm != callee_sm
20862       && callee_has_property (AARCH64_IPA_SM_FIXED))
20863     return false;
20864
20865   /* aarch64_function_attribute_inlinable_p prevents new-ZA and new-ZT0
20866      functions from being inlined into others.  We also need to prevent
20867      inlining of shared-ZA functions into functions without ZA state,
20868      since this is an error condition.
20869
20870      The only other problematic case for ZA is inlining a function that
20871      directly clobbers ZA or ZT0 into a function that has ZA or ZT0 state.  */
20872   auto caller_za = (aarch64_get_isa_flags (caller_opts) & AARCH64_FL_ZA_ON);
20873   auto callee_za = (aarch64_get_isa_flags (callee_opts) & AARCH64_FL_ZA_ON);
20874   if (!caller_za && callee_za)
20875     return false;
20876   if (!callee_za
20877       && aarch64_fndecl_has_state (caller, "za")
20878       && callee_has_property (AARCH64_IPA_CLOBBERS_ZA))
20879     return false;
20880   if (!callee_za
20881       && aarch64_fndecl_has_state (caller, "zt0")
20882       && callee_has_property (AARCH64_IPA_CLOBBERS_ZT0))
20883     return false;
20884
20885   /* Allow non-strict aligned functions inlining into strict
20886      aligned ones.  */
20887   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
20888        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
20889       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
20890            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
20891     return false;
20892
20893   bool always_inline = lookup_attribute ("always_inline",
20894                                           DECL_ATTRIBUTES (callee));
20895
20896   /* If the architectural features match up and the callee is always_inline
20897      then the other attributes don't matter.  */
20898   if (always_inline)
20899     return true;
20900
20901   if (caller_opts->x_aarch64_cmodel_var
20902       != callee_opts->x_aarch64_cmodel_var)
20903     return false;
20904
20905   if (caller_opts->x_aarch64_tls_dialect
20906       != callee_opts->x_aarch64_tls_dialect)
20907     return false;
20908
20909   /* Honour explicit requests to workaround errata.  */
20910   if (!aarch64_tribools_ok_for_inlining_p (
20911           caller_opts->x_aarch64_fix_a53_err835769,
20912           callee_opts->x_aarch64_fix_a53_err835769,
20913           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
20914     return false;
20915
20916   if (!aarch64_tribools_ok_for_inlining_p (
20917           caller_opts->x_aarch64_fix_a53_err843419,
20918           callee_opts->x_aarch64_fix_a53_err843419,
20919           2, TARGET_FIX_ERR_A53_843419))
20920     return false;
20921
20922   /* If the user explicitly specified -momit-leaf-frame-pointer for the
20923      caller and calle and they don't match up, reject inlining.  */
20924   if (!aarch64_tribools_ok_for_inlining_p (
20925           caller_opts->x_flag_omit_leaf_frame_pointer,
20926           callee_opts->x_flag_omit_leaf_frame_pointer,
20927           2, 1))
20928     return false;
20929
20930   /* If the callee has specific tuning overrides, respect them.  */
20931   if (callee_opts->x_aarch64_override_tune_string != NULL
20932       && caller_opts->x_aarch64_override_tune_string == NULL)
20933     return false;
20934
20935   /* If the user specified tuning override strings for the
20936      caller and callee and they don't match up, reject inlining.
20937      We just do a string compare here, we don't analyze the meaning
20938      of the string, as it would be too costly for little gain.  */
20939   if (callee_opts->x_aarch64_override_tune_string
20940       && caller_opts->x_aarch64_override_tune_string
20941       && (strcmp (callee_opts->x_aarch64_override_tune_string,
20942                   caller_opts->x_aarch64_override_tune_string) != 0))
20943     return false;
20944
20945   return true;
20946 }
20947
20948 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
20949    been already.  */
20950
20951 arm_pcs
20952 aarch64_tlsdesc_abi_id ()
20953 {
20954   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
20955   if (!tlsdesc_abi.initialized_p ())
20956     {
20957       HARD_REG_SET full_reg_clobbers;
20958       CLEAR_HARD_REG_SET (full_reg_clobbers);
20959       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
20960       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
20961       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
20962         SET_HARD_REG_BIT (full_reg_clobbers, regno);
20963       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
20964     }
20965   return ARM_PCS_TLSDESC;
20966 }
20967
20968 /* Return true if SYMBOL_REF X binds locally.  */
20969
20970 static bool
20971 aarch64_symbol_binds_local_p (const_rtx x)
20972 {
20973   return (SYMBOL_REF_DECL (x)
20974           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
20975           : SYMBOL_REF_LOCAL_P (x));
20976 }
20977
20978 /* Return true if SYMBOL_REF X is thread local */
20979 static bool
20980 aarch64_tls_symbol_p (rtx x)
20981 {
20982   if (! TARGET_HAVE_TLS)
20983     return false;
20984
20985   x = strip_salt (x);
20986   if (!SYMBOL_REF_P (x))
20987     return false;
20988
20989   return SYMBOL_REF_TLS_MODEL (x) != 0;
20990 }
20991
20992 /* Classify a TLS symbol into one of the TLS kinds.  */
20993 enum aarch64_symbol_type
20994 aarch64_classify_tls_symbol (rtx x)
20995 {
20996   enum tls_model tls_kind = tls_symbolic_operand_type (x);
20997
20998   switch (tls_kind)
20999     {
21000     case TLS_MODEL_GLOBAL_DYNAMIC:
21001     case TLS_MODEL_LOCAL_DYNAMIC:
21002       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
21003
21004     case TLS_MODEL_INITIAL_EXEC:
21005       switch (aarch64_cmodel)
21006         {
21007         case AARCH64_CMODEL_TINY:
21008         case AARCH64_CMODEL_TINY_PIC:
21009           return SYMBOL_TINY_TLSIE;
21010         default:
21011           return SYMBOL_SMALL_TLSIE;
21012         }
21013
21014     case TLS_MODEL_LOCAL_EXEC:
21015       if (aarch64_tls_size == 12)
21016         return SYMBOL_TLSLE12;
21017       else if (aarch64_tls_size == 24)
21018         return SYMBOL_TLSLE24;
21019       else if (aarch64_tls_size == 32)
21020         return SYMBOL_TLSLE32;
21021       else if (aarch64_tls_size == 48)
21022         return SYMBOL_TLSLE48;
21023       else
21024         gcc_unreachable ();
21025
21026     case TLS_MODEL_EMULATED:
21027     case TLS_MODEL_NONE:
21028       return SYMBOL_FORCE_TO_MEM;
21029
21030     default:
21031       gcc_unreachable ();
21032     }
21033 }
21034
21035 /* Return the correct method for accessing X + OFFSET, where X is either
21036    a SYMBOL_REF or LABEL_REF.  */
21037
21038 enum aarch64_symbol_type
21039 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
21040 {
21041   x = strip_salt (x);
21042
21043   if (LABEL_REF_P (x))
21044     {
21045       switch (aarch64_cmodel)
21046         {
21047         case AARCH64_CMODEL_LARGE:
21048           return SYMBOL_FORCE_TO_MEM;
21049
21050         case AARCH64_CMODEL_TINY_PIC:
21051         case AARCH64_CMODEL_TINY:
21052           return SYMBOL_TINY_ABSOLUTE;
21053
21054         case AARCH64_CMODEL_SMALL_SPIC:
21055         case AARCH64_CMODEL_SMALL_PIC:
21056         case AARCH64_CMODEL_SMALL:
21057           return SYMBOL_SMALL_ABSOLUTE;
21058
21059         default:
21060           gcc_unreachable ();
21061         }
21062     }
21063
21064   if (SYMBOL_REF_P (x))
21065     {
21066       if (aarch64_tls_symbol_p (x))
21067         return aarch64_classify_tls_symbol (x);
21068
21069       switch (aarch64_cmodel)
21070         {
21071         case AARCH64_CMODEL_TINY_PIC:
21072         case AARCH64_CMODEL_TINY:
21073           /* With -fPIC non-local symbols use the GOT.  For orthogonality
21074              always use the GOT for extern weak symbols.  */
21075           if ((flag_pic || SYMBOL_REF_WEAK (x))
21076               && !aarch64_symbol_binds_local_p (x))
21077             return SYMBOL_TINY_GOT;
21078
21079           /* When we retrieve symbol + offset address, we have to make sure
21080              the offset does not cause overflow of the final address.  But
21081              we have no way of knowing the address of symbol at compile time
21082              so we can't accurately say if the distance between the PC and
21083              symbol + offset is outside the addressible range of +/-1MB in the
21084              TINY code model.  So we limit the maximum offset to +/-64KB and
21085              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
21086              If offset_within_block_p is true we allow larger offsets.  */
21087           if (!(IN_RANGE (offset, -0x10000, 0x10000)
21088                 || offset_within_block_p (x, offset)))
21089             return SYMBOL_FORCE_TO_MEM;
21090
21091           return SYMBOL_TINY_ABSOLUTE;
21092
21093
21094         case AARCH64_CMODEL_SMALL_SPIC:
21095         case AARCH64_CMODEL_SMALL_PIC:
21096         case AARCH64_CMODEL_SMALL:
21097           if ((flag_pic || SYMBOL_REF_WEAK (x))
21098               && !aarch64_symbol_binds_local_p (x))
21099             return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
21100                     ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
21101
21102           /* Same reasoning as the tiny code model, but the offset cap here is
21103              1MB, allowing +/-3.9GB for the offset to the symbol.  */
21104           if (!(IN_RANGE (offset, -0x100000, 0x100000)
21105                 || offset_within_block_p (x, offset)))
21106             return SYMBOL_FORCE_TO_MEM;
21107
21108           return SYMBOL_SMALL_ABSOLUTE;
21109
21110         case AARCH64_CMODEL_LARGE:
21111           /* This is alright even in PIC code as the constant
21112              pool reference is always PC relative and within
21113              the same translation unit.  */
21114           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
21115             return SYMBOL_SMALL_ABSOLUTE;
21116           else
21117             return SYMBOL_FORCE_TO_MEM;
21118
21119         default:
21120           gcc_unreachable ();
21121         }
21122     }
21123
21124   /* By default push everything into the constant pool.  */
21125   return SYMBOL_FORCE_TO_MEM;
21126 }
21127
21128 bool
21129 aarch64_constant_address_p (rtx x)
21130 {
21131   return (CONSTANT_P (x) && memory_address_p (DImode, x));
21132 }
21133
21134 bool
21135 aarch64_legitimate_pic_operand_p (rtx x)
21136 {
21137   poly_int64 offset;
21138   x = strip_offset_and_salt (x, &offset);
21139   if (SYMBOL_REF_P (x))
21140     return false;
21141
21142   return true;
21143 }
21144
21145 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
21146    that should be rematerialized rather than spilled.  */
21147
21148 static bool
21149 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
21150 {
21151   /* Support CSE and rematerialization of common constants.  */
21152   if (CONST_INT_P (x)
21153       || CONST_DOUBLE_P (x))
21154     return true;
21155
21156   /* Only accept variable-length vector constants if they can be
21157      handled directly.
21158
21159      ??? It would be possible (but complex) to handle rematerialization
21160      of other constants via secondary reloads.  */
21161   if (!GET_MODE_SIZE (mode).is_constant ())
21162     return aarch64_simd_valid_immediate (x, NULL);
21163
21164   /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
21165      least be forced to memory and loaded from there.  */
21166   if (CONST_VECTOR_P (x))
21167     return !targetm.cannot_force_const_mem (mode, x);
21168
21169   /* Do not allow vector struct mode constants for Advanced SIMD.
21170      We could support 0 and -1 easily, but they need support in
21171      aarch64-simd.md.  */
21172   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21173   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21174     return false;
21175
21176   if (GET_CODE (x) == HIGH)
21177     x = XEXP (x, 0);
21178
21179   /* Accept polynomial constants that can be calculated by using the
21180      destination of a move as the sole temporary.  Constants that
21181      require a second temporary cannot be rematerialized (they can't be
21182      forced to memory and also aren't legitimate constants).  */
21183   poly_int64 offset;
21184   if (poly_int_rtx_p (x, &offset))
21185     return aarch64_offset_temporaries (false, offset) <= 1;
21186
21187   /* If an offset is being added to something else, we need to allow the
21188      base to be moved into the destination register, meaning that there
21189      are no free temporaries for the offset.  */
21190   x = strip_offset_and_salt (x, &offset);
21191   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
21192     return false;
21193
21194   /* Do not allow const (plus (anchor_symbol, const_int)).  */
21195   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
21196     return false;
21197
21198   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
21199      so spilling them is better than rematerialization.  */
21200   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
21201     return true;
21202
21203   /* Label references are always constant.  */
21204   if (LABEL_REF_P (x))
21205     return true;
21206
21207   return false;
21208 }
21209
21210 rtx
21211 aarch64_load_tp (rtx target)
21212 {
21213   if (!target
21214       || GET_MODE (target) != Pmode
21215       || !register_operand (target, Pmode))
21216     target = gen_reg_rtx (Pmode);
21217
21218   /* Can return in any reg.  */
21219   emit_insn (gen_aarch64_load_tp_hard (target));
21220   return target;
21221 }
21222
21223 /* On AAPCS systems, this is the "struct __va_list".  */
21224 static GTY(()) tree va_list_type;
21225
21226 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
21227    Return the type to use as __builtin_va_list.
21228
21229    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
21230
21231    struct __va_list
21232    {
21233      void *__stack;
21234      void *__gr_top;
21235      void *__vr_top;
21236      int   __gr_offs;
21237      int   __vr_offs;
21238    };  */
21239
21240 static tree
21241 aarch64_build_builtin_va_list (void)
21242 {
21243   tree va_list_name;
21244   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21245
21246   /* Create the type.  */
21247   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
21248   /* Give it the required name.  */
21249   va_list_name = build_decl (BUILTINS_LOCATION,
21250                              TYPE_DECL,
21251                              get_identifier ("__va_list"),
21252                              va_list_type);
21253   DECL_ARTIFICIAL (va_list_name) = 1;
21254   TYPE_NAME (va_list_type) = va_list_name;
21255   TYPE_STUB_DECL (va_list_type) = va_list_name;
21256
21257   /* Create the fields.  */
21258   f_stack = build_decl (BUILTINS_LOCATION,
21259                         FIELD_DECL, get_identifier ("__stack"),
21260                         ptr_type_node);
21261   f_grtop = build_decl (BUILTINS_LOCATION,
21262                         FIELD_DECL, get_identifier ("__gr_top"),
21263                         ptr_type_node);
21264   f_vrtop = build_decl (BUILTINS_LOCATION,
21265                         FIELD_DECL, get_identifier ("__vr_top"),
21266                         ptr_type_node);
21267   f_groff = build_decl (BUILTINS_LOCATION,
21268                         FIELD_DECL, get_identifier ("__gr_offs"),
21269                         integer_type_node);
21270   f_vroff = build_decl (BUILTINS_LOCATION,
21271                         FIELD_DECL, get_identifier ("__vr_offs"),
21272                         integer_type_node);
21273
21274   /* Tell tree-stdarg pass about our internal offset fields.
21275      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
21276      purpose to identify whether the code is updating va_list internal
21277      offset fields through irregular way.  */
21278   va_list_gpr_counter_field = f_groff;
21279   va_list_fpr_counter_field = f_vroff;
21280
21281   DECL_ARTIFICIAL (f_stack) = 1;
21282   DECL_ARTIFICIAL (f_grtop) = 1;
21283   DECL_ARTIFICIAL (f_vrtop) = 1;
21284   DECL_ARTIFICIAL (f_groff) = 1;
21285   DECL_ARTIFICIAL (f_vroff) = 1;
21286
21287   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
21288   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
21289   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
21290   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
21291   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
21292
21293   TYPE_FIELDS (va_list_type) = f_stack;
21294   DECL_CHAIN (f_stack) = f_grtop;
21295   DECL_CHAIN (f_grtop) = f_vrtop;
21296   DECL_CHAIN (f_vrtop) = f_groff;
21297   DECL_CHAIN (f_groff) = f_vroff;
21298
21299   /* Compute its layout.  */
21300   layout_type (va_list_type);
21301
21302   return va_list_type;
21303 }
21304
21305 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
21306 static void
21307 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
21308 {
21309   const CUMULATIVE_ARGS *cum;
21310   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21311   tree stack, grtop, vrtop, groff, vroff;
21312   tree t;
21313   int gr_save_area_size = cfun->va_list_gpr_size;
21314   int vr_save_area_size = cfun->va_list_fpr_size;
21315   int vr_offset;
21316
21317   cum = &crtl->args.info;
21318   if (cfun->va_list_gpr_size)
21319     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
21320                              cfun->va_list_gpr_size);
21321   if (cfun->va_list_fpr_size)
21322     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
21323                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
21324
21325   if (!TARGET_FLOAT)
21326     {
21327       gcc_assert (cum->aapcs_nvrn == 0);
21328       vr_save_area_size = 0;
21329     }
21330
21331   f_stack = TYPE_FIELDS (va_list_type_node);
21332   f_grtop = DECL_CHAIN (f_stack);
21333   f_vrtop = DECL_CHAIN (f_grtop);
21334   f_groff = DECL_CHAIN (f_vrtop);
21335   f_vroff = DECL_CHAIN (f_groff);
21336
21337   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
21338                   NULL_TREE);
21339   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
21340                   NULL_TREE);
21341   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
21342                   NULL_TREE);
21343   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
21344                   NULL_TREE);
21345   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
21346                   NULL_TREE);
21347
21348   /* Emit code to initialize STACK, which points to the next varargs stack
21349      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
21350      by named arguments.  STACK is 8-byte aligned.  */
21351   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
21352   if (cum->aapcs_stack_size > 0)
21353     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
21354   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
21355   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21356
21357   /* Emit code to initialize GRTOP, the top of the GR save area.
21358      virtual_incoming_args_rtx should have been 16 byte aligned.  */
21359   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
21360   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
21361   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21362
21363   /* Emit code to initialize VRTOP, the top of the VR save area.
21364      This address is gr_save_area_bytes below GRTOP, rounded
21365      down to the next 16-byte boundary.  */
21366   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
21367   vr_offset = ROUND_UP (gr_save_area_size,
21368                         STACK_BOUNDARY / BITS_PER_UNIT);
21369
21370   if (vr_offset)
21371     t = fold_build_pointer_plus_hwi (t, -vr_offset);
21372   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
21373   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21374
21375   /* Emit code to initialize GROFF, the offset from GRTOP of the
21376      next GPR argument.  */
21377   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
21378               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
21379   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21380
21381   /* Likewise emit code to initialize VROFF, the offset from FTOP
21382      of the next VR argument.  */
21383   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
21384               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
21385   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21386 }
21387
21388 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
21389
21390 static tree
21391 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
21392                               gimple_seq *post_p ATTRIBUTE_UNUSED)
21393 {
21394   tree addr;
21395   bool indirect_p;
21396   bool is_ha;           /* is HFA or HVA.  */
21397   bool dw_align;        /* double-word align.  */
21398   machine_mode ag_mode = VOIDmode;
21399   int nregs;
21400   machine_mode mode;
21401
21402   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21403   tree stack, f_top, f_off, off, arg, roundup, on_stack;
21404   HOST_WIDE_INT size, rsize, adjust, align;
21405   tree t, u, cond1, cond2;
21406
21407   indirect_p = pass_va_arg_by_reference (type);
21408   if (indirect_p)
21409     type = build_pointer_type (type);
21410
21411   mode = TYPE_MODE (type);
21412
21413   f_stack = TYPE_FIELDS (va_list_type_node);
21414   f_grtop = DECL_CHAIN (f_stack);
21415   f_vrtop = DECL_CHAIN (f_grtop);
21416   f_groff = DECL_CHAIN (f_vrtop);
21417   f_vroff = DECL_CHAIN (f_groff);
21418
21419   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
21420                   f_stack, NULL_TREE);
21421   size = int_size_in_bytes (type);
21422
21423   unsigned int abi_break_gcc_9;
21424   unsigned int abi_break_gcc_13;
21425   unsigned int abi_break_gcc_14;
21426   align
21427     = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
21428                                       &abi_break_gcc_13, &abi_break_gcc_14)
21429     / BITS_PER_UNIT;
21430
21431   dw_align = false;
21432   adjust = 0;
21433   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
21434                                                &is_ha, false))
21435     {
21436       /* No frontends can create types with variable-sized modes, so we
21437          shouldn't be asked to pass or return them.  */
21438       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
21439
21440       /* TYPE passed in fp/simd registers.  */
21441       if (!TARGET_FLOAT)
21442         aarch64_err_no_fpadvsimd (mode);
21443
21444       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
21445                       unshare_expr (valist), f_vrtop, NULL_TREE);
21446       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
21447                       unshare_expr (valist), f_vroff, NULL_TREE);
21448
21449       rsize = nregs * UNITS_PER_VREG;
21450
21451       if (is_ha)
21452         {
21453           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
21454             adjust = UNITS_PER_VREG - ag_size;
21455         }
21456       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21457                && size < UNITS_PER_VREG)
21458         {
21459           adjust = UNITS_PER_VREG - size;
21460         }
21461     }
21462   else
21463     {
21464       /* TYPE passed in general registers.  */
21465       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
21466                       unshare_expr (valist), f_grtop, NULL_TREE);
21467       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
21468                       unshare_expr (valist), f_groff, NULL_TREE);
21469       rsize = ROUND_UP (size, UNITS_PER_WORD);
21470       nregs = rsize / UNITS_PER_WORD;
21471
21472       if (align <= 8
21473           && abi_break_gcc_13
21474           && warn_psabi
21475           && !bitint_or_aggr_of_bitint_p (type))
21476         inform (input_location, "parameter passing for argument of type "
21477                 "%qT changed in GCC 13.1", type);
21478
21479       if (warn_psabi
21480           && abi_break_gcc_14
21481           && (abi_break_gcc_14 > 8 * BITS_PER_UNIT) != (align > 8)
21482           && !bitint_or_aggr_of_bitint_p (type))
21483         inform (input_location, "parameter passing for argument of type "
21484                 "%qT changed in GCC 14.1", type);
21485
21486       if (align > 8)
21487         {
21488           if (abi_break_gcc_9
21489               && warn_psabi
21490               && !bitint_or_aggr_of_bitint_p (type))
21491             inform (input_location, "parameter passing for argument of type "
21492                     "%qT changed in GCC 9.1", type);
21493           dw_align = true;
21494         }
21495
21496       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21497           && size < UNITS_PER_WORD)
21498         {
21499           adjust = UNITS_PER_WORD  - size;
21500         }
21501     }
21502
21503   /* Get a local temporary for the field value.  */
21504   off = get_initialized_tmp_var (f_off, pre_p, NULL);
21505
21506   /* Emit code to branch if off >= 0.  */
21507   t = build2 (GE_EXPR, boolean_type_node, off,
21508               build_int_cst (TREE_TYPE (off), 0));
21509   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
21510
21511   if (dw_align)
21512     {
21513       /* Emit: offs = (offs + 15) & -16.  */
21514       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21515                   build_int_cst (TREE_TYPE (off), 15));
21516       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
21517                   build_int_cst (TREE_TYPE (off), -16));
21518       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
21519     }
21520   else
21521     roundup = NULL;
21522
21523   /* Update ap.__[g|v]r_offs  */
21524   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21525               build_int_cst (TREE_TYPE (off), rsize));
21526   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
21527
21528   /* String up.  */
21529   if (roundup)
21530     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21531
21532   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
21533   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
21534               build_int_cst (TREE_TYPE (f_off), 0));
21535   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
21536
21537   /* String up: make sure the assignment happens before the use.  */
21538   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
21539   COND_EXPR_ELSE (cond1) = t;
21540
21541   /* Prepare the trees handling the argument that is passed on the stack;
21542      the top level node will store in ON_STACK.  */
21543   arg = get_initialized_tmp_var (stack, pre_p, NULL);
21544   if (align > 8)
21545     {
21546       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
21547       t = fold_build_pointer_plus_hwi (arg, 15);
21548       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21549                   build_int_cst (TREE_TYPE (t), -16));
21550       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
21551     }
21552   else
21553     roundup = NULL;
21554   /* Advance ap.__stack  */
21555   t = fold_build_pointer_plus_hwi (arg, size + 7);
21556   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21557               build_int_cst (TREE_TYPE (t), -8));
21558   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
21559   /* String up roundup and advance.  */
21560   if (roundup)
21561     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21562   /* String up with arg */
21563   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
21564   /* Big-endianness related address adjustment.  */
21565   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21566       && size < UNITS_PER_WORD)
21567   {
21568     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
21569                 size_int (UNITS_PER_WORD - size));
21570     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
21571   }
21572
21573   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
21574   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
21575
21576   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
21577   t = off;
21578   if (adjust)
21579     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
21580                 build_int_cst (TREE_TYPE (off), adjust));
21581
21582   t = fold_convert (sizetype, t);
21583   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
21584
21585   if (is_ha)
21586     {
21587       /* type ha; // treat as "struct {ftype field[n];}"
21588          ... [computing offs]
21589          for (i = 0; i <nregs; ++i, offs += 16)
21590            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
21591          return ha;  */
21592       int i;
21593       tree tmp_ha, field_t, field_ptr_t;
21594
21595       /* Declare a local variable.  */
21596       tmp_ha = create_tmp_var_raw (type, "ha");
21597       gimple_add_tmp_var (tmp_ha);
21598
21599       /* Establish the base type.  */
21600       switch (ag_mode)
21601         {
21602         case E_SFmode:
21603           field_t = float_type_node;
21604           field_ptr_t = float_ptr_type_node;
21605           break;
21606         case E_DFmode:
21607           field_t = double_type_node;
21608           field_ptr_t = double_ptr_type_node;
21609           break;
21610         case E_TFmode:
21611           field_t = long_double_type_node;
21612           field_ptr_t = long_double_ptr_type_node;
21613           break;
21614         case E_SDmode:
21615           field_t = dfloat32_type_node;
21616           field_ptr_t = build_pointer_type (dfloat32_type_node);
21617           break;
21618         case E_DDmode:
21619           field_t = dfloat64_type_node;
21620           field_ptr_t = build_pointer_type (dfloat64_type_node);
21621           break;
21622         case E_TDmode:
21623           field_t = dfloat128_type_node;
21624           field_ptr_t = build_pointer_type (dfloat128_type_node);
21625           break;
21626         case E_HFmode:
21627           field_t = aarch64_fp16_type_node;
21628           field_ptr_t = aarch64_fp16_ptr_type_node;
21629           break;
21630         case E_BFmode:
21631           field_t = bfloat16_type_node;
21632           field_ptr_t = aarch64_bf16_ptr_type_node;
21633           break;
21634         case E_V2SImode:
21635         case E_V4SImode:
21636             {
21637               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
21638               field_t = build_vector_type_for_mode (innertype, ag_mode);
21639               field_ptr_t = build_pointer_type (field_t);
21640             }
21641           break;
21642         default:
21643           gcc_assert (0);
21644         }
21645
21646       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
21647       TREE_ADDRESSABLE (tmp_ha) = 1;
21648       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
21649       addr = t;
21650       t = fold_convert (field_ptr_t, addr);
21651       t = build2 (MODIFY_EXPR, field_t,
21652                   build1 (INDIRECT_REF, field_t, tmp_ha),
21653                   build1 (INDIRECT_REF, field_t, t));
21654
21655       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
21656       for (i = 1; i < nregs; ++i)
21657         {
21658           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
21659           u = fold_convert (field_ptr_t, addr);
21660           u = build2 (MODIFY_EXPR, field_t,
21661                       build2 (MEM_REF, field_t, tmp_ha,
21662                               build_int_cst (field_ptr_t,
21663                                              (i *
21664                                               int_size_in_bytes (field_t)))),
21665                       build1 (INDIRECT_REF, field_t, u));
21666           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
21667         }
21668
21669       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
21670       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
21671     }
21672
21673   COND_EXPR_ELSE (cond2) = t;
21674   addr = fold_convert (build_pointer_type (type), cond1);
21675   addr = build_va_arg_indirect_ref (addr);
21676
21677   if (indirect_p)
21678     addr = build_va_arg_indirect_ref (addr);
21679
21680   return addr;
21681 }
21682
21683 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
21684
21685 static void
21686 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
21687                                 const function_arg_info &arg,
21688                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
21689 {
21690   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
21691   CUMULATIVE_ARGS local_cum;
21692   int gr_saved = cfun->va_list_gpr_size;
21693   int vr_saved = cfun->va_list_fpr_size;
21694
21695   /* The caller has advanced CUM up to, but not beyond, the last named
21696      argument.  Advance a local copy of CUM past the last "real" named
21697      argument, to find out how many registers are left over.  */
21698   local_cum = *cum;
21699   if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
21700     aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
21701
21702   /* Found out how many registers we need to save.
21703      Honor tree-stdvar analysis results.  */
21704   if (cfun->va_list_gpr_size)
21705     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
21706                     cfun->va_list_gpr_size / UNITS_PER_WORD);
21707   if (cfun->va_list_fpr_size)
21708     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
21709                     cfun->va_list_fpr_size / UNITS_PER_VREG);
21710
21711   if (!TARGET_FLOAT)
21712     {
21713       gcc_assert (local_cum.aapcs_nvrn == 0);
21714       vr_saved = 0;
21715     }
21716
21717   if (!no_rtl)
21718     {
21719       if (gr_saved > 0)
21720         {
21721           rtx ptr, mem;
21722
21723           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
21724           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
21725                                - gr_saved * UNITS_PER_WORD);
21726           mem = gen_frame_mem (BLKmode, ptr);
21727           set_mem_alias_set (mem, get_varargs_alias_set ());
21728
21729           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
21730                                mem, gr_saved);
21731         }
21732       if (vr_saved > 0)
21733         {
21734           /* We can't use move_block_from_reg, because it will use
21735              the wrong mode, storing D regs only.  */
21736           machine_mode mode = TImode;
21737           int off, i, vr_start;
21738
21739           /* Set OFF to the offset from virtual_incoming_args_rtx of
21740              the first vector register.  The VR save area lies below
21741              the GR one, and is aligned to 16 bytes.  */
21742           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
21743                            STACK_BOUNDARY / BITS_PER_UNIT);
21744           off -= vr_saved * UNITS_PER_VREG;
21745
21746           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
21747           for (i = 0; i < vr_saved; ++i)
21748             {
21749               rtx ptr, mem;
21750
21751               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
21752               mem = gen_frame_mem (mode, ptr);
21753               set_mem_alias_set (mem, get_varargs_alias_set ());
21754               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
21755               off += UNITS_PER_VREG;
21756             }
21757         }
21758     }
21759
21760   /* We don't save the size into *PRETEND_SIZE because we want to avoid
21761      any complication of having crtl->args.pretend_args_size changed.  */
21762   cfun->machine->frame.saved_varargs_size
21763     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
21764                  STACK_BOUNDARY / BITS_PER_UNIT)
21765        + vr_saved * UNITS_PER_VREG);
21766 }
21767
21768 static void
21769 aarch64_conditional_register_usage (void)
21770 {
21771   int i;
21772   if (!TARGET_FLOAT)
21773     {
21774       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
21775         {
21776           fixed_regs[i] = 1;
21777           call_used_regs[i] = 1;
21778           CLEAR_HARD_REG_BIT (operand_reg_set, i);
21779         }
21780     }
21781   if (!TARGET_SVE)
21782     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
21783       {
21784         fixed_regs[i] = 1;
21785         call_used_regs[i] = 1;
21786       }
21787
21788   /* Only allow these registers to be accessed via special patterns.  */
21789   CLEAR_HARD_REG_BIT (operand_reg_set, VG_REGNUM);
21790   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
21791   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
21792   for (int i = FIRST_FAKE_REGNUM; i <= LAST_FAKE_REGNUM; ++i)
21793     CLEAR_HARD_REG_BIT (operand_reg_set, i);
21794
21795   /* When tracking speculation, we need a couple of call-clobbered registers
21796      to track the speculation state.  It would be nice to just use
21797      IP0 and IP1, but currently there are numerous places that just
21798      assume these registers are free for other uses (eg pointer
21799      authentication).  */
21800   if (aarch64_track_speculation)
21801     {
21802       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
21803       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
21804       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21805       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21806     }
21807 }
21808
21809 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
21810
21811 bool
21812 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
21813 {
21814   /* For records we're passed a FIELD_DECL, for arrays we're passed
21815      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
21816   const_tree type = TREE_TYPE (field_or_array);
21817
21818   /* Assign BLKmode to anything that contains more than 2 SVE predicates.
21819      For structures, the "multiple" case is indicated by MODE being
21820      VOIDmode.  */
21821   unsigned int num_zr, num_pr;
21822   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr > 2)
21823     {
21824       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
21825         return !simple_cst_equal (TYPE_SIZE (field_or_array),
21826                                   TYPE_SIZE (type));
21827       return mode == VOIDmode;
21828     }
21829
21830   return default_member_type_forces_blk (field_or_array, mode);
21831 }
21832
21833 /* Bitmasks that indicate whether earlier versions of GCC would have
21834    taken a different path through the ABI logic.  This should result in
21835    a -Wpsabi warning if the earlier path led to a different ABI decision.
21836
21837    WARN_PSABI_EMPTY_CXX17_BASE
21838       Indicates that the type includes an artificial empty C++17 base field
21839       that, prior to GCC 10.1, would prevent the type from being treated as
21840       a HFA or HVA.  See PR94383 for details.
21841
21842    WARN_PSABI_NO_UNIQUE_ADDRESS
21843       Indicates that the type includes an empty [[no_unique_address]] field
21844       that, prior to GCC 10.1, would prevent the type from being treated as
21845       a HFA or HVA.  */
21846 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
21847 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
21848 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
21849
21850 /* Walk down the type tree of TYPE counting consecutive base elements.
21851    If *MODEP is VOIDmode, then set it to the first valid floating point
21852    type.  If a non-floating point type is found, or if a floating point
21853    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
21854    otherwise return the count in the sub-tree.
21855
21856    The WARN_PSABI_FLAGS argument allows the caller to check whether this
21857    function has changed its behavior relative to earlier versions of GCC.
21858    Normally the argument should be nonnull and point to a zero-initialized
21859    variable.  The function then records whether the ABI decision might
21860    be affected by a known fix to the ABI logic, setting the associated
21861    WARN_PSABI_* bits if so.
21862
21863    When the argument is instead a null pointer, the function tries to
21864    simulate the behavior of GCC before all such ABI fixes were made.
21865    This is useful to check whether the function returns something
21866    different after the ABI fixes.  */
21867 static int
21868 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
21869                          unsigned int *warn_psabi_flags)
21870 {
21871   machine_mode mode;
21872   HOST_WIDE_INT size;
21873
21874   if (aarch64_sve::builtin_type_p (type))
21875     return -1;
21876
21877   switch (TREE_CODE (type))
21878     {
21879     case REAL_TYPE:
21880       mode = TYPE_MODE (type);
21881       if (mode != DFmode && mode != SFmode
21882           && mode != TFmode && mode != HFmode
21883           && mode != SDmode && mode != DDmode && mode != TDmode)
21884         return -1;
21885
21886       if (*modep == VOIDmode)
21887         *modep = mode;
21888
21889       if (*modep == mode)
21890         return 1;
21891
21892       break;
21893
21894     case COMPLEX_TYPE:
21895       mode = TYPE_MODE (TREE_TYPE (type));
21896       if (mode != DFmode && mode != SFmode
21897           && mode != TFmode && mode != HFmode)
21898         return -1;
21899
21900       if (*modep == VOIDmode)
21901         *modep = mode;
21902
21903       if (*modep == mode)
21904         return 2;
21905
21906       break;
21907
21908     case VECTOR_TYPE:
21909       /* Use V2SImode and V4SImode as representatives of all 64-bit
21910          and 128-bit vector types.  */
21911       size = int_size_in_bytes (type);
21912       switch (size)
21913         {
21914         case 8:
21915           mode = V2SImode;
21916           break;
21917         case 16:
21918           mode = V4SImode;
21919           break;
21920         default:
21921           return -1;
21922         }
21923
21924       if (*modep == VOIDmode)
21925         *modep = mode;
21926
21927       /* Vector modes are considered to be opaque: two vectors are
21928          equivalent for the purposes of being homogeneous aggregates
21929          if they are the same size.  */
21930       if (*modep == mode)
21931         return 1;
21932
21933       break;
21934
21935     case ARRAY_TYPE:
21936       {
21937         int count;
21938         tree index = TYPE_DOMAIN (type);
21939
21940         /* Can't handle incomplete types nor sizes that are not
21941            fixed.  */
21942         if (!COMPLETE_TYPE_P (type)
21943             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21944           return -1;
21945
21946         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
21947                                          warn_psabi_flags);
21948         if (count == -1
21949             || !index
21950             || !TYPE_MAX_VALUE (index)
21951             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
21952             || !TYPE_MIN_VALUE (index)
21953             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
21954             || count < 0)
21955           return -1;
21956
21957         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
21958                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
21959
21960         /* There must be no padding.  */
21961         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21962                       count * GET_MODE_BITSIZE (*modep)))
21963           return -1;
21964
21965         return count;
21966       }
21967
21968     case RECORD_TYPE:
21969       {
21970         int count = 0;
21971         int sub_count;
21972         tree field;
21973
21974         /* Can't handle incomplete types nor sizes that are not
21975            fixed.  */
21976         if (!COMPLETE_TYPE_P (type)
21977             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21978           return -1;
21979
21980         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21981           {
21982             if (TREE_CODE (field) != FIELD_DECL)
21983               continue;
21984
21985             if (DECL_FIELD_ABI_IGNORED (field))
21986               {
21987                 /* See whether this is something that earlier versions of
21988                    GCC failed to ignore.  */
21989                 unsigned int flag;
21990                 if (lookup_attribute ("no_unique_address",
21991                                       DECL_ATTRIBUTES (field)))
21992                   flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
21993                 else if (cxx17_empty_base_field_p (field))
21994                   flag = WARN_PSABI_EMPTY_CXX17_BASE;
21995                 else
21996                   /* No compatibility problem.  */
21997                   continue;
21998
21999                 /* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
22000                 if (warn_psabi_flags)
22001                   {
22002                     *warn_psabi_flags |= flag;
22003                     continue;
22004                   }
22005               }
22006             /* A zero-width bitfield may affect layout in some
22007                circumstances, but adds no members.  The determination
22008                of whether or not a type is an HFA is performed after
22009                layout is complete, so if the type still looks like an
22010                HFA afterwards, it is still classed as one.  This is
22011                potentially an ABI break for the hard-float ABI.  */
22012             else if (DECL_BIT_FIELD (field)
22013                      && integer_zerop (DECL_SIZE (field)))
22014               {
22015                 /* Prior to GCC-12 these fields were striped early,
22016                    hiding them from the back-end entirely and
22017                    resulting in the correct behaviour for argument
22018                    passing.  Simulate that old behaviour without
22019                    generating a warning.  */
22020                 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
22021                   continue;
22022                 if (warn_psabi_flags)
22023                   {
22024                     *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
22025                     continue;
22026                   }
22027               }
22028
22029             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
22030                                                  warn_psabi_flags);
22031             if (sub_count < 0)
22032               return -1;
22033             count += sub_count;
22034           }
22035
22036         /* There must be no padding.  */
22037         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
22038                       count * GET_MODE_BITSIZE (*modep)))
22039           return -1;
22040
22041         return count;
22042       }
22043
22044     case UNION_TYPE:
22045     case QUAL_UNION_TYPE:
22046       {
22047         /* These aren't very interesting except in a degenerate case.  */
22048         int count = 0;
22049         int sub_count;
22050         tree field;
22051
22052         /* Can't handle incomplete types nor sizes that are not
22053            fixed.  */
22054         if (!COMPLETE_TYPE_P (type)
22055             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
22056           return -1;
22057
22058         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
22059           {
22060             if (TREE_CODE (field) != FIELD_DECL)
22061               continue;
22062
22063             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
22064                                                  warn_psabi_flags);
22065             if (sub_count < 0)
22066               return -1;
22067             count = count > sub_count ? count : sub_count;
22068           }
22069
22070         /* There must be no padding.  */
22071         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
22072                       count * GET_MODE_BITSIZE (*modep)))
22073           return -1;
22074
22075         return count;
22076       }
22077
22078     default:
22079       break;
22080     }
22081
22082   return -1;
22083 }
22084
22085 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
22086    type as described in AAPCS64 \S 4.1.2.
22087
22088    See the comment above aarch64_composite_type_p for the notes on MODE.  */
22089
22090 static bool
22091 aarch64_short_vector_p (const_tree type,
22092                         machine_mode mode)
22093 {
22094   poly_int64 size = -1;
22095
22096   if (type && VECTOR_TYPE_P (type))
22097     {
22098       if (aarch64_sve::builtin_type_p (type))
22099         return false;
22100       size = int_size_in_bytes (type);
22101     }
22102   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
22103            || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
22104     {
22105       /* The containing "else if" is too loose: it means that we look at TYPE
22106          if the type is a vector type (good), but that we otherwise ignore TYPE
22107          and look only at the mode.  This is wrong because the type describes
22108          the language-level information whereas the mode is purely an internal
22109          GCC concept.  We can therefore reach here for types that are not
22110          vectors in the AAPCS64 sense.
22111
22112          We can't "fix" that for the traditional Advanced SIMD vector modes
22113          without breaking backwards compatibility.  However, there's no such
22114          baggage for the structure modes, which were introduced in GCC 12.  */
22115       if (aarch64_advsimd_struct_mode_p (mode))
22116         return false;
22117
22118       /* For similar reasons, rely only on the type, not the mode, when
22119          processing SVE types.  */
22120       if (type && aarch64_some_values_include_pst_objects_p (type))
22121         /* Leave later code to report an error if SVE is disabled.  */
22122         gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
22123       else
22124         size = GET_MODE_SIZE (mode);
22125     }
22126   if (known_eq (size, 8) || known_eq (size, 16))
22127     {
22128       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
22129          they are being treated as scalable AAPCS64 types.  */
22130       gcc_assert (!aarch64_sve_mode_p (mode)
22131                   && !aarch64_advsimd_struct_mode_p (mode));
22132       return true;
22133     }
22134   return false;
22135 }
22136
22137 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
22138    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
22139    array types.  The C99 floating-point complex types are also considered
22140    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
22141    types, which are GCC extensions and out of the scope of AAPCS64, are
22142    treated as composite types here as well.
22143
22144    Note that MODE itself is not sufficient in determining whether a type
22145    is such a composite type or not.  This is because
22146    stor-layout.cc:compute_record_mode may have already changed the MODE
22147    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
22148    structure with only one field may have its MODE set to the mode of the
22149    field.  Also an integer mode whose size matches the size of the
22150    RECORD_TYPE type may be used to substitute the original mode
22151    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
22152    solely relied on.  */
22153
22154 static bool
22155 aarch64_composite_type_p (const_tree type,
22156                           machine_mode mode)
22157 {
22158   if (aarch64_short_vector_p (type, mode))
22159     return false;
22160
22161   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
22162     return true;
22163
22164   if (type
22165       && TREE_CODE (type) == BITINT_TYPE
22166       && int_size_in_bytes (type) > 16)
22167     return true;
22168
22169   if (mode == BLKmode
22170       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
22171       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
22172     return true;
22173
22174   return false;
22175 }
22176
22177 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
22178    shall be passed or returned in simd/fp register(s) (providing these
22179    parameter passing registers are available).
22180
22181    Upon successful return, *COUNT returns the number of needed registers,
22182    *BASE_MODE returns the mode of the individual register and when IS_HA
22183    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
22184    floating-point aggregate or a homogeneous short-vector aggregate.
22185
22186    SILENT_P is true if the function should refrain from reporting any
22187    diagnostics.  This should only be used if the caller is certain that
22188    any ABI decisions would eventually come through this function with
22189    SILENT_P set to false.  */
22190
22191 static bool
22192 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
22193                                          const_tree type,
22194                                          machine_mode *base_mode,
22195                                          int *count,
22196                                          bool *is_ha,
22197                                          bool silent_p)
22198 {
22199   if (is_ha != NULL) *is_ha = false;
22200
22201   machine_mode new_mode = VOIDmode;
22202   bool composite_p = aarch64_composite_type_p (type, mode);
22203
22204   if ((!composite_p
22205        && (GET_MODE_CLASS (mode) == MODE_FLOAT
22206            || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
22207       || aarch64_short_vector_p (type, mode))
22208     {
22209       *count = 1;
22210       new_mode = mode;
22211     }
22212   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
22213     {
22214       if (is_ha != NULL) *is_ha = true;
22215       *count = 2;
22216       new_mode = GET_MODE_INNER (mode);
22217     }
22218   else if (type && composite_p)
22219     {
22220       unsigned int warn_psabi_flags = 0;
22221       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
22222                                               &warn_psabi_flags);
22223       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
22224         {
22225           static unsigned last_reported_type_uid;
22226           unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
22227           int alt;
22228           if (!silent_p
22229               && warn_psabi
22230               && warn_psabi_flags
22231               && uid != last_reported_type_uid
22232               && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
22233                   != ag_count))
22234             {
22235               const char *url10
22236                 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
22237               const char *url12
22238                 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
22239               gcc_assert (alt == -1);
22240               last_reported_type_uid = uid;
22241               /* Use TYPE_MAIN_VARIANT to strip any redundant const
22242                  qualification.  */
22243               if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
22244                 inform (input_location, "parameter passing for argument of "
22245                         "type %qT with %<[[no_unique_address]]%> members "
22246                         "changed %{in GCC 10.1%}",
22247                         TYPE_MAIN_VARIANT (type), url10);
22248               else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
22249                 inform (input_location, "parameter passing for argument of "
22250                         "type %qT when C++17 is enabled changed to match "
22251                         "C++14 %{in GCC 10.1%}",
22252                         TYPE_MAIN_VARIANT (type), url10);
22253               else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
22254                 inform (input_location, "parameter passing for argument of "
22255                         "type %qT changed %{in GCC 12.1%}",
22256                         TYPE_MAIN_VARIANT (type), url12);
22257             }
22258
22259           if (is_ha != NULL) *is_ha = true;
22260           *count = ag_count;
22261         }
22262       else
22263         return false;
22264     }
22265   else
22266     return false;
22267
22268   gcc_assert (!aarch64_sve_mode_p (new_mode));
22269   *base_mode = new_mode;
22270   return true;
22271 }
22272
22273 /* Implement TARGET_STRUCT_VALUE_RTX.  */
22274
22275 static rtx
22276 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
22277                           int incoming ATTRIBUTE_UNUSED)
22278 {
22279   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
22280 }
22281
22282 /* Implements target hook vector_mode_supported_p.  */
22283 static bool
22284 aarch64_vector_mode_supported_p (machine_mode mode)
22285 {
22286   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
22287   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22288 }
22289
22290 /* Implements target hook vector_mode_supported_any_target_p.  */
22291 static bool
22292 aarch64_vector_mode_supported_any_target_p (machine_mode mode)
22293 {
22294   unsigned int vec_flags = aarch64_classify_vector_mode (mode, true);
22295   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
22296 }
22297
22298 /* Return the full-width SVE vector mode for element mode MODE, if one
22299    exists.  */
22300 opt_machine_mode
22301 aarch64_full_sve_mode (scalar_mode mode)
22302 {
22303   switch (mode)
22304     {
22305     case E_DFmode:
22306       return VNx2DFmode;
22307     case E_SFmode:
22308       return VNx4SFmode;
22309     case E_HFmode:
22310       return VNx8HFmode;
22311     case E_BFmode:
22312       return VNx8BFmode;
22313     case E_DImode:
22314       return VNx2DImode;
22315     case E_SImode:
22316       return VNx4SImode;
22317     case E_HImode:
22318       return VNx8HImode;
22319     case E_QImode:
22320       return VNx16QImode;
22321     default:
22322       return opt_machine_mode ();
22323     }
22324 }
22325
22326 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
22327    if it exists.  */
22328 opt_machine_mode
22329 aarch64_vq_mode (scalar_mode mode)
22330 {
22331   switch (mode)
22332     {
22333     case E_DFmode:
22334       return V2DFmode;
22335     case E_SFmode:
22336       return V4SFmode;
22337     case E_HFmode:
22338       return V8HFmode;
22339     case E_BFmode:
22340       return V8BFmode;
22341     case E_SImode:
22342       return V4SImode;
22343     case E_HImode:
22344       return V8HImode;
22345     case E_QImode:
22346       return V16QImode;
22347     case E_DImode:
22348       return V2DImode;
22349     default:
22350       return opt_machine_mode ();
22351     }
22352 }
22353
22354 /* Return appropriate SIMD container
22355    for MODE within a vector of WIDTH bits.  */
22356 static machine_mode
22357 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
22358 {
22359   if (TARGET_SVE
22360       && maybe_ne (width, 128)
22361       && known_eq (width, BITS_PER_SVE_VECTOR))
22362     return aarch64_full_sve_mode (mode).else_mode (word_mode);
22363
22364   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
22365   if (TARGET_BASE_SIMD)
22366     {
22367       if (known_eq (width, 128))
22368         return aarch64_vq_mode (mode).else_mode (word_mode);
22369       else
22370         switch (mode)
22371           {
22372           case E_SFmode:
22373             return V2SFmode;
22374           case E_HFmode:
22375             return V4HFmode;
22376           case E_BFmode:
22377             return V4BFmode;
22378           case E_SImode:
22379             return V2SImode;
22380           case E_HImode:
22381             return V4HImode;
22382           case E_QImode:
22383             return V8QImode;
22384           default:
22385             break;
22386           }
22387     }
22388   return word_mode;
22389 }
22390
22391 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
22392    and return whether the SVE mode should be preferred over the
22393    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
22394 static bool
22395 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
22396 {
22397   /* Take into account the aarch64-autovec-preference param if non-zero.  */
22398   bool only_asimd_p = aarch64_autovec_preference == AARCH64_AUTOVEC_ASIMD_ONLY;
22399   bool only_sve_p = aarch64_autovec_preference == AARCH64_AUTOVEC_SVE_ONLY;
22400
22401   if (only_asimd_p)
22402     return false;
22403   if (only_sve_p)
22404     return true;
22405
22406   /* The preference in case of a tie in costs.  */
22407   bool prefer_asimd = aarch64_autovec_preference == AARCH64_AUTOVEC_PREFER_ASIMD;
22408   bool prefer_sve = aarch64_autovec_preference == AARCH64_AUTOVEC_PREFER_SVE;
22409
22410   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
22411   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
22412   /* If the CPU information does not have an SVE width registered use the
22413      generic poly_int comparison that prefers SVE.  If a preference is
22414      explicitly requested avoid this path.  */
22415   if (aarch64_tune_params.sve_width == SVE_SCALABLE
22416       && !prefer_asimd
22417       && !prefer_sve)
22418     return maybe_gt (nunits_sve, nunits_asimd);
22419
22420   /* Otherwise estimate the runtime width of the modes involved.  */
22421   HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
22422   HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
22423
22424   /* Preferring SVE means picking it first unless the Advanced SIMD mode
22425      is clearly wider.  */
22426   if (prefer_sve)
22427     return est_sve >= est_asimd;
22428   /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
22429      is clearly wider.  */
22430   if (prefer_asimd)
22431     return est_sve > est_asimd;
22432
22433   /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
22434   return est_sve > est_asimd;
22435 }
22436
22437 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
22438 static machine_mode
22439 aarch64_preferred_simd_mode (scalar_mode mode)
22440 {
22441   /* Take into account explicit auto-vectorization ISA preferences through
22442      aarch64_cmp_autovec_modes.  */
22443   if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
22444     return aarch64_full_sve_mode (mode).else_mode (word_mode);
22445   if (TARGET_SIMD)
22446     return aarch64_vq_mode (mode).else_mode (word_mode);
22447   return word_mode;
22448 }
22449
22450 /* Return a list of possible vector sizes for the vectorizer
22451    to iterate over.  */
22452 static unsigned int
22453 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
22454 {
22455   static const machine_mode sve_modes[] = {
22456     /* Try using full vectors for all element types.  */
22457     VNx16QImode,
22458
22459     /* Try using 16-bit containers for 8-bit elements and full vectors
22460        for wider elements.  */
22461     VNx8QImode,
22462
22463     /* Try using 32-bit containers for 8-bit and 16-bit elements and
22464        full vectors for wider elements.  */
22465     VNx4QImode,
22466
22467     /* Try using 64-bit containers for all element types.  */
22468     VNx2QImode
22469   };
22470
22471   static const machine_mode advsimd_modes[] = {
22472     /* Try using 128-bit vectors for all element types.  */
22473     V16QImode,
22474
22475     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
22476        for wider elements.  */
22477     V8QImode,
22478
22479     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
22480        for wider elements.
22481
22482        TODO: We could support a limited form of V4QImode too, so that
22483        we use 32-bit vectors for 8-bit elements.  */
22484     V4HImode,
22485
22486     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
22487        for 64-bit elements.
22488
22489        TODO: We could similarly support limited forms of V2QImode and V2HImode
22490        for this case.  */
22491     V2SImode
22492   };
22493
22494   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
22495      This is because:
22496
22497      - If we can't use N-byte Advanced SIMD vectors then the placement
22498        doesn't matter; we'll just continue as though the Advanced SIMD
22499        entry didn't exist.
22500
22501      - If an SVE main loop with N bytes ends up being cheaper than an
22502        Advanced SIMD main loop with N bytes then by default we'll replace
22503        the Advanced SIMD version with the SVE one.
22504
22505      - If an Advanced SIMD main loop with N bytes ends up being cheaper
22506        than an SVE main loop with N bytes then by default we'll try to
22507        use the SVE loop to vectorize the epilogue instead.  */
22508
22509   bool only_asimd_p = aarch64_autovec_preference == AARCH64_AUTOVEC_ASIMD_ONLY;
22510   bool only_sve_p = aarch64_autovec_preference == AARCH64_AUTOVEC_SVE_ONLY;
22511
22512   unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
22513   unsigned int advsimd_i = 0;
22514
22515   while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
22516     {
22517       if (sve_i < ARRAY_SIZE (sve_modes)
22518           && aarch64_cmp_autovec_modes (sve_modes[sve_i],
22519                                         advsimd_modes[advsimd_i]))
22520         modes->safe_push (sve_modes[sve_i++]);
22521       else
22522         modes->safe_push (advsimd_modes[advsimd_i++]);
22523     }
22524   while (sve_i < ARRAY_SIZE (sve_modes))
22525    modes->safe_push (sve_modes[sve_i++]);
22526
22527   unsigned int flags = 0;
22528   if (aarch64_vect_compare_costs)
22529     flags |= VECT_COMPARE_COSTS;
22530   return flags;
22531 }
22532
22533 /* Implement TARGET_MANGLE_TYPE.  */
22534
22535 static const char *
22536 aarch64_mangle_type (const_tree type)
22537 {
22538   /* The AArch64 ABI documents say that "__va_list" has to be
22539      mangled as if it is in the "std" namespace.  */
22540   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
22541     return "St9__va_list";
22542
22543   /* Half-precision floating point types.  */
22544   if (SCALAR_FLOAT_TYPE_P (type) && TYPE_PRECISION (type) == 16)
22545     {
22546       if (TYPE_MAIN_VARIANT (type) == float16_type_node)
22547         return NULL;
22548       if (TYPE_MODE (type) == BFmode)
22549         return "u6__bf16";
22550       else
22551         return "Dh";
22552     }
22553
22554   /* Modal 8 bit floating point types.  */
22555   if (TYPE_MAIN_VARIANT (type) == aarch64_mfp8_type_node)
22556     return "u6__mfp8";
22557
22558   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
22559      builtin types.  */
22560   if (TYPE_NAME (type) != NULL)
22561     {
22562       const char *res;
22563       if ((res = aarch64_general_mangle_builtin_type (type))
22564           || (res = aarch64_sve::mangle_builtin_type (type)))
22565         return res;
22566     }
22567
22568   /* Use the default mangling.  */
22569   return NULL;
22570 }
22571
22572 /* Implement TARGET_INVALID_CONVERSION.  */
22573
22574 static const char *
22575 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
22576 {
22577   /* Do not allow conversions to/from FP8. But do allow conversions between
22578      volatile and const variants of __mfp8. */
22579   bool fromtype_is_fp8
22580       = (TYPE_MAIN_VARIANT (fromtype) == aarch64_mfp8_type_node);
22581   bool totype_is_fp8 = (TYPE_MAIN_VARIANT (totype) == aarch64_mfp8_type_node);
22582
22583   if (fromtype_is_fp8 && totype_is_fp8)
22584     return NULL;
22585
22586   if (fromtype_is_fp8)
22587     return N_ ("invalid conversion from type %<mfloat8_t%>");
22588   if (totype_is_fp8)
22589     return N_ ("invalid conversion to type %<mfloat8_t%>");
22590
22591   /* Conversion allowed.  */
22592   return NULL;
22593 }
22594
22595 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
22596
22597 static bool
22598 aarch64_verify_type_context (location_t loc, type_context_kind context,
22599                              const_tree type, bool silent_p)
22600 {
22601   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
22602 }
22603
22604 /* Find the first rtx_insn before insn that will generate an assembly
22605    instruction.  */
22606
22607 static rtx_insn *
22608 aarch64_prev_real_insn (rtx_insn *insn)
22609 {
22610   if (!insn)
22611     return NULL;
22612
22613   do
22614     {
22615       insn = prev_real_insn (insn);
22616     }
22617   while (insn && recog_memoized (insn) < 0);
22618
22619   return insn;
22620 }
22621
22622 static bool
22623 is_madd_op (enum attr_type t1)
22624 {
22625   unsigned int i;
22626   /* A number of these may be AArch32 only.  */
22627   enum attr_type mlatypes[] = {
22628     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
22629     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
22630     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
22631   };
22632
22633   for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
22634     {
22635       if (t1 == mlatypes[i])
22636         return true;
22637     }
22638
22639   return false;
22640 }
22641
22642 /* Check if there is a register dependency between a load and the insn
22643    for which we hold recog_data.  */
22644
22645 static bool
22646 dep_between_memop_and_curr (rtx memop)
22647 {
22648   rtx load_reg;
22649   int opno;
22650
22651   gcc_assert (GET_CODE (memop) == SET);
22652
22653   if (!REG_P (SET_DEST (memop)))
22654     return false;
22655
22656   load_reg = SET_DEST (memop);
22657   for (opno = 1; opno < recog_data.n_operands; opno++)
22658     {
22659       rtx operand = recog_data.operand[opno];
22660       if (REG_P (operand)
22661           && reg_overlap_mentioned_p (load_reg, operand))
22662         return true;
22663
22664     }
22665   return false;
22666 }
22667
22668
22669 /* When working around the Cortex-A53 erratum 835769,
22670    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
22671    instruction and has a preceding memory instruction such that a NOP
22672    should be inserted between them.  */
22673
22674 bool
22675 aarch64_madd_needs_nop (rtx_insn* insn)
22676 {
22677   enum attr_type attr_type;
22678   rtx_insn *prev;
22679   rtx body;
22680
22681   if (!TARGET_FIX_ERR_A53_835769)
22682     return false;
22683
22684   if (!INSN_P (insn) || recog_memoized (insn) < 0)
22685     return false;
22686
22687   attr_type = get_attr_type (insn);
22688   if (!is_madd_op (attr_type))
22689     return false;
22690
22691   prev = aarch64_prev_real_insn (insn);
22692   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
22693      Restore recog state to INSN to avoid state corruption.  */
22694   extract_constrain_insn_cached (insn);
22695
22696   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
22697     return false;
22698
22699   body = single_set (prev);
22700
22701   /* If the previous insn is a memory op and there is no dependency between
22702      it and the DImode madd, emit a NOP between them.  If body is NULL then we
22703      have a complex memory operation, probably a load/store pair.
22704      Be conservative for now and emit a NOP.  */
22705   if (GET_MODE (recog_data.operand[0]) == DImode
22706       && (!body || !dep_between_memop_and_curr (body)))
22707     return true;
22708
22709   return false;
22710
22711 }
22712
22713
22714 /* Implement FINAL_PRESCAN_INSN.  */
22715
22716 void
22717 aarch64_final_prescan_insn (rtx_insn *insn)
22718 {
22719   if (aarch64_madd_needs_nop (insn))
22720     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
22721 }
22722
22723
22724 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
22725    instruction.  */
22726
22727 bool
22728 aarch64_sve_index_immediate_p (rtx base_or_step)
22729 {
22730   return (CONST_INT_P (base_or_step)
22731           && IN_RANGE (INTVAL (base_or_step), -16, 15));
22732 }
22733
22734 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
22735    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
22736
22737 bool
22738 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
22739 {
22740   rtx elt = unwrap_const_vec_duplicate (x);
22741   if (!CONST_INT_P (elt))
22742     return false;
22743
22744   HOST_WIDE_INT val = INTVAL (elt);
22745   if (negate_p)
22746     val = -val;
22747   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
22748
22749   if (val & 0xff)
22750     return IN_RANGE (val, 0, 0xff);
22751   return IN_RANGE (val, 0, 0xff00);
22752 }
22753
22754 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
22755    instructions when applied to mode MODE.  Negate X first if NEGATE_P
22756    is true.  */
22757
22758 bool
22759 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
22760 {
22761   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
22762     return false;
22763
22764   /* After the optional negation, the immediate must be nonnegative.
22765      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
22766      instead of SQADD Zn.B, Zn.B, #129.  */
22767   rtx elt = unwrap_const_vec_duplicate (x);
22768   return negate_p == (INTVAL (elt) < 0);
22769 }
22770
22771 /* Return true if X is a valid immediate operand for an SVE logical
22772    instruction such as AND.  */
22773
22774 bool
22775 aarch64_sve_bitmask_immediate_p (rtx x)
22776 {
22777   rtx elt;
22778
22779   return (const_vec_duplicate_p (x, &elt)
22780           && CONST_INT_P (elt)
22781           && aarch64_bitmask_imm (INTVAL (elt),
22782                                   GET_MODE_INNER (GET_MODE (x))));
22783 }
22784
22785 /* Return true if X is a valid immediate for the SVE DUP and CPY
22786    instructions.  */
22787
22788 bool
22789 aarch64_sve_dup_immediate_p (rtx x)
22790 {
22791   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
22792   if (!CONST_INT_P (x))
22793     return false;
22794
22795   HOST_WIDE_INT val = INTVAL (x);
22796   if (val & 0xff)
22797     return IN_RANGE (val, -0x80, 0x7f);
22798   return IN_RANGE (val, -0x8000, 0x7f00);
22799 }
22800
22801 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
22802    SIGNED_P says whether the operand is signed rather than unsigned.  */
22803
22804 bool
22805 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
22806 {
22807   x = unwrap_const_vec_duplicate (x);
22808   return (CONST_INT_P (x)
22809           && (signed_p
22810               ? IN_RANGE (INTVAL (x), -16, 15)
22811               : IN_RANGE (INTVAL (x), 0, 127)));
22812 }
22813
22814 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
22815    instruction.  Negate X first if NEGATE_P is true.  */
22816
22817 bool
22818 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
22819 {
22820   rtx elt;
22821   REAL_VALUE_TYPE r;
22822
22823   if (!const_vec_duplicate_p (x, &elt)
22824       || !CONST_DOUBLE_P (elt))
22825     return false;
22826
22827   r = *CONST_DOUBLE_REAL_VALUE (elt);
22828
22829   if (negate_p)
22830     r = real_value_negate (&r);
22831
22832   if (real_equal (&r, &dconst1))
22833     return true;
22834   if (real_equal (&r, &dconsthalf))
22835     return true;
22836   return false;
22837 }
22838
22839 /* Return true if X is a valid immediate operand for an SVE FMUL
22840    instruction.  */
22841
22842 bool
22843 aarch64_sve_float_mul_immediate_p (rtx x)
22844 {
22845   rtx elt;
22846
22847   return (const_vec_duplicate_p (x, &elt)
22848           && CONST_DOUBLE_P (elt)
22849           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
22850               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
22851 }
22852
22853 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
22854    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
22855    is nonnull, use it to describe valid immediates.  */
22856 static bool
22857 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
22858                                     simd_immediate_info *info,
22859                                     enum simd_immediate_check which,
22860                                     simd_immediate_info::insn_type insn)
22861 {
22862   /* Try a 4-byte immediate with LSL.  */
22863   for (unsigned int shift = 0; shift < 32; shift += 8)
22864     if ((val32 & (0xff << shift)) == val32)
22865       {
22866         if (info)
22867           *info = simd_immediate_info (SImode, val32 >> shift, insn,
22868                                        simd_immediate_info::LSL, shift);
22869         return true;
22870       }
22871
22872   /* Try a 2-byte immediate with LSL.  */
22873   unsigned int imm16 = val32 & 0xffff;
22874   if (imm16 == (val32 >> 16))
22875     for (unsigned int shift = 0; shift < 16; shift += 8)
22876       if ((imm16 & (0xff << shift)) == imm16)
22877         {
22878           if (info)
22879             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
22880                                          simd_immediate_info::LSL, shift);
22881           return true;
22882         }
22883
22884   /* Try a 4-byte immediate with MSL, except for cases that MVN
22885      can handle.  */
22886   if (which == AARCH64_CHECK_MOV)
22887     for (unsigned int shift = 8; shift < 24; shift += 8)
22888       {
22889         unsigned int low = (1 << shift) - 1;
22890         if (((val32 & (0xff << shift)) | low) == val32)
22891           {
22892             if (info)
22893               *info = simd_immediate_info (SImode, val32 >> shift, insn,
22894                                            simd_immediate_info::MSL, shift);
22895             return true;
22896           }
22897       }
22898
22899   return false;
22900 }
22901
22902 /* Return true if replicating VAL64 is a valid immediate for the
22903    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
22904    use it to describe valid immediates.  */
22905 static bool
22906 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
22907                                  simd_immediate_info *info,
22908                                  enum simd_immediate_check which)
22909 {
22910   unsigned int val32 = val64 & 0xffffffff;
22911   unsigned int val16 = val64 & 0xffff;
22912   unsigned int val8 = val64 & 0xff;
22913
22914   if (val32 == (val64 >> 32))
22915     {
22916       if ((which & AARCH64_CHECK_ORR) != 0
22917           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
22918                                                  simd_immediate_info::MOV))
22919         return true;
22920
22921       if ((which & AARCH64_CHECK_BIC) != 0
22922           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
22923                                                  simd_immediate_info::MVN))
22924         return true;
22925
22926       /* Try using a replicated byte.  */
22927       if (which == AARCH64_CHECK_MOV
22928           && val16 == (val32 >> 16)
22929           && val8 == (val16 >> 8))
22930         {
22931           if (info)
22932             *info = simd_immediate_info (QImode, val8);
22933           return true;
22934         }
22935     }
22936
22937   /* Try using a bit-to-bytemask.  */
22938   if (which == AARCH64_CHECK_MOV)
22939     {
22940       unsigned int i;
22941       for (i = 0; i < 64; i += 8)
22942         {
22943           unsigned char byte = (val64 >> i) & 0xff;
22944           if (byte != 0 && byte != 0xff)
22945             break;
22946         }
22947       if (i == 64)
22948         {
22949           if (info)
22950             *info = simd_immediate_info (DImode, val64);
22951           return true;
22952         }
22953     }
22954   return false;
22955 }
22956
22957 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
22958    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
22959
22960 static bool
22961 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
22962                              simd_immediate_info *info)
22963 {
22964   scalar_int_mode mode = DImode;
22965   unsigned int val32 = val64 & 0xffffffff;
22966   if (val32 == (val64 >> 32))
22967     {
22968       mode = SImode;
22969       unsigned int val16 = val32 & 0xffff;
22970       if (val16 == (val32 >> 16))
22971         {
22972           mode = HImode;
22973           unsigned int val8 = val16 & 0xff;
22974           if (val8 == (val16 >> 8))
22975             mode = QImode;
22976         }
22977     }
22978   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
22979   if (IN_RANGE (val, -0x80, 0x7f))
22980     {
22981       /* DUP with no shift.  */
22982       if (info)
22983         *info = simd_immediate_info (mode, val);
22984       return true;
22985     }
22986   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
22987     {
22988       /* DUP with LSL #8.  */
22989       if (info)
22990         *info = simd_immediate_info (mode, val);
22991       return true;
22992     }
22993   if (aarch64_bitmask_imm (val64, mode))
22994     {
22995       /* DUPM.  */
22996       if (info)
22997         *info = simd_immediate_info (mode, val);
22998       return true;
22999     }
23000   return false;
23001 }
23002
23003 /* Return true if X is an UNSPEC_PTRUE constant of the form:
23004
23005        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
23006
23007    where PATTERN is the svpattern as a CONST_INT and where ZERO
23008    is a zero constant of the required PTRUE mode (which can have
23009    fewer elements than X's mode, if zero bits are significant).
23010
23011    If so, and if INFO is nonnull, describe the immediate in INFO.  */
23012 bool
23013 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
23014 {
23015   if (GET_CODE (x) != CONST)
23016     return false;
23017
23018   x = XEXP (x, 0);
23019   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
23020     return false;
23021
23022   if (info)
23023     {
23024       aarch64_svpattern pattern
23025         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
23026       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
23027       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
23028       *info = simd_immediate_info (int_mode, pattern);
23029     }
23030   return true;
23031 }
23032
23033 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
23034    it to describe valid immediates.  */
23035
23036 static bool
23037 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
23038 {
23039   if (aarch64_sve_ptrue_svpattern_p (x, info))
23040     return true;
23041
23042   if (x == CONST0_RTX (GET_MODE (x)))
23043     {
23044       if (info)
23045         *info = simd_immediate_info (DImode, 0);
23046       return true;
23047     }
23048
23049   /* Analyze the value as a VNx16BImode.  This should be relatively
23050      efficient, since rtx_vector_builder has enough built-in capacity
23051      to store all VLA predicate constants without needing the heap.  */
23052   rtx_vector_builder builder;
23053   if (!aarch64_get_sve_pred_bits (builder, x))
23054     return false;
23055
23056   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
23057   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
23058     {
23059       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
23060       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
23061       if (pattern != AARCH64_NUM_SVPATTERNS)
23062         {
23063           if (info)
23064             {
23065               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
23066               *info = simd_immediate_info (int_mode, pattern);
23067             }
23068           return true;
23069         }
23070     }
23071   return false;
23072 }
23073
23074 /* Return true if OP is a valid SIMD immediate for the operation
23075    described by WHICH.  If INFO is nonnull, use it to describe valid
23076    immediates.  */
23077 bool
23078 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
23079                               enum simd_immediate_check which)
23080 {
23081   machine_mode mode = GET_MODE (op);
23082   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
23083   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
23084     return false;
23085
23086   if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
23087     return false;
23088
23089   if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
23090     return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
23091
23092   if (vec_flags & VEC_SVE_PRED)
23093     return aarch64_sve_pred_valid_immediate (op, info);
23094
23095   scalar_mode elt_mode = GET_MODE_INNER (mode);
23096   rtx base, step;
23097   unsigned int n_elts;
23098   if (CONST_VECTOR_P (op)
23099       && CONST_VECTOR_DUPLICATE_P (op))
23100     n_elts = CONST_VECTOR_NPATTERNS (op);
23101   else if (which == AARCH64_CHECK_MOV
23102            && TARGET_SVE
23103            && const_vec_series_p (op, &base, &step))
23104     {
23105       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
23106       if (!aarch64_sve_index_immediate_p (base)
23107           || !aarch64_sve_index_immediate_p (step))
23108         return false;
23109
23110       if (info)
23111         {
23112           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
23113              should yield two integer values per 128-bit block, meaning
23114              that we need to treat it in the same way as V2DI and then
23115              ignore the upper 32 bits of each element.  */
23116           elt_mode = aarch64_sve_container_int_mode (mode);
23117           *info = simd_immediate_info (elt_mode, base, step);
23118         }
23119       return true;
23120     }
23121   else if (CONST_VECTOR_P (op)
23122            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
23123     /* N_ELTS set above.  */;
23124   else
23125     return false;
23126
23127   scalar_float_mode elt_float_mode;
23128   if (n_elts == 1
23129       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
23130     {
23131       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
23132       if (aarch64_float_const_zero_rtx_p (elt)
23133           || aarch64_float_const_representable_p (elt))
23134         {
23135           if (info)
23136             *info = simd_immediate_info (elt_float_mode, elt);
23137           return true;
23138         }
23139     }
23140
23141   /* If all elements in an SVE vector have the same value, we have a free
23142      choice between using the element mode and using the container mode.
23143      Using the element mode means that unused parts of the vector are
23144      duplicates of the used elements, while using the container mode means
23145      that the unused parts are an extension of the used elements.  Using the
23146      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
23147      for its container mode VNx4SI while 0x00000101 isn't.
23148
23149      If not all elements in an SVE vector have the same value, we need the
23150      transition from one element to the next to occur at container boundaries.
23151      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
23152      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
23153   scalar_int_mode elt_int_mode;
23154   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
23155     elt_int_mode = aarch64_sve_container_int_mode (mode);
23156   else
23157     elt_int_mode = int_mode_for_mode (elt_mode).require ();
23158
23159   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
23160   if (elt_size > 8)
23161     return false;
23162
23163   /* Expand the vector constant out into a byte vector, with the least
23164      significant byte of the register first.  */
23165   auto_vec<unsigned char, 16> bytes;
23166   bytes.reserve (n_elts * elt_size);
23167   for (unsigned int i = 0; i < n_elts; i++)
23168     {
23169       /* The vector is provided in gcc endian-neutral fashion.
23170          For aarch64_be Advanced SIMD, it must be laid out in the vector
23171          register in reverse order.  */
23172       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
23173       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
23174
23175       if (elt_mode != elt_int_mode)
23176         elt = gen_lowpart (elt_int_mode, elt);
23177
23178       if (!CONST_INT_P (elt))
23179         return false;
23180
23181       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
23182       for (unsigned int byte = 0; byte < elt_size; byte++)
23183         {
23184           bytes.quick_push (elt_val & 0xff);
23185           elt_val >>= BITS_PER_UNIT;
23186         }
23187     }
23188
23189   /* The immediate must repeat every eight bytes.  */
23190   unsigned int nbytes = bytes.length ();
23191   for (unsigned i = 8; i < nbytes; ++i)
23192     if (bytes[i] != bytes[i - 8])
23193       return false;
23194
23195   /* Get the repeating 8-byte value as an integer.  No endian correction
23196      is needed here because bytes is already in lsb-first order.  */
23197   unsigned HOST_WIDE_INT val64 = 0;
23198   for (unsigned int i = 0; i < 8; i++)
23199     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
23200               << (i * BITS_PER_UNIT));
23201
23202   if (vec_flags & VEC_SVE_DATA)
23203     return aarch64_sve_valid_immediate (val64, info);
23204   else
23205     return aarch64_advsimd_valid_immediate (val64, info, which);
23206 }
23207
23208 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
23209    has a step in the range of INDEX.  Return the index expression if so,
23210    otherwise return null.  */
23211 rtx
23212 aarch64_check_zero_based_sve_index_immediate (rtx x)
23213 {
23214   rtx base, step;
23215   if (const_vec_series_p (x, &base, &step)
23216       && base == const0_rtx
23217       && aarch64_sve_index_immediate_p (step))
23218     return step;
23219   return NULL_RTX;
23220 }
23221
23222 /* Check of immediate shift constants are within range.  */
23223 bool
23224 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
23225 {
23226   x = unwrap_const_vec_duplicate (x);
23227   if (!CONST_INT_P (x))
23228     return false;
23229   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
23230   if (left)
23231     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
23232   else
23233     return IN_RANGE (INTVAL (x), 1, bit_width);
23234 }
23235
23236 /* Return the bitmask CONST_INT to select the bits required by a zero extract
23237    operation of width WIDTH at bit position POS.  */
23238
23239 rtx
23240 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
23241 {
23242   gcc_assert (CONST_INT_P (width));
23243   gcc_assert (CONST_INT_P (pos));
23244
23245   unsigned HOST_WIDE_INT mask
23246     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
23247   return GEN_INT (mask << UINTVAL (pos));
23248 }
23249
23250 bool
23251 aarch64_mov_operand_p (rtx x, machine_mode mode)
23252 {
23253   if (GET_CODE (x) == HIGH
23254       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
23255     return true;
23256
23257   if (CONST_INT_P (x))
23258     return true;
23259
23260   if (VECTOR_MODE_P (GET_MODE (x)))
23261     {
23262       /* Require predicate constants to be VNx16BI before RA, so that we
23263          force everything to have a canonical form.  */
23264       if (!lra_in_progress
23265           && !reload_completed
23266           && aarch64_sve_pred_mode_p (GET_MODE (x))
23267           && known_eq (GET_MODE_SIZE (GET_MODE (x)), BYTES_PER_SVE_PRED)
23268           && GET_MODE (x) != VNx16BImode)
23269         return false;
23270
23271       return aarch64_simd_valid_immediate (x, NULL);
23272     }
23273
23274   /* Remove UNSPEC_SALT_ADDR before checking symbol reference.  */
23275   x = strip_salt (x);
23276
23277   /* GOT accesses are valid moves.  */
23278   if (SYMBOL_REF_P (x)
23279       && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
23280     return true;
23281
23282   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
23283     return true;
23284
23285   if (TARGET_SVE
23286       && (aarch64_sve_cnt_immediate_p (x)
23287           || aarch64_sve_rdvl_immediate_p (x)))
23288     return true;
23289
23290   if (aarch64_rdsvl_immediate_p (x))
23291     return true;
23292
23293   return aarch64_classify_symbolic_expression (x)
23294     == SYMBOL_TINY_ABSOLUTE;
23295 }
23296
23297 /* Return a function-invariant register that contains VALUE.  *CACHED_INSN
23298    caches instructions that set up such registers, so that they can be
23299    reused by future calls.  */
23300
23301 static rtx
23302 aarch64_get_shareable_reg (rtx_insn **cached_insn, rtx value)
23303 {
23304   rtx_insn *insn = *cached_insn;
23305   if (insn && INSN_P (insn) && !insn->deleted ())
23306     {
23307       rtx pat = PATTERN (insn);
23308       if (GET_CODE (pat) == SET)
23309         {
23310           rtx dest = SET_DEST (pat);
23311           if (REG_P (dest)
23312               && !HARD_REGISTER_P (dest)
23313               && rtx_equal_p (SET_SRC (pat), value))
23314             return dest;
23315         }
23316     }
23317   rtx reg = gen_reg_rtx (GET_MODE (value));
23318   *cached_insn = emit_insn_before (gen_rtx_SET (reg, value),
23319                                    function_beg_insn);
23320   return reg;
23321 }
23322
23323 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
23324    the constant creation.  */
23325
23326 rtx
23327 aarch64_gen_shareable_zero (machine_mode mode)
23328 {
23329   rtx reg = aarch64_get_shareable_reg (&cfun->machine->advsimd_zero_insn,
23330                                        CONST0_RTX (V4SImode));
23331   return lowpart_subreg (mode, reg, GET_MODE (reg));
23332 }
23333
23334 /* INSN is some form of extension or shift that can be split into a
23335    permutation involving a shared zero.  Return true if we should
23336    perform such a split.
23337
23338    ??? For now, make sure that the split instruction executes more
23339    frequently than the zero that feeds it.  In future it would be good
23340    to split without that restriction and instead recombine shared zeros
23341    if they turn out not to be worthwhile.  This would allow splits in
23342    single-block functions and would also cope more naturally with
23343    rematerialization.  The downside of not doing this is that we lose the
23344    optimizations for vector epilogues as well.  */
23345
23346 bool
23347 aarch64_split_simd_shift_p (rtx_insn *insn)
23348 {
23349   return (can_create_pseudo_p ()
23350           && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))
23351           && (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count
23352               < BLOCK_FOR_INSN (insn)->count));
23353 }
23354
23355 /* Return a const_int vector of VAL.  */
23356 rtx
23357 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
23358 {
23359   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
23360   return gen_const_vec_duplicate (mode, c);
23361 }
23362
23363 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
23364
23365 bool
23366 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
23367 {
23368   machine_mode vmode;
23369
23370   vmode = aarch64_simd_container_mode (mode, 64);
23371   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
23372   return aarch64_simd_valid_immediate (op_v, NULL);
23373 }
23374
23375 /* Construct and return a PARALLEL RTX vector with elements numbering the
23376    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
23377    the vector - from the perspective of the architecture.  This does not
23378    line up with GCC's perspective on lane numbers, so we end up with
23379    different masks depending on our target endian-ness.  The diagram
23380    below may help.  We must draw the distinction when building masks
23381    which select one half of the vector.  An instruction selecting
23382    architectural low-lanes for a big-endian target, must be described using
23383    a mask selecting GCC high-lanes.
23384
23385                  Big-Endian             Little-Endian
23386
23387 GCC             0   1   2   3           3   2   1   0
23388               | x | x | x | x |       | x | x | x | x |
23389 Architecture    3   2   1   0           3   2   1   0
23390
23391 Low Mask:         { 2, 3 }                { 0, 1 }
23392 High Mask:        { 0, 1 }                { 2, 3 }
23393
23394    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
23395
23396 rtx
23397 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
23398 {
23399   rtvec v = rtvec_alloc (nunits / 2);
23400   int high_base = nunits / 2;
23401   int low_base = 0;
23402   int base;
23403   rtx t1;
23404   int i;
23405
23406   if (BYTES_BIG_ENDIAN)
23407     base = high ? low_base : high_base;
23408   else
23409     base = high ? high_base : low_base;
23410
23411   for (i = 0; i < nunits / 2; i++)
23412     RTVEC_ELT (v, i) = GEN_INT (base + i);
23413
23414   t1 = gen_rtx_PARALLEL (mode, v);
23415   return t1;
23416 }
23417
23418 /* Check OP for validity as a PARALLEL RTX vector with elements
23419    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
23420    from the perspective of the architecture.  See the diagram above
23421    aarch64_simd_vect_par_cnst_half for more details.  */
23422
23423 bool
23424 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
23425                                        bool high)
23426 {
23427   int nelts;
23428   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
23429     return false;
23430
23431   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
23432   HOST_WIDE_INT count_op = XVECLEN (op, 0);
23433   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
23434   int i = 0;
23435
23436   if (count_op != count_ideal)
23437     return false;
23438
23439   for (i = 0; i < count_ideal; i++)
23440     {
23441       rtx elt_op = XVECEXP (op, 0, i);
23442       rtx elt_ideal = XVECEXP (ideal, 0, i);
23443
23444       if (!CONST_INT_P (elt_op)
23445           || INTVAL (elt_ideal) != INTVAL (elt_op))
23446         return false;
23447     }
23448   return true;
23449 }
23450
23451 /* Return a PARALLEL containing NELTS elements, with element I equal
23452    to BASE + I * STEP.  */
23453
23454 rtx
23455 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
23456 {
23457   rtvec vec = rtvec_alloc (nelts);
23458   for (unsigned int i = 0; i < nelts; ++i)
23459     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
23460   return gen_rtx_PARALLEL (VOIDmode, vec);
23461 }
23462
23463 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
23464    series with step STEP.  */
23465
23466 bool
23467 aarch64_stepped_int_parallel_p (rtx op, int step)
23468 {
23469   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
23470     return false;
23471
23472   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
23473   for (int i = 1; i < XVECLEN (op, 0); ++i)
23474     if (!CONST_INT_P (XVECEXP (op, 0, i))
23475         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
23476       return false;
23477
23478   return true;
23479 }
23480
23481 /* Return true if OPERANDS[0] to OPERANDS[NUM_OPERANDS - 1] form a
23482    sequence of strided registers, with the stride being equal STRIDE.
23483    The operands are already known to be FPRs.  */
23484 bool
23485 aarch64_strided_registers_p (rtx *operands, unsigned int num_operands,
23486                              unsigned int stride)
23487 {
23488   for (unsigned int i = 1; i < num_operands; ++i)
23489     if (REGNO (operands[i]) != REGNO (operands[0]) + i * stride)
23490       return false;
23491   return true;
23492 }
23493
23494 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
23495    HIGH (exclusive).  */
23496 void
23497 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
23498                           const_tree exp)
23499 {
23500   HOST_WIDE_INT lane;
23501   gcc_assert (CONST_INT_P (operand));
23502   lane = INTVAL (operand);
23503
23504   if (lane < low || lane >= high)
23505   {
23506     if (exp)
23507       error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
23508                 lane, low, high - 1);
23509     else
23510       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
23511   }
23512 }
23513
23514 /* Peform endian correction on lane number N, which indexes a vector
23515    of mode MODE, and return the result as an SImode rtx.  */
23516
23517 rtx
23518 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
23519 {
23520   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
23521 }
23522
23523 /* Return TRUE if OP is a valid vector addressing mode.  */
23524
23525 bool
23526 aarch64_simd_mem_operand_p (rtx op)
23527 {
23528   return (MEM_P (op)
23529           && (GET_CODE (XEXP (op, 0)) == POST_INC || REG_P (XEXP (op, 0)))
23530           && memory_operand (op, VOIDmode));
23531 }
23532
23533 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
23534
23535 bool
23536 aarch64_sve_ld1r_operand_p (rtx op)
23537 {
23538   struct aarch64_address_info addr;
23539   scalar_mode mode;
23540
23541   return (MEM_P (op)
23542           && is_a <scalar_mode> (GET_MODE (op), &mode)
23543           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
23544           && addr.type == ADDRESS_REG_IMM
23545           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
23546 }
23547
23548 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
23549    where the size of the read data is specified by `mode` and the size of the
23550    vector elements are specified by `elem_mode`.   */
23551 bool
23552 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
23553                                    scalar_mode elem_mode)
23554 {
23555   struct aarch64_address_info addr;
23556   if (!MEM_P (op)
23557       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
23558     return false;
23559
23560   if (addr.type == ADDRESS_REG_IMM)
23561     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
23562
23563   if (addr.type == ADDRESS_REG_REG)
23564     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
23565
23566   return false;
23567 }
23568
23569 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
23570 bool
23571 aarch64_sve_ld1rq_operand_p (rtx op)
23572 {
23573   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
23574                                             GET_MODE_INNER (GET_MODE (op)));
23575 }
23576
23577 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
23578    accessing a vector where the element size is specified by `elem_mode`.  */
23579 bool
23580 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
23581 {
23582   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
23583 }
23584
23585 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
23586 bool
23587 aarch64_sve_ldff1_operand_p (rtx op)
23588 {
23589   if (!MEM_P (op))
23590     return false;
23591
23592   struct aarch64_address_info addr;
23593   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
23594     return false;
23595
23596   if (addr.type == ADDRESS_REG_IMM)
23597     return known_eq (addr.const_offset, 0);
23598
23599   return addr.type == ADDRESS_REG_REG;
23600 }
23601
23602 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
23603 bool
23604 aarch64_sve_ldnf1_operand_p (rtx op)
23605 {
23606   struct aarch64_address_info addr;
23607
23608   return (MEM_P (op)
23609           && aarch64_classify_address (&addr, XEXP (op, 0),
23610                                        GET_MODE (op), false)
23611           && addr.type == ADDRESS_REG_IMM);
23612 }
23613
23614 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
23615    The conditions for STR are the same.  */
23616 bool
23617 aarch64_sve_ldr_operand_p (rtx op)
23618 {
23619   struct aarch64_address_info addr;
23620
23621   return (MEM_P (op)
23622           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
23623                                        false, ADDR_QUERY_ANY)
23624           && addr.type == ADDRESS_REG_IMM);
23625 }
23626
23627 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
23628    addressing memory of mode MODE.  */
23629 bool
23630 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
23631 {
23632   struct aarch64_address_info addr;
23633   if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
23634     return false;
23635
23636   if (addr.type == ADDRESS_REG_IMM)
23637     return offset_6bit_signed_scaled_p (mode, addr.const_offset);
23638
23639   return addr.type == ADDRESS_REG_REG;
23640 }
23641
23642 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
23643    We need to be able to access the individual pieces, so the range
23644    is different from LD[234] and ST[234].  */
23645 bool
23646 aarch64_sve_struct_memory_operand_p (rtx op)
23647 {
23648   if (!MEM_P (op))
23649     return false;
23650
23651   machine_mode mode = GET_MODE (op);
23652   struct aarch64_address_info addr;
23653   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
23654                                  ADDR_QUERY_ANY)
23655       || addr.type != ADDRESS_REG_IMM)
23656     return false;
23657
23658   poly_int64 first = addr.const_offset;
23659   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
23660   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
23661           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
23662 }
23663
23664 /* Return true if OFFSET is a constant integer and if VNUM is
23665    OFFSET * the number of bytes in an SVE vector.  This is the requirement
23666    that exists in SME LDR and STR instructions, where the VL offset must
23667    equal the ZA slice offset.  */
23668 bool
23669 aarch64_sme_ldr_vnum_offset_p (rtx offset, rtx vnum)
23670 {
23671   if (!CONST_INT_P (offset) || !IN_RANGE (INTVAL (offset), 0, 15))
23672     return false;
23673
23674   if (TARGET_STREAMING)
23675     {
23676       poly_int64 const_vnum;
23677       return (poly_int_rtx_p (vnum, &const_vnum)
23678               && known_eq (const_vnum,
23679                            INTVAL (offset) * BYTES_PER_SVE_VECTOR));
23680     }
23681   else
23682     {
23683       HOST_WIDE_INT factor;
23684       return (aarch64_sme_vq_unspec_p (vnum, &factor)
23685               && factor == INTVAL (offset) * 16);
23686     }
23687 }
23688
23689 /* Emit a register copy from operand to operand, taking care not to
23690    early-clobber source registers in the process.
23691
23692    COUNT is the number of components into which the copy needs to be
23693    decomposed.  */
23694 void
23695 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
23696                                 unsigned int count)
23697 {
23698   unsigned int i;
23699   int rdest = REGNO (operands[0]);
23700   int rsrc = REGNO (operands[1]);
23701
23702   if (!reg_overlap_mentioned_p (operands[0], operands[1])
23703       || rdest < rsrc)
23704     for (i = 0; i < count; i++)
23705       emit_move_insn (gen_rtx_REG (mode, rdest + i),
23706                       gen_rtx_REG (mode, rsrc + i));
23707   else
23708     for (i = 0; i < count; i++)
23709       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
23710                       gen_rtx_REG (mode, rsrc + count - i - 1));
23711 }
23712
23713 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
23714    one of VSTRUCT modes: OI, CI, or XI.  */
23715 int
23716 aarch64_simd_attr_length_rglist (machine_mode mode)
23717 {
23718   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
23719   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
23720 }
23721
23722 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
23723    alignment of a vector to 128 bits.  SVE predicates have an alignment of
23724    16 bits.  */
23725 static HOST_WIDE_INT
23726 aarch64_simd_vector_alignment (const_tree type)
23727 {
23728   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
23729      be set for non-predicate vectors of booleans.  Modes are the most
23730      direct way we have of identifying real SVE predicate types.  */
23731   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
23732     return 16;
23733   widest_int min_size
23734     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
23735   return wi::umin (min_size, 128).to_uhwi ();
23736 }
23737
23738 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
23739 static poly_uint64
23740 aarch64_vectorize_preferred_vector_alignment (const_tree type)
23741 {
23742   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
23743     {
23744       /* If the length of the vector is a fixed power of 2, try to align
23745          to that length, otherwise don't try to align at all.  */
23746       HOST_WIDE_INT result;
23747       if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
23748           || !pow2p_hwi (result))
23749         result = TYPE_ALIGN (TREE_TYPE (type));
23750       return result;
23751     }
23752   return TYPE_ALIGN (type);
23753 }
23754
23755 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
23756 static bool
23757 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
23758 {
23759   if (is_packed)
23760     return false;
23761
23762   /* For fixed-length vectors, check that the vectorizer will aim for
23763      full-vector alignment.  This isn't true for generic GCC vectors
23764      that are wider than the ABI maximum of 128 bits.  */
23765   poly_uint64 preferred_alignment =
23766     aarch64_vectorize_preferred_vector_alignment (type);
23767   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23768       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
23769                    preferred_alignment))
23770     return false;
23771
23772   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
23773   return true;
23774 }
23775
23776 /* Return true if the vector misalignment factor is supported by the
23777    target.  */
23778 static bool
23779 aarch64_builtin_support_vector_misalignment (machine_mode mode,
23780                                              const_tree type, int misalignment,
23781                                              bool is_packed)
23782 {
23783   if (TARGET_SIMD && STRICT_ALIGNMENT)
23784     {
23785       /* Return if movmisalign pattern is not supported for this mode.  */
23786       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
23787         return false;
23788
23789       /* Misalignment factor is unknown at compile time.  */
23790       if (misalignment == -1)
23791         return false;
23792     }
23793   return default_builtin_support_vector_misalignment (mode, type, misalignment,
23794                                                       is_packed);
23795 }
23796
23797 /* If VALS is a vector constant that can be loaded into a register
23798    using DUP, generate instructions to do so and return an RTX to
23799    assign to the register.  Otherwise return NULL_RTX.  */
23800 static rtx
23801 aarch64_simd_dup_constant (rtx vals)
23802 {
23803   machine_mode mode = GET_MODE (vals);
23804   machine_mode inner_mode = GET_MODE_INNER (mode);
23805   rtx x;
23806
23807   if (!const_vec_duplicate_p (vals, &x))
23808     return NULL_RTX;
23809
23810   /* We can load this constant by using DUP and a constant in a
23811      single ARM register.  This will be cheaper than a vector
23812      load.  */
23813   x = force_reg (inner_mode, x);
23814   return gen_vec_duplicate (mode, x);
23815 }
23816
23817
23818 /* Generate code to load VALS, which is a PARALLEL containing only
23819    constants (for vec_init) or CONST_VECTOR, efficiently into a
23820    register.  Returns an RTX to copy into the register, or NULL_RTX
23821    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
23822 static rtx
23823 aarch64_simd_make_constant (rtx vals)
23824 {
23825   machine_mode mode = GET_MODE (vals);
23826   rtx const_dup;
23827   rtx const_vec = NULL_RTX;
23828   int n_const = 0;
23829   int i;
23830
23831   if (CONST_VECTOR_P (vals))
23832     const_vec = vals;
23833   else if (GET_CODE (vals) == PARALLEL)
23834     {
23835       /* A CONST_VECTOR must contain only CONST_INTs and
23836          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
23837          Only store valid constants in a CONST_VECTOR.  */
23838       int n_elts = XVECLEN (vals, 0);
23839       for (i = 0; i < n_elts; ++i)
23840         {
23841           rtx x = XVECEXP (vals, 0, i);
23842           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23843             n_const++;
23844         }
23845       if (n_const == n_elts)
23846         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
23847     }
23848   else
23849     gcc_unreachable ();
23850
23851   if (const_vec != NULL_RTX
23852       && aarch64_simd_valid_immediate (const_vec, NULL))
23853     /* Load using MOVI/MVNI.  */
23854     return const_vec;
23855   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
23856     /* Loaded using DUP.  */
23857     return const_dup;
23858   else if (const_vec != NULL_RTX)
23859     /* Load from constant pool. We cannot take advantage of single-cycle
23860        LD1 because we need a PC-relative addressing mode.  */
23861     return const_vec;
23862   else
23863     /* A PARALLEL containing something not valid inside CONST_VECTOR.
23864        We cannot construct an initializer.  */
23865     return NULL_RTX;
23866 }
23867
23868 /* A subroutine of aarch64_expand_vector_init, with the same interface.
23869    The caller has already tried a divide-and-conquer approach, so do
23870    not consider that case here.  */
23871
23872 void
23873 aarch64_expand_vector_init_fallback (rtx target, rtx vals)
23874 {
23875   machine_mode mode = GET_MODE (target);
23876   scalar_mode inner_mode = GET_MODE_INNER (mode);
23877   /* The number of vector elements.  */
23878   int n_elts = XVECLEN (vals, 0);
23879   /* The number of vector elements which are not constant.  */
23880   int n_var = 0;
23881   rtx any_const = NULL_RTX;
23882   /* The first element of vals.  */
23883   rtx v0 = XVECEXP (vals, 0, 0);
23884   bool all_same = true;
23885
23886   /* This is a special vec_init<M><N> where N is not an element mode but a
23887      vector mode with half the elements of M.  We expect to find two entries
23888      of mode N in VALS and we must put their concatentation into TARGET.  */
23889   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
23890     {
23891       machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
23892       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
23893                   && known_eq (GET_MODE_SIZE (mode),
23894                                2 * GET_MODE_SIZE (narrow_mode)));
23895       emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
23896                                          XVECEXP (vals, 0, 0),
23897                                          XVECEXP (vals, 0, 1)));
23898      return;
23899    }
23900
23901   /* Count the number of variable elements to initialise.  */
23902   for (int i = 0; i < n_elts; ++i)
23903     {
23904       rtx x = XVECEXP (vals, 0, i);
23905       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
23906         ++n_var;
23907       else
23908         any_const = x;
23909
23910       all_same &= rtx_equal_p (x, v0);
23911     }
23912
23913   /* No variable elements, hand off to aarch64_simd_make_constant which knows
23914      how best to handle this.  */
23915   if (n_var == 0)
23916     {
23917       rtx constant = aarch64_simd_make_constant (vals);
23918       if (constant != NULL_RTX)
23919         {
23920           emit_move_insn (target, constant);
23921           return;
23922         }
23923     }
23924
23925   /* Splat a single non-constant element if we can.  */
23926   if (all_same)
23927     {
23928       rtx x = force_reg (inner_mode, v0);
23929       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
23930       return;
23931     }
23932
23933   enum insn_code icode = optab_handler (vec_set_optab, mode);
23934   gcc_assert (icode != CODE_FOR_nothing);
23935
23936   /* If there are only variable elements, try to optimize
23937      the insertion using dup for the most common element
23938      followed by insertions.  */
23939
23940   /* The algorithm will fill matches[*][0] with the earliest matching element,
23941      and matches[X][1] with the count of duplicate elements (if X is the
23942      earliest element which has duplicates).  */
23943
23944   if (n_var >= n_elts - 1 && n_elts <= 16)
23945     {
23946       int matches[16][2] = {0};
23947       for (int i = 0; i < n_elts; i++)
23948         {
23949           for (int j = 0; j <= i; j++)
23950             {
23951               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
23952                 {
23953                   matches[i][0] = j;
23954                   matches[j][1]++;
23955                   break;
23956                 }
23957             }
23958         }
23959       int maxelement = 0;
23960       int maxv = 0;
23961       rtx const_elem = NULL_RTX;
23962       int const_elem_pos = 0;
23963
23964       for (int i = 0; i < n_elts; i++)
23965         {
23966           if (matches[i][1] > maxv)
23967             {
23968               maxelement = i;
23969               maxv = matches[i][1];
23970             }
23971           if (CONST_INT_P (XVECEXP (vals, 0, i))
23972               || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
23973             {
23974               const_elem_pos = i;
23975               const_elem = XVECEXP (vals, 0, i);
23976             }
23977         }
23978
23979       /* Create a duplicate of the most common element, unless all elements
23980          are equally useless to us, in which case just immediately set the
23981          vector register using the first element.  */
23982
23983       if (maxv == 1)
23984         {
23985           /* For vectors of two 64-bit elements, we can do even better.  */
23986           if (n_elts == 2
23987               && (inner_mode == E_DImode
23988                   || inner_mode == E_DFmode))
23989
23990             {
23991               rtx x0 = XVECEXP (vals, 0, 0);
23992               rtx x1 = XVECEXP (vals, 0, 1);
23993               /* Combine can pick up this case, but handling it directly
23994                  here leaves clearer RTL.
23995
23996                  This is load_pair_lanes<mode>, and also gives us a clean-up
23997                  for store_pair_lanes<mode>.  */
23998               if (memory_operand (x0, inner_mode)
23999                   && memory_operand (x1, inner_mode)
24000                   && aarch64_mergeable_load_pair_p (mode, x0, x1))
24001                 {
24002                   rtx t;
24003                   if (inner_mode == DFmode)
24004                     t = gen_load_pair_lanesdf (target, x0, x1);
24005                   else
24006                     t = gen_load_pair_lanesdi (target, x0, x1);
24007                   emit_insn (t);
24008                   return;
24009                 }
24010             }
24011           /* The subreg-move sequence below will move into lane zero of the
24012              vector register.  For big-endian we want that position to hold
24013              the last element of VALS.  */
24014           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
24015
24016           /* If we have a single constant element, use that for duplicating
24017              instead.  */
24018           if (const_elem)
24019             {
24020               maxelement = const_elem_pos;
24021               aarch64_emit_move (target, gen_vec_duplicate (mode, const_elem));
24022             }
24023           else
24024             {
24025               rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
24026               aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
24027             }
24028         }
24029       else
24030         {
24031           rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
24032           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
24033         }
24034
24035       /* Insert the rest.  */
24036       for (int i = 0; i < n_elts; i++)
24037         {
24038           rtx x = XVECEXP (vals, 0, i);
24039           if (matches[i][0] == maxelement)
24040             continue;
24041           x = force_reg (inner_mode, x);
24042           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
24043         }
24044       return;
24045     }
24046
24047   /* Initialise a vector which is part-variable.  We want to first try
24048      to build those lanes which are constant in the most efficient way we
24049      can.  */
24050   if (n_var != n_elts)
24051     {
24052       rtx copy = copy_rtx (vals);
24053
24054       /* Load constant part of vector.  We really don't care what goes into the
24055          parts we will overwrite, but we're more likely to be able to load the
24056          constant efficiently if it has fewer, larger, repeating parts
24057          (see aarch64_simd_valid_immediate).  */
24058       for (int i = 0; i < n_elts; i++)
24059         {
24060           rtx x = XVECEXP (vals, 0, i);
24061           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
24062             continue;
24063           rtx subst = any_const;
24064           for (int bit = n_elts / 2; bit > 0; bit /= 2)
24065             {
24066               /* Look in the copied vector, as more elements are const.  */
24067               rtx test = XVECEXP (copy, 0, i ^ bit);
24068               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
24069                 {
24070                   subst = test;
24071                   break;
24072                 }
24073             }
24074           XVECEXP (copy, 0, i) = subst;
24075         }
24076       aarch64_expand_vector_init_fallback (target, copy);
24077     }
24078
24079   /* Insert the variable lanes directly.  */
24080   for (int i = 0; i < n_elts; i++)
24081     {
24082       rtx x = XVECEXP (vals, 0, i);
24083       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
24084         continue;
24085       x = force_reg (inner_mode, x);
24086       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
24087     }
24088 }
24089
24090 /* Return even or odd half of VALS depending on EVEN_P.  */
24091
24092 static rtx
24093 aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
24094 {
24095   int n = XVECLEN (vals, 0);
24096   machine_mode new_mode
24097     = aarch64_simd_container_mode (GET_MODE_INNER (mode),
24098                                    GET_MODE_BITSIZE (mode).to_constant () / 2);
24099   rtvec vec = rtvec_alloc (n / 2);
24100   for (int i = 0; i < n / 2; i++)
24101     RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
24102                                   : XVECEXP (vals, 0, 2 * i + 1);
24103   return gen_rtx_PARALLEL (new_mode, vec);
24104 }
24105
24106 /* Return true if SET is a scalar move.  */
24107
24108 static bool
24109 scalar_move_insn_p (rtx set)
24110 {
24111   rtx src = SET_SRC (set);
24112   rtx dest = SET_DEST (set);
24113   return (is_a<scalar_mode> (GET_MODE (dest))
24114           && aarch64_mov_operand (src, GET_MODE (dest)));
24115 }
24116
24117 /* Similar to seq_cost, but ignore cost for scalar moves.  */
24118
24119 static unsigned
24120 seq_cost_ignoring_scalar_moves (const rtx_insn *seq, bool speed)
24121 {
24122   unsigned cost = 0;
24123
24124   for (; seq; seq = NEXT_INSN (seq))
24125     if (NONDEBUG_INSN_P (seq))
24126       {
24127         if (rtx set = single_set (seq))
24128           {
24129             if (!scalar_move_insn_p (set))
24130               cost += set_rtx_cost (set, speed);
24131           }
24132         else
24133           {
24134             int this_cost = insn_cost (CONST_CAST_RTX_INSN (seq), speed);
24135             if (this_cost > 0)
24136               cost += this_cost;
24137             else
24138               cost++;
24139           }
24140       }
24141
24142   return cost;
24143 }
24144
24145 /* Expand a vector initialization sequence, such that TARGET is
24146    initialized to contain VALS.  */
24147
24148 void
24149 aarch64_expand_vector_init (rtx target, rtx vals)
24150 {
24151   /* Try decomposing the initializer into even and odd halves and
24152      then ZIP them together.  Use the resulting sequence if it is
24153      strictly cheaper than loading VALS directly.
24154
24155      Prefer the fallback sequence in the event of a tie, since it
24156      will tend to use fewer registers.  */
24157
24158   machine_mode mode = GET_MODE (target);
24159   int n_elts = XVECLEN (vals, 0);
24160
24161   if (n_elts < 4
24162       || maybe_ne (GET_MODE_BITSIZE (mode), 128))
24163     {
24164       aarch64_expand_vector_init_fallback (target, vals);
24165       return;
24166     }
24167
24168   start_sequence ();
24169   rtx halves[2];
24170   unsigned costs[2];
24171   for (int i = 0; i < 2; i++)
24172     {
24173       start_sequence ();
24174       rtx new_vals = aarch64_unzip_vector_init (mode, vals, i == 0);
24175       rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
24176       aarch64_expand_vector_init (tmp_reg, new_vals);
24177       halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
24178       rtx_insn *rec_seq = get_insns ();
24179       end_sequence ();
24180       costs[i] = seq_cost_ignoring_scalar_moves (rec_seq, !optimize_size);
24181       emit_insn (rec_seq);
24182     }
24183
24184   rtvec v = gen_rtvec (2, halves[0], halves[1]);
24185   rtx_insn *zip1_insn
24186     = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24187   unsigned seq_total_cost
24188     = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
24189   seq_total_cost += insn_cost (zip1_insn, !optimize_size);
24190
24191   rtx_insn *seq = get_insns ();
24192   end_sequence ();
24193
24194   start_sequence ();
24195   aarch64_expand_vector_init_fallback (target, vals);
24196   rtx_insn *fallback_seq = get_insns ();
24197   unsigned fallback_seq_cost
24198     = seq_cost_ignoring_scalar_moves (fallback_seq, !optimize_size);
24199   end_sequence ();
24200
24201   emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
24202 }
24203
24204 /* Emit RTL corresponding to:
24205    insr TARGET, ELEM.  */
24206
24207 static void
24208 emit_insr (rtx target, rtx elem)
24209 {
24210   machine_mode mode = GET_MODE (target);
24211   scalar_mode elem_mode = GET_MODE_INNER (mode);
24212   elem = force_reg (elem_mode, elem);
24213
24214   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
24215   gcc_assert (icode != CODE_FOR_nothing);
24216   emit_insn (GEN_FCN (icode) (target, target, elem));
24217 }
24218
24219 /* Subroutine of aarch64_sve_expand_vector_init for handling
24220    trailing constants.
24221    This function works as follows:
24222    (a) Create a new vector consisting of trailing constants.
24223    (b) Initialize TARGET with the constant vector using emit_move_insn.
24224    (c) Insert remaining elements in TARGET using insr.
24225    NELTS is the total number of elements in original vector while
24226    while NELTS_REQD is the number of elements that are actually
24227    significant.
24228
24229    ??? The heuristic used is to do above only if number of constants
24230    is at least half the total number of elements.  May need fine tuning.  */
24231
24232 static bool
24233 aarch64_sve_expand_vector_init_handle_trailing_constants
24234  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
24235 {
24236   machine_mode mode = GET_MODE (target);
24237   scalar_mode elem_mode = GET_MODE_INNER (mode);
24238   int n_trailing_constants = 0;
24239
24240   for (int i = nelts_reqd - 1;
24241        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
24242        i--)
24243     n_trailing_constants++;
24244
24245   if (n_trailing_constants >= nelts_reqd / 2)
24246     {
24247       /* Try to use the natural pattern of BUILDER to extend the trailing
24248          constant elements to a full vector.  Replace any variables in the
24249          extra elements with zeros.
24250
24251          ??? It would be better if the builders supported "don't care"
24252              elements, with the builder filling in whichever elements
24253              give the most compact encoding.  */
24254       rtx_vector_builder v (mode, nelts, 1);
24255       for (int i = 0; i < nelts; i++)
24256         {
24257           rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
24258           if (!valid_for_const_vector_p (elem_mode, x))
24259             x = CONST0_RTX (elem_mode);
24260           v.quick_push (x);
24261         }
24262       rtx const_vec = v.build ();
24263       emit_move_insn (target, const_vec);
24264
24265       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
24266         emit_insr (target, builder.elt (i));
24267
24268       return true;
24269     }
24270
24271   return false;
24272 }
24273
24274 /* Subroutine of aarch64_sve_expand_vector_init.
24275    Works as follows:
24276    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
24277    (b) Skip trailing elements from BUILDER, which are the same as
24278        element NELTS_REQD - 1.
24279    (c) Insert earlier elements in reverse order in TARGET using insr.  */
24280
24281 static void
24282 aarch64_sve_expand_vector_init_insert_elems (rtx target,
24283                                              const rtx_vector_builder &builder,
24284                                              int nelts_reqd)
24285 {
24286   machine_mode mode = GET_MODE (target);
24287   scalar_mode elem_mode = GET_MODE_INNER (mode);
24288
24289   struct expand_operand ops[2];
24290   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
24291   gcc_assert (icode != CODE_FOR_nothing);
24292
24293   create_output_operand (&ops[0], target, mode);
24294   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
24295   expand_insn (icode, 2, ops);
24296
24297   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24298   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
24299     emit_insr (target, builder.elt (i));
24300 }
24301
24302 /* Subroutine of aarch64_sve_expand_vector_init to handle case
24303    when all trailing elements of builder are same.
24304    This works as follows:
24305    (a) Use expand_insn interface to broadcast last vector element in TARGET.
24306    (b) Insert remaining elements in TARGET using insr.
24307
24308    ??? The heuristic used is to do above if number of same trailing elements
24309    is at least 3/4 of total number of elements, loosely based on
24310    heuristic from mostly_zeros_p.  May need fine-tuning.  */
24311
24312 static bool
24313 aarch64_sve_expand_vector_init_handle_trailing_same_elem
24314  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
24315 {
24316   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
24317   if (ndups >= (3 * nelts_reqd) / 4)
24318     {
24319       aarch64_sve_expand_vector_init_insert_elems (target, builder,
24320                                                    nelts_reqd - ndups + 1);
24321       return true;
24322     }
24323
24324   return false;
24325 }
24326
24327 /* Initialize register TARGET from BUILDER. NELTS is the constant number
24328    of elements in BUILDER.
24329
24330    The function tries to initialize TARGET from BUILDER if it fits one
24331    of the special cases outlined below.
24332
24333    Failing that, the function divides BUILDER into two sub-vectors:
24334    v_even = even elements of BUILDER;
24335    v_odd = odd elements of BUILDER;
24336
24337    and recursively calls itself with v_even and v_odd.
24338
24339    if (recursive call succeeded for v_even or v_odd)
24340      TARGET = zip (v_even, v_odd)
24341
24342    The function returns true if it managed to build TARGET from BUILDER
24343    with one of the special cases, false otherwise.
24344
24345    Example: {a, 1, b, 2, c, 3, d, 4}
24346
24347    The vector gets divided into:
24348    v_even = {a, b, c, d}
24349    v_odd = {1, 2, 3, 4}
24350
24351    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
24352    initialize tmp2 from constant vector v_odd using emit_move_insn.
24353
24354    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
24355    4 elements, so we construct tmp1 from v_even using insr:
24356    tmp1 = dup(d)
24357    insr tmp1, c
24358    insr tmp1, b
24359    insr tmp1, a
24360
24361    And finally:
24362    TARGET = zip (tmp1, tmp2)
24363    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
24364
24365 static bool
24366 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
24367                                 int nelts, int nelts_reqd)
24368 {
24369   machine_mode mode = GET_MODE (target);
24370
24371   /* Case 1: Vector contains trailing constants.  */
24372
24373   if (aarch64_sve_expand_vector_init_handle_trailing_constants
24374        (target, builder, nelts, nelts_reqd))
24375     return true;
24376
24377   /* Case 2: Vector contains leading constants.  */
24378
24379   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
24380   for (int i = 0; i < nelts_reqd; i++)
24381     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
24382   rev_builder.finalize ();
24383
24384   if (aarch64_sve_expand_vector_init_handle_trailing_constants
24385        (target, rev_builder, nelts, nelts_reqd))
24386     {
24387       emit_insn (gen_aarch64_sve_rev (mode, target, target));
24388       return true;
24389     }
24390
24391   /* Case 3: Vector contains trailing same element.  */
24392
24393   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24394        (target, builder, nelts_reqd))
24395     return true;
24396
24397   /* Case 4: Vector contains leading same element.  */
24398
24399   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
24400        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
24401     {
24402       emit_insn (gen_aarch64_sve_rev (mode, target, target));
24403       return true;
24404     }
24405
24406   /* Avoid recursing below 4-elements.
24407      ??? The threshold 4 may need fine-tuning.  */
24408
24409   if (nelts_reqd <= 4)
24410     return false;
24411
24412   rtx_vector_builder v_even (mode, nelts, 1);
24413   rtx_vector_builder v_odd (mode, nelts, 1);
24414
24415   for (int i = 0; i < nelts * 2; i += 2)
24416     {
24417       v_even.quick_push (builder.elt (i));
24418       v_odd.quick_push (builder.elt (i + 1));
24419     }
24420
24421   v_even.finalize ();
24422   v_odd.finalize ();
24423
24424   rtx tmp1 = gen_reg_rtx (mode);
24425   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
24426                                                     nelts, nelts_reqd / 2);
24427
24428   rtx tmp2 = gen_reg_rtx (mode);
24429   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
24430                                                    nelts, nelts_reqd / 2);
24431
24432   if (!did_even_p && !did_odd_p)
24433     return false;
24434
24435   /* Initialize v_even and v_odd using INSR if it didn't match any of the
24436      special cases and zip v_even, v_odd.  */
24437
24438   if (!did_even_p)
24439     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
24440
24441   if (!did_odd_p)
24442     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
24443
24444   rtvec v = gen_rtvec (2, tmp1, tmp2);
24445   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24446   return true;
24447 }
24448
24449 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
24450
24451 void
24452 aarch64_sve_expand_vector_init (rtx target, rtx vals)
24453 {
24454   machine_mode mode = GET_MODE (target);
24455   int nelts = XVECLEN (vals, 0);
24456
24457   rtx_vector_builder v (mode, nelts, 1);
24458   for (int i = 0; i < nelts; i++)
24459     v.quick_push (XVECEXP (vals, 0, i));
24460   v.finalize ();
24461
24462   /* If neither sub-vectors of v could be initialized specially,
24463      then use INSR to insert all elements from v into TARGET.
24464      ??? This might not be optimal for vectors with large
24465      initializers like 16-element or above.
24466      For nelts < 4, it probably isn't useful to handle specially.  */
24467
24468   if (nelts < 4
24469       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
24470     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
24471 }
24472
24473 /* Check whether VALUE is a vector constant in which every element
24474    is either a power of 2 or a negated power of 2.  If so, return
24475    a constant vector of log2s, and flip CODE between PLUS and MINUS
24476    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
24477
24478 static rtx
24479 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
24480 {
24481   if (!CONST_VECTOR_P (value))
24482     return NULL_RTX;
24483
24484   rtx_vector_builder builder;
24485   if (!builder.new_unary_operation (GET_MODE (value), value, false))
24486     return NULL_RTX;
24487
24488   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
24489   /* 1 if the result of the multiplication must be negated,
24490      0 if it mustn't, or -1 if we don't yet care.  */
24491   int negate = -1;
24492   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
24493   for (unsigned int i = 0; i < encoded_nelts; ++i)
24494     {
24495       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
24496       if (!CONST_SCALAR_INT_P (elt))
24497         return NULL_RTX;
24498       rtx_mode_t val (elt, int_mode);
24499       wide_int pow2 = wi::neg (val);
24500       if (val != pow2)
24501         {
24502           /* It matters whether we negate or not.  Make that choice,
24503              and make sure that it's consistent with previous elements.  */
24504           if (negate == !wi::neg_p (val))
24505             return NULL_RTX;
24506           negate = wi::neg_p (val);
24507           if (!negate)
24508             pow2 = val;
24509         }
24510       /* POW2 is now the value that we want to be a power of 2.  */
24511       int shift = wi::exact_log2 (pow2);
24512       if (shift < 0)
24513         return NULL_RTX;
24514       builder.quick_push (gen_int_mode (shift, int_mode));
24515     }
24516   if (negate == -1)
24517     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
24518     code = PLUS;
24519   else if (negate == 1)
24520     code = code == PLUS ? MINUS : PLUS;
24521   return builder.build ();
24522 }
24523
24524 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
24525    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
24526    operands array, in the same order as for fma_optab.  Return true if
24527    the function emitted all the necessary instructions, false if the caller
24528    should generate the pattern normally with the new OPERANDS array.  */
24529
24530 bool
24531 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
24532 {
24533   machine_mode mode = GET_MODE (operands[0]);
24534   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
24535     {
24536       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
24537                                   NULL_RTX, true, OPTAB_DIRECT);
24538       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
24539                           operands[3], product, operands[0], true,
24540                           OPTAB_DIRECT);
24541       return true;
24542     }
24543   operands[2] = force_reg (mode, operands[2]);
24544   return false;
24545 }
24546
24547 /* Likewise, but for a conditional pattern.  */
24548
24549 bool
24550 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
24551 {
24552   machine_mode mode = GET_MODE (operands[0]);
24553   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
24554     {
24555       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
24556                                   NULL_RTX, true, OPTAB_DIRECT);
24557       emit_insn (gen_cond (code, mode, operands[0], operands[1],
24558                            operands[4], product, operands[5]));
24559       return true;
24560     }
24561   operands[3] = force_reg (mode, operands[3]);
24562   return false;
24563 }
24564
24565 static unsigned HOST_WIDE_INT
24566 aarch64_shift_truncation_mask (machine_mode mode)
24567 {
24568   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
24569     return 0;
24570   return GET_MODE_UNIT_BITSIZE (mode) - 1;
24571 }
24572
24573 /* Select a format to encode pointers in exception handling data.  */
24574 int
24575 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
24576 {
24577    int type;
24578    switch (aarch64_cmodel)
24579      {
24580      case AARCH64_CMODEL_TINY:
24581      case AARCH64_CMODEL_TINY_PIC:
24582      case AARCH64_CMODEL_SMALL:
24583      case AARCH64_CMODEL_SMALL_PIC:
24584      case AARCH64_CMODEL_SMALL_SPIC:
24585        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
24586           for everything.  */
24587        type = DW_EH_PE_sdata4;
24588        break;
24589      default:
24590        /* No assumptions here.  8-byte relocs required.  */
24591        type = DW_EH_PE_sdata8;
24592        break;
24593      }
24594    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
24595 }
24596
24597 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
24598
24599 static void
24600 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
24601 {
24602   if (TREE_CODE (decl) == FUNCTION_DECL)
24603     {
24604       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
24605       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
24606         {
24607           fprintf (stream, "\t.variant_pcs\t");
24608           assemble_name (stream, name);
24609           fprintf (stream, "\n");
24610         }
24611     }
24612 }
24613
24614 /* The last .arch and .tune assembly strings that we printed.  */
24615 static std::string aarch64_last_printed_arch_string;
24616 static std::string aarch64_last_printed_tune_string;
24617
24618 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
24619    by the function fndecl.  */
24620
24621 void
24622 aarch64_declare_function_name (FILE *stream, const char* name,
24623                                 tree fndecl)
24624 {
24625   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
24626
24627   struct cl_target_option *targ_options;
24628   if (target_parts)
24629     targ_options = TREE_TARGET_OPTION (target_parts);
24630   else
24631     targ_options = TREE_TARGET_OPTION (target_option_current_node);
24632   gcc_assert (targ_options);
24633
24634   const struct processor *this_arch
24635     = aarch64_get_arch (targ_options->x_selected_arch);
24636
24637   auto isa_flags = aarch64_get_asm_isa_flags (targ_options);
24638   std::string extension
24639     = aarch64_get_extension_string_for_isa_flags (isa_flags,
24640                                                   this_arch->flags);
24641   /* Only update the assembler .arch string if it is distinct from the last
24642      such string we printed.  */
24643   std::string to_print = this_arch->name + extension;
24644   if (to_print != aarch64_last_printed_arch_string)
24645     {
24646       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
24647       aarch64_last_printed_arch_string = to_print;
24648     }
24649
24650   /* Print the cpu name we're tuning for in the comments, might be
24651      useful to readers of the generated asm.  Do it only when it changes
24652      from function to function and verbose assembly is requested.  */
24653   const struct processor *this_tune
24654     = aarch64_get_tune_cpu (targ_options->x_selected_tune);
24655
24656   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
24657     {
24658       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
24659                    this_tune->name);
24660       aarch64_last_printed_tune_string = this_tune->name;
24661     }
24662
24663   aarch64_asm_output_variant_pcs (stream, fndecl, name);
24664
24665   /* Don't forget the type directive for ELF.  */
24666   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
24667   ASM_OUTPUT_FUNCTION_LABEL (stream, name, fndecl);
24668
24669   cfun->machine->label_is_assembled = true;
24670 }
24671
24672 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  */
24673
24674 void
24675 aarch64_print_patchable_function_entry (FILE *file,
24676                                         unsigned HOST_WIDE_INT patch_area_size,
24677                                         bool record_p)
24678 {
24679   if (!cfun->machine->label_is_assembled)
24680     {
24681       /* Emit the patching area before the entry label, if any.  */
24682       default_print_patchable_function_entry (file, patch_area_size,
24683                                               record_p);
24684       return;
24685     }
24686
24687   rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
24688                                GEN_INT (record_p));
24689   basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
24690
24691   if (!aarch_bti_enabled ()
24692       || cgraph_node::get (cfun->decl)->only_called_directly_p ())
24693     {
24694       /* Emit the patchable_area at the beginning of the function.  */
24695       rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
24696       INSN_ADDRESSES_NEW (insn, -1);
24697       return;
24698     }
24699
24700   rtx_insn *insn = next_real_nondebug_insn (get_insns ());
24701   if (!insn
24702       || !INSN_P (insn)
24703       || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
24704       || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
24705     {
24706       /* Emit a BTI_C.  */
24707       insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
24708     }
24709
24710   /* Emit the patchable_area after BTI_C.  */
24711   insn = emit_insn_after (pa, insn);
24712   INSN_ADDRESSES_NEW (insn, -1);
24713 }
24714
24715 /* Output patchable area.  */
24716
24717 void
24718 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
24719 {
24720   default_print_patchable_function_entry (asm_out_file, patch_area_size,
24721                                           record_p);
24722 }
24723
24724 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
24725
24726 void
24727 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
24728 {
24729   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
24730   const char *value = IDENTIFIER_POINTER (target);
24731   aarch64_asm_output_variant_pcs (stream, decl, name);
24732   ASM_OUTPUT_DEF (stream, name, value);
24733 }
24734
24735 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
24736    function symbol references.  */
24737
24738 void
24739 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
24740 {
24741   default_elf_asm_output_external (stream, decl, name);
24742   aarch64_asm_output_variant_pcs (stream, decl, name);
24743 }
24744
24745 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
24746    Used to output the .cfi_b_key_frame directive when signing the current
24747    function with the B key.  */
24748
24749 void
24750 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
24751 {
24752   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
24753       && aarch64_ra_sign_key == AARCH64_KEY_B)
24754         asm_fprintf (f, "\t.cfi_b_key_frame\n");
24755 }
24756
24757 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
24758
24759 static void
24760 aarch64_start_file (void)
24761 {
24762   struct cl_target_option *default_options
24763     = TREE_TARGET_OPTION (target_option_default_node);
24764
24765   const struct processor *default_arch
24766     = aarch64_get_arch (default_options->x_selected_arch);
24767   auto default_isa_flags = aarch64_get_asm_isa_flags (default_options);
24768   std::string extension
24769     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
24770                                                   default_arch->flags);
24771
24772    aarch64_last_printed_arch_string = default_arch->name + extension;
24773    aarch64_last_printed_tune_string = "";
24774    asm_fprintf (asm_out_file, "\t.arch %s\n",
24775                 aarch64_last_printed_arch_string.c_str ());
24776
24777    default_file_start ();
24778 }
24779
24780 /* Emit load exclusive.  */
24781
24782 static void
24783 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
24784                              rtx mem, rtx model_rtx)
24785 {
24786   if (mode == TImode)
24787     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
24788                                                 gen_highpart (DImode, rval),
24789                                                 mem, model_rtx));
24790   else
24791     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
24792 }
24793
24794 /* Emit store exclusive.  */
24795
24796 static void
24797 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
24798                               rtx mem, rtx rval, rtx model_rtx)
24799 {
24800   if (mode == TImode)
24801     emit_insn (gen_aarch64_store_exclusive_pair
24802                (bval, mem, operand_subword (rval, 0, 0, TImode),
24803                 operand_subword (rval, 1, 0, TImode), model_rtx));
24804   else
24805     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
24806 }
24807
24808 /* Mark the previous jump instruction as unlikely.  */
24809
24810 static void
24811 aarch64_emit_unlikely_jump (rtx insn)
24812 {
24813   rtx_insn *jump = emit_jump_insn (insn);
24814   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
24815 }
24816
24817 /* We store the names of the various atomic helpers in a 5x5 array.
24818    Return the libcall function given MODE, MODEL and NAMES.  */
24819
24820 rtx
24821 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
24822                         const atomic_ool_names *names)
24823 {
24824   memmodel model = memmodel_from_int (INTVAL (model_rtx));
24825   int mode_idx, model_idx;
24826
24827   switch (mode)
24828     {
24829     case E_QImode:
24830       mode_idx = 0;
24831       break;
24832     case E_HImode:
24833       mode_idx = 1;
24834       break;
24835     case E_SImode:
24836       mode_idx = 2;
24837       break;
24838     case E_DImode:
24839       mode_idx = 3;
24840       break;
24841     case E_TImode:
24842       mode_idx = 4;
24843       break;
24844     default:
24845       gcc_unreachable ();
24846     }
24847
24848   switch (model)
24849     {
24850     case MEMMODEL_RELAXED:
24851       model_idx = 0;
24852       break;
24853     case MEMMODEL_CONSUME:
24854     case MEMMODEL_ACQUIRE:
24855       model_idx = 1;
24856       break;
24857     case MEMMODEL_RELEASE:
24858       model_idx = 2;
24859       break;
24860     case MEMMODEL_ACQ_REL:
24861     case MEMMODEL_SEQ_CST:
24862       model_idx = 3;
24863       break;
24864     case MEMMODEL_SYNC_ACQUIRE:
24865     case MEMMODEL_SYNC_RELEASE:
24866     case MEMMODEL_SYNC_SEQ_CST:
24867       model_idx = 4;
24868       break;
24869     default:
24870       gcc_unreachable ();
24871     }
24872
24873   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
24874                                       VISIBILITY_HIDDEN);
24875 }
24876
24877 #define DEF0(B, N) \
24878   { "__aarch64_" #B #N "_relax", \
24879     "__aarch64_" #B #N "_acq", \
24880     "__aarch64_" #B #N "_rel", \
24881     "__aarch64_" #B #N "_acq_rel", \
24882     "__aarch64_" #B #N "_sync" }
24883
24884 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
24885                  { NULL, NULL, NULL, NULL }
24886 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
24887
24888 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
24889 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
24890 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
24891 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
24892 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
24893 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
24894
24895 #undef DEF0
24896 #undef DEF4
24897 #undef DEF5
24898
24899 /* Expand a compare and swap pattern.  */
24900
24901 void
24902 aarch64_expand_compare_and_swap (rtx operands[])
24903 {
24904   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
24905   machine_mode mode, r_mode;
24906
24907   bval = operands[0];
24908   rval = operands[1];
24909   mem = operands[2];
24910   oldval = operands[3];
24911   newval = operands[4];
24912   is_weak = operands[5];
24913   mod_s = operands[6];
24914   mod_f = operands[7];
24915   mode = GET_MODE (mem);
24916
24917   /* Normally the succ memory model must be stronger than fail, but in the
24918      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
24919      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
24920   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
24921       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
24922     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
24923
24924   r_mode = mode;
24925   if (mode == QImode || mode == HImode)
24926     {
24927       r_mode = SImode;
24928       rval = gen_reg_rtx (r_mode);
24929     }
24930
24931   if (TARGET_LSE)
24932     {
24933       /* The CAS insn requires oldval and rval overlap, but we need to
24934          have a copy of oldval saved across the operation to tell if
24935          the operation is successful.  */
24936       if (reg_overlap_mentioned_p (rval, oldval))
24937         rval = copy_to_mode_reg (r_mode, oldval);
24938       else
24939         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
24940       if (mode == TImode)
24941         newval = force_reg (mode, newval);
24942
24943       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
24944                                                    newval, mod_s));
24945       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24946     }
24947   else if (TARGET_OUTLINE_ATOMICS)
24948     {
24949       /* Oldval must satisfy compare afterward.  */
24950       if (!aarch64_plus_operand (oldval, mode))
24951         oldval = force_reg (mode, oldval);
24952       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
24953       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
24954                                       oldval, mode, newval, mode,
24955                                       XEXP (mem, 0), Pmode);
24956       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24957     }
24958   else
24959     {
24960       /* The oldval predicate varies by mode.  Test it and force to reg.  */
24961       insn_code code = code_for_aarch64_compare_and_swap (mode);
24962       if (!insn_data[code].operand[2].predicate (oldval, mode))
24963         oldval = force_reg (mode, oldval);
24964
24965       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
24966                                  is_weak, mod_s, mod_f));
24967       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
24968     }
24969
24970   if (r_mode != mode)
24971     rval = gen_lowpart (mode, rval);
24972   emit_move_insn (operands[1], rval);
24973
24974   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
24975   emit_insn (gen_rtx_SET (bval, x));
24976 }
24977
24978 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
24979    sequence implementing an atomic operation.  */
24980
24981 static void
24982 aarch64_emit_post_barrier (enum memmodel model)
24983 {
24984   const enum memmodel base_model = memmodel_base (model);
24985
24986   if (is_mm_sync (model)
24987       && (base_model == MEMMODEL_ACQUIRE
24988           || base_model == MEMMODEL_ACQ_REL
24989           || base_model == MEMMODEL_SEQ_CST))
24990     {
24991       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
24992     }
24993 }
24994
24995 /* Split a compare and swap pattern.  */
24996
24997 void
24998 aarch64_split_compare_and_swap (rtx operands[])
24999 {
25000   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
25001   gcc_assert (epilogue_completed);
25002
25003   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
25004   machine_mode mode;
25005   bool is_weak;
25006   rtx_code_label *label1, *label2;
25007   enum memmodel model;
25008
25009   rval = operands[0];
25010   mem = operands[1];
25011   oldval = operands[2];
25012   newval = operands[3];
25013   model_rtx = operands[5];
25014   scratch = operands[7];
25015   mode = GET_MODE (mem);
25016   model = memmodel_from_int (INTVAL (model_rtx));
25017   is_weak = operands[4] != const0_rtx && mode != TImode;
25018
25019   /* When OLDVAL is zero and we want the strong version we can emit a tighter
25020     loop:
25021     .label1:
25022         LD[A]XR rval, [mem]
25023         CBNZ    rval, .label2
25024         ST[L]XR scratch, newval, [mem]
25025         CBNZ    scratch, .label1
25026     .label2:
25027         CMP     rval, 0.  */
25028   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
25029                         oldval == const0_rtx && mode != TImode);
25030
25031   label1 = NULL;
25032   if (!is_weak)
25033     {
25034       label1 = gen_label_rtx ();
25035       emit_label (label1);
25036     }
25037   label2 = gen_label_rtx ();
25038
25039   /* The initial load can be relaxed for a __sync operation since a final
25040      barrier will be emitted to stop code hoisting.  */
25041   if (is_mm_sync (model))
25042     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
25043   else
25044     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
25045
25046   if (strong_zero_p)
25047     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
25048   else
25049     {
25050       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
25051       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
25052     }
25053   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
25054                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
25055   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
25056
25057   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
25058
25059   if (!is_weak)
25060     {
25061       x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
25062       aarch64_emit_unlikely_jump (x);
25063     }
25064   else
25065     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
25066
25067   /* 128-bit LDAXP is not atomic unless STLXP succeeds.  So for a mismatch,
25068      store the returned value and loop if the STLXP fails.  */
25069   if (mode == TImode)
25070     {
25071       rtx_code_label *label3 = gen_label_rtx ();
25072       emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
25073       emit_barrier ();
25074
25075       emit_label (label2);
25076       aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
25077
25078       x = aarch64_gen_compare_zero_and_branch (NE, scratch, label1);
25079       aarch64_emit_unlikely_jump (x);
25080
25081       label2 = label3;
25082     }
25083
25084   emit_label (label2);
25085
25086   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
25087      to set the condition flags.  If this is not used it will be removed by
25088      later passes.  */
25089   if (strong_zero_p)
25090     aarch64_gen_compare_reg (NE, rval, const0_rtx);
25091
25092   /* Emit any final barrier needed for a __sync operation.  */
25093   if (is_mm_sync (model))
25094     aarch64_emit_post_barrier (model);
25095 }
25096
25097 /* Split an atomic operation.  */
25098
25099 void
25100 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
25101                          rtx value, rtx model_rtx, rtx cond)
25102 {
25103   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
25104   gcc_assert (epilogue_completed);
25105
25106   machine_mode mode = GET_MODE (mem);
25107   machine_mode wmode = (mode == DImode ? DImode : SImode);
25108   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
25109   const bool is_sync = is_mm_sync (model);
25110   rtx_code_label *label;
25111   rtx x;
25112
25113   /* Split the atomic operation into a sequence.  */
25114   label = gen_label_rtx ();
25115   emit_label (label);
25116
25117   if (new_out)
25118     new_out = gen_lowpart (wmode, new_out);
25119   if (old_out)
25120     old_out = gen_lowpart (wmode, old_out);
25121   else
25122     old_out = new_out;
25123   value = simplify_gen_subreg (wmode, value, mode, 0);
25124
25125   /* The initial load can be relaxed for a __sync operation since a final
25126      barrier will be emitted to stop code hoisting.  */
25127  if (is_sync)
25128     aarch64_emit_load_exclusive (mode, old_out, mem,
25129                                  GEN_INT (MEMMODEL_RELAXED));
25130   else
25131     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
25132
25133   switch (code)
25134     {
25135     case SET:
25136       new_out = value;
25137       break;
25138
25139     case NOT:
25140       x = gen_rtx_AND (wmode, old_out, value);
25141       emit_insn (gen_rtx_SET (new_out, x));
25142       x = gen_rtx_NOT (wmode, new_out);
25143       emit_insn (gen_rtx_SET (new_out, x));
25144       break;
25145
25146     case MINUS:
25147       if (CONST_INT_P (value))
25148         {
25149           value = GEN_INT (-UINTVAL (value));
25150           code = PLUS;
25151         }
25152       /* Fall through.  */
25153
25154     default:
25155       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
25156       emit_insn (gen_rtx_SET (new_out, x));
25157       break;
25158     }
25159
25160   aarch64_emit_store_exclusive (mode, cond, mem,
25161                                 gen_lowpart (mode, new_out), model_rtx);
25162
25163   x = aarch64_gen_compare_zero_and_branch (NE, cond, label);
25164   aarch64_emit_unlikely_jump (x);
25165
25166   /* Emit any final barrier needed for a __sync operation.  */
25167   if (is_sync)
25168     aarch64_emit_post_barrier (model);
25169 }
25170
25171 static void
25172 aarch64_init_libfuncs (void)
25173 {
25174    /* Half-precision float operations.  The compiler handles all operations
25175      with NULL libfuncs by converting to SFmode.  */
25176
25177   /* Conversions.  */
25178   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
25179   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
25180
25181   /* Arithmetic.  */
25182   set_optab_libfunc (add_optab, HFmode, NULL);
25183   set_optab_libfunc (sdiv_optab, HFmode, NULL);
25184   set_optab_libfunc (smul_optab, HFmode, NULL);
25185   set_optab_libfunc (neg_optab, HFmode, NULL);
25186   set_optab_libfunc (sub_optab, HFmode, NULL);
25187
25188   /* Comparisons.  */
25189   set_optab_libfunc (eq_optab, HFmode, NULL);
25190   set_optab_libfunc (ne_optab, HFmode, NULL);
25191   set_optab_libfunc (lt_optab, HFmode, NULL);
25192   set_optab_libfunc (le_optab, HFmode, NULL);
25193   set_optab_libfunc (ge_optab, HFmode, NULL);
25194   set_optab_libfunc (gt_optab, HFmode, NULL);
25195   set_optab_libfunc (unord_optab, HFmode, NULL);
25196 }
25197
25198 /* Target hook for c_mode_for_suffix.  */
25199 static machine_mode
25200 aarch64_c_mode_for_suffix (char suffix)
25201 {
25202   if (suffix == 'q')
25203     return TFmode;
25204
25205   return VOIDmode;
25206 }
25207
25208 /* We can only represent floating point constants which will fit in
25209    "quarter-precision" values.  These values are characterised by
25210    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
25211    by:
25212
25213    (-1)^s * (n/16) * 2^r
25214
25215    Where:
25216      's' is the sign bit.
25217      'n' is an integer in the range 16 <= n <= 31.
25218      'r' is an integer in the range -3 <= r <= 4.  */
25219
25220 /* Return true iff X can be represented by a quarter-precision
25221    floating point immediate operand X.  Note, we cannot represent 0.0.  */
25222 bool
25223 aarch64_float_const_representable_p (rtx x)
25224 {
25225   /* This represents our current view of how many bits
25226      make up the mantissa.  */
25227   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
25228   int exponent;
25229   unsigned HOST_WIDE_INT mantissa, mask;
25230   REAL_VALUE_TYPE r, m;
25231   bool fail;
25232
25233   x = unwrap_const_vec_duplicate (x);
25234   if (!CONST_DOUBLE_P (x))
25235     return false;
25236
25237   if (GET_MODE (x) == VOIDmode
25238       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
25239     return false;
25240
25241   r = *CONST_DOUBLE_REAL_VALUE (x);
25242
25243   /* We cannot represent infinities, NaNs or +/-zero.  We won't
25244      know if we have +zero until we analyse the mantissa, but we
25245      can reject the other invalid values.  */
25246   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
25247       || REAL_VALUE_MINUS_ZERO (r))
25248     return false;
25249
25250   /* For BFmode, only handle 0.0. */
25251   if (GET_MODE (x) == BFmode)
25252     return real_iszero (&r, false);
25253
25254   /* Extract exponent.  */
25255   r = real_value_abs (&r);
25256   exponent = REAL_EXP (&r);
25257
25258   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
25259      highest (sign) bit, with a fixed binary point at bit point_pos.
25260      m1 holds the low part of the mantissa, m2 the high part.
25261      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
25262      bits for the mantissa, this can fail (low bits will be lost).  */
25263   real_ldexp (&m, &r, point_pos - exponent);
25264   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
25265
25266   /* If the low part of the mantissa has bits set we cannot represent
25267      the value.  */
25268   if (w.ulow () != 0)
25269     return false;
25270   /* We have rejected the lower HOST_WIDE_INT, so update our
25271      understanding of how many bits lie in the mantissa and
25272      look only at the high HOST_WIDE_INT.  */
25273   mantissa = w.elt (1);
25274   point_pos -= HOST_BITS_PER_WIDE_INT;
25275
25276   /* We can only represent values with a mantissa of the form 1.xxxx.  */
25277   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
25278   if ((mantissa & mask) != 0)
25279     return false;
25280
25281   /* Having filtered unrepresentable values, we may now remove all
25282      but the highest 5 bits.  */
25283   mantissa >>= point_pos - 5;
25284
25285   /* We cannot represent the value 0.0, so reject it.  This is handled
25286      elsewhere.  */
25287   if (mantissa == 0)
25288     return false;
25289
25290   /* Then, as bit 4 is always set, we can mask it off, leaving
25291      the mantissa in the range [0, 15].  */
25292   mantissa &= ~(1 << 4);
25293   gcc_assert (mantissa <= 15);
25294
25295   /* GCC internally does not use IEEE754-like encoding (where normalized
25296      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
25297      Our mantissa values are shifted 4 places to the left relative to
25298      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
25299      by 5 places to correct for GCC's representation.  */
25300   exponent = 5 - exponent;
25301
25302   return (exponent >= 0 && exponent <= 7);
25303 }
25304
25305 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
25306    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
25307    output MOVI/MVNI, ORR or BIC immediate.  */
25308 char*
25309 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
25310                                    enum simd_immediate_check which)
25311 {
25312   bool is_valid;
25313   static char templ[40];
25314   const char *mnemonic;
25315   const char *shift_op;
25316   unsigned int lane_count = 0;
25317   char element_char;
25318
25319   struct simd_immediate_info info;
25320
25321   /* This will return true to show const_vector is legal for use as either
25322      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
25323      It will also update INFO to show how the immediate should be generated.
25324      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
25325   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
25326   gcc_assert (is_valid);
25327
25328   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25329   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
25330
25331   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25332     {
25333       gcc_assert (info.insn == simd_immediate_info::MOV
25334                   && info.u.mov.shift == 0);
25335       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
25336          move immediate path.  */
25337       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25338         info.u.mov.value = GEN_INT (0);
25339       else
25340         {
25341           const unsigned int buf_size = 20;
25342           char float_buf[buf_size] = {'\0'};
25343           real_to_decimal_for_mode (float_buf,
25344                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25345                                     buf_size, buf_size, 1, info.elt_mode);
25346
25347           if (lane_count == 1)
25348             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
25349           else
25350             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
25351                       lane_count, element_char, float_buf);
25352           return templ;
25353         }
25354     }
25355
25356   gcc_assert (CONST_INT_P (info.u.mov.value));
25357
25358   if (which == AARCH64_CHECK_MOV)
25359     {
25360       if (info.insn == simd_immediate_info::INDEX)
25361         {
25362           gcc_assert (TARGET_SVE);
25363           snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #"
25364                     HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
25365                     element_char, INTVAL (info.u.index.base),
25366                     INTVAL (info.u.index.step));
25367           return templ;
25368         }
25369
25370       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
25371       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
25372                   ? "msl" : "lsl");
25373       if (lane_count == 1)
25374         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
25375                   mnemonic, UINTVAL (info.u.mov.value));
25376       else if (info.u.mov.shift)
25377         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25378                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
25379                   element_char, UINTVAL (info.u.mov.value), shift_op,
25380                   info.u.mov.shift);
25381       else
25382         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25383                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
25384                   element_char, UINTVAL (info.u.mov.value));
25385     }
25386   else
25387     {
25388       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
25389       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
25390       if (info.u.mov.shift)
25391         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25392                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
25393                   element_char, UINTVAL (info.u.mov.value), "lsl",
25394                   info.u.mov.shift);
25395       else
25396         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25397                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
25398                   element_char, UINTVAL (info.u.mov.value));
25399     }
25400   return templ;
25401 }
25402
25403 char*
25404 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
25405 {
25406
25407   /* If a floating point number was passed and we desire to use it in an
25408      integer mode do the conversion to integer.  */
25409   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
25410     {
25411       unsigned HOST_WIDE_INT ival;
25412       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
25413           gcc_unreachable ();
25414       immediate = gen_int_mode (ival, mode);
25415     }
25416
25417   machine_mode vmode;
25418   /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
25419      a 128 bit vector mode.  */
25420   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
25421
25422   vmode = aarch64_simd_container_mode (mode, width);
25423   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
25424   return aarch64_output_simd_mov_immediate (v_op, width);
25425 }
25426
25427 /* Return the output string to use for moving immediate CONST_VECTOR
25428    into an SVE register.  */
25429
25430 char *
25431 aarch64_output_sve_mov_immediate (rtx const_vector)
25432 {
25433   static char templ[40];
25434   struct simd_immediate_info info;
25435   char element_char;
25436
25437   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
25438   gcc_assert (is_valid);
25439
25440   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25441
25442   machine_mode vec_mode = GET_MODE (const_vector);
25443   if (aarch64_sve_pred_mode_p (vec_mode))
25444     {
25445       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
25446       if (info.insn == simd_immediate_info::MOV)
25447         {
25448           gcc_assert (info.u.mov.value == const0_rtx);
25449           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
25450         }
25451       else
25452         {
25453           gcc_assert (info.insn == simd_immediate_info::PTRUE);
25454           unsigned int total_bytes;
25455           if (info.u.pattern == AARCH64_SV_ALL
25456               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
25457             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
25458                       total_bytes / GET_MODE_SIZE (info.elt_mode));
25459           else
25460             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
25461                       svpattern_token (info.u.pattern));
25462         }
25463       return buf;
25464     }
25465
25466   if (info.insn == simd_immediate_info::INDEX)
25467     {
25468       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
25469                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
25470                 element_char, INTVAL (info.u.index.base),
25471                 INTVAL (info.u.index.step));
25472       return templ;
25473     }
25474
25475   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25476     {
25477       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25478         info.u.mov.value = GEN_INT (0);
25479       else
25480         {
25481           const int buf_size = 20;
25482           char float_buf[buf_size] = {};
25483           real_to_decimal_for_mode (float_buf,
25484                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25485                                     buf_size, buf_size, 1, info.elt_mode);
25486
25487           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
25488                     element_char, float_buf);
25489           return templ;
25490         }
25491     }
25492
25493   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
25494             element_char, INTVAL (info.u.mov.value));
25495   return templ;
25496 }
25497
25498 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
25499    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
25500    pattern.  */
25501
25502 char *
25503 aarch64_output_sve_ptrues (rtx const_unspec)
25504 {
25505   static char templ[40];
25506
25507   struct simd_immediate_info info;
25508   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
25509   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
25510
25511   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25512   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
25513             svpattern_token (info.u.pattern));
25514   return templ;
25515 }
25516
25517 /* Split operands into moves from op[1] + op[2] into op[0].  */
25518
25519 void
25520 aarch64_split_combinev16qi (rtx operands[3])
25521 {
25522   machine_mode halfmode = GET_MODE (operands[1]);
25523
25524   gcc_assert (halfmode == V16QImode);
25525
25526   rtx destlo = simplify_gen_subreg (halfmode, operands[0],
25527                                     GET_MODE (operands[0]), 0);
25528   rtx desthi = simplify_gen_subreg (halfmode, operands[0],
25529                                     GET_MODE (operands[0]),
25530                                     GET_MODE_SIZE (halfmode));
25531
25532   bool skiplo = rtx_equal_p (destlo, operands[1]);
25533   bool skiphi = rtx_equal_p (desthi, operands[2]);
25534
25535   if (skiplo && skiphi)
25536     {
25537       /* No-op move.  Can't split to nothing; emit something.  */
25538       emit_note (NOTE_INSN_DELETED);
25539       return;
25540     }
25541
25542   /* Special case of reversed high/low parts.  */
25543   if (reg_overlap_mentioned_p (operands[2], destlo)
25544       && reg_overlap_mentioned_p (operands[1], desthi))
25545     {
25546       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25547       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
25548       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25549     }
25550   else if (!reg_overlap_mentioned_p (operands[2], destlo))
25551     {
25552       /* Try to avoid unnecessary moves if part of the result
25553          is in the right place already.  */
25554       if (!skiplo)
25555         emit_move_insn (destlo, operands[1]);
25556       if (!skiphi)
25557         emit_move_insn (desthi, operands[2]);
25558     }
25559   else
25560     {
25561       if (!skiphi)
25562         emit_move_insn (desthi, operands[2]);
25563       if (!skiplo)
25564         emit_move_insn (destlo, operands[1]);
25565     }
25566 }
25567
25568 /* vec_perm support.  */
25569
25570 struct expand_vec_perm_d
25571 {
25572   rtx target, op0, op1;
25573   vec_perm_indices perm;
25574   machine_mode vmode;
25575   machine_mode op_mode;
25576   unsigned int vec_flags;
25577   unsigned int op_vec_flags;
25578   bool one_vector_p;
25579   bool zero_op0_p, zero_op1_p;
25580   bool testing_p;
25581 };
25582
25583 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
25584
25585 /* Generate a variable permutation.  */
25586
25587 static void
25588 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
25589 {
25590   machine_mode vmode = GET_MODE (target);
25591   bool one_vector_p = rtx_equal_p (op0, op1);
25592
25593   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
25594   gcc_checking_assert (GET_MODE (op0) == vmode);
25595   gcc_checking_assert (GET_MODE (op1) == vmode);
25596   gcc_checking_assert (GET_MODE (sel) == vmode);
25597   gcc_checking_assert (TARGET_SIMD);
25598
25599   if (one_vector_p)
25600     {
25601       if (vmode == V8QImode)
25602         {
25603           /* Expand the argument to a V16QI mode by duplicating it.  */
25604           rtx pair = gen_reg_rtx (V16QImode);
25605           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
25606           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25607         }
25608       else
25609         {
25610           emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
25611         }
25612     }
25613   else
25614     {
25615       rtx pair;
25616
25617       if (vmode == V8QImode)
25618         {
25619           pair = gen_reg_rtx (V16QImode);
25620           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
25621           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25622         }
25623       else
25624         {
25625           pair = gen_reg_rtx (V2x16QImode);
25626           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
25627           emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
25628         }
25629     }
25630 }
25631
25632 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
25633    NELT is the number of elements in the vector.  */
25634
25635 void
25636 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
25637                          unsigned int nelt)
25638 {
25639   machine_mode vmode = GET_MODE (target);
25640   bool one_vector_p = rtx_equal_p (op0, op1);
25641   rtx mask;
25642
25643   /* The TBL instruction does not use a modulo index, so we must take care
25644      of that ourselves.  */
25645   mask = aarch64_simd_gen_const_vector_dup (vmode,
25646       one_vector_p ? nelt - 1 : 2 * nelt - 1);
25647   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
25648
25649   /* For big-endian, we also need to reverse the index within the vector
25650      (but not which vector).  */
25651   if (BYTES_BIG_ENDIAN)
25652     {
25653       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
25654       if (!one_vector_p)
25655         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
25656       sel = expand_simple_binop (vmode, XOR, sel, mask,
25657                                  NULL, 0, OPTAB_LIB_WIDEN);
25658     }
25659   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
25660 }
25661
25662 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
25663
25664 static void
25665 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
25666 {
25667   emit_insn (gen_rtx_SET (target,
25668                           gen_rtx_UNSPEC (GET_MODE (target),
25669                                           gen_rtvec (2, op0, op1), code)));
25670 }
25671
25672 /* Expand an SVE vec_perm with the given operands.  */
25673
25674 void
25675 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
25676 {
25677   machine_mode data_mode = GET_MODE (target);
25678   machine_mode sel_mode = GET_MODE (sel);
25679   /* Enforced by the pattern condition.  */
25680   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
25681
25682   /* Note: vec_perm indices are supposed to wrap when they go beyond the
25683      size of the two value vectors, i.e. the upper bits of the indices
25684      are effectively ignored.  SVE TBL instead produces 0 for any
25685      out-of-range indices, so we need to modulo all the vec_perm indices
25686      to ensure they are all in range.  */
25687   rtx sel_reg = force_reg (sel_mode, sel);
25688
25689   /* Check if the sel only references the first values vector.  */
25690   if (CONST_VECTOR_P (sel)
25691       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
25692     {
25693       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
25694       return;
25695     }
25696
25697   /* Check if the two values vectors are the same.  */
25698   if (rtx_equal_p (op0, op1))
25699     {
25700       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
25701       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25702                                          NULL, 0, OPTAB_DIRECT);
25703       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
25704       return;
25705     }
25706
25707   /* Run TBL on for each value vector and combine the results.  */
25708
25709   rtx res0 = gen_reg_rtx (data_mode);
25710   rtx res1 = gen_reg_rtx (data_mode);
25711   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
25712   if (!CONST_VECTOR_P (sel)
25713       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
25714     {
25715       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
25716                                                        2 * nunits - 1);
25717       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25718                                      NULL, 0, OPTAB_DIRECT);
25719     }
25720   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
25721   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
25722                                      NULL, 0, OPTAB_DIRECT);
25723   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
25724   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
25725     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
25726   else
25727     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
25728 }
25729
25730 /* Recognize patterns suitable for the TRN instructions.  */
25731 static bool
25732 aarch64_evpc_trn (struct expand_vec_perm_d *d)
25733 {
25734   HOST_WIDE_INT odd;
25735   poly_uint64 nelt = d->perm.length ();
25736   rtx out, in0, in1;
25737   machine_mode vmode = d->vmode;
25738
25739   if (GET_MODE_UNIT_SIZE (vmode) > 8)
25740     return false;
25741
25742   /* Note that these are little-endian tests.
25743      We correct for big-endian later.  */
25744   if (!d->perm[0].is_constant (&odd)
25745       || (odd != 0 && odd != 1)
25746       || !d->perm.series_p (0, 2, odd, 2)
25747       || !d->perm.series_p (1, 2, nelt + odd, 2))
25748     return false;
25749
25750   /* Success!  */
25751   if (d->testing_p)
25752     return true;
25753
25754   in0 = d->op0;
25755   in1 = d->op1;
25756   /* We don't need a big-endian lane correction for SVE; see the comment
25757      at the head of aarch64-sve.md for details.  */
25758   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25759     {
25760       std::swap (in0, in1);
25761       odd = !odd;
25762     }
25763   out = d->target;
25764
25765   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25766                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
25767   return true;
25768 }
25769
25770 /* Try to re-encode the PERM constant so it combines odd and even elements.
25771    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
25772    We retry with this new constant with the full suite of patterns.  */
25773 static bool
25774 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
25775 {
25776   expand_vec_perm_d newd;
25777
25778   /* The subregs that we'd create are not supported for big-endian SVE;
25779      see aarch64_modes_compatible_p for details.  */
25780   if (BYTES_BIG_ENDIAN && (d->vec_flags & VEC_ANY_SVE))
25781     return false;
25782
25783   /* Get the new mode.  Always twice the size of the inner
25784      and half the elements.  */
25785   machine_mode new_mode;
25786   if (!aarch64_coalesce_units (d->vmode, 2).exists (&new_mode))
25787     return false;
25788
25789   vec_perm_indices newpermindices;
25790   if (!newpermindices.new_shrunk_vector (d->perm, 2))
25791     return false;
25792
25793   newd.vmode = new_mode;
25794   newd.vec_flags = d->vec_flags;
25795   newd.op_mode = newd.vmode;
25796   newd.op_vec_flags = newd.vec_flags;
25797   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
25798   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
25799   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
25800   newd.testing_p = d->testing_p;
25801   newd.one_vector_p = d->one_vector_p;
25802
25803   newd.perm.new_vector (newpermindices.encoding (), newd.one_vector_p ? 1 : 2,
25804                         newpermindices.nelts_per_input ());
25805   return aarch64_expand_vec_perm_const_1 (&newd);
25806 }
25807
25808 /* Recognize patterns suitable for the UZP instructions.  */
25809 static bool
25810 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
25811 {
25812   HOST_WIDE_INT odd;
25813   rtx out, in0, in1;
25814   machine_mode vmode = d->vmode;
25815
25816   if (GET_MODE_UNIT_SIZE (vmode) > 8)
25817     return false;
25818
25819   /* Note that these are little-endian tests.
25820      We correct for big-endian later.  */
25821   if (!d->perm[0].is_constant (&odd)
25822       || (odd != 0 && odd != 1)
25823       || !d->perm.series_p (0, 1, odd, 2))
25824     return false;
25825
25826   /* Success!  */
25827   if (d->testing_p)
25828     return true;
25829
25830   in0 = d->op0;
25831   in1 = d->op1;
25832   /* We don't need a big-endian lane correction for SVE; see the comment
25833      at the head of aarch64-sve.md for details.  */
25834   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25835     {
25836       std::swap (in0, in1);
25837       odd = !odd;
25838     }
25839   out = d->target;
25840
25841   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25842                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
25843   return true;
25844 }
25845
25846 /* Recognize patterns suitable for the ZIP instructions.  */
25847 static bool
25848 aarch64_evpc_zip (struct expand_vec_perm_d *d)
25849 {
25850   unsigned int high;
25851   poly_uint64 nelt = d->perm.length ();
25852   rtx out, in0, in1;
25853   machine_mode vmode = d->vmode;
25854
25855   if (GET_MODE_UNIT_SIZE (vmode) > 8)
25856     return false;
25857
25858   /* Note that these are little-endian tests.
25859      We correct for big-endian later.  */
25860   poly_uint64 first = d->perm[0];
25861   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
25862       || !d->perm.series_p (0, 2, first, 1)
25863       || !d->perm.series_p (1, 2, first + nelt, 1))
25864     return false;
25865   high = maybe_ne (first, 0U);
25866
25867   /* Success!  */
25868   if (d->testing_p)
25869     return true;
25870
25871   in0 = d->op0;
25872   in1 = d->op1;
25873   /* We don't need a big-endian lane correction for SVE; see the comment
25874      at the head of aarch64-sve.md for details.  */
25875   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25876     {
25877       std::swap (in0, in1);
25878       high = !high;
25879     }
25880   out = d->target;
25881
25882   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25883                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
25884   return true;
25885 }
25886
25887 /* Recognize patterns for the EXT insn.  */
25888
25889 static bool
25890 aarch64_evpc_ext (struct expand_vec_perm_d *d)
25891 {
25892   HOST_WIDE_INT location;
25893   rtx offset;
25894
25895   /* The first element always refers to the first vector.
25896      Check if the extracted indices are increasing by one.  */
25897   if ((d->vec_flags & VEC_SVE_PRED)
25898       || !d->perm[0].is_constant (&location)
25899       || !d->perm.series_p (0, 1, location, 1))
25900     return false;
25901
25902   /* Success! */
25903   if (d->testing_p)
25904     return true;
25905
25906   /* The case where (location == 0) is a no-op for both big- and little-endian,
25907      and is removed by the mid-end at optimization levels -O1 and higher.
25908
25909      We don't need a big-endian lane correction for SVE; see the comment
25910      at the head of aarch64-sve.md for details.  */
25911   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
25912     {
25913       /* After setup, we want the high elements of the first vector (stored
25914          at the LSB end of the register), and the low elements of the second
25915          vector (stored at the MSB end of the register). So swap.  */
25916       std::swap (d->op0, d->op1);
25917       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
25918          to_constant () is safe since this is restricted to Advanced SIMD
25919          vectors.  */
25920       location = d->perm.length ().to_constant () - location;
25921     }
25922
25923   offset = GEN_INT (location);
25924   emit_set_insn (d->target,
25925                  gen_rtx_UNSPEC (d->vmode,
25926                                  gen_rtvec (3, d->op0, d->op1, offset),
25927                                  UNSPEC_EXT));
25928   return true;
25929 }
25930
25931 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
25932    within each 64-bit, 32-bit or 16-bit granule.  */
25933
25934 static bool
25935 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
25936 {
25937   HOST_WIDE_INT diff;
25938   unsigned int i, size, unspec;
25939   machine_mode pred_mode;
25940
25941   if ((d->vec_flags & VEC_SVE_PRED)
25942       || !d->one_vector_p
25943       || !d->perm[0].is_constant (&diff)
25944       || !diff)
25945     return false;
25946
25947   if (d->vec_flags & VEC_SVE_DATA)
25948     size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
25949   else
25950     size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
25951   if (size == 64)
25952     {
25953       unspec = UNSPEC_REV64;
25954       pred_mode = VNx2BImode;
25955     }
25956   else if (size == 32)
25957     {
25958       unspec = UNSPEC_REV32;
25959       pred_mode = VNx4BImode;
25960     }
25961   else if (size == 16)
25962     {
25963       unspec = UNSPEC_REV16;
25964       pred_mode = VNx8BImode;
25965     }
25966   else
25967     return false;
25968
25969   unsigned int step = diff + 1;
25970   for (i = 0; i < step; ++i)
25971     if (!d->perm.series_p (i, step, diff - i, step))
25972       return false;
25973
25974   /* Success! */
25975   if (d->testing_p)
25976     return true;
25977
25978   if (d->vec_flags & VEC_SVE_DATA)
25979     {
25980       rtx pred = aarch64_ptrue_reg (pred_mode);
25981       emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
25982                                          d->target, pred, d->op0));
25983       return true;
25984     }
25985   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
25986   emit_set_insn (d->target, src);
25987   return true;
25988 }
25989
25990 /* Recognize patterns for the REV insn, which reverses elements within
25991    a full vector.  */
25992
25993 static bool
25994 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
25995 {
25996   poly_uint64 nelt = d->perm.length ();
25997
25998   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
25999     return false;
26000
26001   if (!d->perm.series_p (0, 1, nelt - 1, -1))
26002     return false;
26003
26004   /* Success! */
26005   if (d->testing_p)
26006     return true;
26007
26008   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
26009   emit_set_insn (d->target, src);
26010   return true;
26011 }
26012
26013 static bool
26014 aarch64_evpc_dup (struct expand_vec_perm_d *d)
26015 {
26016   rtx out = d->target;
26017   rtx in0;
26018   HOST_WIDE_INT elt;
26019   machine_mode vmode = d->vmode;
26020   rtx lane;
26021
26022   if ((d->vec_flags & VEC_SVE_PRED)
26023       || d->perm.encoding ().encoded_nelts () != 1
26024       || !d->perm[0].is_constant (&elt))
26025     return false;
26026
26027   if ((d->vec_flags & VEC_SVE_DATA)
26028       && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
26029     return false;
26030
26031   /* Success! */
26032   if (d->testing_p)
26033     return true;
26034
26035   /* The generic preparation in aarch64_expand_vec_perm_const_1
26036      swaps the operand order and the permute indices if it finds
26037      d->perm[0] to be in the second operand.  Thus, we can always
26038      use d->op0 and need not do any extra arithmetic to get the
26039      correct lane number.  */
26040   in0 = d->op0;
26041   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
26042
26043   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
26044   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
26045   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
26046   return true;
26047 }
26048
26049 static bool
26050 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
26051 {
26052   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
26053   machine_mode vmode = d->vmode;
26054
26055   /* Make sure that the indices are constant.  */
26056   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
26057   for (unsigned int i = 0; i < encoded_nelts; ++i)
26058     if (!d->perm[i].is_constant ())
26059       return false;
26060
26061   if (d->testing_p)
26062     return true;
26063
26064   /* Generic code will try constant permutation twice.  Once with the
26065      original mode and again with the elements lowered to QImode.
26066      So wait and don't do the selector expansion ourselves.  */
26067   if (vmode != V8QImode && vmode != V16QImode)
26068     return false;
26069
26070   /* to_constant is safe since this routine is specific to Advanced SIMD
26071      vectors.  */
26072   unsigned int nelt = d->perm.length ().to_constant ();
26073
26074   /* If one register is the constant vector of 0 then we only need
26075      a one reg TBL and we map any accesses to the vector of 0 to -1.  We can't
26076      do this earlier since vec_perm_indices clamps elements to within range so
26077      we can only do it during codegen.  */
26078   if (d->zero_op0_p)
26079     d->op0 = d->op1;
26080   else if (d->zero_op1_p)
26081     d->op1 = d->op0;
26082
26083   for (unsigned int i = 0; i < nelt; ++i)
26084     {
26085       auto val = d->perm[i].to_constant ();
26086
26087       /* If we're selecting from a 0 vector, we can just use an out of range
26088          index instead.  */
26089       if ((d->zero_op0_p && val < nelt) || (d->zero_op1_p && val >= nelt))
26090         rperm[i] = constm1_rtx;
26091       else
26092         {
26093           /* If we are remapping a zero register as the first parameter we need
26094              to adjust the indices of the non-zero register.  */
26095           if (d->zero_op0_p)
26096             val = val % nelt;
26097
26098           /* If big-endian and two vectors we end up with a weird mixed-endian
26099              mode on NEON.  Reverse the index within each word but not the word
26100              itself.  to_constant is safe because we checked is_constant
26101              above.  */
26102           rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val);
26103         }
26104     }
26105
26106   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
26107   sel = force_reg (vmode, sel);
26108
26109   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
26110   return true;
26111 }
26112
26113 /* Try to implement D using an SVE TBL instruction.  */
26114
26115 static bool
26116 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
26117 {
26118   unsigned HOST_WIDE_INT nelt;
26119
26120   /* Permuting two variable-length vectors could overflow the
26121      index range.  */
26122   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
26123     return false;
26124
26125   if (d->testing_p)
26126     return true;
26127
26128   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
26129   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
26130   if (d->one_vector_p)
26131     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
26132   else
26133     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
26134   return true;
26135 }
26136
26137 /* Try to implement D using SVE dup instruction.  */
26138
26139 static bool
26140 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
26141 {
26142   if (BYTES_BIG_ENDIAN
26143       || !d->one_vector_p
26144       || d->vec_flags != VEC_SVE_DATA
26145       || d->op_vec_flags != VEC_ADVSIMD
26146       || d->perm.encoding ().nelts_per_pattern () != 1
26147       || !known_eq (d->perm.encoding ().npatterns (),
26148                     GET_MODE_NUNITS (d->op_mode))
26149       || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
26150     return false;
26151
26152   int npatterns = d->perm.encoding ().npatterns ();
26153   for (int i = 0; i < npatterns; i++)
26154     if (!known_eq (d->perm[i], i))
26155       return false;
26156
26157   if (d->testing_p)
26158     return true;
26159
26160   aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
26161   return true;
26162 }
26163
26164 /* Try to implement D using SVE SEL instruction.  */
26165
26166 static bool
26167 aarch64_evpc_sel (struct expand_vec_perm_d *d)
26168 {
26169   machine_mode vmode = d->vmode;
26170   int unit_size = GET_MODE_UNIT_SIZE (vmode);
26171
26172   if (d->vec_flags != VEC_SVE_DATA
26173       || unit_size > 8)
26174     return false;
26175
26176   int n_patterns = d->perm.encoding ().npatterns ();
26177   poly_int64 vec_len = d->perm.length ();
26178
26179   for (int i = 0; i < n_patterns; ++i)
26180     if (!known_eq (d->perm[i], i)
26181         && !known_eq (d->perm[i], vec_len + i))
26182       return false;
26183
26184   for (int i = n_patterns; i < n_patterns * 2; i++)
26185     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
26186         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
26187       return false;
26188
26189   if (d->testing_p)
26190     return true;
26191
26192   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
26193
26194   /* Build a predicate that is true when op0 elements should be used.  */
26195   rtx_vector_builder builder (pred_mode, n_patterns, 2);
26196   for (int i = 0; i < n_patterns * 2; i++)
26197     {
26198       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
26199                                           : CONST0_RTX (BImode);
26200       builder.quick_push (elem);
26201     }
26202
26203   rtx const_vec = builder.build ();
26204   rtx pred = force_reg (pred_mode, const_vec);
26205   /* TARGET = PRED ? OP0 : OP1.  */
26206   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
26207   return true;
26208 }
26209
26210 /* Recognize patterns suitable for the INS instructions.  */
26211 static bool
26212 aarch64_evpc_ins (struct expand_vec_perm_d *d)
26213 {
26214   machine_mode mode = d->vmode;
26215   unsigned HOST_WIDE_INT nelt;
26216
26217   if (d->vec_flags != VEC_ADVSIMD)
26218     return false;
26219
26220   /* to_constant is safe since this routine is specific to Advanced SIMD
26221      vectors.  */
26222   nelt = d->perm.length ().to_constant ();
26223   rtx insv = d->op0;
26224
26225   HOST_WIDE_INT idx = -1;
26226
26227   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26228     {
26229       HOST_WIDE_INT elt;
26230       if (!d->perm[i].is_constant (&elt))
26231         return false;
26232       if (elt == (HOST_WIDE_INT) i)
26233         continue;
26234       if (idx != -1)
26235         {
26236           idx = -1;
26237           break;
26238         }
26239       idx = i;
26240     }
26241
26242   if (idx == -1)
26243     {
26244       insv = d->op1;
26245       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
26246         {
26247           if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
26248             continue;
26249           if (idx != -1)
26250             return false;
26251           idx = i;
26252         }
26253
26254       if (idx == -1)
26255         return false;
26256     }
26257
26258   if (d->testing_p)
26259     return true;
26260
26261   gcc_assert (idx != -1);
26262
26263   unsigned extractindex = d->perm[idx].to_constant ();
26264   rtx extractv = d->op0;
26265   if (extractindex >= nelt)
26266     {
26267       extractv = d->op1;
26268       extractindex -= nelt;
26269     }
26270   gcc_assert (extractindex < nelt);
26271
26272   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
26273   expand_operand ops[5];
26274   create_output_operand (&ops[0], d->target, mode);
26275   create_input_operand (&ops[1], insv, mode);
26276   create_integer_operand (&ops[2], 1 << idx);
26277   create_input_operand (&ops[3], extractv, mode);
26278   create_integer_operand (&ops[4], extractindex);
26279   expand_insn (icode, 5, ops);
26280
26281   return true;
26282 }
26283
26284 static bool
26285 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
26286 {
26287   gcc_assert (d->op_mode != E_VOIDmode);
26288
26289   /* The pattern matching functions above are written to look for a small
26290      number to begin the sequence (0, 1, N/2).  If we begin with an index
26291      from the second operand, we can swap the operands.  */
26292   poly_int64 nelt = d->perm.length ();
26293   if (known_ge (d->perm[0], nelt))
26294     {
26295       d->perm.rotate_inputs (1);
26296       std::swap (d->op0, d->op1);
26297     }
26298
26299   if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
26300        || d->vec_flags == VEC_SVE_DATA
26301        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
26302        || d->vec_flags == VEC_SVE_PRED)
26303       && known_gt (nelt, 1))
26304     {
26305       if (d->vmode == d->op_mode)
26306         {
26307           if (aarch64_evpc_rev_local (d))
26308             return true;
26309           else if (aarch64_evpc_rev_global (d))
26310             return true;
26311           else if (aarch64_evpc_ext (d))
26312             return true;
26313           else if (aarch64_evpc_dup (d))
26314             return true;
26315           else if (aarch64_evpc_zip (d))
26316             return true;
26317           else if (aarch64_evpc_uzp (d))
26318             return true;
26319           else if (aarch64_evpc_trn (d))
26320             return true;
26321           else if (aarch64_evpc_sel (d))
26322             return true;
26323           else if (aarch64_evpc_ins (d))
26324             return true;
26325           else if (aarch64_evpc_reencode (d))
26326             return true;
26327
26328           if (d->vec_flags == VEC_SVE_DATA)
26329             return aarch64_evpc_sve_tbl (d);
26330           else if (d->vec_flags == VEC_ADVSIMD)
26331             return aarch64_evpc_tbl (d);
26332         }
26333       else
26334         {
26335           if (aarch64_evpc_sve_dup (d))
26336             return true;
26337         }
26338     }
26339   return false;
26340 }
26341
26342 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
26343
26344 static bool
26345 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
26346                                   rtx target, rtx op0, rtx op1,
26347                                   const vec_perm_indices &sel)
26348 {
26349   struct expand_vec_perm_d d;
26350
26351   /* Check whether the mask can be applied to a single vector.  */
26352   if (sel.ninputs () == 1
26353       || (op0 && rtx_equal_p (op0, op1)))
26354     d.one_vector_p = true;
26355   else if (sel.all_from_input_p (0))
26356     {
26357       d.one_vector_p = true;
26358       op1 = op0;
26359     }
26360   else if (sel.all_from_input_p (1))
26361     {
26362       d.one_vector_p = true;
26363       op0 = op1;
26364     }
26365   else
26366     d.one_vector_p = false;
26367
26368   d.zero_op0_p = op0 == CONST0_RTX (op_mode);
26369   d.zero_op1_p = op1 == CONST0_RTX (op_mode);
26370   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
26371                      sel.nelts_per_input ());
26372   d.vmode = vmode;
26373   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
26374   d.op_mode = op_mode;
26375   d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
26376   d.target = target;
26377   d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
26378   if (op0 == op1)
26379     d.op1 = d.op0;
26380   else
26381     d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
26382   d.testing_p = !target;
26383
26384   if (!d.testing_p)
26385     return aarch64_expand_vec_perm_const_1 (&d);
26386
26387   rtx_insn *last = get_last_insn ();
26388   bool ret = aarch64_expand_vec_perm_const_1 (&d);
26389   gcc_assert (last == get_last_insn ());
26390
26391   return ret;
26392 }
26393 /* Generate a byte permute mask for a register of mode MODE,
26394    which has NUNITS units.  */
26395
26396 rtx
26397 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
26398 {
26399   /* We have to reverse each vector because we dont have
26400      a permuted load that can reverse-load according to ABI rules.  */
26401   rtx mask;
26402   rtvec v = rtvec_alloc (16);
26403   unsigned int i, j;
26404   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
26405
26406   gcc_assert (BYTES_BIG_ENDIAN);
26407   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
26408
26409   for (i = 0; i < nunits; i++)
26410     for (j = 0; j < usize; j++)
26411       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
26412   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
26413   return force_reg (V16QImode, mask);
26414 }
26415
26416 /* Expand an SVE integer comparison using the SVE equivalent of:
26417
26418      (set TARGET (CODE OP0 OP1)).  */
26419
26420 void
26421 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
26422 {
26423   machine_mode pred_mode = GET_MODE (target);
26424   machine_mode data_mode = GET_MODE (op0);
26425   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
26426                                       op0, op1);
26427   if (!rtx_equal_p (target, res))
26428     emit_move_insn (target, res);
26429 }
26430
26431 /* Return the UNSPEC_COND_* code for comparison CODE.  */
26432
26433 static unsigned int
26434 aarch64_unspec_cond_code (rtx_code code)
26435 {
26436   switch (code)
26437     {
26438     case NE:
26439       return UNSPEC_COND_FCMNE;
26440     case EQ:
26441       return UNSPEC_COND_FCMEQ;
26442     case LT:
26443       return UNSPEC_COND_FCMLT;
26444     case GT:
26445       return UNSPEC_COND_FCMGT;
26446     case LE:
26447       return UNSPEC_COND_FCMLE;
26448     case GE:
26449       return UNSPEC_COND_FCMGE;
26450     case UNORDERED:
26451       return UNSPEC_COND_FCMUO;
26452     default:
26453       gcc_unreachable ();
26454     }
26455 }
26456
26457 /* Emit:
26458
26459       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26460
26461    where <X> is the operation associated with comparison CODE.
26462    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26463
26464 static void
26465 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
26466                           bool known_ptrue_p, rtx op0, rtx op1)
26467 {
26468   rtx flag = gen_int_mode (known_ptrue_p, SImode);
26469   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
26470                                gen_rtvec (4, pred, flag, op0, op1),
26471                                aarch64_unspec_cond_code (code));
26472   emit_set_insn (target, unspec);
26473 }
26474
26475 /* Emit the SVE equivalent of:
26476
26477       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
26478       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
26479       (set TARGET (ior:PRED_MODE TMP1 TMP2))
26480
26481    where <Xi> is the operation associated with comparison CODEi.
26482    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26483
26484 static void
26485 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
26486                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
26487 {
26488   machine_mode pred_mode = GET_MODE (pred);
26489   rtx tmp1 = gen_reg_rtx (pred_mode);
26490   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
26491   rtx tmp2 = gen_reg_rtx (pred_mode);
26492   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
26493   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
26494 }
26495
26496 /* Emit the SVE equivalent of:
26497
26498       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26499       (set TARGET (not TMP))
26500
26501    where <X> is the operation associated with comparison CODE.
26502    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26503
26504 static void
26505 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
26506                                  bool known_ptrue_p, rtx op0, rtx op1)
26507 {
26508   machine_mode pred_mode = GET_MODE (pred);
26509   rtx tmp = gen_reg_rtx (pred_mode);
26510   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
26511   aarch64_emit_unop (target, one_cmpl_optab, tmp);
26512 }
26513
26514 /* Expand an SVE floating-point comparison using the SVE equivalent of:
26515
26516      (set TARGET (CODE OP0 OP1))
26517
26518    If CAN_INVERT_P is true, the caller can also handle inverted results;
26519    return true if the result is in fact inverted.  */
26520
26521 bool
26522 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
26523                                   rtx op0, rtx op1, bool can_invert_p)
26524 {
26525   machine_mode pred_mode = GET_MODE (target);
26526   machine_mode data_mode = GET_MODE (op0);
26527
26528   rtx ptrue = aarch64_ptrue_reg (pred_mode);
26529   switch (code)
26530     {
26531     case UNORDERED:
26532       /* UNORDERED has no immediate form.  */
26533       op1 = force_reg (data_mode, op1);
26534       /* fall through */
26535     case LT:
26536     case LE:
26537     case GT:
26538     case GE:
26539     case EQ:
26540     case NE:
26541       {
26542         /* There is native support for the comparison.  */
26543         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26544         return false;
26545       }
26546
26547     case LTGT:
26548       /* This is a trapping operation (LT or GT).  */
26549       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
26550       return false;
26551
26552     case UNEQ:
26553       if (!flag_trapping_math)
26554         {
26555           /* This would trap for signaling NaNs.  */
26556           op1 = force_reg (data_mode, op1);
26557           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
26558                                         ptrue, true, op0, op1);
26559           return false;
26560         }
26561       /* fall through */
26562     case UNLT:
26563     case UNLE:
26564     case UNGT:
26565     case UNGE:
26566       if (flag_trapping_math)
26567         {
26568           /* Work out which elements are ordered.  */
26569           rtx ordered = gen_reg_rtx (pred_mode);
26570           op1 = force_reg (data_mode, op1);
26571           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
26572                                            ptrue, true, op0, op1);
26573
26574           /* Test the opposite condition for the ordered elements,
26575              then invert the result.  */
26576           if (code == UNEQ)
26577             code = NE;
26578           else
26579             code = reverse_condition_maybe_unordered (code);
26580           if (can_invert_p)
26581             {
26582               aarch64_emit_sve_fp_cond (target, code,
26583                                         ordered, false, op0, op1);
26584               return true;
26585             }
26586           aarch64_emit_sve_invert_fp_cond (target, code,
26587                                            ordered, false, op0, op1);
26588           return false;
26589         }
26590       break;
26591
26592     case ORDERED:
26593       /* ORDERED has no immediate form.  */
26594       op1 = force_reg (data_mode, op1);
26595       break;
26596
26597     default:
26598       gcc_unreachable ();
26599     }
26600
26601   /* There is native support for the inverse comparison.  */
26602   code = reverse_condition_maybe_unordered (code);
26603   if (can_invert_p)
26604     {
26605       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26606       return true;
26607     }
26608   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
26609   return false;
26610 }
26611
26612 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
26613    of the data being selected and CMP_MODE is the mode of the values being
26614    compared.  */
26615
26616 void
26617 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
26618                           rtx *ops)
26619 {
26620   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
26621   rtx pred = gen_reg_rtx (pred_mode);
26622   if (FLOAT_MODE_P (cmp_mode))
26623     {
26624       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
26625                                             ops[4], ops[5], true))
26626         std::swap (ops[1], ops[2]);
26627     }
26628   else
26629     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
26630
26631   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
26632     ops[1] = force_reg (data_mode, ops[1]);
26633   /* The "false" value can only be zero if the "true" value is a constant.  */
26634   if (register_operand (ops[1], data_mode)
26635       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
26636     ops[2] = force_reg (data_mode, ops[2]);
26637
26638   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
26639   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
26640 }
26641
26642 /* Return true if:
26643
26644    (a) MODE1 and MODE2 use the same layout for bytes that are common
26645        to both modes;
26646
26647    (b) subregs involving the two modes behave as the target-independent
26648        subreg rules require; and
26649
26650    (c) there is at least one register that can hold both modes.
26651
26652    Return false otherwise.  */
26653
26654 static bool
26655 aarch64_modes_compatible_p (machine_mode mode1, machine_mode mode2)
26656 {
26657   unsigned int flags1 = aarch64_classify_vector_mode (mode1);
26658   unsigned int flags2 = aarch64_classify_vector_mode (mode2);
26659
26660   bool sve1_p = (flags1 & VEC_ANY_SVE);
26661   bool sve2_p = (flags2 & VEC_ANY_SVE);
26662
26663   bool partial_sve1_p = sve1_p && (flags1 & VEC_PARTIAL);
26664   bool partial_sve2_p = sve2_p && (flags2 & VEC_PARTIAL);
26665
26666   bool pred1_p = (flags1 & VEC_SVE_PRED);
26667   bool pred2_p = (flags2 & VEC_SVE_PRED);
26668
26669   bool partial_advsimd_struct1_p = (flags1 == (VEC_ADVSIMD | VEC_STRUCT
26670                                                | VEC_PARTIAL));
26671   bool partial_advsimd_struct2_p = (flags2 == (VEC_ADVSIMD | VEC_STRUCT
26672                                                | VEC_PARTIAL));
26673
26674   /* Don't allow changes between predicate modes and other modes.
26675      Only predicate registers can hold predicate modes and only
26676      non-predicate registers can hold non-predicate modes, so any
26677      attempt to mix them would require a round trip through memory.  */
26678   if (pred1_p != pred2_p)
26679     return false;
26680
26681   /* The contents of partial SVE modes are distributed evenly across
26682      the register, whereas GCC expects them to be clustered together.
26683      We therefore need to be careful about mode changes involving them.  */
26684   if (partial_sve1_p && partial_sve2_p)
26685     {
26686       /* Reject changes between partial SVE modes that have different
26687          patterns of significant and insignificant bits.  */
26688       if ((aarch64_sve_container_bits (mode1)
26689            != aarch64_sve_container_bits (mode2))
26690           || GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26691         return false;
26692     }
26693   else if (partial_sve1_p)
26694     {
26695       /* The first lane of MODE1 is where GCC expects it, but anything
26696          bigger than that is not.  */
26697       if (maybe_gt (GET_MODE_SIZE (mode2), GET_MODE_UNIT_SIZE (mode1)))
26698         return false;
26699     }
26700   else if (partial_sve2_p)
26701     {
26702       /* Similarly in reverse.  */
26703       if (maybe_gt (GET_MODE_SIZE (mode1), GET_MODE_UNIT_SIZE (mode2)))
26704         return false;
26705     }
26706
26707   /* Don't allow changes between partial Advanced SIMD structure modes
26708      and other modes that are bigger than 8 bytes.  E.g. V16QI and V2x8QI
26709      are the same size, but the former occupies one Q register while the
26710      latter occupies two D registers.  */
26711   if (partial_advsimd_struct1_p != partial_advsimd_struct2_p
26712       && maybe_gt (GET_MODE_SIZE (mode1), 8)
26713       && maybe_gt (GET_MODE_SIZE (mode2), 8))
26714     return false;
26715
26716   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26717     {
26718       /* Don't allow changes between SVE modes and other modes that might
26719          be bigger than 128 bits.  In particular, OImode, CImode and XImode
26720          divide into 128-bit quantities while SVE modes divide into
26721          BITS_PER_SVE_VECTOR quantities.  */
26722       if (sve1_p && !sve2_p && maybe_gt (GET_MODE_BITSIZE (mode2), 128))
26723         return false;
26724       if (sve2_p && !sve1_p && maybe_gt (GET_MODE_BITSIZE (mode1), 128))
26725         return false;
26726     }
26727
26728   if (BYTES_BIG_ENDIAN)
26729     {
26730       /* Don't allow changes between SVE data modes and non-SVE modes.
26731          See the comment at the head of aarch64-sve.md for details.  */
26732       if (sve1_p != sve2_p)
26733         return false;
26734
26735       /* Don't allow changes in element size: lane 0 of the new vector
26736          would not then be lane 0 of the old vector.  See the comment
26737          above aarch64_maybe_expand_sve_subreg_move for a more detailed
26738          description.
26739
26740          In the worst case, this forces a register to be spilled in
26741          one mode and reloaded in the other, which handles the
26742          endianness correctly.  */
26743       if (sve1_p && GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26744         return false;
26745     }
26746   return true;
26747 }
26748
26749 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always defer
26750    to aarch64_modes_compatible_p.  However due to issues with register
26751    allocation it is preferable to avoid tieing integer scalar and FP
26752    scalar modes.  Executing integer operations in general registers is
26753    better than treating them as scalar vector operations.  This reduces
26754    latency and avoids redundant int<->FP moves.  So tie modes if they
26755    are either the same class, or one of them is a vector mode.  */
26756
26757 static bool
26758 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
26759 {
26760   if (aarch64_modes_compatible_p (mode1, mode2))
26761     {
26762       if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
26763         return true;
26764       if (VECTOR_MODE_P (mode1) || VECTOR_MODE_P (mode2))
26765         return true;
26766     }
26767   return false;
26768 }
26769
26770 /* Return a new RTX holding the result of moving POINTER forward by
26771    AMOUNT bytes.  */
26772
26773 static rtx
26774 aarch64_move_pointer (rtx pointer, poly_int64 amount)
26775 {
26776   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
26777
26778   return adjust_automodify_address (pointer, GET_MODE (pointer),
26779                                     next, amount);
26780 }
26781
26782 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
26783    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
26784    rather than memcpy.  Return true iff we succeeded.  */
26785 bool
26786 aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
26787 {
26788   if (!TARGET_MOPS)
26789     return false;
26790
26791   /* All three registers are changed by the instruction, so each one
26792      must be a fresh pseudo.  */
26793   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26794   rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
26795   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26796   rtx src_mem = replace_equiv_address (operands[1], src_addr);
26797   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
26798   if (is_memmove)
26799     emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
26800   else
26801     emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
26802   return true;
26803 }
26804
26805 /* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
26806    OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
26807    if this is a memmove rather than memcpy.  Return true if we succeed,
26808    otherwise return false, indicating that a libcall should be emitted.  */
26809 bool
26810 aarch64_expand_cpymem (rtx *operands, bool is_memmove)
26811 {
26812   int mode_bytes;
26813   rtx dst = operands[0];
26814   rtx src = operands[1];
26815   unsigned align = UINTVAL (operands[3]);
26816   rtx base;
26817   machine_mode mode = BLKmode, next_mode;
26818
26819   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
26820   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
26821     return aarch64_expand_cpymem_mops (operands, is_memmove);
26822
26823   unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
26824
26825   /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
26826   unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
26827   unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
26828                                        : aarch64_mops_memcpy_size_threshold;
26829
26830   /* Reduce the maximum size with -Os.  */
26831   if (optimize_function_for_size_p (cfun))
26832     max_copy_size /= 4;
26833
26834   /* Large copies use MOPS when available or a library call.  */
26835   if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
26836     return aarch64_expand_cpymem_mops (operands, is_memmove);
26837
26838   /* Default to 32-byte LDP/STP on large copies, however small copies or
26839      no SIMD support fall back to 16-byte chunks.
26840      ??? Although it would be possible to use LDP/STP Qn in streaming mode
26841      (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
26842      whether that would improve performance.  */
26843   bool use_qregs = size > 24 && TARGET_SIMD;
26844
26845   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26846   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26847
26848   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
26849   src = adjust_automodify_address (src, VOIDmode, base, 0);
26850
26851   auto_vec<std::pair<rtx, rtx>, 16> ops;
26852   int offset = 0;
26853
26854   while (size > 0)
26855     {
26856       /* Find the largest mode in which to do the copy in without over reading
26857          or writing.  */
26858       opt_scalar_int_mode mode_iter;
26859       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26860         if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, 16))
26861           mode = mode_iter.require ();
26862
26863       gcc_assert (mode != BLKmode);
26864
26865       mode_bytes = GET_MODE_SIZE (mode).to_constant ();
26866
26867       /* Prefer Q-register accesses.  */
26868       if (mode_bytes == 16 && use_qregs)
26869         mode = V4SImode;
26870
26871       rtx reg = gen_reg_rtx (mode);
26872       rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
26873       rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
26874       ops.safe_push ({ load, store });
26875       size -= mode_bytes;
26876       offset += mode_bytes;
26877
26878       /* Emit trailing copies using overlapping unaligned accesses
26879          (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
26880       if (size > 0 && size < 16 && !STRICT_ALIGNMENT)
26881         {
26882           next_mode = smallest_mode_for_size
26883             (size * BITS_PER_UNIT, MODE_INT).require ();
26884           int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
26885           gcc_assert (n_bytes <= mode_bytes);
26886           offset -= n_bytes - size;
26887           size = n_bytes;
26888         }
26889     }
26890
26891   /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
26892   int nops = ops.length();
26893   int inc = is_memmove || nops <= 8 ? nops : 6;
26894
26895   for (int i = 0; i < nops; i += inc)
26896     {
26897       int m = MIN (nops, i + inc);
26898       /* Emit loads.  */
26899       for (int j = i; j < m; j++)
26900         emit_insn (ops[j].first);
26901       /* Emit stores.  */
26902       for (int j = i; j < m; j++)
26903         emit_insn (ops[j].second);
26904     }
26905   return true;
26906 }
26907
26908 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
26909    as for the setmem pattern.  Return true iff we succeed.  */
26910 static bool
26911 aarch64_expand_setmem_mops (rtx *operands)
26912 {
26913   if (!TARGET_MOPS)
26914     return false;
26915
26916   /* The first two registers are changed by the instruction, so both
26917      of them must be a fresh pseudo.  */
26918   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26919   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26920   rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
26921   rtx val = operands[2];
26922   if (val != CONST0_RTX (QImode))
26923     val = force_reg (QImode, val);
26924   emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
26925   return true;
26926 }
26927
26928 /* Expand setmem, as if from a __builtin_memset.  Return true if
26929    we succeed, otherwise return false.  */
26930
26931 bool
26932 aarch64_expand_setmem (rtx *operands)
26933 {
26934   int mode_bytes;
26935   unsigned HOST_WIDE_INT len;
26936   rtx dst = operands[0];
26937   rtx val = operands[2], src;
26938   unsigned align = UINTVAL (operands[3]);
26939   rtx base;
26940   machine_mode mode = BLKmode, next_mode;
26941
26942   /* Variable-sized or strict-align memset may use the MOPS expansion.  */
26943   if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
26944       || (STRICT_ALIGNMENT && align < 16))
26945     return aarch64_expand_setmem_mops (operands);
26946
26947   /* Set inline limits for memset.  MOPS has a separate threshold.  */
26948   unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
26949   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
26950
26951   len = UINTVAL (operands[1]);
26952
26953   /* Large memset uses MOPS when available or a library call.  */
26954   if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
26955     return aarch64_expand_setmem_mops (operands);
26956
26957   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26958   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26959
26960   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
26961   val = expand_vector_broadcast (V16QImode, val);
26962   val = force_reg (V16QImode, val);
26963
26964   int offset = 0;
26965   while (len > 0)
26966     {
26967       /* Find the largest mode in which to do the copy without
26968          over writing.  */
26969       opt_scalar_int_mode mode_iter;
26970       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26971         if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16))
26972           mode = mode_iter.require ();
26973
26974       gcc_assert (mode != BLKmode);
26975
26976       mode_bytes = GET_MODE_SIZE (mode).to_constant ();
26977
26978       src = val;
26979
26980       /* Prefer Q-register accesses.  */
26981       if (mode_bytes == 16)
26982         mode = V16QImode;
26983       else
26984         src = lowpart_subreg (mode, src, GET_MODE (val));
26985
26986       emit_move_insn (adjust_address (dst, mode, offset), src);
26987       len -= mode_bytes;
26988       offset += mode_bytes;
26989
26990       /* Emit trailing writes using overlapping unaligned accesses
26991          (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
26992       if (len > 0 && len < 16 && !STRICT_ALIGNMENT)
26993         {
26994           next_mode = smallest_mode_for_size
26995             (len * BITS_PER_UNIT, MODE_INT).require ();
26996           int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
26997           gcc_assert (n_bytes <= mode_bytes);
26998           offset -= n_bytes - len;
26999           len = n_bytes;
27000         }
27001     }
27002
27003   return true;
27004 }
27005
27006
27007 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
27008    SImode stores.  Handle the case when the constant has identical
27009    bottom and top halves.  This is beneficial when the two stores can be
27010    merged into an STP and we avoid synthesising potentially expensive
27011    immediates twice.  Return true if such a split is possible.  */
27012
27013 bool
27014 aarch64_split_dimode_const_store (rtx dst, rtx src)
27015 {
27016   rtx lo = gen_lowpart (SImode, src);
27017   rtx hi = gen_highpart_mode (SImode, DImode, src);
27018
27019   if (!rtx_equal_p (lo, hi))
27020     return false;
27021
27022   unsigned int orig_cost
27023     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
27024   unsigned int lo_cost
27025     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
27026
27027   /* We want to transform:
27028      MOV        x1, 49370
27029      MOVK       x1, 0x140, lsl 16
27030      MOVK       x1, 0xc0da, lsl 32
27031      MOVK       x1, 0x140, lsl 48
27032      STR        x1, [x0]
27033    into:
27034      MOV        w1, 49370
27035      MOVK       w1, 0x140, lsl 16
27036      STP        w1, w1, [x0]
27037    So we want to perform this when we save at least one instruction.  */
27038   if (orig_cost <= lo_cost)
27039     return false;
27040
27041   rtx mem_lo = adjust_address (dst, SImode, 0);
27042   if (!aarch64_mem_pair_operand (mem_lo, SImode))
27043     return false;
27044
27045   rtx tmp_reg = gen_reg_rtx (SImode);
27046   aarch64_expand_mov_immediate (tmp_reg, lo);
27047   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
27048   /* Don't emit an explicit store pair as this may not be always profitable.
27049      Let the sched-fusion logic decide whether to merge them.  */
27050   emit_move_insn (mem_lo, tmp_reg);
27051   emit_move_insn (mem_hi, tmp_reg);
27052
27053   return true;
27054 }
27055
27056 /* Generate RTL for a conditional branch with rtx comparison CODE in
27057    mode CC_MODE.  The destination of the unlikely conditional branch
27058    is LABEL_REF.  */
27059
27060 void
27061 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
27062                               rtx label_ref)
27063 {
27064   rtx x;
27065   x = gen_rtx_fmt_ee (code, VOIDmode,
27066                       gen_rtx_REG (cc_mode, CC_REGNUM),
27067                       const0_rtx);
27068
27069   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
27070                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
27071                             pc_rtx);
27072   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
27073 }
27074
27075 /* Generate DImode scratch registers for 128-bit (TImode) addition.
27076
27077    OP1 represents the TImode destination operand 1
27078    OP2 represents the TImode destination operand 2
27079    LOW_DEST represents the low half (DImode) of TImode operand 0
27080    LOW_IN1 represents the low half (DImode) of TImode operand 1
27081    LOW_IN2 represents the low half (DImode) of TImode operand 2
27082    HIGH_DEST represents the high half (DImode) of TImode operand 0
27083    HIGH_IN1 represents the high half (DImode) of TImode operand 1
27084    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
27085
27086 void
27087 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
27088                             rtx *low_in1, rtx *low_in2,
27089                             rtx *high_dest, rtx *high_in1,
27090                             rtx *high_in2)
27091 {
27092   *low_dest = gen_reg_rtx (DImode);
27093   *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
27094   *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
27095   *high_dest = gen_reg_rtx (DImode);
27096   *high_in1 = force_highpart_subreg (DImode, op1, TImode);
27097   *high_in2 = force_highpart_subreg (DImode, op2, TImode);
27098 }
27099
27100 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
27101
27102    OP1 represents the TImode destination operand 1
27103    OP2 represents the TImode destination operand 2
27104    LOW_DEST represents the low half (DImode) of TImode operand 0
27105    LOW_IN1 represents the low half (DImode) of TImode operand 1
27106    LOW_IN2 represents the low half (DImode) of TImode operand 2
27107    HIGH_DEST represents the high half (DImode) of TImode operand 0
27108    HIGH_IN1 represents the high half (DImode) of TImode operand 1
27109    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
27110
27111
27112 void
27113 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
27114                              rtx *low_in1, rtx *low_in2,
27115                              rtx *high_dest, rtx *high_in1,
27116                              rtx *high_in2)
27117 {
27118   *low_dest = gen_reg_rtx (DImode);
27119   *low_in1 = force_lowpart_subreg (DImode, op1, TImode);
27120   *low_in2 = force_lowpart_subreg (DImode, op2, TImode);
27121   *high_dest = gen_reg_rtx (DImode);
27122
27123   *high_in1 = force_highpart_subreg (DImode, op1, TImode);
27124   *high_in2 = force_highpart_subreg (DImode, op2, TImode);
27125 }
27126
27127 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
27128
27129    OP0 represents the TImode destination operand 0
27130    LOW_DEST represents the low half (DImode) of TImode operand 0
27131    LOW_IN1 represents the low half (DImode) of TImode operand 1
27132    LOW_IN2 represents the low half (DImode) of TImode operand 2
27133    HIGH_DEST represents the high half (DImode) of TImode operand 0
27134    HIGH_IN1 represents the high half (DImode) of TImode operand 1
27135    HIGH_IN2 represents the high half (DImode) of TImode operand 2
27136    UNSIGNED_P is true if the operation is being performed on unsigned
27137    values.  */
27138 void
27139 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
27140                        rtx low_in2, rtx high_dest, rtx high_in1,
27141                        rtx high_in2, bool unsigned_p)
27142 {
27143   if (low_in2 == const0_rtx)
27144     {
27145       low_dest = low_in1;
27146       high_in2 = force_reg (DImode, high_in2);
27147       if (unsigned_p)
27148         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
27149       else
27150         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
27151     }
27152   else
27153     {
27154       if (aarch64_plus_immediate (low_in2, DImode))
27155         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
27156                                             GEN_INT (-UINTVAL (low_in2))));
27157       else
27158         {
27159           low_in2 = force_reg (DImode, low_in2);
27160           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
27161         }
27162       high_in2 = force_reg (DImode, high_in2);
27163
27164       if (unsigned_p)
27165         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
27166       else
27167         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
27168     }
27169
27170   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
27171   emit_move_insn (gen_highpart (DImode, op0), high_dest);
27172
27173 }
27174
27175 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
27176
27177 static unsigned HOST_WIDE_INT
27178 aarch64_asan_shadow_offset (void)
27179 {
27180   if (TARGET_ILP32)
27181     return (HOST_WIDE_INT_1 << 29);
27182   else
27183     return (HOST_WIDE_INT_1 << 36);
27184 }
27185
27186 static rtx
27187 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
27188                         rtx_code code, tree treeop0, tree treeop1)
27189 {
27190   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27191   rtx op0, op1;
27192   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27193   insn_code icode;
27194   struct expand_operand ops[4];
27195
27196   start_sequence ();
27197   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27198
27199   op_mode = GET_MODE (op0);
27200   if (op_mode == VOIDmode)
27201     op_mode = GET_MODE (op1);
27202
27203   switch (op_mode)
27204     {
27205     case E_QImode:
27206     case E_HImode:
27207     case E_SImode:
27208       cmp_mode = SImode;
27209       icode = CODE_FOR_cmpsi;
27210       break;
27211
27212     case E_DImode:
27213       cmp_mode = DImode;
27214       icode = CODE_FOR_cmpdi;
27215       break;
27216
27217     case E_SFmode:
27218       cmp_mode = SFmode;
27219       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
27220       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
27221       break;
27222
27223     case E_DFmode:
27224       cmp_mode = DFmode;
27225       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
27226       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
27227       break;
27228
27229     default:
27230       end_sequence ();
27231       return NULL_RTX;
27232     }
27233
27234   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
27235   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
27236   if (!op0 || !op1)
27237     {
27238       end_sequence ();
27239       return NULL_RTX;
27240     }
27241   *prep_seq = get_insns ();
27242   end_sequence ();
27243
27244   create_fixed_operand (&ops[0], op0);
27245   create_fixed_operand (&ops[1], op1);
27246
27247   start_sequence ();
27248   if (!maybe_expand_insn (icode, 2, ops))
27249     {
27250       end_sequence ();
27251       return NULL_RTX;
27252     }
27253   *gen_seq = get_insns ();
27254   end_sequence ();
27255
27256   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
27257                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
27258 }
27259
27260 static rtx
27261 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
27262                        rtx_code cmp_code, tree treeop0, tree treeop1,
27263                        rtx_code bit_code)
27264 {
27265   rtx op0, op1, target;
27266   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27267   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27268   insn_code icode;
27269   struct expand_operand ops[6];
27270   int aarch64_cond;
27271
27272   push_to_sequence (*prep_seq);
27273   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27274
27275   op_mode = GET_MODE (op0);
27276   if (op_mode == VOIDmode)
27277     op_mode = GET_MODE (op1);
27278
27279   switch (op_mode)
27280     {
27281     case E_QImode:
27282     case E_HImode:
27283     case E_SImode:
27284       cmp_mode = SImode;
27285       break;
27286
27287     case E_DImode:
27288       cmp_mode = DImode;
27289       break;
27290
27291     case E_SFmode:
27292       cmp_mode = SFmode;
27293       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27294       break;
27295
27296     case E_DFmode:
27297       cmp_mode = DFmode;
27298       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27299       break;
27300
27301     default:
27302       end_sequence ();
27303       return NULL_RTX;
27304     }
27305
27306   icode = code_for_ccmp (cc_mode, cmp_mode);
27307
27308   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
27309   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
27310   if (!op0 || !op1)
27311     {
27312       end_sequence ();
27313       return NULL_RTX;
27314     }
27315   *prep_seq = get_insns ();
27316   end_sequence ();
27317
27318   target = gen_rtx_REG (cc_mode, CC_REGNUM);
27319   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
27320
27321   if (bit_code != AND)
27322     {
27323       /* Treat the ccmp patterns as canonical and use them where possible,
27324          but fall back to ccmp_rev patterns if there's no other option.  */
27325       rtx_code prev_code = GET_CODE (prev);
27326       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
27327       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
27328           && !(prev_code == EQ
27329                || prev_code == NE
27330                || prev_code == ORDERED
27331                || prev_code == UNORDERED))
27332         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
27333       else
27334         {
27335           rtx_code code = reverse_condition (prev_code);
27336           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
27337         }
27338       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
27339     }
27340
27341   create_fixed_operand (&ops[0], XEXP (prev, 0));
27342   create_fixed_operand (&ops[1], target);
27343   create_fixed_operand (&ops[2], op0);
27344   create_fixed_operand (&ops[3], op1);
27345   create_fixed_operand (&ops[4], prev);
27346   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
27347
27348   push_to_sequence (*gen_seq);
27349   if (!maybe_expand_insn (icode, 6, ops))
27350     {
27351       end_sequence ();
27352       return NULL_RTX;
27353     }
27354
27355   *gen_seq = get_insns ();
27356   end_sequence ();
27357
27358   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
27359 }
27360
27361 #undef TARGET_GEN_CCMP_FIRST
27362 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
27363
27364 #undef TARGET_GEN_CCMP_NEXT
27365 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
27366
27367 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
27368    instruction fusion of some sort.  */
27369
27370 static bool
27371 aarch64_macro_fusion_p (void)
27372 {
27373   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
27374 }
27375
27376
27377 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
27378    should be kept together during scheduling.  */
27379
27380 static bool
27381 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
27382 {
27383   rtx set_dest;
27384   rtx prev_set = single_set (prev);
27385   rtx curr_set = single_set (curr);
27386   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
27387   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
27388
27389   if (!aarch64_macro_fusion_p ())
27390     return false;
27391
27392   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
27393     {
27394       /* We are trying to match:
27395          prev (mov)  == (set (reg r0) (const_int imm16))
27396          curr (movk) == (set (zero_extract (reg r0)
27397                                            (const_int 16)
27398                                            (const_int 16))
27399                              (const_int imm16_1))  */
27400
27401       set_dest = SET_DEST (curr_set);
27402
27403       if (GET_CODE (set_dest) == ZERO_EXTRACT
27404           && CONST_INT_P (SET_SRC (curr_set))
27405           && CONST_INT_P (SET_SRC (prev_set))
27406           && CONST_INT_P (XEXP (set_dest, 2))
27407           && INTVAL (XEXP (set_dest, 2)) == 16
27408           && REG_P (XEXP (set_dest, 0))
27409           && REG_P (SET_DEST (prev_set))
27410           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
27411         {
27412           return true;
27413         }
27414     }
27415
27416   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
27417     {
27418
27419       /*  We're trying to match:
27420           prev (adrp) == (set (reg r1)
27421                               (high (symbol_ref ("SYM"))))
27422           curr (add) == (set (reg r0)
27423                              (lo_sum (reg r1)
27424                                      (symbol_ref ("SYM"))))
27425           Note that r0 need not necessarily be the same as r1, especially
27426           during pre-regalloc scheduling.  */
27427
27428       if (satisfies_constraint_Ush (SET_SRC (prev_set))
27429           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27430         {
27431           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
27432               && REG_P (XEXP (SET_SRC (curr_set), 0))
27433               && REGNO (XEXP (SET_SRC (curr_set), 0))
27434                  == REGNO (SET_DEST (prev_set))
27435               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
27436                               XEXP (SET_SRC (curr_set), 1)))
27437             return true;
27438         }
27439     }
27440
27441   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
27442     {
27443
27444       /* We're trying to match:
27445          prev (movk) == (set (zero_extract (reg r0)
27446                                            (const_int 16)
27447                                            (const_int 32))
27448                              (const_int imm16_1))
27449          curr (movk) == (set (zero_extract (reg r0)
27450                                            (const_int 16)
27451                                            (const_int 48))
27452                              (const_int imm16_2))  */
27453
27454       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
27455           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
27456           && REG_P (XEXP (SET_DEST (prev_set), 0))
27457           && REG_P (XEXP (SET_DEST (curr_set), 0))
27458           && REGNO (XEXP (SET_DEST (prev_set), 0))
27459              == REGNO (XEXP (SET_DEST (curr_set), 0))
27460           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
27461           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
27462           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
27463           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
27464           && CONST_INT_P (SET_SRC (prev_set))
27465           && CONST_INT_P (SET_SRC (curr_set)))
27466         return true;
27467
27468     }
27469   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
27470     {
27471       /* We're trying to match:
27472           prev (adrp) == (set (reg r0)
27473                               (high (symbol_ref ("SYM"))))
27474           curr (ldr) == (set (reg r1)
27475                              (mem (lo_sum (reg r0)
27476                                              (symbol_ref ("SYM")))))
27477                  or
27478           curr (ldr) == (set (reg r1)
27479                              (zero_extend (mem
27480                                            (lo_sum (reg r0)
27481                                                    (symbol_ref ("SYM"))))))  */
27482       if (satisfies_constraint_Ush (SET_SRC (prev_set))
27483           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27484         {
27485           rtx curr_src = SET_SRC (curr_set);
27486
27487           if (GET_CODE (curr_src) == ZERO_EXTEND)
27488             curr_src = XEXP (curr_src, 0);
27489
27490           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
27491               && REG_P (XEXP (XEXP (curr_src, 0), 0))
27492               && REGNO (XEXP (XEXP (curr_src, 0), 0))
27493                  == REGNO (SET_DEST (prev_set))
27494               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
27495                               XEXP (SET_SRC (prev_set), 0)))
27496               return true;
27497         }
27498     }
27499
27500   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
27501   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
27502       && prev_set && curr_set && any_condjump_p (curr)
27503       && GET_CODE (SET_SRC (prev_set)) == COMPARE
27504       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
27505       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
27506     return true;
27507
27508   /* Fuse CMP and CSEL/CSET.  */
27509   if (prev_set && curr_set
27510       && GET_CODE (SET_SRC (prev_set)) == COMPARE
27511       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
27512       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
27513     {
27514       enum attr_type prev_type = get_attr_type (prev);
27515       if ((prev_type == TYPE_ALUS_SREG || prev_type == TYPE_ALUS_IMM)
27516           && ((aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSEL)
27517                && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
27518                && aarch64_reg_or_zero (XEXP (SET_SRC (curr_set), 1), VOIDmode)
27519                && aarch64_reg_or_zero (XEXP (SET_SRC (curr_set), 2), VOIDmode)
27520                && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (curr_set), 1))))
27521               || (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSET)
27522                   && GET_RTX_CLASS (GET_CODE (SET_SRC (curr_set)))
27523                      == RTX_COMPARE
27524                   && REG_P (SET_DEST (curr_set)))))
27525         return true;
27526     }
27527
27528   /* Fuse flag-setting ALU instructions and conditional branch.  */
27529   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
27530       && any_condjump_p (curr))
27531     {
27532       unsigned int condreg1, condreg2;
27533       rtx cc_reg_1;
27534       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
27535       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
27536
27537       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
27538           && prev
27539           && modified_in_p (cc_reg_1, prev))
27540         {
27541           enum attr_type prev_type = get_attr_type (prev);
27542
27543           /* FIXME: this misses some which is considered simple arthematic
27544              instructions for ThunderX.  Simple shifts are missed here.  */
27545           if (prev_type == TYPE_ALUS_SREG
27546               || prev_type == TYPE_ALUS_IMM
27547               || prev_type == TYPE_LOGICS_REG
27548               || prev_type == TYPE_LOGICS_IMM)
27549             return true;
27550         }
27551     }
27552
27553   /* Fuse ALU instructions and CBZ/CBNZ.  */
27554   if (prev_set
27555       && curr_set
27556       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
27557       && any_condjump_p (curr))
27558     {
27559       /* We're trying to match:
27560           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
27561           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
27562                                                          (const_int 0))
27563                                                  (label_ref ("SYM"))
27564                                                  (pc))  */
27565       if (SET_DEST (curr_set) == (pc_rtx)
27566           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
27567           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
27568           && REG_P (SET_DEST (prev_set))
27569           && REGNO (SET_DEST (prev_set))
27570              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
27571         {
27572           /* Fuse ALU operations followed by conditional branch instruction.  */
27573           switch (get_attr_type (prev))
27574             {
27575             case TYPE_ALU_IMM:
27576             case TYPE_ALU_SREG:
27577             case TYPE_ADC_REG:
27578             case TYPE_ADC_IMM:
27579             case TYPE_ADCS_REG:
27580             case TYPE_ADCS_IMM:
27581             case TYPE_LOGIC_REG:
27582             case TYPE_LOGIC_IMM:
27583             case TYPE_CSEL:
27584             case TYPE_ADR:
27585             case TYPE_MOV_IMM:
27586             case TYPE_SHIFT_REG:
27587             case TYPE_SHIFT_IMM:
27588             case TYPE_BFM:
27589             case TYPE_RBIT:
27590             case TYPE_REV:
27591             case TYPE_EXTEND:
27592               return true;
27593
27594             default:;
27595             }
27596         }
27597     }
27598
27599   /* Fuse A+B+1 and A-B-1 */
27600   if (simple_sets_p
27601       && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
27602     {
27603       /* We're trying to match:
27604           prev == (set (r0) (plus (r0) (r1)))
27605           curr == (set (r0) (plus (r0) (const_int 1)))
27606         or:
27607           prev == (set (r0) (minus (r0) (r1)))
27608           curr == (set (r0) (plus (r0) (const_int -1))) */
27609
27610       rtx prev_src = SET_SRC (prev_set);
27611       rtx curr_src = SET_SRC (curr_set);
27612
27613       int polarity = 1;
27614       if (GET_CODE (prev_src) == MINUS)
27615         polarity = -1;
27616
27617       if (GET_CODE (curr_src) == PLUS
27618           && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
27619           && CONST_INT_P (XEXP (curr_src, 1))
27620           && INTVAL (XEXP (curr_src, 1)) == polarity
27621           && REG_P (XEXP (curr_src, 0))
27622           && REG_P (SET_DEST (prev_set))
27623           && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
27624         return true;
27625     }
27626
27627   return false;
27628 }
27629
27630 /* Return true iff the instruction fusion described by OP is enabled.  */
27631
27632 bool
27633 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
27634 {
27635   return (aarch64_tune_params.fusible_ops & op) != 0;
27636 }
27637
27638 /* If MEM is in the form of [base+offset], extract the two parts
27639    of address and set to BASE and OFFSET, otherwise return false
27640    after clearing BASE and OFFSET.  */
27641
27642 bool
27643 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
27644 {
27645   rtx addr;
27646
27647   gcc_assert (MEM_P (mem));
27648
27649   addr = XEXP (mem, 0);
27650
27651   if (REG_P (addr))
27652     {
27653       *base = addr;
27654       *offset = const0_rtx;
27655       return true;
27656     }
27657
27658   if (GET_CODE (addr) == PLUS
27659       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
27660     {
27661       *base = XEXP (addr, 0);
27662       *offset = XEXP (addr, 1);
27663       return true;
27664     }
27665
27666   *base = NULL_RTX;
27667   *offset = NULL_RTX;
27668
27669   return false;
27670 }
27671
27672 /* Types for scheduling fusion.  */
27673 enum sched_fusion_type
27674 {
27675   SCHED_FUSION_NONE = 0,
27676   SCHED_FUSION_LD_SIGN_EXTEND,
27677   SCHED_FUSION_LD_ZERO_EXTEND,
27678   SCHED_FUSION_LD,
27679   SCHED_FUSION_ST,
27680   SCHED_FUSION_NUM
27681 };
27682
27683 /* If INSN is a load or store of address in the form of [base+offset],
27684    extract the two parts and set to BASE and OFFSET.  Return scheduling
27685    fusion type this INSN is.  */
27686
27687 static enum sched_fusion_type
27688 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
27689 {
27690   rtx x, dest, src;
27691   enum sched_fusion_type fusion = SCHED_FUSION_LD;
27692
27693   gcc_assert (INSN_P (insn));
27694   x = PATTERN (insn);
27695   if (GET_CODE (x) != SET)
27696     return SCHED_FUSION_NONE;
27697
27698   src = SET_SRC (x);
27699   dest = SET_DEST (x);
27700
27701   machine_mode dest_mode = GET_MODE (dest);
27702
27703   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
27704     return SCHED_FUSION_NONE;
27705
27706   if (GET_CODE (src) == SIGN_EXTEND)
27707     {
27708       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
27709       src = XEXP (src, 0);
27710       if (!MEM_P (src) || GET_MODE (src) != SImode)
27711         return SCHED_FUSION_NONE;
27712     }
27713   else if (GET_CODE (src) == ZERO_EXTEND)
27714     {
27715       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
27716       src = XEXP (src, 0);
27717       if (!MEM_P (src) || GET_MODE (src) != SImode)
27718         return SCHED_FUSION_NONE;
27719     }
27720
27721   if (MEM_P (src) && REG_P (dest))
27722     extract_base_offset_in_addr (src, base, offset);
27723   else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
27724     {
27725       fusion = SCHED_FUSION_ST;
27726       extract_base_offset_in_addr (dest, base, offset);
27727     }
27728   else
27729     return SCHED_FUSION_NONE;
27730
27731   if (*base == NULL_RTX || *offset == NULL_RTX)
27732     fusion = SCHED_FUSION_NONE;
27733
27734   return fusion;
27735 }
27736
27737 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
27738
27739    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
27740    and PRI are only calculated for these instructions.  For other instruction,
27741    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
27742    type instruction fusion can be added by returning different priorities.
27743
27744    It's important that irrelevant instructions get the largest FUSION_PRI.  */
27745
27746 static void
27747 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
27748                                int *fusion_pri, int *pri)
27749 {
27750   int tmp, off_val;
27751   rtx base, offset;
27752   enum sched_fusion_type fusion;
27753
27754   gcc_assert (INSN_P (insn));
27755
27756   tmp = max_pri - 1;
27757   fusion = fusion_load_store (insn, &base, &offset);
27758   if (fusion == SCHED_FUSION_NONE)
27759     {
27760       *pri = tmp;
27761       *fusion_pri = tmp;
27762       return;
27763     }
27764
27765   /* Set FUSION_PRI according to fusion type and base register.  */
27766   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
27767
27768   /* Calculate PRI.  */
27769   tmp /= 2;
27770
27771   /* INSN with smaller offset goes first.  */
27772   off_val = (int)(INTVAL (offset));
27773   if (off_val >= 0)
27774     tmp -= (off_val & 0xfffff);
27775   else
27776     tmp += ((- off_val) & 0xfffff);
27777
27778   *pri = tmp;
27779   return;
27780 }
27781
27782 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
27783    Adjust priority of sha1h instructions so they are scheduled before
27784    other SHA1 instructions.  */
27785
27786 static int
27787 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
27788 {
27789   rtx x = PATTERN (insn);
27790
27791   if (GET_CODE (x) == SET)
27792     {
27793       x = SET_SRC (x);
27794
27795       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
27796         return priority + 10;
27797     }
27798
27799   return priority;
27800 }
27801
27802 /* If REVERSED is null, return true if memory reference *MEM2 comes
27803    immediately after memory reference *MEM1.  Do not change the references
27804    in this case.
27805
27806    Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
27807    if they are, try to make them use constant offsets from the same base
27808    register.  Return true on success.  When returning true, set *REVERSED
27809    to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2.  */
27810 static bool
27811 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
27812 {
27813   if (reversed)
27814     *reversed = false;
27815
27816   if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
27817       || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
27818     return false;
27819
27820   if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
27821     return false;
27822
27823   auto size1 = MEM_SIZE (*mem1);
27824   auto size2 = MEM_SIZE (*mem2);
27825
27826   rtx base1, base2, offset1, offset2;
27827   extract_base_offset_in_addr (*mem1, &base1, &offset1);
27828   extract_base_offset_in_addr (*mem2, &base2, &offset2);
27829
27830   /* Make sure at least one memory is in base+offset form.  */
27831   if (!(base1 && offset1) && !(base2 && offset2))
27832     return false;
27833
27834   /* If both mems already use the same base register, just check the
27835      offsets.  */
27836   if (base1 && base2 && rtx_equal_p (base1, base2))
27837     {
27838       if (!offset1 || !offset2)
27839         return false;
27840
27841       if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
27842         return true;
27843
27844       if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
27845         {
27846           *reversed = true;
27847           return true;
27848         }
27849
27850       return false;
27851     }
27852
27853   /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
27854      guarantee that the values are consecutive.  */
27855   if (MEM_EXPR (*mem1)
27856       && MEM_EXPR (*mem2)
27857       && MEM_OFFSET_KNOWN_P (*mem1)
27858       && MEM_OFFSET_KNOWN_P (*mem2))
27859     {
27860       poly_int64 expr_offset1;
27861       poly_int64 expr_offset2;
27862       tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
27863                                                        &expr_offset1);
27864       tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
27865                                                        &expr_offset2);
27866       if (!expr_base1
27867           || !expr_base2
27868           || !DECL_P (expr_base1)
27869           || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
27870         return false;
27871
27872       expr_offset1 += MEM_OFFSET (*mem1);
27873       expr_offset2 += MEM_OFFSET (*mem2);
27874
27875       if (known_eq (expr_offset1 + size1, expr_offset2))
27876         ;
27877       else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
27878         *reversed = true;
27879       else
27880         return false;
27881
27882       if (reversed)
27883         {
27884           if (base2)
27885             {
27886               rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
27887                                          expr_offset1 - expr_offset2);
27888               *mem1 = replace_equiv_address_nv (*mem1, addr1);
27889             }
27890           else
27891             {
27892               rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
27893                                          expr_offset2 - expr_offset1);
27894               *mem2 = replace_equiv_address_nv (*mem2, addr2);
27895             }
27896         }
27897       return true;
27898     }
27899
27900   return false;
27901 }
27902
27903 /* Test if MODE is suitable for a single transfer register in an ldp or stp
27904    instruction.  */
27905
27906 bool
27907 aarch64_ldpstp_operand_mode_p (machine_mode mode)
27908 {
27909   if (!targetm.hard_regno_mode_ok (V0_REGNUM, mode)
27910       || hard_regno_nregs (V0_REGNUM, mode) > 1)
27911     return false;
27912
27913   const auto size = GET_MODE_SIZE (mode);
27914   return known_eq (size, 4) || known_eq (size, 8) || known_eq (size, 16);
27915 }
27916
27917 /* Return true if MEM1 and MEM2 can be combined into a single access
27918    of mode MODE, with the combined access having the same address as MEM1.  */
27919
27920 bool
27921 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
27922 {
27923   if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
27924     return false;
27925   return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
27926 }
27927
27928 /* Return true if MEM agrees with the ldp-stp policy model.
27929    Otherwise, false.  */
27930
27931 bool
27932 aarch64_mem_ok_with_ldpstp_policy_model (rtx mem, bool load, machine_mode mode)
27933 {
27934   auto policy = (load
27935                  ? aarch64_tune_params.ldp_policy_model
27936                  : aarch64_tune_params.stp_policy_model);
27937
27938   /* If we have AARCH64_LDP_STP_POLICY_NEVER, reject the load pair.  */
27939   if (policy == AARCH64_LDP_STP_POLICY_NEVER)
27940     return false;
27941
27942   /* If we have AARCH64_LDP_STP_POLICY_ALIGNED,
27943      do not emit the load pair unless the alignment is checked to be
27944      at least double the alignment of the type.  */
27945   if (policy == AARCH64_LDP_STP_POLICY_ALIGNED
27946       && !optimize_function_for_size_p (cfun)
27947       && MEM_ALIGN (mem) < 2 * GET_MODE_ALIGNMENT (mode))
27948     return false;
27949
27950   return true;
27951 }
27952
27953 /* Given OPERANDS of consecutive load/store, check if we can merge
27954    them into ldp/stp.  LOAD is true if they are load instructions.  */
27955
27956 bool
27957 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load)
27958 {
27959   enum reg_class rclass_1, rclass_2;
27960   rtx mem_1, mem_2, reg_1, reg_2;
27961
27962   if (load)
27963     {
27964       mem_1 = operands[1];
27965       mem_2 = operands[3];
27966       reg_1 = operands[0];
27967       reg_2 = operands[2];
27968       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
27969       if (REGNO (reg_1) == REGNO (reg_2))
27970         return false;
27971       if (reg_overlap_mentioned_p (reg_1, mem_2))
27972         return false;
27973     }
27974   else
27975     {
27976       mem_1 = operands[0];
27977       mem_2 = operands[2];
27978       reg_1 = operands[1];
27979       reg_2 = operands[3];
27980     }
27981
27982   /* The mems cannot be volatile.  */
27983   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
27984     return false;
27985
27986   /* Check if the addresses are in the form of [base+offset].  */
27987   bool reversed = false;
27988   if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
27989     return false;
27990
27991   /* The operands must be of the same size.  */
27992   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
27993                         GET_MODE_SIZE (GET_MODE (mem_2))));
27994
27995   /* The lower memory access must be a mem-pair operand.  */
27996   rtx lower_mem = reversed ? mem_2 : mem_1;
27997   machine_mode lower_mem_mode = GET_MODE (lower_mem);
27998   if (!aarch64_mem_pair_operand (lower_mem, lower_mem_mode))
27999     return false;
28000
28001   /* Check if lower_mem is ok with the ldp-stp policy model.  */
28002   if (!aarch64_mem_ok_with_ldpstp_policy_model (lower_mem, load,
28003                                                 lower_mem_mode))
28004     return false;
28005
28006   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
28007     rclass_1 = FP_REGS;
28008   else
28009     rclass_1 = GENERAL_REGS;
28010
28011   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
28012     rclass_2 = FP_REGS;
28013   else
28014     rclass_2 = GENERAL_REGS;
28015
28016   /* Check if the registers are of same class.  */
28017   if (rclass_1 != rclass_2)
28018     return false;
28019
28020   return true;
28021 }
28022
28023 /* Given OPERANDS of consecutive load/store that can be merged,
28024    swap them if they are not in ascending order.  */
28025 void
28026 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
28027 {
28028   int mem_op = load ? 1 : 0;
28029   bool reversed = false;
28030   if (!aarch64_check_consecutive_mems (operands + mem_op,
28031                                        operands + mem_op + 2, &reversed))
28032     gcc_unreachable ();
28033
28034   if (reversed)
28035     {
28036       /* Irrespective of whether this is a load or a store,
28037          we do the same swap.  */
28038       std::swap (operands[0], operands[2]);
28039       std::swap (operands[1], operands[3]);
28040     }
28041 }
28042
28043 /* Helper function used for generation of load/store pair instructions, called
28044    from peepholes in aarch64-ldpstp.md.  OPERANDS is an array of
28045    operands as matched by the peepholes in that file.  LOAD_P is true if we're
28046    generating a load pair, otherwise we're generating a store pair.  CODE is
28047    either {ZERO,SIGN}_EXTEND for extending loads or UNKNOWN if we're generating a
28048    standard load/store pair.  */
28049
28050 void
28051 aarch64_finish_ldpstp_peephole (rtx *operands, bool load_p, enum rtx_code code)
28052 {
28053   aarch64_swap_ldrstr_operands (operands, load_p);
28054
28055   if (load_p)
28056     emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
28057                                       operands[1], code));
28058   else
28059     {
28060       gcc_assert (code == UNKNOWN);
28061       emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
28062                                          operands[3]));
28063     }
28064 }
28065
28066 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
28067    comparison between the two.  */
28068 int
28069 aarch64_host_wide_int_compare (const void *x, const void *y)
28070 {
28071   return wi::cmps (* ((const HOST_WIDE_INT *) x),
28072                    * ((const HOST_WIDE_INT *) y));
28073 }
28074
28075 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
28076    other pointing to a REG rtx containing an offset, compare the offsets
28077    of the two pairs.
28078
28079    Return:
28080
28081         1 iff offset (X) > offset (Y)
28082         0 iff offset (X) == offset (Y)
28083         -1 iff offset (X) < offset (Y)  */
28084 int
28085 aarch64_ldrstr_offset_compare (const void *x, const void *y)
28086 {
28087   const rtx * operands_1 = (const rtx *) x;
28088   const rtx * operands_2 = (const rtx *) y;
28089   rtx mem_1, mem_2, base, offset_1, offset_2;
28090
28091   if (MEM_P (operands_1[0]))
28092     mem_1 = operands_1[0];
28093   else
28094     mem_1 = operands_1[1];
28095
28096   if (MEM_P (operands_2[0]))
28097     mem_2 = operands_2[0];
28098   else
28099     mem_2 = operands_2[1];
28100
28101   /* Extract the offsets.  */
28102   extract_base_offset_in_addr (mem_1, &base, &offset_1);
28103   extract_base_offset_in_addr (mem_2, &base, &offset_2);
28104
28105   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
28106
28107   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
28108 }
28109
28110 /* Given OPERANDS of consecutive load/store, check if we can merge
28111    them into ldp/stp by adjusting the offset.  LOAD is true if they
28112    are load instructions.  MODE is the mode of memory operands.
28113
28114    Given below consecutive stores:
28115
28116      str  w1, [xb, 0x100]
28117      str  w1, [xb, 0x104]
28118      str  w1, [xb, 0x108]
28119      str  w1, [xb, 0x10c]
28120
28121    Though the offsets are out of the range supported by stp, we can
28122    still pair them after adjusting the offset, like:
28123
28124      add  scratch, xb, 0x100
28125      stp  w1, w1, [scratch]
28126      stp  w1, w1, [scratch, 0x8]
28127
28128    The peephole patterns detecting this opportunity should guarantee
28129    the scratch register is avaliable.  */
28130
28131 bool
28132 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
28133                                        machine_mode mode)
28134 {
28135   const int num_insns = 4;
28136   enum reg_class rclass;
28137   HOST_WIDE_INT offvals[num_insns], msize;
28138   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
28139
28140   if (load)
28141     {
28142       for (int i = 0; i < num_insns; i++)
28143         {
28144           reg[i] = operands[2 * i];
28145           mem[i] = operands[2 * i + 1];
28146
28147           gcc_assert (REG_P (reg[i]));
28148         }
28149
28150       /* Do not attempt to merge the loads if the loads clobber each other.  */
28151       for (int i = 0; i < 8; i += 2)
28152         for (int j = i + 2; j < 8; j += 2)
28153           if (reg_overlap_mentioned_p (operands[i], operands[j]))
28154             return false;
28155     }
28156   else
28157     for (int i = 0; i < num_insns; i++)
28158       {
28159         mem[i] = operands[2 * i];
28160         reg[i] = operands[2 * i + 1];
28161       }
28162
28163   /* Skip if memory operand is by itself valid for ldp/stp.  */
28164   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
28165     return false;
28166
28167   for (int i = 0; i < num_insns; i++)
28168     {
28169       /* The mems cannot be volatile.  */
28170       if (MEM_VOLATILE_P (mem[i]))
28171         return false;
28172
28173       /* Check if the addresses are in the form of [base+offset].  */
28174       extract_base_offset_in_addr (mem[i], base + i, offset + i);
28175       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
28176         return false;
28177     }
28178
28179   /* Check if the registers are of same class.  */
28180   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
28181     ? FP_REGS : GENERAL_REGS;
28182
28183   for (int i = 1; i < num_insns; i++)
28184     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
28185       {
28186         if (rclass != FP_REGS)
28187           return false;
28188       }
28189     else
28190       {
28191         if (rclass != GENERAL_REGS)
28192           return false;
28193       }
28194
28195   /* Only the last register in the order in which they occur
28196      may be clobbered by the load.  */
28197   if (rclass == GENERAL_REGS && load)
28198     for (int i = 0; i < num_insns - 1; i++)
28199       if (reg_mentioned_p (reg[i], mem[i]))
28200         return false;
28201
28202   /* Check if the bases are same.  */
28203   for (int i = 0; i < num_insns - 1; i++)
28204     if (!rtx_equal_p (base[i], base[i + 1]))
28205       return false;
28206
28207   for (int i = 0; i < num_insns; i++)
28208     offvals[i] = INTVAL (offset[i]);
28209
28210   msize = GET_MODE_SIZE (mode).to_constant ();
28211
28212   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
28213   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
28214          aarch64_host_wide_int_compare);
28215
28216   if (!(offvals[1] == offvals[0] + msize
28217         && offvals[3] == offvals[2] + msize))
28218     return false;
28219
28220   /* Check that offsets are within range of each other.  The ldp/stp
28221      instructions have 7 bit immediate offsets, so use 0x80.  */
28222   if (offvals[2] - offvals[0] >= msize * 0x80)
28223     return false;
28224
28225   /* The offsets must be aligned with respect to each other.  */
28226   if (offvals[0] % msize != offvals[2] % msize)
28227     return false;
28228
28229    /* Check if mem[0] is ok with the ldp-stp policy model.  */
28230   if (!aarch64_mem_ok_with_ldpstp_policy_model (mem[0], load, mode))
28231     return false;
28232
28233   return true;
28234 }
28235
28236 /* Given OPERANDS of consecutive load/store, this function pairs them
28237    into LDP/STP after adjusting the offset.  It depends on the fact
28238    that the operands can be sorted so the offsets are correct for STP.
28239    MODE is the mode of memory operands.  CODE is the rtl operator
28240    which should be applied to all memory operands, it's SIGN_EXTEND,
28241    ZERO_EXTEND or UNKNOWN.  */
28242
28243 bool
28244 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
28245                              machine_mode mode, RTX_CODE code)
28246 {
28247   rtx base, offset_1, offset_2;
28248   rtx mem_1, mem_2;
28249   rtx temp_operands[8];
28250   HOST_WIDE_INT off_val_1, off_val_2, base_off, new_off_1, new_off_2,
28251                 stp_off_upper_limit, stp_off_lower_limit, msize;
28252
28253   /* We make changes on a copy as we may still bail out.  */
28254   for (int i = 0; i < 8; i ++)
28255     temp_operands[i] = operands[i];
28256
28257   /* Sort the operands.  Note for cases as below:
28258        [base + 0x310] = A
28259        [base + 0x320] = B
28260        [base + 0x330] = C
28261        [base + 0x320] = D
28262      We need stable sorting otherwise wrong data may be store to offset 0x320.
28263      Also note the dead store in above case should be optimized away, but no
28264      guarantees here.  */
28265   gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
28266                  aarch64_ldrstr_offset_compare);
28267
28268   /* Copy the memory operands so that if we have to bail for some
28269      reason the original addresses are unchanged.  */
28270   if (load)
28271     {
28272       mem_1 = copy_rtx (temp_operands[1]);
28273       mem_2 = copy_rtx (temp_operands[5]);
28274     }
28275   else
28276     {
28277       mem_1 = copy_rtx (temp_operands[0]);
28278       mem_2 = copy_rtx (temp_operands[4]);
28279       gcc_assert (code == UNKNOWN);
28280     }
28281
28282   extract_base_offset_in_addr (mem_1, &base, &offset_1);
28283   extract_base_offset_in_addr (mem_2, &base, &offset_2);
28284   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
28285               && offset_2 != NULL_RTX);
28286
28287   /* Adjust offset so it can fit in LDP/STP instruction.  */
28288   msize = GET_MODE_SIZE (mode).to_constant();
28289   stp_off_upper_limit = msize * (0x40 - 1);
28290   stp_off_lower_limit = - msize * 0x40;
28291
28292   off_val_1 = INTVAL (offset_1);
28293   off_val_2 = INTVAL (offset_2);
28294
28295   /* The base offset is optimally half way between the two STP/LDP offsets.  */
28296   if (msize <= 4)
28297     base_off = (off_val_1 + off_val_2) / 2;
28298   else
28299     /* However, due to issues with negative LDP/STP offset generation for
28300        larger modes, for DF, DD, DI and vector modes. we must not use negative
28301        addresses smaller than 9 signed unadjusted bits can store.  This
28302        provides the most range in this case.  */
28303     base_off = off_val_1;
28304
28305   /* Adjust the base so that it is aligned with the addresses but still
28306      optimal.  */
28307   if (base_off % msize != off_val_1 % msize)
28308     /* Fix the offset, bearing in mind we want to make it bigger not
28309        smaller.  */
28310     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28311   else if (msize <= 4)
28312     /* The negative range of LDP/STP is one larger than the positive range.  */
28313     base_off += msize;
28314
28315   /* Check if base offset is too big or too small.  We can attempt to resolve
28316      this issue by setting it to the maximum value and seeing if the offsets
28317      still fit.  */
28318   if (base_off >= 0x1000)
28319     {
28320       base_off = 0x1000 - 1;
28321       /* We must still make sure that the base offset is aligned with respect
28322          to the address.  But it may not be made any bigger.  */
28323       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28324     }
28325
28326   /* Likewise for the case where the base is too small.  */
28327   if (base_off <= -0x1000)
28328     {
28329       base_off = -0x1000 + 1;
28330       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28331     }
28332
28333   /* Offset of the first STP/LDP.  */
28334   new_off_1 = off_val_1 - base_off;
28335
28336   /* Offset of the second STP/LDP.  */
28337   new_off_2 = off_val_2 - base_off;
28338
28339   /* The offsets must be within the range of the LDP/STP instructions.  */
28340   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
28341       || new_off_2 > stp_off_upper_limit || new_off_2 < stp_off_lower_limit)
28342     return false;
28343
28344   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
28345                                                   new_off_1), true);
28346   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
28347                                                   new_off_2), true);
28348
28349   if (!aarch64_mem_pair_operand (mem_1, mode)
28350       || !aarch64_mem_pair_operand (mem_2, mode))
28351     return false;
28352
28353   if (load)
28354     {
28355       operands[0] = temp_operands[0];
28356       operands[1] = mem_1;
28357       operands[2] = temp_operands[2];
28358       operands[4] = temp_operands[4];
28359       operands[5] = mem_2;
28360       operands[6] = temp_operands[6];
28361     }
28362   else
28363     {
28364       operands[0] = mem_1;
28365       operands[1] = temp_operands[1];
28366       operands[3] = temp_operands[3];
28367       operands[4] = mem_2;
28368       operands[5] = temp_operands[5];
28369       operands[7] = temp_operands[7];
28370     }
28371
28372   /* Emit adjusting instruction.  */
28373   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
28374   /* Emit ldp/stp instructions.  */
28375   if (load)
28376     {
28377       emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
28378                                         operands[1], code));
28379       emit_insn (aarch64_gen_load_pair (operands[4], operands[6],
28380                                         operands[5], code));
28381     }
28382   else
28383     {
28384       emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
28385                                          operands[3]));
28386       emit_insn (aarch64_gen_store_pair (operands[4], operands[5],
28387                                          operands[7]));
28388     }
28389   return true;
28390 }
28391
28392 /* Implement TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE.  Assume that
28393    predicated operations when available are beneficial.  */
28394
28395 static bool
28396 aarch64_conditional_operation_is_expensive (unsigned)
28397 {
28398   return false;
28399 }
28400
28401 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
28402    it isn't worth branching around empty masked ops (including masked
28403    stores).  */
28404
28405 static bool
28406 aarch64_empty_mask_is_expensive (unsigned)
28407 {
28408   return false;
28409 }
28410
28411 /* Return 1 if pseudo register should be created and used to hold
28412    GOT address for PIC code.  */
28413
28414 bool
28415 aarch64_use_pseudo_pic_reg (void)
28416 {
28417   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
28418 }
28419
28420 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
28421
28422 static int
28423 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
28424 {
28425   switch (XINT (x, 1))
28426     {
28427     case UNSPEC_GOTSMALLPIC:
28428     case UNSPEC_GOTSMALLPIC28K:
28429     case UNSPEC_GOTTINYPIC:
28430       return 0;
28431     default:
28432       break;
28433     }
28434
28435   return default_unspec_may_trap_p (x, flags);
28436 }
28437
28438
28439 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
28440    return the log2 of that value.  Otherwise return -1.  */
28441
28442 int
28443 aarch64_fpconst_pow_of_2 (rtx x)
28444 {
28445   const REAL_VALUE_TYPE *r;
28446
28447   if (!CONST_DOUBLE_P (x))
28448     return -1;
28449
28450   r = CONST_DOUBLE_REAL_VALUE (x);
28451
28452   if (REAL_VALUE_NEGATIVE (*r)
28453       || REAL_VALUE_ISNAN (*r)
28454       || REAL_VALUE_ISINF (*r)
28455       || !real_isinteger (r, DFmode))
28456     return -1;
28457
28458   return exact_log2 (real_to_integer (r));
28459 }
28460
28461 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
28462    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
28463    return n. Otherwise return -1.  */
28464
28465 int
28466 aarch64_fpconst_pow2_recip (rtx x)
28467 {
28468   REAL_VALUE_TYPE r0;
28469
28470   if (!CONST_DOUBLE_P (x))
28471     return -1;
28472
28473   r0 = *CONST_DOUBLE_REAL_VALUE (x);
28474   if (exact_real_inverse (DFmode, &r0)
28475       && !REAL_VALUE_NEGATIVE (r0))
28476     {
28477         int ret = exact_log2 (real_to_integer (&r0));
28478         if (ret >= 1 && ret <= 32)
28479             return ret;
28480     }
28481   return -1;
28482 }
28483
28484 /* If X is a vector of equal CONST_DOUBLE values and that value is
28485    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
28486
28487 int
28488 aarch64_vec_fpconst_pow_of_2 (rtx x)
28489 {
28490   int nelts;
28491   if (!CONST_VECTOR_P (x)
28492       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
28493     return -1;
28494
28495   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
28496     return -1;
28497
28498   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
28499   if (firstval <= 0)
28500     return -1;
28501
28502   for (int i = 1; i < nelts; i++)
28503     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
28504       return -1;
28505
28506   return firstval;
28507 }
28508
28509 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
28510    to float.
28511
28512    __fp16 always promotes through this hook.
28513    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
28514    through the generic excess precision logic rather than here.  */
28515
28516 static tree
28517 aarch64_promoted_type (const_tree t)
28518 {
28519   if (SCALAR_FLOAT_TYPE_P (t)
28520       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
28521     return float_type_node;
28522
28523   return NULL_TREE;
28524 }
28525
28526 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
28527
28528 static bool
28529 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
28530                            optimization_type opt_type)
28531 {
28532   switch (op)
28533     {
28534     case rsqrt_optab:
28535       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
28536
28537     default:
28538       return true;
28539     }
28540 }
28541
28542 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
28543
28544 static unsigned int
28545 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
28546                                         int *offset)
28547 {
28548   /* Polynomial invariant 1 == (VG / 2) - 1.  */
28549   gcc_assert (i == 1);
28550   *factor = 2;
28551   *offset = 1;
28552   return AARCH64_DWARF_VG;
28553 }
28554
28555 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
28556    if MODE is [BH]Fmode, and punt to the generic implementation otherwise.  */
28557
28558 static bool
28559 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
28560 {
28561   return ((mode == HFmode || mode == BFmode)
28562           ? true
28563           : default_libgcc_floating_mode_supported_p (mode));
28564 }
28565
28566 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
28567    if MODE is [BH]Fmode, and punt to the generic implementation otherwise.  */
28568
28569 static bool
28570 aarch64_scalar_mode_supported_p (scalar_mode mode)
28571 {
28572   if (DECIMAL_FLOAT_MODE_P (mode))
28573     return default_decimal_float_supported_p ();
28574
28575   return ((mode == HFmode || mode == BFmode)
28576           ? true
28577           : default_scalar_mode_supported_p (mode));
28578 }
28579
28580 /* Set the value of FLT_EVAL_METHOD.
28581    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
28582
28583     0: evaluate all operations and constants, whose semantic type has at
28584        most the range and precision of type float, to the range and
28585        precision of float; evaluate all other operations and constants to
28586        the range and precision of the semantic type;
28587
28588     N, where _FloatN is a supported interchange floating type
28589        evaluate all operations and constants, whose semantic type has at
28590        most the range and precision of _FloatN type, to the range and
28591        precision of the _FloatN type; evaluate all other operations and
28592        constants to the range and precision of the semantic type;
28593
28594    If we have the ARMv8.2-A extensions then we support _Float16 in native
28595    precision, so we should set this to 16.  Otherwise, we support the type,
28596    but want to evaluate expressions in float precision, so set this to
28597    0.  */
28598
28599 static enum flt_eval_method
28600 aarch64_excess_precision (enum excess_precision_type type)
28601 {
28602   switch (type)
28603     {
28604       case EXCESS_PRECISION_TYPE_FAST:
28605       case EXCESS_PRECISION_TYPE_STANDARD:
28606         /* We can calculate either in 16-bit range and precision or
28607            32-bit range and precision.  Make that decision based on whether
28608            we have native support for the ARMv8.2-A 16-bit floating-point
28609            instructions or not.  */
28610         return (TARGET_FP_F16INST
28611                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
28612                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
28613       case EXCESS_PRECISION_TYPE_IMPLICIT:
28614       case EXCESS_PRECISION_TYPE_FLOAT16:
28615         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
28616       default:
28617         gcc_unreachable ();
28618     }
28619   return FLT_EVAL_METHOD_UNPREDICTABLE;
28620 }
28621
28622 /* Implement TARGET_C_BITINT_TYPE_INFO.
28623    Return true if _BitInt(N) is supported and fill its details into *INFO.  */
28624 bool
28625 aarch64_bitint_type_info (int n, struct bitint_info *info)
28626 {
28627   if (TARGET_BIG_END)
28628     return false;
28629
28630   if (n <= 8)
28631     info->limb_mode = QImode;
28632   else if (n <= 16)
28633     info->limb_mode = HImode;
28634   else if (n <= 32)
28635     info->limb_mode = SImode;
28636   else if (n <= 64)
28637     info->limb_mode = DImode;
28638   else if (n <= 128)
28639     info->limb_mode = TImode;
28640   else
28641     /* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
28642        type {signed,unsigned} __int128[M] where M*128 >= N.  However, to be
28643        able to use libgcc's implementation to support large _BitInt's we need
28644        to use a LIMB_MODE that is no larger than 'long long'.  This is why we
28645        use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
28646        be TImode to ensure we are ABI compliant.  */
28647     info->limb_mode = DImode;
28648
28649   if (n > 128)
28650     info->abi_limb_mode = TImode;
28651   else
28652     info->abi_limb_mode = info->limb_mode;
28653   info->big_endian = TARGET_BIG_END;
28654   info->extended = false;
28655   return true;
28656 }
28657
28658 /* Implement TARGET_C_MODE_FOR_FLOATING_TYPE.  Return TFmode for
28659    TI_LONG_DOUBLE_TYPE which is for long double type, go with the default
28660    one for the others.  */
28661
28662 static machine_mode
28663 aarch64_c_mode_for_floating_type (enum tree_index ti)
28664 {
28665   if (ti == TI_LONG_DOUBLE_TYPE)
28666     return TFmode;
28667   return default_mode_for_floating_type (ti);
28668 }
28669
28670 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
28671    scheduled for speculative execution.  Reject the long-running division
28672    and square-root instructions.  */
28673
28674 static bool
28675 aarch64_sched_can_speculate_insn (rtx_insn *insn)
28676 {
28677   switch (get_attr_type (insn))
28678     {
28679       case TYPE_SDIV:
28680       case TYPE_UDIV:
28681       case TYPE_FDIVS:
28682       case TYPE_FDIVD:
28683       case TYPE_FSQRTS:
28684       case TYPE_FSQRTD:
28685       case TYPE_NEON_FP_SQRT_S:
28686       case TYPE_NEON_FP_SQRT_D:
28687       case TYPE_NEON_FP_SQRT_S_Q:
28688       case TYPE_NEON_FP_SQRT_D_Q:
28689       case TYPE_NEON_FP_DIV_S:
28690       case TYPE_NEON_FP_DIV_D:
28691       case TYPE_NEON_FP_DIV_S_Q:
28692       case TYPE_NEON_FP_DIV_D_Q:
28693         return false;
28694       default:
28695         return true;
28696     }
28697 }
28698
28699 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
28700
28701 static int
28702 aarch64_compute_pressure_classes (reg_class *classes)
28703 {
28704   int i = 0;
28705   classes[i++] = GENERAL_REGS;
28706   classes[i++] = FP_REGS;
28707   /* PR_REGS isn't a useful pressure class because many predicate pseudo
28708      registers need to go in PR_LO_REGS at some point during their
28709      lifetime.  Splitting it into two halves has the effect of making
28710      all predicates count against PR_LO_REGS, so that we try whenever
28711      possible to restrict the number of live predicates to 8.  This
28712      greatly reduces the amount of spilling in certain loops.  */
28713   classes[i++] = PR_LO_REGS;
28714   classes[i++] = PR_HI_REGS;
28715   return i;
28716 }
28717
28718 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
28719
28720 static bool
28721 aarch64_can_change_mode_class (machine_mode from,
28722                                machine_mode to, reg_class_t)
28723 {
28724   return aarch64_modes_compatible_p (from, to);
28725 }
28726
28727 /* Implement TARGET_EARLY_REMAT_MODES.  */
28728
28729 static void
28730 aarch64_select_early_remat_modes (sbitmap modes)
28731 {
28732   /* SVE values are not normally live across a call, so it should be
28733      worth doing early rematerialization even in VL-specific mode.  */
28734   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
28735     if (aarch64_sve_mode_p ((machine_mode) i))
28736       bitmap_set_bit (modes, i);
28737 }
28738
28739 /* Override the default target speculation_safe_value.  */
28740 static rtx
28741 aarch64_speculation_safe_value (machine_mode mode,
28742                                 rtx result, rtx val, rtx failval)
28743 {
28744   /* Maybe we should warn if falling back to hard barriers.  They are
28745      likely to be noticably more expensive than the alternative below.  */
28746   if (!aarch64_track_speculation)
28747     return default_speculation_safe_value (mode, result, val, failval);
28748
28749   if (!REG_P (val))
28750     val = copy_to_mode_reg (mode, val);
28751
28752   if (!aarch64_reg_or_zero (failval, mode))
28753     failval = copy_to_mode_reg (mode, failval);
28754
28755   emit_insn (gen_despeculate_copy (mode, result, val, failval));
28756   return result;
28757 }
28758
28759 /* Implement TARGET_ESTIMATED_POLY_VALUE.
28760    Look into the tuning structure for an estimate.
28761    KIND specifies the type of requested estimate: min, max or likely.
28762    For cores with a known SVE width all three estimates are the same.
28763    For generic SVE tuning we want to distinguish the maximum estimate from
28764    the minimum and likely ones.
28765    The likely estimate is the same as the minimum in that case to give a
28766    conservative behavior of auto-vectorizing with SVE when it is a win
28767    even for 128-bit SVE.
28768    When SVE width information is available VAL.coeffs[1] is multiplied by
28769    the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
28770
28771 static HOST_WIDE_INT
28772 aarch64_estimated_poly_value (poly_int64 val,
28773                               poly_value_estimate_kind kind
28774                                 = POLY_VALUE_LIKELY)
28775 {
28776   unsigned int width_source = aarch64_tune_params.sve_width;
28777
28778   /* If there is no core-specific information then the minimum and likely
28779      values are based on 128-bit vectors and the maximum is based on
28780      the architectural maximum of 2048 bits.  */
28781   if (width_source == SVE_SCALABLE)
28782     switch (kind)
28783       {
28784       case POLY_VALUE_MIN:
28785       case POLY_VALUE_LIKELY:
28786         return val.coeffs[0];
28787       case POLY_VALUE_MAX:
28788           return val.coeffs[0] + val.coeffs[1] * 15;
28789       }
28790
28791   /* Allow sve_width to be a bitmask of different VL, treating the lowest
28792      as likely.  This could be made more general if future -mtune options
28793      need it to be.  */
28794   if (kind == POLY_VALUE_MAX)
28795     width_source = 1 << floor_log2 (width_source);
28796   else
28797     width_source = least_bit_hwi (width_source);
28798
28799   /* If the core provides width information, use that.  */
28800   HOST_WIDE_INT over_128 = width_source - 128;
28801   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
28802 }
28803
28804
28805 /* Return true for types that could be supported as SIMD return or
28806    argument types.  */
28807
28808 static bool
28809 supported_simd_type (tree t)
28810 {
28811   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
28812     {
28813       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
28814       return s == 1 || s == 2 || s == 4 || s == 8;
28815     }
28816   return false;
28817 }
28818
28819 /* Determine the lane size for the clone argument/return type.  This follows
28820    the LS(P) rule in the VFABIA64.  */
28821
28822 static unsigned
28823 lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
28824 {
28825   gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
28826
28827   /* For non map-to-vector types that are pointers we use the element type it
28828      points to.  */
28829   if (POINTER_TYPE_P (type))
28830     switch (clone_arg_type)
28831       {
28832       default:
28833         break;
28834       case SIMD_CLONE_ARG_TYPE_UNIFORM:
28835       case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
28836       case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
28837         type = TREE_TYPE (type);
28838         break;
28839       }
28840
28841   /* For types (or pointers of non map-to-vector types point to) that are
28842      integers or floating point, we use their size if they are 1, 2, 4 or 8.
28843    */
28844   if (INTEGRAL_TYPE_P (type)
28845       || SCALAR_FLOAT_TYPE_P (type))
28846     switch (TYPE_PRECISION (type) / BITS_PER_UNIT)
28847       {
28848       default:
28849         break;
28850       case 1:
28851       case 2:
28852       case 4:
28853       case 8:
28854         return TYPE_PRECISION (type);
28855       }
28856   /* For any other we use the size of uintptr_t.  For map-to-vector types that
28857      are pointers, using the size of uintptr_t is the same as using the size of
28858      their type, seeing all pointers are the same size as uintptr_t.  */
28859   return POINTER_SIZE;
28860 }
28861
28862
28863 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
28864
28865 static int
28866 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
28867                                         struct cgraph_simd_clone *clonei,
28868                                         tree base_type ATTRIBUTE_UNUSED,
28869                                         int num, bool explicit_p)
28870 {
28871   tree t, ret_type;
28872   unsigned int nds_elt_bits;
28873   unsigned HOST_WIDE_INT const_simdlen;
28874
28875   if (!TARGET_SIMD)
28876     return 0;
28877
28878   /* For now, SVE simdclones won't produce illegal simdlen, So only check
28879      const simdlens here.  */
28880   if (maybe_ne (clonei->simdlen, 0U)
28881       && clonei->simdlen.is_constant (&const_simdlen)
28882       && (const_simdlen < 2
28883           || const_simdlen > 1024
28884           || (const_simdlen & (const_simdlen - 1)) != 0))
28885     {
28886       if (explicit_p)
28887         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28888                     "unsupported simdlen %wd", const_simdlen);
28889       return 0;
28890     }
28891
28892   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
28893   /* According to AArch64's Vector ABI the type that determines the simdlen is
28894      the narrowest of types, so we ignore base_type for AArch64.  */
28895   if (TREE_CODE (ret_type) != VOID_TYPE
28896       && !supported_simd_type (ret_type))
28897     {
28898       if (!explicit_p)
28899         ;
28900       else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28901         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28902                     "GCC does not currently support return type %qT "
28903                     "for simd", ret_type);
28904       else
28905         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28906                     "unsupported return type %qT for simd",
28907                     ret_type);
28908       return 0;
28909     }
28910
28911   auto_vec<std::pair <tree, unsigned int>> vec_elts (clonei->nargs + 1);
28912
28913   /* We are looking for the NDS type here according to the VFABIA64.  */
28914   if (TREE_CODE (ret_type) != VOID_TYPE)
28915     {
28916       nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
28917       vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
28918     }
28919   else
28920     nds_elt_bits = POINTER_SIZE;
28921
28922   int i;
28923   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
28924   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
28925   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
28926        t && t != void_list_node; t = TREE_CHAIN (t), i++)
28927     {
28928       tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
28929       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
28930           && !supported_simd_type (arg_type))
28931         {
28932           if (!explicit_p)
28933             ;
28934           else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28935             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28936                         "GCC does not currently support argument type %qT "
28937                         "for simd", arg_type);
28938           else
28939             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28940                         "unsupported argument type %qT for simd",
28941                         arg_type);
28942           return 0;
28943         }
28944       unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
28945       if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
28946         vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
28947       if (nds_elt_bits > lane_bits)
28948         nds_elt_bits = lane_bits;
28949     }
28950
28951   clonei->vecsize_mangle = 'n';
28952   clonei->mask_mode = VOIDmode;
28953   poly_uint64 simdlen;
28954   auto_vec<poly_uint64> simdlens (2);
28955   /* Keep track of the possible simdlens the clones of this function can have,
28956      and check them later to see if we support them.  */
28957   if (known_eq (clonei->simdlen, 0U))
28958     {
28959       simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
28960       if (maybe_ne (simdlen, 1U))
28961         simdlens.safe_push (simdlen);
28962       simdlens.safe_push (simdlen * 2);
28963     }
28964   else
28965     simdlens.safe_push (clonei->simdlen);
28966
28967   clonei->vecsize_int = 0;
28968   clonei->vecsize_float = 0;
28969
28970   /* We currently do not support generating simdclones where vector arguments
28971      do not fit into a single vector register, i.e. vector types that are more
28972      than 128-bits large.  This is because of how we currently represent such
28973      types in ACLE, where we use a struct to allow us to pass them as arguments
28974      and return.
28975      Hence why we have to check whether the simdlens available for this
28976      simdclone would cause a vector type to be larger than 128-bits, and reject
28977      such a clone.  */
28978   unsigned j = 0;
28979   while (j < simdlens.length ())
28980     {
28981       bool remove_simdlen = false;
28982       for (auto elt : vec_elts)
28983         if (known_gt (simdlens[j] * elt.second, 128U))
28984           {
28985             /* Don't issue a warning for every simdclone when there is no
28986                specific simdlen clause.  */
28987             if (explicit_p && maybe_ne (clonei->simdlen, 0U))
28988               warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28989                           "GCC does not currently support simdlen %wd for "
28990                           "type %qT",
28991                           constant_lower_bound (simdlens[j]), elt.first);
28992             remove_simdlen = true;
28993             break;
28994           }
28995       if (remove_simdlen)
28996         simdlens.ordered_remove (j);
28997       else
28998         j++;
28999     }
29000
29001
29002   int count = simdlens.length ();
29003   if (count == 0)
29004     {
29005       if (explicit_p && known_eq (clonei->simdlen, 0U))
29006         {
29007           /* Warn the user if we can't generate any simdclone.  */
29008           simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
29009           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
29010                       "GCC does not currently support a simdclone with simdlens"
29011                       " %wd and %wd for these types.",
29012                       constant_lower_bound (simdlen),
29013                       constant_lower_bound (simdlen*2));
29014         }
29015       return 0;
29016     }
29017
29018   gcc_assert (num < count);
29019   clonei->simdlen = simdlens[num];
29020   return count;
29021 }
29022
29023 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
29024
29025 static void
29026 aarch64_simd_clone_adjust (struct cgraph_node *node)
29027 {
29028   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
29029      use the correct ABI.  */
29030
29031   tree t = TREE_TYPE (node->decl);
29032   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
29033                                         TYPE_ATTRIBUTES (t));
29034 }
29035
29036 /* Implement TARGET_SIMD_CLONE_USABLE.  */
29037
29038 static int
29039 aarch64_simd_clone_usable (struct cgraph_node *node)
29040 {
29041   switch (node->simdclone->vecsize_mangle)
29042     {
29043     case 'n':
29044       if (!TARGET_SIMD)
29045         return -1;
29046       return 0;
29047     default:
29048       gcc_unreachable ();
29049     }
29050 }
29051
29052 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
29053
29054 static int
29055 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
29056 {
29057   auto check_attr = [&](const char *ns, const char *name) {
29058     tree attr1 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type1));
29059     tree attr2 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type2));
29060     if (!attr1 && !attr2)
29061       return true;
29062
29063     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
29064   };
29065
29066   if (!check_attr ("gnu", "aarch64_vector_pcs"))
29067     return 0;
29068   if (!check_attr ("gnu", "Advanced SIMD type"))
29069     return 0;
29070   if (!check_attr ("gnu", "SVE type"))
29071     return 0;
29072   if (!check_attr ("gnu", "SVE sizeless type"))
29073     return 0;
29074   if (!check_attr ("arm", "streaming"))
29075     return 0;
29076   if (!check_attr ("arm", "streaming_compatible"))
29077     return 0;
29078   if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "za")
29079       != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "za"))
29080     return 0;
29081   if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "zt0")
29082       != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "zt0"))
29083     return 0;
29084   return 1;
29085 }
29086
29087 /* Implement TARGET_MERGE_DECL_ATTRIBUTES.  */
29088
29089 static tree
29090 aarch64_merge_decl_attributes (tree olddecl, tree newdecl)
29091 {
29092   tree old_attrs = DECL_ATTRIBUTES (olddecl);
29093   tree old_new = lookup_attribute ("arm", "new", old_attrs);
29094
29095   tree new_attrs = DECL_ATTRIBUTES (newdecl);
29096   tree new_new = lookup_attribute ("arm", "new", new_attrs);
29097
29098   if (DECL_INITIAL (olddecl) && new_new)
29099     {
29100       error ("cannot apply attribute %qs to %q+D after the function"
29101              " has been defined", "new", newdecl);
29102       inform (DECL_SOURCE_LOCATION (olddecl), "%q+D defined here",
29103               newdecl);
29104     }
29105   else
29106     {
29107       if (old_new && new_new)
29108         {
29109           old_attrs = remove_attribute ("arm", "new", old_attrs);
29110           TREE_VALUE (new_new) = chainon (TREE_VALUE (new_new),
29111                                           TREE_VALUE (old_new));
29112         }
29113       if (new_new)
29114         aarch64_check_arm_new_against_type (TREE_VALUE (new_new), newdecl);
29115     }
29116
29117   return merge_attributes (old_attrs, new_attrs);
29118 }
29119
29120 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
29121
29122 static const char *
29123 aarch64_get_multilib_abi_name (void)
29124 {
29125   if (TARGET_BIG_END)
29126     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
29127   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
29128 }
29129
29130 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
29131    global variable based guard use the default else
29132    return a null tree.  */
29133 static tree
29134 aarch64_stack_protect_guard (void)
29135 {
29136   if (aarch64_stack_protector_guard == SSP_GLOBAL)
29137     return default_stack_protect_guard ();
29138
29139   return NULL_TREE;
29140 }
29141
29142 /* Implement TARGET_INVALID_UNARY_OP.  */
29143
29144 static const char *
29145 aarch64_invalid_unary_op (int op, const_tree type)
29146 {
29147   /* Reject all single-operand operations on __mfp8 except for &.  */
29148   if (TYPE_MAIN_VARIANT (type) == aarch64_mfp8_type_node && op != ADDR_EXPR)
29149     return N_ ("operation not permitted on type %<mfloat8_t%>");
29150
29151   /* Operation allowed.  */
29152   return NULL;
29153 }
29154
29155 /* Implement TARGET_INVALID_BINARY_OP.  */
29156
29157 static const char *
29158 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
29159                            const_tree type2)
29160 {
29161   if (VECTOR_TYPE_P (type1)
29162       && VECTOR_TYPE_P (type2)
29163       && !TYPE_INDIVISIBLE_P (type1)
29164       && !TYPE_INDIVISIBLE_P (type2)
29165       && (aarch64_sve::builtin_type_p (type1)
29166           != aarch64_sve::builtin_type_p (type2)))
29167     return N_("cannot combine GNU and SVE vectors in a binary operation");
29168
29169   /* Reject all 2-operand operations on __mfp8.  */
29170   if (TYPE_MAIN_VARIANT (type1) == aarch64_mfp8_type_node
29171       || TYPE_MAIN_VARIANT (type2) == aarch64_mfp8_type_node)
29172     return N_ ("operation not permitted on type %<mfloat8_t%>");
29173
29174   /* Operation allowed.  */
29175   return NULL;
29176 }
29177
29178 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES.  Here we tell the rest of the
29179    compiler that we automatically ignore the top byte of our pointers, which
29180    allows using -fsanitize=hwaddress.  */
29181 bool
29182 aarch64_can_tag_addresses ()
29183 {
29184   return !TARGET_ILP32;
29185 }
29186
29187 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
29188    section at the end if needed.  */
29189 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
29190 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
29191 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
29192 void
29193 aarch64_file_end_indicate_exec_stack ()
29194 {
29195   file_end_indicate_exec_stack ();
29196
29197   unsigned feature_1_and = 0;
29198   if (aarch_bti_enabled ())
29199     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
29200
29201   if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE)
29202     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
29203
29204   if (feature_1_and)
29205     {
29206       /* Generate .note.gnu.property section.  */
29207       switch_to_section (get_section (".note.gnu.property",
29208                                       SECTION_NOTYPE, NULL));
29209
29210       /* PT_NOTE header: namesz, descsz, type.
29211          namesz = 4 ("GNU\0")
29212          descsz = 16 (Size of the program property array)
29213                   [(12 + padding) * Number of array elements]
29214          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
29215       assemble_align (POINTER_SIZE);
29216       assemble_integer (GEN_INT (4), 4, 32, 1);
29217       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
29218       assemble_integer (GEN_INT (5), 4, 32, 1);
29219
29220       /* PT_NOTE name.  */
29221       assemble_string ("GNU", 4);
29222
29223       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
29224          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
29225          datasz = 4
29226          data   = feature_1_and.  */
29227       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
29228       assemble_integer (GEN_INT (4), 4, 32, 1);
29229       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
29230
29231       /* Pad the size of the note to the required alignment.  */
29232       assemble_align (POINTER_SIZE);
29233     }
29234 }
29235 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
29236 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
29237 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
29238
29239 /* Helper function for straight line speculation.
29240    Return what barrier should be emitted for straight line speculation
29241    mitigation.
29242    When not mitigating against straight line speculation this function returns
29243    an empty string.
29244    When mitigating against straight line speculation, use:
29245    * SB when the v8.5-A SB extension is enabled.
29246    * DSB+ISB otherwise.  */
29247 const char *
29248 aarch64_sls_barrier (int mitigation_required)
29249 {
29250   return mitigation_required
29251     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
29252     : "";
29253 }
29254
29255 static GTY (()) tree aarch64_sls_shared_thunks[30];
29256 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
29257 const char *indirect_symbol_names[30] = {
29258     "__call_indirect_x0",
29259     "__call_indirect_x1",
29260     "__call_indirect_x2",
29261     "__call_indirect_x3",
29262     "__call_indirect_x4",
29263     "__call_indirect_x5",
29264     "__call_indirect_x6",
29265     "__call_indirect_x7",
29266     "__call_indirect_x8",
29267     "__call_indirect_x9",
29268     "__call_indirect_x10",
29269     "__call_indirect_x11",
29270     "__call_indirect_x12",
29271     "__call_indirect_x13",
29272     "__call_indirect_x14",
29273     "__call_indirect_x15",
29274     "", /* "__call_indirect_x16",  */
29275     "", /* "__call_indirect_x17",  */
29276     "__call_indirect_x18",
29277     "__call_indirect_x19",
29278     "__call_indirect_x20",
29279     "__call_indirect_x21",
29280     "__call_indirect_x22",
29281     "__call_indirect_x23",
29282     "__call_indirect_x24",
29283     "__call_indirect_x25",
29284     "__call_indirect_x26",
29285     "__call_indirect_x27",
29286     "__call_indirect_x28",
29287     "__call_indirect_x29",
29288 };
29289
29290 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
29291    line speculation.  Instead of a simple BLR that can be speculated past,
29292    we emit a BL to this thunk, and this thunk contains a BR to the relevant
29293    register.  These thunks have the relevant speculation barries put after
29294    their indirect branch so that speculation is blocked.
29295
29296    We use such a thunk so the speculation barriers are kept off the
29297    architecturally executed path in order to reduce the performance overhead.
29298
29299    When optimizing for size we use stubs shared by the linked object.
29300    When optimizing for performance we emit stubs for each function in the hope
29301    that the branch predictor can better train on jumps specific for a given
29302    function.  */
29303 rtx
29304 aarch64_sls_create_blr_label (int regnum)
29305 {
29306   gcc_assert (STUB_REGNUM_P (regnum));
29307   if (optimize_function_for_size_p (cfun))
29308     {
29309       /* For the thunks shared between different functions in this compilation
29310          unit we use a named symbol -- this is just for users to more easily
29311          understand the generated assembly.  */
29312       aarch64_sls_shared_thunks_needed = true;
29313       const char *thunk_name = indirect_symbol_names[regnum];
29314       if (aarch64_sls_shared_thunks[regnum] == NULL)
29315         {
29316           /* Build a decl representing this function stub and record it for
29317              later.  We build a decl here so we can use the GCC machinery for
29318              handling sections automatically (through `get_named_section` and
29319              `make_decl_one_only`).  That saves us a lot of trouble handling
29320              the specifics of different output file formats.  */
29321           tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
29322                                   get_identifier (thunk_name),
29323                                   build_function_type_list (void_type_node,
29324                                                             NULL_TREE));
29325           DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
29326                                            NULL_TREE, void_type_node);
29327           TREE_PUBLIC (decl) = 1;
29328           TREE_STATIC (decl) = 1;
29329           DECL_IGNORED_P (decl) = 1;
29330           DECL_ARTIFICIAL (decl) = 1;
29331           make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29332           resolve_unique_section (decl, 0, false);
29333           aarch64_sls_shared_thunks[regnum] = decl;
29334         }
29335
29336       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
29337     }
29338
29339   if (cfun->machine->call_via[regnum] == NULL)
29340     cfun->machine->call_via[regnum]
29341       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
29342   return cfun->machine->call_via[regnum];
29343 }
29344
29345 /* Helper function for aarch64_sls_emit_blr_function_thunks and
29346    aarch64_sls_emit_shared_blr_thunks below.  */
29347 static void
29348 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
29349 {
29350   /* Save in x16 and branch to that function so this transformation does
29351      not prevent jumping to `BTI c` instructions.  */
29352   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
29353   asm_fprintf (out_file, "\tbr\tx16\n");
29354 }
29355
29356 /* Emit all BLR stubs for this particular function.
29357    Here we emit all the BLR stubs needed for the current function.  Since we
29358    emit these stubs in a consecutive block we know there will be no speculation
29359    gadgets between each stub, and hence we only emit a speculation barrier at
29360    the end of the stub sequences.
29361
29362    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
29363 void
29364 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
29365 {
29366   if (! aarch64_harden_sls_blr_p ())
29367     return;
29368
29369   bool any_functions_emitted = false;
29370   /* We must save and restore the current function section since this assembly
29371      is emitted at the end of the function.  This means it can be emitted *just
29372      after* the cold section of a function.  That cold part would be emitted in
29373      a different section.  That switch would trigger a `.cfi_endproc` directive
29374      to be emitted in the original section and a `.cfi_startproc` directive to
29375      be emitted in the new section.  Switching to the original section without
29376      restoring would mean that the `.cfi_endproc` emitted as a function ends
29377      would happen in a different section -- leaving an unmatched
29378      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
29379      in the standard text section.  */
29380   section *save_text_section = in_section;
29381   switch_to_section (function_section (current_function_decl));
29382   for (int regnum = 0; regnum < 30; ++regnum)
29383     {
29384       rtx specu_label = cfun->machine->call_via[regnum];
29385       if (specu_label == NULL)
29386         continue;
29387
29388       targetm.asm_out.print_operand (out_file, specu_label, 0);
29389       asm_fprintf (out_file, ":\n");
29390       aarch64_sls_emit_function_stub (out_file, regnum);
29391       any_functions_emitted = true;
29392     }
29393   if (any_functions_emitted)
29394     /* Can use the SB if needs be here, since this stub will only be used
29395       by the current function, and hence for the current target.  */
29396     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
29397   switch_to_section (save_text_section);
29398 }
29399
29400 /* Emit shared BLR stubs for the current compilation unit.
29401    Over the course of compiling this unit we may have converted some BLR
29402    instructions to a BL to a shared stub function.  This is where we emit those
29403    stub functions.
29404    This function is for the stubs shared between different functions in this
29405    compilation unit.  We share when optimizing for size instead of speed.
29406
29407    This function is called through the TARGET_ASM_FILE_END hook.  */
29408 void
29409 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
29410 {
29411   if (! aarch64_sls_shared_thunks_needed)
29412     return;
29413
29414   for (int regnum = 0; regnum < 30; ++regnum)
29415     {
29416       tree decl = aarch64_sls_shared_thunks[regnum];
29417       if (!decl)
29418         continue;
29419
29420       const char *name = indirect_symbol_names[regnum];
29421       switch_to_section (get_named_section (decl, NULL, 0));
29422       ASM_OUTPUT_ALIGN (out_file, 2);
29423       targetm.asm_out.globalize_label (out_file, name);
29424       /* Only emits if the compiler is configured for an assembler that can
29425          handle visibility directives.  */
29426       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
29427       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
29428       ASM_OUTPUT_LABEL (out_file, name);
29429       aarch64_sls_emit_function_stub (out_file, regnum);
29430       /* Use the most conservative target to ensure it can always be used by any
29431          function in the translation unit.  */
29432       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
29433       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
29434     }
29435 }
29436
29437 /* Implement TARGET_ASM_FILE_END.  */
29438 void
29439 aarch64_asm_file_end ()
29440 {
29441   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
29442   /* Since this function will be called for the ASM_FILE_END hook, we ensure
29443      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
29444      for FreeBSD) still gets called.  */
29445 #ifdef TARGET_ASM_FILE_END
29446   TARGET_ASM_FILE_END ();
29447 #endif
29448 }
29449
29450 const char *
29451 aarch64_indirect_call_asm (rtx addr)
29452 {
29453   gcc_assert (REG_P (addr));
29454   if (aarch64_harden_sls_blr_p ())
29455     {
29456       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
29457       output_asm_insn ("bl\t%0", &stub_label);
29458     }
29459   else
29460    output_asm_insn ("blr\t%0", &addr);
29461   return "";
29462 }
29463
29464 /* Emit the assembly instruction to load the thread pointer into DEST.
29465    Select between different tpidr_elN registers depending on -mtp= setting.  */
29466
29467 const char *
29468 aarch64_output_load_tp (rtx dest)
29469 {
29470   const char *tpidrs[] = {"tpidr_el0", "tpidr_el1", "tpidr_el2",
29471                           "tpidr_el3", "tpidrro_el0"};
29472   char buffer[64];
29473   snprintf (buffer, sizeof (buffer), "mrs\t%%0, %s",
29474             tpidrs[aarch64_tpidr_register]);
29475   output_asm_insn (buffer, &dest);
29476   return "";
29477 }
29478
29479 /* Set up the value of REG_ALLOC_ORDER from scratch.
29480
29481    It was previously good practice to put call-clobbered registers ahead
29482    of call-preserved registers, but that isn't necessary these days.
29483    IRA's model of register save/restore costs is much more sophisticated
29484    than the model that a simple ordering could provide.  We leave
29485    HONOR_REG_ALLOC_ORDER undefined so that we can get the full benefit
29486    of IRA's model.
29487
29488    However, it is still useful to list registers that are members of
29489    multiple classes after registers that are members of fewer classes.
29490    For example, we have:
29491
29492    - FP_LO8_REGS: v0-v7
29493    - FP_LO_REGS: v0-v15
29494    - FP_REGS: v0-v31
29495
29496    If, as a tie-breaker, we allocate FP_REGS in the order v0-v31,
29497    we run the risk of starving other (lower-priority) pseudos that
29498    require FP_LO8_REGS or FP_LO_REGS.  Allocating FP_LO_REGS in the
29499    order v0-v15 could similarly starve pseudos that require FP_LO8_REGS.
29500    Allocating downwards rather than upwards avoids this problem, at least
29501    in code that has reasonable register pressure.
29502
29503    The situation for predicate registers is similar.  */
29504
29505 void
29506 aarch64_adjust_reg_alloc_order ()
29507 {
29508   for (int i = 0; i < FIRST_PSEUDO_REGISTER; ++i)
29509     if (IN_RANGE (i, V0_REGNUM, V31_REGNUM))
29510       reg_alloc_order[i] = V31_REGNUM - (i - V0_REGNUM);
29511     else if (IN_RANGE (i, P0_REGNUM, P15_REGNUM))
29512       reg_alloc_order[i] = P15_REGNUM - (i - P0_REGNUM);
29513     else
29514       reg_alloc_order[i] = i;
29515 }
29516
29517 /* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
29518    of vector mode MODE to select half the elements of that vector.
29519    Allow any combination of indices except duplicates (or out of range of
29520    the mode units).  */
29521
29522 bool
29523 aarch64_parallel_select_half_p (machine_mode mode, rtx par)
29524 {
29525   int nunits = XVECLEN (par, 0);
29526   if (!known_eq (GET_MODE_NUNITS (mode), nunits * 2))
29527     return false;
29528   int mode_nunits = nunits * 2;
29529   /* Put all the elements of PAR into a hash_set and use its
29530      uniqueness guarantees to check that we don't try to insert the same
29531      element twice.  */
29532   hash_set<rtx> parset;
29533   for (int i = 0; i < nunits; ++i)
29534     {
29535       rtx elt = XVECEXP (par, 0, i);
29536       if (!CONST_INT_P (elt)
29537           || !IN_RANGE (INTVAL (elt), 0, mode_nunits - 1)
29538           || parset.add (elt))
29539         return false;
29540     }
29541   return true;
29542 }
29543
29544 /* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
29545    contain any common elements.  */
29546
29547 bool
29548 aarch64_pars_overlap_p (rtx par1, rtx par2)
29549 {
29550   int len1 = XVECLEN (par1, 0);
29551   int len2 = XVECLEN (par2, 0);
29552   hash_set<rtx> parset;
29553   for (int i = 0; i < len1; ++i)
29554     parset.add (XVECEXP (par1, 0, i));
29555   for (int i = 0; i < len2; ++i)
29556     if (parset.contains (XVECEXP (par2, 0, i)))
29557       return true;
29558   return false;
29559 }
29560
29561 /* Implement OPTIMIZE_MODE_SWITCHING.  */
29562
29563 bool
29564 aarch64_optimize_mode_switching (aarch64_mode_entity entity)
29565 {
29566   bool have_sme_state = (aarch64_cfun_incoming_pstate_za () != 0
29567                          || (aarch64_cfun_has_new_state ("za")
29568                              && df_regs_ever_live_p (ZA_REGNUM))
29569                          || (aarch64_cfun_has_new_state ("zt0")
29570                              && df_regs_ever_live_p (ZT0_REGNUM)));
29571
29572   if (have_sme_state && nonlocal_goto_handler_labels)
29573     {
29574       static bool reported;
29575       if (!reported)
29576         {
29577           sorry ("non-local gotos in functions with SME state");
29578           reported = true;
29579         }
29580     }
29581
29582   switch (entity)
29583     {
29584     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29585     case aarch64_mode_entity::LOCAL_SME_STATE:
29586       return have_sme_state && !nonlocal_goto_handler_labels;
29587     }
29588   gcc_unreachable ();
29589 }
29590
29591 /* Implement TARGET_MODE_EMIT for ZA_SAVE_BUFFER.  */
29592
29593 static void
29594 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode mode,
29595                                   aarch64_tristate_mode prev_mode)
29596 {
29597   if (mode == aarch64_tristate_mode::YES)
29598     {
29599       gcc_assert (prev_mode == aarch64_tristate_mode::NO);
29600       aarch64_init_tpidr2_block ();
29601     }
29602   else
29603     gcc_unreachable ();
29604 }
29605
29606 /* Implement TARGET_MODE_EMIT for LOCAL_SME_STATE.  */
29607
29608 static void
29609 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
29610                                    aarch64_local_sme_state prev_mode)
29611 {
29612   /* Back-propagation should ensure that we're always starting from
29613      a known mode.  */
29614   gcc_assert (prev_mode != aarch64_local_sme_state::ANY);
29615
29616   if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29617     {
29618       /* Commit any uncommitted lazy save.  This leaves ZA either active
29619          and zero (lazy save case) or off (normal case).
29620
29621          The sequence is:
29622
29623              mrs <temp>, tpidr2_el0
29624              cbz <temp>, no_save
29625              bl __arm_tpidr2_save
29626              msr tpidr2_el0, xzr
29627              zero { za }       // Only if ZA is live
29628              zero { zt0 }      // Only if ZT0 is live
29629          no_save:  */
29630       auto tmp_reg = gen_reg_rtx (DImode);
29631       emit_insn (gen_aarch64_read_tpidr2 (tmp_reg));
29632       auto label = gen_label_rtx ();
29633       rtx branch = aarch64_gen_compare_zero_and_branch (EQ, tmp_reg, label);
29634       auto jump = emit_jump_insn (branch);
29635       JUMP_LABEL (jump) = label;
29636       emit_insn (gen_aarch64_tpidr2_save ());
29637       emit_insn (gen_aarch64_clear_tpidr2 ());
29638       if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29639           || mode == aarch64_local_sme_state::ACTIVE_DEAD)
29640         {
29641           if (aarch64_cfun_has_state ("za"))
29642             emit_insn (gen_aarch64_initial_zero_za ());
29643           if (aarch64_cfun_has_state ("zt0"))
29644             emit_insn (gen_aarch64_sme_zero_zt0 ());
29645         }
29646       emit_label (label);
29647     }
29648
29649   if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29650       || mode == aarch64_local_sme_state::ACTIVE_DEAD)
29651     {
29652       if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29653         {
29654           /* Make ZA active after being inactive.
29655
29656              First handle the case in which the lazy save we set up was
29657              committed by a callee.  If the function's source-level ZA state
29658              is live then we must conditionally restore it from the lazy
29659              save buffer.  Otherwise we can just force PSTATE.ZA to 1.  */
29660           if (mode == aarch64_local_sme_state::ACTIVE_LIVE)
29661             emit_insn (gen_aarch64_restore_za (aarch64_get_tpidr2_ptr ()));
29662           else
29663             emit_insn (gen_aarch64_smstart_za ());
29664
29665           /* Now handle the case in which the lazy save was not committed.
29666              In that case, ZA still contains the current function's ZA state,
29667              and we just need to cancel the lazy save.  */
29668           emit_insn (gen_aarch64_clear_tpidr2 ());
29669
29670           /* Restore the ZT0 state, if we have some.  */
29671           if (aarch64_cfun_has_state ("zt0"))
29672             aarch64_restore_zt0 (true);
29673
29674           return;
29675         }
29676
29677       if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL)
29678         {
29679           /* Retrieve the current function's ZA state from the lazy save
29680              buffer.  */
29681           aarch64_restore_za (aarch64_get_tpidr2_ptr ());
29682
29683           /* Restore the ZT0 state, if we have some.  */
29684           if (aarch64_cfun_has_state ("zt0"))
29685             aarch64_restore_zt0 (true);
29686           return;
29687         }
29688
29689       if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER
29690           || prev_mode == aarch64_local_sme_state::OFF)
29691         {
29692           /* INACTIVE_CALLER means that we are enabling ZA for the first
29693              time in this function.  The code above means that ZA is either
29694              active and zero (if we committed a lazy save) or off.  Handle
29695              the latter case by forcing ZA on.
29696
29697              OFF means that PSTATE.ZA is guaranteed to be 0.  We just need
29698              to force it to 1.
29699
29700              Both cases leave ZA zeroed.  */
29701           emit_insn (gen_aarch64_smstart_za ());
29702
29703           /* Restore the ZT0 state, if we have some.  */
29704           if (prev_mode == aarch64_local_sme_state::OFF
29705               && aarch64_cfun_has_state ("zt0"))
29706             aarch64_restore_zt0 (true);
29707           return;
29708         }
29709
29710       if (prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29711           || prev_mode == aarch64_local_sme_state::ACTIVE_LIVE)
29712         /* A simple change in liveness, such as in a CFG structure where
29713            ZA is only conditionally defined.  No code is needed.  */
29714         return;
29715
29716       gcc_unreachable ();
29717     }
29718
29719   if (mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29720     {
29721       if (prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29722           || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29723           || prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29724         {
29725           /* Save the ZT0 state, if we have some.  */
29726           if (aarch64_cfun_has_state ("zt0"))
29727             aarch64_save_zt0 ();
29728
29729           /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
29730              case of setting up a lazy save buffer before a call.
29731              A transition from INACTIVE_CALLER is similar, except that
29732              the contents of ZA are known to be zero.
29733
29734              A transition from ACTIVE_DEAD means that ZA is live at the
29735              point of the transition, but is dead on at least one incoming
29736              edge.  (That is, ZA is only conditionally initialized.)
29737              For efficiency, we want to set up a lazy save even for
29738              dead contents, since forcing ZA off would make later code
29739              restore ZA from the lazy save buffer.  */
29740           emit_insn (gen_aarch64_write_tpidr2 (aarch64_get_tpidr2_ptr ()));
29741           return;
29742         }
29743
29744       if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL
29745           || prev_mode == aarch64_local_sme_state::OFF)
29746         /* We're simply discarding the information about which inactive
29747            state applies.  */
29748         return;
29749
29750       gcc_unreachable ();
29751     }
29752
29753   if (mode == aarch64_local_sme_state::INACTIVE_CALLER
29754       || mode == aarch64_local_sme_state::OFF)
29755     {
29756       /* Save the ZT0 state, if we have some.  */
29757       if ((prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29758            || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD)
29759           && mode == aarch64_local_sme_state::OFF
29760           && aarch64_cfun_has_state ("zt0"))
29761         aarch64_save_zt0 ();
29762
29763       /* The transition to INACTIVE_CALLER is used before returning from
29764          new("za") functions.  Any state in ZA belongs to the current
29765          function rather than a caller, but that state is no longer
29766          needed.  Clear any pending lazy save and turn ZA off.
29767
29768          The transition to OFF is used before calling a private-ZA function.
29769          We committed any incoming lazy save above, so at this point any
29770          contents in ZA belong to the current function.  */
29771       if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29772         emit_insn (gen_aarch64_clear_tpidr2 ());
29773
29774       if (prev_mode != aarch64_local_sme_state::OFF
29775           && prev_mode != aarch64_local_sme_state::SAVED_LOCAL)
29776         emit_insn (gen_aarch64_smstop_za ());
29777
29778       return;
29779     }
29780
29781   if (mode == aarch64_local_sme_state::SAVED_LOCAL)
29782     {
29783       /* This is a transition to an exception handler.  */
29784       gcc_assert (prev_mode == aarch64_local_sme_state::OFF
29785                   || prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL);
29786       return;
29787     }
29788
29789   gcc_unreachable ();
29790 }
29791
29792 /* Implement TARGET_MODE_EMIT.  */
29793
29794 static void
29795 aarch64_mode_emit (int entity, int mode, int prev_mode, HARD_REG_SET live)
29796 {
29797   if (mode == prev_mode)
29798     return;
29799
29800   start_sequence ();
29801   switch (aarch64_mode_entity (entity))
29802     {
29803     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29804       aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode (mode),
29805                                         aarch64_tristate_mode (prev_mode));
29806       break;
29807
29808     case aarch64_mode_entity::LOCAL_SME_STATE:
29809       aarch64_mode_emit_local_sme_state (aarch64_local_sme_state (mode),
29810                                          aarch64_local_sme_state (prev_mode));
29811       break;
29812     }
29813   rtx_insn *seq = get_insns ();
29814   end_sequence ();
29815
29816   /* Get the set of clobbered registers that are currently live.  */
29817   HARD_REG_SET clobbers = {};
29818   for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
29819     {
29820       if (!NONDEBUG_INSN_P (insn))
29821         continue;
29822       vec_rtx_properties properties;
29823       properties.add_insn (insn, false);
29824       for (rtx_obj_reference ref : properties.refs ())
29825         if (ref.is_write () && HARD_REGISTER_NUM_P (ref.regno))
29826           SET_HARD_REG_BIT (clobbers, ref.regno);
29827     }
29828   clobbers &= live;
29829
29830   /* Emit instructions to save clobbered registers to pseudos.  Queue
29831      instructions to restore the registers afterwards.
29832
29833      This should only needed in rare situations.  */
29834   auto_vec<rtx, 33> after;
29835   for (unsigned int regno = R0_REGNUM; regno < R30_REGNUM; ++regno)
29836     if (TEST_HARD_REG_BIT (clobbers, regno))
29837       {
29838         rtx hard_reg = gen_rtx_REG (DImode, regno);
29839         rtx pseudo_reg = gen_reg_rtx (DImode);
29840         emit_move_insn (pseudo_reg, hard_reg);
29841         after.quick_push (gen_move_insn (hard_reg, pseudo_reg));
29842       }
29843   if (TEST_HARD_REG_BIT (clobbers, CC_REGNUM))
29844     {
29845       rtx pseudo_reg = gen_reg_rtx (DImode);
29846       emit_insn (gen_aarch64_save_nzcv (pseudo_reg));
29847       after.quick_push (gen_aarch64_restore_nzcv (pseudo_reg));
29848     }
29849
29850   /* Emit the transition instructions themselves.  */
29851   emit_insn (seq);
29852
29853   /* Restore the clobbered registers.  */
29854   for (auto *insn : after)
29855     emit_insn (insn);
29856 }
29857
29858 /* Return true if INSN references the SME state represented by hard register
29859    REGNO.  */
29860
29861 static bool
29862 aarch64_insn_references_sme_state_p (rtx_insn *insn, unsigned int regno)
29863 {
29864   df_ref ref;
29865   FOR_EACH_INSN_DEF (ref, insn)
29866     if (!DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
29867         && DF_REF_REGNO (ref) == regno)
29868       return true;
29869   FOR_EACH_INSN_USE (ref, insn)
29870     if (DF_REF_REGNO (ref) == regno)
29871       return true;
29872   return false;
29873 }
29874
29875 /* Implement TARGET_MODE_NEEDED for LOCAL_SME_STATE.  */
29876
29877 static aarch64_local_sme_state
29878 aarch64_mode_needed_local_sme_state (rtx_insn *insn, HARD_REG_SET live)
29879 {
29880   if (!CALL_P (insn)
29881       && find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29882     {
29883       static bool reported;
29884       if (!reported)
29885         {
29886           sorry ("catching non-call exceptions in functions with SME state");
29887           reported = true;
29888         }
29889       /* Aim for graceful error recovery by picking the value that is
29890          least likely to generate an ICE.  */
29891       return aarch64_local_sme_state::INACTIVE_LOCAL;
29892     }
29893
29894   /* A non-local goto is equivalent to a return.  We disallow non-local
29895      receivers in functions with SME state, so we know that the target
29896      expects ZA to be dormant or off.  */
29897   if (JUMP_P (insn)
29898       && find_reg_note (insn, REG_NON_LOCAL_GOTO, NULL_RTX))
29899     return aarch64_local_sme_state::INACTIVE_CALLER;
29900
29901   /* start_private_za_call and end_private_za_call bracket a sequence
29902      that calls a private-ZA function.  Force ZA to be turned off if the
29903      function doesn't have any live ZA state, otherwise require ZA to be
29904      inactive.  */
29905   auto icode = recog_memoized (insn);
29906   if (icode == CODE_FOR_aarch64_start_private_za_call
29907       || icode == CODE_FOR_aarch64_end_private_za_call)
29908     return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29909             ? aarch64_local_sme_state::INACTIVE_LOCAL
29910             : aarch64_local_sme_state::OFF);
29911
29912   /* Force ZA to contain the current function's ZA state if INSN wants
29913      to access it.  Do the same for accesses to ZT0, since ZA and ZT0
29914      are both controlled by PSTATE.ZA.  */
29915   if (aarch64_insn_references_sme_state_p (insn, ZA_REGNUM)
29916       || aarch64_insn_references_sme_state_p (insn, ZT0_REGNUM))
29917     return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29918             ? aarch64_local_sme_state::ACTIVE_LIVE
29919             : aarch64_local_sme_state::ACTIVE_DEAD);
29920
29921   return aarch64_local_sme_state::ANY;
29922 }
29923
29924 /* Implement TARGET_MODE_NEEDED for ZA_SAVE_BUFFER.  */
29925
29926 static aarch64_tristate_mode
29927 aarch64_mode_needed_za_save_buffer (rtx_insn *insn, HARD_REG_SET live)
29928 {
29929   /* We need to set up a lazy save buffer no later than the first
29930      transition to INACTIVE_LOCAL (which involves setting up a lazy save).  */
29931   if (aarch64_mode_needed_local_sme_state (insn, live)
29932       == aarch64_local_sme_state::INACTIVE_LOCAL)
29933     return aarch64_tristate_mode::YES;
29934
29935   /* Also make sure that the lazy save buffer is set up before the first
29936      insn that throws internally.  The exception handler will sometimes
29937      load from it.  */
29938   if (find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29939     return aarch64_tristate_mode::YES;
29940
29941   return aarch64_tristate_mode::MAYBE;
29942 }
29943
29944 /* Implement TARGET_MODE_NEEDED.  */
29945
29946 static int
29947 aarch64_mode_needed (int entity, rtx_insn *insn, HARD_REG_SET live)
29948 {
29949   switch (aarch64_mode_entity (entity))
29950     {
29951     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29952       return int (aarch64_mode_needed_za_save_buffer (insn, live));
29953
29954     case aarch64_mode_entity::LOCAL_SME_STATE:
29955       return int (aarch64_mode_needed_local_sme_state (insn, live));
29956     }
29957   gcc_unreachable ();
29958 }
29959
29960 /* Implement TARGET_MODE_AFTER for LOCAL_SME_STATE.  */
29961
29962 static aarch64_local_sme_state
29963 aarch64_mode_after_local_sme_state (aarch64_local_sme_state mode,
29964                                     HARD_REG_SET live)
29965 {
29966   /* Note places where ZA dies, so that we can try to avoid saving and
29967      restoring state that isn't needed.  */
29968   if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29969       && !TEST_HARD_REG_BIT (live, ZA_REGNUM))
29970     return aarch64_local_sme_state::ACTIVE_DEAD;
29971
29972   /* Note where ZA is born, e.g. when moving past an __arm_out("za")
29973      function.  */
29974   if (mode == aarch64_local_sme_state::ACTIVE_DEAD
29975       && TEST_HARD_REG_BIT (live, ZA_REGNUM))
29976     return aarch64_local_sme_state::ACTIVE_LIVE;
29977
29978   return mode;
29979 }
29980
29981 /* Implement TARGET_MODE_AFTER.  */
29982
29983 static int
29984 aarch64_mode_after (int entity, int mode, rtx_insn *, HARD_REG_SET live)
29985 {
29986   switch (aarch64_mode_entity (entity))
29987     {
29988     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29989       return mode;
29990
29991     case aarch64_mode_entity::LOCAL_SME_STATE:
29992       return int (aarch64_mode_after_local_sme_state
29993                   (aarch64_local_sme_state (mode), live));
29994     }
29995   gcc_unreachable ();
29996 }
29997
29998 /* Implement TARGET_MODE_CONFLUENCE for LOCAL_SME_STATE.  */
29999
30000 static aarch64_local_sme_state
30001 aarch64_local_sme_confluence (aarch64_local_sme_state mode1,
30002                               aarch64_local_sme_state mode2)
30003 {
30004   /* Perform a symmetrical check for two values.  */
30005   auto is_pair = [&](aarch64_local_sme_state val1,
30006                      aarch64_local_sme_state val2)
30007     {
30008       return ((mode1 == val1 && mode2 == val2)
30009               || (mode1 == val2 && mode2 == val1));
30010     };
30011
30012   /* INACTIVE_CALLER means ZA is off or it has dormant contents belonging
30013      to a caller.  OFF is one of the options.  */
30014   if (is_pair (aarch64_local_sme_state::INACTIVE_CALLER,
30015                aarch64_local_sme_state::OFF))
30016     return aarch64_local_sme_state::INACTIVE_CALLER;
30017
30018   /* Similarly for dormant contents belonging to the current function.  */
30019   if (is_pair (aarch64_local_sme_state::INACTIVE_LOCAL,
30020                aarch64_local_sme_state::OFF))
30021     return aarch64_local_sme_state::INACTIVE_LOCAL;
30022
30023   /* Treat a conditionally-initialized value as a fully-initialized value.  */
30024   if (is_pair (aarch64_local_sme_state::ACTIVE_LIVE,
30025                aarch64_local_sme_state::ACTIVE_DEAD))
30026     return aarch64_local_sme_state::ACTIVE_LIVE;
30027
30028   return aarch64_local_sme_state::ANY;
30029 }
30030
30031 /* Implement TARGET_MODE_CONFLUENCE.  */
30032
30033 static int
30034 aarch64_mode_confluence (int entity, int mode1, int mode2)
30035 {
30036   gcc_assert (mode1 != mode2);
30037   switch (aarch64_mode_entity (entity))
30038     {
30039     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30040       return int (aarch64_tristate_mode::MAYBE);
30041
30042     case aarch64_mode_entity::LOCAL_SME_STATE:
30043       return int (aarch64_local_sme_confluence
30044                   (aarch64_local_sme_state (mode1),
30045                    aarch64_local_sme_state (mode2)));
30046     }
30047   gcc_unreachable ();
30048 }
30049
30050 /* Implement TARGET_MODE_BACKPROP for an entity that either stays
30051    NO throughput, or makes one transition from NO to YES.  */
30052
30053 static aarch64_tristate_mode
30054 aarch64_one_shot_backprop (aarch64_tristate_mode mode1,
30055                            aarch64_tristate_mode mode2)
30056 {
30057   /* Keep bringing the transition forward until it starts from NO.  */
30058   if (mode1 == aarch64_tristate_mode::MAYBE
30059       && mode2 == aarch64_tristate_mode::YES)
30060     return mode2;
30061
30062   return aarch64_tristate_mode::MAYBE;
30063 }
30064
30065 /* Implement TARGET_MODE_BACKPROP for LOCAL_SME_STATE.  */
30066
30067 static aarch64_local_sme_state
30068 aarch64_local_sme_backprop (aarch64_local_sme_state mode1,
30069                             aarch64_local_sme_state mode2)
30070 {
30071   /* We always need to know what the current state is when transitioning
30072      to a new state.  Force any location with indeterminate starting state
30073      to be active.  */
30074   if (mode1 == aarch64_local_sme_state::ANY)
30075     switch (mode2)
30076       {
30077       case aarch64_local_sme_state::INACTIVE_CALLER:
30078       case aarch64_local_sme_state::OFF:
30079       case aarch64_local_sme_state::ACTIVE_DEAD:
30080         /* The current function's ZA state is not live.  */
30081         return aarch64_local_sme_state::ACTIVE_DEAD;
30082
30083       case aarch64_local_sme_state::INACTIVE_LOCAL:
30084       case aarch64_local_sme_state::ACTIVE_LIVE:
30085         /* The current function's ZA state is live.  */
30086         return aarch64_local_sme_state::ACTIVE_LIVE;
30087
30088       case aarch64_local_sme_state::SAVED_LOCAL:
30089         /* This is a transition to an exception handler.  Since we don't
30090            support non-call exceptions for SME functions, the source of
30091            the transition must be known.  We'll assert later if that's
30092            not the case.  */
30093         return aarch64_local_sme_state::ANY;
30094
30095       case aarch64_local_sme_state::ANY:
30096         return aarch64_local_sme_state::ANY;
30097       }
30098
30099   return aarch64_local_sme_state::ANY;
30100 }
30101
30102 /* Implement TARGET_MODE_BACKPROP.  */
30103
30104 static int
30105 aarch64_mode_backprop (int entity, int mode1, int mode2)
30106 {
30107   switch (aarch64_mode_entity (entity))
30108     {
30109     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30110       return int (aarch64_one_shot_backprop (aarch64_tristate_mode (mode1),
30111                                              aarch64_tristate_mode (mode2)));
30112
30113     case aarch64_mode_entity::LOCAL_SME_STATE:
30114       return int (aarch64_local_sme_backprop
30115                   (aarch64_local_sme_state (mode1),
30116                    aarch64_local_sme_state (mode2)));
30117     }
30118   gcc_unreachable ();
30119 }
30120
30121 /* Implement TARGET_MODE_ENTRY.  */
30122
30123 static int
30124 aarch64_mode_entry (int entity)
30125 {
30126   switch (aarch64_mode_entity (entity))
30127     {
30128     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30129       return int (aarch64_tristate_mode::NO);
30130
30131     case aarch64_mode_entity::LOCAL_SME_STATE:
30132       return int (aarch64_cfun_shared_flags ("za") != 0
30133                   ? aarch64_local_sme_state::ACTIVE_LIVE
30134                   : aarch64_cfun_incoming_pstate_za () != 0
30135                   ? aarch64_local_sme_state::ACTIVE_DEAD
30136                   : aarch64_local_sme_state::INACTIVE_CALLER);
30137     }
30138   gcc_unreachable ();
30139 }
30140
30141 /* Implement TARGET_MODE_EXIT.  */
30142
30143 static int
30144 aarch64_mode_exit (int entity)
30145 {
30146   switch (aarch64_mode_entity (entity))
30147     {
30148     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30149       return int (aarch64_tristate_mode::MAYBE);
30150
30151     case aarch64_mode_entity::LOCAL_SME_STATE:
30152       return int (aarch64_cfun_shared_flags ("za") != 0
30153                   ? aarch64_local_sme_state::ACTIVE_LIVE
30154                   : aarch64_cfun_incoming_pstate_za () != 0
30155                   ? aarch64_local_sme_state::ACTIVE_DEAD
30156                   : aarch64_local_sme_state::INACTIVE_CALLER);
30157     }
30158   gcc_unreachable ();
30159 }
30160
30161 /* Implement TARGET_MODE_EH_HANDLER.  */
30162
30163 static int
30164 aarch64_mode_eh_handler (int entity)
30165 {
30166   switch (aarch64_mode_entity (entity))
30167     {
30168     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
30169       /* Require a lazy save buffer to be allocated before the first
30170          insn that can throw.  */
30171       return int (aarch64_tristate_mode::YES);
30172
30173     case aarch64_mode_entity::LOCAL_SME_STATE:
30174       return int (aarch64_local_sme_state::SAVED_LOCAL);
30175     }
30176   gcc_unreachable ();
30177 }
30178
30179 /* Implement TARGET_MODE_PRIORITY.  */
30180
30181 static int
30182 aarch64_mode_priority (int, int n)
30183 {
30184   return n;
30185 }
30186
30187 /* Implement TARGET_MD_ASM_ADJUST.  */
30188
30189 static rtx_insn *
30190 aarch64_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
30191                        vec<machine_mode> &input_modes,
30192                        vec<const char *> &constraints,
30193                        vec<rtx> &uses, vec<rtx> &clobbers,
30194                        HARD_REG_SET &clobbered_regs, location_t loc)
30195 {
30196   rtx_insn *seq = arm_md_asm_adjust (outputs, inputs, input_modes, constraints,
30197                                      uses, clobbers, clobbered_regs, loc);
30198
30199   /* "za" in the clobber list of a function with ZA state is defined to
30200      mean that the asm can read from and write to ZA.  We can model the
30201      read using a USE, but unfortunately, it's not possible to model the
30202      write directly.   Use a separate insn to model the effect.
30203
30204      We must ensure that ZA is active on entry, which is enforced by using
30205      SME_STATE_REGNUM.  The asm must ensure that ZA is active on return.
30206
30207      The same thing applies to ZT0.  */
30208   if (TARGET_ZA)
30209     for (unsigned int i = clobbers.length (); i-- > 0; )
30210       {
30211         rtx x = clobbers[i];
30212         if (REG_P (x)
30213             && (REGNO (x) == ZA_REGNUM || REGNO (x) == ZT0_REGNUM))
30214           {
30215             auto id = cfun->machine->next_asm_update_za_id++;
30216
30217             start_sequence ();
30218             if (seq)
30219               emit_insn (seq);
30220             rtx id_rtx = gen_int_mode (id, SImode);
30221             emit_insn (REGNO (x) == ZA_REGNUM
30222                        ? gen_aarch64_asm_update_za (id_rtx)
30223                        : gen_aarch64_asm_update_zt0 (id_rtx));
30224             seq = get_insns ();
30225             end_sequence ();
30226
30227             auto mode = REGNO (x) == ZA_REGNUM ? VNx16QImode : V8DImode;
30228             uses.safe_push (gen_rtx_REG (mode, REGNO (x)));
30229             uses.safe_push (gen_rtx_REG (DImode, SME_STATE_REGNUM));
30230
30231             clobbers.ordered_remove (i);
30232             CLEAR_HARD_REG_BIT (clobbered_regs, REGNO (x));
30233           }
30234       }
30235   return seq;
30236 }
30237
30238 /* BB is the target of an exception or nonlocal goto edge, which means
30239    that PSTATE.SM is known to be 0 on entry.  Put it into the state that
30240    the current function requires.  */
30241
30242 static bool
30243 aarch64_switch_pstate_sm_for_landing_pad (basic_block bb)
30244 {
30245   if (TARGET_NON_STREAMING)
30246     return false;
30247
30248   start_sequence ();
30249   rtx_insn *guard_label = nullptr;
30250   if (TARGET_STREAMING_COMPATIBLE)
30251     guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30252                                                   AARCH64_ISA_MODE_SM_OFF);
30253   aarch64_sme_mode_switch_regs args_switch;
30254   args_switch.add_call_preserved_regs (df_get_live_in (bb));
30255   args_switch.emit_prologue ();
30256   aarch64_switch_pstate_sm (AARCH64_ISA_MODE_SM_OFF, AARCH64_ISA_MODE_SM_ON);
30257   args_switch.emit_epilogue ();
30258   if (guard_label)
30259     emit_label (guard_label);
30260   auto seq = get_insns ();
30261   end_sequence ();
30262
30263   emit_insn_after (seq, bb_note (bb));
30264   return true;
30265 }
30266
30267 /* JUMP is a nonlocal goto.  Its target requires PSTATE.SM to be 0 on entry,
30268    so arrange to make it so.  */
30269
30270 static bool
30271 aarch64_switch_pstate_sm_for_jump (rtx_insn *jump)
30272 {
30273   if (TARGET_NON_STREAMING)
30274     return false;
30275
30276   start_sequence ();
30277   rtx_insn *guard_label = nullptr;
30278   if (TARGET_STREAMING_COMPATIBLE)
30279     guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30280                                                   AARCH64_ISA_MODE_SM_OFF);
30281   aarch64_switch_pstate_sm (AARCH64_ISA_MODE_SM_ON, AARCH64_ISA_MODE_SM_OFF);
30282   if (guard_label)
30283     emit_label (guard_label);
30284   auto seq = get_insns ();
30285   end_sequence ();
30286
30287   emit_insn_before (seq, jump);
30288   return true;
30289 }
30290
30291 /* If CALL involves a change in PSTATE.SM, emit the instructions needed
30292    to switch to the new mode and the instructions needed to restore the
30293    original mode.  Return true if something changed.  */
30294 static bool
30295 aarch64_switch_pstate_sm_for_call (rtx_call_insn *call)
30296 {
30297   /* Mode switches for sibling calls are handled via the epilogue.  */
30298   if (SIBLING_CALL_P (call))
30299     return false;
30300
30301   auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
30302   if (!aarch64_call_switches_pstate_sm (callee_isa_mode))
30303     return false;
30304
30305   /* Switch mode before the call, preserving any argument registers
30306      across the switch.  */
30307   start_sequence ();
30308   rtx_insn *args_guard_label = nullptr;
30309   if (TARGET_STREAMING_COMPATIBLE)
30310     args_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30311                                                        callee_isa_mode);
30312   aarch64_sme_mode_switch_regs args_switch;
30313   args_switch.add_call_args (call);
30314   args_switch.emit_prologue ();
30315   aarch64_switch_pstate_sm (AARCH64_ISA_MODE, callee_isa_mode);
30316   args_switch.emit_epilogue ();
30317   if (args_guard_label)
30318     emit_label (args_guard_label);
30319   auto args_seq = get_insns ();
30320   end_sequence ();
30321   emit_insn_before (args_seq, call);
30322
30323   if (find_reg_note (call, REG_NORETURN, NULL_RTX))
30324     return true;
30325
30326   /* Switch mode after the call, preserving any return registers across
30327      the switch.  */
30328   start_sequence ();
30329   rtx_insn *return_guard_label = nullptr;
30330   if (TARGET_STREAMING_COMPATIBLE)
30331     return_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
30332                                                          callee_isa_mode);
30333   aarch64_sme_mode_switch_regs return_switch;
30334   return_switch.add_call_result (call);
30335   return_switch.emit_prologue ();
30336   aarch64_switch_pstate_sm (callee_isa_mode, AARCH64_ISA_MODE);
30337   return_switch.emit_epilogue ();
30338   if (return_guard_label)
30339     emit_label (return_guard_label);
30340   auto result_seq = get_insns ();
30341   end_sequence ();
30342   emit_insn_after (result_seq, call);
30343   return true;
30344 }
30345
30346 namespace {
30347
30348 const pass_data pass_data_switch_pstate_sm =
30349 {
30350   RTL_PASS, // type
30351   "smstarts", // name
30352   OPTGROUP_NONE, // optinfo_flags
30353   TV_NONE, // tv_id
30354   0, // properties_required
30355   0, // properties_provided
30356   0, // properties_destroyed
30357   0, // todo_flags_start
30358   TODO_df_finish, // todo_flags_finish
30359 };
30360
30361 class pass_switch_pstate_sm : public rtl_opt_pass
30362 {
30363 public:
30364   pass_switch_pstate_sm (gcc::context *ctxt)
30365     : rtl_opt_pass (pass_data_switch_pstate_sm, ctxt)
30366   {}
30367
30368   // opt_pass methods:
30369   bool gate (function *) override final;
30370   unsigned int execute (function *) override final;
30371 };
30372
30373 bool
30374 pass_switch_pstate_sm::gate (function *fn)
30375 {
30376   return (aarch64_fndecl_pstate_sm (fn->decl) != AARCH64_ISA_MODE_SM_OFF
30377           || cfun->machine->call_switches_pstate_sm);
30378 }
30379
30380 /* Emit any instructions needed to switch PSTATE.SM.  */
30381 unsigned int
30382 pass_switch_pstate_sm::execute (function *fn)
30383 {
30384   basic_block bb;
30385
30386   auto_sbitmap blocks (last_basic_block_for_fn (cfun));
30387   bitmap_clear (blocks);
30388   FOR_EACH_BB_FN (bb, fn)
30389     {
30390       if (has_abnormal_call_or_eh_pred_edge_p (bb)
30391           && aarch64_switch_pstate_sm_for_landing_pad (bb))
30392         bitmap_set_bit (blocks, bb->index);
30393
30394       if (cfun->machine->call_switches_pstate_sm)
30395         {
30396           rtx_insn *insn;
30397           FOR_BB_INSNS (bb, insn)
30398             if (auto *call = dyn_cast<rtx_call_insn *> (insn))
30399               if (aarch64_switch_pstate_sm_for_call (call))
30400                 bitmap_set_bit (blocks, bb->index);
30401         }
30402
30403       auto end = BB_END (bb);
30404       if (JUMP_P (end)
30405           && find_reg_note (end, REG_NON_LOCAL_GOTO, NULL_RTX)
30406           && aarch64_switch_pstate_sm_for_jump (end))
30407         bitmap_set_bit (blocks, bb->index);
30408     }
30409   find_many_sub_basic_blocks (blocks);
30410   clear_aux_for_blocks ();
30411   return 0;
30412 }
30413
30414 }
30415
30416 rtl_opt_pass *
30417 make_pass_switch_pstate_sm (gcc::context *ctxt)
30418 {
30419   return new pass_switch_pstate_sm (ctxt);
30420 }
30421
30422 /* Parse an implementation-defined system register name of
30423    the form S[0-3]_[0-7]_C[0-15]_C[0-15]_[0-7].
30424    Return true if name matched against above pattern, false
30425    otherwise.  */
30426 bool
30427 aarch64_is_implem_def_reg (const char *regname)
30428 {
30429   unsigned pos = 0;
30430   unsigned name_len = strlen (regname);
30431   if (name_len < 12 || name_len > 14)
30432     return false;
30433
30434   auto cterm_valid_p = [&]()
30435   {
30436     bool leading_zero_p = false;
30437     unsigned i = 0;
30438     char n[3] = {0};
30439
30440     if (regname[pos] != 'c')
30441       return false;
30442     pos++;
30443     while (regname[pos] != '_')
30444       {
30445         if (leading_zero_p)
30446           return false;
30447         if (i == 0 && regname[pos] == '0')
30448           leading_zero_p = true;
30449         if (i > 2)
30450           return false;
30451         if (!ISDIGIT (regname[pos]))
30452           return false;
30453         n[i++] = regname[pos++];
30454       }
30455     if (atoi (n) > 15)
30456       return false;
30457     return true;
30458   };
30459
30460   if (regname[pos] != 's')
30461     return false;
30462   pos++;
30463   if (regname[pos] < '0' || regname[pos] > '3')
30464     return false;
30465   pos++;
30466   if (regname[pos++] != '_')
30467     return false;
30468   if (regname[pos] < '0' || regname[pos] > '7')
30469     return false;
30470   pos++;
30471   if (regname[pos++] != '_')
30472     return false;
30473   if (!cterm_valid_p ())
30474     return false;
30475   if (regname[pos++] != '_')
30476     return false;
30477   if (!cterm_valid_p ())
30478     return false;
30479   if (regname[pos++] != '_')
30480     return false;
30481   if (regname[pos] < '0' || regname[pos] > '7')
30482     return false;
30483   return true;
30484 }
30485
30486 /* Return true if REGNAME matches either a known permitted system
30487    register name, or a generic sysreg specification.  For use in
30488    back-end predicate `aarch64_sysreg_string'.  */
30489 bool
30490 aarch64_valid_sysreg_name_p (const char *regname)
30491 {
30492   const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30493   if (sysreg == NULL)
30494     return aarch64_is_implem_def_reg (regname);
30495   if (sysreg->arch_reqs)
30496     return bool (aarch64_isa_flags & sysreg->arch_reqs);
30497   return true;
30498 }
30499
30500 /* Return the generic sysreg specification for a valid system register
30501    name, otherwise NULL.  WRITE_P is true iff the register is being
30502    written to.  IS128OP indicates the requested system register should
30503    be checked for a 128-bit implementation.  */
30504 const char *
30505 aarch64_retrieve_sysreg (const char *regname, bool write_p, bool is128op)
30506 {
30507   const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30508   if (sysreg == NULL)
30509     {
30510       if (aarch64_is_implem_def_reg (regname))
30511         return regname;
30512       else
30513         return NULL;
30514     }
30515   if (is128op && !(sysreg->properties & F_REG_128))
30516     return NULL;
30517   if ((write_p && (sysreg->properties & F_REG_READ))
30518       || (!write_p && (sysreg->properties & F_REG_WRITE)))
30519     return NULL;
30520   if ((~aarch64_isa_flags & sysreg->arch_reqs) != 0)
30521     return NULL;
30522   return sysreg->encoding;
30523 }
30524
30525 /* Target-specific selftests.  */
30526
30527 #if CHECKING_P
30528
30529 namespace selftest {
30530
30531 /* Selftest for the RTL loader.
30532    Verify that the RTL loader copes with a dump from
30533    print_rtx_function.  This is essentially just a test that class
30534    function_reader can handle a real dump, but it also verifies
30535    that lookup_reg_by_dump_name correctly handles hard regs.
30536    The presence of hard reg names in the dump means that the test is
30537    target-specific, hence it is in this file.  */
30538
30539 static void
30540 aarch64_test_loading_full_dump ()
30541 {
30542   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
30543
30544   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
30545
30546   rtx_insn *insn_1 = get_insn_by_uid (1);
30547   ASSERT_EQ (NOTE, GET_CODE (insn_1));
30548
30549   rtx_insn *insn_15 = get_insn_by_uid (15);
30550   ASSERT_EQ (INSN, GET_CODE (insn_15));
30551   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
30552
30553   /* Verify crtl->return_rtx.  */
30554   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
30555   ASSERT_EQ (0, REGNO (crtl->return_rtx));
30556   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
30557 }
30558
30559 /* Test the fractional_cost class.  */
30560
30561 static void
30562 aarch64_test_fractional_cost ()
30563 {
30564   using cf = fractional_cost;
30565
30566   ASSERT_EQ (cf (0, 20), 0);
30567
30568   ASSERT_EQ (cf (4, 2), 2);
30569   ASSERT_EQ (3, cf (9, 3));
30570
30571   ASSERT_NE (cf (5, 2), 2);
30572   ASSERT_NE (3, cf (8, 3));
30573
30574   ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
30575   ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
30576   ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
30577
30578   ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
30579   ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
30580   ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
30581   ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
30582   ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
30583   ASSERT_EQ (3 - cf (10, 3), 0);
30584
30585   ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
30586   ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
30587
30588   ASSERT_TRUE (cf (4, 15) <= cf (5, 15));
30589   ASSERT_TRUE (cf (5, 15) <= cf (5, 15));
30590   ASSERT_FALSE (cf (6, 15) <= cf (5, 15));
30591   ASSERT_TRUE (cf (1, 3) <= cf (2, 5));
30592   ASSERT_TRUE (cf (1, 12) <= cf (1, 6));
30593   ASSERT_TRUE (cf (5, 3) <= cf (5, 3));
30594   ASSERT_TRUE (cf (239, 240) <= 1);
30595   ASSERT_TRUE (cf (240, 240) <= 1);
30596   ASSERT_FALSE (cf (241, 240) <= 1);
30597   ASSERT_FALSE (2 <= cf (207, 104));
30598   ASSERT_TRUE (2 <= cf (208, 104));
30599   ASSERT_TRUE (2 <= cf (209, 104));
30600
30601   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
30602   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
30603   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
30604   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
30605   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
30606   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
30607   ASSERT_TRUE (cf (239, 240) < 1);
30608   ASSERT_FALSE (cf (240, 240) < 1);
30609   ASSERT_FALSE (cf (241, 240) < 1);
30610   ASSERT_FALSE (2 < cf (207, 104));
30611   ASSERT_FALSE (2 < cf (208, 104));
30612   ASSERT_TRUE (2 < cf (209, 104));
30613
30614   ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
30615   ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
30616   ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
30617   ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
30618   ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
30619   ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
30620   ASSERT_FALSE (cf (239, 240) >= 1);
30621   ASSERT_TRUE (cf (240, 240) >= 1);
30622   ASSERT_TRUE (cf (241, 240) >= 1);
30623   ASSERT_TRUE (2 >= cf (207, 104));
30624   ASSERT_TRUE (2 >= cf (208, 104));
30625   ASSERT_FALSE (2 >= cf (209, 104));
30626
30627   ASSERT_FALSE (cf (4, 15) > cf (5, 15));
30628   ASSERT_FALSE (cf (5, 15) > cf (5, 15));
30629   ASSERT_TRUE (cf (6, 15) > cf (5, 15));
30630   ASSERT_FALSE (cf (1, 3) > cf (2, 5));
30631   ASSERT_FALSE (cf (1, 12) > cf (1, 6));
30632   ASSERT_FALSE (cf (5, 3) > cf (5, 3));
30633   ASSERT_FALSE (cf (239, 240) > 1);
30634   ASSERT_FALSE (cf (240, 240) > 1);
30635   ASSERT_TRUE (cf (241, 240) > 1);
30636   ASSERT_TRUE (2 > cf (207, 104));
30637   ASSERT_FALSE (2 > cf (208, 104));
30638   ASSERT_FALSE (2 > cf (209, 104));
30639
30640   ASSERT_EQ (cf (1, 2).ceil (), 1);
30641   ASSERT_EQ (cf (11, 7).ceil (), 2);
30642   ASSERT_EQ (cf (20, 1).ceil (), 20);
30643   ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
30644   ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
30645   ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
30646   ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
30647   ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
30648
30649   ASSERT_EQ (cf (1, 2).as_double (), 0.5);
30650 }
30651
30652 /* Calculate whether our system register data, as imported from
30653    `aarch64-sys-reg.def' has any duplicate entries.  */
30654 static void
30655 aarch64_test_sysreg_encoding_clashes (void)
30656 {
30657   using dup_instances_t = hash_map<nofree_string_hash,
30658                                    std::vector<const sysreg_t*>>;
30659
30660   dup_instances_t duplicate_instances;
30661
30662   /* Every time an encoding is established to come up more than once
30663      we add it to a "clash-analysis queue", which is then used to extract
30664      necessary information from our hash map when establishing whether
30665      repeated encodings are valid.  */
30666
30667   /* 1) Collect recurrence information.  */
30668   for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
30669     {
30670       const sysreg_t *reg = aarch64_sysregs + i;
30671
30672       std::vector<const sysreg_t*> *tmp
30673         = &duplicate_instances.get_or_insert (reg->encoding);
30674
30675       tmp->push_back (reg);
30676     }
30677
30678   /* 2) Carry out analysis on collected data.  */
30679   for (auto instance : duplicate_instances)
30680     {
30681       unsigned nrep = instance.second.size ();
30682       if (nrep > 1)
30683         for (unsigned i = 0; i < nrep; i++)
30684           for (unsigned j = i + 1; j < nrep; j++)
30685             {
30686               const sysreg_t *a = instance.second[i];
30687               const sysreg_t *b = instance.second[j];
30688               ASSERT_TRUE ((a->properties != b->properties)
30689                            || (a->arch_reqs != b->arch_reqs));
30690             }
30691     }
30692 }
30693
30694 /* Run all target-specific selftests.  */
30695
30696 static void
30697 aarch64_run_selftests (void)
30698 {
30699   aarch64_test_loading_full_dump ();
30700   aarch64_test_fractional_cost ();
30701   aarch64_test_sysreg_encoding_clashes ();
30702 }
30703
30704 } // namespace selftest
30705
30706 #endif /* #if CHECKING_P */
30707
30708 #undef TARGET_STACK_PROTECT_GUARD
30709 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
30710
30711 #undef TARGET_ADDRESS_COST
30712 #define TARGET_ADDRESS_COST aarch64_address_cost
30713
30714 /* This hook will determines whether unnamed bitfields affect the alignment
30715    of the containing structure.  The hook returns true if the structure
30716    should inherit the alignment requirements of an unnamed bitfield's
30717    type.  */
30718 #undef TARGET_ALIGN_ANON_BITFIELD
30719 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
30720
30721 #undef TARGET_ASM_ALIGNED_DI_OP
30722 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
30723
30724 #undef TARGET_ASM_ALIGNED_HI_OP
30725 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
30726
30727 #undef TARGET_ASM_ALIGNED_SI_OP
30728 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
30729
30730 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
30731 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
30732   hook_bool_const_tree_hwi_hwi_const_tree_true
30733
30734 #undef TARGET_ASM_FILE_START
30735 #define TARGET_ASM_FILE_START aarch64_start_file
30736
30737 #undef TARGET_ASM_OUTPUT_MI_THUNK
30738 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
30739
30740 #undef TARGET_ASM_SELECT_RTX_SECTION
30741 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
30742
30743 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
30744 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
30745
30746 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
30747 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
30748
30749 #undef TARGET_BUILD_BUILTIN_VA_LIST
30750 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
30751
30752 #undef TARGET_CALLEE_COPIES
30753 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
30754
30755 #undef TARGET_FRAME_POINTER_REQUIRED
30756 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
30757
30758 #undef TARGET_CAN_ELIMINATE
30759 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
30760
30761 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
30762 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P \
30763   aarch64_function_attribute_inlinable_p
30764
30765 #undef TARGET_NEED_IPA_FN_TARGET_INFO
30766 #define TARGET_NEED_IPA_FN_TARGET_INFO aarch64_need_ipa_fn_target_info
30767
30768 #undef TARGET_UPDATE_IPA_FN_TARGET_INFO
30769 #define TARGET_UPDATE_IPA_FN_TARGET_INFO aarch64_update_ipa_fn_target_info
30770
30771 #undef TARGET_CAN_INLINE_P
30772 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
30773
30774 #undef TARGET_CANNOT_FORCE_CONST_MEM
30775 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
30776
30777 #undef TARGET_CASE_VALUES_THRESHOLD
30778 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
30779
30780 #undef TARGET_CONDITIONAL_REGISTER_USAGE
30781 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
30782
30783 #undef TARGET_MEMBER_TYPE_FORCES_BLK
30784 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
30785
30786 /* Only the least significant bit is used for initialization guard
30787    variables.  */
30788 #undef TARGET_CXX_GUARD_MASK_BIT
30789 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
30790
30791 #undef TARGET_C_MODE_FOR_SUFFIX
30792 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
30793
30794 #ifdef TARGET_BIG_ENDIAN_DEFAULT
30795 #undef  TARGET_DEFAULT_TARGET_FLAGS
30796 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
30797 #endif
30798
30799 #undef TARGET_CLASS_MAX_NREGS
30800 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
30801
30802 #undef TARGET_BUILTIN_DECL
30803 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
30804
30805 #undef TARGET_BUILTIN_RECIPROCAL
30806 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
30807
30808 #undef TARGET_C_EXCESS_PRECISION
30809 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
30810
30811 #undef TARGET_C_BITINT_TYPE_INFO
30812 #define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
30813
30814 #undef TARGET_C_MODE_FOR_FLOATING_TYPE
30815 #define TARGET_C_MODE_FOR_FLOATING_TYPE aarch64_c_mode_for_floating_type
30816
30817 #undef  TARGET_EXPAND_BUILTIN
30818 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
30819
30820 #undef TARGET_EXPAND_BUILTIN_VA_START
30821 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
30822
30823 #undef TARGET_FOLD_BUILTIN
30824 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
30825
30826 #undef TARGET_FUNCTION_ARG
30827 #define TARGET_FUNCTION_ARG aarch64_function_arg
30828
30829 #undef TARGET_FUNCTION_ARG_ADVANCE
30830 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
30831
30832 #undef TARGET_FUNCTION_ARG_BOUNDARY
30833 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
30834
30835 #undef TARGET_FUNCTION_ARG_PADDING
30836 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
30837
30838 #undef TARGET_GET_RAW_RESULT_MODE
30839 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
30840 #undef TARGET_GET_RAW_ARG_MODE
30841 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
30842
30843 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
30844 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
30845
30846 #undef TARGET_FUNCTION_VALUE
30847 #define TARGET_FUNCTION_VALUE aarch64_function_value
30848
30849 #undef TARGET_FUNCTION_VALUE_REGNO_P
30850 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
30851
30852 #undef TARGET_START_CALL_ARGS
30853 #define TARGET_START_CALL_ARGS aarch64_start_call_args
30854
30855 #undef TARGET_END_CALL_ARGS
30856 #define TARGET_END_CALL_ARGS aarch64_end_call_args
30857
30858 #undef TARGET_GIMPLE_FOLD_BUILTIN
30859 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
30860
30861 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
30862 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
30863
30864 #undef  TARGET_INIT_BUILTINS
30865 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
30866
30867 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
30868 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
30869   aarch64_ira_change_pseudo_allocno_class
30870
30871 #undef TARGET_LEGITIMATE_ADDRESS_P
30872 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
30873
30874 #undef TARGET_LEGITIMATE_CONSTANT_P
30875 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
30876
30877 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
30878 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
30879   aarch64_legitimize_address_displacement
30880
30881 #undef TARGET_LIBGCC_CMP_RETURN_MODE
30882 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
30883
30884 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
30885 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
30886 aarch64_libgcc_floating_mode_supported_p
30887
30888 #undef TARGET_MANGLE_TYPE
30889 #define TARGET_MANGLE_TYPE aarch64_mangle_type
30890
30891 #undef TARGET_INVALID_CONVERSION
30892 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
30893
30894 #undef TARGET_INVALID_UNARY_OP
30895 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
30896
30897 #undef TARGET_INVALID_BINARY_OP
30898 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
30899
30900 #undef TARGET_VERIFY_TYPE_CONTEXT
30901 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
30902
30903 #undef TARGET_MEMORY_MOVE_COST
30904 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
30905
30906 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
30907 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
30908
30909 #undef TARGET_MUST_PASS_IN_STACK
30910 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
30911
30912 /* This target hook should return true if accesses to volatile bitfields
30913    should use the narrowest mode possible.  It should return false if these
30914    accesses should use the bitfield container type.  */
30915 #undef TARGET_NARROW_VOLATILE_BITFIELD
30916 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
30917
30918 #undef  TARGET_OPTION_OVERRIDE
30919 #define TARGET_OPTION_OVERRIDE aarch64_override_options
30920
30921 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
30922 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
30923   aarch64_override_options_after_change
30924
30925 #undef TARGET_OFFLOAD_OPTIONS
30926 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
30927
30928 #undef TARGET_OPTION_RESTORE
30929 #define TARGET_OPTION_RESTORE aarch64_option_restore
30930
30931 #undef TARGET_OPTION_PRINT
30932 #define TARGET_OPTION_PRINT aarch64_option_print
30933
30934 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
30935 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
30936
30937 #undef TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P
30938 #define TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P \
30939   aarch64_option_valid_version_attribute_p
30940
30941 #undef TARGET_SET_CURRENT_FUNCTION
30942 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
30943
30944 #undef TARGET_PASS_BY_REFERENCE
30945 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
30946
30947 #undef TARGET_PREFERRED_RELOAD_CLASS
30948 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
30949
30950 #undef TARGET_SCHED_REASSOCIATION_WIDTH
30951 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
30952
30953 #undef TARGET_DWARF_FRAME_REG_MODE
30954 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
30955
30956 #undef TARGET_OUTPUT_CFI_DIRECTIVE
30957 #define TARGET_OUTPUT_CFI_DIRECTIVE aarch64_output_cfi_directive
30958
30959 #undef TARGET_DW_CFI_OPRND1_DESC
30960 #define TARGET_DW_CFI_OPRND1_DESC aarch64_dw_cfi_oprnd1_desc
30961
30962 #undef TARGET_PROMOTED_TYPE
30963 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
30964
30965 #undef TARGET_SECONDARY_RELOAD
30966 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
30967
30968 #undef TARGET_SECONDARY_MEMORY_NEEDED
30969 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
30970
30971 #undef TARGET_SHIFT_TRUNCATION_MASK
30972 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
30973
30974 #undef TARGET_SETUP_INCOMING_VARARGS
30975 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
30976
30977 #undef TARGET_STRUCT_VALUE_RTX
30978 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
30979
30980 #undef TARGET_REGISTER_MOVE_COST
30981 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
30982
30983 #undef TARGET_RETURN_IN_MEMORY
30984 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
30985
30986 #undef TARGET_RETURN_IN_MSB
30987 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
30988
30989 #undef TARGET_RTX_COSTS
30990 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
30991
30992 #undef TARGET_INSN_COST
30993 #define TARGET_INSN_COST aarch64_insn_cost
30994
30995 #undef TARGET_SCALAR_MODE_SUPPORTED_P
30996 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
30997
30998 #undef TARGET_SCHED_ISSUE_RATE
30999 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
31000
31001 #undef TARGET_SCHED_VARIABLE_ISSUE
31002 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
31003
31004 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
31005 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
31006   aarch64_sched_first_cycle_multipass_dfa_lookahead
31007
31008 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
31009 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
31010   aarch64_first_cycle_multipass_dfa_lookahead_guard
31011
31012 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
31013 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
31014   aarch64_get_separate_components
31015
31016 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
31017 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
31018   aarch64_components_for_bb
31019
31020 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
31021 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
31022   aarch64_disqualify_components
31023
31024 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
31025 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
31026   aarch64_emit_prologue_components
31027
31028 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
31029 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
31030   aarch64_emit_epilogue_components
31031
31032 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
31033 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
31034   aarch64_set_handled_components
31035
31036 #undef TARGET_TRAMPOLINE_INIT
31037 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
31038
31039 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
31040 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
31041
31042 #undef TARGET_VECTOR_MODE_SUPPORTED_P
31043 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
31044
31045 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
31046 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P aarch64_vector_mode_supported_any_target_p
31047
31048 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
31049 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
31050
31051 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
31052 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
31053   aarch64_builtin_support_vector_misalignment
31054
31055 #undef TARGET_ARRAY_MODE
31056 #define TARGET_ARRAY_MODE aarch64_array_mode
31057
31058 #undef TARGET_ARRAY_MODE_SUPPORTED_P
31059 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
31060
31061 #undef TARGET_VECTORIZE_CREATE_COSTS
31062 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
31063
31064 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
31065 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
31066   aarch64_builtin_vectorization_cost
31067
31068 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
31069 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
31070
31071 #undef TARGET_VECTORIZE_BUILTINS
31072 #define TARGET_VECTORIZE_BUILTINS
31073
31074 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
31075 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
31076   aarch64_autovectorize_vector_modes
31077
31078 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
31079 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
31080   aarch64_atomic_assign_expand_fenv
31081
31082 /* Section anchor support.  */
31083
31084 #undef TARGET_MIN_ANCHOR_OFFSET
31085 #define TARGET_MIN_ANCHOR_OFFSET -256
31086
31087 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
31088    byte offset; we can do much more for larger data types, but have no way
31089    to determine the size of the access.  We assume accesses are aligned.  */
31090 #undef TARGET_MAX_ANCHOR_OFFSET
31091 #define TARGET_MAX_ANCHOR_OFFSET 4095
31092
31093 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
31094 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
31095   aarch64_vectorize_preferred_div_as_shifts_over_mult
31096
31097 #undef TARGET_VECTOR_ALIGNMENT
31098 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
31099
31100 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
31101 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
31102   aarch64_vectorize_preferred_vector_alignment
31103 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
31104 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
31105   aarch64_simd_vector_alignment_reachable
31106
31107 /* vec_perm support.  */
31108
31109 #undef TARGET_VECTORIZE_VEC_PERM_CONST
31110 #define TARGET_VECTORIZE_VEC_PERM_CONST \
31111   aarch64_vectorize_vec_perm_const
31112
31113 #undef TARGET_VECTORIZE_RELATED_MODE
31114 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
31115 #undef TARGET_VECTORIZE_GET_MASK_MODE
31116 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
31117 #undef TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE
31118 #define TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE \
31119   aarch64_conditional_operation_is_expensive
31120 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
31121 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
31122   aarch64_empty_mask_is_expensive
31123 #undef TARGET_PREFERRED_ELSE_VALUE
31124 #define TARGET_PREFERRED_ELSE_VALUE \
31125   aarch64_preferred_else_value
31126
31127 #undef TARGET_INIT_LIBFUNCS
31128 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
31129
31130 #undef TARGET_FIXED_CONDITION_CODE_REGS
31131 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
31132
31133 #undef TARGET_FLAGS_REGNUM
31134 #define TARGET_FLAGS_REGNUM CC_REGNUM
31135
31136 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
31137 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
31138
31139 #undef TARGET_ASAN_SHADOW_OFFSET
31140 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
31141
31142 #undef TARGET_LEGITIMIZE_ADDRESS
31143 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
31144
31145 #undef TARGET_SCHED_CAN_SPECULATE_INSN
31146 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
31147
31148 #undef TARGET_CAN_USE_DOLOOP_P
31149 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
31150
31151 #undef TARGET_SCHED_ADJUST_PRIORITY
31152 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
31153
31154 #undef TARGET_SCHED_MACRO_FUSION_P
31155 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
31156
31157 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
31158 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
31159
31160 #undef TARGET_SCHED_FUSION_PRIORITY
31161 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
31162
31163 #undef TARGET_UNSPEC_MAY_TRAP_P
31164 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
31165
31166 #undef TARGET_USE_PSEUDO_PIC_REG
31167 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
31168
31169 #undef TARGET_PRINT_OPERAND
31170 #define TARGET_PRINT_OPERAND aarch64_print_operand
31171
31172 #undef TARGET_PRINT_OPERAND_ADDRESS
31173 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
31174
31175 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
31176 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
31177
31178 #undef TARGET_OPTAB_SUPPORTED_P
31179 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
31180
31181 #undef TARGET_OMIT_STRUCT_RETURN_REG
31182 #define TARGET_OMIT_STRUCT_RETURN_REG true
31183
31184 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
31185 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
31186   aarch64_dwarf_poly_indeterminate_value
31187
31188 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
31189 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
31190 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
31191
31192 #undef TARGET_HARD_REGNO_NREGS
31193 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
31194 #undef TARGET_HARD_REGNO_MODE_OK
31195 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
31196
31197 #undef TARGET_MODES_TIEABLE_P
31198 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
31199
31200 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
31201 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
31202   aarch64_hard_regno_call_part_clobbered
31203
31204 #undef TARGET_INSN_CALLEE_ABI
31205 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
31206
31207 #undef TARGET_CONSTANT_ALIGNMENT
31208 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
31209
31210 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
31211 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
31212   aarch64_stack_clash_protection_alloca_probe_range
31213
31214 #undef TARGET_COMPUTE_PRESSURE_CLASSES
31215 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
31216
31217 #undef TARGET_CAN_CHANGE_MODE_CLASS
31218 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
31219
31220 #undef TARGET_SELECT_EARLY_REMAT_MODES
31221 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
31222
31223 #undef TARGET_SPECULATION_SAFE_VALUE
31224 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
31225
31226 #undef TARGET_ESTIMATED_POLY_VALUE
31227 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
31228
31229 #undef TARGET_ATTRIBUTE_TABLE
31230 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
31231
31232 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
31233 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
31234   aarch64_simd_clone_compute_vecsize_and_simdlen
31235
31236 #undef TARGET_SIMD_CLONE_ADJUST
31237 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
31238
31239 #undef TARGET_SIMD_CLONE_USABLE
31240 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
31241
31242 #undef TARGET_COMP_TYPE_ATTRIBUTES
31243 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
31244
31245 #undef TARGET_MERGE_DECL_ATTRIBUTES
31246 #define TARGET_MERGE_DECL_ATTRIBUTES aarch64_merge_decl_attributes
31247
31248 #undef TARGET_GET_MULTILIB_ABI_NAME
31249 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
31250
31251 #undef TARGET_FNTYPE_ABI
31252 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
31253
31254 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
31255 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
31256
31257 #if CHECKING_P
31258 #undef TARGET_RUN_TARGET_SELFTESTS
31259 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
31260 #endif /* #if CHECKING_P */
31261
31262 #undef TARGET_ASM_POST_CFI_STARTPROC
31263 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
31264
31265 #undef TARGET_STRICT_ARGUMENT_NAMING
31266 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
31267
31268 #undef TARGET_MODE_EMIT
31269 #define TARGET_MODE_EMIT aarch64_mode_emit
31270
31271 #undef TARGET_MODE_NEEDED
31272 #define TARGET_MODE_NEEDED aarch64_mode_needed
31273
31274 #undef TARGET_MODE_AFTER
31275 #define TARGET_MODE_AFTER aarch64_mode_after
31276
31277 #undef TARGET_MODE_CONFLUENCE
31278 #define TARGET_MODE_CONFLUENCE aarch64_mode_confluence
31279
31280 #undef TARGET_MODE_BACKPROP
31281 #define TARGET_MODE_BACKPROP aarch64_mode_backprop
31282
31283 #undef TARGET_MODE_ENTRY
31284 #define TARGET_MODE_ENTRY aarch64_mode_entry
31285
31286 #undef TARGET_MODE_EXIT
31287 #define TARGET_MODE_EXIT aarch64_mode_exit
31288
31289 #undef TARGET_MODE_EH_HANDLER
31290 #define TARGET_MODE_EH_HANDLER aarch64_mode_eh_handler
31291
31292 #undef TARGET_MODE_PRIORITY
31293 #define TARGET_MODE_PRIORITY aarch64_mode_priority
31294
31295 #undef TARGET_MD_ASM_ADJUST
31296 #define TARGET_MD_ASM_ADJUST aarch64_md_asm_adjust
31297
31298 #undef TARGET_ASM_FILE_END
31299 #define TARGET_ASM_FILE_END aarch64_asm_file_end
31300
31301 #undef TARGET_ASM_FUNCTION_EPILOGUE
31302 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
31303
31304 #undef TARGET_HAVE_SHADOW_CALL_STACK
31305 #define TARGET_HAVE_SHADOW_CALL_STACK true
31306
31307 #undef TARGET_CONST_ANCHOR
31308 #define TARGET_CONST_ANCHOR 0x1000000
31309
31310 #undef TARGET_EXTRA_LIVE_ON_ENTRY
31311 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
31312
31313 #undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
31314 #define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
31315
31316 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
31317 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
31318
31319 #undef TARGET_OPTION_FUNCTION_VERSIONS
31320 #define TARGET_OPTION_FUNCTION_VERSIONS aarch64_common_function_versions
31321
31322 #undef TARGET_COMPARE_VERSION_PRIORITY
31323 #define TARGET_COMPARE_VERSION_PRIORITY aarch64_compare_version_priority
31324
31325 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
31326 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
31327   aarch64_generate_version_dispatcher_body
31328
31329 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
31330 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
31331   aarch64_get_function_versions_dispatcher
31332
31333 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
31334 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME aarch64_mangle_decl_assembler_name
31335
31336 struct gcc_target targetm = TARGET_INITIALIZER;
31337
31338 #include "gt-aarch64.h"