llvm/lib/Target/AArch64/AArch64Subtarget.cpp

   1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements the AArch64 specific subclass of TargetSubtarget.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "AArch64Subtarget.h"
  14
  15 #include "AArch64.h"
  16 #include "AArch64InstrInfo.h"
  17 #include "AArch64PBQPRegAlloc.h"
  18 #include "AArch64TargetMachine.h"
  19 #include "GISel/AArch64CallLowering.h"
  20 #include "GISel/AArch64LegalizerInfo.h"
  21 #include "GISel/AArch64RegisterBankInfo.h"
  22 #include "MCTargetDesc/AArch64AddressingModes.h"
  23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineScheduler.h"
  26 #include "llvm/IR/GlobalValue.h"
  27 #include "llvm/Support/AArch64TargetParser.h"
  28 #include "llvm/Support/TargetParser.h"
  29
  30 using namespace llvm;
  31
  32 #define DEBUG_TYPE "aarch64-subtarget"
  33
  34 #define GET_SUBTARGETINFO_CTOR
  35 #define GET_SUBTARGETINFO_TARGET_DESC
  36 #include "AArch64GenSubtargetInfo.inc"
  37
  38 static cl::opt<bool>
  39 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
  40                      "converter pass"), cl::init(true), cl::Hidden);
  41
  42 // If OS supports TBI, use this flag to enable it.
  43 static cl::opt<bool>
  44 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
  45                          "an address is ignored"), cl::init(false), cl::Hidden);
  46
  47 static cl::opt<bool>
  48     UseNonLazyBind("aarch64-enable-nonlazybind",
  49                    cl::desc("Call nonlazybind functions via direct GOT load"),
  50                    cl::init(false), cl::Hidden);
  51
  52 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
  53                            cl::desc("Enable the use of AA during codegen."));
  54
  55 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
  56     StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
  57   // Determine default and user-specified characteristics
  58
  59   if (CPUString.empty())
  60     CPUString = "generic";
  61
  62   if (TuneCPUString.empty())
  63     TuneCPUString = CPUString;
  64
  65   ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
  66   initializeProperties();
  67
  68   return *this;
  69 }
  70
  71 void AArch64Subtarget::initializeProperties() {
  72   // Initialize CPU specific properties. We should add a tablegen feature for
  73   // this in the future so we can specify it together with the subtarget
  74   // features.
  75   switch (ARMProcFamily) {
  76   case Others:
  77     break;
  78   case Carmel:
  79     CacheLineSize = 64;
  80     break;
  81   case CortexA35:
  82     break;
  83   case CortexA53:
  84   case CortexA55:
  85     PrefFunctionLogAlignment = 4;
  86     break;
  87   case CortexA57:
  88     MaxInterleaveFactor = 4;
  89     PrefFunctionLogAlignment = 4;
  90     break;
  91   case CortexA65:
  92     PrefFunctionLogAlignment = 3;
  93     break;
  94   case CortexA72:
  95   case CortexA73:
  96   case CortexA75:
  97   case CortexA76:
  98   case CortexA77:
  99   case CortexA78:
 100   case CortexA78C:
 101   case CortexR82:
 102   case CortexX1:
 103   case CortexX1C:
 104     PrefFunctionLogAlignment = 4;
 105     break;
 106   case CortexA510:
 107   case CortexA710:
 108   case CortexX2:
 109     PrefFunctionLogAlignment = 4;
 110     VScaleForTuning = 1;
 111     break;
 112   case A64FX:
 113     CacheLineSize = 256;
 114     PrefFunctionLogAlignment = 3;
 115     PrefLoopLogAlignment = 2;
 116     MaxInterleaveFactor = 4;
 117     PrefetchDistance = 128;
 118     MinPrefetchStride = 1024;
 119     MaxPrefetchIterationsAhead = 4;
 120     VScaleForTuning = 4;
 121     break;
 122   case AppleA7:
 123   case AppleA10:
 124   case AppleA11:
 125   case AppleA12:
 126   case AppleA13:
 127   case AppleA14:
 128     CacheLineSize = 64;
 129     PrefetchDistance = 280;
 130     MinPrefetchStride = 2048;
 131     MaxPrefetchIterationsAhead = 3;
 132     break;
 133   case ExynosM3:
 134     MaxInterleaveFactor = 4;
 135     MaxJumpTableSize = 20;
 136     PrefFunctionLogAlignment = 5;
 137     PrefLoopLogAlignment = 4;
 138     break;
 139   case Falkor:
 140     MaxInterleaveFactor = 4;
 141     // FIXME: remove this to enable 64-bit SLP if performance looks good.
 142     MinVectorRegisterBitWidth = 128;
 143     CacheLineSize = 128;
 144     PrefetchDistance = 820;
 145     MinPrefetchStride = 2048;
 146     MaxPrefetchIterationsAhead = 8;
 147     break;
 148   case Kryo:
 149     MaxInterleaveFactor = 4;
 150     VectorInsertExtractBaseCost = 2;
 151     CacheLineSize = 128;
 152     PrefetchDistance = 740;
 153     MinPrefetchStride = 1024;
 154     MaxPrefetchIterationsAhead = 11;
 155     // FIXME: remove this to enable 64-bit SLP if performance looks good.
 156     MinVectorRegisterBitWidth = 128;
 157     break;
 158   case NeoverseE1:
 159     PrefFunctionLogAlignment = 3;
 160     break;
 161   case NeoverseN1:
 162     PrefFunctionLogAlignment = 4;
 163     PrefLoopLogAlignment = 5;
 164     MaxBytesForLoopAlignment = 16;
 165     break;
 166   case NeoverseN2:
 167     PrefFunctionLogAlignment = 4;
 168     PrefLoopLogAlignment = 5;
 169     MaxBytesForLoopAlignment = 16;
 170     VScaleForTuning = 1;
 171     break;
 172   case NeoverseV1:
 173     PrefFunctionLogAlignment = 4;
 174     PrefLoopLogAlignment = 5;
 175     MaxBytesForLoopAlignment = 16;
 176     VScaleForTuning = 2;
 177     break;
 178   case Neoverse512TVB:
 179     PrefFunctionLogAlignment = 4;
 180     VScaleForTuning = 1;
 181     MaxInterleaveFactor = 4;
 182     break;
 183   case Saphira:
 184     MaxInterleaveFactor = 4;
 185     // FIXME: remove this to enable 64-bit SLP if performance looks good.
 186     MinVectorRegisterBitWidth = 128;
 187     break;
 188   case ThunderX2T99:
 189     CacheLineSize = 64;
 190     PrefFunctionLogAlignment = 3;
 191     PrefLoopLogAlignment = 2;
 192     MaxInterleaveFactor = 4;
 193     PrefetchDistance = 128;
 194     MinPrefetchStride = 1024;
 195     MaxPrefetchIterationsAhead = 4;
 196     // FIXME: remove this to enable 64-bit SLP if performance looks good.
 197     MinVectorRegisterBitWidth = 128;
 198     break;
 199   case ThunderX:
 200   case ThunderXT88:
 201   case ThunderXT81:
 202   case ThunderXT83:
 203     CacheLineSize = 128;
 204     PrefFunctionLogAlignment = 3;
 205     PrefLoopLogAlignment = 2;
 206     // FIXME: remove this to enable 64-bit SLP if performance looks good.
 207     MinVectorRegisterBitWidth = 128;
 208     break;
 209   case TSV110:
 210     CacheLineSize = 64;
 211     PrefFunctionLogAlignment = 4;
 212     PrefLoopLogAlignment = 2;
 213     break;
 214   case ThunderX3T110:
 215     CacheLineSize = 64;
 216     PrefFunctionLogAlignment = 4;
 217     PrefLoopLogAlignment = 2;
 218     MaxInterleaveFactor = 4;
 219     PrefetchDistance = 128;
 220     MinPrefetchStride = 1024;
 221     MaxPrefetchIterationsAhead = 4;
 222     // FIXME: remove this to enable 64-bit SLP if performance looks good.
 223     MinVectorRegisterBitWidth = 128;
 224     break;
 225   }
 226 }
 227
 228 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
 229                                    const std::string &TuneCPU,
 230                                    const std::string &FS,
 231                                    const TargetMachine &TM, bool LittleEndian,
 232                                    unsigned MinSVEVectorSizeInBitsOverride,
 233                                    unsigned MaxSVEVectorSizeInBitsOverride)
 234     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
 235       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
 236       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
 237       IsLittle(LittleEndian),
 238       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
 239       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
 240       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
 241       TLInfo(TM, *this) {
 242   if (AArch64::isX18ReservedByDefault(TT))
 243     ReserveXRegister.set(18);
 244
 245   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
 246   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
 247   Legalizer.reset(new AArch64LegalizerInfo(*this));
 248
 249   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
 250
 251   // FIXME: At this point, we can't rely on Subtarget having RBI.
 252   // It's awkward to mix passing RBI and the Subtarget; should we pass
 253   // TII/TRI as well?
 254   InstSelector.reset(createAArch64InstructionSelector(
 255       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
 256
 257   RegBankInfo.reset(RBI);
 258 }
 259
 260 const CallLowering *AArch64Subtarget::getCallLowering() const {
 261   return CallLoweringInfo.get();
 262 }
 263
 264 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
 265   return InlineAsmLoweringInfo.get();
 266 }
 267
 268 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
 269   return InstSelector.get();
 270 }
 271
 272 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
 273   return Legalizer.get();
 274 }
 275
 276 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
 277   return RegBankInfo.get();
 278 }
 279
 280 /// Find the target operand flags that describe how a global value should be
 281 /// referenced for the current subtarget.
 282 unsigned
 283 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
 284                                           const TargetMachine &TM) const {
 285   // MachO large model always goes via a GOT, simply to get a single 8-byte
 286   // absolute relocation on all global addresses.
 287   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
 288     return AArch64II::MO_GOT;
 289
 290   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
 291     if (GV->hasDLLImportStorageClass())
 292       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
 293     if (getTargetTriple().isOSWindows())
 294       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
 295     return AArch64II::MO_GOT;
 296   }
 297
 298   // The small code model's direct accesses use ADRP, which cannot
 299   // necessarily produce the value 0 (if the code is above 4GB).
 300   // Same for the tiny code model, where we have a pc relative LDR.
 301   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
 302       GV->hasExternalWeakLinkage())
 303     return AArch64II::MO_GOT;
 304
 305   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
 306   // that their nominal addresses are tagged and outside of the code model. In
 307   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
 308   // tag if necessary based on MO_TAGGED.
 309   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
 310     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
 311
 312   return AArch64II::MO_NO_FLAG;
 313 }
 314
 315 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
 316     const GlobalValue *GV, const TargetMachine &TM) const {
 317   // MachO large model always goes via a GOT, because we don't have the
 318   // relocations available to do anything else..
 319   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
 320       !GV->hasInternalLinkage())
 321     return AArch64II::MO_GOT;
 322
 323   // NonLazyBind goes via GOT unless we know it's available locally.
 324   auto *F = dyn_cast<Function>(GV);
 325   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
 326       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
 327     return AArch64II::MO_GOT;
 328
 329   // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
 330   if (getTargetTriple().isOSWindows())
 331     return ClassifyGlobalReference(GV, TM);
 332
 333   return AArch64II::MO_NO_FLAG;
 334 }
 335
 336 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 337                                            unsigned NumRegionInstrs) const {
 338   // LNT run (at least on Cyclone) showed reasonably significant gains for
 339   // bi-directional scheduling. 253.perlbmk.
 340   Policy.OnlyTopDown = false;
 341   Policy.OnlyBottomUp = false;
 342   // Enabling or Disabling the latency heuristic is a close call: It seems to
 343   // help nearly no benchmark on out-of-order architectures, on the other hand
 344   // it regresses register pressure on a few benchmarking.
 345   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
 346 }
 347
 348 bool AArch64Subtarget::enableEarlyIfConversion() const {
 349   return EnableEarlyIfConvert;
 350 }
 351
 352 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
 353   if (!UseAddressTopByteIgnored)
 354     return false;
 355
 356   if (TargetTriple.isDriverKit())
 357     return true;
 358   if (TargetTriple.isiOS()) {
 359     return TargetTriple.getiOSVersion() >= VersionTuple(8);
 360   }
 361
 362   return false;
 363 }
 364
 365 std::unique_ptr<PBQPRAConstraint>
 366 AArch64Subtarget::getCustomPBQPConstraints() const {
 367   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
 368 }
 369
 370 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
 371   // We usually compute max call frame size after ISel. Do the computation now
 372   // if the .mir file didn't specify it. Note that this will probably give you
 373   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
 374   // instructions, specify explicitly if you need it to be correct.
 375   MachineFrameInfo &MFI = MF.getFrameInfo();
 376   if (!MFI.isMaxCallFrameSizeComputed())
 377     MFI.computeMaxCallFrameSize(MF);
 378 }
 379
 380 bool AArch64Subtarget::useAA() const { return UseAA; }