[AMDGPU] New gfx940 mfma instructions
[llvm-project.git] / llvm / lib / Target / AArch64 / AArch64Subtarget.cpp
blobbeac903383bfe460a39e82ef4d7f2b85880c22c8
1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
11 //===----------------------------------------------------------------------===//
13 #include "AArch64Subtarget.h"
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/IR/GlobalValue.h"
27 #include "llvm/Support/AArch64TargetParser.h"
28 #include "llvm/Support/TargetParser.h"
30 using namespace llvm;
32 #define DEBUG_TYPE "aarch64-subtarget"
34 #define GET_SUBTARGETINFO_CTOR
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #include "AArch64GenSubtargetInfo.inc"
38 static cl::opt<bool>
39 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
40 "converter pass"), cl::init(true), cl::Hidden);
42 // If OS supports TBI, use this flag to enable it.
43 static cl::opt<bool>
44 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
45 "an address is ignored"), cl::init(false), cl::Hidden);
47 static cl::opt<bool>
48 UseNonLazyBind("aarch64-enable-nonlazybind",
49 cl::desc("Call nonlazybind functions via direct GOT load"),
50 cl::init(false), cl::Hidden);
52 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
53 cl::desc("Enable the use of AA during codegen."));
55 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
56 StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
57 // Determine default and user-specified characteristics
59 if (CPUString.empty())
60 CPUString = "generic";
62 if (TuneCPUString.empty())
63 TuneCPUString = CPUString;
65 ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
66 initializeProperties();
68 return *this;
71 void AArch64Subtarget::initializeProperties() {
72 // Initialize CPU specific properties. We should add a tablegen feature for
73 // this in the future so we can specify it together with the subtarget
74 // features.
75 switch (ARMProcFamily) {
76 case Others:
77 break;
78 case Carmel:
79 CacheLineSize = 64;
80 break;
81 case CortexA35:
82 break;
83 case CortexA53:
84 case CortexA55:
85 PrefFunctionLogAlignment = 4;
86 break;
87 case CortexA57:
88 MaxInterleaveFactor = 4;
89 PrefFunctionLogAlignment = 4;
90 break;
91 case CortexA65:
92 PrefFunctionLogAlignment = 3;
93 break;
94 case CortexA72:
95 case CortexA73:
96 case CortexA75:
97 case CortexA76:
98 case CortexA77:
99 case CortexA78:
100 case CortexA78C:
101 case CortexR82:
102 case CortexX1:
103 case CortexX1C:
104 PrefFunctionLogAlignment = 4;
105 break;
106 case CortexA510:
107 case CortexA710:
108 case CortexX2:
109 PrefFunctionLogAlignment = 4;
110 VScaleForTuning = 1;
111 break;
112 case A64FX:
113 CacheLineSize = 256;
114 PrefFunctionLogAlignment = 3;
115 PrefLoopLogAlignment = 2;
116 MaxInterleaveFactor = 4;
117 PrefetchDistance = 128;
118 MinPrefetchStride = 1024;
119 MaxPrefetchIterationsAhead = 4;
120 VScaleForTuning = 4;
121 break;
122 case AppleA7:
123 case AppleA10:
124 case AppleA11:
125 case AppleA12:
126 case AppleA13:
127 case AppleA14:
128 CacheLineSize = 64;
129 PrefetchDistance = 280;
130 MinPrefetchStride = 2048;
131 MaxPrefetchIterationsAhead = 3;
132 break;
133 case ExynosM3:
134 MaxInterleaveFactor = 4;
135 MaxJumpTableSize = 20;
136 PrefFunctionLogAlignment = 5;
137 PrefLoopLogAlignment = 4;
138 break;
139 case Falkor:
140 MaxInterleaveFactor = 4;
141 // FIXME: remove this to enable 64-bit SLP if performance looks good.
142 MinVectorRegisterBitWidth = 128;
143 CacheLineSize = 128;
144 PrefetchDistance = 820;
145 MinPrefetchStride = 2048;
146 MaxPrefetchIterationsAhead = 8;
147 break;
148 case Kryo:
149 MaxInterleaveFactor = 4;
150 VectorInsertExtractBaseCost = 2;
151 CacheLineSize = 128;
152 PrefetchDistance = 740;
153 MinPrefetchStride = 1024;
154 MaxPrefetchIterationsAhead = 11;
155 // FIXME: remove this to enable 64-bit SLP if performance looks good.
156 MinVectorRegisterBitWidth = 128;
157 break;
158 case NeoverseE1:
159 PrefFunctionLogAlignment = 3;
160 break;
161 case NeoverseN1:
162 PrefFunctionLogAlignment = 4;
163 PrefLoopLogAlignment = 5;
164 MaxBytesForLoopAlignment = 16;
165 break;
166 case NeoverseN2:
167 PrefFunctionLogAlignment = 4;
168 PrefLoopLogAlignment = 5;
169 MaxBytesForLoopAlignment = 16;
170 VScaleForTuning = 1;
171 break;
172 case NeoverseV1:
173 PrefFunctionLogAlignment = 4;
174 PrefLoopLogAlignment = 5;
175 MaxBytesForLoopAlignment = 16;
176 VScaleForTuning = 2;
177 break;
178 case Neoverse512TVB:
179 PrefFunctionLogAlignment = 4;
180 VScaleForTuning = 1;
181 MaxInterleaveFactor = 4;
182 break;
183 case Saphira:
184 MaxInterleaveFactor = 4;
185 // FIXME: remove this to enable 64-bit SLP if performance looks good.
186 MinVectorRegisterBitWidth = 128;
187 break;
188 case ThunderX2T99:
189 CacheLineSize = 64;
190 PrefFunctionLogAlignment = 3;
191 PrefLoopLogAlignment = 2;
192 MaxInterleaveFactor = 4;
193 PrefetchDistance = 128;
194 MinPrefetchStride = 1024;
195 MaxPrefetchIterationsAhead = 4;
196 // FIXME: remove this to enable 64-bit SLP if performance looks good.
197 MinVectorRegisterBitWidth = 128;
198 break;
199 case ThunderX:
200 case ThunderXT88:
201 case ThunderXT81:
202 case ThunderXT83:
203 CacheLineSize = 128;
204 PrefFunctionLogAlignment = 3;
205 PrefLoopLogAlignment = 2;
206 // FIXME: remove this to enable 64-bit SLP if performance looks good.
207 MinVectorRegisterBitWidth = 128;
208 break;
209 case TSV110:
210 CacheLineSize = 64;
211 PrefFunctionLogAlignment = 4;
212 PrefLoopLogAlignment = 2;
213 break;
214 case ThunderX3T110:
215 CacheLineSize = 64;
216 PrefFunctionLogAlignment = 4;
217 PrefLoopLogAlignment = 2;
218 MaxInterleaveFactor = 4;
219 PrefetchDistance = 128;
220 MinPrefetchStride = 1024;
221 MaxPrefetchIterationsAhead = 4;
222 // FIXME: remove this to enable 64-bit SLP if performance looks good.
223 MinVectorRegisterBitWidth = 128;
224 break;
228 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
229 const std::string &TuneCPU,
230 const std::string &FS,
231 const TargetMachine &TM, bool LittleEndian,
232 unsigned MinSVEVectorSizeInBitsOverride,
233 unsigned MaxSVEVectorSizeInBitsOverride)
234 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
235 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
236 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
237 IsLittle(LittleEndian),
238 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
239 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
240 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
241 TLInfo(TM, *this) {
242 if (AArch64::isX18ReservedByDefault(TT))
243 ReserveXRegister.set(18);
245 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
246 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
247 Legalizer.reset(new AArch64LegalizerInfo(*this));
249 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
251 // FIXME: At this point, we can't rely on Subtarget having RBI.
252 // It's awkward to mix passing RBI and the Subtarget; should we pass
253 // TII/TRI as well?
254 InstSelector.reset(createAArch64InstructionSelector(
255 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
257 RegBankInfo.reset(RBI);
260 const CallLowering *AArch64Subtarget::getCallLowering() const {
261 return CallLoweringInfo.get();
264 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
265 return InlineAsmLoweringInfo.get();
268 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
269 return InstSelector.get();
272 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
273 return Legalizer.get();
276 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
277 return RegBankInfo.get();
280 /// Find the target operand flags that describe how a global value should be
281 /// referenced for the current subtarget.
282 unsigned
283 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
284 const TargetMachine &TM) const {
285 // MachO large model always goes via a GOT, simply to get a single 8-byte
286 // absolute relocation on all global addresses.
287 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
288 return AArch64II::MO_GOT;
290 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
291 if (GV->hasDLLImportStorageClass())
292 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
293 if (getTargetTriple().isOSWindows())
294 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
295 return AArch64II::MO_GOT;
298 // The small code model's direct accesses use ADRP, which cannot
299 // necessarily produce the value 0 (if the code is above 4GB).
300 // Same for the tiny code model, where we have a pc relative LDR.
301 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
302 GV->hasExternalWeakLinkage())
303 return AArch64II::MO_GOT;
305 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
306 // that their nominal addresses are tagged and outside of the code model. In
307 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
308 // tag if necessary based on MO_TAGGED.
309 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
310 return AArch64II::MO_NC | AArch64II::MO_TAGGED;
312 return AArch64II::MO_NO_FLAG;
315 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
316 const GlobalValue *GV, const TargetMachine &TM) const {
317 // MachO large model always goes via a GOT, because we don't have the
318 // relocations available to do anything else..
319 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
320 !GV->hasInternalLinkage())
321 return AArch64II::MO_GOT;
323 // NonLazyBind goes via GOT unless we know it's available locally.
324 auto *F = dyn_cast<Function>(GV);
325 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
326 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
327 return AArch64II::MO_GOT;
329 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
330 if (getTargetTriple().isOSWindows())
331 return ClassifyGlobalReference(GV, TM);
333 return AArch64II::MO_NO_FLAG;
336 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
337 unsigned NumRegionInstrs) const {
338 // LNT run (at least on Cyclone) showed reasonably significant gains for
339 // bi-directional scheduling. 253.perlbmk.
340 Policy.OnlyTopDown = false;
341 Policy.OnlyBottomUp = false;
342 // Enabling or Disabling the latency heuristic is a close call: It seems to
343 // help nearly no benchmark on out-of-order architectures, on the other hand
344 // it regresses register pressure on a few benchmarking.
345 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
348 bool AArch64Subtarget::enableEarlyIfConversion() const {
349 return EnableEarlyIfConvert;
352 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
353 if (!UseAddressTopByteIgnored)
354 return false;
356 if (TargetTriple.isDriverKit())
357 return true;
358 if (TargetTriple.isiOS()) {
359 return TargetTriple.getiOSVersion() >= VersionTuple(8);
362 return false;
365 std::unique_ptr<PBQPRAConstraint>
366 AArch64Subtarget::getCustomPBQPConstraints() const {
367 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
370 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
371 // We usually compute max call frame size after ISel. Do the computation now
372 // if the .mir file didn't specify it. Note that this will probably give you
373 // bogus values after PEI has eliminated the callframe setup/destroy pseudo
374 // instructions, specify explicitly if you need it to be correct.
375 MachineFrameInfo &MFI = MF.getFrameInfo();
376 if (!MFI.isMaxCallFrameSizeComputed())
377 MFI.computeMaxCallFrameSize(MF);
380 bool AArch64Subtarget::useAA() const { return UseAA; }