[llvm] [cmake] Add possibility to use ChooseMSVCCRT.cmake when include LLVM library
[llvm-core.git] / include / llvm / MCA / HardwareUnits / LSUnit.h
blob6ca103808aa81494df14493cde0636ab3550bf17
1 //===------------------------- LSUnit.h --------------------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// A Load/Store unit class that models load/store queues and that implements
11 /// a simple weak memory consistency model.
12 ///
13 //===----------------------------------------------------------------------===//
15 #ifndef LLVM_MCA_LSUNIT_H
16 #define LLVM_MCA_LSUNIT_H
18 #include "llvm/ADT/DenseMap.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/MC/MCSchedule.h"
21 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
22 #include "llvm/MCA/Instruction.h"
24 namespace llvm {
25 namespace mca {
27 class Scheduler;
29 /// A node of a memory dependency graph. A MemoryGroup describes a set of
30 /// instructions with same memory dependencies.
31 ///
32 /// By construction, instructions of a MemoryGroup don't depend on each other.
33 /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
34 /// A Memory group identifier is then stored as a "token" in field
35 /// Instruction::LSUTokenID of each dispatched instructions. That token is used
36 /// internally by the LSUnit to track memory dependencies.
37 class MemoryGroup {
38 unsigned NumPredecessors;
39 unsigned NumExecutingPredecessors;
40 unsigned NumExecutedPredecessors;
42 unsigned NumInstructions;
43 unsigned NumExecuting;
44 unsigned NumExecuted;
45 SmallVector<MemoryGroup *, 4> Succ;
47 CriticalDependency CriticalPredecessor;
48 InstRef CriticalMemoryInstruction;
50 MemoryGroup(const MemoryGroup &) = delete;
51 MemoryGroup &operator=(const MemoryGroup &) = delete;
53 public:
54 MemoryGroup()
55 : NumPredecessors(0), NumExecutingPredecessors(0),
56 NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0),
57 NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {}
58 MemoryGroup(MemoryGroup &&) = default;
60 ArrayRef<MemoryGroup *> getSuccessors() const { return Succ; }
61 unsigned getNumSuccessors() const { return Succ.size(); }
62 unsigned getNumPredecessors() const { return NumPredecessors; }
63 unsigned getNumExecutingPredecessors() const {
64 return NumExecutingPredecessors;
66 unsigned getNumExecutedPredecessors() const {
67 return NumExecutedPredecessors;
69 unsigned getNumInstructions() const { return NumInstructions; }
70 unsigned getNumExecuting() const { return NumExecuting; }
71 unsigned getNumExecuted() const { return NumExecuted; }
73 const InstRef &getCriticalMemoryInstruction() const {
74 return CriticalMemoryInstruction;
76 const CriticalDependency &getCriticalPredecessor() const {
77 return CriticalPredecessor;
80 void addSuccessor(MemoryGroup *Group) {
81 Group->NumPredecessors++;
82 assert(!isExecuted() && "Should have been removed!");
83 if (isExecuting())
84 Group->onGroupIssued(CriticalMemoryInstruction);
85 Succ.emplace_back(Group);
88 bool isWaiting() const {
89 return NumPredecessors >
90 (NumExecutingPredecessors + NumExecutedPredecessors);
92 bool isPending() const {
93 return NumExecutingPredecessors &&
94 ((NumExecutedPredecessors + NumExecutingPredecessors) ==
95 NumPredecessors);
97 bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
98 bool isExecuting() const {
99 return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
101 bool isExecuted() const { return NumInstructions == NumExecuted; }
103 void onGroupIssued(const InstRef &IR) {
104 assert(!isReady() && "Unexpected group-start event!");
105 NumExecutingPredecessors++;
107 unsigned Cycles = IR.getInstruction()->getCyclesLeft();
108 if (CriticalPredecessor.Cycles < Cycles) {
109 CriticalPredecessor.IID = IR.getSourceIndex();
110 CriticalPredecessor.Cycles = Cycles;
114 void onGroupExecuted() {
115 assert(!isReady() && "Inconsistent state found!");
116 NumExecutingPredecessors--;
117 NumExecutedPredecessors++;
120 void onInstructionIssued(const InstRef &IR) {
121 assert(!isExecuting() && "Invalid internal state!");
122 ++NumExecuting;
124 // update the CriticalMemDep.
125 const Instruction &IS = *IR.getInstruction();
126 if ((bool)CriticalMemoryInstruction) {
127 const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
128 if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
129 CriticalMemoryInstruction = IR;
130 } else {
131 CriticalMemoryInstruction = IR;
134 if (!isExecuting())
135 return;
137 // Notify successors that this group started execution.
138 for (MemoryGroup *MG : Succ)
139 MG->onGroupIssued(CriticalMemoryInstruction);
142 void onInstructionExecuted() {
143 assert(isReady() && !isExecuted() && "Invalid internal state!");
144 --NumExecuting;
145 ++NumExecuted;
147 if (!isExecuted())
148 return;
150 // Notify successors that this group has finished execution.
151 for (MemoryGroup *MG : Succ)
152 MG->onGroupExecuted();
155 void addInstruction() {
156 assert(!getNumSuccessors() && "Cannot add instructions to this group!");
157 ++NumInstructions;
160 void cycleEvent() {
161 if (isWaiting() && CriticalPredecessor.Cycles)
162 CriticalPredecessor.Cycles--;
166 /// Abstract base interface for LS (load/store) units in llvm-mca.
167 class LSUnitBase : public HardwareUnit {
168 /// Load queue size.
170 /// A value of zero for this field means that the load queue is unbounded.
171 /// Processor models can declare the size of a load queue via tablegen (see
172 /// the definition of tablegen class LoadQueue in
173 /// llvm/Target/TargetSchedule.td).
174 unsigned LQSize;
176 /// Load queue size.
178 /// A value of zero for this field means that the store queue is unbounded.
179 /// Processor models can declare the size of a store queue via tablegen (see
180 /// the definition of tablegen class StoreQueue in
181 /// llvm/Target/TargetSchedule.td).
182 unsigned SQSize;
184 unsigned UsedLQEntries;
185 unsigned UsedSQEntries;
187 /// True if loads don't alias with stores.
189 /// By default, the LS unit assumes that loads and stores don't alias with
190 /// eachother. If this field is set to false, then loads are always assumed to
191 /// alias with stores.
192 const bool NoAlias;
194 /// Used to map group identifiers to MemoryGroups.
195 DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
196 unsigned NextGroupID;
198 public:
199 LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
200 unsigned StoreQueueSize, bool AssumeNoAlias);
202 virtual ~LSUnitBase();
204 /// Returns the total number of entries in the load queue.
205 unsigned getLoadQueueSize() const { return LQSize; }
207 /// Returns the total number of entries in the store queue.
208 unsigned getStoreQueueSize() const { return SQSize; }
210 unsigned getUsedLQEntries() const { return UsedLQEntries; }
211 unsigned getUsedSQEntries() const { return UsedSQEntries; }
212 unsigned assignLQSlot() { return UsedLQEntries++; }
213 unsigned assignSQSlot() { return UsedSQEntries++; }
215 bool assumeNoAlias() const { return NoAlias; }
217 enum Status {
218 LSU_AVAILABLE = 0,
219 LSU_LQUEUE_FULL, // Load Queue unavailable
220 LSU_SQUEUE_FULL // Store Queue unavailable
223 /// This method checks the availability of the load/store buffers.
225 /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
226 /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
227 /// not a memory operation.
228 virtual Status isAvailable(const InstRef &IR) const = 0;
230 /// Allocates LS resources for instruction IR.
232 /// This method assumes that a previous call to `isAvailable(IR)` succeeded
233 /// with a LSUnitBase::Status value of LSU_AVAILABLE.
234 /// Returns the GroupID associated with this instruction. That value will be
235 /// used to set the LSUTokenID field in class Instruction.
236 virtual unsigned dispatch(const InstRef &IR) = 0;
238 bool isSQEmpty() const { return !UsedSQEntries; }
239 bool isLQEmpty() const { return !UsedLQEntries; }
240 bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
241 bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
243 bool isValidGroupID(unsigned Index) const {
244 return Index && (Groups.find(Index) != Groups.end());
247 /// Check if a peviously dispatched instruction IR is now ready for execution.
248 bool isReady(const InstRef &IR) const {
249 unsigned GroupID = IR.getInstruction()->getLSUTokenID();
250 const MemoryGroup &Group = getGroup(GroupID);
251 return Group.isReady();
254 /// Check if instruction IR only depends on memory instructions that are
255 /// currently executing.
256 bool isPending(const InstRef &IR) const {
257 unsigned GroupID = IR.getInstruction()->getLSUTokenID();
258 const MemoryGroup &Group = getGroup(GroupID);
259 return Group.isPending();
262 /// Check if instruction IR is still waiting on memory operations, and the
263 /// wait time is still unknown.
264 bool isWaiting(const InstRef &IR) const {
265 unsigned GroupID = IR.getInstruction()->getLSUTokenID();
266 const MemoryGroup &Group = getGroup(GroupID);
267 return Group.isWaiting();
270 bool hasDependentUsers(const InstRef &IR) const {
271 unsigned GroupID = IR.getInstruction()->getLSUTokenID();
272 const MemoryGroup &Group = getGroup(GroupID);
273 return !Group.isExecuted() && Group.getNumSuccessors();
276 const MemoryGroup &getGroup(unsigned Index) const {
277 assert(isValidGroupID(Index) && "Group doesn't exist!");
278 return *Groups.find(Index)->second;
281 MemoryGroup &getGroup(unsigned Index) {
282 assert(isValidGroupID(Index) && "Group doesn't exist!");
283 return *Groups.find(Index)->second;
286 unsigned createMemoryGroup() {
287 Groups.insert(
288 std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
289 return NextGroupID++;
292 // Instruction executed event handlers.
293 virtual void onInstructionExecuted(const InstRef &IR);
295 virtual void onInstructionIssued(const InstRef &IR) {
296 unsigned GroupID = IR.getInstruction()->getLSUTokenID();
297 Groups[GroupID]->onInstructionIssued(IR);
300 virtual void cycleEvent();
302 #ifndef NDEBUG
303 void dump() const;
304 #endif
307 /// Default Load/Store Unit (LS Unit) for simulated processors.
309 /// Each load (or store) consumes one entry in the load (or store) queue.
311 /// Rules are:
312 /// 1) A younger load is allowed to pass an older load only if there are no
313 /// stores nor barriers in between the two loads.
314 /// 2) An younger store is not allowed to pass an older store.
315 /// 3) A younger store is not allowed to pass an older load.
316 /// 4) A younger load is allowed to pass an older store only if the load does
317 /// not alias with the store.
319 /// This class optimistically assumes that loads don't alias store operations.
320 /// Under this assumption, younger loads are always allowed to pass older
321 /// stores (this would only affects rule 4).
322 /// Essentially, this class doesn't perform any sort alias analysis to
323 /// identify aliasing loads and stores.
325 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
326 /// set to `false` by the constructor of LSUnit.
328 /// Note that this class doesn't know about the existence of different memory
329 /// types for memory operations (example: write-through, write-combining, etc.).
330 /// Derived classes are responsible for implementing that extra knowledge, and
331 /// provide different sets of rules for loads and stores by overriding method
332 /// `isReady()`.
333 /// To emulate a write-combining memory type, rule 2. must be relaxed in a
334 /// derived class to enable the reordering of non-aliasing store operations.
336 /// No assumptions are made by this class on the size of the store buffer. This
337 /// class doesn't know how to identify cases where store-to-load forwarding may
338 /// occur.
340 /// LSUnit doesn't attempt to predict whether a load or store hits or misses
341 /// the L1 cache. To be more specific, LSUnit doesn't know anything about
342 /// cache hierarchy and memory types.
343 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
344 /// scheduling model provides an "optimistic" load-to-use latency (which usually
345 /// matches the load-to-use latency for when there is a hit in the L1D).
346 /// Derived classes may expand this knowledge.
348 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
349 /// memory-barrier like instructions.
350 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
351 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
352 /// serializes loads without forcing a flush of the load queue.
353 /// Similarly, instructions that both `mayStore` and have `unmodeled side
354 /// effects` are treated like store barriers. A full memory
355 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
356 /// effects. This is obviously inaccurate, but this is the best that we can do
357 /// at the moment.
359 /// Each load/store barrier consumes one entry in the load/store queue. A
360 /// load/store barrier enforces ordering of loads/stores:
361 /// - A younger load cannot pass a load barrier.
362 /// - A younger store cannot pass a store barrier.
364 /// A younger load has to wait for the memory load barrier to execute.
365 /// A load/store barrier is "executed" when it becomes the oldest entry in
366 /// the load/store queue(s). That also means, all the older loads/stores have
367 /// already been executed.
368 class LSUnit : public LSUnitBase {
369 // This class doesn't know about the latency of a load instruction. So, it
370 // conservatively/pessimistically assumes that the latency of a load opcode
371 // matches the instruction latency.
373 // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
374 // and load/store conflicts, the latency of a load is determined by the depth
375 // of the load pipeline. So, we could use field `LoadLatency` in the
376 // MCSchedModel to model that latency.
377 // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
378 // L1D, and it usually already accounts for any extra latency due to data
379 // forwarding.
380 // When doing throughput analysis, `LoadLatency` is likely to
381 // be a better predictor of load latency than instruction latency. This is
382 // particularly true when simulating code with temporal/spatial locality of
383 // memory accesses.
384 // Using `LoadLatency` (instead of the instruction latency) is also expected
385 // to improve the load queue allocation for long latency instructions with
386 // folded memory operands (See PR39829).
388 // FIXME: On some processors, load/store operations are split into multiple
389 // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
390 // not 256-bit data types. So, a 256-bit load is effectively split into two
391 // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
392 // simplicity, this class optimistically assumes that a load instruction only
393 // consumes one entry in the LoadQueue. Similarly, store instructions only
394 // consume a single entry in the StoreQueue.
395 // In future, we should reassess the quality of this design, and consider
396 // alternative approaches that let instructions specify the number of
397 // load/store queue entries which they consume at dispatch stage (See
398 // PR39830).
400 // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
401 // conservatively treated as a store barrier. It forces older store to be
402 // executed before newer stores are issued.
404 // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
405 // conservatively treated as a load barrier. It forces older loads to execute
406 // before newer loads are issued.
407 unsigned CurrentLoadGroupID;
408 unsigned CurrentLoadBarrierGroupID;
409 unsigned CurrentStoreGroupID;
411 public:
412 LSUnit(const MCSchedModel &SM)
413 : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
414 LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
415 : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
416 LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
417 : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
418 CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0) {}
420 /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
421 /// accomodate instruction IR.
422 Status isAvailable(const InstRef &IR) const override;
424 /// Allocates LS resources for instruction IR.
426 /// This method assumes that a previous call to `isAvailable(IR)` succeeded
427 /// returning LSU_AVAILABLE.
429 /// Rules are:
430 /// By default, rules are:
431 /// 1. A store may not pass a previous store.
432 /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
433 /// 3. A load may pass a previous load.
434 /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
435 /// 5. A load has to wait until an older load barrier is fully executed.
436 /// 6. A store has to wait until an older store barrier is fully executed.
437 unsigned dispatch(const InstRef &IR) override;
439 // FIXME: For simplicity, we optimistically assume a similar behavior for
440 // store instructions. In practice, store operations don't tend to leave the
441 // store queue until they reach the 'Retired' stage (See PR39830).
442 void onInstructionExecuted(const InstRef &IR) override;
445 } // namespace mca
446 } // namespace llvm
448 #endif // LLVM_MCA_LSUNIT_H