1 //===------------------------- LSUnit.h --------------------------*- C++-*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// A Load/Store unit class that models load/store queues and that implements
11 /// a simple weak memory consistency model.
13 //===----------------------------------------------------------------------===//
15 #ifndef LLVM_MCA_LSUNIT_H
16 #define LLVM_MCA_LSUNIT_H
18 #include "llvm/ADT/DenseMap.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/MC/MCSchedule.h"
21 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
22 #include "llvm/MCA/Instruction.h"
29 /// A node of a memory dependency graph. A MemoryGroup describes a set of
30 /// instructions with same memory dependencies.
32 /// By construction, instructions of a MemoryGroup don't depend on each other.
33 /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
34 /// A Memory group identifier is then stored as a "token" in field
35 /// Instruction::LSUTokenID of each dispatched instructions. That token is used
36 /// internally by the LSUnit to track memory dependencies.
38 unsigned NumPredecessors
;
39 unsigned NumExecutingPredecessors
;
40 unsigned NumExecutedPredecessors
;
42 unsigned NumInstructions
;
43 unsigned NumExecuting
;
45 SmallVector
<MemoryGroup
*, 4> Succ
;
47 CriticalDependency CriticalPredecessor
;
48 InstRef CriticalMemoryInstruction
;
50 MemoryGroup(const MemoryGroup
&) = delete;
51 MemoryGroup
&operator=(const MemoryGroup
&) = delete;
55 : NumPredecessors(0), NumExecutingPredecessors(0),
56 NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0),
57 NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {}
58 MemoryGroup(MemoryGroup
&&) = default;
60 ArrayRef
<MemoryGroup
*> getSuccessors() const { return Succ
; }
61 unsigned getNumSuccessors() const { return Succ
.size(); }
62 unsigned getNumPredecessors() const { return NumPredecessors
; }
63 unsigned getNumExecutingPredecessors() const {
64 return NumExecutingPredecessors
;
66 unsigned getNumExecutedPredecessors() const {
67 return NumExecutedPredecessors
;
69 unsigned getNumInstructions() const { return NumInstructions
; }
70 unsigned getNumExecuting() const { return NumExecuting
; }
71 unsigned getNumExecuted() const { return NumExecuted
; }
73 const InstRef
&getCriticalMemoryInstruction() const {
74 return CriticalMemoryInstruction
;
76 const CriticalDependency
&getCriticalPredecessor() const {
77 return CriticalPredecessor
;
80 void addSuccessor(MemoryGroup
*Group
) {
81 Group
->NumPredecessors
++;
82 assert(!isExecuted() && "Should have been removed!");
84 Group
->onGroupIssued(CriticalMemoryInstruction
);
85 Succ
.emplace_back(Group
);
88 bool isWaiting() const {
89 return NumPredecessors
>
90 (NumExecutingPredecessors
+ NumExecutedPredecessors
);
92 bool isPending() const {
93 return NumExecutingPredecessors
&&
94 ((NumExecutedPredecessors
+ NumExecutingPredecessors
) ==
97 bool isReady() const { return NumExecutedPredecessors
== NumPredecessors
; }
98 bool isExecuting() const {
99 return NumExecuting
&& (NumExecuting
== (NumInstructions
- NumExecuted
));
101 bool isExecuted() const { return NumInstructions
== NumExecuted
; }
103 void onGroupIssued(const InstRef
&IR
) {
104 assert(!isReady() && "Unexpected group-start event!");
105 NumExecutingPredecessors
++;
107 unsigned Cycles
= IR
.getInstruction()->getCyclesLeft();
108 if (CriticalPredecessor
.Cycles
< Cycles
) {
109 CriticalPredecessor
.IID
= IR
.getSourceIndex();
110 CriticalPredecessor
.Cycles
= Cycles
;
114 void onGroupExecuted() {
115 assert(!isReady() && "Inconsistent state found!");
116 NumExecutingPredecessors
--;
117 NumExecutedPredecessors
++;
120 void onInstructionIssued(const InstRef
&IR
) {
121 assert(!isExecuting() && "Invalid internal state!");
124 // update the CriticalMemDep.
125 const Instruction
&IS
= *IR
.getInstruction();
126 if ((bool)CriticalMemoryInstruction
) {
127 const Instruction
&OtherIS
= *CriticalMemoryInstruction
.getInstruction();
128 if (OtherIS
.getCyclesLeft() < IS
.getCyclesLeft())
129 CriticalMemoryInstruction
= IR
;
131 CriticalMemoryInstruction
= IR
;
137 // Notify successors that this group started execution.
138 for (MemoryGroup
*MG
: Succ
)
139 MG
->onGroupIssued(CriticalMemoryInstruction
);
142 void onInstructionExecuted() {
143 assert(isReady() && !isExecuted() && "Invalid internal state!");
150 // Notify successors that this group has finished execution.
151 for (MemoryGroup
*MG
: Succ
)
152 MG
->onGroupExecuted();
155 void addInstruction() {
156 assert(!getNumSuccessors() && "Cannot add instructions to this group!");
161 if (isWaiting() && CriticalPredecessor
.Cycles
)
162 CriticalPredecessor
.Cycles
--;
166 /// Abstract base interface for LS (load/store) units in llvm-mca.
167 class LSUnitBase
: public HardwareUnit
{
170 /// A value of zero for this field means that the load queue is unbounded.
171 /// Processor models can declare the size of a load queue via tablegen (see
172 /// the definition of tablegen class LoadQueue in
173 /// llvm/Target/TargetSchedule.td).
178 /// A value of zero for this field means that the store queue is unbounded.
179 /// Processor models can declare the size of a store queue via tablegen (see
180 /// the definition of tablegen class StoreQueue in
181 /// llvm/Target/TargetSchedule.td).
184 unsigned UsedLQEntries
;
185 unsigned UsedSQEntries
;
187 /// True if loads don't alias with stores.
189 /// By default, the LS unit assumes that loads and stores don't alias with
190 /// eachother. If this field is set to false, then loads are always assumed to
191 /// alias with stores.
194 /// Used to map group identifiers to MemoryGroups.
195 DenseMap
<unsigned, std::unique_ptr
<MemoryGroup
>> Groups
;
196 unsigned NextGroupID
;
199 LSUnitBase(const MCSchedModel
&SM
, unsigned LoadQueueSize
,
200 unsigned StoreQueueSize
, bool AssumeNoAlias
);
202 virtual ~LSUnitBase();
204 /// Returns the total number of entries in the load queue.
205 unsigned getLoadQueueSize() const { return LQSize
; }
207 /// Returns the total number of entries in the store queue.
208 unsigned getStoreQueueSize() const { return SQSize
; }
210 unsigned getUsedLQEntries() const { return UsedLQEntries
; }
211 unsigned getUsedSQEntries() const { return UsedSQEntries
; }
212 void acquireLQSlot() { ++UsedLQEntries
; }
213 void acquireSQSlot() { ++UsedSQEntries
; }
214 void releaseLQSlot() { --UsedLQEntries
; }
215 void releaseSQSlot() { --UsedSQEntries
; }
217 bool assumeNoAlias() const { return NoAlias
; }
221 LSU_LQUEUE_FULL
, // Load Queue unavailable
222 LSU_SQUEUE_FULL
// Store Queue unavailable
225 /// This method checks the availability of the load/store buffers.
227 /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
228 /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
229 /// not a memory operation.
230 virtual Status
isAvailable(const InstRef
&IR
) const = 0;
232 /// Allocates LS resources for instruction IR.
234 /// This method assumes that a previous call to `isAvailable(IR)` succeeded
235 /// with a LSUnitBase::Status value of LSU_AVAILABLE.
236 /// Returns the GroupID associated with this instruction. That value will be
237 /// used to set the LSUTokenID field in class Instruction.
238 virtual unsigned dispatch(const InstRef
&IR
) = 0;
240 bool isSQEmpty() const { return !UsedSQEntries
; }
241 bool isLQEmpty() const { return !UsedLQEntries
; }
242 bool isSQFull() const { return SQSize
&& SQSize
== UsedSQEntries
; }
243 bool isLQFull() const { return LQSize
&& LQSize
== UsedLQEntries
; }
245 bool isValidGroupID(unsigned Index
) const {
246 return Index
&& (Groups
.find(Index
) != Groups
.end());
249 /// Check if a peviously dispatched instruction IR is now ready for execution.
250 bool isReady(const InstRef
&IR
) const {
251 unsigned GroupID
= IR
.getInstruction()->getLSUTokenID();
252 const MemoryGroup
&Group
= getGroup(GroupID
);
253 return Group
.isReady();
256 /// Check if instruction IR only depends on memory instructions that are
257 /// currently executing.
258 bool isPending(const InstRef
&IR
) const {
259 unsigned GroupID
= IR
.getInstruction()->getLSUTokenID();
260 const MemoryGroup
&Group
= getGroup(GroupID
);
261 return Group
.isPending();
264 /// Check if instruction IR is still waiting on memory operations, and the
265 /// wait time is still unknown.
266 bool isWaiting(const InstRef
&IR
) const {
267 unsigned GroupID
= IR
.getInstruction()->getLSUTokenID();
268 const MemoryGroup
&Group
= getGroup(GroupID
);
269 return Group
.isWaiting();
272 bool hasDependentUsers(const InstRef
&IR
) const {
273 unsigned GroupID
= IR
.getInstruction()->getLSUTokenID();
274 const MemoryGroup
&Group
= getGroup(GroupID
);
275 return !Group
.isExecuted() && Group
.getNumSuccessors();
278 const MemoryGroup
&getGroup(unsigned Index
) const {
279 assert(isValidGroupID(Index
) && "Group doesn't exist!");
280 return *Groups
.find(Index
)->second
;
283 MemoryGroup
&getGroup(unsigned Index
) {
284 assert(isValidGroupID(Index
) && "Group doesn't exist!");
285 return *Groups
.find(Index
)->second
;
288 unsigned createMemoryGroup() {
290 std::make_pair(NextGroupID
, std::make_unique
<MemoryGroup
>()));
291 return NextGroupID
++;
294 virtual void onInstructionExecuted(const InstRef
&IR
);
296 // Loads are tracked by the LDQ (load queue) from dispatch until completion.
297 // Stores are tracked by the STQ (store queue) from dispatch until commitment.
298 // By default we conservatively assume that the LDQ receives a load at
299 // dispatch. Loads leave the LDQ at retirement stage.
300 virtual void onInstructionRetired(const InstRef
&IR
);
302 virtual void onInstructionIssued(const InstRef
&IR
) {
303 unsigned GroupID
= IR
.getInstruction()->getLSUTokenID();
304 Groups
[GroupID
]->onInstructionIssued(IR
);
307 virtual void cycleEvent();
314 /// Default Load/Store Unit (LS Unit) for simulated processors.
316 /// Each load (or store) consumes one entry in the load (or store) queue.
319 /// 1) A younger load is allowed to pass an older load only if there are no
320 /// stores nor barriers in between the two loads.
321 /// 2) An younger store is not allowed to pass an older store.
322 /// 3) A younger store is not allowed to pass an older load.
323 /// 4) A younger load is allowed to pass an older store only if the load does
324 /// not alias with the store.
326 /// This class optimistically assumes that loads don't alias store operations.
327 /// Under this assumption, younger loads are always allowed to pass older
328 /// stores (this would only affects rule 4).
329 /// Essentially, this class doesn't perform any sort alias analysis to
330 /// identify aliasing loads and stores.
332 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
333 /// set to `false` by the constructor of LSUnit.
335 /// Note that this class doesn't know about the existence of different memory
336 /// types for memory operations (example: write-through, write-combining, etc.).
337 /// Derived classes are responsible for implementing that extra knowledge, and
338 /// provide different sets of rules for loads and stores by overriding method
340 /// To emulate a write-combining memory type, rule 2. must be relaxed in a
341 /// derived class to enable the reordering of non-aliasing store operations.
343 /// No assumptions are made by this class on the size of the store buffer. This
344 /// class doesn't know how to identify cases where store-to-load forwarding may
347 /// LSUnit doesn't attempt to predict whether a load or store hits or misses
348 /// the L1 cache. To be more specific, LSUnit doesn't know anything about
349 /// cache hierarchy and memory types.
350 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
351 /// scheduling model provides an "optimistic" load-to-use latency (which usually
352 /// matches the load-to-use latency for when there is a hit in the L1D).
353 /// Derived classes may expand this knowledge.
355 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
356 /// memory-barrier like instructions.
357 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
358 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
359 /// serializes loads without forcing a flush of the load queue.
360 /// Similarly, instructions that both `mayStore` and have `unmodeled side
361 /// effects` are treated like store barriers. A full memory
362 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
363 /// effects. This is obviously inaccurate, but this is the best that we can do
366 /// Each load/store barrier consumes one entry in the load/store queue. A
367 /// load/store barrier enforces ordering of loads/stores:
368 /// - A younger load cannot pass a load barrier.
369 /// - A younger store cannot pass a store barrier.
371 /// A younger load has to wait for the memory load barrier to execute.
372 /// A load/store barrier is "executed" when it becomes the oldest entry in
373 /// the load/store queue(s). That also means, all the older loads/stores have
374 /// already been executed.
375 class LSUnit
: public LSUnitBase
{
376 // This class doesn't know about the latency of a load instruction. So, it
377 // conservatively/pessimistically assumes that the latency of a load opcode
378 // matches the instruction latency.
380 // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
381 // and load/store conflicts, the latency of a load is determined by the depth
382 // of the load pipeline. So, we could use field `LoadLatency` in the
383 // MCSchedModel to model that latency.
384 // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
385 // L1D, and it usually already accounts for any extra latency due to data
387 // When doing throughput analysis, `LoadLatency` is likely to
388 // be a better predictor of load latency than instruction latency. This is
389 // particularly true when simulating code with temporal/spatial locality of
391 // Using `LoadLatency` (instead of the instruction latency) is also expected
392 // to improve the load queue allocation for long latency instructions with
393 // folded memory operands (See PR39829).
395 // FIXME: On some processors, load/store operations are split into multiple
396 // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
397 // not 256-bit data types. So, a 256-bit load is effectively split into two
398 // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
399 // simplicity, this class optimistically assumes that a load instruction only
400 // consumes one entry in the LoadQueue. Similarly, store instructions only
401 // consume a single entry in the StoreQueue.
402 // In future, we should reassess the quality of this design, and consider
403 // alternative approaches that let instructions specify the number of
404 // load/store queue entries which they consume at dispatch stage (See
407 // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
408 // conservatively treated as a store barrier. It forces older store to be
409 // executed before newer stores are issued.
411 // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
412 // conservatively treated as a load barrier. It forces older loads to execute
413 // before newer loads are issued.
414 unsigned CurrentLoadGroupID
;
415 unsigned CurrentLoadBarrierGroupID
;
416 unsigned CurrentStoreGroupID
;
419 LSUnit(const MCSchedModel
&SM
)
420 : LSUnit(SM
, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
421 LSUnit(const MCSchedModel
&SM
, unsigned LQ
, unsigned SQ
)
422 : LSUnit(SM
, LQ
, SQ
, /* NoAlias */ false) {}
423 LSUnit(const MCSchedModel
&SM
, unsigned LQ
, unsigned SQ
, bool AssumeNoAlias
)
424 : LSUnitBase(SM
, LQ
, SQ
, AssumeNoAlias
), CurrentLoadGroupID(0),
425 CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0) {}
427 /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
428 /// accomodate instruction IR.
429 Status
isAvailable(const InstRef
&IR
) const override
;
431 /// Allocates LS resources for instruction IR.
433 /// This method assumes that a previous call to `isAvailable(IR)` succeeded
434 /// returning LSU_AVAILABLE.
437 /// By default, rules are:
438 /// 1. A store may not pass a previous store.
439 /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
440 /// 3. A load may pass a previous load.
441 /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
442 /// 5. A load has to wait until an older load barrier is fully executed.
443 /// 6. A store has to wait until an older store barrier is fully executed.
444 unsigned dispatch(const InstRef
&IR
) override
;
446 void onInstructionExecuted(const InstRef
&IR
) override
;
452 #endif // LLVM_MCA_LSUNIT_H