include/llvm/MCA/HardwareUnits/LSUnit.h

   1 //===------------------------- LSUnit.h --------------------------*- C++-*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 ///
  10 /// A Load/Store unit class that models load/store queues and that implements
  11 /// a simple weak memory consistency model.
  12 ///
  13 //===----------------------------------------------------------------------===//
  14
  15 #ifndef LLVM_MCA_LSUNIT_H
  16 #define LLVM_MCA_LSUNIT_H
  17
  18 #include "llvm/ADT/DenseMap.h"
  19 #include "llvm/ADT/SmallVector.h"
  20 #include "llvm/MC/MCSchedule.h"
  21 #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
  22 #include "llvm/MCA/Instruction.h"
  23
  24 namespace llvm {
  25 namespace mca {
  26
  27 class Scheduler;
  28
  29 /// A node of a memory dependency graph. A MemoryGroup describes a set of
  30 /// instructions with same memory dependencies.
  31 ///
  32 /// By construction, instructions of a MemoryGroup don't depend on each other.
  33 /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
  34 /// A Memory group identifier is then stored as a "token" in field
  35 /// Instruction::LSUTokenID of each dispatched instructions. That token is used
  36 /// internally by the LSUnit to track memory dependencies.
  37 class MemoryGroup {
  38   unsigned NumPredecessors;
  39   unsigned NumExecutingPredecessors;
  40   unsigned NumExecutedPredecessors;
  41
  42   unsigned NumInstructions;
  43   unsigned NumExecuting;
  44   unsigned NumExecuted;
  45   SmallVector<MemoryGroup *, 4> Succ;
  46
  47   CriticalDependency CriticalPredecessor;
  48   InstRef CriticalMemoryInstruction;
  49
  50   MemoryGroup(const MemoryGroup &) = delete;
  51   MemoryGroup &operator=(const MemoryGroup &) = delete;
  52
  53 public:
  54   MemoryGroup()
  55       : NumPredecessors(0), NumExecutingPredecessors(0),
  56         NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0),
  57         NumExecuted(0), CriticalPredecessor(), CriticalMemoryInstruction() {}
  58   MemoryGroup(MemoryGroup &&) = default;
  59
  60   ArrayRef<MemoryGroup *> getSuccessors() const { return Succ; }
  61   unsigned getNumSuccessors() const { return Succ.size(); }
  62   unsigned getNumPredecessors() const { return NumPredecessors; }
  63   unsigned getNumExecutingPredecessors() const {
  64     return NumExecutingPredecessors;
  65   }
  66   unsigned getNumExecutedPredecessors() const {
  67     return NumExecutedPredecessors;
  68   }
  69   unsigned getNumInstructions() const { return NumInstructions; }
  70   unsigned getNumExecuting() const { return NumExecuting; }
  71   unsigned getNumExecuted() const { return NumExecuted; }
  72
  73   const InstRef &getCriticalMemoryInstruction() const {
  74     return CriticalMemoryInstruction;
  75   }
  76   const CriticalDependency &getCriticalPredecessor() const {
  77     return CriticalPredecessor;
  78   }
  79
  80   void addSuccessor(MemoryGroup *Group) {
  81     Group->NumPredecessors++;
  82     assert(!isExecuted() && "Should have been removed!");
  83     if (isExecuting())
  84       Group->onGroupIssued(CriticalMemoryInstruction);
  85     Succ.emplace_back(Group);
  86   }
  87
  88   bool isWaiting() const {
  89     return NumPredecessors >
  90            (NumExecutingPredecessors + NumExecutedPredecessors);
  91   }
  92   bool isPending() const {
  93     return NumExecutingPredecessors &&
  94            ((NumExecutedPredecessors + NumExecutingPredecessors) ==
  95             NumPredecessors);
  96   }
  97   bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
  98   bool isExecuting() const {
  99     return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
 100   }
 101   bool isExecuted() const { return NumInstructions == NumExecuted; }
 102
 103   void onGroupIssued(const InstRef &IR) {
 104     assert(!isReady() && "Unexpected group-start event!");
 105     NumExecutingPredecessors++;
 106
 107     unsigned Cycles = IR.getInstruction()->getCyclesLeft();
 108     if (CriticalPredecessor.Cycles < Cycles) {
 109       CriticalPredecessor.IID = IR.getSourceIndex();
 110       CriticalPredecessor.Cycles = Cycles;
 111     }
 112   }
 113
 114   void onGroupExecuted() {
 115     assert(!isReady() && "Inconsistent state found!");
 116     NumExecutingPredecessors--;
 117     NumExecutedPredecessors++;
 118   }
 119
 120   void onInstructionIssued(const InstRef &IR) {
 121     assert(!isExecuting() && "Invalid internal state!");
 122     ++NumExecuting;
 123
 124     // update the CriticalMemDep.
 125     const Instruction &IS = *IR.getInstruction();
 126     if ((bool)CriticalMemoryInstruction) {
 127       const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
 128       if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
 129         CriticalMemoryInstruction = IR;
 130     } else {
 131       CriticalMemoryInstruction = IR;
 132     }
 133
 134     if (!isExecuting())
 135       return;
 136
 137     // Notify successors that this group started execution.
 138     for (MemoryGroup *MG : Succ)
 139       MG->onGroupIssued(CriticalMemoryInstruction);
 140   }
 141
 142   void onInstructionExecuted() {
 143     assert(isReady() && !isExecuted() && "Invalid internal state!");
 144     --NumExecuting;
 145     ++NumExecuted;
 146
 147     if (!isExecuted())
 148       return;
 149
 150     // Notify successors that this group has finished execution.
 151     for (MemoryGroup *MG : Succ)
 152       MG->onGroupExecuted();
 153   }
 154
 155   void addInstruction() {
 156     assert(!getNumSuccessors() && "Cannot add instructions to this group!");
 157     ++NumInstructions;
 158   }
 159
 160   void cycleEvent() {
 161     if (isWaiting() && CriticalPredecessor.Cycles)
 162       CriticalPredecessor.Cycles--;
 163   }
 164 };
 165
 166 /// Abstract base interface for LS (load/store) units in llvm-mca.
 167 class LSUnitBase : public HardwareUnit {
 168   /// Load queue size.
 169   ///
 170   /// A value of zero for this field means that the load queue is unbounded.
 171   /// Processor models can declare the size of a load queue via tablegen (see
 172   /// the definition of tablegen class LoadQueue in
 173   /// llvm/Target/TargetSchedule.td).
 174   unsigned LQSize;
 175
 176   /// Load queue size.
 177   ///
 178   /// A value of zero for this field means that the store queue is unbounded.
 179   /// Processor models can declare the size of a store queue via tablegen (see
 180   /// the definition of tablegen class StoreQueue in
 181   /// llvm/Target/TargetSchedule.td).
 182   unsigned SQSize;
 183
 184   unsigned UsedLQEntries;
 185   unsigned UsedSQEntries;
 186
 187   /// True if loads don't alias with stores.
 188   ///
 189   /// By default, the LS unit assumes that loads and stores don't alias with
 190   /// eachother. If this field is set to false, then loads are always assumed to
 191   /// alias with stores.
 192   const bool NoAlias;
 193
 194   /// Used to map group identifiers to MemoryGroups.
 195   DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
 196   unsigned NextGroupID;
 197
 198 public:
 199   LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
 200              unsigned StoreQueueSize, bool AssumeNoAlias);
 201
 202   virtual ~LSUnitBase();
 203
 204   /// Returns the total number of entries in the load queue.
 205   unsigned getLoadQueueSize() const { return LQSize; }
 206
 207   /// Returns the total number of entries in the store queue.
 208   unsigned getStoreQueueSize() const { return SQSize; }
 209
 210   unsigned getUsedLQEntries() const { return UsedLQEntries; }
 211   unsigned getUsedSQEntries() const { return UsedSQEntries; }
 212   unsigned assignLQSlot() { return UsedLQEntries++; }
 213   unsigned assignSQSlot() { return UsedSQEntries++; }
 214
 215   bool assumeNoAlias() const { return NoAlias; }
 216
 217   enum Status {
 218     LSU_AVAILABLE = 0,
 219     LSU_LQUEUE_FULL, // Load Queue unavailable
 220     LSU_SQUEUE_FULL  // Store Queue unavailable
 221   };
 222
 223   /// This method checks the availability of the load/store buffers.
 224   ///
 225   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
 226   /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
 227   /// not a memory operation.
 228   virtual Status isAvailable(const InstRef &IR) const = 0;
 229
 230   /// Allocates LS resources for instruction IR.
 231   ///
 232   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
 233   /// with a LSUnitBase::Status value of LSU_AVAILABLE.
 234   /// Returns the GroupID associated with this instruction. That value will be
 235   /// used to set the LSUTokenID field in class Instruction.
 236   virtual unsigned dispatch(const InstRef &IR) = 0;
 237
 238   bool isSQEmpty() const { return !UsedSQEntries; }
 239   bool isLQEmpty() const { return !UsedLQEntries; }
 240   bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
 241   bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
 242
 243   bool isValidGroupID(unsigned Index) const {
 244     return Index && (Groups.find(Index) != Groups.end());
 245   }
 246
 247   /// Check if a peviously dispatched instruction IR is now ready for execution.
 248   bool isReady(const InstRef &IR) const {
 249     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
 250     const MemoryGroup &Group = getGroup(GroupID);
 251     return Group.isReady();
 252   }
 253
 254   /// Check if instruction IR only depends on memory instructions that are
 255   /// currently executing.
 256   bool isPending(const InstRef &IR) const {
 257     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
 258     const MemoryGroup &Group = getGroup(GroupID);
 259     return Group.isPending();
 260   }
 261
 262   /// Check if instruction IR is still waiting on memory operations, and the
 263   /// wait time is still unknown.
 264   bool isWaiting(const InstRef &IR) const {
 265     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
 266     const MemoryGroup &Group = getGroup(GroupID);
 267     return Group.isWaiting();
 268   }
 269
 270   bool hasDependentUsers(const InstRef &IR) const {
 271     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
 272     const MemoryGroup &Group = getGroup(GroupID);
 273     return !Group.isExecuted() && Group.getNumSuccessors();
 274   }
 275
 276   const MemoryGroup &getGroup(unsigned Index) const {
 277     assert(isValidGroupID(Index) && "Group doesn't exist!");
 278     return *Groups.find(Index)->second;
 279   }
 280
 281   MemoryGroup &getGroup(unsigned Index) {
 282     assert(isValidGroupID(Index) && "Group doesn't exist!");
 283     return *Groups.find(Index)->second;
 284   }
 285
 286   unsigned createMemoryGroup() {
 287     Groups.insert(
 288         std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
 289     return NextGroupID++;
 290   }
 291
 292   // Instruction executed event handlers.
 293   virtual void onInstructionExecuted(const InstRef &IR);
 294
 295   virtual void onInstructionIssued(const InstRef &IR) {
 296     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
 297     Groups[GroupID]->onInstructionIssued(IR);
 298   }
 299
 300   virtual void cycleEvent();
 301
 302 #ifndef NDEBUG
 303   void dump() const;
 304 #endif
 305 };
 306
 307 /// Default Load/Store Unit (LS Unit) for simulated processors.
 308 ///
 309 /// Each load (or store) consumes one entry in the load (or store) queue.
 310 ///
 311 /// Rules are:
 312 /// 1) A younger load is allowed to pass an older load only if there are no
 313 ///    stores nor barriers in between the two loads.
 314 /// 2) An younger store is not allowed to pass an older store.
 315 /// 3) A younger store is not allowed to pass an older load.
 316 /// 4) A younger load is allowed to pass an older store only if the load does
 317 ///    not alias with the store.
 318 ///
 319 /// This class optimistically assumes that loads don't alias store operations.
 320 /// Under this assumption, younger loads are always allowed to pass older
 321 /// stores (this would only affects rule 4).
 322 /// Essentially, this class doesn't perform any sort alias analysis to
 323 /// identify aliasing loads and stores.
 324 ///
 325 /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
 326 /// set to `false` by the constructor of LSUnit.
 327 ///
 328 /// Note that this class doesn't know about the existence of different memory
 329 /// types for memory operations (example: write-through, write-combining, etc.).
 330 /// Derived classes are responsible for implementing that extra knowledge, and
 331 /// provide different sets of rules for loads and stores by overriding method
 332 /// `isReady()`.
 333 /// To emulate a write-combining memory type, rule 2. must be relaxed in a
 334 /// derived class to enable the reordering of non-aliasing store operations.
 335 ///
 336 /// No assumptions are made by this class on the size of the store buffer.  This
 337 /// class doesn't know how to identify cases where store-to-load forwarding may
 338 /// occur.
 339 ///
 340 /// LSUnit doesn't attempt to predict whether a load or store hits or misses
 341 /// the L1 cache. To be more specific, LSUnit doesn't know anything about
 342 /// cache hierarchy and memory types.
 343 /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
 344 /// scheduling model provides an "optimistic" load-to-use latency (which usually
 345 /// matches the load-to-use latency for when there is a hit in the L1D).
 346 /// Derived classes may expand this knowledge.
 347 ///
 348 /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
 349 /// memory-barrier like instructions.
 350 /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
 351 /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
 352 /// serializes loads without forcing a flush of the load queue.
 353 /// Similarly, instructions that both `mayStore` and have `unmodeled side
 354 /// effects` are treated like store barriers. A full memory
 355 /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
 356 /// effects. This is obviously inaccurate, but this is the best that we can do
 357 /// at the moment.
 358 ///
 359 /// Each load/store barrier consumes one entry in the load/store queue. A
 360 /// load/store barrier enforces ordering of loads/stores:
 361 ///  - A younger load cannot pass a load barrier.
 362 ///  - A younger store cannot pass a store barrier.
 363 ///
 364 /// A younger load has to wait for the memory load barrier to execute.
 365 /// A load/store barrier is "executed" when it becomes the oldest entry in
 366 /// the load/store queue(s). That also means, all the older loads/stores have
 367 /// already been executed.
 368 class LSUnit : public LSUnitBase {
 369   // This class doesn't know about the latency of a load instruction. So, it
 370   // conservatively/pessimistically assumes that the latency of a load opcode
 371   // matches the instruction latency.
 372   //
 373   // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
 374   // and load/store conflicts, the latency of a load is determined by the depth
 375   // of the load pipeline. So, we could use field `LoadLatency` in the
 376   // MCSchedModel to model that latency.
 377   // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
 378   // L1D, and it usually already accounts for any extra latency due to data
 379   // forwarding.
 380   // When doing throughput analysis, `LoadLatency` is likely to
 381   // be a better predictor of load latency than instruction latency. This is
 382   // particularly true when simulating code with temporal/spatial locality of
 383   // memory accesses.
 384   // Using `LoadLatency` (instead of the instruction latency) is also expected
 385   // to improve the load queue allocation for long latency instructions with
 386   // folded memory operands (See PR39829).
 387   //
 388   // FIXME: On some processors, load/store operations are split into multiple
 389   // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
 390   // not 256-bit data types. So, a 256-bit load is effectively split into two
 391   // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
 392   // simplicity, this class optimistically assumes that a load instruction only
 393   // consumes one entry in the LoadQueue.  Similarly, store instructions only
 394   // consume a single entry in the StoreQueue.
 395   // In future, we should reassess the quality of this design, and consider
 396   // alternative approaches that let instructions specify the number of
 397   // load/store queue entries which they consume at dispatch stage (See
 398   // PR39830).
 399   //
 400   // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
 401   // conservatively treated as a store barrier. It forces older store to be
 402   // executed before newer stores are issued.
 403   //
 404   // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
 405   // conservatively treated as a load barrier. It forces older loads to execute
 406   // before newer loads are issued.
 407   unsigned CurrentLoadGroupID;
 408   unsigned CurrentLoadBarrierGroupID;
 409   unsigned CurrentStoreGroupID;
 410
 411 public:
 412   LSUnit(const MCSchedModel &SM)
 413       : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
 414   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
 415       : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
 416   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
 417       : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
 418         CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0) {}
 419
 420   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
 421   /// accomodate instruction IR.
 422   Status isAvailable(const InstRef &IR) const override;
 423
 424   /// Allocates LS resources for instruction IR.
 425   ///
 426   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
 427   /// returning LSU_AVAILABLE.
 428   ///
 429   /// Rules are:
 430   /// By default, rules are:
 431   /// 1. A store may not pass a previous store.
 432   /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
 433   /// 3. A load may pass a previous load.
 434   /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
 435   /// 5. A load has to wait until an older load barrier is fully executed.
 436   /// 6. A store has to wait until an older store barrier is fully executed.
 437   unsigned dispatch(const InstRef &IR) override;
 438
 439   // FIXME: For simplicity, we optimistically assume a similar behavior for
 440   // store instructions. In practice, store operations don't tend to leave the
 441   // store queue until they reach the 'Retired' stage (See PR39830).
 442   void onInstructionExecuted(const InstRef &IR) override;
 443 };
 444
 445 } // namespace mca
 446 } // namespace llvm
 447
 448 #endif // LLVM_MCA_LSUNIT_H