llvm/tools/llvm-exegesis/lib/ParallelSnippetGenerator.cpp

   1 //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "ParallelSnippetGenerator.h"
  10
  11 #include "BenchmarkRunner.h"
  12 #include "MCInstrDescView.h"
  13 #include "Target.h"
  14
  15 // FIXME: Load constants into registers (e.g. with fld1) to not break
  16 // instructions like x87.
  17
  18 // Ideally we would like the only limitation on executing instructions to be the
  19 // availability of the CPU resources (e.g. execution ports) needed to execute
  20 // them, instead of the availability of their data dependencies.
  21
  22 // To achieve that, one approach is to generate instructions that do not have
  23 // data dependencies between them.
  24 //
  25 // For some instructions, this is trivial:
  26 //    mov rax, qword ptr [rsi]
  27 //    mov rax, qword ptr [rsi]
  28 //    mov rax, qword ptr [rsi]
  29 //    mov rax, qword ptr [rsi]
  30 // For the above snippet, haswell just renames rax four times and executes the
  31 // four instructions two at a time on P23 and P0126.
  32 //
  33 // For some instructions, we just need to make sure that the source is
  34 // different from the destination. For example, IDIV8r reads from GPR and
  35 // writes to AX. We just need to ensure that the Var is assigned a
  36 // register which is different from AX:
  37 //    idiv bx
  38 //    idiv bx
  39 //    idiv bx
  40 //    idiv bx
  41 // The above snippet will be able to fully saturate the ports, while the same
  42 // with ax would issue one uop every `latency(IDIV8r)` cycles.
  43 //
  44 // Some instructions make this harder because they both read and write from
  45 // the same register:
  46 //    inc rax
  47 //    inc rax
  48 //    inc rax
  49 //    inc rax
  50 // This has a data dependency from each instruction to the next, limit the
  51 // number of instructions that can be issued in parallel.
  52 // It turns out that this is not a big issue on recent Intel CPUs because they
  53 // have heuristics to balance port pressure. In the snippet above, subsequent
  54 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
  55 // might end up executing them all on P0 (just because they can), or try
  56 // avoiding P5 because it's usually under high pressure from vector
  57 // instructions.
  58 // This issue is even more important for high-latency instructions because
  59 // they increase the idle time of the CPU, e.g. :
  60 //    imul rax, rbx
  61 //    imul rax, rbx
  62 //    imul rax, rbx
  63 //    imul rax, rbx
  64 //
  65 // To avoid that, we do the renaming statically by generating as many
  66 // independent exclusive assignments as possible (until all possible registers
  67 // are exhausted) e.g.:
  68 //    imul rax, rbx
  69 //    imul rcx, rbx
  70 //    imul rdx, rbx
  71 //    imul r8,  rbx
  72 //
  73 // Some instruction even make the above static renaming impossible because
  74 // they implicitly read and write from the same operand, e.g. ADC16rr reads
  75 // and writes from EFLAGS.
  76 // In that case we just use a greedy register assignment and hope for the
  77 // best.
  78
  79 namespace llvm {
  80 namespace exegesis {
  81
  82 static bool hasVariablesWithTiedOperands(const Instruction &Instr) {
  83   SmallVector<const Variable *, 8> Result;
  84   for (const auto &Var : Instr.Variables)
  85     if (Var.hasTiedOperands())
  86       return true;
  87   return false;
  88 }
  89
  90 ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
  91
  92 void ParallelSnippetGenerator::instantiateMemoryOperands(
  93     const unsigned ScratchSpacePointerInReg,
  94     std::vector<InstructionTemplate> &Instructions) const {
  95   if (ScratchSpacePointerInReg == 0)
  96     return; // no memory operands.
  97   const auto &ET = State.getExegesisTarget();
  98   const unsigned MemStep = ET.getMaxMemoryAccessSize();
  99   const size_t OriginalInstructionsSize = Instructions.size();
 100   size_t I = 0;
 101   for (InstructionTemplate &IT : Instructions) {
 102     ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
 103     ++I;
 104   }
 105
 106   while (Instructions.size() < kMinNumDifferentAddresses) {
 107     InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
 108     ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
 109     ++I;
 110     Instructions.push_back(std::move(IT));
 111   }
 112   assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
 113          "not enough scratch space");
 114 }
 115
 116 enum class RegRandomizationStrategy : uint8_t {
 117   PickRandomRegs,
 118   SingleStaticRegPerOperand,
 119   SingleStaticReg,
 120
 121   FIRST = PickRandomRegs,
 122   LAST = SingleStaticReg,
 123 };
 124
 125 } // namespace exegesis
 126
 127 template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> {
 128   static constexpr bool is_iterable = true;
 129 };
 130
 131 namespace exegesis {
 132
 133 const char *getDescription(RegRandomizationStrategy S) {
 134   switch (S) {
 135   case RegRandomizationStrategy::PickRandomRegs:
 136     return "randomizing registers";
 137   case RegRandomizationStrategy::SingleStaticRegPerOperand:
 138     return "one unique register for each position";
 139   case RegRandomizationStrategy::SingleStaticReg:
 140     return "reusing the same register for all positions";
 141   }
 142   llvm_unreachable("Unknown UseRegRandomizationStrategy enum");
 143 }
 144
 145 static std::variant<std::nullopt_t, MCOperand, Register>
 146 generateSingleRegisterForInstrAvoidingDefUseOverlap(
 147     const LLVMState &State, const BitVector &ForbiddenRegisters,
 148     const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
 149     const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT,
 150     const Operand &Op, const ArrayRef<InstructionTemplate> Instructions,
 151     RegRandomizationStrategy S) {
 152   const Instruction &Instr = IT.getInstr();
 153   assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() &&
 154          !IT.getValueFor(Op).isValid());
 155   assert((!Op.isUse() || !Op.isTied()) &&
 156          "Not expecting to see a tied use reg");
 157
 158   if (Op.isUse()) {
 159     switch (S) {
 160     case RegRandomizationStrategy::PickRandomRegs:
 161       break;
 162     case RegRandomizationStrategy::SingleStaticReg:
 163     case RegRandomizationStrategy::SingleStaticRegPerOperand: {
 164       if (!Instructions.empty())
 165         return Instructions.front().getValueFor(Op);
 166       if (S != RegRandomizationStrategy::SingleStaticReg)
 167         break;
 168       BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
 169       const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
 170       if (std::optional<int> CommonBit =
 171               getFirstCommonBit(PossibleRegisters, UseAliases))
 172         return *CommonBit;
 173       break;
 174     }
 175     }
 176   }
 177
 178   BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
 179   remove(PossibleRegisters, ForbiddenRegisters);
 180
 181   if (Op.isDef()) {
 182     remove(PossibleRegisters, ImplicitUseAliases);
 183     const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
 184     remove(PossibleRegisters, UseAliases);
 185   }
 186
 187   if (Op.isUse()) {
 188     remove(PossibleRegisters, ImplicitDefAliases);
 189     // NOTE: in general, using same reg for multiple Use's is fine.
 190     if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) {
 191       const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
 192       remove(PossibleRegisters, UseAliases);
 193     }
 194   }
 195
 196   bool IsDefWithTiedUse =
 197       Instr.Variables[Op.getVariableIndex()].hasTiedOperands();
 198   if (Op.isUse() || IsDefWithTiedUse) {
 199     // Now, important bit: if we have used some register for def,
 200     // then we can not use that same register for *any* use,
 201     // be it either an untied use, or an use tied to a def.
 202     // But def-ing same regs is fine, as long as there are no uses!
 203     const BitVector DefsAliases = getAliasedBits(State.getRegInfo(), Defs);
 204     remove(PossibleRegisters, DefsAliases);
 205   }
 206
 207   if (!PossibleRegisters.any())
 208     return std::nullopt;
 209
 210   return randomBit(PossibleRegisters);
 211 }
 212
 213 static std::optional<InstructionTemplate>
 214 generateSingleSnippetForInstrAvoidingDefUseOverlap(
 215     const LLVMState &State, const BitVector &ForbiddenRegisters,
 216     const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
 217     BitVector &Uses, BitVector &Defs, InstructionTemplate IT,
 218     const ArrayRef<InstructionTemplate> Instructions,
 219     RegRandomizationStrategy S) {
 220   const Instruction &Instr = IT.getInstr();
 221   for (const Operand &Op : Instr.Operands) {
 222     if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() ||
 223         IT.getValueFor(Op).isValid())
 224       continue;
 225     assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses.");
 226
 227     std::variant<std::nullopt_t, MCOperand, Register> R =
 228         generateSingleRegisterForInstrAvoidingDefUseOverlap(
 229             State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
 230             Uses, Defs, IT, Op, Instructions, S);
 231
 232     if (std::holds_alternative<std::nullopt_t>(R))
 233       return {};
 234
 235     MCOperand MCOp;
 236     if (std::holds_alternative<MCOperand>(R))
 237       MCOp = std::get<MCOperand>(R);
 238     else {
 239       Register RandomReg = std::get<Register>(R);
 240       if (Op.isDef())
 241         Defs.set(RandomReg);
 242       if (Op.isUse())
 243         Uses.set(RandomReg);
 244       MCOp = MCOperand::createReg(RandomReg);
 245     }
 246     IT.getValueFor(Op) = MCOp;
 247   }
 248   return IT;
 249 }
 250
 251 static std::vector<InstructionTemplate>
 252 generateSnippetForInstrAvoidingDefUseOverlap(
 253     const LLVMState &State, const InstructionTemplate &IT,
 254     RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) {
 255   // We don't want to accidentally serialize the instruction,
 256   // so we must be sure that we don't pick a def that is an implicit use,
 257   // or a use that is an implicit def, so record implicit regs now.
 258   BitVector ImplicitUses(State.getRegInfo().getNumRegs());
 259   BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
 260   for (const auto &Op : IT.getInstr().Operands) {
 261     if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
 262       assert(Op.isImplicitReg() && "Not an implicit register operand?");
 263       if (Op.isUse())
 264         ImplicitUses.set(Op.getImplicitReg());
 265       else {
 266         assert(Op.isDef() && "Not a use and not a def?");
 267         ImplicitDefs.set(Op.getImplicitReg());
 268       }
 269     }
 270   }
 271   const BitVector ImplicitUseAliases =
 272       getAliasedBits(State.getRegInfo(), ImplicitUses);
 273   const BitVector ImplicitDefAliases =
 274       getAliasedBits(State.getRegInfo(), ImplicitDefs);
 275
 276   BitVector Defs(State.getRegInfo().getNumRegs());
 277   BitVector Uses(State.getRegInfo().getNumRegs());
 278   std::vector<InstructionTemplate> Instructions;
 279
 280   while (true) {
 281     std::optional<InstructionTemplate> TmpIT =
 282         generateSingleSnippetForInstrAvoidingDefUseOverlap(
 283             State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
 284             Uses, Defs, IT, Instructions, S);
 285     if (!TmpIT)
 286       return Instructions;
 287     Instructions.push_back(std::move(*TmpIT));
 288     if (!hasVariablesWithTiedOperands(IT.getInstr()))
 289       return Instructions;
 290     assert(Instructions.size() <= 128 && "Stuck in endless loop?");
 291   }
 292 }
 293
 294 Expected<std::vector<CodeTemplate>>
 295 ParallelSnippetGenerator::generateCodeTemplates(
 296     InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
 297   const Instruction &Instr = Variant.getInstr();
 298   CodeTemplate CT;
 299   CT.ScratchSpacePointerInReg =
 300       Instr.hasMemoryOperands()
 301           ? State.getExegesisTarget().getScratchMemoryRegister(
 302                 State.getTargetMachine().getTargetTriple())
 303           : 0;
 304   const AliasingConfigurations SelfAliasing(Instr, Instr, ForbiddenRegisters);
 305   if (SelfAliasing.empty()) {
 306     CT.Info = "instruction is parallel, repeating a random one.";
 307     CT.Instructions.push_back(std::move(Variant));
 308     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
 309     return getSingleton(std::move(CT));
 310   }
 311   if (SelfAliasing.hasImplicitAliasing()) {
 312     CT.Info = "instruction is serial, repeating a random one.";
 313     CT.Instructions.push_back(std::move(Variant));
 314     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
 315     return getSingleton(std::move(CT));
 316   }
 317   std::vector<CodeTemplate> Result;
 318   bool HasTiedOperands = hasVariablesWithTiedOperands(Instr);
 319   // If there are no tied operands, then we don't want to "saturate backedge",
 320   // and the template we will produce will have only a single instruction.
 321   unsigned NumUntiedUseRegs = count_if(Instr.Operands, [](const Operand &Op) {
 322     return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() &&
 323            !Op.isTied();
 324   });
 325   SmallVector<RegRandomizationStrategy, 3> Strategies;
 326   if (HasTiedOperands || NumUntiedUseRegs >= 3)
 327     Strategies.push_back(RegRandomizationStrategy::PickRandomRegs);
 328   if (NumUntiedUseRegs >= 2)
 329     Strategies.push_back(RegRandomizationStrategy::SingleStaticRegPerOperand);
 330   Strategies.push_back(RegRandomizationStrategy::SingleStaticReg);
 331   for (RegRandomizationStrategy S : Strategies) {
 332     CodeTemplate CurrCT = CT.clone();
 333     CurrCT.Info =
 334         Twine("instruction has ")
 335             .concat(HasTiedOperands ? "" : "no ")
 336             .concat("tied variables, avoiding "
 337                     "Read-After-Write issue, picking random def and use "
 338                     "registers not aliasing each other, for uses, ")
 339             .concat(getDescription(S))
 340             .str();
 341     CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap(
 342         State, Variant, S, ForbiddenRegisters);
 343     if (CurrCT.Instructions.empty())
 344       return make_error<StringError>(
 345           Twine("Failed to produce any snippet via: ").concat(CurrCT.Info),
 346           inconvertibleErrorCode());
 347     instantiateMemoryOperands(CurrCT.ScratchSpacePointerInReg,
 348                               CurrCT.Instructions);
 349     Result.push_back(std::move(CurrCT));
 350   }
 351   return Result;
 352 }
 353
 354 constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;
 355
 356 } // namespace exegesis
 357 } // namespace llvm