tools/llvm-exegesis/lib/Uops.cpp

   1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9
  10 #include "Uops.h"
  11
  12 #include "Assembler.h"
  13 #include "BenchmarkRunner.h"
  14 #include "MCInstrDescView.h"
  15 #include "Target.h"
  16
  17 // FIXME: Load constants into registers (e.g. with fld1) to not break
  18 // instructions like x87.
  19
  20 // Ideally we would like the only limitation on executing uops to be the issue
  21 // ports. Maximizing port pressure increases the likelihood that the load is
  22 // distributed evenly across possible ports.
  23
  24 // To achieve that, one approach is to generate instructions that do not have
  25 // data dependencies between them.
  26 //
  27 // For some instructions, this is trivial:
  28 //    mov rax, qword ptr [rsi]
  29 //    mov rax, qword ptr [rsi]
  30 //    mov rax, qword ptr [rsi]
  31 //    mov rax, qword ptr [rsi]
  32 // For the above snippet, haswell just renames rax four times and executes the
  33 // four instructions two at a time on P23 and P0126.
  34 //
  35 // For some instructions, we just need to make sure that the source is
  36 // different from the destination. For example, IDIV8r reads from GPR and
  37 // writes to AX. We just need to ensure that the Var is assigned a
  38 // register which is different from AX:
  39 //    idiv bx
  40 //    idiv bx
  41 //    idiv bx
  42 //    idiv bx
  43 // The above snippet will be able to fully saturate the ports, while the same
  44 // with ax would issue one uop every `latency(IDIV8r)` cycles.
  45 //
  46 // Some instructions make this harder because they both read and write from
  47 // the same register:
  48 //    inc rax
  49 //    inc rax
  50 //    inc rax
  51 //    inc rax
  52 // This has a data dependency from each instruction to the next, limit the
  53 // number of instructions that can be issued in parallel.
  54 // It turns out that this is not a big issue on recent Intel CPUs because they
  55 // have heuristics to balance port pressure. In the snippet above, subsequent
  56 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
  57 // might end up executing them all on P0 (just because they can), or try
  58 // avoiding P5 because it's usually under high pressure from vector
  59 // instructions.
  60 // This issue is even more important for high-latency instructions because
  61 // they increase the idle time of the CPU, e.g. :
  62 //    imul rax, rbx
  63 //    imul rax, rbx
  64 //    imul rax, rbx
  65 //    imul rax, rbx
  66 //
  67 // To avoid that, we do the renaming statically by generating as many
  68 // independent exclusive assignments as possible (until all possible registers
  69 // are exhausted) e.g.:
  70 //    imul rax, rbx
  71 //    imul rcx, rbx
  72 //    imul rdx, rbx
  73 //    imul r8,  rbx
  74 //
  75 // Some instruction even make the above static renaming impossible because
  76 // they implicitly read and write from the same operand, e.g. ADC16rr reads
  77 // and writes from EFLAGS.
  78 // In that case we just use a greedy register assignment and hope for the
  79 // best.
  80
  81 namespace exegesis {
  82
  83 static llvm::SmallVector<const Variable *, 8>
  84 getVariablesWithTiedOperands(const Instruction &Instr) {
  85   llvm::SmallVector<const Variable *, 8> Result;
  86   for (const auto &Var : Instr.Variables)
  87     if (Var.hasTiedOperands())
  88       Result.push_back(&Var);
  89   return Result;
  90 }
  91
  92 static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
  93   assert(a.size() == b.size());
  94   for (auto I : b.set_bits())
  95     a.reset(I);
  96 }
  97
  98 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
  99
 100 UopsSnippetGenerator::~UopsSnippetGenerator() = default;
 101
 102 void UopsSnippetGenerator::instantiateMemoryOperands(
 103     const unsigned ScratchSpacePointerInReg,
 104     std::vector<InstructionTemplate> &Instructions) const {
 105   if (ScratchSpacePointerInReg == 0)
 106     return; // no memory operands.
 107   const auto &ET = State.getExegesisTarget();
 108   const unsigned MemStep = ET.getMaxMemoryAccessSize();
 109   const size_t OriginalInstructionsSize = Instructions.size();
 110   size_t I = 0;
 111   for (InstructionTemplate &IT : Instructions) {
 112     ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
 113     ++I;
 114   }
 115
 116   while (Instructions.size() < kMinNumDifferentAddresses) {
 117     InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
 118     ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
 119     ++I;
 120     Instructions.push_back(std::move(IT));
 121   }
 122   assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
 123          "not enough scratch space");
 124 }
 125
 126 llvm::Expected<std::vector<CodeTemplate>>
 127 UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
 128   CodeTemplate CT;
 129   const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
 130   if (Instr.hasMemoryOperands()) {
 131     const auto &ET = State.getExegesisTarget();
 132     CT.ScratchSpacePointerInReg =
 133         ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
 134     if (CT.ScratchSpacePointerInReg == 0)
 135       return llvm::make_error<BenchmarkFailure>(
 136           "Infeasible : target does not support memory instructions");
 137     ScratchSpaceAliasedRegs =
 138         &State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits();
 139     // If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
 140     // FIXME: We could make a copy of the scratch register.
 141     for (const auto &Op : Instr.Operands) {
 142       if (Op.isDef() && Op.isImplicitReg() &&
 143           ScratchSpaceAliasedRegs->test(Op.getImplicitReg()))
 144         return llvm::make_error<BenchmarkFailure>(
 145             "Infeasible : memory instruction uses scratch memory register");
 146     }
 147   }
 148
 149   const AliasingConfigurations SelfAliasing(Instr, Instr);
 150   InstructionTemplate IT(Instr);
 151   if (SelfAliasing.empty()) {
 152     CT.Info = "instruction is parallel, repeating a random one.";
 153     CT.Instructions.push_back(std::move(IT));
 154     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
 155     return getSingleton(std::move(CT));
 156   }
 157   if (SelfAliasing.hasImplicitAliasing()) {
 158     CT.Info = "instruction is serial, repeating a random one.";
 159     CT.Instructions.push_back(std::move(IT));
 160     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
 161     return getSingleton(std::move(CT));
 162   }
 163   const auto TiedVariables = getVariablesWithTiedOperands(Instr);
 164   if (!TiedVariables.empty()) {
 165     if (TiedVariables.size() > 1)
 166       return llvm::make_error<llvm::StringError>(
 167           "Infeasible : don't know how to handle several tied variables",
 168           llvm::inconvertibleErrorCode());
 169     const Variable *Var = TiedVariables.front();
 170     assert(Var);
 171     const Operand &Op = Instr.getPrimaryOperand(*Var);
 172     assert(Op.isReg());
 173     CT.Info = "instruction has tied variables using static renaming.";
 174     for (const llvm::MCPhysReg Reg :
 175          Op.getRegisterAliasing().sourceBits().set_bits()) {
 176       if (ScratchSpaceAliasedRegs && ScratchSpaceAliasedRegs->test(Reg))
 177         continue; // Do not use the scratch memory address register.
 178       InstructionTemplate TmpIT = IT;
 179       TmpIT.getValueFor(*Var) = llvm::MCOperand::createReg(Reg);
 180       CT.Instructions.push_back(std::move(TmpIT));
 181     }
 182     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
 183     return getSingleton(std::move(CT));
 184   }
 185   const auto &ReservedRegisters = State.getRATC().reservedRegisters();
 186   // No tied variables, we pick random values for defs.
 187   llvm::BitVector Defs(State.getRegInfo().getNumRegs());
 188   for (const auto &Op : Instr.Operands) {
 189     if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
 190       auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
 191       remove(PossibleRegisters, ReservedRegisters);
 192       // Do not use the scratch memory address register.
 193       if (ScratchSpaceAliasedRegs)
 194         remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
 195       assert(PossibleRegisters.any() && "No register left to choose from");
 196       const auto RandomReg = randomBit(PossibleRegisters);
 197       Defs.set(RandomReg);
 198       IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
 199     }
 200   }
 201   // And pick random use values that are not reserved and don't alias with defs.
 202   const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
 203   for (const auto &Op : Instr.Operands) {
 204     if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
 205       auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
 206       remove(PossibleRegisters, ReservedRegisters);
 207       // Do not use the scratch memory address register.
 208       if (ScratchSpaceAliasedRegs)
 209         remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
 210       remove(PossibleRegisters, DefAliases);
 211       assert(PossibleRegisters.any() && "No register left to choose from");
 212       const auto RandomReg = randomBit(PossibleRegisters);
 213       IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
 214     }
 215   }
 216   CT.Info =
 217       "instruction has no tied variables picking Uses different from defs";
 218   CT.Instructions.push_back(std::move(IT));
 219   instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
 220   return getSingleton(std::move(CT));
 221 }
 222
 223 llvm::Expected<std::vector<BenchmarkMeasure>>
 224 UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
 225   const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
 226
 227   std::vector<BenchmarkMeasure> Result;
 228   const auto &PfmCounters = SchedModel.getExtraProcessorInfo().PfmCounters;
 229   // Uops per port.
 230   for (unsigned ProcResIdx = 1;
 231        ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
 232     const char *const Counters = PfmCounters.IssueCounters[ProcResIdx];
 233     if (!Counters)
 234       continue;
 235     auto ExpectedCounterValue = Executor.runAndMeasure(Counters);
 236     if (!ExpectedCounterValue)
 237       return ExpectedCounterValue.takeError();
 238     Result.push_back(BenchmarkMeasure::Create(
 239         SchedModel.getProcResource(ProcResIdx)->Name, *ExpectedCounterValue));
 240   }
 241   // NumMicroOps.
 242   if (const char *const UopsCounter = PfmCounters.UopsCounter) {
 243     auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter);
 244     if (!ExpectedCounterValue)
 245       return ExpectedCounterValue.takeError();
 246     Result.push_back(
 247         BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue));
 248   }
 249   return std::move(Result);
 250 }
 251
 252 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;
 253
 254 } // namespace exegesis