[llvm-exegesis][NFC] Simplify code now that Instruction has more semantic
[llvm-core.git] / tools / llvm-exegesis / lib / Uops.cpp
blobd291ff7664ebec7853bc48a0ec236c85780d827d
1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
10 #include "Uops.h"
12 #include "Assembler.h"
13 #include "BenchmarkRunner.h"
14 #include "MCInstrDescView.h"
15 #include "PerfHelper.h"
16 #include "Target.h"
18 // FIXME: Load constants into registers (e.g. with fld1) to not break
19 // instructions like x87.
21 // Ideally we would like the only limitation on executing uops to be the issue
22 // ports. Maximizing port pressure increases the likelihood that the load is
23 // distributed evenly across possible ports.
25 // To achieve that, one approach is to generate instructions that do not have
26 // data dependencies between them.
28 // For some instructions, this is trivial:
29 // mov rax, qword ptr [rsi]
30 // mov rax, qword ptr [rsi]
31 // mov rax, qword ptr [rsi]
32 // mov rax, qword ptr [rsi]
33 // For the above snippet, haswell just renames rax four times and executes the
34 // four instructions two at a time on P23 and P0126.
36 // For some instructions, we just need to make sure that the source is
37 // different from the destination. For example, IDIV8r reads from GPR and
38 // writes to AX. We just need to ensure that the Var is assigned a
39 // register which is different from AX:
40 // idiv bx
41 // idiv bx
42 // idiv bx
43 // idiv bx
44 // The above snippet will be able to fully saturate the ports, while the same
45 // with ax would issue one uop every `latency(IDIV8r)` cycles.
47 // Some instructions make this harder because they both read and write from
48 // the same register:
49 // inc rax
50 // inc rax
51 // inc rax
52 // inc rax
53 // This has a data dependency from each instruction to the next, limit the
54 // number of instructions that can be issued in parallel.
55 // It turns out that this is not a big issue on recent Intel CPUs because they
56 // have heuristics to balance port pressure. In the snippet above, subsequent
57 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
58 // might end up executing them all on P0 (just because they can), or try
59 // avoiding P5 because it's usually under high pressure from vector
60 // instructions.
61 // This issue is even more important for high-latency instructions because
62 // they increase the idle time of the CPU, e.g. :
63 // imul rax, rbx
64 // imul rax, rbx
65 // imul rax, rbx
66 // imul rax, rbx
68 // To avoid that, we do the renaming statically by generating as many
69 // independent exclusive assignments as possible (until all possible registers
70 // are exhausted) e.g.:
71 // imul rax, rbx
72 // imul rcx, rbx
73 // imul rdx, rbx
74 // imul r8, rbx
76 // Some instruction even make the above static renaming impossible because
77 // they implicitly read and write from the same operand, e.g. ADC16rr reads
78 // and writes from EFLAGS.
79 // In that case we just use a greedy register assignment and hope for the
80 // best.
82 namespace exegesis {
84 static llvm::SmallVector<const Variable *, 8>
85 getVariablesWithTiedOperands(const Instruction &Instr) {
86 llvm::SmallVector<const Variable *, 8> Result;
87 for (const auto &Var : Instr.Variables)
88 if (Var.hasTiedOperands())
89 Result.push_back(&Var);
90 return Result;
93 static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
94 assert(a.size() == b.size());
95 for (auto I : b.set_bits())
96 a.reset(I);
99 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
101 UopsSnippetGenerator::~UopsSnippetGenerator() = default;
103 void UopsSnippetGenerator::instantiateMemoryOperands(
104 const unsigned ScratchSpacePointerInReg,
105 std::vector<InstructionTemplate> &Instructions) const {
106 if (ScratchSpacePointerInReg == 0)
107 return; // no memory operands.
108 const auto &ET = State.getExegesisTarget();
109 const unsigned MemStep = ET.getMaxMemoryAccessSize();
110 const size_t OriginalInstructionsSize = Instructions.size();
111 size_t I = 0;
112 for (InstructionTemplate &IT : Instructions) {
113 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
114 ++I;
117 while (Instructions.size() < kMinNumDifferentAddresses) {
118 InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
119 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
120 ++I;
121 Instructions.push_back(std::move(IT));
123 assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
124 "not enough scratch space");
127 llvm::Expected<CodeTemplate>
128 UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
129 const Instruction Instr(State.getInstrInfo().get(Opcode), RATC);
130 if (Instr.hasMemoryOperands())
131 return llvm::make_error<BenchmarkFailure>(
132 "Infeasible : has unknown operands");
133 const auto &ET = State.getExegesisTarget();
134 CodeTemplate CT;
136 const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
137 if (Instr.hasMemoryOperands()) {
138 CT.ScratchSpacePointerInReg =
139 ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
140 if (CT.ScratchSpacePointerInReg == 0)
141 return llvm::make_error<BenchmarkFailure>(
142 "Infeasible : target does not support memory instructions");
143 ScratchSpaceAliasedRegs =
144 &RATC.getRegister(CT.ScratchSpacePointerInReg).aliasedBits();
145 // If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
146 // FIXME: We could make a copy of the scratch register.
147 for (const auto &Op : Instr.Operands) {
148 if (Op.isDef() && Op.isImplicitReg() &&
149 ScratchSpaceAliasedRegs->test(Op.getImplicitReg()))
150 return llvm::make_error<BenchmarkFailure>(
151 "Infeasible : memory instruction uses scratch memory register");
155 const AliasingConfigurations SelfAliasing(Instr, Instr);
156 InstructionTemplate IT(Instr);
157 if (SelfAliasing.empty()) {
158 CT.Info = "instruction is parallel, repeating a random one.";
159 CT.Instructions.push_back(std::move(IT));
160 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
161 return std::move(CT);
163 if (SelfAliasing.hasImplicitAliasing()) {
164 CT.Info = "instruction is serial, repeating a random one.";
165 CT.Instructions.push_back(std::move(IT));
166 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
167 return std::move(CT);
169 const auto TiedVariables = getVariablesWithTiedOperands(Instr);
170 if (!TiedVariables.empty()) {
171 if (TiedVariables.size() > 1)
172 return llvm::make_error<llvm::StringError>(
173 "Infeasible : don't know how to handle several tied variables",
174 llvm::inconvertibleErrorCode());
175 const Variable *Var = TiedVariables.front();
176 assert(Var);
177 const Operand &Op = Instr.getPrimaryOperand(*Var);
178 assert(Op.isReg());
179 CT.Info = "instruction has tied variables using static renaming.";
180 for (const llvm::MCPhysReg Reg :
181 Op.getRegisterAliasing().sourceBits().set_bits()) {
182 if (ScratchSpaceAliasedRegs && ScratchSpaceAliasedRegs->test(Reg))
183 continue; // Do not use the scratch memory address register.
184 InstructionTemplate TmpIT = IT;
185 TmpIT.getValueFor(*Var) = llvm::MCOperand::createReg(Reg);
186 CT.Instructions.push_back(std::move(TmpIT));
188 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
189 return std::move(CT);
191 // No tied variables, we pick random values for defs.
192 llvm::BitVector Defs(State.getRegInfo().getNumRegs());
193 for (const auto &Op : Instr.Operands) {
194 if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
195 auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
196 remove(PossibleRegisters, RATC.reservedRegisters());
197 // Do not use the scratch memory address register.
198 if (ScratchSpaceAliasedRegs)
199 remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
200 assert(PossibleRegisters.any() && "No register left to choose from");
201 const auto RandomReg = randomBit(PossibleRegisters);
202 Defs.set(RandomReg);
203 IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
206 // And pick random use values that are not reserved and don't alias with defs.
207 const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
208 for (const auto &Op : Instr.Operands) {
209 if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
210 auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
211 remove(PossibleRegisters, RATC.reservedRegisters());
212 // Do not use the scratch memory address register.
213 if (ScratchSpaceAliasedRegs)
214 remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
215 remove(PossibleRegisters, DefAliases);
216 assert(PossibleRegisters.any() && "No register left to choose from");
217 const auto RandomReg = randomBit(PossibleRegisters);
218 IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
221 CT.Info =
222 "instruction has no tied variables picking Uses different from defs";
223 CT.Instructions.push_back(std::move(IT));
224 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
225 return std::move(CT);
228 std::vector<BenchmarkMeasure>
229 UopsBenchmarkRunner::runMeasurements(const ExecutableFunction &Function,
230 ScratchSpace &Scratch) const {
231 const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
233 const auto RunMeasurement = [&Function,
234 &Scratch](const char *const Counters) {
235 // We sum counts when there are several counters for a single ProcRes
236 // (e.g. P23 on SandyBridge).
237 int64_t CounterValue = 0;
238 llvm::SmallVector<llvm::StringRef, 2> CounterNames;
239 llvm::StringRef(Counters).split(CounterNames, ',');
240 for (const auto &CounterName : CounterNames) {
241 pfm::PerfEvent UopPerfEvent(CounterName);
242 if (!UopPerfEvent.valid())
243 llvm::report_fatal_error(
244 llvm::Twine("invalid perf event ").concat(Counters));
245 pfm::Counter Counter(UopPerfEvent);
246 Scratch.clear();
247 Counter.start();
248 Function(Scratch.ptr());
249 Counter.stop();
250 CounterValue += Counter.read();
252 return CounterValue;
255 std::vector<BenchmarkMeasure> Result;
256 const auto &PfmCounters = SchedModel.getExtraProcessorInfo().PfmCounters;
257 // Uops per port.
258 for (unsigned ProcResIdx = 1;
259 ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
260 const char *const Counters = PfmCounters.IssueCounters[ProcResIdx];
261 if (!Counters)
262 continue;
263 const double CounterValue = RunMeasurement(Counters);
264 Result.push_back(BenchmarkMeasure::Create(
265 SchedModel.getProcResource(ProcResIdx)->Name, CounterValue));
267 // NumMicroOps.
268 if (const char *const UopsCounter = PfmCounters.UopsCounter) {
269 const double CounterValue = RunMeasurement(UopsCounter);
270 Result.push_back(BenchmarkMeasure::Create("NumMicroOps", CounterValue));
272 return Result;
275 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;
277 } // namespace exegesis