[llvm-exegesis] Provide a way to handle memory instructions.
[llvm-core.git] / tools / llvm-exegesis / lib / Uops.cpp
blob729b4847dbc61be08553e71f83ef0ff7f6e48745
1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
10 #include "Uops.h"
12 #include "Assembler.h"
13 #include "BenchmarkRunner.h"
14 #include "MCInstrDescView.h"
15 #include "PerfHelper.h"
16 #include "Target.h"
18 // FIXME: Load constants into registers (e.g. with fld1) to not break
19 // instructions like x87.
21 // Ideally we would like the only limitation on executing uops to be the issue
22 // ports. Maximizing port pressure increases the likelihood that the load is
23 // distributed evenly across possible ports.
25 // To achieve that, one approach is to generate instructions that do not have
26 // data dependencies between them.
28 // For some instructions, this is trivial:
29 // mov rax, qword ptr [rsi]
30 // mov rax, qword ptr [rsi]
31 // mov rax, qword ptr [rsi]
32 // mov rax, qword ptr [rsi]
33 // For the above snippet, haswell just renames rax four times and executes the
34 // four instructions two at a time on P23 and P0126.
36 // For some instructions, we just need to make sure that the source is
37 // different from the destination. For example, IDIV8r reads from GPR and
38 // writes to AX. We just need to ensure that the Var is assigned a
39 // register which is different from AX:
40 // idiv bx
41 // idiv bx
42 // idiv bx
43 // idiv bx
44 // The above snippet will be able to fully saturate the ports, while the same
45 // with ax would issue one uop every `latency(IDIV8r)` cycles.
47 // Some instructions make this harder because they both read and write from
48 // the same register:
49 // inc rax
50 // inc rax
51 // inc rax
52 // inc rax
53 // This has a data dependency from each instruction to the next, limit the
54 // number of instructions that can be issued in parallel.
55 // It turns out that this is not a big issue on recent Intel CPUs because they
56 // have heuristics to balance port pressure. In the snippet above, subsequent
57 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
58 // might end up executing them all on P0 (just because they can), or try
59 // avoiding P5 because it's usually under high pressure from vector
60 // instructions.
61 // This issue is even more important for high-latency instructions because
62 // they increase the idle time of the CPU, e.g. :
63 // imul rax, rbx
64 // imul rax, rbx
65 // imul rax, rbx
66 // imul rax, rbx
68 // To avoid that, we do the renaming statically by generating as many
69 // independent exclusive assignments as possible (until all possible registers
70 // are exhausted) e.g.:
71 // imul rax, rbx
72 // imul rcx, rbx
73 // imul rdx, rbx
74 // imul r8, rbx
76 // Some instruction even make the above static renaming impossible because
77 // they implicitly read and write from the same operand, e.g. ADC16rr reads
78 // and writes from EFLAGS.
79 // In that case we just use a greedy register assignment and hope for the
80 // best.
82 namespace exegesis {
84 static bool hasUnknownOperand(const llvm::MCOperandInfo &OpInfo) {
85 return OpInfo.OperandType == llvm::MCOI::OPERAND_UNKNOWN;
88 llvm::Error
89 UopsBenchmarkRunner::isInfeasible(const llvm::MCInstrDesc &MCInstrDesc) const {
90 if (llvm::any_of(MCInstrDesc.operands(), hasUnknownOperand))
91 return llvm::make_error<BenchmarkFailure>(
92 "Infeasible : has unknown operands");
93 return llvm::Error::success();
96 // Returns whether this Variable ties Use and Def operands together.
97 static bool hasTiedOperands(const Instruction &Instr, const Variable &Var) {
98 bool HasUse = false;
99 bool HasDef = false;
100 for (const unsigned OpIndex : Var.TiedOperands) {
101 const Operand &Op = Instr.Operands[OpIndex];
102 if (Op.IsDef)
103 HasDef = true;
104 else
105 HasUse = true;
107 return HasUse && HasDef;
110 static llvm::SmallVector<const Variable *, 8>
111 getTiedVariables(const Instruction &Instr) {
112 llvm::SmallVector<const Variable *, 8> Result;
113 for (const auto &Var : Instr.Variables)
114 if (hasTiedOperands(Instr, Var))
115 Result.push_back(&Var);
116 return Result;
119 static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
120 assert(a.size() == b.size());
121 for (auto I : b.set_bits())
122 a.reset(I);
125 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
127 void UopsBenchmarkRunner::instantiateMemoryOperands(
128 const unsigned ScratchSpaceReg,
129 std::vector<InstructionInstance> &Snippet) const {
130 if (ScratchSpaceReg == 0)
131 return; // no memory operands.
132 const auto &ET = State.getExegesisTarget();
133 const unsigned MemStep = ET.getMaxMemoryAccessSize();
134 const size_t OriginalSnippetSize = Snippet.size();
135 size_t I = 0;
136 for (InstructionInstance &II : Snippet) {
137 ET.fillMemoryOperands(II, ScratchSpaceReg, I * MemStep);
138 ++I;
141 while (Snippet.size() < kMinNumDifferentAddresses) {
142 InstructionInstance II = Snippet[I % OriginalSnippetSize];
143 ET.fillMemoryOperands(II, ScratchSpaceReg, I * MemStep);
144 ++I;
145 Snippet.push_back(std::move(II));
147 assert(I * MemStep < ScratchSpace::kSize && "not enough scratch space");
150 llvm::Expected<SnippetPrototype>
151 UopsBenchmarkRunner::generatePrototype(unsigned Opcode) const {
152 const auto &InstrDesc = State.getInstrInfo().get(Opcode);
153 if (auto E = isInfeasible(InstrDesc))
154 return std::move(E);
155 const Instruction Instr(InstrDesc, RATC);
156 const auto &ET = State.getExegesisTarget();
157 SnippetPrototype Prototype;
159 const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
160 if (Instr.hasMemoryOperands()) {
161 Prototype.ScratchSpaceReg =
162 ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
163 if (Prototype.ScratchSpaceReg == 0)
164 return llvm::make_error<BenchmarkFailure>(
165 "Infeasible : target does not support memory instructions");
166 ScratchSpaceAliasedRegs =
167 &RATC.getRegister(Prototype.ScratchSpaceReg).aliasedBits();
168 // If the instruction implicitly writes to ScratchSpaceReg , abort.
169 // FIXME: We could make a copy of the scratch register.
170 for (const auto &Op : Instr.Operands) {
171 if (Op.IsDef && Op.ImplicitReg &&
172 ScratchSpaceAliasedRegs->test(*Op.ImplicitReg))
173 return llvm::make_error<BenchmarkFailure>(
174 "Infeasible : memory instruction uses scratch memory register");
178 const AliasingConfigurations SelfAliasing(Instr, Instr);
179 InstructionInstance II(Instr);
180 if (SelfAliasing.empty()) {
181 Prototype.Explanation = "instruction is parallel, repeating a random one.";
182 Prototype.Snippet.push_back(std::move(II));
183 instantiateMemoryOperands(Prototype.ScratchSpaceReg, Prototype.Snippet);
184 return std::move(Prototype);
186 if (SelfAliasing.hasImplicitAliasing()) {
187 Prototype.Explanation = "instruction is serial, repeating a random one.";
188 Prototype.Snippet.push_back(std::move(II));
189 instantiateMemoryOperands(Prototype.ScratchSpaceReg, Prototype.Snippet);
190 return std::move(Prototype);
192 const auto TiedVariables = getTiedVariables(Instr);
193 if (!TiedVariables.empty()) {
194 if (TiedVariables.size() > 1)
195 return llvm::make_error<llvm::StringError>(
196 "Infeasible : don't know how to handle several tied variables",
197 llvm::inconvertibleErrorCode());
198 const Variable *Var = TiedVariables.front();
199 assert(Var);
200 assert(!Var->TiedOperands.empty());
201 const Operand &Op = Instr.Operands[Var->TiedOperands.front()];
202 assert(Op.Tracker);
203 Prototype.Explanation =
204 "instruction has tied variables using static renaming.";
205 for (const llvm::MCPhysReg Reg : Op.Tracker->sourceBits().set_bits()) {
206 if (ScratchSpaceAliasedRegs && ScratchSpaceAliasedRegs->test(Reg))
207 continue; // Do not use the scratch memory address register.
208 InstructionInstance TmpII = II;
209 TmpII.getValueFor(*Var) = llvm::MCOperand::createReg(Reg);
210 Prototype.Snippet.push_back(std::move(TmpII));
212 instantiateMemoryOperands(Prototype.ScratchSpaceReg, Prototype.Snippet);
213 return std::move(Prototype);
215 // No tied variables, we pick random values for defs.
216 llvm::BitVector Defs(State.getRegInfo().getNumRegs());
217 for (const auto &Op : Instr.Operands) {
218 if (Op.Tracker && Op.IsExplicit && Op.IsDef && !Op.IsMem) {
219 auto PossibleRegisters = Op.Tracker->sourceBits();
220 remove(PossibleRegisters, RATC.reservedRegisters());
221 // Do not use the scratch memory address register.
222 if (ScratchSpaceAliasedRegs)
223 remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
224 assert(PossibleRegisters.any() && "No register left to choose from");
225 const auto RandomReg = randomBit(PossibleRegisters);
226 Defs.set(RandomReg);
227 II.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
230 // And pick random use values that are not reserved and don't alias with defs.
231 const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
232 for (const auto &Op : Instr.Operands) {
233 if (Op.Tracker && Op.IsExplicit && !Op.IsDef && !Op.IsMem) {
234 auto PossibleRegisters = Op.Tracker->sourceBits();
235 remove(PossibleRegisters, RATC.reservedRegisters());
236 // Do not use the scratch memory address register.
237 if (ScratchSpaceAliasedRegs)
238 remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
239 remove(PossibleRegisters, DefAliases);
240 assert(PossibleRegisters.any() && "No register left to choose from");
241 const auto RandomReg = randomBit(PossibleRegisters);
242 II.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
245 Prototype.Explanation =
246 "instruction has no tied variables picking Uses different from defs";
247 Prototype.Snippet.push_back(std::move(II));
248 instantiateMemoryOperands(Prototype.ScratchSpaceReg, Prototype.Snippet);
249 return std::move(Prototype);
252 std::vector<BenchmarkMeasure>
253 UopsBenchmarkRunner::runMeasurements(const ExecutableFunction &Function,
254 ScratchSpace &Scratch,
255 const unsigned NumRepetitions) const {
256 const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
258 std::vector<BenchmarkMeasure> Result;
259 for (unsigned ProcResIdx = 1;
260 ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
261 const char *const PfmCounters = SchedModel.getExtraProcessorInfo()
262 .PfmCounters.IssueCounters[ProcResIdx];
263 if (!PfmCounters)
264 continue;
265 // We sum counts when there are several counters for a single ProcRes
266 // (e.g. P23 on SandyBridge).
267 int64_t CounterValue = 0;
268 llvm::SmallVector<llvm::StringRef, 2> CounterNames;
269 llvm::StringRef(PfmCounters).split(CounterNames, ',');
270 for (const auto &CounterName : CounterNames) {
271 pfm::PerfEvent UopPerfEvent(CounterName);
272 if (!UopPerfEvent.valid())
273 llvm::report_fatal_error(
274 llvm::Twine("invalid perf event ").concat(PfmCounters));
275 pfm::Counter Counter(UopPerfEvent);
276 Scratch.clear();
277 Counter.start();
278 Function(Scratch.ptr());
279 Counter.stop();
280 CounterValue += Counter.read();
282 Result.push_back({llvm::itostr(ProcResIdx),
283 static_cast<double>(CounterValue) / NumRepetitions,
284 SchedModel.getProcResource(ProcResIdx)->Name});
286 return Result;
289 constexpr const size_t UopsBenchmarkRunner::kMinNumDifferentAddresses;
291 } // namespace exegesis