[llvm-exegesis] Reject x86 instructions that use non uniform memory accesses
[llvm-core.git] / tools / llvm-exegesis / lib / Uops.cpp
blob50be707feb2b870a70e0ab8ca935e370c1d68c3e
1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
10 #include "Uops.h"
12 #include "Assembler.h"
13 #include "BenchmarkRunner.h"
14 #include "MCInstrDescView.h"
15 #include "Target.h"
17 // FIXME: Load constants into registers (e.g. with fld1) to not break
18 // instructions like x87.
20 // Ideally we would like the only limitation on executing uops to be the issue
21 // ports. Maximizing port pressure increases the likelihood that the load is
22 // distributed evenly across possible ports.
24 // To achieve that, one approach is to generate instructions that do not have
25 // data dependencies between them.
27 // For some instructions, this is trivial:
28 // mov rax, qword ptr [rsi]
29 // mov rax, qword ptr [rsi]
30 // mov rax, qword ptr [rsi]
31 // mov rax, qword ptr [rsi]
32 // For the above snippet, haswell just renames rax four times and executes the
33 // four instructions two at a time on P23 and P0126.
35 // For some instructions, we just need to make sure that the source is
36 // different from the destination. For example, IDIV8r reads from GPR and
37 // writes to AX. We just need to ensure that the Var is assigned a
38 // register which is different from AX:
39 // idiv bx
40 // idiv bx
41 // idiv bx
42 // idiv bx
43 // The above snippet will be able to fully saturate the ports, while the same
44 // with ax would issue one uop every `latency(IDIV8r)` cycles.
46 // Some instructions make this harder because they both read and write from
47 // the same register:
48 // inc rax
49 // inc rax
50 // inc rax
51 // inc rax
52 // This has a data dependency from each instruction to the next, limit the
53 // number of instructions that can be issued in parallel.
54 // It turns out that this is not a big issue on recent Intel CPUs because they
55 // have heuristics to balance port pressure. In the snippet above, subsequent
56 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
57 // might end up executing them all on P0 (just because they can), or try
58 // avoiding P5 because it's usually under high pressure from vector
59 // instructions.
60 // This issue is even more important for high-latency instructions because
61 // they increase the idle time of the CPU, e.g. :
62 // imul rax, rbx
63 // imul rax, rbx
64 // imul rax, rbx
65 // imul rax, rbx
67 // To avoid that, we do the renaming statically by generating as many
68 // independent exclusive assignments as possible (until all possible registers
69 // are exhausted) e.g.:
70 // imul rax, rbx
71 // imul rcx, rbx
72 // imul rdx, rbx
73 // imul r8, rbx
75 // Some instruction even make the above static renaming impossible because
76 // they implicitly read and write from the same operand, e.g. ADC16rr reads
77 // and writes from EFLAGS.
78 // In that case we just use a greedy register assignment and hope for the
79 // best.
81 namespace exegesis {
83 static llvm::SmallVector<const Variable *, 8>
84 getVariablesWithTiedOperands(const Instruction &Instr) {
85 llvm::SmallVector<const Variable *, 8> Result;
86 for (const auto &Var : Instr.Variables)
87 if (Var.hasTiedOperands())
88 Result.push_back(&Var);
89 return Result;
92 static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
93 assert(a.size() == b.size());
94 for (auto I : b.set_bits())
95 a.reset(I);
98 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
100 UopsSnippetGenerator::~UopsSnippetGenerator() = default;
102 void UopsSnippetGenerator::instantiateMemoryOperands(
103 const unsigned ScratchSpacePointerInReg,
104 std::vector<InstructionTemplate> &Instructions) const {
105 if (ScratchSpacePointerInReg == 0)
106 return; // no memory operands.
107 const auto &ET = State.getExegesisTarget();
108 const unsigned MemStep = ET.getMaxMemoryAccessSize();
109 const size_t OriginalInstructionsSize = Instructions.size();
110 size_t I = 0;
111 for (InstructionTemplate &IT : Instructions) {
112 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
113 ++I;
116 while (Instructions.size() < kMinNumDifferentAddresses) {
117 InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
118 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
119 ++I;
120 Instructions.push_back(std::move(IT));
122 assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
123 "not enough scratch space");
126 llvm::Expected<std::vector<CodeTemplate>>
127 UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
128 CodeTemplate CT;
129 const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
130 if (Instr.hasMemoryOperands()) {
131 const auto &ET = State.getExegesisTarget();
132 CT.ScratchSpacePointerInReg =
133 ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
134 if (CT.ScratchSpacePointerInReg == 0)
135 return llvm::make_error<BenchmarkFailure>(
136 "Infeasible : target does not support memory instructions");
137 ScratchSpaceAliasedRegs =
138 &State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits();
139 // If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
140 // FIXME: We could make a copy of the scratch register.
141 for (const auto &Op : Instr.Operands) {
142 if (Op.isDef() && Op.isImplicitReg() &&
143 ScratchSpaceAliasedRegs->test(Op.getImplicitReg()))
144 return llvm::make_error<BenchmarkFailure>(
145 "Infeasible : memory instruction uses scratch memory register");
149 const AliasingConfigurations SelfAliasing(Instr, Instr);
150 InstructionTemplate IT(Instr);
151 if (SelfAliasing.empty()) {
152 CT.Info = "instruction is parallel, repeating a random one.";
153 CT.Instructions.push_back(std::move(IT));
154 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
155 return getSingleton(std::move(CT));
157 if (SelfAliasing.hasImplicitAliasing()) {
158 CT.Info = "instruction is serial, repeating a random one.";
159 CT.Instructions.push_back(std::move(IT));
160 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
161 return getSingleton(std::move(CT));
163 const auto TiedVariables = getVariablesWithTiedOperands(Instr);
164 if (!TiedVariables.empty()) {
165 if (TiedVariables.size() > 1)
166 return llvm::make_error<llvm::StringError>(
167 "Infeasible : don't know how to handle several tied variables",
168 llvm::inconvertibleErrorCode());
169 const Variable *Var = TiedVariables.front();
170 assert(Var);
171 const Operand &Op = Instr.getPrimaryOperand(*Var);
172 assert(Op.isReg());
173 CT.Info = "instruction has tied variables using static renaming.";
174 for (const llvm::MCPhysReg Reg :
175 Op.getRegisterAliasing().sourceBits().set_bits()) {
176 if (ScratchSpaceAliasedRegs && ScratchSpaceAliasedRegs->test(Reg))
177 continue; // Do not use the scratch memory address register.
178 InstructionTemplate TmpIT = IT;
179 TmpIT.getValueFor(*Var) = llvm::MCOperand::createReg(Reg);
180 CT.Instructions.push_back(std::move(TmpIT));
182 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
183 return getSingleton(std::move(CT));
185 const auto &ReservedRegisters = State.getRATC().reservedRegisters();
186 // No tied variables, we pick random values for defs.
187 llvm::BitVector Defs(State.getRegInfo().getNumRegs());
188 for (const auto &Op : Instr.Operands) {
189 if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
190 auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
191 remove(PossibleRegisters, ReservedRegisters);
192 // Do not use the scratch memory address register.
193 if (ScratchSpaceAliasedRegs)
194 remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
195 assert(PossibleRegisters.any() && "No register left to choose from");
196 const auto RandomReg = randomBit(PossibleRegisters);
197 Defs.set(RandomReg);
198 IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
201 // And pick random use values that are not reserved and don't alias with defs.
202 const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
203 for (const auto &Op : Instr.Operands) {
204 if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
205 auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
206 remove(PossibleRegisters, ReservedRegisters);
207 // Do not use the scratch memory address register.
208 if (ScratchSpaceAliasedRegs)
209 remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
210 remove(PossibleRegisters, DefAliases);
211 assert(PossibleRegisters.any() && "No register left to choose from");
212 const auto RandomReg = randomBit(PossibleRegisters);
213 IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
216 CT.Info =
217 "instruction has no tied variables picking Uses different from defs";
218 CT.Instructions.push_back(std::move(IT));
219 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
220 return getSingleton(std::move(CT));
223 llvm::Expected<std::vector<BenchmarkMeasure>>
224 UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
225 const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
227 std::vector<BenchmarkMeasure> Result;
228 const auto &PfmCounters = SchedModel.getExtraProcessorInfo().PfmCounters;
229 // Uops per port.
230 for (unsigned ProcResIdx = 1;
231 ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
232 const char *const Counters = PfmCounters.IssueCounters[ProcResIdx];
233 if (!Counters)
234 continue;
235 auto ExpectedCounterValue = Executor.runAndMeasure(Counters);
236 if (!ExpectedCounterValue)
237 return ExpectedCounterValue.takeError();
238 Result.push_back(BenchmarkMeasure::Create(
239 SchedModel.getProcResource(ProcResIdx)->Name, *ExpectedCounterValue));
241 // NumMicroOps.
242 if (const char *const UopsCounter = PfmCounters.UopsCounter) {
243 auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter);
244 if (!ExpectedCounterValue)
245 return ExpectedCounterValue.takeError();
246 Result.push_back(
247 BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue));
249 return std::move(Result);
252 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;
254 } // namespace exegesis