[InstCombine] Signed saturation patterns
[llvm-core.git] / tools / llvm-exegesis / lib / Uops.cpp
blobdb7dce8971873f36ede50d3aff0254a9b7222525
1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "Uops.h"
11 #include "Assembler.h"
12 #include "BenchmarkRunner.h"
13 #include "MCInstrDescView.h"
14 #include "Target.h"
16 // FIXME: Load constants into registers (e.g. with fld1) to not break
17 // instructions like x87.
19 // Ideally we would like the only limitation on executing uops to be the issue
20 // ports. Maximizing port pressure increases the likelihood that the load is
21 // distributed evenly across possible ports.
23 // To achieve that, one approach is to generate instructions that do not have
24 // data dependencies between them.
26 // For some instructions, this is trivial:
27 // mov rax, qword ptr [rsi]
28 // mov rax, qword ptr [rsi]
29 // mov rax, qword ptr [rsi]
30 // mov rax, qword ptr [rsi]
31 // For the above snippet, haswell just renames rax four times and executes the
32 // four instructions two at a time on P23 and P0126.
34 // For some instructions, we just need to make sure that the source is
35 // different from the destination. For example, IDIV8r reads from GPR and
36 // writes to AX. We just need to ensure that the Var is assigned a
37 // register which is different from AX:
38 // idiv bx
39 // idiv bx
40 // idiv bx
41 // idiv bx
42 // The above snippet will be able to fully saturate the ports, while the same
43 // with ax would issue one uop every `latency(IDIV8r)` cycles.
45 // Some instructions make this harder because they both read and write from
46 // the same register:
47 // inc rax
48 // inc rax
49 // inc rax
50 // inc rax
51 // This has a data dependency from each instruction to the next, limit the
52 // number of instructions that can be issued in parallel.
53 // It turns out that this is not a big issue on recent Intel CPUs because they
54 // have heuristics to balance port pressure. In the snippet above, subsequent
55 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
56 // might end up executing them all on P0 (just because they can), or try
57 // avoiding P5 because it's usually under high pressure from vector
58 // instructions.
59 // This issue is even more important for high-latency instructions because
60 // they increase the idle time of the CPU, e.g. :
61 // imul rax, rbx
62 // imul rax, rbx
63 // imul rax, rbx
64 // imul rax, rbx
66 // To avoid that, we do the renaming statically by generating as many
67 // independent exclusive assignments as possible (until all possible registers
68 // are exhausted) e.g.:
69 // imul rax, rbx
70 // imul rcx, rbx
71 // imul rdx, rbx
72 // imul r8, rbx
74 // Some instruction even make the above static renaming impossible because
75 // they implicitly read and write from the same operand, e.g. ADC16rr reads
76 // and writes from EFLAGS.
77 // In that case we just use a greedy register assignment and hope for the
78 // best.
80 namespace llvm {
81 namespace exegesis {
83 static SmallVector<const Variable *, 8>
84 getVariablesWithTiedOperands(const Instruction &Instr) {
85 SmallVector<const Variable *, 8> Result;
86 for (const auto &Var : Instr.Variables)
87 if (Var.hasTiedOperands())
88 Result.push_back(&Var);
89 return Result;
92 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
94 UopsSnippetGenerator::~UopsSnippetGenerator() = default;
96 void UopsSnippetGenerator::instantiateMemoryOperands(
97 const unsigned ScratchSpacePointerInReg,
98 std::vector<InstructionTemplate> &Instructions) const {
99 if (ScratchSpacePointerInReg == 0)
100 return; // no memory operands.
101 const auto &ET = State.getExegesisTarget();
102 const unsigned MemStep = ET.getMaxMemoryAccessSize();
103 const size_t OriginalInstructionsSize = Instructions.size();
104 size_t I = 0;
105 for (InstructionTemplate &IT : Instructions) {
106 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
107 ++I;
110 while (Instructions.size() < kMinNumDifferentAddresses) {
111 InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
112 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
113 ++I;
114 Instructions.push_back(std::move(IT));
116 assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
117 "not enough scratch space");
120 static std::vector<InstructionTemplate> generateSnippetUsingStaticRenaming(
121 const LLVMState &State, const InstructionTemplate &IT,
122 const ArrayRef<const Variable *> TiedVariables,
123 const BitVector &ForbiddenRegisters) {
124 std::vector<InstructionTemplate> Instructions;
125 // Assign registers to variables in a round-robin manner. This is simple but
126 // ensures that the most register-constrained variable does not get starved.
127 std::vector<BitVector> PossibleRegsForVar;
128 for (const Variable *Var : TiedVariables) {
129 assert(Var);
130 const Operand &Op = IT.Instr.getPrimaryOperand(*Var);
131 assert(Op.isReg());
132 BitVector PossibleRegs = Op.getRegisterAliasing().sourceBits();
133 remove(PossibleRegs, ForbiddenRegisters);
134 PossibleRegsForVar.push_back(std::move(PossibleRegs));
136 SmallVector<int, 2> Iterators(TiedVariables.size(), 0);
137 while (true) {
138 InstructionTemplate TmpIT = IT;
139 // Find a possible register for each variable in turn, marking the
140 // register as taken.
141 for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) {
142 const int NextPossibleReg =
143 PossibleRegsForVar[VarId].find_next(Iterators[VarId]);
144 if (NextPossibleReg <= 0) {
145 return Instructions;
147 TmpIT.getValueFor(*TiedVariables[VarId]) =
148 MCOperand::createReg(NextPossibleReg);
149 // Bump iterator.
150 Iterators[VarId] = NextPossibleReg;
151 // Prevent other variables from using the register.
152 for (BitVector &OtherPossibleRegs : PossibleRegsForVar) {
153 OtherPossibleRegs.reset(NextPossibleReg);
156 Instructions.push_back(std::move(TmpIT));
160 Expected<std::vector<CodeTemplate>> UopsSnippetGenerator::generateCodeTemplates(
161 const Instruction &Instr, const BitVector &ForbiddenRegisters) const {
162 CodeTemplate CT;
163 CT.ScratchSpacePointerInReg =
164 Instr.hasMemoryOperands()
165 ? State.getExegesisTarget().getScratchMemoryRegister(
166 State.getTargetMachine().getTargetTriple())
167 : 0;
168 const AliasingConfigurations SelfAliasing(Instr, Instr);
169 InstructionTemplate IT(Instr);
170 if (SelfAliasing.empty()) {
171 CT.Info = "instruction is parallel, repeating a random one.";
172 CT.Instructions.push_back(std::move(IT));
173 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
174 return getSingleton(std::move(CT));
176 if (SelfAliasing.hasImplicitAliasing()) {
177 CT.Info = "instruction is serial, repeating a random one.";
178 CT.Instructions.push_back(std::move(IT));
179 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
180 return getSingleton(std::move(CT));
182 const auto TiedVariables = getVariablesWithTiedOperands(Instr);
183 if (!TiedVariables.empty()) {
184 CT.Info = "instruction has tied variables, using static renaming.";
185 CT.Instructions = generateSnippetUsingStaticRenaming(
186 State, IT, TiedVariables, ForbiddenRegisters);
187 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
188 return getSingleton(std::move(CT));
190 // No tied variables, we pick random values for defs.
191 BitVector Defs(State.getRegInfo().getNumRegs());
192 for (const auto &Op : Instr.Operands) {
193 if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
194 auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
195 // Do not use forbidden registers.
196 remove(PossibleRegisters, ForbiddenRegisters);
197 assert(PossibleRegisters.any() && "No register left to choose from");
198 const auto RandomReg = randomBit(PossibleRegisters);
199 Defs.set(RandomReg);
200 IT.getValueFor(Op) = MCOperand::createReg(RandomReg);
203 // And pick random use values that are not reserved and don't alias with defs.
204 const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
205 for (const auto &Op : Instr.Operands) {
206 if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
207 auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
208 remove(PossibleRegisters, ForbiddenRegisters);
209 remove(PossibleRegisters, DefAliases);
210 assert(PossibleRegisters.any() && "No register left to choose from");
211 const auto RandomReg = randomBit(PossibleRegisters);
212 IT.getValueFor(Op) = MCOperand::createReg(RandomReg);
215 CT.Info =
216 "instruction has no tied variables picking Uses different from defs";
217 CT.Instructions.push_back(std::move(IT));
218 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
219 return getSingleton(std::move(CT));
222 Expected<std::vector<BenchmarkMeasure>>
223 UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
224 std::vector<BenchmarkMeasure> Result;
225 const PfmCountersInfo &PCI = State.getPfmCounters();
226 // Uops per port.
227 for (const auto *IssueCounter = PCI.IssueCounters,
228 *IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters;
229 IssueCounter != IssueCounterEnd; ++IssueCounter) {
230 if (!IssueCounter->Counter)
231 continue;
232 auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter);
233 if (!ExpectedCounterValue)
234 return ExpectedCounterValue.takeError();
235 Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName,
236 *ExpectedCounterValue));
238 // NumMicroOps.
239 if (const char *const UopsCounter = PCI.UopsCounter) {
240 auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter);
241 if (!ExpectedCounterValue)
242 return ExpectedCounterValue.takeError();
243 Result.push_back(
244 BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue));
246 return std::move(Result);
249 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;
251 } // namespace exegesis
252 } // namespace llvm