1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
11 #include "Assembler.h"
12 #include "BenchmarkRunner.h"
13 #include "MCInstrDescView.h"
16 // FIXME: Load constants into registers (e.g. with fld1) to not break
17 // instructions like x87.
19 // Ideally we would like the only limitation on executing uops to be the issue
20 // ports. Maximizing port pressure increases the likelihood that the load is
21 // distributed evenly across possible ports.
23 // To achieve that, one approach is to generate instructions that do not have
24 // data dependencies between them.
26 // For some instructions, this is trivial:
27 // mov rax, qword ptr [rsi]
28 // mov rax, qword ptr [rsi]
29 // mov rax, qword ptr [rsi]
30 // mov rax, qword ptr [rsi]
31 // For the above snippet, haswell just renames rax four times and executes the
32 // four instructions two at a time on P23 and P0126.
34 // For some instructions, we just need to make sure that the source is
35 // different from the destination. For example, IDIV8r reads from GPR and
36 // writes to AX. We just need to ensure that the Var is assigned a
37 // register which is different from AX:
42 // The above snippet will be able to fully saturate the ports, while the same
43 // with ax would issue one uop every `latency(IDIV8r)` cycles.
45 // Some instructions make this harder because they both read and write from
51 // This has a data dependency from each instruction to the next, limit the
52 // number of instructions that can be issued in parallel.
53 // It turns out that this is not a big issue on recent Intel CPUs because they
54 // have heuristics to balance port pressure. In the snippet above, subsequent
55 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
56 // might end up executing them all on P0 (just because they can), or try
57 // avoiding P5 because it's usually under high pressure from vector
59 // This issue is even more important for high-latency instructions because
60 // they increase the idle time of the CPU, e.g. :
66 // To avoid that, we do the renaming statically by generating as many
67 // independent exclusive assignments as possible (until all possible registers
68 // are exhausted) e.g.:
74 // Some instruction even make the above static renaming impossible because
75 // they implicitly read and write from the same operand, e.g. ADC16rr reads
76 // and writes from EFLAGS.
77 // In that case we just use a greedy register assignment and hope for the
83 static llvm::SmallVector
<const Variable
*, 8>
84 getVariablesWithTiedOperands(const Instruction
&Instr
) {
85 llvm::SmallVector
<const Variable
*, 8> Result
;
86 for (const auto &Var
: Instr
.Variables
)
87 if (Var
.hasTiedOperands())
88 Result
.push_back(&Var
);
92 static void remove(llvm::BitVector
&a
, const llvm::BitVector
&b
) {
93 assert(a
.size() == b
.size());
94 for (auto I
: b
.set_bits())
98 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
100 UopsSnippetGenerator::~UopsSnippetGenerator() = default;
102 void UopsSnippetGenerator::instantiateMemoryOperands(
103 const unsigned ScratchSpacePointerInReg
,
104 std::vector
<InstructionTemplate
> &Instructions
) const {
105 if (ScratchSpacePointerInReg
== 0)
106 return; // no memory operands.
107 const auto &ET
= State
.getExegesisTarget();
108 const unsigned MemStep
= ET
.getMaxMemoryAccessSize();
109 const size_t OriginalInstructionsSize
= Instructions
.size();
111 for (InstructionTemplate
&IT
: Instructions
) {
112 ET
.fillMemoryOperands(IT
, ScratchSpacePointerInReg
, I
* MemStep
);
116 while (Instructions
.size() < kMinNumDifferentAddresses
) {
117 InstructionTemplate IT
= Instructions
[I
% OriginalInstructionsSize
];
118 ET
.fillMemoryOperands(IT
, ScratchSpacePointerInReg
, I
* MemStep
);
120 Instructions
.push_back(std::move(IT
));
122 assert(I
* MemStep
< BenchmarkRunner::ScratchSpace::kSize
&&
123 "not enough scratch space");
126 static std::vector
<InstructionTemplate
> generateSnippetUsingStaticRenaming(
127 const LLVMState
&State
, const InstructionTemplate
&IT
,
128 const ArrayRef
<const Variable
*> TiedVariables
,
129 const BitVector
*ScratchSpaceAliasedRegs
) {
130 std::vector
<InstructionTemplate
> Instructions
;
131 // Assign registers to variables in a round-robin manner. This is simple but
132 // ensures that the most register-constrained variable does not get starved.
133 std::vector
<BitVector
> PossibleRegsForVar
;
134 for (const Variable
*Var
: TiedVariables
) {
136 const Operand
&Op
= IT
.Instr
.getPrimaryOperand(*Var
);
138 BitVector PossibleRegs
= State
.getRATC().emptyRegisters();
139 if (ScratchSpaceAliasedRegs
) {
140 PossibleRegs
|= *ScratchSpaceAliasedRegs
;
143 PossibleRegs
&= Op
.getRegisterAliasing().sourceBits();
144 PossibleRegsForVar
.push_back(std::move(PossibleRegs
));
146 SmallVector
<int, 2> Iterators(TiedVariables
.size(), 0);
148 InstructionTemplate TmpIT
= IT
;
149 // Find a possible register for each variable in turn, marking the
150 // register as taken.
151 for (size_t VarId
= 0; VarId
< TiedVariables
.size(); ++VarId
) {
152 const int NextPossibleReg
=
153 PossibleRegsForVar
[VarId
].find_next(Iterators
[VarId
]);
154 if (NextPossibleReg
<= 0) {
157 TmpIT
.getValueFor(*TiedVariables
[VarId
]) =
158 llvm::MCOperand::createReg(NextPossibleReg
);
160 Iterators
[VarId
] = NextPossibleReg
;
161 // Prevent other variables from using the register.
162 for (BitVector
&OtherPossibleRegs
: PossibleRegsForVar
) {
163 OtherPossibleRegs
.reset(NextPossibleReg
);
166 Instructions
.push_back(std::move(TmpIT
));
170 llvm::Expected
<std::vector
<CodeTemplate
>>
171 UopsSnippetGenerator::generateCodeTemplates(const Instruction
&Instr
) const {
173 const llvm::BitVector
*ScratchSpaceAliasedRegs
= nullptr;
174 if (Instr
.hasMemoryOperands()) {
175 const auto &ET
= State
.getExegesisTarget();
176 CT
.ScratchSpacePointerInReg
=
177 ET
.getScratchMemoryRegister(State
.getTargetMachine().getTargetTriple());
178 if (CT
.ScratchSpacePointerInReg
== 0)
179 return llvm::make_error
<BenchmarkFailure
>(
180 "Infeasible : target does not support memory instructions");
181 ScratchSpaceAliasedRegs
=
182 &State
.getRATC().getRegister(CT
.ScratchSpacePointerInReg
).aliasedBits();
183 // If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
184 // FIXME: We could make a copy of the scratch register.
185 for (const auto &Op
: Instr
.Operands
) {
186 if (Op
.isDef() && Op
.isImplicitReg() &&
187 ScratchSpaceAliasedRegs
->test(Op
.getImplicitReg()))
188 return llvm::make_error
<BenchmarkFailure
>(
189 "Infeasible : memory instruction uses scratch memory register");
193 const AliasingConfigurations
SelfAliasing(Instr
, Instr
);
194 InstructionTemplate
IT(Instr
);
195 if (SelfAliasing
.empty()) {
196 CT
.Info
= "instruction is parallel, repeating a random one.";
197 CT
.Instructions
.push_back(std::move(IT
));
198 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
199 return getSingleton(std::move(CT
));
201 if (SelfAliasing
.hasImplicitAliasing()) {
202 CT
.Info
= "instruction is serial, repeating a random one.";
203 CT
.Instructions
.push_back(std::move(IT
));
204 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
205 return getSingleton(std::move(CT
));
207 const auto TiedVariables
= getVariablesWithTiedOperands(Instr
);
208 if (!TiedVariables
.empty()) {
209 CT
.Info
= "instruction has tied variables, using static renaming.";
210 CT
.Instructions
= generateSnippetUsingStaticRenaming(
211 State
, IT
, TiedVariables
, ScratchSpaceAliasedRegs
);
212 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
213 return getSingleton(std::move(CT
));
215 const auto &ReservedRegisters
= State
.getRATC().reservedRegisters();
216 // No tied variables, we pick random values for defs.
217 llvm::BitVector
Defs(State
.getRegInfo().getNumRegs());
218 for (const auto &Op
: Instr
.Operands
) {
219 if (Op
.isReg() && Op
.isExplicit() && Op
.isDef() && !Op
.isMemory()) {
220 auto PossibleRegisters
= Op
.getRegisterAliasing().sourceBits();
221 remove(PossibleRegisters
, ReservedRegisters
);
222 // Do not use the scratch memory address register.
223 if (ScratchSpaceAliasedRegs
)
224 remove(PossibleRegisters
, *ScratchSpaceAliasedRegs
);
225 assert(PossibleRegisters
.any() && "No register left to choose from");
226 const auto RandomReg
= randomBit(PossibleRegisters
);
228 IT
.getValueFor(Op
) = llvm::MCOperand::createReg(RandomReg
);
231 // And pick random use values that are not reserved and don't alias with defs.
232 const auto DefAliases
= getAliasedBits(State
.getRegInfo(), Defs
);
233 for (const auto &Op
: Instr
.Operands
) {
234 if (Op
.isReg() && Op
.isExplicit() && Op
.isUse() && !Op
.isMemory()) {
235 auto PossibleRegisters
= Op
.getRegisterAliasing().sourceBits();
236 remove(PossibleRegisters
, ReservedRegisters
);
237 // Do not use the scratch memory address register.
238 if (ScratchSpaceAliasedRegs
)
239 remove(PossibleRegisters
, *ScratchSpaceAliasedRegs
);
240 remove(PossibleRegisters
, DefAliases
);
241 assert(PossibleRegisters
.any() && "No register left to choose from");
242 const auto RandomReg
= randomBit(PossibleRegisters
);
243 IT
.getValueFor(Op
) = llvm::MCOperand::createReg(RandomReg
);
247 "instruction has no tied variables picking Uses different from defs";
248 CT
.Instructions
.push_back(std::move(IT
));
249 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
250 return getSingleton(std::move(CT
));
253 llvm::Expected
<std::vector
<BenchmarkMeasure
>>
254 UopsBenchmarkRunner::runMeasurements(const FunctionExecutor
&Executor
) const {
255 std::vector
<BenchmarkMeasure
> Result
;
256 const PfmCountersInfo
&PCI
= State
.getPfmCounters();
258 for (const auto *IssueCounter
= PCI
.IssueCounters
,
259 *IssueCounterEnd
= PCI
.IssueCounters
+ PCI
.NumIssueCounters
;
260 IssueCounter
!= IssueCounterEnd
; ++IssueCounter
) {
261 if (!IssueCounter
->Counter
)
263 auto ExpectedCounterValue
= Executor
.runAndMeasure(IssueCounter
->Counter
);
264 if (!ExpectedCounterValue
)
265 return ExpectedCounterValue
.takeError();
266 Result
.push_back(BenchmarkMeasure::Create(IssueCounter
->ProcResName
,
267 *ExpectedCounterValue
));
270 if (const char *const UopsCounter
= PCI
.UopsCounter
) {
271 auto ExpectedCounterValue
= Executor
.runAndMeasure(UopsCounter
);
272 if (!ExpectedCounterValue
)
273 return ExpectedCounterValue
.takeError();
275 BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue
));
277 return std::move(Result
);
280 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses
;
282 } // namespace exegesis