1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
11 #include "Assembler.h"
12 #include "BenchmarkRunner.h"
13 #include "MCInstrDescView.h"
16 // FIXME: Load constants into registers (e.g. with fld1) to not break
17 // instructions like x87.
19 // Ideally we would like the only limitation on executing uops to be the issue
20 // ports. Maximizing port pressure increases the likelihood that the load is
21 // distributed evenly across possible ports.
23 // To achieve that, one approach is to generate instructions that do not have
24 // data dependencies between them.
26 // For some instructions, this is trivial:
27 // mov rax, qword ptr [rsi]
28 // mov rax, qword ptr [rsi]
29 // mov rax, qword ptr [rsi]
30 // mov rax, qword ptr [rsi]
31 // For the above snippet, haswell just renames rax four times and executes the
32 // four instructions two at a time on P23 and P0126.
34 // For some instructions, we just need to make sure that the source is
35 // different from the destination. For example, IDIV8r reads from GPR and
36 // writes to AX. We just need to ensure that the Var is assigned a
37 // register which is different from AX:
42 // The above snippet will be able to fully saturate the ports, while the same
43 // with ax would issue one uop every `latency(IDIV8r)` cycles.
45 // Some instructions make this harder because they both read and write from
51 // This has a data dependency from each instruction to the next, limit the
52 // number of instructions that can be issued in parallel.
53 // It turns out that this is not a big issue on recent Intel CPUs because they
54 // have heuristics to balance port pressure. In the snippet above, subsequent
55 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
56 // might end up executing them all on P0 (just because they can), or try
57 // avoiding P5 because it's usually under high pressure from vector
59 // This issue is even more important for high-latency instructions because
60 // they increase the idle time of the CPU, e.g. :
66 // To avoid that, we do the renaming statically by generating as many
67 // independent exclusive assignments as possible (until all possible registers
68 // are exhausted) e.g.:
74 // Some instruction even make the above static renaming impossible because
75 // they implicitly read and write from the same operand, e.g. ADC16rr reads
76 // and writes from EFLAGS.
77 // In that case we just use a greedy register assignment and hope for the
83 static SmallVector
<const Variable
*, 8>
84 getVariablesWithTiedOperands(const Instruction
&Instr
) {
85 SmallVector
<const Variable
*, 8> Result
;
86 for (const auto &Var
: Instr
.Variables
)
87 if (Var
.hasTiedOperands())
88 Result
.push_back(&Var
);
92 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
94 UopsSnippetGenerator::~UopsSnippetGenerator() = default;
96 void UopsSnippetGenerator::instantiateMemoryOperands(
97 const unsigned ScratchSpacePointerInReg
,
98 std::vector
<InstructionTemplate
> &Instructions
) const {
99 if (ScratchSpacePointerInReg
== 0)
100 return; // no memory operands.
101 const auto &ET
= State
.getExegesisTarget();
102 const unsigned MemStep
= ET
.getMaxMemoryAccessSize();
103 const size_t OriginalInstructionsSize
= Instructions
.size();
105 for (InstructionTemplate
&IT
: Instructions
) {
106 ET
.fillMemoryOperands(IT
, ScratchSpacePointerInReg
, I
* MemStep
);
110 while (Instructions
.size() < kMinNumDifferentAddresses
) {
111 InstructionTemplate IT
= Instructions
[I
% OriginalInstructionsSize
];
112 ET
.fillMemoryOperands(IT
, ScratchSpacePointerInReg
, I
* MemStep
);
114 Instructions
.push_back(std::move(IT
));
116 assert(I
* MemStep
< BenchmarkRunner::ScratchSpace::kSize
&&
117 "not enough scratch space");
120 static std::vector
<InstructionTemplate
> generateSnippetUsingStaticRenaming(
121 const LLVMState
&State
, const InstructionTemplate
&IT
,
122 const ArrayRef
<const Variable
*> TiedVariables
,
123 const BitVector
&ForbiddenRegisters
) {
124 std::vector
<InstructionTemplate
> Instructions
;
125 // Assign registers to variables in a round-robin manner. This is simple but
126 // ensures that the most register-constrained variable does not get starved.
127 std::vector
<BitVector
> PossibleRegsForVar
;
128 for (const Variable
*Var
: TiedVariables
) {
130 const Operand
&Op
= IT
.Instr
.getPrimaryOperand(*Var
);
132 BitVector PossibleRegs
= Op
.getRegisterAliasing().sourceBits();
133 remove(PossibleRegs
, ForbiddenRegisters
);
134 PossibleRegsForVar
.push_back(std::move(PossibleRegs
));
136 SmallVector
<int, 2> Iterators(TiedVariables
.size(), 0);
138 InstructionTemplate TmpIT
= IT
;
139 // Find a possible register for each variable in turn, marking the
140 // register as taken.
141 for (size_t VarId
= 0; VarId
< TiedVariables
.size(); ++VarId
) {
142 const int NextPossibleReg
=
143 PossibleRegsForVar
[VarId
].find_next(Iterators
[VarId
]);
144 if (NextPossibleReg
<= 0) {
147 TmpIT
.getValueFor(*TiedVariables
[VarId
]) =
148 MCOperand::createReg(NextPossibleReg
);
150 Iterators
[VarId
] = NextPossibleReg
;
151 // Prevent other variables from using the register.
152 for (BitVector
&OtherPossibleRegs
: PossibleRegsForVar
) {
153 OtherPossibleRegs
.reset(NextPossibleReg
);
156 Instructions
.push_back(std::move(TmpIT
));
160 Expected
<std::vector
<CodeTemplate
>> UopsSnippetGenerator::generateCodeTemplates(
161 const Instruction
&Instr
, const BitVector
&ForbiddenRegisters
) const {
163 CT
.ScratchSpacePointerInReg
=
164 Instr
.hasMemoryOperands()
165 ? State
.getExegesisTarget().getScratchMemoryRegister(
166 State
.getTargetMachine().getTargetTriple())
168 const AliasingConfigurations
SelfAliasing(Instr
, Instr
);
169 InstructionTemplate
IT(Instr
);
170 if (SelfAliasing
.empty()) {
171 CT
.Info
= "instruction is parallel, repeating a random one.";
172 CT
.Instructions
.push_back(std::move(IT
));
173 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
174 return getSingleton(std::move(CT
));
176 if (SelfAliasing
.hasImplicitAliasing()) {
177 CT
.Info
= "instruction is serial, repeating a random one.";
178 CT
.Instructions
.push_back(std::move(IT
));
179 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
180 return getSingleton(std::move(CT
));
182 const auto TiedVariables
= getVariablesWithTiedOperands(Instr
);
183 if (!TiedVariables
.empty()) {
184 CT
.Info
= "instruction has tied variables, using static renaming.";
185 CT
.Instructions
= generateSnippetUsingStaticRenaming(
186 State
, IT
, TiedVariables
, ForbiddenRegisters
);
187 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
188 return getSingleton(std::move(CT
));
190 // No tied variables, we pick random values for defs.
191 BitVector
Defs(State
.getRegInfo().getNumRegs());
192 for (const auto &Op
: Instr
.Operands
) {
193 if (Op
.isReg() && Op
.isExplicit() && Op
.isDef() && !Op
.isMemory()) {
194 auto PossibleRegisters
= Op
.getRegisterAliasing().sourceBits();
195 // Do not use forbidden registers.
196 remove(PossibleRegisters
, ForbiddenRegisters
);
197 assert(PossibleRegisters
.any() && "No register left to choose from");
198 const auto RandomReg
= randomBit(PossibleRegisters
);
200 IT
.getValueFor(Op
) = MCOperand::createReg(RandomReg
);
203 // And pick random use values that are not reserved and don't alias with defs.
204 const auto DefAliases
= getAliasedBits(State
.getRegInfo(), Defs
);
205 for (const auto &Op
: Instr
.Operands
) {
206 if (Op
.isReg() && Op
.isExplicit() && Op
.isUse() && !Op
.isMemory()) {
207 auto PossibleRegisters
= Op
.getRegisterAliasing().sourceBits();
208 remove(PossibleRegisters
, ForbiddenRegisters
);
209 remove(PossibleRegisters
, DefAliases
);
210 assert(PossibleRegisters
.any() && "No register left to choose from");
211 const auto RandomReg
= randomBit(PossibleRegisters
);
212 IT
.getValueFor(Op
) = MCOperand::createReg(RandomReg
);
216 "instruction has no tied variables picking Uses different from defs";
217 CT
.Instructions
.push_back(std::move(IT
));
218 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
219 return getSingleton(std::move(CT
));
222 Expected
<std::vector
<BenchmarkMeasure
>>
223 UopsBenchmarkRunner::runMeasurements(const FunctionExecutor
&Executor
) const {
224 std::vector
<BenchmarkMeasure
> Result
;
225 const PfmCountersInfo
&PCI
= State
.getPfmCounters();
227 for (const auto *IssueCounter
= PCI
.IssueCounters
,
228 *IssueCounterEnd
= PCI
.IssueCounters
+ PCI
.NumIssueCounters
;
229 IssueCounter
!= IssueCounterEnd
; ++IssueCounter
) {
230 if (!IssueCounter
->Counter
)
232 auto ExpectedCounterValue
= Executor
.runAndMeasure(IssueCounter
->Counter
);
233 if (!ExpectedCounterValue
)
234 return ExpectedCounterValue
.takeError();
235 Result
.push_back(BenchmarkMeasure::Create(IssueCounter
->ProcResName
,
236 *ExpectedCounterValue
));
239 if (const char *const UopsCounter
= PCI
.UopsCounter
) {
240 auto ExpectedCounterValue
= Executor
.runAndMeasure(UopsCounter
);
241 if (!ExpectedCounterValue
)
242 return ExpectedCounterValue
.takeError();
244 BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue
));
246 return std::move(Result
);
249 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses
;
251 } // namespace exegesis