1 //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "ParallelSnippetGenerator.h"
11 #include "BenchmarkRunner.h"
12 #include "MCInstrDescView.h"
15 // FIXME: Load constants into registers (e.g. with fld1) to not break
16 // instructions like x87.
18 // Ideally we would like the only limitation on executing instructions to be the
19 // availability of the CPU resources (e.g. execution ports) needed to execute
20 // them, instead of the availability of their data dependencies.
22 // To achieve that, one approach is to generate instructions that do not have
23 // data dependencies between them.
25 // For some instructions, this is trivial:
26 // mov rax, qword ptr [rsi]
27 // mov rax, qword ptr [rsi]
28 // mov rax, qword ptr [rsi]
29 // mov rax, qword ptr [rsi]
30 // For the above snippet, haswell just renames rax four times and executes the
31 // four instructions two at a time on P23 and P0126.
33 // For some instructions, we just need to make sure that the source is
34 // different from the destination. For example, IDIV8r reads from GPR and
35 // writes to AX. We just need to ensure that the Var is assigned a
36 // register which is different from AX:
41 // The above snippet will be able to fully saturate the ports, while the same
42 // with ax would issue one uop every `latency(IDIV8r)` cycles.
44 // Some instructions make this harder because they both read and write from
50 // This has a data dependency from each instruction to the next, limit the
51 // number of instructions that can be issued in parallel.
52 // It turns out that this is not a big issue on recent Intel CPUs because they
53 // have heuristics to balance port pressure. In the snippet above, subsequent
54 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
55 // might end up executing them all on P0 (just because they can), or try
56 // avoiding P5 because it's usually under high pressure from vector
58 // This issue is even more important for high-latency instructions because
59 // they increase the idle time of the CPU, e.g. :
65 // To avoid that, we do the renaming statically by generating as many
66 // independent exclusive assignments as possible (until all possible registers
67 // are exhausted) e.g.:
73 // Some instruction even make the above static renaming impossible because
74 // they implicitly read and write from the same operand, e.g. ADC16rr reads
75 // and writes from EFLAGS.
76 // In that case we just use a greedy register assignment and hope for the
82 static SmallVector
<const Variable
*, 8>
83 getVariablesWithTiedOperands(const Instruction
&Instr
) {
84 SmallVector
<const Variable
*, 8> Result
;
85 for (const auto &Var
: Instr
.Variables
)
86 if (Var
.hasTiedOperands())
87 Result
.push_back(&Var
);
91 ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
93 void ParallelSnippetGenerator::instantiateMemoryOperands(
94 const unsigned ScratchSpacePointerInReg
,
95 std::vector
<InstructionTemplate
> &Instructions
) const {
96 if (ScratchSpacePointerInReg
== 0)
97 return; // no memory operands.
98 const auto &ET
= State
.getExegesisTarget();
99 const unsigned MemStep
= ET
.getMaxMemoryAccessSize();
100 const size_t OriginalInstructionsSize
= Instructions
.size();
102 for (InstructionTemplate
&IT
: Instructions
) {
103 ET
.fillMemoryOperands(IT
, ScratchSpacePointerInReg
, I
* MemStep
);
107 while (Instructions
.size() < kMinNumDifferentAddresses
) {
108 InstructionTemplate IT
= Instructions
[I
% OriginalInstructionsSize
];
109 ET
.fillMemoryOperands(IT
, ScratchSpacePointerInReg
, I
* MemStep
);
111 Instructions
.push_back(std::move(IT
));
113 assert(I
* MemStep
< BenchmarkRunner::ScratchSpace::kSize
&&
114 "not enough scratch space");
117 static std::vector
<InstructionTemplate
> generateSnippetUsingStaticRenaming(
118 const LLVMState
&State
, const InstructionTemplate
&IT
,
119 const ArrayRef
<const Variable
*> TiedVariables
,
120 const BitVector
&ForbiddenRegisters
) {
121 std::vector
<InstructionTemplate
> Instructions
;
122 // Assign registers to variables in a round-robin manner. This is simple but
123 // ensures that the most register-constrained variable does not get starved.
124 std::vector
<BitVector
> PossibleRegsForVar
;
125 for (const Variable
*Var
: TiedVariables
) {
127 const Operand
&Op
= IT
.getInstr().getPrimaryOperand(*Var
);
129 BitVector PossibleRegs
= Op
.getRegisterAliasing().sourceBits();
130 remove(PossibleRegs
, ForbiddenRegisters
);
131 PossibleRegsForVar
.push_back(std::move(PossibleRegs
));
133 SmallVector
<int, 2> Iterators(TiedVariables
.size(), 0);
135 InstructionTemplate TmpIT
= IT
;
136 // Find a possible register for each variable in turn, marking the
137 // register as taken.
138 for (size_t VarId
= 0; VarId
< TiedVariables
.size(); ++VarId
) {
139 const int NextPossibleReg
=
140 PossibleRegsForVar
[VarId
].find_next(Iterators
[VarId
]);
141 if (NextPossibleReg
<= 0) {
144 TmpIT
.getValueFor(*TiedVariables
[VarId
]) =
145 MCOperand::createReg(NextPossibleReg
);
147 Iterators
[VarId
] = NextPossibleReg
;
148 // Prevent other variables from using the register.
149 for (BitVector
&OtherPossibleRegs
: PossibleRegsForVar
) {
150 OtherPossibleRegs
.reset(NextPossibleReg
);
153 Instructions
.push_back(std::move(TmpIT
));
157 Expected
<std::vector
<CodeTemplate
>>
158 ParallelSnippetGenerator::generateCodeTemplates(
159 InstructionTemplate Variant
, const BitVector
&ForbiddenRegisters
) const {
160 const Instruction
&Instr
= Variant
.getInstr();
162 CT
.ScratchSpacePointerInReg
=
163 Instr
.hasMemoryOperands()
164 ? State
.getExegesisTarget().getScratchMemoryRegister(
165 State
.getTargetMachine().getTargetTriple())
167 const AliasingConfigurations
SelfAliasing(Instr
, Instr
);
168 if (SelfAliasing
.empty()) {
169 CT
.Info
= "instruction is parallel, repeating a random one.";
170 CT
.Instructions
.push_back(std::move(Variant
));
171 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
172 return getSingleton(std::move(CT
));
174 if (SelfAliasing
.hasImplicitAliasing()) {
175 CT
.Info
= "instruction is serial, repeating a random one.";
176 CT
.Instructions
.push_back(std::move(Variant
));
177 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
178 return getSingleton(std::move(CT
));
180 const auto TiedVariables
= getVariablesWithTiedOperands(Instr
);
181 if (!TiedVariables
.empty()) {
182 CT
.Info
= "instruction has tied variables, using static renaming.";
183 CT
.Instructions
= generateSnippetUsingStaticRenaming(
184 State
, Variant
, TiedVariables
, ForbiddenRegisters
);
185 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
186 return getSingleton(std::move(CT
));
188 // No tied variables, we pick random values for defs.
190 // We don't want to accidentally serialize the instruction,
191 // so we must be sure that we don't pick a def that is an implicit use,
192 // or a use that is an implicit def, so record implicit regs now.
193 BitVector
ImplicitUses(State
.getRegInfo().getNumRegs());
194 BitVector
ImplicitDefs(State
.getRegInfo().getNumRegs());
195 for (const auto &Op
: Instr
.Operands
) {
196 if (Op
.isReg() && Op
.isImplicit() && !Op
.isMemory()) {
197 assert(Op
.isImplicitReg() && "Not an implicit register operand?");
199 ImplicitUses
.set(Op
.getImplicitReg());
201 assert(Op
.isDef() && "Not a use and not a def?");
202 ImplicitDefs
.set(Op
.getImplicitReg());
206 const auto ImplicitUseAliases
=
207 getAliasedBits(State
.getRegInfo(), ImplicitUses
);
208 const auto ImplicitDefAliases
=
209 getAliasedBits(State
.getRegInfo(), ImplicitDefs
);
210 BitVector
Defs(State
.getRegInfo().getNumRegs());
211 for (const auto &Op
: Instr
.Operands
) {
212 if (Op
.isReg() && Op
.isExplicit() && Op
.isDef() && !Op
.isMemory()) {
213 auto PossibleRegisters
= Op
.getRegisterAliasing().sourceBits();
214 // Do not use forbidden registers and regs that are implicitly used.
215 // Note that we don't try to avoid using implicit defs explicitly.
216 remove(PossibleRegisters
, ForbiddenRegisters
);
217 remove(PossibleRegisters
, ImplicitUseAliases
);
218 if (!PossibleRegisters
.any())
219 return make_error
<StringError
>(
220 Twine("no available registers:\ncandidates:\n")
221 .concat(debugString(State
.getRegInfo(),
222 Op
.getRegisterAliasing().sourceBits()))
223 .concat("\nforbidden:\n")
224 .concat(debugString(State
.getRegInfo(), ForbiddenRegisters
))
225 .concat("\nimplicit use:\n")
226 .concat(debugString(State
.getRegInfo(), ImplicitUseAliases
)),
227 inconvertibleErrorCode());
228 const auto RandomReg
= randomBit(PossibleRegisters
);
230 Variant
.getValueFor(Op
) = MCOperand::createReg(RandomReg
);
233 // And pick random use values that are not reserved and don't alias with defs.
234 // Note that we don't try to avoid using implicit uses explicitly.
235 const auto DefAliases
= getAliasedBits(State
.getRegInfo(), Defs
);
236 for (const auto &Op
: Instr
.Operands
) {
237 if (Op
.isReg() && Op
.isExplicit() && Op
.isUse() && !Op
.isMemory()) {
238 auto PossibleRegisters
= Op
.getRegisterAliasing().sourceBits();
239 remove(PossibleRegisters
, ForbiddenRegisters
);
240 remove(PossibleRegisters
, DefAliases
);
241 remove(PossibleRegisters
, ImplicitDefAliases
);
242 assert(PossibleRegisters
.any() && "No register left to choose from");
243 const auto RandomReg
= randomBit(PossibleRegisters
);
244 Variant
.getValueFor(Op
) = MCOperand::createReg(RandomReg
);
248 "instruction has no tied variables picking Uses different from defs";
249 CT
.Instructions
.push_back(std::move(Variant
));
250 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
251 return getSingleton(std::move(CT
));
254 constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses
;
256 } // namespace exegesis