1 //===-- ParallelSnippetGenerator.cpp ----------------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "ParallelSnippetGenerator.h"
11 #include "BenchmarkRunner.h"
12 #include "MCInstrDescView.h"
15 // FIXME: Load constants into registers (e.g. with fld1) to not break
16 // instructions like x87.
18 // Ideally we would like the only limitation on executing instructions to be the
19 // availability of the CPU resources (e.g. execution ports) needed to execute
20 // them, instead of the availability of their data dependencies.
22 // To achieve that, one approach is to generate instructions that do not have
23 // data dependencies between them.
25 // For some instructions, this is trivial:
26 // mov rax, qword ptr [rsi]
27 // mov rax, qword ptr [rsi]
28 // mov rax, qword ptr [rsi]
29 // mov rax, qword ptr [rsi]
30 // For the above snippet, haswell just renames rax four times and executes the
31 // four instructions two at a time on P23 and P0126.
33 // For some instructions, we just need to make sure that the source is
34 // different from the destination. For example, IDIV8r reads from GPR and
35 // writes to AX. We just need to ensure that the Var is assigned a
36 // register which is different from AX:
41 // The above snippet will be able to fully saturate the ports, while the same
42 // with ax would issue one uop every `latency(IDIV8r)` cycles.
44 // Some instructions make this harder because they both read and write from
50 // This has a data dependency from each instruction to the next, limit the
51 // number of instructions that can be issued in parallel.
52 // It turns out that this is not a big issue on recent Intel CPUs because they
53 // have heuristics to balance port pressure. In the snippet above, subsequent
54 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
55 // might end up executing them all on P0 (just because they can), or try
56 // avoiding P5 because it's usually under high pressure from vector
58 // This issue is even more important for high-latency instructions because
59 // they increase the idle time of the CPU, e.g. :
65 // To avoid that, we do the renaming statically by generating as many
66 // independent exclusive assignments as possible (until all possible registers
67 // are exhausted) e.g.:
73 // Some instruction even make the above static renaming impossible because
74 // they implicitly read and write from the same operand, e.g. ADC16rr reads
75 // and writes from EFLAGS.
76 // In that case we just use a greedy register assignment and hope for the
82 static bool hasVariablesWithTiedOperands(const Instruction
&Instr
) {
83 SmallVector
<const Variable
*, 8> Result
;
84 for (const auto &Var
: Instr
.Variables
)
85 if (Var
.hasTiedOperands())
90 ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
92 void ParallelSnippetGenerator::instantiateMemoryOperands(
93 const unsigned ScratchSpacePointerInReg
,
94 std::vector
<InstructionTemplate
> &Instructions
) const {
95 if (ScratchSpacePointerInReg
== 0)
96 return; // no memory operands.
97 const auto &ET
= State
.getExegesisTarget();
98 const unsigned MemStep
= ET
.getMaxMemoryAccessSize();
99 const size_t OriginalInstructionsSize
= Instructions
.size();
101 for (InstructionTemplate
&IT
: Instructions
) {
102 ET
.fillMemoryOperands(IT
, ScratchSpacePointerInReg
, I
* MemStep
);
106 while (Instructions
.size() < kMinNumDifferentAddresses
) {
107 InstructionTemplate IT
= Instructions
[I
% OriginalInstructionsSize
];
108 ET
.fillMemoryOperands(IT
, ScratchSpacePointerInReg
, I
* MemStep
);
110 Instructions
.push_back(std::move(IT
));
112 assert(I
* MemStep
< BenchmarkRunner::ScratchSpace::kSize
&&
113 "not enough scratch space");
116 enum class RegRandomizationStrategy
: uint8_t {
118 SingleStaticRegPerOperand
,
121 FIRST
= PickRandomRegs
,
122 LAST
= SingleStaticReg
,
125 } // namespace exegesis
127 template <> struct enum_iteration_traits
<exegesis::RegRandomizationStrategy
> {
128 static constexpr bool is_iterable
= true;
133 const char *getDescription(RegRandomizationStrategy S
) {
135 case RegRandomizationStrategy::PickRandomRegs
:
136 return "randomizing registers";
137 case RegRandomizationStrategy::SingleStaticRegPerOperand
:
138 return "one unique register for each position";
139 case RegRandomizationStrategy::SingleStaticReg
:
140 return "reusing the same register for all positions";
142 llvm_unreachable("Unknown UseRegRandomizationStrategy enum");
145 static std::variant
<std::nullopt_t
, MCOperand
, Register
>
146 generateSingleRegisterForInstrAvoidingDefUseOverlap(
147 const LLVMState
&State
, const BitVector
&ForbiddenRegisters
,
148 const BitVector
&ImplicitUseAliases
, const BitVector
&ImplicitDefAliases
,
149 const BitVector
&Uses
, const BitVector
&Defs
, const InstructionTemplate
&IT
,
150 const Operand
&Op
, const ArrayRef
<InstructionTemplate
> Instructions
,
151 RegRandomizationStrategy S
) {
152 const Instruction
&Instr
= IT
.getInstr();
153 assert(Op
.isReg() && Op
.isExplicit() && !Op
.isMemory() &&
154 !IT
.getValueFor(Op
).isValid());
155 assert((!Op
.isUse() || !Op
.isTied()) &&
156 "Not expecting to see a tied use reg");
160 case RegRandomizationStrategy::PickRandomRegs
:
162 case RegRandomizationStrategy::SingleStaticReg
:
163 case RegRandomizationStrategy::SingleStaticRegPerOperand
: {
164 if (!Instructions
.empty())
165 return Instructions
.front().getValueFor(Op
);
166 if (S
!= RegRandomizationStrategy::SingleStaticReg
)
168 BitVector PossibleRegisters
= Op
.getRegisterAliasing().sourceBits();
169 const BitVector UseAliases
= getAliasedBits(State
.getRegInfo(), Uses
);
170 if (std::optional
<int> CommonBit
=
171 getFirstCommonBit(PossibleRegisters
, UseAliases
))
178 BitVector PossibleRegisters
= Op
.getRegisterAliasing().sourceBits();
179 remove(PossibleRegisters
, ForbiddenRegisters
);
182 remove(PossibleRegisters
, ImplicitUseAliases
);
183 const BitVector UseAliases
= getAliasedBits(State
.getRegInfo(), Uses
);
184 remove(PossibleRegisters
, UseAliases
);
188 remove(PossibleRegisters
, ImplicitDefAliases
);
189 // NOTE: in general, using same reg for multiple Use's is fine.
190 if (S
== RegRandomizationStrategy::SingleStaticRegPerOperand
) {
191 const BitVector UseAliases
= getAliasedBits(State
.getRegInfo(), Uses
);
192 remove(PossibleRegisters
, UseAliases
);
196 bool IsDefWithTiedUse
=
197 Instr
.Variables
[Op
.getVariableIndex()].hasTiedOperands();
198 if (Op
.isUse() || IsDefWithTiedUse
) {
199 // Now, important bit: if we have used some register for def,
200 // then we can not use that same register for *any* use,
201 // be it either an untied use, or an use tied to a def.
202 // But def-ing same regs is fine, as long as there are no uses!
203 const BitVector DefsAliases
= getAliasedBits(State
.getRegInfo(), Defs
);
204 remove(PossibleRegisters
, DefsAliases
);
207 if (!PossibleRegisters
.any())
210 return randomBit(PossibleRegisters
);
213 static std::optional
<InstructionTemplate
>
214 generateSingleSnippetForInstrAvoidingDefUseOverlap(
215 const LLVMState
&State
, const BitVector
&ForbiddenRegisters
,
216 const BitVector
&ImplicitUseAliases
, const BitVector
&ImplicitDefAliases
,
217 BitVector
&Uses
, BitVector
&Defs
, InstructionTemplate IT
,
218 const ArrayRef
<InstructionTemplate
> Instructions
,
219 RegRandomizationStrategy S
) {
220 const Instruction
&Instr
= IT
.getInstr();
221 for (const Operand
&Op
: Instr
.Operands
) {
222 if (!Op
.isReg() || !Op
.isExplicit() || Op
.isMemory() ||
223 IT
.getValueFor(Op
).isValid())
225 assert((!Op
.isUse() || !Op
.isTied()) && "Will not get tied uses.");
227 std::variant
<std::nullopt_t
, MCOperand
, Register
> R
=
228 generateSingleRegisterForInstrAvoidingDefUseOverlap(
229 State
, ForbiddenRegisters
, ImplicitUseAliases
, ImplicitDefAliases
,
230 Uses
, Defs
, IT
, Op
, Instructions
, S
);
232 if (std::holds_alternative
<std::nullopt_t
>(R
))
236 if (std::holds_alternative
<MCOperand
>(R
))
237 MCOp
= std::get
<MCOperand
>(R
);
239 Register RandomReg
= std::get
<Register
>(R
);
244 MCOp
= MCOperand::createReg(RandomReg
);
246 IT
.getValueFor(Op
) = MCOp
;
251 static std::vector
<InstructionTemplate
>
252 generateSnippetForInstrAvoidingDefUseOverlap(
253 const LLVMState
&State
, const InstructionTemplate
&IT
,
254 RegRandomizationStrategy S
, const BitVector
&ForbiddenRegisters
) {
255 // We don't want to accidentally serialize the instruction,
256 // so we must be sure that we don't pick a def that is an implicit use,
257 // or a use that is an implicit def, so record implicit regs now.
258 BitVector
ImplicitUses(State
.getRegInfo().getNumRegs());
259 BitVector
ImplicitDefs(State
.getRegInfo().getNumRegs());
260 for (const auto &Op
: IT
.getInstr().Operands
) {
261 if (Op
.isReg() && Op
.isImplicit() && !Op
.isMemory()) {
262 assert(Op
.isImplicitReg() && "Not an implicit register operand?");
264 ImplicitUses
.set(Op
.getImplicitReg());
266 assert(Op
.isDef() && "Not a use and not a def?");
267 ImplicitDefs
.set(Op
.getImplicitReg());
271 const BitVector ImplicitUseAliases
=
272 getAliasedBits(State
.getRegInfo(), ImplicitUses
);
273 const BitVector ImplicitDefAliases
=
274 getAliasedBits(State
.getRegInfo(), ImplicitDefs
);
276 BitVector
Defs(State
.getRegInfo().getNumRegs());
277 BitVector
Uses(State
.getRegInfo().getNumRegs());
278 std::vector
<InstructionTemplate
> Instructions
;
281 std::optional
<InstructionTemplate
> TmpIT
=
282 generateSingleSnippetForInstrAvoidingDefUseOverlap(
283 State
, ForbiddenRegisters
, ImplicitUseAliases
, ImplicitDefAliases
,
284 Uses
, Defs
, IT
, Instructions
, S
);
287 Instructions
.push_back(std::move(*TmpIT
));
288 if (!hasVariablesWithTiedOperands(IT
.getInstr()))
290 assert(Instructions
.size() <= 128 && "Stuck in endless loop?");
294 Expected
<std::vector
<CodeTemplate
>>
295 ParallelSnippetGenerator::generateCodeTemplates(
296 InstructionTemplate Variant
, const BitVector
&ForbiddenRegisters
) const {
297 const Instruction
&Instr
= Variant
.getInstr();
299 CT
.ScratchSpacePointerInReg
=
300 Instr
.hasMemoryOperands()
301 ? State
.getExegesisTarget().getScratchMemoryRegister(
302 State
.getTargetMachine().getTargetTriple())
304 const AliasingConfigurations
SelfAliasing(Instr
, Instr
, ForbiddenRegisters
);
305 if (SelfAliasing
.empty()) {
306 CT
.Info
= "instruction is parallel, repeating a random one.";
307 CT
.Instructions
.push_back(std::move(Variant
));
308 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
309 return getSingleton(std::move(CT
));
311 if (SelfAliasing
.hasImplicitAliasing()) {
312 CT
.Info
= "instruction is serial, repeating a random one.";
313 CT
.Instructions
.push_back(std::move(Variant
));
314 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
315 return getSingleton(std::move(CT
));
317 std::vector
<CodeTemplate
> Result
;
318 bool HasTiedOperands
= hasVariablesWithTiedOperands(Instr
);
319 // If there are no tied operands, then we don't want to "saturate backedge",
320 // and the template we will produce will have only a single instruction.
321 unsigned NumUntiedUseRegs
= count_if(Instr
.Operands
, [](const Operand
&Op
) {
322 return Op
.isReg() && Op
.isExplicit() && !Op
.isMemory() && Op
.isUse() &&
325 SmallVector
<RegRandomizationStrategy
, 3> Strategies
;
326 if (HasTiedOperands
|| NumUntiedUseRegs
>= 3)
327 Strategies
.push_back(RegRandomizationStrategy::PickRandomRegs
);
328 if (NumUntiedUseRegs
>= 2)
329 Strategies
.push_back(RegRandomizationStrategy::SingleStaticRegPerOperand
);
330 Strategies
.push_back(RegRandomizationStrategy::SingleStaticReg
);
331 for (RegRandomizationStrategy S
: Strategies
) {
332 CodeTemplate CurrCT
= CT
.clone();
334 Twine("instruction has ")
335 .concat(HasTiedOperands
? "" : "no ")
336 .concat("tied variables, avoiding "
337 "Read-After-Write issue, picking random def and use "
338 "registers not aliasing each other, for uses, ")
339 .concat(getDescription(S
))
341 CurrCT
.Instructions
= generateSnippetForInstrAvoidingDefUseOverlap(
342 State
, Variant
, S
, ForbiddenRegisters
);
343 if (CurrCT
.Instructions
.empty())
344 return make_error
<StringError
>(
345 Twine("Failed to produce any snippet via: ").concat(CurrCT
.Info
),
346 inconvertibleErrorCode());
347 instantiateMemoryOperands(CurrCT
.ScratchSpacePointerInReg
,
348 CurrCT
.Instructions
);
349 Result
.push_back(std::move(CurrCT
));
354 constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses
;
356 } // namespace exegesis