1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
12 #include "Assembler.h"
13 #include "BenchmarkRunner.h"
14 #include "MCInstrDescView.h"
17 // FIXME: Load constants into registers (e.g. with fld1) to not break
18 // instructions like x87.
20 // Ideally we would like the only limitation on executing uops to be the issue
21 // ports. Maximizing port pressure increases the likelihood that the load is
22 // distributed evenly across possible ports.
24 // To achieve that, one approach is to generate instructions that do not have
25 // data dependencies between them.
27 // For some instructions, this is trivial:
28 // mov rax, qword ptr [rsi]
29 // mov rax, qword ptr [rsi]
30 // mov rax, qword ptr [rsi]
31 // mov rax, qword ptr [rsi]
32 // For the above snippet, haswell just renames rax four times and executes the
33 // four instructions two at a time on P23 and P0126.
35 // For some instructions, we just need to make sure that the source is
36 // different from the destination. For example, IDIV8r reads from GPR and
37 // writes to AX. We just need to ensure that the Var is assigned a
38 // register which is different from AX:
43 // The above snippet will be able to fully saturate the ports, while the same
44 // with ax would issue one uop every `latency(IDIV8r)` cycles.
46 // Some instructions make this harder because they both read and write from
52 // This has a data dependency from each instruction to the next, limit the
53 // number of instructions that can be issued in parallel.
54 // It turns out that this is not a big issue on recent Intel CPUs because they
55 // have heuristics to balance port pressure. In the snippet above, subsequent
56 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
57 // might end up executing them all on P0 (just because they can), or try
58 // avoiding P5 because it's usually under high pressure from vector
60 // This issue is even more important for high-latency instructions because
61 // they increase the idle time of the CPU, e.g. :
67 // To avoid that, we do the renaming statically by generating as many
68 // independent exclusive assignments as possible (until all possible registers
69 // are exhausted) e.g.:
75 // Some instruction even make the above static renaming impossible because
76 // they implicitly read and write from the same operand, e.g. ADC16rr reads
77 // and writes from EFLAGS.
78 // In that case we just use a greedy register assignment and hope for the
83 static llvm::SmallVector
<const Variable
*, 8>
84 getVariablesWithTiedOperands(const Instruction
&Instr
) {
85 llvm::SmallVector
<const Variable
*, 8> Result
;
86 for (const auto &Var
: Instr
.Variables
)
87 if (Var
.hasTiedOperands())
88 Result
.push_back(&Var
);
92 static void remove(llvm::BitVector
&a
, const llvm::BitVector
&b
) {
93 assert(a
.size() == b
.size());
94 for (auto I
: b
.set_bits())
98 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
100 UopsSnippetGenerator::~UopsSnippetGenerator() = default;
102 void UopsSnippetGenerator::instantiateMemoryOperands(
103 const unsigned ScratchSpacePointerInReg
,
104 std::vector
<InstructionTemplate
> &Instructions
) const {
105 if (ScratchSpacePointerInReg
== 0)
106 return; // no memory operands.
107 const auto &ET
= State
.getExegesisTarget();
108 const unsigned MemStep
= ET
.getMaxMemoryAccessSize();
109 const size_t OriginalInstructionsSize
= Instructions
.size();
111 for (InstructionTemplate
&IT
: Instructions
) {
112 ET
.fillMemoryOperands(IT
, ScratchSpacePointerInReg
, I
* MemStep
);
116 while (Instructions
.size() < kMinNumDifferentAddresses
) {
117 InstructionTemplate IT
= Instructions
[I
% OriginalInstructionsSize
];
118 ET
.fillMemoryOperands(IT
, ScratchSpacePointerInReg
, I
* MemStep
);
120 Instructions
.push_back(std::move(IT
));
122 assert(I
* MemStep
< BenchmarkRunner::ScratchSpace::kSize
&&
123 "not enough scratch space");
126 llvm::Expected
<std::vector
<CodeTemplate
>>
127 UopsSnippetGenerator::generateCodeTemplates(const Instruction
&Instr
) const {
129 const llvm::BitVector
*ScratchSpaceAliasedRegs
= nullptr;
130 if (Instr
.hasMemoryOperands()) {
131 const auto &ET
= State
.getExegesisTarget();
132 CT
.ScratchSpacePointerInReg
=
133 ET
.getScratchMemoryRegister(State
.getTargetMachine().getTargetTriple());
134 if (CT
.ScratchSpacePointerInReg
== 0)
135 return llvm::make_error
<BenchmarkFailure
>(
136 "Infeasible : target does not support memory instructions");
137 ScratchSpaceAliasedRegs
=
138 &State
.getRATC().getRegister(CT
.ScratchSpacePointerInReg
).aliasedBits();
139 // If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
140 // FIXME: We could make a copy of the scratch register.
141 for (const auto &Op
: Instr
.Operands
) {
142 if (Op
.isDef() && Op
.isImplicitReg() &&
143 ScratchSpaceAliasedRegs
->test(Op
.getImplicitReg()))
144 return llvm::make_error
<BenchmarkFailure
>(
145 "Infeasible : memory instruction uses scratch memory register");
149 const AliasingConfigurations
SelfAliasing(Instr
, Instr
);
150 InstructionTemplate
IT(Instr
);
151 if (SelfAliasing
.empty()) {
152 CT
.Info
= "instruction is parallel, repeating a random one.";
153 CT
.Instructions
.push_back(std::move(IT
));
154 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
155 return getSingleton(std::move(CT
));
157 if (SelfAliasing
.hasImplicitAliasing()) {
158 CT
.Info
= "instruction is serial, repeating a random one.";
159 CT
.Instructions
.push_back(std::move(IT
));
160 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
161 return getSingleton(std::move(CT
));
163 const auto TiedVariables
= getVariablesWithTiedOperands(Instr
);
164 if (!TiedVariables
.empty()) {
165 if (TiedVariables
.size() > 1)
166 return llvm::make_error
<llvm::StringError
>(
167 "Infeasible : don't know how to handle several tied variables",
168 llvm::inconvertibleErrorCode());
169 const Variable
*Var
= TiedVariables
.front();
171 const Operand
&Op
= Instr
.getPrimaryOperand(*Var
);
173 CT
.Info
= "instruction has tied variables using static renaming.";
174 for (const llvm::MCPhysReg Reg
:
175 Op
.getRegisterAliasing().sourceBits().set_bits()) {
176 if (ScratchSpaceAliasedRegs
&& ScratchSpaceAliasedRegs
->test(Reg
))
177 continue; // Do not use the scratch memory address register.
178 InstructionTemplate TmpIT
= IT
;
179 TmpIT
.getValueFor(*Var
) = llvm::MCOperand::createReg(Reg
);
180 CT
.Instructions
.push_back(std::move(TmpIT
));
182 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
183 return getSingleton(std::move(CT
));
185 const auto &ReservedRegisters
= State
.getRATC().reservedRegisters();
186 // No tied variables, we pick random values for defs.
187 llvm::BitVector
Defs(State
.getRegInfo().getNumRegs());
188 for (const auto &Op
: Instr
.Operands
) {
189 if (Op
.isReg() && Op
.isExplicit() && Op
.isDef() && !Op
.isMemory()) {
190 auto PossibleRegisters
= Op
.getRegisterAliasing().sourceBits();
191 remove(PossibleRegisters
, ReservedRegisters
);
192 // Do not use the scratch memory address register.
193 if (ScratchSpaceAliasedRegs
)
194 remove(PossibleRegisters
, *ScratchSpaceAliasedRegs
);
195 assert(PossibleRegisters
.any() && "No register left to choose from");
196 const auto RandomReg
= randomBit(PossibleRegisters
);
198 IT
.getValueFor(Op
) = llvm::MCOperand::createReg(RandomReg
);
201 // And pick random use values that are not reserved and don't alias with defs.
202 const auto DefAliases
= getAliasedBits(State
.getRegInfo(), Defs
);
203 for (const auto &Op
: Instr
.Operands
) {
204 if (Op
.isReg() && Op
.isExplicit() && Op
.isUse() && !Op
.isMemory()) {
205 auto PossibleRegisters
= Op
.getRegisterAliasing().sourceBits();
206 remove(PossibleRegisters
, ReservedRegisters
);
207 // Do not use the scratch memory address register.
208 if (ScratchSpaceAliasedRegs
)
209 remove(PossibleRegisters
, *ScratchSpaceAliasedRegs
);
210 remove(PossibleRegisters
, DefAliases
);
211 assert(PossibleRegisters
.any() && "No register left to choose from");
212 const auto RandomReg
= randomBit(PossibleRegisters
);
213 IT
.getValueFor(Op
) = llvm::MCOperand::createReg(RandomReg
);
217 "instruction has no tied variables picking Uses different from defs";
218 CT
.Instructions
.push_back(std::move(IT
));
219 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
220 return getSingleton(std::move(CT
));
223 llvm::Expected
<std::vector
<BenchmarkMeasure
>>
224 UopsBenchmarkRunner::runMeasurements(const FunctionExecutor
&Executor
) const {
225 const auto &SchedModel
= State
.getSubtargetInfo().getSchedModel();
227 std::vector
<BenchmarkMeasure
> Result
;
228 const auto &PfmCounters
= SchedModel
.getExtraProcessorInfo().PfmCounters
;
230 for (unsigned ProcResIdx
= 1;
231 ProcResIdx
< SchedModel
.getNumProcResourceKinds(); ++ProcResIdx
) {
232 const char *const Counters
= PfmCounters
.IssueCounters
[ProcResIdx
];
235 auto ExpectedCounterValue
= Executor
.runAndMeasure(Counters
);
236 if (!ExpectedCounterValue
)
237 return ExpectedCounterValue
.takeError();
238 Result
.push_back(BenchmarkMeasure::Create(
239 SchedModel
.getProcResource(ProcResIdx
)->Name
, *ExpectedCounterValue
));
242 if (const char *const UopsCounter
= PfmCounters
.UopsCounter
) {
243 auto ExpectedCounterValue
= Executor
.runAndMeasure(UopsCounter
);
244 if (!ExpectedCounterValue
)
245 return ExpectedCounterValue
.takeError();
247 BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue
));
249 return std::move(Result
);
252 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses
;
254 } // namespace exegesis