1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
12 #include "Assembler.h"
13 #include "BenchmarkRunner.h"
14 #include "MCInstrDescView.h"
15 #include "PerfHelper.h"
18 // FIXME: Load constants into registers (e.g. with fld1) to not break
19 // instructions like x87.
21 // Ideally we would like the only limitation on executing uops to be the issue
22 // ports. Maximizing port pressure increases the likelihood that the load is
23 // distributed evenly across possible ports.
25 // To achieve that, one approach is to generate instructions that do not have
26 // data dependencies between them.
28 // For some instructions, this is trivial:
29 // mov rax, qword ptr [rsi]
30 // mov rax, qword ptr [rsi]
31 // mov rax, qword ptr [rsi]
32 // mov rax, qword ptr [rsi]
33 // For the above snippet, haswell just renames rax four times and executes the
34 // four instructions two at a time on P23 and P0126.
36 // For some instructions, we just need to make sure that the source is
37 // different from the destination. For example, IDIV8r reads from GPR and
38 // writes to AX. We just need to ensure that the Var is assigned a
39 // register which is different from AX:
44 // The above snippet will be able to fully saturate the ports, while the same
45 // with ax would issue one uop every `latency(IDIV8r)` cycles.
47 // Some instructions make this harder because they both read and write from
53 // This has a data dependency from each instruction to the next, limit the
54 // number of instructions that can be issued in parallel.
55 // It turns out that this is not a big issue on recent Intel CPUs because they
56 // have heuristics to balance port pressure. In the snippet above, subsequent
57 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
58 // might end up executing them all on P0 (just because they can), or try
59 // avoiding P5 because it's usually under high pressure from vector
61 // This issue is even more important for high-latency instructions because
62 // they increase the idle time of the CPU, e.g. :
68 // To avoid that, we do the renaming statically by generating as many
69 // independent exclusive assignments as possible (until all possible registers
70 // are exhausted) e.g.:
76 // Some instruction even make the above static renaming impossible because
77 // they implicitly read and write from the same operand, e.g. ADC16rr reads
78 // and writes from EFLAGS.
79 // In that case we just use a greedy register assignment and hope for the
84 static llvm::SmallVector
<const Variable
*, 8>
85 getVariablesWithTiedOperands(const Instruction
&Instr
) {
86 llvm::SmallVector
<const Variable
*, 8> Result
;
87 for (const auto &Var
: Instr
.Variables
)
88 if (Var
.hasTiedOperands())
89 Result
.push_back(&Var
);
93 static void remove(llvm::BitVector
&a
, const llvm::BitVector
&b
) {
94 assert(a
.size() == b
.size());
95 for (auto I
: b
.set_bits())
99 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
101 UopsSnippetGenerator::~UopsSnippetGenerator() = default;
103 void UopsSnippetGenerator::instantiateMemoryOperands(
104 const unsigned ScratchSpacePointerInReg
,
105 std::vector
<InstructionTemplate
> &Instructions
) const {
106 if (ScratchSpacePointerInReg
== 0)
107 return; // no memory operands.
108 const auto &ET
= State
.getExegesisTarget();
109 const unsigned MemStep
= ET
.getMaxMemoryAccessSize();
110 const size_t OriginalInstructionsSize
= Instructions
.size();
112 for (InstructionTemplate
&IT
: Instructions
) {
113 ET
.fillMemoryOperands(IT
, ScratchSpacePointerInReg
, I
* MemStep
);
117 while (Instructions
.size() < kMinNumDifferentAddresses
) {
118 InstructionTemplate IT
= Instructions
[I
% OriginalInstructionsSize
];
119 ET
.fillMemoryOperands(IT
, ScratchSpacePointerInReg
, I
* MemStep
);
121 Instructions
.push_back(std::move(IT
));
123 assert(I
* MemStep
< BenchmarkRunner::ScratchSpace::kSize
&&
124 "not enough scratch space");
127 llvm::Expected
<std::vector
<CodeTemplate
>>
128 UopsSnippetGenerator::generateCodeTemplates(const Instruction
&Instr
) const {
130 const llvm::BitVector
*ScratchSpaceAliasedRegs
= nullptr;
131 if (Instr
.hasMemoryOperands()) {
132 const auto &ET
= State
.getExegesisTarget();
133 CT
.ScratchSpacePointerInReg
=
134 ET
.getScratchMemoryRegister(State
.getTargetMachine().getTargetTriple());
135 if (CT
.ScratchSpacePointerInReg
== 0)
136 return llvm::make_error
<BenchmarkFailure
>(
137 "Infeasible : target does not support memory instructions");
138 ScratchSpaceAliasedRegs
=
139 &State
.getRATC().getRegister(CT
.ScratchSpacePointerInReg
).aliasedBits();
140 // If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
141 // FIXME: We could make a copy of the scratch register.
142 for (const auto &Op
: Instr
.Operands
) {
143 if (Op
.isDef() && Op
.isImplicitReg() &&
144 ScratchSpaceAliasedRegs
->test(Op
.getImplicitReg()))
145 return llvm::make_error
<BenchmarkFailure
>(
146 "Infeasible : memory instruction uses scratch memory register");
150 const AliasingConfigurations
SelfAliasing(Instr
, Instr
);
151 InstructionTemplate
IT(Instr
);
152 if (SelfAliasing
.empty()) {
153 CT
.Info
= "instruction is parallel, repeating a random one.";
154 CT
.Instructions
.push_back(std::move(IT
));
155 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
156 return getSingleton(std::move(CT
));
158 if (SelfAliasing
.hasImplicitAliasing()) {
159 CT
.Info
= "instruction is serial, repeating a random one.";
160 CT
.Instructions
.push_back(std::move(IT
));
161 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
162 return getSingleton(std::move(CT
));
164 const auto TiedVariables
= getVariablesWithTiedOperands(Instr
);
165 if (!TiedVariables
.empty()) {
166 if (TiedVariables
.size() > 1)
167 return llvm::make_error
<llvm::StringError
>(
168 "Infeasible : don't know how to handle several tied variables",
169 llvm::inconvertibleErrorCode());
170 const Variable
*Var
= TiedVariables
.front();
172 const Operand
&Op
= Instr
.getPrimaryOperand(*Var
);
174 CT
.Info
= "instruction has tied variables using static renaming.";
175 for (const llvm::MCPhysReg Reg
:
176 Op
.getRegisterAliasing().sourceBits().set_bits()) {
177 if (ScratchSpaceAliasedRegs
&& ScratchSpaceAliasedRegs
->test(Reg
))
178 continue; // Do not use the scratch memory address register.
179 InstructionTemplate TmpIT
= IT
;
180 TmpIT
.getValueFor(*Var
) = llvm::MCOperand::createReg(Reg
);
181 CT
.Instructions
.push_back(std::move(TmpIT
));
183 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
184 return getSingleton(std::move(CT
));
186 const auto &ReservedRegisters
= State
.getRATC().reservedRegisters();
187 // No tied variables, we pick random values for defs.
188 llvm::BitVector
Defs(State
.getRegInfo().getNumRegs());
189 for (const auto &Op
: Instr
.Operands
) {
190 if (Op
.isReg() && Op
.isExplicit() && Op
.isDef() && !Op
.isMemory()) {
191 auto PossibleRegisters
= Op
.getRegisterAliasing().sourceBits();
192 remove(PossibleRegisters
, ReservedRegisters
);
193 // Do not use the scratch memory address register.
194 if (ScratchSpaceAliasedRegs
)
195 remove(PossibleRegisters
, *ScratchSpaceAliasedRegs
);
196 assert(PossibleRegisters
.any() && "No register left to choose from");
197 const auto RandomReg
= randomBit(PossibleRegisters
);
199 IT
.getValueFor(Op
) = llvm::MCOperand::createReg(RandomReg
);
202 // And pick random use values that are not reserved and don't alias with defs.
203 const auto DefAliases
= getAliasedBits(State
.getRegInfo(), Defs
);
204 for (const auto &Op
: Instr
.Operands
) {
205 if (Op
.isReg() && Op
.isExplicit() && Op
.isUse() && !Op
.isMemory()) {
206 auto PossibleRegisters
= Op
.getRegisterAliasing().sourceBits();
207 remove(PossibleRegisters
, ReservedRegisters
);
208 // Do not use the scratch memory address register.
209 if (ScratchSpaceAliasedRegs
)
210 remove(PossibleRegisters
, *ScratchSpaceAliasedRegs
);
211 remove(PossibleRegisters
, DefAliases
);
212 assert(PossibleRegisters
.any() && "No register left to choose from");
213 const auto RandomReg
= randomBit(PossibleRegisters
);
214 IT
.getValueFor(Op
) = llvm::MCOperand::createReg(RandomReg
);
218 "instruction has no tied variables picking Uses different from defs";
219 CT
.Instructions
.push_back(std::move(IT
));
220 instantiateMemoryOperands(CT
.ScratchSpacePointerInReg
, CT
.Instructions
);
221 return getSingleton(std::move(CT
));
224 std::vector
<BenchmarkMeasure
>
225 UopsBenchmarkRunner::runMeasurements(const ExecutableFunction
&Function
,
226 ScratchSpace
&Scratch
) const {
227 const auto &SchedModel
= State
.getSubtargetInfo().getSchedModel();
229 const auto RunMeasurement
= [&Function
,
230 &Scratch
](const char *const Counters
) {
231 // We sum counts when there are several counters for a single ProcRes
232 // (e.g. P23 on SandyBridge).
233 int64_t CounterValue
= 0;
234 llvm::SmallVector
<llvm::StringRef
, 2> CounterNames
;
235 llvm::StringRef(Counters
).split(CounterNames
, ',');
236 for (const auto &CounterName
: CounterNames
) {
237 pfm::PerfEvent
UopPerfEvent(CounterName
);
238 if (!UopPerfEvent
.valid())
239 llvm::report_fatal_error(
240 llvm::Twine("invalid perf event ").concat(Counters
));
241 pfm::Counter
Counter(UopPerfEvent
);
244 Function(Scratch
.ptr());
246 CounterValue
+= Counter
.read();
251 std::vector
<BenchmarkMeasure
> Result
;
252 const auto &PfmCounters
= SchedModel
.getExtraProcessorInfo().PfmCounters
;
254 for (unsigned ProcResIdx
= 1;
255 ProcResIdx
< SchedModel
.getNumProcResourceKinds(); ++ProcResIdx
) {
256 const char *const Counters
= PfmCounters
.IssueCounters
[ProcResIdx
];
259 const double CounterValue
= RunMeasurement(Counters
);
260 Result
.push_back(BenchmarkMeasure::Create(
261 SchedModel
.getProcResource(ProcResIdx
)->Name
, CounterValue
));
264 if (const char *const UopsCounter
= PfmCounters
.UopsCounter
) {
265 const double CounterValue
= RunMeasurement(UopsCounter
);
266 Result
.push_back(BenchmarkMeasure::Create("NumMicroOps", CounterValue
));
271 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses
;
273 } // namespace exegesis