1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
12 #include "Assembler.h"
13 #include "BenchmarkRunner.h"
14 #include "MCInstrDescView.h"
15 #include "PerfHelper.h"
18 // FIXME: Load constants into registers (e.g. with fld1) to not break
19 // instructions like x87.
21 // Ideally we would like the only limitation on executing uops to be the issue
22 // ports. Maximizing port pressure increases the likelihood that the load is
23 // distributed evenly across possible ports.
25 // To achieve that, one approach is to generate instructions that do not have
26 // data dependencies between them.
28 // For some instructions, this is trivial:
29 // mov rax, qword ptr [rsi]
30 // mov rax, qword ptr [rsi]
31 // mov rax, qword ptr [rsi]
32 // mov rax, qword ptr [rsi]
33 // For the above snippet, haswell just renames rax four times and executes the
34 // four instructions two at a time on P23 and P0126.
36 // For some instructions, we just need to make sure that the source is
37 // different from the destination. For example, IDIV8r reads from GPR and
38 // writes to AX. We just need to ensure that the Var is assigned a
39 // register which is different from AX:
44 // The above snippet will be able to fully saturate the ports, while the same
45 // with ax would issue one uop every `latency(IDIV8r)` cycles.
47 // Some instructions make this harder because they both read and write from
53 // This has a data dependency from each instruction to the next, limit the
54 // number of instructions that can be issued in parallel.
55 // It turns out that this is not a big issue on recent Intel CPUs because they
56 // have heuristics to balance port pressure. In the snippet above, subsequent
57 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
58 // might end up executing them all on P0 (just because they can), or try
59 // avoiding P5 because it's usually under high pressure from vector
61 // This issue is even more important for high-latency instructions because
62 // they increase the idle time of the CPU, e.g. :
68 // To avoid that, we do the renaming statically by generating as many
69 // independent exclusive assignments as possible (until all possible registers
70 // are exhausted) e.g.:
76 // Some instruction even make the above static renaming impossible because
77 // they implicitly read and write from the same operand, e.g. ADC16rr reads
78 // and writes from EFLAGS.
79 // In that case we just use a greedy register assignment and hope for the
84 static bool hasUnknownOperand(const llvm::MCOperandInfo
&OpInfo
) {
85 return OpInfo
.OperandType
== llvm::MCOI::OPERAND_UNKNOWN
;
89 UopsBenchmarkRunner::isInfeasible(const llvm::MCInstrDesc
&MCInstrDesc
) const {
90 if (llvm::any_of(MCInstrDesc
.operands(), hasUnknownOperand
))
91 return llvm::make_error
<BenchmarkFailure
>(
92 "Infeasible : has unknown operands");
93 return llvm::Error::success();
96 // Returns whether this Variable ties Use and Def operands together.
97 static bool hasTiedOperands(const Instruction
&Instr
, const Variable
&Var
) {
100 for (const unsigned OpIndex
: Var
.TiedOperands
) {
101 const Operand
&Op
= Instr
.Operands
[OpIndex
];
107 return HasUse
&& HasDef
;
110 static llvm::SmallVector
<const Variable
*, 8>
111 getTiedVariables(const Instruction
&Instr
) {
112 llvm::SmallVector
<const Variable
*, 8> Result
;
113 for (const auto &Var
: Instr
.Variables
)
114 if (hasTiedOperands(Instr
, Var
))
115 Result
.push_back(&Var
);
119 static void remove(llvm::BitVector
&a
, const llvm::BitVector
&b
) {
120 assert(a
.size() == b
.size());
121 for (auto I
: b
.set_bits())
125 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
127 void UopsBenchmarkRunner::instantiateMemoryOperands(
128 const unsigned ScratchSpaceReg
,
129 std::vector
<InstructionInstance
> &Snippet
) const {
130 if (ScratchSpaceReg
== 0)
131 return; // no memory operands.
132 const auto &ET
= State
.getExegesisTarget();
133 const unsigned MemStep
= ET
.getMaxMemoryAccessSize();
134 const size_t OriginalSnippetSize
= Snippet
.size();
136 for (InstructionInstance
&II
: Snippet
) {
137 ET
.fillMemoryOperands(II
, ScratchSpaceReg
, I
* MemStep
);
141 while (Snippet
.size() < kMinNumDifferentAddresses
) {
142 InstructionInstance II
= Snippet
[I
% OriginalSnippetSize
];
143 ET
.fillMemoryOperands(II
, ScratchSpaceReg
, I
* MemStep
);
145 Snippet
.push_back(std::move(II
));
147 assert(I
* MemStep
< ScratchSpace::kSize
&& "not enough scratch space");
150 llvm::Expected
<SnippetPrototype
>
151 UopsBenchmarkRunner::generatePrototype(unsigned Opcode
) const {
152 const auto &InstrDesc
= State
.getInstrInfo().get(Opcode
);
153 if (auto E
= isInfeasible(InstrDesc
))
155 const Instruction
Instr(InstrDesc
, RATC
);
156 const auto &ET
= State
.getExegesisTarget();
157 SnippetPrototype Prototype
;
159 const llvm::BitVector
*ScratchSpaceAliasedRegs
= nullptr;
160 if (Instr
.hasMemoryOperands()) {
161 Prototype
.ScratchSpaceReg
=
162 ET
.getScratchMemoryRegister(State
.getTargetMachine().getTargetTriple());
163 if (Prototype
.ScratchSpaceReg
== 0)
164 return llvm::make_error
<BenchmarkFailure
>(
165 "Infeasible : target does not support memory instructions");
166 ScratchSpaceAliasedRegs
=
167 &RATC
.getRegister(Prototype
.ScratchSpaceReg
).aliasedBits();
168 // If the instruction implicitly writes to ScratchSpaceReg , abort.
169 // FIXME: We could make a copy of the scratch register.
170 for (const auto &Op
: Instr
.Operands
) {
171 if (Op
.IsDef
&& Op
.ImplicitReg
&&
172 ScratchSpaceAliasedRegs
->test(*Op
.ImplicitReg
))
173 return llvm::make_error
<BenchmarkFailure
>(
174 "Infeasible : memory instruction uses scratch memory register");
178 const AliasingConfigurations
SelfAliasing(Instr
, Instr
);
179 InstructionInstance
II(Instr
);
180 if (SelfAliasing
.empty()) {
181 Prototype
.Explanation
= "instruction is parallel, repeating a random one.";
182 Prototype
.Snippet
.push_back(std::move(II
));
183 instantiateMemoryOperands(Prototype
.ScratchSpaceReg
, Prototype
.Snippet
);
184 return std::move(Prototype
);
186 if (SelfAliasing
.hasImplicitAliasing()) {
187 Prototype
.Explanation
= "instruction is serial, repeating a random one.";
188 Prototype
.Snippet
.push_back(std::move(II
));
189 instantiateMemoryOperands(Prototype
.ScratchSpaceReg
, Prototype
.Snippet
);
190 return std::move(Prototype
);
192 const auto TiedVariables
= getTiedVariables(Instr
);
193 if (!TiedVariables
.empty()) {
194 if (TiedVariables
.size() > 1)
195 return llvm::make_error
<llvm::StringError
>(
196 "Infeasible : don't know how to handle several tied variables",
197 llvm::inconvertibleErrorCode());
198 const Variable
*Var
= TiedVariables
.front();
200 assert(!Var
->TiedOperands
.empty());
201 const Operand
&Op
= Instr
.Operands
[Var
->TiedOperands
.front()];
203 Prototype
.Explanation
=
204 "instruction has tied variables using static renaming.";
205 for (const llvm::MCPhysReg Reg
: Op
.Tracker
->sourceBits().set_bits()) {
206 if (ScratchSpaceAliasedRegs
&& ScratchSpaceAliasedRegs
->test(Reg
))
207 continue; // Do not use the scratch memory address register.
208 InstructionInstance TmpII
= II
;
209 TmpII
.getValueFor(*Var
) = llvm::MCOperand::createReg(Reg
);
210 Prototype
.Snippet
.push_back(std::move(TmpII
));
212 instantiateMemoryOperands(Prototype
.ScratchSpaceReg
, Prototype
.Snippet
);
213 return std::move(Prototype
);
215 // No tied variables, we pick random values for defs.
216 llvm::BitVector
Defs(State
.getRegInfo().getNumRegs());
217 for (const auto &Op
: Instr
.Operands
) {
218 if (Op
.Tracker
&& Op
.IsExplicit
&& Op
.IsDef
&& !Op
.IsMem
) {
219 auto PossibleRegisters
= Op
.Tracker
->sourceBits();
220 remove(PossibleRegisters
, RATC
.reservedRegisters());
221 // Do not use the scratch memory address register.
222 if (ScratchSpaceAliasedRegs
)
223 remove(PossibleRegisters
, *ScratchSpaceAliasedRegs
);
224 assert(PossibleRegisters
.any() && "No register left to choose from");
225 const auto RandomReg
= randomBit(PossibleRegisters
);
227 II
.getValueFor(Op
) = llvm::MCOperand::createReg(RandomReg
);
230 // And pick random use values that are not reserved and don't alias with defs.
231 const auto DefAliases
= getAliasedBits(State
.getRegInfo(), Defs
);
232 for (const auto &Op
: Instr
.Operands
) {
233 if (Op
.Tracker
&& Op
.IsExplicit
&& !Op
.IsDef
&& !Op
.IsMem
) {
234 auto PossibleRegisters
= Op
.Tracker
->sourceBits();
235 remove(PossibleRegisters
, RATC
.reservedRegisters());
236 // Do not use the scratch memory address register.
237 if (ScratchSpaceAliasedRegs
)
238 remove(PossibleRegisters
, *ScratchSpaceAliasedRegs
);
239 remove(PossibleRegisters
, DefAliases
);
240 assert(PossibleRegisters
.any() && "No register left to choose from");
241 const auto RandomReg
= randomBit(PossibleRegisters
);
242 II
.getValueFor(Op
) = llvm::MCOperand::createReg(RandomReg
);
245 Prototype
.Explanation
=
246 "instruction has no tied variables picking Uses different from defs";
247 Prototype
.Snippet
.push_back(std::move(II
));
248 instantiateMemoryOperands(Prototype
.ScratchSpaceReg
, Prototype
.Snippet
);
249 return std::move(Prototype
);
252 std::vector
<BenchmarkMeasure
>
253 UopsBenchmarkRunner::runMeasurements(const ExecutableFunction
&Function
,
254 ScratchSpace
&Scratch
,
255 const unsigned NumRepetitions
) const {
256 const auto &SchedModel
= State
.getSubtargetInfo().getSchedModel();
258 std::vector
<BenchmarkMeasure
> Result
;
259 for (unsigned ProcResIdx
= 1;
260 ProcResIdx
< SchedModel
.getNumProcResourceKinds(); ++ProcResIdx
) {
261 const char *const PfmCounters
= SchedModel
.getExtraProcessorInfo()
262 .PfmCounters
.IssueCounters
[ProcResIdx
];
265 // We sum counts when there are several counters for a single ProcRes
266 // (e.g. P23 on SandyBridge).
267 int64_t CounterValue
= 0;
268 llvm::SmallVector
<llvm::StringRef
, 2> CounterNames
;
269 llvm::StringRef(PfmCounters
).split(CounterNames
, ',');
270 for (const auto &CounterName
: CounterNames
) {
271 pfm::PerfEvent
UopPerfEvent(CounterName
);
272 if (!UopPerfEvent
.valid())
273 llvm::report_fatal_error(
274 llvm::Twine("invalid perf event ").concat(PfmCounters
));
275 pfm::Counter
Counter(UopPerfEvent
);
278 Function(Scratch
.ptr());
280 CounterValue
+= Counter
.read();
282 Result
.push_back({llvm::itostr(ProcResIdx
),
283 static_cast<double>(CounterValue
) / NumRepetitions
,
284 SchedModel
.getProcResource(ProcResIdx
)->Name
});
289 constexpr const size_t UopsBenchmarkRunner::kMinNumDifferentAddresses
;
291 } // namespace exegesis