[llvm-exegesis] Add ability to assign perf counters to specific PID
[llvm-project.git] / llvm / tools / llvm-exegesis / lib / X86 / Target.cpp
blob12e80a7ca06b3750a1f8f852f37b2e2d8a736974
1 //===-- Target.cpp ----------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "../Target.h"
10 #include "../Error.h"
11 #include "../ParallelSnippetGenerator.h"
12 #include "../SerialSnippetGenerator.h"
13 #include "../SnippetGenerator.h"
14 #include "MCTargetDesc/X86BaseInfo.h"
15 #include "MCTargetDesc/X86MCTargetDesc.h"
16 #include "X86.h"
17 #include "X86Counter.h"
18 #include "X86RegisterInfo.h"
19 #include "llvm/ADT/Sequence.h"
20 #include "llvm/CodeGen/MachineInstrBuilder.h"
21 #include "llvm/MC/MCInstBuilder.h"
22 #include "llvm/Support/Errc.h"
23 #include "llvm/Support/Error.h"
24 #include "llvm/Support/ErrorHandling.h"
25 #include "llvm/Support/FormatVariadic.h"
26 #include "llvm/TargetParser/Host.h"
28 #include <memory>
29 #include <string>
30 #include <vector>
31 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
32 #include <immintrin.h>
33 #include <intrin.h>
34 #endif
35 #if defined(_MSC_VER) && defined(_M_X64)
36 #include <float.h> // For _clearfp in ~X86SavedState().
37 #endif
39 namespace llvm {
40 namespace exegesis {
42 // If a positive value is specified, we are going to use the LBR in
43 // latency-mode.
45 // Note:
46 // - A small value is preferred, but too low a value could result in
47 // throttling.
48 // - A prime number is preferred to avoid always skipping certain blocks.
50 static cl::opt<unsigned> LbrSamplingPeriod(
51 "x86-lbr-sample-period",
52 cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
53 cl::cat(BenchmarkOptions), cl::init(0));
55 static cl::opt<bool>
56 DisableUpperSSERegisters("x86-disable-upper-sse-registers",
57 cl::desc("Disable XMM8-XMM15 register usage"),
58 cl::cat(BenchmarkOptions), cl::init(false));
60 // FIXME: Validates that repetition-mode is loop if LBR is requested.
62 // Returns a non-null reason if we cannot handle the memory references in this
63 // instruction.
64 static const char *isInvalidMemoryInstr(const Instruction &Instr) {
65 switch (Instr.Description.TSFlags & X86II::FormMask) {
66 default:
67 return "Unknown FormMask value";
68 // These have no memory access.
69 case X86II::Pseudo:
70 case X86II::RawFrm:
71 case X86II::AddCCFrm:
72 case X86II::PrefixByte:
73 case X86II::MRMDestReg:
74 case X86II::MRMSrcReg:
75 case X86II::MRMSrcReg4VOp3:
76 case X86II::MRMSrcRegOp4:
77 case X86II::MRMSrcRegCC:
78 case X86II::MRMXrCC:
79 case X86II::MRMr0:
80 case X86II::MRMXr:
81 case X86II::MRM0r:
82 case X86II::MRM1r:
83 case X86II::MRM2r:
84 case X86II::MRM3r:
85 case X86II::MRM4r:
86 case X86II::MRM5r:
87 case X86II::MRM6r:
88 case X86II::MRM7r:
89 case X86II::MRM0X:
90 case X86II::MRM1X:
91 case X86II::MRM2X:
92 case X86II::MRM3X:
93 case X86II::MRM4X:
94 case X86II::MRM5X:
95 case X86II::MRM6X:
96 case X86II::MRM7X:
97 case X86II::MRM_C0:
98 case X86II::MRM_C1:
99 case X86II::MRM_C2:
100 case X86II::MRM_C3:
101 case X86II::MRM_C4:
102 case X86II::MRM_C5:
103 case X86II::MRM_C6:
104 case X86II::MRM_C7:
105 case X86II::MRM_C8:
106 case X86II::MRM_C9:
107 case X86II::MRM_CA:
108 case X86II::MRM_CB:
109 case X86II::MRM_CC:
110 case X86II::MRM_CD:
111 case X86II::MRM_CE:
112 case X86II::MRM_CF:
113 case X86II::MRM_D0:
114 case X86II::MRM_D1:
115 case X86II::MRM_D2:
116 case X86II::MRM_D3:
117 case X86II::MRM_D4:
118 case X86II::MRM_D5:
119 case X86II::MRM_D6:
120 case X86II::MRM_D7:
121 case X86II::MRM_D8:
122 case X86II::MRM_D9:
123 case X86II::MRM_DA:
124 case X86II::MRM_DB:
125 case X86II::MRM_DC:
126 case X86II::MRM_DD:
127 case X86II::MRM_DE:
128 case X86II::MRM_DF:
129 case X86II::MRM_E0:
130 case X86II::MRM_E1:
131 case X86II::MRM_E2:
132 case X86II::MRM_E3:
133 case X86II::MRM_E4:
134 case X86II::MRM_E5:
135 case X86II::MRM_E6:
136 case X86II::MRM_E7:
137 case X86II::MRM_E8:
138 case X86II::MRM_E9:
139 case X86II::MRM_EA:
140 case X86II::MRM_EB:
141 case X86II::MRM_EC:
142 case X86II::MRM_ED:
143 case X86II::MRM_EE:
144 case X86II::MRM_EF:
145 case X86II::MRM_F0:
146 case X86II::MRM_F1:
147 case X86II::MRM_F2:
148 case X86II::MRM_F3:
149 case X86II::MRM_F4:
150 case X86II::MRM_F5:
151 case X86II::MRM_F6:
152 case X86II::MRM_F7:
153 case X86II::MRM_F8:
154 case X86II::MRM_F9:
155 case X86II::MRM_FA:
156 case X86II::MRM_FB:
157 case X86II::MRM_FC:
158 case X86II::MRM_FD:
159 case X86II::MRM_FE:
160 case X86II::MRM_FF:
161 case X86II::RawFrmImm8:
162 return nullptr;
163 case X86II::AddRegFrm:
164 return (Instr.Description.Opcode == X86::POP16r ||
165 Instr.Description.Opcode == X86::POP32r ||
166 Instr.Description.Opcode == X86::PUSH16r ||
167 Instr.Description.Opcode == X86::PUSH32r)
168 ? "unsupported opcode: unsupported memory access"
169 : nullptr;
170 // These access memory and are handled.
171 case X86II::MRMDestMem:
172 case X86II::MRMSrcMem:
173 case X86II::MRMSrcMem4VOp3:
174 case X86II::MRMSrcMemOp4:
175 case X86II::MRMSrcMemCC:
176 case X86II::MRMXmCC:
177 case X86II::MRMXm:
178 case X86II::MRM0m:
179 case X86II::MRM1m:
180 case X86II::MRM2m:
181 case X86II::MRM3m:
182 case X86II::MRM4m:
183 case X86II::MRM5m:
184 case X86II::MRM6m:
185 case X86II::MRM7m:
186 return nullptr;
187 // These access memory and are not handled yet.
188 case X86II::RawFrmImm16:
189 case X86II::RawFrmMemOffs:
190 case X86II::RawFrmSrc:
191 case X86II::RawFrmDst:
192 case X86II::RawFrmDstSrc:
193 return "unsupported opcode: non uniform memory access";
197 // If the opcode is invalid, returns a pointer to a character literal indicating
198 // the reason. nullptr indicates a valid opcode.
199 static const char *isInvalidOpcode(const Instruction &Instr) {
200 const auto OpcodeName = Instr.Name;
201 if ((Instr.Description.TSFlags & X86II::FormMask) == X86II::Pseudo)
202 return "unsupported opcode: pseudo instruction";
203 if ((OpcodeName.startswith("POP") && !OpcodeName.startswith("POPCNT")) ||
204 OpcodeName.startswith("PUSH") || OpcodeName.startswith("ADJCALLSTACK") ||
205 OpcodeName.startswith("LEAVE"))
206 return "unsupported opcode: Push/Pop/AdjCallStack/Leave";
207 switch (Instr.Description.Opcode) {
208 case X86::LFS16rm:
209 case X86::LFS32rm:
210 case X86::LFS64rm:
211 case X86::LGS16rm:
212 case X86::LGS32rm:
213 case X86::LGS64rm:
214 case X86::LSS16rm:
215 case X86::LSS32rm:
216 case X86::LSS64rm:
217 case X86::SYSENTER:
218 case X86::WRFSBASE:
219 case X86::WRFSBASE64:
220 return "unsupported opcode";
221 default:
222 break;
224 if (const auto reason = isInvalidMemoryInstr(Instr))
225 return reason;
226 // We do not handle instructions with OPERAND_PCREL.
227 for (const Operand &Op : Instr.Operands)
228 if (Op.isExplicit() &&
229 Op.getExplicitOperandInfo().OperandType == MCOI::OPERAND_PCREL)
230 return "unsupported opcode: PC relative operand";
231 // We do not handle second-form X87 instructions. We only handle first-form
232 // ones (_Fp), see comment in X86InstrFPStack.td.
233 for (const Operand &Op : Instr.Operands)
234 if (Op.isReg() && Op.isExplicit() &&
235 Op.getExplicitOperandInfo().RegClass == X86::RSTRegClassID)
236 return "unsupported second-form X87 instruction";
237 return nullptr;
240 static unsigned getX86FPFlags(const Instruction &Instr) {
241 return Instr.Description.TSFlags & X86II::FPTypeMask;
244 // Helper to fill a memory operand with a value.
245 static void setMemOp(InstructionTemplate &IT, int OpIdx,
246 const MCOperand &OpVal) {
247 const auto Op = IT.getInstr().Operands[OpIdx];
248 assert(Op.isExplicit() && "invalid memory pattern");
249 IT.getValueFor(Op) = OpVal;
252 // Common (latency, uops) code for LEA templates. `GetDestReg` takes the
253 // addressing base and index registers and returns the LEA destination register.
254 static Expected<std::vector<CodeTemplate>> generateLEATemplatesCommon(
255 const Instruction &Instr, const BitVector &ForbiddenRegisters,
256 const LLVMState &State, const SnippetGenerator::Options &Opts,
257 std::function<void(unsigned, unsigned, BitVector &CandidateDestRegs)>
258 RestrictDestRegs) {
259 assert(Instr.Operands.size() == 6 && "invalid LEA");
260 assert(X86II::getMemoryOperandNo(Instr.Description.TSFlags) == 1 &&
261 "invalid LEA");
263 constexpr const int kDestOp = 0;
264 constexpr const int kBaseOp = 1;
265 constexpr const int kIndexOp = 3;
266 auto PossibleDestRegs =
267 Instr.Operands[kDestOp].getRegisterAliasing().sourceBits();
268 remove(PossibleDestRegs, ForbiddenRegisters);
269 auto PossibleBaseRegs =
270 Instr.Operands[kBaseOp].getRegisterAliasing().sourceBits();
271 remove(PossibleBaseRegs, ForbiddenRegisters);
272 auto PossibleIndexRegs =
273 Instr.Operands[kIndexOp].getRegisterAliasing().sourceBits();
274 remove(PossibleIndexRegs, ForbiddenRegisters);
276 const auto &RegInfo = State.getRegInfo();
277 std::vector<CodeTemplate> Result;
278 for (const unsigned BaseReg : PossibleBaseRegs.set_bits()) {
279 for (const unsigned IndexReg : PossibleIndexRegs.set_bits()) {
280 for (int LogScale = 0; LogScale <= 3; ++LogScale) {
281 // FIXME: Add an option for controlling how we explore immediates.
282 for (const int Disp : {0, 42}) {
283 InstructionTemplate IT(&Instr);
284 const int64_t Scale = 1ull << LogScale;
285 setMemOp(IT, 1, MCOperand::createReg(BaseReg));
286 setMemOp(IT, 2, MCOperand::createImm(Scale));
287 setMemOp(IT, 3, MCOperand::createReg(IndexReg));
288 setMemOp(IT, 4, MCOperand::createImm(Disp));
289 // SegmentReg must be 0 for LEA.
290 setMemOp(IT, 5, MCOperand::createReg(0));
292 // Output reg candidates are selected by the caller.
293 auto PossibleDestRegsNow = PossibleDestRegs;
294 RestrictDestRegs(BaseReg, IndexReg, PossibleDestRegsNow);
295 assert(PossibleDestRegsNow.set_bits().begin() !=
296 PossibleDestRegsNow.set_bits().end() &&
297 "no remaining registers");
298 setMemOp(
299 IT, 0,
300 MCOperand::createReg(*PossibleDestRegsNow.set_bits().begin()));
302 CodeTemplate CT;
303 CT.Instructions.push_back(std::move(IT));
304 CT.Config = formatv("{3}(%{0}, %{1}, {2})", RegInfo.getName(BaseReg),
305 RegInfo.getName(IndexReg), Scale, Disp)
306 .str();
307 Result.push_back(std::move(CT));
308 if (Result.size() >= Opts.MaxConfigsPerOpcode)
309 return std::move(Result);
315 return std::move(Result);
318 namespace {
319 class X86SerialSnippetGenerator : public SerialSnippetGenerator {
320 public:
321 using SerialSnippetGenerator::SerialSnippetGenerator;
323 Expected<std::vector<CodeTemplate>>
324 generateCodeTemplates(InstructionTemplate Variant,
325 const BitVector &ForbiddenRegisters) const override;
327 } // namespace
329 Expected<std::vector<CodeTemplate>>
330 X86SerialSnippetGenerator::generateCodeTemplates(
331 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
332 const Instruction &Instr = Variant.getInstr();
334 if (const auto reason = isInvalidOpcode(Instr))
335 return make_error<Failure>(reason);
337 // LEA gets special attention.
338 const auto Opcode = Instr.Description.getOpcode();
339 if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
340 return generateLEATemplatesCommon(
341 Instr, ForbiddenRegisters, State, Opts,
342 [this](unsigned BaseReg, unsigned IndexReg,
343 BitVector &CandidateDestRegs) {
344 // We just select a destination register that aliases the base
345 // register.
346 CandidateDestRegs &=
347 State.getRATC().getRegister(BaseReg).aliasedBits();
351 if (Instr.hasMemoryOperands())
352 return make_error<Failure>(
353 "unsupported memory operand in latency measurements");
355 switch (getX86FPFlags(Instr)) {
356 case X86II::NotFP:
357 return SerialSnippetGenerator::generateCodeTemplates(Variant,
358 ForbiddenRegisters);
359 case X86II::ZeroArgFP:
360 case X86II::OneArgFP:
361 case X86II::SpecialFP:
362 case X86II::CompareFP:
363 case X86II::CondMovFP:
364 return make_error<Failure>("Unsupported x87 Instruction");
365 case X86II::OneArgFPRW:
366 case X86II::TwoArgFP:
367 // These are instructions like
368 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
369 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
370 // They are intrinsically serial and do not modify the state of the stack.
371 return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters);
372 default:
373 llvm_unreachable("Unknown FP Type!");
377 namespace {
378 class X86ParallelSnippetGenerator : public ParallelSnippetGenerator {
379 public:
380 using ParallelSnippetGenerator::ParallelSnippetGenerator;
382 Expected<std::vector<CodeTemplate>>
383 generateCodeTemplates(InstructionTemplate Variant,
384 const BitVector &ForbiddenRegisters) const override;
387 } // namespace
389 Expected<std::vector<CodeTemplate>>
390 X86ParallelSnippetGenerator::generateCodeTemplates(
391 InstructionTemplate Variant, const BitVector &ForbiddenRegisters) const {
392 const Instruction &Instr = Variant.getInstr();
394 if (const auto reason = isInvalidOpcode(Instr))
395 return make_error<Failure>(reason);
397 // LEA gets special attention.
398 const auto Opcode = Instr.Description.getOpcode();
399 if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) {
400 return generateLEATemplatesCommon(
401 Instr, ForbiddenRegisters, State, Opts,
402 [this](unsigned BaseReg, unsigned IndexReg,
403 BitVector &CandidateDestRegs) {
404 // Any destination register that is not used for addressing is fine.
405 remove(CandidateDestRegs,
406 State.getRATC().getRegister(BaseReg).aliasedBits());
407 remove(CandidateDestRegs,
408 State.getRATC().getRegister(IndexReg).aliasedBits());
412 switch (getX86FPFlags(Instr)) {
413 case X86II::NotFP:
414 return ParallelSnippetGenerator::generateCodeTemplates(Variant,
415 ForbiddenRegisters);
416 case X86II::ZeroArgFP:
417 case X86II::OneArgFP:
418 case X86II::SpecialFP:
419 return make_error<Failure>("Unsupported x87 Instruction");
420 case X86II::OneArgFPRW:
421 case X86II::TwoArgFP:
422 // These are instructions like
423 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
424 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
425 // They are intrinsically serial and do not modify the state of the stack.
426 // We generate the same code for latency and uops.
427 return generateSelfAliasingCodeTemplates(Variant, ForbiddenRegisters);
428 case X86II::CompareFP:
429 case X86II::CondMovFP:
430 // We can compute uops for any FP instruction that does not grow or shrink
431 // the stack (either do not touch the stack or push as much as they pop).
432 return generateUnconstrainedCodeTemplates(
433 Variant, "instruction does not grow/shrink the FP stack");
434 default:
435 llvm_unreachable("Unknown FP Type!");
439 static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
440 switch (RegBitWidth) {
441 case 8:
442 return X86::MOV8ri;
443 case 16:
444 return X86::MOV16ri;
445 case 32:
446 return X86::MOV32ri;
447 case 64:
448 return X86::MOV64ri;
450 llvm_unreachable("Invalid Value Width");
453 // Generates instruction to load an immediate value into a register.
454 static MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
455 const APInt &Value) {
456 if (Value.getBitWidth() > RegBitWidth)
457 llvm_unreachable("Value must fit in the Register");
458 return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth))
459 .addReg(Reg)
460 .addImm(Value.getZExtValue());
463 // Allocates scratch memory on the stack.
464 static MCInst allocateStackSpace(unsigned Bytes) {
465 return MCInstBuilder(X86::SUB64ri8)
466 .addReg(X86::RSP)
467 .addReg(X86::RSP)
468 .addImm(Bytes);
471 // Fills scratch memory at offset `OffsetBytes` with value `Imm`.
472 static MCInst fillStackSpace(unsigned MovOpcode, unsigned OffsetBytes,
473 uint64_t Imm) {
474 return MCInstBuilder(MovOpcode)
475 // Address = ESP
476 .addReg(X86::RSP) // BaseReg
477 .addImm(1) // ScaleAmt
478 .addReg(0) // IndexReg
479 .addImm(OffsetBytes) // Disp
480 .addReg(0) // Segment
481 // Immediate.
482 .addImm(Imm);
485 // Loads scratch memory into register `Reg` using opcode `RMOpcode`.
486 static MCInst loadToReg(unsigned Reg, unsigned RMOpcode) {
487 return MCInstBuilder(RMOpcode)
488 .addReg(Reg)
489 // Address = ESP
490 .addReg(X86::RSP) // BaseReg
491 .addImm(1) // ScaleAmt
492 .addReg(0) // IndexReg
493 .addImm(0) // Disp
494 .addReg(0); // Segment
497 // Releases scratch memory.
498 static MCInst releaseStackSpace(unsigned Bytes) {
499 return MCInstBuilder(X86::ADD64ri8)
500 .addReg(X86::RSP)
501 .addReg(X86::RSP)
502 .addImm(Bytes);
505 // Reserves some space on the stack, fills it with the content of the provided
506 // constant and provide methods to load the stack value into a register.
507 namespace {
508 struct ConstantInliner {
509 explicit ConstantInliner(const APInt &Constant) : Constant_(Constant) {}
511 std::vector<MCInst> loadAndFinalize(unsigned Reg, unsigned RegBitWidth,
512 unsigned Opcode);
514 std::vector<MCInst> loadX87STAndFinalize(unsigned Reg);
516 std::vector<MCInst> loadX87FPAndFinalize(unsigned Reg);
518 std::vector<MCInst> popFlagAndFinalize();
520 std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode,
521 unsigned Value);
523 private:
524 ConstantInliner &add(const MCInst &Inst) {
525 Instructions.push_back(Inst);
526 return *this;
529 void initStack(unsigned Bytes);
531 static constexpr const unsigned kF80Bytes = 10; // 80 bits.
533 APInt Constant_;
534 std::vector<MCInst> Instructions;
536 } // namespace
538 std::vector<MCInst> ConstantInliner::loadAndFinalize(unsigned Reg,
539 unsigned RegBitWidth,
540 unsigned Opcode) {
541 assert((RegBitWidth & 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
542 initStack(RegBitWidth / 8);
543 add(loadToReg(Reg, Opcode));
544 add(releaseStackSpace(RegBitWidth / 8));
545 return std::move(Instructions);
548 std::vector<MCInst> ConstantInliner::loadX87STAndFinalize(unsigned Reg) {
549 initStack(kF80Bytes);
550 add(MCInstBuilder(X86::LD_F80m)
551 // Address = ESP
552 .addReg(X86::RSP) // BaseReg
553 .addImm(1) // ScaleAmt
554 .addReg(0) // IndexReg
555 .addImm(0) // Disp
556 .addReg(0)); // Segment
557 if (Reg != X86::ST0)
558 add(MCInstBuilder(X86::ST_Frr).addReg(Reg));
559 add(releaseStackSpace(kF80Bytes));
560 return std::move(Instructions);
563 std::vector<MCInst> ConstantInliner::loadX87FPAndFinalize(unsigned Reg) {
564 initStack(kF80Bytes);
565 add(MCInstBuilder(X86::LD_Fp80m)
566 .addReg(Reg)
567 // Address = ESP
568 .addReg(X86::RSP) // BaseReg
569 .addImm(1) // ScaleAmt
570 .addReg(0) // IndexReg
571 .addImm(0) // Disp
572 .addReg(0)); // Segment
573 add(releaseStackSpace(kF80Bytes));
574 return std::move(Instructions);
577 std::vector<MCInst> ConstantInliner::popFlagAndFinalize() {
578 initStack(8);
579 add(MCInstBuilder(X86::POPF64));
580 return std::move(Instructions);
583 std::vector<MCInst>
584 ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) {
585 add(allocateStackSpace(4));
586 add(fillStackSpace(X86::MOV32mi, 0, Value)); // Mask all FP exceptions
587 add(MCInstBuilder(Opcode)
588 // Address = ESP
589 .addReg(X86::RSP) // BaseReg
590 .addImm(1) // ScaleAmt
591 .addReg(0) // IndexReg
592 .addImm(0) // Disp
593 .addReg(0)); // Segment
594 add(releaseStackSpace(4));
595 return std::move(Instructions);
598 void ConstantInliner::initStack(unsigned Bytes) {
599 assert(Constant_.getBitWidth() <= Bytes * 8 &&
600 "Value does not have the correct size");
601 const APInt WideConstant = Constant_.getBitWidth() < Bytes * 8
602 ? Constant_.sext(Bytes * 8)
603 : Constant_;
604 add(allocateStackSpace(Bytes));
605 size_t ByteOffset = 0;
606 for (; Bytes - ByteOffset >= 4; ByteOffset += 4)
607 add(fillStackSpace(
608 X86::MOV32mi, ByteOffset,
609 WideConstant.extractBits(32, ByteOffset * 8).getZExtValue()));
610 if (Bytes - ByteOffset >= 2) {
611 add(fillStackSpace(
612 X86::MOV16mi, ByteOffset,
613 WideConstant.extractBits(16, ByteOffset * 8).getZExtValue()));
614 ByteOffset += 2;
616 if (Bytes - ByteOffset >= 1)
617 add(fillStackSpace(
618 X86::MOV8mi, ByteOffset,
619 WideConstant.extractBits(8, ByteOffset * 8).getZExtValue()));
622 #include "X86GenExegesis.inc"
624 namespace {
626 class X86SavedState : public ExegesisTarget::SavedState {
627 public:
628 X86SavedState() {
629 #if defined(_MSC_VER) && defined(_M_X64)
630 _fxsave64(FPState);
631 Eflags = __readeflags();
632 #elif defined(__GNUC__) && defined(__x86_64__)
633 __builtin_ia32_fxsave64(FPState);
634 Eflags = __builtin_ia32_readeflags_u64();
635 #else
636 report_fatal_error("X86 exegesis running on unsupported target");
637 #endif
640 ~X86SavedState() {
641 // Restoring the X87 state does not flush pending exceptions, make sure
642 // these exceptions are flushed now.
643 #if defined(_MSC_VER) && defined(_M_X64)
644 _clearfp();
645 _fxrstor64(FPState);
646 __writeeflags(Eflags);
647 #elif defined(__GNUC__) && defined(__x86_64__)
648 asm volatile("fwait");
649 __builtin_ia32_fxrstor64(FPState);
650 __builtin_ia32_writeeflags_u64(Eflags);
651 #else
652 report_fatal_error("X86 exegesis running on unsupported target");
653 #endif
656 private:
657 #if defined(__x86_64__) || defined(_M_X64)
658 alignas(16) char FPState[512];
659 uint64_t Eflags;
660 #endif
663 class ExegesisX86Target : public ExegesisTarget {
664 public:
665 ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
667 Expected<std::unique_ptr<pfm::Counter>>
668 createCounter(StringRef CounterName, const LLVMState &State,
669 const pid_t ProcessID) const override {
670 // If LbrSamplingPeriod was provided, then ignore the
671 // CounterName because we only have one for LBR.
672 if (LbrSamplingPeriod > 0) {
673 // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without
674 // __linux__ (for now)
675 #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \
676 defined(__linux__)
677 return std::make_unique<X86LbrCounter>(
678 X86LbrPerfEvent(LbrSamplingPeriod));
679 #else
680 return llvm::make_error<llvm::StringError>(
681 "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, "
682 "or running on Linux.",
683 llvm::errc::invalid_argument);
684 #endif
686 return ExegesisTarget::createCounter(CounterName, State, ProcessID);
689 private:
690 void addTargetSpecificPasses(PassManagerBase &PM) const override;
692 unsigned getScratchMemoryRegister(const Triple &TT) const override;
694 unsigned getLoopCounterRegister(const Triple &) const override;
696 unsigned getMaxMemoryAccessSize() const override { return 64; }
698 Error randomizeTargetMCOperand(const Instruction &Instr, const Variable &Var,
699 MCOperand &AssignedValue,
700 const BitVector &ForbiddenRegs) const override;
702 void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
703 unsigned Offset) const override;
705 void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
706 MachineBasicBlock &TargetMBB,
707 const MCInstrInfo &MII) const override;
709 std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
710 const APInt &Value) const override;
712 ArrayRef<unsigned> getUnavailableRegisters() const override {
713 if (DisableUpperSSERegisters)
714 return ArrayRef(kUnavailableRegistersSSE,
715 sizeof(kUnavailableRegistersSSE) /
716 sizeof(kUnavailableRegistersSSE[0]));
718 return ArrayRef(kUnavailableRegisters, std::size(kUnavailableRegisters));
721 bool allowAsBackToBack(const Instruction &Instr) const override {
722 const unsigned Opcode = Instr.Description.Opcode;
723 return !isInvalidOpcode(Instr) && Opcode != X86::LEA64r &&
724 Opcode != X86::LEA64_32r && Opcode != X86::LEA16r;
727 std::vector<InstructionTemplate>
728 generateInstructionVariants(const Instruction &Instr,
729 unsigned MaxConfigsPerOpcode) const override;
731 std::unique_ptr<SnippetGenerator> createSerialSnippetGenerator(
732 const LLVMState &State,
733 const SnippetGenerator::Options &Opts) const override {
734 return std::make_unique<X86SerialSnippetGenerator>(State, Opts);
737 std::unique_ptr<SnippetGenerator> createParallelSnippetGenerator(
738 const LLVMState &State,
739 const SnippetGenerator::Options &Opts) const override {
740 return std::make_unique<X86ParallelSnippetGenerator>(State, Opts);
743 bool matchesArch(Triple::ArchType Arch) const override {
744 return Arch == Triple::x86_64 || Arch == Triple::x86;
747 Error checkFeatureSupport() const override {
748 // LBR is the only feature we conditionally support now.
749 // So if LBR is not requested, then we should be able to run the benchmarks.
750 if (LbrSamplingPeriod == 0)
751 return Error::success();
753 #if defined(__linux__) && defined(HAVE_LIBPFM) && \
754 defined(LIBPFM_HAS_FIELD_CYCLES)
755 // FIXME: Fix this.
756 // https://bugs.llvm.org/show_bug.cgi?id=48918
757 // For now, only do the check if we see an Intel machine because
758 // the counter uses some intel-specific magic and it could
759 // be confuse and think an AMD machine actually has LBR support.
760 #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
761 defined(_M_X64)
762 using namespace sys::detail::x86;
764 if (getVendorSignature() == VendorSignatures::GENUINE_INTEL)
765 // If the kernel supports it, the hardware still may not have it.
766 return X86LbrCounter::checkLbrSupport();
767 #else
768 report_fatal_error("Running X86 exegesis on unsupported target");
769 #endif
770 #endif
771 return llvm::make_error<llvm::StringError>(
772 "LBR not supported on this kernel and/or platform",
773 llvm::errc::not_supported);
776 std::unique_ptr<SavedState> withSavedState() const override {
777 return std::make_unique<X86SavedState>();
780 static const unsigned kUnavailableRegisters[4];
781 static const unsigned kUnavailableRegistersSSE[12];
784 // We disable a few registers that cannot be encoded on instructions with a REX
785 // prefix.
786 const unsigned ExegesisX86Target::kUnavailableRegisters[4] = {X86::AH, X86::BH,
787 X86::CH, X86::DH};
789 // Optionally, also disable the upper (x86_64) SSE registers to reduce frontend
790 // decoder load.
791 const unsigned ExegesisX86Target::kUnavailableRegistersSSE[12] = {
792 X86::AH, X86::BH, X86::CH, X86::DH, X86::XMM8, X86::XMM9,
793 X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13, X86::XMM14, X86::XMM15};
795 // We're using one of R8-R15 because these registers are never hardcoded in
796 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
797 // conflicts.
798 constexpr const unsigned kLoopCounterReg = X86::R8;
800 } // namespace
802 void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase &PM) const {
803 // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
804 PM.add(createX86FloatingPointStackifierPass());
807 unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
808 if (!TT.isArch64Bit()) {
809 // FIXME: This would require popping from the stack, so we would have to
810 // add some additional setup code.
811 return 0;
813 return TT.isOSWindows() ? X86::RCX : X86::RDI;
816 unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const {
817 if (!TT.isArch64Bit()) {
818 return 0;
820 return kLoopCounterReg;
823 Error ExegesisX86Target::randomizeTargetMCOperand(
824 const Instruction &Instr, const Variable &Var, MCOperand &AssignedValue,
825 const BitVector &ForbiddenRegs) const {
826 const Operand &Op = Instr.getPrimaryOperand(Var);
827 switch (Op.getExplicitOperandInfo().OperandType) {
828 case X86::OperandType::OPERAND_ROUNDING_CONTROL:
829 AssignedValue =
830 MCOperand::createImm(randomIndex(X86::STATIC_ROUNDING::TO_ZERO));
831 return Error::success();
832 default:
833 break;
835 return make_error<Failure>(
836 Twine("unimplemented operand type ")
837 .concat(Twine(Op.getExplicitOperandInfo().OperandType)));
840 void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
841 unsigned Reg,
842 unsigned Offset) const {
843 assert(!isInvalidMemoryInstr(IT.getInstr()) &&
844 "fillMemoryOperands requires a valid memory instruction");
845 int MemOpIdx = X86II::getMemoryOperandNo(IT.getInstr().Description.TSFlags);
846 assert(MemOpIdx >= 0 && "invalid memory operand index");
847 // getMemoryOperandNo() ignores tied operands, so we have to add them back.
848 MemOpIdx += X86II::getOperandBias(IT.getInstr().Description);
849 setMemOp(IT, MemOpIdx + 0, MCOperand::createReg(Reg)); // BaseReg
850 setMemOp(IT, MemOpIdx + 1, MCOperand::createImm(1)); // ScaleAmt
851 setMemOp(IT, MemOpIdx + 2, MCOperand::createReg(0)); // IndexReg
852 setMemOp(IT, MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp
853 setMemOp(IT, MemOpIdx + 4, MCOperand::createReg(0)); // Segment
856 void ExegesisX86Target::decrementLoopCounterAndJump(
857 MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
858 const MCInstrInfo &MII) const {
859 BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
860 .addDef(kLoopCounterReg)
861 .addUse(kLoopCounterReg)
862 .addImm(-1);
863 BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1))
864 .addMBB(&TargetMBB)
865 .addImm(X86::COND_NE);
868 std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
869 unsigned Reg,
870 const APInt &Value) const {
871 if (X86::GR8RegClass.contains(Reg))
872 return {loadImmediate(Reg, 8, Value)};
873 if (X86::GR16RegClass.contains(Reg))
874 return {loadImmediate(Reg, 16, Value)};
875 if (X86::GR32RegClass.contains(Reg))
876 return {loadImmediate(Reg, 32, Value)};
877 if (X86::GR64RegClass.contains(Reg))
878 return {loadImmediate(Reg, 64, Value)};
879 if (X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) ||
880 X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg)) {
881 switch (Value.getBitWidth()) {
882 case 8:
883 if (STI.getFeatureBits()[X86::FeatureDQI]) {
884 ConstantInliner CI(Value);
885 return CI.loadAndFinalize(Reg, Value.getBitWidth(), X86::KMOVBkm);
887 [[fallthrough]];
888 case 16:
889 if (STI.getFeatureBits()[X86::FeatureAVX512]) {
890 ConstantInliner CI(Value.zextOrTrunc(16));
891 return CI.loadAndFinalize(Reg, 16, X86::KMOVWkm);
893 break;
894 case 32:
895 if (STI.getFeatureBits()[X86::FeatureBWI]) {
896 ConstantInliner CI(Value);
897 return CI.loadAndFinalize(Reg, Value.getBitWidth(), X86::KMOVDkm);
899 break;
900 case 64:
901 if (STI.getFeatureBits()[X86::FeatureBWI]) {
902 ConstantInliner CI(Value);
903 return CI.loadAndFinalize(Reg, Value.getBitWidth(), X86::KMOVQkm);
905 break;
908 ConstantInliner CI(Value);
909 if (X86::VR64RegClass.contains(Reg))
910 return CI.loadAndFinalize(Reg, 64, X86::MMX_MOVQ64rm);
911 if (X86::VR128XRegClass.contains(Reg)) {
912 if (STI.getFeatureBits()[X86::FeatureAVX512])
913 return CI.loadAndFinalize(Reg, 128, X86::VMOVDQU32Z128rm);
914 if (STI.getFeatureBits()[X86::FeatureAVX])
915 return CI.loadAndFinalize(Reg, 128, X86::VMOVDQUrm);
916 return CI.loadAndFinalize(Reg, 128, X86::MOVDQUrm);
918 if (X86::VR256XRegClass.contains(Reg)) {
919 if (STI.getFeatureBits()[X86::FeatureAVX512])
920 return CI.loadAndFinalize(Reg, 256, X86::VMOVDQU32Z256rm);
921 if (STI.getFeatureBits()[X86::FeatureAVX])
922 return CI.loadAndFinalize(Reg, 256, X86::VMOVDQUYrm);
924 if (X86::VR512RegClass.contains(Reg))
925 if (STI.getFeatureBits()[X86::FeatureAVX512])
926 return CI.loadAndFinalize(Reg, 512, X86::VMOVDQU32Zrm);
927 if (X86::RSTRegClass.contains(Reg)) {
928 return CI.loadX87STAndFinalize(Reg);
930 if (X86::RFP32RegClass.contains(Reg) || X86::RFP64RegClass.contains(Reg) ||
931 X86::RFP80RegClass.contains(Reg)) {
932 return CI.loadX87FPAndFinalize(Reg);
934 if (Reg == X86::EFLAGS)
935 return CI.popFlagAndFinalize();
936 if (Reg == X86::MXCSR)
937 return CI.loadImplicitRegAndFinalize(
938 STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR : X86::LDMXCSR,
939 0x1f80);
940 if (Reg == X86::FPCW)
941 return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f);
942 return {}; // Not yet implemented.
945 // Instruction can have some variable operands, and we may want to see how
946 // different operands affect performance. So for each operand position,
947 // precompute all the possible choices we might care about,
948 // and greedily generate all the possible combinations of choices.
949 std::vector<InstructionTemplate> ExegesisX86Target::generateInstructionVariants(
950 const Instruction &Instr, unsigned MaxConfigsPerOpcode) const {
951 bool Exploration = false;
952 SmallVector<SmallVector<MCOperand, 1>, 4> VariableChoices;
953 VariableChoices.resize(Instr.Variables.size());
954 for (auto I : llvm::zip(Instr.Variables, VariableChoices)) {
955 const Variable &Var = std::get<0>(I);
956 SmallVectorImpl<MCOperand> &Choices = std::get<1>(I);
958 switch (Instr.getPrimaryOperand(Var).getExplicitOperandInfo().OperandType) {
959 default:
960 // We don't wish to explicitly explore this variable.
961 Choices.emplace_back(); // But add invalid MCOperand to simplify logic.
962 continue;
963 case X86::OperandType::OPERAND_COND_CODE: {
964 Exploration = true;
965 auto CondCodes = enum_seq_inclusive(X86::CondCode::COND_O,
966 X86::CondCode::LAST_VALID_COND,
967 force_iteration_on_noniterable_enum);
968 Choices.reserve(CondCodes.size());
969 for (int CondCode : CondCodes)
970 Choices.emplace_back(MCOperand::createImm(CondCode));
971 break;
976 // If we don't wish to explore any variables, defer to the baseline method.
977 if (!Exploration)
978 return ExegesisTarget::generateInstructionVariants(Instr,
979 MaxConfigsPerOpcode);
981 std::vector<InstructionTemplate> Variants;
982 size_t NumVariants;
983 CombinationGenerator<MCOperand, decltype(VariableChoices)::value_type, 4> G(
984 VariableChoices);
986 // How many operand combinations can we produce, within the limit?
987 NumVariants = std::min(G.numCombinations(), (size_t)MaxConfigsPerOpcode);
988 // And actually produce all the wanted operand combinations.
989 Variants.reserve(NumVariants);
990 G.generate([&](ArrayRef<MCOperand> State) -> bool {
991 Variants.emplace_back(&Instr);
992 Variants.back().setVariableValues(State);
993 // Did we run out of space for variants?
994 return Variants.size() >= NumVariants;
997 assert(Variants.size() == NumVariants &&
998 Variants.size() <= MaxConfigsPerOpcode &&
999 "Should not produce too many variants");
1000 return Variants;
1003 static ExegesisTarget *getTheExegesisX86Target() {
1004 static ExegesisX86Target Target;
1005 return &Target;
1008 void InitializeX86ExegesisTarget() {
1009 ExegesisTarget::registerTarget(getTheExegesisX86Target());
1012 } // namespace exegesis
1013 } // namespace llvm