1 //===-- Target.cpp ----------------------------------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
11 #include "../ParallelSnippetGenerator.h"
12 #include "../SerialSnippetGenerator.h"
13 #include "../SnippetGenerator.h"
14 #include "MCTargetDesc/X86BaseInfo.h"
15 #include "MCTargetDesc/X86MCTargetDesc.h"
17 #include "X86Counter.h"
18 #include "X86RegisterInfo.h"
19 #include "llvm/ADT/Sequence.h"
20 #include "llvm/CodeGen/MachineInstrBuilder.h"
21 #include "llvm/MC/MCInstBuilder.h"
22 #include "llvm/Support/Errc.h"
23 #include "llvm/Support/Error.h"
24 #include "llvm/Support/ErrorHandling.h"
25 #include "llvm/Support/FormatVariadic.h"
26 #include "llvm/TargetParser/Host.h"
31 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
32 #include <immintrin.h>
35 #if defined(_MSC_VER) && defined(_M_X64)
36 #include <float.h> // For _clearfp in ~X86SavedState().
42 // If a positive value is specified, we are going to use the LBR in
46 // - A small value is preferred, but too low a value could result in
48 // - A prime number is preferred to avoid always skipping certain blocks.
50 static cl::opt
<unsigned> LbrSamplingPeriod(
51 "x86-lbr-sample-period",
52 cl::desc("The sample period (nbranches/sample), used for LBR sampling"),
53 cl::cat(BenchmarkOptions
), cl::init(0));
56 DisableUpperSSERegisters("x86-disable-upper-sse-registers",
57 cl::desc("Disable XMM8-XMM15 register usage"),
58 cl::cat(BenchmarkOptions
), cl::init(false));
60 // FIXME: Validates that repetition-mode is loop if LBR is requested.
62 // Returns a non-null reason if we cannot handle the memory references in this
64 static const char *isInvalidMemoryInstr(const Instruction
&Instr
) {
65 switch (Instr
.Description
.TSFlags
& X86II::FormMask
) {
67 return "Unknown FormMask value";
68 // These have no memory access.
72 case X86II::PrefixByte
:
73 case X86II::MRMDestReg
:
74 case X86II::MRMSrcReg
:
75 case X86II::MRMSrcReg4VOp3
:
76 case X86II::MRMSrcRegOp4
:
77 case X86II::MRMSrcRegCC
:
161 case X86II::RawFrmImm8
:
163 case X86II::AddRegFrm
:
164 return (Instr
.Description
.Opcode
== X86::POP16r
||
165 Instr
.Description
.Opcode
== X86::POP32r
||
166 Instr
.Description
.Opcode
== X86::PUSH16r
||
167 Instr
.Description
.Opcode
== X86::PUSH32r
)
168 ? "unsupported opcode: unsupported memory access"
170 // These access memory and are handled.
171 case X86II::MRMDestMem
:
172 case X86II::MRMSrcMem
:
173 case X86II::MRMSrcMem4VOp3
:
174 case X86II::MRMSrcMemOp4
:
175 case X86II::MRMSrcMemCC
:
187 // These access memory and are not handled yet.
188 case X86II::RawFrmImm16
:
189 case X86II::RawFrmMemOffs
:
190 case X86II::RawFrmSrc
:
191 case X86II::RawFrmDst
:
192 case X86II::RawFrmDstSrc
:
193 return "unsupported opcode: non uniform memory access";
197 // If the opcode is invalid, returns a pointer to a character literal indicating
198 // the reason. nullptr indicates a valid opcode.
199 static const char *isInvalidOpcode(const Instruction
&Instr
) {
200 const auto OpcodeName
= Instr
.Name
;
201 if ((Instr
.Description
.TSFlags
& X86II::FormMask
) == X86II::Pseudo
)
202 return "unsupported opcode: pseudo instruction";
203 if ((OpcodeName
.startswith("POP") && !OpcodeName
.startswith("POPCNT")) ||
204 OpcodeName
.startswith("PUSH") || OpcodeName
.startswith("ADJCALLSTACK") ||
205 OpcodeName
.startswith("LEAVE"))
206 return "unsupported opcode: Push/Pop/AdjCallStack/Leave";
207 switch (Instr
.Description
.Opcode
) {
219 case X86::WRFSBASE64
:
220 return "unsupported opcode";
224 if (const auto reason
= isInvalidMemoryInstr(Instr
))
226 // We do not handle instructions with OPERAND_PCREL.
227 for (const Operand
&Op
: Instr
.Operands
)
228 if (Op
.isExplicit() &&
229 Op
.getExplicitOperandInfo().OperandType
== MCOI::OPERAND_PCREL
)
230 return "unsupported opcode: PC relative operand";
231 // We do not handle second-form X87 instructions. We only handle first-form
232 // ones (_Fp), see comment in X86InstrFPStack.td.
233 for (const Operand
&Op
: Instr
.Operands
)
234 if (Op
.isReg() && Op
.isExplicit() &&
235 Op
.getExplicitOperandInfo().RegClass
== X86::RSTRegClassID
)
236 return "unsupported second-form X87 instruction";
240 static unsigned getX86FPFlags(const Instruction
&Instr
) {
241 return Instr
.Description
.TSFlags
& X86II::FPTypeMask
;
244 // Helper to fill a memory operand with a value.
245 static void setMemOp(InstructionTemplate
&IT
, int OpIdx
,
246 const MCOperand
&OpVal
) {
247 const auto Op
= IT
.getInstr().Operands
[OpIdx
];
248 assert(Op
.isExplicit() && "invalid memory pattern");
249 IT
.getValueFor(Op
) = OpVal
;
252 // Common (latency, uops) code for LEA templates. `GetDestReg` takes the
253 // addressing base and index registers and returns the LEA destination register.
254 static Expected
<std::vector
<CodeTemplate
>> generateLEATemplatesCommon(
255 const Instruction
&Instr
, const BitVector
&ForbiddenRegisters
,
256 const LLVMState
&State
, const SnippetGenerator::Options
&Opts
,
257 std::function
<void(unsigned, unsigned, BitVector
&CandidateDestRegs
)>
259 assert(Instr
.Operands
.size() == 6 && "invalid LEA");
260 assert(X86II::getMemoryOperandNo(Instr
.Description
.TSFlags
) == 1 &&
263 constexpr const int kDestOp
= 0;
264 constexpr const int kBaseOp
= 1;
265 constexpr const int kIndexOp
= 3;
266 auto PossibleDestRegs
=
267 Instr
.Operands
[kDestOp
].getRegisterAliasing().sourceBits();
268 remove(PossibleDestRegs
, ForbiddenRegisters
);
269 auto PossibleBaseRegs
=
270 Instr
.Operands
[kBaseOp
].getRegisterAliasing().sourceBits();
271 remove(PossibleBaseRegs
, ForbiddenRegisters
);
272 auto PossibleIndexRegs
=
273 Instr
.Operands
[kIndexOp
].getRegisterAliasing().sourceBits();
274 remove(PossibleIndexRegs
, ForbiddenRegisters
);
276 const auto &RegInfo
= State
.getRegInfo();
277 std::vector
<CodeTemplate
> Result
;
278 for (const unsigned BaseReg
: PossibleBaseRegs
.set_bits()) {
279 for (const unsigned IndexReg
: PossibleIndexRegs
.set_bits()) {
280 for (int LogScale
= 0; LogScale
<= 3; ++LogScale
) {
281 // FIXME: Add an option for controlling how we explore immediates.
282 for (const int Disp
: {0, 42}) {
283 InstructionTemplate
IT(&Instr
);
284 const int64_t Scale
= 1ull << LogScale
;
285 setMemOp(IT
, 1, MCOperand::createReg(BaseReg
));
286 setMemOp(IT
, 2, MCOperand::createImm(Scale
));
287 setMemOp(IT
, 3, MCOperand::createReg(IndexReg
));
288 setMemOp(IT
, 4, MCOperand::createImm(Disp
));
289 // SegmentReg must be 0 for LEA.
290 setMemOp(IT
, 5, MCOperand::createReg(0));
292 // Output reg candidates are selected by the caller.
293 auto PossibleDestRegsNow
= PossibleDestRegs
;
294 RestrictDestRegs(BaseReg
, IndexReg
, PossibleDestRegsNow
);
295 assert(PossibleDestRegsNow
.set_bits().begin() !=
296 PossibleDestRegsNow
.set_bits().end() &&
297 "no remaining registers");
300 MCOperand::createReg(*PossibleDestRegsNow
.set_bits().begin()));
303 CT
.Instructions
.push_back(std::move(IT
));
304 CT
.Config
= formatv("{3}(%{0}, %{1}, {2})", RegInfo
.getName(BaseReg
),
305 RegInfo
.getName(IndexReg
), Scale
, Disp
)
307 Result
.push_back(std::move(CT
));
308 if (Result
.size() >= Opts
.MaxConfigsPerOpcode
)
309 return std::move(Result
);
315 return std::move(Result
);
319 class X86SerialSnippetGenerator
: public SerialSnippetGenerator
{
321 using SerialSnippetGenerator::SerialSnippetGenerator
;
323 Expected
<std::vector
<CodeTemplate
>>
324 generateCodeTemplates(InstructionTemplate Variant
,
325 const BitVector
&ForbiddenRegisters
) const override
;
329 Expected
<std::vector
<CodeTemplate
>>
330 X86SerialSnippetGenerator::generateCodeTemplates(
331 InstructionTemplate Variant
, const BitVector
&ForbiddenRegisters
) const {
332 const Instruction
&Instr
= Variant
.getInstr();
334 if (const auto reason
= isInvalidOpcode(Instr
))
335 return make_error
<Failure
>(reason
);
337 // LEA gets special attention.
338 const auto Opcode
= Instr
.Description
.getOpcode();
339 if (Opcode
== X86::LEA64r
|| Opcode
== X86::LEA64_32r
) {
340 return generateLEATemplatesCommon(
341 Instr
, ForbiddenRegisters
, State
, Opts
,
342 [this](unsigned BaseReg
, unsigned IndexReg
,
343 BitVector
&CandidateDestRegs
) {
344 // We just select a destination register that aliases the base
347 State
.getRATC().getRegister(BaseReg
).aliasedBits();
351 if (Instr
.hasMemoryOperands())
352 return make_error
<Failure
>(
353 "unsupported memory operand in latency measurements");
355 switch (getX86FPFlags(Instr
)) {
357 return SerialSnippetGenerator::generateCodeTemplates(Variant
,
359 case X86II::ZeroArgFP
:
360 case X86II::OneArgFP
:
361 case X86II::SpecialFP
:
362 case X86II::CompareFP
:
363 case X86II::CondMovFP
:
364 return make_error
<Failure
>("Unsupported x87 Instruction");
365 case X86II::OneArgFPRW
:
366 case X86II::TwoArgFP
:
367 // These are instructions like
368 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
369 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
370 // They are intrinsically serial and do not modify the state of the stack.
371 return generateSelfAliasingCodeTemplates(Variant
, ForbiddenRegisters
);
373 llvm_unreachable("Unknown FP Type!");
378 class X86ParallelSnippetGenerator
: public ParallelSnippetGenerator
{
380 using ParallelSnippetGenerator::ParallelSnippetGenerator
;
382 Expected
<std::vector
<CodeTemplate
>>
383 generateCodeTemplates(InstructionTemplate Variant
,
384 const BitVector
&ForbiddenRegisters
) const override
;
389 Expected
<std::vector
<CodeTemplate
>>
390 X86ParallelSnippetGenerator::generateCodeTemplates(
391 InstructionTemplate Variant
, const BitVector
&ForbiddenRegisters
) const {
392 const Instruction
&Instr
= Variant
.getInstr();
394 if (const auto reason
= isInvalidOpcode(Instr
))
395 return make_error
<Failure
>(reason
);
397 // LEA gets special attention.
398 const auto Opcode
= Instr
.Description
.getOpcode();
399 if (Opcode
== X86::LEA64r
|| Opcode
== X86::LEA64_32r
) {
400 return generateLEATemplatesCommon(
401 Instr
, ForbiddenRegisters
, State
, Opts
,
402 [this](unsigned BaseReg
, unsigned IndexReg
,
403 BitVector
&CandidateDestRegs
) {
404 // Any destination register that is not used for addressing is fine.
405 remove(CandidateDestRegs
,
406 State
.getRATC().getRegister(BaseReg
).aliasedBits());
407 remove(CandidateDestRegs
,
408 State
.getRATC().getRegister(IndexReg
).aliasedBits());
412 switch (getX86FPFlags(Instr
)) {
414 return ParallelSnippetGenerator::generateCodeTemplates(Variant
,
416 case X86II::ZeroArgFP
:
417 case X86II::OneArgFP
:
418 case X86II::SpecialFP
:
419 return make_error
<Failure
>("Unsupported x87 Instruction");
420 case X86II::OneArgFPRW
:
421 case X86II::TwoArgFP
:
422 // These are instructions like
423 // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
424 // - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
425 // They are intrinsically serial and do not modify the state of the stack.
426 // We generate the same code for latency and uops.
427 return generateSelfAliasingCodeTemplates(Variant
, ForbiddenRegisters
);
428 case X86II::CompareFP
:
429 case X86II::CondMovFP
:
430 // We can compute uops for any FP instruction that does not grow or shrink
431 // the stack (either do not touch the stack or push as much as they pop).
432 return generateUnconstrainedCodeTemplates(
433 Variant
, "instruction does not grow/shrink the FP stack");
435 llvm_unreachable("Unknown FP Type!");
439 static unsigned getLoadImmediateOpcode(unsigned RegBitWidth
) {
440 switch (RegBitWidth
) {
450 llvm_unreachable("Invalid Value Width");
453 // Generates instruction to load an immediate value into a register.
454 static MCInst
loadImmediate(unsigned Reg
, unsigned RegBitWidth
,
455 const APInt
&Value
) {
456 if (Value
.getBitWidth() > RegBitWidth
)
457 llvm_unreachable("Value must fit in the Register");
458 return MCInstBuilder(getLoadImmediateOpcode(RegBitWidth
))
460 .addImm(Value
.getZExtValue());
463 // Allocates scratch memory on the stack.
464 static MCInst
allocateStackSpace(unsigned Bytes
) {
465 return MCInstBuilder(X86::SUB64ri8
)
471 // Fills scratch memory at offset `OffsetBytes` with value `Imm`.
472 static MCInst
fillStackSpace(unsigned MovOpcode
, unsigned OffsetBytes
,
474 return MCInstBuilder(MovOpcode
)
476 .addReg(X86::RSP
) // BaseReg
477 .addImm(1) // ScaleAmt
478 .addReg(0) // IndexReg
479 .addImm(OffsetBytes
) // Disp
480 .addReg(0) // Segment
485 // Loads scratch memory into register `Reg` using opcode `RMOpcode`.
486 static MCInst
loadToReg(unsigned Reg
, unsigned RMOpcode
) {
487 return MCInstBuilder(RMOpcode
)
490 .addReg(X86::RSP
) // BaseReg
491 .addImm(1) // ScaleAmt
492 .addReg(0) // IndexReg
494 .addReg(0); // Segment
497 // Releases scratch memory.
498 static MCInst
releaseStackSpace(unsigned Bytes
) {
499 return MCInstBuilder(X86::ADD64ri8
)
505 // Reserves some space on the stack, fills it with the content of the provided
506 // constant and provide methods to load the stack value into a register.
508 struct ConstantInliner
{
509 explicit ConstantInliner(const APInt
&Constant
) : Constant_(Constant
) {}
511 std::vector
<MCInst
> loadAndFinalize(unsigned Reg
, unsigned RegBitWidth
,
514 std::vector
<MCInst
> loadX87STAndFinalize(unsigned Reg
);
516 std::vector
<MCInst
> loadX87FPAndFinalize(unsigned Reg
);
518 std::vector
<MCInst
> popFlagAndFinalize();
520 std::vector
<MCInst
> loadImplicitRegAndFinalize(unsigned Opcode
,
524 ConstantInliner
&add(const MCInst
&Inst
) {
525 Instructions
.push_back(Inst
);
529 void initStack(unsigned Bytes
);
531 static constexpr const unsigned kF80Bytes
= 10; // 80 bits.
534 std::vector
<MCInst
> Instructions
;
538 std::vector
<MCInst
> ConstantInliner::loadAndFinalize(unsigned Reg
,
539 unsigned RegBitWidth
,
541 assert((RegBitWidth
& 7) == 0 && "RegBitWidth must be a multiple of 8 bits");
542 initStack(RegBitWidth
/ 8);
543 add(loadToReg(Reg
, Opcode
));
544 add(releaseStackSpace(RegBitWidth
/ 8));
545 return std::move(Instructions
);
548 std::vector
<MCInst
> ConstantInliner::loadX87STAndFinalize(unsigned Reg
) {
549 initStack(kF80Bytes
);
550 add(MCInstBuilder(X86::LD_F80m
)
552 .addReg(X86::RSP
) // BaseReg
553 .addImm(1) // ScaleAmt
554 .addReg(0) // IndexReg
556 .addReg(0)); // Segment
558 add(MCInstBuilder(X86::ST_Frr
).addReg(Reg
));
559 add(releaseStackSpace(kF80Bytes
));
560 return std::move(Instructions
);
563 std::vector
<MCInst
> ConstantInliner::loadX87FPAndFinalize(unsigned Reg
) {
564 initStack(kF80Bytes
);
565 add(MCInstBuilder(X86::LD_Fp80m
)
568 .addReg(X86::RSP
) // BaseReg
569 .addImm(1) // ScaleAmt
570 .addReg(0) // IndexReg
572 .addReg(0)); // Segment
573 add(releaseStackSpace(kF80Bytes
));
574 return std::move(Instructions
);
577 std::vector
<MCInst
> ConstantInliner::popFlagAndFinalize() {
579 add(MCInstBuilder(X86::POPF64
));
580 return std::move(Instructions
);
584 ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode
, unsigned Value
) {
585 add(allocateStackSpace(4));
586 add(fillStackSpace(X86::MOV32mi
, 0, Value
)); // Mask all FP exceptions
587 add(MCInstBuilder(Opcode
)
589 .addReg(X86::RSP
) // BaseReg
590 .addImm(1) // ScaleAmt
591 .addReg(0) // IndexReg
593 .addReg(0)); // Segment
594 add(releaseStackSpace(4));
595 return std::move(Instructions
);
598 void ConstantInliner::initStack(unsigned Bytes
) {
599 assert(Constant_
.getBitWidth() <= Bytes
* 8 &&
600 "Value does not have the correct size");
601 const APInt WideConstant
= Constant_
.getBitWidth() < Bytes
* 8
602 ? Constant_
.sext(Bytes
* 8)
604 add(allocateStackSpace(Bytes
));
605 size_t ByteOffset
= 0;
606 for (; Bytes
- ByteOffset
>= 4; ByteOffset
+= 4)
608 X86::MOV32mi
, ByteOffset
,
609 WideConstant
.extractBits(32, ByteOffset
* 8).getZExtValue()));
610 if (Bytes
- ByteOffset
>= 2) {
612 X86::MOV16mi
, ByteOffset
,
613 WideConstant
.extractBits(16, ByteOffset
* 8).getZExtValue()));
616 if (Bytes
- ByteOffset
>= 1)
618 X86::MOV8mi
, ByteOffset
,
619 WideConstant
.extractBits(8, ByteOffset
* 8).getZExtValue()));
622 #include "X86GenExegesis.inc"
626 class X86SavedState
: public ExegesisTarget::SavedState
{
629 #if defined(_MSC_VER) && defined(_M_X64)
631 Eflags
= __readeflags();
632 #elif defined(__GNUC__) && defined(__x86_64__)
633 __builtin_ia32_fxsave64(FPState
);
634 Eflags
= __builtin_ia32_readeflags_u64();
636 report_fatal_error("X86 exegesis running on unsupported target");
641 // Restoring the X87 state does not flush pending exceptions, make sure
642 // these exceptions are flushed now.
643 #if defined(_MSC_VER) && defined(_M_X64)
646 __writeeflags(Eflags
);
647 #elif defined(__GNUC__) && defined(__x86_64__)
648 asm volatile("fwait");
649 __builtin_ia32_fxrstor64(FPState
);
650 __builtin_ia32_writeeflags_u64(Eflags
);
652 report_fatal_error("X86 exegesis running on unsupported target");
657 #if defined(__x86_64__) || defined(_M_X64)
658 alignas(16) char FPState
[512];
663 class ExegesisX86Target
: public ExegesisTarget
{
665 ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters
) {}
667 Expected
<std::unique_ptr
<pfm::Counter
>>
668 createCounter(StringRef CounterName
, const LLVMState
&State
,
669 const pid_t ProcessID
) const override
{
670 // If LbrSamplingPeriod was provided, then ignore the
671 // CounterName because we only have one for LBR.
672 if (LbrSamplingPeriod
> 0) {
673 // Can't use LBR without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, or without
674 // __linux__ (for now)
675 #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) && \
677 return std::make_unique
<X86LbrCounter
>(
678 X86LbrPerfEvent(LbrSamplingPeriod
));
680 return llvm::make_error
<llvm::StringError
>(
681 "LBR counter requested without HAVE_LIBPFM, LIBPFM_HAS_FIELD_CYCLES, "
682 "or running on Linux.",
683 llvm::errc::invalid_argument
);
686 return ExegesisTarget::createCounter(CounterName
, State
, ProcessID
);
690 void addTargetSpecificPasses(PassManagerBase
&PM
) const override
;
692 unsigned getScratchMemoryRegister(const Triple
&TT
) const override
;
694 unsigned getLoopCounterRegister(const Triple
&) const override
;
696 unsigned getMaxMemoryAccessSize() const override
{ return 64; }
698 Error
randomizeTargetMCOperand(const Instruction
&Instr
, const Variable
&Var
,
699 MCOperand
&AssignedValue
,
700 const BitVector
&ForbiddenRegs
) const override
;
702 void fillMemoryOperands(InstructionTemplate
&IT
, unsigned Reg
,
703 unsigned Offset
) const override
;
705 void decrementLoopCounterAndJump(MachineBasicBlock
&MBB
,
706 MachineBasicBlock
&TargetMBB
,
707 const MCInstrInfo
&MII
) const override
;
709 std::vector
<MCInst
> setRegTo(const MCSubtargetInfo
&STI
, unsigned Reg
,
710 const APInt
&Value
) const override
;
712 ArrayRef
<unsigned> getUnavailableRegisters() const override
{
713 if (DisableUpperSSERegisters
)
714 return ArrayRef(kUnavailableRegistersSSE
,
715 sizeof(kUnavailableRegistersSSE
) /
716 sizeof(kUnavailableRegistersSSE
[0]));
718 return ArrayRef(kUnavailableRegisters
, std::size(kUnavailableRegisters
));
721 bool allowAsBackToBack(const Instruction
&Instr
) const override
{
722 const unsigned Opcode
= Instr
.Description
.Opcode
;
723 return !isInvalidOpcode(Instr
) && Opcode
!= X86::LEA64r
&&
724 Opcode
!= X86::LEA64_32r
&& Opcode
!= X86::LEA16r
;
727 std::vector
<InstructionTemplate
>
728 generateInstructionVariants(const Instruction
&Instr
,
729 unsigned MaxConfigsPerOpcode
) const override
;
731 std::unique_ptr
<SnippetGenerator
> createSerialSnippetGenerator(
732 const LLVMState
&State
,
733 const SnippetGenerator::Options
&Opts
) const override
{
734 return std::make_unique
<X86SerialSnippetGenerator
>(State
, Opts
);
737 std::unique_ptr
<SnippetGenerator
> createParallelSnippetGenerator(
738 const LLVMState
&State
,
739 const SnippetGenerator::Options
&Opts
) const override
{
740 return std::make_unique
<X86ParallelSnippetGenerator
>(State
, Opts
);
743 bool matchesArch(Triple::ArchType Arch
) const override
{
744 return Arch
== Triple::x86_64
|| Arch
== Triple::x86
;
747 Error
checkFeatureSupport() const override
{
748 // LBR is the only feature we conditionally support now.
749 // So if LBR is not requested, then we should be able to run the benchmarks.
750 if (LbrSamplingPeriod
== 0)
751 return Error::success();
753 #if defined(__linux__) && defined(HAVE_LIBPFM) && \
754 defined(LIBPFM_HAS_FIELD_CYCLES)
756 // https://bugs.llvm.org/show_bug.cgi?id=48918
757 // For now, only do the check if we see an Intel machine because
758 // the counter uses some intel-specific magic and it could
759 // be confuse and think an AMD machine actually has LBR support.
760 #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
762 using namespace sys::detail::x86
;
764 if (getVendorSignature() == VendorSignatures::GENUINE_INTEL
)
765 // If the kernel supports it, the hardware still may not have it.
766 return X86LbrCounter::checkLbrSupport();
768 report_fatal_error("Running X86 exegesis on unsupported target");
771 return llvm::make_error
<llvm::StringError
>(
772 "LBR not supported on this kernel and/or platform",
773 llvm::errc::not_supported
);
776 std::unique_ptr
<SavedState
> withSavedState() const override
{
777 return std::make_unique
<X86SavedState
>();
780 static const unsigned kUnavailableRegisters
[4];
781 static const unsigned kUnavailableRegistersSSE
[12];
784 // We disable a few registers that cannot be encoded on instructions with a REX
786 const unsigned ExegesisX86Target::kUnavailableRegisters
[4] = {X86::AH
, X86::BH
,
789 // Optionally, also disable the upper (x86_64) SSE registers to reduce frontend
791 const unsigned ExegesisX86Target::kUnavailableRegistersSSE
[12] = {
792 X86::AH
, X86::BH
, X86::CH
, X86::DH
, X86::XMM8
, X86::XMM9
,
793 X86::XMM10
, X86::XMM11
, X86::XMM12
, X86::XMM13
, X86::XMM14
, X86::XMM15
};
795 // We're using one of R8-R15 because these registers are never hardcoded in
796 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
798 constexpr const unsigned kLoopCounterReg
= X86::R8
;
802 void ExegesisX86Target::addTargetSpecificPasses(PassManagerBase
&PM
) const {
803 // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
804 PM
.add(createX86FloatingPointStackifierPass());
807 unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple
&TT
) const {
808 if (!TT
.isArch64Bit()) {
809 // FIXME: This would require popping from the stack, so we would have to
810 // add some additional setup code.
813 return TT
.isOSWindows() ? X86::RCX
: X86::RDI
;
816 unsigned ExegesisX86Target::getLoopCounterRegister(const Triple
&TT
) const {
817 if (!TT
.isArch64Bit()) {
820 return kLoopCounterReg
;
823 Error
ExegesisX86Target::randomizeTargetMCOperand(
824 const Instruction
&Instr
, const Variable
&Var
, MCOperand
&AssignedValue
,
825 const BitVector
&ForbiddenRegs
) const {
826 const Operand
&Op
= Instr
.getPrimaryOperand(Var
);
827 switch (Op
.getExplicitOperandInfo().OperandType
) {
828 case X86::OperandType::OPERAND_ROUNDING_CONTROL
:
830 MCOperand::createImm(randomIndex(X86::STATIC_ROUNDING::TO_ZERO
));
831 return Error::success();
835 return make_error
<Failure
>(
836 Twine("unimplemented operand type ")
837 .concat(Twine(Op
.getExplicitOperandInfo().OperandType
)));
840 void ExegesisX86Target::fillMemoryOperands(InstructionTemplate
&IT
,
842 unsigned Offset
) const {
843 assert(!isInvalidMemoryInstr(IT
.getInstr()) &&
844 "fillMemoryOperands requires a valid memory instruction");
845 int MemOpIdx
= X86II::getMemoryOperandNo(IT
.getInstr().Description
.TSFlags
);
846 assert(MemOpIdx
>= 0 && "invalid memory operand index");
847 // getMemoryOperandNo() ignores tied operands, so we have to add them back.
848 MemOpIdx
+= X86II::getOperandBias(IT
.getInstr().Description
);
849 setMemOp(IT
, MemOpIdx
+ 0, MCOperand::createReg(Reg
)); // BaseReg
850 setMemOp(IT
, MemOpIdx
+ 1, MCOperand::createImm(1)); // ScaleAmt
851 setMemOp(IT
, MemOpIdx
+ 2, MCOperand::createReg(0)); // IndexReg
852 setMemOp(IT
, MemOpIdx
+ 3, MCOperand::createImm(Offset
)); // Disp
853 setMemOp(IT
, MemOpIdx
+ 4, MCOperand::createReg(0)); // Segment
856 void ExegesisX86Target::decrementLoopCounterAndJump(
857 MachineBasicBlock
&MBB
, MachineBasicBlock
&TargetMBB
,
858 const MCInstrInfo
&MII
) const {
859 BuildMI(&MBB
, DebugLoc(), MII
.get(X86::ADD64ri8
))
860 .addDef(kLoopCounterReg
)
861 .addUse(kLoopCounterReg
)
863 BuildMI(&MBB
, DebugLoc(), MII
.get(X86::JCC_1
))
865 .addImm(X86::COND_NE
);
868 std::vector
<MCInst
> ExegesisX86Target::setRegTo(const MCSubtargetInfo
&STI
,
870 const APInt
&Value
) const {
871 if (X86::GR8RegClass
.contains(Reg
))
872 return {loadImmediate(Reg
, 8, Value
)};
873 if (X86::GR16RegClass
.contains(Reg
))
874 return {loadImmediate(Reg
, 16, Value
)};
875 if (X86::GR32RegClass
.contains(Reg
))
876 return {loadImmediate(Reg
, 32, Value
)};
877 if (X86::GR64RegClass
.contains(Reg
))
878 return {loadImmediate(Reg
, 64, Value
)};
879 if (X86::VK8RegClass
.contains(Reg
) || X86::VK16RegClass
.contains(Reg
) ||
880 X86::VK32RegClass
.contains(Reg
) || X86::VK64RegClass
.contains(Reg
)) {
881 switch (Value
.getBitWidth()) {
883 if (STI
.getFeatureBits()[X86::FeatureDQI
]) {
884 ConstantInliner
CI(Value
);
885 return CI
.loadAndFinalize(Reg
, Value
.getBitWidth(), X86::KMOVBkm
);
889 if (STI
.getFeatureBits()[X86::FeatureAVX512
]) {
890 ConstantInliner
CI(Value
.zextOrTrunc(16));
891 return CI
.loadAndFinalize(Reg
, 16, X86::KMOVWkm
);
895 if (STI
.getFeatureBits()[X86::FeatureBWI
]) {
896 ConstantInliner
CI(Value
);
897 return CI
.loadAndFinalize(Reg
, Value
.getBitWidth(), X86::KMOVDkm
);
901 if (STI
.getFeatureBits()[X86::FeatureBWI
]) {
902 ConstantInliner
CI(Value
);
903 return CI
.loadAndFinalize(Reg
, Value
.getBitWidth(), X86::KMOVQkm
);
908 ConstantInliner
CI(Value
);
909 if (X86::VR64RegClass
.contains(Reg
))
910 return CI
.loadAndFinalize(Reg
, 64, X86::MMX_MOVQ64rm
);
911 if (X86::VR128XRegClass
.contains(Reg
)) {
912 if (STI
.getFeatureBits()[X86::FeatureAVX512
])
913 return CI
.loadAndFinalize(Reg
, 128, X86::VMOVDQU32Z128rm
);
914 if (STI
.getFeatureBits()[X86::FeatureAVX
])
915 return CI
.loadAndFinalize(Reg
, 128, X86::VMOVDQUrm
);
916 return CI
.loadAndFinalize(Reg
, 128, X86::MOVDQUrm
);
918 if (X86::VR256XRegClass
.contains(Reg
)) {
919 if (STI
.getFeatureBits()[X86::FeatureAVX512
])
920 return CI
.loadAndFinalize(Reg
, 256, X86::VMOVDQU32Z256rm
);
921 if (STI
.getFeatureBits()[X86::FeatureAVX
])
922 return CI
.loadAndFinalize(Reg
, 256, X86::VMOVDQUYrm
);
924 if (X86::VR512RegClass
.contains(Reg
))
925 if (STI
.getFeatureBits()[X86::FeatureAVX512
])
926 return CI
.loadAndFinalize(Reg
, 512, X86::VMOVDQU32Zrm
);
927 if (X86::RSTRegClass
.contains(Reg
)) {
928 return CI
.loadX87STAndFinalize(Reg
);
930 if (X86::RFP32RegClass
.contains(Reg
) || X86::RFP64RegClass
.contains(Reg
) ||
931 X86::RFP80RegClass
.contains(Reg
)) {
932 return CI
.loadX87FPAndFinalize(Reg
);
934 if (Reg
== X86::EFLAGS
)
935 return CI
.popFlagAndFinalize();
936 if (Reg
== X86::MXCSR
)
937 return CI
.loadImplicitRegAndFinalize(
938 STI
.getFeatureBits()[X86::FeatureAVX
] ? X86::VLDMXCSR
: X86::LDMXCSR
,
940 if (Reg
== X86::FPCW
)
941 return CI
.loadImplicitRegAndFinalize(X86::FLDCW16m
, 0x37f);
942 return {}; // Not yet implemented.
945 // Instruction can have some variable operands, and we may want to see how
946 // different operands affect performance. So for each operand position,
947 // precompute all the possible choices we might care about,
948 // and greedily generate all the possible combinations of choices.
949 std::vector
<InstructionTemplate
> ExegesisX86Target::generateInstructionVariants(
950 const Instruction
&Instr
, unsigned MaxConfigsPerOpcode
) const {
951 bool Exploration
= false;
952 SmallVector
<SmallVector
<MCOperand
, 1>, 4> VariableChoices
;
953 VariableChoices
.resize(Instr
.Variables
.size());
954 for (auto I
: llvm::zip(Instr
.Variables
, VariableChoices
)) {
955 const Variable
&Var
= std::get
<0>(I
);
956 SmallVectorImpl
<MCOperand
> &Choices
= std::get
<1>(I
);
958 switch (Instr
.getPrimaryOperand(Var
).getExplicitOperandInfo().OperandType
) {
960 // We don't wish to explicitly explore this variable.
961 Choices
.emplace_back(); // But add invalid MCOperand to simplify logic.
963 case X86::OperandType::OPERAND_COND_CODE
: {
965 auto CondCodes
= enum_seq_inclusive(X86::CondCode::COND_O
,
966 X86::CondCode::LAST_VALID_COND
,
967 force_iteration_on_noniterable_enum
);
968 Choices
.reserve(CondCodes
.size());
969 for (int CondCode
: CondCodes
)
970 Choices
.emplace_back(MCOperand::createImm(CondCode
));
976 // If we don't wish to explore any variables, defer to the baseline method.
978 return ExegesisTarget::generateInstructionVariants(Instr
,
979 MaxConfigsPerOpcode
);
981 std::vector
<InstructionTemplate
> Variants
;
983 CombinationGenerator
<MCOperand
, decltype(VariableChoices
)::value_type
, 4> G(
986 // How many operand combinations can we produce, within the limit?
987 NumVariants
= std::min(G
.numCombinations(), (size_t)MaxConfigsPerOpcode
);
988 // And actually produce all the wanted operand combinations.
989 Variants
.reserve(NumVariants
);
990 G
.generate([&](ArrayRef
<MCOperand
> State
) -> bool {
991 Variants
.emplace_back(&Instr
);
992 Variants
.back().setVariableValues(State
);
993 // Did we run out of space for variants?
994 return Variants
.size() >= NumVariants
;
997 assert(Variants
.size() == NumVariants
&&
998 Variants
.size() <= MaxConfigsPerOpcode
&&
999 "Should not produce too many variants");
1003 static ExegesisTarget
*getTheExegesisX86Target() {
1004 static ExegesisX86Target Target
;
1008 void InitializeX86ExegesisTarget() {
1009 ExegesisTarget::registerTarget(getTheExegesisX86Target());
1012 } // namespace exegesis