1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines a DAG pattern matching instruction selector for X86,
11 // converting from a legalized dag to a X86 dag.
13 //===----------------------------------------------------------------------===//
16 #include "X86MachineFunctionInfo.h"
17 #include "X86RegisterInfo.h"
18 #include "X86Subtarget.h"
19 #include "X86TargetMachine.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/MachineFrameInfo.h"
22 #include "llvm/CodeGen/MachineFunction.h"
23 #include "llvm/CodeGen/SelectionDAGISel.h"
24 #include "llvm/Config/llvm-config.h"
25 #include "llvm/IR/ConstantRange.h"
26 #include "llvm/IR/Function.h"
27 #include "llvm/IR/Instructions.h"
28 #include "llvm/IR/Intrinsics.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/ErrorHandling.h"
32 #include "llvm/Support/KnownBits.h"
33 #include "llvm/Support/MathExtras.h"
34 #include "llvm/Support/raw_ostream.h"
35 #include "llvm/Target/TargetMachine.h"
36 #include "llvm/Target/TargetOptions.h"
40 #define DEBUG_TYPE "x86-isel"
42 STATISTIC(NumLoadMoved
, "Number of loads moved below TokenFactor");
44 static cl::opt
<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
45 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
48 //===----------------------------------------------------------------------===//
49 // Pattern Matcher Implementation
50 //===----------------------------------------------------------------------===//
53 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
54 /// numbers for the leaves of the matched tree.
55 struct X86ISelAddressMode
{
61 // This is really a union, discriminated by BaseType!
69 const GlobalValue
*GV
;
71 const BlockAddress
*BlockAddr
;
75 unsigned Align
; // CP alignment.
76 unsigned char SymbolFlags
; // X86II::MO_*
79 : BaseType(RegBase
), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
80 Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
81 MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG
) {}
83 bool hasSymbolicDisplacement() const {
84 return GV
!= nullptr || CP
!= nullptr || ES
!= nullptr ||
85 MCSym
!= nullptr || JT
!= -1 || BlockAddr
!= nullptr;
88 bool hasBaseOrIndexReg() const {
89 return BaseType
== FrameIndexBase
||
90 IndexReg
.getNode() != nullptr || Base_Reg
.getNode() != nullptr;
93 /// Return true if this addressing mode is already RIP-relative.
94 bool isRIPRelative() const {
95 if (BaseType
!= RegBase
) return false;
96 if (RegisterSDNode
*RegNode
=
97 dyn_cast_or_null
<RegisterSDNode
>(Base_Reg
.getNode()))
98 return RegNode
->getReg() == X86::RIP
;
102 void setBaseReg(SDValue Reg
) {
107 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
108 void dump(SelectionDAG
*DAG
= nullptr) {
109 dbgs() << "X86ISelAddressMode " << this << '\n';
110 dbgs() << "Base_Reg ";
111 if (Base_Reg
.getNode())
112 Base_Reg
.getNode()->dump(DAG
);
115 if (BaseType
== FrameIndexBase
)
116 dbgs() << " Base.FrameIndex " << Base_FrameIndex
<< '\n';
117 dbgs() << " Scale " << Scale
<< '\n'
119 if (IndexReg
.getNode())
120 IndexReg
.getNode()->dump(DAG
);
123 dbgs() << " Disp " << Disp
<< '\n'
145 dbgs() << " JT" << JT
<< " Align" << Align
<< '\n';
152 //===--------------------------------------------------------------------===//
153 /// ISel - X86-specific code to select X86 machine instructions for
154 /// SelectionDAG operations.
156 class X86DAGToDAGISel final
: public SelectionDAGISel
{
157 /// Keep a pointer to the X86Subtarget around so that we can
158 /// make the right decision when generating code for different targets.
159 const X86Subtarget
*Subtarget
;
161 /// If true, selector should try to optimize for code size instead of
165 /// If true, selector should try to optimize for minimum code size.
169 explicit X86DAGToDAGISel(X86TargetMachine
&tm
, CodeGenOpt::Level OptLevel
)
170 : SelectionDAGISel(tm
, OptLevel
), OptForSize(false),
171 OptForMinSize(false) {}
173 StringRef
getPassName() const override
{
174 return "X86 DAG->DAG Instruction Selection";
177 bool runOnMachineFunction(MachineFunction
&MF
) override
{
178 // Reset the subtarget each time through.
179 Subtarget
= &MF
.getSubtarget
<X86Subtarget
>();
180 SelectionDAGISel::runOnMachineFunction(MF
);
184 void EmitFunctionEntryCode() override
;
186 bool IsProfitableToFold(SDValue N
, SDNode
*U
, SDNode
*Root
) const override
;
188 void PreprocessISelDAG() override
;
189 void PostprocessISelDAG() override
;
191 // Include the pieces autogenerated from the target description.
192 #include "X86GenDAGISel.inc"
195 void Select(SDNode
*N
) override
;
197 bool foldOffsetIntoAddress(uint64_t Offset
, X86ISelAddressMode
&AM
);
198 bool matchLoadInAddress(LoadSDNode
*N
, X86ISelAddressMode
&AM
);
199 bool matchWrapper(SDValue N
, X86ISelAddressMode
&AM
);
200 bool matchAddress(SDValue N
, X86ISelAddressMode
&AM
);
201 bool matchVectorAddress(SDValue N
, X86ISelAddressMode
&AM
);
202 bool matchAdd(SDValue N
, X86ISelAddressMode
&AM
, unsigned Depth
);
203 bool matchAddressRecursively(SDValue N
, X86ISelAddressMode
&AM
,
205 bool matchAddressBase(SDValue N
, X86ISelAddressMode
&AM
);
206 bool selectAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
207 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
209 bool selectVectorAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
210 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
212 bool selectMOV64Imm32(SDValue N
, SDValue
&Imm
);
213 bool selectLEAAddr(SDValue N
, SDValue
&Base
,
214 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
216 bool selectLEA64_32Addr(SDValue N
, SDValue
&Base
,
217 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
219 bool selectTLSADDRAddr(SDValue N
, SDValue
&Base
,
220 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
222 bool selectScalarSSELoad(SDNode
*Root
, SDNode
*Parent
, SDValue N
,
223 SDValue
&Base
, SDValue
&Scale
,
224 SDValue
&Index
, SDValue
&Disp
,
226 SDValue
&NodeWithChain
);
227 bool selectRelocImm(SDValue N
, SDValue
&Op
);
229 bool tryFoldLoad(SDNode
*Root
, SDNode
*P
, SDValue N
,
230 SDValue
&Base
, SDValue
&Scale
,
231 SDValue
&Index
, SDValue
&Disp
,
234 // Convenience method where P is also root.
235 bool tryFoldLoad(SDNode
*P
, SDValue N
,
236 SDValue
&Base
, SDValue
&Scale
,
237 SDValue
&Index
, SDValue
&Disp
,
239 return tryFoldLoad(P
, P
, N
, Base
, Scale
, Index
, Disp
, Segment
);
242 // Try to fold a vector load. This makes sure the load isn't non-temporal.
243 bool tryFoldVecLoad(SDNode
*Root
, SDNode
*P
, SDValue N
,
244 SDValue
&Base
, SDValue
&Scale
,
245 SDValue
&Index
, SDValue
&Disp
,
248 /// Implement addressing mode selection for inline asm expressions.
249 bool SelectInlineAsmMemoryOperand(const SDValue
&Op
,
250 unsigned ConstraintID
,
251 std::vector
<SDValue
> &OutOps
) override
;
253 void emitSpecialCodeForMain();
255 inline void getAddressOperands(X86ISelAddressMode
&AM
, const SDLoc
&DL
,
256 SDValue
&Base
, SDValue
&Scale
,
257 SDValue
&Index
, SDValue
&Disp
,
259 Base
= (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
260 ? CurDAG
->getTargetFrameIndex(
262 TLI
->getPointerTy(CurDAG
->getDataLayout()))
264 Scale
= getI8Imm(AM
.Scale
, DL
);
266 // These are 32-bit even in 64-bit mode since RIP-relative offset
269 Disp
= CurDAG
->getTargetGlobalAddress(AM
.GV
, SDLoc(),
273 Disp
= CurDAG
->getTargetConstantPool(AM
.CP
, MVT::i32
,
274 AM
.Align
, AM
.Disp
, AM
.SymbolFlags
);
276 assert(!AM
.Disp
&& "Non-zero displacement is ignored with ES.");
277 Disp
= CurDAG
->getTargetExternalSymbol(AM
.ES
, MVT::i32
, AM
.SymbolFlags
);
278 } else if (AM
.MCSym
) {
279 assert(!AM
.Disp
&& "Non-zero displacement is ignored with MCSym.");
280 assert(AM
.SymbolFlags
== 0 && "oo");
281 Disp
= CurDAG
->getMCSymbol(AM
.MCSym
, MVT::i32
);
282 } else if (AM
.JT
!= -1) {
283 assert(!AM
.Disp
&& "Non-zero displacement is ignored with JT.");
284 Disp
= CurDAG
->getTargetJumpTable(AM
.JT
, MVT::i32
, AM
.SymbolFlags
);
285 } else if (AM
.BlockAddr
)
286 Disp
= CurDAG
->getTargetBlockAddress(AM
.BlockAddr
, MVT::i32
, AM
.Disp
,
289 Disp
= CurDAG
->getTargetConstant(AM
.Disp
, DL
, MVT::i32
);
291 if (AM
.Segment
.getNode())
292 Segment
= AM
.Segment
;
294 Segment
= CurDAG
->getRegister(0, MVT::i32
);
297 // Utility function to determine whether we should avoid selecting
298 // immediate forms of instructions for better code size or not.
299 // At a high level, we'd like to avoid such instructions when
300 // we have similar constants used within the same basic block
301 // that can be kept in a register.
303 bool shouldAvoidImmediateInstFormsForSize(SDNode
*N
) const {
304 uint32_t UseCount
= 0;
306 // Do not want to hoist if we're not optimizing for size.
307 // TODO: We'd like to remove this restriction.
308 // See the comment in X86InstrInfo.td for more info.
312 // Walk all the users of the immediate.
313 for (SDNode::use_iterator UI
= N
->use_begin(),
314 UE
= N
->use_end(); (UI
!= UE
) && (UseCount
< 2); ++UI
) {
318 // This user is already selected. Count it as a legitimate use and
320 if (User
->isMachineOpcode()) {
325 // We want to count stores of immediates as real uses.
326 if (User
->getOpcode() == ISD::STORE
&&
327 User
->getOperand(1).getNode() == N
) {
332 // We don't currently match users that have > 2 operands (except
333 // for stores, which are handled above)
334 // Those instruction won't match in ISEL, for now, and would
335 // be counted incorrectly.
336 // This may change in the future as we add additional instruction
338 if (User
->getNumOperands() != 2)
341 // Immediates that are used for offsets as part of stack
342 // manipulation should be left alone. These are typically
343 // used to indicate SP offsets for argument passing and
344 // will get pulled into stores/pushes (implicitly).
345 if (User
->getOpcode() == X86ISD::ADD
||
346 User
->getOpcode() == ISD::ADD
||
347 User
->getOpcode() == X86ISD::SUB
||
348 User
->getOpcode() == ISD::SUB
) {
350 // Find the other operand of the add/sub.
351 SDValue OtherOp
= User
->getOperand(0);
352 if (OtherOp
.getNode() == N
)
353 OtherOp
= User
->getOperand(1);
355 // Don't count if the other operand is SP.
356 RegisterSDNode
*RegNode
;
357 if (OtherOp
->getOpcode() == ISD::CopyFromReg
&&
358 (RegNode
= dyn_cast_or_null
<RegisterSDNode
>(
359 OtherOp
->getOperand(1).getNode())))
360 if ((RegNode
->getReg() == X86::ESP
) ||
361 (RegNode
->getReg() == X86::RSP
))
365 // ... otherwise, count this and move on.
369 // If we have more than 1 use, then recommend for hoisting.
370 return (UseCount
> 1);
373 /// Return a target constant with the specified value of type i8.
374 inline SDValue
getI8Imm(unsigned Imm
, const SDLoc
&DL
) {
375 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i8
);
378 /// Return a target constant with the specified value, of type i32.
379 inline SDValue
getI32Imm(unsigned Imm
, const SDLoc
&DL
) {
380 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i32
);
383 /// Return a target constant with the specified value, of type i64.
384 inline SDValue
getI64Imm(uint64_t Imm
, const SDLoc
&DL
) {
385 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i64
);
388 SDValue
getExtractVEXTRACTImmediate(SDNode
*N
, unsigned VecWidth
,
390 assert((VecWidth
== 128 || VecWidth
== 256) && "Unexpected vector width");
391 uint64_t Index
= N
->getConstantOperandVal(1);
392 MVT VecVT
= N
->getOperand(0).getSimpleValueType();
393 return getI8Imm((Index
* VecVT
.getScalarSizeInBits()) / VecWidth
, DL
);
396 SDValue
getInsertVINSERTImmediate(SDNode
*N
, unsigned VecWidth
,
398 assert((VecWidth
== 128 || VecWidth
== 256) && "Unexpected vector width");
399 uint64_t Index
= N
->getConstantOperandVal(2);
400 MVT VecVT
= N
->getSimpleValueType(0);
401 return getI8Imm((Index
* VecVT
.getScalarSizeInBits()) / VecWidth
, DL
);
404 /// Return an SDNode that returns the value of the global base register.
405 /// Output instructions required to initialize the global base register,
407 SDNode
*getGlobalBaseReg();
409 /// Return a reference to the TargetMachine, casted to the target-specific
411 const X86TargetMachine
&getTargetMachine() const {
412 return static_cast<const X86TargetMachine
&>(TM
);
415 /// Return a reference to the TargetInstrInfo, casted to the target-specific
417 const X86InstrInfo
*getInstrInfo() const {
418 return Subtarget
->getInstrInfo();
421 /// Address-mode matching performs shift-of-and to and-of-shift
422 /// reassociation in order to expose more scaled addressing
424 bool ComplexPatternFuncMutatesDAG() const override
{
428 bool isSExtAbsoluteSymbolRef(unsigned Width
, SDNode
*N
) const;
430 /// Returns whether this is a relocatable immediate in the range
431 /// [-2^Width .. 2^Width-1].
432 template <unsigned Width
> bool isSExtRelocImm(SDNode
*N
) const {
433 if (auto *CN
= dyn_cast
<ConstantSDNode
>(N
))
434 return isInt
<Width
>(CN
->getSExtValue());
435 return isSExtAbsoluteSymbolRef(Width
, N
);
438 // Indicates we should prefer to use a non-temporal load for this load.
439 bool useNonTemporalLoad(LoadSDNode
*N
) const {
440 if (!N
->isNonTemporal())
443 unsigned StoreSize
= N
->getMemoryVT().getStoreSize();
445 if (N
->getAlignment() < StoreSize
)
449 default: llvm_unreachable("Unsupported store size");
451 return Subtarget
->hasSSE41();
453 return Subtarget
->hasAVX2();
455 return Subtarget
->hasAVX512();
459 bool foldLoadStoreIntoMemOperand(SDNode
*Node
);
460 bool matchBEXTRFromAnd(SDNode
*Node
);
461 bool shrinkAndImmediate(SDNode
*N
);
462 bool isMaskZeroExtended(SDNode
*N
) const;
463 bool tryShiftAmountMod(SDNode
*N
);
465 MachineSDNode
*emitPCMPISTR(unsigned ROpc
, unsigned MOpc
, bool MayFoldLoad
,
466 const SDLoc
&dl
, MVT VT
, SDNode
*Node
);
467 MachineSDNode
*emitPCMPESTR(unsigned ROpc
, unsigned MOpc
, bool MayFoldLoad
,
468 const SDLoc
&dl
, MVT VT
, SDNode
*Node
,
474 // Returns true if this masked compare can be implemented legally with this
476 static bool isLegalMaskCompare(SDNode
*N
, const X86Subtarget
*Subtarget
) {
477 unsigned Opcode
= N
->getOpcode();
478 if (Opcode
== X86ISD::CMPM
|| Opcode
== ISD::SETCC
||
479 Opcode
== X86ISD::CMPM_RND
|| Opcode
== X86ISD::VFPCLASS
) {
480 // We can get 256-bit 8 element types here without VLX being enabled. When
481 // this happens we will use 512-bit operations and the mask will not be
483 EVT OpVT
= N
->getOperand(0).getValueType();
484 if (OpVT
.is256BitVector() || OpVT
.is128BitVector())
485 return Subtarget
->hasVLX();
489 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
490 if (Opcode
== X86ISD::VFPCLASSS
|| Opcode
== X86ISD::FSETCCM
||
491 Opcode
== X86ISD::FSETCCM_RND
)
497 // Returns true if we can assume the writer of the mask has zero extended it
499 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode
*N
) const {
500 // If this is an AND, check if we have a compare on either side. As long as
501 // one side guarantees the mask is zero extended, the AND will preserve those
503 if (N
->getOpcode() == ISD::AND
)
504 return isLegalMaskCompare(N
->getOperand(0).getNode(), Subtarget
) ||
505 isLegalMaskCompare(N
->getOperand(1).getNode(), Subtarget
);
507 return isLegalMaskCompare(N
, Subtarget
);
511 X86DAGToDAGISel::IsProfitableToFold(SDValue N
, SDNode
*U
, SDNode
*Root
) const {
512 if (OptLevel
== CodeGenOpt::None
) return false;
517 if (N
.getOpcode() != ISD::LOAD
)
520 // If N is a load, do additional profitability checks.
522 switch (U
->getOpcode()) {
536 SDValue Op1
= U
->getOperand(1);
538 // If the other operand is a 8-bit immediate we should fold the immediate
539 // instead. This reduces code size.
541 // movl 4(%esp), %eax
545 // addl 4(%esp), %eax
546 // The former is 2 bytes shorter. In case where the increment is 1, then
547 // the saving can be 4 bytes (by using incl %eax).
548 if (ConstantSDNode
*Imm
= dyn_cast
<ConstantSDNode
>(Op1
)) {
549 if (Imm
->getAPIntValue().isSignedIntN(8))
552 // If this is a 64-bit AND with an immediate that fits in 32-bits,
553 // prefer using the smaller and over folding the load. This is needed to
554 // make sure immediates created by shrinkAndImmediate are always folded.
555 // Ideally we would narrow the load during DAG combine and get the
556 // best of both worlds.
557 if (U
->getOpcode() == ISD::AND
&&
558 Imm
->getAPIntValue().getBitWidth() == 64 &&
559 Imm
->getAPIntValue().isIntN(32))
563 // If the other operand is a TLS address, we should fold it instead.
566 // leal i@NTPOFF(%eax), %eax
568 // movl $i@NTPOFF, %eax
570 // if the block also has an access to a second TLS address this will save
572 // FIXME: This is probably also true for non-TLS addresses.
573 if (Op1
.getOpcode() == X86ISD::Wrapper
) {
574 SDValue Val
= Op1
.getOperand(0);
575 if (Val
.getOpcode() == ISD::TargetGlobalTLSAddress
)
579 // Don't fold load if this matches the BTS/BTR/BTC patterns.
580 // BTS: (or X, (shl 1, n))
581 // BTR: (and X, (rotl -2, n))
582 // BTC: (xor X, (shl 1, n))
583 if (U
->getOpcode() == ISD::OR
|| U
->getOpcode() == ISD::XOR
) {
584 if (U
->getOperand(0).getOpcode() == ISD::SHL
&&
585 isOneConstant(U
->getOperand(0).getOperand(0)))
588 if (U
->getOperand(1).getOpcode() == ISD::SHL
&&
589 isOneConstant(U
->getOperand(1).getOperand(0)))
592 if (U
->getOpcode() == ISD::AND
) {
593 SDValue U0
= U
->getOperand(0);
594 SDValue U1
= U
->getOperand(1);
595 if (U0
.getOpcode() == ISD::ROTL
) {
596 auto *C
= dyn_cast
<ConstantSDNode
>(U0
.getOperand(0));
597 if (C
&& C
->getSExtValue() == -2)
601 if (U1
.getOpcode() == ISD::ROTL
) {
602 auto *C
= dyn_cast
<ConstantSDNode
>(U1
.getOperand(0));
603 if (C
&& C
->getSExtValue() == -2)
613 // Don't fold a load into a shift by immediate. The BMI2 instructions
614 // support folding a load, but not an immediate. The legacy instructions
615 // support folding an immediate, but can't fold a load. Folding an
616 // immediate is preferable to folding a load.
617 if (isa
<ConstantSDNode
>(U
->getOperand(1)))
624 // Prevent folding a load if this can implemented with an insert_subreg or
625 // a move that implicitly zeroes.
626 if (Root
->getOpcode() == ISD::INSERT_SUBVECTOR
&&
627 isNullConstant(Root
->getOperand(2)) &&
628 (Root
->getOperand(0).isUndef() ||
629 ISD::isBuildVectorAllZeros(Root
->getOperand(0).getNode())))
635 /// Replace the original chain operand of the call with
636 /// load's chain operand and move load below the call's chain operand.
637 static void moveBelowOrigChain(SelectionDAG
*CurDAG
, SDValue Load
,
638 SDValue Call
, SDValue OrigChain
) {
639 SmallVector
<SDValue
, 8> Ops
;
640 SDValue Chain
= OrigChain
.getOperand(0);
641 if (Chain
.getNode() == Load
.getNode())
642 Ops
.push_back(Load
.getOperand(0));
644 assert(Chain
.getOpcode() == ISD::TokenFactor
&&
645 "Unexpected chain operand");
646 for (unsigned i
= 0, e
= Chain
.getNumOperands(); i
!= e
; ++i
)
647 if (Chain
.getOperand(i
).getNode() == Load
.getNode())
648 Ops
.push_back(Load
.getOperand(0));
650 Ops
.push_back(Chain
.getOperand(i
));
652 CurDAG
->getNode(ISD::TokenFactor
, SDLoc(Load
), MVT::Other
, Ops
);
654 Ops
.push_back(NewChain
);
656 Ops
.append(OrigChain
->op_begin() + 1, OrigChain
->op_end());
657 CurDAG
->UpdateNodeOperands(OrigChain
.getNode(), Ops
);
658 CurDAG
->UpdateNodeOperands(Load
.getNode(), Call
.getOperand(0),
659 Load
.getOperand(1), Load
.getOperand(2));
662 Ops
.push_back(SDValue(Load
.getNode(), 1));
663 Ops
.append(Call
->op_begin() + 1, Call
->op_end());
664 CurDAG
->UpdateNodeOperands(Call
.getNode(), Ops
);
667 /// Return true if call address is a load and it can be
668 /// moved below CALLSEQ_START and the chains leading up to the call.
669 /// Return the CALLSEQ_START by reference as a second output.
670 /// In the case of a tail call, there isn't a callseq node between the call
671 /// chain and the load.
672 static bool isCalleeLoad(SDValue Callee
, SDValue
&Chain
, bool HasCallSeq
) {
673 // The transformation is somewhat dangerous if the call's chain was glued to
674 // the call. After MoveBelowOrigChain the load is moved between the call and
675 // the chain, this can create a cycle if the load is not folded. So it is
676 // *really* important that we are sure the load will be folded.
677 if (Callee
.getNode() == Chain
.getNode() || !Callee
.hasOneUse())
679 LoadSDNode
*LD
= dyn_cast
<LoadSDNode
>(Callee
.getNode());
682 LD
->getAddressingMode() != ISD::UNINDEXED
||
683 LD
->getExtensionType() != ISD::NON_EXTLOAD
)
686 // Now let's find the callseq_start.
687 while (HasCallSeq
&& Chain
.getOpcode() != ISD::CALLSEQ_START
) {
688 if (!Chain
.hasOneUse())
690 Chain
= Chain
.getOperand(0);
693 if (!Chain
.getNumOperands())
695 // Since we are not checking for AA here, conservatively abort if the chain
696 // writes to memory. It's not safe to move the callee (a load) across a store.
697 if (isa
<MemSDNode
>(Chain
.getNode()) &&
698 cast
<MemSDNode
>(Chain
.getNode())->writeMem())
700 if (Chain
.getOperand(0).getNode() == Callee
.getNode())
702 if (Chain
.getOperand(0).getOpcode() == ISD::TokenFactor
&&
703 Callee
.getValue(1).isOperandOf(Chain
.getOperand(0).getNode()) &&
704 Callee
.getValue(1).hasOneUse())
709 void X86DAGToDAGISel::PreprocessISelDAG() {
710 // OptFor[Min]Size are used in pattern predicates that isel is matching.
711 OptForSize
= MF
->getFunction().optForSize();
712 OptForMinSize
= MF
->getFunction().optForMinSize();
713 assert((!OptForMinSize
|| OptForSize
) && "OptForMinSize implies OptForSize");
715 for (SelectionDAG::allnodes_iterator I
= CurDAG
->allnodes_begin(),
716 E
= CurDAG
->allnodes_end(); I
!= E
; ) {
717 SDNode
*N
= &*I
++; // Preincrement iterator to avoid invalidation issues.
719 // If this is a target specific AND node with no flag usages, turn it back
720 // into ISD::AND to enable test instruction matching.
721 if (N
->getOpcode() == X86ISD::AND
&& !N
->hasAnyUseOfValue(1)) {
722 SDValue Res
= CurDAG
->getNode(ISD::AND
, SDLoc(N
), N
->getValueType(0),
723 N
->getOperand(0), N
->getOperand(1));
725 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
727 CurDAG
->DeleteNode(N
);
731 if (OptLevel
!= CodeGenOpt::None
&&
732 // Only do this when the target can fold the load into the call or
734 !Subtarget
->useRetpolineIndirectCalls() &&
735 ((N
->getOpcode() == X86ISD::CALL
&& !Subtarget
->slowTwoMemOps()) ||
736 (N
->getOpcode() == X86ISD::TC_RETURN
&&
737 (Subtarget
->is64Bit() ||
738 !getTargetMachine().isPositionIndependent())))) {
739 /// Also try moving call address load from outside callseq_start to just
740 /// before the call to allow it to be folded.
758 bool HasCallSeq
= N
->getOpcode() == X86ISD::CALL
;
759 SDValue Chain
= N
->getOperand(0);
760 SDValue Load
= N
->getOperand(1);
761 if (!isCalleeLoad(Load
, Chain
, HasCallSeq
))
763 moveBelowOrigChain(CurDAG
, Load
, SDValue(N
, 0), Chain
);
768 // Lower fpround and fpextend nodes that target the FP stack to be store and
769 // load to the stack. This is a gross hack. We would like to simply mark
770 // these as being illegal, but when we do that, legalize produces these when
771 // it expands calls, then expands these in the same legalize pass. We would
772 // like dag combine to be able to hack on these between the call expansion
773 // and the node legalization. As such this pass basically does "really
774 // late" legalization of these inline with the X86 isel pass.
775 // FIXME: This should only happen when not compiled with -O0.
776 if (N
->getOpcode() != ISD::FP_ROUND
&& N
->getOpcode() != ISD::FP_EXTEND
)
779 MVT SrcVT
= N
->getOperand(0).getSimpleValueType();
780 MVT DstVT
= N
->getSimpleValueType(0);
782 // If any of the sources are vectors, no fp stack involved.
783 if (SrcVT
.isVector() || DstVT
.isVector())
786 // If the source and destination are SSE registers, then this is a legal
787 // conversion that should not be lowered.
788 const X86TargetLowering
*X86Lowering
=
789 static_cast<const X86TargetLowering
*>(TLI
);
790 bool SrcIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(SrcVT
);
791 bool DstIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(DstVT
);
792 if (SrcIsSSE
&& DstIsSSE
)
795 if (!SrcIsSSE
&& !DstIsSSE
) {
796 // If this is an FPStack extension, it is a noop.
797 if (N
->getOpcode() == ISD::FP_EXTEND
)
799 // If this is a value-preserving FPStack truncation, it is a noop.
800 if (N
->getConstantOperandVal(1))
804 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
805 // FPStack has extload and truncstore. SSE can fold direct loads into other
806 // operations. Based on this, decide what we want to do.
808 if (N
->getOpcode() == ISD::FP_ROUND
)
809 MemVT
= DstVT
; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
811 MemVT
= SrcIsSSE
? SrcVT
: DstVT
;
813 SDValue MemTmp
= CurDAG
->CreateStackTemporary(MemVT
);
816 // FIXME: optimize the case where the src/dest is a load or store?
818 CurDAG
->getTruncStore(CurDAG
->getEntryNode(), dl
, N
->getOperand(0),
819 MemTmp
, MachinePointerInfo(), MemVT
);
820 SDValue Result
= CurDAG
->getExtLoad(ISD::EXTLOAD
, dl
, DstVT
, Store
, MemTmp
,
821 MachinePointerInfo(), MemVT
);
823 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
824 // extload we created. This will cause general havok on the dag because
825 // anything below the conversion could be folded into other existing nodes.
826 // To avoid invalidating 'I', back it up to the convert node.
828 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Result
);
830 // Now that we did that, the node is dead. Increment the iterator to the
831 // next node to process, then delete N.
833 CurDAG
->DeleteNode(N
);
838 void X86DAGToDAGISel::PostprocessISelDAG() {
839 // Skip peepholes at -O0.
840 if (TM
.getOptLevel() == CodeGenOpt::None
)
843 // Attempt to remove vectors moves that were inserted to zero upper bits.
845 SelectionDAG::allnodes_iterator
Position(CurDAG
->getRoot().getNode());
848 while (Position
!= CurDAG
->allnodes_begin()) {
849 SDNode
*N
= &*--Position
;
850 // Skip dead nodes and any non-machine opcodes.
851 if (N
->use_empty() || !N
->isMachineOpcode())
854 if (N
->getMachineOpcode() != TargetOpcode::SUBREG_TO_REG
)
857 unsigned SubRegIdx
= N
->getConstantOperandVal(2);
858 if (SubRegIdx
!= X86::sub_xmm
&& SubRegIdx
!= X86::sub_ymm
)
861 SDValue Move
= N
->getOperand(1);
862 if (!Move
.isMachineOpcode())
865 // Make sure its one of the move opcodes we recognize.
866 switch (Move
.getMachineOpcode()) {
869 case X86::VMOVAPDrr
: case X86::VMOVUPDrr
:
870 case X86::VMOVAPSrr
: case X86::VMOVUPSrr
:
871 case X86::VMOVDQArr
: case X86::VMOVDQUrr
:
872 case X86::VMOVAPDYrr
: case X86::VMOVUPDYrr
:
873 case X86::VMOVAPSYrr
: case X86::VMOVUPSYrr
:
874 case X86::VMOVDQAYrr
: case X86::VMOVDQUYrr
:
875 case X86::VMOVAPDZ128rr
: case X86::VMOVUPDZ128rr
:
876 case X86::VMOVAPSZ128rr
: case X86::VMOVUPSZ128rr
:
877 case X86::VMOVDQA32Z128rr
: case X86::VMOVDQU32Z128rr
:
878 case X86::VMOVDQA64Z128rr
: case X86::VMOVDQU64Z128rr
:
879 case X86::VMOVAPDZ256rr
: case X86::VMOVUPDZ256rr
:
880 case X86::VMOVAPSZ256rr
: case X86::VMOVUPSZ256rr
:
881 case X86::VMOVDQA32Z256rr
: case X86::VMOVDQU32Z256rr
:
882 case X86::VMOVDQA64Z256rr
: case X86::VMOVDQU64Z256rr
:
886 SDValue In
= Move
.getOperand(0);
887 if (!In
.isMachineOpcode() ||
888 In
.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END
)
891 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
892 // the SHA instructions which use a legacy encoding.
893 uint64_t TSFlags
= getInstrInfo()->get(In
.getMachineOpcode()).TSFlags
;
894 if ((TSFlags
& X86II::EncodingMask
) != X86II::VEX
&&
895 (TSFlags
& X86II::EncodingMask
) != X86II::EVEX
&&
896 (TSFlags
& X86II::EncodingMask
) != X86II::XOP
)
899 // Producing instruction is another vector instruction. We can drop the
901 CurDAG
->UpdateNodeOperands(N
, N
->getOperand(0), In
, N
->getOperand(2));
903 // If the move is now dead, delete it.
904 if (Move
.getNode()->use_empty())
905 CurDAG
->RemoveDeadNode(Move
.getNode());
910 /// Emit any code that needs to be executed only in the main function.
911 void X86DAGToDAGISel::emitSpecialCodeForMain() {
912 if (Subtarget
->isTargetCygMing()) {
913 TargetLowering::ArgListTy Args
;
914 auto &DL
= CurDAG
->getDataLayout();
916 TargetLowering::CallLoweringInfo
CLI(*CurDAG
);
917 CLI
.setChain(CurDAG
->getRoot())
918 .setCallee(CallingConv::C
, Type::getVoidTy(*CurDAG
->getContext()),
919 CurDAG
->getExternalSymbol("__main", TLI
->getPointerTy(DL
)),
921 const TargetLowering
&TLI
= CurDAG
->getTargetLoweringInfo();
922 std::pair
<SDValue
, SDValue
> Result
= TLI
.LowerCallTo(CLI
);
923 CurDAG
->setRoot(Result
.second
);
927 void X86DAGToDAGISel::EmitFunctionEntryCode() {
928 // If this is main, emit special code for main.
929 const Function
&F
= MF
->getFunction();
930 if (F
.hasExternalLinkage() && F
.getName() == "main")
931 emitSpecialCodeForMain();
934 static bool isDispSafeForFrameIndex(int64_t Val
) {
935 // On 64-bit platforms, we can run into an issue where a frame index
936 // includes a displacement that, when added to the explicit displacement,
937 // will overflow the displacement field. Assuming that the frame index
938 // displacement fits into a 31-bit integer (which is only slightly more
939 // aggressive than the current fundamental assumption that it fits into
940 // a 32-bit integer), a 31-bit disp should always be safe.
941 return isInt
<31>(Val
);
944 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset
,
945 X86ISelAddressMode
&AM
) {
946 // If there's no offset to fold, we don't need to do any work.
950 // Cannot combine ExternalSymbol displacements with integer offsets.
951 if (AM
.ES
|| AM
.MCSym
)
954 int64_t Val
= AM
.Disp
+ Offset
;
955 CodeModel::Model M
= TM
.getCodeModel();
956 if (Subtarget
->is64Bit()) {
957 if (!X86::isOffsetSuitableForCodeModel(Val
, M
,
958 AM
.hasSymbolicDisplacement()))
960 // In addition to the checks required for a register base, check that
961 // we do not try to use an unsafe Disp with a frame index.
962 if (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
&&
963 !isDispSafeForFrameIndex(Val
))
971 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode
*N
, X86ISelAddressMode
&AM
){
972 SDValue Address
= N
->getOperand(1);
974 // load gs:0 -> GS segment register.
975 // load fs:0 -> FS segment register.
977 // This optimization is valid because the GNU TLS model defines that
978 // gs:0 (or fs:0 on X86-64) contains its own address.
979 // For more information see http://people.redhat.com/drepper/tls.pdf
980 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Address
))
981 if (C
->getSExtValue() == 0 && AM
.Segment
.getNode() == nullptr &&
982 (Subtarget
->isTargetGlibc() || Subtarget
->isTargetAndroid() ||
983 Subtarget
->isTargetFuchsia()))
984 switch (N
->getPointerInfo().getAddrSpace()) {
986 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
989 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
991 // Address space 258 is not handled here, because it is not used to
992 // address TLS areas.
998 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
999 /// mode. These wrap things that will resolve down into a symbol reference.
1000 /// If no match is possible, this returns true, otherwise it returns false.
1001 bool X86DAGToDAGISel::matchWrapper(SDValue N
, X86ISelAddressMode
&AM
) {
1002 // If the addressing mode already has a symbol as the displacement, we can
1003 // never match another symbol.
1004 if (AM
.hasSymbolicDisplacement())
1007 bool IsRIPRel
= N
.getOpcode() == X86ISD::WrapperRIP
;
1009 // We can't use an addressing mode in the 64-bit large code model. In the
1010 // medium code model, we use can use an mode when RIP wrappers are present.
1011 // That signifies access to globals that are known to be "near", such as the
1013 CodeModel::Model M
= TM
.getCodeModel();
1014 if (Subtarget
->is64Bit() &&
1015 (M
== CodeModel::Large
|| (M
== CodeModel::Medium
&& !IsRIPRel
)))
1018 // Base and index reg must be 0 in order to use %rip as base.
1019 if (IsRIPRel
&& AM
.hasBaseOrIndexReg())
1022 // Make a local copy in case we can't do this fold.
1023 X86ISelAddressMode Backup
= AM
;
1026 SDValue N0
= N
.getOperand(0);
1027 if (GlobalAddressSDNode
*G
= dyn_cast
<GlobalAddressSDNode
>(N0
)) {
1028 AM
.GV
= G
->getGlobal();
1029 AM
.SymbolFlags
= G
->getTargetFlags();
1030 Offset
= G
->getOffset();
1031 } else if (ConstantPoolSDNode
*CP
= dyn_cast
<ConstantPoolSDNode
>(N0
)) {
1032 AM
.CP
= CP
->getConstVal();
1033 AM
.Align
= CP
->getAlignment();
1034 AM
.SymbolFlags
= CP
->getTargetFlags();
1035 Offset
= CP
->getOffset();
1036 } else if (ExternalSymbolSDNode
*S
= dyn_cast
<ExternalSymbolSDNode
>(N0
)) {
1037 AM
.ES
= S
->getSymbol();
1038 AM
.SymbolFlags
= S
->getTargetFlags();
1039 } else if (auto *S
= dyn_cast
<MCSymbolSDNode
>(N0
)) {
1040 AM
.MCSym
= S
->getMCSymbol();
1041 } else if (JumpTableSDNode
*J
= dyn_cast
<JumpTableSDNode
>(N0
)) {
1042 AM
.JT
= J
->getIndex();
1043 AM
.SymbolFlags
= J
->getTargetFlags();
1044 } else if (BlockAddressSDNode
*BA
= dyn_cast
<BlockAddressSDNode
>(N0
)) {
1045 AM
.BlockAddr
= BA
->getBlockAddress();
1046 AM
.SymbolFlags
= BA
->getTargetFlags();
1047 Offset
= BA
->getOffset();
1049 llvm_unreachable("Unhandled symbol reference node.");
1051 if (foldOffsetIntoAddress(Offset
, AM
)) {
1057 AM
.setBaseReg(CurDAG
->getRegister(X86::RIP
, MVT::i64
));
1059 // Commit the changes now that we know this fold is safe.
1063 /// Add the specified node to the specified addressing mode, returning true if
1064 /// it cannot be done. This just pattern matches for the addressing mode.
1065 bool X86DAGToDAGISel::matchAddress(SDValue N
, X86ISelAddressMode
&AM
) {
1066 if (matchAddressRecursively(N
, AM
, 0))
1069 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1070 // a smaller encoding and avoids a scaled-index.
1071 if (AM
.Scale
== 2 &&
1072 AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1073 AM
.Base_Reg
.getNode() == nullptr) {
1074 AM
.Base_Reg
= AM
.IndexReg
;
1078 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1079 // because it has a smaller encoding.
1080 // TODO: Which other code models can use this?
1081 if (TM
.getCodeModel() == CodeModel::Small
&&
1082 Subtarget
->is64Bit() &&
1084 AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1085 AM
.Base_Reg
.getNode() == nullptr &&
1086 AM
.IndexReg
.getNode() == nullptr &&
1087 AM
.SymbolFlags
== X86II::MO_NO_FLAG
&&
1088 AM
.hasSymbolicDisplacement())
1089 AM
.Base_Reg
= CurDAG
->getRegister(X86::RIP
, MVT::i64
);
1094 bool X86DAGToDAGISel::matchAdd(SDValue N
, X86ISelAddressMode
&AM
,
1096 // Add an artificial use to this node so that we can keep track of
1097 // it if it gets CSE'd with a different node.
1098 HandleSDNode
Handle(N
);
1100 X86ISelAddressMode Backup
= AM
;
1101 if (!matchAddressRecursively(N
.getOperand(0), AM
, Depth
+1) &&
1102 !matchAddressRecursively(Handle
.getValue().getOperand(1), AM
, Depth
+1))
1106 // Try again after commuting the operands.
1107 if (!matchAddressRecursively(Handle
.getValue().getOperand(1), AM
, Depth
+1) &&
1108 !matchAddressRecursively(Handle
.getValue().getOperand(0), AM
, Depth
+1))
1112 // If we couldn't fold both operands into the address at the same time,
1113 // see if we can just put each operand into a register and fold at least
1115 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1116 !AM
.Base_Reg
.getNode() &&
1117 !AM
.IndexReg
.getNode()) {
1118 N
= Handle
.getValue();
1119 AM
.Base_Reg
= N
.getOperand(0);
1120 AM
.IndexReg
= N
.getOperand(1);
1124 N
= Handle
.getValue();
1128 // Insert a node into the DAG at least before the Pos node's position. This
1129 // will reposition the node as needed, and will assign it a node ID that is <=
1130 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1131 // IDs! The selection DAG must no longer depend on their uniqueness when this
1133 static void insertDAGNode(SelectionDAG
&DAG
, SDValue Pos
, SDValue N
) {
1134 if (N
->getNodeId() == -1 ||
1135 (SelectionDAGISel::getUninvalidatedNodeId(N
.getNode()) >
1136 SelectionDAGISel::getUninvalidatedNodeId(Pos
.getNode()))) {
1137 DAG
.RepositionNode(Pos
->getIterator(), N
.getNode());
1138 // Mark Node as invalid for pruning as after this it may be a successor to a
1139 // selected node but otherwise be in the same position of Pos.
1140 // Conservatively mark it with the same -abs(Id) to assure node id
1141 // invariant is preserved.
1142 N
->setNodeId(Pos
->getNodeId());
1143 SelectionDAGISel::InvalidateNodeId(N
.getNode());
1147 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1148 // safe. This allows us to convert the shift and and into an h-register
1149 // extract and a scaled index. Returns false if the simplification is
1151 static bool foldMaskAndShiftToExtract(SelectionDAG
&DAG
, SDValue N
,
1153 SDValue Shift
, SDValue X
,
1154 X86ISelAddressMode
&AM
) {
1155 if (Shift
.getOpcode() != ISD::SRL
||
1156 !isa
<ConstantSDNode
>(Shift
.getOperand(1)) ||
1160 int ScaleLog
= 8 - Shift
.getConstantOperandVal(1);
1161 if (ScaleLog
<= 0 || ScaleLog
>= 4 ||
1162 Mask
!= (0xffu
<< ScaleLog
))
1165 MVT VT
= N
.getSimpleValueType();
1167 SDValue Eight
= DAG
.getConstant(8, DL
, MVT::i8
);
1168 SDValue NewMask
= DAG
.getConstant(0xff, DL
, VT
);
1169 SDValue Srl
= DAG
.getNode(ISD::SRL
, DL
, VT
, X
, Eight
);
1170 SDValue And
= DAG
.getNode(ISD::AND
, DL
, VT
, Srl
, NewMask
);
1171 SDValue ShlCount
= DAG
.getConstant(ScaleLog
, DL
, MVT::i8
);
1172 SDValue Shl
= DAG
.getNode(ISD::SHL
, DL
, VT
, And
, ShlCount
);
1174 // Insert the new nodes into the topological ordering. We must do this in
1175 // a valid topological ordering as nothing is going to go back and re-sort
1176 // these nodes. We continually insert before 'N' in sequence as this is
1177 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1178 // hierarchy left to express.
1179 insertDAGNode(DAG
, N
, Eight
);
1180 insertDAGNode(DAG
, N
, Srl
);
1181 insertDAGNode(DAG
, N
, NewMask
);
1182 insertDAGNode(DAG
, N
, And
);
1183 insertDAGNode(DAG
, N
, ShlCount
);
1184 insertDAGNode(DAG
, N
, Shl
);
1185 DAG
.ReplaceAllUsesWith(N
, Shl
);
1187 AM
.Scale
= (1 << ScaleLog
);
1191 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
1192 // allows us to fold the shift into this addressing mode. Returns false if the
1193 // transform succeeded.
1194 static bool foldMaskedShiftToScaledMask(SelectionDAG
&DAG
, SDValue N
,
1196 SDValue Shift
, SDValue X
,
1197 X86ISelAddressMode
&AM
) {
1198 if (Shift
.getOpcode() != ISD::SHL
||
1199 !isa
<ConstantSDNode
>(Shift
.getOperand(1)))
1202 // Not likely to be profitable if either the AND or SHIFT node has more
1203 // than one use (unless all uses are for address computation). Besides,
1204 // isel mechanism requires their node ids to be reused.
1205 if (!N
.hasOneUse() || !Shift
.hasOneUse())
1208 // Verify that the shift amount is something we can fold.
1209 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
1210 if (ShiftAmt
!= 1 && ShiftAmt
!= 2 && ShiftAmt
!= 3)
1213 MVT VT
= N
.getSimpleValueType();
1215 SDValue NewMask
= DAG
.getConstant(Mask
>> ShiftAmt
, DL
, VT
);
1216 SDValue NewAnd
= DAG
.getNode(ISD::AND
, DL
, VT
, X
, NewMask
);
1217 SDValue NewShift
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewAnd
, Shift
.getOperand(1));
1219 // Insert the new nodes into the topological ordering. We must do this in
1220 // a valid topological ordering as nothing is going to go back and re-sort
1221 // these nodes. We continually insert before 'N' in sequence as this is
1222 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1223 // hierarchy left to express.
1224 insertDAGNode(DAG
, N
, NewMask
);
1225 insertDAGNode(DAG
, N
, NewAnd
);
1226 insertDAGNode(DAG
, N
, NewShift
);
1227 DAG
.ReplaceAllUsesWith(N
, NewShift
);
1229 AM
.Scale
= 1 << ShiftAmt
;
1230 AM
.IndexReg
= NewAnd
;
1234 // Implement some heroics to detect shifts of masked values where the mask can
1235 // be replaced by extending the shift and undoing that in the addressing mode
1236 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
1237 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
1238 // the addressing mode. This results in code such as:
1240 // int f(short *y, int *lookup_table) {
1242 // return *y + lookup_table[*y >> 11];
1246 // movzwl (%rdi), %eax
1249 // addl (%rsi,%rcx,4), %eax
1252 // movzwl (%rdi), %eax
1256 // addl (%rsi,%rcx), %eax
1258 // Note that this function assumes the mask is provided as a mask *after* the
1259 // value is shifted. The input chain may or may not match that, but computing
1260 // such a mask is trivial.
1261 static bool foldMaskAndShiftToScale(SelectionDAG
&DAG
, SDValue N
,
1263 SDValue Shift
, SDValue X
,
1264 X86ISelAddressMode
&AM
) {
1265 if (Shift
.getOpcode() != ISD::SRL
|| !Shift
.hasOneUse() ||
1266 !isa
<ConstantSDNode
>(Shift
.getOperand(1)))
1269 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
1270 unsigned MaskLZ
= countLeadingZeros(Mask
);
1271 unsigned MaskTZ
= countTrailingZeros(Mask
);
1273 // The amount of shift we're trying to fit into the addressing mode is taken
1274 // from the trailing zeros of the mask.
1275 unsigned AMShiftAmt
= MaskTZ
;
1277 // There is nothing we can do here unless the mask is removing some bits.
1278 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1279 if (AMShiftAmt
<= 0 || AMShiftAmt
> 3) return true;
1281 // We also need to ensure that mask is a continuous run of bits.
1282 if (countTrailingOnes(Mask
>> MaskTZ
) + MaskTZ
+ MaskLZ
!= 64) return true;
1284 // Scale the leading zero count down based on the actual size of the value.
1285 // Also scale it down based on the size of the shift.
1286 unsigned ScaleDown
= (64 - X
.getSimpleValueType().getSizeInBits()) + ShiftAmt
;
1287 if (MaskLZ
< ScaleDown
)
1289 MaskLZ
-= ScaleDown
;
1291 // The final check is to ensure that any masked out high bits of X are
1292 // already known to be zero. Otherwise, the mask has a semantic impact
1293 // other than masking out a couple of low bits. Unfortunately, because of
1294 // the mask, zero extensions will be removed from operands in some cases.
1295 // This code works extra hard to look through extensions because we can
1296 // replace them with zero extensions cheaply if necessary.
1297 bool ReplacingAnyExtend
= false;
1298 if (X
.getOpcode() == ISD::ANY_EXTEND
) {
1299 unsigned ExtendBits
= X
.getSimpleValueType().getSizeInBits() -
1300 X
.getOperand(0).getSimpleValueType().getSizeInBits();
1301 // Assume that we'll replace the any-extend with a zero-extend, and
1302 // narrow the search to the extended value.
1303 X
= X
.getOperand(0);
1304 MaskLZ
= ExtendBits
> MaskLZ
? 0 : MaskLZ
- ExtendBits
;
1305 ReplacingAnyExtend
= true;
1307 APInt MaskedHighBits
=
1308 APInt::getHighBitsSet(X
.getSimpleValueType().getSizeInBits(), MaskLZ
);
1310 DAG
.computeKnownBits(X
, Known
);
1311 if (MaskedHighBits
!= Known
.Zero
) return true;
1313 // We've identified a pattern that can be transformed into a single shift
1314 // and an addressing mode. Make it so.
1315 MVT VT
= N
.getSimpleValueType();
1316 if (ReplacingAnyExtend
) {
1317 assert(X
.getValueType() != VT
);
1318 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
1319 SDValue NewX
= DAG
.getNode(ISD::ZERO_EXTEND
, SDLoc(X
), VT
, X
);
1320 insertDAGNode(DAG
, N
, NewX
);
1324 SDValue NewSRLAmt
= DAG
.getConstant(ShiftAmt
+ AMShiftAmt
, DL
, MVT::i8
);
1325 SDValue NewSRL
= DAG
.getNode(ISD::SRL
, DL
, VT
, X
, NewSRLAmt
);
1326 SDValue NewSHLAmt
= DAG
.getConstant(AMShiftAmt
, DL
, MVT::i8
);
1327 SDValue NewSHL
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewSRL
, NewSHLAmt
);
1329 // Insert the new nodes into the topological ordering. We must do this in
1330 // a valid topological ordering as nothing is going to go back and re-sort
1331 // these nodes. We continually insert before 'N' in sequence as this is
1332 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1333 // hierarchy left to express.
1334 insertDAGNode(DAG
, N
, NewSRLAmt
);
1335 insertDAGNode(DAG
, N
, NewSRL
);
1336 insertDAGNode(DAG
, N
, NewSHLAmt
);
1337 insertDAGNode(DAG
, N
, NewSHL
);
1338 DAG
.ReplaceAllUsesWith(N
, NewSHL
);
1340 AM
.Scale
= 1 << AMShiftAmt
;
1341 AM
.IndexReg
= NewSRL
;
1345 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N
, X86ISelAddressMode
&AM
,
1349 dbgs() << "MatchAddress: ";
1354 return matchAddressBase(N
, AM
);
1356 // If this is already a %rip relative address, we can only merge immediates
1357 // into it. Instead of handling this in every case, we handle it here.
1358 // RIP relative addressing: %rip + 32-bit displacement!
1359 if (AM
.isRIPRelative()) {
1360 // FIXME: JumpTable and ExternalSymbol address currently don't like
1361 // displacements. It isn't very important, but this should be fixed for
1363 if (!(AM
.ES
|| AM
.MCSym
) && AM
.JT
!= -1)
1366 if (ConstantSDNode
*Cst
= dyn_cast
<ConstantSDNode
>(N
))
1367 if (!foldOffsetIntoAddress(Cst
->getSExtValue(), AM
))
1372 switch (N
.getOpcode()) {
1374 case ISD::LOCAL_RECOVER
: {
1375 if (!AM
.hasSymbolicDisplacement() && AM
.Disp
== 0)
1376 if (const auto *ESNode
= dyn_cast
<MCSymbolSDNode
>(N
.getOperand(0))) {
1377 // Use the symbol and don't prefix it.
1378 AM
.MCSym
= ESNode
->getMCSymbol();
1383 case ISD::Constant
: {
1384 uint64_t Val
= cast
<ConstantSDNode
>(N
)->getSExtValue();
1385 if (!foldOffsetIntoAddress(Val
, AM
))
1390 case X86ISD::Wrapper
:
1391 case X86ISD::WrapperRIP
:
1392 if (!matchWrapper(N
, AM
))
1397 if (!matchLoadInAddress(cast
<LoadSDNode
>(N
), AM
))
1401 case ISD::FrameIndex
:
1402 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1403 AM
.Base_Reg
.getNode() == nullptr &&
1404 (!Subtarget
->is64Bit() || isDispSafeForFrameIndex(AM
.Disp
))) {
1405 AM
.BaseType
= X86ISelAddressMode::FrameIndexBase
;
1406 AM
.Base_FrameIndex
= cast
<FrameIndexSDNode
>(N
)->getIndex();
1412 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1)
1415 if (ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(N
.getOperand(1))) {
1416 unsigned Val
= CN
->getZExtValue();
1417 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
1418 // that the base operand remains free for further matching. If
1419 // the base doesn't end up getting used, a post-processing step
1420 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
1421 if (Val
== 1 || Val
== 2 || Val
== 3) {
1422 AM
.Scale
= 1 << Val
;
1423 SDValue ShVal
= N
.getOperand(0);
1425 // Okay, we know that we have a scale by now. However, if the scaled
1426 // value is an add of something and a constant, we can fold the
1427 // constant into the disp field here.
1428 if (CurDAG
->isBaseWithConstantOffset(ShVal
)) {
1429 AM
.IndexReg
= ShVal
.getOperand(0);
1430 ConstantSDNode
*AddVal
= cast
<ConstantSDNode
>(ShVal
.getOperand(1));
1431 uint64_t Disp
= (uint64_t)AddVal
->getSExtValue() << Val
;
1432 if (!foldOffsetIntoAddress(Disp
, AM
))
1436 AM
.IndexReg
= ShVal
;
1443 // Scale must not be used already.
1444 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1) break;
1446 SDValue And
= N
.getOperand(0);
1447 if (And
.getOpcode() != ISD::AND
) break;
1448 SDValue X
= And
.getOperand(0);
1450 // We only handle up to 64-bit values here as those are what matter for
1451 // addressing mode optimizations.
1452 if (X
.getSimpleValueType().getSizeInBits() > 64) break;
1454 // The mask used for the transform is expected to be post-shift, but we
1455 // found the shift first so just apply the shift to the mask before passing
1457 if (!isa
<ConstantSDNode
>(N
.getOperand(1)) ||
1458 !isa
<ConstantSDNode
>(And
.getOperand(1)))
1460 uint64_t Mask
= And
.getConstantOperandVal(1) >> N
.getConstantOperandVal(1);
1462 // Try to fold the mask and shift into the scale, and return false if we
1464 if (!foldMaskAndShiftToScale(*CurDAG
, N
, Mask
, N
, X
, AM
))
1469 case ISD::SMUL_LOHI
:
1470 case ISD::UMUL_LOHI
:
1471 // A mul_lohi where we need the low part can be folded as a plain multiply.
1472 if (N
.getResNo() != 0) break;
1475 case X86ISD::MUL_IMM
:
1476 // X*[3,5,9] -> X+X*[2,4,8]
1477 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1478 AM
.Base_Reg
.getNode() == nullptr &&
1479 AM
.IndexReg
.getNode() == nullptr) {
1480 if (ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(N
.getOperand(1)))
1481 if (CN
->getZExtValue() == 3 || CN
->getZExtValue() == 5 ||
1482 CN
->getZExtValue() == 9) {
1483 AM
.Scale
= unsigned(CN
->getZExtValue())-1;
1485 SDValue MulVal
= N
.getOperand(0);
1488 // Okay, we know that we have a scale by now. However, if the scaled
1489 // value is an add of something and a constant, we can fold the
1490 // constant into the disp field here.
1491 if (MulVal
.getNode()->getOpcode() == ISD::ADD
&& MulVal
.hasOneUse() &&
1492 isa
<ConstantSDNode
>(MulVal
.getOperand(1))) {
1493 Reg
= MulVal
.getOperand(0);
1494 ConstantSDNode
*AddVal
=
1495 cast
<ConstantSDNode
>(MulVal
.getOperand(1));
1496 uint64_t Disp
= AddVal
->getSExtValue() * CN
->getZExtValue();
1497 if (foldOffsetIntoAddress(Disp
, AM
))
1498 Reg
= N
.getOperand(0);
1500 Reg
= N
.getOperand(0);
1503 AM
.IndexReg
= AM
.Base_Reg
= Reg
;
1510 // Given A-B, if A can be completely folded into the address and
1511 // the index field with the index field unused, use -B as the index.
1512 // This is a win if a has multiple parts that can be folded into
1513 // the address. Also, this saves a mov if the base register has
1514 // other uses, since it avoids a two-address sub instruction, however
1515 // it costs an additional mov if the index register has other uses.
1517 // Add an artificial use to this node so that we can keep track of
1518 // it if it gets CSE'd with a different node.
1519 HandleSDNode
Handle(N
);
1521 // Test if the LHS of the sub can be folded.
1522 X86ISelAddressMode Backup
= AM
;
1523 if (matchAddressRecursively(N
.getOperand(0), AM
, Depth
+1)) {
1527 // Test if the index field is free for use.
1528 if (AM
.IndexReg
.getNode() || AM
.isRIPRelative()) {
1534 SDValue RHS
= Handle
.getValue().getOperand(1);
1535 // If the RHS involves a register with multiple uses, this
1536 // transformation incurs an extra mov, due to the neg instruction
1537 // clobbering its operand.
1538 if (!RHS
.getNode()->hasOneUse() ||
1539 RHS
.getNode()->getOpcode() == ISD::CopyFromReg
||
1540 RHS
.getNode()->getOpcode() == ISD::TRUNCATE
||
1541 RHS
.getNode()->getOpcode() == ISD::ANY_EXTEND
||
1542 (RHS
.getNode()->getOpcode() == ISD::ZERO_EXTEND
&&
1543 RHS
.getOperand(0).getValueType() == MVT::i32
))
1545 // If the base is a register with multiple uses, this
1546 // transformation may save a mov.
1547 // FIXME: Don't rely on DELETED_NODEs.
1548 if ((AM
.BaseType
== X86ISelAddressMode::RegBase
&& AM
.Base_Reg
.getNode() &&
1549 AM
.Base_Reg
->getOpcode() != ISD::DELETED_NODE
&&
1550 !AM
.Base_Reg
.getNode()->hasOneUse()) ||
1551 AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
1553 // If the folded LHS was interesting, this transformation saves
1554 // address arithmetic.
1555 if ((AM
.hasSymbolicDisplacement() && !Backup
.hasSymbolicDisplacement()) +
1556 ((AM
.Disp
!= 0) && (Backup
.Disp
== 0)) +
1557 (AM
.Segment
.getNode() && !Backup
.Segment
.getNode()) >= 2)
1559 // If it doesn't look like it may be an overall win, don't do it.
1565 // Ok, the transformation is legal and appears profitable. Go for it.
1566 SDValue Zero
= CurDAG
->getConstant(0, dl
, N
.getValueType());
1567 SDValue Neg
= CurDAG
->getNode(ISD::SUB
, dl
, N
.getValueType(), Zero
, RHS
);
1571 // Insert the new nodes into the topological ordering.
1572 insertDAGNode(*CurDAG
, Handle
.getValue(), Zero
);
1573 insertDAGNode(*CurDAG
, Handle
.getValue(), Neg
);
1578 if (!matchAdd(N
, AM
, Depth
))
1583 // We want to look through a transform in InstCombine and DAGCombiner that
1584 // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
1585 // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
1586 // An 'lea' can then be used to match the shift (multiply) and add:
1588 // lea (%rsi, %rdi, 8), %rax
1589 if (CurDAG
->haveNoCommonBitsSet(N
.getOperand(0), N
.getOperand(1)) &&
1590 !matchAdd(N
, AM
, Depth
))
1595 // Perform some heroic transforms on an and of a constant-count shift
1596 // with a constant to enable use of the scaled offset field.
1598 // Scale must not be used already.
1599 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1) break;
1601 SDValue Shift
= N
.getOperand(0);
1602 if (Shift
.getOpcode() != ISD::SRL
&& Shift
.getOpcode() != ISD::SHL
) break;
1603 SDValue X
= Shift
.getOperand(0);
1605 // We only handle up to 64-bit values here as those are what matter for
1606 // addressing mode optimizations.
1607 if (X
.getSimpleValueType().getSizeInBits() > 64) break;
1609 if (!isa
<ConstantSDNode
>(N
.getOperand(1)))
1611 uint64_t Mask
= N
.getConstantOperandVal(1);
1613 // Try to fold the mask and shift into an extract and scale.
1614 if (!foldMaskAndShiftToExtract(*CurDAG
, N
, Mask
, Shift
, X
, AM
))
1617 // Try to fold the mask and shift directly into the scale.
1618 if (!foldMaskAndShiftToScale(*CurDAG
, N
, Mask
, Shift
, X
, AM
))
1621 // Try to swap the mask and shift to place shifts which can be done as
1622 // a scale on the outside of the mask.
1623 if (!foldMaskedShiftToScaledMask(*CurDAG
, N
, Mask
, Shift
, X
, AM
))
1629 return matchAddressBase(N
, AM
);
1632 /// Helper for MatchAddress. Add the specified node to the
1633 /// specified addressing mode without any further recursion.
1634 bool X86DAGToDAGISel::matchAddressBase(SDValue N
, X86ISelAddressMode
&AM
) {
1635 // Is the base register already occupied?
1636 if (AM
.BaseType
!= X86ISelAddressMode::RegBase
|| AM
.Base_Reg
.getNode()) {
1637 // If so, check to see if the scale index register is set.
1638 if (!AM
.IndexReg
.getNode()) {
1644 // Otherwise, we cannot select it.
1648 // Default, generate it as a register.
1649 AM
.BaseType
= X86ISelAddressMode::RegBase
;
1654 /// Helper for selectVectorAddr. Handles things that can be folded into a
1655 /// gather scatter address. The index register and scale should have already
1657 bool X86DAGToDAGISel::matchVectorAddress(SDValue N
, X86ISelAddressMode
&AM
) {
1658 // TODO: Support other operations.
1659 switch (N
.getOpcode()) {
1660 case ISD::Constant
: {
1661 uint64_t Val
= cast
<ConstantSDNode
>(N
)->getSExtValue();
1662 if (!foldOffsetIntoAddress(Val
, AM
))
1666 case X86ISD::Wrapper
:
1667 if (!matchWrapper(N
, AM
))
1672 return matchAddressBase(N
, AM
);
1675 bool X86DAGToDAGISel::selectVectorAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
1676 SDValue
&Scale
, SDValue
&Index
,
1677 SDValue
&Disp
, SDValue
&Segment
) {
1678 X86ISelAddressMode AM
;
1679 auto *Mgs
= cast
<X86MaskedGatherScatterSDNode
>(Parent
);
1680 AM
.IndexReg
= Mgs
->getIndex();
1681 AM
.Scale
= cast
<ConstantSDNode
>(Mgs
->getScale())->getZExtValue();
1683 unsigned AddrSpace
= cast
<MemSDNode
>(Parent
)->getPointerInfo().getAddrSpace();
1684 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
1685 if (AddrSpace
== 256)
1686 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
1687 if (AddrSpace
== 257)
1688 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
1689 if (AddrSpace
== 258)
1690 AM
.Segment
= CurDAG
->getRegister(X86::SS
, MVT::i16
);
1692 // Try to match into the base and displacement fields.
1693 if (matchVectorAddress(N
, AM
))
1696 MVT VT
= N
.getSimpleValueType();
1697 if (AM
.BaseType
== X86ISelAddressMode::RegBase
) {
1698 if (!AM
.Base_Reg
.getNode())
1699 AM
.Base_Reg
= CurDAG
->getRegister(0, VT
);
1702 getAddressOperands(AM
, SDLoc(N
), Base
, Scale
, Index
, Disp
, Segment
);
1706 /// Returns true if it is able to pattern match an addressing mode.
1707 /// It returns the operands which make up the maximal addressing mode it can
1708 /// match by reference.
1710 /// Parent is the parent node of the addr operand that is being matched. It
1711 /// is always a load, store, atomic node, or null. It is only null when
1712 /// checking memory operands for inline asm nodes.
1713 bool X86DAGToDAGISel::selectAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
1714 SDValue
&Scale
, SDValue
&Index
,
1715 SDValue
&Disp
, SDValue
&Segment
) {
1716 X86ISelAddressMode AM
;
1719 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
1720 // that are not a MemSDNode, and thus don't have proper addrspace info.
1721 Parent
->getOpcode() != ISD::INTRINSIC_W_CHAIN
&& // unaligned loads, fixme
1722 Parent
->getOpcode() != ISD::INTRINSIC_VOID
&& // nontemporal stores
1723 Parent
->getOpcode() != X86ISD::TLSCALL
&& // Fixme
1724 Parent
->getOpcode() != X86ISD::EH_SJLJ_SETJMP
&& // setjmp
1725 Parent
->getOpcode() != X86ISD::EH_SJLJ_LONGJMP
) { // longjmp
1726 unsigned AddrSpace
=
1727 cast
<MemSDNode
>(Parent
)->getPointerInfo().getAddrSpace();
1728 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
1729 if (AddrSpace
== 256)
1730 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
1731 if (AddrSpace
== 257)
1732 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
1733 if (AddrSpace
== 258)
1734 AM
.Segment
= CurDAG
->getRegister(X86::SS
, MVT::i16
);
1737 if (matchAddress(N
, AM
))
1740 MVT VT
= N
.getSimpleValueType();
1741 if (AM
.BaseType
== X86ISelAddressMode::RegBase
) {
1742 if (!AM
.Base_Reg
.getNode())
1743 AM
.Base_Reg
= CurDAG
->getRegister(0, VT
);
1746 if (!AM
.IndexReg
.getNode())
1747 AM
.IndexReg
= CurDAG
->getRegister(0, VT
);
1749 getAddressOperands(AM
, SDLoc(N
), Base
, Scale
, Index
, Disp
, Segment
);
1753 // We can only fold a load if all nodes between it and the root node have a
1754 // single use. If there are additional uses, we could end up duplicating the
1756 static bool hasSingleUsesFromRoot(SDNode
*Root
, SDNode
*User
) {
1757 while (User
!= Root
) {
1758 if (!User
->hasOneUse())
1760 User
= *User
->use_begin();
1766 /// Match a scalar SSE load. In particular, we want to match a load whose top
1767 /// elements are either undef or zeros. The load flavor is derived from the
1768 /// type of N, which is either v4f32 or v2f64.
1771 /// PatternChainNode: this is the matched node that has a chain input and
1773 bool X86DAGToDAGISel::selectScalarSSELoad(SDNode
*Root
, SDNode
*Parent
,
1774 SDValue N
, SDValue
&Base
,
1775 SDValue
&Scale
, SDValue
&Index
,
1776 SDValue
&Disp
, SDValue
&Segment
,
1777 SDValue
&PatternNodeWithChain
) {
1778 if (!hasSingleUsesFromRoot(Root
, Parent
))
1781 // We can allow a full vector load here since narrowing a load is ok.
1782 if (ISD::isNON_EXTLoad(N
.getNode())) {
1783 PatternNodeWithChain
= N
;
1784 if (IsProfitableToFold(PatternNodeWithChain
, N
.getNode(), Root
) &&
1785 IsLegalToFold(PatternNodeWithChain
, Parent
, Root
, OptLevel
)) {
1786 LoadSDNode
*LD
= cast
<LoadSDNode
>(PatternNodeWithChain
);
1787 return selectAddr(LD
, LD
->getBasePtr(), Base
, Scale
, Index
, Disp
,
1792 // We can also match the special zero extended load opcode.
1793 if (N
.getOpcode() == X86ISD::VZEXT_LOAD
) {
1794 PatternNodeWithChain
= N
;
1795 if (IsProfitableToFold(PatternNodeWithChain
, N
.getNode(), Root
) &&
1796 IsLegalToFold(PatternNodeWithChain
, Parent
, Root
, OptLevel
)) {
1797 auto *MI
= cast
<MemIntrinsicSDNode
>(PatternNodeWithChain
);
1798 return selectAddr(MI
, MI
->getBasePtr(), Base
, Scale
, Index
, Disp
,
1803 // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
1804 // once. Otherwise the load might get duplicated and the chain output of the
1805 // duplicate load will not be observed by all dependencies.
1806 if (N
.getOpcode() == ISD::SCALAR_TO_VECTOR
&& N
.getNode()->hasOneUse()) {
1807 PatternNodeWithChain
= N
.getOperand(0);
1808 if (ISD::isNON_EXTLoad(PatternNodeWithChain
.getNode()) &&
1809 IsProfitableToFold(PatternNodeWithChain
, N
.getNode(), Root
) &&
1810 IsLegalToFold(PatternNodeWithChain
, N
.getNode(), Root
, OptLevel
)) {
1811 LoadSDNode
*LD
= cast
<LoadSDNode
>(PatternNodeWithChain
);
1812 return selectAddr(LD
, LD
->getBasePtr(), Base
, Scale
, Index
, Disp
,
1817 // Also handle the case where we explicitly require zeros in the top
1818 // elements. This is a vector shuffle from the zero vector.
1819 if (N
.getOpcode() == X86ISD::VZEXT_MOVL
&& N
.getNode()->hasOneUse() &&
1820 // Check to see if the top elements are all zeros (or bitcast of zeros).
1821 N
.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR
&&
1822 N
.getOperand(0).getNode()->hasOneUse()) {
1823 PatternNodeWithChain
= N
.getOperand(0).getOperand(0);
1824 if (ISD::isNON_EXTLoad(PatternNodeWithChain
.getNode()) &&
1825 IsProfitableToFold(PatternNodeWithChain
, N
.getNode(), Root
) &&
1826 IsLegalToFold(PatternNodeWithChain
, N
.getNode(), Root
, OptLevel
)) {
1827 // Okay, this is a zero extending load. Fold it.
1828 LoadSDNode
*LD
= cast
<LoadSDNode
>(PatternNodeWithChain
);
1829 return selectAddr(LD
, LD
->getBasePtr(), Base
, Scale
, Index
, Disp
,
1838 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N
, SDValue
&Imm
) {
1839 if (const ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(N
)) {
1840 uint64_t ImmVal
= CN
->getZExtValue();
1841 if (!isUInt
<32>(ImmVal
))
1844 Imm
= CurDAG
->getTargetConstant(ImmVal
, SDLoc(N
), MVT::i64
);
1848 // In static codegen with small code model, we can get the address of a label
1849 // into a register with 'movl'
1850 if (N
->getOpcode() != X86ISD::Wrapper
)
1853 N
= N
.getOperand(0);
1855 // At least GNU as does not accept 'movl' for TPOFF relocations.
1856 // FIXME: We could use 'movl' when we know we are targeting MC.
1857 if (N
->getOpcode() == ISD::TargetGlobalTLSAddress
)
1861 if (N
->getOpcode() != ISD::TargetGlobalAddress
)
1862 return TM
.getCodeModel() == CodeModel::Small
;
1864 Optional
<ConstantRange
> CR
=
1865 cast
<GlobalAddressSDNode
>(N
)->getGlobal()->getAbsoluteSymbolRange();
1867 return TM
.getCodeModel() == CodeModel::Small
;
1869 return CR
->getUnsignedMax().ult(1ull << 32);
1872 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N
, SDValue
&Base
,
1873 SDValue
&Scale
, SDValue
&Index
,
1874 SDValue
&Disp
, SDValue
&Segment
) {
1875 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
1878 if (!selectLEAAddr(N
, Base
, Scale
, Index
, Disp
, Segment
))
1881 RegisterSDNode
*RN
= dyn_cast
<RegisterSDNode
>(Base
);
1882 if (RN
&& RN
->getReg() == 0)
1883 Base
= CurDAG
->getRegister(0, MVT::i64
);
1884 else if (Base
.getValueType() == MVT::i32
&& !dyn_cast
<FrameIndexSDNode
>(Base
)) {
1885 // Base could already be %rip, particularly in the x32 ABI.
1886 Base
= SDValue(CurDAG
->getMachineNode(
1887 TargetOpcode::SUBREG_TO_REG
, DL
, MVT::i64
,
1888 CurDAG
->getTargetConstant(0, DL
, MVT::i64
),
1890 CurDAG
->getTargetConstant(X86::sub_32bit
, DL
, MVT::i32
)),
1894 RN
= dyn_cast
<RegisterSDNode
>(Index
);
1895 if (RN
&& RN
->getReg() == 0)
1896 Index
= CurDAG
->getRegister(0, MVT::i64
);
1898 assert(Index
.getValueType() == MVT::i32
&&
1899 "Expect to be extending 32-bit registers for use in LEA");
1900 Index
= SDValue(CurDAG
->getMachineNode(
1901 TargetOpcode::SUBREG_TO_REG
, DL
, MVT::i64
,
1902 CurDAG
->getTargetConstant(0, DL
, MVT::i64
),
1904 CurDAG
->getTargetConstant(X86::sub_32bit
, DL
,
1912 /// Calls SelectAddr and determines if the maximal addressing
1913 /// mode it matches can be cost effectively emitted as an LEA instruction.
1914 bool X86DAGToDAGISel::selectLEAAddr(SDValue N
,
1915 SDValue
&Base
, SDValue
&Scale
,
1916 SDValue
&Index
, SDValue
&Disp
,
1918 X86ISelAddressMode AM
;
1920 // Save the DL and VT before calling matchAddress, it can invalidate N.
1922 MVT VT
= N
.getSimpleValueType();
1924 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
1926 SDValue Copy
= AM
.Segment
;
1927 SDValue T
= CurDAG
->getRegister(0, MVT::i32
);
1929 if (matchAddress(N
, AM
))
1931 assert (T
== AM
.Segment
);
1934 unsigned Complexity
= 0;
1935 if (AM
.BaseType
== X86ISelAddressMode::RegBase
)
1936 if (AM
.Base_Reg
.getNode())
1939 AM
.Base_Reg
= CurDAG
->getRegister(0, VT
);
1940 else if (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
1943 if (AM
.IndexReg
.getNode())
1946 AM
.IndexReg
= CurDAG
->getRegister(0, VT
);
1948 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
1953 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
1954 // to a LEA. This is determined with some experimentation but is by no means
1955 // optimal (especially for code size consideration). LEA is nice because of
1956 // its three-address nature. Tweak the cost function again when we can run
1957 // convertToThreeAddress() at register allocation time.
1958 if (AM
.hasSymbolicDisplacement()) {
1959 // For X86-64, always use LEA to materialize RIP-relative addresses.
1960 if (Subtarget
->is64Bit())
1966 if (AM
.Disp
&& (AM
.Base_Reg
.getNode() || AM
.IndexReg
.getNode()))
1969 // If it isn't worth using an LEA, reject it.
1970 if (Complexity
<= 2)
1973 getAddressOperands(AM
, DL
, Base
, Scale
, Index
, Disp
, Segment
);
1977 /// This is only run on TargetGlobalTLSAddress nodes.
1978 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N
, SDValue
&Base
,
1979 SDValue
&Scale
, SDValue
&Index
,
1980 SDValue
&Disp
, SDValue
&Segment
) {
1981 assert(N
.getOpcode() == ISD::TargetGlobalTLSAddress
);
1982 const GlobalAddressSDNode
*GA
= cast
<GlobalAddressSDNode
>(N
);
1984 X86ISelAddressMode AM
;
1985 AM
.GV
= GA
->getGlobal();
1986 AM
.Disp
+= GA
->getOffset();
1987 AM
.Base_Reg
= CurDAG
->getRegister(0, N
.getValueType());
1988 AM
.SymbolFlags
= GA
->getTargetFlags();
1990 if (N
.getValueType() == MVT::i32
) {
1992 AM
.IndexReg
= CurDAG
->getRegister(X86::EBX
, MVT::i32
);
1994 AM
.IndexReg
= CurDAG
->getRegister(0, MVT::i64
);
1997 getAddressOperands(AM
, SDLoc(N
), Base
, Scale
, Index
, Disp
, Segment
);
2001 bool X86DAGToDAGISel::selectRelocImm(SDValue N
, SDValue
&Op
) {
2002 if (auto *CN
= dyn_cast
<ConstantSDNode
>(N
)) {
2003 Op
= CurDAG
->getTargetConstant(CN
->getAPIntValue(), SDLoc(CN
),
2008 // Keep track of the original value type and whether this value was
2009 // truncated. If we see a truncation from pointer type to VT that truncates
2010 // bits that are known to be zero, we can use a narrow reference.
2011 EVT VT
= N
.getValueType();
2012 bool WasTruncated
= false;
2013 if (N
.getOpcode() == ISD::TRUNCATE
) {
2014 WasTruncated
= true;
2015 N
= N
.getOperand(0);
2018 if (N
.getOpcode() != X86ISD::Wrapper
)
2021 // We can only use non-GlobalValues as immediates if they were not truncated,
2022 // as we do not have any range information. If we have a GlobalValue and the
2023 // address was not truncated, we can select it as an operand directly.
2024 unsigned Opc
= N
.getOperand(0)->getOpcode();
2025 if (Opc
!= ISD::TargetGlobalAddress
|| !WasTruncated
) {
2026 Op
= N
.getOperand(0);
2027 // We can only select the operand directly if we didn't have to look past a
2029 return !WasTruncated
;
2032 // Check that the global's range fits into VT.
2033 auto *GA
= cast
<GlobalAddressSDNode
>(N
.getOperand(0));
2034 Optional
<ConstantRange
> CR
= GA
->getGlobal()->getAbsoluteSymbolRange();
2035 if (!CR
|| CR
->getUnsignedMax().uge(1ull << VT
.getSizeInBits()))
2038 // Okay, we can use a narrow reference.
2039 Op
= CurDAG
->getTargetGlobalAddress(GA
->getGlobal(), SDLoc(N
), VT
,
2040 GA
->getOffset(), GA
->getTargetFlags());
2044 bool X86DAGToDAGISel::tryFoldLoad(SDNode
*Root
, SDNode
*P
, SDValue N
,
2045 SDValue
&Base
, SDValue
&Scale
,
2046 SDValue
&Index
, SDValue
&Disp
,
2048 if (!ISD::isNON_EXTLoad(N
.getNode()) ||
2049 !IsProfitableToFold(N
, P
, Root
) ||
2050 !IsLegalToFold(N
, P
, Root
, OptLevel
))
2053 return selectAddr(N
.getNode(),
2054 N
.getOperand(1), Base
, Scale
, Index
, Disp
, Segment
);
2057 bool X86DAGToDAGISel::tryFoldVecLoad(SDNode
*Root
, SDNode
*P
, SDValue N
,
2058 SDValue
&Base
, SDValue
&Scale
,
2059 SDValue
&Index
, SDValue
&Disp
,
2061 if (!ISD::isNON_EXTLoad(N
.getNode()) ||
2062 useNonTemporalLoad(cast
<LoadSDNode
>(N
)) ||
2063 !IsProfitableToFold(N
, P
, Root
) ||
2064 !IsLegalToFold(N
, P
, Root
, OptLevel
))
2067 return selectAddr(N
.getNode(),
2068 N
.getOperand(1), Base
, Scale
, Index
, Disp
, Segment
);
2071 /// Return an SDNode that returns the value of the global base register.
2072 /// Output instructions required to initialize the global base register,
2074 SDNode
*X86DAGToDAGISel::getGlobalBaseReg() {
2075 unsigned GlobalBaseReg
= getInstrInfo()->getGlobalBaseReg(MF
);
2076 auto &DL
= MF
->getDataLayout();
2077 return CurDAG
->getRegister(GlobalBaseReg
, TLI
->getPointerTy(DL
)).getNode();
2080 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width
, SDNode
*N
) const {
2081 if (N
->getOpcode() == ISD::TRUNCATE
)
2082 N
= N
->getOperand(0).getNode();
2083 if (N
->getOpcode() != X86ISD::Wrapper
)
2086 auto *GA
= dyn_cast
<GlobalAddressSDNode
>(N
->getOperand(0));
2090 Optional
<ConstantRange
> CR
= GA
->getGlobal()->getAbsoluteSymbolRange();
2091 return CR
&& CR
->getSignedMin().sge(-1ull << Width
) &&
2092 CR
->getSignedMax().slt(1ull << Width
);
2095 /// Test whether the given X86ISD::CMP node has any uses which require the SF
2096 /// or OF bits to be accurate.
2097 static bool hasNoSignedComparisonUses(SDNode
*N
) {
2098 // Examine each user of the node.
2099 for (SDNode::use_iterator UI
= N
->use_begin(),
2100 UE
= N
->use_end(); UI
!= UE
; ++UI
) {
2101 // Only examine CopyToReg uses.
2102 if (UI
->getOpcode() != ISD::CopyToReg
)
2104 // Only examine CopyToReg uses that copy to EFLAGS.
2105 if (cast
<RegisterSDNode
>(UI
->getOperand(1))->getReg() !=
2108 // Examine each user of the CopyToReg use.
2109 for (SDNode::use_iterator FlagUI
= UI
->use_begin(),
2110 FlagUE
= UI
->use_end(); FlagUI
!= FlagUE
; ++FlagUI
) {
2111 // Only examine the Flag result.
2112 if (FlagUI
.getUse().getResNo() != 1) continue;
2113 // Anything unusual: assume conservatively.
2114 if (!FlagUI
->isMachineOpcode()) return false;
2115 // Examine the opcode of the user.
2116 switch (FlagUI
->getMachineOpcode()) {
2117 // These comparisons don't treat the most significant bit specially.
2118 case X86::SETAr
: case X86::SETAEr
: case X86::SETBr
: case X86::SETBEr
:
2119 case X86::SETEr
: case X86::SETNEr
: case X86::SETPr
: case X86::SETNPr
:
2120 case X86::SETAm
: case X86::SETAEm
: case X86::SETBm
: case X86::SETBEm
:
2121 case X86::SETEm
: case X86::SETNEm
: case X86::SETPm
: case X86::SETNPm
:
2122 case X86::JA_1
: case X86::JAE_1
: case X86::JB_1
: case X86::JBE_1
:
2123 case X86::JE_1
: case X86::JNE_1
: case X86::JP_1
: case X86::JNP_1
:
2124 case X86::CMOVA16rr
: case X86::CMOVA16rm
:
2125 case X86::CMOVA32rr
: case X86::CMOVA32rm
:
2126 case X86::CMOVA64rr
: case X86::CMOVA64rm
:
2127 case X86::CMOVAE16rr
: case X86::CMOVAE16rm
:
2128 case X86::CMOVAE32rr
: case X86::CMOVAE32rm
:
2129 case X86::CMOVAE64rr
: case X86::CMOVAE64rm
:
2130 case X86::CMOVB16rr
: case X86::CMOVB16rm
:
2131 case X86::CMOVB32rr
: case X86::CMOVB32rm
:
2132 case X86::CMOVB64rr
: case X86::CMOVB64rm
:
2133 case X86::CMOVBE16rr
: case X86::CMOVBE16rm
:
2134 case X86::CMOVBE32rr
: case X86::CMOVBE32rm
:
2135 case X86::CMOVBE64rr
: case X86::CMOVBE64rm
:
2136 case X86::CMOVE16rr
: case X86::CMOVE16rm
:
2137 case X86::CMOVE32rr
: case X86::CMOVE32rm
:
2138 case X86::CMOVE64rr
: case X86::CMOVE64rm
:
2139 case X86::CMOVNE16rr
: case X86::CMOVNE16rm
:
2140 case X86::CMOVNE32rr
: case X86::CMOVNE32rm
:
2141 case X86::CMOVNE64rr
: case X86::CMOVNE64rm
:
2142 case X86::CMOVNP16rr
: case X86::CMOVNP16rm
:
2143 case X86::CMOVNP32rr
: case X86::CMOVNP32rm
:
2144 case X86::CMOVNP64rr
: case X86::CMOVNP64rm
:
2145 case X86::CMOVP16rr
: case X86::CMOVP16rm
:
2146 case X86::CMOVP32rr
: case X86::CMOVP32rm
:
2147 case X86::CMOVP64rr
: case X86::CMOVP64rm
:
2149 // Anything else: assume conservatively.
2150 default: return false;
2157 /// Test whether the given node which sets flags has any uses which require the
2158 /// CF flag to be accurate.
2159 static bool hasNoCarryFlagUses(SDNode
*N
) {
2160 // Examine each user of the node.
2161 for (SDNode::use_iterator UI
= N
->use_begin(), UE
= N
->use_end(); UI
!= UE
;
2163 // Only check things that use the flags.
2164 if (UI
.getUse().getResNo() != 1)
2166 // Only examine CopyToReg uses.
2167 if (UI
->getOpcode() != ISD::CopyToReg
)
2169 // Only examine CopyToReg uses that copy to EFLAGS.
2170 if (cast
<RegisterSDNode
>(UI
->getOperand(1))->getReg() != X86::EFLAGS
)
2172 // Examine each user of the CopyToReg use.
2173 for (SDNode::use_iterator FlagUI
= UI
->use_begin(), FlagUE
= UI
->use_end();
2174 FlagUI
!= FlagUE
; ++FlagUI
) {
2175 // Only examine the Flag result.
2176 if (FlagUI
.getUse().getResNo() != 1)
2178 // Anything unusual: assume conservatively.
2179 if (!FlagUI
->isMachineOpcode())
2181 // Examine the opcode of the user.
2182 switch (FlagUI
->getMachineOpcode()) {
2183 // Comparisons which don't examine the CF flag.
2184 case X86::SETOr
: case X86::SETNOr
: case X86::SETEr
: case X86::SETNEr
:
2185 case X86::SETSr
: case X86::SETNSr
: case X86::SETPr
: case X86::SETNPr
:
2186 case X86::SETLr
: case X86::SETGEr
: case X86::SETLEr
: case X86::SETGr
:
2187 case X86::JO_1
: case X86::JNO_1
: case X86::JE_1
: case X86::JNE_1
:
2188 case X86::JS_1
: case X86::JNS_1
: case X86::JP_1
: case X86::JNP_1
:
2189 case X86::JL_1
: case X86::JGE_1
: case X86::JLE_1
: case X86::JG_1
:
2190 case X86::CMOVO16rr
: case X86::CMOVO32rr
: case X86::CMOVO64rr
:
2191 case X86::CMOVO16rm
: case X86::CMOVO32rm
: case X86::CMOVO64rm
:
2192 case X86::CMOVNO16rr
: case X86::CMOVNO32rr
: case X86::CMOVNO64rr
:
2193 case X86::CMOVNO16rm
: case X86::CMOVNO32rm
: case X86::CMOVNO64rm
:
2194 case X86::CMOVE16rr
: case X86::CMOVE32rr
: case X86::CMOVE64rr
:
2195 case X86::CMOVE16rm
: case X86::CMOVE32rm
: case X86::CMOVE64rm
:
2196 case X86::CMOVNE16rr
: case X86::CMOVNE32rr
: case X86::CMOVNE64rr
:
2197 case X86::CMOVNE16rm
: case X86::CMOVNE32rm
: case X86::CMOVNE64rm
:
2198 case X86::CMOVS16rr
: case X86::CMOVS32rr
: case X86::CMOVS64rr
:
2199 case X86::CMOVS16rm
: case X86::CMOVS32rm
: case X86::CMOVS64rm
:
2200 case X86::CMOVNS16rr
: case X86::CMOVNS32rr
: case X86::CMOVNS64rr
:
2201 case X86::CMOVNS16rm
: case X86::CMOVNS32rm
: case X86::CMOVNS64rm
:
2202 case X86::CMOVP16rr
: case X86::CMOVP32rr
: case X86::CMOVP64rr
:
2203 case X86::CMOVP16rm
: case X86::CMOVP32rm
: case X86::CMOVP64rm
:
2204 case X86::CMOVNP16rr
: case X86::CMOVNP32rr
: case X86::CMOVNP64rr
:
2205 case X86::CMOVNP16rm
: case X86::CMOVNP32rm
: case X86::CMOVNP64rm
:
2206 case X86::CMOVL16rr
: case X86::CMOVL32rr
: case X86::CMOVL64rr
:
2207 case X86::CMOVL16rm
: case X86::CMOVL32rm
: case X86::CMOVL64rm
:
2208 case X86::CMOVGE16rr
: case X86::CMOVGE32rr
: case X86::CMOVGE64rr
:
2209 case X86::CMOVGE16rm
: case X86::CMOVGE32rm
: case X86::CMOVGE64rm
:
2210 case X86::CMOVLE16rr
: case X86::CMOVLE32rr
: case X86::CMOVLE64rr
:
2211 case X86::CMOVLE16rm
: case X86::CMOVLE32rm
: case X86::CMOVLE64rm
:
2212 case X86::CMOVG16rr
: case X86::CMOVG32rr
: case X86::CMOVG64rr
:
2213 case X86::CMOVG16rm
: case X86::CMOVG32rm
: case X86::CMOVG64rm
:
2215 // Anything else: assume conservatively.
2224 /// Check whether or not the chain ending in StoreNode is suitable for doing
2225 /// the {load; op; store} to modify transformation.
2226 static bool isFusableLoadOpStorePattern(StoreSDNode
*StoreNode
,
2227 SDValue StoredVal
, SelectionDAG
*CurDAG
,
2229 LoadSDNode
*&LoadNode
,
2230 SDValue
&InputChain
) {
2231 // Is the stored value result 0 of the operation?
2232 if (StoredVal
.getResNo() != 0) return false;
2234 // Are there other uses of the operation other than the store?
2235 if (!StoredVal
.getNode()->hasNUsesOfValue(1, 0)) return false;
2237 // Is the store non-extending and non-indexed?
2238 if (!ISD::isNormalStore(StoreNode
) || StoreNode
->isNonTemporal())
2241 SDValue Load
= StoredVal
->getOperand(LoadOpNo
);
2242 // Is the stored value a non-extending and non-indexed load?
2243 if (!ISD::isNormalLoad(Load
.getNode())) return false;
2245 // Return LoadNode by reference.
2246 LoadNode
= cast
<LoadSDNode
>(Load
);
2248 // Is store the only read of the loaded value?
2249 if (!Load
.hasOneUse())
2252 // Is the address of the store the same as the load?
2253 if (LoadNode
->getBasePtr() != StoreNode
->getBasePtr() ||
2254 LoadNode
->getOffset() != StoreNode
->getOffset())
2257 bool FoundLoad
= false;
2258 SmallVector
<SDValue
, 4> ChainOps
;
2259 SmallVector
<const SDNode
*, 4> LoopWorklist
;
2260 SmallPtrSet
<const SDNode
*, 16> Visited
;
2261 const unsigned int Max
= 1024;
2263 // Visualization of Load-Op-Store fusion:
2264 // -------------------------
2266 // *-lines = Chain operand dependencies.
2267 // |-lines = Normal operand dependencies.
2268 // Dependencies flow down and right. n-suffix references multiple nodes.
2276 // * * \ | => A--LD_OP_ST
2284 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
2288 // Ensure the transform is safe by checking for the dual
2289 // dependencies to make sure we do not induce a loop.
2291 // As LD is a predecessor to both OP and ST we can do this by checking:
2292 // a). if LD is a predecessor to a member of Xn or Yn.
2293 // b). if a Zn is a predecessor to ST.
2295 // However, (b) can only occur through being a chain predecessor to
2296 // ST, which is the same as Zn being a member or predecessor of Xn,
2297 // which is a subset of LD being a predecessor of Xn. So it's
2298 // subsumed by check (a).
2300 SDValue Chain
= StoreNode
->getChain();
2302 // Gather X elements in ChainOps.
2303 if (Chain
== Load
.getValue(1)) {
2305 ChainOps
.push_back(Load
.getOperand(0));
2306 } else if (Chain
.getOpcode() == ISD::TokenFactor
) {
2307 for (unsigned i
= 0, e
= Chain
.getNumOperands(); i
!= e
; ++i
) {
2308 SDValue Op
= Chain
.getOperand(i
);
2309 if (Op
== Load
.getValue(1)) {
2311 // Drop Load, but keep its chain. No cycle check necessary.
2312 ChainOps
.push_back(Load
.getOperand(0));
2315 LoopWorklist
.push_back(Op
.getNode());
2316 ChainOps
.push_back(Op
);
2323 // Worklist is currently Xn. Add Yn to worklist.
2324 for (SDValue Op
: StoredVal
->ops())
2325 if (Op
.getNode() != LoadNode
)
2326 LoopWorklist
.push_back(Op
.getNode());
2328 // Check (a) if Load is a predecessor to Xn + Yn
2329 if (SDNode::hasPredecessorHelper(Load
.getNode(), Visited
, LoopWorklist
, Max
,
2334 CurDAG
->getNode(ISD::TokenFactor
, SDLoc(Chain
), MVT::Other
, ChainOps
);
2338 // Change a chain of {load; op; store} of the same value into a simple op
2339 // through memory of that value, if the uses of the modified value and its
2340 // address are suitable.
2342 // The tablegen pattern memory operand pattern is currently not able to match
2343 // the case where the EFLAGS on the original operation are used.
2345 // To move this to tablegen, we'll need to improve tablegen to allow flags to
2346 // be transferred from a node in the pattern to the result node, probably with
2347 // a new keyword. For example, we have this
2348 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2349 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2350 // (implicit EFLAGS)]>;
2351 // but maybe need something like this
2352 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2353 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2354 // (transferrable EFLAGS)]>;
2356 // Until then, we manually fold these and instruction select the operation
2358 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode
*Node
) {
2359 StoreSDNode
*StoreNode
= cast
<StoreSDNode
>(Node
);
2360 SDValue StoredVal
= StoreNode
->getOperand(1);
2361 unsigned Opc
= StoredVal
->getOpcode();
2363 // Before we try to select anything, make sure this is memory operand size
2364 // and opcode we can handle. Note that this must match the code below that
2365 // actually lowers the opcodes.
2366 EVT MemVT
= StoreNode
->getMemoryVT();
2367 if (MemVT
!= MVT::i64
&& MemVT
!= MVT::i32
&& MemVT
!= MVT::i16
&&
2371 bool IsCommutable
= false;
2385 IsCommutable
= true;
2389 unsigned LoadOpNo
= 0;
2390 LoadSDNode
*LoadNode
= nullptr;
2392 if (!isFusableLoadOpStorePattern(StoreNode
, StoredVal
, CurDAG
, LoadOpNo
,
2393 LoadNode
, InputChain
)) {
2397 // This operation is commutable, try the other operand.
2399 if (!isFusableLoadOpStorePattern(StoreNode
, StoredVal
, CurDAG
, LoadOpNo
,
2400 LoadNode
, InputChain
))
2404 SDValue Base
, Scale
, Index
, Disp
, Segment
;
2405 if (!selectAddr(LoadNode
, LoadNode
->getBasePtr(), Base
, Scale
, Index
, Disp
,
2409 auto SelectOpcode
= [&](unsigned Opc64
, unsigned Opc32
, unsigned Opc16
,
2411 switch (MemVT
.getSimpleVT().SimpleTy
) {
2421 llvm_unreachable("Invalid size!");
2425 MachineSDNode
*Result
;
2431 ? SelectOpcode(X86::INC64m
, X86::INC32m
, X86::INC16m
, X86::INC8m
)
2432 : SelectOpcode(X86::DEC64m
, X86::DEC32m
, X86::DEC16m
, X86::DEC8m
);
2433 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
, Segment
, InputChain
};
2435 CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
, MVT::Other
, Ops
);
2445 auto SelectRegOpcode
= [SelectOpcode
](unsigned Opc
) {
2448 return SelectOpcode(X86::ADD64mr
, X86::ADD32mr
, X86::ADD16mr
,
2451 return SelectOpcode(X86::ADC64mr
, X86::ADC32mr
, X86::ADC16mr
,
2454 return SelectOpcode(X86::SUB64mr
, X86::SUB32mr
, X86::SUB16mr
,
2457 return SelectOpcode(X86::SBB64mr
, X86::SBB32mr
, X86::SBB16mr
,
2460 return SelectOpcode(X86::AND64mr
, X86::AND32mr
, X86::AND16mr
,
2463 return SelectOpcode(X86::OR64mr
, X86::OR32mr
, X86::OR16mr
, X86::OR8mr
);
2465 return SelectOpcode(X86::XOR64mr
, X86::XOR32mr
, X86::XOR16mr
,
2468 llvm_unreachable("Invalid opcode!");
2471 auto SelectImm8Opcode
= [SelectOpcode
](unsigned Opc
) {
2474 return SelectOpcode(X86::ADD64mi8
, X86::ADD32mi8
, X86::ADD16mi8
, 0);
2476 return SelectOpcode(X86::ADC64mi8
, X86::ADC32mi8
, X86::ADC16mi8
, 0);
2478 return SelectOpcode(X86::SUB64mi8
, X86::SUB32mi8
, X86::SUB16mi8
, 0);
2480 return SelectOpcode(X86::SBB64mi8
, X86::SBB32mi8
, X86::SBB16mi8
, 0);
2482 return SelectOpcode(X86::AND64mi8
, X86::AND32mi8
, X86::AND16mi8
, 0);
2484 return SelectOpcode(X86::OR64mi8
, X86::OR32mi8
, X86::OR16mi8
, 0);
2486 return SelectOpcode(X86::XOR64mi8
, X86::XOR32mi8
, X86::XOR16mi8
, 0);
2488 llvm_unreachable("Invalid opcode!");
2491 auto SelectImmOpcode
= [SelectOpcode
](unsigned Opc
) {
2494 return SelectOpcode(X86::ADD64mi32
, X86::ADD32mi
, X86::ADD16mi
,
2497 return SelectOpcode(X86::ADC64mi32
, X86::ADC32mi
, X86::ADC16mi
,
2500 return SelectOpcode(X86::SUB64mi32
, X86::SUB32mi
, X86::SUB16mi
,
2503 return SelectOpcode(X86::SBB64mi32
, X86::SBB32mi
, X86::SBB16mi
,
2506 return SelectOpcode(X86::AND64mi32
, X86::AND32mi
, X86::AND16mi
,
2509 return SelectOpcode(X86::OR64mi32
, X86::OR32mi
, X86::OR16mi
,
2512 return SelectOpcode(X86::XOR64mi32
, X86::XOR32mi
, X86::XOR16mi
,
2515 llvm_unreachable("Invalid opcode!");
2519 unsigned NewOpc
= SelectRegOpcode(Opc
);
2520 SDValue Operand
= StoredVal
->getOperand(1-LoadOpNo
);
2522 // See if the operand is a constant that we can fold into an immediate
2524 if (auto *OperandC
= dyn_cast
<ConstantSDNode
>(Operand
)) {
2525 auto OperandV
= OperandC
->getAPIntValue();
2527 // Check if we can shrink the operand enough to fit in an immediate (or
2528 // fit into a smaller immediate) by negating it and switching the
2530 if ((Opc
== X86ISD::ADD
|| Opc
== X86ISD::SUB
) &&
2531 ((MemVT
!= MVT::i8
&& OperandV
.getMinSignedBits() > 8 &&
2532 (-OperandV
).getMinSignedBits() <= 8) ||
2533 (MemVT
== MVT::i64
&& OperandV
.getMinSignedBits() > 32 &&
2534 (-OperandV
).getMinSignedBits() <= 32)) &&
2535 hasNoCarryFlagUses(StoredVal
.getNode())) {
2536 OperandV
= -OperandV
;
2537 Opc
= Opc
== X86ISD::ADD
? X86ISD::SUB
: X86ISD::ADD
;
2540 // First try to fit this into an Imm8 operand. If it doesn't fit, then try
2541 // the larger immediate operand.
2542 if (MemVT
!= MVT::i8
&& OperandV
.getMinSignedBits() <= 8) {
2543 Operand
= CurDAG
->getTargetConstant(OperandV
, SDLoc(Node
), MemVT
);
2544 NewOpc
= SelectImm8Opcode(Opc
);
2545 } else if (OperandV
.getActiveBits() <= MemVT
.getSizeInBits() &&
2546 (MemVT
!= MVT::i64
|| OperandV
.getMinSignedBits() <= 32)) {
2547 Operand
= CurDAG
->getTargetConstant(OperandV
, SDLoc(Node
), MemVT
);
2548 NewOpc
= SelectImmOpcode(Opc
);
2552 if (Opc
== X86ISD::ADC
|| Opc
== X86ISD::SBB
) {
2554 CurDAG
->getCopyToReg(InputChain
, SDLoc(Node
), X86::EFLAGS
,
2555 StoredVal
.getOperand(2), SDValue());
2557 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
,
2558 Segment
, Operand
, CopyTo
, CopyTo
.getValue(1)};
2559 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
, MVT::Other
,
2562 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
,
2563 Segment
, Operand
, InputChain
};
2564 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
, MVT::Other
,
2570 llvm_unreachable("Invalid opcode!");
2573 MachineMemOperand
*MemOps
[] = {StoreNode
->getMemOperand(),
2574 LoadNode
->getMemOperand()};
2575 CurDAG
->setNodeMemRefs(Result
, MemOps
);
2577 // Update Load Chain uses as well.
2578 ReplaceUses(SDValue(LoadNode
, 1), SDValue(Result
, 1));
2579 ReplaceUses(SDValue(StoreNode
, 0), SDValue(Result
, 1));
2580 ReplaceUses(SDValue(StoredVal
.getNode(), 1), SDValue(Result
, 0));
2581 CurDAG
->RemoveDeadNode(Node
);
2585 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
2586 bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode
*Node
) {
2587 MVT NVT
= Node
->getSimpleValueType(0);
2590 SDValue N0
= Node
->getOperand(0);
2591 SDValue N1
= Node
->getOperand(1);
2593 if (!Subtarget
->hasBMI() && !Subtarget
->hasTBM())
2596 // Must have a shift right.
2597 if (N0
->getOpcode() != ISD::SRL
&& N0
->getOpcode() != ISD::SRA
)
2600 // Shift can't have additional users.
2601 if (!N0
->hasOneUse())
2604 // Only supported for 32 and 64 bits.
2605 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
2608 // Shift amount and RHS of and must be constant.
2609 ConstantSDNode
*MaskCst
= dyn_cast
<ConstantSDNode
>(N1
);
2610 ConstantSDNode
*ShiftCst
= dyn_cast
<ConstantSDNode
>(N0
->getOperand(1));
2611 if (!MaskCst
|| !ShiftCst
)
2614 // And RHS must be a mask.
2615 uint64_t Mask
= MaskCst
->getZExtValue();
2616 if (!isMask_64(Mask
))
2619 uint64_t Shift
= ShiftCst
->getZExtValue();
2620 uint64_t MaskSize
= countPopulation(Mask
);
2622 // Don't interfere with something that can be handled by extracting AH.
2623 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
2624 if (Shift
== 8 && MaskSize
== 8)
2627 // Make sure we are only using bits that were in the original value, not
2629 if (Shift
+ MaskSize
> NVT
.getSizeInBits())
2632 // Create a BEXTR node and run it through selection.
2633 SDValue C
= CurDAG
->getConstant(Shift
| (MaskSize
<< 8), dl
, NVT
);
2634 SDValue New
= CurDAG
->getNode(X86ISD::BEXTR
, dl
, NVT
,
2635 N0
->getOperand(0), C
);
2636 ReplaceNode(Node
, New
.getNode());
2637 SelectCode(New
.getNode());
2641 // Emit a PCMISTR(I/M) instruction.
2642 MachineSDNode
*X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc
, unsigned MOpc
,
2643 bool MayFoldLoad
, const SDLoc
&dl
,
2644 MVT VT
, SDNode
*Node
) {
2645 SDValue N0
= Node
->getOperand(0);
2646 SDValue N1
= Node
->getOperand(1);
2647 SDValue Imm
= Node
->getOperand(2);
2648 const ConstantInt
*Val
= cast
<ConstantSDNode
>(Imm
)->getConstantIntValue();
2649 Imm
= CurDAG
->getTargetConstant(*Val
, SDLoc(Node
), Imm
.getValueType());
2651 // If there is a load, it will be behind a bitcast. We don't need to check
2652 // alignment on this load.
2653 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
2654 if (MayFoldLoad
&& N1
->getOpcode() == ISD::BITCAST
&& N1
->hasOneUse() &&
2655 tryFoldVecLoad(Node
, N1
.getNode(), N1
.getOperand(0), Tmp0
, Tmp1
, Tmp2
,
2657 SDValue Load
= N1
.getOperand(0);
2658 SDValue Ops
[] = { N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
2659 Load
.getOperand(0) };
2660 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Other
);
2661 MachineSDNode
*CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
2662 // Update the chain.
2663 ReplaceUses(Load
.getValue(1), SDValue(CNode
, 2));
2664 // Record the mem-refs
2665 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(Load
)->getMemOperand()});
2669 SDValue Ops
[] = { N0
, N1
, Imm
};
2670 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
);
2671 MachineSDNode
*CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, Ops
);
2675 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
2676 // to emit a second instruction after this one. This is needed since we have two
2677 // copyToReg nodes glued before this and we need to continue that glue through.
2678 MachineSDNode
*X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc
, unsigned MOpc
,
2679 bool MayFoldLoad
, const SDLoc
&dl
,
2680 MVT VT
, SDNode
*Node
,
2682 SDValue N0
= Node
->getOperand(0);
2683 SDValue N2
= Node
->getOperand(2);
2684 SDValue Imm
= Node
->getOperand(4);
2685 const ConstantInt
*Val
= cast
<ConstantSDNode
>(Imm
)->getConstantIntValue();
2686 Imm
= CurDAG
->getTargetConstant(*Val
, SDLoc(Node
), Imm
.getValueType());
2688 // If there is a load, it will be behind a bitcast. We don't need to check
2689 // alignment on this load.
2690 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
2691 if (MayFoldLoad
&& N2
->getOpcode() == ISD::BITCAST
&& N2
->hasOneUse() &&
2692 tryFoldVecLoad(Node
, N2
.getNode(), N2
.getOperand(0), Tmp0
, Tmp1
, Tmp2
,
2694 SDValue Load
= N2
.getOperand(0);
2695 SDValue Ops
[] = { N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
2696 Load
.getOperand(0), InFlag
};
2697 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Other
, MVT::Glue
);
2698 MachineSDNode
*CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
2699 InFlag
= SDValue(CNode
, 3);
2700 // Update the chain.
2701 ReplaceUses(Load
.getValue(1), SDValue(CNode
, 2));
2702 // Record the mem-refs
2703 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(Load
)->getMemOperand()});
2707 SDValue Ops
[] = { N0
, N2
, Imm
, InFlag
};
2708 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Glue
);
2709 MachineSDNode
*CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, Ops
);
2710 InFlag
= SDValue(CNode
, 2);
2714 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode
*N
) {
2715 EVT VT
= N
->getValueType(0);
2717 // Only handle scalar shifts.
2721 // Narrower shifts only mask to 5 bits in hardware.
2722 unsigned Size
= VT
== MVT::i64
? 64 : 32;
2724 SDValue OrigShiftAmt
= N
->getOperand(1);
2725 SDValue ShiftAmt
= OrigShiftAmt
;
2728 // Skip over a truncate of the shift amount.
2729 if (ShiftAmt
->getOpcode() == ISD::TRUNCATE
)
2730 ShiftAmt
= ShiftAmt
->getOperand(0);
2732 // Special case to avoid messing up a BZHI pattern.
2733 // Look for (srl (shl X, (size - y)), (size - y)
2734 if (Subtarget
->hasBMI2() && (VT
== MVT::i32
|| VT
== MVT::i64
) &&
2735 N
->getOpcode() == ISD::SRL
&& N
->getOperand(0).getOpcode() == ISD::SHL
&&
2736 // Shift amounts the same?
2737 N
->getOperand(1) == N
->getOperand(0).getOperand(1) &&
2738 // Shift amounts size - y?
2739 ShiftAmt
.getOpcode() == ISD::SUB
&&
2740 isa
<ConstantSDNode
>(ShiftAmt
.getOperand(0)) &&
2741 cast
<ConstantSDNode
>(ShiftAmt
.getOperand(0))->getZExtValue() == Size
)
2744 SDValue NewShiftAmt
;
2745 if (ShiftAmt
->getOpcode() == ISD::ADD
|| ShiftAmt
->getOpcode() == ISD::SUB
) {
2746 SDValue Add0
= ShiftAmt
->getOperand(0);
2747 SDValue Add1
= ShiftAmt
->getOperand(1);
2748 // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
2749 // to avoid the ADD/SUB.
2750 if (isa
<ConstantSDNode
>(Add1
) &&
2751 cast
<ConstantSDNode
>(Add1
)->getZExtValue() % Size
== 0) {
2753 // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
2754 // generate a NEG instead of a SUB of a constant.
2755 } else if (ShiftAmt
->getOpcode() == ISD::SUB
&&
2756 isa
<ConstantSDNode
>(Add0
) &&
2757 cast
<ConstantSDNode
>(Add0
)->getZExtValue() != 0 &&
2758 cast
<ConstantSDNode
>(Add0
)->getZExtValue() % Size
== 0) {
2759 // Insert a negate op.
2760 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
2761 // that uses it that's not a shift.
2762 EVT SubVT
= ShiftAmt
.getValueType();
2763 SDValue Zero
= CurDAG
->getConstant(0, DL
, SubVT
);
2764 SDValue Neg
= CurDAG
->getNode(ISD::SUB
, DL
, SubVT
, Zero
, Add1
);
2767 // Insert these operands into a valid topological order so they can
2768 // get selected independently.
2769 insertDAGNode(*CurDAG
, OrigShiftAmt
, Zero
);
2770 insertDAGNode(*CurDAG
, OrigShiftAmt
, Neg
);
2776 if (NewShiftAmt
.getValueType() != MVT::i8
) {
2777 // Need to truncate the shift amount.
2778 NewShiftAmt
= CurDAG
->getNode(ISD::TRUNCATE
, DL
, MVT::i8
, NewShiftAmt
);
2779 // Add to a correct topological ordering.
2780 insertDAGNode(*CurDAG
, OrigShiftAmt
, NewShiftAmt
);
2783 // Insert a new mask to keep the shift amount legal. This should be removed
2784 // by isel patterns.
2785 NewShiftAmt
= CurDAG
->getNode(ISD::AND
, DL
, MVT::i8
, NewShiftAmt
,
2786 CurDAG
->getConstant(Size
- 1, DL
, MVT::i8
));
2787 // Place in a correct topological ordering.
2788 insertDAGNode(*CurDAG
, OrigShiftAmt
, NewShiftAmt
);
2790 SDNode
*UpdatedNode
= CurDAG
->UpdateNodeOperands(N
, N
->getOperand(0),
2792 if (UpdatedNode
!= N
) {
2793 // If we found an existing node, we should replace ourselves with that node
2794 // and wait for it to be selected after its other users.
2795 ReplaceNode(N
, UpdatedNode
);
2799 // If the original shift amount is now dead, delete it so that we don't run
2801 if (OrigShiftAmt
.getNode()->use_empty())
2802 CurDAG
->RemoveDeadNode(OrigShiftAmt
.getNode());
2804 // Now that we've optimized the shift amount, defer to normal isel to get
2805 // load folding and legacy vs BMI2 selection without repeating it here.
2810 /// If the high bits of an 'and' operand are known zero, try setting the
2811 /// high bits of an 'and' constant operand to produce a smaller encoding by
2812 /// creating a small, sign-extended negative immediate rather than a large
2813 /// positive one. This reverses a transform in SimplifyDemandedBits that
2814 /// shrinks mask constants by clearing bits. There is also a possibility that
2815 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
2816 /// case, just replace the 'and'. Return 'true' if the node is replaced.
2817 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode
*And
) {
2818 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
2819 // have immediate operands.
2820 MVT VT
= And
->getSimpleValueType(0);
2821 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
2824 auto *And1C
= dyn_cast
<ConstantSDNode
>(And
->getOperand(1));
2828 // Bail out if the mask constant is already negative. It's can't shrink more.
2829 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
2830 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
2831 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
2832 // are negative too.
2833 APInt MaskVal
= And1C
->getAPIntValue();
2834 unsigned MaskLZ
= MaskVal
.countLeadingZeros();
2835 if (!MaskLZ
|| (VT
== MVT::i64
&& MaskLZ
== 32))
2838 // Don't extend into the upper 32 bits of a 64 bit mask.
2839 if (VT
== MVT::i64
&& MaskLZ
>= 32) {
2841 MaskVal
= MaskVal
.trunc(32);
2844 SDValue And0
= And
->getOperand(0);
2845 APInt HighZeros
= APInt::getHighBitsSet(MaskVal
.getBitWidth(), MaskLZ
);
2846 APInt NegMaskVal
= MaskVal
| HighZeros
;
2848 // If a negative constant would not allow a smaller encoding, there's no need
2849 // to continue. Only change the constant when we know it's a win.
2850 unsigned MinWidth
= NegMaskVal
.getMinSignedBits();
2851 if (MinWidth
> 32 || (MinWidth
> 8 && MaskVal
.getMinSignedBits() <= 32))
2854 // Extend masks if we truncated above.
2855 if (VT
== MVT::i64
&& MaskVal
.getBitWidth() < 64) {
2856 NegMaskVal
= NegMaskVal
.zext(64);
2857 HighZeros
= HighZeros
.zext(64);
2860 // The variable operand must be all zeros in the top bits to allow using the
2861 // new, negative constant as the mask.
2862 if (!CurDAG
->MaskedValueIsZero(And0
, HighZeros
))
2865 // Check if the mask is -1. In that case, this is an unnecessary instruction
2866 // that escaped earlier analysis.
2867 if (NegMaskVal
.isAllOnesValue()) {
2868 ReplaceNode(And
, And0
.getNode());
2872 // A negative mask allows a smaller encoding. Create a new 'and' node.
2873 SDValue NewMask
= CurDAG
->getConstant(NegMaskVal
, SDLoc(And
), VT
);
2874 SDValue NewAnd
= CurDAG
->getNode(ISD::AND
, SDLoc(And
), VT
, And0
, NewMask
);
2875 ReplaceNode(And
, NewAnd
.getNode());
2876 SelectCode(NewAnd
.getNode());
2880 void X86DAGToDAGISel::Select(SDNode
*Node
) {
2881 MVT NVT
= Node
->getSimpleValueType(0);
2882 unsigned Opcode
= Node
->getOpcode();
2885 if (Node
->isMachineOpcode()) {
2886 LLVM_DEBUG(dbgs() << "== "; Node
->dump(CurDAG
); dbgs() << '\n');
2887 Node
->setNodeId(-1);
2888 return; // Already selected.
2894 if (Subtarget
->isTargetNaCl())
2895 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
2896 // leave the instruction alone.
2898 if (Subtarget
->isTarget64BitILP32()) {
2899 // Converts a 32-bit register to a 64-bit, zero-extended version of
2900 // it. This is needed because x86-64 can do many things, but jmp %r32
2901 // ain't one of them.
2902 const SDValue
&Target
= Node
->getOperand(1);
2903 assert(Target
.getSimpleValueType() == llvm::MVT::i32
);
2904 SDValue ZextTarget
= CurDAG
->getZExtOrTrunc(Target
, dl
, EVT(MVT::i64
));
2905 SDValue Brind
= CurDAG
->getNode(ISD::BRIND
, dl
, MVT::Other
,
2906 Node
->getOperand(0), ZextTarget
);
2907 ReplaceNode(Node
, Brind
.getNode());
2908 SelectCode(ZextTarget
.getNode());
2909 SelectCode(Brind
.getNode());
2914 case X86ISD::GlobalBaseReg
:
2915 ReplaceNode(Node
, getGlobalBaseReg());
2919 // Just drop all 128/256/512-bit bitcasts.
2920 if (NVT
.is512BitVector() || NVT
.is256BitVector() || NVT
.is128BitVector() ||
2922 ReplaceUses(SDValue(Node
, 0), Node
->getOperand(0));
2923 CurDAG
->RemoveDeadNode(Node
);
2928 case X86ISD::SELECT
:
2929 case X86ISD::SHRUNKBLEND
: {
2930 // SHRUNKBLEND selects like a regular VSELECT. Same with X86ISD::SELECT.
2931 SDValue VSelect
= CurDAG
->getNode(
2932 ISD::VSELECT
, SDLoc(Node
), Node
->getValueType(0), Node
->getOperand(0),
2933 Node
->getOperand(1), Node
->getOperand(2));
2934 ReplaceNode(Node
, VSelect
.getNode());
2935 SelectCode(VSelect
.getNode());
2936 // We already called ReplaceUses.
2943 if (tryShiftAmountMod(Node
))
2948 if (matchBEXTRFromAnd(Node
))
2950 if (AndImmShrink
&& shrinkAndImmediate(Node
))
2957 // For operations of the form (x << C1) op C2, check if we can use a smaller
2958 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
2959 SDValue N0
= Node
->getOperand(0);
2960 SDValue N1
= Node
->getOperand(1);
2962 if (N0
->getOpcode() != ISD::SHL
|| !N0
->hasOneUse())
2965 // i8 is unshrinkable, i16 should be promoted to i32.
2966 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
2969 ConstantSDNode
*Cst
= dyn_cast
<ConstantSDNode
>(N1
);
2970 ConstantSDNode
*ShlCst
= dyn_cast
<ConstantSDNode
>(N0
->getOperand(1));
2971 if (!Cst
|| !ShlCst
)
2974 int64_t Val
= Cst
->getSExtValue();
2975 uint64_t ShlVal
= ShlCst
->getZExtValue();
2977 // Make sure that we don't change the operation by removing bits.
2978 // This only matters for OR and XOR, AND is unaffected.
2979 uint64_t RemovedBitsMask
= (1ULL << ShlVal
) - 1;
2980 if (Opcode
!= ISD::AND
&& (Val
& RemovedBitsMask
) != 0)
2983 unsigned ShlOp
, AddOp
, Op
;
2986 // Check the minimum bitwidth for the new constant.
2987 // TODO: AND32ri is the same as AND64ri32 with zext imm.
2988 // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
2989 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
2990 if (!isInt
<8>(Val
) && isInt
<8>(Val
>> ShlVal
))
2992 else if (!isInt
<32>(Val
) && isInt
<32>(Val
>> ShlVal
))
2995 // Bail if there is no smaller encoding.
2999 switch (NVT
.SimpleTy
) {
3000 default: llvm_unreachable("Unsupported VT!");
3002 assert(CstVT
== MVT::i8
);
3003 ShlOp
= X86::SHL32ri
;
3004 AddOp
= X86::ADD32rr
;
3007 default: llvm_unreachable("Impossible opcode");
3008 case ISD::AND
: Op
= X86::AND32ri8
; break;
3009 case ISD::OR
: Op
= X86::OR32ri8
; break;
3010 case ISD::XOR
: Op
= X86::XOR32ri8
; break;
3014 assert(CstVT
== MVT::i8
|| CstVT
== MVT::i32
);
3015 ShlOp
= X86::SHL64ri
;
3016 AddOp
= X86::ADD64rr
;
3019 default: llvm_unreachable("Impossible opcode");
3020 case ISD::AND
: Op
= CstVT
==MVT::i8
? X86::AND64ri8
: X86::AND64ri32
; break;
3021 case ISD::OR
: Op
= CstVT
==MVT::i8
? X86::OR64ri8
: X86::OR64ri32
; break;
3022 case ISD::XOR
: Op
= CstVT
==MVT::i8
? X86::XOR64ri8
: X86::XOR64ri32
; break;
3027 // Emit the smaller op and the shift.
3028 SDValue NewCst
= CurDAG
->getTargetConstant(Val
>> ShlVal
, dl
, CstVT
);
3029 SDNode
*New
= CurDAG
->getMachineNode(Op
, dl
, NVT
, N0
->getOperand(0),NewCst
);
3031 CurDAG
->SelectNodeTo(Node
, AddOp
, NVT
, SDValue(New
, 0),
3034 CurDAG
->SelectNodeTo(Node
, ShlOp
, NVT
, SDValue(New
, 0),
3035 getI8Imm(ShlVal
, dl
));
3039 case X86ISD::SMUL8
: {
3040 SDValue N0
= Node
->getOperand(0);
3041 SDValue N1
= Node
->getOperand(1);
3043 unsigned Opc
= (Opcode
== X86ISD::SMUL8
? X86::IMUL8r
: X86::MUL8r
);
3045 SDValue InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::AL
,
3046 N0
, SDValue()).getValue(1);
3048 SDVTList VTs
= CurDAG
->getVTList(NVT
, MVT::i32
);
3049 SDValue Ops
[] = {N1
, InFlag
};
3050 SDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
3052 ReplaceNode(Node
, CNode
);
3056 case X86ISD::UMUL
: {
3057 SDValue N0
= Node
->getOperand(0);
3058 SDValue N1
= Node
->getOperand(1);
3060 unsigned LoReg
, Opc
;
3061 switch (NVT
.SimpleTy
) {
3062 default: llvm_unreachable("Unsupported VT!");
3063 // MVT::i8 is handled by X86ISD::UMUL8.
3064 case MVT::i16
: LoReg
= X86::AX
; Opc
= X86::MUL16r
; break;
3065 case MVT::i32
: LoReg
= X86::EAX
; Opc
= X86::MUL32r
; break;
3066 case MVT::i64
: LoReg
= X86::RAX
; Opc
= X86::MUL64r
; break;
3069 SDValue InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, LoReg
,
3070 N0
, SDValue()).getValue(1);
3072 SDVTList VTs
= CurDAG
->getVTList(NVT
, NVT
, MVT::i32
);
3073 SDValue Ops
[] = {N1
, InFlag
};
3074 SDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
3076 ReplaceNode(Node
, CNode
);
3080 case ISD::SMUL_LOHI
:
3081 case ISD::UMUL_LOHI
: {
3082 SDValue N0
= Node
->getOperand(0);
3083 SDValue N1
= Node
->getOperand(1);
3086 bool isSigned
= Opcode
== ISD::SMUL_LOHI
;
3087 bool hasBMI2
= Subtarget
->hasBMI2();
3089 switch (NVT
.SimpleTy
) {
3090 default: llvm_unreachable("Unsupported VT!");
3091 case MVT::i32
: Opc
= hasBMI2
? X86::MULX32rr
: X86::MUL32r
;
3092 MOpc
= hasBMI2
? X86::MULX32rm
: X86::MUL32m
; break;
3093 case MVT::i64
: Opc
= hasBMI2
? X86::MULX64rr
: X86::MUL64r
;
3094 MOpc
= hasBMI2
? X86::MULX64rm
: X86::MUL64m
; break;
3097 switch (NVT
.SimpleTy
) {
3098 default: llvm_unreachable("Unsupported VT!");
3099 case MVT::i32
: Opc
= X86::IMUL32r
; MOpc
= X86::IMUL32m
; break;
3100 case MVT::i64
: Opc
= X86::IMUL64r
; MOpc
= X86::IMUL64m
; break;
3104 unsigned SrcReg
, LoReg
, HiReg
;
3106 default: llvm_unreachable("Unknown MUL opcode!");
3109 SrcReg
= LoReg
= X86::EAX
; HiReg
= X86::EDX
;
3113 SrcReg
= LoReg
= X86::RAX
; HiReg
= X86::RDX
;
3116 SrcReg
= X86::EDX
; LoReg
= HiReg
= 0;
3119 SrcReg
= X86::RDX
; LoReg
= HiReg
= 0;
3123 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
3124 bool foldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
3125 // Multiply is commmutative.
3127 foldedLoad
= tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
3132 SDValue InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, SrcReg
,
3133 N0
, SDValue()).getValue(1);
3134 SDValue ResHi
, ResLo
;
3138 MachineSDNode
*CNode
= nullptr;
3139 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
3141 if (MOpc
== X86::MULX32rm
|| MOpc
== X86::MULX64rm
) {
3142 SDVTList VTs
= CurDAG
->getVTList(NVT
, NVT
, MVT::Other
, MVT::Glue
);
3143 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
3144 ResHi
= SDValue(CNode
, 0);
3145 ResLo
= SDValue(CNode
, 1);
3146 Chain
= SDValue(CNode
, 2);
3147 InFlag
= SDValue(CNode
, 3);
3149 SDVTList VTs
= CurDAG
->getVTList(MVT::Other
, MVT::Glue
);
3150 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
3151 Chain
= SDValue(CNode
, 0);
3152 InFlag
= SDValue(CNode
, 1);
3155 // Update the chain.
3156 ReplaceUses(N1
.getValue(1), Chain
);
3157 // Record the mem-refs
3158 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
3160 SDValue Ops
[] = { N1
, InFlag
};
3161 if (Opc
== X86::MULX32rr
|| Opc
== X86::MULX64rr
) {
3162 SDVTList VTs
= CurDAG
->getVTList(NVT
, NVT
, MVT::Glue
);
3163 SDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
3164 ResHi
= SDValue(CNode
, 0);
3165 ResLo
= SDValue(CNode
, 1);
3166 InFlag
= SDValue(CNode
, 2);
3168 SDVTList VTs
= CurDAG
->getVTList(MVT::Glue
);
3169 SDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
3170 InFlag
= SDValue(CNode
, 0);
3174 // Copy the low half of the result, if it is needed.
3175 if (!SDValue(Node
, 0).use_empty()) {
3176 if (!ResLo
.getNode()) {
3177 assert(LoReg
&& "Register for low half is not defined!");
3178 ResLo
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
, LoReg
, NVT
,
3180 InFlag
= ResLo
.getValue(2);
3182 ReplaceUses(SDValue(Node
, 0), ResLo
);
3183 LLVM_DEBUG(dbgs() << "=> "; ResLo
.getNode()->dump(CurDAG
);
3186 // Copy the high half of the result, if it is needed.
3187 if (!SDValue(Node
, 1).use_empty()) {
3188 if (!ResHi
.getNode()) {
3189 assert(HiReg
&& "Register for high half is not defined!");
3190 ResHi
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
, HiReg
, NVT
,
3192 InFlag
= ResHi
.getValue(2);
3194 ReplaceUses(SDValue(Node
, 1), ResHi
);
3195 LLVM_DEBUG(dbgs() << "=> "; ResHi
.getNode()->dump(CurDAG
);
3199 CurDAG
->RemoveDeadNode(Node
);
3205 case X86ISD::SDIVREM8_SEXT_HREG
:
3206 case X86ISD::UDIVREM8_ZEXT_HREG
: {
3207 SDValue N0
= Node
->getOperand(0);
3208 SDValue N1
= Node
->getOperand(1);
3211 bool isSigned
= (Opcode
== ISD::SDIVREM
||
3212 Opcode
== X86ISD::SDIVREM8_SEXT_HREG
);
3214 switch (NVT
.SimpleTy
) {
3215 default: llvm_unreachable("Unsupported VT!");
3216 case MVT::i8
: Opc
= X86::DIV8r
; MOpc
= X86::DIV8m
; break;
3217 case MVT::i16
: Opc
= X86::DIV16r
; MOpc
= X86::DIV16m
; break;
3218 case MVT::i32
: Opc
= X86::DIV32r
; MOpc
= X86::DIV32m
; break;
3219 case MVT::i64
: Opc
= X86::DIV64r
; MOpc
= X86::DIV64m
; break;
3222 switch (NVT
.SimpleTy
) {
3223 default: llvm_unreachable("Unsupported VT!");
3224 case MVT::i8
: Opc
= X86::IDIV8r
; MOpc
= X86::IDIV8m
; break;
3225 case MVT::i16
: Opc
= X86::IDIV16r
; MOpc
= X86::IDIV16m
; break;
3226 case MVT::i32
: Opc
= X86::IDIV32r
; MOpc
= X86::IDIV32m
; break;
3227 case MVT::i64
: Opc
= X86::IDIV64r
; MOpc
= X86::IDIV64m
; break;
3231 unsigned LoReg
, HiReg
, ClrReg
;
3232 unsigned SExtOpcode
;
3233 switch (NVT
.SimpleTy
) {
3234 default: llvm_unreachable("Unsupported VT!");
3236 LoReg
= X86::AL
; ClrReg
= HiReg
= X86::AH
;
3237 SExtOpcode
= X86::CBW
;
3240 LoReg
= X86::AX
; HiReg
= X86::DX
;
3242 SExtOpcode
= X86::CWD
;
3245 LoReg
= X86::EAX
; ClrReg
= HiReg
= X86::EDX
;
3246 SExtOpcode
= X86::CDQ
;
3249 LoReg
= X86::RAX
; ClrReg
= HiReg
= X86::RDX
;
3250 SExtOpcode
= X86::CQO
;
3254 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
3255 bool foldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
3256 bool signBitIsZero
= CurDAG
->SignBitIsZero(N0
);
3259 if (NVT
== MVT::i8
&& (!isSigned
|| signBitIsZero
)) {
3260 // Special case for div8, just use a move with zero extension to AX to
3261 // clear the upper 8 bits (AH).
3262 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Move
, Chain
;
3263 if (tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
3264 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N0
.getOperand(0) };
3266 SDValue(CurDAG
->getMachineNode(X86::MOVZX32rm8
, dl
, MVT::i32
,
3267 MVT::Other
, Ops
), 0);
3268 Chain
= Move
.getValue(1);
3269 ReplaceUses(N0
.getValue(1), Chain
);
3272 SDValue(CurDAG
->getMachineNode(X86::MOVZX32rr8
, dl
, MVT::i32
, N0
),0);
3273 Chain
= CurDAG
->getEntryNode();
3275 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::EAX
, Move
, SDValue());
3276 InFlag
= Chain
.getValue(1);
3279 CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
,
3280 LoReg
, N0
, SDValue()).getValue(1);
3281 if (isSigned
&& !signBitIsZero
) {
3282 // Sign extend the low part into the high part.
3284 SDValue(CurDAG
->getMachineNode(SExtOpcode
, dl
, MVT::Glue
, InFlag
),0);
3286 // Zero out the high part, effectively zero extending the input.
3287 SDValue ClrNode
= SDValue(CurDAG
->getMachineNode(X86::MOV32r0
, dl
, NVT
), 0);
3288 switch (NVT
.SimpleTy
) {
3291 SDValue(CurDAG
->getMachineNode(
3292 TargetOpcode::EXTRACT_SUBREG
, dl
, MVT::i16
, ClrNode
,
3293 CurDAG
->getTargetConstant(X86::sub_16bit
, dl
,
3301 SDValue(CurDAG
->getMachineNode(
3302 TargetOpcode::SUBREG_TO_REG
, dl
, MVT::i64
,
3303 CurDAG
->getTargetConstant(0, dl
, MVT::i64
), ClrNode
,
3304 CurDAG
->getTargetConstant(X86::sub_32bit
, dl
,
3309 llvm_unreachable("Unexpected division source");
3312 InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, ClrReg
,
3313 ClrNode
, InFlag
).getValue(1);
3318 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
3320 MachineSDNode
*CNode
=
3321 CurDAG
->getMachineNode(MOpc
, dl
, MVT::Other
, MVT::Glue
, Ops
);
3322 InFlag
= SDValue(CNode
, 1);
3323 // Update the chain.
3324 ReplaceUses(N1
.getValue(1), SDValue(CNode
, 0));
3325 // Record the mem-refs
3326 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
3329 SDValue(CurDAG
->getMachineNode(Opc
, dl
, MVT::Glue
, N1
, InFlag
), 0);
3332 // Prevent use of AH in a REX instruction by explicitly copying it to
3333 // an ABCD_L register.
3335 // The current assumption of the register allocator is that isel
3336 // won't generate explicit references to the GR8_ABCD_H registers. If
3337 // the allocator and/or the backend get enhanced to be more robust in
3338 // that regard, this can be, and should be, removed.
3339 if (HiReg
== X86::AH
&& !SDValue(Node
, 1).use_empty()) {
3340 SDValue AHCopy
= CurDAG
->getRegister(X86::AH
, MVT::i8
);
3341 unsigned AHExtOpcode
=
3342 isSigned
? X86::MOVSX32rr8_NOREX
: X86::MOVZX32rr8_NOREX
;
3344 SDNode
*RNode
= CurDAG
->getMachineNode(AHExtOpcode
, dl
, MVT::i32
,
3345 MVT::Glue
, AHCopy
, InFlag
);
3346 SDValue
Result(RNode
, 0);
3347 InFlag
= SDValue(RNode
, 1);
3349 if (Opcode
== X86ISD::UDIVREM8_ZEXT_HREG
||
3350 Opcode
== X86ISD::SDIVREM8_SEXT_HREG
) {
3351 assert(Node
->getValueType(1) == MVT::i32
&& "Unexpected result type!");
3354 CurDAG
->getTargetExtractSubreg(X86::sub_8bit
, dl
, MVT::i8
, Result
);
3356 ReplaceUses(SDValue(Node
, 1), Result
);
3357 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
3360 // Copy the division (low) result, if it is needed.
3361 if (!SDValue(Node
, 0).use_empty()) {
3362 SDValue Result
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
,
3363 LoReg
, NVT
, InFlag
);
3364 InFlag
= Result
.getValue(2);
3365 ReplaceUses(SDValue(Node
, 0), Result
);
3366 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
3369 // Copy the remainder (high) result, if it is needed.
3370 if (!SDValue(Node
, 1).use_empty()) {
3371 SDValue Result
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
,
3372 HiReg
, NVT
, InFlag
);
3373 InFlag
= Result
.getValue(2);
3374 ReplaceUses(SDValue(Node
, 1), Result
);
3375 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
3378 CurDAG
->RemoveDeadNode(Node
);
3383 SDValue N0
= Node
->getOperand(0);
3384 SDValue N1
= Node
->getOperand(1);
3386 if (N0
.getOpcode() == ISD::TRUNCATE
&& N0
.hasOneUse() &&
3387 hasNoSignedComparisonUses(Node
))
3388 N0
= N0
.getOperand(0);
3390 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
3391 // use a smaller encoding.
3392 // Look past the truncate if CMP is the only use of it.
3393 if (N0
.getOpcode() == ISD::AND
&&
3394 N0
.getNode()->hasOneUse() &&
3395 N0
.getValueType() != MVT::i8
&&
3396 X86::isZeroNode(N1
)) {
3397 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1));
3399 uint64_t Mask
= C
->getZExtValue();
3405 if (isUInt
<8>(Mask
) &&
3406 (!(Mask
& 0x80) || hasNoSignedComparisonUses(Node
))) {
3407 // For example, convert "testl %eax, $8" to "testb %al, $8"
3409 SubRegOp
= X86::sub_8bit
;
3411 } else if (OptForMinSize
&& isUInt
<16>(Mask
) &&
3412 (!(Mask
& 0x8000) || hasNoSignedComparisonUses(Node
))) {
3413 // For example, "testl %eax, $32776" to "testw %ax, $32776".
3414 // NOTE: We only want to form TESTW instructions if optimizing for
3415 // min size. Otherwise we only save one byte and possibly get a length
3416 // changing prefix penalty in the decoders.
3418 SubRegOp
= X86::sub_16bit
;
3420 } else if (isUInt
<32>(Mask
) && N0
.getValueType() != MVT::i16
&&
3421 (!(Mask
& 0x80000000) || hasNoSignedComparisonUses(Node
))) {
3422 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
3423 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
3424 // Otherwize, we find ourselves in a position where we have to do
3425 // promotion. If previous passes did not promote the and, we assume
3426 // they had a good reason not to and do not promote here.
3428 SubRegOp
= X86::sub_32bit
;
3431 // No eligible transformation was found.
3435 SDValue Imm
= CurDAG
->getTargetConstant(Mask
, dl
, VT
);
3436 SDValue Reg
= N0
.getOperand(0);
3438 // Extract the subregister if necessary.
3439 if (N0
.getValueType() != VT
)
3440 Reg
= CurDAG
->getTargetExtractSubreg(SubRegOp
, dl
, VT
, Reg
);
3442 // Emit a testl or testw.
3443 SDNode
*NewNode
= CurDAG
->getMachineNode(Op
, dl
, MVT::i32
, Reg
, Imm
);
3444 // Replace CMP with TEST.
3445 ReplaceNode(Node
, NewNode
);
3450 case X86ISD::PCMPISTR
: {
3451 if (!Subtarget
->hasSSE42())
3454 bool NeedIndex
= !SDValue(Node
, 0).use_empty();
3455 bool NeedMask
= !SDValue(Node
, 1).use_empty();
3456 // We can't fold a load if we are going to make two instructions.
3457 bool MayFoldLoad
= !NeedIndex
|| !NeedMask
;
3459 MachineSDNode
*CNode
;
3461 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRMrr
: X86::PCMPISTRMrr
;
3462 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRMrm
: X86::PCMPISTRMrm
;
3463 CNode
= emitPCMPISTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::v16i8
, Node
);
3464 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, 0));
3466 if (NeedIndex
|| !NeedMask
) {
3467 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRIrr
: X86::PCMPISTRIrr
;
3468 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRIrm
: X86::PCMPISTRIrm
;
3469 CNode
= emitPCMPISTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::i32
, Node
);
3470 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
3473 // Connect the flag usage to the last instruction created.
3474 ReplaceUses(SDValue(Node
, 2), SDValue(CNode
, 1));
3475 CurDAG
->RemoveDeadNode(Node
);
3478 case X86ISD::PCMPESTR
: {
3479 if (!Subtarget
->hasSSE42())
3482 // Copy the two implicit register inputs.
3483 SDValue InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EAX
,
3484 Node
->getOperand(1),
3485 SDValue()).getValue(1);
3486 InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EDX
,
3487 Node
->getOperand(3), InFlag
).getValue(1);
3489 bool NeedIndex
= !SDValue(Node
, 0).use_empty();
3490 bool NeedMask
= !SDValue(Node
, 1).use_empty();
3491 // We can't fold a load if we are going to make two instructions.
3492 bool MayFoldLoad
= !NeedIndex
|| !NeedMask
;
3494 MachineSDNode
*CNode
;
3496 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRMrr
: X86::PCMPESTRMrr
;
3497 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRMrm
: X86::PCMPESTRMrm
;
3498 CNode
= emitPCMPESTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::v16i8
, Node
,
3500 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, 0));
3502 if (NeedIndex
|| !NeedMask
) {
3503 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRIrr
: X86::PCMPESTRIrr
;
3504 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRIrm
: X86::PCMPESTRIrm
;
3505 CNode
= emitPCMPESTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::i32
, Node
, InFlag
);
3506 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
3508 // Connect the flag usage to the last instruction created.
3509 ReplaceUses(SDValue(Node
, 2), SDValue(CNode
, 1));
3510 CurDAG
->RemoveDeadNode(Node
);
3515 if (foldLoadStoreIntoMemOperand(Node
))
3523 bool X86DAGToDAGISel::
3524 SelectInlineAsmMemoryOperand(const SDValue
&Op
, unsigned ConstraintID
,
3525 std::vector
<SDValue
> &OutOps
) {
3526 SDValue Op0
, Op1
, Op2
, Op3
, Op4
;
3527 switch (ConstraintID
) {
3529 llvm_unreachable("Unexpected asm memory constraint");
3530 case InlineAsm::Constraint_i
:
3531 // FIXME: It seems strange that 'i' is needed here since it's supposed to
3532 // be an immediate and not a memory constraint.
3534 case InlineAsm::Constraint_o
: // offsetable ??
3535 case InlineAsm::Constraint_v
: // not offsetable ??
3536 case InlineAsm::Constraint_m
: // memory
3537 case InlineAsm::Constraint_X
:
3538 if (!selectAddr(nullptr, Op
, Op0
, Op1
, Op2
, Op3
, Op4
))
3543 OutOps
.push_back(Op0
);
3544 OutOps
.push_back(Op1
);
3545 OutOps
.push_back(Op2
);
3546 OutOps
.push_back(Op3
);
3547 OutOps
.push_back(Op4
);
3551 /// This pass converts a legalized DAG into a X86-specific DAG,
3552 /// ready for instruction scheduling.
3553 FunctionPass
*llvm::createX86ISelDag(X86TargetMachine
&TM
,
3554 CodeGenOpt::Level OptLevel
) {
3555 return new X86DAGToDAGISel(TM
, OptLevel
);