1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines a DAG pattern matching instruction selector for X86,
10 // converting from a legalized dag to a X86 dag.
12 //===----------------------------------------------------------------------===//
15 #include "X86MachineFunctionInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "X86TargetMachine.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/MachineFrameInfo.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/SelectionDAGISel.h"
23 #include "llvm/Config/llvm-config.h"
24 #include "llvm/IR/ConstantRange.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/ErrorHandling.h"
31 #include "llvm/Support/KnownBits.h"
32 #include "llvm/Support/MathExtras.h"
33 #include "llvm/Support/raw_ostream.h"
34 #include "llvm/Target/TargetMachine.h"
35 #include "llvm/Target/TargetOptions.h"
39 #define DEBUG_TYPE "x86-isel"
41 STATISTIC(NumLoadMoved
, "Number of loads moved below TokenFactor");
43 static cl::opt
<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
47 //===----------------------------------------------------------------------===//
48 // Pattern Matcher Implementation
49 //===----------------------------------------------------------------------===//
52 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
53 /// numbers for the leaves of the matched tree.
54 struct X86ISelAddressMode
{
60 // This is really a union, discriminated by BaseType!
68 const GlobalValue
*GV
;
70 const BlockAddress
*BlockAddr
;
74 unsigned Align
; // CP alignment.
75 unsigned char SymbolFlags
; // X86II::MO_*
78 : BaseType(RegBase
), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
79 Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
80 MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG
) {}
82 bool hasSymbolicDisplacement() const {
83 return GV
!= nullptr || CP
!= nullptr || ES
!= nullptr ||
84 MCSym
!= nullptr || JT
!= -1 || BlockAddr
!= nullptr;
87 bool hasBaseOrIndexReg() const {
88 return BaseType
== FrameIndexBase
||
89 IndexReg
.getNode() != nullptr || Base_Reg
.getNode() != nullptr;
92 /// Return true if this addressing mode is already RIP-relative.
93 bool isRIPRelative() const {
94 if (BaseType
!= RegBase
) return false;
95 if (RegisterSDNode
*RegNode
=
96 dyn_cast_or_null
<RegisterSDNode
>(Base_Reg
.getNode()))
97 return RegNode
->getReg() == X86::RIP
;
101 void setBaseReg(SDValue Reg
) {
106 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
107 void dump(SelectionDAG
*DAG
= nullptr) {
108 dbgs() << "X86ISelAddressMode " << this << '\n';
109 dbgs() << "Base_Reg ";
110 if (Base_Reg
.getNode())
111 Base_Reg
.getNode()->dump(DAG
);
114 if (BaseType
== FrameIndexBase
)
115 dbgs() << " Base.FrameIndex " << Base_FrameIndex
<< '\n';
116 dbgs() << " Scale " << Scale
<< '\n'
118 if (IndexReg
.getNode())
119 IndexReg
.getNode()->dump(DAG
);
122 dbgs() << " Disp " << Disp
<< '\n'
144 dbgs() << " JT" << JT
<< " Align" << Align
<< '\n';
151 //===--------------------------------------------------------------------===//
152 /// ISel - X86-specific code to select X86 machine instructions for
153 /// SelectionDAG operations.
155 class X86DAGToDAGISel final
: public SelectionDAGISel
{
156 /// Keep a pointer to the X86Subtarget around so that we can
157 /// make the right decision when generating code for different targets.
158 const X86Subtarget
*Subtarget
;
160 /// If true, selector should try to optimize for code size instead of
164 /// If true, selector should try to optimize for minimum code size.
167 /// Disable direct TLS access through segment registers.
168 bool IndirectTlsSegRefs
;
171 explicit X86DAGToDAGISel(X86TargetMachine
&tm
, CodeGenOpt::Level OptLevel
)
172 : SelectionDAGISel(tm
, OptLevel
), OptForSize(false),
173 OptForMinSize(false) {}
175 StringRef
getPassName() const override
{
176 return "X86 DAG->DAG Instruction Selection";
179 bool runOnMachineFunction(MachineFunction
&MF
) override
{
180 // Reset the subtarget each time through.
181 Subtarget
= &MF
.getSubtarget
<X86Subtarget
>();
182 IndirectTlsSegRefs
= MF
.getFunction().hasFnAttribute(
183 "indirect-tls-seg-refs");
184 SelectionDAGISel::runOnMachineFunction(MF
);
188 void EmitFunctionEntryCode() override
;
190 bool IsProfitableToFold(SDValue N
, SDNode
*U
, SDNode
*Root
) const override
;
192 void PreprocessISelDAG() override
;
193 void PostprocessISelDAG() override
;
195 // Include the pieces autogenerated from the target description.
196 #include "X86GenDAGISel.inc"
199 void Select(SDNode
*N
) override
;
201 bool foldOffsetIntoAddress(uint64_t Offset
, X86ISelAddressMode
&AM
);
202 bool matchLoadInAddress(LoadSDNode
*N
, X86ISelAddressMode
&AM
);
203 bool matchWrapper(SDValue N
, X86ISelAddressMode
&AM
);
204 bool matchAddress(SDValue N
, X86ISelAddressMode
&AM
);
205 bool matchVectorAddress(SDValue N
, X86ISelAddressMode
&AM
);
206 bool matchAdd(SDValue N
, X86ISelAddressMode
&AM
, unsigned Depth
);
207 bool matchAddressRecursively(SDValue N
, X86ISelAddressMode
&AM
,
209 bool matchAddressBase(SDValue N
, X86ISelAddressMode
&AM
);
210 bool selectAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
211 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
213 bool selectVectorAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
214 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
216 bool selectMOV64Imm32(SDValue N
, SDValue
&Imm
);
217 bool selectLEAAddr(SDValue N
, SDValue
&Base
,
218 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
220 bool selectLEA64_32Addr(SDValue N
, SDValue
&Base
,
221 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
223 bool selectTLSADDRAddr(SDValue N
, SDValue
&Base
,
224 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
226 bool selectScalarSSELoad(SDNode
*Root
, SDNode
*Parent
, SDValue N
,
227 SDValue
&Base
, SDValue
&Scale
,
228 SDValue
&Index
, SDValue
&Disp
,
230 SDValue
&NodeWithChain
);
231 bool selectRelocImm(SDValue N
, SDValue
&Op
);
233 bool tryFoldLoad(SDNode
*Root
, SDNode
*P
, SDValue N
,
234 SDValue
&Base
, SDValue
&Scale
,
235 SDValue
&Index
, SDValue
&Disp
,
238 // Convenience method where P is also root.
239 bool tryFoldLoad(SDNode
*P
, SDValue N
,
240 SDValue
&Base
, SDValue
&Scale
,
241 SDValue
&Index
, SDValue
&Disp
,
243 return tryFoldLoad(P
, P
, N
, Base
, Scale
, Index
, Disp
, Segment
);
246 /// Implement addressing mode selection for inline asm expressions.
247 bool SelectInlineAsmMemoryOperand(const SDValue
&Op
,
248 unsigned ConstraintID
,
249 std::vector
<SDValue
> &OutOps
) override
;
251 void emitSpecialCodeForMain();
253 inline void getAddressOperands(X86ISelAddressMode
&AM
, const SDLoc
&DL
,
254 SDValue
&Base
, SDValue
&Scale
,
255 SDValue
&Index
, SDValue
&Disp
,
257 Base
= (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
258 ? CurDAG
->getTargetFrameIndex(
260 TLI
->getPointerTy(CurDAG
->getDataLayout()))
262 Scale
= getI8Imm(AM
.Scale
, DL
);
264 // These are 32-bit even in 64-bit mode since RIP-relative offset
267 Disp
= CurDAG
->getTargetGlobalAddress(AM
.GV
, SDLoc(),
271 Disp
= CurDAG
->getTargetConstantPool(AM
.CP
, MVT::i32
,
272 AM
.Align
, AM
.Disp
, AM
.SymbolFlags
);
274 assert(!AM
.Disp
&& "Non-zero displacement is ignored with ES.");
275 Disp
= CurDAG
->getTargetExternalSymbol(AM
.ES
, MVT::i32
, AM
.SymbolFlags
);
276 } else if (AM
.MCSym
) {
277 assert(!AM
.Disp
&& "Non-zero displacement is ignored with MCSym.");
278 assert(AM
.SymbolFlags
== 0 && "oo");
279 Disp
= CurDAG
->getMCSymbol(AM
.MCSym
, MVT::i32
);
280 } else if (AM
.JT
!= -1) {
281 assert(!AM
.Disp
&& "Non-zero displacement is ignored with JT.");
282 Disp
= CurDAG
->getTargetJumpTable(AM
.JT
, MVT::i32
, AM
.SymbolFlags
);
283 } else if (AM
.BlockAddr
)
284 Disp
= CurDAG
->getTargetBlockAddress(AM
.BlockAddr
, MVT::i32
, AM
.Disp
,
287 Disp
= CurDAG
->getTargetConstant(AM
.Disp
, DL
, MVT::i32
);
289 if (AM
.Segment
.getNode())
290 Segment
= AM
.Segment
;
292 Segment
= CurDAG
->getRegister(0, MVT::i32
);
295 // Utility function to determine whether we should avoid selecting
296 // immediate forms of instructions for better code size or not.
297 // At a high level, we'd like to avoid such instructions when
298 // we have similar constants used within the same basic block
299 // that can be kept in a register.
301 bool shouldAvoidImmediateInstFormsForSize(SDNode
*N
) const {
302 uint32_t UseCount
= 0;
304 // Do not want to hoist if we're not optimizing for size.
305 // TODO: We'd like to remove this restriction.
306 // See the comment in X86InstrInfo.td for more info.
310 // Walk all the users of the immediate.
311 for (SDNode::use_iterator UI
= N
->use_begin(),
312 UE
= N
->use_end(); (UI
!= UE
) && (UseCount
< 2); ++UI
) {
316 // This user is already selected. Count it as a legitimate use and
318 if (User
->isMachineOpcode()) {
323 // We want to count stores of immediates as real uses.
324 if (User
->getOpcode() == ISD::STORE
&&
325 User
->getOperand(1).getNode() == N
) {
330 // We don't currently match users that have > 2 operands (except
331 // for stores, which are handled above)
332 // Those instruction won't match in ISEL, for now, and would
333 // be counted incorrectly.
334 // This may change in the future as we add additional instruction
336 if (User
->getNumOperands() != 2)
339 // Immediates that are used for offsets as part of stack
340 // manipulation should be left alone. These are typically
341 // used to indicate SP offsets for argument passing and
342 // will get pulled into stores/pushes (implicitly).
343 if (User
->getOpcode() == X86ISD::ADD
||
344 User
->getOpcode() == ISD::ADD
||
345 User
->getOpcode() == X86ISD::SUB
||
346 User
->getOpcode() == ISD::SUB
) {
348 // Find the other operand of the add/sub.
349 SDValue OtherOp
= User
->getOperand(0);
350 if (OtherOp
.getNode() == N
)
351 OtherOp
= User
->getOperand(1);
353 // Don't count if the other operand is SP.
354 RegisterSDNode
*RegNode
;
355 if (OtherOp
->getOpcode() == ISD::CopyFromReg
&&
356 (RegNode
= dyn_cast_or_null
<RegisterSDNode
>(
357 OtherOp
->getOperand(1).getNode())))
358 if ((RegNode
->getReg() == X86::ESP
) ||
359 (RegNode
->getReg() == X86::RSP
))
363 // ... otherwise, count this and move on.
367 // If we have more than 1 use, then recommend for hoisting.
368 return (UseCount
> 1);
371 /// Return a target constant with the specified value of type i8.
372 inline SDValue
getI8Imm(unsigned Imm
, const SDLoc
&DL
) {
373 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i8
);
376 /// Return a target constant with the specified value, of type i32.
377 inline SDValue
getI32Imm(unsigned Imm
, const SDLoc
&DL
) {
378 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i32
);
381 /// Return a target constant with the specified value, of type i64.
382 inline SDValue
getI64Imm(uint64_t Imm
, const SDLoc
&DL
) {
383 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i64
);
386 SDValue
getExtractVEXTRACTImmediate(SDNode
*N
, unsigned VecWidth
,
388 assert((VecWidth
== 128 || VecWidth
== 256) && "Unexpected vector width");
389 uint64_t Index
= N
->getConstantOperandVal(1);
390 MVT VecVT
= N
->getOperand(0).getSimpleValueType();
391 return getI8Imm((Index
* VecVT
.getScalarSizeInBits()) / VecWidth
, DL
);
394 SDValue
getInsertVINSERTImmediate(SDNode
*N
, unsigned VecWidth
,
396 assert((VecWidth
== 128 || VecWidth
== 256) && "Unexpected vector width");
397 uint64_t Index
= N
->getConstantOperandVal(2);
398 MVT VecVT
= N
->getSimpleValueType(0);
399 return getI8Imm((Index
* VecVT
.getScalarSizeInBits()) / VecWidth
, DL
);
402 /// Return an SDNode that returns the value of the global base register.
403 /// Output instructions required to initialize the global base register,
405 SDNode
*getGlobalBaseReg();
407 /// Return a reference to the TargetMachine, casted to the target-specific
409 const X86TargetMachine
&getTargetMachine() const {
410 return static_cast<const X86TargetMachine
&>(TM
);
413 /// Return a reference to the TargetInstrInfo, casted to the target-specific
415 const X86InstrInfo
*getInstrInfo() const {
416 return Subtarget
->getInstrInfo();
419 /// Address-mode matching performs shift-of-and to and-of-shift
420 /// reassociation in order to expose more scaled addressing
422 bool ComplexPatternFuncMutatesDAG() const override
{
426 bool isSExtAbsoluteSymbolRef(unsigned Width
, SDNode
*N
) const;
428 /// Returns whether this is a relocatable immediate in the range
429 /// [-2^Width .. 2^Width-1].
430 template <unsigned Width
> bool isSExtRelocImm(SDNode
*N
) const {
431 if (auto *CN
= dyn_cast
<ConstantSDNode
>(N
))
432 return isInt
<Width
>(CN
->getSExtValue());
433 return isSExtAbsoluteSymbolRef(Width
, N
);
436 // Indicates we should prefer to use a non-temporal load for this load.
437 bool useNonTemporalLoad(LoadSDNode
*N
) const {
438 if (!N
->isNonTemporal())
441 unsigned StoreSize
= N
->getMemoryVT().getStoreSize();
443 if (N
->getAlignment() < StoreSize
)
447 default: llvm_unreachable("Unsupported store size");
452 return Subtarget
->hasSSE41();
454 return Subtarget
->hasAVX2();
456 return Subtarget
->hasAVX512();
460 bool foldLoadStoreIntoMemOperand(SDNode
*Node
);
461 MachineSDNode
*matchBEXTRFromAndImm(SDNode
*Node
);
462 bool matchBitExtract(SDNode
*Node
);
463 bool shrinkAndImmediate(SDNode
*N
);
464 bool isMaskZeroExtended(SDNode
*N
) const;
465 bool tryShiftAmountMod(SDNode
*N
);
467 MachineSDNode
*emitPCMPISTR(unsigned ROpc
, unsigned MOpc
, bool MayFoldLoad
,
468 const SDLoc
&dl
, MVT VT
, SDNode
*Node
);
469 MachineSDNode
*emitPCMPESTR(unsigned ROpc
, unsigned MOpc
, bool MayFoldLoad
,
470 const SDLoc
&dl
, MVT VT
, SDNode
*Node
,
473 bool tryOptimizeRem8Extend(SDNode
*N
);
475 bool onlyUsesZeroFlag(SDValue Flags
) const;
476 bool hasNoSignFlagUses(SDValue Flags
) const;
477 bool hasNoCarryFlagUses(SDValue Flags
) const;
482 // Returns true if this masked compare can be implemented legally with this
484 static bool isLegalMaskCompare(SDNode
*N
, const X86Subtarget
*Subtarget
) {
485 unsigned Opcode
= N
->getOpcode();
486 if (Opcode
== X86ISD::CMPM
|| Opcode
== ISD::SETCC
||
487 Opcode
== X86ISD::CMPM_RND
|| Opcode
== X86ISD::VFPCLASS
) {
488 // We can get 256-bit 8 element types here without VLX being enabled. When
489 // this happens we will use 512-bit operations and the mask will not be
491 EVT OpVT
= N
->getOperand(0).getValueType();
492 if (OpVT
.is256BitVector() || OpVT
.is128BitVector())
493 return Subtarget
->hasVLX();
497 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
498 if (Opcode
== X86ISD::VFPCLASSS
|| Opcode
== X86ISD::FSETCCM
||
499 Opcode
== X86ISD::FSETCCM_RND
)
505 // Returns true if we can assume the writer of the mask has zero extended it
507 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode
*N
) const {
508 // If this is an AND, check if we have a compare on either side. As long as
509 // one side guarantees the mask is zero extended, the AND will preserve those
511 if (N
->getOpcode() == ISD::AND
)
512 return isLegalMaskCompare(N
->getOperand(0).getNode(), Subtarget
) ||
513 isLegalMaskCompare(N
->getOperand(1).getNode(), Subtarget
);
515 return isLegalMaskCompare(N
, Subtarget
);
519 X86DAGToDAGISel::IsProfitableToFold(SDValue N
, SDNode
*U
, SDNode
*Root
) const {
520 if (OptLevel
== CodeGenOpt::None
) return false;
525 if (N
.getOpcode() != ISD::LOAD
)
528 // Don't fold non-temporal loads if we have an instruction for them.
529 if (useNonTemporalLoad(cast
<LoadSDNode
>(N
)))
532 // If N is a load, do additional profitability checks.
534 switch (U
->getOpcode()) {
548 SDValue Op1
= U
->getOperand(1);
550 // If the other operand is a 8-bit immediate we should fold the immediate
551 // instead. This reduces code size.
553 // movl 4(%esp), %eax
557 // addl 4(%esp), %eax
558 // The former is 2 bytes shorter. In case where the increment is 1, then
559 // the saving can be 4 bytes (by using incl %eax).
560 if (ConstantSDNode
*Imm
= dyn_cast
<ConstantSDNode
>(Op1
)) {
561 if (Imm
->getAPIntValue().isSignedIntN(8))
564 // If this is a 64-bit AND with an immediate that fits in 32-bits,
565 // prefer using the smaller and over folding the load. This is needed to
566 // make sure immediates created by shrinkAndImmediate are always folded.
567 // Ideally we would narrow the load during DAG combine and get the
568 // best of both worlds.
569 if (U
->getOpcode() == ISD::AND
&&
570 Imm
->getAPIntValue().getBitWidth() == 64 &&
571 Imm
->getAPIntValue().isIntN(32))
575 // If the other operand is a TLS address, we should fold it instead.
578 // leal i@NTPOFF(%eax), %eax
580 // movl $i@NTPOFF, %eax
582 // if the block also has an access to a second TLS address this will save
584 // FIXME: This is probably also true for non-TLS addresses.
585 if (Op1
.getOpcode() == X86ISD::Wrapper
) {
586 SDValue Val
= Op1
.getOperand(0);
587 if (Val
.getOpcode() == ISD::TargetGlobalTLSAddress
)
591 // Don't fold load if this matches the BTS/BTR/BTC patterns.
592 // BTS: (or X, (shl 1, n))
593 // BTR: (and X, (rotl -2, n))
594 // BTC: (xor X, (shl 1, n))
595 if (U
->getOpcode() == ISD::OR
|| U
->getOpcode() == ISD::XOR
) {
596 if (U
->getOperand(0).getOpcode() == ISD::SHL
&&
597 isOneConstant(U
->getOperand(0).getOperand(0)))
600 if (U
->getOperand(1).getOpcode() == ISD::SHL
&&
601 isOneConstant(U
->getOperand(1).getOperand(0)))
604 if (U
->getOpcode() == ISD::AND
) {
605 SDValue U0
= U
->getOperand(0);
606 SDValue U1
= U
->getOperand(1);
607 if (U0
.getOpcode() == ISD::ROTL
) {
608 auto *C
= dyn_cast
<ConstantSDNode
>(U0
.getOperand(0));
609 if (C
&& C
->getSExtValue() == -2)
613 if (U1
.getOpcode() == ISD::ROTL
) {
614 auto *C
= dyn_cast
<ConstantSDNode
>(U1
.getOperand(0));
615 if (C
&& C
->getSExtValue() == -2)
625 // Don't fold a load into a shift by immediate. The BMI2 instructions
626 // support folding a load, but not an immediate. The legacy instructions
627 // support folding an immediate, but can't fold a load. Folding an
628 // immediate is preferable to folding a load.
629 if (isa
<ConstantSDNode
>(U
->getOperand(1)))
636 // Prevent folding a load if this can implemented with an insert_subreg or
637 // a move that implicitly zeroes.
638 if (Root
->getOpcode() == ISD::INSERT_SUBVECTOR
&&
639 isNullConstant(Root
->getOperand(2)) &&
640 (Root
->getOperand(0).isUndef() ||
641 ISD::isBuildVectorAllZeros(Root
->getOperand(0).getNode())))
647 /// Replace the original chain operand of the call with
648 /// load's chain operand and move load below the call's chain operand.
649 static void moveBelowOrigChain(SelectionDAG
*CurDAG
, SDValue Load
,
650 SDValue Call
, SDValue OrigChain
) {
651 SmallVector
<SDValue
, 8> Ops
;
652 SDValue Chain
= OrigChain
.getOperand(0);
653 if (Chain
.getNode() == Load
.getNode())
654 Ops
.push_back(Load
.getOperand(0));
656 assert(Chain
.getOpcode() == ISD::TokenFactor
&&
657 "Unexpected chain operand");
658 for (unsigned i
= 0, e
= Chain
.getNumOperands(); i
!= e
; ++i
)
659 if (Chain
.getOperand(i
).getNode() == Load
.getNode())
660 Ops
.push_back(Load
.getOperand(0));
662 Ops
.push_back(Chain
.getOperand(i
));
664 CurDAG
->getNode(ISD::TokenFactor
, SDLoc(Load
), MVT::Other
, Ops
);
666 Ops
.push_back(NewChain
);
668 Ops
.append(OrigChain
->op_begin() + 1, OrigChain
->op_end());
669 CurDAG
->UpdateNodeOperands(OrigChain
.getNode(), Ops
);
670 CurDAG
->UpdateNodeOperands(Load
.getNode(), Call
.getOperand(0),
671 Load
.getOperand(1), Load
.getOperand(2));
674 Ops
.push_back(SDValue(Load
.getNode(), 1));
675 Ops
.append(Call
->op_begin() + 1, Call
->op_end());
676 CurDAG
->UpdateNodeOperands(Call
.getNode(), Ops
);
679 /// Return true if call address is a load and it can be
680 /// moved below CALLSEQ_START and the chains leading up to the call.
681 /// Return the CALLSEQ_START by reference as a second output.
682 /// In the case of a tail call, there isn't a callseq node between the call
683 /// chain and the load.
684 static bool isCalleeLoad(SDValue Callee
, SDValue
&Chain
, bool HasCallSeq
) {
685 // The transformation is somewhat dangerous if the call's chain was glued to
686 // the call. After MoveBelowOrigChain the load is moved between the call and
687 // the chain, this can create a cycle if the load is not folded. So it is
688 // *really* important that we are sure the load will be folded.
689 if (Callee
.getNode() == Chain
.getNode() || !Callee
.hasOneUse())
691 LoadSDNode
*LD
= dyn_cast
<LoadSDNode
>(Callee
.getNode());
694 LD
->getAddressingMode() != ISD::UNINDEXED
||
695 LD
->getExtensionType() != ISD::NON_EXTLOAD
)
698 // Now let's find the callseq_start.
699 while (HasCallSeq
&& Chain
.getOpcode() != ISD::CALLSEQ_START
) {
700 if (!Chain
.hasOneUse())
702 Chain
= Chain
.getOperand(0);
705 if (!Chain
.getNumOperands())
707 // Since we are not checking for AA here, conservatively abort if the chain
708 // writes to memory. It's not safe to move the callee (a load) across a store.
709 if (isa
<MemSDNode
>(Chain
.getNode()) &&
710 cast
<MemSDNode
>(Chain
.getNode())->writeMem())
712 if (Chain
.getOperand(0).getNode() == Callee
.getNode())
714 if (Chain
.getOperand(0).getOpcode() == ISD::TokenFactor
&&
715 Callee
.getValue(1).isOperandOf(Chain
.getOperand(0).getNode()) &&
716 Callee
.getValue(1).hasOneUse())
721 void X86DAGToDAGISel::PreprocessISelDAG() {
722 // OptFor[Min]Size are used in pattern predicates that isel is matching.
723 OptForSize
= MF
->getFunction().optForSize();
724 OptForMinSize
= MF
->getFunction().optForMinSize();
725 assert((!OptForMinSize
|| OptForSize
) && "OptForMinSize implies OptForSize");
727 for (SelectionDAG::allnodes_iterator I
= CurDAG
->allnodes_begin(),
728 E
= CurDAG
->allnodes_end(); I
!= E
; ) {
729 SDNode
*N
= &*I
++; // Preincrement iterator to avoid invalidation issues.
731 // If this is a target specific AND node with no flag usages, turn it back
732 // into ISD::AND to enable test instruction matching.
733 if (N
->getOpcode() == X86ISD::AND
&& !N
->hasAnyUseOfValue(1)) {
734 SDValue Res
= CurDAG
->getNode(ISD::AND
, SDLoc(N
), N
->getValueType(0),
735 N
->getOperand(0), N
->getOperand(1));
737 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
739 CurDAG
->DeleteNode(N
);
743 if (OptLevel
!= CodeGenOpt::None
&&
744 // Only do this when the target can fold the load into the call or
746 !Subtarget
->useRetpolineIndirectCalls() &&
747 ((N
->getOpcode() == X86ISD::CALL
&& !Subtarget
->slowTwoMemOps()) ||
748 (N
->getOpcode() == X86ISD::TC_RETURN
&&
749 (Subtarget
->is64Bit() ||
750 !getTargetMachine().isPositionIndependent())))) {
751 /// Also try moving call address load from outside callseq_start to just
752 /// before the call to allow it to be folded.
770 bool HasCallSeq
= N
->getOpcode() == X86ISD::CALL
;
771 SDValue Chain
= N
->getOperand(0);
772 SDValue Load
= N
->getOperand(1);
773 if (!isCalleeLoad(Load
, Chain
, HasCallSeq
))
775 moveBelowOrigChain(CurDAG
, Load
, SDValue(N
, 0), Chain
);
780 // Lower fpround and fpextend nodes that target the FP stack to be store and
781 // load to the stack. This is a gross hack. We would like to simply mark
782 // these as being illegal, but when we do that, legalize produces these when
783 // it expands calls, then expands these in the same legalize pass. We would
784 // like dag combine to be able to hack on these between the call expansion
785 // and the node legalization. As such this pass basically does "really
786 // late" legalization of these inline with the X86 isel pass.
787 // FIXME: This should only happen when not compiled with -O0.
788 if (N
->getOpcode() != ISD::FP_ROUND
&& N
->getOpcode() != ISD::FP_EXTEND
)
791 MVT SrcVT
= N
->getOperand(0).getSimpleValueType();
792 MVT DstVT
= N
->getSimpleValueType(0);
794 // If any of the sources are vectors, no fp stack involved.
795 if (SrcVT
.isVector() || DstVT
.isVector())
798 // If the source and destination are SSE registers, then this is a legal
799 // conversion that should not be lowered.
800 const X86TargetLowering
*X86Lowering
=
801 static_cast<const X86TargetLowering
*>(TLI
);
802 bool SrcIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(SrcVT
);
803 bool DstIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(DstVT
);
804 if (SrcIsSSE
&& DstIsSSE
)
807 if (!SrcIsSSE
&& !DstIsSSE
) {
808 // If this is an FPStack extension, it is a noop.
809 if (N
->getOpcode() == ISD::FP_EXTEND
)
811 // If this is a value-preserving FPStack truncation, it is a noop.
812 if (N
->getConstantOperandVal(1))
816 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
817 // FPStack has extload and truncstore. SSE can fold direct loads into other
818 // operations. Based on this, decide what we want to do.
820 if (N
->getOpcode() == ISD::FP_ROUND
)
821 MemVT
= DstVT
; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
823 MemVT
= SrcIsSSE
? SrcVT
: DstVT
;
825 SDValue MemTmp
= CurDAG
->CreateStackTemporary(MemVT
);
828 // FIXME: optimize the case where the src/dest is a load or store?
830 CurDAG
->getTruncStore(CurDAG
->getEntryNode(), dl
, N
->getOperand(0),
831 MemTmp
, MachinePointerInfo(), MemVT
);
832 SDValue Result
= CurDAG
->getExtLoad(ISD::EXTLOAD
, dl
, DstVT
, Store
, MemTmp
,
833 MachinePointerInfo(), MemVT
);
835 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
836 // extload we created. This will cause general havok on the dag because
837 // anything below the conversion could be folded into other existing nodes.
838 // To avoid invalidating 'I', back it up to the convert node.
840 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Result
);
842 // Now that we did that, the node is dead. Increment the iterator to the
843 // next node to process, then delete N.
845 CurDAG
->DeleteNode(N
);
849 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
850 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode
*N
) {
851 unsigned Opc
= N
->getMachineOpcode();
852 if (Opc
!= X86::MOVZX32rr8
&& Opc
!= X86::MOVSX32rr8
&&
853 Opc
!= X86::MOVSX64rr8
)
856 SDValue N0
= N
->getOperand(0);
858 // We need to be extracting the lower bit of an extend.
859 if (!N0
.isMachineOpcode() ||
860 N0
.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG
||
861 N0
.getConstantOperandVal(1) != X86::sub_8bit
)
864 // We're looking for either a movsx or movzx to match the original opcode.
865 unsigned ExpectedOpc
= Opc
== X86::MOVZX32rr8
? X86::MOVZX32rr8_NOREX
866 : X86::MOVSX32rr8_NOREX
;
867 SDValue N00
= N0
.getOperand(0);
868 if (!N00
.isMachineOpcode() || N00
.getMachineOpcode() != ExpectedOpc
)
871 if (Opc
== X86::MOVSX64rr8
) {
872 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
874 MachineSDNode
*Extend
= CurDAG
->getMachineNode(X86::MOVSX64rr32
, SDLoc(N
),
876 ReplaceUses(N
, Extend
);
878 // Ok we can drop this extend and just use the original extend.
879 ReplaceUses(N
, N00
.getNode());
885 void X86DAGToDAGISel::PostprocessISelDAG() {
886 // Skip peepholes at -O0.
887 if (TM
.getOptLevel() == CodeGenOpt::None
)
890 SelectionDAG::allnodes_iterator Position
= CurDAG
->allnodes_end();
892 bool MadeChange
= false;
893 while (Position
!= CurDAG
->allnodes_begin()) {
894 SDNode
*N
= &*--Position
;
895 // Skip dead nodes and any non-machine opcodes.
896 if (N
->use_empty() || !N
->isMachineOpcode())
899 if (tryOptimizeRem8Extend(N
)) {
904 // Look for a TESTrr+ANDrr pattern where both operands of the test are
905 // the same. Rewrite to remove the AND.
906 unsigned Opc
= N
->getMachineOpcode();
907 if ((Opc
== X86::TEST8rr
|| Opc
== X86::TEST16rr
||
908 Opc
== X86::TEST32rr
|| Opc
== X86::TEST64rr
) &&
909 N
->getOperand(0) == N
->getOperand(1) &&
910 N
->isOnlyUserOf(N
->getOperand(0).getNode()) &&
911 N
->getOperand(0).isMachineOpcode()) {
912 SDValue And
= N
->getOperand(0);
913 unsigned N0Opc
= And
.getMachineOpcode();
914 if (N0Opc
== X86::AND8rr
|| N0Opc
== X86::AND16rr
||
915 N0Opc
== X86::AND32rr
|| N0Opc
== X86::AND64rr
) {
916 MachineSDNode
*Test
= CurDAG
->getMachineNode(Opc
, SDLoc(N
),
920 ReplaceUses(N
, Test
);
924 if (N0Opc
== X86::AND8rm
|| N0Opc
== X86::AND16rm
||
925 N0Opc
== X86::AND32rm
|| N0Opc
== X86::AND64rm
) {
928 case X86::AND8rm
: NewOpc
= X86::TEST8mr
; break;
929 case X86::AND16rm
: NewOpc
= X86::TEST16mr
; break;
930 case X86::AND32rm
: NewOpc
= X86::TEST32mr
; break;
931 case X86::AND64rm
: NewOpc
= X86::TEST64mr
; break;
934 // Need to swap the memory and register operand.
935 SDValue Ops
[] = { And
.getOperand(1),
941 And
.getOperand(6) /* Chain */ };
942 MachineSDNode
*Test
= CurDAG
->getMachineNode(NewOpc
, SDLoc(N
),
943 MVT::i32
, MVT::Other
, Ops
);
944 ReplaceUses(N
, Test
);
950 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
951 // used. We're doing this late so we can prefer to fold the AND into masked
952 // comparisons. Doing that can be better for the live range of the mask
954 if ((Opc
== X86::KORTESTBrr
|| Opc
== X86::KORTESTWrr
||
955 Opc
== X86::KORTESTDrr
|| Opc
== X86::KORTESTQrr
) &&
956 N
->getOperand(0) == N
->getOperand(1) &&
957 N
->isOnlyUserOf(N
->getOperand(0).getNode()) &&
958 N
->getOperand(0).isMachineOpcode() &&
959 onlyUsesZeroFlag(SDValue(N
, 0))) {
960 SDValue And
= N
->getOperand(0);
961 unsigned N0Opc
= And
.getMachineOpcode();
962 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
963 // KAND instructions and KTEST use the same ISA feature.
964 if (N0Opc
== X86::KANDBrr
||
965 (N0Opc
== X86::KANDWrr
&& Subtarget
->hasDQI()) ||
966 N0Opc
== X86::KANDDrr
|| N0Opc
== X86::KANDQrr
) {
969 default: llvm_unreachable("Unexpected opcode!");
970 case X86::KORTESTBrr
: NewOpc
= X86::KTESTBrr
; break;
971 case X86::KORTESTWrr
: NewOpc
= X86::KTESTWrr
; break;
972 case X86::KORTESTDrr
: NewOpc
= X86::KTESTDrr
; break;
973 case X86::KORTESTQrr
: NewOpc
= X86::KTESTQrr
; break;
975 MachineSDNode
*KTest
= CurDAG
->getMachineNode(NewOpc
, SDLoc(N
),
979 ReplaceUses(N
, KTest
);
985 // Attempt to remove vectors moves that were inserted to zero upper bits.
986 if (Opc
!= TargetOpcode::SUBREG_TO_REG
)
989 unsigned SubRegIdx
= N
->getConstantOperandVal(2);
990 if (SubRegIdx
!= X86::sub_xmm
&& SubRegIdx
!= X86::sub_ymm
)
993 SDValue Move
= N
->getOperand(1);
994 if (!Move
.isMachineOpcode())
997 // Make sure its one of the move opcodes we recognize.
998 switch (Move
.getMachineOpcode()) {
1001 case X86::VMOVAPDrr
: case X86::VMOVUPDrr
:
1002 case X86::VMOVAPSrr
: case X86::VMOVUPSrr
:
1003 case X86::VMOVDQArr
: case X86::VMOVDQUrr
:
1004 case X86::VMOVAPDYrr
: case X86::VMOVUPDYrr
:
1005 case X86::VMOVAPSYrr
: case X86::VMOVUPSYrr
:
1006 case X86::VMOVDQAYrr
: case X86::VMOVDQUYrr
:
1007 case X86::VMOVAPDZ128rr
: case X86::VMOVUPDZ128rr
:
1008 case X86::VMOVAPSZ128rr
: case X86::VMOVUPSZ128rr
:
1009 case X86::VMOVDQA32Z128rr
: case X86::VMOVDQU32Z128rr
:
1010 case X86::VMOVDQA64Z128rr
: case X86::VMOVDQU64Z128rr
:
1011 case X86::VMOVAPDZ256rr
: case X86::VMOVUPDZ256rr
:
1012 case X86::VMOVAPSZ256rr
: case X86::VMOVUPSZ256rr
:
1013 case X86::VMOVDQA32Z256rr
: case X86::VMOVDQU32Z256rr
:
1014 case X86::VMOVDQA64Z256rr
: case X86::VMOVDQU64Z256rr
:
1018 SDValue In
= Move
.getOperand(0);
1019 if (!In
.isMachineOpcode() ||
1020 In
.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END
)
1023 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1024 // the SHA instructions which use a legacy encoding.
1025 uint64_t TSFlags
= getInstrInfo()->get(In
.getMachineOpcode()).TSFlags
;
1026 if ((TSFlags
& X86II::EncodingMask
) != X86II::VEX
&&
1027 (TSFlags
& X86II::EncodingMask
) != X86II::EVEX
&&
1028 (TSFlags
& X86II::EncodingMask
) != X86II::XOP
)
1031 // Producing instruction is another vector instruction. We can drop the
1033 CurDAG
->UpdateNodeOperands(N
, N
->getOperand(0), In
, N
->getOperand(2));
1038 CurDAG
->RemoveDeadNodes();
1042 /// Emit any code that needs to be executed only in the main function.
1043 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1044 if (Subtarget
->isTargetCygMing()) {
1045 TargetLowering::ArgListTy Args
;
1046 auto &DL
= CurDAG
->getDataLayout();
1048 TargetLowering::CallLoweringInfo
CLI(*CurDAG
);
1049 CLI
.setChain(CurDAG
->getRoot())
1050 .setCallee(CallingConv::C
, Type::getVoidTy(*CurDAG
->getContext()),
1051 CurDAG
->getExternalSymbol("__main", TLI
->getPointerTy(DL
)),
1053 const TargetLowering
&TLI
= CurDAG
->getTargetLoweringInfo();
1054 std::pair
<SDValue
, SDValue
> Result
= TLI
.LowerCallTo(CLI
);
1055 CurDAG
->setRoot(Result
.second
);
1059 void X86DAGToDAGISel::EmitFunctionEntryCode() {
1060 // If this is main, emit special code for main.
1061 const Function
&F
= MF
->getFunction();
1062 if (F
.hasExternalLinkage() && F
.getName() == "main")
1063 emitSpecialCodeForMain();
1066 static bool isDispSafeForFrameIndex(int64_t Val
) {
1067 // On 64-bit platforms, we can run into an issue where a frame index
1068 // includes a displacement that, when added to the explicit displacement,
1069 // will overflow the displacement field. Assuming that the frame index
1070 // displacement fits into a 31-bit integer (which is only slightly more
1071 // aggressive than the current fundamental assumption that it fits into
1072 // a 32-bit integer), a 31-bit disp should always be safe.
1073 return isInt
<31>(Val
);
1076 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset
,
1077 X86ISelAddressMode
&AM
) {
1078 // If there's no offset to fold, we don't need to do any work.
1082 // Cannot combine ExternalSymbol displacements with integer offsets.
1083 if (AM
.ES
|| AM
.MCSym
)
1086 int64_t Val
= AM
.Disp
+ Offset
;
1087 CodeModel::Model M
= TM
.getCodeModel();
1088 if (Subtarget
->is64Bit()) {
1089 if (!X86::isOffsetSuitableForCodeModel(Val
, M
,
1090 AM
.hasSymbolicDisplacement()))
1092 // In addition to the checks required for a register base, check that
1093 // we do not try to use an unsafe Disp with a frame index.
1094 if (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
&&
1095 !isDispSafeForFrameIndex(Val
))
1103 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode
*N
, X86ISelAddressMode
&AM
){
1104 SDValue Address
= N
->getOperand(1);
1106 // load gs:0 -> GS segment register.
1107 // load fs:0 -> FS segment register.
1109 // This optimization is valid because the GNU TLS model defines that
1110 // gs:0 (or fs:0 on X86-64) contains its own address.
1111 // For more information see http://people.redhat.com/drepper/tls.pdf
1112 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Address
))
1113 if (C
->getSExtValue() == 0 && AM
.Segment
.getNode() == nullptr &&
1114 !IndirectTlsSegRefs
&&
1115 (Subtarget
->isTargetGlibc() || Subtarget
->isTargetAndroid() ||
1116 Subtarget
->isTargetFuchsia()))
1117 switch (N
->getPointerInfo().getAddrSpace()) {
1119 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
1122 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
1124 // Address space 258 is not handled here, because it is not used to
1125 // address TLS areas.
1131 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1132 /// mode. These wrap things that will resolve down into a symbol reference.
1133 /// If no match is possible, this returns true, otherwise it returns false.
1134 bool X86DAGToDAGISel::matchWrapper(SDValue N
, X86ISelAddressMode
&AM
) {
1135 // If the addressing mode already has a symbol as the displacement, we can
1136 // never match another symbol.
1137 if (AM
.hasSymbolicDisplacement())
1140 bool IsRIPRel
= N
.getOpcode() == X86ISD::WrapperRIP
;
1142 // We can't use an addressing mode in the 64-bit large code model. In the
1143 // medium code model, we use can use an mode when RIP wrappers are present.
1144 // That signifies access to globals that are known to be "near", such as the
1146 CodeModel::Model M
= TM
.getCodeModel();
1147 if (Subtarget
->is64Bit() &&
1148 (M
== CodeModel::Large
|| (M
== CodeModel::Medium
&& !IsRIPRel
)))
1151 // Base and index reg must be 0 in order to use %rip as base.
1152 if (IsRIPRel
&& AM
.hasBaseOrIndexReg())
1155 // Make a local copy in case we can't do this fold.
1156 X86ISelAddressMode Backup
= AM
;
1159 SDValue N0
= N
.getOperand(0);
1160 if (GlobalAddressSDNode
*G
= dyn_cast
<GlobalAddressSDNode
>(N0
)) {
1161 AM
.GV
= G
->getGlobal();
1162 AM
.SymbolFlags
= G
->getTargetFlags();
1163 Offset
= G
->getOffset();
1164 } else if (ConstantPoolSDNode
*CP
= dyn_cast
<ConstantPoolSDNode
>(N0
)) {
1165 AM
.CP
= CP
->getConstVal();
1166 AM
.Align
= CP
->getAlignment();
1167 AM
.SymbolFlags
= CP
->getTargetFlags();
1168 Offset
= CP
->getOffset();
1169 } else if (ExternalSymbolSDNode
*S
= dyn_cast
<ExternalSymbolSDNode
>(N0
)) {
1170 AM
.ES
= S
->getSymbol();
1171 AM
.SymbolFlags
= S
->getTargetFlags();
1172 } else if (auto *S
= dyn_cast
<MCSymbolSDNode
>(N0
)) {
1173 AM
.MCSym
= S
->getMCSymbol();
1174 } else if (JumpTableSDNode
*J
= dyn_cast
<JumpTableSDNode
>(N0
)) {
1175 AM
.JT
= J
->getIndex();
1176 AM
.SymbolFlags
= J
->getTargetFlags();
1177 } else if (BlockAddressSDNode
*BA
= dyn_cast
<BlockAddressSDNode
>(N0
)) {
1178 AM
.BlockAddr
= BA
->getBlockAddress();
1179 AM
.SymbolFlags
= BA
->getTargetFlags();
1180 Offset
= BA
->getOffset();
1182 llvm_unreachable("Unhandled symbol reference node.");
1184 if (foldOffsetIntoAddress(Offset
, AM
)) {
1190 AM
.setBaseReg(CurDAG
->getRegister(X86::RIP
, MVT::i64
));
1192 // Commit the changes now that we know this fold is safe.
1196 /// Add the specified node to the specified addressing mode, returning true if
1197 /// it cannot be done. This just pattern matches for the addressing mode.
1198 bool X86DAGToDAGISel::matchAddress(SDValue N
, X86ISelAddressMode
&AM
) {
1199 if (matchAddressRecursively(N
, AM
, 0))
1202 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1203 // a smaller encoding and avoids a scaled-index.
1204 if (AM
.Scale
== 2 &&
1205 AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1206 AM
.Base_Reg
.getNode() == nullptr) {
1207 AM
.Base_Reg
= AM
.IndexReg
;
1211 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1212 // because it has a smaller encoding.
1213 // TODO: Which other code models can use this?
1214 if (TM
.getCodeModel() == CodeModel::Small
&&
1215 Subtarget
->is64Bit() &&
1217 AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1218 AM
.Base_Reg
.getNode() == nullptr &&
1219 AM
.IndexReg
.getNode() == nullptr &&
1220 AM
.SymbolFlags
== X86II::MO_NO_FLAG
&&
1221 AM
.hasSymbolicDisplacement())
1222 AM
.Base_Reg
= CurDAG
->getRegister(X86::RIP
, MVT::i64
);
1227 bool X86DAGToDAGISel::matchAdd(SDValue N
, X86ISelAddressMode
&AM
,
1229 // Add an artificial use to this node so that we can keep track of
1230 // it if it gets CSE'd with a different node.
1231 HandleSDNode
Handle(N
);
1233 X86ISelAddressMode Backup
= AM
;
1234 if (!matchAddressRecursively(N
.getOperand(0), AM
, Depth
+1) &&
1235 !matchAddressRecursively(Handle
.getValue().getOperand(1), AM
, Depth
+1))
1239 // Try again after commuting the operands.
1240 if (!matchAddressRecursively(Handle
.getValue().getOperand(1), AM
, Depth
+1) &&
1241 !matchAddressRecursively(Handle
.getValue().getOperand(0), AM
, Depth
+1))
1245 // If we couldn't fold both operands into the address at the same time,
1246 // see if we can just put each operand into a register and fold at least
1248 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1249 !AM
.Base_Reg
.getNode() &&
1250 !AM
.IndexReg
.getNode()) {
1251 N
= Handle
.getValue();
1252 AM
.Base_Reg
= N
.getOperand(0);
1253 AM
.IndexReg
= N
.getOperand(1);
1257 N
= Handle
.getValue();
1261 // Insert a node into the DAG at least before the Pos node's position. This
1262 // will reposition the node as needed, and will assign it a node ID that is <=
1263 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1264 // IDs! The selection DAG must no longer depend on their uniqueness when this
1266 static void insertDAGNode(SelectionDAG
&DAG
, SDValue Pos
, SDValue N
) {
1267 if (N
->getNodeId() == -1 ||
1268 (SelectionDAGISel::getUninvalidatedNodeId(N
.getNode()) >
1269 SelectionDAGISel::getUninvalidatedNodeId(Pos
.getNode()))) {
1270 DAG
.RepositionNode(Pos
->getIterator(), N
.getNode());
1271 // Mark Node as invalid for pruning as after this it may be a successor to a
1272 // selected node but otherwise be in the same position of Pos.
1273 // Conservatively mark it with the same -abs(Id) to assure node id
1274 // invariant is preserved.
1275 N
->setNodeId(Pos
->getNodeId());
1276 SelectionDAGISel::InvalidateNodeId(N
.getNode());
1280 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1281 // safe. This allows us to convert the shift and and into an h-register
1282 // extract and a scaled index. Returns false if the simplification is
1284 static bool foldMaskAndShiftToExtract(SelectionDAG
&DAG
, SDValue N
,
1286 SDValue Shift
, SDValue X
,
1287 X86ISelAddressMode
&AM
) {
1288 if (Shift
.getOpcode() != ISD::SRL
||
1289 !isa
<ConstantSDNode
>(Shift
.getOperand(1)) ||
1293 int ScaleLog
= 8 - Shift
.getConstantOperandVal(1);
1294 if (ScaleLog
<= 0 || ScaleLog
>= 4 ||
1295 Mask
!= (0xffu
<< ScaleLog
))
1298 MVT VT
= N
.getSimpleValueType();
1300 SDValue Eight
= DAG
.getConstant(8, DL
, MVT::i8
);
1301 SDValue NewMask
= DAG
.getConstant(0xff, DL
, VT
);
1302 SDValue Srl
= DAG
.getNode(ISD::SRL
, DL
, VT
, X
, Eight
);
1303 SDValue And
= DAG
.getNode(ISD::AND
, DL
, VT
, Srl
, NewMask
);
1304 SDValue ShlCount
= DAG
.getConstant(ScaleLog
, DL
, MVT::i8
);
1305 SDValue Shl
= DAG
.getNode(ISD::SHL
, DL
, VT
, And
, ShlCount
);
1307 // Insert the new nodes into the topological ordering. We must do this in
1308 // a valid topological ordering as nothing is going to go back and re-sort
1309 // these nodes. We continually insert before 'N' in sequence as this is
1310 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1311 // hierarchy left to express.
1312 insertDAGNode(DAG
, N
, Eight
);
1313 insertDAGNode(DAG
, N
, Srl
);
1314 insertDAGNode(DAG
, N
, NewMask
);
1315 insertDAGNode(DAG
, N
, And
);
1316 insertDAGNode(DAG
, N
, ShlCount
);
1317 insertDAGNode(DAG
, N
, Shl
);
1318 DAG
.ReplaceAllUsesWith(N
, Shl
);
1320 AM
.Scale
= (1 << ScaleLog
);
1324 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
1325 // allows us to fold the shift into this addressing mode. Returns false if the
1326 // transform succeeded.
1327 static bool foldMaskedShiftToScaledMask(SelectionDAG
&DAG
, SDValue N
,
1329 SDValue Shift
, SDValue X
,
1330 X86ISelAddressMode
&AM
) {
1331 if (Shift
.getOpcode() != ISD::SHL
||
1332 !isa
<ConstantSDNode
>(Shift
.getOperand(1)))
1335 // Not likely to be profitable if either the AND or SHIFT node has more
1336 // than one use (unless all uses are for address computation). Besides,
1337 // isel mechanism requires their node ids to be reused.
1338 if (!N
.hasOneUse() || !Shift
.hasOneUse())
1341 // Verify that the shift amount is something we can fold.
1342 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
1343 if (ShiftAmt
!= 1 && ShiftAmt
!= 2 && ShiftAmt
!= 3)
1346 MVT VT
= N
.getSimpleValueType();
1348 SDValue NewMask
= DAG
.getConstant(Mask
>> ShiftAmt
, DL
, VT
);
1349 SDValue NewAnd
= DAG
.getNode(ISD::AND
, DL
, VT
, X
, NewMask
);
1350 SDValue NewShift
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewAnd
, Shift
.getOperand(1));
1352 // Insert the new nodes into the topological ordering. We must do this in
1353 // a valid topological ordering as nothing is going to go back and re-sort
1354 // these nodes. We continually insert before 'N' in sequence as this is
1355 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1356 // hierarchy left to express.
1357 insertDAGNode(DAG
, N
, NewMask
);
1358 insertDAGNode(DAG
, N
, NewAnd
);
1359 insertDAGNode(DAG
, N
, NewShift
);
1360 DAG
.ReplaceAllUsesWith(N
, NewShift
);
1362 AM
.Scale
= 1 << ShiftAmt
;
1363 AM
.IndexReg
= NewAnd
;
1367 // Implement some heroics to detect shifts of masked values where the mask can
1368 // be replaced by extending the shift and undoing that in the addressing mode
1369 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
1370 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
1371 // the addressing mode. This results in code such as:
1373 // int f(short *y, int *lookup_table) {
1375 // return *y + lookup_table[*y >> 11];
1379 // movzwl (%rdi), %eax
1382 // addl (%rsi,%rcx,4), %eax
1385 // movzwl (%rdi), %eax
1389 // addl (%rsi,%rcx), %eax
1391 // Note that this function assumes the mask is provided as a mask *after* the
1392 // value is shifted. The input chain may or may not match that, but computing
1393 // such a mask is trivial.
1394 static bool foldMaskAndShiftToScale(SelectionDAG
&DAG
, SDValue N
,
1396 SDValue Shift
, SDValue X
,
1397 X86ISelAddressMode
&AM
) {
1398 if (Shift
.getOpcode() != ISD::SRL
|| !Shift
.hasOneUse() ||
1399 !isa
<ConstantSDNode
>(Shift
.getOperand(1)))
1402 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
1403 unsigned MaskLZ
= countLeadingZeros(Mask
);
1404 unsigned MaskTZ
= countTrailingZeros(Mask
);
1406 // The amount of shift we're trying to fit into the addressing mode is taken
1407 // from the trailing zeros of the mask.
1408 unsigned AMShiftAmt
= MaskTZ
;
1410 // There is nothing we can do here unless the mask is removing some bits.
1411 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1412 if (AMShiftAmt
<= 0 || AMShiftAmt
> 3) return true;
1414 // We also need to ensure that mask is a continuous run of bits.
1415 if (countTrailingOnes(Mask
>> MaskTZ
) + MaskTZ
+ MaskLZ
!= 64) return true;
1417 // Scale the leading zero count down based on the actual size of the value.
1418 // Also scale it down based on the size of the shift.
1419 unsigned ScaleDown
= (64 - X
.getSimpleValueType().getSizeInBits()) + ShiftAmt
;
1420 if (MaskLZ
< ScaleDown
)
1422 MaskLZ
-= ScaleDown
;
1424 // The final check is to ensure that any masked out high bits of X are
1425 // already known to be zero. Otherwise, the mask has a semantic impact
1426 // other than masking out a couple of low bits. Unfortunately, because of
1427 // the mask, zero extensions will be removed from operands in some cases.
1428 // This code works extra hard to look through extensions because we can
1429 // replace them with zero extensions cheaply if necessary.
1430 bool ReplacingAnyExtend
= false;
1431 if (X
.getOpcode() == ISD::ANY_EXTEND
) {
1432 unsigned ExtendBits
= X
.getSimpleValueType().getSizeInBits() -
1433 X
.getOperand(0).getSimpleValueType().getSizeInBits();
1434 // Assume that we'll replace the any-extend with a zero-extend, and
1435 // narrow the search to the extended value.
1436 X
= X
.getOperand(0);
1437 MaskLZ
= ExtendBits
> MaskLZ
? 0 : MaskLZ
- ExtendBits
;
1438 ReplacingAnyExtend
= true;
1440 APInt MaskedHighBits
=
1441 APInt::getHighBitsSet(X
.getSimpleValueType().getSizeInBits(), MaskLZ
);
1442 KnownBits Known
= DAG
.computeKnownBits(X
);
1443 if (MaskedHighBits
!= Known
.Zero
) return true;
1445 // We've identified a pattern that can be transformed into a single shift
1446 // and an addressing mode. Make it so.
1447 MVT VT
= N
.getSimpleValueType();
1448 if (ReplacingAnyExtend
) {
1449 assert(X
.getValueType() != VT
);
1450 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
1451 SDValue NewX
= DAG
.getNode(ISD::ZERO_EXTEND
, SDLoc(X
), VT
, X
);
1452 insertDAGNode(DAG
, N
, NewX
);
1456 SDValue NewSRLAmt
= DAG
.getConstant(ShiftAmt
+ AMShiftAmt
, DL
, MVT::i8
);
1457 SDValue NewSRL
= DAG
.getNode(ISD::SRL
, DL
, VT
, X
, NewSRLAmt
);
1458 SDValue NewSHLAmt
= DAG
.getConstant(AMShiftAmt
, DL
, MVT::i8
);
1459 SDValue NewSHL
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewSRL
, NewSHLAmt
);
1461 // Insert the new nodes into the topological ordering. We must do this in
1462 // a valid topological ordering as nothing is going to go back and re-sort
1463 // these nodes. We continually insert before 'N' in sequence as this is
1464 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1465 // hierarchy left to express.
1466 insertDAGNode(DAG
, N
, NewSRLAmt
);
1467 insertDAGNode(DAG
, N
, NewSRL
);
1468 insertDAGNode(DAG
, N
, NewSHLAmt
);
1469 insertDAGNode(DAG
, N
, NewSHL
);
1470 DAG
.ReplaceAllUsesWith(N
, NewSHL
);
1472 AM
.Scale
= 1 << AMShiftAmt
;
1473 AM
.IndexReg
= NewSRL
;
1477 // Transform "(X >> SHIFT) & (MASK << C1)" to
1478 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
1479 // matched to a BEXTR later. Returns false if the simplification is performed.
1480 static bool foldMaskedShiftToBEXTR(SelectionDAG
&DAG
, SDValue N
,
1482 SDValue Shift
, SDValue X
,
1483 X86ISelAddressMode
&AM
,
1484 const X86Subtarget
&Subtarget
) {
1485 if (Shift
.getOpcode() != ISD::SRL
||
1486 !isa
<ConstantSDNode
>(Shift
.getOperand(1)) ||
1487 !Shift
.hasOneUse() || !N
.hasOneUse())
1490 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
1491 if (!Subtarget
.hasTBM() &&
1492 !(Subtarget
.hasBMI() && Subtarget
.hasFastBEXTR()))
1495 // We need to ensure that mask is a continuous run of bits.
1496 if (!isShiftedMask_64(Mask
)) return true;
1498 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
1500 // The amount of shift we're trying to fit into the addressing mode is taken
1501 // from the trailing zeros of the mask.
1502 unsigned AMShiftAmt
= countTrailingZeros(Mask
);
1504 // There is nothing we can do here unless the mask is removing some bits.
1505 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1506 if (AMShiftAmt
<= 0 || AMShiftAmt
> 3) return true;
1508 MVT VT
= N
.getSimpleValueType();
1510 SDValue NewSRLAmt
= DAG
.getConstant(ShiftAmt
+ AMShiftAmt
, DL
, MVT::i8
);
1511 SDValue NewSRL
= DAG
.getNode(ISD::SRL
, DL
, VT
, X
, NewSRLAmt
);
1512 SDValue NewMask
= DAG
.getConstant(Mask
>> AMShiftAmt
, DL
, VT
);
1513 SDValue NewAnd
= DAG
.getNode(ISD::AND
, DL
, VT
, NewSRL
, NewMask
);
1514 SDValue NewSHLAmt
= DAG
.getConstant(AMShiftAmt
, DL
, MVT::i8
);
1515 SDValue NewSHL
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewAnd
, NewSHLAmt
);
1517 // Insert the new nodes into the topological ordering. We must do this in
1518 // a valid topological ordering as nothing is going to go back and re-sort
1519 // these nodes. We continually insert before 'N' in sequence as this is
1520 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1521 // hierarchy left to express.
1522 insertDAGNode(DAG
, N
, NewSRLAmt
);
1523 insertDAGNode(DAG
, N
, NewSRL
);
1524 insertDAGNode(DAG
, N
, NewMask
);
1525 insertDAGNode(DAG
, N
, NewAnd
);
1526 insertDAGNode(DAG
, N
, NewSHLAmt
);
1527 insertDAGNode(DAG
, N
, NewSHL
);
1528 DAG
.ReplaceAllUsesWith(N
, NewSHL
);
1530 AM
.Scale
= 1 << AMShiftAmt
;
1531 AM
.IndexReg
= NewAnd
;
1535 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N
, X86ISelAddressMode
&AM
,
1539 dbgs() << "MatchAddress: ";
1544 return matchAddressBase(N
, AM
);
1546 // If this is already a %rip relative address, we can only merge immediates
1547 // into it. Instead of handling this in every case, we handle it here.
1548 // RIP relative addressing: %rip + 32-bit displacement!
1549 if (AM
.isRIPRelative()) {
1550 // FIXME: JumpTable and ExternalSymbol address currently don't like
1551 // displacements. It isn't very important, but this should be fixed for
1553 if (!(AM
.ES
|| AM
.MCSym
) && AM
.JT
!= -1)
1556 if (ConstantSDNode
*Cst
= dyn_cast
<ConstantSDNode
>(N
))
1557 if (!foldOffsetIntoAddress(Cst
->getSExtValue(), AM
))
1562 switch (N
.getOpcode()) {
1564 case ISD::LOCAL_RECOVER
: {
1565 if (!AM
.hasSymbolicDisplacement() && AM
.Disp
== 0)
1566 if (const auto *ESNode
= dyn_cast
<MCSymbolSDNode
>(N
.getOperand(0))) {
1567 // Use the symbol and don't prefix it.
1568 AM
.MCSym
= ESNode
->getMCSymbol();
1573 case ISD::Constant
: {
1574 uint64_t Val
= cast
<ConstantSDNode
>(N
)->getSExtValue();
1575 if (!foldOffsetIntoAddress(Val
, AM
))
1580 case X86ISD::Wrapper
:
1581 case X86ISD::WrapperRIP
:
1582 if (!matchWrapper(N
, AM
))
1587 if (!matchLoadInAddress(cast
<LoadSDNode
>(N
), AM
))
1591 case ISD::FrameIndex
:
1592 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1593 AM
.Base_Reg
.getNode() == nullptr &&
1594 (!Subtarget
->is64Bit() || isDispSafeForFrameIndex(AM
.Disp
))) {
1595 AM
.BaseType
= X86ISelAddressMode::FrameIndexBase
;
1596 AM
.Base_FrameIndex
= cast
<FrameIndexSDNode
>(N
)->getIndex();
1602 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1)
1605 if (ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(N
.getOperand(1))) {
1606 unsigned Val
= CN
->getZExtValue();
1607 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
1608 // that the base operand remains free for further matching. If
1609 // the base doesn't end up getting used, a post-processing step
1610 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
1611 if (Val
== 1 || Val
== 2 || Val
== 3) {
1612 AM
.Scale
= 1 << Val
;
1613 SDValue ShVal
= N
.getOperand(0);
1615 // Okay, we know that we have a scale by now. However, if the scaled
1616 // value is an add of something and a constant, we can fold the
1617 // constant into the disp field here.
1618 if (CurDAG
->isBaseWithConstantOffset(ShVal
)) {
1619 AM
.IndexReg
= ShVal
.getOperand(0);
1620 ConstantSDNode
*AddVal
= cast
<ConstantSDNode
>(ShVal
.getOperand(1));
1621 uint64_t Disp
= (uint64_t)AddVal
->getSExtValue() << Val
;
1622 if (!foldOffsetIntoAddress(Disp
, AM
))
1626 AM
.IndexReg
= ShVal
;
1633 // Scale must not be used already.
1634 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1) break;
1636 SDValue And
= N
.getOperand(0);
1637 if (And
.getOpcode() != ISD::AND
) break;
1638 SDValue X
= And
.getOperand(0);
1640 // We only handle up to 64-bit values here as those are what matter for
1641 // addressing mode optimizations.
1642 if (X
.getSimpleValueType().getSizeInBits() > 64) break;
1644 // The mask used for the transform is expected to be post-shift, but we
1645 // found the shift first so just apply the shift to the mask before passing
1647 if (!isa
<ConstantSDNode
>(N
.getOperand(1)) ||
1648 !isa
<ConstantSDNode
>(And
.getOperand(1)))
1650 uint64_t Mask
= And
.getConstantOperandVal(1) >> N
.getConstantOperandVal(1);
1652 // Try to fold the mask and shift into the scale, and return false if we
1654 if (!foldMaskAndShiftToScale(*CurDAG
, N
, Mask
, N
, X
, AM
))
1659 case ISD::SMUL_LOHI
:
1660 case ISD::UMUL_LOHI
:
1661 // A mul_lohi where we need the low part can be folded as a plain multiply.
1662 if (N
.getResNo() != 0) break;
1665 case X86ISD::MUL_IMM
:
1666 // X*[3,5,9] -> X+X*[2,4,8]
1667 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1668 AM
.Base_Reg
.getNode() == nullptr &&
1669 AM
.IndexReg
.getNode() == nullptr) {
1670 if (ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(N
.getOperand(1)))
1671 if (CN
->getZExtValue() == 3 || CN
->getZExtValue() == 5 ||
1672 CN
->getZExtValue() == 9) {
1673 AM
.Scale
= unsigned(CN
->getZExtValue())-1;
1675 SDValue MulVal
= N
.getOperand(0);
1678 // Okay, we know that we have a scale by now. However, if the scaled
1679 // value is an add of something and a constant, we can fold the
1680 // constant into the disp field here.
1681 if (MulVal
.getNode()->getOpcode() == ISD::ADD
&& MulVal
.hasOneUse() &&
1682 isa
<ConstantSDNode
>(MulVal
.getOperand(1))) {
1683 Reg
= MulVal
.getOperand(0);
1684 ConstantSDNode
*AddVal
=
1685 cast
<ConstantSDNode
>(MulVal
.getOperand(1));
1686 uint64_t Disp
= AddVal
->getSExtValue() * CN
->getZExtValue();
1687 if (foldOffsetIntoAddress(Disp
, AM
))
1688 Reg
= N
.getOperand(0);
1690 Reg
= N
.getOperand(0);
1693 AM
.IndexReg
= AM
.Base_Reg
= Reg
;
1700 // Given A-B, if A can be completely folded into the address and
1701 // the index field with the index field unused, use -B as the index.
1702 // This is a win if a has multiple parts that can be folded into
1703 // the address. Also, this saves a mov if the base register has
1704 // other uses, since it avoids a two-address sub instruction, however
1705 // it costs an additional mov if the index register has other uses.
1707 // Add an artificial use to this node so that we can keep track of
1708 // it if it gets CSE'd with a different node.
1709 HandleSDNode
Handle(N
);
1711 // Test if the LHS of the sub can be folded.
1712 X86ISelAddressMode Backup
= AM
;
1713 if (matchAddressRecursively(N
.getOperand(0), AM
, Depth
+1)) {
1717 // Test if the index field is free for use.
1718 if (AM
.IndexReg
.getNode() || AM
.isRIPRelative()) {
1724 SDValue RHS
= Handle
.getValue().getOperand(1);
1725 // If the RHS involves a register with multiple uses, this
1726 // transformation incurs an extra mov, due to the neg instruction
1727 // clobbering its operand.
1728 if (!RHS
.getNode()->hasOneUse() ||
1729 RHS
.getNode()->getOpcode() == ISD::CopyFromReg
||
1730 RHS
.getNode()->getOpcode() == ISD::TRUNCATE
||
1731 RHS
.getNode()->getOpcode() == ISD::ANY_EXTEND
||
1732 (RHS
.getNode()->getOpcode() == ISD::ZERO_EXTEND
&&
1733 RHS
.getOperand(0).getValueType() == MVT::i32
))
1735 // If the base is a register with multiple uses, this
1736 // transformation may save a mov.
1737 // FIXME: Don't rely on DELETED_NODEs.
1738 if ((AM
.BaseType
== X86ISelAddressMode::RegBase
&& AM
.Base_Reg
.getNode() &&
1739 AM
.Base_Reg
->getOpcode() != ISD::DELETED_NODE
&&
1740 !AM
.Base_Reg
.getNode()->hasOneUse()) ||
1741 AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
1743 // If the folded LHS was interesting, this transformation saves
1744 // address arithmetic.
1745 if ((AM
.hasSymbolicDisplacement() && !Backup
.hasSymbolicDisplacement()) +
1746 ((AM
.Disp
!= 0) && (Backup
.Disp
== 0)) +
1747 (AM
.Segment
.getNode() && !Backup
.Segment
.getNode()) >= 2)
1749 // If it doesn't look like it may be an overall win, don't do it.
1755 // Ok, the transformation is legal and appears profitable. Go for it.
1756 SDValue Zero
= CurDAG
->getConstant(0, dl
, N
.getValueType());
1757 SDValue Neg
= CurDAG
->getNode(ISD::SUB
, dl
, N
.getValueType(), Zero
, RHS
);
1761 // Insert the new nodes into the topological ordering.
1762 insertDAGNode(*CurDAG
, Handle
.getValue(), Zero
);
1763 insertDAGNode(*CurDAG
, Handle
.getValue(), Neg
);
1768 if (!matchAdd(N
, AM
, Depth
))
1773 // We want to look through a transform in InstCombine and DAGCombiner that
1774 // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
1775 // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
1776 // An 'lea' can then be used to match the shift (multiply) and add:
1778 // lea (%rsi, %rdi, 8), %rax
1779 if (CurDAG
->haveNoCommonBitsSet(N
.getOperand(0), N
.getOperand(1)) &&
1780 !matchAdd(N
, AM
, Depth
))
1785 // Perform some heroic transforms on an and of a constant-count shift
1786 // with a constant to enable use of the scaled offset field.
1788 // Scale must not be used already.
1789 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1) break;
1791 SDValue Shift
= N
.getOperand(0);
1792 if (Shift
.getOpcode() != ISD::SRL
&& Shift
.getOpcode() != ISD::SHL
) break;
1793 SDValue X
= Shift
.getOperand(0);
1795 // We only handle up to 64-bit values here as those are what matter for
1796 // addressing mode optimizations.
1797 if (X
.getSimpleValueType().getSizeInBits() > 64) break;
1799 if (!isa
<ConstantSDNode
>(N
.getOperand(1)))
1801 uint64_t Mask
= N
.getConstantOperandVal(1);
1803 // Try to fold the mask and shift into an extract and scale.
1804 if (!foldMaskAndShiftToExtract(*CurDAG
, N
, Mask
, Shift
, X
, AM
))
1807 // Try to fold the mask and shift directly into the scale.
1808 if (!foldMaskAndShiftToScale(*CurDAG
, N
, Mask
, Shift
, X
, AM
))
1811 // Try to swap the mask and shift to place shifts which can be done as
1812 // a scale on the outside of the mask.
1813 if (!foldMaskedShiftToScaledMask(*CurDAG
, N
, Mask
, Shift
, X
, AM
))
1816 // Try to fold the mask and shift into BEXTR and scale.
1817 if (!foldMaskedShiftToBEXTR(*CurDAG
, N
, Mask
, Shift
, X
, AM
, *Subtarget
))
1824 return matchAddressBase(N
, AM
);
1827 /// Helper for MatchAddress. Add the specified node to the
1828 /// specified addressing mode without any further recursion.
1829 bool X86DAGToDAGISel::matchAddressBase(SDValue N
, X86ISelAddressMode
&AM
) {
1830 // Is the base register already occupied?
1831 if (AM
.BaseType
!= X86ISelAddressMode::RegBase
|| AM
.Base_Reg
.getNode()) {
1832 // If so, check to see if the scale index register is set.
1833 if (!AM
.IndexReg
.getNode()) {
1839 // Otherwise, we cannot select it.
1843 // Default, generate it as a register.
1844 AM
.BaseType
= X86ISelAddressMode::RegBase
;
1849 /// Helper for selectVectorAddr. Handles things that can be folded into a
1850 /// gather scatter address. The index register and scale should have already
1852 bool X86DAGToDAGISel::matchVectorAddress(SDValue N
, X86ISelAddressMode
&AM
) {
1853 // TODO: Support other operations.
1854 switch (N
.getOpcode()) {
1855 case ISD::Constant
: {
1856 uint64_t Val
= cast
<ConstantSDNode
>(N
)->getSExtValue();
1857 if (!foldOffsetIntoAddress(Val
, AM
))
1861 case X86ISD::Wrapper
:
1862 if (!matchWrapper(N
, AM
))
1867 return matchAddressBase(N
, AM
);
1870 bool X86DAGToDAGISel::selectVectorAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
1871 SDValue
&Scale
, SDValue
&Index
,
1872 SDValue
&Disp
, SDValue
&Segment
) {
1873 X86ISelAddressMode AM
;
1874 auto *Mgs
= cast
<X86MaskedGatherScatterSDNode
>(Parent
);
1875 AM
.IndexReg
= Mgs
->getIndex();
1876 AM
.Scale
= cast
<ConstantSDNode
>(Mgs
->getScale())->getZExtValue();
1878 unsigned AddrSpace
= cast
<MemSDNode
>(Parent
)->getPointerInfo().getAddrSpace();
1879 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
1880 if (AddrSpace
== 256)
1881 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
1882 if (AddrSpace
== 257)
1883 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
1884 if (AddrSpace
== 258)
1885 AM
.Segment
= CurDAG
->getRegister(X86::SS
, MVT::i16
);
1887 // Try to match into the base and displacement fields.
1888 if (matchVectorAddress(N
, AM
))
1891 MVT VT
= N
.getSimpleValueType();
1892 if (AM
.BaseType
== X86ISelAddressMode::RegBase
) {
1893 if (!AM
.Base_Reg
.getNode())
1894 AM
.Base_Reg
= CurDAG
->getRegister(0, VT
);
1897 getAddressOperands(AM
, SDLoc(N
), Base
, Scale
, Index
, Disp
, Segment
);
1901 /// Returns true if it is able to pattern match an addressing mode.
1902 /// It returns the operands which make up the maximal addressing mode it can
1903 /// match by reference.
1905 /// Parent is the parent node of the addr operand that is being matched. It
1906 /// is always a load, store, atomic node, or null. It is only null when
1907 /// checking memory operands for inline asm nodes.
1908 bool X86DAGToDAGISel::selectAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
1909 SDValue
&Scale
, SDValue
&Index
,
1910 SDValue
&Disp
, SDValue
&Segment
) {
1911 X86ISelAddressMode AM
;
1914 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
1915 // that are not a MemSDNode, and thus don't have proper addrspace info.
1916 Parent
->getOpcode() != ISD::INTRINSIC_W_CHAIN
&& // unaligned loads, fixme
1917 Parent
->getOpcode() != ISD::INTRINSIC_VOID
&& // nontemporal stores
1918 Parent
->getOpcode() != X86ISD::TLSCALL
&& // Fixme
1919 Parent
->getOpcode() != X86ISD::EH_SJLJ_SETJMP
&& // setjmp
1920 Parent
->getOpcode() != X86ISD::EH_SJLJ_LONGJMP
) { // longjmp
1921 unsigned AddrSpace
=
1922 cast
<MemSDNode
>(Parent
)->getPointerInfo().getAddrSpace();
1923 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
1924 if (AddrSpace
== 256)
1925 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
1926 if (AddrSpace
== 257)
1927 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
1928 if (AddrSpace
== 258)
1929 AM
.Segment
= CurDAG
->getRegister(X86::SS
, MVT::i16
);
1932 if (matchAddress(N
, AM
))
1935 MVT VT
= N
.getSimpleValueType();
1936 if (AM
.BaseType
== X86ISelAddressMode::RegBase
) {
1937 if (!AM
.Base_Reg
.getNode())
1938 AM
.Base_Reg
= CurDAG
->getRegister(0, VT
);
1941 if (!AM
.IndexReg
.getNode())
1942 AM
.IndexReg
= CurDAG
->getRegister(0, VT
);
1944 getAddressOperands(AM
, SDLoc(N
), Base
, Scale
, Index
, Disp
, Segment
);
1948 // We can only fold a load if all nodes between it and the root node have a
1949 // single use. If there are additional uses, we could end up duplicating the
1951 static bool hasSingleUsesFromRoot(SDNode
*Root
, SDNode
*User
) {
1952 while (User
!= Root
) {
1953 if (!User
->hasOneUse())
1955 User
= *User
->use_begin();
1961 /// Match a scalar SSE load. In particular, we want to match a load whose top
1962 /// elements are either undef or zeros. The load flavor is derived from the
1963 /// type of N, which is either v4f32 or v2f64.
1966 /// PatternChainNode: this is the matched node that has a chain input and
1968 bool X86DAGToDAGISel::selectScalarSSELoad(SDNode
*Root
, SDNode
*Parent
,
1969 SDValue N
, SDValue
&Base
,
1970 SDValue
&Scale
, SDValue
&Index
,
1971 SDValue
&Disp
, SDValue
&Segment
,
1972 SDValue
&PatternNodeWithChain
) {
1973 if (!hasSingleUsesFromRoot(Root
, Parent
))
1976 // We can allow a full vector load here since narrowing a load is ok.
1977 if (ISD::isNON_EXTLoad(N
.getNode())) {
1978 PatternNodeWithChain
= N
;
1979 if (IsProfitableToFold(PatternNodeWithChain
, N
.getNode(), Root
) &&
1980 IsLegalToFold(PatternNodeWithChain
, Parent
, Root
, OptLevel
)) {
1981 LoadSDNode
*LD
= cast
<LoadSDNode
>(PatternNodeWithChain
);
1982 return selectAddr(LD
, LD
->getBasePtr(), Base
, Scale
, Index
, Disp
,
1987 // We can also match the special zero extended load opcode.
1988 if (N
.getOpcode() == X86ISD::VZEXT_LOAD
) {
1989 PatternNodeWithChain
= N
;
1990 if (IsProfitableToFold(PatternNodeWithChain
, N
.getNode(), Root
) &&
1991 IsLegalToFold(PatternNodeWithChain
, Parent
, Root
, OptLevel
)) {
1992 auto *MI
= cast
<MemIntrinsicSDNode
>(PatternNodeWithChain
);
1993 return selectAddr(MI
, MI
->getBasePtr(), Base
, Scale
, Index
, Disp
,
1998 // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
1999 // once. Otherwise the load might get duplicated and the chain output of the
2000 // duplicate load will not be observed by all dependencies.
2001 if (N
.getOpcode() == ISD::SCALAR_TO_VECTOR
&& N
.getNode()->hasOneUse()) {
2002 PatternNodeWithChain
= N
.getOperand(0);
2003 if (ISD::isNON_EXTLoad(PatternNodeWithChain
.getNode()) &&
2004 IsProfitableToFold(PatternNodeWithChain
, N
.getNode(), Root
) &&
2005 IsLegalToFold(PatternNodeWithChain
, N
.getNode(), Root
, OptLevel
)) {
2006 LoadSDNode
*LD
= cast
<LoadSDNode
>(PatternNodeWithChain
);
2007 return selectAddr(LD
, LD
->getBasePtr(), Base
, Scale
, Index
, Disp
,
2012 // Also handle the case where we explicitly require zeros in the top
2013 // elements. This is a vector shuffle from the zero vector.
2014 if (N
.getOpcode() == X86ISD::VZEXT_MOVL
&& N
.getNode()->hasOneUse() &&
2015 // Check to see if the top elements are all zeros (or bitcast of zeros).
2016 N
.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR
&&
2017 N
.getOperand(0).getNode()->hasOneUse()) {
2018 PatternNodeWithChain
= N
.getOperand(0).getOperand(0);
2019 if (ISD::isNON_EXTLoad(PatternNodeWithChain
.getNode()) &&
2020 IsProfitableToFold(PatternNodeWithChain
, N
.getNode(), Root
) &&
2021 IsLegalToFold(PatternNodeWithChain
, N
.getNode(), Root
, OptLevel
)) {
2022 // Okay, this is a zero extending load. Fold it.
2023 LoadSDNode
*LD
= cast
<LoadSDNode
>(PatternNodeWithChain
);
2024 return selectAddr(LD
, LD
->getBasePtr(), Base
, Scale
, Index
, Disp
,
2033 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N
, SDValue
&Imm
) {
2034 if (const ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(N
)) {
2035 uint64_t ImmVal
= CN
->getZExtValue();
2036 if (!isUInt
<32>(ImmVal
))
2039 Imm
= CurDAG
->getTargetConstant(ImmVal
, SDLoc(N
), MVT::i64
);
2043 // In static codegen with small code model, we can get the address of a label
2044 // into a register with 'movl'
2045 if (N
->getOpcode() != X86ISD::Wrapper
)
2048 N
= N
.getOperand(0);
2050 // At least GNU as does not accept 'movl' for TPOFF relocations.
2051 // FIXME: We could use 'movl' when we know we are targeting MC.
2052 if (N
->getOpcode() == ISD::TargetGlobalTLSAddress
)
2056 if (N
->getOpcode() != ISD::TargetGlobalAddress
)
2057 return TM
.getCodeModel() == CodeModel::Small
;
2059 Optional
<ConstantRange
> CR
=
2060 cast
<GlobalAddressSDNode
>(N
)->getGlobal()->getAbsoluteSymbolRange();
2062 return TM
.getCodeModel() == CodeModel::Small
;
2064 return CR
->getUnsignedMax().ult(1ull << 32);
2067 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N
, SDValue
&Base
,
2068 SDValue
&Scale
, SDValue
&Index
,
2069 SDValue
&Disp
, SDValue
&Segment
) {
2070 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2073 if (!selectLEAAddr(N
, Base
, Scale
, Index
, Disp
, Segment
))
2076 RegisterSDNode
*RN
= dyn_cast
<RegisterSDNode
>(Base
);
2077 if (RN
&& RN
->getReg() == 0)
2078 Base
= CurDAG
->getRegister(0, MVT::i64
);
2079 else if (Base
.getValueType() == MVT::i32
&& !dyn_cast
<FrameIndexSDNode
>(Base
)) {
2080 // Base could already be %rip, particularly in the x32 ABI.
2081 Base
= SDValue(CurDAG
->getMachineNode(
2082 TargetOpcode::SUBREG_TO_REG
, DL
, MVT::i64
,
2083 CurDAG
->getTargetConstant(0, DL
, MVT::i64
),
2085 CurDAG
->getTargetConstant(X86::sub_32bit
, DL
, MVT::i32
)),
2089 RN
= dyn_cast
<RegisterSDNode
>(Index
);
2090 if (RN
&& RN
->getReg() == 0)
2091 Index
= CurDAG
->getRegister(0, MVT::i64
);
2093 assert(Index
.getValueType() == MVT::i32
&&
2094 "Expect to be extending 32-bit registers for use in LEA");
2095 Index
= SDValue(CurDAG
->getMachineNode(
2096 TargetOpcode::SUBREG_TO_REG
, DL
, MVT::i64
,
2097 CurDAG
->getTargetConstant(0, DL
, MVT::i64
),
2099 CurDAG
->getTargetConstant(X86::sub_32bit
, DL
,
2107 /// Calls SelectAddr and determines if the maximal addressing
2108 /// mode it matches can be cost effectively emitted as an LEA instruction.
2109 bool X86DAGToDAGISel::selectLEAAddr(SDValue N
,
2110 SDValue
&Base
, SDValue
&Scale
,
2111 SDValue
&Index
, SDValue
&Disp
,
2113 X86ISelAddressMode AM
;
2115 // Save the DL and VT before calling matchAddress, it can invalidate N.
2117 MVT VT
= N
.getSimpleValueType();
2119 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
2121 SDValue Copy
= AM
.Segment
;
2122 SDValue T
= CurDAG
->getRegister(0, MVT::i32
);
2124 if (matchAddress(N
, AM
))
2126 assert (T
== AM
.Segment
);
2129 unsigned Complexity
= 0;
2130 if (AM
.BaseType
== X86ISelAddressMode::RegBase
)
2131 if (AM
.Base_Reg
.getNode())
2134 AM
.Base_Reg
= CurDAG
->getRegister(0, VT
);
2135 else if (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
2138 if (AM
.IndexReg
.getNode())
2141 AM
.IndexReg
= CurDAG
->getRegister(0, VT
);
2143 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
2148 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
2149 // to a LEA. This is determined with some experimentation but is by no means
2150 // optimal (especially for code size consideration). LEA is nice because of
2151 // its three-address nature. Tweak the cost function again when we can run
2152 // convertToThreeAddress() at register allocation time.
2153 if (AM
.hasSymbolicDisplacement()) {
2154 // For X86-64, always use LEA to materialize RIP-relative addresses.
2155 if (Subtarget
->is64Bit())
2161 if (AM
.Disp
&& (AM
.Base_Reg
.getNode() || AM
.IndexReg
.getNode()))
2164 // If it isn't worth using an LEA, reject it.
2165 if (Complexity
<= 2)
2168 getAddressOperands(AM
, DL
, Base
, Scale
, Index
, Disp
, Segment
);
2172 /// This is only run on TargetGlobalTLSAddress nodes.
2173 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N
, SDValue
&Base
,
2174 SDValue
&Scale
, SDValue
&Index
,
2175 SDValue
&Disp
, SDValue
&Segment
) {
2176 assert(N
.getOpcode() == ISD::TargetGlobalTLSAddress
);
2177 const GlobalAddressSDNode
*GA
= cast
<GlobalAddressSDNode
>(N
);
2179 X86ISelAddressMode AM
;
2180 AM
.GV
= GA
->getGlobal();
2181 AM
.Disp
+= GA
->getOffset();
2182 AM
.Base_Reg
= CurDAG
->getRegister(0, N
.getValueType());
2183 AM
.SymbolFlags
= GA
->getTargetFlags();
2185 if (N
.getValueType() == MVT::i32
) {
2187 AM
.IndexReg
= CurDAG
->getRegister(X86::EBX
, MVT::i32
);
2189 AM
.IndexReg
= CurDAG
->getRegister(0, MVT::i64
);
2192 getAddressOperands(AM
, SDLoc(N
), Base
, Scale
, Index
, Disp
, Segment
);
2196 bool X86DAGToDAGISel::selectRelocImm(SDValue N
, SDValue
&Op
) {
2197 if (auto *CN
= dyn_cast
<ConstantSDNode
>(N
)) {
2198 Op
= CurDAG
->getTargetConstant(CN
->getAPIntValue(), SDLoc(CN
),
2203 // Keep track of the original value type and whether this value was
2204 // truncated. If we see a truncation from pointer type to VT that truncates
2205 // bits that are known to be zero, we can use a narrow reference.
2206 EVT VT
= N
.getValueType();
2207 bool WasTruncated
= false;
2208 if (N
.getOpcode() == ISD::TRUNCATE
) {
2209 WasTruncated
= true;
2210 N
= N
.getOperand(0);
2213 if (N
.getOpcode() != X86ISD::Wrapper
)
2216 // We can only use non-GlobalValues as immediates if they were not truncated,
2217 // as we do not have any range information. If we have a GlobalValue and the
2218 // address was not truncated, we can select it as an operand directly.
2219 unsigned Opc
= N
.getOperand(0)->getOpcode();
2220 if (Opc
!= ISD::TargetGlobalAddress
|| !WasTruncated
) {
2221 Op
= N
.getOperand(0);
2222 // We can only select the operand directly if we didn't have to look past a
2224 return !WasTruncated
;
2227 // Check that the global's range fits into VT.
2228 auto *GA
= cast
<GlobalAddressSDNode
>(N
.getOperand(0));
2229 Optional
<ConstantRange
> CR
= GA
->getGlobal()->getAbsoluteSymbolRange();
2230 if (!CR
|| CR
->getUnsignedMax().uge(1ull << VT
.getSizeInBits()))
2233 // Okay, we can use a narrow reference.
2234 Op
= CurDAG
->getTargetGlobalAddress(GA
->getGlobal(), SDLoc(N
), VT
,
2235 GA
->getOffset(), GA
->getTargetFlags());
2239 bool X86DAGToDAGISel::tryFoldLoad(SDNode
*Root
, SDNode
*P
, SDValue N
,
2240 SDValue
&Base
, SDValue
&Scale
,
2241 SDValue
&Index
, SDValue
&Disp
,
2243 if (!ISD::isNON_EXTLoad(N
.getNode()) ||
2244 !IsProfitableToFold(N
, P
, Root
) ||
2245 !IsLegalToFold(N
, P
, Root
, OptLevel
))
2248 return selectAddr(N
.getNode(),
2249 N
.getOperand(1), Base
, Scale
, Index
, Disp
, Segment
);
2252 /// Return an SDNode that returns the value of the global base register.
2253 /// Output instructions required to initialize the global base register,
2255 SDNode
*X86DAGToDAGISel::getGlobalBaseReg() {
2256 unsigned GlobalBaseReg
= getInstrInfo()->getGlobalBaseReg(MF
);
2257 auto &DL
= MF
->getDataLayout();
2258 return CurDAG
->getRegister(GlobalBaseReg
, TLI
->getPointerTy(DL
)).getNode();
2261 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width
, SDNode
*N
) const {
2262 if (N
->getOpcode() == ISD::TRUNCATE
)
2263 N
= N
->getOperand(0).getNode();
2264 if (N
->getOpcode() != X86ISD::Wrapper
)
2267 auto *GA
= dyn_cast
<GlobalAddressSDNode
>(N
->getOperand(0));
2271 Optional
<ConstantRange
> CR
= GA
->getGlobal()->getAbsoluteSymbolRange();
2272 return CR
&& CR
->getSignedMin().sge(-1ull << Width
) &&
2273 CR
->getSignedMax().slt(1ull << Width
);
2276 static X86::CondCode
getCondFromOpc(unsigned Opc
) {
2277 X86::CondCode CC
= X86::COND_INVALID
;
2278 if (CC
== X86::COND_INVALID
)
2279 CC
= X86::getCondFromBranchOpc(Opc
);
2280 if (CC
== X86::COND_INVALID
)
2281 CC
= X86::getCondFromSETOpc(Opc
);
2282 if (CC
== X86::COND_INVALID
)
2283 CC
= X86::getCondFromCMovOpc(Opc
);
2288 /// Test whether the given X86ISD::CMP node has any users that use a flag
2290 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags
) const {
2291 // Examine each user of the node.
2292 for (SDNode::use_iterator UI
= Flags
->use_begin(), UE
= Flags
->use_end();
2294 // Only check things that use the flags.
2295 if (UI
.getUse().getResNo() != Flags
.getResNo())
2297 // Only examine CopyToReg uses that copy to EFLAGS.
2298 if (UI
->getOpcode() != ISD::CopyToReg
||
2299 cast
<RegisterSDNode
>(UI
->getOperand(1))->getReg() != X86::EFLAGS
)
2301 // Examine each user of the CopyToReg use.
2302 for (SDNode::use_iterator FlagUI
= UI
->use_begin(),
2303 FlagUE
= UI
->use_end(); FlagUI
!= FlagUE
; ++FlagUI
) {
2304 // Only examine the Flag result.
2305 if (FlagUI
.getUse().getResNo() != 1) continue;
2306 // Anything unusual: assume conservatively.
2307 if (!FlagUI
->isMachineOpcode()) return false;
2308 // Examine the condition code of the user.
2309 X86::CondCode CC
= getCondFromOpc(FlagUI
->getMachineOpcode());
2312 // Comparisons which only use the zero flag.
2313 case X86::COND_E
: case X86::COND_NE
:
2315 // Anything else: assume conservatively.
2324 /// Test whether the given X86ISD::CMP node has any uses which require the SF
2325 /// flag to be accurate.
2326 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags
) const {
2327 // Examine each user of the node.
2328 for (SDNode::use_iterator UI
= Flags
->use_begin(), UE
= Flags
->use_end();
2330 // Only check things that use the flags.
2331 if (UI
.getUse().getResNo() != Flags
.getResNo())
2333 // Only examine CopyToReg uses that copy to EFLAGS.
2334 if (UI
->getOpcode() != ISD::CopyToReg
||
2335 cast
<RegisterSDNode
>(UI
->getOperand(1))->getReg() != X86::EFLAGS
)
2337 // Examine each user of the CopyToReg use.
2338 for (SDNode::use_iterator FlagUI
= UI
->use_begin(),
2339 FlagUE
= UI
->use_end(); FlagUI
!= FlagUE
; ++FlagUI
) {
2340 // Only examine the Flag result.
2341 if (FlagUI
.getUse().getResNo() != 1) continue;
2342 // Anything unusual: assume conservatively.
2343 if (!FlagUI
->isMachineOpcode()) return false;
2344 // Examine the condition code of the user.
2345 X86::CondCode CC
= getCondFromOpc(FlagUI
->getMachineOpcode());
2348 // Comparisons which don't examine the SF flag.
2349 case X86::COND_A
: case X86::COND_AE
:
2350 case X86::COND_B
: case X86::COND_BE
:
2351 case X86::COND_E
: case X86::COND_NE
:
2352 case X86::COND_O
: case X86::COND_NO
:
2353 case X86::COND_P
: case X86::COND_NP
:
2355 // Anything else: assume conservatively.
2364 static bool mayUseCarryFlag(X86::CondCode CC
) {
2366 // Comparisons which don't examine the CF flag.
2367 case X86::COND_O
: case X86::COND_NO
:
2368 case X86::COND_E
: case X86::COND_NE
:
2369 case X86::COND_S
: case X86::COND_NS
:
2370 case X86::COND_P
: case X86::COND_NP
:
2371 case X86::COND_L
: case X86::COND_GE
:
2372 case X86::COND_G
: case X86::COND_LE
:
2374 // Anything else: assume conservatively.
2380 /// Test whether the given node which sets flags has any uses which require the
2381 /// CF flag to be accurate.
2382 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags
) const {
2383 // Examine each user of the node.
2384 for (SDNode::use_iterator UI
= Flags
->use_begin(), UE
= Flags
->use_end();
2386 // Only check things that use the flags.
2387 if (UI
.getUse().getResNo() != Flags
.getResNo())
2390 unsigned UIOpc
= UI
->getOpcode();
2392 if (UIOpc
== ISD::CopyToReg
) {
2393 // Only examine CopyToReg uses that copy to EFLAGS.
2394 if (cast
<RegisterSDNode
>(UI
->getOperand(1))->getReg() != X86::EFLAGS
)
2396 // Examine each user of the CopyToReg use.
2397 for (SDNode::use_iterator FlagUI
= UI
->use_begin(), FlagUE
= UI
->use_end();
2398 FlagUI
!= FlagUE
; ++FlagUI
) {
2399 // Only examine the Flag result.
2400 if (FlagUI
.getUse().getResNo() != 1)
2402 // Anything unusual: assume conservatively.
2403 if (!FlagUI
->isMachineOpcode())
2405 // Examine the condition code of the user.
2406 X86::CondCode CC
= getCondFromOpc(FlagUI
->getMachineOpcode());
2408 if (mayUseCarryFlag(CC
))
2412 // This CopyToReg is ok. Move on to the next user.
2416 // This might be an unselected node. So look for the pre-isel opcodes that
2421 // Something unusual. Be conservative.
2423 case X86ISD::SETCC
: CCOpNo
= 0; break;
2424 case X86ISD::SETCC_CARRY
: CCOpNo
= 0; break;
2425 case X86ISD::CMOV
: CCOpNo
= 2; break;
2426 case X86ISD::BRCOND
: CCOpNo
= 2; break;
2429 X86::CondCode CC
= (X86::CondCode
)UI
->getConstantOperandVal(CCOpNo
);
2430 if (mayUseCarryFlag(CC
))
2436 /// Check whether or not the chain ending in StoreNode is suitable for doing
2437 /// the {load; op; store} to modify transformation.
2438 static bool isFusableLoadOpStorePattern(StoreSDNode
*StoreNode
,
2439 SDValue StoredVal
, SelectionDAG
*CurDAG
,
2441 LoadSDNode
*&LoadNode
,
2442 SDValue
&InputChain
) {
2443 // Is the stored value result 0 of the operation?
2444 if (StoredVal
.getResNo() != 0) return false;
2446 // Are there other uses of the operation other than the store?
2447 if (!StoredVal
.getNode()->hasNUsesOfValue(1, 0)) return false;
2449 // Is the store non-extending and non-indexed?
2450 if (!ISD::isNormalStore(StoreNode
) || StoreNode
->isNonTemporal())
2453 SDValue Load
= StoredVal
->getOperand(LoadOpNo
);
2454 // Is the stored value a non-extending and non-indexed load?
2455 if (!ISD::isNormalLoad(Load
.getNode())) return false;
2457 // Return LoadNode by reference.
2458 LoadNode
= cast
<LoadSDNode
>(Load
);
2460 // Is store the only read of the loaded value?
2461 if (!Load
.hasOneUse())
2464 // Is the address of the store the same as the load?
2465 if (LoadNode
->getBasePtr() != StoreNode
->getBasePtr() ||
2466 LoadNode
->getOffset() != StoreNode
->getOffset())
2469 bool FoundLoad
= false;
2470 SmallVector
<SDValue
, 4> ChainOps
;
2471 SmallVector
<const SDNode
*, 4> LoopWorklist
;
2472 SmallPtrSet
<const SDNode
*, 16> Visited
;
2473 const unsigned int Max
= 1024;
2475 // Visualization of Load-Op-Store fusion:
2476 // -------------------------
2478 // *-lines = Chain operand dependencies.
2479 // |-lines = Normal operand dependencies.
2480 // Dependencies flow down and right. n-suffix references multiple nodes.
2488 // * * \ | => A--LD_OP_ST
2496 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
2500 // Ensure the transform is safe by checking for the dual
2501 // dependencies to make sure we do not induce a loop.
2503 // As LD is a predecessor to both OP and ST we can do this by checking:
2504 // a). if LD is a predecessor to a member of Xn or Yn.
2505 // b). if a Zn is a predecessor to ST.
2507 // However, (b) can only occur through being a chain predecessor to
2508 // ST, which is the same as Zn being a member or predecessor of Xn,
2509 // which is a subset of LD being a predecessor of Xn. So it's
2510 // subsumed by check (a).
2512 SDValue Chain
= StoreNode
->getChain();
2514 // Gather X elements in ChainOps.
2515 if (Chain
== Load
.getValue(1)) {
2517 ChainOps
.push_back(Load
.getOperand(0));
2518 } else if (Chain
.getOpcode() == ISD::TokenFactor
) {
2519 for (unsigned i
= 0, e
= Chain
.getNumOperands(); i
!= e
; ++i
) {
2520 SDValue Op
= Chain
.getOperand(i
);
2521 if (Op
== Load
.getValue(1)) {
2523 // Drop Load, but keep its chain. No cycle check necessary.
2524 ChainOps
.push_back(Load
.getOperand(0));
2527 LoopWorklist
.push_back(Op
.getNode());
2528 ChainOps
.push_back(Op
);
2535 // Worklist is currently Xn. Add Yn to worklist.
2536 for (SDValue Op
: StoredVal
->ops())
2537 if (Op
.getNode() != LoadNode
)
2538 LoopWorklist
.push_back(Op
.getNode());
2540 // Check (a) if Load is a predecessor to Xn + Yn
2541 if (SDNode::hasPredecessorHelper(Load
.getNode(), Visited
, LoopWorklist
, Max
,
2546 CurDAG
->getNode(ISD::TokenFactor
, SDLoc(Chain
), MVT::Other
, ChainOps
);
2550 // Change a chain of {load; op; store} of the same value into a simple op
2551 // through memory of that value, if the uses of the modified value and its
2552 // address are suitable.
2554 // The tablegen pattern memory operand pattern is currently not able to match
2555 // the case where the EFLAGS on the original operation are used.
2557 // To move this to tablegen, we'll need to improve tablegen to allow flags to
2558 // be transferred from a node in the pattern to the result node, probably with
2559 // a new keyword. For example, we have this
2560 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2561 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2562 // (implicit EFLAGS)]>;
2563 // but maybe need something like this
2564 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2565 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2566 // (transferrable EFLAGS)]>;
2568 // Until then, we manually fold these and instruction select the operation
2570 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode
*Node
) {
2571 StoreSDNode
*StoreNode
= cast
<StoreSDNode
>(Node
);
2572 SDValue StoredVal
= StoreNode
->getOperand(1);
2573 unsigned Opc
= StoredVal
->getOpcode();
2575 // Before we try to select anything, make sure this is memory operand size
2576 // and opcode we can handle. Note that this must match the code below that
2577 // actually lowers the opcodes.
2578 EVT MemVT
= StoreNode
->getMemoryVT();
2579 if (MemVT
!= MVT::i64
&& MemVT
!= MVT::i32
&& MemVT
!= MVT::i16
&&
2583 bool IsCommutable
= false;
2595 IsCommutable
= true;
2599 unsigned LoadOpNo
= 0;
2600 LoadSDNode
*LoadNode
= nullptr;
2602 if (!isFusableLoadOpStorePattern(StoreNode
, StoredVal
, CurDAG
, LoadOpNo
,
2603 LoadNode
, InputChain
)) {
2607 // This operation is commutable, try the other operand.
2609 if (!isFusableLoadOpStorePattern(StoreNode
, StoredVal
, CurDAG
, LoadOpNo
,
2610 LoadNode
, InputChain
))
2614 SDValue Base
, Scale
, Index
, Disp
, Segment
;
2615 if (!selectAddr(LoadNode
, LoadNode
->getBasePtr(), Base
, Scale
, Index
, Disp
,
2619 auto SelectOpcode
= [&](unsigned Opc64
, unsigned Opc32
, unsigned Opc16
,
2621 switch (MemVT
.getSimpleVT().SimpleTy
) {
2631 llvm_unreachable("Invalid size!");
2635 MachineSDNode
*Result
;
2639 // Try to match inc/dec.
2640 if (!Subtarget
->slowIncDec() ||
2641 CurDAG
->getMachineFunction().getFunction().optForSize()) {
2642 bool IsOne
= isOneConstant(StoredVal
.getOperand(1));
2643 bool IsNegOne
= isAllOnesConstant(StoredVal
.getOperand(1));
2644 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
2645 if ((IsOne
|| IsNegOne
) && hasNoCarryFlagUses(StoredVal
.getValue(1))) {
2647 ((Opc
== X86ISD::ADD
) == IsOne
)
2648 ? SelectOpcode(X86::INC64m
, X86::INC32m
, X86::INC16m
, X86::INC8m
)
2649 : SelectOpcode(X86::DEC64m
, X86::DEC32m
, X86::DEC16m
, X86::DEC8m
);
2650 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
, Segment
, InputChain
};
2651 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
,
2662 auto SelectRegOpcode
= [SelectOpcode
](unsigned Opc
) {
2665 return SelectOpcode(X86::ADD64mr
, X86::ADD32mr
, X86::ADD16mr
,
2668 return SelectOpcode(X86::ADC64mr
, X86::ADC32mr
, X86::ADC16mr
,
2671 return SelectOpcode(X86::SUB64mr
, X86::SUB32mr
, X86::SUB16mr
,
2674 return SelectOpcode(X86::SBB64mr
, X86::SBB32mr
, X86::SBB16mr
,
2677 return SelectOpcode(X86::AND64mr
, X86::AND32mr
, X86::AND16mr
,
2680 return SelectOpcode(X86::OR64mr
, X86::OR32mr
, X86::OR16mr
, X86::OR8mr
);
2682 return SelectOpcode(X86::XOR64mr
, X86::XOR32mr
, X86::XOR16mr
,
2685 llvm_unreachable("Invalid opcode!");
2688 auto SelectImm8Opcode
= [SelectOpcode
](unsigned Opc
) {
2691 return SelectOpcode(X86::ADD64mi8
, X86::ADD32mi8
, X86::ADD16mi8
, 0);
2693 return SelectOpcode(X86::ADC64mi8
, X86::ADC32mi8
, X86::ADC16mi8
, 0);
2695 return SelectOpcode(X86::SUB64mi8
, X86::SUB32mi8
, X86::SUB16mi8
, 0);
2697 return SelectOpcode(X86::SBB64mi8
, X86::SBB32mi8
, X86::SBB16mi8
, 0);
2699 return SelectOpcode(X86::AND64mi8
, X86::AND32mi8
, X86::AND16mi8
, 0);
2701 return SelectOpcode(X86::OR64mi8
, X86::OR32mi8
, X86::OR16mi8
, 0);
2703 return SelectOpcode(X86::XOR64mi8
, X86::XOR32mi8
, X86::XOR16mi8
, 0);
2705 llvm_unreachable("Invalid opcode!");
2708 auto SelectImmOpcode
= [SelectOpcode
](unsigned Opc
) {
2711 return SelectOpcode(X86::ADD64mi32
, X86::ADD32mi
, X86::ADD16mi
,
2714 return SelectOpcode(X86::ADC64mi32
, X86::ADC32mi
, X86::ADC16mi
,
2717 return SelectOpcode(X86::SUB64mi32
, X86::SUB32mi
, X86::SUB16mi
,
2720 return SelectOpcode(X86::SBB64mi32
, X86::SBB32mi
, X86::SBB16mi
,
2723 return SelectOpcode(X86::AND64mi32
, X86::AND32mi
, X86::AND16mi
,
2726 return SelectOpcode(X86::OR64mi32
, X86::OR32mi
, X86::OR16mi
,
2729 return SelectOpcode(X86::XOR64mi32
, X86::XOR32mi
, X86::XOR16mi
,
2732 llvm_unreachable("Invalid opcode!");
2736 unsigned NewOpc
= SelectRegOpcode(Opc
);
2737 SDValue Operand
= StoredVal
->getOperand(1-LoadOpNo
);
2739 // See if the operand is a constant that we can fold into an immediate
2741 if (auto *OperandC
= dyn_cast
<ConstantSDNode
>(Operand
)) {
2742 auto OperandV
= OperandC
->getAPIntValue();
2744 // Check if we can shrink the operand enough to fit in an immediate (or
2745 // fit into a smaller immediate) by negating it and switching the
2747 if ((Opc
== X86ISD::ADD
|| Opc
== X86ISD::SUB
) &&
2748 ((MemVT
!= MVT::i8
&& OperandV
.getMinSignedBits() > 8 &&
2749 (-OperandV
).getMinSignedBits() <= 8) ||
2750 (MemVT
== MVT::i64
&& OperandV
.getMinSignedBits() > 32 &&
2751 (-OperandV
).getMinSignedBits() <= 32)) &&
2752 hasNoCarryFlagUses(StoredVal
.getValue(1))) {
2753 OperandV
= -OperandV
;
2754 Opc
= Opc
== X86ISD::ADD
? X86ISD::SUB
: X86ISD::ADD
;
2757 // First try to fit this into an Imm8 operand. If it doesn't fit, then try
2758 // the larger immediate operand.
2759 if (MemVT
!= MVT::i8
&& OperandV
.getMinSignedBits() <= 8) {
2760 Operand
= CurDAG
->getTargetConstant(OperandV
, SDLoc(Node
), MemVT
);
2761 NewOpc
= SelectImm8Opcode(Opc
);
2762 } else if (OperandV
.getActiveBits() <= MemVT
.getSizeInBits() &&
2763 (MemVT
!= MVT::i64
|| OperandV
.getMinSignedBits() <= 32)) {
2764 Operand
= CurDAG
->getTargetConstant(OperandV
, SDLoc(Node
), MemVT
);
2765 NewOpc
= SelectImmOpcode(Opc
);
2769 if (Opc
== X86ISD::ADC
|| Opc
== X86ISD::SBB
) {
2771 CurDAG
->getCopyToReg(InputChain
, SDLoc(Node
), X86::EFLAGS
,
2772 StoredVal
.getOperand(2), SDValue());
2774 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
,
2775 Segment
, Operand
, CopyTo
, CopyTo
.getValue(1)};
2776 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
, MVT::Other
,
2779 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
,
2780 Segment
, Operand
, InputChain
};
2781 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
, MVT::Other
,
2787 llvm_unreachable("Invalid opcode!");
2790 MachineMemOperand
*MemOps
[] = {StoreNode
->getMemOperand(),
2791 LoadNode
->getMemOperand()};
2792 CurDAG
->setNodeMemRefs(Result
, MemOps
);
2794 // Update Load Chain uses as well.
2795 ReplaceUses(SDValue(LoadNode
, 1), SDValue(Result
, 1));
2796 ReplaceUses(SDValue(StoreNode
, 0), SDValue(Result
, 1));
2797 ReplaceUses(SDValue(StoredVal
.getNode(), 1), SDValue(Result
, 0));
2798 CurDAG
->RemoveDeadNode(Node
);
2802 // See if this is an X & Mask that we can match to BEXTR/BZHI.
2803 // Where Mask is one of the following patterns:
2804 // a) x & (1 << nbits) - 1
2805 // b) x & ~(-1 << nbits)
2806 // c) x & (-1 >> (32 - y))
2807 // d) x << (32 - y) >> (32 - y)
2808 bool X86DAGToDAGISel::matchBitExtract(SDNode
*Node
) {
2810 (Node
->getOpcode() == ISD::AND
|| Node
->getOpcode() == ISD::SRL
) &&
2811 "Should be either an and-mask, or right-shift after clearing high bits.");
2813 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
2814 if (!Subtarget
->hasBMI() && !Subtarget
->hasBMI2())
2817 MVT NVT
= Node
->getSimpleValueType(0);
2819 // Only supported for 32 and 64 bits.
2820 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
2823 unsigned Size
= NVT
.getSizeInBits();
2827 // If we have BMI2's BZHI, we are ok with muti-use patterns.
2828 // Else, if we only have BMI1's BEXTR, we require one-use.
2829 const bool CanHaveExtraUses
= Subtarget
->hasBMI2();
2830 auto checkUses
= [CanHaveExtraUses
](SDValue Op
, unsigned NUses
) {
2831 return CanHaveExtraUses
||
2832 Op
.getNode()->hasNUsesOfValue(NUses
, Op
.getResNo());
2834 auto checkOneUse
= [checkUses
](SDValue Op
) { return checkUses(Op
, 1); };
2835 auto checkTwoUse
= [checkUses
](SDValue Op
) { return checkUses(Op
, 2); };
2837 // a) x & ((1 << nbits) + (-1))
2838 auto matchPatternA
= [&checkOneUse
, &NBits
](SDValue Mask
) -> bool {
2839 // Match `add`. Must only have one use!
2840 if (Mask
->getOpcode() != ISD::ADD
|| !checkOneUse(Mask
))
2842 // We should be adding all-ones constant (i.e. subtracting one.)
2843 if (!isAllOnesConstant(Mask
->getOperand(1)))
2845 // Match `1 << nbits`. Must only have one use!
2846 SDValue M0
= Mask
->getOperand(0);
2847 if (M0
->getOpcode() != ISD::SHL
|| !checkOneUse(M0
))
2849 if (!isOneConstant(M0
->getOperand(0)))
2851 NBits
= M0
->getOperand(1);
2855 // b) x & ~(-1 << nbits)
2856 auto matchPatternB
= [&checkOneUse
, &NBits
](SDValue Mask
) -> bool {
2857 // Match `~()`. Must only have one use!
2858 if (!isBitwiseNot(Mask
) || !checkOneUse(Mask
))
2860 // Match `-1 << nbits`. Must only have one use!
2861 SDValue M0
= Mask
->getOperand(0);
2862 if (M0
->getOpcode() != ISD::SHL
|| !checkOneUse(M0
))
2864 if (!isAllOnesConstant(M0
->getOperand(0)))
2866 NBits
= M0
->getOperand(1);
2870 // Match potentially-truncated (bitwidth - y)
2871 auto matchShiftAmt
= [checkOneUse
, Size
, &NBits
](SDValue ShiftAmt
) {
2872 // Skip over a truncate of the shift amount.
2873 if (ShiftAmt
.getOpcode() == ISD::TRUNCATE
) {
2874 ShiftAmt
= ShiftAmt
.getOperand(0);
2875 // The trunc should have been the only user of the real shift amount.
2876 if (!checkOneUse(ShiftAmt
))
2879 // Match the shift amount as: (bitwidth - y). It should go away, too.
2880 if (ShiftAmt
.getOpcode() != ISD::SUB
)
2882 auto V0
= dyn_cast
<ConstantSDNode
>(ShiftAmt
.getOperand(0));
2883 if (!V0
|| V0
->getZExtValue() != Size
)
2885 NBits
= ShiftAmt
.getOperand(1);
2889 // c) x & (-1 >> (32 - y))
2890 auto matchPatternC
= [&checkOneUse
, matchShiftAmt
](SDValue Mask
) -> bool {
2891 // Match `l>>`. Must only have one use!
2892 if (Mask
.getOpcode() != ISD::SRL
|| !checkOneUse(Mask
))
2894 // We should be shifting all-ones constant.
2895 if (!isAllOnesConstant(Mask
.getOperand(0)))
2897 SDValue M1
= Mask
.getOperand(1);
2898 // The shift amount should not be used externally.
2899 if (!checkOneUse(M1
))
2901 return matchShiftAmt(M1
);
2906 // d) x << (32 - y) >> (32 - y)
2907 auto matchPatternD
= [&checkOneUse
, &checkTwoUse
, matchShiftAmt
,
2908 &X
](SDNode
*Node
) -> bool {
2909 if (Node
->getOpcode() != ISD::SRL
)
2911 SDValue N0
= Node
->getOperand(0);
2912 if (N0
->getOpcode() != ISD::SHL
|| !checkOneUse(N0
))
2914 SDValue N1
= Node
->getOperand(1);
2915 SDValue N01
= N0
->getOperand(1);
2916 // Both of the shifts must be by the exact same value.
2917 // There should not be any uses of the shift amount outside of the pattern.
2918 if (N1
!= N01
|| !checkTwoUse(N1
))
2920 if (!matchShiftAmt(N1
))
2922 X
= N0
->getOperand(0);
2926 auto matchLowBitMask
= [&matchPatternA
, &matchPatternB
,
2927 &matchPatternC
](SDValue Mask
) -> bool {
2928 // FIXME: pattern c.
2929 return matchPatternA(Mask
) || matchPatternB(Mask
) || matchPatternC(Mask
);
2932 if (Node
->getOpcode() == ISD::AND
) {
2933 X
= Node
->getOperand(0);
2934 SDValue Mask
= Node
->getOperand(1);
2936 if (matchLowBitMask(Mask
)) {
2940 if (!matchLowBitMask(Mask
))
2943 } else if (!matchPatternD(Node
))
2948 // If we do *NOT* have BMI2, let's find out if the if the 'X' is *logically*
2949 // shifted (potentially with one-use trunc inbetween),
2950 // and if so look past one-use truncation.
2952 if (!Subtarget
->hasBMI2() && X
.getOpcode() == ISD::TRUNCATE
&&
2953 X
.hasOneUse() && X
.getOperand(0).getOpcode() == ISD::SRL
) {
2954 assert(NVT
== MVT::i32
&& "Expected target valuetype to be i32");
2955 X
= X
.getOperand(0);
2956 XVT
= X
.getSimpleValueType();
2957 assert(XVT
== MVT::i64
&& "Expected truncation from i64");
2960 SDValue OrigNBits
= NBits
;
2961 // Truncate the shift amount.
2962 NBits
= CurDAG
->getNode(ISD::TRUNCATE
, DL
, MVT::i8
, NBits
);
2963 insertDAGNode(*CurDAG
, OrigNBits
, NBits
);
2965 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
2966 // All the other bits are undefined, we do not care about them.
2967 SDValue ImplDef
= SDValue(
2968 CurDAG
->getMachineNode(TargetOpcode::IMPLICIT_DEF
, DL
, MVT::i32
), 0);
2969 insertDAGNode(*CurDAG
, OrigNBits
, ImplDef
);
2970 NBits
= CurDAG
->getTargetInsertSubreg(X86::sub_8bit
, DL
, MVT::i32
, ImplDef
,
2972 insertDAGNode(*CurDAG
, OrigNBits
, NBits
);
2974 if (Subtarget
->hasBMI2()) {
2975 // Great, just emit the the BZHI..
2976 if (XVT
!= MVT::i32
) {
2977 // But have to place the bit count into the wide-enough register first.
2978 SDValue ImplDef
= SDValue(
2979 CurDAG
->getMachineNode(TargetOpcode::IMPLICIT_DEF
, DL
, XVT
), 0);
2980 insertDAGNode(*CurDAG
, OrigNBits
, ImplDef
);
2981 NBits
= CurDAG
->getTargetInsertSubreg(X86::sub_32bit
, DL
, XVT
, ImplDef
,
2983 insertDAGNode(*CurDAG
, OrigNBits
, NBits
);
2986 SDValue Extract
= CurDAG
->getNode(X86ISD::BZHI
, DL
, XVT
, X
, NBits
);
2987 ReplaceNode(Node
, Extract
.getNode());
2988 SelectCode(Extract
.getNode());
2992 // Else, emitting BEXTR requires one more step.
2993 // The 'control' of BEXTR has the pattern of:
2994 // [15...8 bit][ 7...0 bit] location
2995 // [ bit count][ shift] name
2996 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
2998 // Shift NBits left by 8 bits, thus producing 'control'.
2999 // This makes the low 8 bits to be zero.
3000 SDValue C8
= CurDAG
->getConstant(8, DL
, MVT::i8
);
3001 SDValue Control
= CurDAG
->getNode(ISD::SHL
, DL
, MVT::i32
, NBits
, C8
);
3002 insertDAGNode(*CurDAG
, OrigNBits
, Control
);
3004 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3005 if (X
.getOpcode() == ISD::SRL
) {
3006 SDValue ShiftAmt
= X
.getOperand(1);
3007 X
= X
.getOperand(0);
3009 assert(ShiftAmt
.getValueType() == MVT::i8
&&
3010 "Expected shift amount to be i8");
3012 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
3013 // We could zext to i16 in some form, but we intentionally don't do that.
3014 SDValue OrigShiftAmt
= ShiftAmt
;
3015 ShiftAmt
= CurDAG
->getNode(ISD::ZERO_EXTEND
, DL
, MVT::i32
, ShiftAmt
);
3016 insertDAGNode(*CurDAG
, OrigShiftAmt
, ShiftAmt
);
3018 // And now 'or' these low 8 bits of shift amount into the 'control'.
3019 Control
= CurDAG
->getNode(ISD::OR
, DL
, MVT::i32
, Control
, ShiftAmt
);
3020 insertDAGNode(*CurDAG
, OrigNBits
, Control
);
3023 // But have to place the 'control' into the wide-enough register first.
3024 if (XVT
!= MVT::i32
) {
3026 SDValue(CurDAG
->getMachineNode(TargetOpcode::IMPLICIT_DEF
, DL
, XVT
), 0);
3027 insertDAGNode(*CurDAG
, OrigNBits
, ImplDef
);
3028 Control
= CurDAG
->getTargetInsertSubreg(X86::sub_32bit
, DL
, XVT
, ImplDef
,
3030 insertDAGNode(*CurDAG
, OrigNBits
, Control
);
3033 // And finally, form the BEXTR itself.
3034 SDValue Extract
= CurDAG
->getNode(X86ISD::BEXTR
, DL
, XVT
, X
, Control
);
3036 // The 'X' was originally truncated. Do that now.
3038 insertDAGNode(*CurDAG
, OrigNBits
, Extract
);
3039 Extract
= CurDAG
->getNode(ISD::TRUNCATE
, DL
, NVT
, Extract
);
3042 ReplaceNode(Node
, Extract
.getNode());
3043 SelectCode(Extract
.getNode());
3048 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
3049 MachineSDNode
*X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode
*Node
) {
3050 MVT NVT
= Node
->getSimpleValueType(0);
3053 SDValue N0
= Node
->getOperand(0);
3054 SDValue N1
= Node
->getOperand(1);
3056 // If we have TBM we can use an immediate for the control. If we have BMI
3057 // we should only do this if the BEXTR instruction is implemented well.
3058 // Otherwise moving the control into a register makes this more costly.
3059 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
3060 // hoisting the move immediate would make it worthwhile with a less optimal
3062 if (!Subtarget
->hasTBM() &&
3063 !(Subtarget
->hasBMI() && Subtarget
->hasFastBEXTR()))
3066 // Must have a shift right.
3067 if (N0
->getOpcode() != ISD::SRL
&& N0
->getOpcode() != ISD::SRA
)
3070 // Shift can't have additional users.
3071 if (!N0
->hasOneUse())
3074 // Only supported for 32 and 64 bits.
3075 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
3078 // Shift amount and RHS of and must be constant.
3079 ConstantSDNode
*MaskCst
= dyn_cast
<ConstantSDNode
>(N1
);
3080 ConstantSDNode
*ShiftCst
= dyn_cast
<ConstantSDNode
>(N0
->getOperand(1));
3081 if (!MaskCst
|| !ShiftCst
)
3084 // And RHS must be a mask.
3085 uint64_t Mask
= MaskCst
->getZExtValue();
3086 if (!isMask_64(Mask
))
3089 uint64_t Shift
= ShiftCst
->getZExtValue();
3090 uint64_t MaskSize
= countPopulation(Mask
);
3092 // Don't interfere with something that can be handled by extracting AH.
3093 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
3094 if (Shift
== 8 && MaskSize
== 8)
3097 // Make sure we are only using bits that were in the original value, not
3099 if (Shift
+ MaskSize
> NVT
.getSizeInBits())
3102 SDValue New
= CurDAG
->getTargetConstant(Shift
| (MaskSize
<< 8), dl
, NVT
);
3103 unsigned ROpc
= NVT
== MVT::i64
? X86::BEXTRI64ri
: X86::BEXTRI32ri
;
3104 unsigned MOpc
= NVT
== MVT::i64
? X86::BEXTRI64mi
: X86::BEXTRI32mi
;
3106 // BMI requires the immediate to placed in a register.
3107 if (!Subtarget
->hasTBM()) {
3108 ROpc
= NVT
== MVT::i64
? X86::BEXTR64rr
: X86::BEXTR32rr
;
3109 MOpc
= NVT
== MVT::i64
? X86::BEXTR64rm
: X86::BEXTR32rm
;
3110 unsigned NewOpc
= NVT
== MVT::i64
? X86::MOV32ri64
: X86::MOV32ri
;
3111 New
= SDValue(CurDAG
->getMachineNode(NewOpc
, dl
, NVT
, New
), 0);
3114 MachineSDNode
*NewNode
;
3115 SDValue Input
= N0
->getOperand(0);
3116 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
3117 if (tryFoldLoad(Node
, N0
.getNode(), Input
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
3118 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, New
, Input
.getOperand(0) };
3119 SDVTList VTs
= CurDAG
->getVTList(NVT
, MVT::Other
);
3120 NewNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
3121 // Update the chain.
3122 ReplaceUses(Input
.getValue(1), SDValue(NewNode
, 1));
3123 // Record the mem-refs
3124 CurDAG
->setNodeMemRefs(NewNode
, {cast
<LoadSDNode
>(Input
)->getMemOperand()});
3126 NewNode
= CurDAG
->getMachineNode(ROpc
, dl
, NVT
, Input
, New
);
3132 // Emit a PCMISTR(I/M) instruction.
3133 MachineSDNode
*X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc
, unsigned MOpc
,
3134 bool MayFoldLoad
, const SDLoc
&dl
,
3135 MVT VT
, SDNode
*Node
) {
3136 SDValue N0
= Node
->getOperand(0);
3137 SDValue N1
= Node
->getOperand(1);
3138 SDValue Imm
= Node
->getOperand(2);
3139 const ConstantInt
*Val
= cast
<ConstantSDNode
>(Imm
)->getConstantIntValue();
3140 Imm
= CurDAG
->getTargetConstant(*Val
, SDLoc(Node
), Imm
.getValueType());
3142 // Try to fold a load. No need to check alignment.
3143 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
3144 if (MayFoldLoad
&& tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
3145 SDValue Ops
[] = { N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
3147 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Other
);
3148 MachineSDNode
*CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
3149 // Update the chain.
3150 ReplaceUses(N1
.getValue(1), SDValue(CNode
, 2));
3151 // Record the mem-refs
3152 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
3156 SDValue Ops
[] = { N0
, N1
, Imm
};
3157 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
);
3158 MachineSDNode
*CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, Ops
);
3162 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
3163 // to emit a second instruction after this one. This is needed since we have two
3164 // copyToReg nodes glued before this and we need to continue that glue through.
3165 MachineSDNode
*X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc
, unsigned MOpc
,
3166 bool MayFoldLoad
, const SDLoc
&dl
,
3167 MVT VT
, SDNode
*Node
,
3169 SDValue N0
= Node
->getOperand(0);
3170 SDValue N2
= Node
->getOperand(2);
3171 SDValue Imm
= Node
->getOperand(4);
3172 const ConstantInt
*Val
= cast
<ConstantSDNode
>(Imm
)->getConstantIntValue();
3173 Imm
= CurDAG
->getTargetConstant(*Val
, SDLoc(Node
), Imm
.getValueType());
3175 // Try to fold a load. No need to check alignment.
3176 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
3177 if (MayFoldLoad
&& tryFoldLoad(Node
, N2
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
3178 SDValue Ops
[] = { N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
3179 N2
.getOperand(0), InFlag
};
3180 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Other
, MVT::Glue
);
3181 MachineSDNode
*CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
3182 InFlag
= SDValue(CNode
, 3);
3183 // Update the chain.
3184 ReplaceUses(N2
.getValue(1), SDValue(CNode
, 2));
3185 // Record the mem-refs
3186 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N2
)->getMemOperand()});
3190 SDValue Ops
[] = { N0
, N2
, Imm
, InFlag
};
3191 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Glue
);
3192 MachineSDNode
*CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, Ops
);
3193 InFlag
= SDValue(CNode
, 2);
3197 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode
*N
) {
3198 EVT VT
= N
->getValueType(0);
3200 // Only handle scalar shifts.
3204 // Narrower shifts only mask to 5 bits in hardware.
3205 unsigned Size
= VT
== MVT::i64
? 64 : 32;
3207 SDValue OrigShiftAmt
= N
->getOperand(1);
3208 SDValue ShiftAmt
= OrigShiftAmt
;
3211 // Skip over a truncate of the shift amount.
3212 if (ShiftAmt
->getOpcode() == ISD::TRUNCATE
)
3213 ShiftAmt
= ShiftAmt
->getOperand(0);
3215 // This function is called after X86DAGToDAGISel::matchBitExtract(),
3216 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
3218 SDValue NewShiftAmt
;
3219 if (ShiftAmt
->getOpcode() == ISD::ADD
|| ShiftAmt
->getOpcode() == ISD::SUB
) {
3220 SDValue Add0
= ShiftAmt
->getOperand(0);
3221 SDValue Add1
= ShiftAmt
->getOperand(1);
3222 // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
3223 // to avoid the ADD/SUB.
3224 if (isa
<ConstantSDNode
>(Add1
) &&
3225 cast
<ConstantSDNode
>(Add1
)->getZExtValue() % Size
== 0) {
3227 // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
3228 // generate a NEG instead of a SUB of a constant.
3229 } else if (ShiftAmt
->getOpcode() == ISD::SUB
&&
3230 isa
<ConstantSDNode
>(Add0
) &&
3231 cast
<ConstantSDNode
>(Add0
)->getZExtValue() != 0 &&
3232 cast
<ConstantSDNode
>(Add0
)->getZExtValue() % Size
== 0) {
3233 // Insert a negate op.
3234 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
3235 // that uses it that's not a shift.
3236 EVT SubVT
= ShiftAmt
.getValueType();
3237 SDValue Zero
= CurDAG
->getConstant(0, DL
, SubVT
);
3238 SDValue Neg
= CurDAG
->getNode(ISD::SUB
, DL
, SubVT
, Zero
, Add1
);
3241 // Insert these operands into a valid topological order so they can
3242 // get selected independently.
3243 insertDAGNode(*CurDAG
, OrigShiftAmt
, Zero
);
3244 insertDAGNode(*CurDAG
, OrigShiftAmt
, Neg
);
3250 if (NewShiftAmt
.getValueType() != MVT::i8
) {
3251 // Need to truncate the shift amount.
3252 NewShiftAmt
= CurDAG
->getNode(ISD::TRUNCATE
, DL
, MVT::i8
, NewShiftAmt
);
3253 // Add to a correct topological ordering.
3254 insertDAGNode(*CurDAG
, OrigShiftAmt
, NewShiftAmt
);
3257 // Insert a new mask to keep the shift amount legal. This should be removed
3258 // by isel patterns.
3259 NewShiftAmt
= CurDAG
->getNode(ISD::AND
, DL
, MVT::i8
, NewShiftAmt
,
3260 CurDAG
->getConstant(Size
- 1, DL
, MVT::i8
));
3261 // Place in a correct topological ordering.
3262 insertDAGNode(*CurDAG
, OrigShiftAmt
, NewShiftAmt
);
3264 SDNode
*UpdatedNode
= CurDAG
->UpdateNodeOperands(N
, N
->getOperand(0),
3266 if (UpdatedNode
!= N
) {
3267 // If we found an existing node, we should replace ourselves with that node
3268 // and wait for it to be selected after its other users.
3269 ReplaceNode(N
, UpdatedNode
);
3273 // If the original shift amount is now dead, delete it so that we don't run
3275 if (OrigShiftAmt
.getNode()->use_empty())
3276 CurDAG
->RemoveDeadNode(OrigShiftAmt
.getNode());
3278 // Now that we've optimized the shift amount, defer to normal isel to get
3279 // load folding and legacy vs BMI2 selection without repeating it here.
3284 /// If the high bits of an 'and' operand are known zero, try setting the
3285 /// high bits of an 'and' constant operand to produce a smaller encoding by
3286 /// creating a small, sign-extended negative immediate rather than a large
3287 /// positive one. This reverses a transform in SimplifyDemandedBits that
3288 /// shrinks mask constants by clearing bits. There is also a possibility that
3289 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
3290 /// case, just replace the 'and'. Return 'true' if the node is replaced.
3291 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode
*And
) {
3292 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
3293 // have immediate operands.
3294 MVT VT
= And
->getSimpleValueType(0);
3295 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
3298 auto *And1C
= dyn_cast
<ConstantSDNode
>(And
->getOperand(1));
3302 // Bail out if the mask constant is already negative. It's can't shrink more.
3303 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
3304 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
3305 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
3306 // are negative too.
3307 APInt MaskVal
= And1C
->getAPIntValue();
3308 unsigned MaskLZ
= MaskVal
.countLeadingZeros();
3309 if (!MaskLZ
|| (VT
== MVT::i64
&& MaskLZ
== 32))
3312 // Don't extend into the upper 32 bits of a 64 bit mask.
3313 if (VT
== MVT::i64
&& MaskLZ
>= 32) {
3315 MaskVal
= MaskVal
.trunc(32);
3318 SDValue And0
= And
->getOperand(0);
3319 APInt HighZeros
= APInt::getHighBitsSet(MaskVal
.getBitWidth(), MaskLZ
);
3320 APInt NegMaskVal
= MaskVal
| HighZeros
;
3322 // If a negative constant would not allow a smaller encoding, there's no need
3323 // to continue. Only change the constant when we know it's a win.
3324 unsigned MinWidth
= NegMaskVal
.getMinSignedBits();
3325 if (MinWidth
> 32 || (MinWidth
> 8 && MaskVal
.getMinSignedBits() <= 32))
3328 // Extend masks if we truncated above.
3329 if (VT
== MVT::i64
&& MaskVal
.getBitWidth() < 64) {
3330 NegMaskVal
= NegMaskVal
.zext(64);
3331 HighZeros
= HighZeros
.zext(64);
3334 // The variable operand must be all zeros in the top bits to allow using the
3335 // new, negative constant as the mask.
3336 if (!CurDAG
->MaskedValueIsZero(And0
, HighZeros
))
3339 // Check if the mask is -1. In that case, this is an unnecessary instruction
3340 // that escaped earlier analysis.
3341 if (NegMaskVal
.isAllOnesValue()) {
3342 ReplaceNode(And
, And0
.getNode());
3346 // A negative mask allows a smaller encoding. Create a new 'and' node.
3347 SDValue NewMask
= CurDAG
->getConstant(NegMaskVal
, SDLoc(And
), VT
);
3348 SDValue NewAnd
= CurDAG
->getNode(ISD::AND
, SDLoc(And
), VT
, And0
, NewMask
);
3349 ReplaceNode(And
, NewAnd
.getNode());
3350 SelectCode(NewAnd
.getNode());
3354 void X86DAGToDAGISel::Select(SDNode
*Node
) {
3355 MVT NVT
= Node
->getSimpleValueType(0);
3356 unsigned Opcode
= Node
->getOpcode();
3359 if (Node
->isMachineOpcode()) {
3360 LLVM_DEBUG(dbgs() << "== "; Node
->dump(CurDAG
); dbgs() << '\n');
3361 Node
->setNodeId(-1);
3362 return; // Already selected.
3368 if (Subtarget
->isTargetNaCl())
3369 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
3370 // leave the instruction alone.
3372 if (Subtarget
->isTarget64BitILP32()) {
3373 // Converts a 32-bit register to a 64-bit, zero-extended version of
3374 // it. This is needed because x86-64 can do many things, but jmp %r32
3375 // ain't one of them.
3376 const SDValue
&Target
= Node
->getOperand(1);
3377 assert(Target
.getSimpleValueType() == llvm::MVT::i32
);
3378 SDValue ZextTarget
= CurDAG
->getZExtOrTrunc(Target
, dl
, EVT(MVT::i64
));
3379 SDValue Brind
= CurDAG
->getNode(ISD::BRIND
, dl
, MVT::Other
,
3380 Node
->getOperand(0), ZextTarget
);
3381 ReplaceNode(Node
, Brind
.getNode());
3382 SelectCode(ZextTarget
.getNode());
3383 SelectCode(Brind
.getNode());
3388 case X86ISD::GlobalBaseReg
:
3389 ReplaceNode(Node
, getGlobalBaseReg());
3393 // Just drop all 128/256/512-bit bitcasts.
3394 if (NVT
.is512BitVector() || NVT
.is256BitVector() || NVT
.is128BitVector() ||
3396 ReplaceUses(SDValue(Node
, 0), Node
->getOperand(0));
3397 CurDAG
->RemoveDeadNode(Node
);
3402 case ISD::VSELECT
: {
3403 // Replace VSELECT with non-mask conditions with with BLENDV.
3404 if (Node
->getOperand(0).getValueType().getVectorElementType() == MVT::i1
)
3407 assert(Subtarget
->hasSSE41() && "Expected SSE4.1 support!");
3408 SDValue Blendv
= CurDAG
->getNode(
3409 X86ISD::BLENDV
, SDLoc(Node
), Node
->getValueType(0), Node
->getOperand(0),
3410 Node
->getOperand(1), Node
->getOperand(2));
3411 ReplaceNode(Node
, Blendv
.getNode());
3412 SelectCode(Blendv
.getNode());
3413 // We already called ReplaceUses.
3418 if (matchBitExtract(Node
))
3423 if (tryShiftAmountMod(Node
))
3428 if (MachineSDNode
*NewNode
= matchBEXTRFromAndImm(Node
)) {
3429 ReplaceUses(SDValue(Node
, 0), SDValue(NewNode
, 0));
3430 CurDAG
->RemoveDeadNode(Node
);
3433 if (matchBitExtract(Node
))
3435 if (AndImmShrink
&& shrinkAndImmediate(Node
))
3442 // For operations of the form (x << C1) op C2, check if we can use a smaller
3443 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
3444 SDValue N0
= Node
->getOperand(0);
3445 SDValue N1
= Node
->getOperand(1);
3447 if (N0
->getOpcode() != ISD::SHL
|| !N0
->hasOneUse())
3450 // i8 is unshrinkable, i16 should be promoted to i32.
3451 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
3454 ConstantSDNode
*Cst
= dyn_cast
<ConstantSDNode
>(N1
);
3455 ConstantSDNode
*ShlCst
= dyn_cast
<ConstantSDNode
>(N0
->getOperand(1));
3456 if (!Cst
|| !ShlCst
)
3459 int64_t Val
= Cst
->getSExtValue();
3460 uint64_t ShlVal
= ShlCst
->getZExtValue();
3462 // Make sure that we don't change the operation by removing bits.
3463 // This only matters for OR and XOR, AND is unaffected.
3464 uint64_t RemovedBitsMask
= (1ULL << ShlVal
) - 1;
3465 if (Opcode
!= ISD::AND
&& (Val
& RemovedBitsMask
) != 0)
3468 unsigned ShlOp
, AddOp
, Op
;
3471 // Check the minimum bitwidth for the new constant.
3472 // TODO: AND32ri is the same as AND64ri32 with zext imm.
3473 // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
3474 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
3475 if (!isInt
<8>(Val
) && isInt
<8>(Val
>> ShlVal
))
3477 else if (!isInt
<32>(Val
) && isInt
<32>(Val
>> ShlVal
))
3480 // Bail if there is no smaller encoding.
3484 switch (NVT
.SimpleTy
) {
3485 default: llvm_unreachable("Unsupported VT!");
3487 assert(CstVT
== MVT::i8
);
3488 ShlOp
= X86::SHL32ri
;
3489 AddOp
= X86::ADD32rr
;
3492 default: llvm_unreachable("Impossible opcode");
3493 case ISD::AND
: Op
= X86::AND32ri8
; break;
3494 case ISD::OR
: Op
= X86::OR32ri8
; break;
3495 case ISD::XOR
: Op
= X86::XOR32ri8
; break;
3499 assert(CstVT
== MVT::i8
|| CstVT
== MVT::i32
);
3500 ShlOp
= X86::SHL64ri
;
3501 AddOp
= X86::ADD64rr
;
3504 default: llvm_unreachable("Impossible opcode");
3505 case ISD::AND
: Op
= CstVT
==MVT::i8
? X86::AND64ri8
: X86::AND64ri32
; break;
3506 case ISD::OR
: Op
= CstVT
==MVT::i8
? X86::OR64ri8
: X86::OR64ri32
; break;
3507 case ISD::XOR
: Op
= CstVT
==MVT::i8
? X86::XOR64ri8
: X86::XOR64ri32
; break;
3512 // Emit the smaller op and the shift.
3513 SDValue NewCst
= CurDAG
->getTargetConstant(Val
>> ShlVal
, dl
, CstVT
);
3514 SDNode
*New
= CurDAG
->getMachineNode(Op
, dl
, NVT
, N0
->getOperand(0),NewCst
);
3516 CurDAG
->SelectNodeTo(Node
, AddOp
, NVT
, SDValue(New
, 0),
3519 CurDAG
->SelectNodeTo(Node
, ShlOp
, NVT
, SDValue(New
, 0),
3520 getI8Imm(ShlVal
, dl
));
3524 // i16/i32/i64 are handled with isel patterns.
3528 case X86ISD::UMUL
: {
3529 SDValue N0
= Node
->getOperand(0);
3530 SDValue N1
= Node
->getOperand(1);
3532 unsigned LoReg
, ROpc
, MOpc
;
3533 switch (NVT
.SimpleTy
) {
3534 default: llvm_unreachable("Unsupported VT!");
3537 ROpc
= Opcode
== X86ISD::SMUL
? X86::IMUL8r
: X86::MUL8r
;
3538 MOpc
= Opcode
== X86ISD::SMUL
? X86::IMUL8m
: X86::MUL8m
;
3557 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
3558 bool FoldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
3559 // Multiply is commmutative.
3561 FoldedLoad
= tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
3566 SDValue InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, LoReg
,
3567 N0
, SDValue()).getValue(1);
3569 MachineSDNode
*CNode
;
3571 // i16/i32/i64 use an instruction that produces a low and high result even
3572 // though only the low result is used.
3575 VTs
= CurDAG
->getVTList(NVT
, MVT::i32
, MVT::Other
);
3577 VTs
= CurDAG
->getVTList(NVT
, NVT
, MVT::i32
, MVT::Other
);
3579 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
3581 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
3583 // Update the chain.
3584 ReplaceUses(N1
.getValue(1), SDValue(CNode
, NVT
== MVT::i8
? 2 : 3));
3585 // Record the mem-refs
3586 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
3588 // i16/i32/i64 use an instruction that produces a low and high result even
3589 // though only the low result is used.
3592 VTs
= CurDAG
->getVTList(NVT
, MVT::i32
);
3594 VTs
= CurDAG
->getVTList(NVT
, NVT
, MVT::i32
);
3596 CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, {N1
, InFlag
});
3599 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
3600 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, NVT
== MVT::i8
? 1 : 2));
3601 CurDAG
->RemoveDeadNode(Node
);
3605 case ISD::SMUL_LOHI
:
3606 case ISD::UMUL_LOHI
: {
3607 SDValue N0
= Node
->getOperand(0);
3608 SDValue N1
= Node
->getOperand(1);
3611 bool isSigned
= Opcode
== ISD::SMUL_LOHI
;
3613 switch (NVT
.SimpleTy
) {
3614 default: llvm_unreachable("Unsupported VT!");
3615 case MVT::i32
: Opc
= X86::MUL32r
; MOpc
= X86::MUL32m
; break;
3616 case MVT::i64
: Opc
= X86::MUL64r
; MOpc
= X86::MUL64m
; break;
3619 switch (NVT
.SimpleTy
) {
3620 default: llvm_unreachable("Unsupported VT!");
3621 case MVT::i32
: Opc
= X86::IMUL32r
; MOpc
= X86::IMUL32m
; break;
3622 case MVT::i64
: Opc
= X86::IMUL64r
; MOpc
= X86::IMUL64m
; break;
3626 unsigned SrcReg
, LoReg
, HiReg
;
3628 default: llvm_unreachable("Unknown MUL opcode!");
3631 SrcReg
= LoReg
= X86::EAX
; HiReg
= X86::EDX
;
3635 SrcReg
= LoReg
= X86::RAX
; HiReg
= X86::RDX
;
3639 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
3640 bool foldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
3641 // Multiply is commmutative.
3643 foldedLoad
= tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
3648 SDValue InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, SrcReg
,
3649 N0
, SDValue()).getValue(1);
3652 MachineSDNode
*CNode
= nullptr;
3653 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
3655 SDVTList VTs
= CurDAG
->getVTList(MVT::Other
, MVT::Glue
);
3656 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
3657 Chain
= SDValue(CNode
, 0);
3658 InFlag
= SDValue(CNode
, 1);
3660 // Update the chain.
3661 ReplaceUses(N1
.getValue(1), Chain
);
3662 // Record the mem-refs
3663 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
3665 SDValue Ops
[] = { N1
, InFlag
};
3666 SDVTList VTs
= CurDAG
->getVTList(MVT::Glue
);
3667 SDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
3668 InFlag
= SDValue(CNode
, 0);
3671 // Copy the low half of the result, if it is needed.
3672 if (!SDValue(Node
, 0).use_empty()) {
3673 assert(LoReg
&& "Register for low half is not defined!");
3674 SDValue ResLo
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
, LoReg
,
3676 InFlag
= ResLo
.getValue(2);
3677 ReplaceUses(SDValue(Node
, 0), ResLo
);
3678 LLVM_DEBUG(dbgs() << "=> "; ResLo
.getNode()->dump(CurDAG
);
3681 // Copy the high half of the result, if it is needed.
3682 if (!SDValue(Node
, 1).use_empty()) {
3683 assert(HiReg
&& "Register for high half is not defined!");
3684 SDValue ResHi
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
, HiReg
,
3686 InFlag
= ResHi
.getValue(2);
3687 ReplaceUses(SDValue(Node
, 1), ResHi
);
3688 LLVM_DEBUG(dbgs() << "=> "; ResHi
.getNode()->dump(CurDAG
);
3692 CurDAG
->RemoveDeadNode(Node
);
3697 case ISD::UDIVREM
: {
3698 SDValue N0
= Node
->getOperand(0);
3699 SDValue N1
= Node
->getOperand(1);
3702 bool isSigned
= Opcode
== ISD::SDIVREM
;
3704 switch (NVT
.SimpleTy
) {
3705 default: llvm_unreachable("Unsupported VT!");
3706 case MVT::i8
: Opc
= X86::DIV8r
; MOpc
= X86::DIV8m
; break;
3707 case MVT::i16
: Opc
= X86::DIV16r
; MOpc
= X86::DIV16m
; break;
3708 case MVT::i32
: Opc
= X86::DIV32r
; MOpc
= X86::DIV32m
; break;
3709 case MVT::i64
: Opc
= X86::DIV64r
; MOpc
= X86::DIV64m
; break;
3712 switch (NVT
.SimpleTy
) {
3713 default: llvm_unreachable("Unsupported VT!");
3714 case MVT::i8
: Opc
= X86::IDIV8r
; MOpc
= X86::IDIV8m
; break;
3715 case MVT::i16
: Opc
= X86::IDIV16r
; MOpc
= X86::IDIV16m
; break;
3716 case MVT::i32
: Opc
= X86::IDIV32r
; MOpc
= X86::IDIV32m
; break;
3717 case MVT::i64
: Opc
= X86::IDIV64r
; MOpc
= X86::IDIV64m
; break;
3721 unsigned LoReg
, HiReg
, ClrReg
;
3722 unsigned SExtOpcode
;
3723 switch (NVT
.SimpleTy
) {
3724 default: llvm_unreachable("Unsupported VT!");
3726 LoReg
= X86::AL
; ClrReg
= HiReg
= X86::AH
;
3727 SExtOpcode
= X86::CBW
;
3730 LoReg
= X86::AX
; HiReg
= X86::DX
;
3732 SExtOpcode
= X86::CWD
;
3735 LoReg
= X86::EAX
; ClrReg
= HiReg
= X86::EDX
;
3736 SExtOpcode
= X86::CDQ
;
3739 LoReg
= X86::RAX
; ClrReg
= HiReg
= X86::RDX
;
3740 SExtOpcode
= X86::CQO
;
3744 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
3745 bool foldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
3746 bool signBitIsZero
= CurDAG
->SignBitIsZero(N0
);
3749 if (NVT
== MVT::i8
&& (!isSigned
|| signBitIsZero
)) {
3750 // Special case for div8, just use a move with zero extension to AX to
3751 // clear the upper 8 bits (AH).
3752 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Chain
;
3753 MachineSDNode
*Move
;
3754 if (tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
3755 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N0
.getOperand(0) };
3756 Move
= CurDAG
->getMachineNode(X86::MOVZX32rm8
, dl
, MVT::i32
,
3758 Chain
= SDValue(Move
, 1);
3759 ReplaceUses(N0
.getValue(1), Chain
);
3760 // Record the mem-refs
3761 CurDAG
->setNodeMemRefs(Move
, {cast
<LoadSDNode
>(N0
)->getMemOperand()});
3763 Move
= CurDAG
->getMachineNode(X86::MOVZX32rr8
, dl
, MVT::i32
, N0
);
3764 Chain
= CurDAG
->getEntryNode();
3766 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::EAX
, SDValue(Move
, 0),
3768 InFlag
= Chain
.getValue(1);
3771 CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
,
3772 LoReg
, N0
, SDValue()).getValue(1);
3773 if (isSigned
&& !signBitIsZero
) {
3774 // Sign extend the low part into the high part.
3776 SDValue(CurDAG
->getMachineNode(SExtOpcode
, dl
, MVT::Glue
, InFlag
),0);
3778 // Zero out the high part, effectively zero extending the input.
3779 SDValue ClrNode
= SDValue(CurDAG
->getMachineNode(X86::MOV32r0
, dl
, NVT
), 0);
3780 switch (NVT
.SimpleTy
) {
3783 SDValue(CurDAG
->getMachineNode(
3784 TargetOpcode::EXTRACT_SUBREG
, dl
, MVT::i16
, ClrNode
,
3785 CurDAG
->getTargetConstant(X86::sub_16bit
, dl
,
3793 SDValue(CurDAG
->getMachineNode(
3794 TargetOpcode::SUBREG_TO_REG
, dl
, MVT::i64
,
3795 CurDAG
->getTargetConstant(0, dl
, MVT::i64
), ClrNode
,
3796 CurDAG
->getTargetConstant(X86::sub_32bit
, dl
,
3801 llvm_unreachable("Unexpected division source");
3804 InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, ClrReg
,
3805 ClrNode
, InFlag
).getValue(1);
3810 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
3812 MachineSDNode
*CNode
=
3813 CurDAG
->getMachineNode(MOpc
, dl
, MVT::Other
, MVT::Glue
, Ops
);
3814 InFlag
= SDValue(CNode
, 1);
3815 // Update the chain.
3816 ReplaceUses(N1
.getValue(1), SDValue(CNode
, 0));
3817 // Record the mem-refs
3818 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
3821 SDValue(CurDAG
->getMachineNode(Opc
, dl
, MVT::Glue
, N1
, InFlag
), 0);
3824 // Prevent use of AH in a REX instruction by explicitly copying it to
3825 // an ABCD_L register.
3827 // The current assumption of the register allocator is that isel
3828 // won't generate explicit references to the GR8_ABCD_H registers. If
3829 // the allocator and/or the backend get enhanced to be more robust in
3830 // that regard, this can be, and should be, removed.
3831 if (HiReg
== X86::AH
&& !SDValue(Node
, 1).use_empty()) {
3832 SDValue AHCopy
= CurDAG
->getRegister(X86::AH
, MVT::i8
);
3833 unsigned AHExtOpcode
=
3834 isSigned
? X86::MOVSX32rr8_NOREX
: X86::MOVZX32rr8_NOREX
;
3836 SDNode
*RNode
= CurDAG
->getMachineNode(AHExtOpcode
, dl
, MVT::i32
,
3837 MVT::Glue
, AHCopy
, InFlag
);
3838 SDValue
Result(RNode
, 0);
3839 InFlag
= SDValue(RNode
, 1);
3842 CurDAG
->getTargetExtractSubreg(X86::sub_8bit
, dl
, MVT::i8
, Result
);
3844 ReplaceUses(SDValue(Node
, 1), Result
);
3845 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
3848 // Copy the division (low) result, if it is needed.
3849 if (!SDValue(Node
, 0).use_empty()) {
3850 SDValue Result
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
,
3851 LoReg
, NVT
, InFlag
);
3852 InFlag
= Result
.getValue(2);
3853 ReplaceUses(SDValue(Node
, 0), Result
);
3854 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
3857 // Copy the remainder (high) result, if it is needed.
3858 if (!SDValue(Node
, 1).use_empty()) {
3859 SDValue Result
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
,
3860 HiReg
, NVT
, InFlag
);
3861 InFlag
= Result
.getValue(2);
3862 ReplaceUses(SDValue(Node
, 1), Result
);
3863 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
3866 CurDAG
->RemoveDeadNode(Node
);
3871 SDValue N0
= Node
->getOperand(0);
3872 SDValue N1
= Node
->getOperand(1);
3874 // Optimizations for TEST compares.
3875 if (!isNullConstant(N1
))
3878 // Save the original VT of the compare.
3879 MVT CmpVT
= N0
.getSimpleValueType();
3881 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
3882 // by a test instruction. The test should be removed later by
3883 // analyzeCompare if we are using only the zero flag.
3884 // TODO: Should we check the users and use the BEXTR flags directly?
3885 if (N0
.getOpcode() == ISD::AND
&& N0
.hasOneUse()) {
3886 if (MachineSDNode
*NewNode
= matchBEXTRFromAndImm(N0
.getNode())) {
3887 unsigned TestOpc
= CmpVT
== MVT::i64
? X86::TEST64rr
3889 SDValue BEXTR
= SDValue(NewNode
, 0);
3890 NewNode
= CurDAG
->getMachineNode(TestOpc
, dl
, MVT::i32
, BEXTR
, BEXTR
);
3891 ReplaceUses(SDValue(Node
, 0), SDValue(NewNode
, 0));
3892 CurDAG
->RemoveDeadNode(Node
);
3897 // We can peek through truncates, but we need to be careful below.
3898 if (N0
.getOpcode() == ISD::TRUNCATE
&& N0
.hasOneUse())
3899 N0
= N0
.getOperand(0);
3901 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
3902 // use a smaller encoding.
3903 // Look past the truncate if CMP is the only use of it.
3904 if (N0
.getOpcode() == ISD::AND
&&
3905 N0
.getNode()->hasOneUse() &&
3906 N0
.getValueType() != MVT::i8
) {
3907 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1));
3909 uint64_t Mask
= C
->getZExtValue();
3911 // Check if we can replace AND+IMM64 with a shift. This is possible for
3912 // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
3914 if (CmpVT
== MVT::i64
&& !isInt
<32>(Mask
) &&
3915 onlyUsesZeroFlag(SDValue(Node
, 0))) {
3916 if (isMask_64(~Mask
)) {
3917 unsigned TrailingZeros
= countTrailingZeros(Mask
);
3918 SDValue Imm
= CurDAG
->getTargetConstant(TrailingZeros
, dl
, MVT::i64
);
3920 SDValue(CurDAG
->getMachineNode(X86::SHR64ri
, dl
, MVT::i64
,
3921 N0
.getOperand(0), Imm
), 0);
3922 MachineSDNode
*Test
= CurDAG
->getMachineNode(X86::TEST64rr
, dl
,
3923 MVT::i32
, Shift
, Shift
);
3924 ReplaceNode(Node
, Test
);
3927 if (isMask_64(Mask
)) {
3928 unsigned LeadingZeros
= countLeadingZeros(Mask
);
3929 SDValue Imm
= CurDAG
->getTargetConstant(LeadingZeros
, dl
, MVT::i64
);
3931 SDValue(CurDAG
->getMachineNode(X86::SHL64ri
, dl
, MVT::i64
,
3932 N0
.getOperand(0), Imm
), 0);
3933 MachineSDNode
*Test
= CurDAG
->getMachineNode(X86::TEST64rr
, dl
,
3934 MVT::i32
, Shift
, Shift
);
3935 ReplaceNode(Node
, Test
);
3942 unsigned ROpc
, MOpc
;
3944 // For each of these checks we need to be careful if the sign flag is
3945 // being used. It is only safe to use the sign flag in two conditions,
3946 // either the sign bit in the shrunken mask is zero or the final test
3947 // size is equal to the original compare size.
3949 if (isUInt
<8>(Mask
) &&
3950 (!(Mask
& 0x80) || CmpVT
== MVT::i8
||
3951 hasNoSignFlagUses(SDValue(Node
, 0)))) {
3952 // For example, convert "testl %eax, $8" to "testb %al, $8"
3954 SubRegOp
= X86::sub_8bit
;
3955 ROpc
= X86::TEST8ri
;
3956 MOpc
= X86::TEST8mi
;
3957 } else if (OptForMinSize
&& isUInt
<16>(Mask
) &&
3958 (!(Mask
& 0x8000) || CmpVT
== MVT::i16
||
3959 hasNoSignFlagUses(SDValue(Node
, 0)))) {
3960 // For example, "testl %eax, $32776" to "testw %ax, $32776".
3961 // NOTE: We only want to form TESTW instructions if optimizing for
3962 // min size. Otherwise we only save one byte and possibly get a length
3963 // changing prefix penalty in the decoders.
3965 SubRegOp
= X86::sub_16bit
;
3966 ROpc
= X86::TEST16ri
;
3967 MOpc
= X86::TEST16mi
;
3968 } else if (isUInt
<32>(Mask
) && N0
.getValueType() != MVT::i16
&&
3969 ((!(Mask
& 0x80000000) &&
3970 // Without minsize 16-bit Cmps can get here so we need to
3971 // be sure we calculate the correct sign flag if needed.
3972 (CmpVT
!= MVT::i16
|| !(Mask
& 0x8000))) ||
3973 CmpVT
== MVT::i32
||
3974 hasNoSignFlagUses(SDValue(Node
, 0)))) {
3975 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
3976 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
3977 // Otherwize, we find ourselves in a position where we have to do
3978 // promotion. If previous passes did not promote the and, we assume
3979 // they had a good reason not to and do not promote here.
3981 SubRegOp
= X86::sub_32bit
;
3982 ROpc
= X86::TEST32ri
;
3983 MOpc
= X86::TEST32mi
;
3985 // No eligible transformation was found.
3989 // FIXME: We should be able to fold loads here.
3991 SDValue Imm
= CurDAG
->getTargetConstant(Mask
, dl
, VT
);
3992 SDValue Reg
= N0
.getOperand(0);
3994 // Emit a testl or testw.
3995 MachineSDNode
*NewNode
;
3996 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
3997 if (tryFoldLoad(Node
, N0
.getNode(), Reg
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
3998 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
3999 Reg
.getOperand(0) };
4000 NewNode
= CurDAG
->getMachineNode(MOpc
, dl
, MVT::i32
, MVT::Other
, Ops
);
4001 // Update the chain.
4002 ReplaceUses(Reg
.getValue(1), SDValue(NewNode
, 1));
4003 // Record the mem-refs
4004 CurDAG
->setNodeMemRefs(NewNode
,
4005 {cast
<LoadSDNode
>(Reg
)->getMemOperand()});
4007 // Extract the subregister if necessary.
4008 if (N0
.getValueType() != VT
)
4009 Reg
= CurDAG
->getTargetExtractSubreg(SubRegOp
, dl
, VT
, Reg
);
4011 NewNode
= CurDAG
->getMachineNode(ROpc
, dl
, MVT::i32
, Reg
, Imm
);
4013 // Replace CMP with TEST.
4014 ReplaceNode(Node
, NewNode
);
4019 case X86ISD::PCMPISTR
: {
4020 if (!Subtarget
->hasSSE42())
4023 bool NeedIndex
= !SDValue(Node
, 0).use_empty();
4024 bool NeedMask
= !SDValue(Node
, 1).use_empty();
4025 // We can't fold a load if we are going to make two instructions.
4026 bool MayFoldLoad
= !NeedIndex
|| !NeedMask
;
4028 MachineSDNode
*CNode
;
4030 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRMrr
: X86::PCMPISTRMrr
;
4031 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRMrm
: X86::PCMPISTRMrm
;
4032 CNode
= emitPCMPISTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::v16i8
, Node
);
4033 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, 0));
4035 if (NeedIndex
|| !NeedMask
) {
4036 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRIrr
: X86::PCMPISTRIrr
;
4037 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRIrm
: X86::PCMPISTRIrm
;
4038 CNode
= emitPCMPISTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::i32
, Node
);
4039 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
4042 // Connect the flag usage to the last instruction created.
4043 ReplaceUses(SDValue(Node
, 2), SDValue(CNode
, 1));
4044 CurDAG
->RemoveDeadNode(Node
);
4047 case X86ISD::PCMPESTR
: {
4048 if (!Subtarget
->hasSSE42())
4051 // Copy the two implicit register inputs.
4052 SDValue InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EAX
,
4053 Node
->getOperand(1),
4054 SDValue()).getValue(1);
4055 InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EDX
,
4056 Node
->getOperand(3), InFlag
).getValue(1);
4058 bool NeedIndex
= !SDValue(Node
, 0).use_empty();
4059 bool NeedMask
= !SDValue(Node
, 1).use_empty();
4060 // We can't fold a load if we are going to make two instructions.
4061 bool MayFoldLoad
= !NeedIndex
|| !NeedMask
;
4063 MachineSDNode
*CNode
;
4065 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRMrr
: X86::PCMPESTRMrr
;
4066 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRMrm
: X86::PCMPESTRMrm
;
4067 CNode
= emitPCMPESTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::v16i8
, Node
,
4069 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, 0));
4071 if (NeedIndex
|| !NeedMask
) {
4072 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRIrr
: X86::PCMPESTRIrr
;
4073 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRIrm
: X86::PCMPESTRIrm
;
4074 CNode
= emitPCMPESTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::i32
, Node
, InFlag
);
4075 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
4077 // Connect the flag usage to the last instruction created.
4078 ReplaceUses(SDValue(Node
, 2), SDValue(CNode
, 1));
4079 CurDAG
->RemoveDeadNode(Node
);
4084 if (foldLoadStoreIntoMemOperand(Node
))
4092 bool X86DAGToDAGISel::
4093 SelectInlineAsmMemoryOperand(const SDValue
&Op
, unsigned ConstraintID
,
4094 std::vector
<SDValue
> &OutOps
) {
4095 SDValue Op0
, Op1
, Op2
, Op3
, Op4
;
4096 switch (ConstraintID
) {
4098 llvm_unreachable("Unexpected asm memory constraint");
4099 case InlineAsm::Constraint_i
:
4100 // FIXME: It seems strange that 'i' is needed here since it's supposed to
4101 // be an immediate and not a memory constraint.
4103 case InlineAsm::Constraint_o
: // offsetable ??
4104 case InlineAsm::Constraint_v
: // not offsetable ??
4105 case InlineAsm::Constraint_m
: // memory
4106 case InlineAsm::Constraint_X
:
4107 if (!selectAddr(nullptr, Op
, Op0
, Op1
, Op2
, Op3
, Op4
))
4112 OutOps
.push_back(Op0
);
4113 OutOps
.push_back(Op1
);
4114 OutOps
.push_back(Op2
);
4115 OutOps
.push_back(Op3
);
4116 OutOps
.push_back(Op4
);
4120 /// This pass converts a legalized DAG into a X86-specific DAG,
4121 /// ready for instruction scheduling.
4122 FunctionPass
*llvm::createX86ISelDag(X86TargetMachine
&TM
,
4123 CodeGenOpt::Level OptLevel
) {
4124 return new X86DAGToDAGISel(TM
, OptLevel
);