1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines a DAG pattern matching instruction selector for X86,
10 // converting from a legalized dag to a X86 dag.
12 //===----------------------------------------------------------------------===//
15 #include "X86MachineFunctionInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "X86TargetMachine.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/MachineFrameInfo.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/SelectionDAGISel.h"
23 #include "llvm/Config/llvm-config.h"
24 #include "llvm/IR/ConstantRange.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/ErrorHandling.h"
31 #include "llvm/Support/KnownBits.h"
32 #include "llvm/Support/MathExtras.h"
33 #include "llvm/Support/raw_ostream.h"
34 #include "llvm/Target/TargetMachine.h"
35 #include "llvm/Target/TargetOptions.h"
39 #define DEBUG_TYPE "x86-isel"
41 STATISTIC(NumLoadMoved
, "Number of loads moved below TokenFactor");
43 static cl::opt
<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
47 //===----------------------------------------------------------------------===//
48 // Pattern Matcher Implementation
49 //===----------------------------------------------------------------------===//
52 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
53 /// numbers for the leaves of the matched tree.
54 struct X86ISelAddressMode
{
60 // This is really a union, discriminated by BaseType!
68 const GlobalValue
*GV
;
70 const BlockAddress
*BlockAddr
;
74 unsigned Align
; // CP alignment.
75 unsigned char SymbolFlags
; // X86II::MO_*
76 bool NegateIndex
= false;
79 : BaseType(RegBase
), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
80 Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
81 MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG
) {}
83 bool hasSymbolicDisplacement() const {
84 return GV
!= nullptr || CP
!= nullptr || ES
!= nullptr ||
85 MCSym
!= nullptr || JT
!= -1 || BlockAddr
!= nullptr;
88 bool hasBaseOrIndexReg() const {
89 return BaseType
== FrameIndexBase
||
90 IndexReg
.getNode() != nullptr || Base_Reg
.getNode() != nullptr;
93 /// Return true if this addressing mode is already RIP-relative.
94 bool isRIPRelative() const {
95 if (BaseType
!= RegBase
) return false;
96 if (RegisterSDNode
*RegNode
=
97 dyn_cast_or_null
<RegisterSDNode
>(Base_Reg
.getNode()))
98 return RegNode
->getReg() == X86::RIP
;
102 void setBaseReg(SDValue Reg
) {
107 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
108 void dump(SelectionDAG
*DAG
= nullptr) {
109 dbgs() << "X86ISelAddressMode " << this << '\n';
110 dbgs() << "Base_Reg ";
111 if (Base_Reg
.getNode())
112 Base_Reg
.getNode()->dump(DAG
);
115 if (BaseType
== FrameIndexBase
)
116 dbgs() << " Base.FrameIndex " << Base_FrameIndex
<< '\n';
117 dbgs() << " Scale " << Scale
<< '\n'
121 if (IndexReg
.getNode())
122 IndexReg
.getNode()->dump(DAG
);
125 dbgs() << " Disp " << Disp
<< '\n'
147 dbgs() << " JT" << JT
<< " Align" << Align
<< '\n';
154 //===--------------------------------------------------------------------===//
155 /// ISel - X86-specific code to select X86 machine instructions for
156 /// SelectionDAG operations.
158 class X86DAGToDAGISel final
: public SelectionDAGISel
{
159 /// Keep a pointer to the X86Subtarget around so that we can
160 /// make the right decision when generating code for different targets.
161 const X86Subtarget
*Subtarget
;
163 /// If true, selector should try to optimize for code size instead of
167 /// If true, selector should try to optimize for minimum code size.
170 /// Disable direct TLS access through segment registers.
171 bool IndirectTlsSegRefs
;
174 explicit X86DAGToDAGISel(X86TargetMachine
&tm
, CodeGenOpt::Level OptLevel
)
175 : SelectionDAGISel(tm
, OptLevel
), Subtarget(nullptr), OptForSize(false),
176 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178 StringRef
getPassName() const override
{
179 return "X86 DAG->DAG Instruction Selection";
182 bool runOnMachineFunction(MachineFunction
&MF
) override
{
183 // Reset the subtarget each time through.
184 Subtarget
= &MF
.getSubtarget
<X86Subtarget
>();
185 IndirectTlsSegRefs
= MF
.getFunction().hasFnAttribute(
186 "indirect-tls-seg-refs");
188 // OptFor[Min]Size are used in pattern predicates that isel is matching.
189 OptForSize
= MF
.getFunction().hasOptSize();
190 OptForMinSize
= MF
.getFunction().hasMinSize();
191 assert((!OptForMinSize
|| OptForSize
) &&
192 "OptForMinSize implies OptForSize");
194 SelectionDAGISel::runOnMachineFunction(MF
);
198 void EmitFunctionEntryCode() override
;
200 bool IsProfitableToFold(SDValue N
, SDNode
*U
, SDNode
*Root
) const override
;
202 void PreprocessISelDAG() override
;
203 void PostprocessISelDAG() override
;
205 // Include the pieces autogenerated from the target description.
206 #include "X86GenDAGISel.inc"
209 void Select(SDNode
*N
) override
;
211 bool foldOffsetIntoAddress(uint64_t Offset
, X86ISelAddressMode
&AM
);
212 bool matchLoadInAddress(LoadSDNode
*N
, X86ISelAddressMode
&AM
);
213 bool matchWrapper(SDValue N
, X86ISelAddressMode
&AM
);
214 bool matchAddress(SDValue N
, X86ISelAddressMode
&AM
);
215 bool matchVectorAddress(SDValue N
, X86ISelAddressMode
&AM
);
216 bool matchAdd(SDValue
&N
, X86ISelAddressMode
&AM
, unsigned Depth
);
217 bool matchAddressRecursively(SDValue N
, X86ISelAddressMode
&AM
,
219 bool matchAddressBase(SDValue N
, X86ISelAddressMode
&AM
);
220 bool selectAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
221 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
223 bool selectVectorAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
224 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
226 bool selectMOV64Imm32(SDValue N
, SDValue
&Imm
);
227 bool selectLEAAddr(SDValue N
, SDValue
&Base
,
228 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
230 bool selectLEA64_32Addr(SDValue N
, SDValue
&Base
,
231 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
233 bool selectTLSADDRAddr(SDValue N
, SDValue
&Base
,
234 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
236 bool selectScalarSSELoad(SDNode
*Root
, SDNode
*Parent
, SDValue N
,
237 SDValue
&Base
, SDValue
&Scale
,
238 SDValue
&Index
, SDValue
&Disp
,
240 SDValue
&NodeWithChain
);
241 bool selectRelocImm(SDValue N
, SDValue
&Op
);
243 bool tryFoldLoad(SDNode
*Root
, SDNode
*P
, SDValue N
,
244 SDValue
&Base
, SDValue
&Scale
,
245 SDValue
&Index
, SDValue
&Disp
,
248 // Convenience method where P is also root.
249 bool tryFoldLoad(SDNode
*P
, SDValue N
,
250 SDValue
&Base
, SDValue
&Scale
,
251 SDValue
&Index
, SDValue
&Disp
,
253 return tryFoldLoad(P
, P
, N
, Base
, Scale
, Index
, Disp
, Segment
);
256 bool tryFoldBroadcast(SDNode
*Root
, SDNode
*P
, SDValue N
,
257 SDValue
&Base
, SDValue
&Scale
,
258 SDValue
&Index
, SDValue
&Disp
,
261 /// Implement addressing mode selection for inline asm expressions.
262 bool SelectInlineAsmMemoryOperand(const SDValue
&Op
,
263 unsigned ConstraintID
,
264 std::vector
<SDValue
> &OutOps
) override
;
266 void emitSpecialCodeForMain();
268 inline void getAddressOperands(X86ISelAddressMode
&AM
, const SDLoc
&DL
,
269 MVT VT
, SDValue
&Base
, SDValue
&Scale
,
270 SDValue
&Index
, SDValue
&Disp
,
272 if (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
273 Base
= CurDAG
->getTargetFrameIndex(
274 AM
.Base_FrameIndex
, TLI
->getPointerTy(CurDAG
->getDataLayout()));
275 else if (AM
.Base_Reg
.getNode())
278 Base
= CurDAG
->getRegister(0, VT
);
280 Scale
= getI8Imm(AM
.Scale
, DL
);
282 // Negate the index if needed.
283 if (AM
.NegateIndex
) {
284 unsigned NegOpc
= VT
== MVT::i64
? X86::NEG64r
: X86::NEG32r
;
285 SDValue Neg
= SDValue(CurDAG
->getMachineNode(NegOpc
, DL
, VT
, MVT::i32
,
290 if (AM
.IndexReg
.getNode())
293 Index
= CurDAG
->getRegister(0, VT
);
295 // These are 32-bit even in 64-bit mode since RIP-relative offset
298 Disp
= CurDAG
->getTargetGlobalAddress(AM
.GV
, SDLoc(),
302 Disp
= CurDAG
->getTargetConstantPool(AM
.CP
, MVT::i32
,
303 AM
.Align
, AM
.Disp
, AM
.SymbolFlags
);
305 assert(!AM
.Disp
&& "Non-zero displacement is ignored with ES.");
306 Disp
= CurDAG
->getTargetExternalSymbol(AM
.ES
, MVT::i32
, AM
.SymbolFlags
);
307 } else if (AM
.MCSym
) {
308 assert(!AM
.Disp
&& "Non-zero displacement is ignored with MCSym.");
309 assert(AM
.SymbolFlags
== 0 && "oo");
310 Disp
= CurDAG
->getMCSymbol(AM
.MCSym
, MVT::i32
);
311 } else if (AM
.JT
!= -1) {
312 assert(!AM
.Disp
&& "Non-zero displacement is ignored with JT.");
313 Disp
= CurDAG
->getTargetJumpTable(AM
.JT
, MVT::i32
, AM
.SymbolFlags
);
314 } else if (AM
.BlockAddr
)
315 Disp
= CurDAG
->getTargetBlockAddress(AM
.BlockAddr
, MVT::i32
, AM
.Disp
,
318 Disp
= CurDAG
->getTargetConstant(AM
.Disp
, DL
, MVT::i32
);
320 if (AM
.Segment
.getNode())
321 Segment
= AM
.Segment
;
323 Segment
= CurDAG
->getRegister(0, MVT::i16
);
326 // Utility function to determine whether we should avoid selecting
327 // immediate forms of instructions for better code size or not.
328 // At a high level, we'd like to avoid such instructions when
329 // we have similar constants used within the same basic block
330 // that can be kept in a register.
332 bool shouldAvoidImmediateInstFormsForSize(SDNode
*N
) const {
333 uint32_t UseCount
= 0;
335 // Do not want to hoist if we're not optimizing for size.
336 // TODO: We'd like to remove this restriction.
337 // See the comment in X86InstrInfo.td for more info.
341 // Walk all the users of the immediate.
342 for (SDNode::use_iterator UI
= N
->use_begin(),
343 UE
= N
->use_end(); (UI
!= UE
) && (UseCount
< 2); ++UI
) {
347 // This user is already selected. Count it as a legitimate use and
349 if (User
->isMachineOpcode()) {
354 // We want to count stores of immediates as real uses.
355 if (User
->getOpcode() == ISD::STORE
&&
356 User
->getOperand(1).getNode() == N
) {
361 // We don't currently match users that have > 2 operands (except
362 // for stores, which are handled above)
363 // Those instruction won't match in ISEL, for now, and would
364 // be counted incorrectly.
365 // This may change in the future as we add additional instruction
367 if (User
->getNumOperands() != 2)
370 // If this can match to INC/DEC, don't count it as a use.
371 if (User
->getOpcode() == ISD::ADD
&&
372 (isOneConstant(SDValue(N
, 0)) || isAllOnesConstant(SDValue(N
, 0))))
375 // Immediates that are used for offsets as part of stack
376 // manipulation should be left alone. These are typically
377 // used to indicate SP offsets for argument passing and
378 // will get pulled into stores/pushes (implicitly).
379 if (User
->getOpcode() == X86ISD::ADD
||
380 User
->getOpcode() == ISD::ADD
||
381 User
->getOpcode() == X86ISD::SUB
||
382 User
->getOpcode() == ISD::SUB
) {
384 // Find the other operand of the add/sub.
385 SDValue OtherOp
= User
->getOperand(0);
386 if (OtherOp
.getNode() == N
)
387 OtherOp
= User
->getOperand(1);
389 // Don't count if the other operand is SP.
390 RegisterSDNode
*RegNode
;
391 if (OtherOp
->getOpcode() == ISD::CopyFromReg
&&
392 (RegNode
= dyn_cast_or_null
<RegisterSDNode
>(
393 OtherOp
->getOperand(1).getNode())))
394 if ((RegNode
->getReg() == X86::ESP
) ||
395 (RegNode
->getReg() == X86::RSP
))
399 // ... otherwise, count this and move on.
403 // If we have more than 1 use, then recommend for hoisting.
404 return (UseCount
> 1);
407 /// Return a target constant with the specified value of type i8.
408 inline SDValue
getI8Imm(unsigned Imm
, const SDLoc
&DL
) {
409 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i8
);
412 /// Return a target constant with the specified value, of type i32.
413 inline SDValue
getI32Imm(unsigned Imm
, const SDLoc
&DL
) {
414 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i32
);
417 /// Return a target constant with the specified value, of type i64.
418 inline SDValue
getI64Imm(uint64_t Imm
, const SDLoc
&DL
) {
419 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i64
);
422 SDValue
getExtractVEXTRACTImmediate(SDNode
*N
, unsigned VecWidth
,
424 assert((VecWidth
== 128 || VecWidth
== 256) && "Unexpected vector width");
425 uint64_t Index
= N
->getConstantOperandVal(1);
426 MVT VecVT
= N
->getOperand(0).getSimpleValueType();
427 return getI8Imm((Index
* VecVT
.getScalarSizeInBits()) / VecWidth
, DL
);
430 SDValue
getInsertVINSERTImmediate(SDNode
*N
, unsigned VecWidth
,
432 assert((VecWidth
== 128 || VecWidth
== 256) && "Unexpected vector width");
433 uint64_t Index
= N
->getConstantOperandVal(2);
434 MVT VecVT
= N
->getSimpleValueType(0);
435 return getI8Imm((Index
* VecVT
.getScalarSizeInBits()) / VecWidth
, DL
);
438 // Helper to detect unneeded and instructions on shift amounts. Called
439 // from PatFrags in tablegen.
440 bool isUnneededShiftMask(SDNode
*N
, unsigned Width
) const {
441 assert(N
->getOpcode() == ISD::AND
&& "Unexpected opcode");
442 const APInt
&Val
= cast
<ConstantSDNode
>(N
->getOperand(1))->getAPIntValue();
444 if (Val
.countTrailingOnes() >= Width
)
447 APInt Mask
= Val
| CurDAG
->computeKnownBits(N
->getOperand(0)).Zero
;
448 return Mask
.countTrailingOnes() >= Width
;
451 /// Return an SDNode that returns the value of the global base register.
452 /// Output instructions required to initialize the global base register,
454 SDNode
*getGlobalBaseReg();
456 /// Return a reference to the TargetMachine, casted to the target-specific
458 const X86TargetMachine
&getTargetMachine() const {
459 return static_cast<const X86TargetMachine
&>(TM
);
462 /// Return a reference to the TargetInstrInfo, casted to the target-specific
464 const X86InstrInfo
*getInstrInfo() const {
465 return Subtarget
->getInstrInfo();
468 /// Address-mode matching performs shift-of-and to and-of-shift
469 /// reassociation in order to expose more scaled addressing
471 bool ComplexPatternFuncMutatesDAG() const override
{
475 bool isSExtAbsoluteSymbolRef(unsigned Width
, SDNode
*N
) const;
477 /// Returns whether this is a relocatable immediate in the range
478 /// [-2^Width .. 2^Width-1].
479 template <unsigned Width
> bool isSExtRelocImm(SDNode
*N
) const {
480 if (auto *CN
= dyn_cast
<ConstantSDNode
>(N
))
481 return isInt
<Width
>(CN
->getSExtValue());
482 return isSExtAbsoluteSymbolRef(Width
, N
);
485 // Indicates we should prefer to use a non-temporal load for this load.
486 bool useNonTemporalLoad(LoadSDNode
*N
) const {
487 if (!N
->isNonTemporal())
490 unsigned StoreSize
= N
->getMemoryVT().getStoreSize();
492 if (N
->getAlignment() < StoreSize
)
496 default: llvm_unreachable("Unsupported store size");
501 return Subtarget
->hasSSE41();
503 return Subtarget
->hasAVX2();
505 return Subtarget
->hasAVX512();
509 bool foldLoadStoreIntoMemOperand(SDNode
*Node
);
510 MachineSDNode
*matchBEXTRFromAndImm(SDNode
*Node
);
511 bool matchBitExtract(SDNode
*Node
);
512 bool shrinkAndImmediate(SDNode
*N
);
513 bool isMaskZeroExtended(SDNode
*N
) const;
514 bool tryShiftAmountMod(SDNode
*N
);
515 bool combineIncDecVector(SDNode
*Node
);
516 bool tryShrinkShlLogicImm(SDNode
*N
);
517 bool tryVPTESTM(SDNode
*Root
, SDValue Setcc
, SDValue Mask
);
518 bool tryMatchBitSelect(SDNode
*N
);
520 MachineSDNode
*emitPCMPISTR(unsigned ROpc
, unsigned MOpc
, bool MayFoldLoad
,
521 const SDLoc
&dl
, MVT VT
, SDNode
*Node
);
522 MachineSDNode
*emitPCMPESTR(unsigned ROpc
, unsigned MOpc
, bool MayFoldLoad
,
523 const SDLoc
&dl
, MVT VT
, SDNode
*Node
,
526 bool tryOptimizeRem8Extend(SDNode
*N
);
528 bool onlyUsesZeroFlag(SDValue Flags
) const;
529 bool hasNoSignFlagUses(SDValue Flags
) const;
530 bool hasNoCarryFlagUses(SDValue Flags
) const;
535 // Returns true if this masked compare can be implemented legally with this
537 static bool isLegalMaskCompare(SDNode
*N
, const X86Subtarget
*Subtarget
) {
538 unsigned Opcode
= N
->getOpcode();
539 if (Opcode
== X86ISD::CMPM
|| Opcode
== ISD::SETCC
||
540 Opcode
== X86ISD::CMPM_SAE
|| Opcode
== X86ISD::VFPCLASS
) {
541 // We can get 256-bit 8 element types here without VLX being enabled. When
542 // this happens we will use 512-bit operations and the mask will not be
544 EVT OpVT
= N
->getOperand(0).getValueType();
545 if (OpVT
.is256BitVector() || OpVT
.is128BitVector())
546 return Subtarget
->hasVLX();
550 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
551 if (Opcode
== X86ISD::VFPCLASSS
|| Opcode
== X86ISD::FSETCCM
||
552 Opcode
== X86ISD::FSETCCM_SAE
)
558 // Returns true if we can assume the writer of the mask has zero extended it
560 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode
*N
) const {
561 // If this is an AND, check if we have a compare on either side. As long as
562 // one side guarantees the mask is zero extended, the AND will preserve those
564 if (N
->getOpcode() == ISD::AND
)
565 return isLegalMaskCompare(N
->getOperand(0).getNode(), Subtarget
) ||
566 isLegalMaskCompare(N
->getOperand(1).getNode(), Subtarget
);
568 return isLegalMaskCompare(N
, Subtarget
);
572 X86DAGToDAGISel::IsProfitableToFold(SDValue N
, SDNode
*U
, SDNode
*Root
) const {
573 if (OptLevel
== CodeGenOpt::None
) return false;
578 if (N
.getOpcode() != ISD::LOAD
)
581 // Don't fold non-temporal loads if we have an instruction for them.
582 if (useNonTemporalLoad(cast
<LoadSDNode
>(N
)))
585 // If N is a load, do additional profitability checks.
587 switch (U
->getOpcode()) {
601 SDValue Op1
= U
->getOperand(1);
603 // If the other operand is a 8-bit immediate we should fold the immediate
604 // instead. This reduces code size.
606 // movl 4(%esp), %eax
610 // addl 4(%esp), %eax
611 // The former is 2 bytes shorter. In case where the increment is 1, then
612 // the saving can be 4 bytes (by using incl %eax).
613 if (ConstantSDNode
*Imm
= dyn_cast
<ConstantSDNode
>(Op1
)) {
614 if (Imm
->getAPIntValue().isSignedIntN(8))
617 // If this is a 64-bit AND with an immediate that fits in 32-bits,
618 // prefer using the smaller and over folding the load. This is needed to
619 // make sure immediates created by shrinkAndImmediate are always folded.
620 // Ideally we would narrow the load during DAG combine and get the
621 // best of both worlds.
622 if (U
->getOpcode() == ISD::AND
&&
623 Imm
->getAPIntValue().getBitWidth() == 64 &&
624 Imm
->getAPIntValue().isIntN(32))
627 // If this really a zext_inreg that can be represented with a movzx
628 // instruction, prefer that.
629 // TODO: We could shrink the load and fold if it is non-volatile.
630 if (U
->getOpcode() == ISD::AND
&&
631 (Imm
->getAPIntValue() == UINT8_MAX
||
632 Imm
->getAPIntValue() == UINT16_MAX
||
633 Imm
->getAPIntValue() == UINT32_MAX
))
636 // ADD/SUB with can negate the immediate and use the opposite operation
637 // to fit 128 into a sign extended 8 bit immediate.
638 if ((U
->getOpcode() == ISD::ADD
|| U
->getOpcode() == ISD::SUB
) &&
639 (-Imm
->getAPIntValue()).isSignedIntN(8))
643 // If the other operand is a TLS address, we should fold it instead.
646 // leal i@NTPOFF(%eax), %eax
648 // movl $i@NTPOFF, %eax
650 // if the block also has an access to a second TLS address this will save
652 // FIXME: This is probably also true for non-TLS addresses.
653 if (Op1
.getOpcode() == X86ISD::Wrapper
) {
654 SDValue Val
= Op1
.getOperand(0);
655 if (Val
.getOpcode() == ISD::TargetGlobalTLSAddress
)
659 // Don't fold load if this matches the BTS/BTR/BTC patterns.
660 // BTS: (or X, (shl 1, n))
661 // BTR: (and X, (rotl -2, n))
662 // BTC: (xor X, (shl 1, n))
663 if (U
->getOpcode() == ISD::OR
|| U
->getOpcode() == ISD::XOR
) {
664 if (U
->getOperand(0).getOpcode() == ISD::SHL
&&
665 isOneConstant(U
->getOperand(0).getOperand(0)))
668 if (U
->getOperand(1).getOpcode() == ISD::SHL
&&
669 isOneConstant(U
->getOperand(1).getOperand(0)))
672 if (U
->getOpcode() == ISD::AND
) {
673 SDValue U0
= U
->getOperand(0);
674 SDValue U1
= U
->getOperand(1);
675 if (U0
.getOpcode() == ISD::ROTL
) {
676 auto *C
= dyn_cast
<ConstantSDNode
>(U0
.getOperand(0));
677 if (C
&& C
->getSExtValue() == -2)
681 if (U1
.getOpcode() == ISD::ROTL
) {
682 auto *C
= dyn_cast
<ConstantSDNode
>(U1
.getOperand(0));
683 if (C
&& C
->getSExtValue() == -2)
693 // Don't fold a load into a shift by immediate. The BMI2 instructions
694 // support folding a load, but not an immediate. The legacy instructions
695 // support folding an immediate, but can't fold a load. Folding an
696 // immediate is preferable to folding a load.
697 if (isa
<ConstantSDNode
>(U
->getOperand(1)))
704 // Prevent folding a load if this can implemented with an insert_subreg or
705 // a move that implicitly zeroes.
706 if (Root
->getOpcode() == ISD::INSERT_SUBVECTOR
&&
707 isNullConstant(Root
->getOperand(2)) &&
708 (Root
->getOperand(0).isUndef() ||
709 ISD::isBuildVectorAllZeros(Root
->getOperand(0).getNode())))
715 /// Replace the original chain operand of the call with
716 /// load's chain operand and move load below the call's chain operand.
717 static void moveBelowOrigChain(SelectionDAG
*CurDAG
, SDValue Load
,
718 SDValue Call
, SDValue OrigChain
) {
719 SmallVector
<SDValue
, 8> Ops
;
720 SDValue Chain
= OrigChain
.getOperand(0);
721 if (Chain
.getNode() == Load
.getNode())
722 Ops
.push_back(Load
.getOperand(0));
724 assert(Chain
.getOpcode() == ISD::TokenFactor
&&
725 "Unexpected chain operand");
726 for (unsigned i
= 0, e
= Chain
.getNumOperands(); i
!= e
; ++i
)
727 if (Chain
.getOperand(i
).getNode() == Load
.getNode())
728 Ops
.push_back(Load
.getOperand(0));
730 Ops
.push_back(Chain
.getOperand(i
));
732 CurDAG
->getNode(ISD::TokenFactor
, SDLoc(Load
), MVT::Other
, Ops
);
734 Ops
.push_back(NewChain
);
736 Ops
.append(OrigChain
->op_begin() + 1, OrigChain
->op_end());
737 CurDAG
->UpdateNodeOperands(OrigChain
.getNode(), Ops
);
738 CurDAG
->UpdateNodeOperands(Load
.getNode(), Call
.getOperand(0),
739 Load
.getOperand(1), Load
.getOperand(2));
742 Ops
.push_back(SDValue(Load
.getNode(), 1));
743 Ops
.append(Call
->op_begin() + 1, Call
->op_end());
744 CurDAG
->UpdateNodeOperands(Call
.getNode(), Ops
);
747 /// Return true if call address is a load and it can be
748 /// moved below CALLSEQ_START and the chains leading up to the call.
749 /// Return the CALLSEQ_START by reference as a second output.
750 /// In the case of a tail call, there isn't a callseq node between the call
751 /// chain and the load.
752 static bool isCalleeLoad(SDValue Callee
, SDValue
&Chain
, bool HasCallSeq
) {
753 // The transformation is somewhat dangerous if the call's chain was glued to
754 // the call. After MoveBelowOrigChain the load is moved between the call and
755 // the chain, this can create a cycle if the load is not folded. So it is
756 // *really* important that we are sure the load will be folded.
757 if (Callee
.getNode() == Chain
.getNode() || !Callee
.hasOneUse())
759 LoadSDNode
*LD
= dyn_cast
<LoadSDNode
>(Callee
.getNode());
762 LD
->getAddressingMode() != ISD::UNINDEXED
||
763 LD
->getExtensionType() != ISD::NON_EXTLOAD
)
766 // Now let's find the callseq_start.
767 while (HasCallSeq
&& Chain
.getOpcode() != ISD::CALLSEQ_START
) {
768 if (!Chain
.hasOneUse())
770 Chain
= Chain
.getOperand(0);
773 if (!Chain
.getNumOperands())
775 // Since we are not checking for AA here, conservatively abort if the chain
776 // writes to memory. It's not safe to move the callee (a load) across a store.
777 if (isa
<MemSDNode
>(Chain
.getNode()) &&
778 cast
<MemSDNode
>(Chain
.getNode())->writeMem())
780 if (Chain
.getOperand(0).getNode() == Callee
.getNode())
782 if (Chain
.getOperand(0).getOpcode() == ISD::TokenFactor
&&
783 Callee
.getValue(1).isOperandOf(Chain
.getOperand(0).getNode()) &&
784 Callee
.getValue(1).hasOneUse())
789 void X86DAGToDAGISel::PreprocessISelDAG() {
790 for (SelectionDAG::allnodes_iterator I
= CurDAG
->allnodes_begin(),
791 E
= CurDAG
->allnodes_end(); I
!= E
; ) {
792 SDNode
*N
= &*I
++; // Preincrement iterator to avoid invalidation issues.
794 // If this is a target specific AND node with no flag usages, turn it back
795 // into ISD::AND to enable test instruction matching.
796 if (N
->getOpcode() == X86ISD::AND
&& !N
->hasAnyUseOfValue(1)) {
797 SDValue Res
= CurDAG
->getNode(ISD::AND
, SDLoc(N
), N
->getValueType(0),
798 N
->getOperand(0), N
->getOperand(1));
800 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
802 CurDAG
->DeleteNode(N
);
806 switch (N
->getOpcode()) {
807 case ISD::FP_TO_SINT
:
808 case ISD::FP_TO_UINT
: {
809 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
810 // don't need 2 sets of patterns.
811 if (!N
->getSimpleValueType(0).isVector())
815 switch (N
->getOpcode()) {
816 default: llvm_unreachable("Unexpected opcode!");
817 case ISD::FP_TO_SINT
: NewOpc
= X86ISD::CVTTP2SI
; break;
818 case ISD::FP_TO_UINT
: NewOpc
= X86ISD::CVTTP2UI
; break;
820 SDValue Res
= CurDAG
->getNode(NewOpc
, SDLoc(N
), N
->getValueType(0),
823 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
825 CurDAG
->DeleteNode(N
);
831 // Replace vector shifts with their X86 specific equivalent so we don't
832 // need 2 sets of patterns.
833 if (!N
->getValueType(0).isVector())
837 switch (N
->getOpcode()) {
838 default: llvm_unreachable("Unexpected opcode!");
839 case ISD::SHL
: NewOpc
= X86ISD::VSHLV
; break;
840 case ISD::SRA
: NewOpc
= X86ISD::VSRAV
; break;
841 case ISD::SRL
: NewOpc
= X86ISD::VSRLV
; break;
843 SDValue Res
= CurDAG
->getNode(NewOpc
, SDLoc(N
), N
->getValueType(0),
844 N
->getOperand(0), N
->getOperand(1));
846 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
848 CurDAG
->DeleteNode(N
);
851 case ISD::ANY_EXTEND
:
852 case ISD::ANY_EXTEND_VECTOR_INREG
: {
853 // Replace vector any extend with the zero extend equivalents so we don't
854 // need 2 sets of patterns. Ignore vXi1 extensions.
855 if (!N
->getValueType(0).isVector() ||
856 N
->getOperand(0).getScalarValueSizeInBits() == 1)
859 unsigned NewOpc
= N
->getOpcode() == ISD::ANY_EXTEND
861 : ISD::ZERO_EXTEND_VECTOR_INREG
;
863 SDValue Res
= CurDAG
->getNode(NewOpc
, SDLoc(N
), N
->getValueType(0),
866 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
868 CurDAG
->DeleteNode(N
);
874 case ISD::FNEARBYINT
:
876 // Replace fp rounding with their X86 specific equivalent so we don't
877 // need 2 sets of patterns.
879 switch (N
->getOpcode()) {
880 default: llvm_unreachable("Unexpected opcode!");
881 case ISD::FCEIL
: Imm
= 0xA; break;
882 case ISD::FFLOOR
: Imm
= 0x9; break;
883 case ISD::FTRUNC
: Imm
= 0xB; break;
884 case ISD::FNEARBYINT
: Imm
= 0xC; break;
885 case ISD::FRINT
: Imm
= 0x4; break;
888 SDValue Res
= CurDAG
->getNode(
889 X86ISD::VRNDSCALE
, dl
, N
->getValueType(0), N
->getOperand(0),
890 CurDAG
->getTargetConstant(Imm
, dl
, MVT::i8
));
892 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
894 CurDAG
->DeleteNode(N
);
901 // Widen scalar fp logic ops to vector to reduce isel patterns.
902 // FIXME: Can we do this during lowering/combine.
903 MVT VT
= N
->getSimpleValueType(0);
904 if (VT
.isVector() || VT
== MVT::f128
)
907 MVT VecVT
= VT
== MVT::f64
? MVT::v2f64
: MVT::v4f32
;
909 SDValue Op0
= CurDAG
->getNode(ISD::SCALAR_TO_VECTOR
, dl
, VecVT
,
911 SDValue Op1
= CurDAG
->getNode(ISD::SCALAR_TO_VECTOR
, dl
, VecVT
,
915 if (Subtarget
->hasSSE2()) {
916 EVT IntVT
= EVT(VecVT
).changeVectorElementTypeToInteger();
917 Op0
= CurDAG
->getNode(ISD::BITCAST
, dl
, IntVT
, Op0
);
918 Op1
= CurDAG
->getNode(ISD::BITCAST
, dl
, IntVT
, Op1
);
920 switch (N
->getOpcode()) {
921 default: llvm_unreachable("Unexpected opcode!");
922 case X86ISD::FANDN
: Opc
= X86ISD::ANDNP
; break;
923 case X86ISD::FAND
: Opc
= ISD::AND
; break;
924 case X86ISD::FOR
: Opc
= ISD::OR
; break;
925 case X86ISD::FXOR
: Opc
= ISD::XOR
; break;
927 Res
= CurDAG
->getNode(Opc
, dl
, IntVT
, Op0
, Op1
);
928 Res
= CurDAG
->getNode(ISD::BITCAST
, dl
, VecVT
, Res
);
930 Res
= CurDAG
->getNode(N
->getOpcode(), dl
, VecVT
, Op0
, Op1
);
932 Res
= CurDAG
->getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, VT
, Res
,
933 CurDAG
->getIntPtrConstant(0, dl
));
935 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
937 CurDAG
->DeleteNode(N
);
942 if (OptLevel
!= CodeGenOpt::None
&&
943 // Only do this when the target can fold the load into the call or
945 !Subtarget
->useRetpolineIndirectCalls() &&
946 ((N
->getOpcode() == X86ISD::CALL
&& !Subtarget
->slowTwoMemOps()) ||
947 (N
->getOpcode() == X86ISD::TC_RETURN
&&
948 (Subtarget
->is64Bit() ||
949 !getTargetMachine().isPositionIndependent())))) {
950 /// Also try moving call address load from outside callseq_start to just
951 /// before the call to allow it to be folded.
969 bool HasCallSeq
= N
->getOpcode() == X86ISD::CALL
;
970 SDValue Chain
= N
->getOperand(0);
971 SDValue Load
= N
->getOperand(1);
972 if (!isCalleeLoad(Load
, Chain
, HasCallSeq
))
974 moveBelowOrigChain(CurDAG
, Load
, SDValue(N
, 0), Chain
);
979 // Lower fpround and fpextend nodes that target the FP stack to be store and
980 // load to the stack. This is a gross hack. We would like to simply mark
981 // these as being illegal, but when we do that, legalize produces these when
982 // it expands calls, then expands these in the same legalize pass. We would
983 // like dag combine to be able to hack on these between the call expansion
984 // and the node legalization. As such this pass basically does "really
985 // late" legalization of these inline with the X86 isel pass.
986 // FIXME: This should only happen when not compiled with -O0.
987 switch (N
->getOpcode()) {
992 MVT SrcVT
= N
->getOperand(0).getSimpleValueType();
993 MVT DstVT
= N
->getSimpleValueType(0);
995 // If any of the sources are vectors, no fp stack involved.
996 if (SrcVT
.isVector() || DstVT
.isVector())
999 // If the source and destination are SSE registers, then this is a legal
1000 // conversion that should not be lowered.
1001 const X86TargetLowering
*X86Lowering
=
1002 static_cast<const X86TargetLowering
*>(TLI
);
1003 bool SrcIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(SrcVT
);
1004 bool DstIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(DstVT
);
1005 if (SrcIsSSE
&& DstIsSSE
)
1008 if (!SrcIsSSE
&& !DstIsSSE
) {
1009 // If this is an FPStack extension, it is a noop.
1010 if (N
->getOpcode() == ISD::FP_EXTEND
)
1012 // If this is a value-preserving FPStack truncation, it is a noop.
1013 if (N
->getConstantOperandVal(1))
1017 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1018 // FPStack has extload and truncstore. SSE can fold direct loads into other
1019 // operations. Based on this, decide what we want to do.
1021 if (N
->getOpcode() == ISD::FP_ROUND
)
1022 MemVT
= DstVT
; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1024 MemVT
= SrcIsSSE
? SrcVT
: DstVT
;
1026 SDValue MemTmp
= CurDAG
->CreateStackTemporary(MemVT
);
1029 // FIXME: optimize the case where the src/dest is a load or store?
1031 SDValue Store
= CurDAG
->getTruncStore(CurDAG
->getEntryNode(), dl
, N
->getOperand(0),
1032 MemTmp
, MachinePointerInfo(), MemVT
);
1033 SDValue Result
= CurDAG
->getExtLoad(ISD::EXTLOAD
, dl
, DstVT
, Store
, MemTmp
,
1034 MachinePointerInfo(), MemVT
);
1036 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1037 // extload we created. This will cause general havok on the dag because
1038 // anything below the conversion could be folded into other existing nodes.
1039 // To avoid invalidating 'I', back it up to the convert node.
1041 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Result
);
1045 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1046 //dealing with the chain differently, as there is already a preexisting chain.
1047 case ISD::STRICT_FP_ROUND
:
1048 case ISD::STRICT_FP_EXTEND
:
1050 MVT SrcVT
= N
->getOperand(1).getSimpleValueType();
1051 MVT DstVT
= N
->getSimpleValueType(0);
1053 // If any of the sources are vectors, no fp stack involved.
1054 if (SrcVT
.isVector() || DstVT
.isVector())
1057 // If the source and destination are SSE registers, then this is a legal
1058 // conversion that should not be lowered.
1059 const X86TargetLowering
*X86Lowering
=
1060 static_cast<const X86TargetLowering
*>(TLI
);
1061 bool SrcIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(SrcVT
);
1062 bool DstIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(DstVT
);
1063 if (SrcIsSSE
&& DstIsSSE
)
1066 if (!SrcIsSSE
&& !DstIsSSE
) {
1067 // If this is an FPStack extension, it is a noop.
1068 if (N
->getOpcode() == ISD::STRICT_FP_EXTEND
)
1070 // If this is a value-preserving FPStack truncation, it is a noop.
1071 if (N
->getConstantOperandVal(2))
1075 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1076 // FPStack has extload and truncstore. SSE can fold direct loads into other
1077 // operations. Based on this, decide what we want to do.
1079 if (N
->getOpcode() == ISD::STRICT_FP_ROUND
)
1080 MemVT
= DstVT
; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1082 MemVT
= SrcIsSSE
? SrcVT
: DstVT
;
1084 SDValue MemTmp
= CurDAG
->CreateStackTemporary(MemVT
);
1087 // FIXME: optimize the case where the src/dest is a load or store?
1089 //Since the operation is StrictFP, use the preexisting chain.
1090 SDValue Store
= CurDAG
->getTruncStore(N
->getOperand(0), dl
, N
->getOperand(1),
1091 MemTmp
, MachinePointerInfo(), MemVT
);
1092 SDValue Result
= CurDAG
->getExtLoad(ISD::EXTLOAD
, dl
, DstVT
, Store
, MemTmp
,
1093 MachinePointerInfo(), MemVT
);
1095 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1096 // extload we created. This will cause general havok on the dag because
1097 // anything below the conversion could be folded into other existing nodes.
1098 // To avoid invalidating 'I', back it up to the convert node.
1100 CurDAG
->ReplaceAllUsesWith(N
, Result
.getNode());
1106 // Now that we did that, the node is dead. Increment the iterator to the
1107 // next node to process, then delete N.
1109 CurDAG
->DeleteNode(N
);
1112 // The load+call transform above can leave some dead nodes in the graph. Make
1113 // sure we remove them. Its possible some of the other transforms do to so
1114 // just remove dead nodes unconditionally.
1115 CurDAG
->RemoveDeadNodes();
1118 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1119 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode
*N
) {
1120 unsigned Opc
= N
->getMachineOpcode();
1121 if (Opc
!= X86::MOVZX32rr8
&& Opc
!= X86::MOVSX32rr8
&&
1122 Opc
!= X86::MOVSX64rr8
)
1125 SDValue N0
= N
->getOperand(0);
1127 // We need to be extracting the lower bit of an extend.
1128 if (!N0
.isMachineOpcode() ||
1129 N0
.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG
||
1130 N0
.getConstantOperandVal(1) != X86::sub_8bit
)
1133 // We're looking for either a movsx or movzx to match the original opcode.
1134 unsigned ExpectedOpc
= Opc
== X86::MOVZX32rr8
? X86::MOVZX32rr8_NOREX
1135 : X86::MOVSX32rr8_NOREX
;
1136 SDValue N00
= N0
.getOperand(0);
1137 if (!N00
.isMachineOpcode() || N00
.getMachineOpcode() != ExpectedOpc
)
1140 if (Opc
== X86::MOVSX64rr8
) {
1141 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1143 MachineSDNode
*Extend
= CurDAG
->getMachineNode(X86::MOVSX64rr32
, SDLoc(N
),
1145 ReplaceUses(N
, Extend
);
1147 // Ok we can drop this extend and just use the original extend.
1148 ReplaceUses(N
, N00
.getNode());
1154 void X86DAGToDAGISel::PostprocessISelDAG() {
1155 // Skip peepholes at -O0.
1156 if (TM
.getOptLevel() == CodeGenOpt::None
)
1159 SelectionDAG::allnodes_iterator Position
= CurDAG
->allnodes_end();
1161 bool MadeChange
= false;
1162 while (Position
!= CurDAG
->allnodes_begin()) {
1163 SDNode
*N
= &*--Position
;
1164 // Skip dead nodes and any non-machine opcodes.
1165 if (N
->use_empty() || !N
->isMachineOpcode())
1168 if (tryOptimizeRem8Extend(N
)) {
1173 // Look for a TESTrr+ANDrr pattern where both operands of the test are
1174 // the same. Rewrite to remove the AND.
1175 unsigned Opc
= N
->getMachineOpcode();
1176 if ((Opc
== X86::TEST8rr
|| Opc
== X86::TEST16rr
||
1177 Opc
== X86::TEST32rr
|| Opc
== X86::TEST64rr
) &&
1178 N
->getOperand(0) == N
->getOperand(1) &&
1179 N
->isOnlyUserOf(N
->getOperand(0).getNode()) &&
1180 N
->getOperand(0).isMachineOpcode()) {
1181 SDValue And
= N
->getOperand(0);
1182 unsigned N0Opc
= And
.getMachineOpcode();
1183 if (N0Opc
== X86::AND8rr
|| N0Opc
== X86::AND16rr
||
1184 N0Opc
== X86::AND32rr
|| N0Opc
== X86::AND64rr
) {
1185 MachineSDNode
*Test
= CurDAG
->getMachineNode(Opc
, SDLoc(N
),
1189 ReplaceUses(N
, Test
);
1193 if (N0Opc
== X86::AND8rm
|| N0Opc
== X86::AND16rm
||
1194 N0Opc
== X86::AND32rm
|| N0Opc
== X86::AND64rm
) {
1197 case X86::AND8rm
: NewOpc
= X86::TEST8mr
; break;
1198 case X86::AND16rm
: NewOpc
= X86::TEST16mr
; break;
1199 case X86::AND32rm
: NewOpc
= X86::TEST32mr
; break;
1200 case X86::AND64rm
: NewOpc
= X86::TEST64mr
; break;
1203 // Need to swap the memory and register operand.
1204 SDValue Ops
[] = { And
.getOperand(1),
1210 And
.getOperand(6) /* Chain */ };
1211 MachineSDNode
*Test
= CurDAG
->getMachineNode(NewOpc
, SDLoc(N
),
1212 MVT::i32
, MVT::Other
, Ops
);
1213 ReplaceUses(N
, Test
);
1219 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1220 // used. We're doing this late so we can prefer to fold the AND into masked
1221 // comparisons. Doing that can be better for the live range of the mask
1223 if ((Opc
== X86::KORTESTBrr
|| Opc
== X86::KORTESTWrr
||
1224 Opc
== X86::KORTESTDrr
|| Opc
== X86::KORTESTQrr
) &&
1225 N
->getOperand(0) == N
->getOperand(1) &&
1226 N
->isOnlyUserOf(N
->getOperand(0).getNode()) &&
1227 N
->getOperand(0).isMachineOpcode() &&
1228 onlyUsesZeroFlag(SDValue(N
, 0))) {
1229 SDValue And
= N
->getOperand(0);
1230 unsigned N0Opc
= And
.getMachineOpcode();
1231 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1232 // KAND instructions and KTEST use the same ISA feature.
1233 if (N0Opc
== X86::KANDBrr
||
1234 (N0Opc
== X86::KANDWrr
&& Subtarget
->hasDQI()) ||
1235 N0Opc
== X86::KANDDrr
|| N0Opc
== X86::KANDQrr
) {
1238 default: llvm_unreachable("Unexpected opcode!");
1239 case X86::KORTESTBrr
: NewOpc
= X86::KTESTBrr
; break;
1240 case X86::KORTESTWrr
: NewOpc
= X86::KTESTWrr
; break;
1241 case X86::KORTESTDrr
: NewOpc
= X86::KTESTDrr
; break;
1242 case X86::KORTESTQrr
: NewOpc
= X86::KTESTQrr
; break;
1244 MachineSDNode
*KTest
= CurDAG
->getMachineNode(NewOpc
, SDLoc(N
),
1248 ReplaceUses(N
, KTest
);
1254 // Attempt to remove vectors moves that were inserted to zero upper bits.
1255 if (Opc
!= TargetOpcode::SUBREG_TO_REG
)
1258 unsigned SubRegIdx
= N
->getConstantOperandVal(2);
1259 if (SubRegIdx
!= X86::sub_xmm
&& SubRegIdx
!= X86::sub_ymm
)
1262 SDValue Move
= N
->getOperand(1);
1263 if (!Move
.isMachineOpcode())
1266 // Make sure its one of the move opcodes we recognize.
1267 switch (Move
.getMachineOpcode()) {
1270 case X86::VMOVAPDrr
: case X86::VMOVUPDrr
:
1271 case X86::VMOVAPSrr
: case X86::VMOVUPSrr
:
1272 case X86::VMOVDQArr
: case X86::VMOVDQUrr
:
1273 case X86::VMOVAPDYrr
: case X86::VMOVUPDYrr
:
1274 case X86::VMOVAPSYrr
: case X86::VMOVUPSYrr
:
1275 case X86::VMOVDQAYrr
: case X86::VMOVDQUYrr
:
1276 case X86::VMOVAPDZ128rr
: case X86::VMOVUPDZ128rr
:
1277 case X86::VMOVAPSZ128rr
: case X86::VMOVUPSZ128rr
:
1278 case X86::VMOVDQA32Z128rr
: case X86::VMOVDQU32Z128rr
:
1279 case X86::VMOVDQA64Z128rr
: case X86::VMOVDQU64Z128rr
:
1280 case X86::VMOVAPDZ256rr
: case X86::VMOVUPDZ256rr
:
1281 case X86::VMOVAPSZ256rr
: case X86::VMOVUPSZ256rr
:
1282 case X86::VMOVDQA32Z256rr
: case X86::VMOVDQU32Z256rr
:
1283 case X86::VMOVDQA64Z256rr
: case X86::VMOVDQU64Z256rr
:
1287 SDValue In
= Move
.getOperand(0);
1288 if (!In
.isMachineOpcode() ||
1289 In
.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END
)
1292 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1293 // the SHA instructions which use a legacy encoding.
1294 uint64_t TSFlags
= getInstrInfo()->get(In
.getMachineOpcode()).TSFlags
;
1295 if ((TSFlags
& X86II::EncodingMask
) != X86II::VEX
&&
1296 (TSFlags
& X86II::EncodingMask
) != X86II::EVEX
&&
1297 (TSFlags
& X86II::EncodingMask
) != X86II::XOP
)
1300 // Producing instruction is another vector instruction. We can drop the
1302 CurDAG
->UpdateNodeOperands(N
, N
->getOperand(0), In
, N
->getOperand(2));
1307 CurDAG
->RemoveDeadNodes();
1311 /// Emit any code that needs to be executed only in the main function.
1312 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1313 if (Subtarget
->isTargetCygMing()) {
1314 TargetLowering::ArgListTy Args
;
1315 auto &DL
= CurDAG
->getDataLayout();
1317 TargetLowering::CallLoweringInfo
CLI(*CurDAG
);
1318 CLI
.setChain(CurDAG
->getRoot())
1319 .setCallee(CallingConv::C
, Type::getVoidTy(*CurDAG
->getContext()),
1320 CurDAG
->getExternalSymbol("__main", TLI
->getPointerTy(DL
)),
1322 const TargetLowering
&TLI
= CurDAG
->getTargetLoweringInfo();
1323 std::pair
<SDValue
, SDValue
> Result
= TLI
.LowerCallTo(CLI
);
1324 CurDAG
->setRoot(Result
.second
);
1328 void X86DAGToDAGISel::EmitFunctionEntryCode() {
1329 // If this is main, emit special code for main.
1330 const Function
&F
= MF
->getFunction();
1331 if (F
.hasExternalLinkage() && F
.getName() == "main")
1332 emitSpecialCodeForMain();
1335 static bool isDispSafeForFrameIndex(int64_t Val
) {
1336 // On 64-bit platforms, we can run into an issue where a frame index
1337 // includes a displacement that, when added to the explicit displacement,
1338 // will overflow the displacement field. Assuming that the frame index
1339 // displacement fits into a 31-bit integer (which is only slightly more
1340 // aggressive than the current fundamental assumption that it fits into
1341 // a 32-bit integer), a 31-bit disp should always be safe.
1342 return isInt
<31>(Val
);
1345 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset
,
1346 X86ISelAddressMode
&AM
) {
1347 // If there's no offset to fold, we don't need to do any work.
1351 // Cannot combine ExternalSymbol displacements with integer offsets.
1352 if (AM
.ES
|| AM
.MCSym
)
1355 int64_t Val
= AM
.Disp
+ Offset
;
1356 CodeModel::Model M
= TM
.getCodeModel();
1357 if (Subtarget
->is64Bit()) {
1358 if (!X86::isOffsetSuitableForCodeModel(Val
, M
,
1359 AM
.hasSymbolicDisplacement()))
1361 // In addition to the checks required for a register base, check that
1362 // we do not try to use an unsafe Disp with a frame index.
1363 if (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
&&
1364 !isDispSafeForFrameIndex(Val
))
1372 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode
*N
, X86ISelAddressMode
&AM
){
1373 SDValue Address
= N
->getOperand(1);
1375 // load gs:0 -> GS segment register.
1376 // load fs:0 -> FS segment register.
1378 // This optimization is valid because the GNU TLS model defines that
1379 // gs:0 (or fs:0 on X86-64) contains its own address.
1380 // For more information see http://people.redhat.com/drepper/tls.pdf
1381 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Address
))
1382 if (C
->getSExtValue() == 0 && AM
.Segment
.getNode() == nullptr &&
1383 !IndirectTlsSegRefs
&&
1384 (Subtarget
->isTargetGlibc() || Subtarget
->isTargetAndroid() ||
1385 Subtarget
->isTargetFuchsia()))
1386 switch (N
->getPointerInfo().getAddrSpace()) {
1388 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
1391 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
1393 // Address space 258 is not handled here, because it is not used to
1394 // address TLS areas.
1400 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1401 /// mode. These wrap things that will resolve down into a symbol reference.
1402 /// If no match is possible, this returns true, otherwise it returns false.
1403 bool X86DAGToDAGISel::matchWrapper(SDValue N
, X86ISelAddressMode
&AM
) {
1404 // If the addressing mode already has a symbol as the displacement, we can
1405 // never match another symbol.
1406 if (AM
.hasSymbolicDisplacement())
1409 bool IsRIPRelTLS
= false;
1410 bool IsRIPRel
= N
.getOpcode() == X86ISD::WrapperRIP
;
1412 SDValue Val
= N
.getOperand(0);
1413 if (Val
.getOpcode() == ISD::TargetGlobalTLSAddress
)
1417 // We can't use an addressing mode in the 64-bit large code model.
1418 // Global TLS addressing is an exception. In the medium code model,
1419 // we use can use a mode when RIP wrappers are present.
1420 // That signifies access to globals that are known to be "near",
1421 // such as the GOT itself.
1422 CodeModel::Model M
= TM
.getCodeModel();
1423 if (Subtarget
->is64Bit() &&
1424 ((M
== CodeModel::Large
&& !IsRIPRelTLS
) ||
1425 (M
== CodeModel::Medium
&& !IsRIPRel
)))
1428 // Base and index reg must be 0 in order to use %rip as base.
1429 if (IsRIPRel
&& AM
.hasBaseOrIndexReg())
1432 // Make a local copy in case we can't do this fold.
1433 X86ISelAddressMode Backup
= AM
;
1436 SDValue N0
= N
.getOperand(0);
1437 if (GlobalAddressSDNode
*G
= dyn_cast
<GlobalAddressSDNode
>(N0
)) {
1438 AM
.GV
= G
->getGlobal();
1439 AM
.SymbolFlags
= G
->getTargetFlags();
1440 Offset
= G
->getOffset();
1441 } else if (ConstantPoolSDNode
*CP
= dyn_cast
<ConstantPoolSDNode
>(N0
)) {
1442 AM
.CP
= CP
->getConstVal();
1443 AM
.Align
= CP
->getAlignment();
1444 AM
.SymbolFlags
= CP
->getTargetFlags();
1445 Offset
= CP
->getOffset();
1446 } else if (ExternalSymbolSDNode
*S
= dyn_cast
<ExternalSymbolSDNode
>(N0
)) {
1447 AM
.ES
= S
->getSymbol();
1448 AM
.SymbolFlags
= S
->getTargetFlags();
1449 } else if (auto *S
= dyn_cast
<MCSymbolSDNode
>(N0
)) {
1450 AM
.MCSym
= S
->getMCSymbol();
1451 } else if (JumpTableSDNode
*J
= dyn_cast
<JumpTableSDNode
>(N0
)) {
1452 AM
.JT
= J
->getIndex();
1453 AM
.SymbolFlags
= J
->getTargetFlags();
1454 } else if (BlockAddressSDNode
*BA
= dyn_cast
<BlockAddressSDNode
>(N0
)) {
1455 AM
.BlockAddr
= BA
->getBlockAddress();
1456 AM
.SymbolFlags
= BA
->getTargetFlags();
1457 Offset
= BA
->getOffset();
1459 llvm_unreachable("Unhandled symbol reference node.");
1461 if (foldOffsetIntoAddress(Offset
, AM
)) {
1467 AM
.setBaseReg(CurDAG
->getRegister(X86::RIP
, MVT::i64
));
1469 // Commit the changes now that we know this fold is safe.
1473 /// Add the specified node to the specified addressing mode, returning true if
1474 /// it cannot be done. This just pattern matches for the addressing mode.
1475 bool X86DAGToDAGISel::matchAddress(SDValue N
, X86ISelAddressMode
&AM
) {
1476 if (matchAddressRecursively(N
, AM
, 0))
1479 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1480 // a smaller encoding and avoids a scaled-index.
1481 if (AM
.Scale
== 2 &&
1482 AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1483 AM
.Base_Reg
.getNode() == nullptr) {
1484 AM
.Base_Reg
= AM
.IndexReg
;
1488 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1489 // because it has a smaller encoding.
1490 // TODO: Which other code models can use this?
1491 switch (TM
.getCodeModel()) {
1493 case CodeModel::Small
:
1494 case CodeModel::Kernel
:
1495 if (Subtarget
->is64Bit() &&
1497 AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1498 AM
.Base_Reg
.getNode() == nullptr &&
1499 AM
.IndexReg
.getNode() == nullptr &&
1500 AM
.SymbolFlags
== X86II::MO_NO_FLAG
&&
1501 AM
.hasSymbolicDisplacement())
1502 AM
.Base_Reg
= CurDAG
->getRegister(X86::RIP
, MVT::i64
);
1509 bool X86DAGToDAGISel::matchAdd(SDValue
&N
, X86ISelAddressMode
&AM
,
1511 // Add an artificial use to this node so that we can keep track of
1512 // it if it gets CSE'd with a different node.
1513 HandleSDNode
Handle(N
);
1515 X86ISelAddressMode Backup
= AM
;
1516 if (!matchAddressRecursively(N
.getOperand(0), AM
, Depth
+1) &&
1517 !matchAddressRecursively(Handle
.getValue().getOperand(1), AM
, Depth
+1))
1521 // Try again after commuting the operands.
1522 if (!matchAddressRecursively(Handle
.getValue().getOperand(1), AM
, Depth
+1) &&
1523 !matchAddressRecursively(Handle
.getValue().getOperand(0), AM
, Depth
+1))
1527 // If we couldn't fold both operands into the address at the same time,
1528 // see if we can just put each operand into a register and fold at least
1530 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1531 !AM
.Base_Reg
.getNode() &&
1532 !AM
.IndexReg
.getNode()) {
1533 N
= Handle
.getValue();
1534 AM
.Base_Reg
= N
.getOperand(0);
1535 AM
.IndexReg
= N
.getOperand(1);
1539 N
= Handle
.getValue();
1543 // Insert a node into the DAG at least before the Pos node's position. This
1544 // will reposition the node as needed, and will assign it a node ID that is <=
1545 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1546 // IDs! The selection DAG must no longer depend on their uniqueness when this
1548 static void insertDAGNode(SelectionDAG
&DAG
, SDValue Pos
, SDValue N
) {
1549 if (N
->getNodeId() == -1 ||
1550 (SelectionDAGISel::getUninvalidatedNodeId(N
.getNode()) >
1551 SelectionDAGISel::getUninvalidatedNodeId(Pos
.getNode()))) {
1552 DAG
.RepositionNode(Pos
->getIterator(), N
.getNode());
1553 // Mark Node as invalid for pruning as after this it may be a successor to a
1554 // selected node but otherwise be in the same position of Pos.
1555 // Conservatively mark it with the same -abs(Id) to assure node id
1556 // invariant is preserved.
1557 N
->setNodeId(Pos
->getNodeId());
1558 SelectionDAGISel::InvalidateNodeId(N
.getNode());
1562 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1563 // safe. This allows us to convert the shift and and into an h-register
1564 // extract and a scaled index. Returns false if the simplification is
1566 static bool foldMaskAndShiftToExtract(SelectionDAG
&DAG
, SDValue N
,
1568 SDValue Shift
, SDValue X
,
1569 X86ISelAddressMode
&AM
) {
1570 if (Shift
.getOpcode() != ISD::SRL
||
1571 !isa
<ConstantSDNode
>(Shift
.getOperand(1)) ||
1575 int ScaleLog
= 8 - Shift
.getConstantOperandVal(1);
1576 if (ScaleLog
<= 0 || ScaleLog
>= 4 ||
1577 Mask
!= (0xffu
<< ScaleLog
))
1580 MVT VT
= N
.getSimpleValueType();
1582 SDValue Eight
= DAG
.getConstant(8, DL
, MVT::i8
);
1583 SDValue NewMask
= DAG
.getConstant(0xff, DL
, VT
);
1584 SDValue Srl
= DAG
.getNode(ISD::SRL
, DL
, VT
, X
, Eight
);
1585 SDValue And
= DAG
.getNode(ISD::AND
, DL
, VT
, Srl
, NewMask
);
1586 SDValue ShlCount
= DAG
.getConstant(ScaleLog
, DL
, MVT::i8
);
1587 SDValue Shl
= DAG
.getNode(ISD::SHL
, DL
, VT
, And
, ShlCount
);
1589 // Insert the new nodes into the topological ordering. We must do this in
1590 // a valid topological ordering as nothing is going to go back and re-sort
1591 // these nodes. We continually insert before 'N' in sequence as this is
1592 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1593 // hierarchy left to express.
1594 insertDAGNode(DAG
, N
, Eight
);
1595 insertDAGNode(DAG
, N
, Srl
);
1596 insertDAGNode(DAG
, N
, NewMask
);
1597 insertDAGNode(DAG
, N
, And
);
1598 insertDAGNode(DAG
, N
, ShlCount
);
1599 insertDAGNode(DAG
, N
, Shl
);
1600 DAG
.ReplaceAllUsesWith(N
, Shl
);
1601 DAG
.RemoveDeadNode(N
.getNode());
1603 AM
.Scale
= (1 << ScaleLog
);
1607 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
1608 // allows us to fold the shift into this addressing mode. Returns false if the
1609 // transform succeeded.
1610 static bool foldMaskedShiftToScaledMask(SelectionDAG
&DAG
, SDValue N
,
1611 X86ISelAddressMode
&AM
) {
1612 SDValue Shift
= N
.getOperand(0);
1614 // Use a signed mask so that shifting right will insert sign bits. These
1615 // bits will be removed when we shift the result left so it doesn't matter
1616 // what we use. This might allow a smaller immediate encoding.
1617 int64_t Mask
= cast
<ConstantSDNode
>(N
->getOperand(1))->getSExtValue();
1619 // If we have an any_extend feeding the AND, look through it to see if there
1620 // is a shift behind it. But only if the AND doesn't use the extended bits.
1621 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
1622 bool FoundAnyExtend
= false;
1623 if (Shift
.getOpcode() == ISD::ANY_EXTEND
&& Shift
.hasOneUse() &&
1624 Shift
.getOperand(0).getSimpleValueType() == MVT::i32
&&
1626 FoundAnyExtend
= true;
1627 Shift
= Shift
.getOperand(0);
1630 if (Shift
.getOpcode() != ISD::SHL
||
1631 !isa
<ConstantSDNode
>(Shift
.getOperand(1)))
1634 SDValue X
= Shift
.getOperand(0);
1636 // Not likely to be profitable if either the AND or SHIFT node has more
1637 // than one use (unless all uses are for address computation). Besides,
1638 // isel mechanism requires their node ids to be reused.
1639 if (!N
.hasOneUse() || !Shift
.hasOneUse())
1642 // Verify that the shift amount is something we can fold.
1643 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
1644 if (ShiftAmt
!= 1 && ShiftAmt
!= 2 && ShiftAmt
!= 3)
1647 MVT VT
= N
.getSimpleValueType();
1649 if (FoundAnyExtend
) {
1650 SDValue NewX
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VT
, X
);
1651 insertDAGNode(DAG
, N
, NewX
);
1655 SDValue NewMask
= DAG
.getConstant(Mask
>> ShiftAmt
, DL
, VT
);
1656 SDValue NewAnd
= DAG
.getNode(ISD::AND
, DL
, VT
, X
, NewMask
);
1657 SDValue NewShift
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewAnd
, Shift
.getOperand(1));
1659 // Insert the new nodes into the topological ordering. We must do this in
1660 // a valid topological ordering as nothing is going to go back and re-sort
1661 // these nodes. We continually insert before 'N' in sequence as this is
1662 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1663 // hierarchy left to express.
1664 insertDAGNode(DAG
, N
, NewMask
);
1665 insertDAGNode(DAG
, N
, NewAnd
);
1666 insertDAGNode(DAG
, N
, NewShift
);
1667 DAG
.ReplaceAllUsesWith(N
, NewShift
);
1668 DAG
.RemoveDeadNode(N
.getNode());
1670 AM
.Scale
= 1 << ShiftAmt
;
1671 AM
.IndexReg
= NewAnd
;
1675 // Implement some heroics to detect shifts of masked values where the mask can
1676 // be replaced by extending the shift and undoing that in the addressing mode
1677 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
1678 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
1679 // the addressing mode. This results in code such as:
1681 // int f(short *y, int *lookup_table) {
1683 // return *y + lookup_table[*y >> 11];
1687 // movzwl (%rdi), %eax
1690 // addl (%rsi,%rcx,4), %eax
1693 // movzwl (%rdi), %eax
1697 // addl (%rsi,%rcx), %eax
1699 // Note that this function assumes the mask is provided as a mask *after* the
1700 // value is shifted. The input chain may or may not match that, but computing
1701 // such a mask is trivial.
1702 static bool foldMaskAndShiftToScale(SelectionDAG
&DAG
, SDValue N
,
1704 SDValue Shift
, SDValue X
,
1705 X86ISelAddressMode
&AM
) {
1706 if (Shift
.getOpcode() != ISD::SRL
|| !Shift
.hasOneUse() ||
1707 !isa
<ConstantSDNode
>(Shift
.getOperand(1)))
1710 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
1711 unsigned MaskLZ
= countLeadingZeros(Mask
);
1712 unsigned MaskTZ
= countTrailingZeros(Mask
);
1714 // The amount of shift we're trying to fit into the addressing mode is taken
1715 // from the trailing zeros of the mask.
1716 unsigned AMShiftAmt
= MaskTZ
;
1718 // There is nothing we can do here unless the mask is removing some bits.
1719 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1720 if (AMShiftAmt
<= 0 || AMShiftAmt
> 3) return true;
1722 // We also need to ensure that mask is a continuous run of bits.
1723 if (countTrailingOnes(Mask
>> MaskTZ
) + MaskTZ
+ MaskLZ
!= 64) return true;
1725 // Scale the leading zero count down based on the actual size of the value.
1726 // Also scale it down based on the size of the shift.
1727 unsigned ScaleDown
= (64 - X
.getSimpleValueType().getSizeInBits()) + ShiftAmt
;
1728 if (MaskLZ
< ScaleDown
)
1730 MaskLZ
-= ScaleDown
;
1732 // The final check is to ensure that any masked out high bits of X are
1733 // already known to be zero. Otherwise, the mask has a semantic impact
1734 // other than masking out a couple of low bits. Unfortunately, because of
1735 // the mask, zero extensions will be removed from operands in some cases.
1736 // This code works extra hard to look through extensions because we can
1737 // replace them with zero extensions cheaply if necessary.
1738 bool ReplacingAnyExtend
= false;
1739 if (X
.getOpcode() == ISD::ANY_EXTEND
) {
1740 unsigned ExtendBits
= X
.getSimpleValueType().getSizeInBits() -
1741 X
.getOperand(0).getSimpleValueType().getSizeInBits();
1742 // Assume that we'll replace the any-extend with a zero-extend, and
1743 // narrow the search to the extended value.
1744 X
= X
.getOperand(0);
1745 MaskLZ
= ExtendBits
> MaskLZ
? 0 : MaskLZ
- ExtendBits
;
1746 ReplacingAnyExtend
= true;
1748 APInt MaskedHighBits
=
1749 APInt::getHighBitsSet(X
.getSimpleValueType().getSizeInBits(), MaskLZ
);
1750 KnownBits Known
= DAG
.computeKnownBits(X
);
1751 if (MaskedHighBits
!= Known
.Zero
) return true;
1753 // We've identified a pattern that can be transformed into a single shift
1754 // and an addressing mode. Make it so.
1755 MVT VT
= N
.getSimpleValueType();
1756 if (ReplacingAnyExtend
) {
1757 assert(X
.getValueType() != VT
);
1758 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
1759 SDValue NewX
= DAG
.getNode(ISD::ZERO_EXTEND
, SDLoc(X
), VT
, X
);
1760 insertDAGNode(DAG
, N
, NewX
);
1764 SDValue NewSRLAmt
= DAG
.getConstant(ShiftAmt
+ AMShiftAmt
, DL
, MVT::i8
);
1765 SDValue NewSRL
= DAG
.getNode(ISD::SRL
, DL
, VT
, X
, NewSRLAmt
);
1766 SDValue NewSHLAmt
= DAG
.getConstant(AMShiftAmt
, DL
, MVT::i8
);
1767 SDValue NewSHL
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewSRL
, NewSHLAmt
);
1769 // Insert the new nodes into the topological ordering. We must do this in
1770 // a valid topological ordering as nothing is going to go back and re-sort
1771 // these nodes. We continually insert before 'N' in sequence as this is
1772 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1773 // hierarchy left to express.
1774 insertDAGNode(DAG
, N
, NewSRLAmt
);
1775 insertDAGNode(DAG
, N
, NewSRL
);
1776 insertDAGNode(DAG
, N
, NewSHLAmt
);
1777 insertDAGNode(DAG
, N
, NewSHL
);
1778 DAG
.ReplaceAllUsesWith(N
, NewSHL
);
1779 DAG
.RemoveDeadNode(N
.getNode());
1781 AM
.Scale
= 1 << AMShiftAmt
;
1782 AM
.IndexReg
= NewSRL
;
1786 // Transform "(X >> SHIFT) & (MASK << C1)" to
1787 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
1788 // matched to a BEXTR later. Returns false if the simplification is performed.
1789 static bool foldMaskedShiftToBEXTR(SelectionDAG
&DAG
, SDValue N
,
1791 SDValue Shift
, SDValue X
,
1792 X86ISelAddressMode
&AM
,
1793 const X86Subtarget
&Subtarget
) {
1794 if (Shift
.getOpcode() != ISD::SRL
||
1795 !isa
<ConstantSDNode
>(Shift
.getOperand(1)) ||
1796 !Shift
.hasOneUse() || !N
.hasOneUse())
1799 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
1800 if (!Subtarget
.hasTBM() &&
1801 !(Subtarget
.hasBMI() && Subtarget
.hasFastBEXTR()))
1804 // We need to ensure that mask is a continuous run of bits.
1805 if (!isShiftedMask_64(Mask
)) return true;
1807 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
1809 // The amount of shift we're trying to fit into the addressing mode is taken
1810 // from the trailing zeros of the mask.
1811 unsigned AMShiftAmt
= countTrailingZeros(Mask
);
1813 // There is nothing we can do here unless the mask is removing some bits.
1814 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1815 if (AMShiftAmt
<= 0 || AMShiftAmt
> 3) return true;
1817 MVT VT
= N
.getSimpleValueType();
1819 SDValue NewSRLAmt
= DAG
.getConstant(ShiftAmt
+ AMShiftAmt
, DL
, MVT::i8
);
1820 SDValue NewSRL
= DAG
.getNode(ISD::SRL
, DL
, VT
, X
, NewSRLAmt
);
1821 SDValue NewMask
= DAG
.getConstant(Mask
>> AMShiftAmt
, DL
, VT
);
1822 SDValue NewAnd
= DAG
.getNode(ISD::AND
, DL
, VT
, NewSRL
, NewMask
);
1823 SDValue NewSHLAmt
= DAG
.getConstant(AMShiftAmt
, DL
, MVT::i8
);
1824 SDValue NewSHL
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewAnd
, NewSHLAmt
);
1826 // Insert the new nodes into the topological ordering. We must do this in
1827 // a valid topological ordering as nothing is going to go back and re-sort
1828 // these nodes. We continually insert before 'N' in sequence as this is
1829 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1830 // hierarchy left to express.
1831 insertDAGNode(DAG
, N
, NewSRLAmt
);
1832 insertDAGNode(DAG
, N
, NewSRL
);
1833 insertDAGNode(DAG
, N
, NewMask
);
1834 insertDAGNode(DAG
, N
, NewAnd
);
1835 insertDAGNode(DAG
, N
, NewSHLAmt
);
1836 insertDAGNode(DAG
, N
, NewSHL
);
1837 DAG
.ReplaceAllUsesWith(N
, NewSHL
);
1838 DAG
.RemoveDeadNode(N
.getNode());
1840 AM
.Scale
= 1 << AMShiftAmt
;
1841 AM
.IndexReg
= NewAnd
;
1845 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N
, X86ISelAddressMode
&AM
,
1849 dbgs() << "MatchAddress: ";
1854 return matchAddressBase(N
, AM
);
1856 // If this is already a %rip relative address, we can only merge immediates
1857 // into it. Instead of handling this in every case, we handle it here.
1858 // RIP relative addressing: %rip + 32-bit displacement!
1859 if (AM
.isRIPRelative()) {
1860 // FIXME: JumpTable and ExternalSymbol address currently don't like
1861 // displacements. It isn't very important, but this should be fixed for
1863 if (!(AM
.ES
|| AM
.MCSym
) && AM
.JT
!= -1)
1866 if (ConstantSDNode
*Cst
= dyn_cast
<ConstantSDNode
>(N
))
1867 if (!foldOffsetIntoAddress(Cst
->getSExtValue(), AM
))
1872 switch (N
.getOpcode()) {
1874 case ISD::LOCAL_RECOVER
: {
1875 if (!AM
.hasSymbolicDisplacement() && AM
.Disp
== 0)
1876 if (const auto *ESNode
= dyn_cast
<MCSymbolSDNode
>(N
.getOperand(0))) {
1877 // Use the symbol and don't prefix it.
1878 AM
.MCSym
= ESNode
->getMCSymbol();
1883 case ISD::Constant
: {
1884 uint64_t Val
= cast
<ConstantSDNode
>(N
)->getSExtValue();
1885 if (!foldOffsetIntoAddress(Val
, AM
))
1890 case X86ISD::Wrapper
:
1891 case X86ISD::WrapperRIP
:
1892 if (!matchWrapper(N
, AM
))
1897 if (!matchLoadInAddress(cast
<LoadSDNode
>(N
), AM
))
1901 case ISD::FrameIndex
:
1902 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1903 AM
.Base_Reg
.getNode() == nullptr &&
1904 (!Subtarget
->is64Bit() || isDispSafeForFrameIndex(AM
.Disp
))) {
1905 AM
.BaseType
= X86ISelAddressMode::FrameIndexBase
;
1906 AM
.Base_FrameIndex
= cast
<FrameIndexSDNode
>(N
)->getIndex();
1912 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1)
1915 if (ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(N
.getOperand(1))) {
1916 unsigned Val
= CN
->getZExtValue();
1917 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
1918 // that the base operand remains free for further matching. If
1919 // the base doesn't end up getting used, a post-processing step
1920 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
1921 if (Val
== 1 || Val
== 2 || Val
== 3) {
1922 AM
.Scale
= 1 << Val
;
1923 SDValue ShVal
= N
.getOperand(0);
1925 // Okay, we know that we have a scale by now. However, if the scaled
1926 // value is an add of something and a constant, we can fold the
1927 // constant into the disp field here.
1928 if (CurDAG
->isBaseWithConstantOffset(ShVal
)) {
1929 AM
.IndexReg
= ShVal
.getOperand(0);
1930 ConstantSDNode
*AddVal
= cast
<ConstantSDNode
>(ShVal
.getOperand(1));
1931 uint64_t Disp
= (uint64_t)AddVal
->getSExtValue() << Val
;
1932 if (!foldOffsetIntoAddress(Disp
, AM
))
1936 AM
.IndexReg
= ShVal
;
1943 // Scale must not be used already.
1944 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1) break;
1946 // We only handle up to 64-bit values here as those are what matter for
1947 // addressing mode optimizations.
1948 assert(N
.getSimpleValueType().getSizeInBits() <= 64 &&
1949 "Unexpected value size!");
1951 SDValue And
= N
.getOperand(0);
1952 if (And
.getOpcode() != ISD::AND
) break;
1953 SDValue X
= And
.getOperand(0);
1955 // The mask used for the transform is expected to be post-shift, but we
1956 // found the shift first so just apply the shift to the mask before passing
1958 if (!isa
<ConstantSDNode
>(N
.getOperand(1)) ||
1959 !isa
<ConstantSDNode
>(And
.getOperand(1)))
1961 uint64_t Mask
= And
.getConstantOperandVal(1) >> N
.getConstantOperandVal(1);
1963 // Try to fold the mask and shift into the scale, and return false if we
1965 if (!foldMaskAndShiftToScale(*CurDAG
, N
, Mask
, N
, X
, AM
))
1970 case ISD::SMUL_LOHI
:
1971 case ISD::UMUL_LOHI
:
1972 // A mul_lohi where we need the low part can be folded as a plain multiply.
1973 if (N
.getResNo() != 0) break;
1976 case X86ISD::MUL_IMM
:
1977 // X*[3,5,9] -> X+X*[2,4,8]
1978 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1979 AM
.Base_Reg
.getNode() == nullptr &&
1980 AM
.IndexReg
.getNode() == nullptr) {
1981 if (ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(N
.getOperand(1)))
1982 if (CN
->getZExtValue() == 3 || CN
->getZExtValue() == 5 ||
1983 CN
->getZExtValue() == 9) {
1984 AM
.Scale
= unsigned(CN
->getZExtValue())-1;
1986 SDValue MulVal
= N
.getOperand(0);
1989 // Okay, we know that we have a scale by now. However, if the scaled
1990 // value is an add of something and a constant, we can fold the
1991 // constant into the disp field here.
1992 if (MulVal
.getNode()->getOpcode() == ISD::ADD
&& MulVal
.hasOneUse() &&
1993 isa
<ConstantSDNode
>(MulVal
.getOperand(1))) {
1994 Reg
= MulVal
.getOperand(0);
1995 ConstantSDNode
*AddVal
=
1996 cast
<ConstantSDNode
>(MulVal
.getOperand(1));
1997 uint64_t Disp
= AddVal
->getSExtValue() * CN
->getZExtValue();
1998 if (foldOffsetIntoAddress(Disp
, AM
))
1999 Reg
= N
.getOperand(0);
2001 Reg
= N
.getOperand(0);
2004 AM
.IndexReg
= AM
.Base_Reg
= Reg
;
2011 // Given A-B, if A can be completely folded into the address and
2012 // the index field with the index field unused, use -B as the index.
2013 // This is a win if a has multiple parts that can be folded into
2014 // the address. Also, this saves a mov if the base register has
2015 // other uses, since it avoids a two-address sub instruction, however
2016 // it costs an additional mov if the index register has other uses.
2018 // Add an artificial use to this node so that we can keep track of
2019 // it if it gets CSE'd with a different node.
2020 HandleSDNode
Handle(N
);
2022 // Test if the LHS of the sub can be folded.
2023 X86ISelAddressMode Backup
= AM
;
2024 if (matchAddressRecursively(N
.getOperand(0), AM
, Depth
+1)) {
2025 N
= Handle
.getValue();
2029 N
= Handle
.getValue();
2030 // Test if the index field is free for use.
2031 if (AM
.IndexReg
.getNode() || AM
.isRIPRelative()) {
2037 SDValue RHS
= N
.getOperand(1);
2038 // If the RHS involves a register with multiple uses, this
2039 // transformation incurs an extra mov, due to the neg instruction
2040 // clobbering its operand.
2041 if (!RHS
.getNode()->hasOneUse() ||
2042 RHS
.getNode()->getOpcode() == ISD::CopyFromReg
||
2043 RHS
.getNode()->getOpcode() == ISD::TRUNCATE
||
2044 RHS
.getNode()->getOpcode() == ISD::ANY_EXTEND
||
2045 (RHS
.getNode()->getOpcode() == ISD::ZERO_EXTEND
&&
2046 RHS
.getOperand(0).getValueType() == MVT::i32
))
2048 // If the base is a register with multiple uses, this
2049 // transformation may save a mov.
2050 if ((AM
.BaseType
== X86ISelAddressMode::RegBase
&& AM
.Base_Reg
.getNode() &&
2051 !AM
.Base_Reg
.getNode()->hasOneUse()) ||
2052 AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
2054 // If the folded LHS was interesting, this transformation saves
2055 // address arithmetic.
2056 if ((AM
.hasSymbolicDisplacement() && !Backup
.hasSymbolicDisplacement()) +
2057 ((AM
.Disp
!= 0) && (Backup
.Disp
== 0)) +
2058 (AM
.Segment
.getNode() && !Backup
.Segment
.getNode()) >= 2)
2060 // If it doesn't look like it may be an overall win, don't do it.
2066 // Ok, the transformation is legal and appears profitable. Go for it.
2067 // Negation will be emitted later to avoid creating dangling nodes if this
2068 // was an unprofitable LEA.
2070 AM
.NegateIndex
= true;
2076 if (!matchAdd(N
, AM
, Depth
))
2081 // We want to look through a transform in InstCombine and DAGCombiner that
2082 // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
2083 // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
2084 // An 'lea' can then be used to match the shift (multiply) and add:
2086 // lea (%rsi, %rdi, 8), %rax
2087 if (CurDAG
->haveNoCommonBitsSet(N
.getOperand(0), N
.getOperand(1)) &&
2088 !matchAdd(N
, AM
, Depth
))
2093 // Perform some heroic transforms on an and of a constant-count shift
2094 // with a constant to enable use of the scaled offset field.
2096 // Scale must not be used already.
2097 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1) break;
2099 // We only handle up to 64-bit values here as those are what matter for
2100 // addressing mode optimizations.
2101 assert(N
.getSimpleValueType().getSizeInBits() <= 64 &&
2102 "Unexpected value size!");
2104 if (!isa
<ConstantSDNode
>(N
.getOperand(1)))
2107 if (N
.getOperand(0).getOpcode() == ISD::SRL
) {
2108 SDValue Shift
= N
.getOperand(0);
2109 SDValue X
= Shift
.getOperand(0);
2111 uint64_t Mask
= N
.getConstantOperandVal(1);
2113 // Try to fold the mask and shift into an extract and scale.
2114 if (!foldMaskAndShiftToExtract(*CurDAG
, N
, Mask
, Shift
, X
, AM
))
2117 // Try to fold the mask and shift directly into the scale.
2118 if (!foldMaskAndShiftToScale(*CurDAG
, N
, Mask
, Shift
, X
, AM
))
2121 // Try to fold the mask and shift into BEXTR and scale.
2122 if (!foldMaskedShiftToBEXTR(*CurDAG
, N
, Mask
, Shift
, X
, AM
, *Subtarget
))
2126 // Try to swap the mask and shift to place shifts which can be done as
2127 // a scale on the outside of the mask.
2128 if (!foldMaskedShiftToScaledMask(*CurDAG
, N
, AM
))
2133 case ISD::ZERO_EXTEND
: {
2134 // Try to widen a zexted shift left to the same size as its use, so we can
2135 // match the shift as a scale factor.
2136 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1)
2138 if (N
.getOperand(0).getOpcode() != ISD::SHL
|| !N
.getOperand(0).hasOneUse())
2141 // Give up if the shift is not a valid scale factor [1,2,3].
2142 SDValue Shl
= N
.getOperand(0);
2143 auto *ShAmtC
= dyn_cast
<ConstantSDNode
>(Shl
.getOperand(1));
2144 if (!ShAmtC
|| ShAmtC
->getZExtValue() > 3)
2147 // The narrow shift must only shift out zero bits (it must be 'nuw').
2148 // That makes it safe to widen to the destination type.
2149 APInt HighZeros
= APInt::getHighBitsSet(Shl
.getValueSizeInBits(),
2150 ShAmtC
->getZExtValue());
2151 if (!CurDAG
->MaskedValueIsZero(Shl
.getOperand(0), HighZeros
))
2154 // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
2155 MVT VT
= N
.getSimpleValueType();
2157 SDValue Zext
= CurDAG
->getNode(ISD::ZERO_EXTEND
, DL
, VT
, Shl
.getOperand(0));
2158 SDValue NewShl
= CurDAG
->getNode(ISD::SHL
, DL
, VT
, Zext
, Shl
.getOperand(1));
2160 // Convert the shift to scale factor.
2161 AM
.Scale
= 1 << ShAmtC
->getZExtValue();
2164 insertDAGNode(*CurDAG
, N
, Zext
);
2165 insertDAGNode(*CurDAG
, N
, NewShl
);
2166 CurDAG
->ReplaceAllUsesWith(N
, NewShl
);
2167 CurDAG
->RemoveDeadNode(N
.getNode());
2172 return matchAddressBase(N
, AM
);
2175 /// Helper for MatchAddress. Add the specified node to the
2176 /// specified addressing mode without any further recursion.
2177 bool X86DAGToDAGISel::matchAddressBase(SDValue N
, X86ISelAddressMode
&AM
) {
2178 // Is the base register already occupied?
2179 if (AM
.BaseType
!= X86ISelAddressMode::RegBase
|| AM
.Base_Reg
.getNode()) {
2180 // If so, check to see if the scale index register is set.
2181 if (!AM
.IndexReg
.getNode()) {
2187 // Otherwise, we cannot select it.
2191 // Default, generate it as a register.
2192 AM
.BaseType
= X86ISelAddressMode::RegBase
;
2197 /// Helper for selectVectorAddr. Handles things that can be folded into a
2198 /// gather scatter address. The index register and scale should have already
2200 bool X86DAGToDAGISel::matchVectorAddress(SDValue N
, X86ISelAddressMode
&AM
) {
2201 // TODO: Support other operations.
2202 switch (N
.getOpcode()) {
2203 case ISD::Constant
: {
2204 uint64_t Val
= cast
<ConstantSDNode
>(N
)->getSExtValue();
2205 if (!foldOffsetIntoAddress(Val
, AM
))
2209 case X86ISD::Wrapper
:
2210 if (!matchWrapper(N
, AM
))
2215 return matchAddressBase(N
, AM
);
2218 bool X86DAGToDAGISel::selectVectorAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
2219 SDValue
&Scale
, SDValue
&Index
,
2220 SDValue
&Disp
, SDValue
&Segment
) {
2221 X86ISelAddressMode AM
;
2222 auto *Mgs
= cast
<X86MaskedGatherScatterSDNode
>(Parent
);
2223 AM
.IndexReg
= Mgs
->getIndex();
2224 AM
.Scale
= cast
<ConstantSDNode
>(Mgs
->getScale())->getZExtValue();
2226 unsigned AddrSpace
= cast
<MemSDNode
>(Parent
)->getPointerInfo().getAddrSpace();
2227 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2228 if (AddrSpace
== 256)
2229 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
2230 if (AddrSpace
== 257)
2231 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
2232 if (AddrSpace
== 258)
2233 AM
.Segment
= CurDAG
->getRegister(X86::SS
, MVT::i16
);
2236 MVT VT
= N
.getSimpleValueType();
2238 // Try to match into the base and displacement fields.
2239 if (matchVectorAddress(N
, AM
))
2242 getAddressOperands(AM
, DL
, VT
, Base
, Scale
, Index
, Disp
, Segment
);
2246 /// Returns true if it is able to pattern match an addressing mode.
2247 /// It returns the operands which make up the maximal addressing mode it can
2248 /// match by reference.
2250 /// Parent is the parent node of the addr operand that is being matched. It
2251 /// is always a load, store, atomic node, or null. It is only null when
2252 /// checking memory operands for inline asm nodes.
2253 bool X86DAGToDAGISel::selectAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
2254 SDValue
&Scale
, SDValue
&Index
,
2255 SDValue
&Disp
, SDValue
&Segment
) {
2256 X86ISelAddressMode AM
;
2259 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2260 // that are not a MemSDNode, and thus don't have proper addrspace info.
2261 Parent
->getOpcode() != ISD::INTRINSIC_W_CHAIN
&& // unaligned loads, fixme
2262 Parent
->getOpcode() != ISD::INTRINSIC_VOID
&& // nontemporal stores
2263 Parent
->getOpcode() != X86ISD::TLSCALL
&& // Fixme
2264 Parent
->getOpcode() != X86ISD::ENQCMD
&& // Fixme
2265 Parent
->getOpcode() != X86ISD::ENQCMDS
&& // Fixme
2266 Parent
->getOpcode() != X86ISD::EH_SJLJ_SETJMP
&& // setjmp
2267 Parent
->getOpcode() != X86ISD::EH_SJLJ_LONGJMP
) { // longjmp
2268 unsigned AddrSpace
=
2269 cast
<MemSDNode
>(Parent
)->getPointerInfo().getAddrSpace();
2270 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2271 if (AddrSpace
== 256)
2272 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
2273 if (AddrSpace
== 257)
2274 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
2275 if (AddrSpace
== 258)
2276 AM
.Segment
= CurDAG
->getRegister(X86::SS
, MVT::i16
);
2279 // Save the DL and VT before calling matchAddress, it can invalidate N.
2281 MVT VT
= N
.getSimpleValueType();
2283 if (matchAddress(N
, AM
))
2286 getAddressOperands(AM
, DL
, VT
, Base
, Scale
, Index
, Disp
, Segment
);
2290 // We can only fold a load if all nodes between it and the root node have a
2291 // single use. If there are additional uses, we could end up duplicating the
2293 static bool hasSingleUsesFromRoot(SDNode
*Root
, SDNode
*User
) {
2294 while (User
!= Root
) {
2295 if (!User
->hasOneUse())
2297 User
= *User
->use_begin();
2303 /// Match a scalar SSE load. In particular, we want to match a load whose top
2304 /// elements are either undef or zeros. The load flavor is derived from the
2305 /// type of N, which is either v4f32 or v2f64.
2308 /// PatternChainNode: this is the matched node that has a chain input and
2310 bool X86DAGToDAGISel::selectScalarSSELoad(SDNode
*Root
, SDNode
*Parent
,
2311 SDValue N
, SDValue
&Base
,
2312 SDValue
&Scale
, SDValue
&Index
,
2313 SDValue
&Disp
, SDValue
&Segment
,
2314 SDValue
&PatternNodeWithChain
) {
2315 if (!hasSingleUsesFromRoot(Root
, Parent
))
2318 // We can allow a full vector load here since narrowing a load is ok unless
2319 // it's volatile or atomic.
2320 if (ISD::isNON_EXTLoad(N
.getNode())) {
2321 LoadSDNode
*LD
= cast
<LoadSDNode
>(N
);
2322 if (LD
->isSimple() &&
2323 IsProfitableToFold(N
, LD
, Root
) &&
2324 IsLegalToFold(N
, Parent
, Root
, OptLevel
)) {
2325 PatternNodeWithChain
= N
;
2326 return selectAddr(LD
, LD
->getBasePtr(), Base
, Scale
, Index
, Disp
,
2331 // We can also match the special zero extended load opcode.
2332 if (N
.getOpcode() == X86ISD::VZEXT_LOAD
) {
2333 PatternNodeWithChain
= N
;
2334 if (IsProfitableToFold(PatternNodeWithChain
, N
.getNode(), Root
) &&
2335 IsLegalToFold(PatternNodeWithChain
, Parent
, Root
, OptLevel
)) {
2336 auto *MI
= cast
<MemIntrinsicSDNode
>(PatternNodeWithChain
);
2337 return selectAddr(MI
, MI
->getBasePtr(), Base
, Scale
, Index
, Disp
,
2342 // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
2343 // once. Otherwise the load might get duplicated and the chain output of the
2344 // duplicate load will not be observed by all dependencies.
2345 if (N
.getOpcode() == ISD::SCALAR_TO_VECTOR
&& N
.getNode()->hasOneUse()) {
2346 PatternNodeWithChain
= N
.getOperand(0);
2347 if (ISD::isNON_EXTLoad(PatternNodeWithChain
.getNode()) &&
2348 IsProfitableToFold(PatternNodeWithChain
, N
.getNode(), Root
) &&
2349 IsLegalToFold(PatternNodeWithChain
, N
.getNode(), Root
, OptLevel
)) {
2350 LoadSDNode
*LD
= cast
<LoadSDNode
>(PatternNodeWithChain
);
2351 return selectAddr(LD
, LD
->getBasePtr(), Base
, Scale
, Index
, Disp
,
2360 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N
, SDValue
&Imm
) {
2361 if (const ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(N
)) {
2362 uint64_t ImmVal
= CN
->getZExtValue();
2363 if (!isUInt
<32>(ImmVal
))
2366 Imm
= CurDAG
->getTargetConstant(ImmVal
, SDLoc(N
), MVT::i64
);
2370 // In static codegen with small code model, we can get the address of a label
2371 // into a register with 'movl'
2372 if (N
->getOpcode() != X86ISD::Wrapper
)
2375 N
= N
.getOperand(0);
2377 // At least GNU as does not accept 'movl' for TPOFF relocations.
2378 // FIXME: We could use 'movl' when we know we are targeting MC.
2379 if (N
->getOpcode() == ISD::TargetGlobalTLSAddress
)
2383 if (N
->getOpcode() != ISD::TargetGlobalAddress
)
2384 return TM
.getCodeModel() == CodeModel::Small
;
2386 Optional
<ConstantRange
> CR
=
2387 cast
<GlobalAddressSDNode
>(N
)->getGlobal()->getAbsoluteSymbolRange();
2389 return TM
.getCodeModel() == CodeModel::Small
;
2391 return CR
->getUnsignedMax().ult(1ull << 32);
2394 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N
, SDValue
&Base
,
2395 SDValue
&Scale
, SDValue
&Index
,
2396 SDValue
&Disp
, SDValue
&Segment
) {
2397 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2400 if (!selectLEAAddr(N
, Base
, Scale
, Index
, Disp
, Segment
))
2403 RegisterSDNode
*RN
= dyn_cast
<RegisterSDNode
>(Base
);
2404 if (RN
&& RN
->getReg() == 0)
2405 Base
= CurDAG
->getRegister(0, MVT::i64
);
2406 else if (Base
.getValueType() == MVT::i32
&& !isa
<FrameIndexSDNode
>(Base
)) {
2407 // Base could already be %rip, particularly in the x32 ABI.
2408 SDValue ImplDef
= SDValue(CurDAG
->getMachineNode(X86::IMPLICIT_DEF
, DL
,
2410 Base
= CurDAG
->getTargetInsertSubreg(X86::sub_32bit
, DL
, MVT::i64
, ImplDef
,
2414 RN
= dyn_cast
<RegisterSDNode
>(Index
);
2415 if (RN
&& RN
->getReg() == 0)
2416 Index
= CurDAG
->getRegister(0, MVT::i64
);
2418 assert(Index
.getValueType() == MVT::i32
&&
2419 "Expect to be extending 32-bit registers for use in LEA");
2420 SDValue ImplDef
= SDValue(CurDAG
->getMachineNode(X86::IMPLICIT_DEF
, DL
,
2422 Index
= CurDAG
->getTargetInsertSubreg(X86::sub_32bit
, DL
, MVT::i64
, ImplDef
,
2429 /// Calls SelectAddr and determines if the maximal addressing
2430 /// mode it matches can be cost effectively emitted as an LEA instruction.
2431 bool X86DAGToDAGISel::selectLEAAddr(SDValue N
,
2432 SDValue
&Base
, SDValue
&Scale
,
2433 SDValue
&Index
, SDValue
&Disp
,
2435 X86ISelAddressMode AM
;
2437 // Save the DL and VT before calling matchAddress, it can invalidate N.
2439 MVT VT
= N
.getSimpleValueType();
2441 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
2443 SDValue Copy
= AM
.Segment
;
2444 SDValue T
= CurDAG
->getRegister(0, MVT::i32
);
2446 if (matchAddress(N
, AM
))
2448 assert (T
== AM
.Segment
);
2451 unsigned Complexity
= 0;
2452 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&& AM
.Base_Reg
.getNode())
2454 else if (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
2457 if (AM
.IndexReg
.getNode())
2460 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
2465 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
2466 // to a LEA. This is determined with some experimentation but is by no means
2467 // optimal (especially for code size consideration). LEA is nice because of
2468 // its three-address nature. Tweak the cost function again when we can run
2469 // convertToThreeAddress() at register allocation time.
2470 if (AM
.hasSymbolicDisplacement()) {
2471 // For X86-64, always use LEA to materialize RIP-relative addresses.
2472 if (Subtarget
->is64Bit())
2478 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
2479 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
2480 // duplicating flag-producing instructions later in the pipeline.
2481 if (N
.getOpcode() == ISD::ADD
) {
2482 auto isMathWithFlags
= [](SDValue V
) {
2483 switch (V
.getOpcode()) {
2488 /* TODO: These opcodes can be added safely, but we may want to justify
2489 their inclusion for different reasons (better for reg-alloc).
2496 // Value 1 is the flag output of the node - verify it's not dead.
2497 return !SDValue(V
.getNode(), 1).use_empty();
2502 // TODO: This could be an 'or' rather than 'and' to make the transform more
2503 // likely to happen. We might want to factor in whether there's a
2504 // load folding opportunity for the math op that disappears with LEA.
2505 if (isMathWithFlags(N
.getOperand(0)) && isMathWithFlags(N
.getOperand(1)))
2512 // If it isn't worth using an LEA, reject it.
2513 if (Complexity
<= 2)
2516 getAddressOperands(AM
, DL
, VT
, Base
, Scale
, Index
, Disp
, Segment
);
2520 /// This is only run on TargetGlobalTLSAddress nodes.
2521 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N
, SDValue
&Base
,
2522 SDValue
&Scale
, SDValue
&Index
,
2523 SDValue
&Disp
, SDValue
&Segment
) {
2524 assert(N
.getOpcode() == ISD::TargetGlobalTLSAddress
);
2525 const GlobalAddressSDNode
*GA
= cast
<GlobalAddressSDNode
>(N
);
2527 X86ISelAddressMode AM
;
2528 AM
.GV
= GA
->getGlobal();
2529 AM
.Disp
+= GA
->getOffset();
2530 AM
.SymbolFlags
= GA
->getTargetFlags();
2532 MVT VT
= N
.getSimpleValueType();
2533 if (VT
== MVT::i32
) {
2535 AM
.IndexReg
= CurDAG
->getRegister(X86::EBX
, MVT::i32
);
2538 getAddressOperands(AM
, SDLoc(N
), VT
, Base
, Scale
, Index
, Disp
, Segment
);
2542 bool X86DAGToDAGISel::selectRelocImm(SDValue N
, SDValue
&Op
) {
2543 if (auto *CN
= dyn_cast
<ConstantSDNode
>(N
)) {
2544 Op
= CurDAG
->getTargetConstant(CN
->getAPIntValue(), SDLoc(CN
),
2549 // Keep track of the original value type and whether this value was
2550 // truncated. If we see a truncation from pointer type to VT that truncates
2551 // bits that are known to be zero, we can use a narrow reference.
2552 EVT VT
= N
.getValueType();
2553 bool WasTruncated
= false;
2554 if (N
.getOpcode() == ISD::TRUNCATE
) {
2555 WasTruncated
= true;
2556 N
= N
.getOperand(0);
2559 if (N
.getOpcode() != X86ISD::Wrapper
)
2562 // We can only use non-GlobalValues as immediates if they were not truncated,
2563 // as we do not have any range information. If we have a GlobalValue and the
2564 // address was not truncated, we can select it as an operand directly.
2565 unsigned Opc
= N
.getOperand(0)->getOpcode();
2566 if (Opc
!= ISD::TargetGlobalAddress
|| !WasTruncated
) {
2567 Op
= N
.getOperand(0);
2568 // We can only select the operand directly if we didn't have to look past a
2570 return !WasTruncated
;
2573 // Check that the global's range fits into VT.
2574 auto *GA
= cast
<GlobalAddressSDNode
>(N
.getOperand(0));
2575 Optional
<ConstantRange
> CR
= GA
->getGlobal()->getAbsoluteSymbolRange();
2576 if (!CR
|| CR
->getUnsignedMax().uge(1ull << VT
.getSizeInBits()))
2579 // Okay, we can use a narrow reference.
2580 Op
= CurDAG
->getTargetGlobalAddress(GA
->getGlobal(), SDLoc(N
), VT
,
2581 GA
->getOffset(), GA
->getTargetFlags());
2585 bool X86DAGToDAGISel::tryFoldLoad(SDNode
*Root
, SDNode
*P
, SDValue N
,
2586 SDValue
&Base
, SDValue
&Scale
,
2587 SDValue
&Index
, SDValue
&Disp
,
2589 assert(Root
&& P
&& "Unknown root/parent nodes");
2590 if (!ISD::isNON_EXTLoad(N
.getNode()) ||
2591 !IsProfitableToFold(N
, P
, Root
) ||
2592 !IsLegalToFold(N
, P
, Root
, OptLevel
))
2595 return selectAddr(N
.getNode(),
2596 N
.getOperand(1), Base
, Scale
, Index
, Disp
, Segment
);
2599 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode
*Root
, SDNode
*P
, SDValue N
,
2600 SDValue
&Base
, SDValue
&Scale
,
2601 SDValue
&Index
, SDValue
&Disp
,
2603 assert(Root
&& P
&& "Unknown root/parent nodes");
2604 if (N
->getOpcode() != X86ISD::VBROADCAST_LOAD
||
2605 !IsProfitableToFold(N
, P
, Root
) ||
2606 !IsLegalToFold(N
, P
, Root
, OptLevel
))
2609 return selectAddr(N
.getNode(),
2610 N
.getOperand(1), Base
, Scale
, Index
, Disp
, Segment
);
2613 /// Return an SDNode that returns the value of the global base register.
2614 /// Output instructions required to initialize the global base register,
2616 SDNode
*X86DAGToDAGISel::getGlobalBaseReg() {
2617 unsigned GlobalBaseReg
= getInstrInfo()->getGlobalBaseReg(MF
);
2618 auto &DL
= MF
->getDataLayout();
2619 return CurDAG
->getRegister(GlobalBaseReg
, TLI
->getPointerTy(DL
)).getNode();
2622 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width
, SDNode
*N
) const {
2623 if (N
->getOpcode() == ISD::TRUNCATE
)
2624 N
= N
->getOperand(0).getNode();
2625 if (N
->getOpcode() != X86ISD::Wrapper
)
2628 auto *GA
= dyn_cast
<GlobalAddressSDNode
>(N
->getOperand(0));
2632 Optional
<ConstantRange
> CR
= GA
->getGlobal()->getAbsoluteSymbolRange();
2633 return CR
&& CR
->getSignedMin().sge(-1ull << Width
) &&
2634 CR
->getSignedMax().slt(1ull << Width
);
2637 static X86::CondCode
getCondFromNode(SDNode
*N
) {
2638 assert(N
->isMachineOpcode() && "Unexpected node");
2639 X86::CondCode CC
= X86::COND_INVALID
;
2640 unsigned Opc
= N
->getMachineOpcode();
2641 if (Opc
== X86::JCC_1
)
2642 CC
= static_cast<X86::CondCode
>(N
->getConstantOperandVal(1));
2643 else if (Opc
== X86::SETCCr
)
2644 CC
= static_cast<X86::CondCode
>(N
->getConstantOperandVal(0));
2645 else if (Opc
== X86::SETCCm
)
2646 CC
= static_cast<X86::CondCode
>(N
->getConstantOperandVal(5));
2647 else if (Opc
== X86::CMOV16rr
|| Opc
== X86::CMOV32rr
||
2648 Opc
== X86::CMOV64rr
)
2649 CC
= static_cast<X86::CondCode
>(N
->getConstantOperandVal(2));
2650 else if (Opc
== X86::CMOV16rm
|| Opc
== X86::CMOV32rm
||
2651 Opc
== X86::CMOV64rm
)
2652 CC
= static_cast<X86::CondCode
>(N
->getConstantOperandVal(6));
2657 /// Test whether the given X86ISD::CMP node has any users that use a flag
2659 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags
) const {
2660 // Examine each user of the node.
2661 for (SDNode::use_iterator UI
= Flags
->use_begin(), UE
= Flags
->use_end();
2663 // Only check things that use the flags.
2664 if (UI
.getUse().getResNo() != Flags
.getResNo())
2666 // Only examine CopyToReg uses that copy to EFLAGS.
2667 if (UI
->getOpcode() != ISD::CopyToReg
||
2668 cast
<RegisterSDNode
>(UI
->getOperand(1))->getReg() != X86::EFLAGS
)
2670 // Examine each user of the CopyToReg use.
2671 for (SDNode::use_iterator FlagUI
= UI
->use_begin(),
2672 FlagUE
= UI
->use_end(); FlagUI
!= FlagUE
; ++FlagUI
) {
2673 // Only examine the Flag result.
2674 if (FlagUI
.getUse().getResNo() != 1) continue;
2675 // Anything unusual: assume conservatively.
2676 if (!FlagUI
->isMachineOpcode()) return false;
2677 // Examine the condition code of the user.
2678 X86::CondCode CC
= getCondFromNode(*FlagUI
);
2681 // Comparisons which only use the zero flag.
2682 case X86::COND_E
: case X86::COND_NE
:
2684 // Anything else: assume conservatively.
2693 /// Test whether the given X86ISD::CMP node has any uses which require the SF
2694 /// flag to be accurate.
2695 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags
) const {
2696 // Examine each user of the node.
2697 for (SDNode::use_iterator UI
= Flags
->use_begin(), UE
= Flags
->use_end();
2699 // Only check things that use the flags.
2700 if (UI
.getUse().getResNo() != Flags
.getResNo())
2702 // Only examine CopyToReg uses that copy to EFLAGS.
2703 if (UI
->getOpcode() != ISD::CopyToReg
||
2704 cast
<RegisterSDNode
>(UI
->getOperand(1))->getReg() != X86::EFLAGS
)
2706 // Examine each user of the CopyToReg use.
2707 for (SDNode::use_iterator FlagUI
= UI
->use_begin(),
2708 FlagUE
= UI
->use_end(); FlagUI
!= FlagUE
; ++FlagUI
) {
2709 // Only examine the Flag result.
2710 if (FlagUI
.getUse().getResNo() != 1) continue;
2711 // Anything unusual: assume conservatively.
2712 if (!FlagUI
->isMachineOpcode()) return false;
2713 // Examine the condition code of the user.
2714 X86::CondCode CC
= getCondFromNode(*FlagUI
);
2717 // Comparisons which don't examine the SF flag.
2718 case X86::COND_A
: case X86::COND_AE
:
2719 case X86::COND_B
: case X86::COND_BE
:
2720 case X86::COND_E
: case X86::COND_NE
:
2721 case X86::COND_O
: case X86::COND_NO
:
2722 case X86::COND_P
: case X86::COND_NP
:
2724 // Anything else: assume conservatively.
2733 static bool mayUseCarryFlag(X86::CondCode CC
) {
2735 // Comparisons which don't examine the CF flag.
2736 case X86::COND_O
: case X86::COND_NO
:
2737 case X86::COND_E
: case X86::COND_NE
:
2738 case X86::COND_S
: case X86::COND_NS
:
2739 case X86::COND_P
: case X86::COND_NP
:
2740 case X86::COND_L
: case X86::COND_GE
:
2741 case X86::COND_G
: case X86::COND_LE
:
2743 // Anything else: assume conservatively.
2749 /// Test whether the given node which sets flags has any uses which require the
2750 /// CF flag to be accurate.
2751 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags
) const {
2752 // Examine each user of the node.
2753 for (SDNode::use_iterator UI
= Flags
->use_begin(), UE
= Flags
->use_end();
2755 // Only check things that use the flags.
2756 if (UI
.getUse().getResNo() != Flags
.getResNo())
2759 unsigned UIOpc
= UI
->getOpcode();
2761 if (UIOpc
== ISD::CopyToReg
) {
2762 // Only examine CopyToReg uses that copy to EFLAGS.
2763 if (cast
<RegisterSDNode
>(UI
->getOperand(1))->getReg() != X86::EFLAGS
)
2765 // Examine each user of the CopyToReg use.
2766 for (SDNode::use_iterator FlagUI
= UI
->use_begin(), FlagUE
= UI
->use_end();
2767 FlagUI
!= FlagUE
; ++FlagUI
) {
2768 // Only examine the Flag result.
2769 if (FlagUI
.getUse().getResNo() != 1)
2771 // Anything unusual: assume conservatively.
2772 if (!FlagUI
->isMachineOpcode())
2774 // Examine the condition code of the user.
2775 X86::CondCode CC
= getCondFromNode(*FlagUI
);
2777 if (mayUseCarryFlag(CC
))
2781 // This CopyToReg is ok. Move on to the next user.
2785 // This might be an unselected node. So look for the pre-isel opcodes that
2790 // Something unusual. Be conservative.
2792 case X86ISD::SETCC
: CCOpNo
= 0; break;
2793 case X86ISD::SETCC_CARRY
: CCOpNo
= 0; break;
2794 case X86ISD::CMOV
: CCOpNo
= 2; break;
2795 case X86ISD::BRCOND
: CCOpNo
= 2; break;
2798 X86::CondCode CC
= (X86::CondCode
)UI
->getConstantOperandVal(CCOpNo
);
2799 if (mayUseCarryFlag(CC
))
2805 /// Check whether or not the chain ending in StoreNode is suitable for doing
2806 /// the {load; op; store} to modify transformation.
2807 static bool isFusableLoadOpStorePattern(StoreSDNode
*StoreNode
,
2808 SDValue StoredVal
, SelectionDAG
*CurDAG
,
2810 LoadSDNode
*&LoadNode
,
2811 SDValue
&InputChain
) {
2812 // Is the stored value result 0 of the operation?
2813 if (StoredVal
.getResNo() != 0) return false;
2815 // Are there other uses of the operation other than the store?
2816 if (!StoredVal
.getNode()->hasNUsesOfValue(1, 0)) return false;
2818 // Is the store non-extending and non-indexed?
2819 if (!ISD::isNormalStore(StoreNode
) || StoreNode
->isNonTemporal())
2822 SDValue Load
= StoredVal
->getOperand(LoadOpNo
);
2823 // Is the stored value a non-extending and non-indexed load?
2824 if (!ISD::isNormalLoad(Load
.getNode())) return false;
2826 // Return LoadNode by reference.
2827 LoadNode
= cast
<LoadSDNode
>(Load
);
2829 // Is store the only read of the loaded value?
2830 if (!Load
.hasOneUse())
2833 // Is the address of the store the same as the load?
2834 if (LoadNode
->getBasePtr() != StoreNode
->getBasePtr() ||
2835 LoadNode
->getOffset() != StoreNode
->getOffset())
2838 bool FoundLoad
= false;
2839 SmallVector
<SDValue
, 4> ChainOps
;
2840 SmallVector
<const SDNode
*, 4> LoopWorklist
;
2841 SmallPtrSet
<const SDNode
*, 16> Visited
;
2842 const unsigned int Max
= 1024;
2844 // Visualization of Load-Op-Store fusion:
2845 // -------------------------
2847 // *-lines = Chain operand dependencies.
2848 // |-lines = Normal operand dependencies.
2849 // Dependencies flow down and right. n-suffix references multiple nodes.
2857 // * * \ | => A--LD_OP_ST
2865 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
2869 // Ensure the transform is safe by checking for the dual
2870 // dependencies to make sure we do not induce a loop.
2872 // As LD is a predecessor to both OP and ST we can do this by checking:
2873 // a). if LD is a predecessor to a member of Xn or Yn.
2874 // b). if a Zn is a predecessor to ST.
2876 // However, (b) can only occur through being a chain predecessor to
2877 // ST, which is the same as Zn being a member or predecessor of Xn,
2878 // which is a subset of LD being a predecessor of Xn. So it's
2879 // subsumed by check (a).
2881 SDValue Chain
= StoreNode
->getChain();
2883 // Gather X elements in ChainOps.
2884 if (Chain
== Load
.getValue(1)) {
2886 ChainOps
.push_back(Load
.getOperand(0));
2887 } else if (Chain
.getOpcode() == ISD::TokenFactor
) {
2888 for (unsigned i
= 0, e
= Chain
.getNumOperands(); i
!= e
; ++i
) {
2889 SDValue Op
= Chain
.getOperand(i
);
2890 if (Op
== Load
.getValue(1)) {
2892 // Drop Load, but keep its chain. No cycle check necessary.
2893 ChainOps
.push_back(Load
.getOperand(0));
2896 LoopWorklist
.push_back(Op
.getNode());
2897 ChainOps
.push_back(Op
);
2904 // Worklist is currently Xn. Add Yn to worklist.
2905 for (SDValue Op
: StoredVal
->ops())
2906 if (Op
.getNode() != LoadNode
)
2907 LoopWorklist
.push_back(Op
.getNode());
2909 // Check (a) if Load is a predecessor to Xn + Yn
2910 if (SDNode::hasPredecessorHelper(Load
.getNode(), Visited
, LoopWorklist
, Max
,
2915 CurDAG
->getNode(ISD::TokenFactor
, SDLoc(Chain
), MVT::Other
, ChainOps
);
2919 // Change a chain of {load; op; store} of the same value into a simple op
2920 // through memory of that value, if the uses of the modified value and its
2921 // address are suitable.
2923 // The tablegen pattern memory operand pattern is currently not able to match
2924 // the case where the EFLAGS on the original operation are used.
2926 // To move this to tablegen, we'll need to improve tablegen to allow flags to
2927 // be transferred from a node in the pattern to the result node, probably with
2928 // a new keyword. For example, we have this
2929 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2930 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2931 // (implicit EFLAGS)]>;
2932 // but maybe need something like this
2933 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2934 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2935 // (transferrable EFLAGS)]>;
2937 // Until then, we manually fold these and instruction select the operation
2939 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode
*Node
) {
2940 StoreSDNode
*StoreNode
= cast
<StoreSDNode
>(Node
);
2941 SDValue StoredVal
= StoreNode
->getOperand(1);
2942 unsigned Opc
= StoredVal
->getOpcode();
2944 // Before we try to select anything, make sure this is memory operand size
2945 // and opcode we can handle. Note that this must match the code below that
2946 // actually lowers the opcodes.
2947 EVT MemVT
= StoreNode
->getMemoryVT();
2948 if (MemVT
!= MVT::i64
&& MemVT
!= MVT::i32
&& MemVT
!= MVT::i16
&&
2952 bool IsCommutable
= false;
2953 bool IsNegate
= false;
2958 IsNegate
= isNullConstant(StoredVal
.getOperand(0));
2967 IsCommutable
= true;
2971 unsigned LoadOpNo
= IsNegate
? 1 : 0;
2972 LoadSDNode
*LoadNode
= nullptr;
2974 if (!isFusableLoadOpStorePattern(StoreNode
, StoredVal
, CurDAG
, LoadOpNo
,
2975 LoadNode
, InputChain
)) {
2979 // This operation is commutable, try the other operand.
2981 if (!isFusableLoadOpStorePattern(StoreNode
, StoredVal
, CurDAG
, LoadOpNo
,
2982 LoadNode
, InputChain
))
2986 SDValue Base
, Scale
, Index
, Disp
, Segment
;
2987 if (!selectAddr(LoadNode
, LoadNode
->getBasePtr(), Base
, Scale
, Index
, Disp
,
2991 auto SelectOpcode
= [&](unsigned Opc64
, unsigned Opc32
, unsigned Opc16
,
2993 switch (MemVT
.getSimpleVT().SimpleTy
) {
3003 llvm_unreachable("Invalid size!");
3007 MachineSDNode
*Result
;
3012 unsigned NewOpc
= SelectOpcode(X86::NEG64m
, X86::NEG32m
, X86::NEG16m
,
3014 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
, Segment
, InputChain
};
3015 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
,
3021 // Try to match inc/dec.
3022 if (!Subtarget
->slowIncDec() || OptForSize
) {
3023 bool IsOne
= isOneConstant(StoredVal
.getOperand(1));
3024 bool IsNegOne
= isAllOnesConstant(StoredVal
.getOperand(1));
3025 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3026 if ((IsOne
|| IsNegOne
) && hasNoCarryFlagUses(StoredVal
.getValue(1))) {
3028 ((Opc
== X86ISD::ADD
) == IsOne
)
3029 ? SelectOpcode(X86::INC64m
, X86::INC32m
, X86::INC16m
, X86::INC8m
)
3030 : SelectOpcode(X86::DEC64m
, X86::DEC32m
, X86::DEC16m
, X86::DEC8m
);
3031 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
, Segment
, InputChain
};
3032 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
,
3043 auto SelectRegOpcode
= [SelectOpcode
](unsigned Opc
) {
3046 return SelectOpcode(X86::ADD64mr
, X86::ADD32mr
, X86::ADD16mr
,
3049 return SelectOpcode(X86::ADC64mr
, X86::ADC32mr
, X86::ADC16mr
,
3052 return SelectOpcode(X86::SUB64mr
, X86::SUB32mr
, X86::SUB16mr
,
3055 return SelectOpcode(X86::SBB64mr
, X86::SBB32mr
, X86::SBB16mr
,
3058 return SelectOpcode(X86::AND64mr
, X86::AND32mr
, X86::AND16mr
,
3061 return SelectOpcode(X86::OR64mr
, X86::OR32mr
, X86::OR16mr
, X86::OR8mr
);
3063 return SelectOpcode(X86::XOR64mr
, X86::XOR32mr
, X86::XOR16mr
,
3066 llvm_unreachable("Invalid opcode!");
3069 auto SelectImm8Opcode
= [SelectOpcode
](unsigned Opc
) {
3072 return SelectOpcode(X86::ADD64mi8
, X86::ADD32mi8
, X86::ADD16mi8
, 0);
3074 return SelectOpcode(X86::ADC64mi8
, X86::ADC32mi8
, X86::ADC16mi8
, 0);
3076 return SelectOpcode(X86::SUB64mi8
, X86::SUB32mi8
, X86::SUB16mi8
, 0);
3078 return SelectOpcode(X86::SBB64mi8
, X86::SBB32mi8
, X86::SBB16mi8
, 0);
3080 return SelectOpcode(X86::AND64mi8
, X86::AND32mi8
, X86::AND16mi8
, 0);
3082 return SelectOpcode(X86::OR64mi8
, X86::OR32mi8
, X86::OR16mi8
, 0);
3084 return SelectOpcode(X86::XOR64mi8
, X86::XOR32mi8
, X86::XOR16mi8
, 0);
3086 llvm_unreachable("Invalid opcode!");
3089 auto SelectImmOpcode
= [SelectOpcode
](unsigned Opc
) {
3092 return SelectOpcode(X86::ADD64mi32
, X86::ADD32mi
, X86::ADD16mi
,
3095 return SelectOpcode(X86::ADC64mi32
, X86::ADC32mi
, X86::ADC16mi
,
3098 return SelectOpcode(X86::SUB64mi32
, X86::SUB32mi
, X86::SUB16mi
,
3101 return SelectOpcode(X86::SBB64mi32
, X86::SBB32mi
, X86::SBB16mi
,
3104 return SelectOpcode(X86::AND64mi32
, X86::AND32mi
, X86::AND16mi
,
3107 return SelectOpcode(X86::OR64mi32
, X86::OR32mi
, X86::OR16mi
,
3110 return SelectOpcode(X86::XOR64mi32
, X86::XOR32mi
, X86::XOR16mi
,
3113 llvm_unreachable("Invalid opcode!");
3117 unsigned NewOpc
= SelectRegOpcode(Opc
);
3118 SDValue Operand
= StoredVal
->getOperand(1-LoadOpNo
);
3120 // See if the operand is a constant that we can fold into an immediate
3122 if (auto *OperandC
= dyn_cast
<ConstantSDNode
>(Operand
)) {
3123 int64_t OperandV
= OperandC
->getSExtValue();
3125 // Check if we can shrink the operand enough to fit in an immediate (or
3126 // fit into a smaller immediate) by negating it and switching the
3128 if ((Opc
== X86ISD::ADD
|| Opc
== X86ISD::SUB
) &&
3129 ((MemVT
!= MVT::i8
&& !isInt
<8>(OperandV
) && isInt
<8>(-OperandV
)) ||
3130 (MemVT
== MVT::i64
&& !isInt
<32>(OperandV
) &&
3131 isInt
<32>(-OperandV
))) &&
3132 hasNoCarryFlagUses(StoredVal
.getValue(1))) {
3133 OperandV
= -OperandV
;
3134 Opc
= Opc
== X86ISD::ADD
? X86ISD::SUB
: X86ISD::ADD
;
3137 // First try to fit this into an Imm8 operand. If it doesn't fit, then try
3138 // the larger immediate operand.
3139 if (MemVT
!= MVT::i8
&& isInt
<8>(OperandV
)) {
3140 Operand
= CurDAG
->getTargetConstant(OperandV
, SDLoc(Node
), MemVT
);
3141 NewOpc
= SelectImm8Opcode(Opc
);
3142 } else if (MemVT
!= MVT::i64
|| isInt
<32>(OperandV
)) {
3143 Operand
= CurDAG
->getTargetConstant(OperandV
, SDLoc(Node
), MemVT
);
3144 NewOpc
= SelectImmOpcode(Opc
);
3148 if (Opc
== X86ISD::ADC
|| Opc
== X86ISD::SBB
) {
3150 CurDAG
->getCopyToReg(InputChain
, SDLoc(Node
), X86::EFLAGS
,
3151 StoredVal
.getOperand(2), SDValue());
3153 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
,
3154 Segment
, Operand
, CopyTo
, CopyTo
.getValue(1)};
3155 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
, MVT::Other
,
3158 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
,
3159 Segment
, Operand
, InputChain
};
3160 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
, MVT::Other
,
3166 llvm_unreachable("Invalid opcode!");
3169 MachineMemOperand
*MemOps
[] = {StoreNode
->getMemOperand(),
3170 LoadNode
->getMemOperand()};
3171 CurDAG
->setNodeMemRefs(Result
, MemOps
);
3173 // Update Load Chain uses as well.
3174 ReplaceUses(SDValue(LoadNode
, 1), SDValue(Result
, 1));
3175 ReplaceUses(SDValue(StoreNode
, 0), SDValue(Result
, 1));
3176 ReplaceUses(SDValue(StoredVal
.getNode(), 1), SDValue(Result
, 0));
3177 CurDAG
->RemoveDeadNode(Node
);
3181 // See if this is an X & Mask that we can match to BEXTR/BZHI.
3182 // Where Mask is one of the following patterns:
3183 // a) x & (1 << nbits) - 1
3184 // b) x & ~(-1 << nbits)
3185 // c) x & (-1 >> (32 - y))
3186 // d) x << (32 - y) >> (32 - y)
3187 bool X86DAGToDAGISel::matchBitExtract(SDNode
*Node
) {
3189 (Node
->getOpcode() == ISD::AND
|| Node
->getOpcode() == ISD::SRL
) &&
3190 "Should be either an and-mask, or right-shift after clearing high bits.");
3192 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3193 if (!Subtarget
->hasBMI() && !Subtarget
->hasBMI2())
3196 MVT NVT
= Node
->getSimpleValueType(0);
3198 // Only supported for 32 and 64 bits.
3199 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
3204 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3205 // Else, if we only have BMI1's BEXTR, we require one-use.
3206 const bool CanHaveExtraUses
= Subtarget
->hasBMI2();
3207 auto checkUses
= [CanHaveExtraUses
](SDValue Op
, unsigned NUses
) {
3208 return CanHaveExtraUses
||
3209 Op
.getNode()->hasNUsesOfValue(NUses
, Op
.getResNo());
3211 auto checkOneUse
= [checkUses
](SDValue Op
) { return checkUses(Op
, 1); };
3212 auto checkTwoUse
= [checkUses
](SDValue Op
) { return checkUses(Op
, 2); };
3214 auto peekThroughOneUseTruncation
= [checkOneUse
](SDValue V
) {
3215 if (V
->getOpcode() == ISD::TRUNCATE
&& checkOneUse(V
)) {
3216 assert(V
.getSimpleValueType() == MVT::i32
&&
3217 V
.getOperand(0).getSimpleValueType() == MVT::i64
&&
3218 "Expected i64 -> i32 truncation");
3219 V
= V
.getOperand(0);
3224 // a) x & ((1 << nbits) + (-1))
3225 auto matchPatternA
= [checkOneUse
, peekThroughOneUseTruncation
,
3226 &NBits
](SDValue Mask
) -> bool {
3227 // Match `add`. Must only have one use!
3228 if (Mask
->getOpcode() != ISD::ADD
|| !checkOneUse(Mask
))
3230 // We should be adding all-ones constant (i.e. subtracting one.)
3231 if (!isAllOnesConstant(Mask
->getOperand(1)))
3233 // Match `1 << nbits`. Might be truncated. Must only have one use!
3234 SDValue M0
= peekThroughOneUseTruncation(Mask
->getOperand(0));
3235 if (M0
->getOpcode() != ISD::SHL
|| !checkOneUse(M0
))
3237 if (!isOneConstant(M0
->getOperand(0)))
3239 NBits
= M0
->getOperand(1);
3243 auto isAllOnes
= [this, peekThroughOneUseTruncation
, NVT
](SDValue V
) {
3244 V
= peekThroughOneUseTruncation(V
);
3245 return CurDAG
->MaskedValueIsAllOnes(
3246 V
, APInt::getLowBitsSet(V
.getSimpleValueType().getSizeInBits(),
3247 NVT
.getSizeInBits()));
3250 // b) x & ~(-1 << nbits)
3251 auto matchPatternB
= [checkOneUse
, isAllOnes
, peekThroughOneUseTruncation
,
3252 &NBits
](SDValue Mask
) -> bool {
3253 // Match `~()`. Must only have one use!
3254 if (Mask
.getOpcode() != ISD::XOR
|| !checkOneUse(Mask
))
3256 // The -1 only has to be all-ones for the final Node's NVT.
3257 if (!isAllOnes(Mask
->getOperand(1)))
3259 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3260 SDValue M0
= peekThroughOneUseTruncation(Mask
->getOperand(0));
3261 if (M0
->getOpcode() != ISD::SHL
|| !checkOneUse(M0
))
3263 // The -1 only has to be all-ones for the final Node's NVT.
3264 if (!isAllOnes(M0
->getOperand(0)))
3266 NBits
= M0
->getOperand(1);
3270 // Match potentially-truncated (bitwidth - y)
3271 auto matchShiftAmt
= [checkOneUse
, &NBits
](SDValue ShiftAmt
,
3272 unsigned Bitwidth
) {
3273 // Skip over a truncate of the shift amount.
3274 if (ShiftAmt
.getOpcode() == ISD::TRUNCATE
) {
3275 ShiftAmt
= ShiftAmt
.getOperand(0);
3276 // The trunc should have been the only user of the real shift amount.
3277 if (!checkOneUse(ShiftAmt
))
3280 // Match the shift amount as: (bitwidth - y). It should go away, too.
3281 if (ShiftAmt
.getOpcode() != ISD::SUB
)
3283 auto V0
= dyn_cast
<ConstantSDNode
>(ShiftAmt
.getOperand(0));
3284 if (!V0
|| V0
->getZExtValue() != Bitwidth
)
3286 NBits
= ShiftAmt
.getOperand(1);
3290 // c) x & (-1 >> (32 - y))
3291 auto matchPatternC
= [checkOneUse
, peekThroughOneUseTruncation
,
3292 matchShiftAmt
](SDValue Mask
) -> bool {
3293 // The mask itself may be truncated.
3294 Mask
= peekThroughOneUseTruncation(Mask
);
3295 unsigned Bitwidth
= Mask
.getSimpleValueType().getSizeInBits();
3296 // Match `l>>`. Must only have one use!
3297 if (Mask
.getOpcode() != ISD::SRL
|| !checkOneUse(Mask
))
3299 // We should be shifting truly all-ones constant.
3300 if (!isAllOnesConstant(Mask
.getOperand(0)))
3302 SDValue M1
= Mask
.getOperand(1);
3303 // The shift amount should not be used externally.
3304 if (!checkOneUse(M1
))
3306 return matchShiftAmt(M1
, Bitwidth
);
3311 // d) x << (32 - y) >> (32 - y)
3312 auto matchPatternD
= [checkOneUse
, checkTwoUse
, matchShiftAmt
,
3313 &X
](SDNode
*Node
) -> bool {
3314 if (Node
->getOpcode() != ISD::SRL
)
3316 SDValue N0
= Node
->getOperand(0);
3317 if (N0
->getOpcode() != ISD::SHL
|| !checkOneUse(N0
))
3319 unsigned Bitwidth
= N0
.getSimpleValueType().getSizeInBits();
3320 SDValue N1
= Node
->getOperand(1);
3321 SDValue N01
= N0
->getOperand(1);
3322 // Both of the shifts must be by the exact same value.
3323 // There should not be any uses of the shift amount outside of the pattern.
3324 if (N1
!= N01
|| !checkTwoUse(N1
))
3326 if (!matchShiftAmt(N1
, Bitwidth
))
3328 X
= N0
->getOperand(0);
3332 auto matchLowBitMask
= [matchPatternA
, matchPatternB
,
3333 matchPatternC
](SDValue Mask
) -> bool {
3334 return matchPatternA(Mask
) || matchPatternB(Mask
) || matchPatternC(Mask
);
3337 if (Node
->getOpcode() == ISD::AND
) {
3338 X
= Node
->getOperand(0);
3339 SDValue Mask
= Node
->getOperand(1);
3341 if (matchLowBitMask(Mask
)) {
3345 if (!matchLowBitMask(Mask
))
3348 } else if (!matchPatternD(Node
))
3353 // Truncate the shift amount.
3354 NBits
= CurDAG
->getNode(ISD::TRUNCATE
, DL
, MVT::i8
, NBits
);
3355 insertDAGNode(*CurDAG
, SDValue(Node
, 0), NBits
);
3357 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3358 // All the other bits are undefined, we do not care about them.
3359 SDValue ImplDef
= SDValue(
3360 CurDAG
->getMachineNode(TargetOpcode::IMPLICIT_DEF
, DL
, MVT::i32
), 0);
3361 insertDAGNode(*CurDAG
, SDValue(Node
, 0), ImplDef
);
3363 SDValue SRIdxVal
= CurDAG
->getTargetConstant(X86::sub_8bit
, DL
, MVT::i32
);
3364 insertDAGNode(*CurDAG
, SDValue(Node
, 0), SRIdxVal
);
3366 CurDAG
->getMachineNode(TargetOpcode::INSERT_SUBREG
, DL
, MVT::i32
, ImplDef
,
3367 NBits
, SRIdxVal
), 0);
3368 insertDAGNode(*CurDAG
, SDValue(Node
, 0), NBits
);
3370 if (Subtarget
->hasBMI2()) {
3371 // Great, just emit the the BZHI..
3372 if (NVT
!= MVT::i32
) {
3373 // But have to place the bit count into the wide-enough register first.
3374 NBits
= CurDAG
->getNode(ISD::ANY_EXTEND
, DL
, NVT
, NBits
);
3375 insertDAGNode(*CurDAG
, SDValue(Node
, 0), NBits
);
3378 SDValue Extract
= CurDAG
->getNode(X86ISD::BZHI
, DL
, NVT
, X
, NBits
);
3379 ReplaceNode(Node
, Extract
.getNode());
3380 SelectCode(Extract
.getNode());
3384 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3385 // *logically* shifted (potentially with one-use trunc inbetween),
3386 // and the truncation was the only use of the shift,
3387 // and if so look past one-use truncation.
3389 SDValue RealX
= peekThroughOneUseTruncation(X
);
3390 // FIXME: only if the shift is one-use?
3391 if (RealX
!= X
&& RealX
.getOpcode() == ISD::SRL
)
3395 MVT XVT
= X
.getSimpleValueType();
3397 // Else, emitting BEXTR requires one more step.
3398 // The 'control' of BEXTR has the pattern of:
3399 // [15...8 bit][ 7...0 bit] location
3400 // [ bit count][ shift] name
3401 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
3403 // Shift NBits left by 8 bits, thus producing 'control'.
3404 // This makes the low 8 bits to be zero.
3405 SDValue C8
= CurDAG
->getConstant(8, DL
, MVT::i8
);
3406 SDValue Control
= CurDAG
->getNode(ISD::SHL
, DL
, MVT::i32
, NBits
, C8
);
3407 insertDAGNode(*CurDAG
, SDValue(Node
, 0), Control
);
3409 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3410 // FIXME: only if the shift is one-use?
3411 if (X
.getOpcode() == ISD::SRL
) {
3412 SDValue ShiftAmt
= X
.getOperand(1);
3413 X
= X
.getOperand(0);
3415 assert(ShiftAmt
.getValueType() == MVT::i8
&&
3416 "Expected shift amount to be i8");
3418 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
3419 // We could zext to i16 in some form, but we intentionally don't do that.
3420 SDValue OrigShiftAmt
= ShiftAmt
;
3421 ShiftAmt
= CurDAG
->getNode(ISD::ZERO_EXTEND
, DL
, MVT::i32
, ShiftAmt
);
3422 insertDAGNode(*CurDAG
, OrigShiftAmt
, ShiftAmt
);
3424 // And now 'or' these low 8 bits of shift amount into the 'control'.
3425 Control
= CurDAG
->getNode(ISD::OR
, DL
, MVT::i32
, Control
, ShiftAmt
);
3426 insertDAGNode(*CurDAG
, SDValue(Node
, 0), Control
);
3429 // But have to place the 'control' into the wide-enough register first.
3430 if (XVT
!= MVT::i32
) {
3431 Control
= CurDAG
->getNode(ISD::ANY_EXTEND
, DL
, XVT
, Control
);
3432 insertDAGNode(*CurDAG
, SDValue(Node
, 0), Control
);
3435 // And finally, form the BEXTR itself.
3436 SDValue Extract
= CurDAG
->getNode(X86ISD::BEXTR
, DL
, XVT
, X
, Control
);
3438 // The 'X' was originally truncated. Do that now.
3440 insertDAGNode(*CurDAG
, SDValue(Node
, 0), Extract
);
3441 Extract
= CurDAG
->getNode(ISD::TRUNCATE
, DL
, NVT
, Extract
);
3444 ReplaceNode(Node
, Extract
.getNode());
3445 SelectCode(Extract
.getNode());
3450 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
3451 MachineSDNode
*X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode
*Node
) {
3452 MVT NVT
= Node
->getSimpleValueType(0);
3455 SDValue N0
= Node
->getOperand(0);
3456 SDValue N1
= Node
->getOperand(1);
3458 // If we have TBM we can use an immediate for the control. If we have BMI
3459 // we should only do this if the BEXTR instruction is implemented well.
3460 // Otherwise moving the control into a register makes this more costly.
3461 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
3462 // hoisting the move immediate would make it worthwhile with a less optimal
3465 Subtarget
->hasTBM() || (Subtarget
->hasBMI() && Subtarget
->hasFastBEXTR());
3466 if (!PreferBEXTR
&& !Subtarget
->hasBMI2())
3469 // Must have a shift right.
3470 if (N0
->getOpcode() != ISD::SRL
&& N0
->getOpcode() != ISD::SRA
)
3473 // Shift can't have additional users.
3474 if (!N0
->hasOneUse())
3477 // Only supported for 32 and 64 bits.
3478 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
3481 // Shift amount and RHS of and must be constant.
3482 ConstantSDNode
*MaskCst
= dyn_cast
<ConstantSDNode
>(N1
);
3483 ConstantSDNode
*ShiftCst
= dyn_cast
<ConstantSDNode
>(N0
->getOperand(1));
3484 if (!MaskCst
|| !ShiftCst
)
3487 // And RHS must be a mask.
3488 uint64_t Mask
= MaskCst
->getZExtValue();
3489 if (!isMask_64(Mask
))
3492 uint64_t Shift
= ShiftCst
->getZExtValue();
3493 uint64_t MaskSize
= countPopulation(Mask
);
3495 // Don't interfere with something that can be handled by extracting AH.
3496 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
3497 if (Shift
== 8 && MaskSize
== 8)
3500 // Make sure we are only using bits that were in the original value, not
3502 if (Shift
+ MaskSize
> NVT
.getSizeInBits())
3505 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
3506 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
3507 // does not fit into 32 bits. Load folding is not a sufficient reason.
3508 if (!PreferBEXTR
&& MaskSize
<= 32)
3512 unsigned ROpc
, MOpc
;
3515 assert(Subtarget
->hasBMI2() && "We must have BMI2's BZHI then.");
3516 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
3517 // Let's perform the mask first, and apply shift later. Note that we need to
3518 // widen the mask to account for the fact that we'll apply shift afterwards!
3519 Control
= CurDAG
->getTargetConstant(Shift
+ MaskSize
, dl
, NVT
);
3520 ROpc
= NVT
== MVT::i64
? X86::BZHI64rr
: X86::BZHI32rr
;
3521 MOpc
= NVT
== MVT::i64
? X86::BZHI64rm
: X86::BZHI32rm
;
3522 unsigned NewOpc
= NVT
== MVT::i64
? X86::MOV32ri64
: X86::MOV32ri
;
3523 Control
= SDValue(CurDAG
->getMachineNode(NewOpc
, dl
, NVT
, Control
), 0);
3525 // The 'control' of BEXTR has the pattern of:
3526 // [15...8 bit][ 7...0 bit] location
3527 // [ bit count][ shift] name
3528 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
3529 Control
= CurDAG
->getTargetConstant(Shift
| (MaskSize
<< 8), dl
, NVT
);
3530 if (Subtarget
->hasTBM()) {
3531 ROpc
= NVT
== MVT::i64
? X86::BEXTRI64ri
: X86::BEXTRI32ri
;
3532 MOpc
= NVT
== MVT::i64
? X86::BEXTRI64mi
: X86::BEXTRI32mi
;
3534 assert(Subtarget
->hasBMI() && "We must have BMI1's BEXTR then.");
3535 // BMI requires the immediate to placed in a register.
3536 ROpc
= NVT
== MVT::i64
? X86::BEXTR64rr
: X86::BEXTR32rr
;
3537 MOpc
= NVT
== MVT::i64
? X86::BEXTR64rm
: X86::BEXTR32rm
;
3538 unsigned NewOpc
= NVT
== MVT::i64
? X86::MOV32ri64
: X86::MOV32ri
;
3539 Control
= SDValue(CurDAG
->getMachineNode(NewOpc
, dl
, NVT
, Control
), 0);
3543 MachineSDNode
*NewNode
;
3544 SDValue Input
= N0
->getOperand(0);
3545 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
3546 if (tryFoldLoad(Node
, N0
.getNode(), Input
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
3548 Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Control
, Input
.getOperand(0)};
3549 SDVTList VTs
= CurDAG
->getVTList(NVT
, MVT::i32
, MVT::Other
);
3550 NewNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
3551 // Update the chain.
3552 ReplaceUses(Input
.getValue(1), SDValue(NewNode
, 2));
3553 // Record the mem-refs
3554 CurDAG
->setNodeMemRefs(NewNode
, {cast
<LoadSDNode
>(Input
)->getMemOperand()});
3556 NewNode
= CurDAG
->getMachineNode(ROpc
, dl
, NVT
, MVT::i32
, Input
, Control
);
3560 // We still need to apply the shift.
3561 SDValue ShAmt
= CurDAG
->getTargetConstant(Shift
, dl
, NVT
);
3562 unsigned NewOpc
= NVT
== MVT::i64
? X86::SHR64ri
: X86::SHR32ri
;
3564 CurDAG
->getMachineNode(NewOpc
, dl
, NVT
, SDValue(NewNode
, 0), ShAmt
);
3570 // Emit a PCMISTR(I/M) instruction.
3571 MachineSDNode
*X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc
, unsigned MOpc
,
3572 bool MayFoldLoad
, const SDLoc
&dl
,
3573 MVT VT
, SDNode
*Node
) {
3574 SDValue N0
= Node
->getOperand(0);
3575 SDValue N1
= Node
->getOperand(1);
3576 SDValue Imm
= Node
->getOperand(2);
3577 const ConstantInt
*Val
= cast
<ConstantSDNode
>(Imm
)->getConstantIntValue();
3578 Imm
= CurDAG
->getTargetConstant(*Val
, SDLoc(Node
), Imm
.getValueType());
3580 // Try to fold a load. No need to check alignment.
3581 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
3582 if (MayFoldLoad
&& tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
3583 SDValue Ops
[] = { N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
3585 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Other
);
3586 MachineSDNode
*CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
3587 // Update the chain.
3588 ReplaceUses(N1
.getValue(1), SDValue(CNode
, 2));
3589 // Record the mem-refs
3590 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
3594 SDValue Ops
[] = { N0
, N1
, Imm
};
3595 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
);
3596 MachineSDNode
*CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, Ops
);
3600 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
3601 // to emit a second instruction after this one. This is needed since we have two
3602 // copyToReg nodes glued before this and we need to continue that glue through.
3603 MachineSDNode
*X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc
, unsigned MOpc
,
3604 bool MayFoldLoad
, const SDLoc
&dl
,
3605 MVT VT
, SDNode
*Node
,
3607 SDValue N0
= Node
->getOperand(0);
3608 SDValue N2
= Node
->getOperand(2);
3609 SDValue Imm
= Node
->getOperand(4);
3610 const ConstantInt
*Val
= cast
<ConstantSDNode
>(Imm
)->getConstantIntValue();
3611 Imm
= CurDAG
->getTargetConstant(*Val
, SDLoc(Node
), Imm
.getValueType());
3613 // Try to fold a load. No need to check alignment.
3614 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
3615 if (MayFoldLoad
&& tryFoldLoad(Node
, N2
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
3616 SDValue Ops
[] = { N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
3617 N2
.getOperand(0), InFlag
};
3618 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Other
, MVT::Glue
);
3619 MachineSDNode
*CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
3620 InFlag
= SDValue(CNode
, 3);
3621 // Update the chain.
3622 ReplaceUses(N2
.getValue(1), SDValue(CNode
, 2));
3623 // Record the mem-refs
3624 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N2
)->getMemOperand()});
3628 SDValue Ops
[] = { N0
, N2
, Imm
, InFlag
};
3629 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Glue
);
3630 MachineSDNode
*CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, Ops
);
3631 InFlag
= SDValue(CNode
, 2);
3635 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode
*N
) {
3636 EVT VT
= N
->getValueType(0);
3638 // Only handle scalar shifts.
3642 // Narrower shifts only mask to 5 bits in hardware.
3643 unsigned Size
= VT
== MVT::i64
? 64 : 32;
3645 SDValue OrigShiftAmt
= N
->getOperand(1);
3646 SDValue ShiftAmt
= OrigShiftAmt
;
3649 // Skip over a truncate of the shift amount.
3650 if (ShiftAmt
->getOpcode() == ISD::TRUNCATE
)
3651 ShiftAmt
= ShiftAmt
->getOperand(0);
3653 // This function is called after X86DAGToDAGISel::matchBitExtract(),
3654 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
3656 SDValue NewShiftAmt
;
3657 if (ShiftAmt
->getOpcode() == ISD::ADD
|| ShiftAmt
->getOpcode() == ISD::SUB
) {
3658 SDValue Add0
= ShiftAmt
->getOperand(0);
3659 SDValue Add1
= ShiftAmt
->getOperand(1);
3660 // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
3661 // to avoid the ADD/SUB.
3662 if (isa
<ConstantSDNode
>(Add1
) &&
3663 cast
<ConstantSDNode
>(Add1
)->getZExtValue() % Size
== 0) {
3665 // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
3666 // generate a NEG instead of a SUB of a constant.
3667 } else if (ShiftAmt
->getOpcode() == ISD::SUB
&&
3668 isa
<ConstantSDNode
>(Add0
) &&
3669 cast
<ConstantSDNode
>(Add0
)->getZExtValue() != 0 &&
3670 cast
<ConstantSDNode
>(Add0
)->getZExtValue() % Size
== 0) {
3671 // Insert a negate op.
3672 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
3673 // that uses it that's not a shift.
3674 EVT SubVT
= ShiftAmt
.getValueType();
3675 SDValue Zero
= CurDAG
->getConstant(0, DL
, SubVT
);
3676 SDValue Neg
= CurDAG
->getNode(ISD::SUB
, DL
, SubVT
, Zero
, Add1
);
3679 // Insert these operands into a valid topological order so they can
3680 // get selected independently.
3681 insertDAGNode(*CurDAG
, OrigShiftAmt
, Zero
);
3682 insertDAGNode(*CurDAG
, OrigShiftAmt
, Neg
);
3688 if (NewShiftAmt
.getValueType() != MVT::i8
) {
3689 // Need to truncate the shift amount.
3690 NewShiftAmt
= CurDAG
->getNode(ISD::TRUNCATE
, DL
, MVT::i8
, NewShiftAmt
);
3691 // Add to a correct topological ordering.
3692 insertDAGNode(*CurDAG
, OrigShiftAmt
, NewShiftAmt
);
3695 // Insert a new mask to keep the shift amount legal. This should be removed
3696 // by isel patterns.
3697 NewShiftAmt
= CurDAG
->getNode(ISD::AND
, DL
, MVT::i8
, NewShiftAmt
,
3698 CurDAG
->getConstant(Size
- 1, DL
, MVT::i8
));
3699 // Place in a correct topological ordering.
3700 insertDAGNode(*CurDAG
, OrigShiftAmt
, NewShiftAmt
);
3702 SDNode
*UpdatedNode
= CurDAG
->UpdateNodeOperands(N
, N
->getOperand(0),
3704 if (UpdatedNode
!= N
) {
3705 // If we found an existing node, we should replace ourselves with that node
3706 // and wait for it to be selected after its other users.
3707 ReplaceNode(N
, UpdatedNode
);
3711 // If the original shift amount is now dead, delete it so that we don't run
3713 if (OrigShiftAmt
.getNode()->use_empty())
3714 CurDAG
->RemoveDeadNode(OrigShiftAmt
.getNode());
3716 // Now that we've optimized the shift amount, defer to normal isel to get
3717 // load folding and legacy vs BMI2 selection without repeating it here.
3722 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode
*N
) {
3723 MVT NVT
= N
->getSimpleValueType(0);
3724 unsigned Opcode
= N
->getOpcode();
3727 // For operations of the form (x << C1) op C2, check if we can use a smaller
3728 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
3729 SDValue Shift
= N
->getOperand(0);
3730 SDValue N1
= N
->getOperand(1);
3732 ConstantSDNode
*Cst
= dyn_cast
<ConstantSDNode
>(N1
);
3736 int64_t Val
= Cst
->getSExtValue();
3738 // If we have an any_extend feeding the AND, look through it to see if there
3739 // is a shift behind it. But only if the AND doesn't use the extended bits.
3740 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
3741 bool FoundAnyExtend
= false;
3742 if (Shift
.getOpcode() == ISD::ANY_EXTEND
&& Shift
.hasOneUse() &&
3743 Shift
.getOperand(0).getSimpleValueType() == MVT::i32
&&
3745 FoundAnyExtend
= true;
3746 Shift
= Shift
.getOperand(0);
3749 if (Shift
.getOpcode() != ISD::SHL
|| !Shift
.hasOneUse())
3752 // i8 is unshrinkable, i16 should be promoted to i32.
3753 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
3756 ConstantSDNode
*ShlCst
= dyn_cast
<ConstantSDNode
>(Shift
.getOperand(1));
3760 uint64_t ShAmt
= ShlCst
->getZExtValue();
3762 // Make sure that we don't change the operation by removing bits.
3763 // This only matters for OR and XOR, AND is unaffected.
3764 uint64_t RemovedBitsMask
= (1ULL << ShAmt
) - 1;
3765 if (Opcode
!= ISD::AND
&& (Val
& RemovedBitsMask
) != 0)
3768 // Check the minimum bitwidth for the new constant.
3769 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
3770 auto CanShrinkImmediate
= [&](int64_t &ShiftedVal
) {
3771 if (Opcode
== ISD::AND
) {
3772 // AND32ri is the same as AND64ri32 with zext imm.
3773 // Try this before sign extended immediates below.
3774 ShiftedVal
= (uint64_t)Val
>> ShAmt
;
3775 if (NVT
== MVT::i64
&& !isUInt
<32>(Val
) && isUInt
<32>(ShiftedVal
))
3777 // Also swap order when the AND can become MOVZX.
3778 if (ShiftedVal
== UINT8_MAX
|| ShiftedVal
== UINT16_MAX
)
3781 ShiftedVal
= Val
>> ShAmt
;
3782 if ((!isInt
<8>(Val
) && isInt
<8>(ShiftedVal
)) ||
3783 (!isInt
<32>(Val
) && isInt
<32>(ShiftedVal
)))
3785 if (Opcode
!= ISD::AND
) {
3786 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
3787 ShiftedVal
= (uint64_t)Val
>> ShAmt
;
3788 if (NVT
== MVT::i64
&& !isUInt
<32>(Val
) && isUInt
<32>(ShiftedVal
))
3795 if (!CanShrinkImmediate(ShiftedVal
))
3798 // Ok, we can reorder to get a smaller immediate.
3800 // But, its possible the original immediate allowed an AND to become MOVZX.
3801 // Doing this late due to avoid the MakedValueIsZero call as late as
3803 if (Opcode
== ISD::AND
) {
3804 // Find the smallest zext this could possibly be.
3805 unsigned ZExtWidth
= Cst
->getAPIntValue().getActiveBits();
3806 ZExtWidth
= PowerOf2Ceil(std::max(ZExtWidth
, 8U));
3808 // Figure out which bits need to be zero to achieve that mask.
3809 APInt NeededMask
= APInt::getLowBitsSet(NVT
.getSizeInBits(),
3811 NeededMask
&= ~Cst
->getAPIntValue();
3813 if (CurDAG
->MaskedValueIsZero(N
->getOperand(0), NeededMask
))
3817 SDValue X
= Shift
.getOperand(0);
3818 if (FoundAnyExtend
) {
3819 SDValue NewX
= CurDAG
->getNode(ISD::ANY_EXTEND
, dl
, NVT
, X
);
3820 insertDAGNode(*CurDAG
, SDValue(N
, 0), NewX
);
3824 SDValue NewCst
= CurDAG
->getConstant(ShiftedVal
, dl
, NVT
);
3825 insertDAGNode(*CurDAG
, SDValue(N
, 0), NewCst
);
3826 SDValue NewBinOp
= CurDAG
->getNode(Opcode
, dl
, NVT
, X
, NewCst
);
3827 insertDAGNode(*CurDAG
, SDValue(N
, 0), NewBinOp
);
3828 SDValue NewSHL
= CurDAG
->getNode(ISD::SHL
, dl
, NVT
, NewBinOp
,
3829 Shift
.getOperand(1));
3830 ReplaceNode(N
, NewSHL
.getNode());
3831 SelectCode(NewSHL
.getNode());
3835 /// Convert vector increment or decrement to sub/add with an all-ones constant:
3836 /// add X, <1, 1...> --> sub X, <-1, -1...>
3837 /// sub X, <1, 1...> --> add X, <-1, -1...>
3838 /// The all-ones vector constant can be materialized using a pcmpeq instruction
3839 /// that is commonly recognized as an idiom (has no register dependency), so
3840 /// that's better/smaller than loading a splat 1 constant.
3841 bool X86DAGToDAGISel::combineIncDecVector(SDNode
*Node
) {
3842 assert((Node
->getOpcode() == ISD::ADD
|| Node
->getOpcode() == ISD::SUB
) &&
3843 "Unexpected opcode for increment/decrement transform");
3845 EVT VT
= Node
->getValueType(0);
3846 assert(VT
.isVector() && "Should only be called for vectors.");
3848 SDValue X
= Node
->getOperand(0);
3849 SDValue OneVec
= Node
->getOperand(1);
3852 if (!X86::isConstantSplat(OneVec
, SplatVal
) || !SplatVal
.isOneValue())
3856 SDValue OneConstant
, AllOnesVec
;
3858 APInt Ones
= APInt::getAllOnesValue(32);
3859 assert(VT
.getSizeInBits() % 32 == 0 &&
3860 "Expected bit count to be a multiple of 32");
3861 OneConstant
= CurDAG
->getConstant(Ones
, DL
, MVT::i32
);
3862 insertDAGNode(*CurDAG
, X
, OneConstant
);
3864 unsigned NumElts
= VT
.getSizeInBits() / 32;
3865 assert(NumElts
> 0 && "Expected to get non-empty vector.");
3866 AllOnesVec
= CurDAG
->getSplatBuildVector(MVT::getVectorVT(MVT::i32
, NumElts
),
3868 insertDAGNode(*CurDAG
, X
, AllOnesVec
);
3870 AllOnesVec
= CurDAG
->getBitcast(VT
, AllOnesVec
);
3871 insertDAGNode(*CurDAG
, X
, AllOnesVec
);
3873 unsigned NewOpcode
= Node
->getOpcode() == ISD::ADD
? ISD::SUB
: ISD::ADD
;
3874 SDValue NewNode
= CurDAG
->getNode(NewOpcode
, DL
, VT
, X
, AllOnesVec
);
3876 ReplaceNode(Node
, NewNode
.getNode());
3877 SelectCode(NewNode
.getNode());
3881 /// If the high bits of an 'and' operand are known zero, try setting the
3882 /// high bits of an 'and' constant operand to produce a smaller encoding by
3883 /// creating a small, sign-extended negative immediate rather than a large
3884 /// positive one. This reverses a transform in SimplifyDemandedBits that
3885 /// shrinks mask constants by clearing bits. There is also a possibility that
3886 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
3887 /// case, just replace the 'and'. Return 'true' if the node is replaced.
3888 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode
*And
) {
3889 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
3890 // have immediate operands.
3891 MVT VT
= And
->getSimpleValueType(0);
3892 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
3895 auto *And1C
= dyn_cast
<ConstantSDNode
>(And
->getOperand(1));
3899 // Bail out if the mask constant is already negative. It's can't shrink more.
3900 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
3901 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
3902 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
3903 // are negative too.
3904 APInt MaskVal
= And1C
->getAPIntValue();
3905 unsigned MaskLZ
= MaskVal
.countLeadingZeros();
3906 if (!MaskLZ
|| (VT
== MVT::i64
&& MaskLZ
== 32))
3909 // Don't extend into the upper 32 bits of a 64 bit mask.
3910 if (VT
== MVT::i64
&& MaskLZ
>= 32) {
3912 MaskVal
= MaskVal
.trunc(32);
3915 SDValue And0
= And
->getOperand(0);
3916 APInt HighZeros
= APInt::getHighBitsSet(MaskVal
.getBitWidth(), MaskLZ
);
3917 APInt NegMaskVal
= MaskVal
| HighZeros
;
3919 // If a negative constant would not allow a smaller encoding, there's no need
3920 // to continue. Only change the constant when we know it's a win.
3921 unsigned MinWidth
= NegMaskVal
.getMinSignedBits();
3922 if (MinWidth
> 32 || (MinWidth
> 8 && MaskVal
.getMinSignedBits() <= 32))
3925 // Extend masks if we truncated above.
3926 if (VT
== MVT::i64
&& MaskVal
.getBitWidth() < 64) {
3927 NegMaskVal
= NegMaskVal
.zext(64);
3928 HighZeros
= HighZeros
.zext(64);
3931 // The variable operand must be all zeros in the top bits to allow using the
3932 // new, negative constant as the mask.
3933 if (!CurDAG
->MaskedValueIsZero(And0
, HighZeros
))
3936 // Check if the mask is -1. In that case, this is an unnecessary instruction
3937 // that escaped earlier analysis.
3938 if (NegMaskVal
.isAllOnesValue()) {
3939 ReplaceNode(And
, And0
.getNode());
3943 // A negative mask allows a smaller encoding. Create a new 'and' node.
3944 SDValue NewMask
= CurDAG
->getConstant(NegMaskVal
, SDLoc(And
), VT
);
3945 SDValue NewAnd
= CurDAG
->getNode(ISD::AND
, SDLoc(And
), VT
, And0
, NewMask
);
3946 ReplaceNode(And
, NewAnd
.getNode());
3947 SelectCode(NewAnd
.getNode());
3951 static unsigned getVPTESTMOpc(MVT TestVT
, bool IsTestN
, bool FoldedLoad
,
3952 bool FoldedBCast
, bool Masked
) {
3955 switch (TestVT
.SimpleTy
) {
3956 default: llvm_unreachable("Unexpected VT!");
3958 return IsTestN
? X86::VPTESTNMBZ128rmk
: X86::VPTESTMBZ128rmk
;
3960 return IsTestN
? X86::VPTESTNMWZ128rmk
: X86::VPTESTMWZ128rmk
;
3962 return IsTestN
? X86::VPTESTNMDZ128rmk
: X86::VPTESTMDZ128rmk
;
3964 return IsTestN
? X86::VPTESTNMQZ128rmk
: X86::VPTESTMQZ128rmk
;
3966 return IsTestN
? X86::VPTESTNMBZ256rmk
: X86::VPTESTMBZ256rmk
;
3968 return IsTestN
? X86::VPTESTNMWZ256rmk
: X86::VPTESTMWZ256rmk
;
3970 return IsTestN
? X86::VPTESTNMDZ256rmk
: X86::VPTESTMDZ256rmk
;
3972 return IsTestN
? X86::VPTESTNMQZ256rmk
: X86::VPTESTMQZ256rmk
;
3974 return IsTestN
? X86::VPTESTNMBZrmk
: X86::VPTESTMBZrmk
;
3976 return IsTestN
? X86::VPTESTNMWZrmk
: X86::VPTESTMWZrmk
;
3978 return IsTestN
? X86::VPTESTNMDZrmk
: X86::VPTESTMDZrmk
;
3980 return IsTestN
? X86::VPTESTNMQZrmk
: X86::VPTESTMQZrmk
;
3985 switch (TestVT
.SimpleTy
) {
3986 default: llvm_unreachable("Unexpected VT!");
3988 return IsTestN
? X86::VPTESTNMDZ128rmbk
: X86::VPTESTMDZ128rmbk
;
3990 return IsTestN
? X86::VPTESTNMQZ128rmbk
: X86::VPTESTMQZ128rmbk
;
3992 return IsTestN
? X86::VPTESTNMDZ256rmbk
: X86::VPTESTMDZ256rmbk
;
3994 return IsTestN
? X86::VPTESTNMQZ256rmbk
: X86::VPTESTMQZ256rmbk
;
3996 return IsTestN
? X86::VPTESTNMDZrmbk
: X86::VPTESTMDZrmbk
;
3998 return IsTestN
? X86::VPTESTNMQZrmbk
: X86::VPTESTMQZrmbk
;
4002 switch (TestVT
.SimpleTy
) {
4003 default: llvm_unreachable("Unexpected VT!");
4005 return IsTestN
? X86::VPTESTNMBZ128rrk
: X86::VPTESTMBZ128rrk
;
4007 return IsTestN
? X86::VPTESTNMWZ128rrk
: X86::VPTESTMWZ128rrk
;
4009 return IsTestN
? X86::VPTESTNMDZ128rrk
: X86::VPTESTMDZ128rrk
;
4011 return IsTestN
? X86::VPTESTNMQZ128rrk
: X86::VPTESTMQZ128rrk
;
4013 return IsTestN
? X86::VPTESTNMBZ256rrk
: X86::VPTESTMBZ256rrk
;
4015 return IsTestN
? X86::VPTESTNMWZ256rrk
: X86::VPTESTMWZ256rrk
;
4017 return IsTestN
? X86::VPTESTNMDZ256rrk
: X86::VPTESTMDZ256rrk
;
4019 return IsTestN
? X86::VPTESTNMQZ256rrk
: X86::VPTESTMQZ256rrk
;
4021 return IsTestN
? X86::VPTESTNMBZrrk
: X86::VPTESTMBZrrk
;
4023 return IsTestN
? X86::VPTESTNMWZrrk
: X86::VPTESTMWZrrk
;
4025 return IsTestN
? X86::VPTESTNMDZrrk
: X86::VPTESTMDZrrk
;
4027 return IsTestN
? X86::VPTESTNMQZrrk
: X86::VPTESTMQZrrk
;
4032 switch (TestVT
.SimpleTy
) {
4033 default: llvm_unreachable("Unexpected VT!");
4035 return IsTestN
? X86::VPTESTNMBZ128rm
: X86::VPTESTMBZ128rm
;
4037 return IsTestN
? X86::VPTESTNMWZ128rm
: X86::VPTESTMWZ128rm
;
4039 return IsTestN
? X86::VPTESTNMDZ128rm
: X86::VPTESTMDZ128rm
;
4041 return IsTestN
? X86::VPTESTNMQZ128rm
: X86::VPTESTMQZ128rm
;
4043 return IsTestN
? X86::VPTESTNMBZ256rm
: X86::VPTESTMBZ256rm
;
4045 return IsTestN
? X86::VPTESTNMWZ256rm
: X86::VPTESTMWZ256rm
;
4047 return IsTestN
? X86::VPTESTNMDZ256rm
: X86::VPTESTMDZ256rm
;
4049 return IsTestN
? X86::VPTESTNMQZ256rm
: X86::VPTESTMQZ256rm
;
4051 return IsTestN
? X86::VPTESTNMBZrm
: X86::VPTESTMBZrm
;
4053 return IsTestN
? X86::VPTESTNMWZrm
: X86::VPTESTMWZrm
;
4055 return IsTestN
? X86::VPTESTNMDZrm
: X86::VPTESTMDZrm
;
4057 return IsTestN
? X86::VPTESTNMQZrm
: X86::VPTESTMQZrm
;
4062 switch (TestVT
.SimpleTy
) {
4063 default: llvm_unreachable("Unexpected VT!");
4065 return IsTestN
? X86::VPTESTNMDZ128rmb
: X86::VPTESTMDZ128rmb
;
4067 return IsTestN
? X86::VPTESTNMQZ128rmb
: X86::VPTESTMQZ128rmb
;
4069 return IsTestN
? X86::VPTESTNMDZ256rmb
: X86::VPTESTMDZ256rmb
;
4071 return IsTestN
? X86::VPTESTNMQZ256rmb
: X86::VPTESTMQZ256rmb
;
4073 return IsTestN
? X86::VPTESTNMDZrmb
: X86::VPTESTMDZrmb
;
4075 return IsTestN
? X86::VPTESTNMQZrmb
: X86::VPTESTMQZrmb
;
4079 switch (TestVT
.SimpleTy
) {
4080 default: llvm_unreachable("Unexpected VT!");
4082 return IsTestN
? X86::VPTESTNMBZ128rr
: X86::VPTESTMBZ128rr
;
4084 return IsTestN
? X86::VPTESTNMWZ128rr
: X86::VPTESTMWZ128rr
;
4086 return IsTestN
? X86::VPTESTNMDZ128rr
: X86::VPTESTMDZ128rr
;
4088 return IsTestN
? X86::VPTESTNMQZ128rr
: X86::VPTESTMQZ128rr
;
4090 return IsTestN
? X86::VPTESTNMBZ256rr
: X86::VPTESTMBZ256rr
;
4092 return IsTestN
? X86::VPTESTNMWZ256rr
: X86::VPTESTMWZ256rr
;
4094 return IsTestN
? X86::VPTESTNMDZ256rr
: X86::VPTESTMDZ256rr
;
4096 return IsTestN
? X86::VPTESTNMQZ256rr
: X86::VPTESTMQZ256rr
;
4098 return IsTestN
? X86::VPTESTNMBZrr
: X86::VPTESTMBZrr
;
4100 return IsTestN
? X86::VPTESTNMWZrr
: X86::VPTESTMWZrr
;
4102 return IsTestN
? X86::VPTESTNMDZrr
: X86::VPTESTMDZrr
;
4104 return IsTestN
? X86::VPTESTNMQZrr
: X86::VPTESTMQZrr
;
4108 // Try to create VPTESTM instruction. If InMask is not null, it will be used
4109 // to form a masked operation.
4110 bool X86DAGToDAGISel::tryVPTESTM(SDNode
*Root
, SDValue Setcc
,
4112 assert(Subtarget
->hasAVX512() && "Expected AVX512!");
4113 assert(Setcc
.getSimpleValueType().getVectorElementType() == MVT::i1
&&
4116 // Look for equal and not equal compares.
4117 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Setcc
.getOperand(2))->get();
4118 if (CC
!= ISD::SETEQ
&& CC
!= ISD::SETNE
)
4121 SDValue SetccOp0
= Setcc
.getOperand(0);
4122 SDValue SetccOp1
= Setcc
.getOperand(1);
4124 // Canonicalize the all zero vector to the RHS.
4125 if (ISD::isBuildVectorAllZeros(SetccOp0
.getNode()))
4126 std::swap(SetccOp0
, SetccOp1
);
4128 // See if we're comparing against zero.
4129 if (!ISD::isBuildVectorAllZeros(SetccOp1
.getNode()))
4132 SDValue N0
= SetccOp0
;
4134 MVT CmpVT
= N0
.getSimpleValueType();
4135 MVT CmpSVT
= CmpVT
.getVectorElementType();
4137 // Start with both operands the same. We'll try to refine this.
4142 // Look through single use bitcasts.
4143 SDValue N0Temp
= N0
;
4144 if (N0Temp
.getOpcode() == ISD::BITCAST
&& N0Temp
.hasOneUse())
4145 N0Temp
= N0
.getOperand(0);
4147 // Look for single use AND.
4148 if (N0Temp
.getOpcode() == ISD::AND
&& N0Temp
.hasOneUse()) {
4149 Src0
= N0Temp
.getOperand(0);
4150 Src1
= N0Temp
.getOperand(1);
4154 // Without VLX we need to widen the load.
4155 bool Widen
= !Subtarget
->hasVLX() && !CmpVT
.is512BitVector();
4157 // We can only fold loads if the sources are unique.
4158 bool CanFoldLoads
= Src0
!= Src1
;
4160 // Try to fold loads unless we need to widen.
4161 bool FoldedLoad
= false;
4162 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Load
;
4163 if (!Widen
&& CanFoldLoads
) {
4165 FoldedLoad
= tryFoldLoad(Root
, N0
.getNode(), Load
, Tmp0
, Tmp1
, Tmp2
, Tmp3
,
4168 // And is computative.
4170 FoldedLoad
= tryFoldLoad(Root
, N0
.getNode(), Load
, Tmp0
, Tmp1
, Tmp2
,
4173 std::swap(Src0
, Src1
);
4177 auto findBroadcastedOp
= [](SDValue Src
, MVT CmpSVT
, SDNode
*&Parent
) {
4178 // Look through single use bitcasts.
4179 if (Src
.getOpcode() == ISD::BITCAST
&& Src
.hasOneUse()) {
4180 Parent
= Src
.getNode();
4181 Src
= Src
.getOperand(0);
4184 if (Src
.getOpcode() == X86ISD::VBROADCAST_LOAD
&& Src
.hasOneUse()) {
4185 auto *MemIntr
= cast
<MemIntrinsicSDNode
>(Src
);
4186 if (MemIntr
->getMemoryVT().getSizeInBits() == CmpSVT
.getSizeInBits())
4193 // If we didn't fold a load, try to match broadcast. No widening limitation
4194 // for this. But only 32 and 64 bit types are supported.
4195 bool FoldedBCast
= false;
4196 if (!FoldedLoad
&& CanFoldLoads
&&
4197 (CmpSVT
== MVT::i32
|| CmpSVT
== MVT::i64
)) {
4198 SDNode
*ParentNode
= N0
.getNode();
4199 if ((Load
= findBroadcastedOp(Src1
, CmpSVT
, ParentNode
))) {
4200 FoldedBCast
= tryFoldBroadcast(Root
, ParentNode
, Load
, Tmp0
,
4201 Tmp1
, Tmp2
, Tmp3
, Tmp4
);
4204 // Try the other operand.
4206 SDNode
*ParentNode
= N0
.getNode();
4207 if ((Load
= findBroadcastedOp(Src0
, CmpSVT
, ParentNode
))) {
4208 FoldedBCast
= tryFoldBroadcast(Root
, ParentNode
, Load
, Tmp0
,
4209 Tmp1
, Tmp2
, Tmp3
, Tmp4
);
4211 std::swap(Src0
, Src1
);
4216 auto getMaskRC
= [](MVT MaskVT
) {
4217 switch (MaskVT
.SimpleTy
) {
4218 default: llvm_unreachable("Unexpected VT!");
4219 case MVT::v2i1
: return X86::VK2RegClassID
;
4220 case MVT::v4i1
: return X86::VK4RegClassID
;
4221 case MVT::v8i1
: return X86::VK8RegClassID
;
4222 case MVT::v16i1
: return X86::VK16RegClassID
;
4223 case MVT::v32i1
: return X86::VK32RegClassID
;
4224 case MVT::v64i1
: return X86::VK64RegClassID
;
4228 bool IsMasked
= InMask
.getNode() != nullptr;
4232 MVT ResVT
= Setcc
.getSimpleValueType();
4235 // Widen the inputs using insert_subreg or copy_to_regclass.
4236 unsigned Scale
= CmpVT
.is128BitVector() ? 4 : 2;
4237 unsigned SubReg
= CmpVT
.is128BitVector() ? X86::sub_xmm
: X86::sub_ymm
;
4238 unsigned NumElts
= CmpVT
.getVectorNumElements() * Scale
;
4239 CmpVT
= MVT::getVectorVT(CmpSVT
, NumElts
);
4240 MaskVT
= MVT::getVectorVT(MVT::i1
, NumElts
);
4241 SDValue ImplDef
= SDValue(CurDAG
->getMachineNode(X86::IMPLICIT_DEF
, dl
,
4243 Src0
= CurDAG
->getTargetInsertSubreg(SubReg
, dl
, CmpVT
, ImplDef
, Src0
);
4245 assert(!FoldedLoad
&& "Shouldn't have folded the load");
4247 Src1
= CurDAG
->getTargetInsertSubreg(SubReg
, dl
, CmpVT
, ImplDef
, Src1
);
4251 unsigned RegClass
= getMaskRC(MaskVT
);
4252 SDValue RC
= CurDAG
->getTargetConstant(RegClass
, dl
, MVT::i32
);
4253 InMask
= SDValue(CurDAG
->getMachineNode(TargetOpcode::COPY_TO_REGCLASS
,
4254 dl
, MaskVT
, InMask
, RC
), 0);
4258 bool IsTestN
= CC
== ISD::SETEQ
;
4259 unsigned Opc
= getVPTESTMOpc(CmpVT
, IsTestN
, FoldedLoad
, FoldedBCast
,
4262 MachineSDNode
*CNode
;
4263 if (FoldedLoad
|| FoldedBCast
) {
4264 SDVTList VTs
= CurDAG
->getVTList(MaskVT
, MVT::Other
);
4267 SDValue Ops
[] = { InMask
, Src0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
,
4268 Load
.getOperand(0) };
4269 CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
4271 SDValue Ops
[] = { Src0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
,
4272 Load
.getOperand(0) };
4273 CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
4276 // Update the chain.
4277 ReplaceUses(Load
.getValue(1), SDValue(CNode
, 1));
4278 // Record the mem-refs
4279 CurDAG
->setNodeMemRefs(CNode
, {cast
<MemSDNode
>(Load
)->getMemOperand()});
4282 CNode
= CurDAG
->getMachineNode(Opc
, dl
, MaskVT
, InMask
, Src0
, Src1
);
4284 CNode
= CurDAG
->getMachineNode(Opc
, dl
, MaskVT
, Src0
, Src1
);
4287 // If we widened, we need to shrink the mask VT.
4289 unsigned RegClass
= getMaskRC(ResVT
);
4290 SDValue RC
= CurDAG
->getTargetConstant(RegClass
, dl
, MVT::i32
);
4291 CNode
= CurDAG
->getMachineNode(TargetOpcode::COPY_TO_REGCLASS
,
4292 dl
, ResVT
, SDValue(CNode
, 0), RC
);
4295 ReplaceUses(SDValue(Root
, 0), SDValue(CNode
, 0));
4296 CurDAG
->RemoveDeadNode(Root
);
4300 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
4302 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode
*N
) {
4303 assert(N
->getOpcode() == ISD::OR
&& "Unexpected opcode!");
4305 MVT NVT
= N
->getSimpleValueType(0);
4307 // Make sure we support VPTERNLOG.
4308 if (!NVT
.isVector() || !Subtarget
->hasAVX512())
4311 // We need VLX for 128/256-bit.
4312 if (!(Subtarget
->hasVLX() || NVT
.is512BitVector()))
4315 SDValue N0
= N
->getOperand(0);
4316 SDValue N1
= N
->getOperand(1);
4318 // Canonicalize AND to LHS.
4319 if (N1
.getOpcode() == ISD::AND
)
4322 if (N0
.getOpcode() != ISD::AND
||
4323 N1
.getOpcode() != X86ISD::ANDNP
||
4324 !N0
.hasOneUse() || !N1
.hasOneUse())
4327 // ANDN is not commutable, use it to pick down A and C.
4328 SDValue A
= N1
.getOperand(0);
4329 SDValue C
= N1
.getOperand(1);
4331 // AND is commutable, if one operand matches A, the other operand is B.
4332 // Otherwise this isn't a match.
4334 if (N0
.getOperand(0) == A
)
4335 B
= N0
.getOperand(1);
4336 else if (N0
.getOperand(1) == A
)
4337 B
= N0
.getOperand(0);
4342 SDValue Imm
= CurDAG
->getTargetConstant(0xCA, dl
, MVT::i8
);
4343 SDValue Ternlog
= CurDAG
->getNode(X86ISD::VPTERNLOG
, dl
, NVT
, A
, B
, C
, Imm
);
4344 ReplaceNode(N
, Ternlog
.getNode());
4345 SelectCode(Ternlog
.getNode());
4349 void X86DAGToDAGISel::Select(SDNode
*Node
) {
4350 MVT NVT
= Node
->getSimpleValueType(0);
4351 unsigned Opcode
= Node
->getOpcode();
4354 if (Node
->isMachineOpcode()) {
4355 LLVM_DEBUG(dbgs() << "== "; Node
->dump(CurDAG
); dbgs() << '\n');
4356 Node
->setNodeId(-1);
4357 return; // Already selected.
4362 case ISD::INTRINSIC_VOID
: {
4363 unsigned IntNo
= Node
->getConstantOperandVal(1);
4366 case Intrinsic::x86_sse3_monitor
:
4367 case Intrinsic::x86_monitorx
:
4368 case Intrinsic::x86_clzero
: {
4369 bool Use64BitPtr
= Node
->getOperand(2).getValueType() == MVT::i64
;
4373 default: llvm_unreachable("Unexpected intrinsic!");
4374 case Intrinsic::x86_sse3_monitor
:
4375 if (!Subtarget
->hasSSE3())
4377 Opc
= Use64BitPtr
? X86::MONITOR64rrr
: X86::MONITOR32rrr
;
4379 case Intrinsic::x86_monitorx
:
4380 if (!Subtarget
->hasMWAITX())
4382 Opc
= Use64BitPtr
? X86::MONITORX64rrr
: X86::MONITORX32rrr
;
4384 case Intrinsic::x86_clzero
:
4385 if (!Subtarget
->hasCLZERO())
4387 Opc
= Use64BitPtr
? X86::CLZERO64r
: X86::CLZERO32r
;
4392 unsigned PtrReg
= Use64BitPtr
? X86::RAX
: X86::EAX
;
4393 SDValue Chain
= CurDAG
->getCopyToReg(Node
->getOperand(0), dl
, PtrReg
,
4394 Node
->getOperand(2), SDValue());
4395 SDValue InFlag
= Chain
.getValue(1);
4397 if (IntNo
== Intrinsic::x86_sse3_monitor
||
4398 IntNo
== Intrinsic::x86_monitorx
) {
4399 // Copy the other two operands to ECX and EDX.
4400 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::ECX
, Node
->getOperand(3),
4402 InFlag
= Chain
.getValue(1);
4403 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::EDX
, Node
->getOperand(4),
4405 InFlag
= Chain
.getValue(1);
4408 MachineSDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, MVT::Other
,
4410 ReplaceNode(Node
, CNode
);
4419 if (Subtarget
->isTargetNaCl())
4420 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
4421 // leave the instruction alone.
4423 if (Subtarget
->isTarget64BitILP32()) {
4424 // Converts a 32-bit register to a 64-bit, zero-extended version of
4425 // it. This is needed because x86-64 can do many things, but jmp %r32
4426 // ain't one of them.
4427 const SDValue
&Target
= Node
->getOperand(1);
4428 assert(Target
.getSimpleValueType() == llvm::MVT::i32
);
4429 SDValue ZextTarget
= CurDAG
->getZExtOrTrunc(Target
, dl
, EVT(MVT::i64
));
4430 SDValue Brind
= CurDAG
->getNode(ISD::BRIND
, dl
, MVT::Other
,
4431 Node
->getOperand(0), ZextTarget
);
4432 ReplaceNode(Node
, Brind
.getNode());
4433 SelectCode(ZextTarget
.getNode());
4434 SelectCode(Brind
.getNode());
4439 case X86ISD::GlobalBaseReg
:
4440 ReplaceNode(Node
, getGlobalBaseReg());
4444 // Just drop all 128/256/512-bit bitcasts.
4445 if (NVT
.is512BitVector() || NVT
.is256BitVector() || NVT
.is128BitVector() ||
4447 ReplaceUses(SDValue(Node
, 0), Node
->getOperand(0));
4448 CurDAG
->RemoveDeadNode(Node
);
4453 case ISD::VSELECT
: {
4454 // Replace VSELECT with non-mask conditions with with BLENDV.
4455 if (Node
->getOperand(0).getValueType().getVectorElementType() == MVT::i1
)
4458 assert(Subtarget
->hasSSE41() && "Expected SSE4.1 support!");
4459 SDValue Blendv
= CurDAG
->getNode(
4460 X86ISD::BLENDV
, SDLoc(Node
), Node
->getValueType(0), Node
->getOperand(0),
4461 Node
->getOperand(1), Node
->getOperand(2));
4462 ReplaceNode(Node
, Blendv
.getNode());
4463 SelectCode(Blendv
.getNode());
4464 // We already called ReplaceUses.
4469 if (matchBitExtract(Node
))
4474 if (tryShiftAmountMod(Node
))
4479 if (NVT
.isVector() && NVT
.getVectorElementType() == MVT::i1
) {
4480 // Try to form a masked VPTESTM. Operands can be in either order.
4481 SDValue N0
= Node
->getOperand(0);
4482 SDValue N1
= Node
->getOperand(1);
4483 if (N0
.getOpcode() == ISD::SETCC
&& N0
.hasOneUse() &&
4484 tryVPTESTM(Node
, N0
, N1
))
4486 if (N1
.getOpcode() == ISD::SETCC
&& N1
.hasOneUse() &&
4487 tryVPTESTM(Node
, N1
, N0
))
4491 if (MachineSDNode
*NewNode
= matchBEXTRFromAndImm(Node
)) {
4492 ReplaceUses(SDValue(Node
, 0), SDValue(NewNode
, 0));
4493 CurDAG
->RemoveDeadNode(Node
);
4496 if (matchBitExtract(Node
))
4498 if (AndImmShrink
&& shrinkAndImmediate(Node
))
4504 if (tryShrinkShlLogicImm(Node
))
4507 if (Opcode
== ISD::OR
&& tryMatchBitSelect(Node
))
4513 if ((Opcode
== ISD::ADD
|| Opcode
== ISD::SUB
) && NVT
.isVector() &&
4514 combineIncDecVector(Node
))
4517 // Try to avoid folding immediates with multiple uses for optsize.
4518 // This code tries to select to register form directly to avoid going
4519 // through the isel table which might fold the immediate. We can't change
4520 // the patterns on the add/sub/and/or/xor with immediate paterns in the
4521 // tablegen files to check immediate use count without making the patterns
4522 // unavailable to the fast-isel table.
4526 // Only handle i8/i16/i32/i64.
4527 if (NVT
!= MVT::i8
&& NVT
!= MVT::i16
&& NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
4530 SDValue N0
= Node
->getOperand(0);
4531 SDValue N1
= Node
->getOperand(1);
4533 ConstantSDNode
*Cst
= dyn_cast
<ConstantSDNode
>(N1
);
4537 int64_t Val
= Cst
->getSExtValue();
4539 // Make sure its an immediate that is considered foldable.
4540 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
4541 if (!isInt
<8>(Val
) && !isInt
<32>(Val
))
4544 // If this can match to INC/DEC, let it go.
4545 if (Opcode
== ISD::ADD
&& (Val
== 1 || Val
== -1))
4548 // Check if we should avoid folding this immediate.
4549 if (!shouldAvoidImmediateInstFormsForSize(N1
.getNode()))
4552 // We should not fold the immediate. So we need a register form instead.
4553 unsigned ROpc
, MOpc
;
4554 switch (NVT
.SimpleTy
) {
4555 default: llvm_unreachable("Unexpected VT!");
4558 default: llvm_unreachable("Unexpected opcode!");
4559 case ISD::ADD
: ROpc
= X86::ADD8rr
; MOpc
= X86::ADD8rm
; break;
4560 case ISD::SUB
: ROpc
= X86::SUB8rr
; MOpc
= X86::SUB8rm
; break;
4561 case ISD::AND
: ROpc
= X86::AND8rr
; MOpc
= X86::AND8rm
; break;
4562 case ISD::OR
: ROpc
= X86::OR8rr
; MOpc
= X86::OR8rm
; break;
4563 case ISD::XOR
: ROpc
= X86::XOR8rr
; MOpc
= X86::XOR8rm
; break;
4568 default: llvm_unreachable("Unexpected opcode!");
4569 case ISD::ADD
: ROpc
= X86::ADD16rr
; MOpc
= X86::ADD16rm
; break;
4570 case ISD::SUB
: ROpc
= X86::SUB16rr
; MOpc
= X86::SUB16rm
; break;
4571 case ISD::AND
: ROpc
= X86::AND16rr
; MOpc
= X86::AND16rm
; break;
4572 case ISD::OR
: ROpc
= X86::OR16rr
; MOpc
= X86::OR16rm
; break;
4573 case ISD::XOR
: ROpc
= X86::XOR16rr
; MOpc
= X86::XOR16rm
; break;
4578 default: llvm_unreachable("Unexpected opcode!");
4579 case ISD::ADD
: ROpc
= X86::ADD32rr
; MOpc
= X86::ADD32rm
; break;
4580 case ISD::SUB
: ROpc
= X86::SUB32rr
; MOpc
= X86::SUB32rm
; break;
4581 case ISD::AND
: ROpc
= X86::AND32rr
; MOpc
= X86::AND32rm
; break;
4582 case ISD::OR
: ROpc
= X86::OR32rr
; MOpc
= X86::OR32rm
; break;
4583 case ISD::XOR
: ROpc
= X86::XOR32rr
; MOpc
= X86::XOR32rm
; break;
4588 default: llvm_unreachable("Unexpected opcode!");
4589 case ISD::ADD
: ROpc
= X86::ADD64rr
; MOpc
= X86::ADD64rm
; break;
4590 case ISD::SUB
: ROpc
= X86::SUB64rr
; MOpc
= X86::SUB64rm
; break;
4591 case ISD::AND
: ROpc
= X86::AND64rr
; MOpc
= X86::AND64rm
; break;
4592 case ISD::OR
: ROpc
= X86::OR64rr
; MOpc
= X86::OR64rm
; break;
4593 case ISD::XOR
: ROpc
= X86::XOR64rr
; MOpc
= X86::XOR64rm
; break;
4598 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
4600 // If this is a not a subtract, we can still try to fold a load.
4601 if (Opcode
!= ISD::SUB
) {
4602 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4603 if (tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
4604 SDValue Ops
[] = { N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N0
.getOperand(0) };
4605 SDVTList VTs
= CurDAG
->getVTList(NVT
, MVT::i32
, MVT::Other
);
4606 MachineSDNode
*CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
4607 // Update the chain.
4608 ReplaceUses(N0
.getValue(1), SDValue(CNode
, 2));
4609 // Record the mem-refs
4610 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N0
)->getMemOperand()});
4611 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
4612 CurDAG
->RemoveDeadNode(Node
);
4617 CurDAG
->SelectNodeTo(Node
, ROpc
, NVT
, MVT::i32
, N0
, N1
);
4622 // i16/i32/i64 are handled with isel patterns.
4626 case X86ISD::UMUL
: {
4627 SDValue N0
= Node
->getOperand(0);
4628 SDValue N1
= Node
->getOperand(1);
4630 unsigned LoReg
, ROpc
, MOpc
;
4631 switch (NVT
.SimpleTy
) {
4632 default: llvm_unreachable("Unsupported VT!");
4635 ROpc
= Opcode
== X86ISD::SMUL
? X86::IMUL8r
: X86::MUL8r
;
4636 MOpc
= Opcode
== X86ISD::SMUL
? X86::IMUL8m
: X86::MUL8m
;
4655 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4656 bool FoldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
4657 // Multiply is commmutative.
4659 FoldedLoad
= tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
4664 SDValue InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, LoReg
,
4665 N0
, SDValue()).getValue(1);
4667 MachineSDNode
*CNode
;
4669 // i16/i32/i64 use an instruction that produces a low and high result even
4670 // though only the low result is used.
4673 VTs
= CurDAG
->getVTList(NVT
, MVT::i32
, MVT::Other
);
4675 VTs
= CurDAG
->getVTList(NVT
, NVT
, MVT::i32
, MVT::Other
);
4677 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
4679 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
4681 // Update the chain.
4682 ReplaceUses(N1
.getValue(1), SDValue(CNode
, NVT
== MVT::i8
? 2 : 3));
4683 // Record the mem-refs
4684 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
4686 // i16/i32/i64 use an instruction that produces a low and high result even
4687 // though only the low result is used.
4690 VTs
= CurDAG
->getVTList(NVT
, MVT::i32
);
4692 VTs
= CurDAG
->getVTList(NVT
, NVT
, MVT::i32
);
4694 CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, {N1
, InFlag
});
4697 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
4698 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, NVT
== MVT::i8
? 1 : 2));
4699 CurDAG
->RemoveDeadNode(Node
);
4703 case ISD::SMUL_LOHI
:
4704 case ISD::UMUL_LOHI
: {
4705 SDValue N0
= Node
->getOperand(0);
4706 SDValue N1
= Node
->getOperand(1);
4709 bool isSigned
= Opcode
== ISD::SMUL_LOHI
;
4711 switch (NVT
.SimpleTy
) {
4712 default: llvm_unreachable("Unsupported VT!");
4713 case MVT::i32
: Opc
= X86::MUL32r
; MOpc
= X86::MUL32m
; break;
4714 case MVT::i64
: Opc
= X86::MUL64r
; MOpc
= X86::MUL64m
; break;
4717 switch (NVT
.SimpleTy
) {
4718 default: llvm_unreachable("Unsupported VT!");
4719 case MVT::i32
: Opc
= X86::IMUL32r
; MOpc
= X86::IMUL32m
; break;
4720 case MVT::i64
: Opc
= X86::IMUL64r
; MOpc
= X86::IMUL64m
; break;
4724 unsigned SrcReg
, LoReg
, HiReg
;
4726 default: llvm_unreachable("Unknown MUL opcode!");
4729 SrcReg
= LoReg
= X86::EAX
; HiReg
= X86::EDX
;
4733 SrcReg
= LoReg
= X86::RAX
; HiReg
= X86::RDX
;
4737 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4738 bool foldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
4739 // Multiply is commmutative.
4741 foldedLoad
= tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
4746 SDValue InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, SrcReg
,
4747 N0
, SDValue()).getValue(1);
4750 MachineSDNode
*CNode
= nullptr;
4751 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
4753 SDVTList VTs
= CurDAG
->getVTList(MVT::Other
, MVT::Glue
);
4754 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
4755 Chain
= SDValue(CNode
, 0);
4756 InFlag
= SDValue(CNode
, 1);
4758 // Update the chain.
4759 ReplaceUses(N1
.getValue(1), Chain
);
4760 // Record the mem-refs
4761 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
4763 SDValue Ops
[] = { N1
, InFlag
};
4764 SDVTList VTs
= CurDAG
->getVTList(MVT::Glue
);
4765 SDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
4766 InFlag
= SDValue(CNode
, 0);
4769 // Copy the low half of the result, if it is needed.
4770 if (!SDValue(Node
, 0).use_empty()) {
4771 assert(LoReg
&& "Register for low half is not defined!");
4772 SDValue ResLo
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
, LoReg
,
4774 InFlag
= ResLo
.getValue(2);
4775 ReplaceUses(SDValue(Node
, 0), ResLo
);
4776 LLVM_DEBUG(dbgs() << "=> "; ResLo
.getNode()->dump(CurDAG
);
4779 // Copy the high half of the result, if it is needed.
4780 if (!SDValue(Node
, 1).use_empty()) {
4781 assert(HiReg
&& "Register for high half is not defined!");
4782 SDValue ResHi
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
, HiReg
,
4784 InFlag
= ResHi
.getValue(2);
4785 ReplaceUses(SDValue(Node
, 1), ResHi
);
4786 LLVM_DEBUG(dbgs() << "=> "; ResHi
.getNode()->dump(CurDAG
);
4790 CurDAG
->RemoveDeadNode(Node
);
4795 case ISD::UDIVREM
: {
4796 SDValue N0
= Node
->getOperand(0);
4797 SDValue N1
= Node
->getOperand(1);
4800 bool isSigned
= Opcode
== ISD::SDIVREM
;
4802 switch (NVT
.SimpleTy
) {
4803 default: llvm_unreachable("Unsupported VT!");
4804 case MVT::i8
: Opc
= X86::DIV8r
; MOpc
= X86::DIV8m
; break;
4805 case MVT::i16
: Opc
= X86::DIV16r
; MOpc
= X86::DIV16m
; break;
4806 case MVT::i32
: Opc
= X86::DIV32r
; MOpc
= X86::DIV32m
; break;
4807 case MVT::i64
: Opc
= X86::DIV64r
; MOpc
= X86::DIV64m
; break;
4810 switch (NVT
.SimpleTy
) {
4811 default: llvm_unreachable("Unsupported VT!");
4812 case MVT::i8
: Opc
= X86::IDIV8r
; MOpc
= X86::IDIV8m
; break;
4813 case MVT::i16
: Opc
= X86::IDIV16r
; MOpc
= X86::IDIV16m
; break;
4814 case MVT::i32
: Opc
= X86::IDIV32r
; MOpc
= X86::IDIV32m
; break;
4815 case MVT::i64
: Opc
= X86::IDIV64r
; MOpc
= X86::IDIV64m
; break;
4819 unsigned LoReg
, HiReg
, ClrReg
;
4820 unsigned SExtOpcode
;
4821 switch (NVT
.SimpleTy
) {
4822 default: llvm_unreachable("Unsupported VT!");
4824 LoReg
= X86::AL
; ClrReg
= HiReg
= X86::AH
;
4825 SExtOpcode
= 0; // Not used.
4828 LoReg
= X86::AX
; HiReg
= X86::DX
;
4830 SExtOpcode
= X86::CWD
;
4833 LoReg
= X86::EAX
; ClrReg
= HiReg
= X86::EDX
;
4834 SExtOpcode
= X86::CDQ
;
4837 LoReg
= X86::RAX
; ClrReg
= HiReg
= X86::RDX
;
4838 SExtOpcode
= X86::CQO
;
4842 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4843 bool foldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
4844 bool signBitIsZero
= CurDAG
->SignBitIsZero(N0
);
4847 if (NVT
== MVT::i8
) {
4848 // Special case for div8, just use a move with zero extension to AX to
4849 // clear the upper 8 bits (AH).
4850 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Chain
;
4851 MachineSDNode
*Move
;
4852 if (tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
4853 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N0
.getOperand(0) };
4854 unsigned Opc
= (isSigned
&& !signBitIsZero
) ? X86::MOVSX16rm8
4856 Move
= CurDAG
->getMachineNode(Opc
, dl
, MVT::i16
, MVT::Other
, Ops
);
4857 Chain
= SDValue(Move
, 1);
4858 ReplaceUses(N0
.getValue(1), Chain
);
4859 // Record the mem-refs
4860 CurDAG
->setNodeMemRefs(Move
, {cast
<LoadSDNode
>(N0
)->getMemOperand()});
4862 unsigned Opc
= (isSigned
&& !signBitIsZero
) ? X86::MOVSX16rr8
4864 Move
= CurDAG
->getMachineNode(Opc
, dl
, MVT::i16
, N0
);
4865 Chain
= CurDAG
->getEntryNode();
4867 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::AX
, SDValue(Move
, 0),
4869 InFlag
= Chain
.getValue(1);
4872 CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
,
4873 LoReg
, N0
, SDValue()).getValue(1);
4874 if (isSigned
&& !signBitIsZero
) {
4875 // Sign extend the low part into the high part.
4877 SDValue(CurDAG
->getMachineNode(SExtOpcode
, dl
, MVT::Glue
, InFlag
),0);
4879 // Zero out the high part, effectively zero extending the input.
4880 SDValue ClrNode
= SDValue(CurDAG
->getMachineNode(X86::MOV32r0
, dl
, NVT
), 0);
4881 switch (NVT
.SimpleTy
) {
4884 SDValue(CurDAG
->getMachineNode(
4885 TargetOpcode::EXTRACT_SUBREG
, dl
, MVT::i16
, ClrNode
,
4886 CurDAG
->getTargetConstant(X86::sub_16bit
, dl
,
4894 SDValue(CurDAG
->getMachineNode(
4895 TargetOpcode::SUBREG_TO_REG
, dl
, MVT::i64
,
4896 CurDAG
->getTargetConstant(0, dl
, MVT::i64
), ClrNode
,
4897 CurDAG
->getTargetConstant(X86::sub_32bit
, dl
,
4902 llvm_unreachable("Unexpected division source");
4905 InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, ClrReg
,
4906 ClrNode
, InFlag
).getValue(1);
4911 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
4913 MachineSDNode
*CNode
=
4914 CurDAG
->getMachineNode(MOpc
, dl
, MVT::Other
, MVT::Glue
, Ops
);
4915 InFlag
= SDValue(CNode
, 1);
4916 // Update the chain.
4917 ReplaceUses(N1
.getValue(1), SDValue(CNode
, 0));
4918 // Record the mem-refs
4919 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
4922 SDValue(CurDAG
->getMachineNode(Opc
, dl
, MVT::Glue
, N1
, InFlag
), 0);
4925 // Prevent use of AH in a REX instruction by explicitly copying it to
4926 // an ABCD_L register.
4928 // The current assumption of the register allocator is that isel
4929 // won't generate explicit references to the GR8_ABCD_H registers. If
4930 // the allocator and/or the backend get enhanced to be more robust in
4931 // that regard, this can be, and should be, removed.
4932 if (HiReg
== X86::AH
&& !SDValue(Node
, 1).use_empty()) {
4933 SDValue AHCopy
= CurDAG
->getRegister(X86::AH
, MVT::i8
);
4934 unsigned AHExtOpcode
=
4935 isSigned
? X86::MOVSX32rr8_NOREX
: X86::MOVZX32rr8_NOREX
;
4937 SDNode
*RNode
= CurDAG
->getMachineNode(AHExtOpcode
, dl
, MVT::i32
,
4938 MVT::Glue
, AHCopy
, InFlag
);
4939 SDValue
Result(RNode
, 0);
4940 InFlag
= SDValue(RNode
, 1);
4943 CurDAG
->getTargetExtractSubreg(X86::sub_8bit
, dl
, MVT::i8
, Result
);
4945 ReplaceUses(SDValue(Node
, 1), Result
);
4946 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
4949 // Copy the division (low) result, if it is needed.
4950 if (!SDValue(Node
, 0).use_empty()) {
4951 SDValue Result
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
,
4952 LoReg
, NVT
, InFlag
);
4953 InFlag
= Result
.getValue(2);
4954 ReplaceUses(SDValue(Node
, 0), Result
);
4955 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
4958 // Copy the remainder (high) result, if it is needed.
4959 if (!SDValue(Node
, 1).use_empty()) {
4960 SDValue Result
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
,
4961 HiReg
, NVT
, InFlag
);
4962 InFlag
= Result
.getValue(2);
4963 ReplaceUses(SDValue(Node
, 1), Result
);
4964 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
4967 CurDAG
->RemoveDeadNode(Node
);
4972 SDValue N0
= Node
->getOperand(0);
4973 SDValue N1
= Node
->getOperand(1);
4975 // Optimizations for TEST compares.
4976 if (!isNullConstant(N1
))
4979 // Save the original VT of the compare.
4980 MVT CmpVT
= N0
.getSimpleValueType();
4982 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
4983 // by a test instruction. The test should be removed later by
4984 // analyzeCompare if we are using only the zero flag.
4985 // TODO: Should we check the users and use the BEXTR flags directly?
4986 if (N0
.getOpcode() == ISD::AND
&& N0
.hasOneUse()) {
4987 if (MachineSDNode
*NewNode
= matchBEXTRFromAndImm(N0
.getNode())) {
4988 unsigned TestOpc
= CmpVT
== MVT::i64
? X86::TEST64rr
4990 SDValue BEXTR
= SDValue(NewNode
, 0);
4991 NewNode
= CurDAG
->getMachineNode(TestOpc
, dl
, MVT::i32
, BEXTR
, BEXTR
);
4992 ReplaceUses(SDValue(Node
, 0), SDValue(NewNode
, 0));
4993 CurDAG
->RemoveDeadNode(Node
);
4998 // We can peek through truncates, but we need to be careful below.
4999 if (N0
.getOpcode() == ISD::TRUNCATE
&& N0
.hasOneUse())
5000 N0
= N0
.getOperand(0);
5002 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
5003 // use a smaller encoding.
5004 // Look past the truncate if CMP is the only use of it.
5005 if (N0
.getOpcode() == ISD::AND
&&
5006 N0
.getNode()->hasOneUse() &&
5007 N0
.getValueType() != MVT::i8
) {
5008 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1));
5010 uint64_t Mask
= C
->getZExtValue();
5012 // Check if we can replace AND+IMM64 with a shift. This is possible for
5013 // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
5015 if (CmpVT
== MVT::i64
&& !isInt
<32>(Mask
) &&
5016 onlyUsesZeroFlag(SDValue(Node
, 0))) {
5017 if (isMask_64(~Mask
)) {
5018 unsigned TrailingZeros
= countTrailingZeros(Mask
);
5019 SDValue Imm
= CurDAG
->getTargetConstant(TrailingZeros
, dl
, MVT::i64
);
5021 SDValue(CurDAG
->getMachineNode(X86::SHR64ri
, dl
, MVT::i64
, MVT::i32
,
5022 N0
.getOperand(0), Imm
), 0);
5023 MachineSDNode
*Test
= CurDAG
->getMachineNode(X86::TEST64rr
, dl
,
5024 MVT::i32
, Shift
, Shift
);
5025 ReplaceNode(Node
, Test
);
5028 if (isMask_64(Mask
)) {
5029 unsigned LeadingZeros
= countLeadingZeros(Mask
);
5030 SDValue Imm
= CurDAG
->getTargetConstant(LeadingZeros
, dl
, MVT::i64
);
5032 SDValue(CurDAG
->getMachineNode(X86::SHL64ri
, dl
, MVT::i64
, MVT::i32
,
5033 N0
.getOperand(0), Imm
), 0);
5034 MachineSDNode
*Test
= CurDAG
->getMachineNode(X86::TEST64rr
, dl
,
5035 MVT::i32
, Shift
, Shift
);
5036 ReplaceNode(Node
, Test
);
5043 unsigned ROpc
, MOpc
;
5045 // For each of these checks we need to be careful if the sign flag is
5046 // being used. It is only safe to use the sign flag in two conditions,
5047 // either the sign bit in the shrunken mask is zero or the final test
5048 // size is equal to the original compare size.
5050 if (isUInt
<8>(Mask
) &&
5051 (!(Mask
& 0x80) || CmpVT
== MVT::i8
||
5052 hasNoSignFlagUses(SDValue(Node
, 0)))) {
5053 // For example, convert "testl %eax, $8" to "testb %al, $8"
5055 SubRegOp
= X86::sub_8bit
;
5056 ROpc
= X86::TEST8ri
;
5057 MOpc
= X86::TEST8mi
;
5058 } else if (OptForMinSize
&& isUInt
<16>(Mask
) &&
5059 (!(Mask
& 0x8000) || CmpVT
== MVT::i16
||
5060 hasNoSignFlagUses(SDValue(Node
, 0)))) {
5061 // For example, "testl %eax, $32776" to "testw %ax, $32776".
5062 // NOTE: We only want to form TESTW instructions if optimizing for
5063 // min size. Otherwise we only save one byte and possibly get a length
5064 // changing prefix penalty in the decoders.
5066 SubRegOp
= X86::sub_16bit
;
5067 ROpc
= X86::TEST16ri
;
5068 MOpc
= X86::TEST16mi
;
5069 } else if (isUInt
<32>(Mask
) && N0
.getValueType() != MVT::i16
&&
5070 ((!(Mask
& 0x80000000) &&
5071 // Without minsize 16-bit Cmps can get here so we need to
5072 // be sure we calculate the correct sign flag if needed.
5073 (CmpVT
!= MVT::i16
|| !(Mask
& 0x8000))) ||
5074 CmpVT
== MVT::i32
||
5075 hasNoSignFlagUses(SDValue(Node
, 0)))) {
5076 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
5077 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
5078 // Otherwize, we find ourselves in a position where we have to do
5079 // promotion. If previous passes did not promote the and, we assume
5080 // they had a good reason not to and do not promote here.
5082 SubRegOp
= X86::sub_32bit
;
5083 ROpc
= X86::TEST32ri
;
5084 MOpc
= X86::TEST32mi
;
5086 // No eligible transformation was found.
5090 SDValue Imm
= CurDAG
->getTargetConstant(Mask
, dl
, VT
);
5091 SDValue Reg
= N0
.getOperand(0);
5093 // Emit a testl or testw.
5094 MachineSDNode
*NewNode
;
5095 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
5096 if (tryFoldLoad(Node
, N0
.getNode(), Reg
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
5097 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
5098 Reg
.getOperand(0) };
5099 NewNode
= CurDAG
->getMachineNode(MOpc
, dl
, MVT::i32
, MVT::Other
, Ops
);
5100 // Update the chain.
5101 ReplaceUses(Reg
.getValue(1), SDValue(NewNode
, 1));
5102 // Record the mem-refs
5103 CurDAG
->setNodeMemRefs(NewNode
,
5104 {cast
<LoadSDNode
>(Reg
)->getMemOperand()});
5106 // Extract the subregister if necessary.
5107 if (N0
.getValueType() != VT
)
5108 Reg
= CurDAG
->getTargetExtractSubreg(SubRegOp
, dl
, VT
, Reg
);
5110 NewNode
= CurDAG
->getMachineNode(ROpc
, dl
, MVT::i32
, Reg
, Imm
);
5112 // Replace CMP with TEST.
5113 ReplaceNode(Node
, NewNode
);
5118 case X86ISD::PCMPISTR
: {
5119 if (!Subtarget
->hasSSE42())
5122 bool NeedIndex
= !SDValue(Node
, 0).use_empty();
5123 bool NeedMask
= !SDValue(Node
, 1).use_empty();
5124 // We can't fold a load if we are going to make two instructions.
5125 bool MayFoldLoad
= !NeedIndex
|| !NeedMask
;
5127 MachineSDNode
*CNode
;
5129 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRMrr
: X86::PCMPISTRMrr
;
5130 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRMrm
: X86::PCMPISTRMrm
;
5131 CNode
= emitPCMPISTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::v16i8
, Node
);
5132 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, 0));
5134 if (NeedIndex
|| !NeedMask
) {
5135 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRIrr
: X86::PCMPISTRIrr
;
5136 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRIrm
: X86::PCMPISTRIrm
;
5137 CNode
= emitPCMPISTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::i32
, Node
);
5138 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
5141 // Connect the flag usage to the last instruction created.
5142 ReplaceUses(SDValue(Node
, 2), SDValue(CNode
, 1));
5143 CurDAG
->RemoveDeadNode(Node
);
5146 case X86ISD::PCMPESTR
: {
5147 if (!Subtarget
->hasSSE42())
5150 // Copy the two implicit register inputs.
5151 SDValue InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EAX
,
5152 Node
->getOperand(1),
5153 SDValue()).getValue(1);
5154 InFlag
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EDX
,
5155 Node
->getOperand(3), InFlag
).getValue(1);
5157 bool NeedIndex
= !SDValue(Node
, 0).use_empty();
5158 bool NeedMask
= !SDValue(Node
, 1).use_empty();
5159 // We can't fold a load if we are going to make two instructions.
5160 bool MayFoldLoad
= !NeedIndex
|| !NeedMask
;
5162 MachineSDNode
*CNode
;
5164 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRMrr
: X86::PCMPESTRMrr
;
5165 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRMrm
: X86::PCMPESTRMrm
;
5166 CNode
= emitPCMPESTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::v16i8
, Node
,
5168 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, 0));
5170 if (NeedIndex
|| !NeedMask
) {
5171 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRIrr
: X86::PCMPESTRIrr
;
5172 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRIrm
: X86::PCMPESTRIrm
;
5173 CNode
= emitPCMPESTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::i32
, Node
, InFlag
);
5174 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
5176 // Connect the flag usage to the last instruction created.
5177 ReplaceUses(SDValue(Node
, 2), SDValue(CNode
, 1));
5178 CurDAG
->RemoveDeadNode(Node
);
5183 if (NVT
.isVector() && tryVPTESTM(Node
, SDValue(Node
, 0), SDValue()))
5190 if (foldLoadStoreIntoMemOperand(Node
))
5196 case ISD::FNEARBYINT
:
5198 // Replace fp rounding with their X86 specific equivalent so we don't
5199 // need 2 sets of patterns.
5200 // FIXME: This can only happen when the nodes started as STRICT_* and have
5201 // been mutated into their non-STRICT equivalents. Eventually this
5202 // mutation will be removed and we should switch the STRICT_ nodes to a
5203 // strict version of RNDSCALE in PreProcessISelDAG.
5205 switch (Node
->getOpcode()) {
5206 default: llvm_unreachable("Unexpected opcode!");
5207 case ISD::FCEIL
: Imm
= 0xA; break;
5208 case ISD::FFLOOR
: Imm
= 0x9; break;
5209 case ISD::FTRUNC
: Imm
= 0xB; break;
5210 case ISD::FNEARBYINT
: Imm
= 0xC; break;
5211 case ISD::FRINT
: Imm
= 0x4; break;
5214 SDValue Res
= CurDAG
->getNode(X86ISD::VRNDSCALE
, dl
, Node
->getValueType(0),
5215 Node
->getOperand(0),
5216 CurDAG
->getTargetConstant(Imm
, dl
, MVT::i8
));
5217 ReplaceNode(Node
, Res
.getNode());
5218 SelectCode(Res
.getNode());
5226 bool X86DAGToDAGISel::
5227 SelectInlineAsmMemoryOperand(const SDValue
&Op
, unsigned ConstraintID
,
5228 std::vector
<SDValue
> &OutOps
) {
5229 SDValue Op0
, Op1
, Op2
, Op3
, Op4
;
5230 switch (ConstraintID
) {
5232 llvm_unreachable("Unexpected asm memory constraint");
5233 case InlineAsm::Constraint_i
:
5234 // FIXME: It seems strange that 'i' is needed here since it's supposed to
5235 // be an immediate and not a memory constraint.
5237 case InlineAsm::Constraint_o
: // offsetable ??
5238 case InlineAsm::Constraint_v
: // not offsetable ??
5239 case InlineAsm::Constraint_m
: // memory
5240 case InlineAsm::Constraint_X
:
5241 if (!selectAddr(nullptr, Op
, Op0
, Op1
, Op2
, Op3
, Op4
))
5246 OutOps
.push_back(Op0
);
5247 OutOps
.push_back(Op1
);
5248 OutOps
.push_back(Op2
);
5249 OutOps
.push_back(Op3
);
5250 OutOps
.push_back(Op4
);
5254 /// This pass converts a legalized DAG into a X86-specific DAG,
5255 /// ready for instruction scheduling.
5256 FunctionPass
*llvm::createX86ISelDag(X86TargetMachine
&TM
,
5257 CodeGenOpt::Level OptLevel
) {
5258 return new X86DAGToDAGISel(TM
, OptLevel
);