1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines a DAG pattern matching instruction selector for X86,
10 // converting from a legalized dag to a X86 dag.
12 //===----------------------------------------------------------------------===//
14 #include "X86ISelDAGToDAG.h"
16 #include "X86MachineFunctionInfo.h"
17 #include "X86Subtarget.h"
18 #include "X86TargetMachine.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/MachineModuleInfo.h"
21 #include "llvm/CodeGen/SelectionDAGISel.h"
22 #include "llvm/Config/llvm-config.h"
23 #include "llvm/IR/ConstantRange.h"
24 #include "llvm/IR/Function.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/IntrinsicsX86.h"
28 #include "llvm/IR/Module.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/ErrorHandling.h"
32 #include "llvm/Support/KnownBits.h"
33 #include "llvm/Support/MathExtras.h"
38 #define DEBUG_TYPE "x86-isel"
39 #define PASS_NAME "X86 DAG->DAG Instruction Selection"
41 STATISTIC(NumLoadMoved
, "Number of loads moved below TokenFactor");
43 static cl::opt
<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
47 static cl::opt
<bool> EnablePromoteAnyextLoad(
48 "x86-promote-anyext-load", cl::init(true),
49 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden
);
51 extern cl::opt
<bool> IndirectBranchTracking
;
53 //===----------------------------------------------------------------------===//
54 // Pattern Matcher Implementation
55 //===----------------------------------------------------------------------===//
58 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59 /// numbers for the leaves of the matched tree.
60 struct X86ISelAddressMode
{
66 // This is really a union, discriminated by BaseType!
68 int Base_FrameIndex
= 0;
74 const GlobalValue
*GV
= nullptr;
75 const Constant
*CP
= nullptr;
76 const BlockAddress
*BlockAddr
= nullptr;
77 const char *ES
= nullptr;
78 MCSymbol
*MCSym
= nullptr;
80 Align Alignment
; // CP alignment.
81 unsigned char SymbolFlags
= X86II::MO_NO_FLAG
; // X86II::MO_*
82 bool NegateIndex
= false;
84 X86ISelAddressMode() = default;
86 bool hasSymbolicDisplacement() const {
87 return GV
!= nullptr || CP
!= nullptr || ES
!= nullptr ||
88 MCSym
!= nullptr || JT
!= -1 || BlockAddr
!= nullptr;
91 bool hasBaseOrIndexReg() const {
92 return BaseType
== FrameIndexBase
||
93 IndexReg
.getNode() != nullptr || Base_Reg
.getNode() != nullptr;
96 /// Return true if this addressing mode is already RIP-relative.
97 bool isRIPRelative() const {
98 if (BaseType
!= RegBase
) return false;
99 if (RegisterSDNode
*RegNode
=
100 dyn_cast_or_null
<RegisterSDNode
>(Base_Reg
.getNode()))
101 return RegNode
->getReg() == X86::RIP
;
105 void setBaseReg(SDValue Reg
) {
110 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 void dump(SelectionDAG
*DAG
= nullptr) {
112 dbgs() << "X86ISelAddressMode " << this << '\n';
113 dbgs() << "Base_Reg ";
114 if (Base_Reg
.getNode())
115 Base_Reg
.getNode()->dump(DAG
);
118 if (BaseType
== FrameIndexBase
)
119 dbgs() << " Base.FrameIndex " << Base_FrameIndex
<< '\n';
120 dbgs() << " Scale " << Scale
<< '\n'
124 if (IndexReg
.getNode())
125 IndexReg
.getNode()->dump(DAG
);
128 dbgs() << " Disp " << Disp
<< '\n'
150 dbgs() << " JT" << JT
<< " Align" << Alignment
.value() << '\n';
157 //===--------------------------------------------------------------------===//
158 /// ISel - X86-specific code to select X86 machine instructions for
159 /// SelectionDAG operations.
161 class X86DAGToDAGISel final
: public SelectionDAGISel
{
162 /// Keep a pointer to the X86Subtarget around so that we can
163 /// make the right decision when generating code for different targets.
164 const X86Subtarget
*Subtarget
;
166 /// If true, selector should try to optimize for minimum code size.
169 /// Disable direct TLS access through segment registers.
170 bool IndirectTlsSegRefs
;
173 X86DAGToDAGISel() = delete;
175 explicit X86DAGToDAGISel(X86TargetMachine
&tm
, CodeGenOptLevel OptLevel
)
176 : SelectionDAGISel(tm
, OptLevel
), Subtarget(nullptr),
177 OptForMinSize(false), IndirectTlsSegRefs(false) {}
179 bool runOnMachineFunction(MachineFunction
&MF
) override
{
180 // Reset the subtarget each time through.
181 Subtarget
= &MF
.getSubtarget
<X86Subtarget
>();
182 IndirectTlsSegRefs
= MF
.getFunction().hasFnAttribute(
183 "indirect-tls-seg-refs");
185 // OptFor[Min]Size are used in pattern predicates that isel is matching.
186 OptForMinSize
= MF
.getFunction().hasMinSize();
187 assert((!OptForMinSize
|| MF
.getFunction().hasOptSize()) &&
188 "OptForMinSize implies OptForSize");
189 return SelectionDAGISel::runOnMachineFunction(MF
);
192 void emitFunctionEntryCode() override
;
194 bool IsProfitableToFold(SDValue N
, SDNode
*U
, SDNode
*Root
) const override
;
196 void PreprocessISelDAG() override
;
197 void PostprocessISelDAG() override
;
199 // Include the pieces autogenerated from the target description.
200 #include "X86GenDAGISel.inc"
203 void Select(SDNode
*N
) override
;
205 bool foldOffsetIntoAddress(uint64_t Offset
, X86ISelAddressMode
&AM
);
206 bool matchLoadInAddress(LoadSDNode
*N
, X86ISelAddressMode
&AM
,
207 bool AllowSegmentRegForX32
= false);
208 bool matchWrapper(SDValue N
, X86ISelAddressMode
&AM
);
209 bool matchAddress(SDValue N
, X86ISelAddressMode
&AM
);
210 bool matchVectorAddress(SDValue N
, X86ISelAddressMode
&AM
);
211 bool matchAdd(SDValue
&N
, X86ISelAddressMode
&AM
, unsigned Depth
);
212 SDValue
matchIndexRecursively(SDValue N
, X86ISelAddressMode
&AM
,
214 bool matchAddressRecursively(SDValue N
, X86ISelAddressMode
&AM
,
216 bool matchVectorAddressRecursively(SDValue N
, X86ISelAddressMode
&AM
,
218 bool matchAddressBase(SDValue N
, X86ISelAddressMode
&AM
);
219 bool selectAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
220 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
222 bool selectVectorAddr(MemSDNode
*Parent
, SDValue BasePtr
, SDValue IndexOp
,
223 SDValue ScaleOp
, SDValue
&Base
, SDValue
&Scale
,
224 SDValue
&Index
, SDValue
&Disp
, SDValue
&Segment
);
225 bool selectMOV64Imm32(SDValue N
, SDValue
&Imm
);
226 bool selectLEAAddr(SDValue N
, SDValue
&Base
,
227 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
229 bool selectLEA64_32Addr(SDValue N
, SDValue
&Base
,
230 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
232 bool selectTLSADDRAddr(SDValue N
, SDValue
&Base
,
233 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
235 bool selectRelocImm(SDValue N
, SDValue
&Op
);
237 bool tryFoldLoad(SDNode
*Root
, SDNode
*P
, SDValue N
,
238 SDValue
&Base
, SDValue
&Scale
,
239 SDValue
&Index
, SDValue
&Disp
,
242 // Convenience method where P is also root.
243 bool tryFoldLoad(SDNode
*P
, SDValue N
,
244 SDValue
&Base
, SDValue
&Scale
,
245 SDValue
&Index
, SDValue
&Disp
,
247 return tryFoldLoad(P
, P
, N
, Base
, Scale
, Index
, Disp
, Segment
);
250 bool tryFoldBroadcast(SDNode
*Root
, SDNode
*P
, SDValue N
,
251 SDValue
&Base
, SDValue
&Scale
,
252 SDValue
&Index
, SDValue
&Disp
,
255 bool isProfitableToFormMaskedOp(SDNode
*N
) const;
257 /// Implement addressing mode selection for inline asm expressions.
258 bool SelectInlineAsmMemoryOperand(const SDValue
&Op
,
259 InlineAsm::ConstraintCode ConstraintID
,
260 std::vector
<SDValue
> &OutOps
) override
;
262 void emitSpecialCodeForMain();
264 inline void getAddressOperands(X86ISelAddressMode
&AM
, const SDLoc
&DL
,
265 MVT VT
, SDValue
&Base
, SDValue
&Scale
,
266 SDValue
&Index
, SDValue
&Disp
,
268 if (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
269 Base
= CurDAG
->getTargetFrameIndex(
270 AM
.Base_FrameIndex
, TLI
->getPointerTy(CurDAG
->getDataLayout()));
271 else if (AM
.Base_Reg
.getNode())
274 Base
= CurDAG
->getRegister(0, VT
);
276 Scale
= getI8Imm(AM
.Scale
, DL
);
278 #define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
279 // Negate the index if needed.
280 if (AM
.NegateIndex
) {
281 unsigned NegOpc
= VT
== MVT::i64
? GET_ND_IF_ENABLED(X86::NEG64r
)
282 : GET_ND_IF_ENABLED(X86::NEG32r
);
283 SDValue Neg
= SDValue(CurDAG
->getMachineNode(NegOpc
, DL
, VT
, MVT::i32
,
288 if (AM
.IndexReg
.getNode())
291 Index
= CurDAG
->getRegister(0, VT
);
293 // These are 32-bit even in 64-bit mode since RIP-relative offset
296 Disp
= CurDAG
->getTargetGlobalAddress(AM
.GV
, SDLoc(),
300 Disp
= CurDAG
->getTargetConstantPool(AM
.CP
, MVT::i32
, AM
.Alignment
,
301 AM
.Disp
, AM
.SymbolFlags
);
303 assert(!AM
.Disp
&& "Non-zero displacement is ignored with ES.");
304 Disp
= CurDAG
->getTargetExternalSymbol(AM
.ES
, MVT::i32
, AM
.SymbolFlags
);
305 } else if (AM
.MCSym
) {
306 assert(!AM
.Disp
&& "Non-zero displacement is ignored with MCSym.");
307 assert(AM
.SymbolFlags
== 0 && "oo");
308 Disp
= CurDAG
->getMCSymbol(AM
.MCSym
, MVT::i32
);
309 } else if (AM
.JT
!= -1) {
310 assert(!AM
.Disp
&& "Non-zero displacement is ignored with JT.");
311 Disp
= CurDAG
->getTargetJumpTable(AM
.JT
, MVT::i32
, AM
.SymbolFlags
);
312 } else if (AM
.BlockAddr
)
313 Disp
= CurDAG
->getTargetBlockAddress(AM
.BlockAddr
, MVT::i32
, AM
.Disp
,
316 Disp
= CurDAG
->getSignedTargetConstant(AM
.Disp
, DL
, MVT::i32
);
318 if (AM
.Segment
.getNode())
319 Segment
= AM
.Segment
;
321 Segment
= CurDAG
->getRegister(0, MVT::i16
);
324 // Utility function to determine whether it is AMX SDNode right after
325 // lowering but before ISEL.
326 bool isAMXSDNode(SDNode
*N
) const {
327 // Check if N is AMX SDNode:
328 // 1. check specific opcode since these carry MVT::Untyped instead of
330 // 2. check result type;
331 // 3. check operand type;
332 switch (N
->getOpcode()) {
335 case X86::PT2RPNTLVWZ0V
:
336 case X86::PT2RPNTLVWZ0T1V
:
337 case X86::PT2RPNTLVWZ1V
:
338 case X86::PT2RPNTLVWZ1T1V
:
339 case X86::PT2RPNTLVWZ0RSV
:
340 case X86::PT2RPNTLVWZ0RST1V
:
341 case X86::PT2RPNTLVWZ1RSV
:
342 case X86::PT2RPNTLVWZ1RST1V
:
345 for (unsigned Idx
= 0, E
= N
->getNumValues(); Idx
!= E
; ++Idx
) {
346 if (N
->getValueType(Idx
) == MVT::x86amx
)
349 for (unsigned Idx
= 0, E
= N
->getNumOperands(); Idx
!= E
; ++Idx
) {
350 SDValue Op
= N
->getOperand(Idx
);
351 if (Op
.getValueType() == MVT::x86amx
)
357 // Utility function to determine whether we should avoid selecting
358 // immediate forms of instructions for better code size or not.
359 // At a high level, we'd like to avoid such instructions when
360 // we have similar constants used within the same basic block
361 // that can be kept in a register.
363 bool shouldAvoidImmediateInstFormsForSize(SDNode
*N
) const {
364 uint32_t UseCount
= 0;
366 // Do not want to hoist if we're not optimizing for size.
367 // TODO: We'd like to remove this restriction.
368 // See the comment in X86InstrInfo.td for more info.
369 if (!CurDAG
->shouldOptForSize())
372 // Walk all the users of the immediate.
373 for (const SDNode
*User
: N
->users()) {
377 // This user is already selected. Count it as a legitimate use and
379 if (User
->isMachineOpcode()) {
384 // We want to count stores of immediates as real uses.
385 if (User
->getOpcode() == ISD::STORE
&&
386 User
->getOperand(1).getNode() == N
) {
391 // We don't currently match users that have > 2 operands (except
392 // for stores, which are handled above)
393 // Those instruction won't match in ISEL, for now, and would
394 // be counted incorrectly.
395 // This may change in the future as we add additional instruction
397 if (User
->getNumOperands() != 2)
400 // If this is a sign-extended 8-bit integer immediate used in an ALU
401 // instruction, there is probably an opcode encoding to save space.
402 auto *C
= dyn_cast
<ConstantSDNode
>(N
);
403 if (C
&& isInt
<8>(C
->getSExtValue()))
406 // Immediates that are used for offsets as part of stack
407 // manipulation should be left alone. These are typically
408 // used to indicate SP offsets for argument passing and
409 // will get pulled into stores/pushes (implicitly).
410 if (User
->getOpcode() == X86ISD::ADD
||
411 User
->getOpcode() == ISD::ADD
||
412 User
->getOpcode() == X86ISD::SUB
||
413 User
->getOpcode() == ISD::SUB
) {
415 // Find the other operand of the add/sub.
416 SDValue OtherOp
= User
->getOperand(0);
417 if (OtherOp
.getNode() == N
)
418 OtherOp
= User
->getOperand(1);
420 // Don't count if the other operand is SP.
421 RegisterSDNode
*RegNode
;
422 if (OtherOp
->getOpcode() == ISD::CopyFromReg
&&
423 (RegNode
= dyn_cast_or_null
<RegisterSDNode
>(
424 OtherOp
->getOperand(1).getNode())))
425 if ((RegNode
->getReg() == X86::ESP
) ||
426 (RegNode
->getReg() == X86::RSP
))
430 // ... otherwise, count this and move on.
434 // If we have more than 1 use, then recommend for hoisting.
435 return (UseCount
> 1);
438 /// Return a target constant with the specified value of type i8.
439 inline SDValue
getI8Imm(unsigned Imm
, const SDLoc
&DL
) {
440 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i8
);
443 /// Return a target constant with the specified value, of type i32.
444 inline SDValue
getI32Imm(unsigned Imm
, const SDLoc
&DL
) {
445 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i32
);
448 /// Return a target constant with the specified value, of type i64.
449 inline SDValue
getI64Imm(uint64_t Imm
, const SDLoc
&DL
) {
450 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i64
);
453 SDValue
getExtractVEXTRACTImmediate(SDNode
*N
, unsigned VecWidth
,
455 assert((VecWidth
== 128 || VecWidth
== 256) && "Unexpected vector width");
456 uint64_t Index
= N
->getConstantOperandVal(1);
457 MVT VecVT
= N
->getOperand(0).getSimpleValueType();
458 return getI8Imm((Index
* VecVT
.getScalarSizeInBits()) / VecWidth
, DL
);
461 SDValue
getInsertVINSERTImmediate(SDNode
*N
, unsigned VecWidth
,
463 assert((VecWidth
== 128 || VecWidth
== 256) && "Unexpected vector width");
464 uint64_t Index
= N
->getConstantOperandVal(2);
465 MVT VecVT
= N
->getSimpleValueType(0);
466 return getI8Imm((Index
* VecVT
.getScalarSizeInBits()) / VecWidth
, DL
);
469 SDValue
getPermuteVINSERTCommutedImmediate(SDNode
*N
, unsigned VecWidth
,
471 assert(VecWidth
== 128 && "Unexpected vector width");
472 uint64_t Index
= N
->getConstantOperandVal(2);
473 MVT VecVT
= N
->getSimpleValueType(0);
474 uint64_t InsertIdx
= (Index
* VecVT
.getScalarSizeInBits()) / VecWidth
;
475 assert((InsertIdx
== 0 || InsertIdx
== 1) && "Bad insertf128 index");
476 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
477 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
478 return getI8Imm(InsertIdx
? 0x02 : 0x30, DL
);
481 SDValue
getSBBZero(SDNode
*N
) {
483 MVT VT
= N
->getSimpleValueType(0);
486 SDVTList VTs
= CurDAG
->getVTList(MVT::i32
, MVT::i32
);
488 SDValue(CurDAG
->getMachineNode(X86::MOV32r0
, dl
, VTs
, {}), 0);
489 if (VT
== MVT::i64
) {
491 CurDAG
->getMachineNode(
492 TargetOpcode::SUBREG_TO_REG
, dl
, MVT::i64
,
493 CurDAG
->getTargetConstant(0, dl
, MVT::i64
), Zero
,
494 CurDAG
->getTargetConstant(X86::sub_32bit
, dl
, MVT::i32
)),
498 // Copy flags to the EFLAGS register and glue it to next node.
499 unsigned Opcode
= N
->getOpcode();
500 assert((Opcode
== X86ISD::SBB
|| Opcode
== X86ISD::SETCC_CARRY
) &&
501 "Unexpected opcode for SBB materialization");
502 unsigned FlagOpIndex
= Opcode
== X86ISD::SBB
? 2 : 1;
504 CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EFLAGS
,
505 N
->getOperand(FlagOpIndex
), SDValue());
507 // Create a 64-bit instruction if the result is 64-bits otherwise use the
509 unsigned Opc
= VT
== MVT::i64
? X86::SBB64rr
: X86::SBB32rr
;
510 MVT SBBVT
= VT
== MVT::i64
? MVT::i64
: MVT::i32
;
511 VTs
= CurDAG
->getVTList(SBBVT
, MVT::i32
);
513 CurDAG
->getMachineNode(Opc
, dl
, VTs
,
514 {Zero
, Zero
, EFLAGS
, EFLAGS
.getValue(1)}),
518 // Helper to detect unneeded and instructions on shift amounts. Called
519 // from PatFrags in tablegen.
520 bool isUnneededShiftMask(SDNode
*N
, unsigned Width
) const {
521 assert(N
->getOpcode() == ISD::AND
&& "Unexpected opcode");
522 const APInt
&Val
= N
->getConstantOperandAPInt(1);
524 if (Val
.countr_one() >= Width
)
527 APInt Mask
= Val
| CurDAG
->computeKnownBits(N
->getOperand(0)).Zero
;
528 return Mask
.countr_one() >= Width
;
531 /// Return an SDNode that returns the value of the global base register.
532 /// Output instructions required to initialize the global base register,
534 SDNode
*getGlobalBaseReg();
536 /// Return a reference to the TargetMachine, casted to the target-specific
538 const X86TargetMachine
&getTargetMachine() const {
539 return static_cast<const X86TargetMachine
&>(TM
);
542 /// Return a reference to the TargetInstrInfo, casted to the target-specific
544 const X86InstrInfo
*getInstrInfo() const {
545 return Subtarget
->getInstrInfo();
548 /// Return a condition code of the given SDNode
549 X86::CondCode
getCondFromNode(SDNode
*N
) const;
551 /// Address-mode matching performs shift-of-and to and-of-shift
552 /// reassociation in order to expose more scaled addressing
554 bool ComplexPatternFuncMutatesDAG() const override
{
558 bool isSExtAbsoluteSymbolRef(unsigned Width
, SDNode
*N
) const;
560 // Indicates we should prefer to use a non-temporal load for this load.
561 bool useNonTemporalLoad(LoadSDNode
*N
) const {
562 if (!N
->isNonTemporal())
565 unsigned StoreSize
= N
->getMemoryVT().getStoreSize();
567 if (N
->getAlign().value() < StoreSize
)
571 default: llvm_unreachable("Unsupported store size");
576 return Subtarget
->hasSSE41();
578 return Subtarget
->hasAVX2();
580 return Subtarget
->hasAVX512();
584 bool foldLoadStoreIntoMemOperand(SDNode
*Node
);
585 MachineSDNode
*matchBEXTRFromAndImm(SDNode
*Node
);
586 bool matchBitExtract(SDNode
*Node
);
587 bool shrinkAndImmediate(SDNode
*N
);
588 bool isMaskZeroExtended(SDNode
*N
) const;
589 bool tryShiftAmountMod(SDNode
*N
);
590 bool tryShrinkShlLogicImm(SDNode
*N
);
591 bool tryVPTERNLOG(SDNode
*N
);
592 bool matchVPTERNLOG(SDNode
*Root
, SDNode
*ParentA
, SDNode
*ParentB
,
593 SDNode
*ParentC
, SDValue A
, SDValue B
, SDValue C
,
595 bool tryVPTESTM(SDNode
*Root
, SDValue Setcc
, SDValue Mask
);
596 bool tryMatchBitSelect(SDNode
*N
);
598 MachineSDNode
*emitPCMPISTR(unsigned ROpc
, unsigned MOpc
, bool MayFoldLoad
,
599 const SDLoc
&dl
, MVT VT
, SDNode
*Node
);
600 MachineSDNode
*emitPCMPESTR(unsigned ROpc
, unsigned MOpc
, bool MayFoldLoad
,
601 const SDLoc
&dl
, MVT VT
, SDNode
*Node
,
604 bool tryOptimizeRem8Extend(SDNode
*N
);
606 bool onlyUsesZeroFlag(SDValue Flags
) const;
607 bool hasNoSignFlagUses(SDValue Flags
) const;
608 bool hasNoCarryFlagUses(SDValue Flags
) const;
611 class X86DAGToDAGISelLegacy
: public SelectionDAGISelLegacy
{
614 explicit X86DAGToDAGISelLegacy(X86TargetMachine
&tm
,
615 CodeGenOptLevel OptLevel
)
616 : SelectionDAGISelLegacy(
617 ID
, std::make_unique
<X86DAGToDAGISel
>(tm
, OptLevel
)) {}
621 char X86DAGToDAGISelLegacy::ID
= 0;
623 INITIALIZE_PASS(X86DAGToDAGISelLegacy
, DEBUG_TYPE
, PASS_NAME
, false, false)
625 // Returns true if this masked compare can be implemented legally with this
627 static bool isLegalMaskCompare(SDNode
*N
, const X86Subtarget
*Subtarget
) {
628 unsigned Opcode
= N
->getOpcode();
629 if (Opcode
== X86ISD::CMPM
|| Opcode
== X86ISD::CMPMM
||
630 Opcode
== X86ISD::STRICT_CMPM
|| Opcode
== ISD::SETCC
||
631 Opcode
== X86ISD::CMPMM_SAE
|| Opcode
== X86ISD::VFPCLASS
) {
632 // We can get 256-bit 8 element types here without VLX being enabled. When
633 // this happens we will use 512-bit operations and the mask will not be
635 EVT OpVT
= N
->getOperand(0).getValueType();
636 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
638 if (Opcode
== X86ISD::STRICT_CMPM
)
639 OpVT
= N
->getOperand(1).getValueType();
640 if (OpVT
.is256BitVector() || OpVT
.is128BitVector())
641 return Subtarget
->hasVLX();
645 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
646 if (Opcode
== X86ISD::VFPCLASSS
|| Opcode
== X86ISD::FSETCCM
||
647 Opcode
== X86ISD::FSETCCM_SAE
)
653 // Returns true if we can assume the writer of the mask has zero extended it
655 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode
*N
) const {
656 // If this is an AND, check if we have a compare on either side. As long as
657 // one side guarantees the mask is zero extended, the AND will preserve those
659 if (N
->getOpcode() == ISD::AND
)
660 return isLegalMaskCompare(N
->getOperand(0).getNode(), Subtarget
) ||
661 isLegalMaskCompare(N
->getOperand(1).getNode(), Subtarget
);
663 return isLegalMaskCompare(N
, Subtarget
);
667 X86DAGToDAGISel::IsProfitableToFold(SDValue N
, SDNode
*U
, SDNode
*Root
) const {
668 if (OptLevel
== CodeGenOptLevel::None
)
674 if (N
.getOpcode() != ISD::LOAD
)
677 // Don't fold non-temporal loads if we have an instruction for them.
678 if (useNonTemporalLoad(cast
<LoadSDNode
>(N
)))
681 // If N is a load, do additional profitability checks.
683 switch (U
->getOpcode()) {
693 case ISD::UADDO_CARRY
:
697 SDValue Op1
= U
->getOperand(1);
699 // If the other operand is a 8-bit immediate we should fold the immediate
700 // instead. This reduces code size.
702 // movl 4(%esp), %eax
706 // addl 4(%esp), %eax
707 // The former is 2 bytes shorter. In case where the increment is 1, then
708 // the saving can be 4 bytes (by using incl %eax).
709 if (auto *Imm
= dyn_cast
<ConstantSDNode
>(Op1
)) {
710 if (Imm
->getAPIntValue().isSignedIntN(8))
713 // If this is a 64-bit AND with an immediate that fits in 32-bits,
714 // prefer using the smaller and over folding the load. This is needed to
715 // make sure immediates created by shrinkAndImmediate are always folded.
716 // Ideally we would narrow the load during DAG combine and get the
717 // best of both worlds.
718 if (U
->getOpcode() == ISD::AND
&&
719 Imm
->getAPIntValue().getBitWidth() == 64 &&
720 Imm
->getAPIntValue().isIntN(32))
723 // If this really a zext_inreg that can be represented with a movzx
724 // instruction, prefer that.
725 // TODO: We could shrink the load and fold if it is non-volatile.
726 if (U
->getOpcode() == ISD::AND
&&
727 (Imm
->getAPIntValue() == UINT8_MAX
||
728 Imm
->getAPIntValue() == UINT16_MAX
||
729 Imm
->getAPIntValue() == UINT32_MAX
))
732 // ADD/SUB with can negate the immediate and use the opposite operation
733 // to fit 128 into a sign extended 8 bit immediate.
734 if ((U
->getOpcode() == ISD::ADD
|| U
->getOpcode() == ISD::SUB
) &&
735 (-Imm
->getAPIntValue()).isSignedIntN(8))
738 if ((U
->getOpcode() == X86ISD::ADD
|| U
->getOpcode() == X86ISD::SUB
) &&
739 (-Imm
->getAPIntValue()).isSignedIntN(8) &&
740 hasNoCarryFlagUses(SDValue(U
, 1)))
744 // If the other operand is a TLS address, we should fold it instead.
747 // leal i@NTPOFF(%eax), %eax
749 // movl $i@NTPOFF, %eax
751 // if the block also has an access to a second TLS address this will save
753 // FIXME: This is probably also true for non-TLS addresses.
754 if (Op1
.getOpcode() == X86ISD::Wrapper
) {
755 SDValue Val
= Op1
.getOperand(0);
756 if (Val
.getOpcode() == ISD::TargetGlobalTLSAddress
)
760 // Don't fold load if this matches the BTS/BTR/BTC patterns.
761 // BTS: (or X, (shl 1, n))
762 // BTR: (and X, (rotl -2, n))
763 // BTC: (xor X, (shl 1, n))
764 if (U
->getOpcode() == ISD::OR
|| U
->getOpcode() == ISD::XOR
) {
765 if (U
->getOperand(0).getOpcode() == ISD::SHL
&&
766 isOneConstant(U
->getOperand(0).getOperand(0)))
769 if (U
->getOperand(1).getOpcode() == ISD::SHL
&&
770 isOneConstant(U
->getOperand(1).getOperand(0)))
773 if (U
->getOpcode() == ISD::AND
) {
774 SDValue U0
= U
->getOperand(0);
775 SDValue U1
= U
->getOperand(1);
776 if (U0
.getOpcode() == ISD::ROTL
) {
777 auto *C
= dyn_cast
<ConstantSDNode
>(U0
.getOperand(0));
778 if (C
&& C
->getSExtValue() == -2)
782 if (U1
.getOpcode() == ISD::ROTL
) {
783 auto *C
= dyn_cast
<ConstantSDNode
>(U1
.getOperand(0));
784 if (C
&& C
->getSExtValue() == -2)
794 // Don't fold a load into a shift by immediate. The BMI2 instructions
795 // support folding a load, but not an immediate. The legacy instructions
796 // support folding an immediate, but can't fold a load. Folding an
797 // immediate is preferable to folding a load.
798 if (isa
<ConstantSDNode
>(U
->getOperand(1)))
805 // Prevent folding a load if this can implemented with an insert_subreg or
806 // a move that implicitly zeroes.
807 if (Root
->getOpcode() == ISD::INSERT_SUBVECTOR
&&
808 isNullConstant(Root
->getOperand(2)) &&
809 (Root
->getOperand(0).isUndef() ||
810 ISD::isBuildVectorAllZeros(Root
->getOperand(0).getNode())))
816 // Indicates it is profitable to form an AVX512 masked operation. Returning
817 // false will favor a masked register-register masked move or vblendm and the
818 // operation will be selected separately.
819 bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode
*N
) const {
821 (N
->getOpcode() == ISD::VSELECT
|| N
->getOpcode() == X86ISD::SELECTS
) &&
822 "Unexpected opcode!");
824 // If the operation has additional users, the operation will be duplicated.
825 // Check the use count to prevent that.
826 // FIXME: Are there cheap opcodes we might want to duplicate?
827 return N
->getOperand(1).hasOneUse();
830 /// Replace the original chain operand of the call with
831 /// load's chain operand and move load below the call's chain operand.
832 static void moveBelowOrigChain(SelectionDAG
*CurDAG
, SDValue Load
,
833 SDValue Call
, SDValue OrigChain
) {
834 SmallVector
<SDValue
, 8> Ops
;
835 SDValue Chain
= OrigChain
.getOperand(0);
836 if (Chain
.getNode() == Load
.getNode())
837 Ops
.push_back(Load
.getOperand(0));
839 assert(Chain
.getOpcode() == ISD::TokenFactor
&&
840 "Unexpected chain operand");
841 for (unsigned i
= 0, e
= Chain
.getNumOperands(); i
!= e
; ++i
)
842 if (Chain
.getOperand(i
).getNode() == Load
.getNode())
843 Ops
.push_back(Load
.getOperand(0));
845 Ops
.push_back(Chain
.getOperand(i
));
847 CurDAG
->getNode(ISD::TokenFactor
, SDLoc(Load
), MVT::Other
, Ops
);
849 Ops
.push_back(NewChain
);
851 Ops
.append(OrigChain
->op_begin() + 1, OrigChain
->op_end());
852 CurDAG
->UpdateNodeOperands(OrigChain
.getNode(), Ops
);
853 CurDAG
->UpdateNodeOperands(Load
.getNode(), Call
.getOperand(0),
854 Load
.getOperand(1), Load
.getOperand(2));
857 Ops
.push_back(SDValue(Load
.getNode(), 1));
858 Ops
.append(Call
->op_begin() + 1, Call
->op_end());
859 CurDAG
->UpdateNodeOperands(Call
.getNode(), Ops
);
862 /// Return true if call address is a load and it can be
863 /// moved below CALLSEQ_START and the chains leading up to the call.
864 /// Return the CALLSEQ_START by reference as a second output.
865 /// In the case of a tail call, there isn't a callseq node between the call
866 /// chain and the load.
867 static bool isCalleeLoad(SDValue Callee
, SDValue
&Chain
, bool HasCallSeq
) {
868 // The transformation is somewhat dangerous if the call's chain was glued to
869 // the call. After MoveBelowOrigChain the load is moved between the call and
870 // the chain, this can create a cycle if the load is not folded. So it is
871 // *really* important that we are sure the load will be folded.
872 if (Callee
.getNode() == Chain
.getNode() || !Callee
.hasOneUse())
874 auto *LD
= dyn_cast
<LoadSDNode
>(Callee
.getNode());
877 LD
->getAddressingMode() != ISD::UNINDEXED
||
878 LD
->getExtensionType() != ISD::NON_EXTLOAD
)
881 // Now let's find the callseq_start.
882 while (HasCallSeq
&& Chain
.getOpcode() != ISD::CALLSEQ_START
) {
883 if (!Chain
.hasOneUse())
885 Chain
= Chain
.getOperand(0);
888 if (!Chain
.getNumOperands())
890 // Since we are not checking for AA here, conservatively abort if the chain
891 // writes to memory. It's not safe to move the callee (a load) across a store.
892 if (isa
<MemSDNode
>(Chain
.getNode()) &&
893 cast
<MemSDNode
>(Chain
.getNode())->writeMem())
895 if (Chain
.getOperand(0).getNode() == Callee
.getNode())
897 if (Chain
.getOperand(0).getOpcode() == ISD::TokenFactor
&&
898 Callee
.getValue(1).isOperandOf(Chain
.getOperand(0).getNode()) &&
899 Callee
.getValue(1).hasOneUse())
904 static bool isEndbrImm64(uint64_t Imm
) {
905 // There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
906 // i.g: 0xF3660F1EFA, 0xF3670F1EFA
907 if ((Imm
& 0x00FFFFFF) != 0x0F1EFA)
910 uint8_t OptionalPrefixBytes
[] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
911 0x65, 0x66, 0x67, 0xf0, 0xf2};
912 int i
= 24; // 24bit 0x0F1EFA has matched
914 uint8_t Byte
= (Imm
>> i
) & 0xFF;
917 if (!llvm::is_contained(OptionalPrefixBytes
, Byte
))
925 static bool needBWI(MVT VT
) {
926 return (VT
== MVT::v32i16
|| VT
== MVT::v32f16
|| VT
== MVT::v64i8
);
929 void X86DAGToDAGISel::PreprocessISelDAG() {
930 bool MadeChange
= false;
931 for (SelectionDAG::allnodes_iterator I
= CurDAG
->allnodes_begin(),
932 E
= CurDAG
->allnodes_end(); I
!= E
; ) {
933 SDNode
*N
= &*I
++; // Preincrement iterator to avoid invalidation issues.
935 // This is for CET enhancement.
937 // ENDBR32 and ENDBR64 have specific opcodes:
938 // ENDBR32: F3 0F 1E FB
939 // ENDBR64: F3 0F 1E FA
940 // And we want that attackers won’t find unintended ENDBR32/64
941 // opcode matches in the binary
942 // Here’s an example:
943 // If the compiler had to generate asm for the following code:
945 // it could, for example, generate:
946 // mov 0xF30F1EFA, dword ptr[a]
947 // In such a case, the binary would include a gadget that starts
948 // with a fake ENDBR64 opcode. Therefore, we split such generation
949 // into multiple operations, let it not shows in the binary
950 if (N
->getOpcode() == ISD::Constant
) {
951 MVT VT
= N
->getSimpleValueType(0);
952 int64_t Imm
= cast
<ConstantSDNode
>(N
)->getSExtValue();
953 int32_t EndbrImm
= Subtarget
->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
954 if (Imm
== EndbrImm
|| isEndbrImm64(Imm
)) {
955 // Check that the cf-protection-branch is enabled.
956 Metadata
*CFProtectionBranch
=
957 MF
->getFunction().getParent()->getModuleFlag(
958 "cf-protection-branch");
959 if (CFProtectionBranch
|| IndirectBranchTracking
) {
961 SDValue Complement
= CurDAG
->getConstant(~Imm
, dl
, VT
, false, true);
962 Complement
= CurDAG
->getNOT(dl
, Complement
, VT
);
964 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Complement
);
972 // If this is a target specific AND node with no flag usages, turn it back
973 // into ISD::AND to enable test instruction matching.
974 if (N
->getOpcode() == X86ISD::AND
&& !N
->hasAnyUseOfValue(1)) {
975 SDValue Res
= CurDAG
->getNode(ISD::AND
, SDLoc(N
), N
->getValueType(0),
976 N
->getOperand(0), N
->getOperand(1));
978 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
984 // Convert vector increment or decrement to sub/add with an all-ones
986 // add X, <1, 1...> --> sub X, <-1, -1...>
987 // sub X, <1, 1...> --> add X, <-1, -1...>
988 // The all-ones vector constant can be materialized using a pcmpeq
989 // instruction that is commonly recognized as an idiom (has no register
990 // dependency), so that's better/smaller than loading a splat 1 constant.
992 // But don't do this if it would inhibit a potentially profitable load
993 // folding opportunity for the other operand. That only occurs with the
995 // (1) The other operand (op0) is load foldable.
996 // (2) The op is an add (otherwise, we are *creating* an add and can still
997 // load fold the other op).
998 // (3) The target has AVX (otherwise, we have a destructive add and can't
999 // load fold the other op without killing the constant op).
1000 // (4) The constant 1 vector has multiple uses (so it is profitable to load
1001 // into a register anyway).
1002 auto mayPreventLoadFold
= [&]() {
1003 return X86::mayFoldLoad(N
->getOperand(0), *Subtarget
) &&
1004 N
->getOpcode() == ISD::ADD
&& Subtarget
->hasAVX() &&
1005 !N
->getOperand(1).hasOneUse();
1007 if ((N
->getOpcode() == ISD::ADD
|| N
->getOpcode() == ISD::SUB
) &&
1008 N
->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
1010 if (X86::isConstantSplat(N
->getOperand(1), SplatVal
) &&
1014 MVT VT
= N
->getSimpleValueType(0);
1015 unsigned NumElts
= VT
.getSizeInBits() / 32;
1017 CurDAG
->getAllOnesConstant(DL
, MVT::getVectorVT(MVT::i32
, NumElts
));
1018 AllOnes
= CurDAG
->getBitcast(VT
, AllOnes
);
1020 unsigned NewOpcode
= N
->getOpcode() == ISD::ADD
? ISD::SUB
: ISD::ADD
;
1022 CurDAG
->getNode(NewOpcode
, DL
, VT
, N
->getOperand(0), AllOnes
);
1024 CurDAG
->ReplaceAllUsesWith(N
, Res
.getNode());
1031 switch (N
->getOpcode()) {
1032 case X86ISD::VBROADCAST
: {
1033 MVT VT
= N
->getSimpleValueType(0);
1034 // Emulate v32i16/v64i8 broadcast without BWI.
1035 if (!Subtarget
->hasBWI() && needBWI(VT
)) {
1036 MVT NarrowVT
= VT
.getHalfNumVectorElementsVT();
1038 SDValue NarrowBCast
=
1039 CurDAG
->getNode(X86ISD::VBROADCAST
, dl
, NarrowVT
, N
->getOperand(0));
1041 CurDAG
->getNode(ISD::INSERT_SUBVECTOR
, dl
, VT
, CurDAG
->getUNDEF(VT
),
1042 NarrowBCast
, CurDAG
->getIntPtrConstant(0, dl
));
1043 unsigned Index
= NarrowVT
.getVectorMinNumElements();
1044 Res
= CurDAG
->getNode(ISD::INSERT_SUBVECTOR
, dl
, VT
, Res
, NarrowBCast
,
1045 CurDAG
->getIntPtrConstant(Index
, dl
));
1048 CurDAG
->ReplaceAllUsesWith(N
, Res
.getNode());
1056 case X86ISD::VBROADCAST_LOAD
: {
1057 MVT VT
= N
->getSimpleValueType(0);
1058 // Emulate v32i16/v64i8 broadcast without BWI.
1059 if (!Subtarget
->hasBWI() && needBWI(VT
)) {
1060 MVT NarrowVT
= VT
.getHalfNumVectorElementsVT();
1061 auto *MemNode
= cast
<MemSDNode
>(N
);
1063 SDVTList VTs
= CurDAG
->getVTList(NarrowVT
, MVT::Other
);
1064 SDValue Ops
[] = {MemNode
->getChain(), MemNode
->getBasePtr()};
1065 SDValue NarrowBCast
= CurDAG
->getMemIntrinsicNode(
1066 X86ISD::VBROADCAST_LOAD
, dl
, VTs
, Ops
, MemNode
->getMemoryVT(),
1067 MemNode
->getMemOperand());
1069 CurDAG
->getNode(ISD::INSERT_SUBVECTOR
, dl
, VT
, CurDAG
->getUNDEF(VT
),
1070 NarrowBCast
, CurDAG
->getIntPtrConstant(0, dl
));
1071 unsigned Index
= NarrowVT
.getVectorMinNumElements();
1072 Res
= CurDAG
->getNode(ISD::INSERT_SUBVECTOR
, dl
, VT
, Res
, NarrowBCast
,
1073 CurDAG
->getIntPtrConstant(Index
, dl
));
1076 SDValue To
[] = {Res
, NarrowBCast
.getValue(1)};
1077 CurDAG
->ReplaceAllUsesWith(N
, To
);
1086 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1087 // load, then just extract the lower subvector and avoid the second load.
1088 auto *Ld
= cast
<LoadSDNode
>(N
);
1089 MVT VT
= N
->getSimpleValueType(0);
1090 if (!ISD::isNormalLoad(Ld
) || !Ld
->isSimple() ||
1091 !(VT
.is128BitVector() || VT
.is256BitVector()))
1095 SDNode
*MaxLd
= nullptr;
1096 SDValue Ptr
= Ld
->getBasePtr();
1097 SDValue Chain
= Ld
->getChain();
1098 for (SDNode
*User
: Ptr
->users()) {
1099 auto *UserLd
= dyn_cast
<LoadSDNode
>(User
);
1100 MVT UserVT
= User
->getSimpleValueType(0);
1101 if (User
!= N
&& UserLd
&& ISD::isNormalLoad(User
) &&
1102 UserLd
->getBasePtr() == Ptr
&& UserLd
->getChain() == Chain
&&
1103 !User
->hasAnyUseOfValue(1) &&
1104 (UserVT
.is256BitVector() || UserVT
.is512BitVector()) &&
1105 UserVT
.getSizeInBits() > VT
.getSizeInBits() &&
1106 (!MaxLd
|| UserVT
.getSizeInBits() > MaxVT
.getSizeInBits())) {
1113 unsigned NumSubElts
= VT
.getSizeInBits() / MaxVT
.getScalarSizeInBits();
1114 MVT SubVT
= MVT::getVectorVT(MaxVT
.getScalarType(), NumSubElts
);
1115 SDValue Extract
= CurDAG
->getNode(ISD::EXTRACT_SUBVECTOR
, dl
, SubVT
,
1117 CurDAG
->getIntPtrConstant(0, dl
));
1118 SDValue Res
= CurDAG
->getBitcast(VT
, Extract
);
1121 SDValue To
[] = {Res
, SDValue(MaxLd
, 1)};
1122 CurDAG
->ReplaceAllUsesWith(N
, To
);
1129 case ISD::VSELECT
: {
1130 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1131 EVT EleVT
= N
->getOperand(0).getValueType().getVectorElementType();
1132 if (EleVT
== MVT::i1
)
1135 assert(Subtarget
->hasSSE41() && "Expected SSE4.1 support!");
1136 assert(N
->getValueType(0).getVectorElementType() != MVT::i16
&&
1137 "We can't replace VSELECT with BLENDV in vXi16!");
1139 if (Subtarget
->hasVLX() && CurDAG
->ComputeNumSignBits(N
->getOperand(0)) ==
1140 EleVT
.getSizeInBits()) {
1141 R
= CurDAG
->getNode(X86ISD::VPTERNLOG
, SDLoc(N
), N
->getValueType(0),
1142 N
->getOperand(0), N
->getOperand(1), N
->getOperand(2),
1143 CurDAG
->getTargetConstant(0xCA, SDLoc(N
), MVT::i8
));
1145 R
= CurDAG
->getNode(X86ISD::BLENDV
, SDLoc(N
), N
->getValueType(0),
1146 N
->getOperand(0), N
->getOperand(1),
1150 CurDAG
->ReplaceAllUsesWith(N
, R
.getNode());
1156 case ISD::STRICT_FP_ROUND
:
1157 case ISD::FP_TO_SINT
:
1158 case ISD::FP_TO_UINT
:
1159 case ISD::STRICT_FP_TO_SINT
:
1160 case ISD::STRICT_FP_TO_UINT
: {
1161 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1162 // don't need 2 sets of patterns.
1163 if (!N
->getSimpleValueType(0).isVector())
1167 switch (N
->getOpcode()) {
1168 default: llvm_unreachable("Unexpected opcode!");
1169 case ISD::FP_ROUND
: NewOpc
= X86ISD::VFPROUND
; break;
1170 case ISD::STRICT_FP_ROUND
: NewOpc
= X86ISD::STRICT_VFPROUND
; break;
1171 case ISD::STRICT_FP_TO_SINT
: NewOpc
= X86ISD::STRICT_CVTTP2SI
; break;
1172 case ISD::FP_TO_SINT
: NewOpc
= X86ISD::CVTTP2SI
; break;
1173 case ISD::STRICT_FP_TO_UINT
: NewOpc
= X86ISD::STRICT_CVTTP2UI
; break;
1174 case ISD::FP_TO_UINT
: NewOpc
= X86ISD::CVTTP2UI
; break;
1177 if (N
->isStrictFPOpcode())
1179 CurDAG
->getNode(NewOpc
, SDLoc(N
), {N
->getValueType(0), MVT::Other
},
1180 {N
->getOperand(0), N
->getOperand(1)});
1183 CurDAG
->getNode(NewOpc
, SDLoc(N
), N
->getValueType(0),
1186 CurDAG
->ReplaceAllUsesWith(N
, Res
.getNode());
1194 // Replace vector shifts with their X86 specific equivalent so we don't
1195 // need 2 sets of patterns.
1196 if (!N
->getValueType(0).isVector())
1200 switch (N
->getOpcode()) {
1201 default: llvm_unreachable("Unexpected opcode!");
1202 case ISD::SHL
: NewOpc
= X86ISD::VSHLV
; break;
1203 case ISD::SRA
: NewOpc
= X86ISD::VSRAV
; break;
1204 case ISD::SRL
: NewOpc
= X86ISD::VSRLV
; break;
1206 SDValue Res
= CurDAG
->getNode(NewOpc
, SDLoc(N
), N
->getValueType(0),
1207 N
->getOperand(0), N
->getOperand(1));
1209 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
1214 case ISD::ANY_EXTEND
:
1215 case ISD::ANY_EXTEND_VECTOR_INREG
: {
1216 // Replace vector any extend with the zero extend equivalents so we don't
1217 // need 2 sets of patterns. Ignore vXi1 extensions.
1218 if (!N
->getValueType(0).isVector())
1222 if (N
->getOperand(0).getScalarValueSizeInBits() == 1) {
1223 assert(N
->getOpcode() == ISD::ANY_EXTEND
&&
1224 "Unexpected opcode for mask vector!");
1225 NewOpc
= ISD::SIGN_EXTEND
;
1227 NewOpc
= N
->getOpcode() == ISD::ANY_EXTEND
1229 : ISD::ZERO_EXTEND_VECTOR_INREG
;
1232 SDValue Res
= CurDAG
->getNode(NewOpc
, SDLoc(N
), N
->getValueType(0),
1235 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
1241 case ISD::STRICT_FCEIL
:
1243 case ISD::STRICT_FFLOOR
:
1245 case ISD::STRICT_FTRUNC
:
1246 case ISD::FROUNDEVEN
:
1247 case ISD::STRICT_FROUNDEVEN
:
1248 case ISD::FNEARBYINT
:
1249 case ISD::STRICT_FNEARBYINT
:
1251 case ISD::STRICT_FRINT
: {
1252 // Replace fp rounding with their X86 specific equivalent so we don't
1253 // need 2 sets of patterns.
1255 switch (N
->getOpcode()) {
1256 default: llvm_unreachable("Unexpected opcode!");
1257 case ISD::STRICT_FCEIL
:
1258 case ISD::FCEIL
: Imm
= 0xA; break;
1259 case ISD::STRICT_FFLOOR
:
1260 case ISD::FFLOOR
: Imm
= 0x9; break;
1261 case ISD::STRICT_FTRUNC
:
1262 case ISD::FTRUNC
: Imm
= 0xB; break;
1263 case ISD::STRICT_FROUNDEVEN
:
1264 case ISD::FROUNDEVEN
: Imm
= 0x8; break;
1265 case ISD::STRICT_FNEARBYINT
:
1266 case ISD::FNEARBYINT
: Imm
= 0xC; break;
1267 case ISD::STRICT_FRINT
:
1268 case ISD::FRINT
: Imm
= 0x4; break;
1271 bool IsStrict
= N
->isStrictFPOpcode();
1274 Res
= CurDAG
->getNode(X86ISD::STRICT_VRNDSCALE
, dl
,
1275 {N
->getValueType(0), MVT::Other
},
1276 {N
->getOperand(0), N
->getOperand(1),
1277 CurDAG
->getTargetConstant(Imm
, dl
, MVT::i32
)});
1279 Res
= CurDAG
->getNode(X86ISD::VRNDSCALE
, dl
, N
->getValueType(0),
1281 CurDAG
->getTargetConstant(Imm
, dl
, MVT::i32
));
1283 CurDAG
->ReplaceAllUsesWith(N
, Res
.getNode());
1291 case X86ISD::FXOR
: {
1292 // Widen scalar fp logic ops to vector to reduce isel patterns.
1293 // FIXME: Can we do this during lowering/combine.
1294 MVT VT
= N
->getSimpleValueType(0);
1295 if (VT
.isVector() || VT
== MVT::f128
)
1298 MVT VecVT
= VT
== MVT::f64
? MVT::v2f64
1299 : VT
== MVT::f32
? MVT::v4f32
1303 SDValue Op0
= CurDAG
->getNode(ISD::SCALAR_TO_VECTOR
, dl
, VecVT
,
1305 SDValue Op1
= CurDAG
->getNode(ISD::SCALAR_TO_VECTOR
, dl
, VecVT
,
1309 if (Subtarget
->hasSSE2()) {
1310 EVT IntVT
= EVT(VecVT
).changeVectorElementTypeToInteger();
1311 Op0
= CurDAG
->getNode(ISD::BITCAST
, dl
, IntVT
, Op0
);
1312 Op1
= CurDAG
->getNode(ISD::BITCAST
, dl
, IntVT
, Op1
);
1314 switch (N
->getOpcode()) {
1315 default: llvm_unreachable("Unexpected opcode!");
1316 case X86ISD::FANDN
: Opc
= X86ISD::ANDNP
; break;
1317 case X86ISD::FAND
: Opc
= ISD::AND
; break;
1318 case X86ISD::FOR
: Opc
= ISD::OR
; break;
1319 case X86ISD::FXOR
: Opc
= ISD::XOR
; break;
1321 Res
= CurDAG
->getNode(Opc
, dl
, IntVT
, Op0
, Op1
);
1322 Res
= CurDAG
->getNode(ISD::BITCAST
, dl
, VecVT
, Res
);
1324 Res
= CurDAG
->getNode(N
->getOpcode(), dl
, VecVT
, Op0
, Op1
);
1326 Res
= CurDAG
->getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, VT
, Res
,
1327 CurDAG
->getIntPtrConstant(0, dl
));
1329 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
1336 if (OptLevel
!= CodeGenOptLevel::None
&&
1337 // Only do this when the target can fold the load into the call or
1339 !Subtarget
->useIndirectThunkCalls() &&
1340 ((N
->getOpcode() == X86ISD::CALL
&& !Subtarget
->slowTwoMemOps()) ||
1341 (N
->getOpcode() == X86ISD::TC_RETURN
&&
1342 (Subtarget
->is64Bit() ||
1343 !getTargetMachine().isPositionIndependent())))) {
1344 /// Also try moving call address load from outside callseq_start to just
1345 /// before the call to allow it to be folded.
1355 ///[CALLSEQ_START] |
1363 bool HasCallSeq
= N
->getOpcode() == X86ISD::CALL
;
1364 SDValue Chain
= N
->getOperand(0);
1365 SDValue Load
= N
->getOperand(1);
1366 if (!isCalleeLoad(Load
, Chain
, HasCallSeq
))
1368 moveBelowOrigChain(CurDAG
, Load
, SDValue(N
, 0), Chain
);
1374 // Lower fpround and fpextend nodes that target the FP stack to be store and
1375 // load to the stack. This is a gross hack. We would like to simply mark
1376 // these as being illegal, but when we do that, legalize produces these when
1377 // it expands calls, then expands these in the same legalize pass. We would
1378 // like dag combine to be able to hack on these between the call expansion
1379 // and the node legalization. As such this pass basically does "really
1380 // late" legalization of these inline with the X86 isel pass.
1381 // FIXME: This should only happen when not compiled with -O0.
1382 switch (N
->getOpcode()) {
1385 case ISD::FP_EXTEND
:
1387 MVT SrcVT
= N
->getOperand(0).getSimpleValueType();
1388 MVT DstVT
= N
->getSimpleValueType(0);
1390 // If any of the sources are vectors, no fp stack involved.
1391 if (SrcVT
.isVector() || DstVT
.isVector())
1394 // If the source and destination are SSE registers, then this is a legal
1395 // conversion that should not be lowered.
1396 const X86TargetLowering
*X86Lowering
=
1397 static_cast<const X86TargetLowering
*>(TLI
);
1398 bool SrcIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(SrcVT
);
1399 bool DstIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(DstVT
);
1400 if (SrcIsSSE
&& DstIsSSE
)
1403 if (!SrcIsSSE
&& !DstIsSSE
) {
1404 // If this is an FPStack extension, it is a noop.
1405 if (N
->getOpcode() == ISD::FP_EXTEND
)
1407 // If this is a value-preserving FPStack truncation, it is a noop.
1408 if (N
->getConstantOperandVal(1))
1412 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1413 // FPStack has extload and truncstore. SSE can fold direct loads into other
1414 // operations. Based on this, decide what we want to do.
1415 MVT MemVT
= (N
->getOpcode() == ISD::FP_ROUND
) ? DstVT
: SrcVT
;
1416 SDValue MemTmp
= CurDAG
->CreateStackTemporary(MemVT
);
1417 int SPFI
= cast
<FrameIndexSDNode
>(MemTmp
)->getIndex();
1418 MachinePointerInfo MPI
=
1419 MachinePointerInfo::getFixedStack(CurDAG
->getMachineFunction(), SPFI
);
1422 // FIXME: optimize the case where the src/dest is a load or store?
1424 SDValue Store
= CurDAG
->getTruncStore(
1425 CurDAG
->getEntryNode(), dl
, N
->getOperand(0), MemTmp
, MPI
, MemVT
);
1426 SDValue Result
= CurDAG
->getExtLoad(ISD::EXTLOAD
, dl
, DstVT
, Store
,
1427 MemTmp
, MPI
, MemVT
);
1429 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1430 // extload we created. This will cause general havok on the dag because
1431 // anything below the conversion could be folded into other existing nodes.
1432 // To avoid invalidating 'I', back it up to the convert node.
1434 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Result
);
1438 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1439 //dealing with the chain differently, as there is already a preexisting chain.
1440 case ISD::STRICT_FP_ROUND
:
1441 case ISD::STRICT_FP_EXTEND
:
1443 MVT SrcVT
= N
->getOperand(1).getSimpleValueType();
1444 MVT DstVT
= N
->getSimpleValueType(0);
1446 // If any of the sources are vectors, no fp stack involved.
1447 if (SrcVT
.isVector() || DstVT
.isVector())
1450 // If the source and destination are SSE registers, then this is a legal
1451 // conversion that should not be lowered.
1452 const X86TargetLowering
*X86Lowering
=
1453 static_cast<const X86TargetLowering
*>(TLI
);
1454 bool SrcIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(SrcVT
);
1455 bool DstIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(DstVT
);
1456 if (SrcIsSSE
&& DstIsSSE
)
1459 if (!SrcIsSSE
&& !DstIsSSE
) {
1460 // If this is an FPStack extension, it is a noop.
1461 if (N
->getOpcode() == ISD::STRICT_FP_EXTEND
)
1463 // If this is a value-preserving FPStack truncation, it is a noop.
1464 if (N
->getConstantOperandVal(2))
1468 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1469 // FPStack has extload and truncstore. SSE can fold direct loads into other
1470 // operations. Based on this, decide what we want to do.
1471 MVT MemVT
= (N
->getOpcode() == ISD::STRICT_FP_ROUND
) ? DstVT
: SrcVT
;
1472 SDValue MemTmp
= CurDAG
->CreateStackTemporary(MemVT
);
1473 int SPFI
= cast
<FrameIndexSDNode
>(MemTmp
)->getIndex();
1474 MachinePointerInfo MPI
=
1475 MachinePointerInfo::getFixedStack(CurDAG
->getMachineFunction(), SPFI
);
1478 // FIXME: optimize the case where the src/dest is a load or store?
1480 //Since the operation is StrictFP, use the preexisting chain.
1481 SDValue Store
, Result
;
1483 SDVTList VTs
= CurDAG
->getVTList(MVT::Other
);
1484 SDValue Ops
[] = {N
->getOperand(0), N
->getOperand(1), MemTmp
};
1485 Store
= CurDAG
->getMemIntrinsicNode(X86ISD::FST
, dl
, VTs
, Ops
, MemVT
,
1486 MPI
, /*Align*/ std::nullopt
,
1487 MachineMemOperand::MOStore
);
1488 if (N
->getFlags().hasNoFPExcept()) {
1489 SDNodeFlags Flags
= Store
->getFlags();
1490 Flags
.setNoFPExcept(true);
1491 Store
->setFlags(Flags
);
1494 assert(SrcVT
== MemVT
&& "Unexpected VT!");
1495 Store
= CurDAG
->getStore(N
->getOperand(0), dl
, N
->getOperand(1), MemTmp
,
1500 SDVTList VTs
= CurDAG
->getVTList(DstVT
, MVT::Other
);
1501 SDValue Ops
[] = {Store
, MemTmp
};
1502 Result
= CurDAG
->getMemIntrinsicNode(
1503 X86ISD::FLD
, dl
, VTs
, Ops
, MemVT
, MPI
,
1504 /*Align*/ std::nullopt
, MachineMemOperand::MOLoad
);
1505 if (N
->getFlags().hasNoFPExcept()) {
1506 SDNodeFlags Flags
= Result
->getFlags();
1507 Flags
.setNoFPExcept(true);
1508 Result
->setFlags(Flags
);
1511 assert(DstVT
== MemVT
&& "Unexpected VT!");
1512 Result
= CurDAG
->getLoad(DstVT
, dl
, Store
, MemTmp
, MPI
);
1515 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1516 // extload we created. This will cause general havok on the dag because
1517 // anything below the conversion could be folded into other existing nodes.
1518 // To avoid invalidating 'I', back it up to the convert node.
1520 CurDAG
->ReplaceAllUsesWith(N
, Result
.getNode());
1526 // Now that we did that, the node is dead. Increment the iterator to the
1527 // next node to process, then delete N.
1532 // Remove any dead nodes that may have been left behind.
1534 CurDAG
->RemoveDeadNodes();
1537 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1538 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode
*N
) {
1539 unsigned Opc
= N
->getMachineOpcode();
1540 if (Opc
!= X86::MOVZX32rr8
&& Opc
!= X86::MOVSX32rr8
&&
1541 Opc
!= X86::MOVSX64rr8
)
1544 SDValue N0
= N
->getOperand(0);
1546 // We need to be extracting the lower bit of an extend.
1547 if (!N0
.isMachineOpcode() ||
1548 N0
.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG
||
1549 N0
.getConstantOperandVal(1) != X86::sub_8bit
)
1552 // We're looking for either a movsx or movzx to match the original opcode.
1553 unsigned ExpectedOpc
= Opc
== X86::MOVZX32rr8
? X86::MOVZX32rr8_NOREX
1554 : X86::MOVSX32rr8_NOREX
;
1555 SDValue N00
= N0
.getOperand(0);
1556 if (!N00
.isMachineOpcode() || N00
.getMachineOpcode() != ExpectedOpc
)
1559 if (Opc
== X86::MOVSX64rr8
) {
1560 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1562 MachineSDNode
*Extend
= CurDAG
->getMachineNode(X86::MOVSX64rr32
, SDLoc(N
),
1564 ReplaceUses(N
, Extend
);
1566 // Ok we can drop this extend and just use the original extend.
1567 ReplaceUses(N
, N00
.getNode());
1573 void X86DAGToDAGISel::PostprocessISelDAG() {
1574 // Skip peepholes at -O0.
1575 if (TM
.getOptLevel() == CodeGenOptLevel::None
)
1578 SelectionDAG::allnodes_iterator Position
= CurDAG
->allnodes_end();
1580 bool MadeChange
= false;
1581 while (Position
!= CurDAG
->allnodes_begin()) {
1582 SDNode
*N
= &*--Position
;
1583 // Skip dead nodes and any non-machine opcodes.
1584 if (N
->use_empty() || !N
->isMachineOpcode())
1587 if (tryOptimizeRem8Extend(N
)) {
1592 unsigned Opc
= N
->getMachineOpcode();
1596 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1601 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1603 case X86::CTEST16rr
:
1604 case X86::CTEST32rr
:
1605 case X86::CTEST64rr
: {
1606 auto &Op0
= N
->getOperand(0);
1607 if (Op0
!= N
->getOperand(1) || !Op0
->hasNUsesOfValue(2, Op0
.getResNo()) ||
1608 !Op0
.isMachineOpcode())
1610 SDValue And
= N
->getOperand(0);
1611 #define CASE_ND(OP) \
1614 switch (And
.getMachineOpcode()) {
1621 if (And
->hasAnyUseOfValue(1))
1623 SmallVector
<SDValue
> Ops(N
->op_values());
1624 Ops
[0] = And
.getOperand(0);
1625 Ops
[1] = And
.getOperand(1);
1626 MachineSDNode
*Test
=
1627 CurDAG
->getMachineNode(Opc
, SDLoc(N
), MVT::i32
, Ops
);
1628 ReplaceUses(N
, Test
);
1636 if (And
->hasAnyUseOfValue(1))
1639 bool IsCTESTCC
= X86::isCTESTCC(Opc
);
1640 #define FROM_TO(A, B) \
1641 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1643 switch (And
.getMachineOpcode()) {
1644 FROM_TO(AND8rm
, TEST8mr
);
1645 FROM_TO(AND16rm
, TEST16mr
);
1646 FROM_TO(AND32rm
, TEST32mr
);
1647 FROM_TO(AND64rm
, TEST64mr
);
1651 // Need to swap the memory and register operand.
1652 SmallVector
<SDValue
> Ops
= {And
.getOperand(1), And
.getOperand(2),
1653 And
.getOperand(3), And
.getOperand(4),
1654 And
.getOperand(5), And
.getOperand(0)};
1657 Ops
.push_back(N
->getOperand(2));
1658 Ops
.push_back(N
->getOperand(3));
1660 // Chain of memory load
1661 Ops
.push_back(And
.getOperand(6));
1664 Ops
.push_back(N
->getOperand(4));
1666 MachineSDNode
*Test
= CurDAG
->getMachineNode(
1667 NewOpc
, SDLoc(N
), MVT::i32
, MVT::Other
, Ops
);
1668 CurDAG
->setNodeMemRefs(
1669 Test
, cast
<MachineSDNode
>(And
.getNode())->memoperands());
1670 ReplaceUses(And
.getValue(2), SDValue(Test
, 1));
1671 ReplaceUses(SDValue(N
, 0), SDValue(Test
, 0));
1677 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1678 // used. We're doing this late so we can prefer to fold the AND into masked
1679 // comparisons. Doing that can be better for the live range of the mask
1681 case X86::KORTESTBkk
:
1682 case X86::KORTESTWkk
:
1683 case X86::KORTESTDkk
:
1684 case X86::KORTESTQkk
: {
1685 SDValue Op0
= N
->getOperand(0);
1686 if (Op0
!= N
->getOperand(1) || !N
->isOnlyUserOf(Op0
.getNode()) ||
1687 !Op0
.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N
, 0)))
1692 switch (Op0
.getMachineOpcode()) {
1701 #define FROM_TO(A, B) \
1706 FROM_TO(KORTESTBkk
, KTESTBkk
)
1707 FROM_TO(KORTESTWkk
, KTESTWkk
)
1708 FROM_TO(KORTESTDkk
, KTESTDkk
)
1709 FROM_TO(KORTESTQkk
, KTESTQkk
)
1711 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1712 // KAND instructions and KTEST use the same ISA feature.
1713 if (NewOpc
== X86::KTESTWkk
&& !Subtarget
->hasDQI())
1716 MachineSDNode
*KTest
= CurDAG
->getMachineNode(
1717 NewOpc
, SDLoc(N
), MVT::i32
, Op0
.getOperand(0), Op0
.getOperand(1));
1718 ReplaceUses(N
, KTest
);
1722 // Attempt to remove vectors moves that were inserted to zero upper bits.
1723 case TargetOpcode::SUBREG_TO_REG
: {
1724 unsigned SubRegIdx
= N
->getConstantOperandVal(2);
1725 if (SubRegIdx
!= X86::sub_xmm
&& SubRegIdx
!= X86::sub_ymm
)
1728 SDValue Move
= N
->getOperand(1);
1729 if (!Move
.isMachineOpcode())
1732 // Make sure its one of the move opcodes we recognize.
1733 switch (Move
.getMachineOpcode()) {
1736 CASE(VMOVAPDrr
) CASE(VMOVUPDrr
)
1737 CASE(VMOVAPSrr
) CASE(VMOVUPSrr
)
1738 CASE(VMOVDQArr
) CASE(VMOVDQUrr
)
1739 CASE(VMOVAPDYrr
) CASE(VMOVUPDYrr
)
1740 CASE(VMOVAPSYrr
) CASE(VMOVUPSYrr
)
1741 CASE(VMOVDQAYrr
) CASE(VMOVDQUYrr
)
1742 CASE(VMOVAPDZ128rr
) CASE(VMOVUPDZ128rr
)
1743 CASE(VMOVAPSZ128rr
) CASE(VMOVUPSZ128rr
)
1744 CASE(VMOVDQA32Z128rr
) CASE(VMOVDQU32Z128rr
)
1745 CASE(VMOVDQA64Z128rr
) CASE(VMOVDQU64Z128rr
)
1746 CASE(VMOVAPDZ256rr
) CASE(VMOVUPDZ256rr
)
1747 CASE(VMOVAPSZ256rr
) CASE(VMOVUPSZ256rr
)
1748 CASE(VMOVDQA32Z256rr
) CASE(VMOVDQU32Z256rr
)
1749 CASE(VMOVDQA64Z256rr
) CASE(VMOVDQU64Z256rr
)
1753 SDValue In
= Move
.getOperand(0);
1754 if (!In
.isMachineOpcode() ||
1755 In
.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END
)
1758 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1759 // the SHA instructions which use a legacy encoding.
1760 uint64_t TSFlags
= getInstrInfo()->get(In
.getMachineOpcode()).TSFlags
;
1761 if ((TSFlags
& X86II::EncodingMask
) != X86II::VEX
&&
1762 (TSFlags
& X86II::EncodingMask
) != X86II::EVEX
&&
1763 (TSFlags
& X86II::EncodingMask
) != X86II::XOP
)
1766 // Producing instruction is another vector instruction. We can drop the
1768 CurDAG
->UpdateNodeOperands(N
, N
->getOperand(0), In
, N
->getOperand(2));
1775 CurDAG
->RemoveDeadNodes();
1779 /// Emit any code that needs to be executed only in the main function.
1780 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1781 if (Subtarget
->isTargetCygMing()) {
1782 TargetLowering::ArgListTy Args
;
1783 auto &DL
= CurDAG
->getDataLayout();
1785 TargetLowering::CallLoweringInfo
CLI(*CurDAG
);
1786 CLI
.setChain(CurDAG
->getRoot())
1787 .setCallee(CallingConv::C
, Type::getVoidTy(*CurDAG
->getContext()),
1788 CurDAG
->getExternalSymbol("__main", TLI
->getPointerTy(DL
)),
1790 const TargetLowering
&TLI
= CurDAG
->getTargetLoweringInfo();
1791 std::pair
<SDValue
, SDValue
> Result
= TLI
.LowerCallTo(CLI
);
1792 CurDAG
->setRoot(Result
.second
);
1796 void X86DAGToDAGISel::emitFunctionEntryCode() {
1797 // If this is main, emit special code for main.
1798 const Function
&F
= MF
->getFunction();
1799 if (F
.hasExternalLinkage() && F
.getName() == "main")
1800 emitSpecialCodeForMain();
1803 static bool isDispSafeForFrameIndex(int64_t Val
) {
1804 // On 64-bit platforms, we can run into an issue where a frame index
1805 // includes a displacement that, when added to the explicit displacement,
1806 // will overflow the displacement field. Assuming that the frame index
1807 // displacement fits into a 31-bit integer (which is only slightly more
1808 // aggressive than the current fundamental assumption that it fits into
1809 // a 32-bit integer), a 31-bit disp should always be safe.
1810 return isInt
<31>(Val
);
1813 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset
,
1814 X86ISelAddressMode
&AM
) {
1815 // We may have already matched a displacement and the caller just added the
1816 // symbolic displacement. So we still need to do the checks even if Offset
1819 int64_t Val
= AM
.Disp
+ Offset
;
1821 // Cannot combine ExternalSymbol displacements with integer offsets.
1822 if (Val
!= 0 && (AM
.ES
|| AM
.MCSym
))
1825 CodeModel::Model M
= TM
.getCodeModel();
1826 if (Subtarget
->is64Bit()) {
1828 !X86::isOffsetSuitableForCodeModel(Val
, M
,
1829 AM
.hasSymbolicDisplacement()))
1831 // In addition to the checks required for a register base, check that
1832 // we do not try to use an unsafe Disp with a frame index.
1833 if (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
&&
1834 !isDispSafeForFrameIndex(Val
))
1836 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1837 // 64 bits. Instructions with 32-bit register addresses perform this zero
1838 // extension for us and we can safely ignore the high bits of Offset.
1839 // Instructions with only a 32-bit immediate address do not, though: they
1840 // sign extend instead. This means only address the low 2GB of address space
1841 // is directly addressable, we need indirect addressing for the high 2GB of
1843 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1844 // implicit zero extension of instructions would cover up any problem.
1845 // However, we have asserts elsewhere that get triggered if we do, so keep
1846 // the checks for now.
1847 // TODO: We would actually be able to accept these, as well as the same
1848 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1849 // to get an address size override to be emitted. However, this
1850 // pseudo-register is not part of any register class and therefore causes
1851 // MIR verification to fail.
1852 if (Subtarget
->isTarget64BitILP32() && !isUInt
<31>(Val
) &&
1853 !AM
.hasBaseOrIndexReg())
1860 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode
*N
, X86ISelAddressMode
&AM
,
1861 bool AllowSegmentRegForX32
) {
1862 SDValue Address
= N
->getOperand(1);
1864 // load gs:0 -> GS segment register.
1865 // load fs:0 -> FS segment register.
1867 // This optimization is generally valid because the GNU TLS model defines that
1868 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1869 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1870 // zero-extended to 64 bits and then added it to the base address, which gives
1871 // unwanted results when the register holds a negative value.
1872 // For more information see http://people.redhat.com/drepper/tls.pdf
1873 if (isNullConstant(Address
) && AM
.Segment
.getNode() == nullptr &&
1874 !IndirectTlsSegRefs
&&
1875 (Subtarget
->isTargetGlibc() || Subtarget
->isTargetAndroid() ||
1876 Subtarget
->isTargetFuchsia())) {
1877 if (Subtarget
->isTarget64BitILP32() && !AllowSegmentRegForX32
)
1879 switch (N
->getPointerInfo().getAddrSpace()) {
1881 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
1884 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
1886 // Address space X86AS::SS is not handled here, because it is not used to
1887 // address TLS areas.
1894 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1895 /// mode. These wrap things that will resolve down into a symbol reference.
1896 /// If no match is possible, this returns true, otherwise it returns false.
1897 bool X86DAGToDAGISel::matchWrapper(SDValue N
, X86ISelAddressMode
&AM
) {
1898 // If the addressing mode already has a symbol as the displacement, we can
1899 // never match another symbol.
1900 if (AM
.hasSymbolicDisplacement())
1903 bool IsRIPRelTLS
= false;
1904 bool IsRIPRel
= N
.getOpcode() == X86ISD::WrapperRIP
;
1906 SDValue Val
= N
.getOperand(0);
1907 if (Val
.getOpcode() == ISD::TargetGlobalTLSAddress
)
1911 // We can't use an addressing mode in the 64-bit large code model.
1912 // Global TLS addressing is an exception. In the medium code model,
1913 // we use can use a mode when RIP wrappers are present.
1914 // That signifies access to globals that are known to be "near",
1915 // such as the GOT itself.
1916 CodeModel::Model M
= TM
.getCodeModel();
1917 if (Subtarget
->is64Bit() && M
== CodeModel::Large
&& !IsRIPRelTLS
)
1920 // Base and index reg must be 0 in order to use %rip as base.
1921 if (IsRIPRel
&& AM
.hasBaseOrIndexReg())
1924 // Make a local copy in case we can't do this fold.
1925 X86ISelAddressMode Backup
= AM
;
1928 SDValue N0
= N
.getOperand(0);
1929 if (auto *G
= dyn_cast
<GlobalAddressSDNode
>(N0
)) {
1930 AM
.GV
= G
->getGlobal();
1931 AM
.SymbolFlags
= G
->getTargetFlags();
1932 Offset
= G
->getOffset();
1933 } else if (auto *CP
= dyn_cast
<ConstantPoolSDNode
>(N0
)) {
1934 AM
.CP
= CP
->getConstVal();
1935 AM
.Alignment
= CP
->getAlign();
1936 AM
.SymbolFlags
= CP
->getTargetFlags();
1937 Offset
= CP
->getOffset();
1938 } else if (auto *S
= dyn_cast
<ExternalSymbolSDNode
>(N0
)) {
1939 AM
.ES
= S
->getSymbol();
1940 AM
.SymbolFlags
= S
->getTargetFlags();
1941 } else if (auto *S
= dyn_cast
<MCSymbolSDNode
>(N0
)) {
1942 AM
.MCSym
= S
->getMCSymbol();
1943 } else if (auto *J
= dyn_cast
<JumpTableSDNode
>(N0
)) {
1944 AM
.JT
= J
->getIndex();
1945 AM
.SymbolFlags
= J
->getTargetFlags();
1946 } else if (auto *BA
= dyn_cast
<BlockAddressSDNode
>(N0
)) {
1947 AM
.BlockAddr
= BA
->getBlockAddress();
1948 AM
.SymbolFlags
= BA
->getTargetFlags();
1949 Offset
= BA
->getOffset();
1951 llvm_unreachable("Unhandled symbol reference node.");
1953 // Can't use an addressing mode with large globals.
1954 if (Subtarget
->is64Bit() && !IsRIPRel
&& AM
.GV
&&
1955 TM
.isLargeGlobalValue(AM
.GV
)) {
1960 if (foldOffsetIntoAddress(Offset
, AM
)) {
1966 AM
.setBaseReg(CurDAG
->getRegister(X86::RIP
, MVT::i64
));
1968 // Commit the changes now that we know this fold is safe.
1972 /// Add the specified node to the specified addressing mode, returning true if
1973 /// it cannot be done. This just pattern matches for the addressing mode.
1974 bool X86DAGToDAGISel::matchAddress(SDValue N
, X86ISelAddressMode
&AM
) {
1975 if (matchAddressRecursively(N
, AM
, 0))
1978 // Post-processing: Make a second attempt to fold a load, if we now know
1979 // that there will not be any other register. This is only performed for
1980 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1981 // any foldable load the first time.
1982 if (Subtarget
->isTarget64BitILP32() &&
1983 AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1984 AM
.Base_Reg
.getNode() != nullptr && AM
.IndexReg
.getNode() == nullptr) {
1985 SDValue Save_Base_Reg
= AM
.Base_Reg
;
1986 if (auto *LoadN
= dyn_cast
<LoadSDNode
>(Save_Base_Reg
)) {
1987 AM
.Base_Reg
= SDValue();
1988 if (matchLoadInAddress(LoadN
, AM
, /*AllowSegmentRegForX32=*/true))
1989 AM
.Base_Reg
= Save_Base_Reg
;
1993 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1994 // a smaller encoding and avoids a scaled-index.
1995 if (AM
.Scale
== 2 &&
1996 AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1997 AM
.Base_Reg
.getNode() == nullptr) {
1998 AM
.Base_Reg
= AM
.IndexReg
;
2002 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2003 // because it has a smaller encoding.
2004 if (TM
.getCodeModel() != CodeModel::Large
&&
2005 (!AM
.GV
|| !TM
.isLargeGlobalValue(AM
.GV
)) && Subtarget
->is64Bit() &&
2006 AM
.Scale
== 1 && AM
.BaseType
== X86ISelAddressMode::RegBase
&&
2007 AM
.Base_Reg
.getNode() == nullptr && AM
.IndexReg
.getNode() == nullptr &&
2008 AM
.SymbolFlags
== X86II::MO_NO_FLAG
&& AM
.hasSymbolicDisplacement()) {
2009 // However, when GV is a local function symbol and in the same section as
2010 // the current instruction, and AM.Disp is negative and near INT32_MIN,
2011 // referencing GV+Disp generates a relocation referencing the section symbol
2012 // with an even smaller offset, which might underflow. We should bail out if
2013 // the negative offset is too close to INT32_MIN. Actually, we are more
2014 // conservative here, using a smaller magic number also used by
2015 // isOffsetSuitableForCodeModel.
2016 if (isa_and_nonnull
<Function
>(AM
.GV
) && AM
.Disp
< -16 * 1024 * 1024)
2019 AM
.Base_Reg
= CurDAG
->getRegister(X86::RIP
, MVT::i64
);
2025 bool X86DAGToDAGISel::matchAdd(SDValue
&N
, X86ISelAddressMode
&AM
,
2027 // Add an artificial use to this node so that we can keep track of
2028 // it if it gets CSE'd with a different node.
2029 HandleSDNode
Handle(N
);
2031 X86ISelAddressMode Backup
= AM
;
2032 if (!matchAddressRecursively(N
.getOperand(0), AM
, Depth
+1) &&
2033 !matchAddressRecursively(Handle
.getValue().getOperand(1), AM
, Depth
+1))
2037 // Try again after commutating the operands.
2038 if (!matchAddressRecursively(Handle
.getValue().getOperand(1), AM
,
2040 !matchAddressRecursively(Handle
.getValue().getOperand(0), AM
, Depth
+ 1))
2044 // If we couldn't fold both operands into the address at the same time,
2045 // see if we can just put each operand into a register and fold at least
2047 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
2048 !AM
.Base_Reg
.getNode() &&
2049 !AM
.IndexReg
.getNode()) {
2050 N
= Handle
.getValue();
2051 AM
.Base_Reg
= N
.getOperand(0);
2052 AM
.IndexReg
= N
.getOperand(1);
2056 N
= Handle
.getValue();
2060 // Insert a node into the DAG at least before the Pos node's position. This
2061 // will reposition the node as needed, and will assign it a node ID that is <=
2062 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2063 // IDs! The selection DAG must no longer depend on their uniqueness when this
2065 static void insertDAGNode(SelectionDAG
&DAG
, SDValue Pos
, SDValue N
) {
2066 if (N
->getNodeId() == -1 ||
2067 (SelectionDAGISel::getUninvalidatedNodeId(N
.getNode()) >
2068 SelectionDAGISel::getUninvalidatedNodeId(Pos
.getNode()))) {
2069 DAG
.RepositionNode(Pos
->getIterator(), N
.getNode());
2070 // Mark Node as invalid for pruning as after this it may be a successor to a
2071 // selected node but otherwise be in the same position of Pos.
2072 // Conservatively mark it with the same -abs(Id) to assure node id
2073 // invariant is preserved.
2074 N
->setNodeId(Pos
->getNodeId());
2075 SelectionDAGISel::InvalidateNodeId(N
.getNode());
2079 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2080 // safe. This allows us to convert the shift and and into an h-register
2081 // extract and a scaled index. Returns false if the simplification is
2083 static bool foldMaskAndShiftToExtract(SelectionDAG
&DAG
, SDValue N
,
2085 SDValue Shift
, SDValue X
,
2086 X86ISelAddressMode
&AM
) {
2087 if (Shift
.getOpcode() != ISD::SRL
||
2088 !isa
<ConstantSDNode
>(Shift
.getOperand(1)) ||
2092 int ScaleLog
= 8 - Shift
.getConstantOperandVal(1);
2093 if (ScaleLog
<= 0 || ScaleLog
>= 4 ||
2094 Mask
!= (0xffu
<< ScaleLog
))
2097 MVT XVT
= X
.getSimpleValueType();
2098 MVT VT
= N
.getSimpleValueType();
2100 SDValue Eight
= DAG
.getConstant(8, DL
, MVT::i8
);
2101 SDValue NewMask
= DAG
.getConstant(0xff, DL
, XVT
);
2102 SDValue Srl
= DAG
.getNode(ISD::SRL
, DL
, XVT
, X
, Eight
);
2103 SDValue And
= DAG
.getNode(ISD::AND
, DL
, XVT
, Srl
, NewMask
);
2104 SDValue Ext
= DAG
.getZExtOrTrunc(And
, DL
, VT
);
2105 SDValue ShlCount
= DAG
.getConstant(ScaleLog
, DL
, MVT::i8
);
2106 SDValue Shl
= DAG
.getNode(ISD::SHL
, DL
, VT
, Ext
, ShlCount
);
2108 // Insert the new nodes into the topological ordering. We must do this in
2109 // a valid topological ordering as nothing is going to go back and re-sort
2110 // these nodes. We continually insert before 'N' in sequence as this is
2111 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2112 // hierarchy left to express.
2113 insertDAGNode(DAG
, N
, Eight
);
2114 insertDAGNode(DAG
, N
, NewMask
);
2115 insertDAGNode(DAG
, N
, Srl
);
2116 insertDAGNode(DAG
, N
, And
);
2117 insertDAGNode(DAG
, N
, Ext
);
2118 insertDAGNode(DAG
, N
, ShlCount
);
2119 insertDAGNode(DAG
, N
, Shl
);
2120 DAG
.ReplaceAllUsesWith(N
, Shl
);
2121 DAG
.RemoveDeadNode(N
.getNode());
2123 AM
.Scale
= (1 << ScaleLog
);
2127 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2128 // allows us to fold the shift into this addressing mode. Returns false if the
2129 // transform succeeded.
2130 static bool foldMaskedShiftToScaledMask(SelectionDAG
&DAG
, SDValue N
,
2131 X86ISelAddressMode
&AM
) {
2132 SDValue Shift
= N
.getOperand(0);
2134 // Use a signed mask so that shifting right will insert sign bits. These
2135 // bits will be removed when we shift the result left so it doesn't matter
2136 // what we use. This might allow a smaller immediate encoding.
2137 int64_t Mask
= cast
<ConstantSDNode
>(N
->getOperand(1))->getSExtValue();
2139 // If we have an any_extend feeding the AND, look through it to see if there
2140 // is a shift behind it. But only if the AND doesn't use the extended bits.
2141 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2142 bool FoundAnyExtend
= false;
2143 if (Shift
.getOpcode() == ISD::ANY_EXTEND
&& Shift
.hasOneUse() &&
2144 Shift
.getOperand(0).getSimpleValueType() == MVT::i32
&&
2146 FoundAnyExtend
= true;
2147 Shift
= Shift
.getOperand(0);
2150 if (Shift
.getOpcode() != ISD::SHL
||
2151 !isa
<ConstantSDNode
>(Shift
.getOperand(1)))
2154 SDValue X
= Shift
.getOperand(0);
2156 // Not likely to be profitable if either the AND or SHIFT node has more
2157 // than one use (unless all uses are for address computation). Besides,
2158 // isel mechanism requires their node ids to be reused.
2159 if (!N
.hasOneUse() || !Shift
.hasOneUse())
2162 // Verify that the shift amount is something we can fold.
2163 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
2164 if (ShiftAmt
!= 1 && ShiftAmt
!= 2 && ShiftAmt
!= 3)
2167 MVT VT
= N
.getSimpleValueType();
2169 if (FoundAnyExtend
) {
2170 SDValue NewX
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VT
, X
);
2171 insertDAGNode(DAG
, N
, NewX
);
2175 SDValue NewMask
= DAG
.getSignedConstant(Mask
>> ShiftAmt
, DL
, VT
);
2176 SDValue NewAnd
= DAG
.getNode(ISD::AND
, DL
, VT
, X
, NewMask
);
2177 SDValue NewShift
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewAnd
, Shift
.getOperand(1));
2179 // Insert the new nodes into the topological ordering. We must do this in
2180 // a valid topological ordering as nothing is going to go back and re-sort
2181 // these nodes. We continually insert before 'N' in sequence as this is
2182 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2183 // hierarchy left to express.
2184 insertDAGNode(DAG
, N
, NewMask
);
2185 insertDAGNode(DAG
, N
, NewAnd
);
2186 insertDAGNode(DAG
, N
, NewShift
);
2187 DAG
.ReplaceAllUsesWith(N
, NewShift
);
2188 DAG
.RemoveDeadNode(N
.getNode());
2190 AM
.Scale
= 1 << ShiftAmt
;
2191 AM
.IndexReg
= NewAnd
;
2195 // Implement some heroics to detect shifts of masked values where the mask can
2196 // be replaced by extending the shift and undoing that in the addressing mode
2197 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2198 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2199 // the addressing mode. This results in code such as:
2201 // int f(short *y, int *lookup_table) {
2203 // return *y + lookup_table[*y >> 11];
2207 // movzwl (%rdi), %eax
2210 // addl (%rsi,%rcx,4), %eax
2213 // movzwl (%rdi), %eax
2217 // addl (%rsi,%rcx), %eax
2219 // Note that this function assumes the mask is provided as a mask *after* the
2220 // value is shifted. The input chain may or may not match that, but computing
2221 // such a mask is trivial.
2222 static bool foldMaskAndShiftToScale(SelectionDAG
&DAG
, SDValue N
,
2224 SDValue Shift
, SDValue X
,
2225 X86ISelAddressMode
&AM
) {
2226 if (Shift
.getOpcode() != ISD::SRL
|| !Shift
.hasOneUse() ||
2227 !isa
<ConstantSDNode
>(Shift
.getOperand(1)))
2230 // We need to ensure that mask is a continuous run of bits.
2231 unsigned MaskIdx
, MaskLen
;
2232 if (!isShiftedMask_64(Mask
, MaskIdx
, MaskLen
))
2234 unsigned MaskLZ
= 64 - (MaskIdx
+ MaskLen
);
2236 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
2238 // The amount of shift we're trying to fit into the addressing mode is taken
2239 // from the shifted mask index (number of trailing zeros of the mask).
2240 unsigned AMShiftAmt
= MaskIdx
;
2242 // There is nothing we can do here unless the mask is removing some bits.
2243 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2244 if (AMShiftAmt
== 0 || AMShiftAmt
> 3) return true;
2246 // Scale the leading zero count down based on the actual size of the value.
2247 // Also scale it down based on the size of the shift.
2248 unsigned ScaleDown
= (64 - X
.getSimpleValueType().getSizeInBits()) + ShiftAmt
;
2249 if (MaskLZ
< ScaleDown
)
2251 MaskLZ
-= ScaleDown
;
2253 // The final check is to ensure that any masked out high bits of X are
2254 // already known to be zero. Otherwise, the mask has a semantic impact
2255 // other than masking out a couple of low bits. Unfortunately, because of
2256 // the mask, zero extensions will be removed from operands in some cases.
2257 // This code works extra hard to look through extensions because we can
2258 // replace them with zero extensions cheaply if necessary.
2259 bool ReplacingAnyExtend
= false;
2260 if (X
.getOpcode() == ISD::ANY_EXTEND
) {
2261 unsigned ExtendBits
= X
.getSimpleValueType().getSizeInBits() -
2262 X
.getOperand(0).getSimpleValueType().getSizeInBits();
2263 // Assume that we'll replace the any-extend with a zero-extend, and
2264 // narrow the search to the extended value.
2265 X
= X
.getOperand(0);
2266 MaskLZ
= ExtendBits
> MaskLZ
? 0 : MaskLZ
- ExtendBits
;
2267 ReplacingAnyExtend
= true;
2269 APInt MaskedHighBits
=
2270 APInt::getHighBitsSet(X
.getSimpleValueType().getSizeInBits(), MaskLZ
);
2271 if (!DAG
.MaskedValueIsZero(X
, MaskedHighBits
))
2274 // We've identified a pattern that can be transformed into a single shift
2275 // and an addressing mode. Make it so.
2276 MVT VT
= N
.getSimpleValueType();
2277 if (ReplacingAnyExtend
) {
2278 assert(X
.getValueType() != VT
);
2279 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2280 SDValue NewX
= DAG
.getNode(ISD::ZERO_EXTEND
, SDLoc(X
), VT
, X
);
2281 insertDAGNode(DAG
, N
, NewX
);
2285 MVT XVT
= X
.getSimpleValueType();
2287 SDValue NewSRLAmt
= DAG
.getConstant(ShiftAmt
+ AMShiftAmt
, DL
, MVT::i8
);
2288 SDValue NewSRL
= DAG
.getNode(ISD::SRL
, DL
, XVT
, X
, NewSRLAmt
);
2289 SDValue NewExt
= DAG
.getZExtOrTrunc(NewSRL
, DL
, VT
);
2290 SDValue NewSHLAmt
= DAG
.getConstant(AMShiftAmt
, DL
, MVT::i8
);
2291 SDValue NewSHL
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewExt
, NewSHLAmt
);
2293 // Insert the new nodes into the topological ordering. We must do this in
2294 // a valid topological ordering as nothing is going to go back and re-sort
2295 // these nodes. We continually insert before 'N' in sequence as this is
2296 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2297 // hierarchy left to express.
2298 insertDAGNode(DAG
, N
, NewSRLAmt
);
2299 insertDAGNode(DAG
, N
, NewSRL
);
2300 insertDAGNode(DAG
, N
, NewExt
);
2301 insertDAGNode(DAG
, N
, NewSHLAmt
);
2302 insertDAGNode(DAG
, N
, NewSHL
);
2303 DAG
.ReplaceAllUsesWith(N
, NewSHL
);
2304 DAG
.RemoveDeadNode(N
.getNode());
2306 AM
.Scale
= 1 << AMShiftAmt
;
2307 AM
.IndexReg
= NewExt
;
2311 // Transform "(X >> SHIFT) & (MASK << C1)" to
2312 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2313 // matched to a BEXTR later. Returns false if the simplification is performed.
2314 static bool foldMaskedShiftToBEXTR(SelectionDAG
&DAG
, SDValue N
,
2316 SDValue Shift
, SDValue X
,
2317 X86ISelAddressMode
&AM
,
2318 const X86Subtarget
&Subtarget
) {
2319 if (Shift
.getOpcode() != ISD::SRL
||
2320 !isa
<ConstantSDNode
>(Shift
.getOperand(1)) ||
2321 !Shift
.hasOneUse() || !N
.hasOneUse())
2324 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2325 if (!Subtarget
.hasTBM() &&
2326 !(Subtarget
.hasBMI() && Subtarget
.hasFastBEXTR()))
2329 // We need to ensure that mask is a continuous run of bits.
2330 unsigned MaskIdx
, MaskLen
;
2331 if (!isShiftedMask_64(Mask
, MaskIdx
, MaskLen
))
2334 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
2336 // The amount of shift we're trying to fit into the addressing mode is taken
2337 // from the shifted mask index (number of trailing zeros of the mask).
2338 unsigned AMShiftAmt
= MaskIdx
;
2340 // There is nothing we can do here unless the mask is removing some bits.
2341 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2342 if (AMShiftAmt
== 0 || AMShiftAmt
> 3) return true;
2344 MVT XVT
= X
.getSimpleValueType();
2345 MVT VT
= N
.getSimpleValueType();
2347 SDValue NewSRLAmt
= DAG
.getConstant(ShiftAmt
+ AMShiftAmt
, DL
, MVT::i8
);
2348 SDValue NewSRL
= DAG
.getNode(ISD::SRL
, DL
, XVT
, X
, NewSRLAmt
);
2349 SDValue NewMask
= DAG
.getConstant(Mask
>> AMShiftAmt
, DL
, XVT
);
2350 SDValue NewAnd
= DAG
.getNode(ISD::AND
, DL
, XVT
, NewSRL
, NewMask
);
2351 SDValue NewExt
= DAG
.getZExtOrTrunc(NewAnd
, DL
, VT
);
2352 SDValue NewSHLAmt
= DAG
.getConstant(AMShiftAmt
, DL
, MVT::i8
);
2353 SDValue NewSHL
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewExt
, NewSHLAmt
);
2355 // Insert the new nodes into the topological ordering. We must do this in
2356 // a valid topological ordering as nothing is going to go back and re-sort
2357 // these nodes. We continually insert before 'N' in sequence as this is
2358 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2359 // hierarchy left to express.
2360 insertDAGNode(DAG
, N
, NewSRLAmt
);
2361 insertDAGNode(DAG
, N
, NewSRL
);
2362 insertDAGNode(DAG
, N
, NewMask
);
2363 insertDAGNode(DAG
, N
, NewAnd
);
2364 insertDAGNode(DAG
, N
, NewExt
);
2365 insertDAGNode(DAG
, N
, NewSHLAmt
);
2366 insertDAGNode(DAG
, N
, NewSHL
);
2367 DAG
.ReplaceAllUsesWith(N
, NewSHL
);
2368 DAG
.RemoveDeadNode(N
.getNode());
2370 AM
.Scale
= 1 << AMShiftAmt
;
2371 AM
.IndexReg
= NewExt
;
2375 // Attempt to peek further into a scaled index register, collecting additional
2376 // extensions / offsets / etc. Returns /p N if we can't peek any further.
2377 SDValue
X86DAGToDAGISel::matchIndexRecursively(SDValue N
,
2378 X86ISelAddressMode
&AM
,
2380 assert(AM
.IndexReg
.getNode() == nullptr && "IndexReg already matched");
2381 assert((AM
.Scale
== 1 || AM
.Scale
== 2 || AM
.Scale
== 4 || AM
.Scale
== 8) &&
2382 "Illegal index scale");
2385 if (Depth
>= SelectionDAG::MaxRecursionDepth
)
2388 EVT VT
= N
.getValueType();
2389 unsigned Opc
= N
.getOpcode();
2391 // index: add(x,c) -> index: x, disp + c
2392 if (CurDAG
->isBaseWithConstantOffset(N
)) {
2393 auto *AddVal
= cast
<ConstantSDNode
>(N
.getOperand(1));
2394 uint64_t Offset
= (uint64_t)AddVal
->getSExtValue() * AM
.Scale
;
2395 if (!foldOffsetIntoAddress(Offset
, AM
))
2396 return matchIndexRecursively(N
.getOperand(0), AM
, Depth
+ 1);
2399 // index: add(x,x) -> index: x, scale * 2
2400 if (Opc
== ISD::ADD
&& N
.getOperand(0) == N
.getOperand(1)) {
2401 if (AM
.Scale
<= 4) {
2403 return matchIndexRecursively(N
.getOperand(0), AM
, Depth
+ 1);
2407 // index: shl(x,i) -> index: x, scale * (1 << i)
2408 if (Opc
== X86ISD::VSHLI
) {
2409 uint64_t ShiftAmt
= N
.getConstantOperandVal(1);
2410 uint64_t ScaleAmt
= 1ULL << ShiftAmt
;
2411 if ((AM
.Scale
* ScaleAmt
) <= 8) {
2412 AM
.Scale
*= ScaleAmt
;
2413 return matchIndexRecursively(N
.getOperand(0), AM
, Depth
+ 1);
2417 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2418 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2419 if (Opc
== ISD::SIGN_EXTEND
&& !VT
.isVector() && N
.hasOneUse()) {
2420 SDValue Src
= N
.getOperand(0);
2421 if (Src
.getOpcode() == ISD::ADD
&& Src
->getFlags().hasNoSignedWrap() &&
2423 if (CurDAG
->isBaseWithConstantOffset(Src
)) {
2424 SDValue AddSrc
= Src
.getOperand(0);
2425 auto *AddVal
= cast
<ConstantSDNode
>(Src
.getOperand(1));
2426 int64_t Offset
= AddVal
->getSExtValue();
2427 if (!foldOffsetIntoAddress((uint64_t)Offset
* AM
.Scale
, AM
)) {
2429 SDValue ExtSrc
= CurDAG
->getNode(Opc
, DL
, VT
, AddSrc
);
2430 SDValue ExtVal
= CurDAG
->getSignedConstant(Offset
, DL
, VT
);
2431 SDValue ExtAdd
= CurDAG
->getNode(ISD::ADD
, DL
, VT
, ExtSrc
, ExtVal
);
2432 insertDAGNode(*CurDAG
, N
, ExtSrc
);
2433 insertDAGNode(*CurDAG
, N
, ExtVal
);
2434 insertDAGNode(*CurDAG
, N
, ExtAdd
);
2435 CurDAG
->ReplaceAllUsesWith(N
, ExtAdd
);
2436 CurDAG
->RemoveDeadNode(N
.getNode());
2443 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2444 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2445 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2446 if (Opc
== ISD::ZERO_EXTEND
&& !VT
.isVector() && N
.hasOneUse()) {
2447 SDValue Src
= N
.getOperand(0);
2448 unsigned SrcOpc
= Src
.getOpcode();
2449 if (((SrcOpc
== ISD::ADD
&& Src
->getFlags().hasNoUnsignedWrap()) ||
2450 CurDAG
->isADDLike(Src
, /*NoWrap=*/true)) &&
2452 if (CurDAG
->isBaseWithConstantOffset(Src
)) {
2453 SDValue AddSrc
= Src
.getOperand(0);
2454 uint64_t Offset
= Src
.getConstantOperandVal(1);
2455 if (!foldOffsetIntoAddress(Offset
* AM
.Scale
, AM
)) {
2458 // If we're also scaling, see if we can use that as well.
2459 if (AddSrc
.getOpcode() == ISD::SHL
&&
2460 isa
<ConstantSDNode
>(AddSrc
.getOperand(1))) {
2461 SDValue ShVal
= AddSrc
.getOperand(0);
2462 uint64_t ShAmt
= AddSrc
.getConstantOperandVal(1);
2464 APInt::getHighBitsSet(AddSrc
.getScalarValueSizeInBits(), ShAmt
);
2465 uint64_t ScaleAmt
= 1ULL << ShAmt
;
2466 if ((AM
.Scale
* ScaleAmt
) <= 8 &&
2467 (AddSrc
->getFlags().hasNoUnsignedWrap() ||
2468 CurDAG
->MaskedValueIsZero(ShVal
, HiBits
))) {
2469 AM
.Scale
*= ScaleAmt
;
2470 SDValue ExtShVal
= CurDAG
->getNode(Opc
, DL
, VT
, ShVal
);
2471 SDValue ExtShift
= CurDAG
->getNode(ISD::SHL
, DL
, VT
, ExtShVal
,
2472 AddSrc
.getOperand(1));
2473 insertDAGNode(*CurDAG
, N
, ExtShVal
);
2474 insertDAGNode(*CurDAG
, N
, ExtShift
);
2479 SDValue ExtSrc
= CurDAG
->getNode(Opc
, DL
, VT
, AddSrc
);
2480 SDValue ExtVal
= CurDAG
->getConstant(Offset
, DL
, VT
);
2481 SDValue ExtAdd
= CurDAG
->getNode(SrcOpc
, DL
, VT
, ExtSrc
, ExtVal
);
2482 insertDAGNode(*CurDAG
, N
, ExtSrc
);
2483 insertDAGNode(*CurDAG
, N
, ExtVal
);
2484 insertDAGNode(*CurDAG
, N
, ExtAdd
);
2485 CurDAG
->ReplaceAllUsesWith(N
, ExtAdd
);
2486 CurDAG
->RemoveDeadNode(N
.getNode());
2487 return Res
? Res
: ExtSrc
;
2493 // TODO: Handle extensions, shifted masks etc.
2497 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N
, X86ISelAddressMode
&AM
,
2501 dbgs() << "MatchAddress: ";
2505 if (Depth
>= SelectionDAG::MaxRecursionDepth
)
2506 return matchAddressBase(N
, AM
);
2508 // If this is already a %rip relative address, we can only merge immediates
2509 // into it. Instead of handling this in every case, we handle it here.
2510 // RIP relative addressing: %rip + 32-bit displacement!
2511 if (AM
.isRIPRelative()) {
2512 // FIXME: JumpTable and ExternalSymbol address currently don't like
2513 // displacements. It isn't very important, but this should be fixed for
2515 if (!(AM
.ES
|| AM
.MCSym
) && AM
.JT
!= -1)
2518 if (auto *Cst
= dyn_cast
<ConstantSDNode
>(N
))
2519 if (!foldOffsetIntoAddress(Cst
->getSExtValue(), AM
))
2524 switch (N
.getOpcode()) {
2526 case ISD::LOCAL_RECOVER
: {
2527 if (!AM
.hasSymbolicDisplacement() && AM
.Disp
== 0)
2528 if (const auto *ESNode
= dyn_cast
<MCSymbolSDNode
>(N
.getOperand(0))) {
2529 // Use the symbol and don't prefix it.
2530 AM
.MCSym
= ESNode
->getMCSymbol();
2535 case ISD::Constant
: {
2536 uint64_t Val
= cast
<ConstantSDNode
>(N
)->getSExtValue();
2537 if (!foldOffsetIntoAddress(Val
, AM
))
2542 case X86ISD::Wrapper
:
2543 case X86ISD::WrapperRIP
:
2544 if (!matchWrapper(N
, AM
))
2549 if (!matchLoadInAddress(cast
<LoadSDNode
>(N
), AM
))
2553 case ISD::FrameIndex
:
2554 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
2555 AM
.Base_Reg
.getNode() == nullptr &&
2556 (!Subtarget
->is64Bit() || isDispSafeForFrameIndex(AM
.Disp
))) {
2557 AM
.BaseType
= X86ISelAddressMode::FrameIndexBase
;
2558 AM
.Base_FrameIndex
= cast
<FrameIndexSDNode
>(N
)->getIndex();
2564 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1)
2567 if (auto *CN
= dyn_cast
<ConstantSDNode
>(N
.getOperand(1))) {
2568 unsigned Val
= CN
->getZExtValue();
2569 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2570 // that the base operand remains free for further matching. If
2571 // the base doesn't end up getting used, a post-processing step
2572 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2573 if (Val
== 1 || Val
== 2 || Val
== 3) {
2574 SDValue ShVal
= N
.getOperand(0);
2575 AM
.Scale
= 1 << Val
;
2576 AM
.IndexReg
= matchIndexRecursively(ShVal
, AM
, Depth
+ 1);
2583 // Scale must not be used already.
2584 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1) break;
2586 // We only handle up to 64-bit values here as those are what matter for
2587 // addressing mode optimizations.
2588 assert(N
.getSimpleValueType().getSizeInBits() <= 64 &&
2589 "Unexpected value size!");
2591 SDValue And
= N
.getOperand(0);
2592 if (And
.getOpcode() != ISD::AND
) break;
2593 SDValue X
= And
.getOperand(0);
2595 // The mask used for the transform is expected to be post-shift, but we
2596 // found the shift first so just apply the shift to the mask before passing
2598 if (!isa
<ConstantSDNode
>(N
.getOperand(1)) ||
2599 !isa
<ConstantSDNode
>(And
.getOperand(1)))
2601 uint64_t Mask
= And
.getConstantOperandVal(1) >> N
.getConstantOperandVal(1);
2603 // Try to fold the mask and shift into the scale, and return false if we
2605 if (!foldMaskAndShiftToScale(*CurDAG
, N
, Mask
, N
, X
, AM
))
2610 case ISD::SMUL_LOHI
:
2611 case ISD::UMUL_LOHI
:
2612 // A mul_lohi where we need the low part can be folded as a plain multiply.
2613 if (N
.getResNo() != 0) break;
2616 case X86ISD::MUL_IMM
:
2617 // X*[3,5,9] -> X+X*[2,4,8]
2618 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
2619 AM
.Base_Reg
.getNode() == nullptr &&
2620 AM
.IndexReg
.getNode() == nullptr) {
2621 if (auto *CN
= dyn_cast
<ConstantSDNode
>(N
.getOperand(1)))
2622 if (CN
->getZExtValue() == 3 || CN
->getZExtValue() == 5 ||
2623 CN
->getZExtValue() == 9) {
2624 AM
.Scale
= unsigned(CN
->getZExtValue())-1;
2626 SDValue MulVal
= N
.getOperand(0);
2629 // Okay, we know that we have a scale by now. However, if the scaled
2630 // value is an add of something and a constant, we can fold the
2631 // constant into the disp field here.
2632 if (MulVal
.getNode()->getOpcode() == ISD::ADD
&& MulVal
.hasOneUse() &&
2633 isa
<ConstantSDNode
>(MulVal
.getOperand(1))) {
2634 Reg
= MulVal
.getOperand(0);
2635 auto *AddVal
= cast
<ConstantSDNode
>(MulVal
.getOperand(1));
2636 uint64_t Disp
= AddVal
->getSExtValue() * CN
->getZExtValue();
2637 if (foldOffsetIntoAddress(Disp
, AM
))
2638 Reg
= N
.getOperand(0);
2640 Reg
= N
.getOperand(0);
2643 AM
.IndexReg
= AM
.Base_Reg
= Reg
;
2650 // Given A-B, if A can be completely folded into the address and
2651 // the index field with the index field unused, use -B as the index.
2652 // This is a win if a has multiple parts that can be folded into
2653 // the address. Also, this saves a mov if the base register has
2654 // other uses, since it avoids a two-address sub instruction, however
2655 // it costs an additional mov if the index register has other uses.
2657 // Add an artificial use to this node so that we can keep track of
2658 // it if it gets CSE'd with a different node.
2659 HandleSDNode
Handle(N
);
2661 // Test if the LHS of the sub can be folded.
2662 X86ISelAddressMode Backup
= AM
;
2663 if (matchAddressRecursively(N
.getOperand(0), AM
, Depth
+1)) {
2664 N
= Handle
.getValue();
2668 N
= Handle
.getValue();
2669 // Test if the index field is free for use.
2670 if (AM
.IndexReg
.getNode() || AM
.isRIPRelative()) {
2676 SDValue RHS
= N
.getOperand(1);
2677 // If the RHS involves a register with multiple uses, this
2678 // transformation incurs an extra mov, due to the neg instruction
2679 // clobbering its operand.
2680 if (!RHS
.getNode()->hasOneUse() ||
2681 RHS
.getNode()->getOpcode() == ISD::CopyFromReg
||
2682 RHS
.getNode()->getOpcode() == ISD::TRUNCATE
||
2683 RHS
.getNode()->getOpcode() == ISD::ANY_EXTEND
||
2684 (RHS
.getNode()->getOpcode() == ISD::ZERO_EXTEND
&&
2685 RHS
.getOperand(0).getValueType() == MVT::i32
))
2687 // If the base is a register with multiple uses, this
2688 // transformation may save a mov.
2689 if ((AM
.BaseType
== X86ISelAddressMode::RegBase
&& AM
.Base_Reg
.getNode() &&
2690 !AM
.Base_Reg
.getNode()->hasOneUse()) ||
2691 AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
2693 // If the folded LHS was interesting, this transformation saves
2694 // address arithmetic.
2695 if ((AM
.hasSymbolicDisplacement() && !Backup
.hasSymbolicDisplacement()) +
2696 ((AM
.Disp
!= 0) && (Backup
.Disp
== 0)) +
2697 (AM
.Segment
.getNode() && !Backup
.Segment
.getNode()) >= 2)
2699 // If it doesn't look like it may be an overall win, don't do it.
2705 // Ok, the transformation is legal and appears profitable. Go for it.
2706 // Negation will be emitted later to avoid creating dangling nodes if this
2707 // was an unprofitable LEA.
2709 AM
.NegateIndex
= true;
2716 // See if we can treat the OR/XOR node as an ADD node.
2717 if (!CurDAG
->isADDLike(N
))
2721 if (!matchAdd(N
, AM
, Depth
))
2726 // Perform some heroic transforms on an and of a constant-count shift
2727 // with a constant to enable use of the scaled offset field.
2729 // Scale must not be used already.
2730 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1) break;
2732 // We only handle up to 64-bit values here as those are what matter for
2733 // addressing mode optimizations.
2734 assert(N
.getSimpleValueType().getSizeInBits() <= 64 &&
2735 "Unexpected value size!");
2737 if (!isa
<ConstantSDNode
>(N
.getOperand(1)))
2740 if (N
.getOperand(0).getOpcode() == ISD::SRL
) {
2741 SDValue Shift
= N
.getOperand(0);
2742 SDValue X
= Shift
.getOperand(0);
2744 uint64_t Mask
= N
.getConstantOperandVal(1);
2746 // Try to fold the mask and shift into an extract and scale.
2747 if (!foldMaskAndShiftToExtract(*CurDAG
, N
, Mask
, Shift
, X
, AM
))
2750 // Try to fold the mask and shift directly into the scale.
2751 if (!foldMaskAndShiftToScale(*CurDAG
, N
, Mask
, Shift
, X
, AM
))
2754 // Try to fold the mask and shift into BEXTR and scale.
2755 if (!foldMaskedShiftToBEXTR(*CurDAG
, N
, Mask
, Shift
, X
, AM
, *Subtarget
))
2759 // Try to swap the mask and shift to place shifts which can be done as
2760 // a scale on the outside of the mask.
2761 if (!foldMaskedShiftToScaledMask(*CurDAG
, N
, AM
))
2766 case ISD::ZERO_EXTEND
: {
2767 // Try to widen a zexted shift left to the same size as its use, so we can
2768 // match the shift as a scale factor.
2769 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1)
2772 SDValue Src
= N
.getOperand(0);
2774 // See if we can match a zext(addlike(x,c)).
2775 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2776 if (Src
.getOpcode() == ISD::ADD
|| Src
.getOpcode() == ISD::OR
)
2777 if (SDValue Index
= matchIndexRecursively(N
, AM
, Depth
+ 1))
2779 AM
.IndexReg
= Index
;
2783 // Peek through mask: zext(and(shl(x,c1),c2))
2784 APInt Mask
= APInt::getAllOnes(Src
.getScalarValueSizeInBits());
2785 if (Src
.getOpcode() == ISD::AND
&& Src
.hasOneUse())
2786 if (auto *MaskC
= dyn_cast
<ConstantSDNode
>(Src
.getOperand(1))) {
2787 Mask
= MaskC
->getAPIntValue();
2788 Src
= Src
.getOperand(0);
2791 if (Src
.getOpcode() == ISD::SHL
&& Src
.hasOneUse() && N
->hasOneUse()) {
2792 // Give up if the shift is not a valid scale factor [1,2,3].
2793 SDValue ShlSrc
= Src
.getOperand(0);
2794 SDValue ShlAmt
= Src
.getOperand(1);
2795 auto *ShAmtC
= dyn_cast
<ConstantSDNode
>(ShlAmt
);
2798 unsigned ShAmtV
= ShAmtC
->getZExtValue();
2802 // The narrow shift must only shift out zero bits (it must be 'nuw').
2803 // That makes it safe to widen to the destination type.
2805 APInt::getHighBitsSet(ShlSrc
.getValueSizeInBits(), ShAmtV
);
2806 if (!Src
->getFlags().hasNoUnsignedWrap() &&
2807 !CurDAG
->MaskedValueIsZero(ShlSrc
, HighZeros
& Mask
))
2810 // zext (shl nuw i8 %x, C1) to i32
2811 // --> shl (zext i8 %x to i32), (zext C1)
2812 // zext (and (shl nuw i8 %x, C1), C2) to i32
2813 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2814 MVT SrcVT
= ShlSrc
.getSimpleValueType();
2815 MVT VT
= N
.getSimpleValueType();
2818 SDValue Res
= ShlSrc
;
2819 if (!Mask
.isAllOnes()) {
2820 Res
= CurDAG
->getConstant(Mask
.lshr(ShAmtV
), DL
, SrcVT
);
2821 insertDAGNode(*CurDAG
, N
, Res
);
2822 Res
= CurDAG
->getNode(ISD::AND
, DL
, SrcVT
, ShlSrc
, Res
);
2823 insertDAGNode(*CurDAG
, N
, Res
);
2825 SDValue Zext
= CurDAG
->getNode(ISD::ZERO_EXTEND
, DL
, VT
, Res
);
2826 insertDAGNode(*CurDAG
, N
, Zext
);
2827 SDValue NewShl
= CurDAG
->getNode(ISD::SHL
, DL
, VT
, Zext
, ShlAmt
);
2828 insertDAGNode(*CurDAG
, N
, NewShl
);
2829 CurDAG
->ReplaceAllUsesWith(N
, NewShl
);
2830 CurDAG
->RemoveDeadNode(N
.getNode());
2832 // Convert the shift to scale factor.
2833 AM
.Scale
= 1 << ShAmtV
;
2834 // If matchIndexRecursively is not called here,
2835 // Zext may be replaced by other nodes but later used to call a builder
2837 AM
.IndexReg
= matchIndexRecursively(Zext
, AM
, Depth
+ 1);
2841 if (Src
.getOpcode() == ISD::SRL
&& !Mask
.isAllOnes()) {
2842 // Try to fold the mask and shift into an extract and scale.
2843 if (!foldMaskAndShiftToExtract(*CurDAG
, N
, Mask
.getZExtValue(), Src
,
2844 Src
.getOperand(0), AM
))
2847 // Try to fold the mask and shift directly into the scale.
2848 if (!foldMaskAndShiftToScale(*CurDAG
, N
, Mask
.getZExtValue(), Src
,
2849 Src
.getOperand(0), AM
))
2852 // Try to fold the mask and shift into BEXTR and scale.
2853 if (!foldMaskedShiftToBEXTR(*CurDAG
, N
, Mask
.getZExtValue(), Src
,
2854 Src
.getOperand(0), AM
, *Subtarget
))
2862 return matchAddressBase(N
, AM
);
2865 /// Helper for MatchAddress. Add the specified node to the
2866 /// specified addressing mode without any further recursion.
2867 bool X86DAGToDAGISel::matchAddressBase(SDValue N
, X86ISelAddressMode
&AM
) {
2868 // Is the base register already occupied?
2869 if (AM
.BaseType
!= X86ISelAddressMode::RegBase
|| AM
.Base_Reg
.getNode()) {
2870 // If so, check to see if the scale index register is set.
2871 if (!AM
.IndexReg
.getNode()) {
2877 // Otherwise, we cannot select it.
2881 // Default, generate it as a register.
2882 AM
.BaseType
= X86ISelAddressMode::RegBase
;
2887 bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N
,
2888 X86ISelAddressMode
&AM
,
2892 dbgs() << "MatchVectorAddress: ";
2896 if (Depth
>= SelectionDAG::MaxRecursionDepth
)
2897 return matchAddressBase(N
, AM
);
2899 // TODO: Support other operations.
2900 switch (N
.getOpcode()) {
2901 case ISD::Constant
: {
2902 uint64_t Val
= cast
<ConstantSDNode
>(N
)->getSExtValue();
2903 if (!foldOffsetIntoAddress(Val
, AM
))
2907 case X86ISD::Wrapper
:
2908 if (!matchWrapper(N
, AM
))
2912 // Add an artificial use to this node so that we can keep track of
2913 // it if it gets CSE'd with a different node.
2914 HandleSDNode
Handle(N
);
2916 X86ISelAddressMode Backup
= AM
;
2917 if (!matchVectorAddressRecursively(N
.getOperand(0), AM
, Depth
+ 1) &&
2918 !matchVectorAddressRecursively(Handle
.getValue().getOperand(1), AM
,
2923 // Try again after commuting the operands.
2924 if (!matchVectorAddressRecursively(Handle
.getValue().getOperand(1), AM
,
2926 !matchVectorAddressRecursively(Handle
.getValue().getOperand(0), AM
,
2931 N
= Handle
.getValue();
2936 return matchAddressBase(N
, AM
);
2939 /// Helper for selectVectorAddr. Handles things that can be folded into a
2940 /// gather/scatter address. The index register and scale should have already
2942 bool X86DAGToDAGISel::matchVectorAddress(SDValue N
, X86ISelAddressMode
&AM
) {
2943 return matchVectorAddressRecursively(N
, AM
, 0);
2946 bool X86DAGToDAGISel::selectVectorAddr(MemSDNode
*Parent
, SDValue BasePtr
,
2947 SDValue IndexOp
, SDValue ScaleOp
,
2948 SDValue
&Base
, SDValue
&Scale
,
2949 SDValue
&Index
, SDValue
&Disp
,
2951 X86ISelAddressMode AM
;
2952 AM
.Scale
= ScaleOp
->getAsZExtVal();
2954 // Attempt to match index patterns, as long as we're not relying on implicit
2955 // sign-extension, which is performed BEFORE scale.
2956 if (IndexOp
.getScalarValueSizeInBits() == BasePtr
.getScalarValueSizeInBits())
2957 AM
.IndexReg
= matchIndexRecursively(IndexOp
, AM
, 0);
2959 AM
.IndexReg
= IndexOp
;
2961 unsigned AddrSpace
= Parent
->getPointerInfo().getAddrSpace();
2962 if (AddrSpace
== X86AS::GS
)
2963 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
2964 if (AddrSpace
== X86AS::FS
)
2965 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
2966 if (AddrSpace
== X86AS::SS
)
2967 AM
.Segment
= CurDAG
->getRegister(X86::SS
, MVT::i16
);
2970 MVT VT
= BasePtr
.getSimpleValueType();
2972 // Try to match into the base and displacement fields.
2973 if (matchVectorAddress(BasePtr
, AM
))
2976 getAddressOperands(AM
, DL
, VT
, Base
, Scale
, Index
, Disp
, Segment
);
2980 /// Returns true if it is able to pattern match an addressing mode.
2981 /// It returns the operands which make up the maximal addressing mode it can
2982 /// match by reference.
2984 /// Parent is the parent node of the addr operand that is being matched. It
2985 /// is always a load, store, atomic node, or null. It is only null when
2986 /// checking memory operands for inline asm nodes.
2987 bool X86DAGToDAGISel::selectAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
2988 SDValue
&Scale
, SDValue
&Index
,
2989 SDValue
&Disp
, SDValue
&Segment
) {
2990 X86ISelAddressMode AM
;
2993 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2994 // that are not a MemSDNode, and thus don't have proper addrspace info.
2995 Parent
->getOpcode() != ISD::INTRINSIC_W_CHAIN
&& // unaligned loads, fixme
2996 Parent
->getOpcode() != ISD::INTRINSIC_VOID
&& // nontemporal stores
2997 Parent
->getOpcode() != X86ISD::TLSCALL
&& // Fixme
2998 Parent
->getOpcode() != X86ISD::ENQCMD
&& // Fixme
2999 Parent
->getOpcode() != X86ISD::ENQCMDS
&& // Fixme
3000 Parent
->getOpcode() != X86ISD::EH_SJLJ_SETJMP
&& // setjmp
3001 Parent
->getOpcode() != X86ISD::EH_SJLJ_LONGJMP
) { // longjmp
3002 unsigned AddrSpace
=
3003 cast
<MemSDNode
>(Parent
)->getPointerInfo().getAddrSpace();
3004 if (AddrSpace
== X86AS::GS
)
3005 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
3006 if (AddrSpace
== X86AS::FS
)
3007 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
3008 if (AddrSpace
== X86AS::SS
)
3009 AM
.Segment
= CurDAG
->getRegister(X86::SS
, MVT::i16
);
3012 // Save the DL and VT before calling matchAddress, it can invalidate N.
3014 MVT VT
= N
.getSimpleValueType();
3016 if (matchAddress(N
, AM
))
3019 getAddressOperands(AM
, DL
, VT
, Base
, Scale
, Index
, Disp
, Segment
);
3023 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N
, SDValue
&Imm
) {
3024 // Cannot use 32 bit constants to reference objects in kernel/large code
3026 if (TM
.getCodeModel() == CodeModel::Kernel
||
3027 TM
.getCodeModel() == CodeModel::Large
)
3030 // In static codegen with small code model, we can get the address of a label
3031 // into a register with 'movl'
3032 if (N
->getOpcode() != X86ISD::Wrapper
)
3035 N
= N
.getOperand(0);
3037 // At least GNU as does not accept 'movl' for TPOFF relocations.
3038 // FIXME: We could use 'movl' when we know we are targeting MC.
3039 if (N
->getOpcode() == ISD::TargetGlobalTLSAddress
)
3043 // Small/medium code model can reference non-TargetGlobalAddress objects with
3044 // 32 bit constants.
3045 if (N
->getOpcode() != ISD::TargetGlobalAddress
) {
3046 return TM
.getCodeModel() == CodeModel::Small
||
3047 TM
.getCodeModel() == CodeModel::Medium
;
3050 const GlobalValue
*GV
= cast
<GlobalAddressSDNode
>(N
)->getGlobal();
3051 if (std::optional
<ConstantRange
> CR
= GV
->getAbsoluteSymbolRange())
3052 return CR
->getUnsignedMax().ult(1ull << 32);
3054 return !TM
.isLargeGlobalValue(GV
);
3057 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N
, SDValue
&Base
,
3058 SDValue
&Scale
, SDValue
&Index
,
3059 SDValue
&Disp
, SDValue
&Segment
) {
3060 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3063 if (!selectLEAAddr(N
, Base
, Scale
, Index
, Disp
, Segment
))
3066 auto *RN
= dyn_cast
<RegisterSDNode
>(Base
);
3067 if (RN
&& RN
->getReg() == 0)
3068 Base
= CurDAG
->getRegister(0, MVT::i64
);
3069 else if (Base
.getValueType() == MVT::i32
&& !isa
<FrameIndexSDNode
>(Base
)) {
3070 // Base could already be %rip, particularly in the x32 ABI.
3071 SDValue ImplDef
= SDValue(CurDAG
->getMachineNode(X86::IMPLICIT_DEF
, DL
,
3073 Base
= CurDAG
->getTargetInsertSubreg(X86::sub_32bit
, DL
, MVT::i64
, ImplDef
,
3077 RN
= dyn_cast
<RegisterSDNode
>(Index
);
3078 if (RN
&& RN
->getReg() == 0)
3079 Index
= CurDAG
->getRegister(0, MVT::i64
);
3081 assert(Index
.getValueType() == MVT::i32
&&
3082 "Expect to be extending 32-bit registers for use in LEA");
3083 SDValue ImplDef
= SDValue(CurDAG
->getMachineNode(X86::IMPLICIT_DEF
, DL
,
3085 Index
= CurDAG
->getTargetInsertSubreg(X86::sub_32bit
, DL
, MVT::i64
, ImplDef
,
3092 /// Calls SelectAddr and determines if the maximal addressing
3093 /// mode it matches can be cost effectively emitted as an LEA instruction.
3094 bool X86DAGToDAGISel::selectLEAAddr(SDValue N
,
3095 SDValue
&Base
, SDValue
&Scale
,
3096 SDValue
&Index
, SDValue
&Disp
,
3098 X86ISelAddressMode AM
;
3100 // Save the DL and VT before calling matchAddress, it can invalidate N.
3102 MVT VT
= N
.getSimpleValueType();
3104 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3106 SDValue Copy
= AM
.Segment
;
3107 SDValue T
= CurDAG
->getRegister(0, MVT::i32
);
3109 if (matchAddress(N
, AM
))
3111 assert (T
== AM
.Segment
);
3114 unsigned Complexity
= 0;
3115 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&& AM
.Base_Reg
.getNode())
3117 else if (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
3120 if (AM
.IndexReg
.getNode())
3123 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3128 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3129 // to a LEA. This is determined with some experimentation but is by no means
3130 // optimal (especially for code size consideration). LEA is nice because of
3131 // its three-address nature. Tweak the cost function again when we can run
3132 // convertToThreeAddress() at register allocation time.
3133 if (AM
.hasSymbolicDisplacement()) {
3134 // For X86-64, always use LEA to materialize RIP-relative addresses.
3135 if (Subtarget
->is64Bit())
3141 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3142 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3143 // duplicating flag-producing instructions later in the pipeline.
3144 if (N
.getOpcode() == ISD::ADD
) {
3145 auto isMathWithFlags
= [](SDValue V
) {
3146 switch (V
.getOpcode()) {
3153 /* TODO: These opcodes can be added safely, but we may want to justify
3154 their inclusion for different reasons (better for reg-alloc).
3159 // Value 1 is the flag output of the node - verify it's not dead.
3160 return !SDValue(V
.getNode(), 1).use_empty();
3165 // TODO: We might want to factor in whether there's a load folding
3166 // opportunity for the math op that disappears with LEA.
3167 if (isMathWithFlags(N
.getOperand(0)) || isMathWithFlags(N
.getOperand(1)))
3174 // If it isn't worth using an LEA, reject it.
3175 if (Complexity
<= 2)
3178 getAddressOperands(AM
, DL
, VT
, Base
, Scale
, Index
, Disp
, Segment
);
3182 /// This is only run on TargetGlobalTLSAddress nodes.
3183 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N
, SDValue
&Base
,
3184 SDValue
&Scale
, SDValue
&Index
,
3185 SDValue
&Disp
, SDValue
&Segment
) {
3186 assert(N
.getOpcode() == ISD::TargetGlobalTLSAddress
||
3187 N
.getOpcode() == ISD::TargetExternalSymbol
);
3189 X86ISelAddressMode AM
;
3190 if (auto *GA
= dyn_cast
<GlobalAddressSDNode
>(N
)) {
3191 AM
.GV
= GA
->getGlobal();
3192 AM
.Disp
+= GA
->getOffset();
3193 AM
.SymbolFlags
= GA
->getTargetFlags();
3195 auto *SA
= cast
<ExternalSymbolSDNode
>(N
);
3196 AM
.ES
= SA
->getSymbol();
3197 AM
.SymbolFlags
= SA
->getTargetFlags();
3200 if (Subtarget
->is32Bit()) {
3202 AM
.IndexReg
= CurDAG
->getRegister(X86::EBX
, MVT::i32
);
3205 MVT VT
= N
.getSimpleValueType();
3206 getAddressOperands(AM
, SDLoc(N
), VT
, Base
, Scale
, Index
, Disp
, Segment
);
3210 bool X86DAGToDAGISel::selectRelocImm(SDValue N
, SDValue
&Op
) {
3211 // Keep track of the original value type and whether this value was
3212 // truncated. If we see a truncation from pointer type to VT that truncates
3213 // bits that are known to be zero, we can use a narrow reference.
3214 EVT VT
= N
.getValueType();
3215 bool WasTruncated
= false;
3216 if (N
.getOpcode() == ISD::TRUNCATE
) {
3217 WasTruncated
= true;
3218 N
= N
.getOperand(0);
3221 if (N
.getOpcode() != X86ISD::Wrapper
)
3224 // We can only use non-GlobalValues as immediates if they were not truncated,
3225 // as we do not have any range information. If we have a GlobalValue and the
3226 // address was not truncated, we can select it as an operand directly.
3227 unsigned Opc
= N
.getOperand(0)->getOpcode();
3228 if (Opc
!= ISD::TargetGlobalAddress
|| !WasTruncated
) {
3229 Op
= N
.getOperand(0);
3230 // We can only select the operand directly if we didn't have to look past a
3232 return !WasTruncated
;
3235 // Check that the global's range fits into VT.
3236 auto *GA
= cast
<GlobalAddressSDNode
>(N
.getOperand(0));
3237 std::optional
<ConstantRange
> CR
= GA
->getGlobal()->getAbsoluteSymbolRange();
3238 if (!CR
|| CR
->getUnsignedMax().uge(1ull << VT
.getSizeInBits()))
3241 // Okay, we can use a narrow reference.
3242 Op
= CurDAG
->getTargetGlobalAddress(GA
->getGlobal(), SDLoc(N
), VT
,
3243 GA
->getOffset(), GA
->getTargetFlags());
3247 bool X86DAGToDAGISel::tryFoldLoad(SDNode
*Root
, SDNode
*P
, SDValue N
,
3248 SDValue
&Base
, SDValue
&Scale
,
3249 SDValue
&Index
, SDValue
&Disp
,
3251 assert(Root
&& P
&& "Unknown root/parent nodes");
3252 if (!ISD::isNON_EXTLoad(N
.getNode()) ||
3253 !IsProfitableToFold(N
, P
, Root
) ||
3254 !IsLegalToFold(N
, P
, Root
, OptLevel
))
3257 return selectAddr(N
.getNode(),
3258 N
.getOperand(1), Base
, Scale
, Index
, Disp
, Segment
);
3261 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode
*Root
, SDNode
*P
, SDValue N
,
3262 SDValue
&Base
, SDValue
&Scale
,
3263 SDValue
&Index
, SDValue
&Disp
,
3265 assert(Root
&& P
&& "Unknown root/parent nodes");
3266 if (N
->getOpcode() != X86ISD::VBROADCAST_LOAD
||
3267 !IsProfitableToFold(N
, P
, Root
) ||
3268 !IsLegalToFold(N
, P
, Root
, OptLevel
))
3271 return selectAddr(N
.getNode(),
3272 N
.getOperand(1), Base
, Scale
, Index
, Disp
, Segment
);
3275 /// Return an SDNode that returns the value of the global base register.
3276 /// Output instructions required to initialize the global base register,
3278 SDNode
*X86DAGToDAGISel::getGlobalBaseReg() {
3279 unsigned GlobalBaseReg
= getInstrInfo()->getGlobalBaseReg(MF
);
3280 auto &DL
= MF
->getDataLayout();
3281 return CurDAG
->getRegister(GlobalBaseReg
, TLI
->getPointerTy(DL
)).getNode();
3284 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width
, SDNode
*N
) const {
3285 if (N
->getOpcode() == ISD::TRUNCATE
)
3286 N
= N
->getOperand(0).getNode();
3287 if (N
->getOpcode() != X86ISD::Wrapper
)
3290 auto *GA
= dyn_cast
<GlobalAddressSDNode
>(N
->getOperand(0));
3294 auto *GV
= GA
->getGlobal();
3295 std::optional
<ConstantRange
> CR
= GV
->getAbsoluteSymbolRange();
3297 return CR
->getSignedMin().sge(-1ull << Width
) &&
3298 CR
->getSignedMax().slt(1ull << Width
);
3299 // In the kernel code model, globals are in the negative 2GB of the address
3300 // space, so globals can be a sign extended 32-bit immediate.
3301 // In other code models, small globals are in the low 2GB of the address
3302 // space, so sign extending them is equivalent to zero extending them.
3303 return Width
== 32 && !TM
.isLargeGlobalValue(GV
);
3306 X86::CondCode
X86DAGToDAGISel::getCondFromNode(SDNode
*N
) const {
3307 assert(N
->isMachineOpcode() && "Unexpected node");
3308 unsigned Opc
= N
->getMachineOpcode();
3309 const MCInstrDesc
&MCID
= getInstrInfo()->get(Opc
);
3310 int CondNo
= X86::getCondSrcNoFromDesc(MCID
);
3312 return X86::COND_INVALID
;
3314 return static_cast<X86::CondCode
>(N
->getConstantOperandVal(CondNo
));
3317 /// Test whether the given X86ISD::CMP node has any users that use a flag
3319 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags
) const {
3320 // Examine each user of the node.
3321 for (SDUse
&Use
: Flags
->uses()) {
3322 // Only check things that use the flags.
3323 if (Use
.getResNo() != Flags
.getResNo())
3325 SDNode
*User
= Use
.getUser();
3326 // Only examine CopyToReg uses that copy to EFLAGS.
3327 if (User
->getOpcode() != ISD::CopyToReg
||
3328 cast
<RegisterSDNode
>(User
->getOperand(1))->getReg() != X86::EFLAGS
)
3330 // Examine each user of the CopyToReg use.
3331 for (SDUse
&FlagUse
: User
->uses()) {
3332 // Only examine the Flag result.
3333 if (FlagUse
.getResNo() != 1)
3335 // Anything unusual: assume conservatively.
3336 if (!FlagUse
.getUser()->isMachineOpcode())
3338 // Examine the condition code of the user.
3339 X86::CondCode CC
= getCondFromNode(FlagUse
.getUser());
3342 // Comparisons which only use the zero flag.
3343 case X86::COND_E
: case X86::COND_NE
:
3345 // Anything else: assume conservatively.
3354 /// Test whether the given X86ISD::CMP node has any uses which require the SF
3355 /// flag to be accurate.
3356 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags
) const {
3357 // Examine each user of the node.
3358 for (SDUse
&Use
: Flags
->uses()) {
3359 // Only check things that use the flags.
3360 if (Use
.getResNo() != Flags
.getResNo())
3362 SDNode
*User
= Use
.getUser();
3363 // Only examine CopyToReg uses that copy to EFLAGS.
3364 if (User
->getOpcode() != ISD::CopyToReg
||
3365 cast
<RegisterSDNode
>(User
->getOperand(1))->getReg() != X86::EFLAGS
)
3367 // Examine each user of the CopyToReg use.
3368 for (SDUse
&FlagUse
: User
->uses()) {
3369 // Only examine the Flag result.
3370 if (FlagUse
.getResNo() != 1)
3372 // Anything unusual: assume conservatively.
3373 if (!FlagUse
.getUser()->isMachineOpcode())
3375 // Examine the condition code of the user.
3376 X86::CondCode CC
= getCondFromNode(FlagUse
.getUser());
3379 // Comparisons which don't examine the SF flag.
3380 case X86::COND_A
: case X86::COND_AE
:
3381 case X86::COND_B
: case X86::COND_BE
:
3382 case X86::COND_E
: case X86::COND_NE
:
3383 case X86::COND_O
: case X86::COND_NO
:
3384 case X86::COND_P
: case X86::COND_NP
:
3386 // Anything else: assume conservatively.
3395 static bool mayUseCarryFlag(X86::CondCode CC
) {
3397 // Comparisons which don't examine the CF flag.
3398 case X86::COND_O
: case X86::COND_NO
:
3399 case X86::COND_E
: case X86::COND_NE
:
3400 case X86::COND_S
: case X86::COND_NS
:
3401 case X86::COND_P
: case X86::COND_NP
:
3402 case X86::COND_L
: case X86::COND_GE
:
3403 case X86::COND_G
: case X86::COND_LE
:
3405 // Anything else: assume conservatively.
3411 /// Test whether the given node which sets flags has any uses which require the
3412 /// CF flag to be accurate.
3413 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags
) const {
3414 // Examine each user of the node.
3415 for (SDUse
&Use
: Flags
->uses()) {
3416 // Only check things that use the flags.
3417 if (Use
.getResNo() != Flags
.getResNo())
3420 SDNode
*User
= Use
.getUser();
3421 unsigned UserOpc
= User
->getOpcode();
3423 if (UserOpc
== ISD::CopyToReg
) {
3424 // Only examine CopyToReg uses that copy to EFLAGS.
3425 if (cast
<RegisterSDNode
>(User
->getOperand(1))->getReg() != X86::EFLAGS
)
3427 // Examine each user of the CopyToReg use.
3428 for (SDUse
&FlagUse
: User
->uses()) {
3429 // Only examine the Flag result.
3430 if (FlagUse
.getResNo() != 1)
3432 // Anything unusual: assume conservatively.
3433 if (!FlagUse
.getUser()->isMachineOpcode())
3435 // Examine the condition code of the user.
3436 X86::CondCode CC
= getCondFromNode(FlagUse
.getUser());
3438 if (mayUseCarryFlag(CC
))
3442 // This CopyToReg is ok. Move on to the next user.
3446 // This might be an unselected node. So look for the pre-isel opcodes that
3451 // Something unusual. Be conservative.
3453 case X86ISD::SETCC
: CCOpNo
= 0; break;
3454 case X86ISD::SETCC_CARRY
: CCOpNo
= 0; break;
3455 case X86ISD::CMOV
: CCOpNo
= 2; break;
3456 case X86ISD::BRCOND
: CCOpNo
= 2; break;
3459 X86::CondCode CC
= (X86::CondCode
)User
->getConstantOperandVal(CCOpNo
);
3460 if (mayUseCarryFlag(CC
))
3466 /// Check whether or not the chain ending in StoreNode is suitable for doing
3467 /// the {load; op; store} to modify transformation.
3468 static bool isFusableLoadOpStorePattern(StoreSDNode
*StoreNode
,
3469 SDValue StoredVal
, SelectionDAG
*CurDAG
,
3471 LoadSDNode
*&LoadNode
,
3472 SDValue
&InputChain
) {
3473 // Is the stored value result 0 of the operation?
3474 if (StoredVal
.getResNo() != 0) return false;
3476 // Are there other uses of the operation other than the store?
3477 if (!StoredVal
.getNode()->hasNUsesOfValue(1, 0)) return false;
3479 // Is the store non-extending and non-indexed?
3480 if (!ISD::isNormalStore(StoreNode
) || StoreNode
->isNonTemporal())
3483 SDValue Load
= StoredVal
->getOperand(LoadOpNo
);
3484 // Is the stored value a non-extending and non-indexed load?
3485 if (!ISD::isNormalLoad(Load
.getNode())) return false;
3487 // Return LoadNode by reference.
3488 LoadNode
= cast
<LoadSDNode
>(Load
);
3490 // Is store the only read of the loaded value?
3491 if (!Load
.hasOneUse())
3494 // Is the address of the store the same as the load?
3495 if (LoadNode
->getBasePtr() != StoreNode
->getBasePtr() ||
3496 LoadNode
->getOffset() != StoreNode
->getOffset())
3499 bool FoundLoad
= false;
3500 SmallVector
<SDValue
, 4> ChainOps
;
3501 SmallVector
<const SDNode
*, 4> LoopWorklist
;
3502 SmallPtrSet
<const SDNode
*, 16> Visited
;
3503 const unsigned int Max
= 1024;
3505 // Visualization of Load-Op-Store fusion:
3506 // -------------------------
3508 // *-lines = Chain operand dependencies.
3509 // |-lines = Normal operand dependencies.
3510 // Dependencies flow down and right. n-suffix references multiple nodes.
3518 // * * \ | => A--LD_OP_ST
3526 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3530 // Ensure the transform is safe by checking for the dual
3531 // dependencies to make sure we do not induce a loop.
3533 // As LD is a predecessor to both OP and ST we can do this by checking:
3534 // a). if LD is a predecessor to a member of Xn or Yn.
3535 // b). if a Zn is a predecessor to ST.
3537 // However, (b) can only occur through being a chain predecessor to
3538 // ST, which is the same as Zn being a member or predecessor of Xn,
3539 // which is a subset of LD being a predecessor of Xn. So it's
3540 // subsumed by check (a).
3542 SDValue Chain
= StoreNode
->getChain();
3544 // Gather X elements in ChainOps.
3545 if (Chain
== Load
.getValue(1)) {
3547 ChainOps
.push_back(Load
.getOperand(0));
3548 } else if (Chain
.getOpcode() == ISD::TokenFactor
) {
3549 for (unsigned i
= 0, e
= Chain
.getNumOperands(); i
!= e
; ++i
) {
3550 SDValue Op
= Chain
.getOperand(i
);
3551 if (Op
== Load
.getValue(1)) {
3553 // Drop Load, but keep its chain. No cycle check necessary.
3554 ChainOps
.push_back(Load
.getOperand(0));
3557 LoopWorklist
.push_back(Op
.getNode());
3558 ChainOps
.push_back(Op
);
3565 // Worklist is currently Xn. Add Yn to worklist.
3566 for (SDValue Op
: StoredVal
->ops())
3567 if (Op
.getNode() != LoadNode
)
3568 LoopWorklist
.push_back(Op
.getNode());
3570 // Check (a) if Load is a predecessor to Xn + Yn
3571 if (SDNode::hasPredecessorHelper(Load
.getNode(), Visited
, LoopWorklist
, Max
,
3576 CurDAG
->getNode(ISD::TokenFactor
, SDLoc(Chain
), MVT::Other
, ChainOps
);
3580 // Change a chain of {load; op; store} of the same value into a simple op
3581 // through memory of that value, if the uses of the modified value and its
3582 // address are suitable.
3584 // The tablegen pattern memory operand pattern is currently not able to match
3585 // the case where the EFLAGS on the original operation are used.
3587 // To move this to tablegen, we'll need to improve tablegen to allow flags to
3588 // be transferred from a node in the pattern to the result node, probably with
3589 // a new keyword. For example, we have this
3590 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3591 // [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3592 // but maybe need something like this
3593 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3594 // [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3595 // (transferrable EFLAGS)]>;
3597 // Until then, we manually fold these and instruction select the operation
3599 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode
*Node
) {
3600 auto *StoreNode
= cast
<StoreSDNode
>(Node
);
3601 SDValue StoredVal
= StoreNode
->getOperand(1);
3602 unsigned Opc
= StoredVal
->getOpcode();
3604 // Before we try to select anything, make sure this is memory operand size
3605 // and opcode we can handle. Note that this must match the code below that
3606 // actually lowers the opcodes.
3607 EVT MemVT
= StoreNode
->getMemoryVT();
3608 if (MemVT
!= MVT::i64
&& MemVT
!= MVT::i32
&& MemVT
!= MVT::i16
&&
3612 bool IsCommutable
= false;
3613 bool IsNegate
= false;
3618 IsNegate
= isNullConstant(StoredVal
.getOperand(0));
3627 IsCommutable
= true;
3631 unsigned LoadOpNo
= IsNegate
? 1 : 0;
3632 LoadSDNode
*LoadNode
= nullptr;
3634 if (!isFusableLoadOpStorePattern(StoreNode
, StoredVal
, CurDAG
, LoadOpNo
,
3635 LoadNode
, InputChain
)) {
3639 // This operation is commutable, try the other operand.
3641 if (!isFusableLoadOpStorePattern(StoreNode
, StoredVal
, CurDAG
, LoadOpNo
,
3642 LoadNode
, InputChain
))
3646 SDValue Base
, Scale
, Index
, Disp
, Segment
;
3647 if (!selectAddr(LoadNode
, LoadNode
->getBasePtr(), Base
, Scale
, Index
, Disp
,
3651 auto SelectOpcode
= [&](unsigned Opc64
, unsigned Opc32
, unsigned Opc16
,
3653 switch (MemVT
.getSimpleVT().SimpleTy
) {
3663 llvm_unreachable("Invalid size!");
3667 MachineSDNode
*Result
;
3672 unsigned NewOpc
= SelectOpcode(X86::NEG64m
, X86::NEG32m
, X86::NEG16m
,
3674 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
, Segment
, InputChain
};
3675 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
,
3681 // Try to match inc/dec.
3682 if (!Subtarget
->slowIncDec() || CurDAG
->shouldOptForSize()) {
3683 bool IsOne
= isOneConstant(StoredVal
.getOperand(1));
3684 bool IsNegOne
= isAllOnesConstant(StoredVal
.getOperand(1));
3685 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3686 if ((IsOne
|| IsNegOne
) && hasNoCarryFlagUses(StoredVal
.getValue(1))) {
3688 ((Opc
== X86ISD::ADD
) == IsOne
)
3689 ? SelectOpcode(X86::INC64m
, X86::INC32m
, X86::INC16m
, X86::INC8m
)
3690 : SelectOpcode(X86::DEC64m
, X86::DEC32m
, X86::DEC16m
, X86::DEC8m
);
3691 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
, Segment
, InputChain
};
3692 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
,
3703 auto SelectRegOpcode
= [SelectOpcode
](unsigned Opc
) {
3706 return SelectOpcode(X86::ADD64mr
, X86::ADD32mr
, X86::ADD16mr
,
3709 return SelectOpcode(X86::ADC64mr
, X86::ADC32mr
, X86::ADC16mr
,
3712 return SelectOpcode(X86::SUB64mr
, X86::SUB32mr
, X86::SUB16mr
,
3715 return SelectOpcode(X86::SBB64mr
, X86::SBB32mr
, X86::SBB16mr
,
3718 return SelectOpcode(X86::AND64mr
, X86::AND32mr
, X86::AND16mr
,
3721 return SelectOpcode(X86::OR64mr
, X86::OR32mr
, X86::OR16mr
, X86::OR8mr
);
3723 return SelectOpcode(X86::XOR64mr
, X86::XOR32mr
, X86::XOR16mr
,
3726 llvm_unreachable("Invalid opcode!");
3729 auto SelectImmOpcode
= [SelectOpcode
](unsigned Opc
) {
3732 return SelectOpcode(X86::ADD64mi32
, X86::ADD32mi
, X86::ADD16mi
,
3735 return SelectOpcode(X86::ADC64mi32
, X86::ADC32mi
, X86::ADC16mi
,
3738 return SelectOpcode(X86::SUB64mi32
, X86::SUB32mi
, X86::SUB16mi
,
3741 return SelectOpcode(X86::SBB64mi32
, X86::SBB32mi
, X86::SBB16mi
,
3744 return SelectOpcode(X86::AND64mi32
, X86::AND32mi
, X86::AND16mi
,
3747 return SelectOpcode(X86::OR64mi32
, X86::OR32mi
, X86::OR16mi
,
3750 return SelectOpcode(X86::XOR64mi32
, X86::XOR32mi
, X86::XOR16mi
,
3753 llvm_unreachable("Invalid opcode!");
3757 unsigned NewOpc
= SelectRegOpcode(Opc
);
3758 SDValue Operand
= StoredVal
->getOperand(1-LoadOpNo
);
3760 // See if the operand is a constant that we can fold into an immediate
3762 if (auto *OperandC
= dyn_cast
<ConstantSDNode
>(Operand
)) {
3763 int64_t OperandV
= OperandC
->getSExtValue();
3765 // Check if we can shrink the operand enough to fit in an immediate (or
3766 // fit into a smaller immediate) by negating it and switching the
3768 if ((Opc
== X86ISD::ADD
|| Opc
== X86ISD::SUB
) &&
3769 ((MemVT
!= MVT::i8
&& !isInt
<8>(OperandV
) && isInt
<8>(-OperandV
)) ||
3770 (MemVT
== MVT::i64
&& !isInt
<32>(OperandV
) &&
3771 isInt
<32>(-OperandV
))) &&
3772 hasNoCarryFlagUses(StoredVal
.getValue(1))) {
3773 OperandV
= -OperandV
;
3774 Opc
= Opc
== X86ISD::ADD
? X86ISD::SUB
: X86ISD::ADD
;
3777 if (MemVT
!= MVT::i64
|| isInt
<32>(OperandV
)) {
3778 Operand
= CurDAG
->getSignedTargetConstant(OperandV
, SDLoc(Node
), MemVT
);
3779 NewOpc
= SelectImmOpcode(Opc
);
3783 if (Opc
== X86ISD::ADC
|| Opc
== X86ISD::SBB
) {
3785 CurDAG
->getCopyToReg(InputChain
, SDLoc(Node
), X86::EFLAGS
,
3786 StoredVal
.getOperand(2), SDValue());
3788 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
,
3789 Segment
, Operand
, CopyTo
, CopyTo
.getValue(1)};
3790 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
, MVT::Other
,
3793 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
,
3794 Segment
, Operand
, InputChain
};
3795 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
, MVT::Other
,
3801 llvm_unreachable("Invalid opcode!");
3804 MachineMemOperand
*MemOps
[] = {StoreNode
->getMemOperand(),
3805 LoadNode
->getMemOperand()};
3806 CurDAG
->setNodeMemRefs(Result
, MemOps
);
3808 // Update Load Chain uses as well.
3809 ReplaceUses(SDValue(LoadNode
, 1), SDValue(Result
, 1));
3810 ReplaceUses(SDValue(StoreNode
, 0), SDValue(Result
, 1));
3811 ReplaceUses(SDValue(StoredVal
.getNode(), 1), SDValue(Result
, 0));
3812 CurDAG
->RemoveDeadNode(Node
);
3816 // See if this is an X & Mask that we can match to BEXTR/BZHI.
3817 // Where Mask is one of the following patterns:
3818 // a) x & (1 << nbits) - 1
3819 // b) x & ~(-1 << nbits)
3820 // c) x & (-1 >> (32 - y))
3821 // d) x << (32 - y) >> (32 - y)
3822 // e) (1 << nbits) - 1
3823 bool X86DAGToDAGISel::matchBitExtract(SDNode
*Node
) {
3825 (Node
->getOpcode() == ISD::ADD
|| Node
->getOpcode() == ISD::AND
||
3826 Node
->getOpcode() == ISD::SRL
) &&
3827 "Should be either an and-mask, or right-shift after clearing high bits.");
3829 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3830 if (!Subtarget
->hasBMI() && !Subtarget
->hasBMI2())
3833 MVT NVT
= Node
->getSimpleValueType(0);
3835 // Only supported for 32 and 64 bits.
3836 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
3842 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3843 // Else, if we only have BMI1's BEXTR, we require one-use.
3844 const bool AllowExtraUsesByDefault
= Subtarget
->hasBMI2();
3845 auto checkUses
= [AllowExtraUsesByDefault
](
3846 SDValue Op
, unsigned NUses
,
3847 std::optional
<bool> AllowExtraUses
) {
3848 return AllowExtraUses
.value_or(AllowExtraUsesByDefault
) ||
3849 Op
.getNode()->hasNUsesOfValue(NUses
, Op
.getResNo());
3851 auto checkOneUse
= [checkUses
](SDValue Op
,
3852 std::optional
<bool> AllowExtraUses
=
3854 return checkUses(Op
, 1, AllowExtraUses
);
3856 auto checkTwoUse
= [checkUses
](SDValue Op
,
3857 std::optional
<bool> AllowExtraUses
=
3859 return checkUses(Op
, 2, AllowExtraUses
);
3862 auto peekThroughOneUseTruncation
= [checkOneUse
](SDValue V
) {
3863 if (V
->getOpcode() == ISD::TRUNCATE
&& checkOneUse(V
)) {
3864 assert(V
.getSimpleValueType() == MVT::i32
&&
3865 V
.getOperand(0).getSimpleValueType() == MVT::i64
&&
3866 "Expected i64 -> i32 truncation");
3867 V
= V
.getOperand(0);
3872 // a) x & ((1 << nbits) + (-1))
3873 auto matchPatternA
= [checkOneUse
, peekThroughOneUseTruncation
, &NBits
,
3874 &NegateNBits
](SDValue Mask
) -> bool {
3875 // Match `add`. Must only have one use!
3876 if (Mask
->getOpcode() != ISD::ADD
|| !checkOneUse(Mask
))
3878 // We should be adding all-ones constant (i.e. subtracting one.)
3879 if (!isAllOnesConstant(Mask
->getOperand(1)))
3881 // Match `1 << nbits`. Might be truncated. Must only have one use!
3882 SDValue M0
= peekThroughOneUseTruncation(Mask
->getOperand(0));
3883 if (M0
->getOpcode() != ISD::SHL
|| !checkOneUse(M0
))
3885 if (!isOneConstant(M0
->getOperand(0)))
3887 NBits
= M0
->getOperand(1);
3888 NegateNBits
= false;
3892 auto isAllOnes
= [this, peekThroughOneUseTruncation
, NVT
](SDValue V
) {
3893 V
= peekThroughOneUseTruncation(V
);
3894 return CurDAG
->MaskedValueIsAllOnes(
3895 V
, APInt::getLowBitsSet(V
.getSimpleValueType().getSizeInBits(),
3896 NVT
.getSizeInBits()));
3899 // b) x & ~(-1 << nbits)
3900 auto matchPatternB
= [checkOneUse
, isAllOnes
, peekThroughOneUseTruncation
,
3901 &NBits
, &NegateNBits
](SDValue Mask
) -> bool {
3902 // Match `~()`. Must only have one use!
3903 if (Mask
.getOpcode() != ISD::XOR
|| !checkOneUse(Mask
))
3905 // The -1 only has to be all-ones for the final Node's NVT.
3906 if (!isAllOnes(Mask
->getOperand(1)))
3908 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3909 SDValue M0
= peekThroughOneUseTruncation(Mask
->getOperand(0));
3910 if (M0
->getOpcode() != ISD::SHL
|| !checkOneUse(M0
))
3912 // The -1 only has to be all-ones for the final Node's NVT.
3913 if (!isAllOnes(M0
->getOperand(0)))
3915 NBits
= M0
->getOperand(1);
3916 NegateNBits
= false;
3920 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3921 // or leave the shift amount as-is, but then we'll have to negate it.
3922 auto canonicalizeShiftAmt
= [&NBits
, &NegateNBits
](SDValue ShiftAmt
,
3923 unsigned Bitwidth
) {
3926 // Skip over a truncate of the shift amount, if any.
3927 if (NBits
.getOpcode() == ISD::TRUNCATE
)
3928 NBits
= NBits
.getOperand(0);
3929 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3930 // If it doesn't match, that's fine, we'll just negate it ourselves.
3931 if (NBits
.getOpcode() != ISD::SUB
)
3933 auto *V0
= dyn_cast
<ConstantSDNode
>(NBits
.getOperand(0));
3934 if (!V0
|| V0
->getZExtValue() != Bitwidth
)
3936 NBits
= NBits
.getOperand(1);
3937 NegateNBits
= false;
3940 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3942 // c) x & (-1 >> (32 - y))
3943 auto matchPatternC
= [checkOneUse
, peekThroughOneUseTruncation
, &NegateNBits
,
3944 canonicalizeShiftAmt
](SDValue Mask
) -> bool {
3945 // The mask itself may be truncated.
3946 Mask
= peekThroughOneUseTruncation(Mask
);
3947 unsigned Bitwidth
= Mask
.getSimpleValueType().getSizeInBits();
3948 // Match `l>>`. Must only have one use!
3949 if (Mask
.getOpcode() != ISD::SRL
|| !checkOneUse(Mask
))
3951 // We should be shifting truly all-ones constant.
3952 if (!isAllOnesConstant(Mask
.getOperand(0)))
3954 SDValue M1
= Mask
.getOperand(1);
3955 // The shift amount should not be used externally.
3956 if (!checkOneUse(M1
))
3958 canonicalizeShiftAmt(M1
, Bitwidth
);
3959 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3960 // is no extra use of the mask. Clearly, there was one since we are here.
3961 // But at the same time, if we need to negate the shift amount,
3962 // then we don't want the mask to stick around, else it's unprofitable.
3963 return !NegateNBits
;
3968 // d) x << z >> z but then we'll have to subtract z from bitwidth
3970 // d) x << (32 - y) >> (32 - y)
3971 auto matchPatternD
= [checkOneUse
, checkTwoUse
, canonicalizeShiftAmt
,
3972 AllowExtraUsesByDefault
, &NegateNBits
,
3973 &X
](SDNode
*Node
) -> bool {
3974 if (Node
->getOpcode() != ISD::SRL
)
3976 SDValue N0
= Node
->getOperand(0);
3977 if (N0
->getOpcode() != ISD::SHL
)
3979 unsigned Bitwidth
= N0
.getSimpleValueType().getSizeInBits();
3980 SDValue N1
= Node
->getOperand(1);
3981 SDValue N01
= N0
->getOperand(1);
3982 // Both of the shifts must be by the exact same value.
3985 canonicalizeShiftAmt(N1
, Bitwidth
);
3986 // There should not be any external uses of the inner shift / shift amount.
3987 // Note that while we are generally okay with external uses given BMI2,
3988 // iff we need to negate the shift amount, we are not okay with extra uses.
3989 const bool AllowExtraUses
= AllowExtraUsesByDefault
&& !NegateNBits
;
3990 if (!checkOneUse(N0
, AllowExtraUses
) || !checkTwoUse(N1
, AllowExtraUses
))
3992 X
= N0
->getOperand(0);
3996 auto matchLowBitMask
= [matchPatternA
, matchPatternB
,
3997 matchPatternC
](SDValue Mask
) -> bool {
3998 return matchPatternA(Mask
) || matchPatternB(Mask
) || matchPatternC(Mask
);
4001 if (Node
->getOpcode() == ISD::AND
) {
4002 X
= Node
->getOperand(0);
4003 SDValue Mask
= Node
->getOperand(1);
4005 if (matchLowBitMask(Mask
)) {
4009 if (!matchLowBitMask(Mask
))
4012 } else if (matchLowBitMask(SDValue(Node
, 0))) {
4013 X
= CurDAG
->getAllOnesConstant(SDLoc(Node
), NVT
);
4014 } else if (!matchPatternD(Node
))
4017 // If we need to negate the shift amount, require BMI2 BZHI support.
4018 // It's just too unprofitable for BMI1 BEXTR.
4019 if (NegateNBits
&& !Subtarget
->hasBMI2())
4024 // Truncate the shift amount.
4025 NBits
= CurDAG
->getNode(ISD::TRUNCATE
, DL
, MVT::i8
, NBits
);
4026 insertDAGNode(*CurDAG
, SDValue(Node
, 0), NBits
);
4028 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4029 // All the other bits are undefined, we do not care about them.
4030 SDValue ImplDef
= SDValue(
4031 CurDAG
->getMachineNode(TargetOpcode::IMPLICIT_DEF
, DL
, MVT::i32
), 0);
4032 insertDAGNode(*CurDAG
, SDValue(Node
, 0), ImplDef
);
4034 SDValue SRIdxVal
= CurDAG
->getTargetConstant(X86::sub_8bit
, DL
, MVT::i32
);
4035 insertDAGNode(*CurDAG
, SDValue(Node
, 0), SRIdxVal
);
4036 NBits
= SDValue(CurDAG
->getMachineNode(TargetOpcode::INSERT_SUBREG
, DL
,
4037 MVT::i32
, ImplDef
, NBits
, SRIdxVal
),
4039 insertDAGNode(*CurDAG
, SDValue(Node
, 0), NBits
);
4041 // We might have matched the amount of high bits to be cleared,
4042 // but we want the amount of low bits to be kept, so negate it then.
4044 SDValue BitWidthC
= CurDAG
->getConstant(NVT
.getSizeInBits(), DL
, MVT::i32
);
4045 insertDAGNode(*CurDAG
, SDValue(Node
, 0), BitWidthC
);
4047 NBits
= CurDAG
->getNode(ISD::SUB
, DL
, MVT::i32
, BitWidthC
, NBits
);
4048 insertDAGNode(*CurDAG
, SDValue(Node
, 0), NBits
);
4051 if (Subtarget
->hasBMI2()) {
4052 // Great, just emit the BZHI..
4053 if (NVT
!= MVT::i32
) {
4054 // But have to place the bit count into the wide-enough register first.
4055 NBits
= CurDAG
->getNode(ISD::ANY_EXTEND
, DL
, NVT
, NBits
);
4056 insertDAGNode(*CurDAG
, SDValue(Node
, 0), NBits
);
4059 SDValue Extract
= CurDAG
->getNode(X86ISD::BZHI
, DL
, NVT
, X
, NBits
);
4060 ReplaceNode(Node
, Extract
.getNode());
4061 SelectCode(Extract
.getNode());
4065 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4066 // *logically* shifted (potentially with one-use trunc inbetween),
4067 // and the truncation was the only use of the shift,
4068 // and if so look past one-use truncation.
4070 SDValue RealX
= peekThroughOneUseTruncation(X
);
4071 // FIXME: only if the shift is one-use?
4072 if (RealX
!= X
&& RealX
.getOpcode() == ISD::SRL
)
4076 MVT XVT
= X
.getSimpleValueType();
4078 // Else, emitting BEXTR requires one more step.
4079 // The 'control' of BEXTR has the pattern of:
4080 // [15...8 bit][ 7...0 bit] location
4081 // [ bit count][ shift] name
4082 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4084 // Shift NBits left by 8 bits, thus producing 'control'.
4085 // This makes the low 8 bits to be zero.
4086 SDValue C8
= CurDAG
->getConstant(8, DL
, MVT::i8
);
4087 insertDAGNode(*CurDAG
, SDValue(Node
, 0), C8
);
4088 SDValue Control
= CurDAG
->getNode(ISD::SHL
, DL
, MVT::i32
, NBits
, C8
);
4089 insertDAGNode(*CurDAG
, SDValue(Node
, 0), Control
);
4091 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4092 // FIXME: only if the shift is one-use?
4093 if (X
.getOpcode() == ISD::SRL
) {
4094 SDValue ShiftAmt
= X
.getOperand(1);
4095 X
= X
.getOperand(0);
4097 assert(ShiftAmt
.getValueType() == MVT::i8
&&
4098 "Expected shift amount to be i8");
4100 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4101 // We could zext to i16 in some form, but we intentionally don't do that.
4102 SDValue OrigShiftAmt
= ShiftAmt
;
4103 ShiftAmt
= CurDAG
->getNode(ISD::ZERO_EXTEND
, DL
, MVT::i32
, ShiftAmt
);
4104 insertDAGNode(*CurDAG
, OrigShiftAmt
, ShiftAmt
);
4106 // And now 'or' these low 8 bits of shift amount into the 'control'.
4107 Control
= CurDAG
->getNode(ISD::OR
, DL
, MVT::i32
, Control
, ShiftAmt
);
4108 insertDAGNode(*CurDAG
, SDValue(Node
, 0), Control
);
4111 // But have to place the 'control' into the wide-enough register first.
4112 if (XVT
!= MVT::i32
) {
4113 Control
= CurDAG
->getNode(ISD::ANY_EXTEND
, DL
, XVT
, Control
);
4114 insertDAGNode(*CurDAG
, SDValue(Node
, 0), Control
);
4117 // And finally, form the BEXTR itself.
4118 SDValue Extract
= CurDAG
->getNode(X86ISD::BEXTR
, DL
, XVT
, X
, Control
);
4120 // The 'X' was originally truncated. Do that now.
4122 insertDAGNode(*CurDAG
, SDValue(Node
, 0), Extract
);
4123 Extract
= CurDAG
->getNode(ISD::TRUNCATE
, DL
, NVT
, Extract
);
4126 ReplaceNode(Node
, Extract
.getNode());
4127 SelectCode(Extract
.getNode());
4132 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4133 MachineSDNode
*X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode
*Node
) {
4134 MVT NVT
= Node
->getSimpleValueType(0);
4137 SDValue N0
= Node
->getOperand(0);
4138 SDValue N1
= Node
->getOperand(1);
4140 // If we have TBM we can use an immediate for the control. If we have BMI
4141 // we should only do this if the BEXTR instruction is implemented well.
4142 // Otherwise moving the control into a register makes this more costly.
4143 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4144 // hoisting the move immediate would make it worthwhile with a less optimal
4147 Subtarget
->hasTBM() || (Subtarget
->hasBMI() && Subtarget
->hasFastBEXTR());
4148 if (!PreferBEXTR
&& !Subtarget
->hasBMI2())
4151 // Must have a shift right.
4152 if (N0
->getOpcode() != ISD::SRL
&& N0
->getOpcode() != ISD::SRA
)
4155 // Shift can't have additional users.
4156 if (!N0
->hasOneUse())
4159 // Only supported for 32 and 64 bits.
4160 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
4163 // Shift amount and RHS of and must be constant.
4164 auto *MaskCst
= dyn_cast
<ConstantSDNode
>(N1
);
4165 auto *ShiftCst
= dyn_cast
<ConstantSDNode
>(N0
->getOperand(1));
4166 if (!MaskCst
|| !ShiftCst
)
4169 // And RHS must be a mask.
4170 uint64_t Mask
= MaskCst
->getZExtValue();
4171 if (!isMask_64(Mask
))
4174 uint64_t Shift
= ShiftCst
->getZExtValue();
4175 uint64_t MaskSize
= llvm::popcount(Mask
);
4177 // Don't interfere with something that can be handled by extracting AH.
4178 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4179 if (Shift
== 8 && MaskSize
== 8)
4182 // Make sure we are only using bits that were in the original value, not
4184 if (Shift
+ MaskSize
> NVT
.getSizeInBits())
4187 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4188 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4189 // does not fit into 32 bits. Load folding is not a sufficient reason.
4190 if (!PreferBEXTR
&& MaskSize
<= 32)
4194 unsigned ROpc
, MOpc
;
4196 #define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4198 assert(Subtarget
->hasBMI2() && "We must have BMI2's BZHI then.");
4199 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4200 // Let's perform the mask first, and apply shift later. Note that we need to
4201 // widen the mask to account for the fact that we'll apply shift afterwards!
4202 Control
= CurDAG
->getTargetConstant(Shift
+ MaskSize
, dl
, NVT
);
4203 ROpc
= NVT
== MVT::i64
? GET_EGPR_IF_ENABLED(X86::BZHI64rr
)
4204 : GET_EGPR_IF_ENABLED(X86::BZHI32rr
);
4205 MOpc
= NVT
== MVT::i64
? GET_EGPR_IF_ENABLED(X86::BZHI64rm
)
4206 : GET_EGPR_IF_ENABLED(X86::BZHI32rm
);
4207 unsigned NewOpc
= NVT
== MVT::i64
? X86::MOV32ri64
: X86::MOV32ri
;
4208 Control
= SDValue(CurDAG
->getMachineNode(NewOpc
, dl
, NVT
, Control
), 0);
4210 // The 'control' of BEXTR has the pattern of:
4211 // [15...8 bit][ 7...0 bit] location
4212 // [ bit count][ shift] name
4213 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4214 Control
= CurDAG
->getTargetConstant(Shift
| (MaskSize
<< 8), dl
, NVT
);
4215 if (Subtarget
->hasTBM()) {
4216 ROpc
= NVT
== MVT::i64
? X86::BEXTRI64ri
: X86::BEXTRI32ri
;
4217 MOpc
= NVT
== MVT::i64
? X86::BEXTRI64mi
: X86::BEXTRI32mi
;
4219 assert(Subtarget
->hasBMI() && "We must have BMI1's BEXTR then.");
4220 // BMI requires the immediate to placed in a register.
4221 ROpc
= NVT
== MVT::i64
? GET_EGPR_IF_ENABLED(X86::BEXTR64rr
)
4222 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr
);
4223 MOpc
= NVT
== MVT::i64
? GET_EGPR_IF_ENABLED(X86::BEXTR64rm
)
4224 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm
);
4225 unsigned NewOpc
= NVT
== MVT::i64
? X86::MOV32ri64
: X86::MOV32ri
;
4226 Control
= SDValue(CurDAG
->getMachineNode(NewOpc
, dl
, NVT
, Control
), 0);
4230 MachineSDNode
*NewNode
;
4231 SDValue Input
= N0
->getOperand(0);
4232 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4233 if (tryFoldLoad(Node
, N0
.getNode(), Input
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
4235 Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Control
, Input
.getOperand(0)};
4236 SDVTList VTs
= CurDAG
->getVTList(NVT
, MVT::i32
, MVT::Other
);
4237 NewNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
4238 // Update the chain.
4239 ReplaceUses(Input
.getValue(1), SDValue(NewNode
, 2));
4240 // Record the mem-refs
4241 CurDAG
->setNodeMemRefs(NewNode
, {cast
<LoadSDNode
>(Input
)->getMemOperand()});
4243 NewNode
= CurDAG
->getMachineNode(ROpc
, dl
, NVT
, MVT::i32
, Input
, Control
);
4247 // We still need to apply the shift.
4248 SDValue ShAmt
= CurDAG
->getTargetConstant(Shift
, dl
, NVT
);
4249 unsigned NewOpc
= NVT
== MVT::i64
? GET_ND_IF_ENABLED(X86::SHR64ri
)
4250 : GET_ND_IF_ENABLED(X86::SHR32ri
);
4252 CurDAG
->getMachineNode(NewOpc
, dl
, NVT
, SDValue(NewNode
, 0), ShAmt
);
4258 // Emit a PCMISTR(I/M) instruction.
4259 MachineSDNode
*X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc
, unsigned MOpc
,
4260 bool MayFoldLoad
, const SDLoc
&dl
,
4261 MVT VT
, SDNode
*Node
) {
4262 SDValue N0
= Node
->getOperand(0);
4263 SDValue N1
= Node
->getOperand(1);
4264 SDValue Imm
= Node
->getOperand(2);
4265 auto *Val
= cast
<ConstantSDNode
>(Imm
)->getConstantIntValue();
4266 Imm
= CurDAG
->getTargetConstant(*Val
, SDLoc(Node
), Imm
.getValueType());
4268 // Try to fold a load. No need to check alignment.
4269 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4270 if (MayFoldLoad
&& tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
4271 SDValue Ops
[] = { N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
4273 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Other
);
4274 MachineSDNode
*CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
4275 // Update the chain.
4276 ReplaceUses(N1
.getValue(1), SDValue(CNode
, 2));
4277 // Record the mem-refs
4278 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
4282 SDValue Ops
[] = { N0
, N1
, Imm
};
4283 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
);
4284 MachineSDNode
*CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, Ops
);
4288 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4289 // to emit a second instruction after this one. This is needed since we have two
4290 // copyToReg nodes glued before this and we need to continue that glue through.
4291 MachineSDNode
*X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc
, unsigned MOpc
,
4292 bool MayFoldLoad
, const SDLoc
&dl
,
4293 MVT VT
, SDNode
*Node
,
4295 SDValue N0
= Node
->getOperand(0);
4296 SDValue N2
= Node
->getOperand(2);
4297 SDValue Imm
= Node
->getOperand(4);
4298 auto *Val
= cast
<ConstantSDNode
>(Imm
)->getConstantIntValue();
4299 Imm
= CurDAG
->getTargetConstant(*Val
, SDLoc(Node
), Imm
.getValueType());
4301 // Try to fold a load. No need to check alignment.
4302 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4303 if (MayFoldLoad
&& tryFoldLoad(Node
, N2
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
4304 SDValue Ops
[] = { N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
4305 N2
.getOperand(0), InGlue
};
4306 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Other
, MVT::Glue
);
4307 MachineSDNode
*CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
4308 InGlue
= SDValue(CNode
, 3);
4309 // Update the chain.
4310 ReplaceUses(N2
.getValue(1), SDValue(CNode
, 2));
4311 // Record the mem-refs
4312 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N2
)->getMemOperand()});
4316 SDValue Ops
[] = { N0
, N2
, Imm
, InGlue
};
4317 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Glue
);
4318 MachineSDNode
*CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, Ops
);
4319 InGlue
= SDValue(CNode
, 2);
4323 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode
*N
) {
4324 EVT VT
= N
->getValueType(0);
4326 // Only handle scalar shifts.
4330 // Narrower shifts only mask to 5 bits in hardware.
4331 unsigned Size
= VT
== MVT::i64
? 64 : 32;
4333 SDValue OrigShiftAmt
= N
->getOperand(1);
4334 SDValue ShiftAmt
= OrigShiftAmt
;
4337 // Skip over a truncate of the shift amount.
4338 if (ShiftAmt
->getOpcode() == ISD::TRUNCATE
)
4339 ShiftAmt
= ShiftAmt
->getOperand(0);
4341 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4342 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4344 SDValue NewShiftAmt
;
4345 if (ShiftAmt
->getOpcode() == ISD::ADD
|| ShiftAmt
->getOpcode() == ISD::SUB
||
4346 ShiftAmt
->getOpcode() == ISD::XOR
) {
4347 SDValue Add0
= ShiftAmt
->getOperand(0);
4348 SDValue Add1
= ShiftAmt
->getOperand(1);
4349 auto *Add0C
= dyn_cast
<ConstantSDNode
>(Add0
);
4350 auto *Add1C
= dyn_cast
<ConstantSDNode
>(Add1
);
4351 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4352 // to avoid the ADD/SUB/XOR.
4353 if (Add1C
&& Add1C
->getAPIntValue().urem(Size
) == 0) {
4356 } else if (ShiftAmt
->getOpcode() != ISD::ADD
&& ShiftAmt
.hasOneUse() &&
4357 ((Add0C
&& Add0C
->getAPIntValue().urem(Size
) == Size
- 1) ||
4358 (Add1C
&& Add1C
->getAPIntValue().urem(Size
) == Size
- 1))) {
4359 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4360 // we can replace it with a NOT. In the XOR case it may save some code
4361 // size, in the SUB case it also may save a move.
4362 assert(Add0C
== nullptr || Add1C
== nullptr);
4364 // We can only do N-X, not X-N
4365 if (ShiftAmt
->getOpcode() == ISD::SUB
&& Add0C
== nullptr)
4368 EVT OpVT
= ShiftAmt
.getValueType();
4370 SDValue AllOnes
= CurDAG
->getAllOnesConstant(DL
, OpVT
);
4371 NewShiftAmt
= CurDAG
->getNode(ISD::XOR
, DL
, OpVT
,
4372 Add0C
== nullptr ? Add0
: Add1
, AllOnes
);
4373 insertDAGNode(*CurDAG
, OrigShiftAmt
, AllOnes
);
4374 insertDAGNode(*CurDAG
, OrigShiftAmt
, NewShiftAmt
);
4375 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4376 // -X to generate a NEG instead of a SUB of a constant.
4377 } else if (ShiftAmt
->getOpcode() == ISD::SUB
&& Add0C
&&
4378 Add0C
->getZExtValue() != 0) {
4379 EVT SubVT
= ShiftAmt
.getValueType();
4381 if (Add0C
->getZExtValue() % Size
== 0)
4383 else if (ShiftAmt
.hasOneUse() && Size
== 64 &&
4384 Add0C
->getZExtValue() % 32 == 0) {
4385 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4386 // This is mainly beneficial if we already compute (x+n*32).
4387 if (Add1
.getOpcode() == ISD::TRUNCATE
) {
4388 Add1
= Add1
.getOperand(0);
4389 SubVT
= Add1
.getValueType();
4391 if (Add0
.getValueType() != SubVT
) {
4392 Add0
= CurDAG
->getZExtOrTrunc(Add0
, DL
, SubVT
);
4393 insertDAGNode(*CurDAG
, OrigShiftAmt
, Add0
);
4396 X
= CurDAG
->getNode(ISD::ADD
, DL
, SubVT
, Add1
, Add0
);
4397 insertDAGNode(*CurDAG
, OrigShiftAmt
, X
);
4400 // Insert a negate op.
4401 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4402 // that uses it that's not a shift.
4403 SDValue Zero
= CurDAG
->getConstant(0, DL
, SubVT
);
4404 SDValue Neg
= CurDAG
->getNode(ISD::SUB
, DL
, SubVT
, Zero
, X
);
4407 // Insert these operands into a valid topological order so they can
4408 // get selected independently.
4409 insertDAGNode(*CurDAG
, OrigShiftAmt
, Zero
);
4410 insertDAGNode(*CurDAG
, OrigShiftAmt
, Neg
);
4416 if (NewShiftAmt
.getValueType() != MVT::i8
) {
4417 // Need to truncate the shift amount.
4418 NewShiftAmt
= CurDAG
->getNode(ISD::TRUNCATE
, DL
, MVT::i8
, NewShiftAmt
);
4419 // Add to a correct topological ordering.
4420 insertDAGNode(*CurDAG
, OrigShiftAmt
, NewShiftAmt
);
4423 // Insert a new mask to keep the shift amount legal. This should be removed
4424 // by isel patterns.
4425 NewShiftAmt
= CurDAG
->getNode(ISD::AND
, DL
, MVT::i8
, NewShiftAmt
,
4426 CurDAG
->getConstant(Size
- 1, DL
, MVT::i8
));
4427 // Place in a correct topological ordering.
4428 insertDAGNode(*CurDAG
, OrigShiftAmt
, NewShiftAmt
);
4430 SDNode
*UpdatedNode
= CurDAG
->UpdateNodeOperands(N
, N
->getOperand(0),
4432 if (UpdatedNode
!= N
) {
4433 // If we found an existing node, we should replace ourselves with that node
4434 // and wait for it to be selected after its other users.
4435 ReplaceNode(N
, UpdatedNode
);
4439 // If the original shift amount is now dead, delete it so that we don't run
4441 if (OrigShiftAmt
.getNode()->use_empty())
4442 CurDAG
->RemoveDeadNode(OrigShiftAmt
.getNode());
4444 // Now that we've optimized the shift amount, defer to normal isel to get
4445 // load folding and legacy vs BMI2 selection without repeating it here.
4450 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode
*N
) {
4451 MVT NVT
= N
->getSimpleValueType(0);
4452 unsigned Opcode
= N
->getOpcode();
4455 // For operations of the form (x << C1) op C2, check if we can use a smaller
4456 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4457 SDValue Shift
= N
->getOperand(0);
4458 SDValue N1
= N
->getOperand(1);
4460 auto *Cst
= dyn_cast
<ConstantSDNode
>(N1
);
4464 int64_t Val
= Cst
->getSExtValue();
4466 // If we have an any_extend feeding the AND, look through it to see if there
4467 // is a shift behind it. But only if the AND doesn't use the extended bits.
4468 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4469 bool FoundAnyExtend
= false;
4470 if (Shift
.getOpcode() == ISD::ANY_EXTEND
&& Shift
.hasOneUse() &&
4471 Shift
.getOperand(0).getSimpleValueType() == MVT::i32
&&
4473 FoundAnyExtend
= true;
4474 Shift
= Shift
.getOperand(0);
4477 if (Shift
.getOpcode() != ISD::SHL
|| !Shift
.hasOneUse())
4480 // i8 is unshrinkable, i16 should be promoted to i32.
4481 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
4484 auto *ShlCst
= dyn_cast
<ConstantSDNode
>(Shift
.getOperand(1));
4488 uint64_t ShAmt
= ShlCst
->getZExtValue();
4490 // Make sure that we don't change the operation by removing bits.
4491 // This only matters for OR and XOR, AND is unaffected.
4492 uint64_t RemovedBitsMask
= (1ULL << ShAmt
) - 1;
4493 if (Opcode
!= ISD::AND
&& (Val
& RemovedBitsMask
) != 0)
4496 // Check the minimum bitwidth for the new constant.
4497 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4498 auto CanShrinkImmediate
= [&](int64_t &ShiftedVal
) {
4499 if (Opcode
== ISD::AND
) {
4500 // AND32ri is the same as AND64ri32 with zext imm.
4501 // Try this before sign extended immediates below.
4502 ShiftedVal
= (uint64_t)Val
>> ShAmt
;
4503 if (NVT
== MVT::i64
&& !isUInt
<32>(Val
) && isUInt
<32>(ShiftedVal
))
4505 // Also swap order when the AND can become MOVZX.
4506 if (ShiftedVal
== UINT8_MAX
|| ShiftedVal
== UINT16_MAX
)
4509 ShiftedVal
= Val
>> ShAmt
;
4510 if ((!isInt
<8>(Val
) && isInt
<8>(ShiftedVal
)) ||
4511 (!isInt
<32>(Val
) && isInt
<32>(ShiftedVal
)))
4513 if (Opcode
!= ISD::AND
) {
4514 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4515 ShiftedVal
= (uint64_t)Val
>> ShAmt
;
4516 if (NVT
== MVT::i64
&& !isUInt
<32>(Val
) && isUInt
<32>(ShiftedVal
))
4523 if (!CanShrinkImmediate(ShiftedVal
))
4526 // Ok, we can reorder to get a smaller immediate.
4528 // But, its possible the original immediate allowed an AND to become MOVZX.
4529 // Doing this late due to avoid the MakedValueIsZero call as late as
4531 if (Opcode
== ISD::AND
) {
4532 // Find the smallest zext this could possibly be.
4533 unsigned ZExtWidth
= Cst
->getAPIntValue().getActiveBits();
4534 ZExtWidth
= llvm::bit_ceil(std::max(ZExtWidth
, 8U));
4536 // Figure out which bits need to be zero to achieve that mask.
4537 APInt NeededMask
= APInt::getLowBitsSet(NVT
.getSizeInBits(),
4539 NeededMask
&= ~Cst
->getAPIntValue();
4541 if (CurDAG
->MaskedValueIsZero(N
->getOperand(0), NeededMask
))
4545 SDValue X
= Shift
.getOperand(0);
4546 if (FoundAnyExtend
) {
4547 SDValue NewX
= CurDAG
->getNode(ISD::ANY_EXTEND
, dl
, NVT
, X
);
4548 insertDAGNode(*CurDAG
, SDValue(N
, 0), NewX
);
4552 SDValue NewCst
= CurDAG
->getSignedConstant(ShiftedVal
, dl
, NVT
);
4553 insertDAGNode(*CurDAG
, SDValue(N
, 0), NewCst
);
4554 SDValue NewBinOp
= CurDAG
->getNode(Opcode
, dl
, NVT
, X
, NewCst
);
4555 insertDAGNode(*CurDAG
, SDValue(N
, 0), NewBinOp
);
4556 SDValue NewSHL
= CurDAG
->getNode(ISD::SHL
, dl
, NVT
, NewBinOp
,
4557 Shift
.getOperand(1));
4558 ReplaceNode(N
, NewSHL
.getNode());
4559 SelectCode(NewSHL
.getNode());
4563 bool X86DAGToDAGISel::matchVPTERNLOG(SDNode
*Root
, SDNode
*ParentA
,
4564 SDNode
*ParentB
, SDNode
*ParentC
,
4565 SDValue A
, SDValue B
, SDValue C
,
4567 assert(A
.isOperandOf(ParentA
) && B
.isOperandOf(ParentB
) &&
4568 C
.isOperandOf(ParentC
) && "Incorrect parent node");
4570 auto tryFoldLoadOrBCast
=
4571 [this](SDNode
*Root
, SDNode
*P
, SDValue
&L
, SDValue
&Base
, SDValue
&Scale
,
4572 SDValue
&Index
, SDValue
&Disp
, SDValue
&Segment
) {
4573 if (tryFoldLoad(Root
, P
, L
, Base
, Scale
, Index
, Disp
, Segment
))
4576 // Not a load, check for broadcast which may be behind a bitcast.
4577 if (L
.getOpcode() == ISD::BITCAST
&& L
.hasOneUse()) {
4579 L
= L
.getOperand(0);
4582 if (L
.getOpcode() != X86ISD::VBROADCAST_LOAD
)
4585 // Only 32 and 64 bit broadcasts are supported.
4586 auto *MemIntr
= cast
<MemIntrinsicSDNode
>(L
);
4587 unsigned Size
= MemIntr
->getMemoryVT().getSizeInBits();
4588 if (Size
!= 32 && Size
!= 64)
4591 return tryFoldBroadcast(Root
, P
, L
, Base
, Scale
, Index
, Disp
, Segment
);
4594 bool FoldedLoad
= false;
4595 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4596 if (tryFoldLoadOrBCast(Root
, ParentC
, C
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
4598 } else if (tryFoldLoadOrBCast(Root
, ParentA
, A
, Tmp0
, Tmp1
, Tmp2
, Tmp3
,
4602 // Swap bits 1/4 and 3/6.
4603 uint8_t OldImm
= Imm
;
4604 Imm
= OldImm
& 0xa5;
4605 if (OldImm
& 0x02) Imm
|= 0x10;
4606 if (OldImm
& 0x10) Imm
|= 0x02;
4607 if (OldImm
& 0x08) Imm
|= 0x40;
4608 if (OldImm
& 0x40) Imm
|= 0x08;
4609 } else if (tryFoldLoadOrBCast(Root
, ParentB
, B
, Tmp0
, Tmp1
, Tmp2
, Tmp3
,
4613 // Swap bits 1/2 and 5/6.
4614 uint8_t OldImm
= Imm
;
4615 Imm
= OldImm
& 0x99;
4616 if (OldImm
& 0x02) Imm
|= 0x04;
4617 if (OldImm
& 0x04) Imm
|= 0x02;
4618 if (OldImm
& 0x20) Imm
|= 0x40;
4619 if (OldImm
& 0x40) Imm
|= 0x20;
4624 SDValue TImm
= CurDAG
->getTargetConstant(Imm
, DL
, MVT::i8
);
4626 MVT NVT
= Root
->getSimpleValueType(0);
4628 MachineSDNode
*MNode
;
4630 SDVTList VTs
= CurDAG
->getVTList(NVT
, MVT::Other
);
4633 if (C
.getOpcode() == X86ISD::VBROADCAST_LOAD
) {
4634 auto *MemIntr
= cast
<MemIntrinsicSDNode
>(C
);
4635 unsigned EltSize
= MemIntr
->getMemoryVT().getSizeInBits();
4636 assert((EltSize
== 32 || EltSize
== 64) && "Unexpected broadcast size!");
4638 bool UseD
= EltSize
== 32;
4639 if (NVT
.is128BitVector())
4640 Opc
= UseD
? X86::VPTERNLOGDZ128rmbi
: X86::VPTERNLOGQZ128rmbi
;
4641 else if (NVT
.is256BitVector())
4642 Opc
= UseD
? X86::VPTERNLOGDZ256rmbi
: X86::VPTERNLOGQZ256rmbi
;
4643 else if (NVT
.is512BitVector())
4644 Opc
= UseD
? X86::VPTERNLOGDZrmbi
: X86::VPTERNLOGQZrmbi
;
4646 llvm_unreachable("Unexpected vector size!");
4648 bool UseD
= NVT
.getVectorElementType() == MVT::i32
;
4649 if (NVT
.is128BitVector())
4650 Opc
= UseD
? X86::VPTERNLOGDZ128rmi
: X86::VPTERNLOGQZ128rmi
;
4651 else if (NVT
.is256BitVector())
4652 Opc
= UseD
? X86::VPTERNLOGDZ256rmi
: X86::VPTERNLOGQZ256rmi
;
4653 else if (NVT
.is512BitVector())
4654 Opc
= UseD
? X86::VPTERNLOGDZrmi
: X86::VPTERNLOGQZrmi
;
4656 llvm_unreachable("Unexpected vector size!");
4659 SDValue Ops
[] = {A
, B
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, TImm
, C
.getOperand(0)};
4660 MNode
= CurDAG
->getMachineNode(Opc
, DL
, VTs
, Ops
);
4662 // Update the chain.
4663 ReplaceUses(C
.getValue(1), SDValue(MNode
, 1));
4664 // Record the mem-refs
4665 CurDAG
->setNodeMemRefs(MNode
, {cast
<MemSDNode
>(C
)->getMemOperand()});
4667 bool UseD
= NVT
.getVectorElementType() == MVT::i32
;
4669 if (NVT
.is128BitVector())
4670 Opc
= UseD
? X86::VPTERNLOGDZ128rri
: X86::VPTERNLOGQZ128rri
;
4671 else if (NVT
.is256BitVector())
4672 Opc
= UseD
? X86::VPTERNLOGDZ256rri
: X86::VPTERNLOGQZ256rri
;
4673 else if (NVT
.is512BitVector())
4674 Opc
= UseD
? X86::VPTERNLOGDZrri
: X86::VPTERNLOGQZrri
;
4676 llvm_unreachable("Unexpected vector size!");
4678 MNode
= CurDAG
->getMachineNode(Opc
, DL
, NVT
, {A
, B
, C
, TImm
});
4681 ReplaceUses(SDValue(Root
, 0), SDValue(MNode
, 0));
4682 CurDAG
->RemoveDeadNode(Root
);
4686 // Try to match two logic ops to a VPTERNLOG.
4687 // FIXME: Handle more complex patterns that use an operand more than once?
4688 bool X86DAGToDAGISel::tryVPTERNLOG(SDNode
*N
) {
4689 MVT NVT
= N
->getSimpleValueType(0);
4691 // Make sure we support VPTERNLOG.
4692 if (!NVT
.isVector() || !Subtarget
->hasAVX512() ||
4693 NVT
.getVectorElementType() == MVT::i1
)
4696 // We need VLX for 128/256-bit.
4697 if (!(Subtarget
->hasVLX() || NVT
.is512BitVector()))
4700 SDValue N0
= N
->getOperand(0);
4701 SDValue N1
= N
->getOperand(1);
4703 auto getFoldableLogicOp
= [](SDValue Op
) {
4704 // Peek through single use bitcast.
4705 if (Op
.getOpcode() == ISD::BITCAST
&& Op
.hasOneUse())
4706 Op
= Op
.getOperand(0);
4708 if (!Op
.hasOneUse())
4711 unsigned Opc
= Op
.getOpcode();
4712 if (Opc
== ISD::AND
|| Opc
== ISD::OR
|| Opc
== ISD::XOR
||
4713 Opc
== X86ISD::ANDNP
)
4719 SDValue A
, FoldableOp
;
4720 if ((FoldableOp
= getFoldableLogicOp(N1
))) {
4722 } else if ((FoldableOp
= getFoldableLogicOp(N0
))) {
4727 SDValue B
= FoldableOp
.getOperand(0);
4728 SDValue C
= FoldableOp
.getOperand(1);
4729 SDNode
*ParentA
= N
;
4730 SDNode
*ParentB
= FoldableOp
.getNode();
4731 SDNode
*ParentC
= FoldableOp
.getNode();
4733 // We can build the appropriate control immediate by performing the logic
4734 // operation we're matching using these constants for A, B, and C.
4735 uint8_t TernlogMagicA
= 0xf0;
4736 uint8_t TernlogMagicB
= 0xcc;
4737 uint8_t TernlogMagicC
= 0xaa;
4739 // Some of the inputs may be inverted, peek through them and invert the
4740 // magic values accordingly.
4741 // TODO: There may be a bitcast before the xor that we should peek through.
4742 auto PeekThroughNot
= [](SDValue
&Op
, SDNode
*&Parent
, uint8_t &Magic
) {
4743 if (Op
.getOpcode() == ISD::XOR
&& Op
.hasOneUse() &&
4744 ISD::isBuildVectorAllOnes(Op
.getOperand(1).getNode())) {
4746 Parent
= Op
.getNode();
4747 Op
= Op
.getOperand(0);
4751 PeekThroughNot(A
, ParentA
, TernlogMagicA
);
4752 PeekThroughNot(B
, ParentB
, TernlogMagicB
);
4753 PeekThroughNot(C
, ParentC
, TernlogMagicC
);
4756 switch (FoldableOp
.getOpcode()) {
4757 default: llvm_unreachable("Unexpected opcode!");
4758 case ISD::AND
: Imm
= TernlogMagicB
& TernlogMagicC
; break;
4759 case ISD::OR
: Imm
= TernlogMagicB
| TernlogMagicC
; break;
4760 case ISD::XOR
: Imm
= TernlogMagicB
^ TernlogMagicC
; break;
4761 case X86ISD::ANDNP
: Imm
= ~(TernlogMagicB
) & TernlogMagicC
; break;
4764 switch (N
->getOpcode()) {
4765 default: llvm_unreachable("Unexpected opcode!");
4768 Imm
&= ~TernlogMagicA
;
4770 Imm
= ~(Imm
) & TernlogMagicA
;
4772 case ISD::AND
: Imm
&= TernlogMagicA
; break;
4773 case ISD::OR
: Imm
|= TernlogMagicA
; break;
4774 case ISD::XOR
: Imm
^= TernlogMagicA
; break;
4777 return matchVPTERNLOG(N
, ParentA
, ParentB
, ParentC
, A
, B
, C
, Imm
);
4780 /// If the high bits of an 'and' operand are known zero, try setting the
4781 /// high bits of an 'and' constant operand to produce a smaller encoding by
4782 /// creating a small, sign-extended negative immediate rather than a large
4783 /// positive one. This reverses a transform in SimplifyDemandedBits that
4784 /// shrinks mask constants by clearing bits. There is also a possibility that
4785 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4786 /// case, just replace the 'and'. Return 'true' if the node is replaced.
4787 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode
*And
) {
4788 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4789 // have immediate operands.
4790 MVT VT
= And
->getSimpleValueType(0);
4791 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
4794 auto *And1C
= dyn_cast
<ConstantSDNode
>(And
->getOperand(1));
4798 // Bail out if the mask constant is already negative. It's can't shrink more.
4799 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4800 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4801 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4802 // are negative too.
4803 APInt MaskVal
= And1C
->getAPIntValue();
4804 unsigned MaskLZ
= MaskVal
.countl_zero();
4805 if (!MaskLZ
|| (VT
== MVT::i64
&& MaskLZ
== 32))
4808 // Don't extend into the upper 32 bits of a 64 bit mask.
4809 if (VT
== MVT::i64
&& MaskLZ
>= 32) {
4811 MaskVal
= MaskVal
.trunc(32);
4814 SDValue And0
= And
->getOperand(0);
4815 APInt HighZeros
= APInt::getHighBitsSet(MaskVal
.getBitWidth(), MaskLZ
);
4816 APInt NegMaskVal
= MaskVal
| HighZeros
;
4818 // If a negative constant would not allow a smaller encoding, there's no need
4819 // to continue. Only change the constant when we know it's a win.
4820 unsigned MinWidth
= NegMaskVal
.getSignificantBits();
4821 if (MinWidth
> 32 || (MinWidth
> 8 && MaskVal
.getSignificantBits() <= 32))
4824 // Extend masks if we truncated above.
4825 if (VT
== MVT::i64
&& MaskVal
.getBitWidth() < 64) {
4826 NegMaskVal
= NegMaskVal
.zext(64);
4827 HighZeros
= HighZeros
.zext(64);
4830 // The variable operand must be all zeros in the top bits to allow using the
4831 // new, negative constant as the mask.
4832 // TODO: Handle constant folding?
4833 KnownBits Known0
= CurDAG
->computeKnownBits(And0
);
4834 if (Known0
.isConstant() || !HighZeros
.isSubsetOf(Known0
.Zero
))
4837 // Check if the mask is -1. In that case, this is an unnecessary instruction
4838 // that escaped earlier analysis.
4839 if (NegMaskVal
.isAllOnes()) {
4840 ReplaceNode(And
, And0
.getNode());
4844 // A negative mask allows a smaller encoding. Create a new 'and' node.
4845 SDValue NewMask
= CurDAG
->getConstant(NegMaskVal
, SDLoc(And
), VT
);
4846 insertDAGNode(*CurDAG
, SDValue(And
, 0), NewMask
);
4847 SDValue NewAnd
= CurDAG
->getNode(ISD::AND
, SDLoc(And
), VT
, And0
, NewMask
);
4848 ReplaceNode(And
, NewAnd
.getNode());
4849 SelectCode(NewAnd
.getNode());
4853 static unsigned getVPTESTMOpc(MVT TestVT
, bool IsTestN
, bool FoldedLoad
,
4854 bool FoldedBCast
, bool Masked
) {
4855 #define VPTESTM_CASE(VT, SUFFIX) \
4858 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4859 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4862 #define VPTESTM_BROADCAST_CASES(SUFFIX) \
4863 default: llvm_unreachable("Unexpected VT!"); \
4864 VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4865 VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4866 VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4867 VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4868 VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4869 VPTESTM_CASE(v8i64, QZ##SUFFIX)
4871 #define VPTESTM_FULL_CASES(SUFFIX) \
4872 VPTESTM_BROADCAST_CASES(SUFFIX) \
4873 VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4874 VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4875 VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4876 VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4877 VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4878 VPTESTM_CASE(v32i16, WZ##SUFFIX)
4881 switch (TestVT
.SimpleTy
) {
4882 VPTESTM_BROADCAST_CASES(rmb
)
4887 switch (TestVT
.SimpleTy
) {
4888 VPTESTM_FULL_CASES(rm
)
4892 switch (TestVT
.SimpleTy
) {
4893 VPTESTM_FULL_CASES(rr
)
4896 #undef VPTESTM_FULL_CASES
4897 #undef VPTESTM_BROADCAST_CASES
4901 // Try to create VPTESTM instruction. If InMask is not null, it will be used
4902 // to form a masked operation.
4903 bool X86DAGToDAGISel::tryVPTESTM(SDNode
*Root
, SDValue Setcc
,
4905 assert(Subtarget
->hasAVX512() && "Expected AVX512!");
4906 assert(Setcc
.getSimpleValueType().getVectorElementType() == MVT::i1
&&
4909 // Look for equal and not equal compares.
4910 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Setcc
.getOperand(2))->get();
4911 if (CC
!= ISD::SETEQ
&& CC
!= ISD::SETNE
)
4914 SDValue SetccOp0
= Setcc
.getOperand(0);
4915 SDValue SetccOp1
= Setcc
.getOperand(1);
4917 // Canonicalize the all zero vector to the RHS.
4918 if (ISD::isBuildVectorAllZeros(SetccOp0
.getNode()))
4919 std::swap(SetccOp0
, SetccOp1
);
4921 // See if we're comparing against zero.
4922 if (!ISD::isBuildVectorAllZeros(SetccOp1
.getNode()))
4925 SDValue N0
= SetccOp0
;
4927 MVT CmpVT
= N0
.getSimpleValueType();
4928 MVT CmpSVT
= CmpVT
.getVectorElementType();
4930 // Start with both operands the same. We'll try to refine this.
4935 // Look through single use bitcasts.
4936 SDValue N0Temp
= N0
;
4937 if (N0Temp
.getOpcode() == ISD::BITCAST
&& N0Temp
.hasOneUse())
4938 N0Temp
= N0
.getOperand(0);
4940 // Look for single use AND.
4941 if (N0Temp
.getOpcode() == ISD::AND
&& N0Temp
.hasOneUse()) {
4942 Src0
= N0Temp
.getOperand(0);
4943 Src1
= N0Temp
.getOperand(1);
4947 // Without VLX we need to widen the operation.
4948 bool Widen
= !Subtarget
->hasVLX() && !CmpVT
.is512BitVector();
4950 auto tryFoldLoadOrBCast
= [&](SDNode
*Root
, SDNode
*P
, SDValue
&L
,
4951 SDValue
&Base
, SDValue
&Scale
, SDValue
&Index
,
4952 SDValue
&Disp
, SDValue
&Segment
) {
4953 // If we need to widen, we can't fold the load.
4955 if (tryFoldLoad(Root
, P
, L
, Base
, Scale
, Index
, Disp
, Segment
))
4958 // If we didn't fold a load, try to match broadcast. No widening limitation
4959 // for this. But only 32 and 64 bit types are supported.
4960 if (CmpSVT
!= MVT::i32
&& CmpSVT
!= MVT::i64
)
4963 // Look through single use bitcasts.
4964 if (L
.getOpcode() == ISD::BITCAST
&& L
.hasOneUse()) {
4966 L
= L
.getOperand(0);
4969 if (L
.getOpcode() != X86ISD::VBROADCAST_LOAD
)
4972 auto *MemIntr
= cast
<MemIntrinsicSDNode
>(L
);
4973 if (MemIntr
->getMemoryVT().getSizeInBits() != CmpSVT
.getSizeInBits())
4976 return tryFoldBroadcast(Root
, P
, L
, Base
, Scale
, Index
, Disp
, Segment
);
4979 // We can only fold loads if the sources are unique.
4980 bool CanFoldLoads
= Src0
!= Src1
;
4982 bool FoldedLoad
= false;
4983 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4985 FoldedLoad
= tryFoldLoadOrBCast(Root
, N0
.getNode(), Src1
, Tmp0
, Tmp1
, Tmp2
,
4988 // And is commutative.
4989 FoldedLoad
= tryFoldLoadOrBCast(Root
, N0
.getNode(), Src0
, Tmp0
, Tmp1
,
4992 std::swap(Src0
, Src1
);
4996 bool FoldedBCast
= FoldedLoad
&& Src1
.getOpcode() == X86ISD::VBROADCAST_LOAD
;
4998 bool IsMasked
= InMask
.getNode() != nullptr;
5002 MVT ResVT
= Setcc
.getSimpleValueType();
5005 // Widen the inputs using insert_subreg or copy_to_regclass.
5006 unsigned Scale
= CmpVT
.is128BitVector() ? 4 : 2;
5007 unsigned SubReg
= CmpVT
.is128BitVector() ? X86::sub_xmm
: X86::sub_ymm
;
5008 unsigned NumElts
= CmpVT
.getVectorNumElements() * Scale
;
5009 CmpVT
= MVT::getVectorVT(CmpSVT
, NumElts
);
5010 MaskVT
= MVT::getVectorVT(MVT::i1
, NumElts
);
5011 SDValue ImplDef
= SDValue(CurDAG
->getMachineNode(X86::IMPLICIT_DEF
, dl
,
5013 Src0
= CurDAG
->getTargetInsertSubreg(SubReg
, dl
, CmpVT
, ImplDef
, Src0
);
5016 Src1
= CurDAG
->getTargetInsertSubreg(SubReg
, dl
, CmpVT
, ImplDef
, Src1
);
5020 unsigned RegClass
= TLI
->getRegClassFor(MaskVT
)->getID();
5021 SDValue RC
= CurDAG
->getTargetConstant(RegClass
, dl
, MVT::i32
);
5022 InMask
= SDValue(CurDAG
->getMachineNode(TargetOpcode::COPY_TO_REGCLASS
,
5023 dl
, MaskVT
, InMask
, RC
), 0);
5027 bool IsTestN
= CC
== ISD::SETEQ
;
5028 unsigned Opc
= getVPTESTMOpc(CmpVT
, IsTestN
, FoldedLoad
, FoldedBCast
,
5031 MachineSDNode
*CNode
;
5033 SDVTList VTs
= CurDAG
->getVTList(MaskVT
, MVT::Other
);
5036 SDValue Ops
[] = { InMask
, Src0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
,
5037 Src1
.getOperand(0) };
5038 CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
5040 SDValue Ops
[] = { Src0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
,
5041 Src1
.getOperand(0) };
5042 CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
5045 // Update the chain.
5046 ReplaceUses(Src1
.getValue(1), SDValue(CNode
, 1));
5047 // Record the mem-refs
5048 CurDAG
->setNodeMemRefs(CNode
, {cast
<MemSDNode
>(Src1
)->getMemOperand()});
5051 CNode
= CurDAG
->getMachineNode(Opc
, dl
, MaskVT
, InMask
, Src0
, Src1
);
5053 CNode
= CurDAG
->getMachineNode(Opc
, dl
, MaskVT
, Src0
, Src1
);
5056 // If we widened, we need to shrink the mask VT.
5058 unsigned RegClass
= TLI
->getRegClassFor(ResVT
)->getID();
5059 SDValue RC
= CurDAG
->getTargetConstant(RegClass
, dl
, MVT::i32
);
5060 CNode
= CurDAG
->getMachineNode(TargetOpcode::COPY_TO_REGCLASS
,
5061 dl
, ResVT
, SDValue(CNode
, 0), RC
);
5064 ReplaceUses(SDValue(Root
, 0), SDValue(CNode
, 0));
5065 CurDAG
->RemoveDeadNode(Root
);
5069 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5071 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode
*N
) {
5072 assert(N
->getOpcode() == ISD::OR
&& "Unexpected opcode!");
5074 MVT NVT
= N
->getSimpleValueType(0);
5076 // Make sure we support VPTERNLOG.
5077 if (!NVT
.isVector() || !Subtarget
->hasAVX512())
5080 // We need VLX for 128/256-bit.
5081 if (!(Subtarget
->hasVLX() || NVT
.is512BitVector()))
5084 SDValue N0
= N
->getOperand(0);
5085 SDValue N1
= N
->getOperand(1);
5087 // Canonicalize AND to LHS.
5088 if (N1
.getOpcode() == ISD::AND
)
5091 if (N0
.getOpcode() != ISD::AND
||
5092 N1
.getOpcode() != X86ISD::ANDNP
||
5093 !N0
.hasOneUse() || !N1
.hasOneUse())
5096 // ANDN is not commutable, use it to pick down A and C.
5097 SDValue A
= N1
.getOperand(0);
5098 SDValue C
= N1
.getOperand(1);
5100 // AND is commutable, if one operand matches A, the other operand is B.
5101 // Otherwise this isn't a match.
5103 if (N0
.getOperand(0) == A
)
5104 B
= N0
.getOperand(1);
5105 else if (N0
.getOperand(1) == A
)
5106 B
= N0
.getOperand(0);
5111 SDValue Imm
= CurDAG
->getTargetConstant(0xCA, dl
, MVT::i8
);
5112 SDValue Ternlog
= CurDAG
->getNode(X86ISD::VPTERNLOG
, dl
, NVT
, A
, B
, C
, Imm
);
5113 ReplaceNode(N
, Ternlog
.getNode());
5115 return matchVPTERNLOG(Ternlog
.getNode(), Ternlog
.getNode(), Ternlog
.getNode(),
5116 Ternlog
.getNode(), A
, B
, C
, 0xCA);
5119 void X86DAGToDAGISel::Select(SDNode
*Node
) {
5120 MVT NVT
= Node
->getSimpleValueType(0);
5121 unsigned Opcode
= Node
->getOpcode();
5124 if (Node
->isMachineOpcode()) {
5125 LLVM_DEBUG(dbgs() << "== "; Node
->dump(CurDAG
); dbgs() << '\n');
5126 Node
->setNodeId(-1);
5127 return; // Already selected.
5132 case ISD::INTRINSIC_W_CHAIN
: {
5133 unsigned IntNo
= Node
->getConstantOperandVal(1);
5136 case Intrinsic::x86_encodekey128
:
5137 case Intrinsic::x86_encodekey256
: {
5138 if (!Subtarget
->hasKL())
5143 default: llvm_unreachable("Impossible intrinsic");
5144 case Intrinsic::x86_encodekey128
:
5145 Opcode
= X86::ENCODEKEY128
;
5147 case Intrinsic::x86_encodekey256
:
5148 Opcode
= X86::ENCODEKEY256
;
5152 SDValue Chain
= Node
->getOperand(0);
5153 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM0
, Node
->getOperand(3),
5155 if (Opcode
== X86::ENCODEKEY256
)
5156 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM1
, Node
->getOperand(4),
5159 MachineSDNode
*Res
= CurDAG
->getMachineNode(
5160 Opcode
, dl
, Node
->getVTList(),
5161 {Node
->getOperand(2), Chain
, Chain
.getValue(1)});
5162 ReplaceNode(Node
, Res
);
5165 case Intrinsic::x86_tileloaddrs64_internal
:
5166 case Intrinsic::x86_tileloaddrst164_internal
:
5167 if (!Subtarget
->hasAMXMOVRS())
5170 case Intrinsic::x86_tileloadd64_internal
:
5171 case Intrinsic::x86_tileloaddt164_internal
: {
5172 if (!Subtarget
->hasAMXTILE())
5175 CurDAG
->getMachineFunction().getInfo
<X86MachineFunctionInfo
>();
5176 MFI
->setAMXProgModel(AMXProgModelEnum::ManagedRA
);
5180 llvm_unreachable("Unexpected intrinsic!");
5181 case Intrinsic::x86_tileloaddrs64_internal
:
5182 Opc
= X86::PTILELOADDRSV
;
5184 case Intrinsic::x86_tileloaddrst164_internal
:
5185 Opc
= X86::PTILELOADDRST1V
;
5187 case Intrinsic::x86_tileloadd64_internal
:
5188 Opc
= X86::PTILELOADDV
;
5190 case Intrinsic::x86_tileloaddt164_internal
:
5191 Opc
= X86::PTILELOADDT1V
;
5194 // _tile_loadd_internal(row, col, buf, STRIDE)
5195 SDValue Base
= Node
->getOperand(4);
5196 SDValue Scale
= getI8Imm(1, dl
);
5197 SDValue Index
= Node
->getOperand(5);
5198 SDValue Disp
= CurDAG
->getTargetConstant(0, dl
, MVT::i32
);
5199 SDValue Segment
= CurDAG
->getRegister(0, MVT::i16
);
5200 SDValue Chain
= Node
->getOperand(0);
5201 MachineSDNode
*CNode
;
5202 SDValue Ops
[] = {Node
->getOperand(2),
5203 Node
->getOperand(3),
5210 CNode
= CurDAG
->getMachineNode(Opc
, dl
, {MVT::x86amx
, MVT::Other
}, Ops
);
5211 ReplaceNode(Node
, CNode
);
5217 case ISD::INTRINSIC_VOID
: {
5218 unsigned IntNo
= Node
->getConstantOperandVal(1);
5221 case Intrinsic::x86_sse3_monitor
:
5222 case Intrinsic::x86_monitorx
:
5223 case Intrinsic::x86_clzero
: {
5224 bool Use64BitPtr
= Node
->getOperand(2).getValueType() == MVT::i64
;
5228 default: llvm_unreachable("Unexpected intrinsic!");
5229 case Intrinsic::x86_sse3_monitor
:
5230 if (!Subtarget
->hasSSE3())
5232 Opc
= Use64BitPtr
? X86::MONITOR64rrr
: X86::MONITOR32rrr
;
5234 case Intrinsic::x86_monitorx
:
5235 if (!Subtarget
->hasMWAITX())
5237 Opc
= Use64BitPtr
? X86::MONITORX64rrr
: X86::MONITORX32rrr
;
5239 case Intrinsic::x86_clzero
:
5240 if (!Subtarget
->hasCLZERO())
5242 Opc
= Use64BitPtr
? X86::CLZERO64r
: X86::CLZERO32r
;
5247 unsigned PtrReg
= Use64BitPtr
? X86::RAX
: X86::EAX
;
5248 SDValue Chain
= CurDAG
->getCopyToReg(Node
->getOperand(0), dl
, PtrReg
,
5249 Node
->getOperand(2), SDValue());
5250 SDValue InGlue
= Chain
.getValue(1);
5252 if (IntNo
== Intrinsic::x86_sse3_monitor
||
5253 IntNo
== Intrinsic::x86_monitorx
) {
5254 // Copy the other two operands to ECX and EDX.
5255 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::ECX
, Node
->getOperand(3),
5257 InGlue
= Chain
.getValue(1);
5258 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::EDX
, Node
->getOperand(4),
5260 InGlue
= Chain
.getValue(1);
5263 MachineSDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, MVT::Other
,
5265 ReplaceNode(Node
, CNode
);
5271 case Intrinsic::x86_tilestored64_internal
: {
5273 CurDAG
->getMachineFunction().getInfo
<X86MachineFunctionInfo
>();
5274 MFI
->setAMXProgModel(AMXProgModelEnum::ManagedRA
);
5275 unsigned Opc
= X86::PTILESTOREDV
;
5276 // _tile_stored_internal(row, col, buf, STRIDE, c)
5277 SDValue Base
= Node
->getOperand(4);
5278 SDValue Scale
= getI8Imm(1, dl
);
5279 SDValue Index
= Node
->getOperand(5);
5280 SDValue Disp
= CurDAG
->getTargetConstant(0, dl
, MVT::i32
);
5281 SDValue Segment
= CurDAG
->getRegister(0, MVT::i16
);
5282 SDValue Chain
= Node
->getOperand(0);
5283 MachineSDNode
*CNode
;
5284 SDValue Ops
[] = {Node
->getOperand(2),
5285 Node
->getOperand(3),
5291 Node
->getOperand(6),
5293 CNode
= CurDAG
->getMachineNode(Opc
, dl
, MVT::Other
, Ops
);
5294 ReplaceNode(Node
, CNode
);
5297 case Intrinsic::x86_tileloaddrs64
:
5298 case Intrinsic::x86_tileloaddrst164
:
5299 if (!Subtarget
->hasAMXMOVRS())
5302 case Intrinsic::x86_tileloadd64
:
5303 case Intrinsic::x86_tileloaddt164
:
5304 case Intrinsic::x86_tilestored64
: {
5305 if (!Subtarget
->hasAMXTILE())
5308 CurDAG
->getMachineFunction().getInfo
<X86MachineFunctionInfo
>();
5309 MFI
->setAMXProgModel(AMXProgModelEnum::DirectReg
);
5312 default: llvm_unreachable("Unexpected intrinsic!");
5313 case Intrinsic::x86_tileloadd64
: Opc
= X86::PTILELOADD
; break;
5314 case Intrinsic::x86_tileloaddrs64
:
5315 Opc
= X86::PTILELOADDRS
;
5317 case Intrinsic::x86_tileloaddt164
: Opc
= X86::PTILELOADDT1
; break;
5318 case Intrinsic::x86_tileloaddrst164
:
5319 Opc
= X86::PTILELOADDRST1
;
5321 case Intrinsic::x86_tilestored64
: Opc
= X86::PTILESTORED
; break;
5323 // FIXME: Match displacement and scale.
5324 unsigned TIndex
= Node
->getConstantOperandVal(2);
5325 SDValue TReg
= getI8Imm(TIndex
, dl
);
5326 SDValue Base
= Node
->getOperand(3);
5327 SDValue Scale
= getI8Imm(1, dl
);
5328 SDValue Index
= Node
->getOperand(4);
5329 SDValue Disp
= CurDAG
->getTargetConstant(0, dl
, MVT::i32
);
5330 SDValue Segment
= CurDAG
->getRegister(0, MVT::i16
);
5331 SDValue Chain
= Node
->getOperand(0);
5332 MachineSDNode
*CNode
;
5333 if (Opc
== X86::PTILESTORED
) {
5334 SDValue Ops
[] = { Base
, Scale
, Index
, Disp
, Segment
, TReg
, Chain
};
5335 CNode
= CurDAG
->getMachineNode(Opc
, dl
, MVT::Other
, Ops
);
5337 SDValue Ops
[] = { TReg
, Base
, Scale
, Index
, Disp
, Segment
, Chain
};
5338 CNode
= CurDAG
->getMachineNode(Opc
, dl
, MVT::Other
, Ops
);
5340 ReplaceNode(Node
, CNode
);
5343 case Intrinsic::x86_t2rpntlvwz0rs
:
5344 case Intrinsic::x86_t2rpntlvwz0rst1
:
5345 case Intrinsic::x86_t2rpntlvwz1rs
:
5346 case Intrinsic::x86_t2rpntlvwz1rst1
:
5347 if (!Subtarget
->hasAMXMOVRS())
5350 case Intrinsic::x86_t2rpntlvwz0
:
5351 case Intrinsic::x86_t2rpntlvwz0t1
:
5352 case Intrinsic::x86_t2rpntlvwz1
:
5353 case Intrinsic::x86_t2rpntlvwz1t1
: {
5354 if (!Subtarget
->hasAMXTRANSPOSE())
5357 CurDAG
->getMachineFunction().getInfo
<X86MachineFunctionInfo
>();
5358 MFI
->setAMXProgModel(AMXProgModelEnum::DirectReg
);
5362 llvm_unreachable("Unexpected intrinsic!");
5363 case Intrinsic::x86_t2rpntlvwz0
:
5364 Opc
= X86::PT2RPNTLVWZ0
;
5366 case Intrinsic::x86_t2rpntlvwz0t1
:
5367 Opc
= X86::PT2RPNTLVWZ0T1
;
5369 case Intrinsic::x86_t2rpntlvwz1
:
5370 Opc
= X86::PT2RPNTLVWZ1
;
5372 case Intrinsic::x86_t2rpntlvwz1t1
:
5373 Opc
= X86::PT2RPNTLVWZ1T1
;
5375 case Intrinsic::x86_t2rpntlvwz0rs
:
5376 Opc
= X86::PT2RPNTLVWZ0RS
;
5378 case Intrinsic::x86_t2rpntlvwz0rst1
:
5379 Opc
= X86::PT2RPNTLVWZ0RST1
;
5381 case Intrinsic::x86_t2rpntlvwz1rs
:
5382 Opc
= X86::PT2RPNTLVWZ1RS
;
5384 case Intrinsic::x86_t2rpntlvwz1rst1
:
5385 Opc
= X86::PT2RPNTLVWZ1RST1
;
5388 // FIXME: Match displacement and scale.
5389 unsigned TIndex
= Node
->getConstantOperandVal(2);
5390 SDValue TReg
= getI8Imm(TIndex
, dl
);
5391 SDValue Base
= Node
->getOperand(3);
5392 SDValue Scale
= getI8Imm(1, dl
);
5393 SDValue Index
= Node
->getOperand(4);
5394 SDValue Disp
= CurDAG
->getTargetConstant(0, dl
, MVT::i32
);
5395 SDValue Segment
= CurDAG
->getRegister(0, MVT::i16
);
5396 SDValue Chain
= Node
->getOperand(0);
5397 SDValue Ops
[] = {TReg
, Base
, Scale
, Index
, Disp
, Segment
, Chain
};
5398 MachineSDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, MVT::Other
, Ops
);
5399 ReplaceNode(Node
, CNode
);
5406 case X86ISD::NT_BRIND
: {
5407 if (Subtarget
->isTargetNaCl())
5408 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5409 // leave the instruction alone.
5411 if (Subtarget
->isTarget64BitILP32()) {
5412 // Converts a 32-bit register to a 64-bit, zero-extended version of
5413 // it. This is needed because x86-64 can do many things, but jmp %r32
5414 // ain't one of them.
5415 SDValue Target
= Node
->getOperand(1);
5416 assert(Target
.getValueType() == MVT::i32
&& "Unexpected VT!");
5417 SDValue ZextTarget
= CurDAG
->getZExtOrTrunc(Target
, dl
, MVT::i64
);
5418 SDValue Brind
= CurDAG
->getNode(Opcode
, dl
, MVT::Other
,
5419 Node
->getOperand(0), ZextTarget
);
5420 ReplaceNode(Node
, Brind
.getNode());
5421 SelectCode(ZextTarget
.getNode());
5422 SelectCode(Brind
.getNode());
5427 case X86ISD::GlobalBaseReg
:
5428 ReplaceNode(Node
, getGlobalBaseReg());
5432 // Just drop all 128/256/512-bit bitcasts.
5433 if (NVT
.is512BitVector() || NVT
.is256BitVector() || NVT
.is128BitVector() ||
5435 ReplaceUses(SDValue(Node
, 0), Node
->getOperand(0));
5436 CurDAG
->RemoveDeadNode(Node
);
5442 if (matchBitExtract(Node
))
5447 if (tryShiftAmountMod(Node
))
5451 case X86ISD::VPTERNLOG
: {
5452 uint8_t Imm
= Node
->getConstantOperandVal(3);
5453 if (matchVPTERNLOG(Node
, Node
, Node
, Node
, Node
->getOperand(0),
5454 Node
->getOperand(1), Node
->getOperand(2), Imm
))
5460 if (tryVPTERNLOG(Node
))
5465 if (NVT
.isVector() && NVT
.getVectorElementType() == MVT::i1
) {
5466 // Try to form a masked VPTESTM. Operands can be in either order.
5467 SDValue N0
= Node
->getOperand(0);
5468 SDValue N1
= Node
->getOperand(1);
5469 if (N0
.getOpcode() == ISD::SETCC
&& N0
.hasOneUse() &&
5470 tryVPTESTM(Node
, N0
, N1
))
5472 if (N1
.getOpcode() == ISD::SETCC
&& N1
.hasOneUse() &&
5473 tryVPTESTM(Node
, N1
, N0
))
5477 if (MachineSDNode
*NewNode
= matchBEXTRFromAndImm(Node
)) {
5478 ReplaceUses(SDValue(Node
, 0), SDValue(NewNode
, 0));
5479 CurDAG
->RemoveDeadNode(Node
);
5482 if (matchBitExtract(Node
))
5484 if (AndImmShrink
&& shrinkAndImmediate(Node
))
5490 if (tryShrinkShlLogicImm(Node
))
5492 if (Opcode
== ISD::OR
&& tryMatchBitSelect(Node
))
5494 if (tryVPTERNLOG(Node
))
5499 if (Opcode
== ISD::ADD
&& matchBitExtract(Node
))
5503 // Try to avoid folding immediates with multiple uses for optsize.
5504 // This code tries to select to register form directly to avoid going
5505 // through the isel table which might fold the immediate. We can't change
5506 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5507 // tablegen files to check immediate use count without making the patterns
5508 // unavailable to the fast-isel table.
5509 if (!CurDAG
->shouldOptForSize())
5512 // Only handle i8/i16/i32/i64.
5513 if (NVT
!= MVT::i8
&& NVT
!= MVT::i16
&& NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
5516 SDValue N0
= Node
->getOperand(0);
5517 SDValue N1
= Node
->getOperand(1);
5519 auto *Cst
= dyn_cast
<ConstantSDNode
>(N1
);
5523 int64_t Val
= Cst
->getSExtValue();
5525 // Make sure its an immediate that is considered foldable.
5526 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5527 if (!isInt
<8>(Val
) && !isInt
<32>(Val
))
5530 // If this can match to INC/DEC, let it go.
5531 if (Opcode
== ISD::ADD
&& (Val
== 1 || Val
== -1))
5534 // Check if we should avoid folding this immediate.
5535 if (!shouldAvoidImmediateInstFormsForSize(N1
.getNode()))
5538 // We should not fold the immediate. So we need a register form instead.
5539 unsigned ROpc
, MOpc
;
5540 switch (NVT
.SimpleTy
) {
5541 default: llvm_unreachable("Unexpected VT!");
5544 default: llvm_unreachable("Unexpected opcode!");
5546 ROpc
= GET_ND_IF_ENABLED(X86::ADD8rr
);
5547 MOpc
= GET_ND_IF_ENABLED(X86::ADD8rm
);
5550 ROpc
= GET_ND_IF_ENABLED(X86::SUB8rr
);
5551 MOpc
= GET_ND_IF_ENABLED(X86::SUB8rm
);
5554 ROpc
= GET_ND_IF_ENABLED(X86::AND8rr
);
5555 MOpc
= GET_ND_IF_ENABLED(X86::AND8rm
);
5558 ROpc
= GET_ND_IF_ENABLED(X86::OR8rr
);
5559 MOpc
= GET_ND_IF_ENABLED(X86::OR8rm
);
5562 ROpc
= GET_ND_IF_ENABLED(X86::XOR8rr
);
5563 MOpc
= GET_ND_IF_ENABLED(X86::XOR8rm
);
5569 default: llvm_unreachable("Unexpected opcode!");
5571 ROpc
= GET_ND_IF_ENABLED(X86::ADD16rr
);
5572 MOpc
= GET_ND_IF_ENABLED(X86::ADD16rm
);
5575 ROpc
= GET_ND_IF_ENABLED(X86::SUB16rr
);
5576 MOpc
= GET_ND_IF_ENABLED(X86::SUB16rm
);
5579 ROpc
= GET_ND_IF_ENABLED(X86::AND16rr
);
5580 MOpc
= GET_ND_IF_ENABLED(X86::AND16rm
);
5583 ROpc
= GET_ND_IF_ENABLED(X86::OR16rr
);
5584 MOpc
= GET_ND_IF_ENABLED(X86::OR16rm
);
5587 ROpc
= GET_ND_IF_ENABLED(X86::XOR16rr
);
5588 MOpc
= GET_ND_IF_ENABLED(X86::XOR16rm
);
5594 default: llvm_unreachable("Unexpected opcode!");
5596 ROpc
= GET_ND_IF_ENABLED(X86::ADD32rr
);
5597 MOpc
= GET_ND_IF_ENABLED(X86::ADD32rm
);
5600 ROpc
= GET_ND_IF_ENABLED(X86::SUB32rr
);
5601 MOpc
= GET_ND_IF_ENABLED(X86::SUB32rm
);
5604 ROpc
= GET_ND_IF_ENABLED(X86::AND32rr
);
5605 MOpc
= GET_ND_IF_ENABLED(X86::AND32rm
);
5608 ROpc
= GET_ND_IF_ENABLED(X86::OR32rr
);
5609 MOpc
= GET_ND_IF_ENABLED(X86::OR32rm
);
5612 ROpc
= GET_ND_IF_ENABLED(X86::XOR32rr
);
5613 MOpc
= GET_ND_IF_ENABLED(X86::XOR32rm
);
5619 default: llvm_unreachable("Unexpected opcode!");
5621 ROpc
= GET_ND_IF_ENABLED(X86::ADD64rr
);
5622 MOpc
= GET_ND_IF_ENABLED(X86::ADD64rm
);
5625 ROpc
= GET_ND_IF_ENABLED(X86::SUB64rr
);
5626 MOpc
= GET_ND_IF_ENABLED(X86::SUB64rm
);
5629 ROpc
= GET_ND_IF_ENABLED(X86::AND64rr
);
5630 MOpc
= GET_ND_IF_ENABLED(X86::AND64rm
);
5633 ROpc
= GET_ND_IF_ENABLED(X86::OR64rr
);
5634 MOpc
= GET_ND_IF_ENABLED(X86::OR64rm
);
5637 ROpc
= GET_ND_IF_ENABLED(X86::XOR64rr
);
5638 MOpc
= GET_ND_IF_ENABLED(X86::XOR64rm
);
5644 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5646 // If this is a not a subtract, we can still try to fold a load.
5647 if (Opcode
!= ISD::SUB
) {
5648 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
5649 if (tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
5650 SDValue Ops
[] = { N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N0
.getOperand(0) };
5651 SDVTList VTs
= CurDAG
->getVTList(NVT
, MVT::i32
, MVT::Other
);
5652 MachineSDNode
*CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
5653 // Update the chain.
5654 ReplaceUses(N0
.getValue(1), SDValue(CNode
, 2));
5655 // Record the mem-refs
5656 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N0
)->getMemOperand()});
5657 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
5658 CurDAG
->RemoveDeadNode(Node
);
5663 CurDAG
->SelectNodeTo(Node
, ROpc
, NVT
, MVT::i32
, N0
, N1
);
5668 // i16/i32/i64 are handled with isel patterns.
5672 case X86ISD::UMUL
: {
5673 SDValue N0
= Node
->getOperand(0);
5674 SDValue N1
= Node
->getOperand(1);
5676 unsigned LoReg
, ROpc
, MOpc
;
5677 switch (NVT
.SimpleTy
) {
5678 default: llvm_unreachable("Unsupported VT!");
5681 ROpc
= Opcode
== X86ISD::SMUL
? X86::IMUL8r
: X86::MUL8r
;
5682 MOpc
= Opcode
== X86ISD::SMUL
? X86::IMUL8m
: X86::MUL8m
;
5701 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
5702 bool FoldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
5703 // Multiply is commutative.
5705 FoldedLoad
= tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
5710 SDValue InGlue
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, LoReg
,
5711 N0
, SDValue()).getValue(1);
5713 MachineSDNode
*CNode
;
5715 // i16/i32/i64 use an instruction that produces a low and high result even
5716 // though only the low result is used.
5719 VTs
= CurDAG
->getVTList(NVT
, MVT::i32
, MVT::Other
);
5721 VTs
= CurDAG
->getVTList(NVT
, NVT
, MVT::i32
, MVT::Other
);
5723 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
5725 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
5727 // Update the chain.
5728 ReplaceUses(N1
.getValue(1), SDValue(CNode
, NVT
== MVT::i8
? 2 : 3));
5729 // Record the mem-refs
5730 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
5732 // i16/i32/i64 use an instruction that produces a low and high result even
5733 // though only the low result is used.
5736 VTs
= CurDAG
->getVTList(NVT
, MVT::i32
);
5738 VTs
= CurDAG
->getVTList(NVT
, NVT
, MVT::i32
);
5740 CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, {N1
, InGlue
});
5743 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
5744 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, NVT
== MVT::i8
? 1 : 2));
5745 CurDAG
->RemoveDeadNode(Node
);
5749 case ISD::SMUL_LOHI
:
5750 case ISD::UMUL_LOHI
: {
5751 SDValue N0
= Node
->getOperand(0);
5752 SDValue N1
= Node
->getOperand(1);
5755 unsigned LoReg
, HiReg
;
5756 bool IsSigned
= Opcode
== ISD::SMUL_LOHI
;
5757 bool UseMULX
= !IsSigned
&& Subtarget
->hasBMI2();
5758 bool UseMULXHi
= UseMULX
&& SDValue(Node
, 0).use_empty();
5759 switch (NVT
.SimpleTy
) {
5760 default: llvm_unreachable("Unsupported VT!");
5762 Opc
= UseMULXHi
? X86::MULX32Hrr
5763 : UseMULX
? GET_EGPR_IF_ENABLED(X86::MULX32rr
)
5764 : IsSigned
? X86::IMUL32r
5766 MOpc
= UseMULXHi
? X86::MULX32Hrm
5767 : UseMULX
? GET_EGPR_IF_ENABLED(X86::MULX32rm
)
5768 : IsSigned
? X86::IMUL32m
5770 LoReg
= UseMULX
? X86::EDX
: X86::EAX
;
5774 Opc
= UseMULXHi
? X86::MULX64Hrr
5775 : UseMULX
? GET_EGPR_IF_ENABLED(X86::MULX64rr
)
5776 : IsSigned
? X86::IMUL64r
5778 MOpc
= UseMULXHi
? X86::MULX64Hrm
5779 : UseMULX
? GET_EGPR_IF_ENABLED(X86::MULX64rm
)
5780 : IsSigned
? X86::IMUL64m
5782 LoReg
= UseMULX
? X86::RDX
: X86::RAX
;
5787 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
5788 bool foldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
5789 // Multiply is commutative.
5791 foldedLoad
= tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
5796 SDValue InGlue
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, LoReg
,
5797 N0
, SDValue()).getValue(1);
5798 SDValue ResHi
, ResLo
;
5801 MachineSDNode
*CNode
= nullptr;
5802 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
5805 SDVTList VTs
= CurDAG
->getVTList(NVT
, MVT::Other
);
5806 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
5807 ResHi
= SDValue(CNode
, 0);
5808 Chain
= SDValue(CNode
, 1);
5809 } else if (UseMULX
) {
5810 SDVTList VTs
= CurDAG
->getVTList(NVT
, NVT
, MVT::Other
);
5811 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
5812 ResHi
= SDValue(CNode
, 0);
5813 ResLo
= SDValue(CNode
, 1);
5814 Chain
= SDValue(CNode
, 2);
5816 SDVTList VTs
= CurDAG
->getVTList(MVT::Other
, MVT::Glue
);
5817 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
5818 Chain
= SDValue(CNode
, 0);
5819 InGlue
= SDValue(CNode
, 1);
5822 // Update the chain.
5823 ReplaceUses(N1
.getValue(1), Chain
);
5824 // Record the mem-refs
5825 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
5827 SDValue Ops
[] = { N1
, InGlue
};
5829 SDVTList VTs
= CurDAG
->getVTList(NVT
);
5830 SDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
5831 ResHi
= SDValue(CNode
, 0);
5832 } else if (UseMULX
) {
5833 SDVTList VTs
= CurDAG
->getVTList(NVT
, NVT
);
5834 SDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
5835 ResHi
= SDValue(CNode
, 0);
5836 ResLo
= SDValue(CNode
, 1);
5838 SDVTList VTs
= CurDAG
->getVTList(MVT::Glue
);
5839 SDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
5840 InGlue
= SDValue(CNode
, 0);
5844 // Copy the low half of the result, if it is needed.
5845 if (!SDValue(Node
, 0).use_empty()) {
5847 assert(LoReg
&& "Register for low half is not defined!");
5848 ResLo
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
, LoReg
,
5850 InGlue
= ResLo
.getValue(2);
5852 ReplaceUses(SDValue(Node
, 0), ResLo
);
5853 LLVM_DEBUG(dbgs() << "=> "; ResLo
.getNode()->dump(CurDAG
);
5856 // Copy the high half of the result, if it is needed.
5857 if (!SDValue(Node
, 1).use_empty()) {
5859 assert(HiReg
&& "Register for high half is not defined!");
5860 ResHi
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
, HiReg
,
5862 InGlue
= ResHi
.getValue(2);
5864 ReplaceUses(SDValue(Node
, 1), ResHi
);
5865 LLVM_DEBUG(dbgs() << "=> "; ResHi
.getNode()->dump(CurDAG
);
5869 CurDAG
->RemoveDeadNode(Node
);
5874 case ISD::UDIVREM
: {
5875 SDValue N0
= Node
->getOperand(0);
5876 SDValue N1
= Node
->getOperand(1);
5878 unsigned ROpc
, MOpc
;
5879 bool isSigned
= Opcode
== ISD::SDIVREM
;
5881 switch (NVT
.SimpleTy
) {
5882 default: llvm_unreachable("Unsupported VT!");
5883 case MVT::i8
: ROpc
= X86::DIV8r
; MOpc
= X86::DIV8m
; break;
5884 case MVT::i16
: ROpc
= X86::DIV16r
; MOpc
= X86::DIV16m
; break;
5885 case MVT::i32
: ROpc
= X86::DIV32r
; MOpc
= X86::DIV32m
; break;
5886 case MVT::i64
: ROpc
= X86::DIV64r
; MOpc
= X86::DIV64m
; break;
5889 switch (NVT
.SimpleTy
) {
5890 default: llvm_unreachable("Unsupported VT!");
5891 case MVT::i8
: ROpc
= X86::IDIV8r
; MOpc
= X86::IDIV8m
; break;
5892 case MVT::i16
: ROpc
= X86::IDIV16r
; MOpc
= X86::IDIV16m
; break;
5893 case MVT::i32
: ROpc
= X86::IDIV32r
; MOpc
= X86::IDIV32m
; break;
5894 case MVT::i64
: ROpc
= X86::IDIV64r
; MOpc
= X86::IDIV64m
; break;
5898 unsigned LoReg
, HiReg
, ClrReg
;
5899 unsigned SExtOpcode
;
5900 switch (NVT
.SimpleTy
) {
5901 default: llvm_unreachable("Unsupported VT!");
5903 LoReg
= X86::AL
; ClrReg
= HiReg
= X86::AH
;
5904 SExtOpcode
= 0; // Not used.
5907 LoReg
= X86::AX
; HiReg
= X86::DX
;
5909 SExtOpcode
= X86::CWD
;
5912 LoReg
= X86::EAX
; ClrReg
= HiReg
= X86::EDX
;
5913 SExtOpcode
= X86::CDQ
;
5916 LoReg
= X86::RAX
; ClrReg
= HiReg
= X86::RDX
;
5917 SExtOpcode
= X86::CQO
;
5921 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
5922 bool foldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
5923 bool signBitIsZero
= CurDAG
->SignBitIsZero(N0
);
5926 if (NVT
== MVT::i8
) {
5927 // Special case for div8, just use a move with zero extension to AX to
5928 // clear the upper 8 bits (AH).
5929 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Chain
;
5930 MachineSDNode
*Move
;
5931 if (tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
5932 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N0
.getOperand(0) };
5933 unsigned Opc
= (isSigned
&& !signBitIsZero
) ? X86::MOVSX16rm8
5935 Move
= CurDAG
->getMachineNode(Opc
, dl
, MVT::i16
, MVT::Other
, Ops
);
5936 Chain
= SDValue(Move
, 1);
5937 ReplaceUses(N0
.getValue(1), Chain
);
5938 // Record the mem-refs
5939 CurDAG
->setNodeMemRefs(Move
, {cast
<LoadSDNode
>(N0
)->getMemOperand()});
5941 unsigned Opc
= (isSigned
&& !signBitIsZero
) ? X86::MOVSX16rr8
5943 Move
= CurDAG
->getMachineNode(Opc
, dl
, MVT::i16
, N0
);
5944 Chain
= CurDAG
->getEntryNode();
5946 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::AX
, SDValue(Move
, 0),
5948 InGlue
= Chain
.getValue(1);
5951 CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
,
5952 LoReg
, N0
, SDValue()).getValue(1);
5953 if (isSigned
&& !signBitIsZero
) {
5954 // Sign extend the low part into the high part.
5956 SDValue(CurDAG
->getMachineNode(SExtOpcode
, dl
, MVT::Glue
, InGlue
),0);
5958 // Zero out the high part, effectively zero extending the input.
5959 SDVTList VTs
= CurDAG
->getVTList(MVT::i32
, MVT::i32
);
5961 SDValue(CurDAG
->getMachineNode(X86::MOV32r0
, dl
, VTs
, {}), 0);
5962 switch (NVT
.SimpleTy
) {
5965 SDValue(CurDAG
->getMachineNode(
5966 TargetOpcode::EXTRACT_SUBREG
, dl
, MVT::i16
, ClrNode
,
5967 CurDAG
->getTargetConstant(X86::sub_16bit
, dl
,
5975 SDValue(CurDAG
->getMachineNode(
5976 TargetOpcode::SUBREG_TO_REG
, dl
, MVT::i64
,
5977 CurDAG
->getTargetConstant(0, dl
, MVT::i64
), ClrNode
,
5978 CurDAG
->getTargetConstant(X86::sub_32bit
, dl
,
5983 llvm_unreachable("Unexpected division source");
5986 InGlue
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, ClrReg
,
5987 ClrNode
, InGlue
).getValue(1);
5992 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
5994 MachineSDNode
*CNode
=
5995 CurDAG
->getMachineNode(MOpc
, dl
, MVT::Other
, MVT::Glue
, Ops
);
5996 InGlue
= SDValue(CNode
, 1);
5997 // Update the chain.
5998 ReplaceUses(N1
.getValue(1), SDValue(CNode
, 0));
5999 // Record the mem-refs
6000 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
6003 SDValue(CurDAG
->getMachineNode(ROpc
, dl
, MVT::Glue
, N1
, InGlue
), 0);
6006 // Prevent use of AH in a REX instruction by explicitly copying it to
6007 // an ABCD_L register.
6009 // The current assumption of the register allocator is that isel
6010 // won't generate explicit references to the GR8_ABCD_H registers. If
6011 // the allocator and/or the backend get enhanced to be more robust in
6012 // that regard, this can be, and should be, removed.
6013 if (HiReg
== X86::AH
&& !SDValue(Node
, 1).use_empty()) {
6014 SDValue AHCopy
= CurDAG
->getRegister(X86::AH
, MVT::i8
);
6015 unsigned AHExtOpcode
=
6016 isSigned
? X86::MOVSX32rr8_NOREX
: X86::MOVZX32rr8_NOREX
;
6018 SDNode
*RNode
= CurDAG
->getMachineNode(AHExtOpcode
, dl
, MVT::i32
,
6019 MVT::Glue
, AHCopy
, InGlue
);
6020 SDValue
Result(RNode
, 0);
6021 InGlue
= SDValue(RNode
, 1);
6024 CurDAG
->getTargetExtractSubreg(X86::sub_8bit
, dl
, MVT::i8
, Result
);
6026 ReplaceUses(SDValue(Node
, 1), Result
);
6027 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
6030 // Copy the division (low) result, if it is needed.
6031 if (!SDValue(Node
, 0).use_empty()) {
6032 SDValue Result
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
,
6033 LoReg
, NVT
, InGlue
);
6034 InGlue
= Result
.getValue(2);
6035 ReplaceUses(SDValue(Node
, 0), Result
);
6036 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
6039 // Copy the remainder (high) result, if it is needed.
6040 if (!SDValue(Node
, 1).use_empty()) {
6041 SDValue Result
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
,
6042 HiReg
, NVT
, InGlue
);
6043 InGlue
= Result
.getValue(2);
6044 ReplaceUses(SDValue(Node
, 1), Result
);
6045 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
6048 CurDAG
->RemoveDeadNode(Node
);
6053 case X86ISD::STRICT_FCMP
:
6054 case X86ISD::STRICT_FCMPS
: {
6055 bool IsStrictCmp
= Node
->getOpcode() == X86ISD::STRICT_FCMP
||
6056 Node
->getOpcode() == X86ISD::STRICT_FCMPS
;
6057 SDValue N0
= Node
->getOperand(IsStrictCmp
? 1 : 0);
6058 SDValue N1
= Node
->getOperand(IsStrictCmp
? 2 : 1);
6060 // Save the original VT of the compare.
6061 MVT CmpVT
= N0
.getSimpleValueType();
6063 // Floating point needs special handling if we don't have FCOMI.
6064 if (Subtarget
->canUseCMOV())
6067 bool IsSignaling
= Node
->getOpcode() == X86ISD::STRICT_FCMPS
;
6070 switch (CmpVT
.SimpleTy
) {
6071 default: llvm_unreachable("Unexpected type!");
6073 Opc
= IsSignaling
? X86::COM_Fpr32
: X86::UCOM_Fpr32
;
6076 Opc
= IsSignaling
? X86::COM_Fpr64
: X86::UCOM_Fpr64
;
6079 Opc
= IsSignaling
? X86::COM_Fpr80
: X86::UCOM_Fpr80
;
6084 IsStrictCmp
? Node
->getOperand(0) : CurDAG
->getEntryNode();
6087 SDVTList VTs
= CurDAG
->getVTList(MVT::Other
, MVT::Glue
);
6088 Chain
= SDValue(CurDAG
->getMachineNode(Opc
, dl
, VTs
, {N0
, N1
, Chain
}), 0);
6089 Glue
= Chain
.getValue(1);
6091 Glue
= SDValue(CurDAG
->getMachineNode(Opc
, dl
, MVT::Glue
, N0
, N1
), 0);
6096 SDValue(CurDAG
->getMachineNode(X86::FNSTSW16r
, dl
, MVT::i16
, Glue
), 0);
6098 // Extract upper 8-bits of AX.
6100 CurDAG
->getTargetExtractSubreg(X86::sub_8bit_hi
, dl
, MVT::i8
, FNSTSW
);
6102 // Move AH into flags.
6103 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6104 assert(Subtarget
->canUseLAHFSAHF() &&
6105 "Target doesn't support SAHF or FCOMI?");
6106 SDValue AH
= CurDAG
->getCopyToReg(Chain
, dl
, X86::AH
, Extract
, SDValue());
6108 SDValue SAHF
= SDValue(
6109 CurDAG
->getMachineNode(X86::SAHF
, dl
, MVT::i32
, AH
.getValue(1)), 0);
6112 ReplaceUses(SDValue(Node
, 1), Chain
);
6114 ReplaceUses(SDValue(Node
, 0), SAHF
);
6115 CurDAG
->RemoveDeadNode(Node
);
6120 SDValue N0
= Node
->getOperand(0);
6121 SDValue N1
= Node
->getOperand(1);
6123 // Optimizations for TEST compares.
6124 if (!isNullConstant(N1
))
6127 // Save the original VT of the compare.
6128 MVT CmpVT
= N0
.getSimpleValueType();
6130 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6131 // by a test instruction. The test should be removed later by
6132 // analyzeCompare if we are using only the zero flag.
6133 // TODO: Should we check the users and use the BEXTR flags directly?
6134 if (N0
.getOpcode() == ISD::AND
&& N0
.hasOneUse()) {
6135 if (MachineSDNode
*NewNode
= matchBEXTRFromAndImm(N0
.getNode())) {
6136 unsigned TestOpc
= CmpVT
== MVT::i64
? X86::TEST64rr
6138 SDValue BEXTR
= SDValue(NewNode
, 0);
6139 NewNode
= CurDAG
->getMachineNode(TestOpc
, dl
, MVT::i32
, BEXTR
, BEXTR
);
6140 ReplaceUses(SDValue(Node
, 0), SDValue(NewNode
, 0));
6141 CurDAG
->RemoveDeadNode(Node
);
6146 // We can peek through truncates, but we need to be careful below.
6147 if (N0
.getOpcode() == ISD::TRUNCATE
&& N0
.hasOneUse())
6148 N0
= N0
.getOperand(0);
6150 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6151 // use a smaller encoding.
6152 // Look past the truncate if CMP is the only use of it.
6153 if (N0
.getOpcode() == ISD::AND
&& N0
.getNode()->hasOneUse() &&
6154 N0
.getValueType() != MVT::i8
) {
6155 auto *MaskC
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1));
6159 // We may have looked through a truncate so mask off any bits that
6160 // shouldn't be part of the compare.
6161 uint64_t Mask
= MaskC
->getZExtValue();
6162 Mask
&= maskTrailingOnes
<uint64_t>(CmpVT
.getScalarSizeInBits());
6164 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6165 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6167 if (CmpVT
== MVT::i64
&& !isInt
<8>(Mask
) && isShiftedMask_64(Mask
) &&
6168 onlyUsesZeroFlag(SDValue(Node
, 0))) {
6169 unsigned ShiftOpcode
= ISD::DELETED_NODE
;
6173 unsigned TestOpcode
;
6174 unsigned LeadingZeros
= llvm::countl_zero(Mask
);
6175 unsigned TrailingZeros
= llvm::countr_zero(Mask
);
6177 // With leading/trailing zeros, the transform is profitable if we can
6178 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6179 // incurring any extra register moves.
6180 bool SavesBytes
= !isInt
<32>(Mask
) || N0
.getOperand(0).hasOneUse();
6181 if (LeadingZeros
== 0 && SavesBytes
) {
6182 // If the mask covers the most significant bit, then we can replace
6183 // TEST+AND with a SHR and check eflags.
6184 // This emits a redundant TEST which is subsequently eliminated.
6185 ShiftOpcode
= GET_ND_IF_ENABLED(X86::SHR64ri
);
6186 ShiftAmt
= TrailingZeros
;
6188 TestOpcode
= X86::TEST64rr
;
6189 } else if (TrailingZeros
== 0 && SavesBytes
) {
6190 // If the mask covers the least significant bit, then we can replace
6191 // TEST+AND with a SHL and check eflags.
6192 // This emits a redundant TEST which is subsequently eliminated.
6193 ShiftOpcode
= GET_ND_IF_ENABLED(X86::SHL64ri
);
6194 ShiftAmt
= LeadingZeros
;
6196 TestOpcode
= X86::TEST64rr
;
6197 } else if (MaskC
->hasOneUse() && !isInt
<32>(Mask
)) {
6198 // If the shifted mask extends into the high half and is 8/16/32 bits
6199 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6200 unsigned PopCount
= 64 - LeadingZeros
- TrailingZeros
;
6201 if (PopCount
== 8) {
6202 ShiftOpcode
= GET_ND_IF_ENABLED(X86::SHR64ri
);
6203 ShiftAmt
= TrailingZeros
;
6204 SubRegIdx
= X86::sub_8bit
;
6206 TestOpcode
= X86::TEST8rr
;
6207 } else if (PopCount
== 16) {
6208 ShiftOpcode
= GET_ND_IF_ENABLED(X86::SHR64ri
);
6209 ShiftAmt
= TrailingZeros
;
6210 SubRegIdx
= X86::sub_16bit
;
6211 SubRegVT
= MVT::i16
;
6212 TestOpcode
= X86::TEST16rr
;
6213 } else if (PopCount
== 32) {
6214 ShiftOpcode
= GET_ND_IF_ENABLED(X86::SHR64ri
);
6215 ShiftAmt
= TrailingZeros
;
6216 SubRegIdx
= X86::sub_32bit
;
6217 SubRegVT
= MVT::i32
;
6218 TestOpcode
= X86::TEST32rr
;
6221 if (ShiftOpcode
!= ISD::DELETED_NODE
) {
6222 SDValue ShiftC
= CurDAG
->getTargetConstant(ShiftAmt
, dl
, MVT::i64
);
6223 SDValue Shift
= SDValue(
6224 CurDAG
->getMachineNode(ShiftOpcode
, dl
, MVT::i64
, MVT::i32
,
6225 N0
.getOperand(0), ShiftC
),
6227 if (SubRegIdx
!= 0) {
6229 CurDAG
->getTargetExtractSubreg(SubRegIdx
, dl
, SubRegVT
, Shift
);
6231 MachineSDNode
*Test
=
6232 CurDAG
->getMachineNode(TestOpcode
, dl
, MVT::i32
, Shift
, Shift
);
6233 ReplaceNode(Node
, Test
);
6240 unsigned ROpc
, MOpc
;
6242 // For each of these checks we need to be careful if the sign flag is
6243 // being used. It is only safe to use the sign flag in two conditions,
6244 // either the sign bit in the shrunken mask is zero or the final test
6245 // size is equal to the original compare size.
6247 if (isUInt
<8>(Mask
) &&
6248 (!(Mask
& 0x80) || CmpVT
== MVT::i8
||
6249 hasNoSignFlagUses(SDValue(Node
, 0)))) {
6250 // For example, convert "testl %eax, $8" to "testb %al, $8"
6252 SubRegOp
= X86::sub_8bit
;
6253 ROpc
= X86::TEST8ri
;
6254 MOpc
= X86::TEST8mi
;
6255 } else if (OptForMinSize
&& isUInt
<16>(Mask
) &&
6256 (!(Mask
& 0x8000) || CmpVT
== MVT::i16
||
6257 hasNoSignFlagUses(SDValue(Node
, 0)))) {
6258 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6259 // NOTE: We only want to form TESTW instructions if optimizing for
6260 // min size. Otherwise we only save one byte and possibly get a length
6261 // changing prefix penalty in the decoders.
6263 SubRegOp
= X86::sub_16bit
;
6264 ROpc
= X86::TEST16ri
;
6265 MOpc
= X86::TEST16mi
;
6266 } else if (isUInt
<32>(Mask
) && N0
.getValueType() != MVT::i16
&&
6267 ((!(Mask
& 0x80000000) &&
6268 // Without minsize 16-bit Cmps can get here so we need to
6269 // be sure we calculate the correct sign flag if needed.
6270 (CmpVT
!= MVT::i16
|| !(Mask
& 0x8000))) ||
6271 CmpVT
== MVT::i32
||
6272 hasNoSignFlagUses(SDValue(Node
, 0)))) {
6273 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6274 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6275 // Otherwize, we find ourselves in a position where we have to do
6276 // promotion. If previous passes did not promote the and, we assume
6277 // they had a good reason not to and do not promote here.
6279 SubRegOp
= X86::sub_32bit
;
6280 ROpc
= X86::TEST32ri
;
6281 MOpc
= X86::TEST32mi
;
6283 // No eligible transformation was found.
6287 SDValue Imm
= CurDAG
->getTargetConstant(Mask
, dl
, VT
);
6288 SDValue Reg
= N0
.getOperand(0);
6290 // Emit a testl or testw.
6291 MachineSDNode
*NewNode
;
6292 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
6293 if (tryFoldLoad(Node
, N0
.getNode(), Reg
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
6294 if (auto *LoadN
= dyn_cast
<LoadSDNode
>(N0
.getOperand(0).getNode())) {
6295 if (!LoadN
->isSimple()) {
6296 unsigned NumVolBits
= LoadN
->getValueType(0).getSizeInBits();
6297 if ((MOpc
== X86::TEST8mi
&& NumVolBits
!= 8) ||
6298 (MOpc
== X86::TEST16mi
&& NumVolBits
!= 16) ||
6299 (MOpc
== X86::TEST32mi
&& NumVolBits
!= 32))
6303 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
6304 Reg
.getOperand(0) };
6305 NewNode
= CurDAG
->getMachineNode(MOpc
, dl
, MVT::i32
, MVT::Other
, Ops
);
6306 // Update the chain.
6307 ReplaceUses(Reg
.getValue(1), SDValue(NewNode
, 1));
6308 // Record the mem-refs
6309 CurDAG
->setNodeMemRefs(NewNode
,
6310 {cast
<LoadSDNode
>(Reg
)->getMemOperand()});
6312 // Extract the subregister if necessary.
6313 if (N0
.getValueType() != VT
)
6314 Reg
= CurDAG
->getTargetExtractSubreg(SubRegOp
, dl
, VT
, Reg
);
6316 NewNode
= CurDAG
->getMachineNode(ROpc
, dl
, MVT::i32
, Reg
, Imm
);
6318 // Replace CMP with TEST.
6319 ReplaceNode(Node
, NewNode
);
6324 case X86ISD::PCMPISTR
: {
6325 if (!Subtarget
->hasSSE42())
6328 bool NeedIndex
= !SDValue(Node
, 0).use_empty();
6329 bool NeedMask
= !SDValue(Node
, 1).use_empty();
6330 // We can't fold a load if we are going to make two instructions.
6331 bool MayFoldLoad
= !NeedIndex
|| !NeedMask
;
6333 MachineSDNode
*CNode
;
6336 Subtarget
->hasAVX() ? X86::VPCMPISTRMrri
: X86::PCMPISTRMrri
;
6338 Subtarget
->hasAVX() ? X86::VPCMPISTRMrmi
: X86::PCMPISTRMrmi
;
6339 CNode
= emitPCMPISTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::v16i8
, Node
);
6340 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, 0));
6342 if (NeedIndex
|| !NeedMask
) {
6344 Subtarget
->hasAVX() ? X86::VPCMPISTRIrri
: X86::PCMPISTRIrri
;
6346 Subtarget
->hasAVX() ? X86::VPCMPISTRIrmi
: X86::PCMPISTRIrmi
;
6347 CNode
= emitPCMPISTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::i32
, Node
);
6348 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
6351 // Connect the flag usage to the last instruction created.
6352 ReplaceUses(SDValue(Node
, 2), SDValue(CNode
, 1));
6353 CurDAG
->RemoveDeadNode(Node
);
6356 case X86ISD::PCMPESTR
: {
6357 if (!Subtarget
->hasSSE42())
6360 // Copy the two implicit register inputs.
6361 SDValue InGlue
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EAX
,
6362 Node
->getOperand(1),
6363 SDValue()).getValue(1);
6364 InGlue
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EDX
,
6365 Node
->getOperand(3), InGlue
).getValue(1);
6367 bool NeedIndex
= !SDValue(Node
, 0).use_empty();
6368 bool NeedMask
= !SDValue(Node
, 1).use_empty();
6369 // We can't fold a load if we are going to make two instructions.
6370 bool MayFoldLoad
= !NeedIndex
|| !NeedMask
;
6372 MachineSDNode
*CNode
;
6375 Subtarget
->hasAVX() ? X86::VPCMPESTRMrri
: X86::PCMPESTRMrri
;
6377 Subtarget
->hasAVX() ? X86::VPCMPESTRMrmi
: X86::PCMPESTRMrmi
;
6379 emitPCMPESTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::v16i8
, Node
, InGlue
);
6380 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, 0));
6382 if (NeedIndex
|| !NeedMask
) {
6384 Subtarget
->hasAVX() ? X86::VPCMPESTRIrri
: X86::PCMPESTRIrri
;
6386 Subtarget
->hasAVX() ? X86::VPCMPESTRIrmi
: X86::PCMPESTRIrmi
;
6387 CNode
= emitPCMPESTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::i32
, Node
, InGlue
);
6388 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
6390 // Connect the flag usage to the last instruction created.
6391 ReplaceUses(SDValue(Node
, 2), SDValue(CNode
, 1));
6392 CurDAG
->RemoveDeadNode(Node
);
6397 if (NVT
.isVector() && tryVPTESTM(Node
, SDValue(Node
, 0), SDValue()))
6404 if (foldLoadStoreIntoMemOperand(Node
))
6408 case X86ISD::SETCC_CARRY
: {
6409 MVT VT
= Node
->getSimpleValueType(0);
6411 if (Subtarget
->hasSBBDepBreaking()) {
6412 // We have to do this manually because tblgen will put the eflags copy in
6413 // the wrong place if we use an extract_subreg in the pattern.
6414 // Copy flags to the EFLAGS register and glue it to next node.
6416 CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EFLAGS
,
6417 Node
->getOperand(1), SDValue());
6419 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6421 unsigned Opc
= VT
== MVT::i64
? X86::SETB_C64r
: X86::SETB_C32r
;
6422 MVT SetVT
= VT
== MVT::i64
? MVT::i64
: MVT::i32
;
6424 CurDAG
->getMachineNode(Opc
, dl
, SetVT
, EFLAGS
, EFLAGS
.getValue(1)),
6427 // The target does not recognize sbb with the same reg operand as a
6428 // no-source idiom, so we explicitly zero the input values.
6429 Result
= getSBBZero(Node
);
6432 // For less than 32-bits we need to extract from the 32-bit node.
6433 if (VT
== MVT::i8
|| VT
== MVT::i16
) {
6434 int SubIndex
= VT
== MVT::i16
? X86::sub_16bit
: X86::sub_8bit
;
6435 Result
= CurDAG
->getTargetExtractSubreg(SubIndex
, dl
, VT
, Result
);
6438 ReplaceUses(SDValue(Node
, 0), Result
);
6439 CurDAG
->RemoveDeadNode(Node
);
6443 if (isNullConstant(Node
->getOperand(0)) &&
6444 isNullConstant(Node
->getOperand(1))) {
6445 SDValue Result
= getSBBZero(Node
);
6447 // Replace the flag use.
6448 ReplaceUses(SDValue(Node
, 1), Result
.getValue(1));
6450 // Replace the result use.
6451 if (!SDValue(Node
, 0).use_empty()) {
6452 // For less than 32-bits we need to extract from the 32-bit node.
6453 MVT VT
= Node
->getSimpleValueType(0);
6454 if (VT
== MVT::i8
|| VT
== MVT::i16
) {
6455 int SubIndex
= VT
== MVT::i16
? X86::sub_16bit
: X86::sub_8bit
;
6456 Result
= CurDAG
->getTargetExtractSubreg(SubIndex
, dl
, VT
, Result
);
6458 ReplaceUses(SDValue(Node
, 0), Result
);
6461 CurDAG
->RemoveDeadNode(Node
);
6466 case X86ISD::MGATHER
: {
6467 auto *Mgt
= cast
<X86MaskedGatherSDNode
>(Node
);
6468 SDValue IndexOp
= Mgt
->getIndex();
6469 SDValue Mask
= Mgt
->getMask();
6470 MVT IndexVT
= IndexOp
.getSimpleValueType();
6471 MVT ValueVT
= Node
->getSimpleValueType(0);
6472 MVT MaskVT
= Mask
.getSimpleValueType();
6474 // This is just to prevent crashes if the nodes are malformed somehow. We're
6475 // otherwise only doing loose type checking in here based on type what
6476 // a type constraint would say just like table based isel.
6477 if (!ValueVT
.isVector() || !MaskVT
.isVector())
6480 unsigned NumElts
= ValueVT
.getVectorNumElements();
6481 MVT ValueSVT
= ValueVT
.getVectorElementType();
6483 bool IsFP
= ValueSVT
.isFloatingPoint();
6484 unsigned EltSize
= ValueSVT
.getSizeInBits();
6487 bool AVX512Gather
= MaskVT
.getVectorElementType() == MVT::i1
;
6489 if (IndexVT
== MVT::v4i32
&& NumElts
== 4 && EltSize
== 32)
6490 Opc
= IsFP
? X86::VGATHERDPSZ128rm
: X86::VPGATHERDDZ128rm
;
6491 else if (IndexVT
== MVT::v8i32
&& NumElts
== 8 && EltSize
== 32)
6492 Opc
= IsFP
? X86::VGATHERDPSZ256rm
: X86::VPGATHERDDZ256rm
;
6493 else if (IndexVT
== MVT::v16i32
&& NumElts
== 16 && EltSize
== 32)
6494 Opc
= IsFP
? X86::VGATHERDPSZrm
: X86::VPGATHERDDZrm
;
6495 else if (IndexVT
== MVT::v4i32
&& NumElts
== 2 && EltSize
== 64)
6496 Opc
= IsFP
? X86::VGATHERDPDZ128rm
: X86::VPGATHERDQZ128rm
;
6497 else if (IndexVT
== MVT::v4i32
&& NumElts
== 4 && EltSize
== 64)
6498 Opc
= IsFP
? X86::VGATHERDPDZ256rm
: X86::VPGATHERDQZ256rm
;
6499 else if (IndexVT
== MVT::v8i32
&& NumElts
== 8 && EltSize
== 64)
6500 Opc
= IsFP
? X86::VGATHERDPDZrm
: X86::VPGATHERDQZrm
;
6501 else if (IndexVT
== MVT::v2i64
&& NumElts
== 4 && EltSize
== 32)
6502 Opc
= IsFP
? X86::VGATHERQPSZ128rm
: X86::VPGATHERQDZ128rm
;
6503 else if (IndexVT
== MVT::v4i64
&& NumElts
== 4 && EltSize
== 32)
6504 Opc
= IsFP
? X86::VGATHERQPSZ256rm
: X86::VPGATHERQDZ256rm
;
6505 else if (IndexVT
== MVT::v8i64
&& NumElts
== 8 && EltSize
== 32)
6506 Opc
= IsFP
? X86::VGATHERQPSZrm
: X86::VPGATHERQDZrm
;
6507 else if (IndexVT
== MVT::v2i64
&& NumElts
== 2 && EltSize
== 64)
6508 Opc
= IsFP
? X86::VGATHERQPDZ128rm
: X86::VPGATHERQQZ128rm
;
6509 else if (IndexVT
== MVT::v4i64
&& NumElts
== 4 && EltSize
== 64)
6510 Opc
= IsFP
? X86::VGATHERQPDZ256rm
: X86::VPGATHERQQZ256rm
;
6511 else if (IndexVT
== MVT::v8i64
&& NumElts
== 8 && EltSize
== 64)
6512 Opc
= IsFP
? X86::VGATHERQPDZrm
: X86::VPGATHERQQZrm
;
6514 assert(EVT(MaskVT
) == EVT(ValueVT
).changeVectorElementTypeToInteger() &&
6515 "Unexpected mask VT!");
6516 if (IndexVT
== MVT::v4i32
&& NumElts
== 4 && EltSize
== 32)
6517 Opc
= IsFP
? X86::VGATHERDPSrm
: X86::VPGATHERDDrm
;
6518 else if (IndexVT
== MVT::v8i32
&& NumElts
== 8 && EltSize
== 32)
6519 Opc
= IsFP
? X86::VGATHERDPSYrm
: X86::VPGATHERDDYrm
;
6520 else if (IndexVT
== MVT::v4i32
&& NumElts
== 2 && EltSize
== 64)
6521 Opc
= IsFP
? X86::VGATHERDPDrm
: X86::VPGATHERDQrm
;
6522 else if (IndexVT
== MVT::v4i32
&& NumElts
== 4 && EltSize
== 64)
6523 Opc
= IsFP
? X86::VGATHERDPDYrm
: X86::VPGATHERDQYrm
;
6524 else if (IndexVT
== MVT::v2i64
&& NumElts
== 4 && EltSize
== 32)
6525 Opc
= IsFP
? X86::VGATHERQPSrm
: X86::VPGATHERQDrm
;
6526 else if (IndexVT
== MVT::v4i64
&& NumElts
== 4 && EltSize
== 32)
6527 Opc
= IsFP
? X86::VGATHERQPSYrm
: X86::VPGATHERQDYrm
;
6528 else if (IndexVT
== MVT::v2i64
&& NumElts
== 2 && EltSize
== 64)
6529 Opc
= IsFP
? X86::VGATHERQPDrm
: X86::VPGATHERQQrm
;
6530 else if (IndexVT
== MVT::v4i64
&& NumElts
== 4 && EltSize
== 64)
6531 Opc
= IsFP
? X86::VGATHERQPDYrm
: X86::VPGATHERQQYrm
;
6537 SDValue Base
, Scale
, Index
, Disp
, Segment
;
6538 if (!selectVectorAddr(Mgt
, Mgt
->getBasePtr(), IndexOp
, Mgt
->getScale(),
6539 Base
, Scale
, Index
, Disp
, Segment
))
6542 SDValue PassThru
= Mgt
->getPassThru();
6543 SDValue Chain
= Mgt
->getChain();
6544 // Gather instructions have a mask output not in the ISD node.
6545 SDVTList VTs
= CurDAG
->getVTList(ValueVT
, MaskVT
, MVT::Other
);
6547 MachineSDNode
*NewNode
;
6549 SDValue Ops
[] = {PassThru
, Mask
, Base
, Scale
,
6550 Index
, Disp
, Segment
, Chain
};
6551 NewNode
= CurDAG
->getMachineNode(Opc
, SDLoc(dl
), VTs
, Ops
);
6553 SDValue Ops
[] = {PassThru
, Base
, Scale
, Index
,
6554 Disp
, Segment
, Mask
, Chain
};
6555 NewNode
= CurDAG
->getMachineNode(Opc
, SDLoc(dl
), VTs
, Ops
);
6557 CurDAG
->setNodeMemRefs(NewNode
, {Mgt
->getMemOperand()});
6558 ReplaceUses(SDValue(Node
, 0), SDValue(NewNode
, 0));
6559 ReplaceUses(SDValue(Node
, 1), SDValue(NewNode
, 2));
6560 CurDAG
->RemoveDeadNode(Node
);
6563 case X86ISD::MSCATTER
: {
6564 auto *Sc
= cast
<X86MaskedScatterSDNode
>(Node
);
6565 SDValue Value
= Sc
->getValue();
6566 SDValue IndexOp
= Sc
->getIndex();
6567 MVT IndexVT
= IndexOp
.getSimpleValueType();
6568 MVT ValueVT
= Value
.getSimpleValueType();
6570 // This is just to prevent crashes if the nodes are malformed somehow. We're
6571 // otherwise only doing loose type checking in here based on type what
6572 // a type constraint would say just like table based isel.
6573 if (!ValueVT
.isVector())
6576 unsigned NumElts
= ValueVT
.getVectorNumElements();
6577 MVT ValueSVT
= ValueVT
.getVectorElementType();
6579 bool IsFP
= ValueSVT
.isFloatingPoint();
6580 unsigned EltSize
= ValueSVT
.getSizeInBits();
6583 if (IndexVT
== MVT::v4i32
&& NumElts
== 4 && EltSize
== 32)
6584 Opc
= IsFP
? X86::VSCATTERDPSZ128mr
: X86::VPSCATTERDDZ128mr
;
6585 else if (IndexVT
== MVT::v8i32
&& NumElts
== 8 && EltSize
== 32)
6586 Opc
= IsFP
? X86::VSCATTERDPSZ256mr
: X86::VPSCATTERDDZ256mr
;
6587 else if (IndexVT
== MVT::v16i32
&& NumElts
== 16 && EltSize
== 32)
6588 Opc
= IsFP
? X86::VSCATTERDPSZmr
: X86::VPSCATTERDDZmr
;
6589 else if (IndexVT
== MVT::v4i32
&& NumElts
== 2 && EltSize
== 64)
6590 Opc
= IsFP
? X86::VSCATTERDPDZ128mr
: X86::VPSCATTERDQZ128mr
;
6591 else if (IndexVT
== MVT::v4i32
&& NumElts
== 4 && EltSize
== 64)
6592 Opc
= IsFP
? X86::VSCATTERDPDZ256mr
: X86::VPSCATTERDQZ256mr
;
6593 else if (IndexVT
== MVT::v8i32
&& NumElts
== 8 && EltSize
== 64)
6594 Opc
= IsFP
? X86::VSCATTERDPDZmr
: X86::VPSCATTERDQZmr
;
6595 else if (IndexVT
== MVT::v2i64
&& NumElts
== 4 && EltSize
== 32)
6596 Opc
= IsFP
? X86::VSCATTERQPSZ128mr
: X86::VPSCATTERQDZ128mr
;
6597 else if (IndexVT
== MVT::v4i64
&& NumElts
== 4 && EltSize
== 32)
6598 Opc
= IsFP
? X86::VSCATTERQPSZ256mr
: X86::VPSCATTERQDZ256mr
;
6599 else if (IndexVT
== MVT::v8i64
&& NumElts
== 8 && EltSize
== 32)
6600 Opc
= IsFP
? X86::VSCATTERQPSZmr
: X86::VPSCATTERQDZmr
;
6601 else if (IndexVT
== MVT::v2i64
&& NumElts
== 2 && EltSize
== 64)
6602 Opc
= IsFP
? X86::VSCATTERQPDZ128mr
: X86::VPSCATTERQQZ128mr
;
6603 else if (IndexVT
== MVT::v4i64
&& NumElts
== 4 && EltSize
== 64)
6604 Opc
= IsFP
? X86::VSCATTERQPDZ256mr
: X86::VPSCATTERQQZ256mr
;
6605 else if (IndexVT
== MVT::v8i64
&& NumElts
== 8 && EltSize
== 64)
6606 Opc
= IsFP
? X86::VSCATTERQPDZmr
: X86::VPSCATTERQQZmr
;
6610 SDValue Base
, Scale
, Index
, Disp
, Segment
;
6611 if (!selectVectorAddr(Sc
, Sc
->getBasePtr(), IndexOp
, Sc
->getScale(),
6612 Base
, Scale
, Index
, Disp
, Segment
))
6615 SDValue Mask
= Sc
->getMask();
6616 SDValue Chain
= Sc
->getChain();
6617 // Scatter instructions have a mask output not in the ISD node.
6618 SDVTList VTs
= CurDAG
->getVTList(Mask
.getValueType(), MVT::Other
);
6619 SDValue Ops
[] = {Base
, Scale
, Index
, Disp
, Segment
, Mask
, Value
, Chain
};
6621 MachineSDNode
*NewNode
= CurDAG
->getMachineNode(Opc
, SDLoc(dl
), VTs
, Ops
);
6622 CurDAG
->setNodeMemRefs(NewNode
, {Sc
->getMemOperand()});
6623 ReplaceUses(SDValue(Node
, 0), SDValue(NewNode
, 1));
6624 CurDAG
->RemoveDeadNode(Node
);
6627 case ISD::PREALLOCATED_SETUP
: {
6628 auto *MFI
= CurDAG
->getMachineFunction().getInfo
<X86MachineFunctionInfo
>();
6629 auto CallId
= MFI
->getPreallocatedIdForCallSite(
6630 cast
<SrcValueSDNode
>(Node
->getOperand(1))->getValue());
6631 SDValue Chain
= Node
->getOperand(0);
6632 SDValue CallIdValue
= CurDAG
->getTargetConstant(CallId
, dl
, MVT::i32
);
6633 MachineSDNode
*New
= CurDAG
->getMachineNode(
6634 TargetOpcode::PREALLOCATED_SETUP
, dl
, MVT::Other
, CallIdValue
, Chain
);
6635 ReplaceUses(SDValue(Node
, 0), SDValue(New
, 0)); // Chain
6636 CurDAG
->RemoveDeadNode(Node
);
6639 case ISD::PREALLOCATED_ARG
: {
6640 auto *MFI
= CurDAG
->getMachineFunction().getInfo
<X86MachineFunctionInfo
>();
6641 auto CallId
= MFI
->getPreallocatedIdForCallSite(
6642 cast
<SrcValueSDNode
>(Node
->getOperand(1))->getValue());
6643 SDValue Chain
= Node
->getOperand(0);
6644 SDValue CallIdValue
= CurDAG
->getTargetConstant(CallId
, dl
, MVT::i32
);
6645 SDValue ArgIndex
= Node
->getOperand(2);
6647 Ops
[0] = CallIdValue
;
6650 MachineSDNode
*New
= CurDAG
->getMachineNode(
6651 TargetOpcode::PREALLOCATED_ARG
, dl
,
6652 CurDAG
->getVTList(TLI
->getPointerTy(CurDAG
->getDataLayout()),
6655 ReplaceUses(SDValue(Node
, 0), SDValue(New
, 0)); // Arg pointer
6656 ReplaceUses(SDValue(Node
, 1), SDValue(New
, 1)); // Chain
6657 CurDAG
->RemoveDeadNode(Node
);
6660 case X86ISD::AESENCWIDE128KL
:
6661 case X86ISD::AESDECWIDE128KL
:
6662 case X86ISD::AESENCWIDE256KL
:
6663 case X86ISD::AESDECWIDE256KL
: {
6664 if (!Subtarget
->hasWIDEKL())
6668 switch (Node
->getOpcode()) {
6670 llvm_unreachable("Unexpected opcode!");
6671 case X86ISD::AESENCWIDE128KL
:
6672 Opcode
= X86::AESENCWIDE128KL
;
6674 case X86ISD::AESDECWIDE128KL
:
6675 Opcode
= X86::AESDECWIDE128KL
;
6677 case X86ISD::AESENCWIDE256KL
:
6678 Opcode
= X86::AESENCWIDE256KL
;
6680 case X86ISD::AESDECWIDE256KL
:
6681 Opcode
= X86::AESDECWIDE256KL
;
6685 SDValue Chain
= Node
->getOperand(0);
6686 SDValue Addr
= Node
->getOperand(1);
6688 SDValue Base
, Scale
, Index
, Disp
, Segment
;
6689 if (!selectAddr(Node
, Addr
, Base
, Scale
, Index
, Disp
, Segment
))
6692 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM0
, Node
->getOperand(2),
6694 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM1
, Node
->getOperand(3),
6696 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM2
, Node
->getOperand(4),
6698 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM3
, Node
->getOperand(5),
6700 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM4
, Node
->getOperand(6),
6702 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM5
, Node
->getOperand(7),
6704 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM6
, Node
->getOperand(8),
6706 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM7
, Node
->getOperand(9),
6709 MachineSDNode
*Res
= CurDAG
->getMachineNode(
6710 Opcode
, dl
, Node
->getVTList(),
6711 {Base
, Scale
, Index
, Disp
, Segment
, Chain
, Chain
.getValue(1)});
6712 CurDAG
->setNodeMemRefs(Res
, cast
<MemSDNode
>(Node
)->getMemOperand());
6713 ReplaceNode(Node
, Res
);
6721 bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6722 const SDValue
&Op
, InlineAsm::ConstraintCode ConstraintID
,
6723 std::vector
<SDValue
> &OutOps
) {
6724 SDValue Op0
, Op1
, Op2
, Op3
, Op4
;
6725 switch (ConstraintID
) {
6727 llvm_unreachable("Unexpected asm memory constraint");
6728 case InlineAsm::ConstraintCode::o
: // offsetable ??
6729 case InlineAsm::ConstraintCode::v
: // not offsetable ??
6730 case InlineAsm::ConstraintCode::m
: // memory
6731 case InlineAsm::ConstraintCode::X
:
6732 case InlineAsm::ConstraintCode::p
: // address
6733 if (!selectAddr(nullptr, Op
, Op0
, Op1
, Op2
, Op3
, Op4
))
6738 OutOps
.push_back(Op0
);
6739 OutOps
.push_back(Op1
);
6740 OutOps
.push_back(Op2
);
6741 OutOps
.push_back(Op3
);
6742 OutOps
.push_back(Op4
);
6746 X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine
&TM
)
6747 : SelectionDAGISelPass(
6748 std::make_unique
<X86DAGToDAGISel
>(TM
, TM
.getOptLevel())) {}
6750 /// This pass converts a legalized DAG into a X86-specific DAG,
6751 /// ready for instruction scheduling.
6752 FunctionPass
*llvm::createX86ISelDag(X86TargetMachine
&TM
,
6753 CodeGenOptLevel OptLevel
) {
6754 return new X86DAGToDAGISelLegacy(TM
, OptLevel
);