1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines a DAG pattern matching instruction selector for X86,
10 // converting from a legalized dag to a X86 dag.
12 //===----------------------------------------------------------------------===//
15 #include "X86MachineFunctionInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "X86TargetMachine.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/MachineModuleInfo.h"
21 #include "llvm/CodeGen/SelectionDAGISel.h"
22 #include "llvm/Config/llvm-config.h"
23 #include "llvm/IR/ConstantRange.h"
24 #include "llvm/IR/Function.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/IntrinsicsX86.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/ErrorHandling.h"
31 #include "llvm/Support/KnownBits.h"
32 #include "llvm/Support/MathExtras.h"
37 #define DEBUG_TYPE "x86-isel"
38 #define PASS_NAME "X86 DAG->DAG Instruction Selection"
40 STATISTIC(NumLoadMoved
, "Number of loads moved below TokenFactor");
42 static cl::opt
<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
43 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
46 static cl::opt
<bool> EnablePromoteAnyextLoad(
47 "x86-promote-anyext-load", cl::init(true),
48 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden
);
50 extern cl::opt
<bool> IndirectBranchTracking
;
52 //===----------------------------------------------------------------------===//
53 // Pattern Matcher Implementation
54 //===----------------------------------------------------------------------===//
57 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
58 /// numbers for the leaves of the matched tree.
59 struct X86ISelAddressMode
{
65 // This is really a union, discriminated by BaseType!
67 int Base_FrameIndex
= 0;
73 const GlobalValue
*GV
= nullptr;
74 const Constant
*CP
= nullptr;
75 const BlockAddress
*BlockAddr
= nullptr;
76 const char *ES
= nullptr;
77 MCSymbol
*MCSym
= nullptr;
79 Align Alignment
; // CP alignment.
80 unsigned char SymbolFlags
= X86II::MO_NO_FLAG
; // X86II::MO_*
81 bool NegateIndex
= false;
83 X86ISelAddressMode() = default;
85 bool hasSymbolicDisplacement() const {
86 return GV
!= nullptr || CP
!= nullptr || ES
!= nullptr ||
87 MCSym
!= nullptr || JT
!= -1 || BlockAddr
!= nullptr;
90 bool hasBaseOrIndexReg() const {
91 return BaseType
== FrameIndexBase
||
92 IndexReg
.getNode() != nullptr || Base_Reg
.getNode() != nullptr;
95 /// Return true if this addressing mode is already RIP-relative.
96 bool isRIPRelative() const {
97 if (BaseType
!= RegBase
) return false;
98 if (RegisterSDNode
*RegNode
=
99 dyn_cast_or_null
<RegisterSDNode
>(Base_Reg
.getNode()))
100 return RegNode
->getReg() == X86::RIP
;
104 void setBaseReg(SDValue Reg
) {
109 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
110 void dump(SelectionDAG
*DAG
= nullptr) {
111 dbgs() << "X86ISelAddressMode " << this << '\n';
112 dbgs() << "Base_Reg ";
113 if (Base_Reg
.getNode())
114 Base_Reg
.getNode()->dump(DAG
);
117 if (BaseType
== FrameIndexBase
)
118 dbgs() << " Base.FrameIndex " << Base_FrameIndex
<< '\n';
119 dbgs() << " Scale " << Scale
<< '\n'
123 if (IndexReg
.getNode())
124 IndexReg
.getNode()->dump(DAG
);
127 dbgs() << " Disp " << Disp
<< '\n'
149 dbgs() << " JT" << JT
<< " Align" << Alignment
.value() << '\n';
156 //===--------------------------------------------------------------------===//
157 /// ISel - X86-specific code to select X86 machine instructions for
158 /// SelectionDAG operations.
160 class X86DAGToDAGISel final
: public SelectionDAGISel
{
161 /// Keep a pointer to the X86Subtarget around so that we can
162 /// make the right decision when generating code for different targets.
163 const X86Subtarget
*Subtarget
;
165 /// If true, selector should try to optimize for minimum code size.
168 /// Disable direct TLS access through segment registers.
169 bool IndirectTlsSegRefs
;
174 X86DAGToDAGISel() = delete;
176 explicit X86DAGToDAGISel(X86TargetMachine
&tm
, CodeGenOptLevel OptLevel
)
177 : SelectionDAGISel(ID
, tm
, OptLevel
), Subtarget(nullptr),
178 OptForMinSize(false), IndirectTlsSegRefs(false) {}
180 bool runOnMachineFunction(MachineFunction
&MF
) override
{
181 // Reset the subtarget each time through.
182 Subtarget
= &MF
.getSubtarget
<X86Subtarget
>();
183 IndirectTlsSegRefs
= MF
.getFunction().hasFnAttribute(
184 "indirect-tls-seg-refs");
186 // OptFor[Min]Size are used in pattern predicates that isel is matching.
187 OptForMinSize
= MF
.getFunction().hasMinSize();
188 assert((!OptForMinSize
|| MF
.getFunction().hasOptSize()) &&
189 "OptForMinSize implies OptForSize");
191 SelectionDAGISel::runOnMachineFunction(MF
);
195 void emitFunctionEntryCode() override
;
197 bool IsProfitableToFold(SDValue N
, SDNode
*U
, SDNode
*Root
) const override
;
199 void PreprocessISelDAG() override
;
200 void PostprocessISelDAG() override
;
202 // Include the pieces autogenerated from the target description.
203 #include "X86GenDAGISel.inc"
206 void Select(SDNode
*N
) override
;
208 bool foldOffsetIntoAddress(uint64_t Offset
, X86ISelAddressMode
&AM
);
209 bool matchLoadInAddress(LoadSDNode
*N
, X86ISelAddressMode
&AM
,
210 bool AllowSegmentRegForX32
= false);
211 bool matchWrapper(SDValue N
, X86ISelAddressMode
&AM
);
212 bool matchAddress(SDValue N
, X86ISelAddressMode
&AM
);
213 bool matchVectorAddress(SDValue N
, X86ISelAddressMode
&AM
);
214 bool matchAdd(SDValue
&N
, X86ISelAddressMode
&AM
, unsigned Depth
);
215 SDValue
matchIndexRecursively(SDValue N
, X86ISelAddressMode
&AM
,
217 bool matchAddressRecursively(SDValue N
, X86ISelAddressMode
&AM
,
219 bool matchVectorAddressRecursively(SDValue N
, X86ISelAddressMode
&AM
,
221 bool matchAddressBase(SDValue N
, X86ISelAddressMode
&AM
);
222 bool selectAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
223 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
225 bool selectVectorAddr(MemSDNode
*Parent
, SDValue BasePtr
, SDValue IndexOp
,
226 SDValue ScaleOp
, SDValue
&Base
, SDValue
&Scale
,
227 SDValue
&Index
, SDValue
&Disp
, SDValue
&Segment
);
228 bool selectMOV64Imm32(SDValue N
, SDValue
&Imm
);
229 bool selectLEAAddr(SDValue N
, SDValue
&Base
,
230 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
232 bool selectLEA64_32Addr(SDValue N
, SDValue
&Base
,
233 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
235 bool selectTLSADDRAddr(SDValue N
, SDValue
&Base
,
236 SDValue
&Scale
, SDValue
&Index
, SDValue
&Disp
,
238 bool selectRelocImm(SDValue N
, SDValue
&Op
);
240 bool tryFoldLoad(SDNode
*Root
, SDNode
*P
, SDValue N
,
241 SDValue
&Base
, SDValue
&Scale
,
242 SDValue
&Index
, SDValue
&Disp
,
245 // Convenience method where P is also root.
246 bool tryFoldLoad(SDNode
*P
, SDValue N
,
247 SDValue
&Base
, SDValue
&Scale
,
248 SDValue
&Index
, SDValue
&Disp
,
250 return tryFoldLoad(P
, P
, N
, Base
, Scale
, Index
, Disp
, Segment
);
253 bool tryFoldBroadcast(SDNode
*Root
, SDNode
*P
, SDValue N
,
254 SDValue
&Base
, SDValue
&Scale
,
255 SDValue
&Index
, SDValue
&Disp
,
258 bool isProfitableToFormMaskedOp(SDNode
*N
) const;
260 /// Implement addressing mode selection for inline asm expressions.
261 bool SelectInlineAsmMemoryOperand(const SDValue
&Op
,
262 InlineAsm::ConstraintCode ConstraintID
,
263 std::vector
<SDValue
> &OutOps
) override
;
265 void emitSpecialCodeForMain();
267 inline void getAddressOperands(X86ISelAddressMode
&AM
, const SDLoc
&DL
,
268 MVT VT
, SDValue
&Base
, SDValue
&Scale
,
269 SDValue
&Index
, SDValue
&Disp
,
271 if (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
272 Base
= CurDAG
->getTargetFrameIndex(
273 AM
.Base_FrameIndex
, TLI
->getPointerTy(CurDAG
->getDataLayout()));
274 else if (AM
.Base_Reg
.getNode())
277 Base
= CurDAG
->getRegister(0, VT
);
279 Scale
= getI8Imm(AM
.Scale
, DL
);
281 // Negate the index if needed.
282 if (AM
.NegateIndex
) {
283 unsigned NegOpc
= VT
== MVT::i64
? X86::NEG64r
: X86::NEG32r
;
284 SDValue Neg
= SDValue(CurDAG
->getMachineNode(NegOpc
, DL
, VT
, MVT::i32
,
289 if (AM
.IndexReg
.getNode())
292 Index
= CurDAG
->getRegister(0, VT
);
294 // These are 32-bit even in 64-bit mode since RIP-relative offset
297 Disp
= CurDAG
->getTargetGlobalAddress(AM
.GV
, SDLoc(),
301 Disp
= CurDAG
->getTargetConstantPool(AM
.CP
, MVT::i32
, AM
.Alignment
,
302 AM
.Disp
, AM
.SymbolFlags
);
304 assert(!AM
.Disp
&& "Non-zero displacement is ignored with ES.");
305 Disp
= CurDAG
->getTargetExternalSymbol(AM
.ES
, MVT::i32
, AM
.SymbolFlags
);
306 } else if (AM
.MCSym
) {
307 assert(!AM
.Disp
&& "Non-zero displacement is ignored with MCSym.");
308 assert(AM
.SymbolFlags
== 0 && "oo");
309 Disp
= CurDAG
->getMCSymbol(AM
.MCSym
, MVT::i32
);
310 } else if (AM
.JT
!= -1) {
311 assert(!AM
.Disp
&& "Non-zero displacement is ignored with JT.");
312 Disp
= CurDAG
->getTargetJumpTable(AM
.JT
, MVT::i32
, AM
.SymbolFlags
);
313 } else if (AM
.BlockAddr
)
314 Disp
= CurDAG
->getTargetBlockAddress(AM
.BlockAddr
, MVT::i32
, AM
.Disp
,
317 Disp
= CurDAG
->getTargetConstant(AM
.Disp
, DL
, MVT::i32
);
319 if (AM
.Segment
.getNode())
320 Segment
= AM
.Segment
;
322 Segment
= CurDAG
->getRegister(0, MVT::i16
);
325 // Utility function to determine whether we should avoid selecting
326 // immediate forms of instructions for better code size or not.
327 // At a high level, we'd like to avoid such instructions when
328 // we have similar constants used within the same basic block
329 // that can be kept in a register.
331 bool shouldAvoidImmediateInstFormsForSize(SDNode
*N
) const {
332 uint32_t UseCount
= 0;
334 // Do not want to hoist if we're not optimizing for size.
335 // TODO: We'd like to remove this restriction.
336 // See the comment in X86InstrInfo.td for more info.
337 if (!CurDAG
->shouldOptForSize())
340 // Walk all the users of the immediate.
341 for (const SDNode
*User
: N
->uses()) {
345 // This user is already selected. Count it as a legitimate use and
347 if (User
->isMachineOpcode()) {
352 // We want to count stores of immediates as real uses.
353 if (User
->getOpcode() == ISD::STORE
&&
354 User
->getOperand(1).getNode() == N
) {
359 // We don't currently match users that have > 2 operands (except
360 // for stores, which are handled above)
361 // Those instruction won't match in ISEL, for now, and would
362 // be counted incorrectly.
363 // This may change in the future as we add additional instruction
365 if (User
->getNumOperands() != 2)
368 // If this is a sign-extended 8-bit integer immediate used in an ALU
369 // instruction, there is probably an opcode encoding to save space.
370 auto *C
= dyn_cast
<ConstantSDNode
>(N
);
371 if (C
&& isInt
<8>(C
->getSExtValue()))
374 // Immediates that are used for offsets as part of stack
375 // manipulation should be left alone. These are typically
376 // used to indicate SP offsets for argument passing and
377 // will get pulled into stores/pushes (implicitly).
378 if (User
->getOpcode() == X86ISD::ADD
||
379 User
->getOpcode() == ISD::ADD
||
380 User
->getOpcode() == X86ISD::SUB
||
381 User
->getOpcode() == ISD::SUB
) {
383 // Find the other operand of the add/sub.
384 SDValue OtherOp
= User
->getOperand(0);
385 if (OtherOp
.getNode() == N
)
386 OtherOp
= User
->getOperand(1);
388 // Don't count if the other operand is SP.
389 RegisterSDNode
*RegNode
;
390 if (OtherOp
->getOpcode() == ISD::CopyFromReg
&&
391 (RegNode
= dyn_cast_or_null
<RegisterSDNode
>(
392 OtherOp
->getOperand(1).getNode())))
393 if ((RegNode
->getReg() == X86::ESP
) ||
394 (RegNode
->getReg() == X86::RSP
))
398 // ... otherwise, count this and move on.
402 // If we have more than 1 use, then recommend for hoisting.
403 return (UseCount
> 1);
406 /// Return a target constant with the specified value of type i8.
407 inline SDValue
getI8Imm(unsigned Imm
, const SDLoc
&DL
) {
408 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i8
);
411 /// Return a target constant with the specified value, of type i32.
412 inline SDValue
getI32Imm(unsigned Imm
, const SDLoc
&DL
) {
413 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i32
);
416 /// Return a target constant with the specified value, of type i64.
417 inline SDValue
getI64Imm(uint64_t Imm
, const SDLoc
&DL
) {
418 return CurDAG
->getTargetConstant(Imm
, DL
, MVT::i64
);
421 SDValue
getExtractVEXTRACTImmediate(SDNode
*N
, unsigned VecWidth
,
423 assert((VecWidth
== 128 || VecWidth
== 256) && "Unexpected vector width");
424 uint64_t Index
= N
->getConstantOperandVal(1);
425 MVT VecVT
= N
->getOperand(0).getSimpleValueType();
426 return getI8Imm((Index
* VecVT
.getScalarSizeInBits()) / VecWidth
, DL
);
429 SDValue
getInsertVINSERTImmediate(SDNode
*N
, unsigned VecWidth
,
431 assert((VecWidth
== 128 || VecWidth
== 256) && "Unexpected vector width");
432 uint64_t Index
= N
->getConstantOperandVal(2);
433 MVT VecVT
= N
->getSimpleValueType(0);
434 return getI8Imm((Index
* VecVT
.getScalarSizeInBits()) / VecWidth
, DL
);
437 SDValue
getPermuteVINSERTCommutedImmediate(SDNode
*N
, unsigned VecWidth
,
439 assert(VecWidth
== 128 && "Unexpected vector width");
440 uint64_t Index
= N
->getConstantOperandVal(2);
441 MVT VecVT
= N
->getSimpleValueType(0);
442 uint64_t InsertIdx
= (Index
* VecVT
.getScalarSizeInBits()) / VecWidth
;
443 assert((InsertIdx
== 0 || InsertIdx
== 1) && "Bad insertf128 index");
444 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
445 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
446 return getI8Imm(InsertIdx
? 0x02 : 0x30, DL
);
449 SDValue
getSBBZero(SDNode
*N
) {
451 MVT VT
= N
->getSimpleValueType(0);
454 SDVTList VTs
= CurDAG
->getVTList(MVT::i32
, MVT::i32
);
455 SDValue Zero
= SDValue(
456 CurDAG
->getMachineNode(X86::MOV32r0
, dl
, VTs
, std::nullopt
), 0);
457 if (VT
== MVT::i64
) {
459 CurDAG
->getMachineNode(
460 TargetOpcode::SUBREG_TO_REG
, dl
, MVT::i64
,
461 CurDAG
->getTargetConstant(0, dl
, MVT::i64
), Zero
,
462 CurDAG
->getTargetConstant(X86::sub_32bit
, dl
, MVT::i32
)),
466 // Copy flags to the EFLAGS register and glue it to next node.
467 unsigned Opcode
= N
->getOpcode();
468 assert((Opcode
== X86ISD::SBB
|| Opcode
== X86ISD::SETCC_CARRY
) &&
469 "Unexpected opcode for SBB materialization");
470 unsigned FlagOpIndex
= Opcode
== X86ISD::SBB
? 2 : 1;
472 CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EFLAGS
,
473 N
->getOperand(FlagOpIndex
), SDValue());
475 // Create a 64-bit instruction if the result is 64-bits otherwise use the
477 unsigned Opc
= VT
== MVT::i64
? X86::SBB64rr
: X86::SBB32rr
;
478 MVT SBBVT
= VT
== MVT::i64
? MVT::i64
: MVT::i32
;
479 VTs
= CurDAG
->getVTList(SBBVT
, MVT::i32
);
481 CurDAG
->getMachineNode(Opc
, dl
, VTs
,
482 {Zero
, Zero
, EFLAGS
, EFLAGS
.getValue(1)}),
486 // Helper to detect unneeded and instructions on shift amounts. Called
487 // from PatFrags in tablegen.
488 bool isUnneededShiftMask(SDNode
*N
, unsigned Width
) const {
489 assert(N
->getOpcode() == ISD::AND
&& "Unexpected opcode");
490 const APInt
&Val
= N
->getConstantOperandAPInt(1);
492 if (Val
.countr_one() >= Width
)
495 APInt Mask
= Val
| CurDAG
->computeKnownBits(N
->getOperand(0)).Zero
;
496 return Mask
.countr_one() >= Width
;
499 /// Return an SDNode that returns the value of the global base register.
500 /// Output instructions required to initialize the global base register,
502 SDNode
*getGlobalBaseReg();
504 /// Return a reference to the TargetMachine, casted to the target-specific
506 const X86TargetMachine
&getTargetMachine() const {
507 return static_cast<const X86TargetMachine
&>(TM
);
510 /// Return a reference to the TargetInstrInfo, casted to the target-specific
512 const X86InstrInfo
*getInstrInfo() const {
513 return Subtarget
->getInstrInfo();
516 /// Return a condition code of the given SDNode
517 X86::CondCode
getCondFromNode(SDNode
*N
) const;
519 /// Address-mode matching performs shift-of-and to and-of-shift
520 /// reassociation in order to expose more scaled addressing
522 bool ComplexPatternFuncMutatesDAG() const override
{
526 bool isSExtAbsoluteSymbolRef(unsigned Width
, SDNode
*N
) const;
528 // Indicates we should prefer to use a non-temporal load for this load.
529 bool useNonTemporalLoad(LoadSDNode
*N
) const {
530 if (!N
->isNonTemporal())
533 unsigned StoreSize
= N
->getMemoryVT().getStoreSize();
535 if (N
->getAlign().value() < StoreSize
)
539 default: llvm_unreachable("Unsupported store size");
544 return Subtarget
->hasSSE41();
546 return Subtarget
->hasAVX2();
548 return Subtarget
->hasAVX512();
552 bool foldLoadStoreIntoMemOperand(SDNode
*Node
);
553 MachineSDNode
*matchBEXTRFromAndImm(SDNode
*Node
);
554 bool matchBitExtract(SDNode
*Node
);
555 bool shrinkAndImmediate(SDNode
*N
);
556 bool isMaskZeroExtended(SDNode
*N
) const;
557 bool tryShiftAmountMod(SDNode
*N
);
558 bool tryShrinkShlLogicImm(SDNode
*N
);
559 bool tryVPTERNLOG(SDNode
*N
);
560 bool matchVPTERNLOG(SDNode
*Root
, SDNode
*ParentA
, SDNode
*ParentB
,
561 SDNode
*ParentC
, SDValue A
, SDValue B
, SDValue C
,
563 bool tryVPTESTM(SDNode
*Root
, SDValue Setcc
, SDValue Mask
);
564 bool tryMatchBitSelect(SDNode
*N
);
566 MachineSDNode
*emitPCMPISTR(unsigned ROpc
, unsigned MOpc
, bool MayFoldLoad
,
567 const SDLoc
&dl
, MVT VT
, SDNode
*Node
);
568 MachineSDNode
*emitPCMPESTR(unsigned ROpc
, unsigned MOpc
, bool MayFoldLoad
,
569 const SDLoc
&dl
, MVT VT
, SDNode
*Node
,
572 bool tryOptimizeRem8Extend(SDNode
*N
);
574 bool onlyUsesZeroFlag(SDValue Flags
) const;
575 bool hasNoSignFlagUses(SDValue Flags
) const;
576 bool hasNoCarryFlagUses(SDValue Flags
) const;
580 char X86DAGToDAGISel::ID
= 0;
582 INITIALIZE_PASS(X86DAGToDAGISel
, DEBUG_TYPE
, PASS_NAME
, false, false)
584 // Returns true if this masked compare can be implemented legally with this
586 static bool isLegalMaskCompare(SDNode
*N
, const X86Subtarget
*Subtarget
) {
587 unsigned Opcode
= N
->getOpcode();
588 if (Opcode
== X86ISD::CMPM
|| Opcode
== X86ISD::CMPMM
||
589 Opcode
== X86ISD::STRICT_CMPM
|| Opcode
== ISD::SETCC
||
590 Opcode
== X86ISD::CMPMM_SAE
|| Opcode
== X86ISD::VFPCLASS
) {
591 // We can get 256-bit 8 element types here without VLX being enabled. When
592 // this happens we will use 512-bit operations and the mask will not be
594 EVT OpVT
= N
->getOperand(0).getValueType();
595 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
597 if (Opcode
== X86ISD::STRICT_CMPM
)
598 OpVT
= N
->getOperand(1).getValueType();
599 if (OpVT
.is256BitVector() || OpVT
.is128BitVector())
600 return Subtarget
->hasVLX();
604 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
605 if (Opcode
== X86ISD::VFPCLASSS
|| Opcode
== X86ISD::FSETCCM
||
606 Opcode
== X86ISD::FSETCCM_SAE
)
612 // Returns true if we can assume the writer of the mask has zero extended it
614 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode
*N
) const {
615 // If this is an AND, check if we have a compare on either side. As long as
616 // one side guarantees the mask is zero extended, the AND will preserve those
618 if (N
->getOpcode() == ISD::AND
)
619 return isLegalMaskCompare(N
->getOperand(0).getNode(), Subtarget
) ||
620 isLegalMaskCompare(N
->getOperand(1).getNode(), Subtarget
);
622 return isLegalMaskCompare(N
, Subtarget
);
626 X86DAGToDAGISel::IsProfitableToFold(SDValue N
, SDNode
*U
, SDNode
*Root
) const {
627 if (OptLevel
== CodeGenOptLevel::None
)
633 if (N
.getOpcode() != ISD::LOAD
)
636 // Don't fold non-temporal loads if we have an instruction for them.
637 if (useNonTemporalLoad(cast
<LoadSDNode
>(N
)))
640 // If N is a load, do additional profitability checks.
642 switch (U
->getOpcode()) {
652 case ISD::UADDO_CARRY
:
656 SDValue Op1
= U
->getOperand(1);
658 // If the other operand is a 8-bit immediate we should fold the immediate
659 // instead. This reduces code size.
661 // movl 4(%esp), %eax
665 // addl 4(%esp), %eax
666 // The former is 2 bytes shorter. In case where the increment is 1, then
667 // the saving can be 4 bytes (by using incl %eax).
668 if (auto *Imm
= dyn_cast
<ConstantSDNode
>(Op1
)) {
669 if (Imm
->getAPIntValue().isSignedIntN(8))
672 // If this is a 64-bit AND with an immediate that fits in 32-bits,
673 // prefer using the smaller and over folding the load. This is needed to
674 // make sure immediates created by shrinkAndImmediate are always folded.
675 // Ideally we would narrow the load during DAG combine and get the
676 // best of both worlds.
677 if (U
->getOpcode() == ISD::AND
&&
678 Imm
->getAPIntValue().getBitWidth() == 64 &&
679 Imm
->getAPIntValue().isIntN(32))
682 // If this really a zext_inreg that can be represented with a movzx
683 // instruction, prefer that.
684 // TODO: We could shrink the load and fold if it is non-volatile.
685 if (U
->getOpcode() == ISD::AND
&&
686 (Imm
->getAPIntValue() == UINT8_MAX
||
687 Imm
->getAPIntValue() == UINT16_MAX
||
688 Imm
->getAPIntValue() == UINT32_MAX
))
691 // ADD/SUB with can negate the immediate and use the opposite operation
692 // to fit 128 into a sign extended 8 bit immediate.
693 if ((U
->getOpcode() == ISD::ADD
|| U
->getOpcode() == ISD::SUB
) &&
694 (-Imm
->getAPIntValue()).isSignedIntN(8))
697 if ((U
->getOpcode() == X86ISD::ADD
|| U
->getOpcode() == X86ISD::SUB
) &&
698 (-Imm
->getAPIntValue()).isSignedIntN(8) &&
699 hasNoCarryFlagUses(SDValue(U
, 1)))
703 // If the other operand is a TLS address, we should fold it instead.
706 // leal i@NTPOFF(%eax), %eax
708 // movl $i@NTPOFF, %eax
710 // if the block also has an access to a second TLS address this will save
712 // FIXME: This is probably also true for non-TLS addresses.
713 if (Op1
.getOpcode() == X86ISD::Wrapper
) {
714 SDValue Val
= Op1
.getOperand(0);
715 if (Val
.getOpcode() == ISD::TargetGlobalTLSAddress
)
719 // Don't fold load if this matches the BTS/BTR/BTC patterns.
720 // BTS: (or X, (shl 1, n))
721 // BTR: (and X, (rotl -2, n))
722 // BTC: (xor X, (shl 1, n))
723 if (U
->getOpcode() == ISD::OR
|| U
->getOpcode() == ISD::XOR
) {
724 if (U
->getOperand(0).getOpcode() == ISD::SHL
&&
725 isOneConstant(U
->getOperand(0).getOperand(0)))
728 if (U
->getOperand(1).getOpcode() == ISD::SHL
&&
729 isOneConstant(U
->getOperand(1).getOperand(0)))
732 if (U
->getOpcode() == ISD::AND
) {
733 SDValue U0
= U
->getOperand(0);
734 SDValue U1
= U
->getOperand(1);
735 if (U0
.getOpcode() == ISD::ROTL
) {
736 auto *C
= dyn_cast
<ConstantSDNode
>(U0
.getOperand(0));
737 if (C
&& C
->getSExtValue() == -2)
741 if (U1
.getOpcode() == ISD::ROTL
) {
742 auto *C
= dyn_cast
<ConstantSDNode
>(U1
.getOperand(0));
743 if (C
&& C
->getSExtValue() == -2)
753 // Don't fold a load into a shift by immediate. The BMI2 instructions
754 // support folding a load, but not an immediate. The legacy instructions
755 // support folding an immediate, but can't fold a load. Folding an
756 // immediate is preferable to folding a load.
757 if (isa
<ConstantSDNode
>(U
->getOperand(1)))
764 // Prevent folding a load if this can implemented with an insert_subreg or
765 // a move that implicitly zeroes.
766 if (Root
->getOpcode() == ISD::INSERT_SUBVECTOR
&&
767 isNullConstant(Root
->getOperand(2)) &&
768 (Root
->getOperand(0).isUndef() ||
769 ISD::isBuildVectorAllZeros(Root
->getOperand(0).getNode())))
775 // Indicates it is profitable to form an AVX512 masked operation. Returning
776 // false will favor a masked register-register masked move or vblendm and the
777 // operation will be selected separately.
778 bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode
*N
) const {
780 (N
->getOpcode() == ISD::VSELECT
|| N
->getOpcode() == X86ISD::SELECTS
) &&
781 "Unexpected opcode!");
783 // If the operation has additional users, the operation will be duplicated.
784 // Check the use count to prevent that.
785 // FIXME: Are there cheap opcodes we might want to duplicate?
786 return N
->getOperand(1).hasOneUse();
789 /// Replace the original chain operand of the call with
790 /// load's chain operand and move load below the call's chain operand.
791 static void moveBelowOrigChain(SelectionDAG
*CurDAG
, SDValue Load
,
792 SDValue Call
, SDValue OrigChain
) {
793 SmallVector
<SDValue
, 8> Ops
;
794 SDValue Chain
= OrigChain
.getOperand(0);
795 if (Chain
.getNode() == Load
.getNode())
796 Ops
.push_back(Load
.getOperand(0));
798 assert(Chain
.getOpcode() == ISD::TokenFactor
&&
799 "Unexpected chain operand");
800 for (unsigned i
= 0, e
= Chain
.getNumOperands(); i
!= e
; ++i
)
801 if (Chain
.getOperand(i
).getNode() == Load
.getNode())
802 Ops
.push_back(Load
.getOperand(0));
804 Ops
.push_back(Chain
.getOperand(i
));
806 CurDAG
->getNode(ISD::TokenFactor
, SDLoc(Load
), MVT::Other
, Ops
);
808 Ops
.push_back(NewChain
);
810 Ops
.append(OrigChain
->op_begin() + 1, OrigChain
->op_end());
811 CurDAG
->UpdateNodeOperands(OrigChain
.getNode(), Ops
);
812 CurDAG
->UpdateNodeOperands(Load
.getNode(), Call
.getOperand(0),
813 Load
.getOperand(1), Load
.getOperand(2));
816 Ops
.push_back(SDValue(Load
.getNode(), 1));
817 Ops
.append(Call
->op_begin() + 1, Call
->op_end());
818 CurDAG
->UpdateNodeOperands(Call
.getNode(), Ops
);
821 /// Return true if call address is a load and it can be
822 /// moved below CALLSEQ_START and the chains leading up to the call.
823 /// Return the CALLSEQ_START by reference as a second output.
824 /// In the case of a tail call, there isn't a callseq node between the call
825 /// chain and the load.
826 static bool isCalleeLoad(SDValue Callee
, SDValue
&Chain
, bool HasCallSeq
) {
827 // The transformation is somewhat dangerous if the call's chain was glued to
828 // the call. After MoveBelowOrigChain the load is moved between the call and
829 // the chain, this can create a cycle if the load is not folded. So it is
830 // *really* important that we are sure the load will be folded.
831 if (Callee
.getNode() == Chain
.getNode() || !Callee
.hasOneUse())
833 auto *LD
= dyn_cast
<LoadSDNode
>(Callee
.getNode());
836 LD
->getAddressingMode() != ISD::UNINDEXED
||
837 LD
->getExtensionType() != ISD::NON_EXTLOAD
)
840 // Now let's find the callseq_start.
841 while (HasCallSeq
&& Chain
.getOpcode() != ISD::CALLSEQ_START
) {
842 if (!Chain
.hasOneUse())
844 Chain
= Chain
.getOperand(0);
847 if (!Chain
.getNumOperands())
849 // Since we are not checking for AA here, conservatively abort if the chain
850 // writes to memory. It's not safe to move the callee (a load) across a store.
851 if (isa
<MemSDNode
>(Chain
.getNode()) &&
852 cast
<MemSDNode
>(Chain
.getNode())->writeMem())
854 if (Chain
.getOperand(0).getNode() == Callee
.getNode())
856 if (Chain
.getOperand(0).getOpcode() == ISD::TokenFactor
&&
857 Callee
.getValue(1).isOperandOf(Chain
.getOperand(0).getNode()) &&
858 Callee
.getValue(1).hasOneUse())
863 static bool isEndbrImm64(uint64_t Imm
) {
864 // There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
865 // i.g: 0xF3660F1EFA, 0xF3670F1EFA
866 if ((Imm
& 0x00FFFFFF) != 0x0F1EFA)
869 uint8_t OptionalPrefixBytes
[] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
870 0x65, 0x66, 0x67, 0xf0, 0xf2};
871 int i
= 24; // 24bit 0x0F1EFA has matched
873 uint8_t Byte
= (Imm
>> i
) & 0xFF;
876 if (!llvm::is_contained(OptionalPrefixBytes
, Byte
))
884 static bool needBWI(MVT VT
) {
885 return (VT
== MVT::v32i16
|| VT
== MVT::v32f16
|| VT
== MVT::v64i8
);
888 void X86DAGToDAGISel::PreprocessISelDAG() {
889 bool MadeChange
= false;
890 for (SelectionDAG::allnodes_iterator I
= CurDAG
->allnodes_begin(),
891 E
= CurDAG
->allnodes_end(); I
!= E
; ) {
892 SDNode
*N
= &*I
++; // Preincrement iterator to avoid invalidation issues.
894 // This is for CET enhancement.
896 // ENDBR32 and ENDBR64 have specific opcodes:
897 // ENDBR32: F3 0F 1E FB
898 // ENDBR64: F3 0F 1E FA
899 // And we want that attackers won’t find unintended ENDBR32/64
900 // opcode matches in the binary
901 // Here’s an example:
902 // If the compiler had to generate asm for the following code:
904 // it could, for example, generate:
905 // mov 0xF30F1EFA, dword ptr[a]
906 // In such a case, the binary would include a gadget that starts
907 // with a fake ENDBR64 opcode. Therefore, we split such generation
908 // into multiple operations, let it not shows in the binary
909 if (N
->getOpcode() == ISD::Constant
) {
910 MVT VT
= N
->getSimpleValueType(0);
911 int64_t Imm
= cast
<ConstantSDNode
>(N
)->getSExtValue();
912 int32_t EndbrImm
= Subtarget
->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
913 if (Imm
== EndbrImm
|| isEndbrImm64(Imm
)) {
914 // Check that the cf-protection-branch is enabled.
915 Metadata
*CFProtectionBranch
=
916 MF
->getMMI().getModule()->getModuleFlag("cf-protection-branch");
917 if (CFProtectionBranch
|| IndirectBranchTracking
) {
919 SDValue Complement
= CurDAG
->getConstant(~Imm
, dl
, VT
, false, true);
920 Complement
= CurDAG
->getNOT(dl
, Complement
, VT
);
922 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Complement
);
930 // If this is a target specific AND node with no flag usages, turn it back
931 // into ISD::AND to enable test instruction matching.
932 if (N
->getOpcode() == X86ISD::AND
&& !N
->hasAnyUseOfValue(1)) {
933 SDValue Res
= CurDAG
->getNode(ISD::AND
, SDLoc(N
), N
->getValueType(0),
934 N
->getOperand(0), N
->getOperand(1));
936 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
942 // Convert vector increment or decrement to sub/add with an all-ones
944 // add X, <1, 1...> --> sub X, <-1, -1...>
945 // sub X, <1, 1...> --> add X, <-1, -1...>
946 // The all-ones vector constant can be materialized using a pcmpeq
947 // instruction that is commonly recognized as an idiom (has no register
948 // dependency), so that's better/smaller than loading a splat 1 constant.
950 // But don't do this if it would inhibit a potentially profitable load
951 // folding opportunity for the other operand. That only occurs with the
953 // (1) The other operand (op0) is load foldable.
954 // (2) The op is an add (otherwise, we are *creating* an add and can still
955 // load fold the other op).
956 // (3) The target has AVX (otherwise, we have a destructive add and can't
957 // load fold the other op without killing the constant op).
958 // (4) The constant 1 vector has multiple uses (so it is profitable to load
959 // into a register anyway).
960 auto mayPreventLoadFold
= [&]() {
961 return X86::mayFoldLoad(N
->getOperand(0), *Subtarget
) &&
962 N
->getOpcode() == ISD::ADD
&& Subtarget
->hasAVX() &&
963 !N
->getOperand(1).hasOneUse();
965 if ((N
->getOpcode() == ISD::ADD
|| N
->getOpcode() == ISD::SUB
) &&
966 N
->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
968 if (X86::isConstantSplat(N
->getOperand(1), SplatVal
) &&
972 MVT VT
= N
->getSimpleValueType(0);
973 unsigned NumElts
= VT
.getSizeInBits() / 32;
975 CurDAG
->getAllOnesConstant(DL
, MVT::getVectorVT(MVT::i32
, NumElts
));
976 AllOnes
= CurDAG
->getBitcast(VT
, AllOnes
);
978 unsigned NewOpcode
= N
->getOpcode() == ISD::ADD
? ISD::SUB
: ISD::ADD
;
980 CurDAG
->getNode(NewOpcode
, DL
, VT
, N
->getOperand(0), AllOnes
);
982 CurDAG
->ReplaceAllUsesWith(N
, Res
.getNode());
989 switch (N
->getOpcode()) {
990 case X86ISD::VBROADCAST
: {
991 MVT VT
= N
->getSimpleValueType(0);
992 // Emulate v32i16/v64i8 broadcast without BWI.
993 if (!Subtarget
->hasBWI() && needBWI(VT
)) {
994 MVT NarrowVT
= VT
.getHalfNumVectorElementsVT();
996 SDValue NarrowBCast
=
997 CurDAG
->getNode(X86ISD::VBROADCAST
, dl
, NarrowVT
, N
->getOperand(0));
999 CurDAG
->getNode(ISD::INSERT_SUBVECTOR
, dl
, VT
, CurDAG
->getUNDEF(VT
),
1000 NarrowBCast
, CurDAG
->getIntPtrConstant(0, dl
));
1001 unsigned Index
= NarrowVT
.getVectorMinNumElements();
1002 Res
= CurDAG
->getNode(ISD::INSERT_SUBVECTOR
, dl
, VT
, Res
, NarrowBCast
,
1003 CurDAG
->getIntPtrConstant(Index
, dl
));
1006 CurDAG
->ReplaceAllUsesWith(N
, Res
.getNode());
1014 case X86ISD::VBROADCAST_LOAD
: {
1015 MVT VT
= N
->getSimpleValueType(0);
1016 // Emulate v32i16/v64i8 broadcast without BWI.
1017 if (!Subtarget
->hasBWI() && needBWI(VT
)) {
1018 MVT NarrowVT
= VT
.getHalfNumVectorElementsVT();
1019 auto *MemNode
= cast
<MemSDNode
>(N
);
1021 SDVTList VTs
= CurDAG
->getVTList(NarrowVT
, MVT::Other
);
1022 SDValue Ops
[] = {MemNode
->getChain(), MemNode
->getBasePtr()};
1023 SDValue NarrowBCast
= CurDAG
->getMemIntrinsicNode(
1024 X86ISD::VBROADCAST_LOAD
, dl
, VTs
, Ops
, MemNode
->getMemoryVT(),
1025 MemNode
->getMemOperand());
1027 CurDAG
->getNode(ISD::INSERT_SUBVECTOR
, dl
, VT
, CurDAG
->getUNDEF(VT
),
1028 NarrowBCast
, CurDAG
->getIntPtrConstant(0, dl
));
1029 unsigned Index
= NarrowVT
.getVectorMinNumElements();
1030 Res
= CurDAG
->getNode(ISD::INSERT_SUBVECTOR
, dl
, VT
, Res
, NarrowBCast
,
1031 CurDAG
->getIntPtrConstant(Index
, dl
));
1034 SDValue To
[] = {Res
, NarrowBCast
.getValue(1)};
1035 CurDAG
->ReplaceAllUsesWith(N
, To
);
1044 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1045 // load, then just extract the lower subvector and avoid the second load.
1046 auto *Ld
= cast
<LoadSDNode
>(N
);
1047 MVT VT
= N
->getSimpleValueType(0);
1048 if (!ISD::isNormalLoad(Ld
) || !Ld
->isSimple() ||
1049 !(VT
.is128BitVector() || VT
.is256BitVector()))
1053 SDNode
*MaxLd
= nullptr;
1054 SDValue Ptr
= Ld
->getBasePtr();
1055 SDValue Chain
= Ld
->getChain();
1056 for (SDNode
*User
: Ptr
->uses()) {
1057 auto *UserLd
= dyn_cast
<LoadSDNode
>(User
);
1058 MVT UserVT
= User
->getSimpleValueType(0);
1059 if (User
!= N
&& UserLd
&& ISD::isNormalLoad(User
) &&
1060 UserLd
->getBasePtr() == Ptr
&& UserLd
->getChain() == Chain
&&
1061 !User
->hasAnyUseOfValue(1) &&
1062 (UserVT
.is256BitVector() || UserVT
.is512BitVector()) &&
1063 UserVT
.getSizeInBits() > VT
.getSizeInBits() &&
1064 (!MaxLd
|| UserVT
.getSizeInBits() > MaxVT
.getSizeInBits())) {
1071 unsigned NumSubElts
= VT
.getSizeInBits() / MaxVT
.getScalarSizeInBits();
1072 MVT SubVT
= MVT::getVectorVT(MaxVT
.getScalarType(), NumSubElts
);
1073 SDValue Extract
= CurDAG
->getNode(ISD::EXTRACT_SUBVECTOR
, dl
, SubVT
,
1075 CurDAG
->getIntPtrConstant(0, dl
));
1076 SDValue Res
= CurDAG
->getBitcast(VT
, Extract
);
1079 SDValue To
[] = {Res
, SDValue(MaxLd
, 1)};
1080 CurDAG
->ReplaceAllUsesWith(N
, To
);
1087 case ISD::VSELECT
: {
1088 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1089 EVT EleVT
= N
->getOperand(0).getValueType().getVectorElementType();
1090 if (EleVT
== MVT::i1
)
1093 assert(Subtarget
->hasSSE41() && "Expected SSE4.1 support!");
1094 assert(N
->getValueType(0).getVectorElementType() != MVT::i16
&&
1095 "We can't replace VSELECT with BLENDV in vXi16!");
1097 if (Subtarget
->hasVLX() && CurDAG
->ComputeNumSignBits(N
->getOperand(0)) ==
1098 EleVT
.getSizeInBits()) {
1099 R
= CurDAG
->getNode(X86ISD::VPTERNLOG
, SDLoc(N
), N
->getValueType(0),
1100 N
->getOperand(0), N
->getOperand(1), N
->getOperand(2),
1101 CurDAG
->getTargetConstant(0xCA, SDLoc(N
), MVT::i8
));
1103 R
= CurDAG
->getNode(X86ISD::BLENDV
, SDLoc(N
), N
->getValueType(0),
1104 N
->getOperand(0), N
->getOperand(1),
1108 CurDAG
->ReplaceAllUsesWith(N
, R
.getNode());
1114 case ISD::STRICT_FP_ROUND
:
1115 case ISD::FP_TO_SINT
:
1116 case ISD::FP_TO_UINT
:
1117 case ISD::STRICT_FP_TO_SINT
:
1118 case ISD::STRICT_FP_TO_UINT
: {
1119 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1120 // don't need 2 sets of patterns.
1121 if (!N
->getSimpleValueType(0).isVector())
1125 switch (N
->getOpcode()) {
1126 default: llvm_unreachable("Unexpected opcode!");
1127 case ISD::FP_ROUND
: NewOpc
= X86ISD::VFPROUND
; break;
1128 case ISD::STRICT_FP_ROUND
: NewOpc
= X86ISD::STRICT_VFPROUND
; break;
1129 case ISD::STRICT_FP_TO_SINT
: NewOpc
= X86ISD::STRICT_CVTTP2SI
; break;
1130 case ISD::FP_TO_SINT
: NewOpc
= X86ISD::CVTTP2SI
; break;
1131 case ISD::STRICT_FP_TO_UINT
: NewOpc
= X86ISD::STRICT_CVTTP2UI
; break;
1132 case ISD::FP_TO_UINT
: NewOpc
= X86ISD::CVTTP2UI
; break;
1135 if (N
->isStrictFPOpcode())
1137 CurDAG
->getNode(NewOpc
, SDLoc(N
), {N
->getValueType(0), MVT::Other
},
1138 {N
->getOperand(0), N
->getOperand(1)});
1141 CurDAG
->getNode(NewOpc
, SDLoc(N
), N
->getValueType(0),
1144 CurDAG
->ReplaceAllUsesWith(N
, Res
.getNode());
1152 // Replace vector shifts with their X86 specific equivalent so we don't
1153 // need 2 sets of patterns.
1154 if (!N
->getValueType(0).isVector())
1158 switch (N
->getOpcode()) {
1159 default: llvm_unreachable("Unexpected opcode!");
1160 case ISD::SHL
: NewOpc
= X86ISD::VSHLV
; break;
1161 case ISD::SRA
: NewOpc
= X86ISD::VSRAV
; break;
1162 case ISD::SRL
: NewOpc
= X86ISD::VSRLV
; break;
1164 SDValue Res
= CurDAG
->getNode(NewOpc
, SDLoc(N
), N
->getValueType(0),
1165 N
->getOperand(0), N
->getOperand(1));
1167 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
1172 case ISD::ANY_EXTEND
:
1173 case ISD::ANY_EXTEND_VECTOR_INREG
: {
1174 // Replace vector any extend with the zero extend equivalents so we don't
1175 // need 2 sets of patterns. Ignore vXi1 extensions.
1176 if (!N
->getValueType(0).isVector())
1180 if (N
->getOperand(0).getScalarValueSizeInBits() == 1) {
1181 assert(N
->getOpcode() == ISD::ANY_EXTEND
&&
1182 "Unexpected opcode for mask vector!");
1183 NewOpc
= ISD::SIGN_EXTEND
;
1185 NewOpc
= N
->getOpcode() == ISD::ANY_EXTEND
1187 : ISD::ZERO_EXTEND_VECTOR_INREG
;
1190 SDValue Res
= CurDAG
->getNode(NewOpc
, SDLoc(N
), N
->getValueType(0),
1193 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
1199 case ISD::STRICT_FCEIL
:
1201 case ISD::STRICT_FFLOOR
:
1203 case ISD::STRICT_FTRUNC
:
1204 case ISD::FROUNDEVEN
:
1205 case ISD::STRICT_FROUNDEVEN
:
1206 case ISD::FNEARBYINT
:
1207 case ISD::STRICT_FNEARBYINT
:
1209 case ISD::STRICT_FRINT
: {
1210 // Replace fp rounding with their X86 specific equivalent so we don't
1211 // need 2 sets of patterns.
1213 switch (N
->getOpcode()) {
1214 default: llvm_unreachable("Unexpected opcode!");
1215 case ISD::STRICT_FCEIL
:
1216 case ISD::FCEIL
: Imm
= 0xA; break;
1217 case ISD::STRICT_FFLOOR
:
1218 case ISD::FFLOOR
: Imm
= 0x9; break;
1219 case ISD::STRICT_FTRUNC
:
1220 case ISD::FTRUNC
: Imm
= 0xB; break;
1221 case ISD::STRICT_FROUNDEVEN
:
1222 case ISD::FROUNDEVEN
: Imm
= 0x8; break;
1223 case ISD::STRICT_FNEARBYINT
:
1224 case ISD::FNEARBYINT
: Imm
= 0xC; break;
1225 case ISD::STRICT_FRINT
:
1226 case ISD::FRINT
: Imm
= 0x4; break;
1229 bool IsStrict
= N
->isStrictFPOpcode();
1232 Res
= CurDAG
->getNode(X86ISD::STRICT_VRNDSCALE
, dl
,
1233 {N
->getValueType(0), MVT::Other
},
1234 {N
->getOperand(0), N
->getOperand(1),
1235 CurDAG
->getTargetConstant(Imm
, dl
, MVT::i32
)});
1237 Res
= CurDAG
->getNode(X86ISD::VRNDSCALE
, dl
, N
->getValueType(0),
1239 CurDAG
->getTargetConstant(Imm
, dl
, MVT::i32
));
1241 CurDAG
->ReplaceAllUsesWith(N
, Res
.getNode());
1249 case X86ISD::FXOR
: {
1250 // Widen scalar fp logic ops to vector to reduce isel patterns.
1251 // FIXME: Can we do this during lowering/combine.
1252 MVT VT
= N
->getSimpleValueType(0);
1253 if (VT
.isVector() || VT
== MVT::f128
)
1256 MVT VecVT
= VT
== MVT::f64
? MVT::v2f64
1257 : VT
== MVT::f32
? MVT::v4f32
1261 SDValue Op0
= CurDAG
->getNode(ISD::SCALAR_TO_VECTOR
, dl
, VecVT
,
1263 SDValue Op1
= CurDAG
->getNode(ISD::SCALAR_TO_VECTOR
, dl
, VecVT
,
1267 if (Subtarget
->hasSSE2()) {
1268 EVT IntVT
= EVT(VecVT
).changeVectorElementTypeToInteger();
1269 Op0
= CurDAG
->getNode(ISD::BITCAST
, dl
, IntVT
, Op0
);
1270 Op1
= CurDAG
->getNode(ISD::BITCAST
, dl
, IntVT
, Op1
);
1272 switch (N
->getOpcode()) {
1273 default: llvm_unreachable("Unexpected opcode!");
1274 case X86ISD::FANDN
: Opc
= X86ISD::ANDNP
; break;
1275 case X86ISD::FAND
: Opc
= ISD::AND
; break;
1276 case X86ISD::FOR
: Opc
= ISD::OR
; break;
1277 case X86ISD::FXOR
: Opc
= ISD::XOR
; break;
1279 Res
= CurDAG
->getNode(Opc
, dl
, IntVT
, Op0
, Op1
);
1280 Res
= CurDAG
->getNode(ISD::BITCAST
, dl
, VecVT
, Res
);
1282 Res
= CurDAG
->getNode(N
->getOpcode(), dl
, VecVT
, Op0
, Op1
);
1284 Res
= CurDAG
->getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, VT
, Res
,
1285 CurDAG
->getIntPtrConstant(0, dl
));
1287 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Res
);
1294 if (OptLevel
!= CodeGenOptLevel::None
&&
1295 // Only do this when the target can fold the load into the call or
1297 !Subtarget
->useIndirectThunkCalls() &&
1298 ((N
->getOpcode() == X86ISD::CALL
&& !Subtarget
->slowTwoMemOps()) ||
1299 (N
->getOpcode() == X86ISD::TC_RETURN
&&
1300 (Subtarget
->is64Bit() ||
1301 !getTargetMachine().isPositionIndependent())))) {
1302 /// Also try moving call address load from outside callseq_start to just
1303 /// before the call to allow it to be folded.
1313 ///[CALLSEQ_START] |
1321 bool HasCallSeq
= N
->getOpcode() == X86ISD::CALL
;
1322 SDValue Chain
= N
->getOperand(0);
1323 SDValue Load
= N
->getOperand(1);
1324 if (!isCalleeLoad(Load
, Chain
, HasCallSeq
))
1326 moveBelowOrigChain(CurDAG
, Load
, SDValue(N
, 0), Chain
);
1332 // Lower fpround and fpextend nodes that target the FP stack to be store and
1333 // load to the stack. This is a gross hack. We would like to simply mark
1334 // these as being illegal, but when we do that, legalize produces these when
1335 // it expands calls, then expands these in the same legalize pass. We would
1336 // like dag combine to be able to hack on these between the call expansion
1337 // and the node legalization. As such this pass basically does "really
1338 // late" legalization of these inline with the X86 isel pass.
1339 // FIXME: This should only happen when not compiled with -O0.
1340 switch (N
->getOpcode()) {
1343 case ISD::FP_EXTEND
:
1345 MVT SrcVT
= N
->getOperand(0).getSimpleValueType();
1346 MVT DstVT
= N
->getSimpleValueType(0);
1348 // If any of the sources are vectors, no fp stack involved.
1349 if (SrcVT
.isVector() || DstVT
.isVector())
1352 // If the source and destination are SSE registers, then this is a legal
1353 // conversion that should not be lowered.
1354 const X86TargetLowering
*X86Lowering
=
1355 static_cast<const X86TargetLowering
*>(TLI
);
1356 bool SrcIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(SrcVT
);
1357 bool DstIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(DstVT
);
1358 if (SrcIsSSE
&& DstIsSSE
)
1361 if (!SrcIsSSE
&& !DstIsSSE
) {
1362 // If this is an FPStack extension, it is a noop.
1363 if (N
->getOpcode() == ISD::FP_EXTEND
)
1365 // If this is a value-preserving FPStack truncation, it is a noop.
1366 if (N
->getConstantOperandVal(1))
1370 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1371 // FPStack has extload and truncstore. SSE can fold direct loads into other
1372 // operations. Based on this, decide what we want to do.
1373 MVT MemVT
= (N
->getOpcode() == ISD::FP_ROUND
) ? DstVT
: SrcVT
;
1374 SDValue MemTmp
= CurDAG
->CreateStackTemporary(MemVT
);
1375 int SPFI
= cast
<FrameIndexSDNode
>(MemTmp
)->getIndex();
1376 MachinePointerInfo MPI
=
1377 MachinePointerInfo::getFixedStack(CurDAG
->getMachineFunction(), SPFI
);
1380 // FIXME: optimize the case where the src/dest is a load or store?
1382 SDValue Store
= CurDAG
->getTruncStore(
1383 CurDAG
->getEntryNode(), dl
, N
->getOperand(0), MemTmp
, MPI
, MemVT
);
1384 SDValue Result
= CurDAG
->getExtLoad(ISD::EXTLOAD
, dl
, DstVT
, Store
,
1385 MemTmp
, MPI
, MemVT
);
1387 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1388 // extload we created. This will cause general havok on the dag because
1389 // anything below the conversion could be folded into other existing nodes.
1390 // To avoid invalidating 'I', back it up to the convert node.
1392 CurDAG
->ReplaceAllUsesOfValueWith(SDValue(N
, 0), Result
);
1396 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1397 //dealing with the chain differently, as there is already a preexisting chain.
1398 case ISD::STRICT_FP_ROUND
:
1399 case ISD::STRICT_FP_EXTEND
:
1401 MVT SrcVT
= N
->getOperand(1).getSimpleValueType();
1402 MVT DstVT
= N
->getSimpleValueType(0);
1404 // If any of the sources are vectors, no fp stack involved.
1405 if (SrcVT
.isVector() || DstVT
.isVector())
1408 // If the source and destination are SSE registers, then this is a legal
1409 // conversion that should not be lowered.
1410 const X86TargetLowering
*X86Lowering
=
1411 static_cast<const X86TargetLowering
*>(TLI
);
1412 bool SrcIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(SrcVT
);
1413 bool DstIsSSE
= X86Lowering
->isScalarFPTypeInSSEReg(DstVT
);
1414 if (SrcIsSSE
&& DstIsSSE
)
1417 if (!SrcIsSSE
&& !DstIsSSE
) {
1418 // If this is an FPStack extension, it is a noop.
1419 if (N
->getOpcode() == ISD::STRICT_FP_EXTEND
)
1421 // If this is a value-preserving FPStack truncation, it is a noop.
1422 if (N
->getConstantOperandVal(2))
1426 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1427 // FPStack has extload and truncstore. SSE can fold direct loads into other
1428 // operations. Based on this, decide what we want to do.
1429 MVT MemVT
= (N
->getOpcode() == ISD::STRICT_FP_ROUND
) ? DstVT
: SrcVT
;
1430 SDValue MemTmp
= CurDAG
->CreateStackTemporary(MemVT
);
1431 int SPFI
= cast
<FrameIndexSDNode
>(MemTmp
)->getIndex();
1432 MachinePointerInfo MPI
=
1433 MachinePointerInfo::getFixedStack(CurDAG
->getMachineFunction(), SPFI
);
1436 // FIXME: optimize the case where the src/dest is a load or store?
1438 //Since the operation is StrictFP, use the preexisting chain.
1439 SDValue Store
, Result
;
1441 SDVTList VTs
= CurDAG
->getVTList(MVT::Other
);
1442 SDValue Ops
[] = {N
->getOperand(0), N
->getOperand(1), MemTmp
};
1443 Store
= CurDAG
->getMemIntrinsicNode(X86ISD::FST
, dl
, VTs
, Ops
, MemVT
,
1444 MPI
, /*Align*/ std::nullopt
,
1445 MachineMemOperand::MOStore
);
1446 if (N
->getFlags().hasNoFPExcept()) {
1447 SDNodeFlags Flags
= Store
->getFlags();
1448 Flags
.setNoFPExcept(true);
1449 Store
->setFlags(Flags
);
1452 assert(SrcVT
== MemVT
&& "Unexpected VT!");
1453 Store
= CurDAG
->getStore(N
->getOperand(0), dl
, N
->getOperand(1), MemTmp
,
1458 SDVTList VTs
= CurDAG
->getVTList(DstVT
, MVT::Other
);
1459 SDValue Ops
[] = {Store
, MemTmp
};
1460 Result
= CurDAG
->getMemIntrinsicNode(
1461 X86ISD::FLD
, dl
, VTs
, Ops
, MemVT
, MPI
,
1462 /*Align*/ std::nullopt
, MachineMemOperand::MOLoad
);
1463 if (N
->getFlags().hasNoFPExcept()) {
1464 SDNodeFlags Flags
= Result
->getFlags();
1465 Flags
.setNoFPExcept(true);
1466 Result
->setFlags(Flags
);
1469 assert(DstVT
== MemVT
&& "Unexpected VT!");
1470 Result
= CurDAG
->getLoad(DstVT
, dl
, Store
, MemTmp
, MPI
);
1473 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1474 // extload we created. This will cause general havok on the dag because
1475 // anything below the conversion could be folded into other existing nodes.
1476 // To avoid invalidating 'I', back it up to the convert node.
1478 CurDAG
->ReplaceAllUsesWith(N
, Result
.getNode());
1484 // Now that we did that, the node is dead. Increment the iterator to the
1485 // next node to process, then delete N.
1490 // Remove any dead nodes that may have been left behind.
1492 CurDAG
->RemoveDeadNodes();
1495 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1496 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode
*N
) {
1497 unsigned Opc
= N
->getMachineOpcode();
1498 if (Opc
!= X86::MOVZX32rr8
&& Opc
!= X86::MOVSX32rr8
&&
1499 Opc
!= X86::MOVSX64rr8
)
1502 SDValue N0
= N
->getOperand(0);
1504 // We need to be extracting the lower bit of an extend.
1505 if (!N0
.isMachineOpcode() ||
1506 N0
.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG
||
1507 N0
.getConstantOperandVal(1) != X86::sub_8bit
)
1510 // We're looking for either a movsx or movzx to match the original opcode.
1511 unsigned ExpectedOpc
= Opc
== X86::MOVZX32rr8
? X86::MOVZX32rr8_NOREX
1512 : X86::MOVSX32rr8_NOREX
;
1513 SDValue N00
= N0
.getOperand(0);
1514 if (!N00
.isMachineOpcode() || N00
.getMachineOpcode() != ExpectedOpc
)
1517 if (Opc
== X86::MOVSX64rr8
) {
1518 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1520 MachineSDNode
*Extend
= CurDAG
->getMachineNode(X86::MOVSX64rr32
, SDLoc(N
),
1522 ReplaceUses(N
, Extend
);
1524 // Ok we can drop this extend and just use the original extend.
1525 ReplaceUses(N
, N00
.getNode());
1531 void X86DAGToDAGISel::PostprocessISelDAG() {
1532 // Skip peepholes at -O0.
1533 if (TM
.getOptLevel() == CodeGenOptLevel::None
)
1536 SelectionDAG::allnodes_iterator Position
= CurDAG
->allnodes_end();
1538 bool MadeChange
= false;
1539 while (Position
!= CurDAG
->allnodes_begin()) {
1540 SDNode
*N
= &*--Position
;
1541 // Skip dead nodes and any non-machine opcodes.
1542 if (N
->use_empty() || !N
->isMachineOpcode())
1545 if (tryOptimizeRem8Extend(N
)) {
1550 // Look for a TESTrr+ANDrr pattern where both operands of the test are
1551 // the same. Rewrite to remove the AND.
1552 unsigned Opc
= N
->getMachineOpcode();
1553 if ((Opc
== X86::TEST8rr
|| Opc
== X86::TEST16rr
||
1554 Opc
== X86::TEST32rr
|| Opc
== X86::TEST64rr
) &&
1555 N
->getOperand(0) == N
->getOperand(1) &&
1556 N
->getOperand(0)->hasNUsesOfValue(2, N
->getOperand(0).getResNo()) &&
1557 N
->getOperand(0).isMachineOpcode()) {
1558 SDValue And
= N
->getOperand(0);
1559 unsigned N0Opc
= And
.getMachineOpcode();
1560 if ((N0Opc
== X86::AND8rr
|| N0Opc
== X86::AND16rr
||
1561 N0Opc
== X86::AND32rr
|| N0Opc
== X86::AND64rr
) &&
1562 !And
->hasAnyUseOfValue(1)) {
1563 MachineSDNode
*Test
= CurDAG
->getMachineNode(Opc
, SDLoc(N
),
1567 ReplaceUses(N
, Test
);
1571 if ((N0Opc
== X86::AND8rm
|| N0Opc
== X86::AND16rm
||
1572 N0Opc
== X86::AND32rm
|| N0Opc
== X86::AND64rm
) &&
1573 !And
->hasAnyUseOfValue(1)) {
1576 case X86::AND8rm
: NewOpc
= X86::TEST8mr
; break;
1577 case X86::AND16rm
: NewOpc
= X86::TEST16mr
; break;
1578 case X86::AND32rm
: NewOpc
= X86::TEST32mr
; break;
1579 case X86::AND64rm
: NewOpc
= X86::TEST64mr
; break;
1582 // Need to swap the memory and register operand.
1583 SDValue Ops
[] = { And
.getOperand(1),
1589 And
.getOperand(6) /* Chain */ };
1590 MachineSDNode
*Test
= CurDAG
->getMachineNode(NewOpc
, SDLoc(N
),
1591 MVT::i32
, MVT::Other
, Ops
);
1592 CurDAG
->setNodeMemRefs(
1593 Test
, cast
<MachineSDNode
>(And
.getNode())->memoperands());
1594 ReplaceUses(And
.getValue(2), SDValue(Test
, 1));
1595 ReplaceUses(SDValue(N
, 0), SDValue(Test
, 0));
1601 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1602 // used. We're doing this late so we can prefer to fold the AND into masked
1603 // comparisons. Doing that can be better for the live range of the mask
1605 if ((Opc
== X86::KORTESTBrr
|| Opc
== X86::KORTESTWrr
||
1606 Opc
== X86::KORTESTDrr
|| Opc
== X86::KORTESTQrr
) &&
1607 N
->getOperand(0) == N
->getOperand(1) &&
1608 N
->isOnlyUserOf(N
->getOperand(0).getNode()) &&
1609 N
->getOperand(0).isMachineOpcode() &&
1610 onlyUsesZeroFlag(SDValue(N
, 0))) {
1611 SDValue And
= N
->getOperand(0);
1612 unsigned N0Opc
= And
.getMachineOpcode();
1613 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1614 // KAND instructions and KTEST use the same ISA feature.
1615 if (N0Opc
== X86::KANDBrr
||
1616 (N0Opc
== X86::KANDWrr
&& Subtarget
->hasDQI()) ||
1617 N0Opc
== X86::KANDDrr
|| N0Opc
== X86::KANDQrr
) {
1620 default: llvm_unreachable("Unexpected opcode!");
1621 case X86::KORTESTBrr
: NewOpc
= X86::KTESTBrr
; break;
1622 case X86::KORTESTWrr
: NewOpc
= X86::KTESTWrr
; break;
1623 case X86::KORTESTDrr
: NewOpc
= X86::KTESTDrr
; break;
1624 case X86::KORTESTQrr
: NewOpc
= X86::KTESTQrr
; break;
1626 MachineSDNode
*KTest
= CurDAG
->getMachineNode(NewOpc
, SDLoc(N
),
1630 ReplaceUses(N
, KTest
);
1636 // Attempt to remove vectors moves that were inserted to zero upper bits.
1637 if (Opc
!= TargetOpcode::SUBREG_TO_REG
)
1640 unsigned SubRegIdx
= N
->getConstantOperandVal(2);
1641 if (SubRegIdx
!= X86::sub_xmm
&& SubRegIdx
!= X86::sub_ymm
)
1644 SDValue Move
= N
->getOperand(1);
1645 if (!Move
.isMachineOpcode())
1648 // Make sure its one of the move opcodes we recognize.
1649 switch (Move
.getMachineOpcode()) {
1652 case X86::VMOVAPDrr
: case X86::VMOVUPDrr
:
1653 case X86::VMOVAPSrr
: case X86::VMOVUPSrr
:
1654 case X86::VMOVDQArr
: case X86::VMOVDQUrr
:
1655 case X86::VMOVAPDYrr
: case X86::VMOVUPDYrr
:
1656 case X86::VMOVAPSYrr
: case X86::VMOVUPSYrr
:
1657 case X86::VMOVDQAYrr
: case X86::VMOVDQUYrr
:
1658 case X86::VMOVAPDZ128rr
: case X86::VMOVUPDZ128rr
:
1659 case X86::VMOVAPSZ128rr
: case X86::VMOVUPSZ128rr
:
1660 case X86::VMOVDQA32Z128rr
: case X86::VMOVDQU32Z128rr
:
1661 case X86::VMOVDQA64Z128rr
: case X86::VMOVDQU64Z128rr
:
1662 case X86::VMOVAPDZ256rr
: case X86::VMOVUPDZ256rr
:
1663 case X86::VMOVAPSZ256rr
: case X86::VMOVUPSZ256rr
:
1664 case X86::VMOVDQA32Z256rr
: case X86::VMOVDQU32Z256rr
:
1665 case X86::VMOVDQA64Z256rr
: case X86::VMOVDQU64Z256rr
:
1669 SDValue In
= Move
.getOperand(0);
1670 if (!In
.isMachineOpcode() ||
1671 In
.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END
)
1674 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1675 // the SHA instructions which use a legacy encoding.
1676 uint64_t TSFlags
= getInstrInfo()->get(In
.getMachineOpcode()).TSFlags
;
1677 if ((TSFlags
& X86II::EncodingMask
) != X86II::VEX
&&
1678 (TSFlags
& X86II::EncodingMask
) != X86II::EVEX
&&
1679 (TSFlags
& X86II::EncodingMask
) != X86II::XOP
)
1682 // Producing instruction is another vector instruction. We can drop the
1684 CurDAG
->UpdateNodeOperands(N
, N
->getOperand(0), In
, N
->getOperand(2));
1689 CurDAG
->RemoveDeadNodes();
1693 /// Emit any code that needs to be executed only in the main function.
1694 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1695 if (Subtarget
->isTargetCygMing()) {
1696 TargetLowering::ArgListTy Args
;
1697 auto &DL
= CurDAG
->getDataLayout();
1699 TargetLowering::CallLoweringInfo
CLI(*CurDAG
);
1700 CLI
.setChain(CurDAG
->getRoot())
1701 .setCallee(CallingConv::C
, Type::getVoidTy(*CurDAG
->getContext()),
1702 CurDAG
->getExternalSymbol("__main", TLI
->getPointerTy(DL
)),
1704 const TargetLowering
&TLI
= CurDAG
->getTargetLoweringInfo();
1705 std::pair
<SDValue
, SDValue
> Result
= TLI
.LowerCallTo(CLI
);
1706 CurDAG
->setRoot(Result
.second
);
1710 void X86DAGToDAGISel::emitFunctionEntryCode() {
1711 // If this is main, emit special code for main.
1712 const Function
&F
= MF
->getFunction();
1713 if (F
.hasExternalLinkage() && F
.getName() == "main")
1714 emitSpecialCodeForMain();
1717 static bool isDispSafeForFrameIndex(int64_t Val
) {
1718 // On 64-bit platforms, we can run into an issue where a frame index
1719 // includes a displacement that, when added to the explicit displacement,
1720 // will overflow the displacement field. Assuming that the frame index
1721 // displacement fits into a 31-bit integer (which is only slightly more
1722 // aggressive than the current fundamental assumption that it fits into
1723 // a 32-bit integer), a 31-bit disp should always be safe.
1724 return isInt
<31>(Val
);
1727 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset
,
1728 X86ISelAddressMode
&AM
) {
1729 // We may have already matched a displacement and the caller just added the
1730 // symbolic displacement. So we still need to do the checks even if Offset
1733 int64_t Val
= AM
.Disp
+ Offset
;
1735 // Cannot combine ExternalSymbol displacements with integer offsets.
1736 if (Val
!= 0 && (AM
.ES
|| AM
.MCSym
))
1739 CodeModel::Model M
= TM
.getCodeModel();
1740 if (Subtarget
->is64Bit()) {
1742 !X86::isOffsetSuitableForCodeModel(Val
, M
,
1743 AM
.hasSymbolicDisplacement()))
1745 // In addition to the checks required for a register base, check that
1746 // we do not try to use an unsafe Disp with a frame index.
1747 if (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
&&
1748 !isDispSafeForFrameIndex(Val
))
1750 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1751 // 64 bits. Instructions with 32-bit register addresses perform this zero
1752 // extension for us and we can safely ignore the high bits of Offset.
1753 // Instructions with only a 32-bit immediate address do not, though: they
1754 // sign extend instead. This means only address the low 2GB of address space
1755 // is directly addressable, we need indirect addressing for the high 2GB of
1757 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1758 // implicit zero extension of instructions would cover up any problem.
1759 // However, we have asserts elsewhere that get triggered if we do, so keep
1760 // the checks for now.
1761 // TODO: We would actually be able to accept these, as well as the same
1762 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1763 // to get an address size override to be emitted. However, this
1764 // pseudo-register is not part of any register class and therefore causes
1765 // MIR verification to fail.
1766 if (Subtarget
->isTarget64BitILP32() && !isUInt
<31>(Val
) &&
1767 !AM
.hasBaseOrIndexReg())
1774 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode
*N
, X86ISelAddressMode
&AM
,
1775 bool AllowSegmentRegForX32
) {
1776 SDValue Address
= N
->getOperand(1);
1778 // load gs:0 -> GS segment register.
1779 // load fs:0 -> FS segment register.
1781 // This optimization is generally valid because the GNU TLS model defines that
1782 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1783 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1784 // zero-extended to 64 bits and then added it to the base address, which gives
1785 // unwanted results when the register holds a negative value.
1786 // For more information see http://people.redhat.com/drepper/tls.pdf
1787 if (isNullConstant(Address
) && AM
.Segment
.getNode() == nullptr &&
1788 !IndirectTlsSegRefs
&&
1789 (Subtarget
->isTargetGlibc() || Subtarget
->isTargetAndroid() ||
1790 Subtarget
->isTargetFuchsia())) {
1791 if (Subtarget
->isTarget64BitILP32() && !AllowSegmentRegForX32
)
1793 switch (N
->getPointerInfo().getAddrSpace()) {
1795 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
1798 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
1800 // Address space X86AS::SS is not handled here, because it is not used to
1801 // address TLS areas.
1808 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1809 /// mode. These wrap things that will resolve down into a symbol reference.
1810 /// If no match is possible, this returns true, otherwise it returns false.
1811 bool X86DAGToDAGISel::matchWrapper(SDValue N
, X86ISelAddressMode
&AM
) {
1812 // If the addressing mode already has a symbol as the displacement, we can
1813 // never match another symbol.
1814 if (AM
.hasSymbolicDisplacement())
1817 bool IsRIPRelTLS
= false;
1818 bool IsRIPRel
= N
.getOpcode() == X86ISD::WrapperRIP
;
1820 SDValue Val
= N
.getOperand(0);
1821 if (Val
.getOpcode() == ISD::TargetGlobalTLSAddress
)
1825 // We can't use an addressing mode in the 64-bit large code model.
1826 // Global TLS addressing is an exception. In the medium code model,
1827 // we use can use a mode when RIP wrappers are present.
1828 // That signifies access to globals that are known to be "near",
1829 // such as the GOT itself.
1830 CodeModel::Model M
= TM
.getCodeModel();
1831 if (Subtarget
->is64Bit() && M
== CodeModel::Large
&& !IsRIPRelTLS
)
1834 // Base and index reg must be 0 in order to use %rip as base.
1835 if (IsRIPRel
&& AM
.hasBaseOrIndexReg())
1838 // Make a local copy in case we can't do this fold.
1839 X86ISelAddressMode Backup
= AM
;
1842 SDValue N0
= N
.getOperand(0);
1843 if (auto *G
= dyn_cast
<GlobalAddressSDNode
>(N0
)) {
1844 AM
.GV
= G
->getGlobal();
1845 AM
.SymbolFlags
= G
->getTargetFlags();
1846 Offset
= G
->getOffset();
1847 } else if (auto *CP
= dyn_cast
<ConstantPoolSDNode
>(N0
)) {
1848 AM
.CP
= CP
->getConstVal();
1849 AM
.Alignment
= CP
->getAlign();
1850 AM
.SymbolFlags
= CP
->getTargetFlags();
1851 Offset
= CP
->getOffset();
1852 } else if (auto *S
= dyn_cast
<ExternalSymbolSDNode
>(N0
)) {
1853 AM
.ES
= S
->getSymbol();
1854 AM
.SymbolFlags
= S
->getTargetFlags();
1855 } else if (auto *S
= dyn_cast
<MCSymbolSDNode
>(N0
)) {
1856 AM
.MCSym
= S
->getMCSymbol();
1857 } else if (auto *J
= dyn_cast
<JumpTableSDNode
>(N0
)) {
1858 AM
.JT
= J
->getIndex();
1859 AM
.SymbolFlags
= J
->getTargetFlags();
1860 } else if (auto *BA
= dyn_cast
<BlockAddressSDNode
>(N0
)) {
1861 AM
.BlockAddr
= BA
->getBlockAddress();
1862 AM
.SymbolFlags
= BA
->getTargetFlags();
1863 Offset
= BA
->getOffset();
1865 llvm_unreachable("Unhandled symbol reference node.");
1867 // Can't use an addressing mode with large globals.
1868 if (Subtarget
->is64Bit() && !IsRIPRel
&& AM
.GV
&&
1869 TM
.isLargeGlobalValue(AM
.GV
)) {
1874 if (foldOffsetIntoAddress(Offset
, AM
)) {
1880 AM
.setBaseReg(CurDAG
->getRegister(X86::RIP
, MVT::i64
));
1882 // Commit the changes now that we know this fold is safe.
1886 /// Add the specified node to the specified addressing mode, returning true if
1887 /// it cannot be done. This just pattern matches for the addressing mode.
1888 bool X86DAGToDAGISel::matchAddress(SDValue N
, X86ISelAddressMode
&AM
) {
1889 if (matchAddressRecursively(N
, AM
, 0))
1892 // Post-processing: Make a second attempt to fold a load, if we now know
1893 // that there will not be any other register. This is only performed for
1894 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1895 // any foldable load the first time.
1896 if (Subtarget
->isTarget64BitILP32() &&
1897 AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1898 AM
.Base_Reg
.getNode() != nullptr && AM
.IndexReg
.getNode() == nullptr) {
1899 SDValue Save_Base_Reg
= AM
.Base_Reg
;
1900 if (auto *LoadN
= dyn_cast
<LoadSDNode
>(Save_Base_Reg
)) {
1901 AM
.Base_Reg
= SDValue();
1902 if (matchLoadInAddress(LoadN
, AM
, /*AllowSegmentRegForX32=*/true))
1903 AM
.Base_Reg
= Save_Base_Reg
;
1907 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1908 // a smaller encoding and avoids a scaled-index.
1909 if (AM
.Scale
== 2 &&
1910 AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1911 AM
.Base_Reg
.getNode() == nullptr) {
1912 AM
.Base_Reg
= AM
.IndexReg
;
1916 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1917 // because it has a smaller encoding.
1918 if (TM
.getCodeModel() != CodeModel::Large
&&
1919 (!AM
.GV
|| !TM
.isLargeGlobalValue(AM
.GV
)) && Subtarget
->is64Bit() &&
1920 AM
.Scale
== 1 && AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1921 AM
.Base_Reg
.getNode() == nullptr && AM
.IndexReg
.getNode() == nullptr &&
1922 AM
.SymbolFlags
== X86II::MO_NO_FLAG
&& AM
.hasSymbolicDisplacement()) {
1923 AM
.Base_Reg
= CurDAG
->getRegister(X86::RIP
, MVT::i64
);
1929 bool X86DAGToDAGISel::matchAdd(SDValue
&N
, X86ISelAddressMode
&AM
,
1931 // Add an artificial use to this node so that we can keep track of
1932 // it if it gets CSE'd with a different node.
1933 HandleSDNode
Handle(N
);
1935 X86ISelAddressMode Backup
= AM
;
1936 if (!matchAddressRecursively(N
.getOperand(0), AM
, Depth
+1) &&
1937 !matchAddressRecursively(Handle
.getValue().getOperand(1), AM
, Depth
+1))
1941 // Try again after commutating the operands.
1942 if (!matchAddressRecursively(Handle
.getValue().getOperand(1), AM
,
1944 !matchAddressRecursively(Handle
.getValue().getOperand(0), AM
, Depth
+ 1))
1948 // If we couldn't fold both operands into the address at the same time,
1949 // see if we can just put each operand into a register and fold at least
1951 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
1952 !AM
.Base_Reg
.getNode() &&
1953 !AM
.IndexReg
.getNode()) {
1954 N
= Handle
.getValue();
1955 AM
.Base_Reg
= N
.getOperand(0);
1956 AM
.IndexReg
= N
.getOperand(1);
1960 N
= Handle
.getValue();
1964 // Insert a node into the DAG at least before the Pos node's position. This
1965 // will reposition the node as needed, and will assign it a node ID that is <=
1966 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1967 // IDs! The selection DAG must no longer depend on their uniqueness when this
1969 static void insertDAGNode(SelectionDAG
&DAG
, SDValue Pos
, SDValue N
) {
1970 if (N
->getNodeId() == -1 ||
1971 (SelectionDAGISel::getUninvalidatedNodeId(N
.getNode()) >
1972 SelectionDAGISel::getUninvalidatedNodeId(Pos
.getNode()))) {
1973 DAG
.RepositionNode(Pos
->getIterator(), N
.getNode());
1974 // Mark Node as invalid for pruning as after this it may be a successor to a
1975 // selected node but otherwise be in the same position of Pos.
1976 // Conservatively mark it with the same -abs(Id) to assure node id
1977 // invariant is preserved.
1978 N
->setNodeId(Pos
->getNodeId());
1979 SelectionDAGISel::InvalidateNodeId(N
.getNode());
1983 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1984 // safe. This allows us to convert the shift and and into an h-register
1985 // extract and a scaled index. Returns false if the simplification is
1987 static bool foldMaskAndShiftToExtract(SelectionDAG
&DAG
, SDValue N
,
1989 SDValue Shift
, SDValue X
,
1990 X86ISelAddressMode
&AM
) {
1991 if (Shift
.getOpcode() != ISD::SRL
||
1992 !isa
<ConstantSDNode
>(Shift
.getOperand(1)) ||
1996 int ScaleLog
= 8 - Shift
.getConstantOperandVal(1);
1997 if (ScaleLog
<= 0 || ScaleLog
>= 4 ||
1998 Mask
!= (0xffu
<< ScaleLog
))
2001 MVT XVT
= X
.getSimpleValueType();
2002 MVT VT
= N
.getSimpleValueType();
2004 SDValue Eight
= DAG
.getConstant(8, DL
, MVT::i8
);
2005 SDValue NewMask
= DAG
.getConstant(0xff, DL
, XVT
);
2006 SDValue Srl
= DAG
.getNode(ISD::SRL
, DL
, XVT
, X
, Eight
);
2007 SDValue And
= DAG
.getNode(ISD::AND
, DL
, XVT
, Srl
, NewMask
);
2008 SDValue Ext
= DAG
.getZExtOrTrunc(And
, DL
, VT
);
2009 SDValue ShlCount
= DAG
.getConstant(ScaleLog
, DL
, MVT::i8
);
2010 SDValue Shl
= DAG
.getNode(ISD::SHL
, DL
, VT
, Ext
, ShlCount
);
2012 // Insert the new nodes into the topological ordering. We must do this in
2013 // a valid topological ordering as nothing is going to go back and re-sort
2014 // these nodes. We continually insert before 'N' in sequence as this is
2015 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2016 // hierarchy left to express.
2017 insertDAGNode(DAG
, N
, Eight
);
2018 insertDAGNode(DAG
, N
, NewMask
);
2019 insertDAGNode(DAG
, N
, Srl
);
2020 insertDAGNode(DAG
, N
, And
);
2021 insertDAGNode(DAG
, N
, Ext
);
2022 insertDAGNode(DAG
, N
, ShlCount
);
2023 insertDAGNode(DAG
, N
, Shl
);
2024 DAG
.ReplaceAllUsesWith(N
, Shl
);
2025 DAG
.RemoveDeadNode(N
.getNode());
2027 AM
.Scale
= (1 << ScaleLog
);
2031 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2032 // allows us to fold the shift into this addressing mode. Returns false if the
2033 // transform succeeded.
2034 static bool foldMaskedShiftToScaledMask(SelectionDAG
&DAG
, SDValue N
,
2035 X86ISelAddressMode
&AM
) {
2036 SDValue Shift
= N
.getOperand(0);
2038 // Use a signed mask so that shifting right will insert sign bits. These
2039 // bits will be removed when we shift the result left so it doesn't matter
2040 // what we use. This might allow a smaller immediate encoding.
2041 int64_t Mask
= cast
<ConstantSDNode
>(N
->getOperand(1))->getSExtValue();
2043 // If we have an any_extend feeding the AND, look through it to see if there
2044 // is a shift behind it. But only if the AND doesn't use the extended bits.
2045 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2046 bool FoundAnyExtend
= false;
2047 if (Shift
.getOpcode() == ISD::ANY_EXTEND
&& Shift
.hasOneUse() &&
2048 Shift
.getOperand(0).getSimpleValueType() == MVT::i32
&&
2050 FoundAnyExtend
= true;
2051 Shift
= Shift
.getOperand(0);
2054 if (Shift
.getOpcode() != ISD::SHL
||
2055 !isa
<ConstantSDNode
>(Shift
.getOperand(1)))
2058 SDValue X
= Shift
.getOperand(0);
2060 // Not likely to be profitable if either the AND or SHIFT node has more
2061 // than one use (unless all uses are for address computation). Besides,
2062 // isel mechanism requires their node ids to be reused.
2063 if (!N
.hasOneUse() || !Shift
.hasOneUse())
2066 // Verify that the shift amount is something we can fold.
2067 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
2068 if (ShiftAmt
!= 1 && ShiftAmt
!= 2 && ShiftAmt
!= 3)
2071 MVT VT
= N
.getSimpleValueType();
2073 if (FoundAnyExtend
) {
2074 SDValue NewX
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VT
, X
);
2075 insertDAGNode(DAG
, N
, NewX
);
2079 SDValue NewMask
= DAG
.getConstant(Mask
>> ShiftAmt
, DL
, VT
);
2080 SDValue NewAnd
= DAG
.getNode(ISD::AND
, DL
, VT
, X
, NewMask
);
2081 SDValue NewShift
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewAnd
, Shift
.getOperand(1));
2083 // Insert the new nodes into the topological ordering. We must do this in
2084 // a valid topological ordering as nothing is going to go back and re-sort
2085 // these nodes. We continually insert before 'N' in sequence as this is
2086 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2087 // hierarchy left to express.
2088 insertDAGNode(DAG
, N
, NewMask
);
2089 insertDAGNode(DAG
, N
, NewAnd
);
2090 insertDAGNode(DAG
, N
, NewShift
);
2091 DAG
.ReplaceAllUsesWith(N
, NewShift
);
2092 DAG
.RemoveDeadNode(N
.getNode());
2094 AM
.Scale
= 1 << ShiftAmt
;
2095 AM
.IndexReg
= NewAnd
;
2099 // Implement some heroics to detect shifts of masked values where the mask can
2100 // be replaced by extending the shift and undoing that in the addressing mode
2101 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2102 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2103 // the addressing mode. This results in code such as:
2105 // int f(short *y, int *lookup_table) {
2107 // return *y + lookup_table[*y >> 11];
2111 // movzwl (%rdi), %eax
2114 // addl (%rsi,%rcx,4), %eax
2117 // movzwl (%rdi), %eax
2121 // addl (%rsi,%rcx), %eax
2123 // Note that this function assumes the mask is provided as a mask *after* the
2124 // value is shifted. The input chain may or may not match that, but computing
2125 // such a mask is trivial.
2126 static bool foldMaskAndShiftToScale(SelectionDAG
&DAG
, SDValue N
,
2128 SDValue Shift
, SDValue X
,
2129 X86ISelAddressMode
&AM
) {
2130 if (Shift
.getOpcode() != ISD::SRL
|| !Shift
.hasOneUse() ||
2131 !isa
<ConstantSDNode
>(Shift
.getOperand(1)))
2134 // We need to ensure that mask is a continuous run of bits.
2135 unsigned MaskIdx
, MaskLen
;
2136 if (!isShiftedMask_64(Mask
, MaskIdx
, MaskLen
))
2138 unsigned MaskLZ
= 64 - (MaskIdx
+ MaskLen
);
2140 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
2142 // The amount of shift we're trying to fit into the addressing mode is taken
2143 // from the shifted mask index (number of trailing zeros of the mask).
2144 unsigned AMShiftAmt
= MaskIdx
;
2146 // There is nothing we can do here unless the mask is removing some bits.
2147 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2148 if (AMShiftAmt
== 0 || AMShiftAmt
> 3) return true;
2150 // Scale the leading zero count down based on the actual size of the value.
2151 // Also scale it down based on the size of the shift.
2152 unsigned ScaleDown
= (64 - X
.getSimpleValueType().getSizeInBits()) + ShiftAmt
;
2153 if (MaskLZ
< ScaleDown
)
2155 MaskLZ
-= ScaleDown
;
2157 // The final check is to ensure that any masked out high bits of X are
2158 // already known to be zero. Otherwise, the mask has a semantic impact
2159 // other than masking out a couple of low bits. Unfortunately, because of
2160 // the mask, zero extensions will be removed from operands in some cases.
2161 // This code works extra hard to look through extensions because we can
2162 // replace them with zero extensions cheaply if necessary.
2163 bool ReplacingAnyExtend
= false;
2164 if (X
.getOpcode() == ISD::ANY_EXTEND
) {
2165 unsigned ExtendBits
= X
.getSimpleValueType().getSizeInBits() -
2166 X
.getOperand(0).getSimpleValueType().getSizeInBits();
2167 // Assume that we'll replace the any-extend with a zero-extend, and
2168 // narrow the search to the extended value.
2169 X
= X
.getOperand(0);
2170 MaskLZ
= ExtendBits
> MaskLZ
? 0 : MaskLZ
- ExtendBits
;
2171 ReplacingAnyExtend
= true;
2173 APInt MaskedHighBits
=
2174 APInt::getHighBitsSet(X
.getSimpleValueType().getSizeInBits(), MaskLZ
);
2175 if (!DAG
.MaskedValueIsZero(X
, MaskedHighBits
))
2178 // We've identified a pattern that can be transformed into a single shift
2179 // and an addressing mode. Make it so.
2180 MVT VT
= N
.getSimpleValueType();
2181 if (ReplacingAnyExtend
) {
2182 assert(X
.getValueType() != VT
);
2183 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2184 SDValue NewX
= DAG
.getNode(ISD::ZERO_EXTEND
, SDLoc(X
), VT
, X
);
2185 insertDAGNode(DAG
, N
, NewX
);
2189 MVT XVT
= X
.getSimpleValueType();
2191 SDValue NewSRLAmt
= DAG
.getConstant(ShiftAmt
+ AMShiftAmt
, DL
, MVT::i8
);
2192 SDValue NewSRL
= DAG
.getNode(ISD::SRL
, DL
, XVT
, X
, NewSRLAmt
);
2193 SDValue NewExt
= DAG
.getZExtOrTrunc(NewSRL
, DL
, VT
);
2194 SDValue NewSHLAmt
= DAG
.getConstant(AMShiftAmt
, DL
, MVT::i8
);
2195 SDValue NewSHL
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewExt
, NewSHLAmt
);
2197 // Insert the new nodes into the topological ordering. We must do this in
2198 // a valid topological ordering as nothing is going to go back and re-sort
2199 // these nodes. We continually insert before 'N' in sequence as this is
2200 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2201 // hierarchy left to express.
2202 insertDAGNode(DAG
, N
, NewSRLAmt
);
2203 insertDAGNode(DAG
, N
, NewSRL
);
2204 insertDAGNode(DAG
, N
, NewExt
);
2205 insertDAGNode(DAG
, N
, NewSHLAmt
);
2206 insertDAGNode(DAG
, N
, NewSHL
);
2207 DAG
.ReplaceAllUsesWith(N
, NewSHL
);
2208 DAG
.RemoveDeadNode(N
.getNode());
2210 AM
.Scale
= 1 << AMShiftAmt
;
2211 AM
.IndexReg
= NewExt
;
2215 // Transform "(X >> SHIFT) & (MASK << C1)" to
2216 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2217 // matched to a BEXTR later. Returns false if the simplification is performed.
2218 static bool foldMaskedShiftToBEXTR(SelectionDAG
&DAG
, SDValue N
,
2220 SDValue Shift
, SDValue X
,
2221 X86ISelAddressMode
&AM
,
2222 const X86Subtarget
&Subtarget
) {
2223 if (Shift
.getOpcode() != ISD::SRL
||
2224 !isa
<ConstantSDNode
>(Shift
.getOperand(1)) ||
2225 !Shift
.hasOneUse() || !N
.hasOneUse())
2228 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2229 if (!Subtarget
.hasTBM() &&
2230 !(Subtarget
.hasBMI() && Subtarget
.hasFastBEXTR()))
2233 // We need to ensure that mask is a continuous run of bits.
2234 unsigned MaskIdx
, MaskLen
;
2235 if (!isShiftedMask_64(Mask
, MaskIdx
, MaskLen
))
2238 unsigned ShiftAmt
= Shift
.getConstantOperandVal(1);
2240 // The amount of shift we're trying to fit into the addressing mode is taken
2241 // from the shifted mask index (number of trailing zeros of the mask).
2242 unsigned AMShiftAmt
= MaskIdx
;
2244 // There is nothing we can do here unless the mask is removing some bits.
2245 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2246 if (AMShiftAmt
== 0 || AMShiftAmt
> 3) return true;
2248 MVT XVT
= X
.getSimpleValueType();
2249 MVT VT
= N
.getSimpleValueType();
2251 SDValue NewSRLAmt
= DAG
.getConstant(ShiftAmt
+ AMShiftAmt
, DL
, MVT::i8
);
2252 SDValue NewSRL
= DAG
.getNode(ISD::SRL
, DL
, XVT
, X
, NewSRLAmt
);
2253 SDValue NewMask
= DAG
.getConstant(Mask
>> AMShiftAmt
, DL
, XVT
);
2254 SDValue NewAnd
= DAG
.getNode(ISD::AND
, DL
, XVT
, NewSRL
, NewMask
);
2255 SDValue NewExt
= DAG
.getZExtOrTrunc(NewAnd
, DL
, VT
);
2256 SDValue NewSHLAmt
= DAG
.getConstant(AMShiftAmt
, DL
, MVT::i8
);
2257 SDValue NewSHL
= DAG
.getNode(ISD::SHL
, DL
, VT
, NewExt
, NewSHLAmt
);
2259 // Insert the new nodes into the topological ordering. We must do this in
2260 // a valid topological ordering as nothing is going to go back and re-sort
2261 // these nodes. We continually insert before 'N' in sequence as this is
2262 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2263 // hierarchy left to express.
2264 insertDAGNode(DAG
, N
, NewSRLAmt
);
2265 insertDAGNode(DAG
, N
, NewSRL
);
2266 insertDAGNode(DAG
, N
, NewMask
);
2267 insertDAGNode(DAG
, N
, NewAnd
);
2268 insertDAGNode(DAG
, N
, NewExt
);
2269 insertDAGNode(DAG
, N
, NewSHLAmt
);
2270 insertDAGNode(DAG
, N
, NewSHL
);
2271 DAG
.ReplaceAllUsesWith(N
, NewSHL
);
2272 DAG
.RemoveDeadNode(N
.getNode());
2274 AM
.Scale
= 1 << AMShiftAmt
;
2275 AM
.IndexReg
= NewExt
;
2279 // Attempt to peek further into a scaled index register, collecting additional
2280 // extensions / offsets / etc. Returns /p N if we can't peek any further.
2281 SDValue
X86DAGToDAGISel::matchIndexRecursively(SDValue N
,
2282 X86ISelAddressMode
&AM
,
2284 assert(AM
.IndexReg
.getNode() == nullptr && "IndexReg already matched");
2285 assert((AM
.Scale
== 1 || AM
.Scale
== 2 || AM
.Scale
== 4 || AM
.Scale
== 8) &&
2286 "Illegal index scale");
2289 if (Depth
>= SelectionDAG::MaxRecursionDepth
)
2292 EVT VT
= N
.getValueType();
2293 unsigned Opc
= N
.getOpcode();
2295 // index: add(x,c) -> index: x, disp + c
2296 if (CurDAG
->isBaseWithConstantOffset(N
)) {
2297 auto *AddVal
= cast
<ConstantSDNode
>(N
.getOperand(1));
2298 uint64_t Offset
= (uint64_t)AddVal
->getSExtValue() * AM
.Scale
;
2299 if (!foldOffsetIntoAddress(Offset
, AM
))
2300 return matchIndexRecursively(N
.getOperand(0), AM
, Depth
+ 1);
2303 // index: add(x,x) -> index: x, scale * 2
2304 if (Opc
== ISD::ADD
&& N
.getOperand(0) == N
.getOperand(1)) {
2305 if (AM
.Scale
<= 4) {
2307 return matchIndexRecursively(N
.getOperand(0), AM
, Depth
+ 1);
2311 // index: shl(x,i) -> index: x, scale * (1 << i)
2312 if (Opc
== X86ISD::VSHLI
) {
2313 uint64_t ShiftAmt
= N
.getConstantOperandVal(1);
2314 uint64_t ScaleAmt
= 1ULL << ShiftAmt
;
2315 if ((AM
.Scale
* ScaleAmt
) <= 8) {
2316 AM
.Scale
*= ScaleAmt
;
2317 return matchIndexRecursively(N
.getOperand(0), AM
, Depth
+ 1);
2321 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2322 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2323 if (Opc
== ISD::SIGN_EXTEND
&& !VT
.isVector() && N
.hasOneUse()) {
2324 SDValue Src
= N
.getOperand(0);
2325 if (Src
.getOpcode() == ISD::ADD
&& Src
->getFlags().hasNoSignedWrap() &&
2327 if (CurDAG
->isBaseWithConstantOffset(Src
)) {
2328 SDValue AddSrc
= Src
.getOperand(0);
2329 auto *AddVal
= cast
<ConstantSDNode
>(Src
.getOperand(1));
2330 uint64_t Offset
= (uint64_t)AddVal
->getSExtValue();
2331 if (!foldOffsetIntoAddress(Offset
* AM
.Scale
, AM
)) {
2333 SDValue ExtSrc
= CurDAG
->getNode(Opc
, DL
, VT
, AddSrc
);
2334 SDValue ExtVal
= CurDAG
->getConstant(Offset
, DL
, VT
);
2335 SDValue ExtAdd
= CurDAG
->getNode(ISD::ADD
, DL
, VT
, ExtSrc
, ExtVal
);
2336 insertDAGNode(*CurDAG
, N
, ExtSrc
);
2337 insertDAGNode(*CurDAG
, N
, ExtVal
);
2338 insertDAGNode(*CurDAG
, N
, ExtAdd
);
2339 CurDAG
->ReplaceAllUsesWith(N
, ExtAdd
);
2340 CurDAG
->RemoveDeadNode(N
.getNode());
2347 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2348 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2349 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2350 if (Opc
== ISD::ZERO_EXTEND
&& !VT
.isVector() && N
.hasOneUse()) {
2351 SDValue Src
= N
.getOperand(0);
2352 unsigned SrcOpc
= Src
.getOpcode();
2353 if (((SrcOpc
== ISD::ADD
&& Src
->getFlags().hasNoUnsignedWrap()) ||
2354 CurDAG
->isADDLike(Src
)) &&
2356 if (CurDAG
->isBaseWithConstantOffset(Src
)) {
2357 SDValue AddSrc
= Src
.getOperand(0);
2358 uint64_t Offset
= Src
.getConstantOperandVal(1);
2359 if (!foldOffsetIntoAddress(Offset
* AM
.Scale
, AM
)) {
2362 // If we're also scaling, see if we can use that as well.
2363 if (AddSrc
.getOpcode() == ISD::SHL
&&
2364 isa
<ConstantSDNode
>(AddSrc
.getOperand(1))) {
2365 SDValue ShVal
= AddSrc
.getOperand(0);
2366 uint64_t ShAmt
= AddSrc
.getConstantOperandVal(1);
2368 APInt::getHighBitsSet(AddSrc
.getScalarValueSizeInBits(), ShAmt
);
2369 uint64_t ScaleAmt
= 1ULL << ShAmt
;
2370 if ((AM
.Scale
* ScaleAmt
) <= 8 &&
2371 (AddSrc
->getFlags().hasNoUnsignedWrap() ||
2372 CurDAG
->MaskedValueIsZero(ShVal
, HiBits
))) {
2373 AM
.Scale
*= ScaleAmt
;
2374 SDValue ExtShVal
= CurDAG
->getNode(Opc
, DL
, VT
, ShVal
);
2375 SDValue ExtShift
= CurDAG
->getNode(ISD::SHL
, DL
, VT
, ExtShVal
,
2376 AddSrc
.getOperand(1));
2377 insertDAGNode(*CurDAG
, N
, ExtShVal
);
2378 insertDAGNode(*CurDAG
, N
, ExtShift
);
2383 SDValue ExtSrc
= CurDAG
->getNode(Opc
, DL
, VT
, AddSrc
);
2384 SDValue ExtVal
= CurDAG
->getConstant(Offset
, DL
, VT
);
2385 SDValue ExtAdd
= CurDAG
->getNode(SrcOpc
, DL
, VT
, ExtSrc
, ExtVal
);
2386 insertDAGNode(*CurDAG
, N
, ExtSrc
);
2387 insertDAGNode(*CurDAG
, N
, ExtVal
);
2388 insertDAGNode(*CurDAG
, N
, ExtAdd
);
2389 CurDAG
->ReplaceAllUsesWith(N
, ExtAdd
);
2390 CurDAG
->RemoveDeadNode(N
.getNode());
2391 return Res
? Res
: ExtSrc
;
2397 // TODO: Handle extensions, shifted masks etc.
2401 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N
, X86ISelAddressMode
&AM
,
2405 dbgs() << "MatchAddress: ";
2409 if (Depth
>= SelectionDAG::MaxRecursionDepth
)
2410 return matchAddressBase(N
, AM
);
2412 // If this is already a %rip relative address, we can only merge immediates
2413 // into it. Instead of handling this in every case, we handle it here.
2414 // RIP relative addressing: %rip + 32-bit displacement!
2415 if (AM
.isRIPRelative()) {
2416 // FIXME: JumpTable and ExternalSymbol address currently don't like
2417 // displacements. It isn't very important, but this should be fixed for
2419 if (!(AM
.ES
|| AM
.MCSym
) && AM
.JT
!= -1)
2422 if (auto *Cst
= dyn_cast
<ConstantSDNode
>(N
))
2423 if (!foldOffsetIntoAddress(Cst
->getSExtValue(), AM
))
2428 switch (N
.getOpcode()) {
2430 case ISD::LOCAL_RECOVER
: {
2431 if (!AM
.hasSymbolicDisplacement() && AM
.Disp
== 0)
2432 if (const auto *ESNode
= dyn_cast
<MCSymbolSDNode
>(N
.getOperand(0))) {
2433 // Use the symbol and don't prefix it.
2434 AM
.MCSym
= ESNode
->getMCSymbol();
2439 case ISD::Constant
: {
2440 uint64_t Val
= cast
<ConstantSDNode
>(N
)->getSExtValue();
2441 if (!foldOffsetIntoAddress(Val
, AM
))
2446 case X86ISD::Wrapper
:
2447 case X86ISD::WrapperRIP
:
2448 if (!matchWrapper(N
, AM
))
2453 if (!matchLoadInAddress(cast
<LoadSDNode
>(N
), AM
))
2457 case ISD::FrameIndex
:
2458 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
2459 AM
.Base_Reg
.getNode() == nullptr &&
2460 (!Subtarget
->is64Bit() || isDispSafeForFrameIndex(AM
.Disp
))) {
2461 AM
.BaseType
= X86ISelAddressMode::FrameIndexBase
;
2462 AM
.Base_FrameIndex
= cast
<FrameIndexSDNode
>(N
)->getIndex();
2468 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1)
2471 if (auto *CN
= dyn_cast
<ConstantSDNode
>(N
.getOperand(1))) {
2472 unsigned Val
= CN
->getZExtValue();
2473 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2474 // that the base operand remains free for further matching. If
2475 // the base doesn't end up getting used, a post-processing step
2476 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2477 if (Val
== 1 || Val
== 2 || Val
== 3) {
2478 SDValue ShVal
= N
.getOperand(0);
2479 AM
.Scale
= 1 << Val
;
2480 AM
.IndexReg
= matchIndexRecursively(ShVal
, AM
, Depth
+ 1);
2487 // Scale must not be used already.
2488 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1) break;
2490 // We only handle up to 64-bit values here as those are what matter for
2491 // addressing mode optimizations.
2492 assert(N
.getSimpleValueType().getSizeInBits() <= 64 &&
2493 "Unexpected value size!");
2495 SDValue And
= N
.getOperand(0);
2496 if (And
.getOpcode() != ISD::AND
) break;
2497 SDValue X
= And
.getOperand(0);
2499 // The mask used for the transform is expected to be post-shift, but we
2500 // found the shift first so just apply the shift to the mask before passing
2502 if (!isa
<ConstantSDNode
>(N
.getOperand(1)) ||
2503 !isa
<ConstantSDNode
>(And
.getOperand(1)))
2505 uint64_t Mask
= And
.getConstantOperandVal(1) >> N
.getConstantOperandVal(1);
2507 // Try to fold the mask and shift into the scale, and return false if we
2509 if (!foldMaskAndShiftToScale(*CurDAG
, N
, Mask
, N
, X
, AM
))
2514 case ISD::SMUL_LOHI
:
2515 case ISD::UMUL_LOHI
:
2516 // A mul_lohi where we need the low part can be folded as a plain multiply.
2517 if (N
.getResNo() != 0) break;
2520 case X86ISD::MUL_IMM
:
2521 // X*[3,5,9] -> X+X*[2,4,8]
2522 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&&
2523 AM
.Base_Reg
.getNode() == nullptr &&
2524 AM
.IndexReg
.getNode() == nullptr) {
2525 if (auto *CN
= dyn_cast
<ConstantSDNode
>(N
.getOperand(1)))
2526 if (CN
->getZExtValue() == 3 || CN
->getZExtValue() == 5 ||
2527 CN
->getZExtValue() == 9) {
2528 AM
.Scale
= unsigned(CN
->getZExtValue())-1;
2530 SDValue MulVal
= N
.getOperand(0);
2533 // Okay, we know that we have a scale by now. However, if the scaled
2534 // value is an add of something and a constant, we can fold the
2535 // constant into the disp field here.
2536 if (MulVal
.getNode()->getOpcode() == ISD::ADD
&& MulVal
.hasOneUse() &&
2537 isa
<ConstantSDNode
>(MulVal
.getOperand(1))) {
2538 Reg
= MulVal
.getOperand(0);
2539 auto *AddVal
= cast
<ConstantSDNode
>(MulVal
.getOperand(1));
2540 uint64_t Disp
= AddVal
->getSExtValue() * CN
->getZExtValue();
2541 if (foldOffsetIntoAddress(Disp
, AM
))
2542 Reg
= N
.getOperand(0);
2544 Reg
= N
.getOperand(0);
2547 AM
.IndexReg
= AM
.Base_Reg
= Reg
;
2554 // Given A-B, if A can be completely folded into the address and
2555 // the index field with the index field unused, use -B as the index.
2556 // This is a win if a has multiple parts that can be folded into
2557 // the address. Also, this saves a mov if the base register has
2558 // other uses, since it avoids a two-address sub instruction, however
2559 // it costs an additional mov if the index register has other uses.
2561 // Add an artificial use to this node so that we can keep track of
2562 // it if it gets CSE'd with a different node.
2563 HandleSDNode
Handle(N
);
2565 // Test if the LHS of the sub can be folded.
2566 X86ISelAddressMode Backup
= AM
;
2567 if (matchAddressRecursively(N
.getOperand(0), AM
, Depth
+1)) {
2568 N
= Handle
.getValue();
2572 N
= Handle
.getValue();
2573 // Test if the index field is free for use.
2574 if (AM
.IndexReg
.getNode() || AM
.isRIPRelative()) {
2580 SDValue RHS
= N
.getOperand(1);
2581 // If the RHS involves a register with multiple uses, this
2582 // transformation incurs an extra mov, due to the neg instruction
2583 // clobbering its operand.
2584 if (!RHS
.getNode()->hasOneUse() ||
2585 RHS
.getNode()->getOpcode() == ISD::CopyFromReg
||
2586 RHS
.getNode()->getOpcode() == ISD::TRUNCATE
||
2587 RHS
.getNode()->getOpcode() == ISD::ANY_EXTEND
||
2588 (RHS
.getNode()->getOpcode() == ISD::ZERO_EXTEND
&&
2589 RHS
.getOperand(0).getValueType() == MVT::i32
))
2591 // If the base is a register with multiple uses, this
2592 // transformation may save a mov.
2593 if ((AM
.BaseType
== X86ISelAddressMode::RegBase
&& AM
.Base_Reg
.getNode() &&
2594 !AM
.Base_Reg
.getNode()->hasOneUse()) ||
2595 AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
2597 // If the folded LHS was interesting, this transformation saves
2598 // address arithmetic.
2599 if ((AM
.hasSymbolicDisplacement() && !Backup
.hasSymbolicDisplacement()) +
2600 ((AM
.Disp
!= 0) && (Backup
.Disp
== 0)) +
2601 (AM
.Segment
.getNode() && !Backup
.Segment
.getNode()) >= 2)
2603 // If it doesn't look like it may be an overall win, don't do it.
2609 // Ok, the transformation is legal and appears profitable. Go for it.
2610 // Negation will be emitted later to avoid creating dangling nodes if this
2611 // was an unprofitable LEA.
2613 AM
.NegateIndex
= true;
2620 // See if we can treat the OR/XOR node as an ADD node.
2621 if (!CurDAG
->isADDLike(N
))
2625 if (!matchAdd(N
, AM
, Depth
))
2630 // Perform some heroic transforms on an and of a constant-count shift
2631 // with a constant to enable use of the scaled offset field.
2633 // Scale must not be used already.
2634 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1) break;
2636 // We only handle up to 64-bit values here as those are what matter for
2637 // addressing mode optimizations.
2638 assert(N
.getSimpleValueType().getSizeInBits() <= 64 &&
2639 "Unexpected value size!");
2641 if (!isa
<ConstantSDNode
>(N
.getOperand(1)))
2644 if (N
.getOperand(0).getOpcode() == ISD::SRL
) {
2645 SDValue Shift
= N
.getOperand(0);
2646 SDValue X
= Shift
.getOperand(0);
2648 uint64_t Mask
= N
.getConstantOperandVal(1);
2650 // Try to fold the mask and shift into an extract and scale.
2651 if (!foldMaskAndShiftToExtract(*CurDAG
, N
, Mask
, Shift
, X
, AM
))
2654 // Try to fold the mask and shift directly into the scale.
2655 if (!foldMaskAndShiftToScale(*CurDAG
, N
, Mask
, Shift
, X
, AM
))
2658 // Try to fold the mask and shift into BEXTR and scale.
2659 if (!foldMaskedShiftToBEXTR(*CurDAG
, N
, Mask
, Shift
, X
, AM
, *Subtarget
))
2663 // Try to swap the mask and shift to place shifts which can be done as
2664 // a scale on the outside of the mask.
2665 if (!foldMaskedShiftToScaledMask(*CurDAG
, N
, AM
))
2670 case ISD::ZERO_EXTEND
: {
2671 // Try to widen a zexted shift left to the same size as its use, so we can
2672 // match the shift as a scale factor.
2673 if (AM
.IndexReg
.getNode() != nullptr || AM
.Scale
!= 1)
2676 SDValue Src
= N
.getOperand(0);
2678 // See if we can match a zext(addlike(x,c)).
2679 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2680 if (Src
.getOpcode() == ISD::ADD
|| Src
.getOpcode() == ISD::OR
)
2681 if (SDValue Index
= matchIndexRecursively(N
, AM
, Depth
+ 1))
2683 AM
.IndexReg
= Index
;
2687 // Peek through mask: zext(and(shl(x,c1),c2))
2688 APInt Mask
= APInt::getAllOnes(Src
.getScalarValueSizeInBits());
2689 if (Src
.getOpcode() == ISD::AND
&& Src
.hasOneUse())
2690 if (auto *MaskC
= dyn_cast
<ConstantSDNode
>(Src
.getOperand(1))) {
2691 Mask
= MaskC
->getAPIntValue();
2692 Src
= Src
.getOperand(0);
2695 if (Src
.getOpcode() == ISD::SHL
&& Src
.hasOneUse()) {
2696 // Give up if the shift is not a valid scale factor [1,2,3].
2697 SDValue ShlSrc
= Src
.getOperand(0);
2698 SDValue ShlAmt
= Src
.getOperand(1);
2699 auto *ShAmtC
= dyn_cast
<ConstantSDNode
>(ShlAmt
);
2702 unsigned ShAmtV
= ShAmtC
->getZExtValue();
2706 // The narrow shift must only shift out zero bits (it must be 'nuw').
2707 // That makes it safe to widen to the destination type.
2709 APInt::getHighBitsSet(ShlSrc
.getValueSizeInBits(), ShAmtV
);
2710 if (!Src
->getFlags().hasNoUnsignedWrap() &&
2711 !CurDAG
->MaskedValueIsZero(ShlSrc
, HighZeros
& Mask
))
2714 // zext (shl nuw i8 %x, C1) to i32
2715 // --> shl (zext i8 %x to i32), (zext C1)
2716 // zext (and (shl nuw i8 %x, C1), C2) to i32
2717 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2718 MVT SrcVT
= ShlSrc
.getSimpleValueType();
2719 MVT VT
= N
.getSimpleValueType();
2722 SDValue Res
= ShlSrc
;
2723 if (!Mask
.isAllOnes()) {
2724 Res
= CurDAG
->getConstant(Mask
.lshr(ShAmtV
), DL
, SrcVT
);
2725 insertDAGNode(*CurDAG
, N
, Res
);
2726 Res
= CurDAG
->getNode(ISD::AND
, DL
, SrcVT
, ShlSrc
, Res
);
2727 insertDAGNode(*CurDAG
, N
, Res
);
2729 SDValue Zext
= CurDAG
->getNode(ISD::ZERO_EXTEND
, DL
, VT
, Res
);
2730 insertDAGNode(*CurDAG
, N
, Zext
);
2731 SDValue NewShl
= CurDAG
->getNode(ISD::SHL
, DL
, VT
, Zext
, ShlAmt
);
2732 insertDAGNode(*CurDAG
, N
, NewShl
);
2734 // Convert the shift to scale factor.
2735 AM
.Scale
= 1 << ShAmtV
;
2738 CurDAG
->ReplaceAllUsesWith(N
, NewShl
);
2739 CurDAG
->RemoveDeadNode(N
.getNode());
2743 if (Src
.getOpcode() == ISD::SRL
&& !Mask
.isAllOnes()) {
2744 // Try to fold the mask and shift into an extract and scale.
2745 if (!foldMaskAndShiftToExtract(*CurDAG
, N
, Mask
.getZExtValue(), Src
,
2746 Src
.getOperand(0), AM
))
2749 // Try to fold the mask and shift directly into the scale.
2750 if (!foldMaskAndShiftToScale(*CurDAG
, N
, Mask
.getZExtValue(), Src
,
2751 Src
.getOperand(0), AM
))
2754 // Try to fold the mask and shift into BEXTR and scale.
2755 if (!foldMaskedShiftToBEXTR(*CurDAG
, N
, Mask
.getZExtValue(), Src
,
2756 Src
.getOperand(0), AM
, *Subtarget
))
2764 return matchAddressBase(N
, AM
);
2767 /// Helper for MatchAddress. Add the specified node to the
2768 /// specified addressing mode without any further recursion.
2769 bool X86DAGToDAGISel::matchAddressBase(SDValue N
, X86ISelAddressMode
&AM
) {
2770 // Is the base register already occupied?
2771 if (AM
.BaseType
!= X86ISelAddressMode::RegBase
|| AM
.Base_Reg
.getNode()) {
2772 // If so, check to see if the scale index register is set.
2773 if (!AM
.IndexReg
.getNode()) {
2779 // Otherwise, we cannot select it.
2783 // Default, generate it as a register.
2784 AM
.BaseType
= X86ISelAddressMode::RegBase
;
2789 bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N
,
2790 X86ISelAddressMode
&AM
,
2794 dbgs() << "MatchVectorAddress: ";
2798 if (Depth
>= SelectionDAG::MaxRecursionDepth
)
2799 return matchAddressBase(N
, AM
);
2801 // TODO: Support other operations.
2802 switch (N
.getOpcode()) {
2803 case ISD::Constant
: {
2804 uint64_t Val
= cast
<ConstantSDNode
>(N
)->getSExtValue();
2805 if (!foldOffsetIntoAddress(Val
, AM
))
2809 case X86ISD::Wrapper
:
2810 if (!matchWrapper(N
, AM
))
2814 // Add an artificial use to this node so that we can keep track of
2815 // it if it gets CSE'd with a different node.
2816 HandleSDNode
Handle(N
);
2818 X86ISelAddressMode Backup
= AM
;
2819 if (!matchVectorAddressRecursively(N
.getOperand(0), AM
, Depth
+ 1) &&
2820 !matchVectorAddressRecursively(Handle
.getValue().getOperand(1), AM
,
2825 // Try again after commuting the operands.
2826 if (!matchVectorAddressRecursively(Handle
.getValue().getOperand(1), AM
,
2828 !matchVectorAddressRecursively(Handle
.getValue().getOperand(0), AM
,
2833 N
= Handle
.getValue();
2838 return matchAddressBase(N
, AM
);
2841 /// Helper for selectVectorAddr. Handles things that can be folded into a
2842 /// gather/scatter address. The index register and scale should have already
2844 bool X86DAGToDAGISel::matchVectorAddress(SDValue N
, X86ISelAddressMode
&AM
) {
2845 return matchVectorAddressRecursively(N
, AM
, 0);
2848 bool X86DAGToDAGISel::selectVectorAddr(MemSDNode
*Parent
, SDValue BasePtr
,
2849 SDValue IndexOp
, SDValue ScaleOp
,
2850 SDValue
&Base
, SDValue
&Scale
,
2851 SDValue
&Index
, SDValue
&Disp
,
2853 X86ISelAddressMode AM
;
2854 AM
.Scale
= ScaleOp
->getAsZExtVal();
2856 // Attempt to match index patterns, as long as we're not relying on implicit
2857 // sign-extension, which is performed BEFORE scale.
2858 if (IndexOp
.getScalarValueSizeInBits() == BasePtr
.getScalarValueSizeInBits())
2859 AM
.IndexReg
= matchIndexRecursively(IndexOp
, AM
, 0);
2861 AM
.IndexReg
= IndexOp
;
2863 unsigned AddrSpace
= Parent
->getPointerInfo().getAddrSpace();
2864 if (AddrSpace
== X86AS::GS
)
2865 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
2866 if (AddrSpace
== X86AS::FS
)
2867 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
2868 if (AddrSpace
== X86AS::SS
)
2869 AM
.Segment
= CurDAG
->getRegister(X86::SS
, MVT::i16
);
2872 MVT VT
= BasePtr
.getSimpleValueType();
2874 // Try to match into the base and displacement fields.
2875 if (matchVectorAddress(BasePtr
, AM
))
2878 getAddressOperands(AM
, DL
, VT
, Base
, Scale
, Index
, Disp
, Segment
);
2882 /// Returns true if it is able to pattern match an addressing mode.
2883 /// It returns the operands which make up the maximal addressing mode it can
2884 /// match by reference.
2886 /// Parent is the parent node of the addr operand that is being matched. It
2887 /// is always a load, store, atomic node, or null. It is only null when
2888 /// checking memory operands for inline asm nodes.
2889 bool X86DAGToDAGISel::selectAddr(SDNode
*Parent
, SDValue N
, SDValue
&Base
,
2890 SDValue
&Scale
, SDValue
&Index
,
2891 SDValue
&Disp
, SDValue
&Segment
) {
2892 X86ISelAddressMode AM
;
2895 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2896 // that are not a MemSDNode, and thus don't have proper addrspace info.
2897 Parent
->getOpcode() != ISD::INTRINSIC_W_CHAIN
&& // unaligned loads, fixme
2898 Parent
->getOpcode() != ISD::INTRINSIC_VOID
&& // nontemporal stores
2899 Parent
->getOpcode() != X86ISD::TLSCALL
&& // Fixme
2900 Parent
->getOpcode() != X86ISD::ENQCMD
&& // Fixme
2901 Parent
->getOpcode() != X86ISD::ENQCMDS
&& // Fixme
2902 Parent
->getOpcode() != X86ISD::EH_SJLJ_SETJMP
&& // setjmp
2903 Parent
->getOpcode() != X86ISD::EH_SJLJ_LONGJMP
) { // longjmp
2904 unsigned AddrSpace
=
2905 cast
<MemSDNode
>(Parent
)->getPointerInfo().getAddrSpace();
2906 if (AddrSpace
== X86AS::GS
)
2907 AM
.Segment
= CurDAG
->getRegister(X86::GS
, MVT::i16
);
2908 if (AddrSpace
== X86AS::FS
)
2909 AM
.Segment
= CurDAG
->getRegister(X86::FS
, MVT::i16
);
2910 if (AddrSpace
== X86AS::SS
)
2911 AM
.Segment
= CurDAG
->getRegister(X86::SS
, MVT::i16
);
2914 // Save the DL and VT before calling matchAddress, it can invalidate N.
2916 MVT VT
= N
.getSimpleValueType();
2918 if (matchAddress(N
, AM
))
2921 getAddressOperands(AM
, DL
, VT
, Base
, Scale
, Index
, Disp
, Segment
);
2925 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N
, SDValue
&Imm
) {
2926 // Cannot use 32 bit constants to reference objects in kernel code model.
2927 // Cannot use 32 bit constants to reference objects in large PIC mode since
2928 // GOTOFF is 64 bits.
2929 if (TM
.getCodeModel() == CodeModel::Kernel
||
2930 (TM
.getCodeModel() == CodeModel::Large
&& TM
.isPositionIndependent()))
2933 // In static codegen with small code model, we can get the address of a label
2934 // into a register with 'movl'
2935 if (N
->getOpcode() != X86ISD::Wrapper
)
2938 N
= N
.getOperand(0);
2940 // At least GNU as does not accept 'movl' for TPOFF relocations.
2941 // FIXME: We could use 'movl' when we know we are targeting MC.
2942 if (N
->getOpcode() == ISD::TargetGlobalTLSAddress
)
2946 // Small/medium code model can reference non-TargetGlobalAddress objects with
2947 // 32 bit constants.
2948 if (N
->getOpcode() != ISD::TargetGlobalAddress
) {
2949 return TM
.getCodeModel() == CodeModel::Small
||
2950 TM
.getCodeModel() == CodeModel::Medium
;
2953 const GlobalValue
*GV
= cast
<GlobalAddressSDNode
>(N
)->getGlobal();
2954 if (std::optional
<ConstantRange
> CR
= GV
->getAbsoluteSymbolRange())
2955 return CR
->getUnsignedMax().ult(1ull << 32);
2957 return !TM
.isLargeGlobalValue(GV
);
2960 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N
, SDValue
&Base
,
2961 SDValue
&Scale
, SDValue
&Index
,
2962 SDValue
&Disp
, SDValue
&Segment
) {
2963 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2966 if (!selectLEAAddr(N
, Base
, Scale
, Index
, Disp
, Segment
))
2969 auto *RN
= dyn_cast
<RegisterSDNode
>(Base
);
2970 if (RN
&& RN
->getReg() == 0)
2971 Base
= CurDAG
->getRegister(0, MVT::i64
);
2972 else if (Base
.getValueType() == MVT::i32
&& !isa
<FrameIndexSDNode
>(Base
)) {
2973 // Base could already be %rip, particularly in the x32 ABI.
2974 SDValue ImplDef
= SDValue(CurDAG
->getMachineNode(X86::IMPLICIT_DEF
, DL
,
2976 Base
= CurDAG
->getTargetInsertSubreg(X86::sub_32bit
, DL
, MVT::i64
, ImplDef
,
2980 RN
= dyn_cast
<RegisterSDNode
>(Index
);
2981 if (RN
&& RN
->getReg() == 0)
2982 Index
= CurDAG
->getRegister(0, MVT::i64
);
2984 assert(Index
.getValueType() == MVT::i32
&&
2985 "Expect to be extending 32-bit registers for use in LEA");
2986 SDValue ImplDef
= SDValue(CurDAG
->getMachineNode(X86::IMPLICIT_DEF
, DL
,
2988 Index
= CurDAG
->getTargetInsertSubreg(X86::sub_32bit
, DL
, MVT::i64
, ImplDef
,
2995 /// Calls SelectAddr and determines if the maximal addressing
2996 /// mode it matches can be cost effectively emitted as an LEA instruction.
2997 bool X86DAGToDAGISel::selectLEAAddr(SDValue N
,
2998 SDValue
&Base
, SDValue
&Scale
,
2999 SDValue
&Index
, SDValue
&Disp
,
3001 X86ISelAddressMode AM
;
3003 // Save the DL and VT before calling matchAddress, it can invalidate N.
3005 MVT VT
= N
.getSimpleValueType();
3007 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3009 SDValue Copy
= AM
.Segment
;
3010 SDValue T
= CurDAG
->getRegister(0, MVT::i32
);
3012 if (matchAddress(N
, AM
))
3014 assert (T
== AM
.Segment
);
3017 unsigned Complexity
= 0;
3018 if (AM
.BaseType
== X86ISelAddressMode::RegBase
&& AM
.Base_Reg
.getNode())
3020 else if (AM
.BaseType
== X86ISelAddressMode::FrameIndexBase
)
3023 if (AM
.IndexReg
.getNode())
3026 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3031 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3032 // to a LEA. This is determined with some experimentation but is by no means
3033 // optimal (especially for code size consideration). LEA is nice because of
3034 // its three-address nature. Tweak the cost function again when we can run
3035 // convertToThreeAddress() at register allocation time.
3036 if (AM
.hasSymbolicDisplacement()) {
3037 // For X86-64, always use LEA to materialize RIP-relative addresses.
3038 if (Subtarget
->is64Bit())
3044 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3045 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3046 // duplicating flag-producing instructions later in the pipeline.
3047 if (N
.getOpcode() == ISD::ADD
) {
3048 auto isMathWithFlags
= [](SDValue V
) {
3049 switch (V
.getOpcode()) {
3056 /* TODO: These opcodes can be added safely, but we may want to justify
3057 their inclusion for different reasons (better for reg-alloc).
3062 // Value 1 is the flag output of the node - verify it's not dead.
3063 return !SDValue(V
.getNode(), 1).use_empty();
3068 // TODO: We might want to factor in whether there's a load folding
3069 // opportunity for the math op that disappears with LEA.
3070 if (isMathWithFlags(N
.getOperand(0)) || isMathWithFlags(N
.getOperand(1)))
3077 // If it isn't worth using an LEA, reject it.
3078 if (Complexity
<= 2)
3081 getAddressOperands(AM
, DL
, VT
, Base
, Scale
, Index
, Disp
, Segment
);
3085 /// This is only run on TargetGlobalTLSAddress nodes.
3086 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N
, SDValue
&Base
,
3087 SDValue
&Scale
, SDValue
&Index
,
3088 SDValue
&Disp
, SDValue
&Segment
) {
3089 assert(N
.getOpcode() == ISD::TargetGlobalTLSAddress
);
3090 auto *GA
= cast
<GlobalAddressSDNode
>(N
);
3092 X86ISelAddressMode AM
;
3093 AM
.GV
= GA
->getGlobal();
3094 AM
.Disp
+= GA
->getOffset();
3095 AM
.SymbolFlags
= GA
->getTargetFlags();
3097 if (Subtarget
->is32Bit()) {
3099 AM
.IndexReg
= CurDAG
->getRegister(X86::EBX
, MVT::i32
);
3102 MVT VT
= N
.getSimpleValueType();
3103 getAddressOperands(AM
, SDLoc(N
), VT
, Base
, Scale
, Index
, Disp
, Segment
);
3107 bool X86DAGToDAGISel::selectRelocImm(SDValue N
, SDValue
&Op
) {
3108 // Keep track of the original value type and whether this value was
3109 // truncated. If we see a truncation from pointer type to VT that truncates
3110 // bits that are known to be zero, we can use a narrow reference.
3111 EVT VT
= N
.getValueType();
3112 bool WasTruncated
= false;
3113 if (N
.getOpcode() == ISD::TRUNCATE
) {
3114 WasTruncated
= true;
3115 N
= N
.getOperand(0);
3118 if (N
.getOpcode() != X86ISD::Wrapper
)
3121 // We can only use non-GlobalValues as immediates if they were not truncated,
3122 // as we do not have any range information. If we have a GlobalValue and the
3123 // address was not truncated, we can select it as an operand directly.
3124 unsigned Opc
= N
.getOperand(0)->getOpcode();
3125 if (Opc
!= ISD::TargetGlobalAddress
|| !WasTruncated
) {
3126 Op
= N
.getOperand(0);
3127 // We can only select the operand directly if we didn't have to look past a
3129 return !WasTruncated
;
3132 // Check that the global's range fits into VT.
3133 auto *GA
= cast
<GlobalAddressSDNode
>(N
.getOperand(0));
3134 std::optional
<ConstantRange
> CR
= GA
->getGlobal()->getAbsoluteSymbolRange();
3135 if (!CR
|| CR
->getUnsignedMax().uge(1ull << VT
.getSizeInBits()))
3138 // Okay, we can use a narrow reference.
3139 Op
= CurDAG
->getTargetGlobalAddress(GA
->getGlobal(), SDLoc(N
), VT
,
3140 GA
->getOffset(), GA
->getTargetFlags());
3144 bool X86DAGToDAGISel::tryFoldLoad(SDNode
*Root
, SDNode
*P
, SDValue N
,
3145 SDValue
&Base
, SDValue
&Scale
,
3146 SDValue
&Index
, SDValue
&Disp
,
3148 assert(Root
&& P
&& "Unknown root/parent nodes");
3149 if (!ISD::isNON_EXTLoad(N
.getNode()) ||
3150 !IsProfitableToFold(N
, P
, Root
) ||
3151 !IsLegalToFold(N
, P
, Root
, OptLevel
))
3154 return selectAddr(N
.getNode(),
3155 N
.getOperand(1), Base
, Scale
, Index
, Disp
, Segment
);
3158 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode
*Root
, SDNode
*P
, SDValue N
,
3159 SDValue
&Base
, SDValue
&Scale
,
3160 SDValue
&Index
, SDValue
&Disp
,
3162 assert(Root
&& P
&& "Unknown root/parent nodes");
3163 if (N
->getOpcode() != X86ISD::VBROADCAST_LOAD
||
3164 !IsProfitableToFold(N
, P
, Root
) ||
3165 !IsLegalToFold(N
, P
, Root
, OptLevel
))
3168 return selectAddr(N
.getNode(),
3169 N
.getOperand(1), Base
, Scale
, Index
, Disp
, Segment
);
3172 /// Return an SDNode that returns the value of the global base register.
3173 /// Output instructions required to initialize the global base register,
3175 SDNode
*X86DAGToDAGISel::getGlobalBaseReg() {
3176 unsigned GlobalBaseReg
= getInstrInfo()->getGlobalBaseReg(MF
);
3177 auto &DL
= MF
->getDataLayout();
3178 return CurDAG
->getRegister(GlobalBaseReg
, TLI
->getPointerTy(DL
)).getNode();
3181 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width
, SDNode
*N
) const {
3182 if (N
->getOpcode() == ISD::TRUNCATE
)
3183 N
= N
->getOperand(0).getNode();
3184 if (N
->getOpcode() != X86ISD::Wrapper
)
3187 auto *GA
= dyn_cast
<GlobalAddressSDNode
>(N
->getOperand(0));
3191 auto *GV
= GA
->getGlobal();
3192 std::optional
<ConstantRange
> CR
= GV
->getAbsoluteSymbolRange();
3194 return CR
->getSignedMin().sge(-1ull << Width
) &&
3195 CR
->getSignedMax().slt(1ull << Width
);
3196 // In the kernel code model, globals are in the negative 2GB of the address
3197 // space, so globals can be a sign extended 32-bit immediate.
3198 // In other code models, small globals are in the low 2GB of the address
3199 // space, so sign extending them is equivalent to zero extending them.
3200 return Width
== 32 && !TM
.isLargeGlobalValue(GV
);
3203 X86::CondCode
X86DAGToDAGISel::getCondFromNode(SDNode
*N
) const {
3204 assert(N
->isMachineOpcode() && "Unexpected node");
3205 unsigned Opc
= N
->getMachineOpcode();
3206 const MCInstrDesc
&MCID
= getInstrInfo()->get(Opc
);
3207 int CondNo
= X86::getCondSrcNoFromDesc(MCID
);
3209 return X86::COND_INVALID
;
3211 return static_cast<X86::CondCode
>(N
->getConstantOperandVal(CondNo
));
3214 /// Test whether the given X86ISD::CMP node has any users that use a flag
3216 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags
) const {
3217 // Examine each user of the node.
3218 for (SDNode::use_iterator UI
= Flags
->use_begin(), UE
= Flags
->use_end();
3220 // Only check things that use the flags.
3221 if (UI
.getUse().getResNo() != Flags
.getResNo())
3223 // Only examine CopyToReg uses that copy to EFLAGS.
3224 if (UI
->getOpcode() != ISD::CopyToReg
||
3225 cast
<RegisterSDNode
>(UI
->getOperand(1))->getReg() != X86::EFLAGS
)
3227 // Examine each user of the CopyToReg use.
3228 for (SDNode::use_iterator FlagUI
= UI
->use_begin(),
3229 FlagUE
= UI
->use_end(); FlagUI
!= FlagUE
; ++FlagUI
) {
3230 // Only examine the Flag result.
3231 if (FlagUI
.getUse().getResNo() != 1) continue;
3232 // Anything unusual: assume conservatively.
3233 if (!FlagUI
->isMachineOpcode()) return false;
3234 // Examine the condition code of the user.
3235 X86::CondCode CC
= getCondFromNode(*FlagUI
);
3238 // Comparisons which only use the zero flag.
3239 case X86::COND_E
: case X86::COND_NE
:
3241 // Anything else: assume conservatively.
3250 /// Test whether the given X86ISD::CMP node has any uses which require the SF
3251 /// flag to be accurate.
3252 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags
) const {
3253 // Examine each user of the node.
3254 for (SDNode::use_iterator UI
= Flags
->use_begin(), UE
= Flags
->use_end();
3256 // Only check things that use the flags.
3257 if (UI
.getUse().getResNo() != Flags
.getResNo())
3259 // Only examine CopyToReg uses that copy to EFLAGS.
3260 if (UI
->getOpcode() != ISD::CopyToReg
||
3261 cast
<RegisterSDNode
>(UI
->getOperand(1))->getReg() != X86::EFLAGS
)
3263 // Examine each user of the CopyToReg use.
3264 for (SDNode::use_iterator FlagUI
= UI
->use_begin(),
3265 FlagUE
= UI
->use_end(); FlagUI
!= FlagUE
; ++FlagUI
) {
3266 // Only examine the Flag result.
3267 if (FlagUI
.getUse().getResNo() != 1) continue;
3268 // Anything unusual: assume conservatively.
3269 if (!FlagUI
->isMachineOpcode()) return false;
3270 // Examine the condition code of the user.
3271 X86::CondCode CC
= getCondFromNode(*FlagUI
);
3274 // Comparisons which don't examine the SF flag.
3275 case X86::COND_A
: case X86::COND_AE
:
3276 case X86::COND_B
: case X86::COND_BE
:
3277 case X86::COND_E
: case X86::COND_NE
:
3278 case X86::COND_O
: case X86::COND_NO
:
3279 case X86::COND_P
: case X86::COND_NP
:
3281 // Anything else: assume conservatively.
3290 static bool mayUseCarryFlag(X86::CondCode CC
) {
3292 // Comparisons which don't examine the CF flag.
3293 case X86::COND_O
: case X86::COND_NO
:
3294 case X86::COND_E
: case X86::COND_NE
:
3295 case X86::COND_S
: case X86::COND_NS
:
3296 case X86::COND_P
: case X86::COND_NP
:
3297 case X86::COND_L
: case X86::COND_GE
:
3298 case X86::COND_G
: case X86::COND_LE
:
3300 // Anything else: assume conservatively.
3306 /// Test whether the given node which sets flags has any uses which require the
3307 /// CF flag to be accurate.
3308 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags
) const {
3309 // Examine each user of the node.
3310 for (SDNode::use_iterator UI
= Flags
->use_begin(), UE
= Flags
->use_end();
3312 // Only check things that use the flags.
3313 if (UI
.getUse().getResNo() != Flags
.getResNo())
3316 unsigned UIOpc
= UI
->getOpcode();
3318 if (UIOpc
== ISD::CopyToReg
) {
3319 // Only examine CopyToReg uses that copy to EFLAGS.
3320 if (cast
<RegisterSDNode
>(UI
->getOperand(1))->getReg() != X86::EFLAGS
)
3322 // Examine each user of the CopyToReg use.
3323 for (SDNode::use_iterator FlagUI
= UI
->use_begin(), FlagUE
= UI
->use_end();
3324 FlagUI
!= FlagUE
; ++FlagUI
) {
3325 // Only examine the Flag result.
3326 if (FlagUI
.getUse().getResNo() != 1)
3328 // Anything unusual: assume conservatively.
3329 if (!FlagUI
->isMachineOpcode())
3331 // Examine the condition code of the user.
3332 X86::CondCode CC
= getCondFromNode(*FlagUI
);
3334 if (mayUseCarryFlag(CC
))
3338 // This CopyToReg is ok. Move on to the next user.
3342 // This might be an unselected node. So look for the pre-isel opcodes that
3347 // Something unusual. Be conservative.
3349 case X86ISD::SETCC
: CCOpNo
= 0; break;
3350 case X86ISD::SETCC_CARRY
: CCOpNo
= 0; break;
3351 case X86ISD::CMOV
: CCOpNo
= 2; break;
3352 case X86ISD::BRCOND
: CCOpNo
= 2; break;
3355 X86::CondCode CC
= (X86::CondCode
)UI
->getConstantOperandVal(CCOpNo
);
3356 if (mayUseCarryFlag(CC
))
3362 /// Check whether or not the chain ending in StoreNode is suitable for doing
3363 /// the {load; op; store} to modify transformation.
3364 static bool isFusableLoadOpStorePattern(StoreSDNode
*StoreNode
,
3365 SDValue StoredVal
, SelectionDAG
*CurDAG
,
3367 LoadSDNode
*&LoadNode
,
3368 SDValue
&InputChain
) {
3369 // Is the stored value result 0 of the operation?
3370 if (StoredVal
.getResNo() != 0) return false;
3372 // Are there other uses of the operation other than the store?
3373 if (!StoredVal
.getNode()->hasNUsesOfValue(1, 0)) return false;
3375 // Is the store non-extending and non-indexed?
3376 if (!ISD::isNormalStore(StoreNode
) || StoreNode
->isNonTemporal())
3379 SDValue Load
= StoredVal
->getOperand(LoadOpNo
);
3380 // Is the stored value a non-extending and non-indexed load?
3381 if (!ISD::isNormalLoad(Load
.getNode())) return false;
3383 // Return LoadNode by reference.
3384 LoadNode
= cast
<LoadSDNode
>(Load
);
3386 // Is store the only read of the loaded value?
3387 if (!Load
.hasOneUse())
3390 // Is the address of the store the same as the load?
3391 if (LoadNode
->getBasePtr() != StoreNode
->getBasePtr() ||
3392 LoadNode
->getOffset() != StoreNode
->getOffset())
3395 bool FoundLoad
= false;
3396 SmallVector
<SDValue
, 4> ChainOps
;
3397 SmallVector
<const SDNode
*, 4> LoopWorklist
;
3398 SmallPtrSet
<const SDNode
*, 16> Visited
;
3399 const unsigned int Max
= 1024;
3401 // Visualization of Load-Op-Store fusion:
3402 // -------------------------
3404 // *-lines = Chain operand dependencies.
3405 // |-lines = Normal operand dependencies.
3406 // Dependencies flow down and right. n-suffix references multiple nodes.
3414 // * * \ | => A--LD_OP_ST
3422 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3426 // Ensure the transform is safe by checking for the dual
3427 // dependencies to make sure we do not induce a loop.
3429 // As LD is a predecessor to both OP and ST we can do this by checking:
3430 // a). if LD is a predecessor to a member of Xn or Yn.
3431 // b). if a Zn is a predecessor to ST.
3433 // However, (b) can only occur through being a chain predecessor to
3434 // ST, which is the same as Zn being a member or predecessor of Xn,
3435 // which is a subset of LD being a predecessor of Xn. So it's
3436 // subsumed by check (a).
3438 SDValue Chain
= StoreNode
->getChain();
3440 // Gather X elements in ChainOps.
3441 if (Chain
== Load
.getValue(1)) {
3443 ChainOps
.push_back(Load
.getOperand(0));
3444 } else if (Chain
.getOpcode() == ISD::TokenFactor
) {
3445 for (unsigned i
= 0, e
= Chain
.getNumOperands(); i
!= e
; ++i
) {
3446 SDValue Op
= Chain
.getOperand(i
);
3447 if (Op
== Load
.getValue(1)) {
3449 // Drop Load, but keep its chain. No cycle check necessary.
3450 ChainOps
.push_back(Load
.getOperand(0));
3453 LoopWorklist
.push_back(Op
.getNode());
3454 ChainOps
.push_back(Op
);
3461 // Worklist is currently Xn. Add Yn to worklist.
3462 for (SDValue Op
: StoredVal
->ops())
3463 if (Op
.getNode() != LoadNode
)
3464 LoopWorklist
.push_back(Op
.getNode());
3466 // Check (a) if Load is a predecessor to Xn + Yn
3467 if (SDNode::hasPredecessorHelper(Load
.getNode(), Visited
, LoopWorklist
, Max
,
3472 CurDAG
->getNode(ISD::TokenFactor
, SDLoc(Chain
), MVT::Other
, ChainOps
);
3476 // Change a chain of {load; op; store} of the same value into a simple op
3477 // through memory of that value, if the uses of the modified value and its
3478 // address are suitable.
3480 // The tablegen pattern memory operand pattern is currently not able to match
3481 // the case where the EFLAGS on the original operation are used.
3483 // To move this to tablegen, we'll need to improve tablegen to allow flags to
3484 // be transferred from a node in the pattern to the result node, probably with
3485 // a new keyword. For example, we have this
3486 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3487 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3488 // (implicit EFLAGS)]>;
3489 // but maybe need something like this
3490 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3491 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
3492 // (transferrable EFLAGS)]>;
3494 // Until then, we manually fold these and instruction select the operation
3496 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode
*Node
) {
3497 auto *StoreNode
= cast
<StoreSDNode
>(Node
);
3498 SDValue StoredVal
= StoreNode
->getOperand(1);
3499 unsigned Opc
= StoredVal
->getOpcode();
3501 // Before we try to select anything, make sure this is memory operand size
3502 // and opcode we can handle. Note that this must match the code below that
3503 // actually lowers the opcodes.
3504 EVT MemVT
= StoreNode
->getMemoryVT();
3505 if (MemVT
!= MVT::i64
&& MemVT
!= MVT::i32
&& MemVT
!= MVT::i16
&&
3509 bool IsCommutable
= false;
3510 bool IsNegate
= false;
3515 IsNegate
= isNullConstant(StoredVal
.getOperand(0));
3524 IsCommutable
= true;
3528 unsigned LoadOpNo
= IsNegate
? 1 : 0;
3529 LoadSDNode
*LoadNode
= nullptr;
3531 if (!isFusableLoadOpStorePattern(StoreNode
, StoredVal
, CurDAG
, LoadOpNo
,
3532 LoadNode
, InputChain
)) {
3536 // This operation is commutable, try the other operand.
3538 if (!isFusableLoadOpStorePattern(StoreNode
, StoredVal
, CurDAG
, LoadOpNo
,
3539 LoadNode
, InputChain
))
3543 SDValue Base
, Scale
, Index
, Disp
, Segment
;
3544 if (!selectAddr(LoadNode
, LoadNode
->getBasePtr(), Base
, Scale
, Index
, Disp
,
3548 auto SelectOpcode
= [&](unsigned Opc64
, unsigned Opc32
, unsigned Opc16
,
3550 switch (MemVT
.getSimpleVT().SimpleTy
) {
3560 llvm_unreachable("Invalid size!");
3564 MachineSDNode
*Result
;
3569 unsigned NewOpc
= SelectOpcode(X86::NEG64m
, X86::NEG32m
, X86::NEG16m
,
3571 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
, Segment
, InputChain
};
3572 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
,
3578 // Try to match inc/dec.
3579 if (!Subtarget
->slowIncDec() || CurDAG
->shouldOptForSize()) {
3580 bool IsOne
= isOneConstant(StoredVal
.getOperand(1));
3581 bool IsNegOne
= isAllOnesConstant(StoredVal
.getOperand(1));
3582 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3583 if ((IsOne
|| IsNegOne
) && hasNoCarryFlagUses(StoredVal
.getValue(1))) {
3585 ((Opc
== X86ISD::ADD
) == IsOne
)
3586 ? SelectOpcode(X86::INC64m
, X86::INC32m
, X86::INC16m
, X86::INC8m
)
3587 : SelectOpcode(X86::DEC64m
, X86::DEC32m
, X86::DEC16m
, X86::DEC8m
);
3588 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
, Segment
, InputChain
};
3589 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
,
3600 auto SelectRegOpcode
= [SelectOpcode
](unsigned Opc
) {
3603 return SelectOpcode(X86::ADD64mr
, X86::ADD32mr
, X86::ADD16mr
,
3606 return SelectOpcode(X86::ADC64mr
, X86::ADC32mr
, X86::ADC16mr
,
3609 return SelectOpcode(X86::SUB64mr
, X86::SUB32mr
, X86::SUB16mr
,
3612 return SelectOpcode(X86::SBB64mr
, X86::SBB32mr
, X86::SBB16mr
,
3615 return SelectOpcode(X86::AND64mr
, X86::AND32mr
, X86::AND16mr
,
3618 return SelectOpcode(X86::OR64mr
, X86::OR32mr
, X86::OR16mr
, X86::OR8mr
);
3620 return SelectOpcode(X86::XOR64mr
, X86::XOR32mr
, X86::XOR16mr
,
3623 llvm_unreachable("Invalid opcode!");
3626 auto SelectImmOpcode
= [SelectOpcode
](unsigned Opc
) {
3629 return SelectOpcode(X86::ADD64mi32
, X86::ADD32mi
, X86::ADD16mi
,
3632 return SelectOpcode(X86::ADC64mi32
, X86::ADC32mi
, X86::ADC16mi
,
3635 return SelectOpcode(X86::SUB64mi32
, X86::SUB32mi
, X86::SUB16mi
,
3638 return SelectOpcode(X86::SBB64mi32
, X86::SBB32mi
, X86::SBB16mi
,
3641 return SelectOpcode(X86::AND64mi32
, X86::AND32mi
, X86::AND16mi
,
3644 return SelectOpcode(X86::OR64mi32
, X86::OR32mi
, X86::OR16mi
,
3647 return SelectOpcode(X86::XOR64mi32
, X86::XOR32mi
, X86::XOR16mi
,
3650 llvm_unreachable("Invalid opcode!");
3654 unsigned NewOpc
= SelectRegOpcode(Opc
);
3655 SDValue Operand
= StoredVal
->getOperand(1-LoadOpNo
);
3657 // See if the operand is a constant that we can fold into an immediate
3659 if (auto *OperandC
= dyn_cast
<ConstantSDNode
>(Operand
)) {
3660 int64_t OperandV
= OperandC
->getSExtValue();
3662 // Check if we can shrink the operand enough to fit in an immediate (or
3663 // fit into a smaller immediate) by negating it and switching the
3665 if ((Opc
== X86ISD::ADD
|| Opc
== X86ISD::SUB
) &&
3666 ((MemVT
!= MVT::i8
&& !isInt
<8>(OperandV
) && isInt
<8>(-OperandV
)) ||
3667 (MemVT
== MVT::i64
&& !isInt
<32>(OperandV
) &&
3668 isInt
<32>(-OperandV
))) &&
3669 hasNoCarryFlagUses(StoredVal
.getValue(1))) {
3670 OperandV
= -OperandV
;
3671 Opc
= Opc
== X86ISD::ADD
? X86ISD::SUB
: X86ISD::ADD
;
3674 if (MemVT
!= MVT::i64
|| isInt
<32>(OperandV
)) {
3675 Operand
= CurDAG
->getTargetConstant(OperandV
, SDLoc(Node
), MemVT
);
3676 NewOpc
= SelectImmOpcode(Opc
);
3680 if (Opc
== X86ISD::ADC
|| Opc
== X86ISD::SBB
) {
3682 CurDAG
->getCopyToReg(InputChain
, SDLoc(Node
), X86::EFLAGS
,
3683 StoredVal
.getOperand(2), SDValue());
3685 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
,
3686 Segment
, Operand
, CopyTo
, CopyTo
.getValue(1)};
3687 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
, MVT::Other
,
3690 const SDValue Ops
[] = {Base
, Scale
, Index
, Disp
,
3691 Segment
, Operand
, InputChain
};
3692 Result
= CurDAG
->getMachineNode(NewOpc
, SDLoc(Node
), MVT::i32
, MVT::Other
,
3698 llvm_unreachable("Invalid opcode!");
3701 MachineMemOperand
*MemOps
[] = {StoreNode
->getMemOperand(),
3702 LoadNode
->getMemOperand()};
3703 CurDAG
->setNodeMemRefs(Result
, MemOps
);
3705 // Update Load Chain uses as well.
3706 ReplaceUses(SDValue(LoadNode
, 1), SDValue(Result
, 1));
3707 ReplaceUses(SDValue(StoreNode
, 0), SDValue(Result
, 1));
3708 ReplaceUses(SDValue(StoredVal
.getNode(), 1), SDValue(Result
, 0));
3709 CurDAG
->RemoveDeadNode(Node
);
3713 // See if this is an X & Mask that we can match to BEXTR/BZHI.
3714 // Where Mask is one of the following patterns:
3715 // a) x & (1 << nbits) - 1
3716 // b) x & ~(-1 << nbits)
3717 // c) x & (-1 >> (32 - y))
3718 // d) x << (32 - y) >> (32 - y)
3719 // e) (1 << nbits) - 1
3720 bool X86DAGToDAGISel::matchBitExtract(SDNode
*Node
) {
3722 (Node
->getOpcode() == ISD::ADD
|| Node
->getOpcode() == ISD::AND
||
3723 Node
->getOpcode() == ISD::SRL
) &&
3724 "Should be either an and-mask, or right-shift after clearing high bits.");
3726 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3727 if (!Subtarget
->hasBMI() && !Subtarget
->hasBMI2())
3730 MVT NVT
= Node
->getSimpleValueType(0);
3732 // Only supported for 32 and 64 bits.
3733 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
3739 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3740 // Else, if we only have BMI1's BEXTR, we require one-use.
3741 const bool AllowExtraUsesByDefault
= Subtarget
->hasBMI2();
3742 auto checkUses
= [AllowExtraUsesByDefault
](
3743 SDValue Op
, unsigned NUses
,
3744 std::optional
<bool> AllowExtraUses
) {
3745 return AllowExtraUses
.value_or(AllowExtraUsesByDefault
) ||
3746 Op
.getNode()->hasNUsesOfValue(NUses
, Op
.getResNo());
3748 auto checkOneUse
= [checkUses
](SDValue Op
,
3749 std::optional
<bool> AllowExtraUses
=
3751 return checkUses(Op
, 1, AllowExtraUses
);
3753 auto checkTwoUse
= [checkUses
](SDValue Op
,
3754 std::optional
<bool> AllowExtraUses
=
3756 return checkUses(Op
, 2, AllowExtraUses
);
3759 auto peekThroughOneUseTruncation
= [checkOneUse
](SDValue V
) {
3760 if (V
->getOpcode() == ISD::TRUNCATE
&& checkOneUse(V
)) {
3761 assert(V
.getSimpleValueType() == MVT::i32
&&
3762 V
.getOperand(0).getSimpleValueType() == MVT::i64
&&
3763 "Expected i64 -> i32 truncation");
3764 V
= V
.getOperand(0);
3769 // a) x & ((1 << nbits) + (-1))
3770 auto matchPatternA
= [checkOneUse
, peekThroughOneUseTruncation
, &NBits
,
3771 &NegateNBits
](SDValue Mask
) -> bool {
3772 // Match `add`. Must only have one use!
3773 if (Mask
->getOpcode() != ISD::ADD
|| !checkOneUse(Mask
))
3775 // We should be adding all-ones constant (i.e. subtracting one.)
3776 if (!isAllOnesConstant(Mask
->getOperand(1)))
3778 // Match `1 << nbits`. Might be truncated. Must only have one use!
3779 SDValue M0
= peekThroughOneUseTruncation(Mask
->getOperand(0));
3780 if (M0
->getOpcode() != ISD::SHL
|| !checkOneUse(M0
))
3782 if (!isOneConstant(M0
->getOperand(0)))
3784 NBits
= M0
->getOperand(1);
3785 NegateNBits
= false;
3789 auto isAllOnes
= [this, peekThroughOneUseTruncation
, NVT
](SDValue V
) {
3790 V
= peekThroughOneUseTruncation(V
);
3791 return CurDAG
->MaskedValueIsAllOnes(
3792 V
, APInt::getLowBitsSet(V
.getSimpleValueType().getSizeInBits(),
3793 NVT
.getSizeInBits()));
3796 // b) x & ~(-1 << nbits)
3797 auto matchPatternB
= [checkOneUse
, isAllOnes
, peekThroughOneUseTruncation
,
3798 &NBits
, &NegateNBits
](SDValue Mask
) -> bool {
3799 // Match `~()`. Must only have one use!
3800 if (Mask
.getOpcode() != ISD::XOR
|| !checkOneUse(Mask
))
3802 // The -1 only has to be all-ones for the final Node's NVT.
3803 if (!isAllOnes(Mask
->getOperand(1)))
3805 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3806 SDValue M0
= peekThroughOneUseTruncation(Mask
->getOperand(0));
3807 if (M0
->getOpcode() != ISD::SHL
|| !checkOneUse(M0
))
3809 // The -1 only has to be all-ones for the final Node's NVT.
3810 if (!isAllOnes(M0
->getOperand(0)))
3812 NBits
= M0
->getOperand(1);
3813 NegateNBits
= false;
3817 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3818 // or leave the shift amount as-is, but then we'll have to negate it.
3819 auto canonicalizeShiftAmt
= [&NBits
, &NegateNBits
](SDValue ShiftAmt
,
3820 unsigned Bitwidth
) {
3823 // Skip over a truncate of the shift amount, if any.
3824 if (NBits
.getOpcode() == ISD::TRUNCATE
)
3825 NBits
= NBits
.getOperand(0);
3826 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3827 // If it doesn't match, that's fine, we'll just negate it ourselves.
3828 if (NBits
.getOpcode() != ISD::SUB
)
3830 auto *V0
= dyn_cast
<ConstantSDNode
>(NBits
.getOperand(0));
3831 if (!V0
|| V0
->getZExtValue() != Bitwidth
)
3833 NBits
= NBits
.getOperand(1);
3834 NegateNBits
= false;
3837 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3839 // c) x & (-1 >> (32 - y))
3840 auto matchPatternC
= [checkOneUse
, peekThroughOneUseTruncation
, &NegateNBits
,
3841 canonicalizeShiftAmt
](SDValue Mask
) -> bool {
3842 // The mask itself may be truncated.
3843 Mask
= peekThroughOneUseTruncation(Mask
);
3844 unsigned Bitwidth
= Mask
.getSimpleValueType().getSizeInBits();
3845 // Match `l>>`. Must only have one use!
3846 if (Mask
.getOpcode() != ISD::SRL
|| !checkOneUse(Mask
))
3848 // We should be shifting truly all-ones constant.
3849 if (!isAllOnesConstant(Mask
.getOperand(0)))
3851 SDValue M1
= Mask
.getOperand(1);
3852 // The shift amount should not be used externally.
3853 if (!checkOneUse(M1
))
3855 canonicalizeShiftAmt(M1
, Bitwidth
);
3856 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3857 // is no extra use of the mask. Clearly, there was one since we are here.
3858 // But at the same time, if we need to negate the shift amount,
3859 // then we don't want the mask to stick around, else it's unprofitable.
3860 return !NegateNBits
;
3865 // d) x << z >> z but then we'll have to subtract z from bitwidth
3867 // d) x << (32 - y) >> (32 - y)
3868 auto matchPatternD
= [checkOneUse
, checkTwoUse
, canonicalizeShiftAmt
,
3869 AllowExtraUsesByDefault
, &NegateNBits
,
3870 &X
](SDNode
*Node
) -> bool {
3871 if (Node
->getOpcode() != ISD::SRL
)
3873 SDValue N0
= Node
->getOperand(0);
3874 if (N0
->getOpcode() != ISD::SHL
)
3876 unsigned Bitwidth
= N0
.getSimpleValueType().getSizeInBits();
3877 SDValue N1
= Node
->getOperand(1);
3878 SDValue N01
= N0
->getOperand(1);
3879 // Both of the shifts must be by the exact same value.
3882 canonicalizeShiftAmt(N1
, Bitwidth
);
3883 // There should not be any external uses of the inner shift / shift amount.
3884 // Note that while we are generally okay with external uses given BMI2,
3885 // iff we need to negate the shift amount, we are not okay with extra uses.
3886 const bool AllowExtraUses
= AllowExtraUsesByDefault
&& !NegateNBits
;
3887 if (!checkOneUse(N0
, AllowExtraUses
) || !checkTwoUse(N1
, AllowExtraUses
))
3889 X
= N0
->getOperand(0);
3893 auto matchLowBitMask
= [matchPatternA
, matchPatternB
,
3894 matchPatternC
](SDValue Mask
) -> bool {
3895 return matchPatternA(Mask
) || matchPatternB(Mask
) || matchPatternC(Mask
);
3898 if (Node
->getOpcode() == ISD::AND
) {
3899 X
= Node
->getOperand(0);
3900 SDValue Mask
= Node
->getOperand(1);
3902 if (matchLowBitMask(Mask
)) {
3906 if (!matchLowBitMask(Mask
))
3909 } else if (matchLowBitMask(SDValue(Node
, 0))) {
3910 X
= CurDAG
->getAllOnesConstant(SDLoc(Node
), NVT
);
3911 } else if (!matchPatternD(Node
))
3914 // If we need to negate the shift amount, require BMI2 BZHI support.
3915 // It's just too unprofitable for BMI1 BEXTR.
3916 if (NegateNBits
&& !Subtarget
->hasBMI2())
3921 // Truncate the shift amount.
3922 NBits
= CurDAG
->getNode(ISD::TRUNCATE
, DL
, MVT::i8
, NBits
);
3923 insertDAGNode(*CurDAG
, SDValue(Node
, 0), NBits
);
3925 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3926 // All the other bits are undefined, we do not care about them.
3927 SDValue ImplDef
= SDValue(
3928 CurDAG
->getMachineNode(TargetOpcode::IMPLICIT_DEF
, DL
, MVT::i32
), 0);
3929 insertDAGNode(*CurDAG
, SDValue(Node
, 0), ImplDef
);
3931 SDValue SRIdxVal
= CurDAG
->getTargetConstant(X86::sub_8bit
, DL
, MVT::i32
);
3932 insertDAGNode(*CurDAG
, SDValue(Node
, 0), SRIdxVal
);
3933 NBits
= SDValue(CurDAG
->getMachineNode(TargetOpcode::INSERT_SUBREG
, DL
,
3934 MVT::i32
, ImplDef
, NBits
, SRIdxVal
),
3936 insertDAGNode(*CurDAG
, SDValue(Node
, 0), NBits
);
3938 // We might have matched the amount of high bits to be cleared,
3939 // but we want the amount of low bits to be kept, so negate it then.
3941 SDValue BitWidthC
= CurDAG
->getConstant(NVT
.getSizeInBits(), DL
, MVT::i32
);
3942 insertDAGNode(*CurDAG
, SDValue(Node
, 0), BitWidthC
);
3944 NBits
= CurDAG
->getNode(ISD::SUB
, DL
, MVT::i32
, BitWidthC
, NBits
);
3945 insertDAGNode(*CurDAG
, SDValue(Node
, 0), NBits
);
3948 if (Subtarget
->hasBMI2()) {
3949 // Great, just emit the BZHI..
3950 if (NVT
!= MVT::i32
) {
3951 // But have to place the bit count into the wide-enough register first.
3952 NBits
= CurDAG
->getNode(ISD::ANY_EXTEND
, DL
, NVT
, NBits
);
3953 insertDAGNode(*CurDAG
, SDValue(Node
, 0), NBits
);
3956 SDValue Extract
= CurDAG
->getNode(X86ISD::BZHI
, DL
, NVT
, X
, NBits
);
3957 ReplaceNode(Node
, Extract
.getNode());
3958 SelectCode(Extract
.getNode());
3962 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3963 // *logically* shifted (potentially with one-use trunc inbetween),
3964 // and the truncation was the only use of the shift,
3965 // and if so look past one-use truncation.
3967 SDValue RealX
= peekThroughOneUseTruncation(X
);
3968 // FIXME: only if the shift is one-use?
3969 if (RealX
!= X
&& RealX
.getOpcode() == ISD::SRL
)
3973 MVT XVT
= X
.getSimpleValueType();
3975 // Else, emitting BEXTR requires one more step.
3976 // The 'control' of BEXTR has the pattern of:
3977 // [15...8 bit][ 7...0 bit] location
3978 // [ bit count][ shift] name
3979 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
3981 // Shift NBits left by 8 bits, thus producing 'control'.
3982 // This makes the low 8 bits to be zero.
3983 SDValue C8
= CurDAG
->getConstant(8, DL
, MVT::i8
);
3984 insertDAGNode(*CurDAG
, SDValue(Node
, 0), C8
);
3985 SDValue Control
= CurDAG
->getNode(ISD::SHL
, DL
, MVT::i32
, NBits
, C8
);
3986 insertDAGNode(*CurDAG
, SDValue(Node
, 0), Control
);
3988 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3989 // FIXME: only if the shift is one-use?
3990 if (X
.getOpcode() == ISD::SRL
) {
3991 SDValue ShiftAmt
= X
.getOperand(1);
3992 X
= X
.getOperand(0);
3994 assert(ShiftAmt
.getValueType() == MVT::i8
&&
3995 "Expected shift amount to be i8");
3997 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
3998 // We could zext to i16 in some form, but we intentionally don't do that.
3999 SDValue OrigShiftAmt
= ShiftAmt
;
4000 ShiftAmt
= CurDAG
->getNode(ISD::ZERO_EXTEND
, DL
, MVT::i32
, ShiftAmt
);
4001 insertDAGNode(*CurDAG
, OrigShiftAmt
, ShiftAmt
);
4003 // And now 'or' these low 8 bits of shift amount into the 'control'.
4004 Control
= CurDAG
->getNode(ISD::OR
, DL
, MVT::i32
, Control
, ShiftAmt
);
4005 insertDAGNode(*CurDAG
, SDValue(Node
, 0), Control
);
4008 // But have to place the 'control' into the wide-enough register first.
4009 if (XVT
!= MVT::i32
) {
4010 Control
= CurDAG
->getNode(ISD::ANY_EXTEND
, DL
, XVT
, Control
);
4011 insertDAGNode(*CurDAG
, SDValue(Node
, 0), Control
);
4014 // And finally, form the BEXTR itself.
4015 SDValue Extract
= CurDAG
->getNode(X86ISD::BEXTR
, DL
, XVT
, X
, Control
);
4017 // The 'X' was originally truncated. Do that now.
4019 insertDAGNode(*CurDAG
, SDValue(Node
, 0), Extract
);
4020 Extract
= CurDAG
->getNode(ISD::TRUNCATE
, DL
, NVT
, Extract
);
4023 ReplaceNode(Node
, Extract
.getNode());
4024 SelectCode(Extract
.getNode());
4029 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4030 MachineSDNode
*X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode
*Node
) {
4031 MVT NVT
= Node
->getSimpleValueType(0);
4034 SDValue N0
= Node
->getOperand(0);
4035 SDValue N1
= Node
->getOperand(1);
4037 // If we have TBM we can use an immediate for the control. If we have BMI
4038 // we should only do this if the BEXTR instruction is implemented well.
4039 // Otherwise moving the control into a register makes this more costly.
4040 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4041 // hoisting the move immediate would make it worthwhile with a less optimal
4044 Subtarget
->hasTBM() || (Subtarget
->hasBMI() && Subtarget
->hasFastBEXTR());
4045 if (!PreferBEXTR
&& !Subtarget
->hasBMI2())
4048 // Must have a shift right.
4049 if (N0
->getOpcode() != ISD::SRL
&& N0
->getOpcode() != ISD::SRA
)
4052 // Shift can't have additional users.
4053 if (!N0
->hasOneUse())
4056 // Only supported for 32 and 64 bits.
4057 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
4060 // Shift amount and RHS of and must be constant.
4061 auto *MaskCst
= dyn_cast
<ConstantSDNode
>(N1
);
4062 auto *ShiftCst
= dyn_cast
<ConstantSDNode
>(N0
->getOperand(1));
4063 if (!MaskCst
|| !ShiftCst
)
4066 // And RHS must be a mask.
4067 uint64_t Mask
= MaskCst
->getZExtValue();
4068 if (!isMask_64(Mask
))
4071 uint64_t Shift
= ShiftCst
->getZExtValue();
4072 uint64_t MaskSize
= llvm::popcount(Mask
);
4074 // Don't interfere with something that can be handled by extracting AH.
4075 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4076 if (Shift
== 8 && MaskSize
== 8)
4079 // Make sure we are only using bits that were in the original value, not
4081 if (Shift
+ MaskSize
> NVT
.getSizeInBits())
4084 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4085 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4086 // does not fit into 32 bits. Load folding is not a sufficient reason.
4087 if (!PreferBEXTR
&& MaskSize
<= 32)
4091 unsigned ROpc
, MOpc
;
4093 #define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4095 assert(Subtarget
->hasBMI2() && "We must have BMI2's BZHI then.");
4096 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4097 // Let's perform the mask first, and apply shift later. Note that we need to
4098 // widen the mask to account for the fact that we'll apply shift afterwards!
4099 Control
= CurDAG
->getTargetConstant(Shift
+ MaskSize
, dl
, NVT
);
4100 ROpc
= NVT
== MVT::i64
? GET_EGPR_IF_ENABLED(X86::BZHI64rr
)
4101 : GET_EGPR_IF_ENABLED(X86::BZHI32rr
);
4102 MOpc
= NVT
== MVT::i64
? GET_EGPR_IF_ENABLED(X86::BZHI64rm
)
4103 : GET_EGPR_IF_ENABLED(X86::BZHI32rm
);
4104 unsigned NewOpc
= NVT
== MVT::i64
? X86::MOV32ri64
: X86::MOV32ri
;
4105 Control
= SDValue(CurDAG
->getMachineNode(NewOpc
, dl
, NVT
, Control
), 0);
4107 // The 'control' of BEXTR has the pattern of:
4108 // [15...8 bit][ 7...0 bit] location
4109 // [ bit count][ shift] name
4110 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4111 Control
= CurDAG
->getTargetConstant(Shift
| (MaskSize
<< 8), dl
, NVT
);
4112 if (Subtarget
->hasTBM()) {
4113 ROpc
= NVT
== MVT::i64
? X86::BEXTRI64ri
: X86::BEXTRI32ri
;
4114 MOpc
= NVT
== MVT::i64
? X86::BEXTRI64mi
: X86::BEXTRI32mi
;
4116 assert(Subtarget
->hasBMI() && "We must have BMI1's BEXTR then.");
4117 // BMI requires the immediate to placed in a register.
4118 ROpc
= NVT
== MVT::i64
? GET_EGPR_IF_ENABLED(X86::BEXTR64rr
)
4119 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr
);
4120 MOpc
= NVT
== MVT::i64
? GET_EGPR_IF_ENABLED(X86::BEXTR64rm
)
4121 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm
);
4122 unsigned NewOpc
= NVT
== MVT::i64
? X86::MOV32ri64
: X86::MOV32ri
;
4123 Control
= SDValue(CurDAG
->getMachineNode(NewOpc
, dl
, NVT
, Control
), 0);
4127 MachineSDNode
*NewNode
;
4128 SDValue Input
= N0
->getOperand(0);
4129 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4130 if (tryFoldLoad(Node
, N0
.getNode(), Input
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
4132 Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Control
, Input
.getOperand(0)};
4133 SDVTList VTs
= CurDAG
->getVTList(NVT
, MVT::i32
, MVT::Other
);
4134 NewNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
4135 // Update the chain.
4136 ReplaceUses(Input
.getValue(1), SDValue(NewNode
, 2));
4137 // Record the mem-refs
4138 CurDAG
->setNodeMemRefs(NewNode
, {cast
<LoadSDNode
>(Input
)->getMemOperand()});
4140 NewNode
= CurDAG
->getMachineNode(ROpc
, dl
, NVT
, MVT::i32
, Input
, Control
);
4144 // We still need to apply the shift.
4145 SDValue ShAmt
= CurDAG
->getTargetConstant(Shift
, dl
, NVT
);
4146 unsigned NewOpc
= NVT
== MVT::i64
? X86::SHR64ri
: X86::SHR32ri
;
4148 CurDAG
->getMachineNode(NewOpc
, dl
, NVT
, SDValue(NewNode
, 0), ShAmt
);
4154 // Emit a PCMISTR(I/M) instruction.
4155 MachineSDNode
*X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc
, unsigned MOpc
,
4156 bool MayFoldLoad
, const SDLoc
&dl
,
4157 MVT VT
, SDNode
*Node
) {
4158 SDValue N0
= Node
->getOperand(0);
4159 SDValue N1
= Node
->getOperand(1);
4160 SDValue Imm
= Node
->getOperand(2);
4161 auto *Val
= cast
<ConstantSDNode
>(Imm
)->getConstantIntValue();
4162 Imm
= CurDAG
->getTargetConstant(*Val
, SDLoc(Node
), Imm
.getValueType());
4164 // Try to fold a load. No need to check alignment.
4165 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4166 if (MayFoldLoad
&& tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
4167 SDValue Ops
[] = { N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
4169 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Other
);
4170 MachineSDNode
*CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
4171 // Update the chain.
4172 ReplaceUses(N1
.getValue(1), SDValue(CNode
, 2));
4173 // Record the mem-refs
4174 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
4178 SDValue Ops
[] = { N0
, N1
, Imm
};
4179 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
);
4180 MachineSDNode
*CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, Ops
);
4184 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4185 // to emit a second instruction after this one. This is needed since we have two
4186 // copyToReg nodes glued before this and we need to continue that glue through.
4187 MachineSDNode
*X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc
, unsigned MOpc
,
4188 bool MayFoldLoad
, const SDLoc
&dl
,
4189 MVT VT
, SDNode
*Node
,
4191 SDValue N0
= Node
->getOperand(0);
4192 SDValue N2
= Node
->getOperand(2);
4193 SDValue Imm
= Node
->getOperand(4);
4194 auto *Val
= cast
<ConstantSDNode
>(Imm
)->getConstantIntValue();
4195 Imm
= CurDAG
->getTargetConstant(*Val
, SDLoc(Node
), Imm
.getValueType());
4197 // Try to fold a load. No need to check alignment.
4198 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4199 if (MayFoldLoad
&& tryFoldLoad(Node
, N2
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
4200 SDValue Ops
[] = { N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
4201 N2
.getOperand(0), InGlue
};
4202 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Other
, MVT::Glue
);
4203 MachineSDNode
*CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
4204 InGlue
= SDValue(CNode
, 3);
4205 // Update the chain.
4206 ReplaceUses(N2
.getValue(1), SDValue(CNode
, 2));
4207 // Record the mem-refs
4208 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N2
)->getMemOperand()});
4212 SDValue Ops
[] = { N0
, N2
, Imm
, InGlue
};
4213 SDVTList VTs
= CurDAG
->getVTList(VT
, MVT::i32
, MVT::Glue
);
4214 MachineSDNode
*CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, Ops
);
4215 InGlue
= SDValue(CNode
, 2);
4219 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode
*N
) {
4220 EVT VT
= N
->getValueType(0);
4222 // Only handle scalar shifts.
4226 // Narrower shifts only mask to 5 bits in hardware.
4227 unsigned Size
= VT
== MVT::i64
? 64 : 32;
4229 SDValue OrigShiftAmt
= N
->getOperand(1);
4230 SDValue ShiftAmt
= OrigShiftAmt
;
4233 // Skip over a truncate of the shift amount.
4234 if (ShiftAmt
->getOpcode() == ISD::TRUNCATE
)
4235 ShiftAmt
= ShiftAmt
->getOperand(0);
4237 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4238 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4240 SDValue NewShiftAmt
;
4241 if (ShiftAmt
->getOpcode() == ISD::ADD
|| ShiftAmt
->getOpcode() == ISD::SUB
||
4242 ShiftAmt
->getOpcode() == ISD::XOR
) {
4243 SDValue Add0
= ShiftAmt
->getOperand(0);
4244 SDValue Add1
= ShiftAmt
->getOperand(1);
4245 auto *Add0C
= dyn_cast
<ConstantSDNode
>(Add0
);
4246 auto *Add1C
= dyn_cast
<ConstantSDNode
>(Add1
);
4247 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4248 // to avoid the ADD/SUB/XOR.
4249 if (Add1C
&& Add1C
->getAPIntValue().urem(Size
) == 0) {
4252 } else if (ShiftAmt
->getOpcode() != ISD::ADD
&& ShiftAmt
.hasOneUse() &&
4253 ((Add0C
&& Add0C
->getAPIntValue().urem(Size
) == Size
- 1) ||
4254 (Add1C
&& Add1C
->getAPIntValue().urem(Size
) == Size
- 1))) {
4255 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4256 // we can replace it with a NOT. In the XOR case it may save some code
4257 // size, in the SUB case it also may save a move.
4258 assert(Add0C
== nullptr || Add1C
== nullptr);
4260 // We can only do N-X, not X-N
4261 if (ShiftAmt
->getOpcode() == ISD::SUB
&& Add0C
== nullptr)
4264 EVT OpVT
= ShiftAmt
.getValueType();
4266 SDValue AllOnes
= CurDAG
->getAllOnesConstant(DL
, OpVT
);
4267 NewShiftAmt
= CurDAG
->getNode(ISD::XOR
, DL
, OpVT
,
4268 Add0C
== nullptr ? Add0
: Add1
, AllOnes
);
4269 insertDAGNode(*CurDAG
, OrigShiftAmt
, AllOnes
);
4270 insertDAGNode(*CurDAG
, OrigShiftAmt
, NewShiftAmt
);
4271 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4272 // -X to generate a NEG instead of a SUB of a constant.
4273 } else if (ShiftAmt
->getOpcode() == ISD::SUB
&& Add0C
&&
4274 Add0C
->getZExtValue() != 0) {
4275 EVT SubVT
= ShiftAmt
.getValueType();
4277 if (Add0C
->getZExtValue() % Size
== 0)
4279 else if (ShiftAmt
.hasOneUse() && Size
== 64 &&
4280 Add0C
->getZExtValue() % 32 == 0) {
4281 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4282 // This is mainly beneficial if we already compute (x+n*32).
4283 if (Add1
.getOpcode() == ISD::TRUNCATE
) {
4284 Add1
= Add1
.getOperand(0);
4285 SubVT
= Add1
.getValueType();
4287 if (Add0
.getValueType() != SubVT
) {
4288 Add0
= CurDAG
->getZExtOrTrunc(Add0
, DL
, SubVT
);
4289 insertDAGNode(*CurDAG
, OrigShiftAmt
, Add0
);
4292 X
= CurDAG
->getNode(ISD::ADD
, DL
, SubVT
, Add1
, Add0
);
4293 insertDAGNode(*CurDAG
, OrigShiftAmt
, X
);
4296 // Insert a negate op.
4297 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4298 // that uses it that's not a shift.
4299 SDValue Zero
= CurDAG
->getConstant(0, DL
, SubVT
);
4300 SDValue Neg
= CurDAG
->getNode(ISD::SUB
, DL
, SubVT
, Zero
, X
);
4303 // Insert these operands into a valid topological order so they can
4304 // get selected independently.
4305 insertDAGNode(*CurDAG
, OrigShiftAmt
, Zero
);
4306 insertDAGNode(*CurDAG
, OrigShiftAmt
, Neg
);
4312 if (NewShiftAmt
.getValueType() != MVT::i8
) {
4313 // Need to truncate the shift amount.
4314 NewShiftAmt
= CurDAG
->getNode(ISD::TRUNCATE
, DL
, MVT::i8
, NewShiftAmt
);
4315 // Add to a correct topological ordering.
4316 insertDAGNode(*CurDAG
, OrigShiftAmt
, NewShiftAmt
);
4319 // Insert a new mask to keep the shift amount legal. This should be removed
4320 // by isel patterns.
4321 NewShiftAmt
= CurDAG
->getNode(ISD::AND
, DL
, MVT::i8
, NewShiftAmt
,
4322 CurDAG
->getConstant(Size
- 1, DL
, MVT::i8
));
4323 // Place in a correct topological ordering.
4324 insertDAGNode(*CurDAG
, OrigShiftAmt
, NewShiftAmt
);
4326 SDNode
*UpdatedNode
= CurDAG
->UpdateNodeOperands(N
, N
->getOperand(0),
4328 if (UpdatedNode
!= N
) {
4329 // If we found an existing node, we should replace ourselves with that node
4330 // and wait for it to be selected after its other users.
4331 ReplaceNode(N
, UpdatedNode
);
4335 // If the original shift amount is now dead, delete it so that we don't run
4337 if (OrigShiftAmt
.getNode()->use_empty())
4338 CurDAG
->RemoveDeadNode(OrigShiftAmt
.getNode());
4340 // Now that we've optimized the shift amount, defer to normal isel to get
4341 // load folding and legacy vs BMI2 selection without repeating it here.
4346 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode
*N
) {
4347 MVT NVT
= N
->getSimpleValueType(0);
4348 unsigned Opcode
= N
->getOpcode();
4351 // For operations of the form (x << C1) op C2, check if we can use a smaller
4352 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4353 SDValue Shift
= N
->getOperand(0);
4354 SDValue N1
= N
->getOperand(1);
4356 auto *Cst
= dyn_cast
<ConstantSDNode
>(N1
);
4360 int64_t Val
= Cst
->getSExtValue();
4362 // If we have an any_extend feeding the AND, look through it to see if there
4363 // is a shift behind it. But only if the AND doesn't use the extended bits.
4364 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4365 bool FoundAnyExtend
= false;
4366 if (Shift
.getOpcode() == ISD::ANY_EXTEND
&& Shift
.hasOneUse() &&
4367 Shift
.getOperand(0).getSimpleValueType() == MVT::i32
&&
4369 FoundAnyExtend
= true;
4370 Shift
= Shift
.getOperand(0);
4373 if (Shift
.getOpcode() != ISD::SHL
|| !Shift
.hasOneUse())
4376 // i8 is unshrinkable, i16 should be promoted to i32.
4377 if (NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
4380 auto *ShlCst
= dyn_cast
<ConstantSDNode
>(Shift
.getOperand(1));
4384 uint64_t ShAmt
= ShlCst
->getZExtValue();
4386 // Make sure that we don't change the operation by removing bits.
4387 // This only matters for OR and XOR, AND is unaffected.
4388 uint64_t RemovedBitsMask
= (1ULL << ShAmt
) - 1;
4389 if (Opcode
!= ISD::AND
&& (Val
& RemovedBitsMask
) != 0)
4392 // Check the minimum bitwidth for the new constant.
4393 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4394 auto CanShrinkImmediate
= [&](int64_t &ShiftedVal
) {
4395 if (Opcode
== ISD::AND
) {
4396 // AND32ri is the same as AND64ri32 with zext imm.
4397 // Try this before sign extended immediates below.
4398 ShiftedVal
= (uint64_t)Val
>> ShAmt
;
4399 if (NVT
== MVT::i64
&& !isUInt
<32>(Val
) && isUInt
<32>(ShiftedVal
))
4401 // Also swap order when the AND can become MOVZX.
4402 if (ShiftedVal
== UINT8_MAX
|| ShiftedVal
== UINT16_MAX
)
4405 ShiftedVal
= Val
>> ShAmt
;
4406 if ((!isInt
<8>(Val
) && isInt
<8>(ShiftedVal
)) ||
4407 (!isInt
<32>(Val
) && isInt
<32>(ShiftedVal
)))
4409 if (Opcode
!= ISD::AND
) {
4410 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4411 ShiftedVal
= (uint64_t)Val
>> ShAmt
;
4412 if (NVT
== MVT::i64
&& !isUInt
<32>(Val
) && isUInt
<32>(ShiftedVal
))
4419 if (!CanShrinkImmediate(ShiftedVal
))
4422 // Ok, we can reorder to get a smaller immediate.
4424 // But, its possible the original immediate allowed an AND to become MOVZX.
4425 // Doing this late due to avoid the MakedValueIsZero call as late as
4427 if (Opcode
== ISD::AND
) {
4428 // Find the smallest zext this could possibly be.
4429 unsigned ZExtWidth
= Cst
->getAPIntValue().getActiveBits();
4430 ZExtWidth
= llvm::bit_ceil(std::max(ZExtWidth
, 8U));
4432 // Figure out which bits need to be zero to achieve that mask.
4433 APInt NeededMask
= APInt::getLowBitsSet(NVT
.getSizeInBits(),
4435 NeededMask
&= ~Cst
->getAPIntValue();
4437 if (CurDAG
->MaskedValueIsZero(N
->getOperand(0), NeededMask
))
4441 SDValue X
= Shift
.getOperand(0);
4442 if (FoundAnyExtend
) {
4443 SDValue NewX
= CurDAG
->getNode(ISD::ANY_EXTEND
, dl
, NVT
, X
);
4444 insertDAGNode(*CurDAG
, SDValue(N
, 0), NewX
);
4448 SDValue NewCst
= CurDAG
->getConstant(ShiftedVal
, dl
, NVT
);
4449 insertDAGNode(*CurDAG
, SDValue(N
, 0), NewCst
);
4450 SDValue NewBinOp
= CurDAG
->getNode(Opcode
, dl
, NVT
, X
, NewCst
);
4451 insertDAGNode(*CurDAG
, SDValue(N
, 0), NewBinOp
);
4452 SDValue NewSHL
= CurDAG
->getNode(ISD::SHL
, dl
, NVT
, NewBinOp
,
4453 Shift
.getOperand(1));
4454 ReplaceNode(N
, NewSHL
.getNode());
4455 SelectCode(NewSHL
.getNode());
4459 bool X86DAGToDAGISel::matchVPTERNLOG(SDNode
*Root
, SDNode
*ParentA
,
4460 SDNode
*ParentB
, SDNode
*ParentC
,
4461 SDValue A
, SDValue B
, SDValue C
,
4463 assert(A
.isOperandOf(ParentA
) && B
.isOperandOf(ParentB
) &&
4464 C
.isOperandOf(ParentC
) && "Incorrect parent node");
4466 auto tryFoldLoadOrBCast
=
4467 [this](SDNode
*Root
, SDNode
*P
, SDValue
&L
, SDValue
&Base
, SDValue
&Scale
,
4468 SDValue
&Index
, SDValue
&Disp
, SDValue
&Segment
) {
4469 if (tryFoldLoad(Root
, P
, L
, Base
, Scale
, Index
, Disp
, Segment
))
4472 // Not a load, check for broadcast which may be behind a bitcast.
4473 if (L
.getOpcode() == ISD::BITCAST
&& L
.hasOneUse()) {
4475 L
= L
.getOperand(0);
4478 if (L
.getOpcode() != X86ISD::VBROADCAST_LOAD
)
4481 // Only 32 and 64 bit broadcasts are supported.
4482 auto *MemIntr
= cast
<MemIntrinsicSDNode
>(L
);
4483 unsigned Size
= MemIntr
->getMemoryVT().getSizeInBits();
4484 if (Size
!= 32 && Size
!= 64)
4487 return tryFoldBroadcast(Root
, P
, L
, Base
, Scale
, Index
, Disp
, Segment
);
4490 bool FoldedLoad
= false;
4491 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4492 if (tryFoldLoadOrBCast(Root
, ParentC
, C
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
4494 } else if (tryFoldLoadOrBCast(Root
, ParentA
, A
, Tmp0
, Tmp1
, Tmp2
, Tmp3
,
4498 // Swap bits 1/4 and 3/6.
4499 uint8_t OldImm
= Imm
;
4500 Imm
= OldImm
& 0xa5;
4501 if (OldImm
& 0x02) Imm
|= 0x10;
4502 if (OldImm
& 0x10) Imm
|= 0x02;
4503 if (OldImm
& 0x08) Imm
|= 0x40;
4504 if (OldImm
& 0x40) Imm
|= 0x08;
4505 } else if (tryFoldLoadOrBCast(Root
, ParentB
, B
, Tmp0
, Tmp1
, Tmp2
, Tmp3
,
4509 // Swap bits 1/2 and 5/6.
4510 uint8_t OldImm
= Imm
;
4511 Imm
= OldImm
& 0x99;
4512 if (OldImm
& 0x02) Imm
|= 0x04;
4513 if (OldImm
& 0x04) Imm
|= 0x02;
4514 if (OldImm
& 0x20) Imm
|= 0x40;
4515 if (OldImm
& 0x40) Imm
|= 0x20;
4520 SDValue TImm
= CurDAG
->getTargetConstant(Imm
, DL
, MVT::i8
);
4522 MVT NVT
= Root
->getSimpleValueType(0);
4524 MachineSDNode
*MNode
;
4526 SDVTList VTs
= CurDAG
->getVTList(NVT
, MVT::Other
);
4529 if (C
.getOpcode() == X86ISD::VBROADCAST_LOAD
) {
4530 auto *MemIntr
= cast
<MemIntrinsicSDNode
>(C
);
4531 unsigned EltSize
= MemIntr
->getMemoryVT().getSizeInBits();
4532 assert((EltSize
== 32 || EltSize
== 64) && "Unexpected broadcast size!");
4534 bool UseD
= EltSize
== 32;
4535 if (NVT
.is128BitVector())
4536 Opc
= UseD
? X86::VPTERNLOGDZ128rmbi
: X86::VPTERNLOGQZ128rmbi
;
4537 else if (NVT
.is256BitVector())
4538 Opc
= UseD
? X86::VPTERNLOGDZ256rmbi
: X86::VPTERNLOGQZ256rmbi
;
4539 else if (NVT
.is512BitVector())
4540 Opc
= UseD
? X86::VPTERNLOGDZrmbi
: X86::VPTERNLOGQZrmbi
;
4542 llvm_unreachable("Unexpected vector size!");
4544 bool UseD
= NVT
.getVectorElementType() == MVT::i32
;
4545 if (NVT
.is128BitVector())
4546 Opc
= UseD
? X86::VPTERNLOGDZ128rmi
: X86::VPTERNLOGQZ128rmi
;
4547 else if (NVT
.is256BitVector())
4548 Opc
= UseD
? X86::VPTERNLOGDZ256rmi
: X86::VPTERNLOGQZ256rmi
;
4549 else if (NVT
.is512BitVector())
4550 Opc
= UseD
? X86::VPTERNLOGDZrmi
: X86::VPTERNLOGQZrmi
;
4552 llvm_unreachable("Unexpected vector size!");
4555 SDValue Ops
[] = {A
, B
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, TImm
, C
.getOperand(0)};
4556 MNode
= CurDAG
->getMachineNode(Opc
, DL
, VTs
, Ops
);
4558 // Update the chain.
4559 ReplaceUses(C
.getValue(1), SDValue(MNode
, 1));
4560 // Record the mem-refs
4561 CurDAG
->setNodeMemRefs(MNode
, {cast
<MemSDNode
>(C
)->getMemOperand()});
4563 bool UseD
= NVT
.getVectorElementType() == MVT::i32
;
4565 if (NVT
.is128BitVector())
4566 Opc
= UseD
? X86::VPTERNLOGDZ128rri
: X86::VPTERNLOGQZ128rri
;
4567 else if (NVT
.is256BitVector())
4568 Opc
= UseD
? X86::VPTERNLOGDZ256rri
: X86::VPTERNLOGQZ256rri
;
4569 else if (NVT
.is512BitVector())
4570 Opc
= UseD
? X86::VPTERNLOGDZrri
: X86::VPTERNLOGQZrri
;
4572 llvm_unreachable("Unexpected vector size!");
4574 MNode
= CurDAG
->getMachineNode(Opc
, DL
, NVT
, {A
, B
, C
, TImm
});
4577 ReplaceUses(SDValue(Root
, 0), SDValue(MNode
, 0));
4578 CurDAG
->RemoveDeadNode(Root
);
4582 // Try to match two logic ops to a VPTERNLOG.
4583 // FIXME: Handle more complex patterns that use an operand more than once?
4584 bool X86DAGToDAGISel::tryVPTERNLOG(SDNode
*N
) {
4585 MVT NVT
= N
->getSimpleValueType(0);
4587 // Make sure we support VPTERNLOG.
4588 if (!NVT
.isVector() || !Subtarget
->hasAVX512() ||
4589 NVT
.getVectorElementType() == MVT::i1
)
4592 // We need VLX for 128/256-bit.
4593 if (!(Subtarget
->hasVLX() || NVT
.is512BitVector()))
4596 SDValue N0
= N
->getOperand(0);
4597 SDValue N1
= N
->getOperand(1);
4599 auto getFoldableLogicOp
= [](SDValue Op
) {
4600 // Peek through single use bitcast.
4601 if (Op
.getOpcode() == ISD::BITCAST
&& Op
.hasOneUse())
4602 Op
= Op
.getOperand(0);
4604 if (!Op
.hasOneUse())
4607 unsigned Opc
= Op
.getOpcode();
4608 if (Opc
== ISD::AND
|| Opc
== ISD::OR
|| Opc
== ISD::XOR
||
4609 Opc
== X86ISD::ANDNP
)
4615 SDValue A
, FoldableOp
;
4616 if ((FoldableOp
= getFoldableLogicOp(N1
))) {
4618 } else if ((FoldableOp
= getFoldableLogicOp(N0
))) {
4623 SDValue B
= FoldableOp
.getOperand(0);
4624 SDValue C
= FoldableOp
.getOperand(1);
4625 SDNode
*ParentA
= N
;
4626 SDNode
*ParentB
= FoldableOp
.getNode();
4627 SDNode
*ParentC
= FoldableOp
.getNode();
4629 // We can build the appropriate control immediate by performing the logic
4630 // operation we're matching using these constants for A, B, and C.
4631 uint8_t TernlogMagicA
= 0xf0;
4632 uint8_t TernlogMagicB
= 0xcc;
4633 uint8_t TernlogMagicC
= 0xaa;
4635 // Some of the inputs may be inverted, peek through them and invert the
4636 // magic values accordingly.
4637 // TODO: There may be a bitcast before the xor that we should peek through.
4638 auto PeekThroughNot
= [](SDValue
&Op
, SDNode
*&Parent
, uint8_t &Magic
) {
4639 if (Op
.getOpcode() == ISD::XOR
&& Op
.hasOneUse() &&
4640 ISD::isBuildVectorAllOnes(Op
.getOperand(1).getNode())) {
4642 Parent
= Op
.getNode();
4643 Op
= Op
.getOperand(0);
4647 PeekThroughNot(A
, ParentA
, TernlogMagicA
);
4648 PeekThroughNot(B
, ParentB
, TernlogMagicB
);
4649 PeekThroughNot(C
, ParentC
, TernlogMagicC
);
4652 switch (FoldableOp
.getOpcode()) {
4653 default: llvm_unreachable("Unexpected opcode!");
4654 case ISD::AND
: Imm
= TernlogMagicB
& TernlogMagicC
; break;
4655 case ISD::OR
: Imm
= TernlogMagicB
| TernlogMagicC
; break;
4656 case ISD::XOR
: Imm
= TernlogMagicB
^ TernlogMagicC
; break;
4657 case X86ISD::ANDNP
: Imm
= ~(TernlogMagicB
) & TernlogMagicC
; break;
4660 switch (N
->getOpcode()) {
4661 default: llvm_unreachable("Unexpected opcode!");
4664 Imm
&= ~TernlogMagicA
;
4666 Imm
= ~(Imm
) & TernlogMagicA
;
4668 case ISD::AND
: Imm
&= TernlogMagicA
; break;
4669 case ISD::OR
: Imm
|= TernlogMagicA
; break;
4670 case ISD::XOR
: Imm
^= TernlogMagicA
; break;
4673 return matchVPTERNLOG(N
, ParentA
, ParentB
, ParentC
, A
, B
, C
, Imm
);
4676 /// If the high bits of an 'and' operand are known zero, try setting the
4677 /// high bits of an 'and' constant operand to produce a smaller encoding by
4678 /// creating a small, sign-extended negative immediate rather than a large
4679 /// positive one. This reverses a transform in SimplifyDemandedBits that
4680 /// shrinks mask constants by clearing bits. There is also a possibility that
4681 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4682 /// case, just replace the 'and'. Return 'true' if the node is replaced.
4683 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode
*And
) {
4684 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4685 // have immediate operands.
4686 MVT VT
= And
->getSimpleValueType(0);
4687 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
4690 auto *And1C
= dyn_cast
<ConstantSDNode
>(And
->getOperand(1));
4694 // Bail out if the mask constant is already negative. It's can't shrink more.
4695 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4696 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4697 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4698 // are negative too.
4699 APInt MaskVal
= And1C
->getAPIntValue();
4700 unsigned MaskLZ
= MaskVal
.countl_zero();
4701 if (!MaskLZ
|| (VT
== MVT::i64
&& MaskLZ
== 32))
4704 // Don't extend into the upper 32 bits of a 64 bit mask.
4705 if (VT
== MVT::i64
&& MaskLZ
>= 32) {
4707 MaskVal
= MaskVal
.trunc(32);
4710 SDValue And0
= And
->getOperand(0);
4711 APInt HighZeros
= APInt::getHighBitsSet(MaskVal
.getBitWidth(), MaskLZ
);
4712 APInt NegMaskVal
= MaskVal
| HighZeros
;
4714 // If a negative constant would not allow a smaller encoding, there's no need
4715 // to continue. Only change the constant when we know it's a win.
4716 unsigned MinWidth
= NegMaskVal
.getSignificantBits();
4717 if (MinWidth
> 32 || (MinWidth
> 8 && MaskVal
.getSignificantBits() <= 32))
4720 // Extend masks if we truncated above.
4721 if (VT
== MVT::i64
&& MaskVal
.getBitWidth() < 64) {
4722 NegMaskVal
= NegMaskVal
.zext(64);
4723 HighZeros
= HighZeros
.zext(64);
4726 // The variable operand must be all zeros in the top bits to allow using the
4727 // new, negative constant as the mask.
4728 if (!CurDAG
->MaskedValueIsZero(And0
, HighZeros
))
4731 // Check if the mask is -1. In that case, this is an unnecessary instruction
4732 // that escaped earlier analysis.
4733 if (NegMaskVal
.isAllOnes()) {
4734 ReplaceNode(And
, And0
.getNode());
4738 // A negative mask allows a smaller encoding. Create a new 'and' node.
4739 SDValue NewMask
= CurDAG
->getConstant(NegMaskVal
, SDLoc(And
), VT
);
4740 insertDAGNode(*CurDAG
, SDValue(And
, 0), NewMask
);
4741 SDValue NewAnd
= CurDAG
->getNode(ISD::AND
, SDLoc(And
), VT
, And0
, NewMask
);
4742 ReplaceNode(And
, NewAnd
.getNode());
4743 SelectCode(NewAnd
.getNode());
4747 static unsigned getVPTESTMOpc(MVT TestVT
, bool IsTestN
, bool FoldedLoad
,
4748 bool FoldedBCast
, bool Masked
) {
4749 #define VPTESTM_CASE(VT, SUFFIX) \
4752 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4753 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4756 #define VPTESTM_BROADCAST_CASES(SUFFIX) \
4757 default: llvm_unreachable("Unexpected VT!"); \
4758 VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4759 VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4760 VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4761 VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4762 VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4763 VPTESTM_CASE(v8i64, QZ##SUFFIX)
4765 #define VPTESTM_FULL_CASES(SUFFIX) \
4766 VPTESTM_BROADCAST_CASES(SUFFIX) \
4767 VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4768 VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4769 VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4770 VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4771 VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4772 VPTESTM_CASE(v32i16, WZ##SUFFIX)
4775 switch (TestVT
.SimpleTy
) {
4776 VPTESTM_BROADCAST_CASES(rmb
)
4781 switch (TestVT
.SimpleTy
) {
4782 VPTESTM_FULL_CASES(rm
)
4786 switch (TestVT
.SimpleTy
) {
4787 VPTESTM_FULL_CASES(rr
)
4790 #undef VPTESTM_FULL_CASES
4791 #undef VPTESTM_BROADCAST_CASES
4795 // Try to create VPTESTM instruction. If InMask is not null, it will be used
4796 // to form a masked operation.
4797 bool X86DAGToDAGISel::tryVPTESTM(SDNode
*Root
, SDValue Setcc
,
4799 assert(Subtarget
->hasAVX512() && "Expected AVX512!");
4800 assert(Setcc
.getSimpleValueType().getVectorElementType() == MVT::i1
&&
4803 // Look for equal and not equal compares.
4804 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Setcc
.getOperand(2))->get();
4805 if (CC
!= ISD::SETEQ
&& CC
!= ISD::SETNE
)
4808 SDValue SetccOp0
= Setcc
.getOperand(0);
4809 SDValue SetccOp1
= Setcc
.getOperand(1);
4811 // Canonicalize the all zero vector to the RHS.
4812 if (ISD::isBuildVectorAllZeros(SetccOp0
.getNode()))
4813 std::swap(SetccOp0
, SetccOp1
);
4815 // See if we're comparing against zero.
4816 if (!ISD::isBuildVectorAllZeros(SetccOp1
.getNode()))
4819 SDValue N0
= SetccOp0
;
4821 MVT CmpVT
= N0
.getSimpleValueType();
4822 MVT CmpSVT
= CmpVT
.getVectorElementType();
4824 // Start with both operands the same. We'll try to refine this.
4829 // Look through single use bitcasts.
4830 SDValue N0Temp
= N0
;
4831 if (N0Temp
.getOpcode() == ISD::BITCAST
&& N0Temp
.hasOneUse())
4832 N0Temp
= N0
.getOperand(0);
4834 // Look for single use AND.
4835 if (N0Temp
.getOpcode() == ISD::AND
&& N0Temp
.hasOneUse()) {
4836 Src0
= N0Temp
.getOperand(0);
4837 Src1
= N0Temp
.getOperand(1);
4841 // Without VLX we need to widen the operation.
4842 bool Widen
= !Subtarget
->hasVLX() && !CmpVT
.is512BitVector();
4844 auto tryFoldLoadOrBCast
= [&](SDNode
*Root
, SDNode
*P
, SDValue
&L
,
4845 SDValue
&Base
, SDValue
&Scale
, SDValue
&Index
,
4846 SDValue
&Disp
, SDValue
&Segment
) {
4847 // If we need to widen, we can't fold the load.
4849 if (tryFoldLoad(Root
, P
, L
, Base
, Scale
, Index
, Disp
, Segment
))
4852 // If we didn't fold a load, try to match broadcast. No widening limitation
4853 // for this. But only 32 and 64 bit types are supported.
4854 if (CmpSVT
!= MVT::i32
&& CmpSVT
!= MVT::i64
)
4857 // Look through single use bitcasts.
4858 if (L
.getOpcode() == ISD::BITCAST
&& L
.hasOneUse()) {
4860 L
= L
.getOperand(0);
4863 if (L
.getOpcode() != X86ISD::VBROADCAST_LOAD
)
4866 auto *MemIntr
= cast
<MemIntrinsicSDNode
>(L
);
4867 if (MemIntr
->getMemoryVT().getSizeInBits() != CmpSVT
.getSizeInBits())
4870 return tryFoldBroadcast(Root
, P
, L
, Base
, Scale
, Index
, Disp
, Segment
);
4873 // We can only fold loads if the sources are unique.
4874 bool CanFoldLoads
= Src0
!= Src1
;
4876 bool FoldedLoad
= false;
4877 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
4879 FoldedLoad
= tryFoldLoadOrBCast(Root
, N0
.getNode(), Src1
, Tmp0
, Tmp1
, Tmp2
,
4882 // And is commutative.
4883 FoldedLoad
= tryFoldLoadOrBCast(Root
, N0
.getNode(), Src0
, Tmp0
, Tmp1
,
4886 std::swap(Src0
, Src1
);
4890 bool FoldedBCast
= FoldedLoad
&& Src1
.getOpcode() == X86ISD::VBROADCAST_LOAD
;
4892 bool IsMasked
= InMask
.getNode() != nullptr;
4896 MVT ResVT
= Setcc
.getSimpleValueType();
4899 // Widen the inputs using insert_subreg or copy_to_regclass.
4900 unsigned Scale
= CmpVT
.is128BitVector() ? 4 : 2;
4901 unsigned SubReg
= CmpVT
.is128BitVector() ? X86::sub_xmm
: X86::sub_ymm
;
4902 unsigned NumElts
= CmpVT
.getVectorNumElements() * Scale
;
4903 CmpVT
= MVT::getVectorVT(CmpSVT
, NumElts
);
4904 MaskVT
= MVT::getVectorVT(MVT::i1
, NumElts
);
4905 SDValue ImplDef
= SDValue(CurDAG
->getMachineNode(X86::IMPLICIT_DEF
, dl
,
4907 Src0
= CurDAG
->getTargetInsertSubreg(SubReg
, dl
, CmpVT
, ImplDef
, Src0
);
4910 Src1
= CurDAG
->getTargetInsertSubreg(SubReg
, dl
, CmpVT
, ImplDef
, Src1
);
4914 unsigned RegClass
= TLI
->getRegClassFor(MaskVT
)->getID();
4915 SDValue RC
= CurDAG
->getTargetConstant(RegClass
, dl
, MVT::i32
);
4916 InMask
= SDValue(CurDAG
->getMachineNode(TargetOpcode::COPY_TO_REGCLASS
,
4917 dl
, MaskVT
, InMask
, RC
), 0);
4921 bool IsTestN
= CC
== ISD::SETEQ
;
4922 unsigned Opc
= getVPTESTMOpc(CmpVT
, IsTestN
, FoldedLoad
, FoldedBCast
,
4925 MachineSDNode
*CNode
;
4927 SDVTList VTs
= CurDAG
->getVTList(MaskVT
, MVT::Other
);
4930 SDValue Ops
[] = { InMask
, Src0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
,
4931 Src1
.getOperand(0) };
4932 CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
4934 SDValue Ops
[] = { Src0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
,
4935 Src1
.getOperand(0) };
4936 CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
4939 // Update the chain.
4940 ReplaceUses(Src1
.getValue(1), SDValue(CNode
, 1));
4941 // Record the mem-refs
4942 CurDAG
->setNodeMemRefs(CNode
, {cast
<MemSDNode
>(Src1
)->getMemOperand()});
4945 CNode
= CurDAG
->getMachineNode(Opc
, dl
, MaskVT
, InMask
, Src0
, Src1
);
4947 CNode
= CurDAG
->getMachineNode(Opc
, dl
, MaskVT
, Src0
, Src1
);
4950 // If we widened, we need to shrink the mask VT.
4952 unsigned RegClass
= TLI
->getRegClassFor(ResVT
)->getID();
4953 SDValue RC
= CurDAG
->getTargetConstant(RegClass
, dl
, MVT::i32
);
4954 CNode
= CurDAG
->getMachineNode(TargetOpcode::COPY_TO_REGCLASS
,
4955 dl
, ResVT
, SDValue(CNode
, 0), RC
);
4958 ReplaceUses(SDValue(Root
, 0), SDValue(CNode
, 0));
4959 CurDAG
->RemoveDeadNode(Root
);
4963 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
4965 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode
*N
) {
4966 assert(N
->getOpcode() == ISD::OR
&& "Unexpected opcode!");
4968 MVT NVT
= N
->getSimpleValueType(0);
4970 // Make sure we support VPTERNLOG.
4971 if (!NVT
.isVector() || !Subtarget
->hasAVX512())
4974 // We need VLX for 128/256-bit.
4975 if (!(Subtarget
->hasVLX() || NVT
.is512BitVector()))
4978 SDValue N0
= N
->getOperand(0);
4979 SDValue N1
= N
->getOperand(1);
4981 // Canonicalize AND to LHS.
4982 if (N1
.getOpcode() == ISD::AND
)
4985 if (N0
.getOpcode() != ISD::AND
||
4986 N1
.getOpcode() != X86ISD::ANDNP
||
4987 !N0
.hasOneUse() || !N1
.hasOneUse())
4990 // ANDN is not commutable, use it to pick down A and C.
4991 SDValue A
= N1
.getOperand(0);
4992 SDValue C
= N1
.getOperand(1);
4994 // AND is commutable, if one operand matches A, the other operand is B.
4995 // Otherwise this isn't a match.
4997 if (N0
.getOperand(0) == A
)
4998 B
= N0
.getOperand(1);
4999 else if (N0
.getOperand(1) == A
)
5000 B
= N0
.getOperand(0);
5005 SDValue Imm
= CurDAG
->getTargetConstant(0xCA, dl
, MVT::i8
);
5006 SDValue Ternlog
= CurDAG
->getNode(X86ISD::VPTERNLOG
, dl
, NVT
, A
, B
, C
, Imm
);
5007 ReplaceNode(N
, Ternlog
.getNode());
5009 return matchVPTERNLOG(Ternlog
.getNode(), Ternlog
.getNode(), Ternlog
.getNode(),
5010 Ternlog
.getNode(), A
, B
, C
, 0xCA);
5013 void X86DAGToDAGISel::Select(SDNode
*Node
) {
5014 MVT NVT
= Node
->getSimpleValueType(0);
5015 unsigned Opcode
= Node
->getOpcode();
5018 if (Node
->isMachineOpcode()) {
5019 LLVM_DEBUG(dbgs() << "== "; Node
->dump(CurDAG
); dbgs() << '\n');
5020 Node
->setNodeId(-1);
5021 return; // Already selected.
5026 case ISD::INTRINSIC_W_CHAIN
: {
5027 unsigned IntNo
= Node
->getConstantOperandVal(1);
5030 case Intrinsic::x86_encodekey128
:
5031 case Intrinsic::x86_encodekey256
: {
5032 if (!Subtarget
->hasKL())
5037 default: llvm_unreachable("Impossible intrinsic");
5038 case Intrinsic::x86_encodekey128
: Opcode
= X86::ENCODEKEY128
; break;
5039 case Intrinsic::x86_encodekey256
: Opcode
= X86::ENCODEKEY256
; break;
5042 SDValue Chain
= Node
->getOperand(0);
5043 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM0
, Node
->getOperand(3),
5045 if (Opcode
== X86::ENCODEKEY256
)
5046 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM1
, Node
->getOperand(4),
5049 MachineSDNode
*Res
= CurDAG
->getMachineNode(
5050 Opcode
, dl
, Node
->getVTList(),
5051 {Node
->getOperand(2), Chain
, Chain
.getValue(1)});
5052 ReplaceNode(Node
, Res
);
5055 case Intrinsic::x86_tileloadd64_internal
:
5056 case Intrinsic::x86_tileloaddt164_internal
: {
5057 if (!Subtarget
->hasAMXTILE())
5059 unsigned Opc
= IntNo
== Intrinsic::x86_tileloadd64_internal
5061 : X86::PTILELOADDT1V
;
5062 // _tile_loadd_internal(row, col, buf, STRIDE)
5063 SDValue Base
= Node
->getOperand(4);
5064 SDValue Scale
= getI8Imm(1, dl
);
5065 SDValue Index
= Node
->getOperand(5);
5066 SDValue Disp
= CurDAG
->getTargetConstant(0, dl
, MVT::i32
);
5067 SDValue Segment
= CurDAG
->getRegister(0, MVT::i16
);
5068 SDValue Chain
= Node
->getOperand(0);
5069 MachineSDNode
*CNode
;
5070 SDValue Ops
[] = {Node
->getOperand(2),
5071 Node
->getOperand(3),
5078 CNode
= CurDAG
->getMachineNode(Opc
, dl
, {MVT::x86amx
, MVT::Other
}, Ops
);
5079 ReplaceNode(Node
, CNode
);
5085 case ISD::INTRINSIC_VOID
: {
5086 unsigned IntNo
= Node
->getConstantOperandVal(1);
5089 case Intrinsic::x86_sse3_monitor
:
5090 case Intrinsic::x86_monitorx
:
5091 case Intrinsic::x86_clzero
: {
5092 bool Use64BitPtr
= Node
->getOperand(2).getValueType() == MVT::i64
;
5096 default: llvm_unreachable("Unexpected intrinsic!");
5097 case Intrinsic::x86_sse3_monitor
:
5098 if (!Subtarget
->hasSSE3())
5100 Opc
= Use64BitPtr
? X86::MONITOR64rrr
: X86::MONITOR32rrr
;
5102 case Intrinsic::x86_monitorx
:
5103 if (!Subtarget
->hasMWAITX())
5105 Opc
= Use64BitPtr
? X86::MONITORX64rrr
: X86::MONITORX32rrr
;
5107 case Intrinsic::x86_clzero
:
5108 if (!Subtarget
->hasCLZERO())
5110 Opc
= Use64BitPtr
? X86::CLZERO64r
: X86::CLZERO32r
;
5115 unsigned PtrReg
= Use64BitPtr
? X86::RAX
: X86::EAX
;
5116 SDValue Chain
= CurDAG
->getCopyToReg(Node
->getOperand(0), dl
, PtrReg
,
5117 Node
->getOperand(2), SDValue());
5118 SDValue InGlue
= Chain
.getValue(1);
5120 if (IntNo
== Intrinsic::x86_sse3_monitor
||
5121 IntNo
== Intrinsic::x86_monitorx
) {
5122 // Copy the other two operands to ECX and EDX.
5123 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::ECX
, Node
->getOperand(3),
5125 InGlue
= Chain
.getValue(1);
5126 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::EDX
, Node
->getOperand(4),
5128 InGlue
= Chain
.getValue(1);
5131 MachineSDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, MVT::Other
,
5133 ReplaceNode(Node
, CNode
);
5139 case Intrinsic::x86_tilestored64_internal
: {
5140 unsigned Opc
= X86::PTILESTOREDV
;
5141 // _tile_stored_internal(row, col, buf, STRIDE, c)
5142 SDValue Base
= Node
->getOperand(4);
5143 SDValue Scale
= getI8Imm(1, dl
);
5144 SDValue Index
= Node
->getOperand(5);
5145 SDValue Disp
= CurDAG
->getTargetConstant(0, dl
, MVT::i32
);
5146 SDValue Segment
= CurDAG
->getRegister(0, MVT::i16
);
5147 SDValue Chain
= Node
->getOperand(0);
5148 MachineSDNode
*CNode
;
5149 SDValue Ops
[] = {Node
->getOperand(2),
5150 Node
->getOperand(3),
5156 Node
->getOperand(6),
5158 CNode
= CurDAG
->getMachineNode(Opc
, dl
, MVT::Other
, Ops
);
5159 ReplaceNode(Node
, CNode
);
5162 case Intrinsic::x86_tileloadd64
:
5163 case Intrinsic::x86_tileloaddt164
:
5164 case Intrinsic::x86_tilestored64
: {
5165 if (!Subtarget
->hasAMXTILE())
5169 default: llvm_unreachable("Unexpected intrinsic!");
5170 case Intrinsic::x86_tileloadd64
: Opc
= X86::PTILELOADD
; break;
5171 case Intrinsic::x86_tileloaddt164
: Opc
= X86::PTILELOADDT1
; break;
5172 case Intrinsic::x86_tilestored64
: Opc
= X86::PTILESTORED
; break;
5174 // FIXME: Match displacement and scale.
5175 unsigned TIndex
= Node
->getConstantOperandVal(2);
5176 SDValue TReg
= getI8Imm(TIndex
, dl
);
5177 SDValue Base
= Node
->getOperand(3);
5178 SDValue Scale
= getI8Imm(1, dl
);
5179 SDValue Index
= Node
->getOperand(4);
5180 SDValue Disp
= CurDAG
->getTargetConstant(0, dl
, MVT::i32
);
5181 SDValue Segment
= CurDAG
->getRegister(0, MVT::i16
);
5182 SDValue Chain
= Node
->getOperand(0);
5183 MachineSDNode
*CNode
;
5184 if (Opc
== X86::PTILESTORED
) {
5185 SDValue Ops
[] = { Base
, Scale
, Index
, Disp
, Segment
, TReg
, Chain
};
5186 CNode
= CurDAG
->getMachineNode(Opc
, dl
, MVT::Other
, Ops
);
5188 SDValue Ops
[] = { TReg
, Base
, Scale
, Index
, Disp
, Segment
, Chain
};
5189 CNode
= CurDAG
->getMachineNode(Opc
, dl
, MVT::Other
, Ops
);
5191 ReplaceNode(Node
, CNode
);
5198 case X86ISD::NT_BRIND
: {
5199 if (Subtarget
->isTargetNaCl())
5200 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5201 // leave the instruction alone.
5203 if (Subtarget
->isTarget64BitILP32()) {
5204 // Converts a 32-bit register to a 64-bit, zero-extended version of
5205 // it. This is needed because x86-64 can do many things, but jmp %r32
5206 // ain't one of them.
5207 SDValue Target
= Node
->getOperand(1);
5208 assert(Target
.getValueType() == MVT::i32
&& "Unexpected VT!");
5209 SDValue ZextTarget
= CurDAG
->getZExtOrTrunc(Target
, dl
, MVT::i64
);
5210 SDValue Brind
= CurDAG
->getNode(Opcode
, dl
, MVT::Other
,
5211 Node
->getOperand(0), ZextTarget
);
5212 ReplaceNode(Node
, Brind
.getNode());
5213 SelectCode(ZextTarget
.getNode());
5214 SelectCode(Brind
.getNode());
5219 case X86ISD::GlobalBaseReg
:
5220 ReplaceNode(Node
, getGlobalBaseReg());
5224 // Just drop all 128/256/512-bit bitcasts.
5225 if (NVT
.is512BitVector() || NVT
.is256BitVector() || NVT
.is128BitVector() ||
5227 ReplaceUses(SDValue(Node
, 0), Node
->getOperand(0));
5228 CurDAG
->RemoveDeadNode(Node
);
5234 if (matchBitExtract(Node
))
5239 if (tryShiftAmountMod(Node
))
5243 case X86ISD::VPTERNLOG
: {
5244 uint8_t Imm
= Node
->getConstantOperandVal(3);
5245 if (matchVPTERNLOG(Node
, Node
, Node
, Node
, Node
->getOperand(0),
5246 Node
->getOperand(1), Node
->getOperand(2), Imm
))
5252 if (tryVPTERNLOG(Node
))
5257 if (NVT
.isVector() && NVT
.getVectorElementType() == MVT::i1
) {
5258 // Try to form a masked VPTESTM. Operands can be in either order.
5259 SDValue N0
= Node
->getOperand(0);
5260 SDValue N1
= Node
->getOperand(1);
5261 if (N0
.getOpcode() == ISD::SETCC
&& N0
.hasOneUse() &&
5262 tryVPTESTM(Node
, N0
, N1
))
5264 if (N1
.getOpcode() == ISD::SETCC
&& N1
.hasOneUse() &&
5265 tryVPTESTM(Node
, N1
, N0
))
5269 if (MachineSDNode
*NewNode
= matchBEXTRFromAndImm(Node
)) {
5270 ReplaceUses(SDValue(Node
, 0), SDValue(NewNode
, 0));
5271 CurDAG
->RemoveDeadNode(Node
);
5274 if (matchBitExtract(Node
))
5276 if (AndImmShrink
&& shrinkAndImmediate(Node
))
5282 if (tryShrinkShlLogicImm(Node
))
5284 if (Opcode
== ISD::OR
&& tryMatchBitSelect(Node
))
5286 if (tryVPTERNLOG(Node
))
5291 if (Opcode
== ISD::ADD
&& matchBitExtract(Node
))
5295 // Try to avoid folding immediates with multiple uses for optsize.
5296 // This code tries to select to register form directly to avoid going
5297 // through the isel table which might fold the immediate. We can't change
5298 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5299 // tablegen files to check immediate use count without making the patterns
5300 // unavailable to the fast-isel table.
5301 if (!CurDAG
->shouldOptForSize())
5304 // Only handle i8/i16/i32/i64.
5305 if (NVT
!= MVT::i8
&& NVT
!= MVT::i16
&& NVT
!= MVT::i32
&& NVT
!= MVT::i64
)
5308 SDValue N0
= Node
->getOperand(0);
5309 SDValue N1
= Node
->getOperand(1);
5311 auto *Cst
= dyn_cast
<ConstantSDNode
>(N1
);
5315 int64_t Val
= Cst
->getSExtValue();
5317 // Make sure its an immediate that is considered foldable.
5318 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5319 if (!isInt
<8>(Val
) && !isInt
<32>(Val
))
5322 // If this can match to INC/DEC, let it go.
5323 if (Opcode
== ISD::ADD
&& (Val
== 1 || Val
== -1))
5326 // Check if we should avoid folding this immediate.
5327 if (!shouldAvoidImmediateInstFormsForSize(N1
.getNode()))
5330 // We should not fold the immediate. So we need a register form instead.
5331 unsigned ROpc
, MOpc
;
5332 switch (NVT
.SimpleTy
) {
5333 default: llvm_unreachable("Unexpected VT!");
5336 default: llvm_unreachable("Unexpected opcode!");
5337 case ISD::ADD
: ROpc
= X86::ADD8rr
; MOpc
= X86::ADD8rm
; break;
5338 case ISD::SUB
: ROpc
= X86::SUB8rr
; MOpc
= X86::SUB8rm
; break;
5339 case ISD::AND
: ROpc
= X86::AND8rr
; MOpc
= X86::AND8rm
; break;
5340 case ISD::OR
: ROpc
= X86::OR8rr
; MOpc
= X86::OR8rm
; break;
5341 case ISD::XOR
: ROpc
= X86::XOR8rr
; MOpc
= X86::XOR8rm
; break;
5346 default: llvm_unreachable("Unexpected opcode!");
5347 case ISD::ADD
: ROpc
= X86::ADD16rr
; MOpc
= X86::ADD16rm
; break;
5348 case ISD::SUB
: ROpc
= X86::SUB16rr
; MOpc
= X86::SUB16rm
; break;
5349 case ISD::AND
: ROpc
= X86::AND16rr
; MOpc
= X86::AND16rm
; break;
5350 case ISD::OR
: ROpc
= X86::OR16rr
; MOpc
= X86::OR16rm
; break;
5351 case ISD::XOR
: ROpc
= X86::XOR16rr
; MOpc
= X86::XOR16rm
; break;
5356 default: llvm_unreachable("Unexpected opcode!");
5357 case ISD::ADD
: ROpc
= X86::ADD32rr
; MOpc
= X86::ADD32rm
; break;
5358 case ISD::SUB
: ROpc
= X86::SUB32rr
; MOpc
= X86::SUB32rm
; break;
5359 case ISD::AND
: ROpc
= X86::AND32rr
; MOpc
= X86::AND32rm
; break;
5360 case ISD::OR
: ROpc
= X86::OR32rr
; MOpc
= X86::OR32rm
; break;
5361 case ISD::XOR
: ROpc
= X86::XOR32rr
; MOpc
= X86::XOR32rm
; break;
5366 default: llvm_unreachable("Unexpected opcode!");
5367 case ISD::ADD
: ROpc
= X86::ADD64rr
; MOpc
= X86::ADD64rm
; break;
5368 case ISD::SUB
: ROpc
= X86::SUB64rr
; MOpc
= X86::SUB64rm
; break;
5369 case ISD::AND
: ROpc
= X86::AND64rr
; MOpc
= X86::AND64rm
; break;
5370 case ISD::OR
: ROpc
= X86::OR64rr
; MOpc
= X86::OR64rm
; break;
5371 case ISD::XOR
: ROpc
= X86::XOR64rr
; MOpc
= X86::XOR64rm
; break;
5376 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5378 // If this is a not a subtract, we can still try to fold a load.
5379 if (Opcode
!= ISD::SUB
) {
5380 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
5381 if (tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
5382 SDValue Ops
[] = { N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N0
.getOperand(0) };
5383 SDVTList VTs
= CurDAG
->getVTList(NVT
, MVT::i32
, MVT::Other
);
5384 MachineSDNode
*CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
5385 // Update the chain.
5386 ReplaceUses(N0
.getValue(1), SDValue(CNode
, 2));
5387 // Record the mem-refs
5388 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N0
)->getMemOperand()});
5389 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
5390 CurDAG
->RemoveDeadNode(Node
);
5395 CurDAG
->SelectNodeTo(Node
, ROpc
, NVT
, MVT::i32
, N0
, N1
);
5400 // i16/i32/i64 are handled with isel patterns.
5404 case X86ISD::UMUL
: {
5405 SDValue N0
= Node
->getOperand(0);
5406 SDValue N1
= Node
->getOperand(1);
5408 unsigned LoReg
, ROpc
, MOpc
;
5409 switch (NVT
.SimpleTy
) {
5410 default: llvm_unreachable("Unsupported VT!");
5413 ROpc
= Opcode
== X86ISD::SMUL
? X86::IMUL8r
: X86::MUL8r
;
5414 MOpc
= Opcode
== X86ISD::SMUL
? X86::IMUL8m
: X86::MUL8m
;
5433 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
5434 bool FoldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
5435 // Multiply is commutative.
5437 FoldedLoad
= tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
5442 SDValue InGlue
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, LoReg
,
5443 N0
, SDValue()).getValue(1);
5445 MachineSDNode
*CNode
;
5447 // i16/i32/i64 use an instruction that produces a low and high result even
5448 // though only the low result is used.
5451 VTs
= CurDAG
->getVTList(NVT
, MVT::i32
, MVT::Other
);
5453 VTs
= CurDAG
->getVTList(NVT
, NVT
, MVT::i32
, MVT::Other
);
5455 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
5457 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
5459 // Update the chain.
5460 ReplaceUses(N1
.getValue(1), SDValue(CNode
, NVT
== MVT::i8
? 2 : 3));
5461 // Record the mem-refs
5462 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
5464 // i16/i32/i64 use an instruction that produces a low and high result even
5465 // though only the low result is used.
5468 VTs
= CurDAG
->getVTList(NVT
, MVT::i32
);
5470 VTs
= CurDAG
->getVTList(NVT
, NVT
, MVT::i32
);
5472 CNode
= CurDAG
->getMachineNode(ROpc
, dl
, VTs
, {N1
, InGlue
});
5475 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
5476 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, NVT
== MVT::i8
? 1 : 2));
5477 CurDAG
->RemoveDeadNode(Node
);
5481 case ISD::SMUL_LOHI
:
5482 case ISD::UMUL_LOHI
: {
5483 SDValue N0
= Node
->getOperand(0);
5484 SDValue N1
= Node
->getOperand(1);
5487 unsigned LoReg
, HiReg
;
5488 bool IsSigned
= Opcode
== ISD::SMUL_LOHI
;
5489 bool UseMULX
= !IsSigned
&& Subtarget
->hasBMI2();
5490 bool UseMULXHi
= UseMULX
&& SDValue(Node
, 0).use_empty();
5491 switch (NVT
.SimpleTy
) {
5492 default: llvm_unreachable("Unsupported VT!");
5494 Opc
= UseMULXHi
? X86::MULX32Hrr
5495 : UseMULX
? GET_EGPR_IF_ENABLED(X86::MULX32rr
)
5496 : IsSigned
? X86::IMUL32r
5498 MOpc
= UseMULXHi
? X86::MULX32Hrm
5499 : UseMULX
? GET_EGPR_IF_ENABLED(X86::MULX32rm
)
5500 : IsSigned
? X86::IMUL32m
5502 LoReg
= UseMULX
? X86::EDX
: X86::EAX
;
5506 Opc
= UseMULXHi
? X86::MULX64Hrr
5507 : UseMULX
? GET_EGPR_IF_ENABLED(X86::MULX64rr
)
5508 : IsSigned
? X86::IMUL64r
5510 MOpc
= UseMULXHi
? X86::MULX64Hrm
5511 : UseMULX
? GET_EGPR_IF_ENABLED(X86::MULX64rm
)
5512 : IsSigned
? X86::IMUL64m
5514 LoReg
= UseMULX
? X86::RDX
: X86::RAX
;
5517 #undef GET_EGPR_IF_ENABLED
5520 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
5521 bool foldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
5522 // Multiply is commutative.
5524 foldedLoad
= tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
5529 SDValue InGlue
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, LoReg
,
5530 N0
, SDValue()).getValue(1);
5531 SDValue ResHi
, ResLo
;
5534 MachineSDNode
*CNode
= nullptr;
5535 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
5538 SDVTList VTs
= CurDAG
->getVTList(NVT
, MVT::Other
);
5539 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
5540 ResHi
= SDValue(CNode
, 0);
5541 Chain
= SDValue(CNode
, 1);
5542 } else if (UseMULX
) {
5543 SDVTList VTs
= CurDAG
->getVTList(NVT
, NVT
, MVT::Other
);
5544 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
5545 ResHi
= SDValue(CNode
, 0);
5546 ResLo
= SDValue(CNode
, 1);
5547 Chain
= SDValue(CNode
, 2);
5549 SDVTList VTs
= CurDAG
->getVTList(MVT::Other
, MVT::Glue
);
5550 CNode
= CurDAG
->getMachineNode(MOpc
, dl
, VTs
, Ops
);
5551 Chain
= SDValue(CNode
, 0);
5552 InGlue
= SDValue(CNode
, 1);
5555 // Update the chain.
5556 ReplaceUses(N1
.getValue(1), Chain
);
5557 // Record the mem-refs
5558 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
5560 SDValue Ops
[] = { N1
, InGlue
};
5562 SDVTList VTs
= CurDAG
->getVTList(NVT
);
5563 SDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
5564 ResHi
= SDValue(CNode
, 0);
5565 } else if (UseMULX
) {
5566 SDVTList VTs
= CurDAG
->getVTList(NVT
, NVT
);
5567 SDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
5568 ResHi
= SDValue(CNode
, 0);
5569 ResLo
= SDValue(CNode
, 1);
5571 SDVTList VTs
= CurDAG
->getVTList(MVT::Glue
);
5572 SDNode
*CNode
= CurDAG
->getMachineNode(Opc
, dl
, VTs
, Ops
);
5573 InGlue
= SDValue(CNode
, 0);
5577 // Copy the low half of the result, if it is needed.
5578 if (!SDValue(Node
, 0).use_empty()) {
5580 assert(LoReg
&& "Register for low half is not defined!");
5581 ResLo
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
, LoReg
,
5583 InGlue
= ResLo
.getValue(2);
5585 ReplaceUses(SDValue(Node
, 0), ResLo
);
5586 LLVM_DEBUG(dbgs() << "=> "; ResLo
.getNode()->dump(CurDAG
);
5589 // Copy the high half of the result, if it is needed.
5590 if (!SDValue(Node
, 1).use_empty()) {
5592 assert(HiReg
&& "Register for high half is not defined!");
5593 ResHi
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
, HiReg
,
5595 InGlue
= ResHi
.getValue(2);
5597 ReplaceUses(SDValue(Node
, 1), ResHi
);
5598 LLVM_DEBUG(dbgs() << "=> "; ResHi
.getNode()->dump(CurDAG
);
5602 CurDAG
->RemoveDeadNode(Node
);
5607 case ISD::UDIVREM
: {
5608 SDValue N0
= Node
->getOperand(0);
5609 SDValue N1
= Node
->getOperand(1);
5611 unsigned ROpc
, MOpc
;
5612 bool isSigned
= Opcode
== ISD::SDIVREM
;
5614 switch (NVT
.SimpleTy
) {
5615 default: llvm_unreachable("Unsupported VT!");
5616 case MVT::i8
: ROpc
= X86::DIV8r
; MOpc
= X86::DIV8m
; break;
5617 case MVT::i16
: ROpc
= X86::DIV16r
; MOpc
= X86::DIV16m
; break;
5618 case MVT::i32
: ROpc
= X86::DIV32r
; MOpc
= X86::DIV32m
; break;
5619 case MVT::i64
: ROpc
= X86::DIV64r
; MOpc
= X86::DIV64m
; break;
5622 switch (NVT
.SimpleTy
) {
5623 default: llvm_unreachable("Unsupported VT!");
5624 case MVT::i8
: ROpc
= X86::IDIV8r
; MOpc
= X86::IDIV8m
; break;
5625 case MVT::i16
: ROpc
= X86::IDIV16r
; MOpc
= X86::IDIV16m
; break;
5626 case MVT::i32
: ROpc
= X86::IDIV32r
; MOpc
= X86::IDIV32m
; break;
5627 case MVT::i64
: ROpc
= X86::IDIV64r
; MOpc
= X86::IDIV64m
; break;
5631 unsigned LoReg
, HiReg
, ClrReg
;
5632 unsigned SExtOpcode
;
5633 switch (NVT
.SimpleTy
) {
5634 default: llvm_unreachable("Unsupported VT!");
5636 LoReg
= X86::AL
; ClrReg
= HiReg
= X86::AH
;
5637 SExtOpcode
= 0; // Not used.
5640 LoReg
= X86::AX
; HiReg
= X86::DX
;
5642 SExtOpcode
= X86::CWD
;
5645 LoReg
= X86::EAX
; ClrReg
= HiReg
= X86::EDX
;
5646 SExtOpcode
= X86::CDQ
;
5649 LoReg
= X86::RAX
; ClrReg
= HiReg
= X86::RDX
;
5650 SExtOpcode
= X86::CQO
;
5654 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
5655 bool foldedLoad
= tryFoldLoad(Node
, N1
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
);
5656 bool signBitIsZero
= CurDAG
->SignBitIsZero(N0
);
5659 if (NVT
== MVT::i8
) {
5660 // Special case for div8, just use a move with zero extension to AX to
5661 // clear the upper 8 bits (AH).
5662 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Chain
;
5663 MachineSDNode
*Move
;
5664 if (tryFoldLoad(Node
, N0
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
5665 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N0
.getOperand(0) };
5666 unsigned Opc
= (isSigned
&& !signBitIsZero
) ? X86::MOVSX16rm8
5668 Move
= CurDAG
->getMachineNode(Opc
, dl
, MVT::i16
, MVT::Other
, Ops
);
5669 Chain
= SDValue(Move
, 1);
5670 ReplaceUses(N0
.getValue(1), Chain
);
5671 // Record the mem-refs
5672 CurDAG
->setNodeMemRefs(Move
, {cast
<LoadSDNode
>(N0
)->getMemOperand()});
5674 unsigned Opc
= (isSigned
&& !signBitIsZero
) ? X86::MOVSX16rr8
5676 Move
= CurDAG
->getMachineNode(Opc
, dl
, MVT::i16
, N0
);
5677 Chain
= CurDAG
->getEntryNode();
5679 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::AX
, SDValue(Move
, 0),
5681 InGlue
= Chain
.getValue(1);
5684 CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
,
5685 LoReg
, N0
, SDValue()).getValue(1);
5686 if (isSigned
&& !signBitIsZero
) {
5687 // Sign extend the low part into the high part.
5689 SDValue(CurDAG
->getMachineNode(SExtOpcode
, dl
, MVT::Glue
, InGlue
),0);
5691 // Zero out the high part, effectively zero extending the input.
5692 SDVTList VTs
= CurDAG
->getVTList(MVT::i32
, MVT::i32
);
5693 SDValue ClrNode
= SDValue(
5694 CurDAG
->getMachineNode(X86::MOV32r0
, dl
, VTs
, std::nullopt
), 0);
5695 switch (NVT
.SimpleTy
) {
5698 SDValue(CurDAG
->getMachineNode(
5699 TargetOpcode::EXTRACT_SUBREG
, dl
, MVT::i16
, ClrNode
,
5700 CurDAG
->getTargetConstant(X86::sub_16bit
, dl
,
5708 SDValue(CurDAG
->getMachineNode(
5709 TargetOpcode::SUBREG_TO_REG
, dl
, MVT::i64
,
5710 CurDAG
->getTargetConstant(0, dl
, MVT::i64
), ClrNode
,
5711 CurDAG
->getTargetConstant(X86::sub_32bit
, dl
,
5716 llvm_unreachable("Unexpected division source");
5719 InGlue
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, ClrReg
,
5720 ClrNode
, InGlue
).getValue(1);
5725 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, N1
.getOperand(0),
5727 MachineSDNode
*CNode
=
5728 CurDAG
->getMachineNode(MOpc
, dl
, MVT::Other
, MVT::Glue
, Ops
);
5729 InGlue
= SDValue(CNode
, 1);
5730 // Update the chain.
5731 ReplaceUses(N1
.getValue(1), SDValue(CNode
, 0));
5732 // Record the mem-refs
5733 CurDAG
->setNodeMemRefs(CNode
, {cast
<LoadSDNode
>(N1
)->getMemOperand()});
5736 SDValue(CurDAG
->getMachineNode(ROpc
, dl
, MVT::Glue
, N1
, InGlue
), 0);
5739 // Prevent use of AH in a REX instruction by explicitly copying it to
5740 // an ABCD_L register.
5742 // The current assumption of the register allocator is that isel
5743 // won't generate explicit references to the GR8_ABCD_H registers. If
5744 // the allocator and/or the backend get enhanced to be more robust in
5745 // that regard, this can be, and should be, removed.
5746 if (HiReg
== X86::AH
&& !SDValue(Node
, 1).use_empty()) {
5747 SDValue AHCopy
= CurDAG
->getRegister(X86::AH
, MVT::i8
);
5748 unsigned AHExtOpcode
=
5749 isSigned
? X86::MOVSX32rr8_NOREX
: X86::MOVZX32rr8_NOREX
;
5751 SDNode
*RNode
= CurDAG
->getMachineNode(AHExtOpcode
, dl
, MVT::i32
,
5752 MVT::Glue
, AHCopy
, InGlue
);
5753 SDValue
Result(RNode
, 0);
5754 InGlue
= SDValue(RNode
, 1);
5757 CurDAG
->getTargetExtractSubreg(X86::sub_8bit
, dl
, MVT::i8
, Result
);
5759 ReplaceUses(SDValue(Node
, 1), Result
);
5760 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
5763 // Copy the division (low) result, if it is needed.
5764 if (!SDValue(Node
, 0).use_empty()) {
5765 SDValue Result
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
,
5766 LoReg
, NVT
, InGlue
);
5767 InGlue
= Result
.getValue(2);
5768 ReplaceUses(SDValue(Node
, 0), Result
);
5769 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
5772 // Copy the remainder (high) result, if it is needed.
5773 if (!SDValue(Node
, 1).use_empty()) {
5774 SDValue Result
= CurDAG
->getCopyFromReg(CurDAG
->getEntryNode(), dl
,
5775 HiReg
, NVT
, InGlue
);
5776 InGlue
= Result
.getValue(2);
5777 ReplaceUses(SDValue(Node
, 1), Result
);
5778 LLVM_DEBUG(dbgs() << "=> "; Result
.getNode()->dump(CurDAG
);
5781 CurDAG
->RemoveDeadNode(Node
);
5786 case X86ISD::STRICT_FCMP
:
5787 case X86ISD::STRICT_FCMPS
: {
5788 bool IsStrictCmp
= Node
->getOpcode() == X86ISD::STRICT_FCMP
||
5789 Node
->getOpcode() == X86ISD::STRICT_FCMPS
;
5790 SDValue N0
= Node
->getOperand(IsStrictCmp
? 1 : 0);
5791 SDValue N1
= Node
->getOperand(IsStrictCmp
? 2 : 1);
5793 // Save the original VT of the compare.
5794 MVT CmpVT
= N0
.getSimpleValueType();
5796 // Floating point needs special handling if we don't have FCOMI.
5797 if (Subtarget
->canUseCMOV())
5800 bool IsSignaling
= Node
->getOpcode() == X86ISD::STRICT_FCMPS
;
5803 switch (CmpVT
.SimpleTy
) {
5804 default: llvm_unreachable("Unexpected type!");
5806 Opc
= IsSignaling
? X86::COM_Fpr32
: X86::UCOM_Fpr32
;
5809 Opc
= IsSignaling
? X86::COM_Fpr64
: X86::UCOM_Fpr64
;
5812 Opc
= IsSignaling
? X86::COM_Fpr80
: X86::UCOM_Fpr80
;
5817 IsStrictCmp
? Node
->getOperand(0) : CurDAG
->getEntryNode();
5820 SDVTList VTs
= CurDAG
->getVTList(MVT::Other
, MVT::Glue
);
5821 Chain
= SDValue(CurDAG
->getMachineNode(Opc
, dl
, VTs
, {N0
, N1
, Chain
}), 0);
5822 Glue
= Chain
.getValue(1);
5824 Glue
= SDValue(CurDAG
->getMachineNode(Opc
, dl
, MVT::Glue
, N0
, N1
), 0);
5829 SDValue(CurDAG
->getMachineNode(X86::FNSTSW16r
, dl
, MVT::i16
, Glue
), 0);
5831 // Extract upper 8-bits of AX.
5833 CurDAG
->getTargetExtractSubreg(X86::sub_8bit_hi
, dl
, MVT::i8
, FNSTSW
);
5835 // Move AH into flags.
5836 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
5837 assert(Subtarget
->canUseLAHFSAHF() &&
5838 "Target doesn't support SAHF or FCOMI?");
5839 SDValue AH
= CurDAG
->getCopyToReg(Chain
, dl
, X86::AH
, Extract
, SDValue());
5841 SDValue SAHF
= SDValue(
5842 CurDAG
->getMachineNode(X86::SAHF
, dl
, MVT::i32
, AH
.getValue(1)), 0);
5845 ReplaceUses(SDValue(Node
, 1), Chain
);
5847 ReplaceUses(SDValue(Node
, 0), SAHF
);
5848 CurDAG
->RemoveDeadNode(Node
);
5853 SDValue N0
= Node
->getOperand(0);
5854 SDValue N1
= Node
->getOperand(1);
5856 // Optimizations for TEST compares.
5857 if (!isNullConstant(N1
))
5860 // Save the original VT of the compare.
5861 MVT CmpVT
= N0
.getSimpleValueType();
5863 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
5864 // by a test instruction. The test should be removed later by
5865 // analyzeCompare if we are using only the zero flag.
5866 // TODO: Should we check the users and use the BEXTR flags directly?
5867 if (N0
.getOpcode() == ISD::AND
&& N0
.hasOneUse()) {
5868 if (MachineSDNode
*NewNode
= matchBEXTRFromAndImm(N0
.getNode())) {
5869 unsigned TestOpc
= CmpVT
== MVT::i64
? X86::TEST64rr
5871 SDValue BEXTR
= SDValue(NewNode
, 0);
5872 NewNode
= CurDAG
->getMachineNode(TestOpc
, dl
, MVT::i32
, BEXTR
, BEXTR
);
5873 ReplaceUses(SDValue(Node
, 0), SDValue(NewNode
, 0));
5874 CurDAG
->RemoveDeadNode(Node
);
5879 // We can peek through truncates, but we need to be careful below.
5880 if (N0
.getOpcode() == ISD::TRUNCATE
&& N0
.hasOneUse())
5881 N0
= N0
.getOperand(0);
5883 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
5884 // use a smaller encoding.
5885 // Look past the truncate if CMP is the only use of it.
5886 if (N0
.getOpcode() == ISD::AND
&& N0
.getNode()->hasOneUse() &&
5887 N0
.getValueType() != MVT::i8
) {
5888 auto *MaskC
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1));
5892 // We may have looked through a truncate so mask off any bits that
5893 // shouldn't be part of the compare.
5894 uint64_t Mask
= MaskC
->getZExtValue();
5895 Mask
&= maskTrailingOnes
<uint64_t>(CmpVT
.getScalarSizeInBits());
5897 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
5898 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
5900 if (CmpVT
== MVT::i64
&& !isInt
<8>(Mask
) && isShiftedMask_64(Mask
) &&
5901 onlyUsesZeroFlag(SDValue(Node
, 0))) {
5902 unsigned ShiftOpcode
= ISD::DELETED_NODE
;
5906 unsigned TestOpcode
;
5907 unsigned LeadingZeros
= llvm::countl_zero(Mask
);
5908 unsigned TrailingZeros
= llvm::countr_zero(Mask
);
5910 // With leading/trailing zeros, the transform is profitable if we can
5911 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
5912 // incurring any extra register moves.
5913 bool SavesBytes
= !isInt
<32>(Mask
) || N0
.getOperand(0).hasOneUse();
5914 if (LeadingZeros
== 0 && SavesBytes
) {
5915 // If the mask covers the most significant bit, then we can replace
5916 // TEST+AND with a SHR and check eflags.
5917 // This emits a redundant TEST which is subsequently eliminated.
5918 ShiftOpcode
= X86::SHR64ri
;
5919 ShiftAmt
= TrailingZeros
;
5921 TestOpcode
= X86::TEST64rr
;
5922 } else if (TrailingZeros
== 0 && SavesBytes
) {
5923 // If the mask covers the least significant bit, then we can replace
5924 // TEST+AND with a SHL and check eflags.
5925 // This emits a redundant TEST which is subsequently eliminated.
5926 ShiftOpcode
= X86::SHL64ri
;
5927 ShiftAmt
= LeadingZeros
;
5929 TestOpcode
= X86::TEST64rr
;
5930 } else if (MaskC
->hasOneUse() && !isInt
<32>(Mask
)) {
5931 // If the shifted mask extends into the high half and is 8/16/32 bits
5932 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
5933 unsigned PopCount
= 64 - LeadingZeros
- TrailingZeros
;
5934 if (PopCount
== 8) {
5935 ShiftOpcode
= X86::SHR64ri
;
5936 ShiftAmt
= TrailingZeros
;
5937 SubRegIdx
= X86::sub_8bit
;
5939 TestOpcode
= X86::TEST8rr
;
5940 } else if (PopCount
== 16) {
5941 ShiftOpcode
= X86::SHR64ri
;
5942 ShiftAmt
= TrailingZeros
;
5943 SubRegIdx
= X86::sub_16bit
;
5944 SubRegVT
= MVT::i16
;
5945 TestOpcode
= X86::TEST16rr
;
5946 } else if (PopCount
== 32) {
5947 ShiftOpcode
= X86::SHR64ri
;
5948 ShiftAmt
= TrailingZeros
;
5949 SubRegIdx
= X86::sub_32bit
;
5950 SubRegVT
= MVT::i32
;
5951 TestOpcode
= X86::TEST32rr
;
5954 if (ShiftOpcode
!= ISD::DELETED_NODE
) {
5955 SDValue ShiftC
= CurDAG
->getTargetConstant(ShiftAmt
, dl
, MVT::i64
);
5956 SDValue Shift
= SDValue(
5957 CurDAG
->getMachineNode(ShiftOpcode
, dl
, MVT::i64
, MVT::i32
,
5958 N0
.getOperand(0), ShiftC
),
5960 if (SubRegIdx
!= 0) {
5962 CurDAG
->getTargetExtractSubreg(SubRegIdx
, dl
, SubRegVT
, Shift
);
5964 MachineSDNode
*Test
=
5965 CurDAG
->getMachineNode(TestOpcode
, dl
, MVT::i32
, Shift
, Shift
);
5966 ReplaceNode(Node
, Test
);
5973 unsigned ROpc
, MOpc
;
5975 // For each of these checks we need to be careful if the sign flag is
5976 // being used. It is only safe to use the sign flag in two conditions,
5977 // either the sign bit in the shrunken mask is zero or the final test
5978 // size is equal to the original compare size.
5980 if (isUInt
<8>(Mask
) &&
5981 (!(Mask
& 0x80) || CmpVT
== MVT::i8
||
5982 hasNoSignFlagUses(SDValue(Node
, 0)))) {
5983 // For example, convert "testl %eax, $8" to "testb %al, $8"
5985 SubRegOp
= X86::sub_8bit
;
5986 ROpc
= X86::TEST8ri
;
5987 MOpc
= X86::TEST8mi
;
5988 } else if (OptForMinSize
&& isUInt
<16>(Mask
) &&
5989 (!(Mask
& 0x8000) || CmpVT
== MVT::i16
||
5990 hasNoSignFlagUses(SDValue(Node
, 0)))) {
5991 // For example, "testl %eax, $32776" to "testw %ax, $32776".
5992 // NOTE: We only want to form TESTW instructions if optimizing for
5993 // min size. Otherwise we only save one byte and possibly get a length
5994 // changing prefix penalty in the decoders.
5996 SubRegOp
= X86::sub_16bit
;
5997 ROpc
= X86::TEST16ri
;
5998 MOpc
= X86::TEST16mi
;
5999 } else if (isUInt
<32>(Mask
) && N0
.getValueType() != MVT::i16
&&
6000 ((!(Mask
& 0x80000000) &&
6001 // Without minsize 16-bit Cmps can get here so we need to
6002 // be sure we calculate the correct sign flag if needed.
6003 (CmpVT
!= MVT::i16
|| !(Mask
& 0x8000))) ||
6004 CmpVT
== MVT::i32
||
6005 hasNoSignFlagUses(SDValue(Node
, 0)))) {
6006 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6007 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6008 // Otherwize, we find ourselves in a position where we have to do
6009 // promotion. If previous passes did not promote the and, we assume
6010 // they had a good reason not to and do not promote here.
6012 SubRegOp
= X86::sub_32bit
;
6013 ROpc
= X86::TEST32ri
;
6014 MOpc
= X86::TEST32mi
;
6016 // No eligible transformation was found.
6020 SDValue Imm
= CurDAG
->getTargetConstant(Mask
, dl
, VT
);
6021 SDValue Reg
= N0
.getOperand(0);
6023 // Emit a testl or testw.
6024 MachineSDNode
*NewNode
;
6025 SDValue Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
;
6026 if (tryFoldLoad(Node
, N0
.getNode(), Reg
, Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
)) {
6027 if (auto *LoadN
= dyn_cast
<LoadSDNode
>(N0
.getOperand(0).getNode())) {
6028 if (!LoadN
->isSimple()) {
6029 unsigned NumVolBits
= LoadN
->getValueType(0).getSizeInBits();
6030 if ((MOpc
== X86::TEST8mi
&& NumVolBits
!= 8) ||
6031 (MOpc
== X86::TEST16mi
&& NumVolBits
!= 16) ||
6032 (MOpc
== X86::TEST32mi
&& NumVolBits
!= 32))
6036 SDValue Ops
[] = { Tmp0
, Tmp1
, Tmp2
, Tmp3
, Tmp4
, Imm
,
6037 Reg
.getOperand(0) };
6038 NewNode
= CurDAG
->getMachineNode(MOpc
, dl
, MVT::i32
, MVT::Other
, Ops
);
6039 // Update the chain.
6040 ReplaceUses(Reg
.getValue(1), SDValue(NewNode
, 1));
6041 // Record the mem-refs
6042 CurDAG
->setNodeMemRefs(NewNode
,
6043 {cast
<LoadSDNode
>(Reg
)->getMemOperand()});
6045 // Extract the subregister if necessary.
6046 if (N0
.getValueType() != VT
)
6047 Reg
= CurDAG
->getTargetExtractSubreg(SubRegOp
, dl
, VT
, Reg
);
6049 NewNode
= CurDAG
->getMachineNode(ROpc
, dl
, MVT::i32
, Reg
, Imm
);
6051 // Replace CMP with TEST.
6052 ReplaceNode(Node
, NewNode
);
6057 case X86ISD::PCMPISTR
: {
6058 if (!Subtarget
->hasSSE42())
6061 bool NeedIndex
= !SDValue(Node
, 0).use_empty();
6062 bool NeedMask
= !SDValue(Node
, 1).use_empty();
6063 // We can't fold a load if we are going to make two instructions.
6064 bool MayFoldLoad
= !NeedIndex
|| !NeedMask
;
6066 MachineSDNode
*CNode
;
6068 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRMrr
: X86::PCMPISTRMrr
;
6069 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRMrm
: X86::PCMPISTRMrm
;
6070 CNode
= emitPCMPISTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::v16i8
, Node
);
6071 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, 0));
6073 if (NeedIndex
|| !NeedMask
) {
6074 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRIrr
: X86::PCMPISTRIrr
;
6075 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPISTRIrm
: X86::PCMPISTRIrm
;
6076 CNode
= emitPCMPISTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::i32
, Node
);
6077 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
6080 // Connect the flag usage to the last instruction created.
6081 ReplaceUses(SDValue(Node
, 2), SDValue(CNode
, 1));
6082 CurDAG
->RemoveDeadNode(Node
);
6085 case X86ISD::PCMPESTR
: {
6086 if (!Subtarget
->hasSSE42())
6089 // Copy the two implicit register inputs.
6090 SDValue InGlue
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EAX
,
6091 Node
->getOperand(1),
6092 SDValue()).getValue(1);
6093 InGlue
= CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EDX
,
6094 Node
->getOperand(3), InGlue
).getValue(1);
6096 bool NeedIndex
= !SDValue(Node
, 0).use_empty();
6097 bool NeedMask
= !SDValue(Node
, 1).use_empty();
6098 // We can't fold a load if we are going to make two instructions.
6099 bool MayFoldLoad
= !NeedIndex
|| !NeedMask
;
6101 MachineSDNode
*CNode
;
6103 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRMrr
: X86::PCMPESTRMrr
;
6104 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRMrm
: X86::PCMPESTRMrm
;
6105 CNode
= emitPCMPESTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::v16i8
, Node
,
6107 ReplaceUses(SDValue(Node
, 1), SDValue(CNode
, 0));
6109 if (NeedIndex
|| !NeedMask
) {
6110 unsigned ROpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRIrr
: X86::PCMPESTRIrr
;
6111 unsigned MOpc
= Subtarget
->hasAVX() ? X86::VPCMPESTRIrm
: X86::PCMPESTRIrm
;
6112 CNode
= emitPCMPESTR(ROpc
, MOpc
, MayFoldLoad
, dl
, MVT::i32
, Node
, InGlue
);
6113 ReplaceUses(SDValue(Node
, 0), SDValue(CNode
, 0));
6115 // Connect the flag usage to the last instruction created.
6116 ReplaceUses(SDValue(Node
, 2), SDValue(CNode
, 1));
6117 CurDAG
->RemoveDeadNode(Node
);
6122 if (NVT
.isVector() && tryVPTESTM(Node
, SDValue(Node
, 0), SDValue()))
6129 if (foldLoadStoreIntoMemOperand(Node
))
6133 case X86ISD::SETCC_CARRY
: {
6134 MVT VT
= Node
->getSimpleValueType(0);
6136 if (Subtarget
->hasSBBDepBreaking()) {
6137 // We have to do this manually because tblgen will put the eflags copy in
6138 // the wrong place if we use an extract_subreg in the pattern.
6139 // Copy flags to the EFLAGS register and glue it to next node.
6141 CurDAG
->getCopyToReg(CurDAG
->getEntryNode(), dl
, X86::EFLAGS
,
6142 Node
->getOperand(1), SDValue());
6144 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6146 unsigned Opc
= VT
== MVT::i64
? X86::SETB_C64r
: X86::SETB_C32r
;
6147 MVT SetVT
= VT
== MVT::i64
? MVT::i64
: MVT::i32
;
6149 CurDAG
->getMachineNode(Opc
, dl
, SetVT
, EFLAGS
, EFLAGS
.getValue(1)),
6152 // The target does not recognize sbb with the same reg operand as a
6153 // no-source idiom, so we explicitly zero the input values.
6154 Result
= getSBBZero(Node
);
6157 // For less than 32-bits we need to extract from the 32-bit node.
6158 if (VT
== MVT::i8
|| VT
== MVT::i16
) {
6159 int SubIndex
= VT
== MVT::i16
? X86::sub_16bit
: X86::sub_8bit
;
6160 Result
= CurDAG
->getTargetExtractSubreg(SubIndex
, dl
, VT
, Result
);
6163 ReplaceUses(SDValue(Node
, 0), Result
);
6164 CurDAG
->RemoveDeadNode(Node
);
6168 if (isNullConstant(Node
->getOperand(0)) &&
6169 isNullConstant(Node
->getOperand(1))) {
6170 SDValue Result
= getSBBZero(Node
);
6172 // Replace the flag use.
6173 ReplaceUses(SDValue(Node
, 1), Result
.getValue(1));
6175 // Replace the result use.
6176 if (!SDValue(Node
, 0).use_empty()) {
6177 // For less than 32-bits we need to extract from the 32-bit node.
6178 MVT VT
= Node
->getSimpleValueType(0);
6179 if (VT
== MVT::i8
|| VT
== MVT::i16
) {
6180 int SubIndex
= VT
== MVT::i16
? X86::sub_16bit
: X86::sub_8bit
;
6181 Result
= CurDAG
->getTargetExtractSubreg(SubIndex
, dl
, VT
, Result
);
6183 ReplaceUses(SDValue(Node
, 0), Result
);
6186 CurDAG
->RemoveDeadNode(Node
);
6191 case X86ISD::MGATHER
: {
6192 auto *Mgt
= cast
<X86MaskedGatherSDNode
>(Node
);
6193 SDValue IndexOp
= Mgt
->getIndex();
6194 SDValue Mask
= Mgt
->getMask();
6195 MVT IndexVT
= IndexOp
.getSimpleValueType();
6196 MVT ValueVT
= Node
->getSimpleValueType(0);
6197 MVT MaskVT
= Mask
.getSimpleValueType();
6199 // This is just to prevent crashes if the nodes are malformed somehow. We're
6200 // otherwise only doing loose type checking in here based on type what
6201 // a type constraint would say just like table based isel.
6202 if (!ValueVT
.isVector() || !MaskVT
.isVector())
6205 unsigned NumElts
= ValueVT
.getVectorNumElements();
6206 MVT ValueSVT
= ValueVT
.getVectorElementType();
6208 bool IsFP
= ValueSVT
.isFloatingPoint();
6209 unsigned EltSize
= ValueSVT
.getSizeInBits();
6212 bool AVX512Gather
= MaskVT
.getVectorElementType() == MVT::i1
;
6214 if (IndexVT
== MVT::v4i32
&& NumElts
== 4 && EltSize
== 32)
6215 Opc
= IsFP
? X86::VGATHERDPSZ128rm
: X86::VPGATHERDDZ128rm
;
6216 else if (IndexVT
== MVT::v8i32
&& NumElts
== 8 && EltSize
== 32)
6217 Opc
= IsFP
? X86::VGATHERDPSZ256rm
: X86::VPGATHERDDZ256rm
;
6218 else if (IndexVT
== MVT::v16i32
&& NumElts
== 16 && EltSize
== 32)
6219 Opc
= IsFP
? X86::VGATHERDPSZrm
: X86::VPGATHERDDZrm
;
6220 else if (IndexVT
== MVT::v4i32
&& NumElts
== 2 && EltSize
== 64)
6221 Opc
= IsFP
? X86::VGATHERDPDZ128rm
: X86::VPGATHERDQZ128rm
;
6222 else if (IndexVT
== MVT::v4i32
&& NumElts
== 4 && EltSize
== 64)
6223 Opc
= IsFP
? X86::VGATHERDPDZ256rm
: X86::VPGATHERDQZ256rm
;
6224 else if (IndexVT
== MVT::v8i32
&& NumElts
== 8 && EltSize
== 64)
6225 Opc
= IsFP
? X86::VGATHERDPDZrm
: X86::VPGATHERDQZrm
;
6226 else if (IndexVT
== MVT::v2i64
&& NumElts
== 4 && EltSize
== 32)
6227 Opc
= IsFP
? X86::VGATHERQPSZ128rm
: X86::VPGATHERQDZ128rm
;
6228 else if (IndexVT
== MVT::v4i64
&& NumElts
== 4 && EltSize
== 32)
6229 Opc
= IsFP
? X86::VGATHERQPSZ256rm
: X86::VPGATHERQDZ256rm
;
6230 else if (IndexVT
== MVT::v8i64
&& NumElts
== 8 && EltSize
== 32)
6231 Opc
= IsFP
? X86::VGATHERQPSZrm
: X86::VPGATHERQDZrm
;
6232 else if (IndexVT
== MVT::v2i64
&& NumElts
== 2 && EltSize
== 64)
6233 Opc
= IsFP
? X86::VGATHERQPDZ128rm
: X86::VPGATHERQQZ128rm
;
6234 else if (IndexVT
== MVT::v4i64
&& NumElts
== 4 && EltSize
== 64)
6235 Opc
= IsFP
? X86::VGATHERQPDZ256rm
: X86::VPGATHERQQZ256rm
;
6236 else if (IndexVT
== MVT::v8i64
&& NumElts
== 8 && EltSize
== 64)
6237 Opc
= IsFP
? X86::VGATHERQPDZrm
: X86::VPGATHERQQZrm
;
6239 assert(EVT(MaskVT
) == EVT(ValueVT
).changeVectorElementTypeToInteger() &&
6240 "Unexpected mask VT!");
6241 if (IndexVT
== MVT::v4i32
&& NumElts
== 4 && EltSize
== 32)
6242 Opc
= IsFP
? X86::VGATHERDPSrm
: X86::VPGATHERDDrm
;
6243 else if (IndexVT
== MVT::v8i32
&& NumElts
== 8 && EltSize
== 32)
6244 Opc
= IsFP
? X86::VGATHERDPSYrm
: X86::VPGATHERDDYrm
;
6245 else if (IndexVT
== MVT::v4i32
&& NumElts
== 2 && EltSize
== 64)
6246 Opc
= IsFP
? X86::VGATHERDPDrm
: X86::VPGATHERDQrm
;
6247 else if (IndexVT
== MVT::v4i32
&& NumElts
== 4 && EltSize
== 64)
6248 Opc
= IsFP
? X86::VGATHERDPDYrm
: X86::VPGATHERDQYrm
;
6249 else if (IndexVT
== MVT::v2i64
&& NumElts
== 4 && EltSize
== 32)
6250 Opc
= IsFP
? X86::VGATHERQPSrm
: X86::VPGATHERQDrm
;
6251 else if (IndexVT
== MVT::v4i64
&& NumElts
== 4 && EltSize
== 32)
6252 Opc
= IsFP
? X86::VGATHERQPSYrm
: X86::VPGATHERQDYrm
;
6253 else if (IndexVT
== MVT::v2i64
&& NumElts
== 2 && EltSize
== 64)
6254 Opc
= IsFP
? X86::VGATHERQPDrm
: X86::VPGATHERQQrm
;
6255 else if (IndexVT
== MVT::v4i64
&& NumElts
== 4 && EltSize
== 64)
6256 Opc
= IsFP
? X86::VGATHERQPDYrm
: X86::VPGATHERQQYrm
;
6262 SDValue Base
, Scale
, Index
, Disp
, Segment
;
6263 if (!selectVectorAddr(Mgt
, Mgt
->getBasePtr(), IndexOp
, Mgt
->getScale(),
6264 Base
, Scale
, Index
, Disp
, Segment
))
6267 SDValue PassThru
= Mgt
->getPassThru();
6268 SDValue Chain
= Mgt
->getChain();
6269 // Gather instructions have a mask output not in the ISD node.
6270 SDVTList VTs
= CurDAG
->getVTList(ValueVT
, MaskVT
, MVT::Other
);
6272 MachineSDNode
*NewNode
;
6274 SDValue Ops
[] = {PassThru
, Mask
, Base
, Scale
,
6275 Index
, Disp
, Segment
, Chain
};
6276 NewNode
= CurDAG
->getMachineNode(Opc
, SDLoc(dl
), VTs
, Ops
);
6278 SDValue Ops
[] = {PassThru
, Base
, Scale
, Index
,
6279 Disp
, Segment
, Mask
, Chain
};
6280 NewNode
= CurDAG
->getMachineNode(Opc
, SDLoc(dl
), VTs
, Ops
);
6282 CurDAG
->setNodeMemRefs(NewNode
, {Mgt
->getMemOperand()});
6283 ReplaceUses(SDValue(Node
, 0), SDValue(NewNode
, 0));
6284 ReplaceUses(SDValue(Node
, 1), SDValue(NewNode
, 2));
6285 CurDAG
->RemoveDeadNode(Node
);
6288 case X86ISD::MSCATTER
: {
6289 auto *Sc
= cast
<X86MaskedScatterSDNode
>(Node
);
6290 SDValue Value
= Sc
->getValue();
6291 SDValue IndexOp
= Sc
->getIndex();
6292 MVT IndexVT
= IndexOp
.getSimpleValueType();
6293 MVT ValueVT
= Value
.getSimpleValueType();
6295 // This is just to prevent crashes if the nodes are malformed somehow. We're
6296 // otherwise only doing loose type checking in here based on type what
6297 // a type constraint would say just like table based isel.
6298 if (!ValueVT
.isVector())
6301 unsigned NumElts
= ValueVT
.getVectorNumElements();
6302 MVT ValueSVT
= ValueVT
.getVectorElementType();
6304 bool IsFP
= ValueSVT
.isFloatingPoint();
6305 unsigned EltSize
= ValueSVT
.getSizeInBits();
6308 if (IndexVT
== MVT::v4i32
&& NumElts
== 4 && EltSize
== 32)
6309 Opc
= IsFP
? X86::VSCATTERDPSZ128mr
: X86::VPSCATTERDDZ128mr
;
6310 else if (IndexVT
== MVT::v8i32
&& NumElts
== 8 && EltSize
== 32)
6311 Opc
= IsFP
? X86::VSCATTERDPSZ256mr
: X86::VPSCATTERDDZ256mr
;
6312 else if (IndexVT
== MVT::v16i32
&& NumElts
== 16 && EltSize
== 32)
6313 Opc
= IsFP
? X86::VSCATTERDPSZmr
: X86::VPSCATTERDDZmr
;
6314 else if (IndexVT
== MVT::v4i32
&& NumElts
== 2 && EltSize
== 64)
6315 Opc
= IsFP
? X86::VSCATTERDPDZ128mr
: X86::VPSCATTERDQZ128mr
;
6316 else if (IndexVT
== MVT::v4i32
&& NumElts
== 4 && EltSize
== 64)
6317 Opc
= IsFP
? X86::VSCATTERDPDZ256mr
: X86::VPSCATTERDQZ256mr
;
6318 else if (IndexVT
== MVT::v8i32
&& NumElts
== 8 && EltSize
== 64)
6319 Opc
= IsFP
? X86::VSCATTERDPDZmr
: X86::VPSCATTERDQZmr
;
6320 else if (IndexVT
== MVT::v2i64
&& NumElts
== 4 && EltSize
== 32)
6321 Opc
= IsFP
? X86::VSCATTERQPSZ128mr
: X86::VPSCATTERQDZ128mr
;
6322 else if (IndexVT
== MVT::v4i64
&& NumElts
== 4 && EltSize
== 32)
6323 Opc
= IsFP
? X86::VSCATTERQPSZ256mr
: X86::VPSCATTERQDZ256mr
;
6324 else if (IndexVT
== MVT::v8i64
&& NumElts
== 8 && EltSize
== 32)
6325 Opc
= IsFP
? X86::VSCATTERQPSZmr
: X86::VPSCATTERQDZmr
;
6326 else if (IndexVT
== MVT::v2i64
&& NumElts
== 2 && EltSize
== 64)
6327 Opc
= IsFP
? X86::VSCATTERQPDZ128mr
: X86::VPSCATTERQQZ128mr
;
6328 else if (IndexVT
== MVT::v4i64
&& NumElts
== 4 && EltSize
== 64)
6329 Opc
= IsFP
? X86::VSCATTERQPDZ256mr
: X86::VPSCATTERQQZ256mr
;
6330 else if (IndexVT
== MVT::v8i64
&& NumElts
== 8 && EltSize
== 64)
6331 Opc
= IsFP
? X86::VSCATTERQPDZmr
: X86::VPSCATTERQQZmr
;
6335 SDValue Base
, Scale
, Index
, Disp
, Segment
;
6336 if (!selectVectorAddr(Sc
, Sc
->getBasePtr(), IndexOp
, Sc
->getScale(),
6337 Base
, Scale
, Index
, Disp
, Segment
))
6340 SDValue Mask
= Sc
->getMask();
6341 SDValue Chain
= Sc
->getChain();
6342 // Scatter instructions have a mask output not in the ISD node.
6343 SDVTList VTs
= CurDAG
->getVTList(Mask
.getValueType(), MVT::Other
);
6344 SDValue Ops
[] = {Base
, Scale
, Index
, Disp
, Segment
, Mask
, Value
, Chain
};
6346 MachineSDNode
*NewNode
= CurDAG
->getMachineNode(Opc
, SDLoc(dl
), VTs
, Ops
);
6347 CurDAG
->setNodeMemRefs(NewNode
, {Sc
->getMemOperand()});
6348 ReplaceUses(SDValue(Node
, 0), SDValue(NewNode
, 1));
6349 CurDAG
->RemoveDeadNode(Node
);
6352 case ISD::PREALLOCATED_SETUP
: {
6353 auto *MFI
= CurDAG
->getMachineFunction().getInfo
<X86MachineFunctionInfo
>();
6354 auto CallId
= MFI
->getPreallocatedIdForCallSite(
6355 cast
<SrcValueSDNode
>(Node
->getOperand(1))->getValue());
6356 SDValue Chain
= Node
->getOperand(0);
6357 SDValue CallIdValue
= CurDAG
->getTargetConstant(CallId
, dl
, MVT::i32
);
6358 MachineSDNode
*New
= CurDAG
->getMachineNode(
6359 TargetOpcode::PREALLOCATED_SETUP
, dl
, MVT::Other
, CallIdValue
, Chain
);
6360 ReplaceUses(SDValue(Node
, 0), SDValue(New
, 0)); // Chain
6361 CurDAG
->RemoveDeadNode(Node
);
6364 case ISD::PREALLOCATED_ARG
: {
6365 auto *MFI
= CurDAG
->getMachineFunction().getInfo
<X86MachineFunctionInfo
>();
6366 auto CallId
= MFI
->getPreallocatedIdForCallSite(
6367 cast
<SrcValueSDNode
>(Node
->getOperand(1))->getValue());
6368 SDValue Chain
= Node
->getOperand(0);
6369 SDValue CallIdValue
= CurDAG
->getTargetConstant(CallId
, dl
, MVT::i32
);
6370 SDValue ArgIndex
= Node
->getOperand(2);
6372 Ops
[0] = CallIdValue
;
6375 MachineSDNode
*New
= CurDAG
->getMachineNode(
6376 TargetOpcode::PREALLOCATED_ARG
, dl
,
6377 CurDAG
->getVTList(TLI
->getPointerTy(CurDAG
->getDataLayout()),
6380 ReplaceUses(SDValue(Node
, 0), SDValue(New
, 0)); // Arg pointer
6381 ReplaceUses(SDValue(Node
, 1), SDValue(New
, 1)); // Chain
6382 CurDAG
->RemoveDeadNode(Node
);
6385 case X86ISD::AESENCWIDE128KL
:
6386 case X86ISD::AESDECWIDE128KL
:
6387 case X86ISD::AESENCWIDE256KL
:
6388 case X86ISD::AESDECWIDE256KL
: {
6389 if (!Subtarget
->hasWIDEKL())
6393 switch (Node
->getOpcode()) {
6395 llvm_unreachable("Unexpected opcode!");
6396 case X86ISD::AESENCWIDE128KL
:
6397 Opcode
= X86::AESENCWIDE128KL
;
6399 case X86ISD::AESDECWIDE128KL
:
6400 Opcode
= X86::AESDECWIDE128KL
;
6402 case X86ISD::AESENCWIDE256KL
:
6403 Opcode
= X86::AESENCWIDE256KL
;
6405 case X86ISD::AESDECWIDE256KL
:
6406 Opcode
= X86::AESDECWIDE256KL
;
6410 SDValue Chain
= Node
->getOperand(0);
6411 SDValue Addr
= Node
->getOperand(1);
6413 SDValue Base
, Scale
, Index
, Disp
, Segment
;
6414 if (!selectAddr(Node
, Addr
, Base
, Scale
, Index
, Disp
, Segment
))
6417 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM0
, Node
->getOperand(2),
6419 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM1
, Node
->getOperand(3),
6421 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM2
, Node
->getOperand(4),
6423 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM3
, Node
->getOperand(5),
6425 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM4
, Node
->getOperand(6),
6427 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM5
, Node
->getOperand(7),
6429 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM6
, Node
->getOperand(8),
6431 Chain
= CurDAG
->getCopyToReg(Chain
, dl
, X86::XMM7
, Node
->getOperand(9),
6434 MachineSDNode
*Res
= CurDAG
->getMachineNode(
6435 Opcode
, dl
, Node
->getVTList(),
6436 {Base
, Scale
, Index
, Disp
, Segment
, Chain
, Chain
.getValue(1)});
6437 CurDAG
->setNodeMemRefs(Res
, cast
<MemSDNode
>(Node
)->getMemOperand());
6438 ReplaceNode(Node
, Res
);
6446 bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6447 const SDValue
&Op
, InlineAsm::ConstraintCode ConstraintID
,
6448 std::vector
<SDValue
> &OutOps
) {
6449 SDValue Op0
, Op1
, Op2
, Op3
, Op4
;
6450 switch (ConstraintID
) {
6452 llvm_unreachable("Unexpected asm memory constraint");
6453 case InlineAsm::ConstraintCode::o
: // offsetable ??
6454 case InlineAsm::ConstraintCode::v
: // not offsetable ??
6455 case InlineAsm::ConstraintCode::m
: // memory
6456 case InlineAsm::ConstraintCode::X
:
6457 case InlineAsm::ConstraintCode::p
: // address
6458 if (!selectAddr(nullptr, Op
, Op0
, Op1
, Op2
, Op3
, Op4
))
6463 OutOps
.push_back(Op0
);
6464 OutOps
.push_back(Op1
);
6465 OutOps
.push_back(Op2
);
6466 OutOps
.push_back(Op3
);
6467 OutOps
.push_back(Op4
);
6471 /// This pass converts a legalized DAG into a X86-specific DAG,
6472 /// ready for instruction scheduling.
6473 FunctionPass
*llvm::createX86ISelDag(X86TargetMachine
&TM
,
6474 CodeGenOptLevel OptLevel
) {
6475 return new X86DAGToDAGISel(TM
, OptLevel
);