[InstCombine] Signed saturation patterns
[llvm-core.git] / lib / Target / X86 / X86ISelDAGToDAG.cpp
blob5b546d42d98a0fcd8db95575b8fba0df18e8ec5a
1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines a DAG pattern matching instruction selector for X86,
10 // converting from a legalized dag to a X86 dag.
12 //===----------------------------------------------------------------------===//
14 #include "X86.h"
15 #include "X86MachineFunctionInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "X86TargetMachine.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/MachineFrameInfo.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/SelectionDAGISel.h"
23 #include "llvm/Config/llvm-config.h"
24 #include "llvm/IR/ConstantRange.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/ErrorHandling.h"
31 #include "llvm/Support/KnownBits.h"
32 #include "llvm/Support/MathExtras.h"
33 #include "llvm/Support/raw_ostream.h"
34 #include "llvm/Target/TargetMachine.h"
35 #include "llvm/Target/TargetOptions.h"
36 #include <stdint.h>
37 using namespace llvm;
39 #define DEBUG_TYPE "x86-isel"
41 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
43 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45 cl::Hidden);
47 //===----------------------------------------------------------------------===//
48 // Pattern Matcher Implementation
49 //===----------------------------------------------------------------------===//
51 namespace {
52 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
53 /// numbers for the leaves of the matched tree.
54 struct X86ISelAddressMode {
55 enum {
56 RegBase,
57 FrameIndexBase
58 } BaseType;
60 // This is really a union, discriminated by BaseType!
61 SDValue Base_Reg;
62 int Base_FrameIndex;
64 unsigned Scale;
65 SDValue IndexReg;
66 int32_t Disp;
67 SDValue Segment;
68 const GlobalValue *GV;
69 const Constant *CP;
70 const BlockAddress *BlockAddr;
71 const char *ES;
72 MCSymbol *MCSym;
73 int JT;
74 unsigned Align; // CP alignment.
75 unsigned char SymbolFlags; // X86II::MO_*
76 bool NegateIndex = false;
78 X86ISelAddressMode()
79 : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
80 Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
81 MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
83 bool hasSymbolicDisplacement() const {
84 return GV != nullptr || CP != nullptr || ES != nullptr ||
85 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
88 bool hasBaseOrIndexReg() const {
89 return BaseType == FrameIndexBase ||
90 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
93 /// Return true if this addressing mode is already RIP-relative.
94 bool isRIPRelative() const {
95 if (BaseType != RegBase) return false;
96 if (RegisterSDNode *RegNode =
97 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
98 return RegNode->getReg() == X86::RIP;
99 return false;
102 void setBaseReg(SDValue Reg) {
103 BaseType = RegBase;
104 Base_Reg = Reg;
107 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
108 void dump(SelectionDAG *DAG = nullptr) {
109 dbgs() << "X86ISelAddressMode " << this << '\n';
110 dbgs() << "Base_Reg ";
111 if (Base_Reg.getNode())
112 Base_Reg.getNode()->dump(DAG);
113 else
114 dbgs() << "nul\n";
115 if (BaseType == FrameIndexBase)
116 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
117 dbgs() << " Scale " << Scale << '\n'
118 << "IndexReg ";
119 if (NegateIndex)
120 dbgs() << "negate ";
121 if (IndexReg.getNode())
122 IndexReg.getNode()->dump(DAG);
123 else
124 dbgs() << "nul\n";
125 dbgs() << " Disp " << Disp << '\n'
126 << "GV ";
127 if (GV)
128 GV->dump();
129 else
130 dbgs() << "nul";
131 dbgs() << " CP ";
132 if (CP)
133 CP->dump();
134 else
135 dbgs() << "nul";
136 dbgs() << '\n'
137 << "ES ";
138 if (ES)
139 dbgs() << ES;
140 else
141 dbgs() << "nul";
142 dbgs() << " MCSym ";
143 if (MCSym)
144 dbgs() << MCSym;
145 else
146 dbgs() << "nul";
147 dbgs() << " JT" << JT << " Align" << Align << '\n';
149 #endif
153 namespace {
154 //===--------------------------------------------------------------------===//
155 /// ISel - X86-specific code to select X86 machine instructions for
156 /// SelectionDAG operations.
158 class X86DAGToDAGISel final : public SelectionDAGISel {
159 /// Keep a pointer to the X86Subtarget around so that we can
160 /// make the right decision when generating code for different targets.
161 const X86Subtarget *Subtarget;
163 /// If true, selector should try to optimize for code size instead of
164 /// performance.
165 bool OptForSize;
167 /// If true, selector should try to optimize for minimum code size.
168 bool OptForMinSize;
170 /// Disable direct TLS access through segment registers.
171 bool IndirectTlsSegRefs;
173 public:
174 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
175 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
176 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178 StringRef getPassName() const override {
179 return "X86 DAG->DAG Instruction Selection";
182 bool runOnMachineFunction(MachineFunction &MF) override {
183 // Reset the subtarget each time through.
184 Subtarget = &MF.getSubtarget<X86Subtarget>();
185 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
186 "indirect-tls-seg-refs");
188 // OptFor[Min]Size are used in pattern predicates that isel is matching.
189 OptForSize = MF.getFunction().hasOptSize();
190 OptForMinSize = MF.getFunction().hasMinSize();
191 assert((!OptForMinSize || OptForSize) &&
192 "OptForMinSize implies OptForSize");
194 SelectionDAGISel::runOnMachineFunction(MF);
195 return true;
198 void EmitFunctionEntryCode() override;
200 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
202 void PreprocessISelDAG() override;
203 void PostprocessISelDAG() override;
205 // Include the pieces autogenerated from the target description.
206 #include "X86GenDAGISel.inc"
208 private:
209 void Select(SDNode *N) override;
211 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
212 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
213 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
214 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
215 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
216 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
217 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218 unsigned Depth);
219 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
220 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
221 SDValue &Scale, SDValue &Index, SDValue &Disp,
222 SDValue &Segment);
223 bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
224 SDValue &Scale, SDValue &Index, SDValue &Disp,
225 SDValue &Segment);
226 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
227 bool selectLEAAddr(SDValue N, SDValue &Base,
228 SDValue &Scale, SDValue &Index, SDValue &Disp,
229 SDValue &Segment);
230 bool selectLEA64_32Addr(SDValue N, SDValue &Base,
231 SDValue &Scale, SDValue &Index, SDValue &Disp,
232 SDValue &Segment);
233 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
234 SDValue &Scale, SDValue &Index, SDValue &Disp,
235 SDValue &Segment);
236 bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
237 SDValue &Base, SDValue &Scale,
238 SDValue &Index, SDValue &Disp,
239 SDValue &Segment,
240 SDValue &NodeWithChain);
241 bool selectRelocImm(SDValue N, SDValue &Op);
243 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
244 SDValue &Base, SDValue &Scale,
245 SDValue &Index, SDValue &Disp,
246 SDValue &Segment);
248 // Convenience method where P is also root.
249 bool tryFoldLoad(SDNode *P, SDValue N,
250 SDValue &Base, SDValue &Scale,
251 SDValue &Index, SDValue &Disp,
252 SDValue &Segment) {
253 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
256 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
257 SDValue &Base, SDValue &Scale,
258 SDValue &Index, SDValue &Disp,
259 SDValue &Segment);
261 /// Implement addressing mode selection for inline asm expressions.
262 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
263 unsigned ConstraintID,
264 std::vector<SDValue> &OutOps) override;
266 void emitSpecialCodeForMain();
268 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
269 MVT VT, SDValue &Base, SDValue &Scale,
270 SDValue &Index, SDValue &Disp,
271 SDValue &Segment) {
272 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
273 Base = CurDAG->getTargetFrameIndex(
274 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
275 else if (AM.Base_Reg.getNode())
276 Base = AM.Base_Reg;
277 else
278 Base = CurDAG->getRegister(0, VT);
280 Scale = getI8Imm(AM.Scale, DL);
282 // Negate the index if needed.
283 if (AM.NegateIndex) {
284 unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
285 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
286 AM.IndexReg), 0);
287 AM.IndexReg = Neg;
290 if (AM.IndexReg.getNode())
291 Index = AM.IndexReg;
292 else
293 Index = CurDAG->getRegister(0, VT);
295 // These are 32-bit even in 64-bit mode since RIP-relative offset
296 // is 32-bit.
297 if (AM.GV)
298 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
299 MVT::i32, AM.Disp,
300 AM.SymbolFlags);
301 else if (AM.CP)
302 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
303 AM.Align, AM.Disp, AM.SymbolFlags);
304 else if (AM.ES) {
305 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
306 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
307 } else if (AM.MCSym) {
308 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
309 assert(AM.SymbolFlags == 0 && "oo");
310 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
311 } else if (AM.JT != -1) {
312 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
313 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
314 } else if (AM.BlockAddr)
315 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
316 AM.SymbolFlags);
317 else
318 Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
320 if (AM.Segment.getNode())
321 Segment = AM.Segment;
322 else
323 Segment = CurDAG->getRegister(0, MVT::i16);
326 // Utility function to determine whether we should avoid selecting
327 // immediate forms of instructions for better code size or not.
328 // At a high level, we'd like to avoid such instructions when
329 // we have similar constants used within the same basic block
330 // that can be kept in a register.
332 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
333 uint32_t UseCount = 0;
335 // Do not want to hoist if we're not optimizing for size.
336 // TODO: We'd like to remove this restriction.
337 // See the comment in X86InstrInfo.td for more info.
338 if (!OptForSize)
339 return false;
341 // Walk all the users of the immediate.
342 for (SDNode::use_iterator UI = N->use_begin(),
343 UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
345 SDNode *User = *UI;
347 // This user is already selected. Count it as a legitimate use and
348 // move on.
349 if (User->isMachineOpcode()) {
350 UseCount++;
351 continue;
354 // We want to count stores of immediates as real uses.
355 if (User->getOpcode() == ISD::STORE &&
356 User->getOperand(1).getNode() == N) {
357 UseCount++;
358 continue;
361 // We don't currently match users that have > 2 operands (except
362 // for stores, which are handled above)
363 // Those instruction won't match in ISEL, for now, and would
364 // be counted incorrectly.
365 // This may change in the future as we add additional instruction
366 // types.
367 if (User->getNumOperands() != 2)
368 continue;
370 // If this can match to INC/DEC, don't count it as a use.
371 if (User->getOpcode() == ISD::ADD &&
372 (isOneConstant(SDValue(N, 0)) || isAllOnesConstant(SDValue(N, 0))))
373 continue;
375 // Immediates that are used for offsets as part of stack
376 // manipulation should be left alone. These are typically
377 // used to indicate SP offsets for argument passing and
378 // will get pulled into stores/pushes (implicitly).
379 if (User->getOpcode() == X86ISD::ADD ||
380 User->getOpcode() == ISD::ADD ||
381 User->getOpcode() == X86ISD::SUB ||
382 User->getOpcode() == ISD::SUB) {
384 // Find the other operand of the add/sub.
385 SDValue OtherOp = User->getOperand(0);
386 if (OtherOp.getNode() == N)
387 OtherOp = User->getOperand(1);
389 // Don't count if the other operand is SP.
390 RegisterSDNode *RegNode;
391 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
392 (RegNode = dyn_cast_or_null<RegisterSDNode>(
393 OtherOp->getOperand(1).getNode())))
394 if ((RegNode->getReg() == X86::ESP) ||
395 (RegNode->getReg() == X86::RSP))
396 continue;
399 // ... otherwise, count this and move on.
400 UseCount++;
403 // If we have more than 1 use, then recommend for hoisting.
404 return (UseCount > 1);
407 /// Return a target constant with the specified value of type i8.
408 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
409 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
412 /// Return a target constant with the specified value, of type i32.
413 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
414 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
417 /// Return a target constant with the specified value, of type i64.
418 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
419 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
422 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
423 const SDLoc &DL) {
424 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
425 uint64_t Index = N->getConstantOperandVal(1);
426 MVT VecVT = N->getOperand(0).getSimpleValueType();
427 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
430 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
431 const SDLoc &DL) {
432 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
433 uint64_t Index = N->getConstantOperandVal(2);
434 MVT VecVT = N->getSimpleValueType(0);
435 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
438 // Helper to detect unneeded and instructions on shift amounts. Called
439 // from PatFrags in tablegen.
440 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
441 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
442 const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
444 if (Val.countTrailingOnes() >= Width)
445 return true;
447 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
448 return Mask.countTrailingOnes() >= Width;
451 /// Return an SDNode that returns the value of the global base register.
452 /// Output instructions required to initialize the global base register,
453 /// if necessary.
454 SDNode *getGlobalBaseReg();
456 /// Return a reference to the TargetMachine, casted to the target-specific
457 /// type.
458 const X86TargetMachine &getTargetMachine() const {
459 return static_cast<const X86TargetMachine &>(TM);
462 /// Return a reference to the TargetInstrInfo, casted to the target-specific
463 /// type.
464 const X86InstrInfo *getInstrInfo() const {
465 return Subtarget->getInstrInfo();
468 /// Address-mode matching performs shift-of-and to and-of-shift
469 /// reassociation in order to expose more scaled addressing
470 /// opportunities.
471 bool ComplexPatternFuncMutatesDAG() const override {
472 return true;
475 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
477 /// Returns whether this is a relocatable immediate in the range
478 /// [-2^Width .. 2^Width-1].
479 template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
480 if (auto *CN = dyn_cast<ConstantSDNode>(N))
481 return isInt<Width>(CN->getSExtValue());
482 return isSExtAbsoluteSymbolRef(Width, N);
485 // Indicates we should prefer to use a non-temporal load for this load.
486 bool useNonTemporalLoad(LoadSDNode *N) const {
487 if (!N->isNonTemporal())
488 return false;
490 unsigned StoreSize = N->getMemoryVT().getStoreSize();
492 if (N->getAlignment() < StoreSize)
493 return false;
495 switch (StoreSize) {
496 default: llvm_unreachable("Unsupported store size");
497 case 4:
498 case 8:
499 return false;
500 case 16:
501 return Subtarget->hasSSE41();
502 case 32:
503 return Subtarget->hasAVX2();
504 case 64:
505 return Subtarget->hasAVX512();
509 bool foldLoadStoreIntoMemOperand(SDNode *Node);
510 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
511 bool matchBitExtract(SDNode *Node);
512 bool shrinkAndImmediate(SDNode *N);
513 bool isMaskZeroExtended(SDNode *N) const;
514 bool tryShiftAmountMod(SDNode *N);
515 bool combineIncDecVector(SDNode *Node);
516 bool tryShrinkShlLogicImm(SDNode *N);
517 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
518 bool tryMatchBitSelect(SDNode *N);
520 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
521 const SDLoc &dl, MVT VT, SDNode *Node);
522 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
523 const SDLoc &dl, MVT VT, SDNode *Node,
524 SDValue &InFlag);
526 bool tryOptimizeRem8Extend(SDNode *N);
528 bool onlyUsesZeroFlag(SDValue Flags) const;
529 bool hasNoSignFlagUses(SDValue Flags) const;
530 bool hasNoCarryFlagUses(SDValue Flags) const;
535 // Returns true if this masked compare can be implemented legally with this
536 // type.
537 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
538 unsigned Opcode = N->getOpcode();
539 if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
540 Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) {
541 // We can get 256-bit 8 element types here without VLX being enabled. When
542 // this happens we will use 512-bit operations and the mask will not be
543 // zero extended.
544 EVT OpVT = N->getOperand(0).getValueType();
545 if (OpVT.is256BitVector() || OpVT.is128BitVector())
546 return Subtarget->hasVLX();
548 return true;
550 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
551 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
552 Opcode == X86ISD::FSETCCM_SAE)
553 return true;
555 return false;
558 // Returns true if we can assume the writer of the mask has zero extended it
559 // for us.
560 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
561 // If this is an AND, check if we have a compare on either side. As long as
562 // one side guarantees the mask is zero extended, the AND will preserve those
563 // zeros.
564 if (N->getOpcode() == ISD::AND)
565 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
566 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
568 return isLegalMaskCompare(N, Subtarget);
571 bool
572 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
573 if (OptLevel == CodeGenOpt::None) return false;
575 if (!N.hasOneUse())
576 return false;
578 if (N.getOpcode() != ISD::LOAD)
579 return true;
581 // Don't fold non-temporal loads if we have an instruction for them.
582 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
583 return false;
585 // If N is a load, do additional profitability checks.
586 if (U == Root) {
587 switch (U->getOpcode()) {
588 default: break;
589 case X86ISD::ADD:
590 case X86ISD::ADC:
591 case X86ISD::SUB:
592 case X86ISD::SBB:
593 case X86ISD::AND:
594 case X86ISD::XOR:
595 case X86ISD::OR:
596 case ISD::ADD:
597 case ISD::ADDCARRY:
598 case ISD::AND:
599 case ISD::OR:
600 case ISD::XOR: {
601 SDValue Op1 = U->getOperand(1);
603 // If the other operand is a 8-bit immediate we should fold the immediate
604 // instead. This reduces code size.
605 // e.g.
606 // movl 4(%esp), %eax
607 // addl $4, %eax
608 // vs.
609 // movl $4, %eax
610 // addl 4(%esp), %eax
611 // The former is 2 bytes shorter. In case where the increment is 1, then
612 // the saving can be 4 bytes (by using incl %eax).
613 if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
614 if (Imm->getAPIntValue().isSignedIntN(8))
615 return false;
617 // If this is a 64-bit AND with an immediate that fits in 32-bits,
618 // prefer using the smaller and over folding the load. This is needed to
619 // make sure immediates created by shrinkAndImmediate are always folded.
620 // Ideally we would narrow the load during DAG combine and get the
621 // best of both worlds.
622 if (U->getOpcode() == ISD::AND &&
623 Imm->getAPIntValue().getBitWidth() == 64 &&
624 Imm->getAPIntValue().isIntN(32))
625 return false;
627 // If this really a zext_inreg that can be represented with a movzx
628 // instruction, prefer that.
629 // TODO: We could shrink the load and fold if it is non-volatile.
630 if (U->getOpcode() == ISD::AND &&
631 (Imm->getAPIntValue() == UINT8_MAX ||
632 Imm->getAPIntValue() == UINT16_MAX ||
633 Imm->getAPIntValue() == UINT32_MAX))
634 return false;
636 // ADD/SUB with can negate the immediate and use the opposite operation
637 // to fit 128 into a sign extended 8 bit immediate.
638 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
639 (-Imm->getAPIntValue()).isSignedIntN(8))
640 return false;
643 // If the other operand is a TLS address, we should fold it instead.
644 // This produces
645 // movl %gs:0, %eax
646 // leal i@NTPOFF(%eax), %eax
647 // instead of
648 // movl $i@NTPOFF, %eax
649 // addl %gs:0, %eax
650 // if the block also has an access to a second TLS address this will save
651 // a load.
652 // FIXME: This is probably also true for non-TLS addresses.
653 if (Op1.getOpcode() == X86ISD::Wrapper) {
654 SDValue Val = Op1.getOperand(0);
655 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
656 return false;
659 // Don't fold load if this matches the BTS/BTR/BTC patterns.
660 // BTS: (or X, (shl 1, n))
661 // BTR: (and X, (rotl -2, n))
662 // BTC: (xor X, (shl 1, n))
663 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
664 if (U->getOperand(0).getOpcode() == ISD::SHL &&
665 isOneConstant(U->getOperand(0).getOperand(0)))
666 return false;
668 if (U->getOperand(1).getOpcode() == ISD::SHL &&
669 isOneConstant(U->getOperand(1).getOperand(0)))
670 return false;
672 if (U->getOpcode() == ISD::AND) {
673 SDValue U0 = U->getOperand(0);
674 SDValue U1 = U->getOperand(1);
675 if (U0.getOpcode() == ISD::ROTL) {
676 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
677 if (C && C->getSExtValue() == -2)
678 return false;
681 if (U1.getOpcode() == ISD::ROTL) {
682 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
683 if (C && C->getSExtValue() == -2)
684 return false;
688 break;
690 case ISD::SHL:
691 case ISD::SRA:
692 case ISD::SRL:
693 // Don't fold a load into a shift by immediate. The BMI2 instructions
694 // support folding a load, but not an immediate. The legacy instructions
695 // support folding an immediate, but can't fold a load. Folding an
696 // immediate is preferable to folding a load.
697 if (isa<ConstantSDNode>(U->getOperand(1)))
698 return false;
700 break;
704 // Prevent folding a load if this can implemented with an insert_subreg or
705 // a move that implicitly zeroes.
706 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
707 isNullConstant(Root->getOperand(2)) &&
708 (Root->getOperand(0).isUndef() ||
709 ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
710 return false;
712 return true;
715 /// Replace the original chain operand of the call with
716 /// load's chain operand and move load below the call's chain operand.
717 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
718 SDValue Call, SDValue OrigChain) {
719 SmallVector<SDValue, 8> Ops;
720 SDValue Chain = OrigChain.getOperand(0);
721 if (Chain.getNode() == Load.getNode())
722 Ops.push_back(Load.getOperand(0));
723 else {
724 assert(Chain.getOpcode() == ISD::TokenFactor &&
725 "Unexpected chain operand");
726 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
727 if (Chain.getOperand(i).getNode() == Load.getNode())
728 Ops.push_back(Load.getOperand(0));
729 else
730 Ops.push_back(Chain.getOperand(i));
731 SDValue NewChain =
732 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
733 Ops.clear();
734 Ops.push_back(NewChain);
736 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
737 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
738 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
739 Load.getOperand(1), Load.getOperand(2));
741 Ops.clear();
742 Ops.push_back(SDValue(Load.getNode(), 1));
743 Ops.append(Call->op_begin() + 1, Call->op_end());
744 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
747 /// Return true if call address is a load and it can be
748 /// moved below CALLSEQ_START and the chains leading up to the call.
749 /// Return the CALLSEQ_START by reference as a second output.
750 /// In the case of a tail call, there isn't a callseq node between the call
751 /// chain and the load.
752 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
753 // The transformation is somewhat dangerous if the call's chain was glued to
754 // the call. After MoveBelowOrigChain the load is moved between the call and
755 // the chain, this can create a cycle if the load is not folded. So it is
756 // *really* important that we are sure the load will be folded.
757 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
758 return false;
759 LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
760 if (!LD ||
761 !LD->isSimple() ||
762 LD->getAddressingMode() != ISD::UNINDEXED ||
763 LD->getExtensionType() != ISD::NON_EXTLOAD)
764 return false;
766 // Now let's find the callseq_start.
767 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
768 if (!Chain.hasOneUse())
769 return false;
770 Chain = Chain.getOperand(0);
773 if (!Chain.getNumOperands())
774 return false;
775 // Since we are not checking for AA here, conservatively abort if the chain
776 // writes to memory. It's not safe to move the callee (a load) across a store.
777 if (isa<MemSDNode>(Chain.getNode()) &&
778 cast<MemSDNode>(Chain.getNode())->writeMem())
779 return false;
780 if (Chain.getOperand(0).getNode() == Callee.getNode())
781 return true;
782 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
783 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
784 Callee.getValue(1).hasOneUse())
785 return true;
786 return false;
789 void X86DAGToDAGISel::PreprocessISelDAG() {
790 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
791 E = CurDAG->allnodes_end(); I != E; ) {
792 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
794 // If this is a target specific AND node with no flag usages, turn it back
795 // into ISD::AND to enable test instruction matching.
796 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
797 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
798 N->getOperand(0), N->getOperand(1));
799 --I;
800 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
801 ++I;
802 CurDAG->DeleteNode(N);
803 continue;
806 switch (N->getOpcode()) {
807 case ISD::FP_TO_SINT:
808 case ISD::FP_TO_UINT: {
809 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
810 // don't need 2 sets of patterns.
811 if (!N->getSimpleValueType(0).isVector())
812 break;
814 unsigned NewOpc;
815 switch (N->getOpcode()) {
816 default: llvm_unreachable("Unexpected opcode!");
817 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
818 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
820 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
821 N->getOperand(0));
822 --I;
823 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
824 ++I;
825 CurDAG->DeleteNode(N);
826 continue;
828 case ISD::SHL:
829 case ISD::SRA:
830 case ISD::SRL: {
831 // Replace vector shifts with their X86 specific equivalent so we don't
832 // need 2 sets of patterns.
833 if (!N->getValueType(0).isVector())
834 break;
836 unsigned NewOpc;
837 switch (N->getOpcode()) {
838 default: llvm_unreachable("Unexpected opcode!");
839 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
840 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
841 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
843 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
844 N->getOperand(0), N->getOperand(1));
845 --I;
846 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
847 ++I;
848 CurDAG->DeleteNode(N);
849 continue;
851 case ISD::ANY_EXTEND:
852 case ISD::ANY_EXTEND_VECTOR_INREG: {
853 // Replace vector any extend with the zero extend equivalents so we don't
854 // need 2 sets of patterns. Ignore vXi1 extensions.
855 if (!N->getValueType(0).isVector() ||
856 N->getOperand(0).getScalarValueSizeInBits() == 1)
857 break;
859 unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
860 ? ISD::ZERO_EXTEND
861 : ISD::ZERO_EXTEND_VECTOR_INREG;
863 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
864 N->getOperand(0));
865 --I;
866 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
867 ++I;
868 CurDAG->DeleteNode(N);
869 continue;
871 case ISD::FCEIL:
872 case ISD::FFLOOR:
873 case ISD::FTRUNC:
874 case ISD::FNEARBYINT:
875 case ISD::FRINT: {
876 // Replace fp rounding with their X86 specific equivalent so we don't
877 // need 2 sets of patterns.
878 unsigned Imm;
879 switch (N->getOpcode()) {
880 default: llvm_unreachable("Unexpected opcode!");
881 case ISD::FCEIL: Imm = 0xA; break;
882 case ISD::FFLOOR: Imm = 0x9; break;
883 case ISD::FTRUNC: Imm = 0xB; break;
884 case ISD::FNEARBYINT: Imm = 0xC; break;
885 case ISD::FRINT: Imm = 0x4; break;
887 SDLoc dl(N);
888 SDValue Res = CurDAG->getNode(
889 X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0),
890 CurDAG->getTargetConstant(Imm, dl, MVT::i8));
891 --I;
892 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
893 ++I;
894 CurDAG->DeleteNode(N);
895 continue;
897 case X86ISD::FANDN:
898 case X86ISD::FAND:
899 case X86ISD::FOR:
900 case X86ISD::FXOR: {
901 // Widen scalar fp logic ops to vector to reduce isel patterns.
902 // FIXME: Can we do this during lowering/combine.
903 MVT VT = N->getSimpleValueType(0);
904 if (VT.isVector() || VT == MVT::f128)
905 break;
907 MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
908 SDLoc dl(N);
909 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
910 N->getOperand(0));
911 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
912 N->getOperand(1));
914 SDValue Res;
915 if (Subtarget->hasSSE2()) {
916 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
917 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
918 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
919 unsigned Opc;
920 switch (N->getOpcode()) {
921 default: llvm_unreachable("Unexpected opcode!");
922 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
923 case X86ISD::FAND: Opc = ISD::AND; break;
924 case X86ISD::FOR: Opc = ISD::OR; break;
925 case X86ISD::FXOR: Opc = ISD::XOR; break;
927 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
928 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
929 } else {
930 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
932 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
933 CurDAG->getIntPtrConstant(0, dl));
934 --I;
935 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
936 ++I;
937 CurDAG->DeleteNode(N);
938 continue;
942 if (OptLevel != CodeGenOpt::None &&
943 // Only do this when the target can fold the load into the call or
944 // jmp.
945 !Subtarget->useRetpolineIndirectCalls() &&
946 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
947 (N->getOpcode() == X86ISD::TC_RETURN &&
948 (Subtarget->is64Bit() ||
949 !getTargetMachine().isPositionIndependent())))) {
950 /// Also try moving call address load from outside callseq_start to just
951 /// before the call to allow it to be folded.
953 /// [Load chain]
954 /// ^
955 /// |
956 /// [Load]
957 /// ^ ^
958 /// | |
959 /// / \--
960 /// / |
961 ///[CALLSEQ_START] |
962 /// ^ |
963 /// | |
964 /// [LOAD/C2Reg] |
965 /// | |
966 /// \ /
967 /// \ /
968 /// [CALL]
969 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
970 SDValue Chain = N->getOperand(0);
971 SDValue Load = N->getOperand(1);
972 if (!isCalleeLoad(Load, Chain, HasCallSeq))
973 continue;
974 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
975 ++NumLoadMoved;
976 continue;
979 // Lower fpround and fpextend nodes that target the FP stack to be store and
980 // load to the stack. This is a gross hack. We would like to simply mark
981 // these as being illegal, but when we do that, legalize produces these when
982 // it expands calls, then expands these in the same legalize pass. We would
983 // like dag combine to be able to hack on these between the call expansion
984 // and the node legalization. As such this pass basically does "really
985 // late" legalization of these inline with the X86 isel pass.
986 // FIXME: This should only happen when not compiled with -O0.
987 switch (N->getOpcode()) {
988 default: continue;
989 case ISD::FP_ROUND:
990 case ISD::FP_EXTEND:
992 MVT SrcVT = N->getOperand(0).getSimpleValueType();
993 MVT DstVT = N->getSimpleValueType(0);
995 // If any of the sources are vectors, no fp stack involved.
996 if (SrcVT.isVector() || DstVT.isVector())
997 continue;
999 // If the source and destination are SSE registers, then this is a legal
1000 // conversion that should not be lowered.
1001 const X86TargetLowering *X86Lowering =
1002 static_cast<const X86TargetLowering *>(TLI);
1003 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1004 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1005 if (SrcIsSSE && DstIsSSE)
1006 continue;
1008 if (!SrcIsSSE && !DstIsSSE) {
1009 // If this is an FPStack extension, it is a noop.
1010 if (N->getOpcode() == ISD::FP_EXTEND)
1011 continue;
1012 // If this is a value-preserving FPStack truncation, it is a noop.
1013 if (N->getConstantOperandVal(1))
1014 continue;
1017 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1018 // FPStack has extload and truncstore. SSE can fold direct loads into other
1019 // operations. Based on this, decide what we want to do.
1020 MVT MemVT;
1021 if (N->getOpcode() == ISD::FP_ROUND)
1022 MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1023 else
1024 MemVT = SrcIsSSE ? SrcVT : DstVT;
1026 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1027 SDLoc dl(N);
1029 // FIXME: optimize the case where the src/dest is a load or store?
1031 SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
1032 MemTmp, MachinePointerInfo(), MemVT);
1033 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1034 MachinePointerInfo(), MemVT);
1036 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1037 // extload we created. This will cause general havok on the dag because
1038 // anything below the conversion could be folded into other existing nodes.
1039 // To avoid invalidating 'I', back it up to the convert node.
1040 --I;
1041 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1042 break;
1045 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1046 //dealing with the chain differently, as there is already a preexisting chain.
1047 case ISD::STRICT_FP_ROUND:
1048 case ISD::STRICT_FP_EXTEND:
1050 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1051 MVT DstVT = N->getSimpleValueType(0);
1053 // If any of the sources are vectors, no fp stack involved.
1054 if (SrcVT.isVector() || DstVT.isVector())
1055 continue;
1057 // If the source and destination are SSE registers, then this is a legal
1058 // conversion that should not be lowered.
1059 const X86TargetLowering *X86Lowering =
1060 static_cast<const X86TargetLowering *>(TLI);
1061 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1062 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1063 if (SrcIsSSE && DstIsSSE)
1064 continue;
1066 if (!SrcIsSSE && !DstIsSSE) {
1067 // If this is an FPStack extension, it is a noop.
1068 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1069 continue;
1070 // If this is a value-preserving FPStack truncation, it is a noop.
1071 if (N->getConstantOperandVal(2))
1072 continue;
1075 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1076 // FPStack has extload and truncstore. SSE can fold direct loads into other
1077 // operations. Based on this, decide what we want to do.
1078 MVT MemVT;
1079 if (N->getOpcode() == ISD::STRICT_FP_ROUND)
1080 MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1081 else
1082 MemVT = SrcIsSSE ? SrcVT : DstVT;
1084 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1085 SDLoc dl(N);
1087 // FIXME: optimize the case where the src/dest is a load or store?
1089 //Since the operation is StrictFP, use the preexisting chain.
1090 SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1),
1091 MemTmp, MachinePointerInfo(), MemVT);
1092 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1093 MachinePointerInfo(), MemVT);
1095 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1096 // extload we created. This will cause general havok on the dag because
1097 // anything below the conversion could be folded into other existing nodes.
1098 // To avoid invalidating 'I', back it up to the convert node.
1099 --I;
1100 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1101 break;
1106 // Now that we did that, the node is dead. Increment the iterator to the
1107 // next node to process, then delete N.
1108 ++I;
1109 CurDAG->DeleteNode(N);
1112 // The load+call transform above can leave some dead nodes in the graph. Make
1113 // sure we remove them. Its possible some of the other transforms do to so
1114 // just remove dead nodes unconditionally.
1115 CurDAG->RemoveDeadNodes();
1118 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1119 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1120 unsigned Opc = N->getMachineOpcode();
1121 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1122 Opc != X86::MOVSX64rr8)
1123 return false;
1125 SDValue N0 = N->getOperand(0);
1127 // We need to be extracting the lower bit of an extend.
1128 if (!N0.isMachineOpcode() ||
1129 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1130 N0.getConstantOperandVal(1) != X86::sub_8bit)
1131 return false;
1133 // We're looking for either a movsx or movzx to match the original opcode.
1134 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1135 : X86::MOVSX32rr8_NOREX;
1136 SDValue N00 = N0.getOperand(0);
1137 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1138 return false;
1140 if (Opc == X86::MOVSX64rr8) {
1141 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1142 // to 64.
1143 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1144 MVT::i64, N00);
1145 ReplaceUses(N, Extend);
1146 } else {
1147 // Ok we can drop this extend and just use the original extend.
1148 ReplaceUses(N, N00.getNode());
1151 return true;
1154 void X86DAGToDAGISel::PostprocessISelDAG() {
1155 // Skip peepholes at -O0.
1156 if (TM.getOptLevel() == CodeGenOpt::None)
1157 return;
1159 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1161 bool MadeChange = false;
1162 while (Position != CurDAG->allnodes_begin()) {
1163 SDNode *N = &*--Position;
1164 // Skip dead nodes and any non-machine opcodes.
1165 if (N->use_empty() || !N->isMachineOpcode())
1166 continue;
1168 if (tryOptimizeRem8Extend(N)) {
1169 MadeChange = true;
1170 continue;
1173 // Look for a TESTrr+ANDrr pattern where both operands of the test are
1174 // the same. Rewrite to remove the AND.
1175 unsigned Opc = N->getMachineOpcode();
1176 if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
1177 Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
1178 N->getOperand(0) == N->getOperand(1) &&
1179 N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1180 N->getOperand(0).isMachineOpcode()) {
1181 SDValue And = N->getOperand(0);
1182 unsigned N0Opc = And.getMachineOpcode();
1183 if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
1184 N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) {
1185 MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
1186 MVT::i32,
1187 And.getOperand(0),
1188 And.getOperand(1));
1189 ReplaceUses(N, Test);
1190 MadeChange = true;
1191 continue;
1193 if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
1194 N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) {
1195 unsigned NewOpc;
1196 switch (N0Opc) {
1197 case X86::AND8rm: NewOpc = X86::TEST8mr; break;
1198 case X86::AND16rm: NewOpc = X86::TEST16mr; break;
1199 case X86::AND32rm: NewOpc = X86::TEST32mr; break;
1200 case X86::AND64rm: NewOpc = X86::TEST64mr; break;
1203 // Need to swap the memory and register operand.
1204 SDValue Ops[] = { And.getOperand(1),
1205 And.getOperand(2),
1206 And.getOperand(3),
1207 And.getOperand(4),
1208 And.getOperand(5),
1209 And.getOperand(0),
1210 And.getOperand(6) /* Chain */ };
1211 MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1212 MVT::i32, MVT::Other, Ops);
1213 ReplaceUses(N, Test);
1214 MadeChange = true;
1215 continue;
1219 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1220 // used. We're doing this late so we can prefer to fold the AND into masked
1221 // comparisons. Doing that can be better for the live range of the mask
1222 // register.
1223 if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
1224 Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
1225 N->getOperand(0) == N->getOperand(1) &&
1226 N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1227 N->getOperand(0).isMachineOpcode() &&
1228 onlyUsesZeroFlag(SDValue(N, 0))) {
1229 SDValue And = N->getOperand(0);
1230 unsigned N0Opc = And.getMachineOpcode();
1231 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1232 // KAND instructions and KTEST use the same ISA feature.
1233 if (N0Opc == X86::KANDBrr ||
1234 (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
1235 N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
1236 unsigned NewOpc;
1237 switch (Opc) {
1238 default: llvm_unreachable("Unexpected opcode!");
1239 case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
1240 case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
1241 case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
1242 case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
1244 MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1245 MVT::i32,
1246 And.getOperand(0),
1247 And.getOperand(1));
1248 ReplaceUses(N, KTest);
1249 MadeChange = true;
1250 continue;
1254 // Attempt to remove vectors moves that were inserted to zero upper bits.
1255 if (Opc != TargetOpcode::SUBREG_TO_REG)
1256 continue;
1258 unsigned SubRegIdx = N->getConstantOperandVal(2);
1259 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1260 continue;
1262 SDValue Move = N->getOperand(1);
1263 if (!Move.isMachineOpcode())
1264 continue;
1266 // Make sure its one of the move opcodes we recognize.
1267 switch (Move.getMachineOpcode()) {
1268 default:
1269 continue;
1270 case X86::VMOVAPDrr: case X86::VMOVUPDrr:
1271 case X86::VMOVAPSrr: case X86::VMOVUPSrr:
1272 case X86::VMOVDQArr: case X86::VMOVDQUrr:
1273 case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
1274 case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
1275 case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
1276 case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
1277 case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
1278 case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1279 case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1280 case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
1281 case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
1282 case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1283 case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1284 break;
1287 SDValue In = Move.getOperand(0);
1288 if (!In.isMachineOpcode() ||
1289 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1290 continue;
1292 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1293 // the SHA instructions which use a legacy encoding.
1294 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1295 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1296 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1297 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1298 continue;
1300 // Producing instruction is another vector instruction. We can drop the
1301 // move.
1302 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1303 MadeChange = true;
1306 if (MadeChange)
1307 CurDAG->RemoveDeadNodes();
1311 /// Emit any code that needs to be executed only in the main function.
1312 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1313 if (Subtarget->isTargetCygMing()) {
1314 TargetLowering::ArgListTy Args;
1315 auto &DL = CurDAG->getDataLayout();
1317 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1318 CLI.setChain(CurDAG->getRoot())
1319 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1320 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1321 std::move(Args));
1322 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1323 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1324 CurDAG->setRoot(Result.second);
1328 void X86DAGToDAGISel::EmitFunctionEntryCode() {
1329 // If this is main, emit special code for main.
1330 const Function &F = MF->getFunction();
1331 if (F.hasExternalLinkage() && F.getName() == "main")
1332 emitSpecialCodeForMain();
1335 static bool isDispSafeForFrameIndex(int64_t Val) {
1336 // On 64-bit platforms, we can run into an issue where a frame index
1337 // includes a displacement that, when added to the explicit displacement,
1338 // will overflow the displacement field. Assuming that the frame index
1339 // displacement fits into a 31-bit integer (which is only slightly more
1340 // aggressive than the current fundamental assumption that it fits into
1341 // a 32-bit integer), a 31-bit disp should always be safe.
1342 return isInt<31>(Val);
1345 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1346 X86ISelAddressMode &AM) {
1347 // If there's no offset to fold, we don't need to do any work.
1348 if (Offset == 0)
1349 return false;
1351 // Cannot combine ExternalSymbol displacements with integer offsets.
1352 if (AM.ES || AM.MCSym)
1353 return true;
1355 int64_t Val = AM.Disp + Offset;
1356 CodeModel::Model M = TM.getCodeModel();
1357 if (Subtarget->is64Bit()) {
1358 if (!X86::isOffsetSuitableForCodeModel(Val, M,
1359 AM.hasSymbolicDisplacement()))
1360 return true;
1361 // In addition to the checks required for a register base, check that
1362 // we do not try to use an unsafe Disp with a frame index.
1363 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1364 !isDispSafeForFrameIndex(Val))
1365 return true;
1367 AM.Disp = Val;
1368 return false;
1372 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
1373 SDValue Address = N->getOperand(1);
1375 // load gs:0 -> GS segment register.
1376 // load fs:0 -> FS segment register.
1378 // This optimization is valid because the GNU TLS model defines that
1379 // gs:0 (or fs:0 on X86-64) contains its own address.
1380 // For more information see http://people.redhat.com/drepper/tls.pdf
1381 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
1382 if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
1383 !IndirectTlsSegRefs &&
1384 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1385 Subtarget->isTargetFuchsia()))
1386 switch (N->getPointerInfo().getAddrSpace()) {
1387 case 256:
1388 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1389 return false;
1390 case 257:
1391 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1392 return false;
1393 // Address space 258 is not handled here, because it is not used to
1394 // address TLS areas.
1397 return true;
1400 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1401 /// mode. These wrap things that will resolve down into a symbol reference.
1402 /// If no match is possible, this returns true, otherwise it returns false.
1403 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1404 // If the addressing mode already has a symbol as the displacement, we can
1405 // never match another symbol.
1406 if (AM.hasSymbolicDisplacement())
1407 return true;
1409 bool IsRIPRelTLS = false;
1410 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1411 if (IsRIPRel) {
1412 SDValue Val = N.getOperand(0);
1413 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1414 IsRIPRelTLS = true;
1417 // We can't use an addressing mode in the 64-bit large code model.
1418 // Global TLS addressing is an exception. In the medium code model,
1419 // we use can use a mode when RIP wrappers are present.
1420 // That signifies access to globals that are known to be "near",
1421 // such as the GOT itself.
1422 CodeModel::Model M = TM.getCodeModel();
1423 if (Subtarget->is64Bit() &&
1424 ((M == CodeModel::Large && !IsRIPRelTLS) ||
1425 (M == CodeModel::Medium && !IsRIPRel)))
1426 return true;
1428 // Base and index reg must be 0 in order to use %rip as base.
1429 if (IsRIPRel && AM.hasBaseOrIndexReg())
1430 return true;
1432 // Make a local copy in case we can't do this fold.
1433 X86ISelAddressMode Backup = AM;
1435 int64_t Offset = 0;
1436 SDValue N0 = N.getOperand(0);
1437 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1438 AM.GV = G->getGlobal();
1439 AM.SymbolFlags = G->getTargetFlags();
1440 Offset = G->getOffset();
1441 } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1442 AM.CP = CP->getConstVal();
1443 AM.Align = CP->getAlignment();
1444 AM.SymbolFlags = CP->getTargetFlags();
1445 Offset = CP->getOffset();
1446 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1447 AM.ES = S->getSymbol();
1448 AM.SymbolFlags = S->getTargetFlags();
1449 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1450 AM.MCSym = S->getMCSymbol();
1451 } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
1452 AM.JT = J->getIndex();
1453 AM.SymbolFlags = J->getTargetFlags();
1454 } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1455 AM.BlockAddr = BA->getBlockAddress();
1456 AM.SymbolFlags = BA->getTargetFlags();
1457 Offset = BA->getOffset();
1458 } else
1459 llvm_unreachable("Unhandled symbol reference node.");
1461 if (foldOffsetIntoAddress(Offset, AM)) {
1462 AM = Backup;
1463 return true;
1466 if (IsRIPRel)
1467 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1469 // Commit the changes now that we know this fold is safe.
1470 return false;
1473 /// Add the specified node to the specified addressing mode, returning true if
1474 /// it cannot be done. This just pattern matches for the addressing mode.
1475 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1476 if (matchAddressRecursively(N, AM, 0))
1477 return true;
1479 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1480 // a smaller encoding and avoids a scaled-index.
1481 if (AM.Scale == 2 &&
1482 AM.BaseType == X86ISelAddressMode::RegBase &&
1483 AM.Base_Reg.getNode() == nullptr) {
1484 AM.Base_Reg = AM.IndexReg;
1485 AM.Scale = 1;
1488 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1489 // because it has a smaller encoding.
1490 // TODO: Which other code models can use this?
1491 switch (TM.getCodeModel()) {
1492 default: break;
1493 case CodeModel::Small:
1494 case CodeModel::Kernel:
1495 if (Subtarget->is64Bit() &&
1496 AM.Scale == 1 &&
1497 AM.BaseType == X86ISelAddressMode::RegBase &&
1498 AM.Base_Reg.getNode() == nullptr &&
1499 AM.IndexReg.getNode() == nullptr &&
1500 AM.SymbolFlags == X86II::MO_NO_FLAG &&
1501 AM.hasSymbolicDisplacement())
1502 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1503 break;
1506 return false;
1509 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1510 unsigned Depth) {
1511 // Add an artificial use to this node so that we can keep track of
1512 // it if it gets CSE'd with a different node.
1513 HandleSDNode Handle(N);
1515 X86ISelAddressMode Backup = AM;
1516 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1517 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1518 return false;
1519 AM = Backup;
1521 // Try again after commuting the operands.
1522 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
1523 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
1524 return false;
1525 AM = Backup;
1527 // If we couldn't fold both operands into the address at the same time,
1528 // see if we can just put each operand into a register and fold at least
1529 // the add.
1530 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1531 !AM.Base_Reg.getNode() &&
1532 !AM.IndexReg.getNode()) {
1533 N = Handle.getValue();
1534 AM.Base_Reg = N.getOperand(0);
1535 AM.IndexReg = N.getOperand(1);
1536 AM.Scale = 1;
1537 return false;
1539 N = Handle.getValue();
1540 return true;
1543 // Insert a node into the DAG at least before the Pos node's position. This
1544 // will reposition the node as needed, and will assign it a node ID that is <=
1545 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1546 // IDs! The selection DAG must no longer depend on their uniqueness when this
1547 // is used.
1548 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1549 if (N->getNodeId() == -1 ||
1550 (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
1551 SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
1552 DAG.RepositionNode(Pos->getIterator(), N.getNode());
1553 // Mark Node as invalid for pruning as after this it may be a successor to a
1554 // selected node but otherwise be in the same position of Pos.
1555 // Conservatively mark it with the same -abs(Id) to assure node id
1556 // invariant is preserved.
1557 N->setNodeId(Pos->getNodeId());
1558 SelectionDAGISel::InvalidateNodeId(N.getNode());
1562 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1563 // safe. This allows us to convert the shift and and into an h-register
1564 // extract and a scaled index. Returns false if the simplification is
1565 // performed.
1566 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
1567 uint64_t Mask,
1568 SDValue Shift, SDValue X,
1569 X86ISelAddressMode &AM) {
1570 if (Shift.getOpcode() != ISD::SRL ||
1571 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1572 !Shift.hasOneUse())
1573 return true;
1575 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
1576 if (ScaleLog <= 0 || ScaleLog >= 4 ||
1577 Mask != (0xffu << ScaleLog))
1578 return true;
1580 MVT VT = N.getSimpleValueType();
1581 SDLoc DL(N);
1582 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
1583 SDValue NewMask = DAG.getConstant(0xff, DL, VT);
1584 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
1585 SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
1586 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
1587 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
1589 // Insert the new nodes into the topological ordering. We must do this in
1590 // a valid topological ordering as nothing is going to go back and re-sort
1591 // these nodes. We continually insert before 'N' in sequence as this is
1592 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1593 // hierarchy left to express.
1594 insertDAGNode(DAG, N, Eight);
1595 insertDAGNode(DAG, N, Srl);
1596 insertDAGNode(DAG, N, NewMask);
1597 insertDAGNode(DAG, N, And);
1598 insertDAGNode(DAG, N, ShlCount);
1599 insertDAGNode(DAG, N, Shl);
1600 DAG.ReplaceAllUsesWith(N, Shl);
1601 DAG.RemoveDeadNode(N.getNode());
1602 AM.IndexReg = And;
1603 AM.Scale = (1 << ScaleLog);
1604 return false;
1607 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
1608 // allows us to fold the shift into this addressing mode. Returns false if the
1609 // transform succeeded.
1610 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
1611 X86ISelAddressMode &AM) {
1612 SDValue Shift = N.getOperand(0);
1614 // Use a signed mask so that shifting right will insert sign bits. These
1615 // bits will be removed when we shift the result left so it doesn't matter
1616 // what we use. This might allow a smaller immediate encoding.
1617 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
1619 // If we have an any_extend feeding the AND, look through it to see if there
1620 // is a shift behind it. But only if the AND doesn't use the extended bits.
1621 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
1622 bool FoundAnyExtend = false;
1623 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
1624 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
1625 isUInt<32>(Mask)) {
1626 FoundAnyExtend = true;
1627 Shift = Shift.getOperand(0);
1630 if (Shift.getOpcode() != ISD::SHL ||
1631 !isa<ConstantSDNode>(Shift.getOperand(1)))
1632 return true;
1634 SDValue X = Shift.getOperand(0);
1636 // Not likely to be profitable if either the AND or SHIFT node has more
1637 // than one use (unless all uses are for address computation). Besides,
1638 // isel mechanism requires their node ids to be reused.
1639 if (!N.hasOneUse() || !Shift.hasOneUse())
1640 return true;
1642 // Verify that the shift amount is something we can fold.
1643 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1644 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
1645 return true;
1647 MVT VT = N.getSimpleValueType();
1648 SDLoc DL(N);
1649 if (FoundAnyExtend) {
1650 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
1651 insertDAGNode(DAG, N, NewX);
1652 X = NewX;
1655 SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
1656 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
1657 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
1659 // Insert the new nodes into the topological ordering. We must do this in
1660 // a valid topological ordering as nothing is going to go back and re-sort
1661 // these nodes. We continually insert before 'N' in sequence as this is
1662 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1663 // hierarchy left to express.
1664 insertDAGNode(DAG, N, NewMask);
1665 insertDAGNode(DAG, N, NewAnd);
1666 insertDAGNode(DAG, N, NewShift);
1667 DAG.ReplaceAllUsesWith(N, NewShift);
1668 DAG.RemoveDeadNode(N.getNode());
1670 AM.Scale = 1 << ShiftAmt;
1671 AM.IndexReg = NewAnd;
1672 return false;
1675 // Implement some heroics to detect shifts of masked values where the mask can
1676 // be replaced by extending the shift and undoing that in the addressing mode
1677 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
1678 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
1679 // the addressing mode. This results in code such as:
1681 // int f(short *y, int *lookup_table) {
1682 // ...
1683 // return *y + lookup_table[*y >> 11];
1684 // }
1686 // Turning into:
1687 // movzwl (%rdi), %eax
1688 // movl %eax, %ecx
1689 // shrl $11, %ecx
1690 // addl (%rsi,%rcx,4), %eax
1692 // Instead of:
1693 // movzwl (%rdi), %eax
1694 // movl %eax, %ecx
1695 // shrl $9, %ecx
1696 // andl $124, %rcx
1697 // addl (%rsi,%rcx), %eax
1699 // Note that this function assumes the mask is provided as a mask *after* the
1700 // value is shifted. The input chain may or may not match that, but computing
1701 // such a mask is trivial.
1702 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
1703 uint64_t Mask,
1704 SDValue Shift, SDValue X,
1705 X86ISelAddressMode &AM) {
1706 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
1707 !isa<ConstantSDNode>(Shift.getOperand(1)))
1708 return true;
1710 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1711 unsigned MaskLZ = countLeadingZeros(Mask);
1712 unsigned MaskTZ = countTrailingZeros(Mask);
1714 // The amount of shift we're trying to fit into the addressing mode is taken
1715 // from the trailing zeros of the mask.
1716 unsigned AMShiftAmt = MaskTZ;
1718 // There is nothing we can do here unless the mask is removing some bits.
1719 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1720 if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1722 // We also need to ensure that mask is a continuous run of bits.
1723 if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
1725 // Scale the leading zero count down based on the actual size of the value.
1726 // Also scale it down based on the size of the shift.
1727 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
1728 if (MaskLZ < ScaleDown)
1729 return true;
1730 MaskLZ -= ScaleDown;
1732 // The final check is to ensure that any masked out high bits of X are
1733 // already known to be zero. Otherwise, the mask has a semantic impact
1734 // other than masking out a couple of low bits. Unfortunately, because of
1735 // the mask, zero extensions will be removed from operands in some cases.
1736 // This code works extra hard to look through extensions because we can
1737 // replace them with zero extensions cheaply if necessary.
1738 bool ReplacingAnyExtend = false;
1739 if (X.getOpcode() == ISD::ANY_EXTEND) {
1740 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
1741 X.getOperand(0).getSimpleValueType().getSizeInBits();
1742 // Assume that we'll replace the any-extend with a zero-extend, and
1743 // narrow the search to the extended value.
1744 X = X.getOperand(0);
1745 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
1746 ReplacingAnyExtend = true;
1748 APInt MaskedHighBits =
1749 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
1750 KnownBits Known = DAG.computeKnownBits(X);
1751 if (MaskedHighBits != Known.Zero) return true;
1753 // We've identified a pattern that can be transformed into a single shift
1754 // and an addressing mode. Make it so.
1755 MVT VT = N.getSimpleValueType();
1756 if (ReplacingAnyExtend) {
1757 assert(X.getValueType() != VT);
1758 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
1759 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
1760 insertDAGNode(DAG, N, NewX);
1761 X = NewX;
1763 SDLoc DL(N);
1764 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1765 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1766 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1767 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
1769 // Insert the new nodes into the topological ordering. We must do this in
1770 // a valid topological ordering as nothing is going to go back and re-sort
1771 // these nodes. We continually insert before 'N' in sequence as this is
1772 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1773 // hierarchy left to express.
1774 insertDAGNode(DAG, N, NewSRLAmt);
1775 insertDAGNode(DAG, N, NewSRL);
1776 insertDAGNode(DAG, N, NewSHLAmt);
1777 insertDAGNode(DAG, N, NewSHL);
1778 DAG.ReplaceAllUsesWith(N, NewSHL);
1779 DAG.RemoveDeadNode(N.getNode());
1781 AM.Scale = 1 << AMShiftAmt;
1782 AM.IndexReg = NewSRL;
1783 return false;
1786 // Transform "(X >> SHIFT) & (MASK << C1)" to
1787 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
1788 // matched to a BEXTR later. Returns false if the simplification is performed.
1789 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
1790 uint64_t Mask,
1791 SDValue Shift, SDValue X,
1792 X86ISelAddressMode &AM,
1793 const X86Subtarget &Subtarget) {
1794 if (Shift.getOpcode() != ISD::SRL ||
1795 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1796 !Shift.hasOneUse() || !N.hasOneUse())
1797 return true;
1799 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
1800 if (!Subtarget.hasTBM() &&
1801 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
1802 return true;
1804 // We need to ensure that mask is a continuous run of bits.
1805 if (!isShiftedMask_64(Mask)) return true;
1807 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1809 // The amount of shift we're trying to fit into the addressing mode is taken
1810 // from the trailing zeros of the mask.
1811 unsigned AMShiftAmt = countTrailingZeros(Mask);
1813 // There is nothing we can do here unless the mask is removing some bits.
1814 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1815 if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1817 MVT VT = N.getSimpleValueType();
1818 SDLoc DL(N);
1819 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1820 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1821 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
1822 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
1823 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1824 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
1826 // Insert the new nodes into the topological ordering. We must do this in
1827 // a valid topological ordering as nothing is going to go back and re-sort
1828 // these nodes. We continually insert before 'N' in sequence as this is
1829 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1830 // hierarchy left to express.
1831 insertDAGNode(DAG, N, NewSRLAmt);
1832 insertDAGNode(DAG, N, NewSRL);
1833 insertDAGNode(DAG, N, NewMask);
1834 insertDAGNode(DAG, N, NewAnd);
1835 insertDAGNode(DAG, N, NewSHLAmt);
1836 insertDAGNode(DAG, N, NewSHL);
1837 DAG.ReplaceAllUsesWith(N, NewSHL);
1838 DAG.RemoveDeadNode(N.getNode());
1840 AM.Scale = 1 << AMShiftAmt;
1841 AM.IndexReg = NewAnd;
1842 return false;
1845 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
1846 unsigned Depth) {
1847 SDLoc dl(N);
1848 LLVM_DEBUG({
1849 dbgs() << "MatchAddress: ";
1850 AM.dump(CurDAG);
1852 // Limit recursion.
1853 if (Depth > 5)
1854 return matchAddressBase(N, AM);
1856 // If this is already a %rip relative address, we can only merge immediates
1857 // into it. Instead of handling this in every case, we handle it here.
1858 // RIP relative addressing: %rip + 32-bit displacement!
1859 if (AM.isRIPRelative()) {
1860 // FIXME: JumpTable and ExternalSymbol address currently don't like
1861 // displacements. It isn't very important, but this should be fixed for
1862 // consistency.
1863 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
1864 return true;
1866 if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
1867 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
1868 return false;
1869 return true;
1872 switch (N.getOpcode()) {
1873 default: break;
1874 case ISD::LOCAL_RECOVER: {
1875 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
1876 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
1877 // Use the symbol and don't prefix it.
1878 AM.MCSym = ESNode->getMCSymbol();
1879 return false;
1881 break;
1883 case ISD::Constant: {
1884 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
1885 if (!foldOffsetIntoAddress(Val, AM))
1886 return false;
1887 break;
1890 case X86ISD::Wrapper:
1891 case X86ISD::WrapperRIP:
1892 if (!matchWrapper(N, AM))
1893 return false;
1894 break;
1896 case ISD::LOAD:
1897 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
1898 return false;
1899 break;
1901 case ISD::FrameIndex:
1902 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1903 AM.Base_Reg.getNode() == nullptr &&
1904 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
1905 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
1906 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
1907 return false;
1909 break;
1911 case ISD::SHL:
1912 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
1913 break;
1915 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
1916 unsigned Val = CN->getZExtValue();
1917 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
1918 // that the base operand remains free for further matching. If
1919 // the base doesn't end up getting used, a post-processing step
1920 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
1921 if (Val == 1 || Val == 2 || Val == 3) {
1922 AM.Scale = 1 << Val;
1923 SDValue ShVal = N.getOperand(0);
1925 // Okay, we know that we have a scale by now. However, if the scaled
1926 // value is an add of something and a constant, we can fold the
1927 // constant into the disp field here.
1928 if (CurDAG->isBaseWithConstantOffset(ShVal)) {
1929 AM.IndexReg = ShVal.getOperand(0);
1930 ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
1931 uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
1932 if (!foldOffsetIntoAddress(Disp, AM))
1933 return false;
1936 AM.IndexReg = ShVal;
1937 return false;
1940 break;
1942 case ISD::SRL: {
1943 // Scale must not be used already.
1944 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
1946 // We only handle up to 64-bit values here as those are what matter for
1947 // addressing mode optimizations.
1948 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
1949 "Unexpected value size!");
1951 SDValue And = N.getOperand(0);
1952 if (And.getOpcode() != ISD::AND) break;
1953 SDValue X = And.getOperand(0);
1955 // The mask used for the transform is expected to be post-shift, but we
1956 // found the shift first so just apply the shift to the mask before passing
1957 // it down.
1958 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
1959 !isa<ConstantSDNode>(And.getOperand(1)))
1960 break;
1961 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
1963 // Try to fold the mask and shift into the scale, and return false if we
1964 // succeed.
1965 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
1966 return false;
1967 break;
1970 case ISD::SMUL_LOHI:
1971 case ISD::UMUL_LOHI:
1972 // A mul_lohi where we need the low part can be folded as a plain multiply.
1973 if (N.getResNo() != 0) break;
1974 LLVM_FALLTHROUGH;
1975 case ISD::MUL:
1976 case X86ISD::MUL_IMM:
1977 // X*[3,5,9] -> X+X*[2,4,8]
1978 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1979 AM.Base_Reg.getNode() == nullptr &&
1980 AM.IndexReg.getNode() == nullptr) {
1981 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
1982 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
1983 CN->getZExtValue() == 9) {
1984 AM.Scale = unsigned(CN->getZExtValue())-1;
1986 SDValue MulVal = N.getOperand(0);
1987 SDValue Reg;
1989 // Okay, we know that we have a scale by now. However, if the scaled
1990 // value is an add of something and a constant, we can fold the
1991 // constant into the disp field here.
1992 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
1993 isa<ConstantSDNode>(MulVal.getOperand(1))) {
1994 Reg = MulVal.getOperand(0);
1995 ConstantSDNode *AddVal =
1996 cast<ConstantSDNode>(MulVal.getOperand(1));
1997 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
1998 if (foldOffsetIntoAddress(Disp, AM))
1999 Reg = N.getOperand(0);
2000 } else {
2001 Reg = N.getOperand(0);
2004 AM.IndexReg = AM.Base_Reg = Reg;
2005 return false;
2008 break;
2010 case ISD::SUB: {
2011 // Given A-B, if A can be completely folded into the address and
2012 // the index field with the index field unused, use -B as the index.
2013 // This is a win if a has multiple parts that can be folded into
2014 // the address. Also, this saves a mov if the base register has
2015 // other uses, since it avoids a two-address sub instruction, however
2016 // it costs an additional mov if the index register has other uses.
2018 // Add an artificial use to this node so that we can keep track of
2019 // it if it gets CSE'd with a different node.
2020 HandleSDNode Handle(N);
2022 // Test if the LHS of the sub can be folded.
2023 X86ISelAddressMode Backup = AM;
2024 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2025 N = Handle.getValue();
2026 AM = Backup;
2027 break;
2029 N = Handle.getValue();
2030 // Test if the index field is free for use.
2031 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2032 AM = Backup;
2033 break;
2036 int Cost = 0;
2037 SDValue RHS = N.getOperand(1);
2038 // If the RHS involves a register with multiple uses, this
2039 // transformation incurs an extra mov, due to the neg instruction
2040 // clobbering its operand.
2041 if (!RHS.getNode()->hasOneUse() ||
2042 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2043 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2044 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2045 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2046 RHS.getOperand(0).getValueType() == MVT::i32))
2047 ++Cost;
2048 // If the base is a register with multiple uses, this
2049 // transformation may save a mov.
2050 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2051 !AM.Base_Reg.getNode()->hasOneUse()) ||
2052 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2053 --Cost;
2054 // If the folded LHS was interesting, this transformation saves
2055 // address arithmetic.
2056 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2057 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2058 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2059 --Cost;
2060 // If it doesn't look like it may be an overall win, don't do it.
2061 if (Cost >= 0) {
2062 AM = Backup;
2063 break;
2066 // Ok, the transformation is legal and appears profitable. Go for it.
2067 // Negation will be emitted later to avoid creating dangling nodes if this
2068 // was an unprofitable LEA.
2069 AM.IndexReg = RHS;
2070 AM.NegateIndex = true;
2071 AM.Scale = 1;
2072 return false;
2075 case ISD::ADD:
2076 if (!matchAdd(N, AM, Depth))
2077 return false;
2078 break;
2080 case ISD::OR:
2081 // We want to look through a transform in InstCombine and DAGCombiner that
2082 // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
2083 // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
2084 // An 'lea' can then be used to match the shift (multiply) and add:
2085 // and $1, %esi
2086 // lea (%rsi, %rdi, 8), %rax
2087 if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
2088 !matchAdd(N, AM, Depth))
2089 return false;
2090 break;
2092 case ISD::AND: {
2093 // Perform some heroic transforms on an and of a constant-count shift
2094 // with a constant to enable use of the scaled offset field.
2096 // Scale must not be used already.
2097 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2099 // We only handle up to 64-bit values here as those are what matter for
2100 // addressing mode optimizations.
2101 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2102 "Unexpected value size!");
2104 if (!isa<ConstantSDNode>(N.getOperand(1)))
2105 break;
2107 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2108 SDValue Shift = N.getOperand(0);
2109 SDValue X = Shift.getOperand(0);
2111 uint64_t Mask = N.getConstantOperandVal(1);
2113 // Try to fold the mask and shift into an extract and scale.
2114 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2115 return false;
2117 // Try to fold the mask and shift directly into the scale.
2118 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2119 return false;
2121 // Try to fold the mask and shift into BEXTR and scale.
2122 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2123 return false;
2126 // Try to swap the mask and shift to place shifts which can be done as
2127 // a scale on the outside of the mask.
2128 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2129 return false;
2131 break;
2133 case ISD::ZERO_EXTEND: {
2134 // Try to widen a zexted shift left to the same size as its use, so we can
2135 // match the shift as a scale factor.
2136 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2137 break;
2138 if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse())
2139 break;
2141 // Give up if the shift is not a valid scale factor [1,2,3].
2142 SDValue Shl = N.getOperand(0);
2143 auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
2144 if (!ShAmtC || ShAmtC->getZExtValue() > 3)
2145 break;
2147 // The narrow shift must only shift out zero bits (it must be 'nuw').
2148 // That makes it safe to widen to the destination type.
2149 APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
2150 ShAmtC->getZExtValue());
2151 if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
2152 break;
2154 // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
2155 MVT VT = N.getSimpleValueType();
2156 SDLoc DL(N);
2157 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
2158 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
2160 // Convert the shift to scale factor.
2161 AM.Scale = 1 << ShAmtC->getZExtValue();
2162 AM.IndexReg = Zext;
2164 insertDAGNode(*CurDAG, N, Zext);
2165 insertDAGNode(*CurDAG, N, NewShl);
2166 CurDAG->ReplaceAllUsesWith(N, NewShl);
2167 CurDAG->RemoveDeadNode(N.getNode());
2168 return false;
2172 return matchAddressBase(N, AM);
2175 /// Helper for MatchAddress. Add the specified node to the
2176 /// specified addressing mode without any further recursion.
2177 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2178 // Is the base register already occupied?
2179 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2180 // If so, check to see if the scale index register is set.
2181 if (!AM.IndexReg.getNode()) {
2182 AM.IndexReg = N;
2183 AM.Scale = 1;
2184 return false;
2187 // Otherwise, we cannot select it.
2188 return true;
2191 // Default, generate it as a register.
2192 AM.BaseType = X86ISelAddressMode::RegBase;
2193 AM.Base_Reg = N;
2194 return false;
2197 /// Helper for selectVectorAddr. Handles things that can be folded into a
2198 /// gather scatter address. The index register and scale should have already
2199 /// been handled.
2200 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2201 // TODO: Support other operations.
2202 switch (N.getOpcode()) {
2203 case ISD::Constant: {
2204 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2205 if (!foldOffsetIntoAddress(Val, AM))
2206 return false;
2207 break;
2209 case X86ISD::Wrapper:
2210 if (!matchWrapper(N, AM))
2211 return false;
2212 break;
2215 return matchAddressBase(N, AM);
2218 bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
2219 SDValue &Scale, SDValue &Index,
2220 SDValue &Disp, SDValue &Segment) {
2221 X86ISelAddressMode AM;
2222 auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
2223 AM.IndexReg = Mgs->getIndex();
2224 AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
2226 unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2227 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2228 if (AddrSpace == 256)
2229 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2230 if (AddrSpace == 257)
2231 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2232 if (AddrSpace == 258)
2233 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2235 SDLoc DL(N);
2236 MVT VT = N.getSimpleValueType();
2238 // Try to match into the base and displacement fields.
2239 if (matchVectorAddress(N, AM))
2240 return false;
2242 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2243 return true;
2246 /// Returns true if it is able to pattern match an addressing mode.
2247 /// It returns the operands which make up the maximal addressing mode it can
2248 /// match by reference.
2250 /// Parent is the parent node of the addr operand that is being matched. It
2251 /// is always a load, store, atomic node, or null. It is only null when
2252 /// checking memory operands for inline asm nodes.
2253 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2254 SDValue &Scale, SDValue &Index,
2255 SDValue &Disp, SDValue &Segment) {
2256 X86ISelAddressMode AM;
2258 if (Parent &&
2259 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2260 // that are not a MemSDNode, and thus don't have proper addrspace info.
2261 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2262 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2263 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2264 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2265 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2266 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2267 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2268 unsigned AddrSpace =
2269 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2270 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2271 if (AddrSpace == 256)
2272 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2273 if (AddrSpace == 257)
2274 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2275 if (AddrSpace == 258)
2276 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2279 // Save the DL and VT before calling matchAddress, it can invalidate N.
2280 SDLoc DL(N);
2281 MVT VT = N.getSimpleValueType();
2283 if (matchAddress(N, AM))
2284 return false;
2286 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2287 return true;
2290 // We can only fold a load if all nodes between it and the root node have a
2291 // single use. If there are additional uses, we could end up duplicating the
2292 // load.
2293 static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
2294 while (User != Root) {
2295 if (!User->hasOneUse())
2296 return false;
2297 User = *User->use_begin();
2300 return true;
2303 /// Match a scalar SSE load. In particular, we want to match a load whose top
2304 /// elements are either undef or zeros. The load flavor is derived from the
2305 /// type of N, which is either v4f32 or v2f64.
2307 /// We also return:
2308 /// PatternChainNode: this is the matched node that has a chain input and
2309 /// output.
2310 bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
2311 SDValue N, SDValue &Base,
2312 SDValue &Scale, SDValue &Index,
2313 SDValue &Disp, SDValue &Segment,
2314 SDValue &PatternNodeWithChain) {
2315 if (!hasSingleUsesFromRoot(Root, Parent))
2316 return false;
2318 // We can allow a full vector load here since narrowing a load is ok unless
2319 // it's volatile or atomic.
2320 if (ISD::isNON_EXTLoad(N.getNode())) {
2321 LoadSDNode *LD = cast<LoadSDNode>(N);
2322 if (LD->isSimple() &&
2323 IsProfitableToFold(N, LD, Root) &&
2324 IsLegalToFold(N, Parent, Root, OptLevel)) {
2325 PatternNodeWithChain = N;
2326 return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2327 Segment);
2331 // We can also match the special zero extended load opcode.
2332 if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
2333 PatternNodeWithChain = N;
2334 if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2335 IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
2336 auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
2337 return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
2338 Segment);
2342 // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
2343 // once. Otherwise the load might get duplicated and the chain output of the
2344 // duplicate load will not be observed by all dependencies.
2345 if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
2346 PatternNodeWithChain = N.getOperand(0);
2347 if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
2348 IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2349 IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
2350 LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
2351 return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2352 Segment);
2356 return false;
2360 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2361 if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
2362 uint64_t ImmVal = CN->getZExtValue();
2363 if (!isUInt<32>(ImmVal))
2364 return false;
2366 Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
2367 return true;
2370 // In static codegen with small code model, we can get the address of a label
2371 // into a register with 'movl'
2372 if (N->getOpcode() != X86ISD::Wrapper)
2373 return false;
2375 N = N.getOperand(0);
2377 // At least GNU as does not accept 'movl' for TPOFF relocations.
2378 // FIXME: We could use 'movl' when we know we are targeting MC.
2379 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2380 return false;
2382 Imm = N;
2383 if (N->getOpcode() != ISD::TargetGlobalAddress)
2384 return TM.getCodeModel() == CodeModel::Small;
2386 Optional<ConstantRange> CR =
2387 cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
2388 if (!CR)
2389 return TM.getCodeModel() == CodeModel::Small;
2391 return CR->getUnsignedMax().ult(1ull << 32);
2394 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2395 SDValue &Scale, SDValue &Index,
2396 SDValue &Disp, SDValue &Segment) {
2397 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2398 SDLoc DL(N);
2400 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2401 return false;
2403 RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
2404 if (RN && RN->getReg() == 0)
2405 Base = CurDAG->getRegister(0, MVT::i64);
2406 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
2407 // Base could already be %rip, particularly in the x32 ABI.
2408 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2409 MVT::i64), 0);
2410 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2411 Base);
2414 RN = dyn_cast<RegisterSDNode>(Index);
2415 if (RN && RN->getReg() == 0)
2416 Index = CurDAG->getRegister(0, MVT::i64);
2417 else {
2418 assert(Index.getValueType() == MVT::i32 &&
2419 "Expect to be extending 32-bit registers for use in LEA");
2420 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2421 MVT::i64), 0);
2422 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2423 Index);
2426 return true;
2429 /// Calls SelectAddr and determines if the maximal addressing
2430 /// mode it matches can be cost effectively emitted as an LEA instruction.
2431 bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
2432 SDValue &Base, SDValue &Scale,
2433 SDValue &Index, SDValue &Disp,
2434 SDValue &Segment) {
2435 X86ISelAddressMode AM;
2437 // Save the DL and VT before calling matchAddress, it can invalidate N.
2438 SDLoc DL(N);
2439 MVT VT = N.getSimpleValueType();
2441 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
2442 // segments.
2443 SDValue Copy = AM.Segment;
2444 SDValue T = CurDAG->getRegister(0, MVT::i32);
2445 AM.Segment = T;
2446 if (matchAddress(N, AM))
2447 return false;
2448 assert (T == AM.Segment);
2449 AM.Segment = Copy;
2451 unsigned Complexity = 0;
2452 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
2453 Complexity = 1;
2454 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2455 Complexity = 4;
2457 if (AM.IndexReg.getNode())
2458 Complexity++;
2460 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
2461 // a simple shift.
2462 if (AM.Scale > 1)
2463 Complexity++;
2465 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
2466 // to a LEA. This is determined with some experimentation but is by no means
2467 // optimal (especially for code size consideration). LEA is nice because of
2468 // its three-address nature. Tweak the cost function again when we can run
2469 // convertToThreeAddress() at register allocation time.
2470 if (AM.hasSymbolicDisplacement()) {
2471 // For X86-64, always use LEA to materialize RIP-relative addresses.
2472 if (Subtarget->is64Bit())
2473 Complexity = 4;
2474 else
2475 Complexity += 2;
2478 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
2479 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
2480 // duplicating flag-producing instructions later in the pipeline.
2481 if (N.getOpcode() == ISD::ADD) {
2482 auto isMathWithFlags = [](SDValue V) {
2483 switch (V.getOpcode()) {
2484 case X86ISD::ADD:
2485 case X86ISD::SUB:
2486 case X86ISD::ADC:
2487 case X86ISD::SBB:
2488 /* TODO: These opcodes can be added safely, but we may want to justify
2489 their inclusion for different reasons (better for reg-alloc).
2490 case X86ISD::SMUL:
2491 case X86ISD::UMUL:
2492 case X86ISD::OR:
2493 case X86ISD::XOR:
2494 case X86ISD::AND:
2496 // Value 1 is the flag output of the node - verify it's not dead.
2497 return !SDValue(V.getNode(), 1).use_empty();
2498 default:
2499 return false;
2502 // TODO: This could be an 'or' rather than 'and' to make the transform more
2503 // likely to happen. We might want to factor in whether there's a
2504 // load folding opportunity for the math op that disappears with LEA.
2505 if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
2506 Complexity++;
2509 if (AM.Disp)
2510 Complexity++;
2512 // If it isn't worth using an LEA, reject it.
2513 if (Complexity <= 2)
2514 return false;
2516 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2517 return true;
2520 /// This is only run on TargetGlobalTLSAddress nodes.
2521 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
2522 SDValue &Scale, SDValue &Index,
2523 SDValue &Disp, SDValue &Segment) {
2524 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
2525 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
2527 X86ISelAddressMode AM;
2528 AM.GV = GA->getGlobal();
2529 AM.Disp += GA->getOffset();
2530 AM.SymbolFlags = GA->getTargetFlags();
2532 MVT VT = N.getSimpleValueType();
2533 if (VT == MVT::i32) {
2534 AM.Scale = 1;
2535 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
2538 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
2539 return true;
2542 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
2543 if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
2544 Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
2545 N.getValueType());
2546 return true;
2549 // Keep track of the original value type and whether this value was
2550 // truncated. If we see a truncation from pointer type to VT that truncates
2551 // bits that are known to be zero, we can use a narrow reference.
2552 EVT VT = N.getValueType();
2553 bool WasTruncated = false;
2554 if (N.getOpcode() == ISD::TRUNCATE) {
2555 WasTruncated = true;
2556 N = N.getOperand(0);
2559 if (N.getOpcode() != X86ISD::Wrapper)
2560 return false;
2562 // We can only use non-GlobalValues as immediates if they were not truncated,
2563 // as we do not have any range information. If we have a GlobalValue and the
2564 // address was not truncated, we can select it as an operand directly.
2565 unsigned Opc = N.getOperand(0)->getOpcode();
2566 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
2567 Op = N.getOperand(0);
2568 // We can only select the operand directly if we didn't have to look past a
2569 // truncate.
2570 return !WasTruncated;
2573 // Check that the global's range fits into VT.
2574 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
2575 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2576 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
2577 return false;
2579 // Okay, we can use a narrow reference.
2580 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
2581 GA->getOffset(), GA->getTargetFlags());
2582 return true;
2585 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
2586 SDValue &Base, SDValue &Scale,
2587 SDValue &Index, SDValue &Disp,
2588 SDValue &Segment) {
2589 assert(Root && P && "Unknown root/parent nodes");
2590 if (!ISD::isNON_EXTLoad(N.getNode()) ||
2591 !IsProfitableToFold(N, P, Root) ||
2592 !IsLegalToFold(N, P, Root, OptLevel))
2593 return false;
2595 return selectAddr(N.getNode(),
2596 N.getOperand(1), Base, Scale, Index, Disp, Segment);
2599 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
2600 SDValue &Base, SDValue &Scale,
2601 SDValue &Index, SDValue &Disp,
2602 SDValue &Segment) {
2603 assert(Root && P && "Unknown root/parent nodes");
2604 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
2605 !IsProfitableToFold(N, P, Root) ||
2606 !IsLegalToFold(N, P, Root, OptLevel))
2607 return false;
2609 return selectAddr(N.getNode(),
2610 N.getOperand(1), Base, Scale, Index, Disp, Segment);
2613 /// Return an SDNode that returns the value of the global base register.
2614 /// Output instructions required to initialize the global base register,
2615 /// if necessary.
2616 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
2617 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
2618 auto &DL = MF->getDataLayout();
2619 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
2622 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
2623 if (N->getOpcode() == ISD::TRUNCATE)
2624 N = N->getOperand(0).getNode();
2625 if (N->getOpcode() != X86ISD::Wrapper)
2626 return false;
2628 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
2629 if (!GA)
2630 return false;
2632 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2633 return CR && CR->getSignedMin().sge(-1ull << Width) &&
2634 CR->getSignedMax().slt(1ull << Width);
2637 static X86::CondCode getCondFromNode(SDNode *N) {
2638 assert(N->isMachineOpcode() && "Unexpected node");
2639 X86::CondCode CC = X86::COND_INVALID;
2640 unsigned Opc = N->getMachineOpcode();
2641 if (Opc == X86::JCC_1)
2642 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
2643 else if (Opc == X86::SETCCr)
2644 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
2645 else if (Opc == X86::SETCCm)
2646 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
2647 else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
2648 Opc == X86::CMOV64rr)
2649 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
2650 else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
2651 Opc == X86::CMOV64rm)
2652 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
2654 return CC;
2657 /// Test whether the given X86ISD::CMP node has any users that use a flag
2658 /// other than ZF.
2659 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
2660 // Examine each user of the node.
2661 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2662 UI != UE; ++UI) {
2663 // Only check things that use the flags.
2664 if (UI.getUse().getResNo() != Flags.getResNo())
2665 continue;
2666 // Only examine CopyToReg uses that copy to EFLAGS.
2667 if (UI->getOpcode() != ISD::CopyToReg ||
2668 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2669 return false;
2670 // Examine each user of the CopyToReg use.
2671 for (SDNode::use_iterator FlagUI = UI->use_begin(),
2672 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2673 // Only examine the Flag result.
2674 if (FlagUI.getUse().getResNo() != 1) continue;
2675 // Anything unusual: assume conservatively.
2676 if (!FlagUI->isMachineOpcode()) return false;
2677 // Examine the condition code of the user.
2678 X86::CondCode CC = getCondFromNode(*FlagUI);
2680 switch (CC) {
2681 // Comparisons which only use the zero flag.
2682 case X86::COND_E: case X86::COND_NE:
2683 continue;
2684 // Anything else: assume conservatively.
2685 default:
2686 return false;
2690 return true;
2693 /// Test whether the given X86ISD::CMP node has any uses which require the SF
2694 /// flag to be accurate.
2695 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
2696 // Examine each user of the node.
2697 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2698 UI != UE; ++UI) {
2699 // Only check things that use the flags.
2700 if (UI.getUse().getResNo() != Flags.getResNo())
2701 continue;
2702 // Only examine CopyToReg uses that copy to EFLAGS.
2703 if (UI->getOpcode() != ISD::CopyToReg ||
2704 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2705 return false;
2706 // Examine each user of the CopyToReg use.
2707 for (SDNode::use_iterator FlagUI = UI->use_begin(),
2708 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2709 // Only examine the Flag result.
2710 if (FlagUI.getUse().getResNo() != 1) continue;
2711 // Anything unusual: assume conservatively.
2712 if (!FlagUI->isMachineOpcode()) return false;
2713 // Examine the condition code of the user.
2714 X86::CondCode CC = getCondFromNode(*FlagUI);
2716 switch (CC) {
2717 // Comparisons which don't examine the SF flag.
2718 case X86::COND_A: case X86::COND_AE:
2719 case X86::COND_B: case X86::COND_BE:
2720 case X86::COND_E: case X86::COND_NE:
2721 case X86::COND_O: case X86::COND_NO:
2722 case X86::COND_P: case X86::COND_NP:
2723 continue;
2724 // Anything else: assume conservatively.
2725 default:
2726 return false;
2730 return true;
2733 static bool mayUseCarryFlag(X86::CondCode CC) {
2734 switch (CC) {
2735 // Comparisons which don't examine the CF flag.
2736 case X86::COND_O: case X86::COND_NO:
2737 case X86::COND_E: case X86::COND_NE:
2738 case X86::COND_S: case X86::COND_NS:
2739 case X86::COND_P: case X86::COND_NP:
2740 case X86::COND_L: case X86::COND_GE:
2741 case X86::COND_G: case X86::COND_LE:
2742 return false;
2743 // Anything else: assume conservatively.
2744 default:
2745 return true;
2749 /// Test whether the given node which sets flags has any uses which require the
2750 /// CF flag to be accurate.
2751 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
2752 // Examine each user of the node.
2753 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2754 UI != UE; ++UI) {
2755 // Only check things that use the flags.
2756 if (UI.getUse().getResNo() != Flags.getResNo())
2757 continue;
2759 unsigned UIOpc = UI->getOpcode();
2761 if (UIOpc == ISD::CopyToReg) {
2762 // Only examine CopyToReg uses that copy to EFLAGS.
2763 if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2764 return false;
2765 // Examine each user of the CopyToReg use.
2766 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
2767 FlagUI != FlagUE; ++FlagUI) {
2768 // Only examine the Flag result.
2769 if (FlagUI.getUse().getResNo() != 1)
2770 continue;
2771 // Anything unusual: assume conservatively.
2772 if (!FlagUI->isMachineOpcode())
2773 return false;
2774 // Examine the condition code of the user.
2775 X86::CondCode CC = getCondFromNode(*FlagUI);
2777 if (mayUseCarryFlag(CC))
2778 return false;
2781 // This CopyToReg is ok. Move on to the next user.
2782 continue;
2785 // This might be an unselected node. So look for the pre-isel opcodes that
2786 // use flags.
2787 unsigned CCOpNo;
2788 switch (UIOpc) {
2789 default:
2790 // Something unusual. Be conservative.
2791 return false;
2792 case X86ISD::SETCC: CCOpNo = 0; break;
2793 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
2794 case X86ISD::CMOV: CCOpNo = 2; break;
2795 case X86ISD::BRCOND: CCOpNo = 2; break;
2798 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
2799 if (mayUseCarryFlag(CC))
2800 return false;
2802 return true;
2805 /// Check whether or not the chain ending in StoreNode is suitable for doing
2806 /// the {load; op; store} to modify transformation.
2807 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
2808 SDValue StoredVal, SelectionDAG *CurDAG,
2809 unsigned LoadOpNo,
2810 LoadSDNode *&LoadNode,
2811 SDValue &InputChain) {
2812 // Is the stored value result 0 of the operation?
2813 if (StoredVal.getResNo() != 0) return false;
2815 // Are there other uses of the operation other than the store?
2816 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
2818 // Is the store non-extending and non-indexed?
2819 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
2820 return false;
2822 SDValue Load = StoredVal->getOperand(LoadOpNo);
2823 // Is the stored value a non-extending and non-indexed load?
2824 if (!ISD::isNormalLoad(Load.getNode())) return false;
2826 // Return LoadNode by reference.
2827 LoadNode = cast<LoadSDNode>(Load);
2829 // Is store the only read of the loaded value?
2830 if (!Load.hasOneUse())
2831 return false;
2833 // Is the address of the store the same as the load?
2834 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
2835 LoadNode->getOffset() != StoreNode->getOffset())
2836 return false;
2838 bool FoundLoad = false;
2839 SmallVector<SDValue, 4> ChainOps;
2840 SmallVector<const SDNode *, 4> LoopWorklist;
2841 SmallPtrSet<const SDNode *, 16> Visited;
2842 const unsigned int Max = 1024;
2844 // Visualization of Load-Op-Store fusion:
2845 // -------------------------
2846 // Legend:
2847 // *-lines = Chain operand dependencies.
2848 // |-lines = Normal operand dependencies.
2849 // Dependencies flow down and right. n-suffix references multiple nodes.
2851 // C Xn C
2852 // * * *
2853 // * * *
2854 // Xn A-LD Yn TF Yn
2855 // * * \ | * |
2856 // * * \ | * |
2857 // * * \ | => A--LD_OP_ST
2858 // * * \| \
2859 // TF OP \
2860 // * | \ Zn
2861 // * | \
2862 // A-ST Zn
2865 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
2866 // #2: Yn -> LD
2867 // #3: ST -> Zn
2869 // Ensure the transform is safe by checking for the dual
2870 // dependencies to make sure we do not induce a loop.
2872 // As LD is a predecessor to both OP and ST we can do this by checking:
2873 // a). if LD is a predecessor to a member of Xn or Yn.
2874 // b). if a Zn is a predecessor to ST.
2876 // However, (b) can only occur through being a chain predecessor to
2877 // ST, which is the same as Zn being a member or predecessor of Xn,
2878 // which is a subset of LD being a predecessor of Xn. So it's
2879 // subsumed by check (a).
2881 SDValue Chain = StoreNode->getChain();
2883 // Gather X elements in ChainOps.
2884 if (Chain == Load.getValue(1)) {
2885 FoundLoad = true;
2886 ChainOps.push_back(Load.getOperand(0));
2887 } else if (Chain.getOpcode() == ISD::TokenFactor) {
2888 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
2889 SDValue Op = Chain.getOperand(i);
2890 if (Op == Load.getValue(1)) {
2891 FoundLoad = true;
2892 // Drop Load, but keep its chain. No cycle check necessary.
2893 ChainOps.push_back(Load.getOperand(0));
2894 continue;
2896 LoopWorklist.push_back(Op.getNode());
2897 ChainOps.push_back(Op);
2901 if (!FoundLoad)
2902 return false;
2904 // Worklist is currently Xn. Add Yn to worklist.
2905 for (SDValue Op : StoredVal->ops())
2906 if (Op.getNode() != LoadNode)
2907 LoopWorklist.push_back(Op.getNode());
2909 // Check (a) if Load is a predecessor to Xn + Yn
2910 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
2911 true))
2912 return false;
2914 InputChain =
2915 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
2916 return true;
2919 // Change a chain of {load; op; store} of the same value into a simple op
2920 // through memory of that value, if the uses of the modified value and its
2921 // address are suitable.
2923 // The tablegen pattern memory operand pattern is currently not able to match
2924 // the case where the EFLAGS on the original operation are used.
2926 // To move this to tablegen, we'll need to improve tablegen to allow flags to
2927 // be transferred from a node in the pattern to the result node, probably with
2928 // a new keyword. For example, we have this
2929 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2930 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2931 // (implicit EFLAGS)]>;
2932 // but maybe need something like this
2933 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2934 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2935 // (transferrable EFLAGS)]>;
2937 // Until then, we manually fold these and instruction select the operation
2938 // here.
2939 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
2940 StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
2941 SDValue StoredVal = StoreNode->getOperand(1);
2942 unsigned Opc = StoredVal->getOpcode();
2944 // Before we try to select anything, make sure this is memory operand size
2945 // and opcode we can handle. Note that this must match the code below that
2946 // actually lowers the opcodes.
2947 EVT MemVT = StoreNode->getMemoryVT();
2948 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
2949 MemVT != MVT::i8)
2950 return false;
2952 bool IsCommutable = false;
2953 bool IsNegate = false;
2954 switch (Opc) {
2955 default:
2956 return false;
2957 case X86ISD::SUB:
2958 IsNegate = isNullConstant(StoredVal.getOperand(0));
2959 break;
2960 case X86ISD::SBB:
2961 break;
2962 case X86ISD::ADD:
2963 case X86ISD::ADC:
2964 case X86ISD::AND:
2965 case X86ISD::OR:
2966 case X86ISD::XOR:
2967 IsCommutable = true;
2968 break;
2971 unsigned LoadOpNo = IsNegate ? 1 : 0;
2972 LoadSDNode *LoadNode = nullptr;
2973 SDValue InputChain;
2974 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2975 LoadNode, InputChain)) {
2976 if (!IsCommutable)
2977 return false;
2979 // This operation is commutable, try the other operand.
2980 LoadOpNo = 1;
2981 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2982 LoadNode, InputChain))
2983 return false;
2986 SDValue Base, Scale, Index, Disp, Segment;
2987 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
2988 Segment))
2989 return false;
2991 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
2992 unsigned Opc8) {
2993 switch (MemVT.getSimpleVT().SimpleTy) {
2994 case MVT::i64:
2995 return Opc64;
2996 case MVT::i32:
2997 return Opc32;
2998 case MVT::i16:
2999 return Opc16;
3000 case MVT::i8:
3001 return Opc8;
3002 default:
3003 llvm_unreachable("Invalid size!");
3007 MachineSDNode *Result;
3008 switch (Opc) {
3009 case X86ISD::SUB:
3010 // Handle negate.
3011 if (IsNegate) {
3012 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3013 X86::NEG8m);
3014 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3015 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3016 MVT::Other, Ops);
3017 break;
3019 LLVM_FALLTHROUGH;
3020 case X86ISD::ADD:
3021 // Try to match inc/dec.
3022 if (!Subtarget->slowIncDec() || OptForSize) {
3023 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3024 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3025 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3026 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3027 unsigned NewOpc =
3028 ((Opc == X86ISD::ADD) == IsOne)
3029 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3030 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3031 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3032 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3033 MVT::Other, Ops);
3034 break;
3037 LLVM_FALLTHROUGH;
3038 case X86ISD::ADC:
3039 case X86ISD::SBB:
3040 case X86ISD::AND:
3041 case X86ISD::OR:
3042 case X86ISD::XOR: {
3043 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3044 switch (Opc) {
3045 case X86ISD::ADD:
3046 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3047 X86::ADD8mr);
3048 case X86ISD::ADC:
3049 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3050 X86::ADC8mr);
3051 case X86ISD::SUB:
3052 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3053 X86::SUB8mr);
3054 case X86ISD::SBB:
3055 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3056 X86::SBB8mr);
3057 case X86ISD::AND:
3058 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3059 X86::AND8mr);
3060 case X86ISD::OR:
3061 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3062 case X86ISD::XOR:
3063 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3064 X86::XOR8mr);
3065 default:
3066 llvm_unreachable("Invalid opcode!");
3069 auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
3070 switch (Opc) {
3071 case X86ISD::ADD:
3072 return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
3073 case X86ISD::ADC:
3074 return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
3075 case X86ISD::SUB:
3076 return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
3077 case X86ISD::SBB:
3078 return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
3079 case X86ISD::AND:
3080 return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
3081 case X86ISD::OR:
3082 return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
3083 case X86ISD::XOR:
3084 return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
3085 default:
3086 llvm_unreachable("Invalid opcode!");
3089 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3090 switch (Opc) {
3091 case X86ISD::ADD:
3092 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3093 X86::ADD8mi);
3094 case X86ISD::ADC:
3095 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3096 X86::ADC8mi);
3097 case X86ISD::SUB:
3098 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3099 X86::SUB8mi);
3100 case X86ISD::SBB:
3101 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3102 X86::SBB8mi);
3103 case X86ISD::AND:
3104 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3105 X86::AND8mi);
3106 case X86ISD::OR:
3107 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3108 X86::OR8mi);
3109 case X86ISD::XOR:
3110 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3111 X86::XOR8mi);
3112 default:
3113 llvm_unreachable("Invalid opcode!");
3117 unsigned NewOpc = SelectRegOpcode(Opc);
3118 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3120 // See if the operand is a constant that we can fold into an immediate
3121 // operand.
3122 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3123 int64_t OperandV = OperandC->getSExtValue();
3125 // Check if we can shrink the operand enough to fit in an immediate (or
3126 // fit into a smaller immediate) by negating it and switching the
3127 // operation.
3128 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3129 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3130 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3131 isInt<32>(-OperandV))) &&
3132 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3133 OperandV = -OperandV;
3134 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3137 // First try to fit this into an Imm8 operand. If it doesn't fit, then try
3138 // the larger immediate operand.
3139 if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
3140 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3141 NewOpc = SelectImm8Opcode(Opc);
3142 } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3143 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3144 NewOpc = SelectImmOpcode(Opc);
3148 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3149 SDValue CopyTo =
3150 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3151 StoredVal.getOperand(2), SDValue());
3153 const SDValue Ops[] = {Base, Scale, Index, Disp,
3154 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3155 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3156 Ops);
3157 } else {
3158 const SDValue Ops[] = {Base, Scale, Index, Disp,
3159 Segment, Operand, InputChain};
3160 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3161 Ops);
3163 break;
3165 default:
3166 llvm_unreachable("Invalid opcode!");
3169 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3170 LoadNode->getMemOperand()};
3171 CurDAG->setNodeMemRefs(Result, MemOps);
3173 // Update Load Chain uses as well.
3174 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3175 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3176 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3177 CurDAG->RemoveDeadNode(Node);
3178 return true;
3181 // See if this is an X & Mask that we can match to BEXTR/BZHI.
3182 // Where Mask is one of the following patterns:
3183 // a) x & (1 << nbits) - 1
3184 // b) x & ~(-1 << nbits)
3185 // c) x & (-1 >> (32 - y))
3186 // d) x << (32 - y) >> (32 - y)
3187 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3188 assert(
3189 (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
3190 "Should be either an and-mask, or right-shift after clearing high bits.");
3192 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3193 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3194 return false;
3196 MVT NVT = Node->getSimpleValueType(0);
3198 // Only supported for 32 and 64 bits.
3199 if (NVT != MVT::i32 && NVT != MVT::i64)
3200 return false;
3202 SDValue NBits;
3204 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3205 // Else, if we only have BMI1's BEXTR, we require one-use.
3206 const bool CanHaveExtraUses = Subtarget->hasBMI2();
3207 auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
3208 return CanHaveExtraUses ||
3209 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3211 auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
3212 auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
3214 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3215 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3216 assert(V.getSimpleValueType() == MVT::i32 &&
3217 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3218 "Expected i64 -> i32 truncation");
3219 V = V.getOperand(0);
3221 return V;
3224 // a) x & ((1 << nbits) + (-1))
3225 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
3226 &NBits](SDValue Mask) -> bool {
3227 // Match `add`. Must only have one use!
3228 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3229 return false;
3230 // We should be adding all-ones constant (i.e. subtracting one.)
3231 if (!isAllOnesConstant(Mask->getOperand(1)))
3232 return false;
3233 // Match `1 << nbits`. Might be truncated. Must only have one use!
3234 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3235 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3236 return false;
3237 if (!isOneConstant(M0->getOperand(0)))
3238 return false;
3239 NBits = M0->getOperand(1);
3240 return true;
3243 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3244 V = peekThroughOneUseTruncation(V);
3245 return CurDAG->MaskedValueIsAllOnes(
3246 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3247 NVT.getSizeInBits()));
3250 // b) x & ~(-1 << nbits)
3251 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3252 &NBits](SDValue Mask) -> bool {
3253 // Match `~()`. Must only have one use!
3254 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3255 return false;
3256 // The -1 only has to be all-ones for the final Node's NVT.
3257 if (!isAllOnes(Mask->getOperand(1)))
3258 return false;
3259 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3260 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3261 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3262 return false;
3263 // The -1 only has to be all-ones for the final Node's NVT.
3264 if (!isAllOnes(M0->getOperand(0)))
3265 return false;
3266 NBits = M0->getOperand(1);
3267 return true;
3270 // Match potentially-truncated (bitwidth - y)
3271 auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
3272 unsigned Bitwidth) {
3273 // Skip over a truncate of the shift amount.
3274 if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
3275 ShiftAmt = ShiftAmt.getOperand(0);
3276 // The trunc should have been the only user of the real shift amount.
3277 if (!checkOneUse(ShiftAmt))
3278 return false;
3280 // Match the shift amount as: (bitwidth - y). It should go away, too.
3281 if (ShiftAmt.getOpcode() != ISD::SUB)
3282 return false;
3283 auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
3284 if (!V0 || V0->getZExtValue() != Bitwidth)
3285 return false;
3286 NBits = ShiftAmt.getOperand(1);
3287 return true;
3290 // c) x & (-1 >> (32 - y))
3291 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
3292 matchShiftAmt](SDValue Mask) -> bool {
3293 // The mask itself may be truncated.
3294 Mask = peekThroughOneUseTruncation(Mask);
3295 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3296 // Match `l>>`. Must only have one use!
3297 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3298 return false;
3299 // We should be shifting truly all-ones constant.
3300 if (!isAllOnesConstant(Mask.getOperand(0)))
3301 return false;
3302 SDValue M1 = Mask.getOperand(1);
3303 // The shift amount should not be used externally.
3304 if (!checkOneUse(M1))
3305 return false;
3306 return matchShiftAmt(M1, Bitwidth);
3309 SDValue X;
3311 // d) x << (32 - y) >> (32 - y)
3312 auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
3313 &X](SDNode *Node) -> bool {
3314 if (Node->getOpcode() != ISD::SRL)
3315 return false;
3316 SDValue N0 = Node->getOperand(0);
3317 if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
3318 return false;
3319 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3320 SDValue N1 = Node->getOperand(1);
3321 SDValue N01 = N0->getOperand(1);
3322 // Both of the shifts must be by the exact same value.
3323 // There should not be any uses of the shift amount outside of the pattern.
3324 if (N1 != N01 || !checkTwoUse(N1))
3325 return false;
3326 if (!matchShiftAmt(N1, Bitwidth))
3327 return false;
3328 X = N0->getOperand(0);
3329 return true;
3332 auto matchLowBitMask = [matchPatternA, matchPatternB,
3333 matchPatternC](SDValue Mask) -> bool {
3334 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3337 if (Node->getOpcode() == ISD::AND) {
3338 X = Node->getOperand(0);
3339 SDValue Mask = Node->getOperand(1);
3341 if (matchLowBitMask(Mask)) {
3342 // Great.
3343 } else {
3344 std::swap(X, Mask);
3345 if (!matchLowBitMask(Mask))
3346 return false;
3348 } else if (!matchPatternD(Node))
3349 return false;
3351 SDLoc DL(Node);
3353 // Truncate the shift amount.
3354 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3355 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3357 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3358 // All the other bits are undefined, we do not care about them.
3359 SDValue ImplDef = SDValue(
3360 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3361 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3363 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3364 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3365 NBits = SDValue(
3366 CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
3367 NBits, SRIdxVal), 0);
3368 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3370 if (Subtarget->hasBMI2()) {
3371 // Great, just emit the the BZHI..
3372 if (NVT != MVT::i32) {
3373 // But have to place the bit count into the wide-enough register first.
3374 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
3375 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3378 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
3379 ReplaceNode(Node, Extract.getNode());
3380 SelectCode(Extract.getNode());
3381 return true;
3384 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3385 // *logically* shifted (potentially with one-use trunc inbetween),
3386 // and the truncation was the only use of the shift,
3387 // and if so look past one-use truncation.
3389 SDValue RealX = peekThroughOneUseTruncation(X);
3390 // FIXME: only if the shift is one-use?
3391 if (RealX != X && RealX.getOpcode() == ISD::SRL)
3392 X = RealX;
3395 MVT XVT = X.getSimpleValueType();
3397 // Else, emitting BEXTR requires one more step.
3398 // The 'control' of BEXTR has the pattern of:
3399 // [15...8 bit][ 7...0 bit] location
3400 // [ bit count][ shift] name
3401 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
3403 // Shift NBits left by 8 bits, thus producing 'control'.
3404 // This makes the low 8 bits to be zero.
3405 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
3406 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
3407 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3409 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3410 // FIXME: only if the shift is one-use?
3411 if (X.getOpcode() == ISD::SRL) {
3412 SDValue ShiftAmt = X.getOperand(1);
3413 X = X.getOperand(0);
3415 assert(ShiftAmt.getValueType() == MVT::i8 &&
3416 "Expected shift amount to be i8");
3418 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
3419 // We could zext to i16 in some form, but we intentionally don't do that.
3420 SDValue OrigShiftAmt = ShiftAmt;
3421 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
3422 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
3424 // And now 'or' these low 8 bits of shift amount into the 'control'.
3425 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
3426 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3429 // But have to place the 'control' into the wide-enough register first.
3430 if (XVT != MVT::i32) {
3431 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
3432 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3435 // And finally, form the BEXTR itself.
3436 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
3438 // The 'X' was originally truncated. Do that now.
3439 if (XVT != NVT) {
3440 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
3441 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
3444 ReplaceNode(Node, Extract.getNode());
3445 SelectCode(Extract.getNode());
3447 return true;
3450 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
3451 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
3452 MVT NVT = Node->getSimpleValueType(0);
3453 SDLoc dl(Node);
3455 SDValue N0 = Node->getOperand(0);
3456 SDValue N1 = Node->getOperand(1);
3458 // If we have TBM we can use an immediate for the control. If we have BMI
3459 // we should only do this if the BEXTR instruction is implemented well.
3460 // Otherwise moving the control into a register makes this more costly.
3461 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
3462 // hoisting the move immediate would make it worthwhile with a less optimal
3463 // BEXTR?
3464 bool PreferBEXTR =
3465 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
3466 if (!PreferBEXTR && !Subtarget->hasBMI2())
3467 return nullptr;
3469 // Must have a shift right.
3470 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
3471 return nullptr;
3473 // Shift can't have additional users.
3474 if (!N0->hasOneUse())
3475 return nullptr;
3477 // Only supported for 32 and 64 bits.
3478 if (NVT != MVT::i32 && NVT != MVT::i64)
3479 return nullptr;
3481 // Shift amount and RHS of and must be constant.
3482 ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
3483 ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
3484 if (!MaskCst || !ShiftCst)
3485 return nullptr;
3487 // And RHS must be a mask.
3488 uint64_t Mask = MaskCst->getZExtValue();
3489 if (!isMask_64(Mask))
3490 return nullptr;
3492 uint64_t Shift = ShiftCst->getZExtValue();
3493 uint64_t MaskSize = countPopulation(Mask);
3495 // Don't interfere with something that can be handled by extracting AH.
3496 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
3497 if (Shift == 8 && MaskSize == 8)
3498 return nullptr;
3500 // Make sure we are only using bits that were in the original value, not
3501 // shifted in.
3502 if (Shift + MaskSize > NVT.getSizeInBits())
3503 return nullptr;
3505 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
3506 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
3507 // does not fit into 32 bits. Load folding is not a sufficient reason.
3508 if (!PreferBEXTR && MaskSize <= 32)
3509 return nullptr;
3511 SDValue Control;
3512 unsigned ROpc, MOpc;
3514 if (!PreferBEXTR) {
3515 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
3516 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
3517 // Let's perform the mask first, and apply shift later. Note that we need to
3518 // widen the mask to account for the fact that we'll apply shift afterwards!
3519 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
3520 ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
3521 MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
3522 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3523 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
3524 } else {
3525 // The 'control' of BEXTR has the pattern of:
3526 // [15...8 bit][ 7...0 bit] location
3527 // [ bit count][ shift] name
3528 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
3529 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
3530 if (Subtarget->hasTBM()) {
3531 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
3532 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
3533 } else {
3534 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
3535 // BMI requires the immediate to placed in a register.
3536 ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
3537 MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
3538 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3539 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
3543 MachineSDNode *NewNode;
3544 SDValue Input = N0->getOperand(0);
3545 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3546 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3547 SDValue Ops[] = {
3548 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
3549 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
3550 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3551 // Update the chain.
3552 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
3553 // Record the mem-refs
3554 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
3555 } else {
3556 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
3559 if (!PreferBEXTR) {
3560 // We still need to apply the shift.
3561 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
3562 unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri;
3563 NewNode =
3564 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
3567 return NewNode;
3570 // Emit a PCMISTR(I/M) instruction.
3571 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
3572 bool MayFoldLoad, const SDLoc &dl,
3573 MVT VT, SDNode *Node) {
3574 SDValue N0 = Node->getOperand(0);
3575 SDValue N1 = Node->getOperand(1);
3576 SDValue Imm = Node->getOperand(2);
3577 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3578 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3580 // Try to fold a load. No need to check alignment.
3581 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3582 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3583 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3584 N1.getOperand(0) };
3585 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
3586 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3587 // Update the chain.
3588 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
3589 // Record the mem-refs
3590 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
3591 return CNode;
3594 SDValue Ops[] = { N0, N1, Imm };
3595 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
3596 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3597 return CNode;
3600 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
3601 // to emit a second instruction after this one. This is needed since we have two
3602 // copyToReg nodes glued before this and we need to continue that glue through.
3603 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
3604 bool MayFoldLoad, const SDLoc &dl,
3605 MVT VT, SDNode *Node,
3606 SDValue &InFlag) {
3607 SDValue N0 = Node->getOperand(0);
3608 SDValue N2 = Node->getOperand(2);
3609 SDValue Imm = Node->getOperand(4);
3610 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3611 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3613 // Try to fold a load. No need to check alignment.
3614 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3615 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3616 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3617 N2.getOperand(0), InFlag };
3618 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
3619 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3620 InFlag = SDValue(CNode, 3);
3621 // Update the chain.
3622 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
3623 // Record the mem-refs
3624 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
3625 return CNode;
3628 SDValue Ops[] = { N0, N2, Imm, InFlag };
3629 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
3630 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3631 InFlag = SDValue(CNode, 2);
3632 return CNode;
3635 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
3636 EVT VT = N->getValueType(0);
3638 // Only handle scalar shifts.
3639 if (VT.isVector())
3640 return false;
3642 // Narrower shifts only mask to 5 bits in hardware.
3643 unsigned Size = VT == MVT::i64 ? 64 : 32;
3645 SDValue OrigShiftAmt = N->getOperand(1);
3646 SDValue ShiftAmt = OrigShiftAmt;
3647 SDLoc DL(N);
3649 // Skip over a truncate of the shift amount.
3650 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
3651 ShiftAmt = ShiftAmt->getOperand(0);
3653 // This function is called after X86DAGToDAGISel::matchBitExtract(),
3654 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
3656 SDValue NewShiftAmt;
3657 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
3658 SDValue Add0 = ShiftAmt->getOperand(0);
3659 SDValue Add1 = ShiftAmt->getOperand(1);
3660 // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
3661 // to avoid the ADD/SUB.
3662 if (isa<ConstantSDNode>(Add1) &&
3663 cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
3664 NewShiftAmt = Add0;
3665 // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
3666 // generate a NEG instead of a SUB of a constant.
3667 } else if (ShiftAmt->getOpcode() == ISD::SUB &&
3668 isa<ConstantSDNode>(Add0) &&
3669 cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
3670 cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
3671 // Insert a negate op.
3672 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
3673 // that uses it that's not a shift.
3674 EVT SubVT = ShiftAmt.getValueType();
3675 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
3676 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
3677 NewShiftAmt = Neg;
3679 // Insert these operands into a valid topological order so they can
3680 // get selected independently.
3681 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
3682 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
3683 } else
3684 return false;
3685 } else
3686 return false;
3688 if (NewShiftAmt.getValueType() != MVT::i8) {
3689 // Need to truncate the shift amount.
3690 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
3691 // Add to a correct topological ordering.
3692 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3695 // Insert a new mask to keep the shift amount legal. This should be removed
3696 // by isel patterns.
3697 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
3698 CurDAG->getConstant(Size - 1, DL, MVT::i8));
3699 // Place in a correct topological ordering.
3700 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3702 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
3703 NewShiftAmt);
3704 if (UpdatedNode != N) {
3705 // If we found an existing node, we should replace ourselves with that node
3706 // and wait for it to be selected after its other users.
3707 ReplaceNode(N, UpdatedNode);
3708 return true;
3711 // If the original shift amount is now dead, delete it so that we don't run
3712 // it through isel.
3713 if (OrigShiftAmt.getNode()->use_empty())
3714 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
3716 // Now that we've optimized the shift amount, defer to normal isel to get
3717 // load folding and legacy vs BMI2 selection without repeating it here.
3718 SelectCode(N);
3719 return true;
3722 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
3723 MVT NVT = N->getSimpleValueType(0);
3724 unsigned Opcode = N->getOpcode();
3725 SDLoc dl(N);
3727 // For operations of the form (x << C1) op C2, check if we can use a smaller
3728 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
3729 SDValue Shift = N->getOperand(0);
3730 SDValue N1 = N->getOperand(1);
3732 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
3733 if (!Cst)
3734 return false;
3736 int64_t Val = Cst->getSExtValue();
3738 // If we have an any_extend feeding the AND, look through it to see if there
3739 // is a shift behind it. But only if the AND doesn't use the extended bits.
3740 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
3741 bool FoundAnyExtend = false;
3742 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
3743 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
3744 isUInt<32>(Val)) {
3745 FoundAnyExtend = true;
3746 Shift = Shift.getOperand(0);
3749 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
3750 return false;
3752 // i8 is unshrinkable, i16 should be promoted to i32.
3753 if (NVT != MVT::i32 && NVT != MVT::i64)
3754 return false;
3756 ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
3757 if (!ShlCst)
3758 return false;
3760 uint64_t ShAmt = ShlCst->getZExtValue();
3762 // Make sure that we don't change the operation by removing bits.
3763 // This only matters for OR and XOR, AND is unaffected.
3764 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
3765 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
3766 return false;
3768 // Check the minimum bitwidth for the new constant.
3769 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
3770 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
3771 if (Opcode == ISD::AND) {
3772 // AND32ri is the same as AND64ri32 with zext imm.
3773 // Try this before sign extended immediates below.
3774 ShiftedVal = (uint64_t)Val >> ShAmt;
3775 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3776 return true;
3777 // Also swap order when the AND can become MOVZX.
3778 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
3779 return true;
3781 ShiftedVal = Val >> ShAmt;
3782 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
3783 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
3784 return true;
3785 if (Opcode != ISD::AND) {
3786 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
3787 ShiftedVal = (uint64_t)Val >> ShAmt;
3788 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3789 return true;
3791 return false;
3794 int64_t ShiftedVal;
3795 if (!CanShrinkImmediate(ShiftedVal))
3796 return false;
3798 // Ok, we can reorder to get a smaller immediate.
3800 // But, its possible the original immediate allowed an AND to become MOVZX.
3801 // Doing this late due to avoid the MakedValueIsZero call as late as
3802 // possible.
3803 if (Opcode == ISD::AND) {
3804 // Find the smallest zext this could possibly be.
3805 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
3806 ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
3808 // Figure out which bits need to be zero to achieve that mask.
3809 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
3810 ZExtWidth);
3811 NeededMask &= ~Cst->getAPIntValue();
3813 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
3814 return false;
3817 SDValue X = Shift.getOperand(0);
3818 if (FoundAnyExtend) {
3819 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
3820 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
3821 X = NewX;
3824 SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
3825 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
3826 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
3827 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
3828 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
3829 Shift.getOperand(1));
3830 ReplaceNode(N, NewSHL.getNode());
3831 SelectCode(NewSHL.getNode());
3832 return true;
3835 /// Convert vector increment or decrement to sub/add with an all-ones constant:
3836 /// add X, <1, 1...> --> sub X, <-1, -1...>
3837 /// sub X, <1, 1...> --> add X, <-1, -1...>
3838 /// The all-ones vector constant can be materialized using a pcmpeq instruction
3839 /// that is commonly recognized as an idiom (has no register dependency), so
3840 /// that's better/smaller than loading a splat 1 constant.
3841 bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) {
3842 assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) &&
3843 "Unexpected opcode for increment/decrement transform");
3845 EVT VT = Node->getValueType(0);
3846 assert(VT.isVector() && "Should only be called for vectors.");
3848 SDValue X = Node->getOperand(0);
3849 SDValue OneVec = Node->getOperand(1);
3851 APInt SplatVal;
3852 if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue())
3853 return false;
3855 SDLoc DL(Node);
3856 SDValue OneConstant, AllOnesVec;
3858 APInt Ones = APInt::getAllOnesValue(32);
3859 assert(VT.getSizeInBits() % 32 == 0 &&
3860 "Expected bit count to be a multiple of 32");
3861 OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32);
3862 insertDAGNode(*CurDAG, X, OneConstant);
3864 unsigned NumElts = VT.getSizeInBits() / 32;
3865 assert(NumElts > 0 && "Expected to get non-empty vector.");
3866 AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts),
3867 DL, OneConstant);
3868 insertDAGNode(*CurDAG, X, AllOnesVec);
3870 AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec);
3871 insertDAGNode(*CurDAG, X, AllOnesVec);
3873 unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
3874 SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec);
3876 ReplaceNode(Node, NewNode.getNode());
3877 SelectCode(NewNode.getNode());
3878 return true;
3881 /// If the high bits of an 'and' operand are known zero, try setting the
3882 /// high bits of an 'and' constant operand to produce a smaller encoding by
3883 /// creating a small, sign-extended negative immediate rather than a large
3884 /// positive one. This reverses a transform in SimplifyDemandedBits that
3885 /// shrinks mask constants by clearing bits. There is also a possibility that
3886 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
3887 /// case, just replace the 'and'. Return 'true' if the node is replaced.
3888 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
3889 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
3890 // have immediate operands.
3891 MVT VT = And->getSimpleValueType(0);
3892 if (VT != MVT::i32 && VT != MVT::i64)
3893 return false;
3895 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
3896 if (!And1C)
3897 return false;
3899 // Bail out if the mask constant is already negative. It's can't shrink more.
3900 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
3901 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
3902 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
3903 // are negative too.
3904 APInt MaskVal = And1C->getAPIntValue();
3905 unsigned MaskLZ = MaskVal.countLeadingZeros();
3906 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
3907 return false;
3909 // Don't extend into the upper 32 bits of a 64 bit mask.
3910 if (VT == MVT::i64 && MaskLZ >= 32) {
3911 MaskLZ -= 32;
3912 MaskVal = MaskVal.trunc(32);
3915 SDValue And0 = And->getOperand(0);
3916 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
3917 APInt NegMaskVal = MaskVal | HighZeros;
3919 // If a negative constant would not allow a smaller encoding, there's no need
3920 // to continue. Only change the constant when we know it's a win.
3921 unsigned MinWidth = NegMaskVal.getMinSignedBits();
3922 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
3923 return false;
3925 // Extend masks if we truncated above.
3926 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
3927 NegMaskVal = NegMaskVal.zext(64);
3928 HighZeros = HighZeros.zext(64);
3931 // The variable operand must be all zeros in the top bits to allow using the
3932 // new, negative constant as the mask.
3933 if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
3934 return false;
3936 // Check if the mask is -1. In that case, this is an unnecessary instruction
3937 // that escaped earlier analysis.
3938 if (NegMaskVal.isAllOnesValue()) {
3939 ReplaceNode(And, And0.getNode());
3940 return true;
3943 // A negative mask allows a smaller encoding. Create a new 'and' node.
3944 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
3945 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
3946 ReplaceNode(And, NewAnd.getNode());
3947 SelectCode(NewAnd.getNode());
3948 return true;
3951 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
3952 bool FoldedBCast, bool Masked) {
3953 if (Masked) {
3954 if (FoldedLoad) {
3955 switch (TestVT.SimpleTy) {
3956 default: llvm_unreachable("Unexpected VT!");
3957 case MVT::v16i8:
3958 return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
3959 case MVT::v8i16:
3960 return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
3961 case MVT::v4i32:
3962 return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
3963 case MVT::v2i64:
3964 return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
3965 case MVT::v32i8:
3966 return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
3967 case MVT::v16i16:
3968 return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
3969 case MVT::v8i32:
3970 return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
3971 case MVT::v4i64:
3972 return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
3973 case MVT::v64i8:
3974 return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
3975 case MVT::v32i16:
3976 return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
3977 case MVT::v16i32:
3978 return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
3979 case MVT::v8i64:
3980 return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
3984 if (FoldedBCast) {
3985 switch (TestVT.SimpleTy) {
3986 default: llvm_unreachable("Unexpected VT!");
3987 case MVT::v4i32:
3988 return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
3989 case MVT::v2i64:
3990 return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
3991 case MVT::v8i32:
3992 return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
3993 case MVT::v4i64:
3994 return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
3995 case MVT::v16i32:
3996 return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
3997 case MVT::v8i64:
3998 return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
4002 switch (TestVT.SimpleTy) {
4003 default: llvm_unreachable("Unexpected VT!");
4004 case MVT::v16i8:
4005 return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
4006 case MVT::v8i16:
4007 return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
4008 case MVT::v4i32:
4009 return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
4010 case MVT::v2i64:
4011 return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
4012 case MVT::v32i8:
4013 return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
4014 case MVT::v16i16:
4015 return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
4016 case MVT::v8i32:
4017 return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
4018 case MVT::v4i64:
4019 return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
4020 case MVT::v64i8:
4021 return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
4022 case MVT::v32i16:
4023 return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
4024 case MVT::v16i32:
4025 return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
4026 case MVT::v8i64:
4027 return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
4031 if (FoldedLoad) {
4032 switch (TestVT.SimpleTy) {
4033 default: llvm_unreachable("Unexpected VT!");
4034 case MVT::v16i8:
4035 return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
4036 case MVT::v8i16:
4037 return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
4038 case MVT::v4i32:
4039 return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
4040 case MVT::v2i64:
4041 return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
4042 case MVT::v32i8:
4043 return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
4044 case MVT::v16i16:
4045 return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
4046 case MVT::v8i32:
4047 return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
4048 case MVT::v4i64:
4049 return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
4050 case MVT::v64i8:
4051 return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
4052 case MVT::v32i16:
4053 return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
4054 case MVT::v16i32:
4055 return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
4056 case MVT::v8i64:
4057 return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
4061 if (FoldedBCast) {
4062 switch (TestVT.SimpleTy) {
4063 default: llvm_unreachable("Unexpected VT!");
4064 case MVT::v4i32:
4065 return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
4066 case MVT::v2i64:
4067 return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
4068 case MVT::v8i32:
4069 return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
4070 case MVT::v4i64:
4071 return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
4072 case MVT::v16i32:
4073 return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
4074 case MVT::v8i64:
4075 return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
4079 switch (TestVT.SimpleTy) {
4080 default: llvm_unreachable("Unexpected VT!");
4081 case MVT::v16i8:
4082 return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
4083 case MVT::v8i16:
4084 return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
4085 case MVT::v4i32:
4086 return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
4087 case MVT::v2i64:
4088 return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
4089 case MVT::v32i8:
4090 return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
4091 case MVT::v16i16:
4092 return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
4093 case MVT::v8i32:
4094 return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
4095 case MVT::v4i64:
4096 return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
4097 case MVT::v64i8:
4098 return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
4099 case MVT::v32i16:
4100 return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
4101 case MVT::v16i32:
4102 return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
4103 case MVT::v8i64:
4104 return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
4108 // Try to create VPTESTM instruction. If InMask is not null, it will be used
4109 // to form a masked operation.
4110 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4111 SDValue InMask) {
4112 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4113 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4114 "Unexpected VT!");
4116 // Look for equal and not equal compares.
4117 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4118 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4119 return false;
4121 SDValue SetccOp0 = Setcc.getOperand(0);
4122 SDValue SetccOp1 = Setcc.getOperand(1);
4124 // Canonicalize the all zero vector to the RHS.
4125 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4126 std::swap(SetccOp0, SetccOp1);
4128 // See if we're comparing against zero.
4129 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4130 return false;
4132 SDValue N0 = SetccOp0;
4134 MVT CmpVT = N0.getSimpleValueType();
4135 MVT CmpSVT = CmpVT.getVectorElementType();
4137 // Start with both operands the same. We'll try to refine this.
4138 SDValue Src0 = N0;
4139 SDValue Src1 = N0;
4142 // Look through single use bitcasts.
4143 SDValue N0Temp = N0;
4144 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4145 N0Temp = N0.getOperand(0);
4147 // Look for single use AND.
4148 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4149 Src0 = N0Temp.getOperand(0);
4150 Src1 = N0Temp.getOperand(1);
4154 // Without VLX we need to widen the load.
4155 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4157 // We can only fold loads if the sources are unique.
4158 bool CanFoldLoads = Src0 != Src1;
4160 // Try to fold loads unless we need to widen.
4161 bool FoldedLoad = false;
4162 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
4163 if (!Widen && CanFoldLoads) {
4164 Load = Src1;
4165 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
4166 Tmp4);
4167 if (!FoldedLoad) {
4168 // And is computative.
4169 Load = Src0;
4170 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
4171 Tmp3, Tmp4);
4172 if (FoldedLoad)
4173 std::swap(Src0, Src1);
4177 auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
4178 // Look through single use bitcasts.
4179 if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) {
4180 Parent = Src.getNode();
4181 Src = Src.getOperand(0);
4184 if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) {
4185 auto *MemIntr = cast<MemIntrinsicSDNode>(Src);
4186 if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits())
4187 return Src;
4190 return SDValue();
4193 // If we didn't fold a load, try to match broadcast. No widening limitation
4194 // for this. But only 32 and 64 bit types are supported.
4195 bool FoldedBCast = false;
4196 if (!FoldedLoad && CanFoldLoads &&
4197 (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
4198 SDNode *ParentNode = N0.getNode();
4199 if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
4200 FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
4201 Tmp1, Tmp2, Tmp3, Tmp4);
4204 // Try the other operand.
4205 if (!FoldedBCast) {
4206 SDNode *ParentNode = N0.getNode();
4207 if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
4208 FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0,
4209 Tmp1, Tmp2, Tmp3, Tmp4);
4210 if (FoldedBCast)
4211 std::swap(Src0, Src1);
4216 auto getMaskRC = [](MVT MaskVT) {
4217 switch (MaskVT.SimpleTy) {
4218 default: llvm_unreachable("Unexpected VT!");
4219 case MVT::v2i1: return X86::VK2RegClassID;
4220 case MVT::v4i1: return X86::VK4RegClassID;
4221 case MVT::v8i1: return X86::VK8RegClassID;
4222 case MVT::v16i1: return X86::VK16RegClassID;
4223 case MVT::v32i1: return X86::VK32RegClassID;
4224 case MVT::v64i1: return X86::VK64RegClassID;
4228 bool IsMasked = InMask.getNode() != nullptr;
4230 SDLoc dl(Root);
4232 MVT ResVT = Setcc.getSimpleValueType();
4233 MVT MaskVT = ResVT;
4234 if (Widen) {
4235 // Widen the inputs using insert_subreg or copy_to_regclass.
4236 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4237 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4238 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4239 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4240 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4241 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4242 CmpVT), 0);
4243 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4245 assert(!FoldedLoad && "Shouldn't have folded the load");
4246 if (!FoldedBCast)
4247 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4249 if (IsMasked) {
4250 // Widen the mask.
4251 unsigned RegClass = getMaskRC(MaskVT);
4252 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4253 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4254 dl, MaskVT, InMask, RC), 0);
4258 bool IsTestN = CC == ISD::SETEQ;
4259 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4260 IsMasked);
4262 MachineSDNode *CNode;
4263 if (FoldedLoad || FoldedBCast) {
4264 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4266 if (IsMasked) {
4267 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4268 Load.getOperand(0) };
4269 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4270 } else {
4271 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4272 Load.getOperand(0) };
4273 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4276 // Update the chain.
4277 ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
4278 // Record the mem-refs
4279 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Load)->getMemOperand()});
4280 } else {
4281 if (IsMasked)
4282 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
4283 else
4284 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
4287 // If we widened, we need to shrink the mask VT.
4288 if (Widen) {
4289 unsigned RegClass = getMaskRC(ResVT);
4290 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4291 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4292 dl, ResVT, SDValue(CNode, 0), RC);
4295 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
4296 CurDAG->RemoveDeadNode(Root);
4297 return true;
4300 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
4301 // into vpternlog.
4302 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
4303 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
4305 MVT NVT = N->getSimpleValueType(0);
4307 // Make sure we support VPTERNLOG.
4308 if (!NVT.isVector() || !Subtarget->hasAVX512())
4309 return false;
4311 // We need VLX for 128/256-bit.
4312 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4313 return false;
4315 SDValue N0 = N->getOperand(0);
4316 SDValue N1 = N->getOperand(1);
4318 // Canonicalize AND to LHS.
4319 if (N1.getOpcode() == ISD::AND)
4320 std::swap(N0, N1);
4322 if (N0.getOpcode() != ISD::AND ||
4323 N1.getOpcode() != X86ISD::ANDNP ||
4324 !N0.hasOneUse() || !N1.hasOneUse())
4325 return false;
4327 // ANDN is not commutable, use it to pick down A and C.
4328 SDValue A = N1.getOperand(0);
4329 SDValue C = N1.getOperand(1);
4331 // AND is commutable, if one operand matches A, the other operand is B.
4332 // Otherwise this isn't a match.
4333 SDValue B;
4334 if (N0.getOperand(0) == A)
4335 B = N0.getOperand(1);
4336 else if (N0.getOperand(1) == A)
4337 B = N0.getOperand(0);
4338 else
4339 return false;
4341 SDLoc dl(N);
4342 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
4343 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
4344 ReplaceNode(N, Ternlog.getNode());
4345 SelectCode(Ternlog.getNode());
4346 return true;
4349 void X86DAGToDAGISel::Select(SDNode *Node) {
4350 MVT NVT = Node->getSimpleValueType(0);
4351 unsigned Opcode = Node->getOpcode();
4352 SDLoc dl(Node);
4354 if (Node->isMachineOpcode()) {
4355 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
4356 Node->setNodeId(-1);
4357 return; // Already selected.
4360 switch (Opcode) {
4361 default: break;
4362 case ISD::INTRINSIC_VOID: {
4363 unsigned IntNo = Node->getConstantOperandVal(1);
4364 switch (IntNo) {
4365 default: break;
4366 case Intrinsic::x86_sse3_monitor:
4367 case Intrinsic::x86_monitorx:
4368 case Intrinsic::x86_clzero: {
4369 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
4371 unsigned Opc = 0;
4372 switch (IntNo) {
4373 default: llvm_unreachable("Unexpected intrinsic!");
4374 case Intrinsic::x86_sse3_monitor:
4375 if (!Subtarget->hasSSE3())
4376 break;
4377 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
4378 break;
4379 case Intrinsic::x86_monitorx:
4380 if (!Subtarget->hasMWAITX())
4381 break;
4382 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
4383 break;
4384 case Intrinsic::x86_clzero:
4385 if (!Subtarget->hasCLZERO())
4386 break;
4387 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
4388 break;
4391 if (Opc) {
4392 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
4393 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
4394 Node->getOperand(2), SDValue());
4395 SDValue InFlag = Chain.getValue(1);
4397 if (IntNo == Intrinsic::x86_sse3_monitor ||
4398 IntNo == Intrinsic::x86_monitorx) {
4399 // Copy the other two operands to ECX and EDX.
4400 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
4401 InFlag);
4402 InFlag = Chain.getValue(1);
4403 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
4404 InFlag);
4405 InFlag = Chain.getValue(1);
4408 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
4409 { Chain, InFlag});
4410 ReplaceNode(Node, CNode);
4411 return;
4416 break;
4418 case ISD::BRIND: {
4419 if (Subtarget->isTargetNaCl())
4420 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
4421 // leave the instruction alone.
4422 break;
4423 if (Subtarget->isTarget64BitILP32()) {
4424 // Converts a 32-bit register to a 64-bit, zero-extended version of
4425 // it. This is needed because x86-64 can do many things, but jmp %r32
4426 // ain't one of them.
4427 const SDValue &Target = Node->getOperand(1);
4428 assert(Target.getSimpleValueType() == llvm::MVT::i32);
4429 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
4430 SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
4431 Node->getOperand(0), ZextTarget);
4432 ReplaceNode(Node, Brind.getNode());
4433 SelectCode(ZextTarget.getNode());
4434 SelectCode(Brind.getNode());
4435 return;
4437 break;
4439 case X86ISD::GlobalBaseReg:
4440 ReplaceNode(Node, getGlobalBaseReg());
4441 return;
4443 case ISD::BITCAST:
4444 // Just drop all 128/256/512-bit bitcasts.
4445 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
4446 NVT == MVT::f128) {
4447 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
4448 CurDAG->RemoveDeadNode(Node);
4449 return;
4451 break;
4453 case ISD::VSELECT: {
4454 // Replace VSELECT with non-mask conditions with with BLENDV.
4455 if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
4456 break;
4458 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
4459 SDValue Blendv = CurDAG->getNode(
4460 X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
4461 Node->getOperand(1), Node->getOperand(2));
4462 ReplaceNode(Node, Blendv.getNode());
4463 SelectCode(Blendv.getNode());
4464 // We already called ReplaceUses.
4465 return;
4468 case ISD::SRL:
4469 if (matchBitExtract(Node))
4470 return;
4471 LLVM_FALLTHROUGH;
4472 case ISD::SRA:
4473 case ISD::SHL:
4474 if (tryShiftAmountMod(Node))
4475 return;
4476 break;
4478 case ISD::AND:
4479 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
4480 // Try to form a masked VPTESTM. Operands can be in either order.
4481 SDValue N0 = Node->getOperand(0);
4482 SDValue N1 = Node->getOperand(1);
4483 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
4484 tryVPTESTM(Node, N0, N1))
4485 return;
4486 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
4487 tryVPTESTM(Node, N1, N0))
4488 return;
4491 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
4492 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4493 CurDAG->RemoveDeadNode(Node);
4494 return;
4496 if (matchBitExtract(Node))
4497 return;
4498 if (AndImmShrink && shrinkAndImmediate(Node))
4499 return;
4501 LLVM_FALLTHROUGH;
4502 case ISD::OR:
4503 case ISD::XOR:
4504 if (tryShrinkShlLogicImm(Node))
4505 return;
4507 if (Opcode == ISD::OR && tryMatchBitSelect(Node))
4508 return;
4510 LLVM_FALLTHROUGH;
4511 case ISD::ADD:
4512 case ISD::SUB: {
4513 if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() &&
4514 combineIncDecVector(Node))
4515 return;
4517 // Try to avoid folding immediates with multiple uses for optsize.
4518 // This code tries to select to register form directly to avoid going
4519 // through the isel table which might fold the immediate. We can't change
4520 // the patterns on the add/sub/and/or/xor with immediate paterns in the
4521 // tablegen files to check immediate use count without making the patterns
4522 // unavailable to the fast-isel table.
4523 if (!OptForSize)
4524 break;
4526 // Only handle i8/i16/i32/i64.
4527 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
4528 break;
4530 SDValue N0 = Node->getOperand(0);
4531 SDValue N1 = Node->getOperand(1);
4533 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
4534 if (!Cst)
4535 break;
4537 int64_t Val = Cst->getSExtValue();
4539 // Make sure its an immediate that is considered foldable.
4540 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
4541 if (!isInt<8>(Val) && !isInt<32>(Val))
4542 break;
4544 // If this can match to INC/DEC, let it go.
4545 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
4546 break;
4548 // Check if we should avoid folding this immediate.
4549 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
4550 break;
4552 // We should not fold the immediate. So we need a register form instead.
4553 unsigned ROpc, MOpc;
4554 switch (NVT.SimpleTy) {
4555 default: llvm_unreachable("Unexpected VT!");
4556 case MVT::i8:
4557 switch (Opcode) {
4558 default: llvm_unreachable("Unexpected opcode!");
4559 case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
4560 case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
4561 case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
4562 case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break;
4563 case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
4565 break;
4566 case MVT::i16:
4567 switch (Opcode) {
4568 default: llvm_unreachable("Unexpected opcode!");
4569 case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
4570 case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
4571 case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
4572 case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break;
4573 case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
4575 break;
4576 case MVT::i32:
4577 switch (Opcode) {
4578 default: llvm_unreachable("Unexpected opcode!");
4579 case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
4580 case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
4581 case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
4582 case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break;
4583 case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
4585 break;
4586 case MVT::i64:
4587 switch (Opcode) {
4588 default: llvm_unreachable("Unexpected opcode!");
4589 case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
4590 case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
4591 case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
4592 case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break;
4593 case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
4595 break;
4598 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
4600 // If this is a not a subtract, we can still try to fold a load.
4601 if (Opcode != ISD::SUB) {
4602 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4603 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4604 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4605 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4606 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4607 // Update the chain.
4608 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
4609 // Record the mem-refs
4610 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
4611 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4612 CurDAG->RemoveDeadNode(Node);
4613 return;
4617 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
4618 return;
4621 case X86ISD::SMUL:
4622 // i16/i32/i64 are handled with isel patterns.
4623 if (NVT != MVT::i8)
4624 break;
4625 LLVM_FALLTHROUGH;
4626 case X86ISD::UMUL: {
4627 SDValue N0 = Node->getOperand(0);
4628 SDValue N1 = Node->getOperand(1);
4630 unsigned LoReg, ROpc, MOpc;
4631 switch (NVT.SimpleTy) {
4632 default: llvm_unreachable("Unsupported VT!");
4633 case MVT::i8:
4634 LoReg = X86::AL;
4635 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
4636 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
4637 break;
4638 case MVT::i16:
4639 LoReg = X86::AX;
4640 ROpc = X86::MUL16r;
4641 MOpc = X86::MUL16m;
4642 break;
4643 case MVT::i32:
4644 LoReg = X86::EAX;
4645 ROpc = X86::MUL32r;
4646 MOpc = X86::MUL32m;
4647 break;
4648 case MVT::i64:
4649 LoReg = X86::RAX;
4650 ROpc = X86::MUL64r;
4651 MOpc = X86::MUL64m;
4652 break;
4655 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4656 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4657 // Multiply is commmutative.
4658 if (!FoldedLoad) {
4659 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4660 if (FoldedLoad)
4661 std::swap(N0, N1);
4664 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
4665 N0, SDValue()).getValue(1);
4667 MachineSDNode *CNode;
4668 if (FoldedLoad) {
4669 // i16/i32/i64 use an instruction that produces a low and high result even
4670 // though only the low result is used.
4671 SDVTList VTs;
4672 if (NVT == MVT::i8)
4673 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4674 else
4675 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
4677 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4678 InFlag };
4679 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4681 // Update the chain.
4682 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
4683 // Record the mem-refs
4684 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4685 } else {
4686 // i16/i32/i64 use an instruction that produces a low and high result even
4687 // though only the low result is used.
4688 SDVTList VTs;
4689 if (NVT == MVT::i8)
4690 VTs = CurDAG->getVTList(NVT, MVT::i32);
4691 else
4692 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
4694 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
4697 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4698 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
4699 CurDAG->RemoveDeadNode(Node);
4700 return;
4703 case ISD::SMUL_LOHI:
4704 case ISD::UMUL_LOHI: {
4705 SDValue N0 = Node->getOperand(0);
4706 SDValue N1 = Node->getOperand(1);
4708 unsigned Opc, MOpc;
4709 bool isSigned = Opcode == ISD::SMUL_LOHI;
4710 if (!isSigned) {
4711 switch (NVT.SimpleTy) {
4712 default: llvm_unreachable("Unsupported VT!");
4713 case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
4714 case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
4716 } else {
4717 switch (NVT.SimpleTy) {
4718 default: llvm_unreachable("Unsupported VT!");
4719 case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
4720 case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
4724 unsigned SrcReg, LoReg, HiReg;
4725 switch (Opc) {
4726 default: llvm_unreachable("Unknown MUL opcode!");
4727 case X86::IMUL32r:
4728 case X86::MUL32r:
4729 SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
4730 break;
4731 case X86::IMUL64r:
4732 case X86::MUL64r:
4733 SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
4734 break;
4737 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4738 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4739 // Multiply is commmutative.
4740 if (!foldedLoad) {
4741 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4742 if (foldedLoad)
4743 std::swap(N0, N1);
4746 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
4747 N0, SDValue()).getValue(1);
4748 if (foldedLoad) {
4749 SDValue Chain;
4750 MachineSDNode *CNode = nullptr;
4751 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4752 InFlag };
4753 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
4754 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4755 Chain = SDValue(CNode, 0);
4756 InFlag = SDValue(CNode, 1);
4758 // Update the chain.
4759 ReplaceUses(N1.getValue(1), Chain);
4760 // Record the mem-refs
4761 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4762 } else {
4763 SDValue Ops[] = { N1, InFlag };
4764 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
4765 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4766 InFlag = SDValue(CNode, 0);
4769 // Copy the low half of the result, if it is needed.
4770 if (!SDValue(Node, 0).use_empty()) {
4771 assert(LoReg && "Register for low half is not defined!");
4772 SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
4773 NVT, InFlag);
4774 InFlag = ResLo.getValue(2);
4775 ReplaceUses(SDValue(Node, 0), ResLo);
4776 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
4777 dbgs() << '\n');
4779 // Copy the high half of the result, if it is needed.
4780 if (!SDValue(Node, 1).use_empty()) {
4781 assert(HiReg && "Register for high half is not defined!");
4782 SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
4783 NVT, InFlag);
4784 InFlag = ResHi.getValue(2);
4785 ReplaceUses(SDValue(Node, 1), ResHi);
4786 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
4787 dbgs() << '\n');
4790 CurDAG->RemoveDeadNode(Node);
4791 return;
4794 case ISD::SDIVREM:
4795 case ISD::UDIVREM: {
4796 SDValue N0 = Node->getOperand(0);
4797 SDValue N1 = Node->getOperand(1);
4799 unsigned Opc, MOpc;
4800 bool isSigned = Opcode == ISD::SDIVREM;
4801 if (!isSigned) {
4802 switch (NVT.SimpleTy) {
4803 default: llvm_unreachable("Unsupported VT!");
4804 case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
4805 case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
4806 case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
4807 case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
4809 } else {
4810 switch (NVT.SimpleTy) {
4811 default: llvm_unreachable("Unsupported VT!");
4812 case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
4813 case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
4814 case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
4815 case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
4819 unsigned LoReg, HiReg, ClrReg;
4820 unsigned SExtOpcode;
4821 switch (NVT.SimpleTy) {
4822 default: llvm_unreachable("Unsupported VT!");
4823 case MVT::i8:
4824 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
4825 SExtOpcode = 0; // Not used.
4826 break;
4827 case MVT::i16:
4828 LoReg = X86::AX; HiReg = X86::DX;
4829 ClrReg = X86::DX;
4830 SExtOpcode = X86::CWD;
4831 break;
4832 case MVT::i32:
4833 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
4834 SExtOpcode = X86::CDQ;
4835 break;
4836 case MVT::i64:
4837 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
4838 SExtOpcode = X86::CQO;
4839 break;
4842 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4843 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4844 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
4846 SDValue InFlag;
4847 if (NVT == MVT::i8) {
4848 // Special case for div8, just use a move with zero extension to AX to
4849 // clear the upper 8 bits (AH).
4850 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
4851 MachineSDNode *Move;
4852 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4853 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4854 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
4855 : X86::MOVZX16rm8;
4856 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
4857 Chain = SDValue(Move, 1);
4858 ReplaceUses(N0.getValue(1), Chain);
4859 // Record the mem-refs
4860 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
4861 } else {
4862 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
4863 : X86::MOVZX16rr8;
4864 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
4865 Chain = CurDAG->getEntryNode();
4867 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
4868 SDValue());
4869 InFlag = Chain.getValue(1);
4870 } else {
4871 InFlag =
4872 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
4873 LoReg, N0, SDValue()).getValue(1);
4874 if (isSigned && !signBitIsZero) {
4875 // Sign extend the low part into the high part.
4876 InFlag =
4877 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
4878 } else {
4879 // Zero out the high part, effectively zero extending the input.
4880 SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
4881 switch (NVT.SimpleTy) {
4882 case MVT::i16:
4883 ClrNode =
4884 SDValue(CurDAG->getMachineNode(
4885 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
4886 CurDAG->getTargetConstant(X86::sub_16bit, dl,
4887 MVT::i32)),
4889 break;
4890 case MVT::i32:
4891 break;
4892 case MVT::i64:
4893 ClrNode =
4894 SDValue(CurDAG->getMachineNode(
4895 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
4896 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
4897 CurDAG->getTargetConstant(X86::sub_32bit, dl,
4898 MVT::i32)),
4900 break;
4901 default:
4902 llvm_unreachable("Unexpected division source");
4905 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
4906 ClrNode, InFlag).getValue(1);
4910 if (foldedLoad) {
4911 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4912 InFlag };
4913 MachineSDNode *CNode =
4914 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
4915 InFlag = SDValue(CNode, 1);
4916 // Update the chain.
4917 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
4918 // Record the mem-refs
4919 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4920 } else {
4921 InFlag =
4922 SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
4925 // Prevent use of AH in a REX instruction by explicitly copying it to
4926 // an ABCD_L register.
4928 // The current assumption of the register allocator is that isel
4929 // won't generate explicit references to the GR8_ABCD_H registers. If
4930 // the allocator and/or the backend get enhanced to be more robust in
4931 // that regard, this can be, and should be, removed.
4932 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
4933 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
4934 unsigned AHExtOpcode =
4935 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
4937 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
4938 MVT::Glue, AHCopy, InFlag);
4939 SDValue Result(RNode, 0);
4940 InFlag = SDValue(RNode, 1);
4942 Result =
4943 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
4945 ReplaceUses(SDValue(Node, 1), Result);
4946 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4947 dbgs() << '\n');
4949 // Copy the division (low) result, if it is needed.
4950 if (!SDValue(Node, 0).use_empty()) {
4951 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4952 LoReg, NVT, InFlag);
4953 InFlag = Result.getValue(2);
4954 ReplaceUses(SDValue(Node, 0), Result);
4955 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4956 dbgs() << '\n');
4958 // Copy the remainder (high) result, if it is needed.
4959 if (!SDValue(Node, 1).use_empty()) {
4960 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4961 HiReg, NVT, InFlag);
4962 InFlag = Result.getValue(2);
4963 ReplaceUses(SDValue(Node, 1), Result);
4964 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4965 dbgs() << '\n');
4967 CurDAG->RemoveDeadNode(Node);
4968 return;
4971 case X86ISD::CMP: {
4972 SDValue N0 = Node->getOperand(0);
4973 SDValue N1 = Node->getOperand(1);
4975 // Optimizations for TEST compares.
4976 if (!isNullConstant(N1))
4977 break;
4979 // Save the original VT of the compare.
4980 MVT CmpVT = N0.getSimpleValueType();
4982 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
4983 // by a test instruction. The test should be removed later by
4984 // analyzeCompare if we are using only the zero flag.
4985 // TODO: Should we check the users and use the BEXTR flags directly?
4986 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
4987 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
4988 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
4989 : X86::TEST32rr;
4990 SDValue BEXTR = SDValue(NewNode, 0);
4991 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
4992 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4993 CurDAG->RemoveDeadNode(Node);
4994 return;
4998 // We can peek through truncates, but we need to be careful below.
4999 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
5000 N0 = N0.getOperand(0);
5002 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
5003 // use a smaller encoding.
5004 // Look past the truncate if CMP is the only use of it.
5005 if (N0.getOpcode() == ISD::AND &&
5006 N0.getNode()->hasOneUse() &&
5007 N0.getValueType() != MVT::i8) {
5008 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
5009 if (!C) break;
5010 uint64_t Mask = C->getZExtValue();
5012 // Check if we can replace AND+IMM64 with a shift. This is possible for
5013 // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
5014 // flag.
5015 if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
5016 onlyUsesZeroFlag(SDValue(Node, 0))) {
5017 if (isMask_64(~Mask)) {
5018 unsigned TrailingZeros = countTrailingZeros(Mask);
5019 SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
5020 SDValue Shift =
5021 SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
5022 N0.getOperand(0), Imm), 0);
5023 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
5024 MVT::i32, Shift, Shift);
5025 ReplaceNode(Node, Test);
5026 return;
5028 if (isMask_64(Mask)) {
5029 unsigned LeadingZeros = countLeadingZeros(Mask);
5030 SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
5031 SDValue Shift =
5032 SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
5033 N0.getOperand(0), Imm), 0);
5034 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
5035 MVT::i32, Shift, Shift);
5036 ReplaceNode(Node, Test);
5037 return;
5041 MVT VT;
5042 int SubRegOp;
5043 unsigned ROpc, MOpc;
5045 // For each of these checks we need to be careful if the sign flag is
5046 // being used. It is only safe to use the sign flag in two conditions,
5047 // either the sign bit in the shrunken mask is zero or the final test
5048 // size is equal to the original compare size.
5050 if (isUInt<8>(Mask) &&
5051 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
5052 hasNoSignFlagUses(SDValue(Node, 0)))) {
5053 // For example, convert "testl %eax, $8" to "testb %al, $8"
5054 VT = MVT::i8;
5055 SubRegOp = X86::sub_8bit;
5056 ROpc = X86::TEST8ri;
5057 MOpc = X86::TEST8mi;
5058 } else if (OptForMinSize && isUInt<16>(Mask) &&
5059 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
5060 hasNoSignFlagUses(SDValue(Node, 0)))) {
5061 // For example, "testl %eax, $32776" to "testw %ax, $32776".
5062 // NOTE: We only want to form TESTW instructions if optimizing for
5063 // min size. Otherwise we only save one byte and possibly get a length
5064 // changing prefix penalty in the decoders.
5065 VT = MVT::i16;
5066 SubRegOp = X86::sub_16bit;
5067 ROpc = X86::TEST16ri;
5068 MOpc = X86::TEST16mi;
5069 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
5070 ((!(Mask & 0x80000000) &&
5071 // Without minsize 16-bit Cmps can get here so we need to
5072 // be sure we calculate the correct sign flag if needed.
5073 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
5074 CmpVT == MVT::i32 ||
5075 hasNoSignFlagUses(SDValue(Node, 0)))) {
5076 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
5077 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
5078 // Otherwize, we find ourselves in a position where we have to do
5079 // promotion. If previous passes did not promote the and, we assume
5080 // they had a good reason not to and do not promote here.
5081 VT = MVT::i32;
5082 SubRegOp = X86::sub_32bit;
5083 ROpc = X86::TEST32ri;
5084 MOpc = X86::TEST32mi;
5085 } else {
5086 // No eligible transformation was found.
5087 break;
5090 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
5091 SDValue Reg = N0.getOperand(0);
5093 // Emit a testl or testw.
5094 MachineSDNode *NewNode;
5095 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5096 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5097 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
5098 Reg.getOperand(0) };
5099 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
5100 // Update the chain.
5101 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
5102 // Record the mem-refs
5103 CurDAG->setNodeMemRefs(NewNode,
5104 {cast<LoadSDNode>(Reg)->getMemOperand()});
5105 } else {
5106 // Extract the subregister if necessary.
5107 if (N0.getValueType() != VT)
5108 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
5110 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
5112 // Replace CMP with TEST.
5113 ReplaceNode(Node, NewNode);
5114 return;
5116 break;
5118 case X86ISD::PCMPISTR: {
5119 if (!Subtarget->hasSSE42())
5120 break;
5122 bool NeedIndex = !SDValue(Node, 0).use_empty();
5123 bool NeedMask = !SDValue(Node, 1).use_empty();
5124 // We can't fold a load if we are going to make two instructions.
5125 bool MayFoldLoad = !NeedIndex || !NeedMask;
5127 MachineSDNode *CNode;
5128 if (NeedMask) {
5129 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
5130 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
5131 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
5132 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
5134 if (NeedIndex || !NeedMask) {
5135 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
5136 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
5137 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
5138 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5141 // Connect the flag usage to the last instruction created.
5142 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
5143 CurDAG->RemoveDeadNode(Node);
5144 return;
5146 case X86ISD::PCMPESTR: {
5147 if (!Subtarget->hasSSE42())
5148 break;
5150 // Copy the two implicit register inputs.
5151 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
5152 Node->getOperand(1),
5153 SDValue()).getValue(1);
5154 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
5155 Node->getOperand(3), InFlag).getValue(1);
5157 bool NeedIndex = !SDValue(Node, 0).use_empty();
5158 bool NeedMask = !SDValue(Node, 1).use_empty();
5159 // We can't fold a load if we are going to make two instructions.
5160 bool MayFoldLoad = !NeedIndex || !NeedMask;
5162 MachineSDNode *CNode;
5163 if (NeedMask) {
5164 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
5165 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
5166 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
5167 InFlag);
5168 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
5170 if (NeedIndex || !NeedMask) {
5171 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
5172 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
5173 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
5174 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5176 // Connect the flag usage to the last instruction created.
5177 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
5178 CurDAG->RemoveDeadNode(Node);
5179 return;
5182 case ISD::SETCC: {
5183 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
5184 return;
5186 break;
5189 case ISD::STORE:
5190 if (foldLoadStoreIntoMemOperand(Node))
5191 return;
5192 break;
5193 case ISD::FCEIL:
5194 case ISD::FFLOOR:
5195 case ISD::FTRUNC:
5196 case ISD::FNEARBYINT:
5197 case ISD::FRINT: {
5198 // Replace fp rounding with their X86 specific equivalent so we don't
5199 // need 2 sets of patterns.
5200 // FIXME: This can only happen when the nodes started as STRICT_* and have
5201 // been mutated into their non-STRICT equivalents. Eventually this
5202 // mutation will be removed and we should switch the STRICT_ nodes to a
5203 // strict version of RNDSCALE in PreProcessISelDAG.
5204 unsigned Imm;
5205 switch (Node->getOpcode()) {
5206 default: llvm_unreachable("Unexpected opcode!");
5207 case ISD::FCEIL: Imm = 0xA; break;
5208 case ISD::FFLOOR: Imm = 0x9; break;
5209 case ISD::FTRUNC: Imm = 0xB; break;
5210 case ISD::FNEARBYINT: Imm = 0xC; break;
5211 case ISD::FRINT: Imm = 0x4; break;
5213 SDLoc dl(Node);
5214 SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, Node->getValueType(0),
5215 Node->getOperand(0),
5216 CurDAG->getTargetConstant(Imm, dl, MVT::i8));
5217 ReplaceNode(Node, Res.getNode());
5218 SelectCode(Res.getNode());
5219 return;
5223 SelectCode(Node);
5226 bool X86DAGToDAGISel::
5227 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
5228 std::vector<SDValue> &OutOps) {
5229 SDValue Op0, Op1, Op2, Op3, Op4;
5230 switch (ConstraintID) {
5231 default:
5232 llvm_unreachable("Unexpected asm memory constraint");
5233 case InlineAsm::Constraint_i:
5234 // FIXME: It seems strange that 'i' is needed here since it's supposed to
5235 // be an immediate and not a memory constraint.
5236 LLVM_FALLTHROUGH;
5237 case InlineAsm::Constraint_o: // offsetable ??
5238 case InlineAsm::Constraint_v: // not offsetable ??
5239 case InlineAsm::Constraint_m: // memory
5240 case InlineAsm::Constraint_X:
5241 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
5242 return true;
5243 break;
5246 OutOps.push_back(Op0);
5247 OutOps.push_back(Op1);
5248 OutOps.push_back(Op2);
5249 OutOps.push_back(Op3);
5250 OutOps.push_back(Op4);
5251 return false;
5254 /// This pass converts a legalized DAG into a X86-specific DAG,
5255 /// ready for instruction scheduling.
5256 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
5257 CodeGenOpt::Level OptLevel) {
5258 return new X86DAGToDAGISel(TM, OptLevel);