[ARM] MVE integer min and max
[llvm-core.git] / lib / Target / X86 / X86ISelDAGToDAG.cpp
blob95d31e62cafc8671a1007733db12dda0cdc8d29b
1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines a DAG pattern matching instruction selector for X86,
10 // converting from a legalized dag to a X86 dag.
12 //===----------------------------------------------------------------------===//
14 #include "X86.h"
15 #include "X86MachineFunctionInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "X86TargetMachine.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/MachineFrameInfo.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/SelectionDAGISel.h"
23 #include "llvm/Config/llvm-config.h"
24 #include "llvm/IR/ConstantRange.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/ErrorHandling.h"
31 #include "llvm/Support/KnownBits.h"
32 #include "llvm/Support/MathExtras.h"
33 #include "llvm/Support/raw_ostream.h"
34 #include "llvm/Target/TargetMachine.h"
35 #include "llvm/Target/TargetOptions.h"
36 #include <stdint.h>
37 using namespace llvm;
39 #define DEBUG_TYPE "x86-isel"
41 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
43 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45 cl::Hidden);
47 //===----------------------------------------------------------------------===//
48 // Pattern Matcher Implementation
49 //===----------------------------------------------------------------------===//
51 namespace {
52 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
53 /// numbers for the leaves of the matched tree.
54 struct X86ISelAddressMode {
55 enum {
56 RegBase,
57 FrameIndexBase
58 } BaseType;
60 // This is really a union, discriminated by BaseType!
61 SDValue Base_Reg;
62 int Base_FrameIndex;
64 unsigned Scale;
65 SDValue IndexReg;
66 int32_t Disp;
67 SDValue Segment;
68 const GlobalValue *GV;
69 const Constant *CP;
70 const BlockAddress *BlockAddr;
71 const char *ES;
72 MCSymbol *MCSym;
73 int JT;
74 unsigned Align; // CP alignment.
75 unsigned char SymbolFlags; // X86II::MO_*
76 bool NegateIndex = false;
78 X86ISelAddressMode()
79 : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
80 Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
81 MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
83 bool hasSymbolicDisplacement() const {
84 return GV != nullptr || CP != nullptr || ES != nullptr ||
85 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
88 bool hasBaseOrIndexReg() const {
89 return BaseType == FrameIndexBase ||
90 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
93 /// Return true if this addressing mode is already RIP-relative.
94 bool isRIPRelative() const {
95 if (BaseType != RegBase) return false;
96 if (RegisterSDNode *RegNode =
97 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
98 return RegNode->getReg() == X86::RIP;
99 return false;
102 void setBaseReg(SDValue Reg) {
103 BaseType = RegBase;
104 Base_Reg = Reg;
107 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
108 void dump(SelectionDAG *DAG = nullptr) {
109 dbgs() << "X86ISelAddressMode " << this << '\n';
110 dbgs() << "Base_Reg ";
111 if (Base_Reg.getNode())
112 Base_Reg.getNode()->dump(DAG);
113 else
114 dbgs() << "nul\n";
115 if (BaseType == FrameIndexBase)
116 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
117 dbgs() << " Scale " << Scale << '\n'
118 << "IndexReg ";
119 if (NegateIndex)
120 dbgs() << "negate ";
121 if (IndexReg.getNode())
122 IndexReg.getNode()->dump(DAG);
123 else
124 dbgs() << "nul\n";
125 dbgs() << " Disp " << Disp << '\n'
126 << "GV ";
127 if (GV)
128 GV->dump();
129 else
130 dbgs() << "nul";
131 dbgs() << " CP ";
132 if (CP)
133 CP->dump();
134 else
135 dbgs() << "nul";
136 dbgs() << '\n'
137 << "ES ";
138 if (ES)
139 dbgs() << ES;
140 else
141 dbgs() << "nul";
142 dbgs() << " MCSym ";
143 if (MCSym)
144 dbgs() << MCSym;
145 else
146 dbgs() << "nul";
147 dbgs() << " JT" << JT << " Align" << Align << '\n';
149 #endif
153 namespace {
154 //===--------------------------------------------------------------------===//
155 /// ISel - X86-specific code to select X86 machine instructions for
156 /// SelectionDAG operations.
158 class X86DAGToDAGISel final : public SelectionDAGISel {
159 /// Keep a pointer to the X86Subtarget around so that we can
160 /// make the right decision when generating code for different targets.
161 const X86Subtarget *Subtarget;
163 /// If true, selector should try to optimize for code size instead of
164 /// performance.
165 bool OptForSize;
167 /// If true, selector should try to optimize for minimum code size.
168 bool OptForMinSize;
170 /// Disable direct TLS access through segment registers.
171 bool IndirectTlsSegRefs;
173 public:
174 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
175 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
176 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178 StringRef getPassName() const override {
179 return "X86 DAG->DAG Instruction Selection";
182 bool runOnMachineFunction(MachineFunction &MF) override {
183 // Reset the subtarget each time through.
184 Subtarget = &MF.getSubtarget<X86Subtarget>();
185 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
186 "indirect-tls-seg-refs");
188 // OptFor[Min]Size are used in pattern predicates that isel is matching.
189 OptForSize = MF.getFunction().hasOptSize();
190 OptForMinSize = MF.getFunction().hasMinSize();
191 assert((!OptForMinSize || OptForSize) &&
192 "OptForMinSize implies OptForSize");
194 SelectionDAGISel::runOnMachineFunction(MF);
195 return true;
198 void EmitFunctionEntryCode() override;
200 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
202 void PreprocessISelDAG() override;
203 void PostprocessISelDAG() override;
205 // Include the pieces autogenerated from the target description.
206 #include "X86GenDAGISel.inc"
208 private:
209 void Select(SDNode *N) override;
211 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
212 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
213 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
214 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
215 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
216 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
217 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218 unsigned Depth);
219 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
220 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
221 SDValue &Scale, SDValue &Index, SDValue &Disp,
222 SDValue &Segment);
223 bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
224 SDValue &Scale, SDValue &Index, SDValue &Disp,
225 SDValue &Segment);
226 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
227 bool selectLEAAddr(SDValue N, SDValue &Base,
228 SDValue &Scale, SDValue &Index, SDValue &Disp,
229 SDValue &Segment);
230 bool selectLEA64_32Addr(SDValue N, SDValue &Base,
231 SDValue &Scale, SDValue &Index, SDValue &Disp,
232 SDValue &Segment);
233 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
234 SDValue &Scale, SDValue &Index, SDValue &Disp,
235 SDValue &Segment);
236 bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
237 SDValue &Base, SDValue &Scale,
238 SDValue &Index, SDValue &Disp,
239 SDValue &Segment,
240 SDValue &NodeWithChain);
241 bool selectRelocImm(SDValue N, SDValue &Op);
243 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
244 SDValue &Base, SDValue &Scale,
245 SDValue &Index, SDValue &Disp,
246 SDValue &Segment);
248 // Convenience method where P is also root.
249 bool tryFoldLoad(SDNode *P, SDValue N,
250 SDValue &Base, SDValue &Scale,
251 SDValue &Index, SDValue &Disp,
252 SDValue &Segment) {
253 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
256 /// Implement addressing mode selection for inline asm expressions.
257 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
258 unsigned ConstraintID,
259 std::vector<SDValue> &OutOps) override;
261 void emitSpecialCodeForMain();
263 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
264 MVT VT, SDValue &Base, SDValue &Scale,
265 SDValue &Index, SDValue &Disp,
266 SDValue &Segment) {
267 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
268 Base = CurDAG->getTargetFrameIndex(
269 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
270 else if (AM.Base_Reg.getNode())
271 Base = AM.Base_Reg;
272 else
273 Base = CurDAG->getRegister(0, VT);
275 Scale = getI8Imm(AM.Scale, DL);
277 // Negate the index if needed.
278 if (AM.NegateIndex) {
279 unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
280 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
281 AM.IndexReg), 0);
282 AM.IndexReg = Neg;
285 if (AM.IndexReg.getNode())
286 Index = AM.IndexReg;
287 else
288 Index = CurDAG->getRegister(0, VT);
290 // These are 32-bit even in 64-bit mode since RIP-relative offset
291 // is 32-bit.
292 if (AM.GV)
293 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
294 MVT::i32, AM.Disp,
295 AM.SymbolFlags);
296 else if (AM.CP)
297 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
298 AM.Align, AM.Disp, AM.SymbolFlags);
299 else if (AM.ES) {
300 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
301 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
302 } else if (AM.MCSym) {
303 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
304 assert(AM.SymbolFlags == 0 && "oo");
305 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
306 } else if (AM.JT != -1) {
307 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
308 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
309 } else if (AM.BlockAddr)
310 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
311 AM.SymbolFlags);
312 else
313 Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
315 if (AM.Segment.getNode())
316 Segment = AM.Segment;
317 else
318 Segment = CurDAG->getRegister(0, MVT::i16);
321 // Utility function to determine whether we should avoid selecting
322 // immediate forms of instructions for better code size or not.
323 // At a high level, we'd like to avoid such instructions when
324 // we have similar constants used within the same basic block
325 // that can be kept in a register.
327 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
328 uint32_t UseCount = 0;
330 // Do not want to hoist if we're not optimizing for size.
331 // TODO: We'd like to remove this restriction.
332 // See the comment in X86InstrInfo.td for more info.
333 if (!OptForSize)
334 return false;
336 // Walk all the users of the immediate.
337 for (SDNode::use_iterator UI = N->use_begin(),
338 UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
340 SDNode *User = *UI;
342 // This user is already selected. Count it as a legitimate use and
343 // move on.
344 if (User->isMachineOpcode()) {
345 UseCount++;
346 continue;
349 // We want to count stores of immediates as real uses.
350 if (User->getOpcode() == ISD::STORE &&
351 User->getOperand(1).getNode() == N) {
352 UseCount++;
353 continue;
356 // We don't currently match users that have > 2 operands (except
357 // for stores, which are handled above)
358 // Those instruction won't match in ISEL, for now, and would
359 // be counted incorrectly.
360 // This may change in the future as we add additional instruction
361 // types.
362 if (User->getNumOperands() != 2)
363 continue;
365 // Immediates that are used for offsets as part of stack
366 // manipulation should be left alone. These are typically
367 // used to indicate SP offsets for argument passing and
368 // will get pulled into stores/pushes (implicitly).
369 if (User->getOpcode() == X86ISD::ADD ||
370 User->getOpcode() == ISD::ADD ||
371 User->getOpcode() == X86ISD::SUB ||
372 User->getOpcode() == ISD::SUB) {
374 // Find the other operand of the add/sub.
375 SDValue OtherOp = User->getOperand(0);
376 if (OtherOp.getNode() == N)
377 OtherOp = User->getOperand(1);
379 // Don't count if the other operand is SP.
380 RegisterSDNode *RegNode;
381 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
382 (RegNode = dyn_cast_or_null<RegisterSDNode>(
383 OtherOp->getOperand(1).getNode())))
384 if ((RegNode->getReg() == X86::ESP) ||
385 (RegNode->getReg() == X86::RSP))
386 continue;
389 // ... otherwise, count this and move on.
390 UseCount++;
393 // If we have more than 1 use, then recommend for hoisting.
394 return (UseCount > 1);
397 /// Return a target constant with the specified value of type i8.
398 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
399 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
402 /// Return a target constant with the specified value, of type i32.
403 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
404 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
407 /// Return a target constant with the specified value, of type i64.
408 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
409 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
412 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
413 const SDLoc &DL) {
414 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
415 uint64_t Index = N->getConstantOperandVal(1);
416 MVT VecVT = N->getOperand(0).getSimpleValueType();
417 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
420 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
421 const SDLoc &DL) {
422 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
423 uint64_t Index = N->getConstantOperandVal(2);
424 MVT VecVT = N->getSimpleValueType(0);
425 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
428 // Helper to detect unneeded and instructions on shift amounts. Called
429 // from PatFrags in tablegen.
430 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
431 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
432 const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
434 if (Val.countTrailingOnes() >= Width)
435 return true;
437 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
438 return Mask.countTrailingOnes() >= Width;
441 /// Return an SDNode that returns the value of the global base register.
442 /// Output instructions required to initialize the global base register,
443 /// if necessary.
444 SDNode *getGlobalBaseReg();
446 /// Return a reference to the TargetMachine, casted to the target-specific
447 /// type.
448 const X86TargetMachine &getTargetMachine() const {
449 return static_cast<const X86TargetMachine &>(TM);
452 /// Return a reference to the TargetInstrInfo, casted to the target-specific
453 /// type.
454 const X86InstrInfo *getInstrInfo() const {
455 return Subtarget->getInstrInfo();
458 /// Address-mode matching performs shift-of-and to and-of-shift
459 /// reassociation in order to expose more scaled addressing
460 /// opportunities.
461 bool ComplexPatternFuncMutatesDAG() const override {
462 return true;
465 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
467 /// Returns whether this is a relocatable immediate in the range
468 /// [-2^Width .. 2^Width-1].
469 template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
470 if (auto *CN = dyn_cast<ConstantSDNode>(N))
471 return isInt<Width>(CN->getSExtValue());
472 return isSExtAbsoluteSymbolRef(Width, N);
475 // Indicates we should prefer to use a non-temporal load for this load.
476 bool useNonTemporalLoad(LoadSDNode *N) const {
477 if (!N->isNonTemporal())
478 return false;
480 unsigned StoreSize = N->getMemoryVT().getStoreSize();
482 if (N->getAlignment() < StoreSize)
483 return false;
485 switch (StoreSize) {
486 default: llvm_unreachable("Unsupported store size");
487 case 4:
488 case 8:
489 return false;
490 case 16:
491 return Subtarget->hasSSE41();
492 case 32:
493 return Subtarget->hasAVX2();
494 case 64:
495 return Subtarget->hasAVX512();
499 bool foldLoadStoreIntoMemOperand(SDNode *Node);
500 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
501 bool matchBitExtract(SDNode *Node);
502 bool shrinkAndImmediate(SDNode *N);
503 bool isMaskZeroExtended(SDNode *N) const;
504 bool tryShiftAmountMod(SDNode *N);
505 bool tryShrinkShlLogicImm(SDNode *N);
506 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
508 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
509 const SDLoc &dl, MVT VT, SDNode *Node);
510 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
511 const SDLoc &dl, MVT VT, SDNode *Node,
512 SDValue &InFlag);
514 bool tryOptimizeRem8Extend(SDNode *N);
516 bool onlyUsesZeroFlag(SDValue Flags) const;
517 bool hasNoSignFlagUses(SDValue Flags) const;
518 bool hasNoCarryFlagUses(SDValue Flags) const;
523 // Returns true if this masked compare can be implemented legally with this
524 // type.
525 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
526 unsigned Opcode = N->getOpcode();
527 if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
528 Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) {
529 // We can get 256-bit 8 element types here without VLX being enabled. When
530 // this happens we will use 512-bit operations and the mask will not be
531 // zero extended.
532 EVT OpVT = N->getOperand(0).getValueType();
533 if (OpVT.is256BitVector() || OpVT.is128BitVector())
534 return Subtarget->hasVLX();
536 return true;
538 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
539 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
540 Opcode == X86ISD::FSETCCM_SAE)
541 return true;
543 return false;
546 // Returns true if we can assume the writer of the mask has zero extended it
547 // for us.
548 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
549 // If this is an AND, check if we have a compare on either side. As long as
550 // one side guarantees the mask is zero extended, the AND will preserve those
551 // zeros.
552 if (N->getOpcode() == ISD::AND)
553 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
554 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
556 return isLegalMaskCompare(N, Subtarget);
559 bool
560 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
561 if (OptLevel == CodeGenOpt::None) return false;
563 if (!N.hasOneUse())
564 return false;
566 if (N.getOpcode() != ISD::LOAD)
567 return true;
569 // Don't fold non-temporal loads if we have an instruction for them.
570 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
571 return false;
573 // If N is a load, do additional profitability checks.
574 if (U == Root) {
575 switch (U->getOpcode()) {
576 default: break;
577 case X86ISD::ADD:
578 case X86ISD::ADC:
579 case X86ISD::SUB:
580 case X86ISD::SBB:
581 case X86ISD::AND:
582 case X86ISD::XOR:
583 case X86ISD::OR:
584 case ISD::ADD:
585 case ISD::ADDCARRY:
586 case ISD::AND:
587 case ISD::OR:
588 case ISD::XOR: {
589 SDValue Op1 = U->getOperand(1);
591 // If the other operand is a 8-bit immediate we should fold the immediate
592 // instead. This reduces code size.
593 // e.g.
594 // movl 4(%esp), %eax
595 // addl $4, %eax
596 // vs.
597 // movl $4, %eax
598 // addl 4(%esp), %eax
599 // The former is 2 bytes shorter. In case where the increment is 1, then
600 // the saving can be 4 bytes (by using incl %eax).
601 if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
602 if (Imm->getAPIntValue().isSignedIntN(8))
603 return false;
605 // If this is a 64-bit AND with an immediate that fits in 32-bits,
606 // prefer using the smaller and over folding the load. This is needed to
607 // make sure immediates created by shrinkAndImmediate are always folded.
608 // Ideally we would narrow the load during DAG combine and get the
609 // best of both worlds.
610 if (U->getOpcode() == ISD::AND &&
611 Imm->getAPIntValue().getBitWidth() == 64 &&
612 Imm->getAPIntValue().isIntN(32))
613 return false;
615 // If this really a zext_inreg that can be represented with a movzx
616 // instruction, prefer that.
617 // TODO: We could shrink the load and fold if it is non-volatile.
618 if (U->getOpcode() == ISD::AND &&
619 (Imm->getAPIntValue() == UINT8_MAX ||
620 Imm->getAPIntValue() == UINT16_MAX ||
621 Imm->getAPIntValue() == UINT32_MAX))
622 return false;
624 // ADD/SUB with can negate the immediate and use the opposite operation
625 // to fit 128 into a sign extended 8 bit immediate.
626 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
627 (-Imm->getAPIntValue()).isSignedIntN(8))
628 return false;
631 // If the other operand is a TLS address, we should fold it instead.
632 // This produces
633 // movl %gs:0, %eax
634 // leal i@NTPOFF(%eax), %eax
635 // instead of
636 // movl $i@NTPOFF, %eax
637 // addl %gs:0, %eax
638 // if the block also has an access to a second TLS address this will save
639 // a load.
640 // FIXME: This is probably also true for non-TLS addresses.
641 if (Op1.getOpcode() == X86ISD::Wrapper) {
642 SDValue Val = Op1.getOperand(0);
643 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
644 return false;
647 // Don't fold load if this matches the BTS/BTR/BTC patterns.
648 // BTS: (or X, (shl 1, n))
649 // BTR: (and X, (rotl -2, n))
650 // BTC: (xor X, (shl 1, n))
651 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
652 if (U->getOperand(0).getOpcode() == ISD::SHL &&
653 isOneConstant(U->getOperand(0).getOperand(0)))
654 return false;
656 if (U->getOperand(1).getOpcode() == ISD::SHL &&
657 isOneConstant(U->getOperand(1).getOperand(0)))
658 return false;
660 if (U->getOpcode() == ISD::AND) {
661 SDValue U0 = U->getOperand(0);
662 SDValue U1 = U->getOperand(1);
663 if (U0.getOpcode() == ISD::ROTL) {
664 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
665 if (C && C->getSExtValue() == -2)
666 return false;
669 if (U1.getOpcode() == ISD::ROTL) {
670 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
671 if (C && C->getSExtValue() == -2)
672 return false;
676 break;
678 case ISD::SHL:
679 case ISD::SRA:
680 case ISD::SRL:
681 // Don't fold a load into a shift by immediate. The BMI2 instructions
682 // support folding a load, but not an immediate. The legacy instructions
683 // support folding an immediate, but can't fold a load. Folding an
684 // immediate is preferable to folding a load.
685 if (isa<ConstantSDNode>(U->getOperand(1)))
686 return false;
688 break;
692 // Prevent folding a load if this can implemented with an insert_subreg or
693 // a move that implicitly zeroes.
694 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
695 isNullConstant(Root->getOperand(2)) &&
696 (Root->getOperand(0).isUndef() ||
697 ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
698 return false;
700 return true;
703 /// Replace the original chain operand of the call with
704 /// load's chain operand and move load below the call's chain operand.
705 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
706 SDValue Call, SDValue OrigChain) {
707 SmallVector<SDValue, 8> Ops;
708 SDValue Chain = OrigChain.getOperand(0);
709 if (Chain.getNode() == Load.getNode())
710 Ops.push_back(Load.getOperand(0));
711 else {
712 assert(Chain.getOpcode() == ISD::TokenFactor &&
713 "Unexpected chain operand");
714 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
715 if (Chain.getOperand(i).getNode() == Load.getNode())
716 Ops.push_back(Load.getOperand(0));
717 else
718 Ops.push_back(Chain.getOperand(i));
719 SDValue NewChain =
720 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
721 Ops.clear();
722 Ops.push_back(NewChain);
724 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
725 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
726 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
727 Load.getOperand(1), Load.getOperand(2));
729 Ops.clear();
730 Ops.push_back(SDValue(Load.getNode(), 1));
731 Ops.append(Call->op_begin() + 1, Call->op_end());
732 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
735 /// Return true if call address is a load and it can be
736 /// moved below CALLSEQ_START and the chains leading up to the call.
737 /// Return the CALLSEQ_START by reference as a second output.
738 /// In the case of a tail call, there isn't a callseq node between the call
739 /// chain and the load.
740 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
741 // The transformation is somewhat dangerous if the call's chain was glued to
742 // the call. After MoveBelowOrigChain the load is moved between the call and
743 // the chain, this can create a cycle if the load is not folded. So it is
744 // *really* important that we are sure the load will be folded.
745 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
746 return false;
747 LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
748 if (!LD ||
749 LD->isVolatile() ||
750 LD->getAddressingMode() != ISD::UNINDEXED ||
751 LD->getExtensionType() != ISD::NON_EXTLOAD)
752 return false;
754 // Now let's find the callseq_start.
755 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
756 if (!Chain.hasOneUse())
757 return false;
758 Chain = Chain.getOperand(0);
761 if (!Chain.getNumOperands())
762 return false;
763 // Since we are not checking for AA here, conservatively abort if the chain
764 // writes to memory. It's not safe to move the callee (a load) across a store.
765 if (isa<MemSDNode>(Chain.getNode()) &&
766 cast<MemSDNode>(Chain.getNode())->writeMem())
767 return false;
768 if (Chain.getOperand(0).getNode() == Callee.getNode())
769 return true;
770 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
771 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
772 Callee.getValue(1).hasOneUse())
773 return true;
774 return false;
777 void X86DAGToDAGISel::PreprocessISelDAG() {
778 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
779 E = CurDAG->allnodes_end(); I != E; ) {
780 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
782 // If this is a target specific AND node with no flag usages, turn it back
783 // into ISD::AND to enable test instruction matching.
784 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
785 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
786 N->getOperand(0), N->getOperand(1));
787 --I;
788 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
789 ++I;
790 CurDAG->DeleteNode(N);
791 continue;
794 switch (N->getOpcode()) {
795 case ISD::FP_TO_SINT:
796 case ISD::FP_TO_UINT: {
797 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
798 // don't need 2 sets of patterns.
799 if (!N->getSimpleValueType(0).isVector())
800 break;
802 unsigned NewOpc;
803 switch (N->getOpcode()) {
804 default: llvm_unreachable("Unexpected opcode!");
805 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
806 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
808 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
809 N->getOperand(0));
810 --I;
811 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
812 ++I;
813 CurDAG->DeleteNode(N);
814 continue;
816 case ISD::SHL:
817 case ISD::SRA:
818 case ISD::SRL: {
819 // Replace vector shifts with their X86 specific equivalent so we don't
820 // need 2 sets of patterns.
821 if (!N->getValueType(0).isVector())
822 break;
824 unsigned NewOpc;
825 switch (N->getOpcode()) {
826 default: llvm_unreachable("Unexpected opcode!");
827 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
828 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
829 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
831 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
832 N->getOperand(0), N->getOperand(1));
833 --I;
834 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
835 ++I;
836 CurDAG->DeleteNode(N);
837 continue;
839 case ISD::ANY_EXTEND:
840 case ISD::ANY_EXTEND_VECTOR_INREG: {
841 // Replace vector any extend with the zero extend equivalents so we don't
842 // need 2 sets of patterns. Ignore vXi1 extensions.
843 if (!N->getValueType(0).isVector() ||
844 N->getOperand(0).getScalarValueSizeInBits() == 1)
845 break;
847 unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
848 ? ISD::ZERO_EXTEND
849 : ISD::ZERO_EXTEND_VECTOR_INREG;
851 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
852 N->getOperand(0));
853 --I;
854 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
855 ++I;
856 CurDAG->DeleteNode(N);
857 continue;
859 case ISD::FCEIL:
860 case ISD::FFLOOR:
861 case ISD::FTRUNC:
862 case ISD::FNEARBYINT:
863 case ISD::FRINT: {
864 // Replace fp rounding with their X86 specific equivalent so we don't
865 // need 2 sets of patterns.
866 unsigned Imm;
867 switch (N->getOpcode()) {
868 default: llvm_unreachable("Unexpected opcode!");
869 case ISD::FCEIL: Imm = 0xA; break;
870 case ISD::FFLOOR: Imm = 0x9; break;
871 case ISD::FTRUNC: Imm = 0xB; break;
872 case ISD::FNEARBYINT: Imm = 0xC; break;
873 case ISD::FRINT: Imm = 0x4; break;
875 SDLoc dl(N);
876 SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
877 N->getValueType(0),
878 N->getOperand(0),
879 CurDAG->getConstant(Imm, dl, MVT::i8));
880 --I;
881 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
882 ++I;
883 CurDAG->DeleteNode(N);
884 continue;
886 case X86ISD::FANDN:
887 case X86ISD::FAND:
888 case X86ISD::FOR:
889 case X86ISD::FXOR: {
890 // Widen scalar fp logic ops to vector to reduce isel patterns.
891 // FIXME: Can we do this during lowering/combine.
892 MVT VT = N->getSimpleValueType(0);
893 if (VT.isVector() || VT == MVT::f128)
894 break;
896 MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
897 SDLoc dl(N);
898 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
899 N->getOperand(0));
900 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
901 N->getOperand(1));
903 SDValue Res;
904 if (Subtarget->hasSSE2()) {
905 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
906 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
907 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
908 unsigned Opc;
909 switch (N->getOpcode()) {
910 default: llvm_unreachable("Unexpected opcode!");
911 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
912 case X86ISD::FAND: Opc = ISD::AND; break;
913 case X86ISD::FOR: Opc = ISD::OR; break;
914 case X86ISD::FXOR: Opc = ISD::XOR; break;
916 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
917 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
918 } else {
919 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
921 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
922 CurDAG->getIntPtrConstant(0, dl));
923 --I;
924 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
925 ++I;
926 CurDAG->DeleteNode(N);
927 continue;
931 if (OptLevel != CodeGenOpt::None &&
932 // Only do this when the target can fold the load into the call or
933 // jmp.
934 !Subtarget->useRetpolineIndirectCalls() &&
935 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
936 (N->getOpcode() == X86ISD::TC_RETURN &&
937 (Subtarget->is64Bit() ||
938 !getTargetMachine().isPositionIndependent())))) {
939 /// Also try moving call address load from outside callseq_start to just
940 /// before the call to allow it to be folded.
942 /// [Load chain]
943 /// ^
944 /// |
945 /// [Load]
946 /// ^ ^
947 /// | |
948 /// / \--
949 /// / |
950 ///[CALLSEQ_START] |
951 /// ^ |
952 /// | |
953 /// [LOAD/C2Reg] |
954 /// | |
955 /// \ /
956 /// \ /
957 /// [CALL]
958 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
959 SDValue Chain = N->getOperand(0);
960 SDValue Load = N->getOperand(1);
961 if (!isCalleeLoad(Load, Chain, HasCallSeq))
962 continue;
963 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
964 ++NumLoadMoved;
965 continue;
968 // Lower fpround and fpextend nodes that target the FP stack to be store and
969 // load to the stack. This is a gross hack. We would like to simply mark
970 // these as being illegal, but when we do that, legalize produces these when
971 // it expands calls, then expands these in the same legalize pass. We would
972 // like dag combine to be able to hack on these between the call expansion
973 // and the node legalization. As such this pass basically does "really
974 // late" legalization of these inline with the X86 isel pass.
975 // FIXME: This should only happen when not compiled with -O0.
976 switch (N->getOpcode()) {
977 default: continue;
978 case ISD::FP_ROUND:
979 case ISD::FP_EXTEND:
981 MVT SrcVT = N->getOperand(0).getSimpleValueType();
982 MVT DstVT = N->getSimpleValueType(0);
984 // If any of the sources are vectors, no fp stack involved.
985 if (SrcVT.isVector() || DstVT.isVector())
986 continue;
988 // If the source and destination are SSE registers, then this is a legal
989 // conversion that should not be lowered.
990 const X86TargetLowering *X86Lowering =
991 static_cast<const X86TargetLowering *>(TLI);
992 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
993 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
994 if (SrcIsSSE && DstIsSSE)
995 continue;
997 if (!SrcIsSSE && !DstIsSSE) {
998 // If this is an FPStack extension, it is a noop.
999 if (N->getOpcode() == ISD::FP_EXTEND)
1000 continue;
1001 // If this is a value-preserving FPStack truncation, it is a noop.
1002 if (N->getConstantOperandVal(1))
1003 continue;
1006 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1007 // FPStack has extload and truncstore. SSE can fold direct loads into other
1008 // operations. Based on this, decide what we want to do.
1009 MVT MemVT;
1010 if (N->getOpcode() == ISD::FP_ROUND)
1011 MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1012 else
1013 MemVT = SrcIsSSE ? SrcVT : DstVT;
1015 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1016 SDLoc dl(N);
1018 // FIXME: optimize the case where the src/dest is a load or store?
1020 SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
1021 MemTmp, MachinePointerInfo(), MemVT);
1022 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1023 MachinePointerInfo(), MemVT);
1025 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1026 // extload we created. This will cause general havok on the dag because
1027 // anything below the conversion could be folded into other existing nodes.
1028 // To avoid invalidating 'I', back it up to the convert node.
1029 --I;
1030 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1031 break;
1034 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1035 //dealing with the chain differently, as there is already a preexisting chain.
1036 case ISD::STRICT_FP_ROUND:
1037 case ISD::STRICT_FP_EXTEND:
1039 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1040 MVT DstVT = N->getSimpleValueType(0);
1042 // If any of the sources are vectors, no fp stack involved.
1043 if (SrcVT.isVector() || DstVT.isVector())
1044 continue;
1046 // If the source and destination are SSE registers, then this is a legal
1047 // conversion that should not be lowered.
1048 const X86TargetLowering *X86Lowering =
1049 static_cast<const X86TargetLowering *>(TLI);
1050 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1051 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1052 if (SrcIsSSE && DstIsSSE)
1053 continue;
1055 if (!SrcIsSSE && !DstIsSSE) {
1056 // If this is an FPStack extension, it is a noop.
1057 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1058 continue;
1059 // If this is a value-preserving FPStack truncation, it is a noop.
1060 if (N->getConstantOperandVal(2))
1061 continue;
1064 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1065 // FPStack has extload and truncstore. SSE can fold direct loads into other
1066 // operations. Based on this, decide what we want to do.
1067 MVT MemVT;
1068 if (N->getOpcode() == ISD::STRICT_FP_ROUND)
1069 MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1070 else
1071 MemVT = SrcIsSSE ? SrcVT : DstVT;
1073 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1074 SDLoc dl(N);
1076 // FIXME: optimize the case where the src/dest is a load or store?
1078 //Since the operation is StrictFP, use the preexisting chain.
1079 SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1),
1080 MemTmp, MachinePointerInfo(), MemVT);
1081 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1082 MachinePointerInfo(), MemVT);
1084 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1085 // extload we created. This will cause general havok on the dag because
1086 // anything below the conversion could be folded into other existing nodes.
1087 // To avoid invalidating 'I', back it up to the convert node.
1088 --I;
1089 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1090 break;
1095 // Now that we did that, the node is dead. Increment the iterator to the
1096 // next node to process, then delete N.
1097 ++I;
1098 CurDAG->DeleteNode(N);
1101 // The load+call transform above can leave some dead nodes in the graph. Make
1102 // sure we remove them. Its possible some of the other transforms do to so
1103 // just remove dead nodes unconditionally.
1104 CurDAG->RemoveDeadNodes();
1107 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1108 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1109 unsigned Opc = N->getMachineOpcode();
1110 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1111 Opc != X86::MOVSX64rr8)
1112 return false;
1114 SDValue N0 = N->getOperand(0);
1116 // We need to be extracting the lower bit of an extend.
1117 if (!N0.isMachineOpcode() ||
1118 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1119 N0.getConstantOperandVal(1) != X86::sub_8bit)
1120 return false;
1122 // We're looking for either a movsx or movzx to match the original opcode.
1123 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1124 : X86::MOVSX32rr8_NOREX;
1125 SDValue N00 = N0.getOperand(0);
1126 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1127 return false;
1129 if (Opc == X86::MOVSX64rr8) {
1130 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1131 // to 64.
1132 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1133 MVT::i64, N00);
1134 ReplaceUses(N, Extend);
1135 } else {
1136 // Ok we can drop this extend and just use the original extend.
1137 ReplaceUses(N, N00.getNode());
1140 return true;
1143 void X86DAGToDAGISel::PostprocessISelDAG() {
1144 // Skip peepholes at -O0.
1145 if (TM.getOptLevel() == CodeGenOpt::None)
1146 return;
1148 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1150 bool MadeChange = false;
1151 while (Position != CurDAG->allnodes_begin()) {
1152 SDNode *N = &*--Position;
1153 // Skip dead nodes and any non-machine opcodes.
1154 if (N->use_empty() || !N->isMachineOpcode())
1155 continue;
1157 if (tryOptimizeRem8Extend(N)) {
1158 MadeChange = true;
1159 continue;
1162 // Look for a TESTrr+ANDrr pattern where both operands of the test are
1163 // the same. Rewrite to remove the AND.
1164 unsigned Opc = N->getMachineOpcode();
1165 if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
1166 Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
1167 N->getOperand(0) == N->getOperand(1) &&
1168 N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1169 N->getOperand(0).isMachineOpcode()) {
1170 SDValue And = N->getOperand(0);
1171 unsigned N0Opc = And.getMachineOpcode();
1172 if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
1173 N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) {
1174 MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
1175 MVT::i32,
1176 And.getOperand(0),
1177 And.getOperand(1));
1178 ReplaceUses(N, Test);
1179 MadeChange = true;
1180 continue;
1182 if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
1183 N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) {
1184 unsigned NewOpc;
1185 switch (N0Opc) {
1186 case X86::AND8rm: NewOpc = X86::TEST8mr; break;
1187 case X86::AND16rm: NewOpc = X86::TEST16mr; break;
1188 case X86::AND32rm: NewOpc = X86::TEST32mr; break;
1189 case X86::AND64rm: NewOpc = X86::TEST64mr; break;
1192 // Need to swap the memory and register operand.
1193 SDValue Ops[] = { And.getOperand(1),
1194 And.getOperand(2),
1195 And.getOperand(3),
1196 And.getOperand(4),
1197 And.getOperand(5),
1198 And.getOperand(0),
1199 And.getOperand(6) /* Chain */ };
1200 MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1201 MVT::i32, MVT::Other, Ops);
1202 ReplaceUses(N, Test);
1203 MadeChange = true;
1204 continue;
1208 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1209 // used. We're doing this late so we can prefer to fold the AND into masked
1210 // comparisons. Doing that can be better for the live range of the mask
1211 // register.
1212 if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
1213 Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
1214 N->getOperand(0) == N->getOperand(1) &&
1215 N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1216 N->getOperand(0).isMachineOpcode() &&
1217 onlyUsesZeroFlag(SDValue(N, 0))) {
1218 SDValue And = N->getOperand(0);
1219 unsigned N0Opc = And.getMachineOpcode();
1220 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1221 // KAND instructions and KTEST use the same ISA feature.
1222 if (N0Opc == X86::KANDBrr ||
1223 (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
1224 N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
1225 unsigned NewOpc;
1226 switch (Opc) {
1227 default: llvm_unreachable("Unexpected opcode!");
1228 case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
1229 case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
1230 case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
1231 case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
1233 MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1234 MVT::i32,
1235 And.getOperand(0),
1236 And.getOperand(1));
1237 ReplaceUses(N, KTest);
1238 MadeChange = true;
1239 continue;
1243 // Attempt to remove vectors moves that were inserted to zero upper bits.
1244 if (Opc != TargetOpcode::SUBREG_TO_REG)
1245 continue;
1247 unsigned SubRegIdx = N->getConstantOperandVal(2);
1248 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1249 continue;
1251 SDValue Move = N->getOperand(1);
1252 if (!Move.isMachineOpcode())
1253 continue;
1255 // Make sure its one of the move opcodes we recognize.
1256 switch (Move.getMachineOpcode()) {
1257 default:
1258 continue;
1259 case X86::VMOVAPDrr: case X86::VMOVUPDrr:
1260 case X86::VMOVAPSrr: case X86::VMOVUPSrr:
1261 case X86::VMOVDQArr: case X86::VMOVDQUrr:
1262 case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
1263 case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
1264 case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
1265 case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
1266 case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
1267 case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1268 case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1269 case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
1270 case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
1271 case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1272 case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1273 break;
1276 SDValue In = Move.getOperand(0);
1277 if (!In.isMachineOpcode() ||
1278 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1279 continue;
1281 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1282 // the SHA instructions which use a legacy encoding.
1283 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1284 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1285 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1286 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1287 continue;
1289 // Producing instruction is another vector instruction. We can drop the
1290 // move.
1291 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1292 MadeChange = true;
1295 if (MadeChange)
1296 CurDAG->RemoveDeadNodes();
1300 /// Emit any code that needs to be executed only in the main function.
1301 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1302 if (Subtarget->isTargetCygMing()) {
1303 TargetLowering::ArgListTy Args;
1304 auto &DL = CurDAG->getDataLayout();
1306 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1307 CLI.setChain(CurDAG->getRoot())
1308 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1309 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1310 std::move(Args));
1311 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1312 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1313 CurDAG->setRoot(Result.second);
1317 void X86DAGToDAGISel::EmitFunctionEntryCode() {
1318 // If this is main, emit special code for main.
1319 const Function &F = MF->getFunction();
1320 if (F.hasExternalLinkage() && F.getName() == "main")
1321 emitSpecialCodeForMain();
1324 static bool isDispSafeForFrameIndex(int64_t Val) {
1325 // On 64-bit platforms, we can run into an issue where a frame index
1326 // includes a displacement that, when added to the explicit displacement,
1327 // will overflow the displacement field. Assuming that the frame index
1328 // displacement fits into a 31-bit integer (which is only slightly more
1329 // aggressive than the current fundamental assumption that it fits into
1330 // a 32-bit integer), a 31-bit disp should always be safe.
1331 return isInt<31>(Val);
1334 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1335 X86ISelAddressMode &AM) {
1336 // If there's no offset to fold, we don't need to do any work.
1337 if (Offset == 0)
1338 return false;
1340 // Cannot combine ExternalSymbol displacements with integer offsets.
1341 if (AM.ES || AM.MCSym)
1342 return true;
1344 int64_t Val = AM.Disp + Offset;
1345 CodeModel::Model M = TM.getCodeModel();
1346 if (Subtarget->is64Bit()) {
1347 if (!X86::isOffsetSuitableForCodeModel(Val, M,
1348 AM.hasSymbolicDisplacement()))
1349 return true;
1350 // In addition to the checks required for a register base, check that
1351 // we do not try to use an unsafe Disp with a frame index.
1352 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1353 !isDispSafeForFrameIndex(Val))
1354 return true;
1356 AM.Disp = Val;
1357 return false;
1361 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
1362 SDValue Address = N->getOperand(1);
1364 // load gs:0 -> GS segment register.
1365 // load fs:0 -> FS segment register.
1367 // This optimization is valid because the GNU TLS model defines that
1368 // gs:0 (or fs:0 on X86-64) contains its own address.
1369 // For more information see http://people.redhat.com/drepper/tls.pdf
1370 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
1371 if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
1372 !IndirectTlsSegRefs &&
1373 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1374 Subtarget->isTargetFuchsia()))
1375 switch (N->getPointerInfo().getAddrSpace()) {
1376 case 256:
1377 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1378 return false;
1379 case 257:
1380 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1381 return false;
1382 // Address space 258 is not handled here, because it is not used to
1383 // address TLS areas.
1386 return true;
1389 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1390 /// mode. These wrap things that will resolve down into a symbol reference.
1391 /// If no match is possible, this returns true, otherwise it returns false.
1392 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1393 // If the addressing mode already has a symbol as the displacement, we can
1394 // never match another symbol.
1395 if (AM.hasSymbolicDisplacement())
1396 return true;
1398 bool IsRIPRelTLS = false;
1399 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1400 if (IsRIPRel) {
1401 SDValue Val = N.getOperand(0);
1402 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1403 IsRIPRelTLS = true;
1406 // We can't use an addressing mode in the 64-bit large code model.
1407 // Global TLS addressing is an exception. In the medium code model,
1408 // we use can use a mode when RIP wrappers are present.
1409 // That signifies access to globals that are known to be "near",
1410 // such as the GOT itself.
1411 CodeModel::Model M = TM.getCodeModel();
1412 if (Subtarget->is64Bit() &&
1413 ((M == CodeModel::Large && !IsRIPRelTLS) ||
1414 (M == CodeModel::Medium && !IsRIPRel)))
1415 return true;
1417 // Base and index reg must be 0 in order to use %rip as base.
1418 if (IsRIPRel && AM.hasBaseOrIndexReg())
1419 return true;
1421 // Make a local copy in case we can't do this fold.
1422 X86ISelAddressMode Backup = AM;
1424 int64_t Offset = 0;
1425 SDValue N0 = N.getOperand(0);
1426 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1427 AM.GV = G->getGlobal();
1428 AM.SymbolFlags = G->getTargetFlags();
1429 Offset = G->getOffset();
1430 } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1431 AM.CP = CP->getConstVal();
1432 AM.Align = CP->getAlignment();
1433 AM.SymbolFlags = CP->getTargetFlags();
1434 Offset = CP->getOffset();
1435 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1436 AM.ES = S->getSymbol();
1437 AM.SymbolFlags = S->getTargetFlags();
1438 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1439 AM.MCSym = S->getMCSymbol();
1440 } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
1441 AM.JT = J->getIndex();
1442 AM.SymbolFlags = J->getTargetFlags();
1443 } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1444 AM.BlockAddr = BA->getBlockAddress();
1445 AM.SymbolFlags = BA->getTargetFlags();
1446 Offset = BA->getOffset();
1447 } else
1448 llvm_unreachable("Unhandled symbol reference node.");
1450 if (foldOffsetIntoAddress(Offset, AM)) {
1451 AM = Backup;
1452 return true;
1455 if (IsRIPRel)
1456 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1458 // Commit the changes now that we know this fold is safe.
1459 return false;
1462 /// Add the specified node to the specified addressing mode, returning true if
1463 /// it cannot be done. This just pattern matches for the addressing mode.
1464 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1465 if (matchAddressRecursively(N, AM, 0))
1466 return true;
1468 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1469 // a smaller encoding and avoids a scaled-index.
1470 if (AM.Scale == 2 &&
1471 AM.BaseType == X86ISelAddressMode::RegBase &&
1472 AM.Base_Reg.getNode() == nullptr) {
1473 AM.Base_Reg = AM.IndexReg;
1474 AM.Scale = 1;
1477 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1478 // because it has a smaller encoding.
1479 // TODO: Which other code models can use this?
1480 switch (TM.getCodeModel()) {
1481 default: break;
1482 case CodeModel::Small:
1483 case CodeModel::Kernel:
1484 if (Subtarget->is64Bit() &&
1485 AM.Scale == 1 &&
1486 AM.BaseType == X86ISelAddressMode::RegBase &&
1487 AM.Base_Reg.getNode() == nullptr &&
1488 AM.IndexReg.getNode() == nullptr &&
1489 AM.SymbolFlags == X86II::MO_NO_FLAG &&
1490 AM.hasSymbolicDisplacement())
1491 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1492 break;
1495 return false;
1498 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1499 unsigned Depth) {
1500 // Add an artificial use to this node so that we can keep track of
1501 // it if it gets CSE'd with a different node.
1502 HandleSDNode Handle(N);
1504 X86ISelAddressMode Backup = AM;
1505 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1506 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1507 return false;
1508 AM = Backup;
1510 // Try again after commuting the operands.
1511 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
1512 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
1513 return false;
1514 AM = Backup;
1516 // If we couldn't fold both operands into the address at the same time,
1517 // see if we can just put each operand into a register and fold at least
1518 // the add.
1519 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1520 !AM.Base_Reg.getNode() &&
1521 !AM.IndexReg.getNode()) {
1522 N = Handle.getValue();
1523 AM.Base_Reg = N.getOperand(0);
1524 AM.IndexReg = N.getOperand(1);
1525 AM.Scale = 1;
1526 return false;
1528 N = Handle.getValue();
1529 return true;
1532 // Insert a node into the DAG at least before the Pos node's position. This
1533 // will reposition the node as needed, and will assign it a node ID that is <=
1534 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1535 // IDs! The selection DAG must no longer depend on their uniqueness when this
1536 // is used.
1537 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1538 if (N->getNodeId() == -1 ||
1539 (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
1540 SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
1541 DAG.RepositionNode(Pos->getIterator(), N.getNode());
1542 // Mark Node as invalid for pruning as after this it may be a successor to a
1543 // selected node but otherwise be in the same position of Pos.
1544 // Conservatively mark it with the same -abs(Id) to assure node id
1545 // invariant is preserved.
1546 N->setNodeId(Pos->getNodeId());
1547 SelectionDAGISel::InvalidateNodeId(N.getNode());
1551 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1552 // safe. This allows us to convert the shift and and into an h-register
1553 // extract and a scaled index. Returns false if the simplification is
1554 // performed.
1555 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
1556 uint64_t Mask,
1557 SDValue Shift, SDValue X,
1558 X86ISelAddressMode &AM) {
1559 if (Shift.getOpcode() != ISD::SRL ||
1560 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1561 !Shift.hasOneUse())
1562 return true;
1564 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
1565 if (ScaleLog <= 0 || ScaleLog >= 4 ||
1566 Mask != (0xffu << ScaleLog))
1567 return true;
1569 MVT VT = N.getSimpleValueType();
1570 SDLoc DL(N);
1571 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
1572 SDValue NewMask = DAG.getConstant(0xff, DL, VT);
1573 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
1574 SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
1575 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
1576 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
1578 // Insert the new nodes into the topological ordering. We must do this in
1579 // a valid topological ordering as nothing is going to go back and re-sort
1580 // these nodes. We continually insert before 'N' in sequence as this is
1581 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1582 // hierarchy left to express.
1583 insertDAGNode(DAG, N, Eight);
1584 insertDAGNode(DAG, N, Srl);
1585 insertDAGNode(DAG, N, NewMask);
1586 insertDAGNode(DAG, N, And);
1587 insertDAGNode(DAG, N, ShlCount);
1588 insertDAGNode(DAG, N, Shl);
1589 DAG.ReplaceAllUsesWith(N, Shl);
1590 DAG.RemoveDeadNode(N.getNode());
1591 AM.IndexReg = And;
1592 AM.Scale = (1 << ScaleLog);
1593 return false;
1596 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
1597 // allows us to fold the shift into this addressing mode. Returns false if the
1598 // transform succeeded.
1599 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
1600 X86ISelAddressMode &AM) {
1601 SDValue Shift = N.getOperand(0);
1603 // Use a signed mask so that shifting right will insert sign bits. These
1604 // bits will be removed when we shift the result left so it doesn't matter
1605 // what we use. This might allow a smaller immediate encoding.
1606 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
1608 // If we have an any_extend feeding the AND, look through it to see if there
1609 // is a shift behind it. But only if the AND doesn't use the extended bits.
1610 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
1611 bool FoundAnyExtend = false;
1612 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
1613 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
1614 isUInt<32>(Mask)) {
1615 FoundAnyExtend = true;
1616 Shift = Shift.getOperand(0);
1619 if (Shift.getOpcode() != ISD::SHL ||
1620 !isa<ConstantSDNode>(Shift.getOperand(1)))
1621 return true;
1623 SDValue X = Shift.getOperand(0);
1625 // Not likely to be profitable if either the AND or SHIFT node has more
1626 // than one use (unless all uses are for address computation). Besides,
1627 // isel mechanism requires their node ids to be reused.
1628 if (!N.hasOneUse() || !Shift.hasOneUse())
1629 return true;
1631 // Verify that the shift amount is something we can fold.
1632 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1633 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
1634 return true;
1636 MVT VT = N.getSimpleValueType();
1637 SDLoc DL(N);
1638 if (FoundAnyExtend) {
1639 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
1640 insertDAGNode(DAG, N, NewX);
1641 X = NewX;
1644 SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
1645 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
1646 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
1648 // Insert the new nodes into the topological ordering. We must do this in
1649 // a valid topological ordering as nothing is going to go back and re-sort
1650 // these nodes. We continually insert before 'N' in sequence as this is
1651 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1652 // hierarchy left to express.
1653 insertDAGNode(DAG, N, NewMask);
1654 insertDAGNode(DAG, N, NewAnd);
1655 insertDAGNode(DAG, N, NewShift);
1656 DAG.ReplaceAllUsesWith(N, NewShift);
1657 DAG.RemoveDeadNode(N.getNode());
1659 AM.Scale = 1 << ShiftAmt;
1660 AM.IndexReg = NewAnd;
1661 return false;
1664 // Implement some heroics to detect shifts of masked values where the mask can
1665 // be replaced by extending the shift and undoing that in the addressing mode
1666 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
1667 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
1668 // the addressing mode. This results in code such as:
1670 // int f(short *y, int *lookup_table) {
1671 // ...
1672 // return *y + lookup_table[*y >> 11];
1673 // }
1675 // Turning into:
1676 // movzwl (%rdi), %eax
1677 // movl %eax, %ecx
1678 // shrl $11, %ecx
1679 // addl (%rsi,%rcx,4), %eax
1681 // Instead of:
1682 // movzwl (%rdi), %eax
1683 // movl %eax, %ecx
1684 // shrl $9, %ecx
1685 // andl $124, %rcx
1686 // addl (%rsi,%rcx), %eax
1688 // Note that this function assumes the mask is provided as a mask *after* the
1689 // value is shifted. The input chain may or may not match that, but computing
1690 // such a mask is trivial.
1691 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
1692 uint64_t Mask,
1693 SDValue Shift, SDValue X,
1694 X86ISelAddressMode &AM) {
1695 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
1696 !isa<ConstantSDNode>(Shift.getOperand(1)))
1697 return true;
1699 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1700 unsigned MaskLZ = countLeadingZeros(Mask);
1701 unsigned MaskTZ = countTrailingZeros(Mask);
1703 // The amount of shift we're trying to fit into the addressing mode is taken
1704 // from the trailing zeros of the mask.
1705 unsigned AMShiftAmt = MaskTZ;
1707 // There is nothing we can do here unless the mask is removing some bits.
1708 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1709 if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1711 // We also need to ensure that mask is a continuous run of bits.
1712 if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
1714 // Scale the leading zero count down based on the actual size of the value.
1715 // Also scale it down based on the size of the shift.
1716 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
1717 if (MaskLZ < ScaleDown)
1718 return true;
1719 MaskLZ -= ScaleDown;
1721 // The final check is to ensure that any masked out high bits of X are
1722 // already known to be zero. Otherwise, the mask has a semantic impact
1723 // other than masking out a couple of low bits. Unfortunately, because of
1724 // the mask, zero extensions will be removed from operands in some cases.
1725 // This code works extra hard to look through extensions because we can
1726 // replace them with zero extensions cheaply if necessary.
1727 bool ReplacingAnyExtend = false;
1728 if (X.getOpcode() == ISD::ANY_EXTEND) {
1729 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
1730 X.getOperand(0).getSimpleValueType().getSizeInBits();
1731 // Assume that we'll replace the any-extend with a zero-extend, and
1732 // narrow the search to the extended value.
1733 X = X.getOperand(0);
1734 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
1735 ReplacingAnyExtend = true;
1737 APInt MaskedHighBits =
1738 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
1739 KnownBits Known = DAG.computeKnownBits(X);
1740 if (MaskedHighBits != Known.Zero) return true;
1742 // We've identified a pattern that can be transformed into a single shift
1743 // and an addressing mode. Make it so.
1744 MVT VT = N.getSimpleValueType();
1745 if (ReplacingAnyExtend) {
1746 assert(X.getValueType() != VT);
1747 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
1748 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
1749 insertDAGNode(DAG, N, NewX);
1750 X = NewX;
1752 SDLoc DL(N);
1753 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1754 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1755 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1756 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
1758 // Insert the new nodes into the topological ordering. We must do this in
1759 // a valid topological ordering as nothing is going to go back and re-sort
1760 // these nodes. We continually insert before 'N' in sequence as this is
1761 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1762 // hierarchy left to express.
1763 insertDAGNode(DAG, N, NewSRLAmt);
1764 insertDAGNode(DAG, N, NewSRL);
1765 insertDAGNode(DAG, N, NewSHLAmt);
1766 insertDAGNode(DAG, N, NewSHL);
1767 DAG.ReplaceAllUsesWith(N, NewSHL);
1768 DAG.RemoveDeadNode(N.getNode());
1770 AM.Scale = 1 << AMShiftAmt;
1771 AM.IndexReg = NewSRL;
1772 return false;
1775 // Transform "(X >> SHIFT) & (MASK << C1)" to
1776 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
1777 // matched to a BEXTR later. Returns false if the simplification is performed.
1778 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
1779 uint64_t Mask,
1780 SDValue Shift, SDValue X,
1781 X86ISelAddressMode &AM,
1782 const X86Subtarget &Subtarget) {
1783 if (Shift.getOpcode() != ISD::SRL ||
1784 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1785 !Shift.hasOneUse() || !N.hasOneUse())
1786 return true;
1788 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
1789 if (!Subtarget.hasTBM() &&
1790 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
1791 return true;
1793 // We need to ensure that mask is a continuous run of bits.
1794 if (!isShiftedMask_64(Mask)) return true;
1796 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1798 // The amount of shift we're trying to fit into the addressing mode is taken
1799 // from the trailing zeros of the mask.
1800 unsigned AMShiftAmt = countTrailingZeros(Mask);
1802 // There is nothing we can do here unless the mask is removing some bits.
1803 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1804 if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1806 MVT VT = N.getSimpleValueType();
1807 SDLoc DL(N);
1808 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1809 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1810 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
1811 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
1812 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1813 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
1815 // Insert the new nodes into the topological ordering. We must do this in
1816 // a valid topological ordering as nothing is going to go back and re-sort
1817 // these nodes. We continually insert before 'N' in sequence as this is
1818 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1819 // hierarchy left to express.
1820 insertDAGNode(DAG, N, NewSRLAmt);
1821 insertDAGNode(DAG, N, NewSRL);
1822 insertDAGNode(DAG, N, NewMask);
1823 insertDAGNode(DAG, N, NewAnd);
1824 insertDAGNode(DAG, N, NewSHLAmt);
1825 insertDAGNode(DAG, N, NewSHL);
1826 DAG.ReplaceAllUsesWith(N, NewSHL);
1827 DAG.RemoveDeadNode(N.getNode());
1829 AM.Scale = 1 << AMShiftAmt;
1830 AM.IndexReg = NewAnd;
1831 return false;
1834 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
1835 unsigned Depth) {
1836 SDLoc dl(N);
1837 LLVM_DEBUG({
1838 dbgs() << "MatchAddress: ";
1839 AM.dump(CurDAG);
1841 // Limit recursion.
1842 if (Depth > 5)
1843 return matchAddressBase(N, AM);
1845 // If this is already a %rip relative address, we can only merge immediates
1846 // into it. Instead of handling this in every case, we handle it here.
1847 // RIP relative addressing: %rip + 32-bit displacement!
1848 if (AM.isRIPRelative()) {
1849 // FIXME: JumpTable and ExternalSymbol address currently don't like
1850 // displacements. It isn't very important, but this should be fixed for
1851 // consistency.
1852 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
1853 return true;
1855 if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
1856 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
1857 return false;
1858 return true;
1861 switch (N.getOpcode()) {
1862 default: break;
1863 case ISD::LOCAL_RECOVER: {
1864 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
1865 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
1866 // Use the symbol and don't prefix it.
1867 AM.MCSym = ESNode->getMCSymbol();
1868 return false;
1870 break;
1872 case ISD::Constant: {
1873 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
1874 if (!foldOffsetIntoAddress(Val, AM))
1875 return false;
1876 break;
1879 case X86ISD::Wrapper:
1880 case X86ISD::WrapperRIP:
1881 if (!matchWrapper(N, AM))
1882 return false;
1883 break;
1885 case ISD::LOAD:
1886 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
1887 return false;
1888 break;
1890 case ISD::FrameIndex:
1891 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1892 AM.Base_Reg.getNode() == nullptr &&
1893 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
1894 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
1895 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
1896 return false;
1898 break;
1900 case ISD::SHL:
1901 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
1902 break;
1904 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
1905 unsigned Val = CN->getZExtValue();
1906 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
1907 // that the base operand remains free for further matching. If
1908 // the base doesn't end up getting used, a post-processing step
1909 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
1910 if (Val == 1 || Val == 2 || Val == 3) {
1911 AM.Scale = 1 << Val;
1912 SDValue ShVal = N.getOperand(0);
1914 // Okay, we know that we have a scale by now. However, if the scaled
1915 // value is an add of something and a constant, we can fold the
1916 // constant into the disp field here.
1917 if (CurDAG->isBaseWithConstantOffset(ShVal)) {
1918 AM.IndexReg = ShVal.getOperand(0);
1919 ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
1920 uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
1921 if (!foldOffsetIntoAddress(Disp, AM))
1922 return false;
1925 AM.IndexReg = ShVal;
1926 return false;
1929 break;
1931 case ISD::SRL: {
1932 // Scale must not be used already.
1933 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
1935 // We only handle up to 64-bit values here as those are what matter for
1936 // addressing mode optimizations.
1937 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
1938 "Unexpected value size!");
1940 SDValue And = N.getOperand(0);
1941 if (And.getOpcode() != ISD::AND) break;
1942 SDValue X = And.getOperand(0);
1944 // The mask used for the transform is expected to be post-shift, but we
1945 // found the shift first so just apply the shift to the mask before passing
1946 // it down.
1947 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
1948 !isa<ConstantSDNode>(And.getOperand(1)))
1949 break;
1950 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
1952 // Try to fold the mask and shift into the scale, and return false if we
1953 // succeed.
1954 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
1955 return false;
1956 break;
1959 case ISD::SMUL_LOHI:
1960 case ISD::UMUL_LOHI:
1961 // A mul_lohi where we need the low part can be folded as a plain multiply.
1962 if (N.getResNo() != 0) break;
1963 LLVM_FALLTHROUGH;
1964 case ISD::MUL:
1965 case X86ISD::MUL_IMM:
1966 // X*[3,5,9] -> X+X*[2,4,8]
1967 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1968 AM.Base_Reg.getNode() == nullptr &&
1969 AM.IndexReg.getNode() == nullptr) {
1970 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
1971 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
1972 CN->getZExtValue() == 9) {
1973 AM.Scale = unsigned(CN->getZExtValue())-1;
1975 SDValue MulVal = N.getOperand(0);
1976 SDValue Reg;
1978 // Okay, we know that we have a scale by now. However, if the scaled
1979 // value is an add of something and a constant, we can fold the
1980 // constant into the disp field here.
1981 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
1982 isa<ConstantSDNode>(MulVal.getOperand(1))) {
1983 Reg = MulVal.getOperand(0);
1984 ConstantSDNode *AddVal =
1985 cast<ConstantSDNode>(MulVal.getOperand(1));
1986 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
1987 if (foldOffsetIntoAddress(Disp, AM))
1988 Reg = N.getOperand(0);
1989 } else {
1990 Reg = N.getOperand(0);
1993 AM.IndexReg = AM.Base_Reg = Reg;
1994 return false;
1997 break;
1999 case ISD::SUB: {
2000 // Given A-B, if A can be completely folded into the address and
2001 // the index field with the index field unused, use -B as the index.
2002 // This is a win if a has multiple parts that can be folded into
2003 // the address. Also, this saves a mov if the base register has
2004 // other uses, since it avoids a two-address sub instruction, however
2005 // it costs an additional mov if the index register has other uses.
2007 // Add an artificial use to this node so that we can keep track of
2008 // it if it gets CSE'd with a different node.
2009 HandleSDNode Handle(N);
2011 // Test if the LHS of the sub can be folded.
2012 X86ISelAddressMode Backup = AM;
2013 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2014 N = Handle.getValue();
2015 AM = Backup;
2016 break;
2018 N = Handle.getValue();
2019 // Test if the index field is free for use.
2020 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2021 AM = Backup;
2022 break;
2025 int Cost = 0;
2026 SDValue RHS = N.getOperand(1);
2027 // If the RHS involves a register with multiple uses, this
2028 // transformation incurs an extra mov, due to the neg instruction
2029 // clobbering its operand.
2030 if (!RHS.getNode()->hasOneUse() ||
2031 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2032 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2033 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2034 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2035 RHS.getOperand(0).getValueType() == MVT::i32))
2036 ++Cost;
2037 // If the base is a register with multiple uses, this
2038 // transformation may save a mov.
2039 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2040 !AM.Base_Reg.getNode()->hasOneUse()) ||
2041 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2042 --Cost;
2043 // If the folded LHS was interesting, this transformation saves
2044 // address arithmetic.
2045 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2046 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2047 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2048 --Cost;
2049 // If it doesn't look like it may be an overall win, don't do it.
2050 if (Cost >= 0) {
2051 AM = Backup;
2052 break;
2055 // Ok, the transformation is legal and appears profitable. Go for it.
2056 // Negation will be emitted later to avoid creating dangling nodes if this
2057 // was an unprofitable LEA.
2058 AM.IndexReg = RHS;
2059 AM.NegateIndex = true;
2060 AM.Scale = 1;
2061 return false;
2064 case ISD::ADD:
2065 if (!matchAdd(N, AM, Depth))
2066 return false;
2067 break;
2069 case ISD::OR:
2070 // We want to look through a transform in InstCombine and DAGCombiner that
2071 // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
2072 // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
2073 // An 'lea' can then be used to match the shift (multiply) and add:
2074 // and $1, %esi
2075 // lea (%rsi, %rdi, 8), %rax
2076 if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
2077 !matchAdd(N, AM, Depth))
2078 return false;
2079 break;
2081 case ISD::AND: {
2082 // Perform some heroic transforms on an and of a constant-count shift
2083 // with a constant to enable use of the scaled offset field.
2085 // Scale must not be used already.
2086 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2088 // We only handle up to 64-bit values here as those are what matter for
2089 // addressing mode optimizations.
2090 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2091 "Unexpected value size!");
2093 if (!isa<ConstantSDNode>(N.getOperand(1)))
2094 break;
2096 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2097 SDValue Shift = N.getOperand(0);
2098 SDValue X = Shift.getOperand(0);
2100 uint64_t Mask = N.getConstantOperandVal(1);
2102 // Try to fold the mask and shift into an extract and scale.
2103 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2104 return false;
2106 // Try to fold the mask and shift directly into the scale.
2107 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2108 return false;
2110 // Try to fold the mask and shift into BEXTR and scale.
2111 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2112 return false;
2115 // Try to swap the mask and shift to place shifts which can be done as
2116 // a scale on the outside of the mask.
2117 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2118 return false;
2120 break;
2122 case ISD::ZERO_EXTEND: {
2123 // Try to widen a zexted shift left to the same size as its use, so we can
2124 // match the shift as a scale factor.
2125 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2126 break;
2127 if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse())
2128 break;
2130 // Give up if the shift is not a valid scale factor [1,2,3].
2131 SDValue Shl = N.getOperand(0);
2132 auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
2133 if (!ShAmtC || ShAmtC->getZExtValue() > 3)
2134 break;
2136 // The narrow shift must only shift out zero bits (it must be 'nuw').
2137 // That makes it safe to widen to the destination type.
2138 APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
2139 ShAmtC->getZExtValue());
2140 if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
2141 break;
2143 // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
2144 MVT VT = N.getSimpleValueType();
2145 SDLoc DL(N);
2146 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
2147 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
2149 // Convert the shift to scale factor.
2150 AM.Scale = 1 << ShAmtC->getZExtValue();
2151 AM.IndexReg = Zext;
2153 insertDAGNode(*CurDAG, N, Zext);
2154 insertDAGNode(*CurDAG, N, NewShl);
2155 CurDAG->ReplaceAllUsesWith(N, NewShl);
2156 CurDAG->RemoveDeadNode(N.getNode());
2157 return false;
2161 return matchAddressBase(N, AM);
2164 /// Helper for MatchAddress. Add the specified node to the
2165 /// specified addressing mode without any further recursion.
2166 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2167 // Is the base register already occupied?
2168 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2169 // If so, check to see if the scale index register is set.
2170 if (!AM.IndexReg.getNode()) {
2171 AM.IndexReg = N;
2172 AM.Scale = 1;
2173 return false;
2176 // Otherwise, we cannot select it.
2177 return true;
2180 // Default, generate it as a register.
2181 AM.BaseType = X86ISelAddressMode::RegBase;
2182 AM.Base_Reg = N;
2183 return false;
2186 /// Helper for selectVectorAddr. Handles things that can be folded into a
2187 /// gather scatter address. The index register and scale should have already
2188 /// been handled.
2189 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2190 // TODO: Support other operations.
2191 switch (N.getOpcode()) {
2192 case ISD::Constant: {
2193 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2194 if (!foldOffsetIntoAddress(Val, AM))
2195 return false;
2196 break;
2198 case X86ISD::Wrapper:
2199 if (!matchWrapper(N, AM))
2200 return false;
2201 break;
2204 return matchAddressBase(N, AM);
2207 bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
2208 SDValue &Scale, SDValue &Index,
2209 SDValue &Disp, SDValue &Segment) {
2210 X86ISelAddressMode AM;
2211 auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
2212 AM.IndexReg = Mgs->getIndex();
2213 AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
2215 unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2216 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2217 if (AddrSpace == 256)
2218 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2219 if (AddrSpace == 257)
2220 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2221 if (AddrSpace == 258)
2222 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2224 SDLoc DL(N);
2225 MVT VT = N.getSimpleValueType();
2227 // Try to match into the base and displacement fields.
2228 if (matchVectorAddress(N, AM))
2229 return false;
2231 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2232 return true;
2235 /// Returns true if it is able to pattern match an addressing mode.
2236 /// It returns the operands which make up the maximal addressing mode it can
2237 /// match by reference.
2239 /// Parent is the parent node of the addr operand that is being matched. It
2240 /// is always a load, store, atomic node, or null. It is only null when
2241 /// checking memory operands for inline asm nodes.
2242 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2243 SDValue &Scale, SDValue &Index,
2244 SDValue &Disp, SDValue &Segment) {
2245 X86ISelAddressMode AM;
2247 if (Parent &&
2248 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2249 // that are not a MemSDNode, and thus don't have proper addrspace info.
2250 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2251 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2252 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2253 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2254 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2255 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2256 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2257 unsigned AddrSpace =
2258 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2259 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2260 if (AddrSpace == 256)
2261 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2262 if (AddrSpace == 257)
2263 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2264 if (AddrSpace == 258)
2265 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2268 // Save the DL and VT before calling matchAddress, it can invalidate N.
2269 SDLoc DL(N);
2270 MVT VT = N.getSimpleValueType();
2272 if (matchAddress(N, AM))
2273 return false;
2275 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2276 return true;
2279 // We can only fold a load if all nodes between it and the root node have a
2280 // single use. If there are additional uses, we could end up duplicating the
2281 // load.
2282 static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
2283 while (User != Root) {
2284 if (!User->hasOneUse())
2285 return false;
2286 User = *User->use_begin();
2289 return true;
2292 /// Match a scalar SSE load. In particular, we want to match a load whose top
2293 /// elements are either undef or zeros. The load flavor is derived from the
2294 /// type of N, which is either v4f32 or v2f64.
2296 /// We also return:
2297 /// PatternChainNode: this is the matched node that has a chain input and
2298 /// output.
2299 bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
2300 SDValue N, SDValue &Base,
2301 SDValue &Scale, SDValue &Index,
2302 SDValue &Disp, SDValue &Segment,
2303 SDValue &PatternNodeWithChain) {
2304 if (!hasSingleUsesFromRoot(Root, Parent))
2305 return false;
2307 // We can allow a full vector load here since narrowing a load is ok unless
2308 // it's volatile.
2309 if (ISD::isNON_EXTLoad(N.getNode())) {
2310 LoadSDNode *LD = cast<LoadSDNode>(N);
2311 if (!LD->isVolatile() &&
2312 IsProfitableToFold(N, LD, Root) &&
2313 IsLegalToFold(N, Parent, Root, OptLevel)) {
2314 PatternNodeWithChain = N;
2315 return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2316 Segment);
2320 // We can also match the special zero extended load opcode.
2321 if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
2322 PatternNodeWithChain = N;
2323 if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2324 IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
2325 auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
2326 return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
2327 Segment);
2331 // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
2332 // once. Otherwise the load might get duplicated and the chain output of the
2333 // duplicate load will not be observed by all dependencies.
2334 if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
2335 PatternNodeWithChain = N.getOperand(0);
2336 if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
2337 IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2338 IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
2339 LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
2340 return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2341 Segment);
2345 return false;
2349 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2350 if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
2351 uint64_t ImmVal = CN->getZExtValue();
2352 if (!isUInt<32>(ImmVal))
2353 return false;
2355 Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
2356 return true;
2359 // In static codegen with small code model, we can get the address of a label
2360 // into a register with 'movl'
2361 if (N->getOpcode() != X86ISD::Wrapper)
2362 return false;
2364 N = N.getOperand(0);
2366 // At least GNU as does not accept 'movl' for TPOFF relocations.
2367 // FIXME: We could use 'movl' when we know we are targeting MC.
2368 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2369 return false;
2371 Imm = N;
2372 if (N->getOpcode() != ISD::TargetGlobalAddress)
2373 return TM.getCodeModel() == CodeModel::Small;
2375 Optional<ConstantRange> CR =
2376 cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
2377 if (!CR)
2378 return TM.getCodeModel() == CodeModel::Small;
2380 return CR->getUnsignedMax().ult(1ull << 32);
2383 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2384 SDValue &Scale, SDValue &Index,
2385 SDValue &Disp, SDValue &Segment) {
2386 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2387 SDLoc DL(N);
2389 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2390 return false;
2392 RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
2393 if (RN && RN->getReg() == 0)
2394 Base = CurDAG->getRegister(0, MVT::i64);
2395 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
2396 // Base could already be %rip, particularly in the x32 ABI.
2397 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2398 MVT::i64), 0);
2399 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2400 Base);
2403 RN = dyn_cast<RegisterSDNode>(Index);
2404 if (RN && RN->getReg() == 0)
2405 Index = CurDAG->getRegister(0, MVT::i64);
2406 else {
2407 assert(Index.getValueType() == MVT::i32 &&
2408 "Expect to be extending 32-bit registers for use in LEA");
2409 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2410 MVT::i64), 0);
2411 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2412 Index);
2415 return true;
2418 /// Calls SelectAddr and determines if the maximal addressing
2419 /// mode it matches can be cost effectively emitted as an LEA instruction.
2420 bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
2421 SDValue &Base, SDValue &Scale,
2422 SDValue &Index, SDValue &Disp,
2423 SDValue &Segment) {
2424 X86ISelAddressMode AM;
2426 // Save the DL and VT before calling matchAddress, it can invalidate N.
2427 SDLoc DL(N);
2428 MVT VT = N.getSimpleValueType();
2430 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
2431 // segments.
2432 SDValue Copy = AM.Segment;
2433 SDValue T = CurDAG->getRegister(0, MVT::i32);
2434 AM.Segment = T;
2435 if (matchAddress(N, AM))
2436 return false;
2437 assert (T == AM.Segment);
2438 AM.Segment = Copy;
2440 unsigned Complexity = 0;
2441 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
2442 Complexity = 1;
2443 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2444 Complexity = 4;
2446 if (AM.IndexReg.getNode())
2447 Complexity++;
2449 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
2450 // a simple shift.
2451 if (AM.Scale > 1)
2452 Complexity++;
2454 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
2455 // to a LEA. This is determined with some experimentation but is by no means
2456 // optimal (especially for code size consideration). LEA is nice because of
2457 // its three-address nature. Tweak the cost function again when we can run
2458 // convertToThreeAddress() at register allocation time.
2459 if (AM.hasSymbolicDisplacement()) {
2460 // For X86-64, always use LEA to materialize RIP-relative addresses.
2461 if (Subtarget->is64Bit())
2462 Complexity = 4;
2463 else
2464 Complexity += 2;
2467 if (AM.Disp)
2468 Complexity++;
2470 // If it isn't worth using an LEA, reject it.
2471 if (Complexity <= 2)
2472 return false;
2474 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2475 return true;
2478 /// This is only run on TargetGlobalTLSAddress nodes.
2479 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
2480 SDValue &Scale, SDValue &Index,
2481 SDValue &Disp, SDValue &Segment) {
2482 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
2483 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
2485 X86ISelAddressMode AM;
2486 AM.GV = GA->getGlobal();
2487 AM.Disp += GA->getOffset();
2488 AM.SymbolFlags = GA->getTargetFlags();
2490 MVT VT = N.getSimpleValueType();
2491 if (VT == MVT::i32) {
2492 AM.Scale = 1;
2493 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
2496 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
2497 return true;
2500 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
2501 if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
2502 Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
2503 N.getValueType());
2504 return true;
2507 // Keep track of the original value type and whether this value was
2508 // truncated. If we see a truncation from pointer type to VT that truncates
2509 // bits that are known to be zero, we can use a narrow reference.
2510 EVT VT = N.getValueType();
2511 bool WasTruncated = false;
2512 if (N.getOpcode() == ISD::TRUNCATE) {
2513 WasTruncated = true;
2514 N = N.getOperand(0);
2517 if (N.getOpcode() != X86ISD::Wrapper)
2518 return false;
2520 // We can only use non-GlobalValues as immediates if they were not truncated,
2521 // as we do not have any range information. If we have a GlobalValue and the
2522 // address was not truncated, we can select it as an operand directly.
2523 unsigned Opc = N.getOperand(0)->getOpcode();
2524 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
2525 Op = N.getOperand(0);
2526 // We can only select the operand directly if we didn't have to look past a
2527 // truncate.
2528 return !WasTruncated;
2531 // Check that the global's range fits into VT.
2532 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
2533 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2534 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
2535 return false;
2537 // Okay, we can use a narrow reference.
2538 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
2539 GA->getOffset(), GA->getTargetFlags());
2540 return true;
2543 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
2544 SDValue &Base, SDValue &Scale,
2545 SDValue &Index, SDValue &Disp,
2546 SDValue &Segment) {
2547 if (!ISD::isNON_EXTLoad(N.getNode()) ||
2548 !IsProfitableToFold(N, P, Root) ||
2549 !IsLegalToFold(N, P, Root, OptLevel))
2550 return false;
2552 return selectAddr(N.getNode(),
2553 N.getOperand(1), Base, Scale, Index, Disp, Segment);
2556 /// Return an SDNode that returns the value of the global base register.
2557 /// Output instructions required to initialize the global base register,
2558 /// if necessary.
2559 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
2560 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
2561 auto &DL = MF->getDataLayout();
2562 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
2565 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
2566 if (N->getOpcode() == ISD::TRUNCATE)
2567 N = N->getOperand(0).getNode();
2568 if (N->getOpcode() != X86ISD::Wrapper)
2569 return false;
2571 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
2572 if (!GA)
2573 return false;
2575 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2576 return CR && CR->getSignedMin().sge(-1ull << Width) &&
2577 CR->getSignedMax().slt(1ull << Width);
2580 static X86::CondCode getCondFromNode(SDNode *N) {
2581 assert(N->isMachineOpcode() && "Unexpected node");
2582 X86::CondCode CC = X86::COND_INVALID;
2583 unsigned Opc = N->getMachineOpcode();
2584 if (Opc == X86::JCC_1)
2585 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
2586 else if (Opc == X86::SETCCr)
2587 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
2588 else if (Opc == X86::SETCCm)
2589 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
2590 else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
2591 Opc == X86::CMOV64rr)
2592 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
2593 else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
2594 Opc == X86::CMOV64rm)
2595 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
2597 return CC;
2600 /// Test whether the given X86ISD::CMP node has any users that use a flag
2601 /// other than ZF.
2602 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
2603 // Examine each user of the node.
2604 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2605 UI != UE; ++UI) {
2606 // Only check things that use the flags.
2607 if (UI.getUse().getResNo() != Flags.getResNo())
2608 continue;
2609 // Only examine CopyToReg uses that copy to EFLAGS.
2610 if (UI->getOpcode() != ISD::CopyToReg ||
2611 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2612 return false;
2613 // Examine each user of the CopyToReg use.
2614 for (SDNode::use_iterator FlagUI = UI->use_begin(),
2615 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2616 // Only examine the Flag result.
2617 if (FlagUI.getUse().getResNo() != 1) continue;
2618 // Anything unusual: assume conservatively.
2619 if (!FlagUI->isMachineOpcode()) return false;
2620 // Examine the condition code of the user.
2621 X86::CondCode CC = getCondFromNode(*FlagUI);
2623 switch (CC) {
2624 // Comparisons which only use the zero flag.
2625 case X86::COND_E: case X86::COND_NE:
2626 continue;
2627 // Anything else: assume conservatively.
2628 default:
2629 return false;
2633 return true;
2636 /// Test whether the given X86ISD::CMP node has any uses which require the SF
2637 /// flag to be accurate.
2638 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
2639 // Examine each user of the node.
2640 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2641 UI != UE; ++UI) {
2642 // Only check things that use the flags.
2643 if (UI.getUse().getResNo() != Flags.getResNo())
2644 continue;
2645 // Only examine CopyToReg uses that copy to EFLAGS.
2646 if (UI->getOpcode() != ISD::CopyToReg ||
2647 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2648 return false;
2649 // Examine each user of the CopyToReg use.
2650 for (SDNode::use_iterator FlagUI = UI->use_begin(),
2651 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2652 // Only examine the Flag result.
2653 if (FlagUI.getUse().getResNo() != 1) continue;
2654 // Anything unusual: assume conservatively.
2655 if (!FlagUI->isMachineOpcode()) return false;
2656 // Examine the condition code of the user.
2657 X86::CondCode CC = getCondFromNode(*FlagUI);
2659 switch (CC) {
2660 // Comparisons which don't examine the SF flag.
2661 case X86::COND_A: case X86::COND_AE:
2662 case X86::COND_B: case X86::COND_BE:
2663 case X86::COND_E: case X86::COND_NE:
2664 case X86::COND_O: case X86::COND_NO:
2665 case X86::COND_P: case X86::COND_NP:
2666 continue;
2667 // Anything else: assume conservatively.
2668 default:
2669 return false;
2673 return true;
2676 static bool mayUseCarryFlag(X86::CondCode CC) {
2677 switch (CC) {
2678 // Comparisons which don't examine the CF flag.
2679 case X86::COND_O: case X86::COND_NO:
2680 case X86::COND_E: case X86::COND_NE:
2681 case X86::COND_S: case X86::COND_NS:
2682 case X86::COND_P: case X86::COND_NP:
2683 case X86::COND_L: case X86::COND_GE:
2684 case X86::COND_G: case X86::COND_LE:
2685 return false;
2686 // Anything else: assume conservatively.
2687 default:
2688 return true;
2692 /// Test whether the given node which sets flags has any uses which require the
2693 /// CF flag to be accurate.
2694 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
2695 // Examine each user of the node.
2696 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2697 UI != UE; ++UI) {
2698 // Only check things that use the flags.
2699 if (UI.getUse().getResNo() != Flags.getResNo())
2700 continue;
2702 unsigned UIOpc = UI->getOpcode();
2704 if (UIOpc == ISD::CopyToReg) {
2705 // Only examine CopyToReg uses that copy to EFLAGS.
2706 if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2707 return false;
2708 // Examine each user of the CopyToReg use.
2709 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
2710 FlagUI != FlagUE; ++FlagUI) {
2711 // Only examine the Flag result.
2712 if (FlagUI.getUse().getResNo() != 1)
2713 continue;
2714 // Anything unusual: assume conservatively.
2715 if (!FlagUI->isMachineOpcode())
2716 return false;
2717 // Examine the condition code of the user.
2718 X86::CondCode CC = getCondFromNode(*FlagUI);
2720 if (mayUseCarryFlag(CC))
2721 return false;
2724 // This CopyToReg is ok. Move on to the next user.
2725 continue;
2728 // This might be an unselected node. So look for the pre-isel opcodes that
2729 // use flags.
2730 unsigned CCOpNo;
2731 switch (UIOpc) {
2732 default:
2733 // Something unusual. Be conservative.
2734 return false;
2735 case X86ISD::SETCC: CCOpNo = 0; break;
2736 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
2737 case X86ISD::CMOV: CCOpNo = 2; break;
2738 case X86ISD::BRCOND: CCOpNo = 2; break;
2741 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
2742 if (mayUseCarryFlag(CC))
2743 return false;
2745 return true;
2748 /// Check whether or not the chain ending in StoreNode is suitable for doing
2749 /// the {load; op; store} to modify transformation.
2750 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
2751 SDValue StoredVal, SelectionDAG *CurDAG,
2752 unsigned LoadOpNo,
2753 LoadSDNode *&LoadNode,
2754 SDValue &InputChain) {
2755 // Is the stored value result 0 of the operation?
2756 if (StoredVal.getResNo() != 0) return false;
2758 // Are there other uses of the operation other than the store?
2759 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
2761 // Is the store non-extending and non-indexed?
2762 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
2763 return false;
2765 SDValue Load = StoredVal->getOperand(LoadOpNo);
2766 // Is the stored value a non-extending and non-indexed load?
2767 if (!ISD::isNormalLoad(Load.getNode())) return false;
2769 // Return LoadNode by reference.
2770 LoadNode = cast<LoadSDNode>(Load);
2772 // Is store the only read of the loaded value?
2773 if (!Load.hasOneUse())
2774 return false;
2776 // Is the address of the store the same as the load?
2777 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
2778 LoadNode->getOffset() != StoreNode->getOffset())
2779 return false;
2781 bool FoundLoad = false;
2782 SmallVector<SDValue, 4> ChainOps;
2783 SmallVector<const SDNode *, 4> LoopWorklist;
2784 SmallPtrSet<const SDNode *, 16> Visited;
2785 const unsigned int Max = 1024;
2787 // Visualization of Load-Op-Store fusion:
2788 // -------------------------
2789 // Legend:
2790 // *-lines = Chain operand dependencies.
2791 // |-lines = Normal operand dependencies.
2792 // Dependencies flow down and right. n-suffix references multiple nodes.
2794 // C Xn C
2795 // * * *
2796 // * * *
2797 // Xn A-LD Yn TF Yn
2798 // * * \ | * |
2799 // * * \ | * |
2800 // * * \ | => A--LD_OP_ST
2801 // * * \| \
2802 // TF OP \
2803 // * | \ Zn
2804 // * | \
2805 // A-ST Zn
2808 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
2809 // #2: Yn -> LD
2810 // #3: ST -> Zn
2812 // Ensure the transform is safe by checking for the dual
2813 // dependencies to make sure we do not induce a loop.
2815 // As LD is a predecessor to both OP and ST we can do this by checking:
2816 // a). if LD is a predecessor to a member of Xn or Yn.
2817 // b). if a Zn is a predecessor to ST.
2819 // However, (b) can only occur through being a chain predecessor to
2820 // ST, which is the same as Zn being a member or predecessor of Xn,
2821 // which is a subset of LD being a predecessor of Xn. So it's
2822 // subsumed by check (a).
2824 SDValue Chain = StoreNode->getChain();
2826 // Gather X elements in ChainOps.
2827 if (Chain == Load.getValue(1)) {
2828 FoundLoad = true;
2829 ChainOps.push_back(Load.getOperand(0));
2830 } else if (Chain.getOpcode() == ISD::TokenFactor) {
2831 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
2832 SDValue Op = Chain.getOperand(i);
2833 if (Op == Load.getValue(1)) {
2834 FoundLoad = true;
2835 // Drop Load, but keep its chain. No cycle check necessary.
2836 ChainOps.push_back(Load.getOperand(0));
2837 continue;
2839 LoopWorklist.push_back(Op.getNode());
2840 ChainOps.push_back(Op);
2844 if (!FoundLoad)
2845 return false;
2847 // Worklist is currently Xn. Add Yn to worklist.
2848 for (SDValue Op : StoredVal->ops())
2849 if (Op.getNode() != LoadNode)
2850 LoopWorklist.push_back(Op.getNode());
2852 // Check (a) if Load is a predecessor to Xn + Yn
2853 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
2854 true))
2855 return false;
2857 InputChain =
2858 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
2859 return true;
2862 // Change a chain of {load; op; store} of the same value into a simple op
2863 // through memory of that value, if the uses of the modified value and its
2864 // address are suitable.
2866 // The tablegen pattern memory operand pattern is currently not able to match
2867 // the case where the EFLAGS on the original operation are used.
2869 // To move this to tablegen, we'll need to improve tablegen to allow flags to
2870 // be transferred from a node in the pattern to the result node, probably with
2871 // a new keyword. For example, we have this
2872 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2873 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2874 // (implicit EFLAGS)]>;
2875 // but maybe need something like this
2876 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2877 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2878 // (transferrable EFLAGS)]>;
2880 // Until then, we manually fold these and instruction select the operation
2881 // here.
2882 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
2883 StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
2884 SDValue StoredVal = StoreNode->getOperand(1);
2885 unsigned Opc = StoredVal->getOpcode();
2887 // Before we try to select anything, make sure this is memory operand size
2888 // and opcode we can handle. Note that this must match the code below that
2889 // actually lowers the opcodes.
2890 EVT MemVT = StoreNode->getMemoryVT();
2891 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
2892 MemVT != MVT::i8)
2893 return false;
2895 bool IsCommutable = false;
2896 bool IsNegate = false;
2897 switch (Opc) {
2898 default:
2899 return false;
2900 case X86ISD::SUB:
2901 IsNegate = isNullConstant(StoredVal.getOperand(0));
2902 break;
2903 case X86ISD::SBB:
2904 break;
2905 case X86ISD::ADD:
2906 case X86ISD::ADC:
2907 case X86ISD::AND:
2908 case X86ISD::OR:
2909 case X86ISD::XOR:
2910 IsCommutable = true;
2911 break;
2914 unsigned LoadOpNo = IsNegate ? 1 : 0;
2915 LoadSDNode *LoadNode = nullptr;
2916 SDValue InputChain;
2917 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2918 LoadNode, InputChain)) {
2919 if (!IsCommutable)
2920 return false;
2922 // This operation is commutable, try the other operand.
2923 LoadOpNo = 1;
2924 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2925 LoadNode, InputChain))
2926 return false;
2929 SDValue Base, Scale, Index, Disp, Segment;
2930 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
2931 Segment))
2932 return false;
2934 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
2935 unsigned Opc8) {
2936 switch (MemVT.getSimpleVT().SimpleTy) {
2937 case MVT::i64:
2938 return Opc64;
2939 case MVT::i32:
2940 return Opc32;
2941 case MVT::i16:
2942 return Opc16;
2943 case MVT::i8:
2944 return Opc8;
2945 default:
2946 llvm_unreachable("Invalid size!");
2950 MachineSDNode *Result;
2951 switch (Opc) {
2952 case X86ISD::SUB:
2953 // Handle negate.
2954 if (IsNegate) {
2955 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
2956 X86::NEG8m);
2957 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
2958 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
2959 MVT::Other, Ops);
2960 break;
2962 LLVM_FALLTHROUGH;
2963 case X86ISD::ADD:
2964 // Try to match inc/dec.
2965 if (!Subtarget->slowIncDec() || OptForSize) {
2966 bool IsOne = isOneConstant(StoredVal.getOperand(1));
2967 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
2968 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
2969 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
2970 unsigned NewOpc =
2971 ((Opc == X86ISD::ADD) == IsOne)
2972 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
2973 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
2974 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
2975 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
2976 MVT::Other, Ops);
2977 break;
2980 LLVM_FALLTHROUGH;
2981 case X86ISD::ADC:
2982 case X86ISD::SBB:
2983 case X86ISD::AND:
2984 case X86ISD::OR:
2985 case X86ISD::XOR: {
2986 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
2987 switch (Opc) {
2988 case X86ISD::ADD:
2989 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
2990 X86::ADD8mr);
2991 case X86ISD::ADC:
2992 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
2993 X86::ADC8mr);
2994 case X86ISD::SUB:
2995 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
2996 X86::SUB8mr);
2997 case X86ISD::SBB:
2998 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
2999 X86::SBB8mr);
3000 case X86ISD::AND:
3001 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3002 X86::AND8mr);
3003 case X86ISD::OR:
3004 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3005 case X86ISD::XOR:
3006 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3007 X86::XOR8mr);
3008 default:
3009 llvm_unreachable("Invalid opcode!");
3012 auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
3013 switch (Opc) {
3014 case X86ISD::ADD:
3015 return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
3016 case X86ISD::ADC:
3017 return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
3018 case X86ISD::SUB:
3019 return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
3020 case X86ISD::SBB:
3021 return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
3022 case X86ISD::AND:
3023 return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
3024 case X86ISD::OR:
3025 return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
3026 case X86ISD::XOR:
3027 return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
3028 default:
3029 llvm_unreachable("Invalid opcode!");
3032 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3033 switch (Opc) {
3034 case X86ISD::ADD:
3035 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3036 X86::ADD8mi);
3037 case X86ISD::ADC:
3038 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3039 X86::ADC8mi);
3040 case X86ISD::SUB:
3041 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3042 X86::SUB8mi);
3043 case X86ISD::SBB:
3044 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3045 X86::SBB8mi);
3046 case X86ISD::AND:
3047 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3048 X86::AND8mi);
3049 case X86ISD::OR:
3050 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3051 X86::OR8mi);
3052 case X86ISD::XOR:
3053 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3054 X86::XOR8mi);
3055 default:
3056 llvm_unreachable("Invalid opcode!");
3060 unsigned NewOpc = SelectRegOpcode(Opc);
3061 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3063 // See if the operand is a constant that we can fold into an immediate
3064 // operand.
3065 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3066 int64_t OperandV = OperandC->getSExtValue();
3068 // Check if we can shrink the operand enough to fit in an immediate (or
3069 // fit into a smaller immediate) by negating it and switching the
3070 // operation.
3071 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3072 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3073 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3074 isInt<32>(-OperandV))) &&
3075 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3076 OperandV = -OperandV;
3077 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3080 // First try to fit this into an Imm8 operand. If it doesn't fit, then try
3081 // the larger immediate operand.
3082 if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
3083 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3084 NewOpc = SelectImm8Opcode(Opc);
3085 } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3086 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3087 NewOpc = SelectImmOpcode(Opc);
3091 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3092 SDValue CopyTo =
3093 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3094 StoredVal.getOperand(2), SDValue());
3096 const SDValue Ops[] = {Base, Scale, Index, Disp,
3097 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3098 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3099 Ops);
3100 } else {
3101 const SDValue Ops[] = {Base, Scale, Index, Disp,
3102 Segment, Operand, InputChain};
3103 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3104 Ops);
3106 break;
3108 default:
3109 llvm_unreachable("Invalid opcode!");
3112 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3113 LoadNode->getMemOperand()};
3114 CurDAG->setNodeMemRefs(Result, MemOps);
3116 // Update Load Chain uses as well.
3117 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3118 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3119 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3120 CurDAG->RemoveDeadNode(Node);
3121 return true;
3124 // See if this is an X & Mask that we can match to BEXTR/BZHI.
3125 // Where Mask is one of the following patterns:
3126 // a) x & (1 << nbits) - 1
3127 // b) x & ~(-1 << nbits)
3128 // c) x & (-1 >> (32 - y))
3129 // d) x << (32 - y) >> (32 - y)
3130 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3131 assert(
3132 (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
3133 "Should be either an and-mask, or right-shift after clearing high bits.");
3135 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3136 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3137 return false;
3139 MVT NVT = Node->getSimpleValueType(0);
3141 // Only supported for 32 and 64 bits.
3142 if (NVT != MVT::i32 && NVT != MVT::i64)
3143 return false;
3145 SDValue NBits;
3147 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3148 // Else, if we only have BMI1's BEXTR, we require one-use.
3149 const bool CanHaveExtraUses = Subtarget->hasBMI2();
3150 auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
3151 return CanHaveExtraUses ||
3152 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3154 auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
3155 auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
3157 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3158 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3159 assert(V.getSimpleValueType() == MVT::i32 &&
3160 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3161 "Expected i64 -> i32 truncation");
3162 V = V.getOperand(0);
3164 return V;
3167 // a) x & ((1 << nbits) + (-1))
3168 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
3169 &NBits](SDValue Mask) -> bool {
3170 // Match `add`. Must only have one use!
3171 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3172 return false;
3173 // We should be adding all-ones constant (i.e. subtracting one.)
3174 if (!isAllOnesConstant(Mask->getOperand(1)))
3175 return false;
3176 // Match `1 << nbits`. Might be truncated. Must only have one use!
3177 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3178 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3179 return false;
3180 if (!isOneConstant(M0->getOperand(0)))
3181 return false;
3182 NBits = M0->getOperand(1);
3183 return true;
3186 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3187 V = peekThroughOneUseTruncation(V);
3188 return CurDAG->MaskedValueIsAllOnes(
3189 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3190 NVT.getSizeInBits()));
3193 // b) x & ~(-1 << nbits)
3194 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3195 &NBits](SDValue Mask) -> bool {
3196 // Match `~()`. Must only have one use!
3197 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3198 return false;
3199 // The -1 only has to be all-ones for the final Node's NVT.
3200 if (!isAllOnes(Mask->getOperand(1)))
3201 return false;
3202 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3203 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3204 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3205 return false;
3206 // The -1 only has to be all-ones for the final Node's NVT.
3207 if (!isAllOnes(M0->getOperand(0)))
3208 return false;
3209 NBits = M0->getOperand(1);
3210 return true;
3213 // Match potentially-truncated (bitwidth - y)
3214 auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
3215 unsigned Bitwidth) {
3216 // Skip over a truncate of the shift amount.
3217 if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
3218 ShiftAmt = ShiftAmt.getOperand(0);
3219 // The trunc should have been the only user of the real shift amount.
3220 if (!checkOneUse(ShiftAmt))
3221 return false;
3223 // Match the shift amount as: (bitwidth - y). It should go away, too.
3224 if (ShiftAmt.getOpcode() != ISD::SUB)
3225 return false;
3226 auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
3227 if (!V0 || V0->getZExtValue() != Bitwidth)
3228 return false;
3229 NBits = ShiftAmt.getOperand(1);
3230 return true;
3233 // c) x & (-1 >> (32 - y))
3234 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
3235 matchShiftAmt](SDValue Mask) -> bool {
3236 // The mask itself may be truncated.
3237 Mask = peekThroughOneUseTruncation(Mask);
3238 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3239 // Match `l>>`. Must only have one use!
3240 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3241 return false;
3242 // We should be shifting truly all-ones constant.
3243 if (!isAllOnesConstant(Mask.getOperand(0)))
3244 return false;
3245 SDValue M1 = Mask.getOperand(1);
3246 // The shift amount should not be used externally.
3247 if (!checkOneUse(M1))
3248 return false;
3249 return matchShiftAmt(M1, Bitwidth);
3252 SDValue X;
3254 // d) x << (32 - y) >> (32 - y)
3255 auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
3256 &X](SDNode *Node) -> bool {
3257 if (Node->getOpcode() != ISD::SRL)
3258 return false;
3259 SDValue N0 = Node->getOperand(0);
3260 if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
3261 return false;
3262 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3263 SDValue N1 = Node->getOperand(1);
3264 SDValue N01 = N0->getOperand(1);
3265 // Both of the shifts must be by the exact same value.
3266 // There should not be any uses of the shift amount outside of the pattern.
3267 if (N1 != N01 || !checkTwoUse(N1))
3268 return false;
3269 if (!matchShiftAmt(N1, Bitwidth))
3270 return false;
3271 X = N0->getOperand(0);
3272 return true;
3275 auto matchLowBitMask = [matchPatternA, matchPatternB,
3276 matchPatternC](SDValue Mask) -> bool {
3277 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3280 if (Node->getOpcode() == ISD::AND) {
3281 X = Node->getOperand(0);
3282 SDValue Mask = Node->getOperand(1);
3284 if (matchLowBitMask(Mask)) {
3285 // Great.
3286 } else {
3287 std::swap(X, Mask);
3288 if (!matchLowBitMask(Mask))
3289 return false;
3291 } else if (!matchPatternD(Node))
3292 return false;
3294 SDLoc DL(Node);
3296 // Truncate the shift amount.
3297 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3298 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3300 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3301 // All the other bits are undefined, we do not care about them.
3302 SDValue ImplDef = SDValue(
3303 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3304 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3305 NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef,
3306 NBits);
3307 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3309 if (Subtarget->hasBMI2()) {
3310 // Great, just emit the the BZHI..
3311 if (NVT != MVT::i32) {
3312 // But have to place the bit count into the wide-enough register first.
3313 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
3314 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3317 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
3318 ReplaceNode(Node, Extract.getNode());
3319 SelectCode(Extract.getNode());
3320 return true;
3323 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3324 // *logically* shifted (potentially with one-use trunc inbetween),
3325 // and the truncation was the only use of the shift,
3326 // and if so look past one-use truncation.
3328 SDValue RealX = peekThroughOneUseTruncation(X);
3329 // FIXME: only if the shift is one-use?
3330 if (RealX != X && RealX.getOpcode() == ISD::SRL)
3331 X = RealX;
3334 MVT XVT = X.getSimpleValueType();
3336 // Else, emitting BEXTR requires one more step.
3337 // The 'control' of BEXTR has the pattern of:
3338 // [15...8 bit][ 7...0 bit] location
3339 // [ bit count][ shift] name
3340 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
3342 // Shift NBits left by 8 bits, thus producing 'control'.
3343 // This makes the low 8 bits to be zero.
3344 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
3345 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
3346 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3348 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3349 // FIXME: only if the shift is one-use?
3350 if (X.getOpcode() == ISD::SRL) {
3351 SDValue ShiftAmt = X.getOperand(1);
3352 X = X.getOperand(0);
3354 assert(ShiftAmt.getValueType() == MVT::i8 &&
3355 "Expected shift amount to be i8");
3357 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
3358 // We could zext to i16 in some form, but we intentionally don't do that.
3359 SDValue OrigShiftAmt = ShiftAmt;
3360 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
3361 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
3363 // And now 'or' these low 8 bits of shift amount into the 'control'.
3364 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
3365 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3368 // But have to place the 'control' into the wide-enough register first.
3369 if (XVT != MVT::i32) {
3370 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
3371 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3374 // And finally, form the BEXTR itself.
3375 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
3377 // The 'X' was originally truncated. Do that now.
3378 if (XVT != NVT) {
3379 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
3380 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
3383 ReplaceNode(Node, Extract.getNode());
3384 SelectCode(Extract.getNode());
3386 return true;
3389 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
3390 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
3391 MVT NVT = Node->getSimpleValueType(0);
3392 SDLoc dl(Node);
3394 SDValue N0 = Node->getOperand(0);
3395 SDValue N1 = Node->getOperand(1);
3397 // If we have TBM we can use an immediate for the control. If we have BMI
3398 // we should only do this if the BEXTR instruction is implemented well.
3399 // Otherwise moving the control into a register makes this more costly.
3400 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
3401 // hoisting the move immediate would make it worthwhile with a less optimal
3402 // BEXTR?
3403 if (!Subtarget->hasTBM() &&
3404 !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
3405 return nullptr;
3407 // Must have a shift right.
3408 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
3409 return nullptr;
3411 // Shift can't have additional users.
3412 if (!N0->hasOneUse())
3413 return nullptr;
3415 // Only supported for 32 and 64 bits.
3416 if (NVT != MVT::i32 && NVT != MVT::i64)
3417 return nullptr;
3419 // Shift amount and RHS of and must be constant.
3420 ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
3421 ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
3422 if (!MaskCst || !ShiftCst)
3423 return nullptr;
3425 // And RHS must be a mask.
3426 uint64_t Mask = MaskCst->getZExtValue();
3427 if (!isMask_64(Mask))
3428 return nullptr;
3430 uint64_t Shift = ShiftCst->getZExtValue();
3431 uint64_t MaskSize = countPopulation(Mask);
3433 // Don't interfere with something that can be handled by extracting AH.
3434 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
3435 if (Shift == 8 && MaskSize == 8)
3436 return nullptr;
3438 // Make sure we are only using bits that were in the original value, not
3439 // shifted in.
3440 if (Shift + MaskSize > NVT.getSizeInBits())
3441 return nullptr;
3443 SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
3444 unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
3445 unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
3447 // BMI requires the immediate to placed in a register.
3448 if (!Subtarget->hasTBM()) {
3449 ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
3450 MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
3451 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3452 New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
3455 MachineSDNode *NewNode;
3456 SDValue Input = N0->getOperand(0);
3457 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3458 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3459 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
3460 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
3461 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3462 // Update the chain.
3463 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
3464 // Record the mem-refs
3465 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
3466 } else {
3467 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
3470 return NewNode;
3473 // Emit a PCMISTR(I/M) instruction.
3474 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
3475 bool MayFoldLoad, const SDLoc &dl,
3476 MVT VT, SDNode *Node) {
3477 SDValue N0 = Node->getOperand(0);
3478 SDValue N1 = Node->getOperand(1);
3479 SDValue Imm = Node->getOperand(2);
3480 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3481 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3483 // Try to fold a load. No need to check alignment.
3484 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3485 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3486 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3487 N1.getOperand(0) };
3488 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
3489 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3490 // Update the chain.
3491 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
3492 // Record the mem-refs
3493 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
3494 return CNode;
3497 SDValue Ops[] = { N0, N1, Imm };
3498 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
3499 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3500 return CNode;
3503 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
3504 // to emit a second instruction after this one. This is needed since we have two
3505 // copyToReg nodes glued before this and we need to continue that glue through.
3506 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
3507 bool MayFoldLoad, const SDLoc &dl,
3508 MVT VT, SDNode *Node,
3509 SDValue &InFlag) {
3510 SDValue N0 = Node->getOperand(0);
3511 SDValue N2 = Node->getOperand(2);
3512 SDValue Imm = Node->getOperand(4);
3513 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3514 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3516 // Try to fold a load. No need to check alignment.
3517 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3518 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3519 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3520 N2.getOperand(0), InFlag };
3521 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
3522 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3523 InFlag = SDValue(CNode, 3);
3524 // Update the chain.
3525 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
3526 // Record the mem-refs
3527 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
3528 return CNode;
3531 SDValue Ops[] = { N0, N2, Imm, InFlag };
3532 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
3533 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3534 InFlag = SDValue(CNode, 2);
3535 return CNode;
3538 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
3539 EVT VT = N->getValueType(0);
3541 // Only handle scalar shifts.
3542 if (VT.isVector())
3543 return false;
3545 // Narrower shifts only mask to 5 bits in hardware.
3546 unsigned Size = VT == MVT::i64 ? 64 : 32;
3548 SDValue OrigShiftAmt = N->getOperand(1);
3549 SDValue ShiftAmt = OrigShiftAmt;
3550 SDLoc DL(N);
3552 // Skip over a truncate of the shift amount.
3553 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
3554 ShiftAmt = ShiftAmt->getOperand(0);
3556 // This function is called after X86DAGToDAGISel::matchBitExtract(),
3557 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
3559 SDValue NewShiftAmt;
3560 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
3561 SDValue Add0 = ShiftAmt->getOperand(0);
3562 SDValue Add1 = ShiftAmt->getOperand(1);
3563 // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
3564 // to avoid the ADD/SUB.
3565 if (isa<ConstantSDNode>(Add1) &&
3566 cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
3567 NewShiftAmt = Add0;
3568 // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
3569 // generate a NEG instead of a SUB of a constant.
3570 } else if (ShiftAmt->getOpcode() == ISD::SUB &&
3571 isa<ConstantSDNode>(Add0) &&
3572 cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
3573 cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
3574 // Insert a negate op.
3575 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
3576 // that uses it that's not a shift.
3577 EVT SubVT = ShiftAmt.getValueType();
3578 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
3579 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
3580 NewShiftAmt = Neg;
3582 // Insert these operands into a valid topological order so they can
3583 // get selected independently.
3584 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
3585 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
3586 } else
3587 return false;
3588 } else
3589 return false;
3591 if (NewShiftAmt.getValueType() != MVT::i8) {
3592 // Need to truncate the shift amount.
3593 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
3594 // Add to a correct topological ordering.
3595 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3598 // Insert a new mask to keep the shift amount legal. This should be removed
3599 // by isel patterns.
3600 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
3601 CurDAG->getConstant(Size - 1, DL, MVT::i8));
3602 // Place in a correct topological ordering.
3603 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3605 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
3606 NewShiftAmt);
3607 if (UpdatedNode != N) {
3608 // If we found an existing node, we should replace ourselves with that node
3609 // and wait for it to be selected after its other users.
3610 ReplaceNode(N, UpdatedNode);
3611 return true;
3614 // If the original shift amount is now dead, delete it so that we don't run
3615 // it through isel.
3616 if (OrigShiftAmt.getNode()->use_empty())
3617 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
3619 // Now that we've optimized the shift amount, defer to normal isel to get
3620 // load folding and legacy vs BMI2 selection without repeating it here.
3621 SelectCode(N);
3622 return true;
3625 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
3626 MVT NVT = N->getSimpleValueType(0);
3627 unsigned Opcode = N->getOpcode();
3628 SDLoc dl(N);
3630 // For operations of the form (x << C1) op C2, check if we can use a smaller
3631 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
3632 SDValue Shift = N->getOperand(0);
3633 SDValue N1 = N->getOperand(1);
3635 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
3636 if (!Cst)
3637 return false;
3639 int64_t Val = Cst->getSExtValue();
3641 // If we have an any_extend feeding the AND, look through it to see if there
3642 // is a shift behind it. But only if the AND doesn't use the extended bits.
3643 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
3644 bool FoundAnyExtend = false;
3645 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
3646 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
3647 isUInt<32>(Val)) {
3648 FoundAnyExtend = true;
3649 Shift = Shift.getOperand(0);
3652 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
3653 return false;
3655 // i8 is unshrinkable, i16 should be promoted to i32.
3656 if (NVT != MVT::i32 && NVT != MVT::i64)
3657 return false;
3659 ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
3660 if (!ShlCst)
3661 return false;
3663 uint64_t ShAmt = ShlCst->getZExtValue();
3665 // Make sure that we don't change the operation by removing bits.
3666 // This only matters for OR and XOR, AND is unaffected.
3667 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
3668 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
3669 return false;
3671 // Check the minimum bitwidth for the new constant.
3672 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
3673 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
3674 if (Opcode == ISD::AND) {
3675 // AND32ri is the same as AND64ri32 with zext imm.
3676 // Try this before sign extended immediates below.
3677 ShiftedVal = (uint64_t)Val >> ShAmt;
3678 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3679 return true;
3680 // Also swap order when the AND can become MOVZX.
3681 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
3682 return true;
3684 ShiftedVal = Val >> ShAmt;
3685 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
3686 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
3687 return true;
3688 if (Opcode != ISD::AND) {
3689 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
3690 ShiftedVal = (uint64_t)Val >> ShAmt;
3691 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3692 return true;
3694 return false;
3697 int64_t ShiftedVal;
3698 if (!CanShrinkImmediate(ShiftedVal))
3699 return false;
3701 // Ok, we can reorder to get a smaller immediate.
3703 // But, its possible the original immediate allowed an AND to become MOVZX.
3704 // Doing this late due to avoid the MakedValueIsZero call as late as
3705 // possible.
3706 if (Opcode == ISD::AND) {
3707 // Find the smallest zext this could possibly be.
3708 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
3709 ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
3711 // Figure out which bits need to be zero to achieve that mask.
3712 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
3713 ZExtWidth);
3714 NeededMask &= ~Cst->getAPIntValue();
3716 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
3717 return false;
3720 SDValue X = Shift.getOperand(0);
3721 if (FoundAnyExtend) {
3722 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
3723 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
3724 X = NewX;
3727 SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
3728 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
3729 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
3730 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
3731 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
3732 Shift.getOperand(1));
3733 ReplaceNode(N, NewSHL.getNode());
3734 SelectCode(NewSHL.getNode());
3735 return true;
3738 /// If the high bits of an 'and' operand are known zero, try setting the
3739 /// high bits of an 'and' constant operand to produce a smaller encoding by
3740 /// creating a small, sign-extended negative immediate rather than a large
3741 /// positive one. This reverses a transform in SimplifyDemandedBits that
3742 /// shrinks mask constants by clearing bits. There is also a possibility that
3743 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
3744 /// case, just replace the 'and'. Return 'true' if the node is replaced.
3745 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
3746 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
3747 // have immediate operands.
3748 MVT VT = And->getSimpleValueType(0);
3749 if (VT != MVT::i32 && VT != MVT::i64)
3750 return false;
3752 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
3753 if (!And1C)
3754 return false;
3756 // Bail out if the mask constant is already negative. It's can't shrink more.
3757 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
3758 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
3759 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
3760 // are negative too.
3761 APInt MaskVal = And1C->getAPIntValue();
3762 unsigned MaskLZ = MaskVal.countLeadingZeros();
3763 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
3764 return false;
3766 // Don't extend into the upper 32 bits of a 64 bit mask.
3767 if (VT == MVT::i64 && MaskLZ >= 32) {
3768 MaskLZ -= 32;
3769 MaskVal = MaskVal.trunc(32);
3772 SDValue And0 = And->getOperand(0);
3773 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
3774 APInt NegMaskVal = MaskVal | HighZeros;
3776 // If a negative constant would not allow a smaller encoding, there's no need
3777 // to continue. Only change the constant when we know it's a win.
3778 unsigned MinWidth = NegMaskVal.getMinSignedBits();
3779 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
3780 return false;
3782 // Extend masks if we truncated above.
3783 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
3784 NegMaskVal = NegMaskVal.zext(64);
3785 HighZeros = HighZeros.zext(64);
3788 // The variable operand must be all zeros in the top bits to allow using the
3789 // new, negative constant as the mask.
3790 if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
3791 return false;
3793 // Check if the mask is -1. In that case, this is an unnecessary instruction
3794 // that escaped earlier analysis.
3795 if (NegMaskVal.isAllOnesValue()) {
3796 ReplaceNode(And, And0.getNode());
3797 return true;
3800 // A negative mask allows a smaller encoding. Create a new 'and' node.
3801 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
3802 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
3803 ReplaceNode(And, NewAnd.getNode());
3804 SelectCode(NewAnd.getNode());
3805 return true;
3808 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
3809 bool FoldedBCast, bool Masked) {
3810 if (Masked) {
3811 if (FoldedLoad) {
3812 switch (TestVT.SimpleTy) {
3813 default: llvm_unreachable("Unexpected VT!");
3814 case MVT::v16i8:
3815 return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
3816 case MVT::v8i16:
3817 return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
3818 case MVT::v4i32:
3819 return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
3820 case MVT::v2i64:
3821 return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
3822 case MVT::v32i8:
3823 return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
3824 case MVT::v16i16:
3825 return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
3826 case MVT::v8i32:
3827 return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
3828 case MVT::v4i64:
3829 return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
3830 case MVT::v64i8:
3831 return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
3832 case MVT::v32i16:
3833 return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
3834 case MVT::v16i32:
3835 return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
3836 case MVT::v8i64:
3837 return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
3841 if (FoldedBCast) {
3842 switch (TestVT.SimpleTy) {
3843 default: llvm_unreachable("Unexpected VT!");
3844 case MVT::v4i32:
3845 return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
3846 case MVT::v2i64:
3847 return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
3848 case MVT::v8i32:
3849 return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
3850 case MVT::v4i64:
3851 return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
3852 case MVT::v16i32:
3853 return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
3854 case MVT::v8i64:
3855 return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
3859 switch (TestVT.SimpleTy) {
3860 default: llvm_unreachable("Unexpected VT!");
3861 case MVT::v16i8:
3862 return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
3863 case MVT::v8i16:
3864 return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
3865 case MVT::v4i32:
3866 return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
3867 case MVT::v2i64:
3868 return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
3869 case MVT::v32i8:
3870 return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
3871 case MVT::v16i16:
3872 return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
3873 case MVT::v8i32:
3874 return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
3875 case MVT::v4i64:
3876 return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
3877 case MVT::v64i8:
3878 return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
3879 case MVT::v32i16:
3880 return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
3881 case MVT::v16i32:
3882 return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
3883 case MVT::v8i64:
3884 return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
3888 if (FoldedLoad) {
3889 switch (TestVT.SimpleTy) {
3890 default: llvm_unreachable("Unexpected VT!");
3891 case MVT::v16i8:
3892 return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
3893 case MVT::v8i16:
3894 return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
3895 case MVT::v4i32:
3896 return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
3897 case MVT::v2i64:
3898 return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
3899 case MVT::v32i8:
3900 return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
3901 case MVT::v16i16:
3902 return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
3903 case MVT::v8i32:
3904 return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
3905 case MVT::v4i64:
3906 return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
3907 case MVT::v64i8:
3908 return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
3909 case MVT::v32i16:
3910 return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
3911 case MVT::v16i32:
3912 return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
3913 case MVT::v8i64:
3914 return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
3918 if (FoldedBCast) {
3919 switch (TestVT.SimpleTy) {
3920 default: llvm_unreachable("Unexpected VT!");
3921 case MVT::v4i32:
3922 return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
3923 case MVT::v2i64:
3924 return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
3925 case MVT::v8i32:
3926 return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
3927 case MVT::v4i64:
3928 return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
3929 case MVT::v16i32:
3930 return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
3931 case MVT::v8i64:
3932 return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
3936 switch (TestVT.SimpleTy) {
3937 default: llvm_unreachable("Unexpected VT!");
3938 case MVT::v16i8:
3939 return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
3940 case MVT::v8i16:
3941 return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
3942 case MVT::v4i32:
3943 return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
3944 case MVT::v2i64:
3945 return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
3946 case MVT::v32i8:
3947 return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
3948 case MVT::v16i16:
3949 return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
3950 case MVT::v8i32:
3951 return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
3952 case MVT::v4i64:
3953 return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
3954 case MVT::v64i8:
3955 return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
3956 case MVT::v32i16:
3957 return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
3958 case MVT::v16i32:
3959 return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
3960 case MVT::v8i64:
3961 return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
3965 // Try to create VPTESTM instruction. If InMask is not null, it will be used
3966 // to form a masked operation.
3967 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
3968 SDValue InMask) {
3969 assert(Subtarget->hasAVX512() && "Expected AVX512!");
3970 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
3971 "Unexpected VT!");
3973 // Look for equal and not equal compares.
3974 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
3975 if (CC != ISD::SETEQ && CC != ISD::SETNE)
3976 return false;
3978 // See if we're comparing against zero. This should have been canonicalized
3979 // to RHS during lowering.
3980 if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode()))
3981 return false;
3983 SDValue N0 = Setcc.getOperand(0);
3985 MVT CmpVT = N0.getSimpleValueType();
3986 MVT CmpSVT = CmpVT.getVectorElementType();
3988 // Start with both operands the same. We'll try to refine this.
3989 SDValue Src0 = N0;
3990 SDValue Src1 = N0;
3993 // Look through single use bitcasts.
3994 SDValue N0Temp = N0;
3995 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
3996 N0Temp = N0.getOperand(0);
3998 // Look for single use AND.
3999 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4000 Src0 = N0Temp.getOperand(0);
4001 Src1 = N0Temp.getOperand(1);
4005 // Without VLX we need to widen the load.
4006 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4008 // We can only fold loads if the sources are unique.
4009 bool CanFoldLoads = Src0 != Src1;
4011 // Try to fold loads unless we need to widen.
4012 bool FoldedLoad = false;
4013 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
4014 if (!Widen && CanFoldLoads) {
4015 Load = Src1;
4016 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
4017 Tmp4);
4018 if (!FoldedLoad) {
4019 // And is computative.
4020 Load = Src0;
4021 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
4022 Tmp3, Tmp4);
4023 if (FoldedLoad)
4024 std::swap(Src0, Src1);
4028 auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
4029 // Look through single use bitcasts.
4030 if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse())
4031 Src = Src.getOperand(0);
4033 if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) {
4034 Parent = Src.getNode();
4035 Src = Src.getOperand(0);
4036 if (Src.getSimpleValueType() == CmpSVT)
4037 return Src;
4040 return SDValue();
4043 // If we didn't fold a load, try to match broadcast. No widening limitation
4044 // for this. But only 32 and 64 bit types are supported.
4045 bool FoldedBCast = false;
4046 if (!FoldedLoad && CanFoldLoads &&
4047 (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
4048 SDNode *ParentNode = nullptr;
4049 if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
4050 FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
4051 Tmp1, Tmp2, Tmp3, Tmp4);
4054 // Try the other operand.
4055 if (!FoldedBCast) {
4056 if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
4057 FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
4058 Tmp1, Tmp2, Tmp3, Tmp4);
4059 if (FoldedBCast)
4060 std::swap(Src0, Src1);
4065 auto getMaskRC = [](MVT MaskVT) {
4066 switch (MaskVT.SimpleTy) {
4067 default: llvm_unreachable("Unexpected VT!");
4068 case MVT::v2i1: return X86::VK2RegClassID;
4069 case MVT::v4i1: return X86::VK4RegClassID;
4070 case MVT::v8i1: return X86::VK8RegClassID;
4071 case MVT::v16i1: return X86::VK16RegClassID;
4072 case MVT::v32i1: return X86::VK32RegClassID;
4073 case MVT::v64i1: return X86::VK64RegClassID;
4077 bool IsMasked = InMask.getNode() != nullptr;
4079 SDLoc dl(Root);
4081 MVT ResVT = Setcc.getSimpleValueType();
4082 MVT MaskVT = ResVT;
4083 if (Widen) {
4084 // Widen the inputs using insert_subreg or copy_to_regclass.
4085 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4086 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4087 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4088 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4089 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4090 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4091 CmpVT), 0);
4092 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4094 assert(!FoldedLoad && "Shouldn't have folded the load");
4095 if (!FoldedBCast)
4096 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4098 if (IsMasked) {
4099 // Widen the mask.
4100 unsigned RegClass = getMaskRC(MaskVT);
4101 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4102 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4103 dl, MaskVT, InMask, RC), 0);
4107 bool IsTestN = CC == ISD::SETEQ;
4108 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4109 IsMasked);
4111 MachineSDNode *CNode;
4112 if (FoldedLoad || FoldedBCast) {
4113 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4115 if (IsMasked) {
4116 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4117 Load.getOperand(0) };
4118 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4119 } else {
4120 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4121 Load.getOperand(0) };
4122 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4125 // Update the chain.
4126 ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
4127 // Record the mem-refs
4128 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
4129 } else {
4130 if (IsMasked)
4131 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
4132 else
4133 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
4136 // If we widened, we need to shrink the mask VT.
4137 if (Widen) {
4138 unsigned RegClass = getMaskRC(ResVT);
4139 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4140 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4141 dl, ResVT, SDValue(CNode, 0), RC);
4144 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
4145 CurDAG->RemoveDeadNode(Root);
4146 return true;
4149 void X86DAGToDAGISel::Select(SDNode *Node) {
4150 MVT NVT = Node->getSimpleValueType(0);
4151 unsigned Opcode = Node->getOpcode();
4152 SDLoc dl(Node);
4154 if (Node->isMachineOpcode()) {
4155 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
4156 Node->setNodeId(-1);
4157 return; // Already selected.
4160 switch (Opcode) {
4161 default: break;
4162 case ISD::INTRINSIC_VOID: {
4163 unsigned IntNo = Node->getConstantOperandVal(1);
4164 switch (IntNo) {
4165 default: break;
4166 case Intrinsic::x86_sse3_monitor:
4167 case Intrinsic::x86_monitorx:
4168 case Intrinsic::x86_clzero: {
4169 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
4171 unsigned Opc = 0;
4172 switch (IntNo) {
4173 case Intrinsic::x86_sse3_monitor:
4174 if (!Subtarget->hasSSE3())
4175 break;
4176 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
4177 break;
4178 case Intrinsic::x86_monitorx:
4179 if (!Subtarget->hasMWAITX())
4180 break;
4181 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
4182 break;
4183 case Intrinsic::x86_clzero:
4184 if (!Subtarget->hasCLZERO())
4185 break;
4186 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
4187 break;
4190 if (Opc) {
4191 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
4192 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
4193 Node->getOperand(2), SDValue());
4194 SDValue InFlag = Chain.getValue(1);
4196 if (IntNo == Intrinsic::x86_sse3_monitor ||
4197 IntNo == Intrinsic::x86_monitorx) {
4198 // Copy the other two operands to ECX and EDX.
4199 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
4200 InFlag);
4201 InFlag = Chain.getValue(1);
4202 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
4203 InFlag);
4204 InFlag = Chain.getValue(1);
4207 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
4208 { Chain, InFlag});
4209 ReplaceNode(Node, CNode);
4210 return;
4215 break;
4217 case ISD::BRIND: {
4218 if (Subtarget->isTargetNaCl())
4219 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
4220 // leave the instruction alone.
4221 break;
4222 if (Subtarget->isTarget64BitILP32()) {
4223 // Converts a 32-bit register to a 64-bit, zero-extended version of
4224 // it. This is needed because x86-64 can do many things, but jmp %r32
4225 // ain't one of them.
4226 const SDValue &Target = Node->getOperand(1);
4227 assert(Target.getSimpleValueType() == llvm::MVT::i32);
4228 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
4229 SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
4230 Node->getOperand(0), ZextTarget);
4231 ReplaceNode(Node, Brind.getNode());
4232 SelectCode(ZextTarget.getNode());
4233 SelectCode(Brind.getNode());
4234 return;
4236 break;
4238 case X86ISD::GlobalBaseReg:
4239 ReplaceNode(Node, getGlobalBaseReg());
4240 return;
4242 case ISD::BITCAST:
4243 // Just drop all 128/256/512-bit bitcasts.
4244 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
4245 NVT == MVT::f128) {
4246 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
4247 CurDAG->RemoveDeadNode(Node);
4248 return;
4250 break;
4252 case ISD::VSELECT: {
4253 // Replace VSELECT with non-mask conditions with with BLENDV.
4254 if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
4255 break;
4257 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
4258 SDValue Blendv = CurDAG->getNode(
4259 X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
4260 Node->getOperand(1), Node->getOperand(2));
4261 ReplaceNode(Node, Blendv.getNode());
4262 SelectCode(Blendv.getNode());
4263 // We already called ReplaceUses.
4264 return;
4267 case ISD::SRL:
4268 if (matchBitExtract(Node))
4269 return;
4270 LLVM_FALLTHROUGH;
4271 case ISD::SRA:
4272 case ISD::SHL:
4273 if (tryShiftAmountMod(Node))
4274 return;
4275 break;
4277 case ISD::AND:
4278 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
4279 // Try to form a masked VPTESTM. Operands can be in either order.
4280 SDValue N0 = Node->getOperand(0);
4281 SDValue N1 = Node->getOperand(1);
4282 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
4283 tryVPTESTM(Node, N0, N1))
4284 return;
4285 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
4286 tryVPTESTM(Node, N1, N0))
4287 return;
4290 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
4291 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4292 CurDAG->RemoveDeadNode(Node);
4293 return;
4295 if (matchBitExtract(Node))
4296 return;
4297 if (AndImmShrink && shrinkAndImmediate(Node))
4298 return;
4300 LLVM_FALLTHROUGH;
4301 case ISD::OR:
4302 case ISD::XOR:
4303 if (tryShrinkShlLogicImm(Node))
4304 return;
4306 LLVM_FALLTHROUGH;
4307 case ISD::ADD:
4308 case ISD::SUB: {
4309 // Try to avoid folding immediates with multiple uses for optsize.
4310 // This code tries to select to register form directly to avoid going
4311 // through the isel table which might fold the immediate. We can't change
4312 // the patterns on the add/sub/and/or/xor with immediate paterns in the
4313 // tablegen files to check immediate use count without making the patterns
4314 // unavailable to the fast-isel table.
4315 if (!OptForSize)
4316 break;
4318 // Only handle i8/i16/i32/i64.
4319 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
4320 break;
4322 SDValue N0 = Node->getOperand(0);
4323 SDValue N1 = Node->getOperand(1);
4325 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
4326 if (!Cst)
4327 break;
4329 int64_t Val = Cst->getSExtValue();
4331 // Make sure its an immediate that is considered foldable.
4332 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
4333 if (!isInt<8>(Val) && !isInt<32>(Val))
4334 break;
4336 // Check if we should avoid folding this immediate.
4337 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
4338 break;
4340 // We should not fold the immediate. So we need a register form instead.
4341 unsigned ROpc, MOpc;
4342 switch (NVT.SimpleTy) {
4343 default: llvm_unreachable("Unexpected VT!");
4344 case MVT::i8:
4345 switch (Opcode) {
4346 default: llvm_unreachable("Unexpected opcode!");
4347 case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
4348 case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
4349 case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
4350 case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break;
4351 case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
4353 break;
4354 case MVT::i16:
4355 switch (Opcode) {
4356 default: llvm_unreachable("Unexpected opcode!");
4357 case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
4358 case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
4359 case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
4360 case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break;
4361 case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
4363 break;
4364 case MVT::i32:
4365 switch (Opcode) {
4366 default: llvm_unreachable("Unexpected opcode!");
4367 case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
4368 case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
4369 case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
4370 case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break;
4371 case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
4373 break;
4374 case MVT::i64:
4375 switch (Opcode) {
4376 default: llvm_unreachable("Unexpected opcode!");
4377 case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
4378 case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
4379 case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
4380 case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break;
4381 case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
4383 break;
4386 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
4388 // If this is a not a subtract, we can still try to fold a load.
4389 if (Opcode != ISD::SUB) {
4390 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4391 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4392 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4393 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4394 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4395 // Update the chain.
4396 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
4397 // Record the mem-refs
4398 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
4399 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4400 CurDAG->RemoveDeadNode(Node);
4401 return;
4405 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
4406 return;
4409 case X86ISD::SMUL:
4410 // i16/i32/i64 are handled with isel patterns.
4411 if (NVT != MVT::i8)
4412 break;
4413 LLVM_FALLTHROUGH;
4414 case X86ISD::UMUL: {
4415 SDValue N0 = Node->getOperand(0);
4416 SDValue N1 = Node->getOperand(1);
4418 unsigned LoReg, ROpc, MOpc;
4419 switch (NVT.SimpleTy) {
4420 default: llvm_unreachable("Unsupported VT!");
4421 case MVT::i8:
4422 LoReg = X86::AL;
4423 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
4424 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
4425 break;
4426 case MVT::i16:
4427 LoReg = X86::AX;
4428 ROpc = X86::MUL16r;
4429 MOpc = X86::MUL16m;
4430 break;
4431 case MVT::i32:
4432 LoReg = X86::EAX;
4433 ROpc = X86::MUL32r;
4434 MOpc = X86::MUL32m;
4435 break;
4436 case MVT::i64:
4437 LoReg = X86::RAX;
4438 ROpc = X86::MUL64r;
4439 MOpc = X86::MUL64m;
4440 break;
4443 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4444 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4445 // Multiply is commmutative.
4446 if (!FoldedLoad) {
4447 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4448 if (FoldedLoad)
4449 std::swap(N0, N1);
4452 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
4453 N0, SDValue()).getValue(1);
4455 MachineSDNode *CNode;
4456 if (FoldedLoad) {
4457 // i16/i32/i64 use an instruction that produces a low and high result even
4458 // though only the low result is used.
4459 SDVTList VTs;
4460 if (NVT == MVT::i8)
4461 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4462 else
4463 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
4465 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4466 InFlag };
4467 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4469 // Update the chain.
4470 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
4471 // Record the mem-refs
4472 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4473 } else {
4474 // i16/i32/i64 use an instruction that produces a low and high result even
4475 // though only the low result is used.
4476 SDVTList VTs;
4477 if (NVT == MVT::i8)
4478 VTs = CurDAG->getVTList(NVT, MVT::i32);
4479 else
4480 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
4482 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
4485 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4486 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
4487 CurDAG->RemoveDeadNode(Node);
4488 return;
4491 case ISD::SMUL_LOHI:
4492 case ISD::UMUL_LOHI: {
4493 SDValue N0 = Node->getOperand(0);
4494 SDValue N1 = Node->getOperand(1);
4496 unsigned Opc, MOpc;
4497 bool isSigned = Opcode == ISD::SMUL_LOHI;
4498 if (!isSigned) {
4499 switch (NVT.SimpleTy) {
4500 default: llvm_unreachable("Unsupported VT!");
4501 case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
4502 case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
4504 } else {
4505 switch (NVT.SimpleTy) {
4506 default: llvm_unreachable("Unsupported VT!");
4507 case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
4508 case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
4512 unsigned SrcReg, LoReg, HiReg;
4513 switch (Opc) {
4514 default: llvm_unreachable("Unknown MUL opcode!");
4515 case X86::IMUL32r:
4516 case X86::MUL32r:
4517 SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
4518 break;
4519 case X86::IMUL64r:
4520 case X86::MUL64r:
4521 SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
4522 break;
4525 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4526 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4527 // Multiply is commmutative.
4528 if (!foldedLoad) {
4529 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4530 if (foldedLoad)
4531 std::swap(N0, N1);
4534 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
4535 N0, SDValue()).getValue(1);
4536 if (foldedLoad) {
4537 SDValue Chain;
4538 MachineSDNode *CNode = nullptr;
4539 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4540 InFlag };
4541 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
4542 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4543 Chain = SDValue(CNode, 0);
4544 InFlag = SDValue(CNode, 1);
4546 // Update the chain.
4547 ReplaceUses(N1.getValue(1), Chain);
4548 // Record the mem-refs
4549 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4550 } else {
4551 SDValue Ops[] = { N1, InFlag };
4552 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
4553 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4554 InFlag = SDValue(CNode, 0);
4557 // Copy the low half of the result, if it is needed.
4558 if (!SDValue(Node, 0).use_empty()) {
4559 assert(LoReg && "Register for low half is not defined!");
4560 SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
4561 NVT, InFlag);
4562 InFlag = ResLo.getValue(2);
4563 ReplaceUses(SDValue(Node, 0), ResLo);
4564 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
4565 dbgs() << '\n');
4567 // Copy the high half of the result, if it is needed.
4568 if (!SDValue(Node, 1).use_empty()) {
4569 assert(HiReg && "Register for high half is not defined!");
4570 SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
4571 NVT, InFlag);
4572 InFlag = ResHi.getValue(2);
4573 ReplaceUses(SDValue(Node, 1), ResHi);
4574 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
4575 dbgs() << '\n');
4578 CurDAG->RemoveDeadNode(Node);
4579 return;
4582 case ISD::SDIVREM:
4583 case ISD::UDIVREM: {
4584 SDValue N0 = Node->getOperand(0);
4585 SDValue N1 = Node->getOperand(1);
4587 unsigned Opc, MOpc;
4588 bool isSigned = Opcode == ISD::SDIVREM;
4589 if (!isSigned) {
4590 switch (NVT.SimpleTy) {
4591 default: llvm_unreachable("Unsupported VT!");
4592 case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
4593 case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
4594 case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
4595 case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
4597 } else {
4598 switch (NVT.SimpleTy) {
4599 default: llvm_unreachable("Unsupported VT!");
4600 case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
4601 case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
4602 case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
4603 case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
4607 unsigned LoReg, HiReg, ClrReg;
4608 unsigned SExtOpcode;
4609 switch (NVT.SimpleTy) {
4610 default: llvm_unreachable("Unsupported VT!");
4611 case MVT::i8:
4612 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
4613 SExtOpcode = X86::CBW;
4614 break;
4615 case MVT::i16:
4616 LoReg = X86::AX; HiReg = X86::DX;
4617 ClrReg = X86::DX;
4618 SExtOpcode = X86::CWD;
4619 break;
4620 case MVT::i32:
4621 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
4622 SExtOpcode = X86::CDQ;
4623 break;
4624 case MVT::i64:
4625 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
4626 SExtOpcode = X86::CQO;
4627 break;
4630 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4631 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4632 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
4634 SDValue InFlag;
4635 if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
4636 // Special case for div8, just use a move with zero extension to AX to
4637 // clear the upper 8 bits (AH).
4638 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
4639 MachineSDNode *Move;
4640 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4641 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4642 Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
4643 MVT::Other, Ops);
4644 Chain = SDValue(Move, 1);
4645 ReplaceUses(N0.getValue(1), Chain);
4646 // Record the mem-refs
4647 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
4648 } else {
4649 Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0);
4650 Chain = CurDAG->getEntryNode();
4652 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0),
4653 SDValue());
4654 InFlag = Chain.getValue(1);
4655 } else {
4656 InFlag =
4657 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
4658 LoReg, N0, SDValue()).getValue(1);
4659 if (isSigned && !signBitIsZero) {
4660 // Sign extend the low part into the high part.
4661 InFlag =
4662 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
4663 } else {
4664 // Zero out the high part, effectively zero extending the input.
4665 SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
4666 switch (NVT.SimpleTy) {
4667 case MVT::i16:
4668 ClrNode =
4669 SDValue(CurDAG->getMachineNode(
4670 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
4671 CurDAG->getTargetConstant(X86::sub_16bit, dl,
4672 MVT::i32)),
4674 break;
4675 case MVT::i32:
4676 break;
4677 case MVT::i64:
4678 ClrNode =
4679 SDValue(CurDAG->getMachineNode(
4680 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
4681 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
4682 CurDAG->getTargetConstant(X86::sub_32bit, dl,
4683 MVT::i32)),
4685 break;
4686 default:
4687 llvm_unreachable("Unexpected division source");
4690 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
4691 ClrNode, InFlag).getValue(1);
4695 if (foldedLoad) {
4696 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4697 InFlag };
4698 MachineSDNode *CNode =
4699 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
4700 InFlag = SDValue(CNode, 1);
4701 // Update the chain.
4702 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
4703 // Record the mem-refs
4704 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4705 } else {
4706 InFlag =
4707 SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
4710 // Prevent use of AH in a REX instruction by explicitly copying it to
4711 // an ABCD_L register.
4713 // The current assumption of the register allocator is that isel
4714 // won't generate explicit references to the GR8_ABCD_H registers. If
4715 // the allocator and/or the backend get enhanced to be more robust in
4716 // that regard, this can be, and should be, removed.
4717 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
4718 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
4719 unsigned AHExtOpcode =
4720 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
4722 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
4723 MVT::Glue, AHCopy, InFlag);
4724 SDValue Result(RNode, 0);
4725 InFlag = SDValue(RNode, 1);
4727 Result =
4728 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
4730 ReplaceUses(SDValue(Node, 1), Result);
4731 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4732 dbgs() << '\n');
4734 // Copy the division (low) result, if it is needed.
4735 if (!SDValue(Node, 0).use_empty()) {
4736 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4737 LoReg, NVT, InFlag);
4738 InFlag = Result.getValue(2);
4739 ReplaceUses(SDValue(Node, 0), Result);
4740 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4741 dbgs() << '\n');
4743 // Copy the remainder (high) result, if it is needed.
4744 if (!SDValue(Node, 1).use_empty()) {
4745 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4746 HiReg, NVT, InFlag);
4747 InFlag = Result.getValue(2);
4748 ReplaceUses(SDValue(Node, 1), Result);
4749 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4750 dbgs() << '\n');
4752 CurDAG->RemoveDeadNode(Node);
4753 return;
4756 case X86ISD::CMP: {
4757 SDValue N0 = Node->getOperand(0);
4758 SDValue N1 = Node->getOperand(1);
4760 // Optimizations for TEST compares.
4761 if (!isNullConstant(N1))
4762 break;
4764 // Save the original VT of the compare.
4765 MVT CmpVT = N0.getSimpleValueType();
4767 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
4768 // by a test instruction. The test should be removed later by
4769 // analyzeCompare if we are using only the zero flag.
4770 // TODO: Should we check the users and use the BEXTR flags directly?
4771 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
4772 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
4773 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
4774 : X86::TEST32rr;
4775 SDValue BEXTR = SDValue(NewNode, 0);
4776 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
4777 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4778 CurDAG->RemoveDeadNode(Node);
4779 return;
4783 // We can peek through truncates, but we need to be careful below.
4784 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
4785 N0 = N0.getOperand(0);
4787 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
4788 // use a smaller encoding.
4789 // Look past the truncate if CMP is the only use of it.
4790 if (N0.getOpcode() == ISD::AND &&
4791 N0.getNode()->hasOneUse() &&
4792 N0.getValueType() != MVT::i8) {
4793 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
4794 if (!C) break;
4795 uint64_t Mask = C->getZExtValue();
4797 // Check if we can replace AND+IMM64 with a shift. This is possible for
4798 // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
4799 // flag.
4800 if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
4801 onlyUsesZeroFlag(SDValue(Node, 0))) {
4802 if (isMask_64(~Mask)) {
4803 unsigned TrailingZeros = countTrailingZeros(Mask);
4804 SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
4805 SDValue Shift =
4806 SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
4807 N0.getOperand(0), Imm), 0);
4808 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
4809 MVT::i32, Shift, Shift);
4810 ReplaceNode(Node, Test);
4811 return;
4813 if (isMask_64(Mask)) {
4814 unsigned LeadingZeros = countLeadingZeros(Mask);
4815 SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
4816 SDValue Shift =
4817 SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
4818 N0.getOperand(0), Imm), 0);
4819 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
4820 MVT::i32, Shift, Shift);
4821 ReplaceNode(Node, Test);
4822 return;
4826 MVT VT;
4827 int SubRegOp;
4828 unsigned ROpc, MOpc;
4830 // For each of these checks we need to be careful if the sign flag is
4831 // being used. It is only safe to use the sign flag in two conditions,
4832 // either the sign bit in the shrunken mask is zero or the final test
4833 // size is equal to the original compare size.
4835 if (isUInt<8>(Mask) &&
4836 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
4837 hasNoSignFlagUses(SDValue(Node, 0)))) {
4838 // For example, convert "testl %eax, $8" to "testb %al, $8"
4839 VT = MVT::i8;
4840 SubRegOp = X86::sub_8bit;
4841 ROpc = X86::TEST8ri;
4842 MOpc = X86::TEST8mi;
4843 } else if (OptForMinSize && isUInt<16>(Mask) &&
4844 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
4845 hasNoSignFlagUses(SDValue(Node, 0)))) {
4846 // For example, "testl %eax, $32776" to "testw %ax, $32776".
4847 // NOTE: We only want to form TESTW instructions if optimizing for
4848 // min size. Otherwise we only save one byte and possibly get a length
4849 // changing prefix penalty in the decoders.
4850 VT = MVT::i16;
4851 SubRegOp = X86::sub_16bit;
4852 ROpc = X86::TEST16ri;
4853 MOpc = X86::TEST16mi;
4854 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
4855 ((!(Mask & 0x80000000) &&
4856 // Without minsize 16-bit Cmps can get here so we need to
4857 // be sure we calculate the correct sign flag if needed.
4858 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
4859 CmpVT == MVT::i32 ||
4860 hasNoSignFlagUses(SDValue(Node, 0)))) {
4861 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
4862 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
4863 // Otherwize, we find ourselves in a position where we have to do
4864 // promotion. If previous passes did not promote the and, we assume
4865 // they had a good reason not to and do not promote here.
4866 VT = MVT::i32;
4867 SubRegOp = X86::sub_32bit;
4868 ROpc = X86::TEST32ri;
4869 MOpc = X86::TEST32mi;
4870 } else {
4871 // No eligible transformation was found.
4872 break;
4875 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
4876 SDValue Reg = N0.getOperand(0);
4878 // Emit a testl or testw.
4879 MachineSDNode *NewNode;
4880 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4881 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4882 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4883 Reg.getOperand(0) };
4884 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
4885 // Update the chain.
4886 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
4887 // Record the mem-refs
4888 CurDAG->setNodeMemRefs(NewNode,
4889 {cast<LoadSDNode>(Reg)->getMemOperand()});
4890 } else {
4891 // Extract the subregister if necessary.
4892 if (N0.getValueType() != VT)
4893 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
4895 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
4897 // Replace CMP with TEST.
4898 ReplaceNode(Node, NewNode);
4899 return;
4901 break;
4903 case X86ISD::PCMPISTR: {
4904 if (!Subtarget->hasSSE42())
4905 break;
4907 bool NeedIndex = !SDValue(Node, 0).use_empty();
4908 bool NeedMask = !SDValue(Node, 1).use_empty();
4909 // We can't fold a load if we are going to make two instructions.
4910 bool MayFoldLoad = !NeedIndex || !NeedMask;
4912 MachineSDNode *CNode;
4913 if (NeedMask) {
4914 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
4915 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
4916 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
4917 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
4919 if (NeedIndex || !NeedMask) {
4920 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
4921 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
4922 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
4923 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4926 // Connect the flag usage to the last instruction created.
4927 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
4928 CurDAG->RemoveDeadNode(Node);
4929 return;
4931 case X86ISD::PCMPESTR: {
4932 if (!Subtarget->hasSSE42())
4933 break;
4935 // Copy the two implicit register inputs.
4936 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
4937 Node->getOperand(1),
4938 SDValue()).getValue(1);
4939 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
4940 Node->getOperand(3), InFlag).getValue(1);
4942 bool NeedIndex = !SDValue(Node, 0).use_empty();
4943 bool NeedMask = !SDValue(Node, 1).use_empty();
4944 // We can't fold a load if we are going to make two instructions.
4945 bool MayFoldLoad = !NeedIndex || !NeedMask;
4947 MachineSDNode *CNode;
4948 if (NeedMask) {
4949 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
4950 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
4951 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
4952 InFlag);
4953 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
4955 if (NeedIndex || !NeedMask) {
4956 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
4957 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
4958 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
4959 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4961 // Connect the flag usage to the last instruction created.
4962 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
4963 CurDAG->RemoveDeadNode(Node);
4964 return;
4967 case ISD::SETCC: {
4968 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
4969 return;
4971 break;
4974 case ISD::STORE:
4975 if (foldLoadStoreIntoMemOperand(Node))
4976 return;
4977 break;
4978 case ISD::FCEIL:
4979 case ISD::FFLOOR:
4980 case ISD::FTRUNC:
4981 case ISD::FNEARBYINT:
4982 case ISD::FRINT: {
4983 // Replace fp rounding with their X86 specific equivalent so we don't
4984 // need 2 sets of patterns.
4985 // FIXME: This can only happen when the nodes started as STRICT_* and have
4986 // been mutated into their non-STRICT equivalents. Eventually this
4987 // mutation will be removed and we should switch the STRICT_ nodes to a
4988 // strict version of RNDSCALE in PreProcessISelDAG.
4989 unsigned Imm;
4990 switch (Node->getOpcode()) {
4991 default: llvm_unreachable("Unexpected opcode!");
4992 case ISD::FCEIL: Imm = 0xA; break;
4993 case ISD::FFLOOR: Imm = 0x9; break;
4994 case ISD::FTRUNC: Imm = 0xB; break;
4995 case ISD::FNEARBYINT: Imm = 0xC; break;
4996 case ISD::FRINT: Imm = 0x4; break;
4998 SDLoc dl(Node);
4999 SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
5000 Node->getValueType(0),
5001 Node->getOperand(0),
5002 CurDAG->getConstant(Imm, dl, MVT::i8));
5003 ReplaceNode(Node, Res.getNode());
5004 SelectCode(Res.getNode());
5005 return;
5009 SelectCode(Node);
5012 bool X86DAGToDAGISel::
5013 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
5014 std::vector<SDValue> &OutOps) {
5015 SDValue Op0, Op1, Op2, Op3, Op4;
5016 switch (ConstraintID) {
5017 default:
5018 llvm_unreachable("Unexpected asm memory constraint");
5019 case InlineAsm::Constraint_i:
5020 // FIXME: It seems strange that 'i' is needed here since it's supposed to
5021 // be an immediate and not a memory constraint.
5022 LLVM_FALLTHROUGH;
5023 case InlineAsm::Constraint_o: // offsetable ??
5024 case InlineAsm::Constraint_v: // not offsetable ??
5025 case InlineAsm::Constraint_m: // memory
5026 case InlineAsm::Constraint_X:
5027 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
5028 return true;
5029 break;
5032 OutOps.push_back(Op0);
5033 OutOps.push_back(Op1);
5034 OutOps.push_back(Op2);
5035 OutOps.push_back(Op3);
5036 OutOps.push_back(Op4);
5037 return false;
5040 /// This pass converts a legalized DAG into a X86-specific DAG,
5041 /// ready for instruction scheduling.
5042 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
5043 CodeGenOpt::Level OptLevel) {
5044 return new X86DAGToDAGISel(TM, OptLevel);