AMDGPU: Mark test as XFAIL in expensive_checks builds
[llvm-project.git] / llvm / lib / Target / X86 / X86ISelDAGToDAG.cpp
blob9b340a778b36ad6d7a24168a5319769139861b25
1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines a DAG pattern matching instruction selector for X86,
10 // converting from a legalized dag to a X86 dag.
12 //===----------------------------------------------------------------------===//
14 #include "X86ISelDAGToDAG.h"
15 #include "X86.h"
16 #include "X86MachineFunctionInfo.h"
17 #include "X86Subtarget.h"
18 #include "X86TargetMachine.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/MachineModuleInfo.h"
21 #include "llvm/CodeGen/SelectionDAGISel.h"
22 #include "llvm/Config/llvm-config.h"
23 #include "llvm/IR/ConstantRange.h"
24 #include "llvm/IR/Function.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/IntrinsicsX86.h"
28 #include "llvm/IR/Module.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/ErrorHandling.h"
32 #include "llvm/Support/KnownBits.h"
33 #include "llvm/Support/MathExtras.h"
34 #include <cstdint>
36 using namespace llvm;
38 #define DEBUG_TYPE "x86-isel"
39 #define PASS_NAME "X86 DAG->DAG Instruction Selection"
41 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
43 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45 cl::Hidden);
47 static cl::opt<bool> EnablePromoteAnyextLoad(
48 "x86-promote-anyext-load", cl::init(true),
49 cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
51 extern cl::opt<bool> IndirectBranchTracking;
53 //===----------------------------------------------------------------------===//
54 // Pattern Matcher Implementation
55 //===----------------------------------------------------------------------===//
57 namespace {
58 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
59 /// numbers for the leaves of the matched tree.
60 struct X86ISelAddressMode {
61 enum {
62 RegBase,
63 FrameIndexBase
64 } BaseType = RegBase;
66 // This is really a union, discriminated by BaseType!
67 SDValue Base_Reg;
68 int Base_FrameIndex = 0;
70 unsigned Scale = 1;
71 SDValue IndexReg;
72 int32_t Disp = 0;
73 SDValue Segment;
74 const GlobalValue *GV = nullptr;
75 const Constant *CP = nullptr;
76 const BlockAddress *BlockAddr = nullptr;
77 const char *ES = nullptr;
78 MCSymbol *MCSym = nullptr;
79 int JT = -1;
80 Align Alignment; // CP alignment.
81 unsigned char SymbolFlags = X86II::MO_NO_FLAG; // X86II::MO_*
82 bool NegateIndex = false;
84 X86ISelAddressMode() = default;
86 bool hasSymbolicDisplacement() const {
87 return GV != nullptr || CP != nullptr || ES != nullptr ||
88 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
91 bool hasBaseOrIndexReg() const {
92 return BaseType == FrameIndexBase ||
93 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
96 /// Return true if this addressing mode is already RIP-relative.
97 bool isRIPRelative() const {
98 if (BaseType != RegBase) return false;
99 if (RegisterSDNode *RegNode =
100 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
101 return RegNode->getReg() == X86::RIP;
102 return false;
105 void setBaseReg(SDValue Reg) {
106 BaseType = RegBase;
107 Base_Reg = Reg;
110 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 void dump(SelectionDAG *DAG = nullptr) {
112 dbgs() << "X86ISelAddressMode " << this << '\n';
113 dbgs() << "Base_Reg ";
114 if (Base_Reg.getNode())
115 Base_Reg.getNode()->dump(DAG);
116 else
117 dbgs() << "nul\n";
118 if (BaseType == FrameIndexBase)
119 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
120 dbgs() << " Scale " << Scale << '\n'
121 << "IndexReg ";
122 if (NegateIndex)
123 dbgs() << "negate ";
124 if (IndexReg.getNode())
125 IndexReg.getNode()->dump(DAG);
126 else
127 dbgs() << "nul\n";
128 dbgs() << " Disp " << Disp << '\n'
129 << "GV ";
130 if (GV)
131 GV->dump();
132 else
133 dbgs() << "nul";
134 dbgs() << " CP ";
135 if (CP)
136 CP->dump();
137 else
138 dbgs() << "nul";
139 dbgs() << '\n'
140 << "ES ";
141 if (ES)
142 dbgs() << ES;
143 else
144 dbgs() << "nul";
145 dbgs() << " MCSym ";
146 if (MCSym)
147 dbgs() << MCSym;
148 else
149 dbgs() << "nul";
150 dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
152 #endif
156 namespace {
157 //===--------------------------------------------------------------------===//
158 /// ISel - X86-specific code to select X86 machine instructions for
159 /// SelectionDAG operations.
161 class X86DAGToDAGISel final : public SelectionDAGISel {
162 /// Keep a pointer to the X86Subtarget around so that we can
163 /// make the right decision when generating code for different targets.
164 const X86Subtarget *Subtarget;
166 /// If true, selector should try to optimize for minimum code size.
167 bool OptForMinSize;
169 /// Disable direct TLS access through segment registers.
170 bool IndirectTlsSegRefs;
172 public:
173 X86DAGToDAGISel() = delete;
175 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOptLevel OptLevel)
176 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
177 OptForMinSize(false), IndirectTlsSegRefs(false) {}
179 bool runOnMachineFunction(MachineFunction &MF) override {
180 // Reset the subtarget each time through.
181 Subtarget = &MF.getSubtarget<X86Subtarget>();
182 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
183 "indirect-tls-seg-refs");
185 // OptFor[Min]Size are used in pattern predicates that isel is matching.
186 OptForMinSize = MF.getFunction().hasMinSize();
187 assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
188 "OptForMinSize implies OptForSize");
189 return SelectionDAGISel::runOnMachineFunction(MF);
192 void emitFunctionEntryCode() override;
194 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
196 void PreprocessISelDAG() override;
197 void PostprocessISelDAG() override;
199 // Include the pieces autogenerated from the target description.
200 #include "X86GenDAGISel.inc"
202 private:
203 void Select(SDNode *N) override;
205 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
206 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
207 bool AllowSegmentRegForX32 = false);
208 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
209 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
210 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
211 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
212 SDValue matchIndexRecursively(SDValue N, X86ISelAddressMode &AM,
213 unsigned Depth);
214 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
215 unsigned Depth);
216 bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
217 unsigned Depth);
218 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
219 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
220 SDValue &Scale, SDValue &Index, SDValue &Disp,
221 SDValue &Segment);
222 bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
223 SDValue ScaleOp, SDValue &Base, SDValue &Scale,
224 SDValue &Index, SDValue &Disp, SDValue &Segment);
225 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
226 bool selectLEAAddr(SDValue N, SDValue &Base,
227 SDValue &Scale, SDValue &Index, SDValue &Disp,
228 SDValue &Segment);
229 bool selectLEA64_32Addr(SDValue N, SDValue &Base,
230 SDValue &Scale, SDValue &Index, SDValue &Disp,
231 SDValue &Segment);
232 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
233 SDValue &Scale, SDValue &Index, SDValue &Disp,
234 SDValue &Segment);
235 bool selectRelocImm(SDValue N, SDValue &Op);
237 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
238 SDValue &Base, SDValue &Scale,
239 SDValue &Index, SDValue &Disp,
240 SDValue &Segment);
242 // Convenience method where P is also root.
243 bool tryFoldLoad(SDNode *P, SDValue N,
244 SDValue &Base, SDValue &Scale,
245 SDValue &Index, SDValue &Disp,
246 SDValue &Segment) {
247 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
250 bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
251 SDValue &Base, SDValue &Scale,
252 SDValue &Index, SDValue &Disp,
253 SDValue &Segment);
255 bool isProfitableToFormMaskedOp(SDNode *N) const;
257 /// Implement addressing mode selection for inline asm expressions.
258 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
259 InlineAsm::ConstraintCode ConstraintID,
260 std::vector<SDValue> &OutOps) override;
262 void emitSpecialCodeForMain();
264 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
265 MVT VT, SDValue &Base, SDValue &Scale,
266 SDValue &Index, SDValue &Disp,
267 SDValue &Segment) {
268 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
269 Base = CurDAG->getTargetFrameIndex(
270 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
271 else if (AM.Base_Reg.getNode())
272 Base = AM.Base_Reg;
273 else
274 Base = CurDAG->getRegister(0, VT);
276 Scale = getI8Imm(AM.Scale, DL);
278 #define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
279 // Negate the index if needed.
280 if (AM.NegateIndex) {
281 unsigned NegOpc = VT == MVT::i64 ? GET_ND_IF_ENABLED(X86::NEG64r)
282 : GET_ND_IF_ENABLED(X86::NEG32r);
283 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
284 AM.IndexReg), 0);
285 AM.IndexReg = Neg;
288 if (AM.IndexReg.getNode())
289 Index = AM.IndexReg;
290 else
291 Index = CurDAG->getRegister(0, VT);
293 // These are 32-bit even in 64-bit mode since RIP-relative offset
294 // is 32-bit.
295 if (AM.GV)
296 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
297 MVT::i32, AM.Disp,
298 AM.SymbolFlags);
299 else if (AM.CP)
300 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
301 AM.Disp, AM.SymbolFlags);
302 else if (AM.ES) {
303 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
304 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
305 } else if (AM.MCSym) {
306 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
307 assert(AM.SymbolFlags == 0 && "oo");
308 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
309 } else if (AM.JT != -1) {
310 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
311 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
312 } else if (AM.BlockAddr)
313 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
314 AM.SymbolFlags);
315 else
316 Disp = CurDAG->getSignedTargetConstant(AM.Disp, DL, MVT::i32);
318 if (AM.Segment.getNode())
319 Segment = AM.Segment;
320 else
321 Segment = CurDAG->getRegister(0, MVT::i16);
324 // Utility function to determine whether it is AMX SDNode right after
325 // lowering but before ISEL.
326 bool isAMXSDNode(SDNode *N) const {
327 // Check if N is AMX SDNode:
328 // 1. check specific opcode since these carry MVT::Untyped instead of
329 // x86amx_type;
330 // 2. check result type;
331 // 3. check operand type;
332 switch (N->getOpcode()) {
333 default:
334 break;
335 case X86::PT2RPNTLVWZ0V:
336 case X86::PT2RPNTLVWZ0T1V:
337 case X86::PT2RPNTLVWZ1V:
338 case X86::PT2RPNTLVWZ1T1V:
339 case X86::PT2RPNTLVWZ0RSV:
340 case X86::PT2RPNTLVWZ0RST1V:
341 case X86::PT2RPNTLVWZ1RSV:
342 case X86::PT2RPNTLVWZ1RST1V:
343 return true;
345 for (unsigned Idx = 0, E = N->getNumValues(); Idx != E; ++Idx) {
346 if (N->getValueType(Idx) == MVT::x86amx)
347 return true;
349 for (unsigned Idx = 0, E = N->getNumOperands(); Idx != E; ++Idx) {
350 SDValue Op = N->getOperand(Idx);
351 if (Op.getValueType() == MVT::x86amx)
352 return true;
354 return false;
357 // Utility function to determine whether we should avoid selecting
358 // immediate forms of instructions for better code size or not.
359 // At a high level, we'd like to avoid such instructions when
360 // we have similar constants used within the same basic block
361 // that can be kept in a register.
363 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
364 uint32_t UseCount = 0;
366 // Do not want to hoist if we're not optimizing for size.
367 // TODO: We'd like to remove this restriction.
368 // See the comment in X86InstrInfo.td for more info.
369 if (!CurDAG->shouldOptForSize())
370 return false;
372 // Walk all the users of the immediate.
373 for (const SDNode *User : N->users()) {
374 if (UseCount >= 2)
375 break;
377 // This user is already selected. Count it as a legitimate use and
378 // move on.
379 if (User->isMachineOpcode()) {
380 UseCount++;
381 continue;
384 // We want to count stores of immediates as real uses.
385 if (User->getOpcode() == ISD::STORE &&
386 User->getOperand(1).getNode() == N) {
387 UseCount++;
388 continue;
391 // We don't currently match users that have > 2 operands (except
392 // for stores, which are handled above)
393 // Those instruction won't match in ISEL, for now, and would
394 // be counted incorrectly.
395 // This may change in the future as we add additional instruction
396 // types.
397 if (User->getNumOperands() != 2)
398 continue;
400 // If this is a sign-extended 8-bit integer immediate used in an ALU
401 // instruction, there is probably an opcode encoding to save space.
402 auto *C = dyn_cast<ConstantSDNode>(N);
403 if (C && isInt<8>(C->getSExtValue()))
404 continue;
406 // Immediates that are used for offsets as part of stack
407 // manipulation should be left alone. These are typically
408 // used to indicate SP offsets for argument passing and
409 // will get pulled into stores/pushes (implicitly).
410 if (User->getOpcode() == X86ISD::ADD ||
411 User->getOpcode() == ISD::ADD ||
412 User->getOpcode() == X86ISD::SUB ||
413 User->getOpcode() == ISD::SUB) {
415 // Find the other operand of the add/sub.
416 SDValue OtherOp = User->getOperand(0);
417 if (OtherOp.getNode() == N)
418 OtherOp = User->getOperand(1);
420 // Don't count if the other operand is SP.
421 RegisterSDNode *RegNode;
422 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
423 (RegNode = dyn_cast_or_null<RegisterSDNode>(
424 OtherOp->getOperand(1).getNode())))
425 if ((RegNode->getReg() == X86::ESP) ||
426 (RegNode->getReg() == X86::RSP))
427 continue;
430 // ... otherwise, count this and move on.
431 UseCount++;
434 // If we have more than 1 use, then recommend for hoisting.
435 return (UseCount > 1);
438 /// Return a target constant with the specified value of type i8.
439 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
440 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
443 /// Return a target constant with the specified value, of type i32.
444 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
445 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
448 /// Return a target constant with the specified value, of type i64.
449 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
450 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
453 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
454 const SDLoc &DL) {
455 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
456 uint64_t Index = N->getConstantOperandVal(1);
457 MVT VecVT = N->getOperand(0).getSimpleValueType();
458 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
461 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
462 const SDLoc &DL) {
463 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
464 uint64_t Index = N->getConstantOperandVal(2);
465 MVT VecVT = N->getSimpleValueType(0);
466 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
469 SDValue getPermuteVINSERTCommutedImmediate(SDNode *N, unsigned VecWidth,
470 const SDLoc &DL) {
471 assert(VecWidth == 128 && "Unexpected vector width");
472 uint64_t Index = N->getConstantOperandVal(2);
473 MVT VecVT = N->getSimpleValueType(0);
474 uint64_t InsertIdx = (Index * VecVT.getScalarSizeInBits()) / VecWidth;
475 assert((InsertIdx == 0 || InsertIdx == 1) && "Bad insertf128 index");
476 // vinsert(0,sub,vec) -> [sub0][vec1] -> vperm2x128(0x30,vec,sub)
477 // vinsert(1,sub,vec) -> [vec0][sub0] -> vperm2x128(0x02,vec,sub)
478 return getI8Imm(InsertIdx ? 0x02 : 0x30, DL);
481 SDValue getSBBZero(SDNode *N) {
482 SDLoc dl(N);
483 MVT VT = N->getSimpleValueType(0);
485 // Create zero.
486 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
487 SDValue Zero =
488 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
489 if (VT == MVT::i64) {
490 Zero = SDValue(
491 CurDAG->getMachineNode(
492 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
493 CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
494 CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
498 // Copy flags to the EFLAGS register and glue it to next node.
499 unsigned Opcode = N->getOpcode();
500 assert((Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY) &&
501 "Unexpected opcode for SBB materialization");
502 unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
503 SDValue EFLAGS =
504 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
505 N->getOperand(FlagOpIndex), SDValue());
507 // Create a 64-bit instruction if the result is 64-bits otherwise use the
508 // 32-bit version.
509 unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
510 MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
511 VTs = CurDAG->getVTList(SBBVT, MVT::i32);
512 return SDValue(
513 CurDAG->getMachineNode(Opc, dl, VTs,
514 {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}),
518 // Helper to detect unneeded and instructions on shift amounts. Called
519 // from PatFrags in tablegen.
520 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
521 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
522 const APInt &Val = N->getConstantOperandAPInt(1);
524 if (Val.countr_one() >= Width)
525 return true;
527 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
528 return Mask.countr_one() >= Width;
531 /// Return an SDNode that returns the value of the global base register.
532 /// Output instructions required to initialize the global base register,
533 /// if necessary.
534 SDNode *getGlobalBaseReg();
536 /// Return a reference to the TargetMachine, casted to the target-specific
537 /// type.
538 const X86TargetMachine &getTargetMachine() const {
539 return static_cast<const X86TargetMachine &>(TM);
542 /// Return a reference to the TargetInstrInfo, casted to the target-specific
543 /// type.
544 const X86InstrInfo *getInstrInfo() const {
545 return Subtarget->getInstrInfo();
548 /// Return a condition code of the given SDNode
549 X86::CondCode getCondFromNode(SDNode *N) const;
551 /// Address-mode matching performs shift-of-and to and-of-shift
552 /// reassociation in order to expose more scaled addressing
553 /// opportunities.
554 bool ComplexPatternFuncMutatesDAG() const override {
555 return true;
558 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
560 // Indicates we should prefer to use a non-temporal load for this load.
561 bool useNonTemporalLoad(LoadSDNode *N) const {
562 if (!N->isNonTemporal())
563 return false;
565 unsigned StoreSize = N->getMemoryVT().getStoreSize();
567 if (N->getAlign().value() < StoreSize)
568 return false;
570 switch (StoreSize) {
571 default: llvm_unreachable("Unsupported store size");
572 case 4:
573 case 8:
574 return false;
575 case 16:
576 return Subtarget->hasSSE41();
577 case 32:
578 return Subtarget->hasAVX2();
579 case 64:
580 return Subtarget->hasAVX512();
584 bool foldLoadStoreIntoMemOperand(SDNode *Node);
585 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
586 bool matchBitExtract(SDNode *Node);
587 bool shrinkAndImmediate(SDNode *N);
588 bool isMaskZeroExtended(SDNode *N) const;
589 bool tryShiftAmountMod(SDNode *N);
590 bool tryShrinkShlLogicImm(SDNode *N);
591 bool tryVPTERNLOG(SDNode *N);
592 bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentB,
593 SDNode *ParentC, SDValue A, SDValue B, SDValue C,
594 uint8_t Imm);
595 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
596 bool tryMatchBitSelect(SDNode *N);
598 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
599 const SDLoc &dl, MVT VT, SDNode *Node);
600 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
601 const SDLoc &dl, MVT VT, SDNode *Node,
602 SDValue &InGlue);
604 bool tryOptimizeRem8Extend(SDNode *N);
606 bool onlyUsesZeroFlag(SDValue Flags) const;
607 bool hasNoSignFlagUses(SDValue Flags) const;
608 bool hasNoCarryFlagUses(SDValue Flags) const;
611 class X86DAGToDAGISelLegacy : public SelectionDAGISelLegacy {
612 public:
613 static char ID;
614 explicit X86DAGToDAGISelLegacy(X86TargetMachine &tm,
615 CodeGenOptLevel OptLevel)
616 : SelectionDAGISelLegacy(
617 ID, std::make_unique<X86DAGToDAGISel>(tm, OptLevel)) {}
621 char X86DAGToDAGISelLegacy::ID = 0;
623 INITIALIZE_PASS(X86DAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false)
625 // Returns true if this masked compare can be implemented legally with this
626 // type.
627 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
628 unsigned Opcode = N->getOpcode();
629 if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM ||
630 Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC ||
631 Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) {
632 // We can get 256-bit 8 element types here without VLX being enabled. When
633 // this happens we will use 512-bit operations and the mask will not be
634 // zero extended.
635 EVT OpVT = N->getOperand(0).getValueType();
636 // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
637 // second operand.
638 if (Opcode == X86ISD::STRICT_CMPM)
639 OpVT = N->getOperand(1).getValueType();
640 if (OpVT.is256BitVector() || OpVT.is128BitVector())
641 return Subtarget->hasVLX();
643 return true;
645 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
646 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
647 Opcode == X86ISD::FSETCCM_SAE)
648 return true;
650 return false;
653 // Returns true if we can assume the writer of the mask has zero extended it
654 // for us.
655 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
656 // If this is an AND, check if we have a compare on either side. As long as
657 // one side guarantees the mask is zero extended, the AND will preserve those
658 // zeros.
659 if (N->getOpcode() == ISD::AND)
660 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
661 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
663 return isLegalMaskCompare(N, Subtarget);
666 bool
667 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
668 if (OptLevel == CodeGenOptLevel::None)
669 return false;
671 if (!N.hasOneUse())
672 return false;
674 if (N.getOpcode() != ISD::LOAD)
675 return true;
677 // Don't fold non-temporal loads if we have an instruction for them.
678 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
679 return false;
681 // If N is a load, do additional profitability checks.
682 if (U == Root) {
683 switch (U->getOpcode()) {
684 default: break;
685 case X86ISD::ADD:
686 case X86ISD::ADC:
687 case X86ISD::SUB:
688 case X86ISD::SBB:
689 case X86ISD::AND:
690 case X86ISD::XOR:
691 case X86ISD::OR:
692 case ISD::ADD:
693 case ISD::UADDO_CARRY:
694 case ISD::AND:
695 case ISD::OR:
696 case ISD::XOR: {
697 SDValue Op1 = U->getOperand(1);
699 // If the other operand is a 8-bit immediate we should fold the immediate
700 // instead. This reduces code size.
701 // e.g.
702 // movl 4(%esp), %eax
703 // addl $4, %eax
704 // vs.
705 // movl $4, %eax
706 // addl 4(%esp), %eax
707 // The former is 2 bytes shorter. In case where the increment is 1, then
708 // the saving can be 4 bytes (by using incl %eax).
709 if (auto *Imm = dyn_cast<ConstantSDNode>(Op1)) {
710 if (Imm->getAPIntValue().isSignedIntN(8))
711 return false;
713 // If this is a 64-bit AND with an immediate that fits in 32-bits,
714 // prefer using the smaller and over folding the load. This is needed to
715 // make sure immediates created by shrinkAndImmediate are always folded.
716 // Ideally we would narrow the load during DAG combine and get the
717 // best of both worlds.
718 if (U->getOpcode() == ISD::AND &&
719 Imm->getAPIntValue().getBitWidth() == 64 &&
720 Imm->getAPIntValue().isIntN(32))
721 return false;
723 // If this really a zext_inreg that can be represented with a movzx
724 // instruction, prefer that.
725 // TODO: We could shrink the load and fold if it is non-volatile.
726 if (U->getOpcode() == ISD::AND &&
727 (Imm->getAPIntValue() == UINT8_MAX ||
728 Imm->getAPIntValue() == UINT16_MAX ||
729 Imm->getAPIntValue() == UINT32_MAX))
730 return false;
732 // ADD/SUB with can negate the immediate and use the opposite operation
733 // to fit 128 into a sign extended 8 bit immediate.
734 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
735 (-Imm->getAPIntValue()).isSignedIntN(8))
736 return false;
738 if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
739 (-Imm->getAPIntValue()).isSignedIntN(8) &&
740 hasNoCarryFlagUses(SDValue(U, 1)))
741 return false;
744 // If the other operand is a TLS address, we should fold it instead.
745 // This produces
746 // movl %gs:0, %eax
747 // leal i@NTPOFF(%eax), %eax
748 // instead of
749 // movl $i@NTPOFF, %eax
750 // addl %gs:0, %eax
751 // if the block also has an access to a second TLS address this will save
752 // a load.
753 // FIXME: This is probably also true for non-TLS addresses.
754 if (Op1.getOpcode() == X86ISD::Wrapper) {
755 SDValue Val = Op1.getOperand(0);
756 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
757 return false;
760 // Don't fold load if this matches the BTS/BTR/BTC patterns.
761 // BTS: (or X, (shl 1, n))
762 // BTR: (and X, (rotl -2, n))
763 // BTC: (xor X, (shl 1, n))
764 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
765 if (U->getOperand(0).getOpcode() == ISD::SHL &&
766 isOneConstant(U->getOperand(0).getOperand(0)))
767 return false;
769 if (U->getOperand(1).getOpcode() == ISD::SHL &&
770 isOneConstant(U->getOperand(1).getOperand(0)))
771 return false;
773 if (U->getOpcode() == ISD::AND) {
774 SDValue U0 = U->getOperand(0);
775 SDValue U1 = U->getOperand(1);
776 if (U0.getOpcode() == ISD::ROTL) {
777 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
778 if (C && C->getSExtValue() == -2)
779 return false;
782 if (U1.getOpcode() == ISD::ROTL) {
783 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
784 if (C && C->getSExtValue() == -2)
785 return false;
789 break;
791 case ISD::SHL:
792 case ISD::SRA:
793 case ISD::SRL:
794 // Don't fold a load into a shift by immediate. The BMI2 instructions
795 // support folding a load, but not an immediate. The legacy instructions
796 // support folding an immediate, but can't fold a load. Folding an
797 // immediate is preferable to folding a load.
798 if (isa<ConstantSDNode>(U->getOperand(1)))
799 return false;
801 break;
805 // Prevent folding a load if this can implemented with an insert_subreg or
806 // a move that implicitly zeroes.
807 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
808 isNullConstant(Root->getOperand(2)) &&
809 (Root->getOperand(0).isUndef() ||
810 ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
811 return false;
813 return true;
816 // Indicates it is profitable to form an AVX512 masked operation. Returning
817 // false will favor a masked register-register masked move or vblendm and the
818 // operation will be selected separately.
819 bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
820 assert(
821 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
822 "Unexpected opcode!");
824 // If the operation has additional users, the operation will be duplicated.
825 // Check the use count to prevent that.
826 // FIXME: Are there cheap opcodes we might want to duplicate?
827 return N->getOperand(1).hasOneUse();
830 /// Replace the original chain operand of the call with
831 /// load's chain operand and move load below the call's chain operand.
832 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
833 SDValue Call, SDValue OrigChain) {
834 SmallVector<SDValue, 8> Ops;
835 SDValue Chain = OrigChain.getOperand(0);
836 if (Chain.getNode() == Load.getNode())
837 Ops.push_back(Load.getOperand(0));
838 else {
839 assert(Chain.getOpcode() == ISD::TokenFactor &&
840 "Unexpected chain operand");
841 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
842 if (Chain.getOperand(i).getNode() == Load.getNode())
843 Ops.push_back(Load.getOperand(0));
844 else
845 Ops.push_back(Chain.getOperand(i));
846 SDValue NewChain =
847 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
848 Ops.clear();
849 Ops.push_back(NewChain);
851 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
852 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
853 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
854 Load.getOperand(1), Load.getOperand(2));
856 Ops.clear();
857 Ops.push_back(SDValue(Load.getNode(), 1));
858 Ops.append(Call->op_begin() + 1, Call->op_end());
859 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
862 /// Return true if call address is a load and it can be
863 /// moved below CALLSEQ_START and the chains leading up to the call.
864 /// Return the CALLSEQ_START by reference as a second output.
865 /// In the case of a tail call, there isn't a callseq node between the call
866 /// chain and the load.
867 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
868 // The transformation is somewhat dangerous if the call's chain was glued to
869 // the call. After MoveBelowOrigChain the load is moved between the call and
870 // the chain, this can create a cycle if the load is not folded. So it is
871 // *really* important that we are sure the load will be folded.
872 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
873 return false;
874 auto *LD = dyn_cast<LoadSDNode>(Callee.getNode());
875 if (!LD ||
876 !LD->isSimple() ||
877 LD->getAddressingMode() != ISD::UNINDEXED ||
878 LD->getExtensionType() != ISD::NON_EXTLOAD)
879 return false;
881 // Now let's find the callseq_start.
882 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
883 if (!Chain.hasOneUse())
884 return false;
885 Chain = Chain.getOperand(0);
888 if (!Chain.getNumOperands())
889 return false;
890 // Since we are not checking for AA here, conservatively abort if the chain
891 // writes to memory. It's not safe to move the callee (a load) across a store.
892 if (isa<MemSDNode>(Chain.getNode()) &&
893 cast<MemSDNode>(Chain.getNode())->writeMem())
894 return false;
895 if (Chain.getOperand(0).getNode() == Callee.getNode())
896 return true;
897 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
898 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
899 Callee.getValue(1).hasOneUse())
900 return true;
901 return false;
904 static bool isEndbrImm64(uint64_t Imm) {
905 // There may be some other prefix bytes between 0xF3 and 0x0F1EFA.
906 // i.g: 0xF3660F1EFA, 0xF3670F1EFA
907 if ((Imm & 0x00FFFFFF) != 0x0F1EFA)
908 return false;
910 uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64,
911 0x65, 0x66, 0x67, 0xf0, 0xf2};
912 int i = 24; // 24bit 0x0F1EFA has matched
913 while (i < 64) {
914 uint8_t Byte = (Imm >> i) & 0xFF;
915 if (Byte == 0xF3)
916 return true;
917 if (!llvm::is_contained(OptionalPrefixBytes, Byte))
918 return false;
919 i += 8;
922 return false;
925 static bool needBWI(MVT VT) {
926 return (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v64i8);
929 void X86DAGToDAGISel::PreprocessISelDAG() {
930 bool MadeChange = false;
931 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
932 E = CurDAG->allnodes_end(); I != E; ) {
933 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
935 // This is for CET enhancement.
937 // ENDBR32 and ENDBR64 have specific opcodes:
938 // ENDBR32: F3 0F 1E FB
939 // ENDBR64: F3 0F 1E FA
940 // And we want that attackers won’t find unintended ENDBR32/64
941 // opcode matches in the binary
942 // Here’s an example:
943 // If the compiler had to generate asm for the following code:
944 // a = 0xF30F1EFA
945 // it could, for example, generate:
946 // mov 0xF30F1EFA, dword ptr[a]
947 // In such a case, the binary would include a gadget that starts
948 // with a fake ENDBR64 opcode. Therefore, we split such generation
949 // into multiple operations, let it not shows in the binary
950 if (N->getOpcode() == ISD::Constant) {
951 MVT VT = N->getSimpleValueType(0);
952 int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
953 int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB;
954 if (Imm == EndbrImm || isEndbrImm64(Imm)) {
955 // Check that the cf-protection-branch is enabled.
956 Metadata *CFProtectionBranch =
957 MF->getFunction().getParent()->getModuleFlag(
958 "cf-protection-branch");
959 if (CFProtectionBranch || IndirectBranchTracking) {
960 SDLoc dl(N);
961 SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true);
962 Complement = CurDAG->getNOT(dl, Complement, VT);
963 --I;
964 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement);
965 ++I;
966 MadeChange = true;
967 continue;
972 // If this is a target specific AND node with no flag usages, turn it back
973 // into ISD::AND to enable test instruction matching.
974 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
975 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
976 N->getOperand(0), N->getOperand(1));
977 --I;
978 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
979 ++I;
980 MadeChange = true;
981 continue;
984 // Convert vector increment or decrement to sub/add with an all-ones
985 // constant:
986 // add X, <1, 1...> --> sub X, <-1, -1...>
987 // sub X, <1, 1...> --> add X, <-1, -1...>
988 // The all-ones vector constant can be materialized using a pcmpeq
989 // instruction that is commonly recognized as an idiom (has no register
990 // dependency), so that's better/smaller than loading a splat 1 constant.
992 // But don't do this if it would inhibit a potentially profitable load
993 // folding opportunity for the other operand. That only occurs with the
994 // intersection of:
995 // (1) The other operand (op0) is load foldable.
996 // (2) The op is an add (otherwise, we are *creating* an add and can still
997 // load fold the other op).
998 // (3) The target has AVX (otherwise, we have a destructive add and can't
999 // load fold the other op without killing the constant op).
1000 // (4) The constant 1 vector has multiple uses (so it is profitable to load
1001 // into a register anyway).
1002 auto mayPreventLoadFold = [&]() {
1003 return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
1004 N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
1005 !N->getOperand(1).hasOneUse();
1007 if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
1008 N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
1009 APInt SplatVal;
1010 if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
1011 SplatVal.isOne()) {
1012 SDLoc DL(N);
1014 MVT VT = N->getSimpleValueType(0);
1015 unsigned NumElts = VT.getSizeInBits() / 32;
1016 SDValue AllOnes =
1017 CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
1018 AllOnes = CurDAG->getBitcast(VT, AllOnes);
1020 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
1021 SDValue Res =
1022 CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
1023 --I;
1024 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1025 ++I;
1026 MadeChange = true;
1027 continue;
1031 switch (N->getOpcode()) {
1032 case X86ISD::VBROADCAST: {
1033 MVT VT = N->getSimpleValueType(0);
1034 // Emulate v32i16/v64i8 broadcast without BWI.
1035 if (!Subtarget->hasBWI() && needBWI(VT)) {
1036 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1037 SDLoc dl(N);
1038 SDValue NarrowBCast =
1039 CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
1040 SDValue Res =
1041 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1042 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1043 unsigned Index = NarrowVT.getVectorMinNumElements();
1044 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1045 CurDAG->getIntPtrConstant(Index, dl));
1047 --I;
1048 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1049 ++I;
1050 MadeChange = true;
1051 continue;
1054 break;
1056 case X86ISD::VBROADCAST_LOAD: {
1057 MVT VT = N->getSimpleValueType(0);
1058 // Emulate v32i16/v64i8 broadcast without BWI.
1059 if (!Subtarget->hasBWI() && needBWI(VT)) {
1060 MVT NarrowVT = VT.getHalfNumVectorElementsVT();
1061 auto *MemNode = cast<MemSDNode>(N);
1062 SDLoc dl(N);
1063 SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
1064 SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
1065 SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
1066 X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
1067 MemNode->getMemOperand());
1068 SDValue Res =
1069 CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
1070 NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
1071 unsigned Index = NarrowVT.getVectorMinNumElements();
1072 Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
1073 CurDAG->getIntPtrConstant(Index, dl));
1075 --I;
1076 SDValue To[] = {Res, NarrowBCast.getValue(1)};
1077 CurDAG->ReplaceAllUsesWith(N, To);
1078 ++I;
1079 MadeChange = true;
1080 continue;
1083 break;
1085 case ISD::LOAD: {
1086 // If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1087 // load, then just extract the lower subvector and avoid the second load.
1088 auto *Ld = cast<LoadSDNode>(N);
1089 MVT VT = N->getSimpleValueType(0);
1090 if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1091 !(VT.is128BitVector() || VT.is256BitVector()))
1092 break;
1094 MVT MaxVT = VT;
1095 SDNode *MaxLd = nullptr;
1096 SDValue Ptr = Ld->getBasePtr();
1097 SDValue Chain = Ld->getChain();
1098 for (SDNode *User : Ptr->users()) {
1099 auto *UserLd = dyn_cast<LoadSDNode>(User);
1100 MVT UserVT = User->getSimpleValueType(0);
1101 if (User != N && UserLd && ISD::isNormalLoad(User) &&
1102 UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1103 !User->hasAnyUseOfValue(1) &&
1104 (UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1105 UserVT.getSizeInBits() > VT.getSizeInBits() &&
1106 (!MaxLd || UserVT.getSizeInBits() > MaxVT.getSizeInBits())) {
1107 MaxLd = User;
1108 MaxVT = UserVT;
1111 if (MaxLd) {
1112 SDLoc dl(N);
1113 unsigned NumSubElts = VT.getSizeInBits() / MaxVT.getScalarSizeInBits();
1114 MVT SubVT = MVT::getVectorVT(MaxVT.getScalarType(), NumSubElts);
1115 SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1116 SDValue(MaxLd, 0),
1117 CurDAG->getIntPtrConstant(0, dl));
1118 SDValue Res = CurDAG->getBitcast(VT, Extract);
1120 --I;
1121 SDValue To[] = {Res, SDValue(MaxLd, 1)};
1122 CurDAG->ReplaceAllUsesWith(N, To);
1123 ++I;
1124 MadeChange = true;
1125 continue;
1127 break;
1129 case ISD::VSELECT: {
1130 // Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1131 EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1132 if (EleVT == MVT::i1)
1133 break;
1135 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1136 assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1137 "We can't replace VSELECT with BLENDV in vXi16!");
1138 SDValue R;
1139 if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1140 EleVT.getSizeInBits()) {
1141 R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1142 N->getOperand(0), N->getOperand(1), N->getOperand(2),
1143 CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1144 } else {
1145 R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1146 N->getOperand(0), N->getOperand(1),
1147 N->getOperand(2));
1149 --I;
1150 CurDAG->ReplaceAllUsesWith(N, R.getNode());
1151 ++I;
1152 MadeChange = true;
1153 continue;
1155 case ISD::FP_ROUND:
1156 case ISD::STRICT_FP_ROUND:
1157 case ISD::FP_TO_SINT:
1158 case ISD::FP_TO_UINT:
1159 case ISD::STRICT_FP_TO_SINT:
1160 case ISD::STRICT_FP_TO_UINT: {
1161 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
1162 // don't need 2 sets of patterns.
1163 if (!N->getSimpleValueType(0).isVector())
1164 break;
1166 unsigned NewOpc;
1167 switch (N->getOpcode()) {
1168 default: llvm_unreachable("Unexpected opcode!");
1169 case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
1170 case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
1171 case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
1172 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
1173 case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
1174 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
1176 SDValue Res;
1177 if (N->isStrictFPOpcode())
1178 Res =
1179 CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
1180 {N->getOperand(0), N->getOperand(1)});
1181 else
1182 Res =
1183 CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1184 N->getOperand(0));
1185 --I;
1186 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1187 ++I;
1188 MadeChange = true;
1189 continue;
1191 case ISD::SHL:
1192 case ISD::SRA:
1193 case ISD::SRL: {
1194 // Replace vector shifts with their X86 specific equivalent so we don't
1195 // need 2 sets of patterns.
1196 if (!N->getValueType(0).isVector())
1197 break;
1199 unsigned NewOpc;
1200 switch (N->getOpcode()) {
1201 default: llvm_unreachable("Unexpected opcode!");
1202 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
1203 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
1204 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
1206 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1207 N->getOperand(0), N->getOperand(1));
1208 --I;
1209 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1210 ++I;
1211 MadeChange = true;
1212 continue;
1214 case ISD::ANY_EXTEND:
1215 case ISD::ANY_EXTEND_VECTOR_INREG: {
1216 // Replace vector any extend with the zero extend equivalents so we don't
1217 // need 2 sets of patterns. Ignore vXi1 extensions.
1218 if (!N->getValueType(0).isVector())
1219 break;
1221 unsigned NewOpc;
1222 if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
1223 assert(N->getOpcode() == ISD::ANY_EXTEND &&
1224 "Unexpected opcode for mask vector!");
1225 NewOpc = ISD::SIGN_EXTEND;
1226 } else {
1227 NewOpc = N->getOpcode() == ISD::ANY_EXTEND
1228 ? ISD::ZERO_EXTEND
1229 : ISD::ZERO_EXTEND_VECTOR_INREG;
1232 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
1233 N->getOperand(0));
1234 --I;
1235 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1236 ++I;
1237 MadeChange = true;
1238 continue;
1240 case ISD::FCEIL:
1241 case ISD::STRICT_FCEIL:
1242 case ISD::FFLOOR:
1243 case ISD::STRICT_FFLOOR:
1244 case ISD::FTRUNC:
1245 case ISD::STRICT_FTRUNC:
1246 case ISD::FROUNDEVEN:
1247 case ISD::STRICT_FROUNDEVEN:
1248 case ISD::FNEARBYINT:
1249 case ISD::STRICT_FNEARBYINT:
1250 case ISD::FRINT:
1251 case ISD::STRICT_FRINT: {
1252 // Replace fp rounding with their X86 specific equivalent so we don't
1253 // need 2 sets of patterns.
1254 unsigned Imm;
1255 switch (N->getOpcode()) {
1256 default: llvm_unreachable("Unexpected opcode!");
1257 case ISD::STRICT_FCEIL:
1258 case ISD::FCEIL: Imm = 0xA; break;
1259 case ISD::STRICT_FFLOOR:
1260 case ISD::FFLOOR: Imm = 0x9; break;
1261 case ISD::STRICT_FTRUNC:
1262 case ISD::FTRUNC: Imm = 0xB; break;
1263 case ISD::STRICT_FROUNDEVEN:
1264 case ISD::FROUNDEVEN: Imm = 0x8; break;
1265 case ISD::STRICT_FNEARBYINT:
1266 case ISD::FNEARBYINT: Imm = 0xC; break;
1267 case ISD::STRICT_FRINT:
1268 case ISD::FRINT: Imm = 0x4; break;
1270 SDLoc dl(N);
1271 bool IsStrict = N->isStrictFPOpcode();
1272 SDValue Res;
1273 if (IsStrict)
1274 Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
1275 {N->getValueType(0), MVT::Other},
1276 {N->getOperand(0), N->getOperand(1),
1277 CurDAG->getTargetConstant(Imm, dl, MVT::i32)});
1278 else
1279 Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
1280 N->getOperand(0),
1281 CurDAG->getTargetConstant(Imm, dl, MVT::i32));
1282 --I;
1283 CurDAG->ReplaceAllUsesWith(N, Res.getNode());
1284 ++I;
1285 MadeChange = true;
1286 continue;
1288 case X86ISD::FANDN:
1289 case X86ISD::FAND:
1290 case X86ISD::FOR:
1291 case X86ISD::FXOR: {
1292 // Widen scalar fp logic ops to vector to reduce isel patterns.
1293 // FIXME: Can we do this during lowering/combine.
1294 MVT VT = N->getSimpleValueType(0);
1295 if (VT.isVector() || VT == MVT::f128)
1296 break;
1298 MVT VecVT = VT == MVT::f64 ? MVT::v2f64
1299 : VT == MVT::f32 ? MVT::v4f32
1300 : MVT::v8f16;
1302 SDLoc dl(N);
1303 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1304 N->getOperand(0));
1305 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
1306 N->getOperand(1));
1308 SDValue Res;
1309 if (Subtarget->hasSSE2()) {
1310 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
1311 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
1312 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
1313 unsigned Opc;
1314 switch (N->getOpcode()) {
1315 default: llvm_unreachable("Unexpected opcode!");
1316 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
1317 case X86ISD::FAND: Opc = ISD::AND; break;
1318 case X86ISD::FOR: Opc = ISD::OR; break;
1319 case X86ISD::FXOR: Opc = ISD::XOR; break;
1321 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
1322 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
1323 } else {
1324 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
1326 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
1327 CurDAG->getIntPtrConstant(0, dl));
1328 --I;
1329 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
1330 ++I;
1331 MadeChange = true;
1332 continue;
1336 if (OptLevel != CodeGenOptLevel::None &&
1337 // Only do this when the target can fold the load into the call or
1338 // jmp.
1339 !Subtarget->useIndirectThunkCalls() &&
1340 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
1341 (N->getOpcode() == X86ISD::TC_RETURN &&
1342 (Subtarget->is64Bit() ||
1343 !getTargetMachine().isPositionIndependent())))) {
1344 /// Also try moving call address load from outside callseq_start to just
1345 /// before the call to allow it to be folded.
1347 /// [Load chain]
1348 /// ^
1349 /// |
1350 /// [Load]
1351 /// ^ ^
1352 /// | |
1353 /// / \--
1354 /// / |
1355 ///[CALLSEQ_START] |
1356 /// ^ |
1357 /// | |
1358 /// [LOAD/C2Reg] |
1359 /// | |
1360 /// \ /
1361 /// \ /
1362 /// [CALL]
1363 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
1364 SDValue Chain = N->getOperand(0);
1365 SDValue Load = N->getOperand(1);
1366 if (!isCalleeLoad(Load, Chain, HasCallSeq))
1367 continue;
1368 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
1369 ++NumLoadMoved;
1370 MadeChange = true;
1371 continue;
1374 // Lower fpround and fpextend nodes that target the FP stack to be store and
1375 // load to the stack. This is a gross hack. We would like to simply mark
1376 // these as being illegal, but when we do that, legalize produces these when
1377 // it expands calls, then expands these in the same legalize pass. We would
1378 // like dag combine to be able to hack on these between the call expansion
1379 // and the node legalization. As such this pass basically does "really
1380 // late" legalization of these inline with the X86 isel pass.
1381 // FIXME: This should only happen when not compiled with -O0.
1382 switch (N->getOpcode()) {
1383 default: continue;
1384 case ISD::FP_ROUND:
1385 case ISD::FP_EXTEND:
1387 MVT SrcVT = N->getOperand(0).getSimpleValueType();
1388 MVT DstVT = N->getSimpleValueType(0);
1390 // If any of the sources are vectors, no fp stack involved.
1391 if (SrcVT.isVector() || DstVT.isVector())
1392 continue;
1394 // If the source and destination are SSE registers, then this is a legal
1395 // conversion that should not be lowered.
1396 const X86TargetLowering *X86Lowering =
1397 static_cast<const X86TargetLowering *>(TLI);
1398 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1399 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1400 if (SrcIsSSE && DstIsSSE)
1401 continue;
1403 if (!SrcIsSSE && !DstIsSSE) {
1404 // If this is an FPStack extension, it is a noop.
1405 if (N->getOpcode() == ISD::FP_EXTEND)
1406 continue;
1407 // If this is a value-preserving FPStack truncation, it is a noop.
1408 if (N->getConstantOperandVal(1))
1409 continue;
1412 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1413 // FPStack has extload and truncstore. SSE can fold direct loads into other
1414 // operations. Based on this, decide what we want to do.
1415 MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
1416 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1417 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1418 MachinePointerInfo MPI =
1419 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1420 SDLoc dl(N);
1422 // FIXME: optimize the case where the src/dest is a load or store?
1424 SDValue Store = CurDAG->getTruncStore(
1425 CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
1426 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
1427 MemTmp, MPI, MemVT);
1429 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1430 // extload we created. This will cause general havok on the dag because
1431 // anything below the conversion could be folded into other existing nodes.
1432 // To avoid invalidating 'I', back it up to the convert node.
1433 --I;
1434 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1435 break;
1438 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1439 //dealing with the chain differently, as there is already a preexisting chain.
1440 case ISD::STRICT_FP_ROUND:
1441 case ISD::STRICT_FP_EXTEND:
1443 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1444 MVT DstVT = N->getSimpleValueType(0);
1446 // If any of the sources are vectors, no fp stack involved.
1447 if (SrcVT.isVector() || DstVT.isVector())
1448 continue;
1450 // If the source and destination are SSE registers, then this is a legal
1451 // conversion that should not be lowered.
1452 const X86TargetLowering *X86Lowering =
1453 static_cast<const X86TargetLowering *>(TLI);
1454 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1455 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1456 if (SrcIsSSE && DstIsSSE)
1457 continue;
1459 if (!SrcIsSSE && !DstIsSSE) {
1460 // If this is an FPStack extension, it is a noop.
1461 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1462 continue;
1463 // If this is a value-preserving FPStack truncation, it is a noop.
1464 if (N->getConstantOperandVal(2))
1465 continue;
1468 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1469 // FPStack has extload and truncstore. SSE can fold direct loads into other
1470 // operations. Based on this, decide what we want to do.
1471 MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
1472 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1473 int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
1474 MachinePointerInfo MPI =
1475 MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
1476 SDLoc dl(N);
1478 // FIXME: optimize the case where the src/dest is a load or store?
1480 //Since the operation is StrictFP, use the preexisting chain.
1481 SDValue Store, Result;
1482 if (!SrcIsSSE) {
1483 SDVTList VTs = CurDAG->getVTList(MVT::Other);
1484 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
1485 Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
1486 MPI, /*Align*/ std::nullopt,
1487 MachineMemOperand::MOStore);
1488 if (N->getFlags().hasNoFPExcept()) {
1489 SDNodeFlags Flags = Store->getFlags();
1490 Flags.setNoFPExcept(true);
1491 Store->setFlags(Flags);
1493 } else {
1494 assert(SrcVT == MemVT && "Unexpected VT!");
1495 Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
1496 MPI);
1499 if (!DstIsSSE) {
1500 SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
1501 SDValue Ops[] = {Store, MemTmp};
1502 Result = CurDAG->getMemIntrinsicNode(
1503 X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
1504 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
1505 if (N->getFlags().hasNoFPExcept()) {
1506 SDNodeFlags Flags = Result->getFlags();
1507 Flags.setNoFPExcept(true);
1508 Result->setFlags(Flags);
1510 } else {
1511 assert(DstVT == MemVT && "Unexpected VT!");
1512 Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
1515 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1516 // extload we created. This will cause general havok on the dag because
1517 // anything below the conversion could be folded into other existing nodes.
1518 // To avoid invalidating 'I', back it up to the convert node.
1519 --I;
1520 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1521 break;
1526 // Now that we did that, the node is dead. Increment the iterator to the
1527 // next node to process, then delete N.
1528 ++I;
1529 MadeChange = true;
1532 // Remove any dead nodes that may have been left behind.
1533 if (MadeChange)
1534 CurDAG->RemoveDeadNodes();
1537 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1538 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1539 unsigned Opc = N->getMachineOpcode();
1540 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1541 Opc != X86::MOVSX64rr8)
1542 return false;
1544 SDValue N0 = N->getOperand(0);
1546 // We need to be extracting the lower bit of an extend.
1547 if (!N0.isMachineOpcode() ||
1548 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1549 N0.getConstantOperandVal(1) != X86::sub_8bit)
1550 return false;
1552 // We're looking for either a movsx or movzx to match the original opcode.
1553 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1554 : X86::MOVSX32rr8_NOREX;
1555 SDValue N00 = N0.getOperand(0);
1556 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1557 return false;
1559 if (Opc == X86::MOVSX64rr8) {
1560 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1561 // to 64.
1562 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1563 MVT::i64, N00);
1564 ReplaceUses(N, Extend);
1565 } else {
1566 // Ok we can drop this extend and just use the original extend.
1567 ReplaceUses(N, N00.getNode());
1570 return true;
1573 void X86DAGToDAGISel::PostprocessISelDAG() {
1574 // Skip peepholes at -O0.
1575 if (TM.getOptLevel() == CodeGenOptLevel::None)
1576 return;
1578 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1580 bool MadeChange = false;
1581 while (Position != CurDAG->allnodes_begin()) {
1582 SDNode *N = &*--Position;
1583 // Skip dead nodes and any non-machine opcodes.
1584 if (N->use_empty() || !N->isMachineOpcode())
1585 continue;
1587 if (tryOptimizeRem8Extend(N)) {
1588 MadeChange = true;
1589 continue;
1592 unsigned Opc = N->getMachineOpcode();
1593 switch (Opc) {
1594 default:
1595 continue;
1596 // ANDrr/rm + TESTrr+ -> TESTrr/TESTmr
1597 case X86::TEST8rr:
1598 case X86::TEST16rr:
1599 case X86::TEST32rr:
1600 case X86::TEST64rr:
1601 // ANDrr/rm + CTESTrr -> CTESTrr/CTESTmr
1602 case X86::CTEST8rr:
1603 case X86::CTEST16rr:
1604 case X86::CTEST32rr:
1605 case X86::CTEST64rr: {
1606 auto &Op0 = N->getOperand(0);
1607 if (Op0 != N->getOperand(1) || !Op0->hasNUsesOfValue(2, Op0.getResNo()) ||
1608 !Op0.isMachineOpcode())
1609 continue;
1610 SDValue And = N->getOperand(0);
1611 #define CASE_ND(OP) \
1612 case X86::OP: \
1613 case X86::OP##_ND:
1614 switch (And.getMachineOpcode()) {
1615 default:
1616 continue;
1617 CASE_ND(AND8rr)
1618 CASE_ND(AND16rr)
1619 CASE_ND(AND32rr)
1620 CASE_ND(AND64rr) {
1621 if (And->hasAnyUseOfValue(1))
1622 continue;
1623 SmallVector<SDValue> Ops(N->op_values());
1624 Ops[0] = And.getOperand(0);
1625 Ops[1] = And.getOperand(1);
1626 MachineSDNode *Test =
1627 CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, Ops);
1628 ReplaceUses(N, Test);
1629 MadeChange = true;
1630 continue;
1632 CASE_ND(AND8rm)
1633 CASE_ND(AND16rm)
1634 CASE_ND(AND32rm)
1635 CASE_ND(AND64rm) {
1636 if (And->hasAnyUseOfValue(1))
1637 continue;
1638 unsigned NewOpc;
1639 bool IsCTESTCC = X86::isCTESTCC(Opc);
1640 #define FROM_TO(A, B) \
1641 CASE_ND(A) NewOpc = IsCTESTCC ? X86::C##B : X86::B; \
1642 break;
1643 switch (And.getMachineOpcode()) {
1644 FROM_TO(AND8rm, TEST8mr);
1645 FROM_TO(AND16rm, TEST16mr);
1646 FROM_TO(AND32rm, TEST32mr);
1647 FROM_TO(AND64rm, TEST64mr);
1649 #undef FROM_TO
1650 #undef CASE_ND
1651 // Need to swap the memory and register operand.
1652 SmallVector<SDValue> Ops = {And.getOperand(1), And.getOperand(2),
1653 And.getOperand(3), And.getOperand(4),
1654 And.getOperand(5), And.getOperand(0)};
1655 // CC, Cflags.
1656 if (IsCTESTCC) {
1657 Ops.push_back(N->getOperand(2));
1658 Ops.push_back(N->getOperand(3));
1660 // Chain of memory load
1661 Ops.push_back(And.getOperand(6));
1662 // Glue
1663 if (IsCTESTCC)
1664 Ops.push_back(N->getOperand(4));
1666 MachineSDNode *Test = CurDAG->getMachineNode(
1667 NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops);
1668 CurDAG->setNodeMemRefs(
1669 Test, cast<MachineSDNode>(And.getNode())->memoperands());
1670 ReplaceUses(And.getValue(2), SDValue(Test, 1));
1671 ReplaceUses(SDValue(N, 0), SDValue(Test, 0));
1672 MadeChange = true;
1673 continue;
1677 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1678 // used. We're doing this late so we can prefer to fold the AND into masked
1679 // comparisons. Doing that can be better for the live range of the mask
1680 // register.
1681 case X86::KORTESTBkk:
1682 case X86::KORTESTWkk:
1683 case X86::KORTESTDkk:
1684 case X86::KORTESTQkk: {
1685 SDValue Op0 = N->getOperand(0);
1686 if (Op0 != N->getOperand(1) || !N->isOnlyUserOf(Op0.getNode()) ||
1687 !Op0.isMachineOpcode() || !onlyUsesZeroFlag(SDValue(N, 0)))
1688 continue;
1689 #define CASE(A) \
1690 case X86::A: \
1691 break;
1692 switch (Op0.getMachineOpcode()) {
1693 default:
1694 continue;
1695 CASE(KANDBkk)
1696 CASE(KANDWkk)
1697 CASE(KANDDkk)
1698 CASE(KANDQkk)
1700 unsigned NewOpc;
1701 #define FROM_TO(A, B) \
1702 case X86::A: \
1703 NewOpc = X86::B; \
1704 break;
1705 switch (Opc) {
1706 FROM_TO(KORTESTBkk, KTESTBkk)
1707 FROM_TO(KORTESTWkk, KTESTWkk)
1708 FROM_TO(KORTESTDkk, KTESTDkk)
1709 FROM_TO(KORTESTQkk, KTESTQkk)
1711 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1712 // KAND instructions and KTEST use the same ISA feature.
1713 if (NewOpc == X86::KTESTWkk && !Subtarget->hasDQI())
1714 continue;
1715 #undef FROM_TO
1716 MachineSDNode *KTest = CurDAG->getMachineNode(
1717 NewOpc, SDLoc(N), MVT::i32, Op0.getOperand(0), Op0.getOperand(1));
1718 ReplaceUses(N, KTest);
1719 MadeChange = true;
1720 continue;
1722 // Attempt to remove vectors moves that were inserted to zero upper bits.
1723 case TargetOpcode::SUBREG_TO_REG: {
1724 unsigned SubRegIdx = N->getConstantOperandVal(2);
1725 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1726 continue;
1728 SDValue Move = N->getOperand(1);
1729 if (!Move.isMachineOpcode())
1730 continue;
1732 // Make sure its one of the move opcodes we recognize.
1733 switch (Move.getMachineOpcode()) {
1734 default:
1735 continue;
1736 CASE(VMOVAPDrr) CASE(VMOVUPDrr)
1737 CASE(VMOVAPSrr) CASE(VMOVUPSrr)
1738 CASE(VMOVDQArr) CASE(VMOVDQUrr)
1739 CASE(VMOVAPDYrr) CASE(VMOVUPDYrr)
1740 CASE(VMOVAPSYrr) CASE(VMOVUPSYrr)
1741 CASE(VMOVDQAYrr) CASE(VMOVDQUYrr)
1742 CASE(VMOVAPDZ128rr) CASE(VMOVUPDZ128rr)
1743 CASE(VMOVAPSZ128rr) CASE(VMOVUPSZ128rr)
1744 CASE(VMOVDQA32Z128rr) CASE(VMOVDQU32Z128rr)
1745 CASE(VMOVDQA64Z128rr) CASE(VMOVDQU64Z128rr)
1746 CASE(VMOVAPDZ256rr) CASE(VMOVUPDZ256rr)
1747 CASE(VMOVAPSZ256rr) CASE(VMOVUPSZ256rr)
1748 CASE(VMOVDQA32Z256rr) CASE(VMOVDQU32Z256rr)
1749 CASE(VMOVDQA64Z256rr) CASE(VMOVDQU64Z256rr)
1751 #undef CASE
1753 SDValue In = Move.getOperand(0);
1754 if (!In.isMachineOpcode() ||
1755 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1756 continue;
1758 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1759 // the SHA instructions which use a legacy encoding.
1760 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1761 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1762 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1763 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1764 continue;
1766 // Producing instruction is another vector instruction. We can drop the
1767 // move.
1768 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1769 MadeChange = true;
1774 if (MadeChange)
1775 CurDAG->RemoveDeadNodes();
1779 /// Emit any code that needs to be executed only in the main function.
1780 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1781 if (Subtarget->isTargetCygMing()) {
1782 TargetLowering::ArgListTy Args;
1783 auto &DL = CurDAG->getDataLayout();
1785 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1786 CLI.setChain(CurDAG->getRoot())
1787 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1788 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1789 std::move(Args));
1790 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1791 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1792 CurDAG->setRoot(Result.second);
1796 void X86DAGToDAGISel::emitFunctionEntryCode() {
1797 // If this is main, emit special code for main.
1798 const Function &F = MF->getFunction();
1799 if (F.hasExternalLinkage() && F.getName() == "main")
1800 emitSpecialCodeForMain();
1803 static bool isDispSafeForFrameIndex(int64_t Val) {
1804 // On 64-bit platforms, we can run into an issue where a frame index
1805 // includes a displacement that, when added to the explicit displacement,
1806 // will overflow the displacement field. Assuming that the frame index
1807 // displacement fits into a 31-bit integer (which is only slightly more
1808 // aggressive than the current fundamental assumption that it fits into
1809 // a 32-bit integer), a 31-bit disp should always be safe.
1810 return isInt<31>(Val);
1813 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1814 X86ISelAddressMode &AM) {
1815 // We may have already matched a displacement and the caller just added the
1816 // symbolic displacement. So we still need to do the checks even if Offset
1817 // is zero.
1819 int64_t Val = AM.Disp + Offset;
1821 // Cannot combine ExternalSymbol displacements with integer offsets.
1822 if (Val != 0 && (AM.ES || AM.MCSym))
1823 return true;
1825 CodeModel::Model M = TM.getCodeModel();
1826 if (Subtarget->is64Bit()) {
1827 if (Val != 0 &&
1828 !X86::isOffsetSuitableForCodeModel(Val, M,
1829 AM.hasSymbolicDisplacement()))
1830 return true;
1831 // In addition to the checks required for a register base, check that
1832 // we do not try to use an unsafe Disp with a frame index.
1833 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1834 !isDispSafeForFrameIndex(Val))
1835 return true;
1836 // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
1837 // 64 bits. Instructions with 32-bit register addresses perform this zero
1838 // extension for us and we can safely ignore the high bits of Offset.
1839 // Instructions with only a 32-bit immediate address do not, though: they
1840 // sign extend instead. This means only address the low 2GB of address space
1841 // is directly addressable, we need indirect addressing for the high 2GB of
1842 // address space.
1843 // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
1844 // implicit zero extension of instructions would cover up any problem.
1845 // However, we have asserts elsewhere that get triggered if we do, so keep
1846 // the checks for now.
1847 // TODO: We would actually be able to accept these, as well as the same
1848 // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
1849 // to get an address size override to be emitted. However, this
1850 // pseudo-register is not part of any register class and therefore causes
1851 // MIR verification to fail.
1852 if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
1853 !AM.hasBaseOrIndexReg())
1854 return true;
1856 AM.Disp = Val;
1857 return false;
1860 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
1861 bool AllowSegmentRegForX32) {
1862 SDValue Address = N->getOperand(1);
1864 // load gs:0 -> GS segment register.
1865 // load fs:0 -> FS segment register.
1867 // This optimization is generally valid because the GNU TLS model defines that
1868 // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode
1869 // with 32-bit registers, as we get in ILP32 mode, those registers are first
1870 // zero-extended to 64 bits and then added it to the base address, which gives
1871 // unwanted results when the register holds a negative value.
1872 // For more information see http://people.redhat.com/drepper/tls.pdf
1873 if (isNullConstant(Address) && AM.Segment.getNode() == nullptr &&
1874 !IndirectTlsSegRefs &&
1875 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1876 Subtarget->isTargetFuchsia())) {
1877 if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32)
1878 return true;
1879 switch (N->getPointerInfo().getAddrSpace()) {
1880 case X86AS::GS:
1881 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1882 return false;
1883 case X86AS::FS:
1884 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1885 return false;
1886 // Address space X86AS::SS is not handled here, because it is not used to
1887 // address TLS areas.
1891 return true;
1894 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1895 /// mode. These wrap things that will resolve down into a symbol reference.
1896 /// If no match is possible, this returns true, otherwise it returns false.
1897 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1898 // If the addressing mode already has a symbol as the displacement, we can
1899 // never match another symbol.
1900 if (AM.hasSymbolicDisplacement())
1901 return true;
1903 bool IsRIPRelTLS = false;
1904 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1905 if (IsRIPRel) {
1906 SDValue Val = N.getOperand(0);
1907 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1908 IsRIPRelTLS = true;
1911 // We can't use an addressing mode in the 64-bit large code model.
1912 // Global TLS addressing is an exception. In the medium code model,
1913 // we use can use a mode when RIP wrappers are present.
1914 // That signifies access to globals that are known to be "near",
1915 // such as the GOT itself.
1916 CodeModel::Model M = TM.getCodeModel();
1917 if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
1918 return true;
1920 // Base and index reg must be 0 in order to use %rip as base.
1921 if (IsRIPRel && AM.hasBaseOrIndexReg())
1922 return true;
1924 // Make a local copy in case we can't do this fold.
1925 X86ISelAddressMode Backup = AM;
1927 int64_t Offset = 0;
1928 SDValue N0 = N.getOperand(0);
1929 if (auto *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1930 AM.GV = G->getGlobal();
1931 AM.SymbolFlags = G->getTargetFlags();
1932 Offset = G->getOffset();
1933 } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1934 AM.CP = CP->getConstVal();
1935 AM.Alignment = CP->getAlign();
1936 AM.SymbolFlags = CP->getTargetFlags();
1937 Offset = CP->getOffset();
1938 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1939 AM.ES = S->getSymbol();
1940 AM.SymbolFlags = S->getTargetFlags();
1941 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1942 AM.MCSym = S->getMCSymbol();
1943 } else if (auto *J = dyn_cast<JumpTableSDNode>(N0)) {
1944 AM.JT = J->getIndex();
1945 AM.SymbolFlags = J->getTargetFlags();
1946 } else if (auto *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1947 AM.BlockAddr = BA->getBlockAddress();
1948 AM.SymbolFlags = BA->getTargetFlags();
1949 Offset = BA->getOffset();
1950 } else
1951 llvm_unreachable("Unhandled symbol reference node.");
1953 // Can't use an addressing mode with large globals.
1954 if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
1955 TM.isLargeGlobalValue(AM.GV)) {
1956 AM = Backup;
1957 return true;
1960 if (foldOffsetIntoAddress(Offset, AM)) {
1961 AM = Backup;
1962 return true;
1965 if (IsRIPRel)
1966 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1968 // Commit the changes now that we know this fold is safe.
1969 return false;
1972 /// Add the specified node to the specified addressing mode, returning true if
1973 /// it cannot be done. This just pattern matches for the addressing mode.
1974 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1975 if (matchAddressRecursively(N, AM, 0))
1976 return true;
1978 // Post-processing: Make a second attempt to fold a load, if we now know
1979 // that there will not be any other register. This is only performed for
1980 // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded
1981 // any foldable load the first time.
1982 if (Subtarget->isTarget64BitILP32() &&
1983 AM.BaseType == X86ISelAddressMode::RegBase &&
1984 AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) {
1985 SDValue Save_Base_Reg = AM.Base_Reg;
1986 if (auto *LoadN = dyn_cast<LoadSDNode>(Save_Base_Reg)) {
1987 AM.Base_Reg = SDValue();
1988 if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true))
1989 AM.Base_Reg = Save_Base_Reg;
1993 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1994 // a smaller encoding and avoids a scaled-index.
1995 if (AM.Scale == 2 &&
1996 AM.BaseType == X86ISelAddressMode::RegBase &&
1997 AM.Base_Reg.getNode() == nullptr) {
1998 AM.Base_Reg = AM.IndexReg;
1999 AM.Scale = 1;
2002 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
2003 // because it has a smaller encoding.
2004 if (TM.getCodeModel() != CodeModel::Large &&
2005 (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
2006 AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
2007 AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
2008 AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
2009 // However, when GV is a local function symbol and in the same section as
2010 // the current instruction, and AM.Disp is negative and near INT32_MIN,
2011 // referencing GV+Disp generates a relocation referencing the section symbol
2012 // with an even smaller offset, which might underflow. We should bail out if
2013 // the negative offset is too close to INT32_MIN. Actually, we are more
2014 // conservative here, using a smaller magic number also used by
2015 // isOffsetSuitableForCodeModel.
2016 if (isa_and_nonnull<Function>(AM.GV) && AM.Disp < -16 * 1024 * 1024)
2017 return true;
2019 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
2022 return false;
2025 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
2026 unsigned Depth) {
2027 // Add an artificial use to this node so that we can keep track of
2028 // it if it gets CSE'd with a different node.
2029 HandleSDNode Handle(N);
2031 X86ISelAddressMode Backup = AM;
2032 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
2033 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
2034 return false;
2035 AM = Backup;
2037 // Try again after commutating the operands.
2038 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
2039 Depth + 1) &&
2040 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
2041 return false;
2042 AM = Backup;
2044 // If we couldn't fold both operands into the address at the same time,
2045 // see if we can just put each operand into a register and fold at least
2046 // the add.
2047 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2048 !AM.Base_Reg.getNode() &&
2049 !AM.IndexReg.getNode()) {
2050 N = Handle.getValue();
2051 AM.Base_Reg = N.getOperand(0);
2052 AM.IndexReg = N.getOperand(1);
2053 AM.Scale = 1;
2054 return false;
2056 N = Handle.getValue();
2057 return true;
2060 // Insert a node into the DAG at least before the Pos node's position. This
2061 // will reposition the node as needed, and will assign it a node ID that is <=
2062 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
2063 // IDs! The selection DAG must no longer depend on their uniqueness when this
2064 // is used.
2065 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
2066 if (N->getNodeId() == -1 ||
2067 (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
2068 SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
2069 DAG.RepositionNode(Pos->getIterator(), N.getNode());
2070 // Mark Node as invalid for pruning as after this it may be a successor to a
2071 // selected node but otherwise be in the same position of Pos.
2072 // Conservatively mark it with the same -abs(Id) to assure node id
2073 // invariant is preserved.
2074 N->setNodeId(Pos->getNodeId());
2075 SelectionDAGISel::InvalidateNodeId(N.getNode());
2079 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
2080 // safe. This allows us to convert the shift and and into an h-register
2081 // extract and a scaled index. Returns false if the simplification is
2082 // performed.
2083 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
2084 uint64_t Mask,
2085 SDValue Shift, SDValue X,
2086 X86ISelAddressMode &AM) {
2087 if (Shift.getOpcode() != ISD::SRL ||
2088 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2089 !Shift.hasOneUse())
2090 return true;
2092 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
2093 if (ScaleLog <= 0 || ScaleLog >= 4 ||
2094 Mask != (0xffu << ScaleLog))
2095 return true;
2097 MVT XVT = X.getSimpleValueType();
2098 MVT VT = N.getSimpleValueType();
2099 SDLoc DL(N);
2100 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
2101 SDValue NewMask = DAG.getConstant(0xff, DL, XVT);
2102 SDValue Srl = DAG.getNode(ISD::SRL, DL, XVT, X, Eight);
2103 SDValue And = DAG.getNode(ISD::AND, DL, XVT, Srl, NewMask);
2104 SDValue Ext = DAG.getZExtOrTrunc(And, DL, VT);
2105 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
2106 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Ext, ShlCount);
2108 // Insert the new nodes into the topological ordering. We must do this in
2109 // a valid topological ordering as nothing is going to go back and re-sort
2110 // these nodes. We continually insert before 'N' in sequence as this is
2111 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2112 // hierarchy left to express.
2113 insertDAGNode(DAG, N, Eight);
2114 insertDAGNode(DAG, N, NewMask);
2115 insertDAGNode(DAG, N, Srl);
2116 insertDAGNode(DAG, N, And);
2117 insertDAGNode(DAG, N, Ext);
2118 insertDAGNode(DAG, N, ShlCount);
2119 insertDAGNode(DAG, N, Shl);
2120 DAG.ReplaceAllUsesWith(N, Shl);
2121 DAG.RemoveDeadNode(N.getNode());
2122 AM.IndexReg = Ext;
2123 AM.Scale = (1 << ScaleLog);
2124 return false;
2127 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
2128 // allows us to fold the shift into this addressing mode. Returns false if the
2129 // transform succeeded.
2130 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
2131 X86ISelAddressMode &AM) {
2132 SDValue Shift = N.getOperand(0);
2134 // Use a signed mask so that shifting right will insert sign bits. These
2135 // bits will be removed when we shift the result left so it doesn't matter
2136 // what we use. This might allow a smaller immediate encoding.
2137 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
2139 // If we have an any_extend feeding the AND, look through it to see if there
2140 // is a shift behind it. But only if the AND doesn't use the extended bits.
2141 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
2142 bool FoundAnyExtend = false;
2143 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
2144 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
2145 isUInt<32>(Mask)) {
2146 FoundAnyExtend = true;
2147 Shift = Shift.getOperand(0);
2150 if (Shift.getOpcode() != ISD::SHL ||
2151 !isa<ConstantSDNode>(Shift.getOperand(1)))
2152 return true;
2154 SDValue X = Shift.getOperand(0);
2156 // Not likely to be profitable if either the AND or SHIFT node has more
2157 // than one use (unless all uses are for address computation). Besides,
2158 // isel mechanism requires their node ids to be reused.
2159 if (!N.hasOneUse() || !Shift.hasOneUse())
2160 return true;
2162 // Verify that the shift amount is something we can fold.
2163 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2164 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
2165 return true;
2167 MVT VT = N.getSimpleValueType();
2168 SDLoc DL(N);
2169 if (FoundAnyExtend) {
2170 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
2171 insertDAGNode(DAG, N, NewX);
2172 X = NewX;
2175 SDValue NewMask = DAG.getSignedConstant(Mask >> ShiftAmt, DL, VT);
2176 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
2177 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
2179 // Insert the new nodes into the topological ordering. We must do this in
2180 // a valid topological ordering as nothing is going to go back and re-sort
2181 // these nodes. We continually insert before 'N' in sequence as this is
2182 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2183 // hierarchy left to express.
2184 insertDAGNode(DAG, N, NewMask);
2185 insertDAGNode(DAG, N, NewAnd);
2186 insertDAGNode(DAG, N, NewShift);
2187 DAG.ReplaceAllUsesWith(N, NewShift);
2188 DAG.RemoveDeadNode(N.getNode());
2190 AM.Scale = 1 << ShiftAmt;
2191 AM.IndexReg = NewAnd;
2192 return false;
2195 // Implement some heroics to detect shifts of masked values where the mask can
2196 // be replaced by extending the shift and undoing that in the addressing mode
2197 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
2198 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
2199 // the addressing mode. This results in code such as:
2201 // int f(short *y, int *lookup_table) {
2202 // ...
2203 // return *y + lookup_table[*y >> 11];
2204 // }
2206 // Turning into:
2207 // movzwl (%rdi), %eax
2208 // movl %eax, %ecx
2209 // shrl $11, %ecx
2210 // addl (%rsi,%rcx,4), %eax
2212 // Instead of:
2213 // movzwl (%rdi), %eax
2214 // movl %eax, %ecx
2215 // shrl $9, %ecx
2216 // andl $124, %rcx
2217 // addl (%rsi,%rcx), %eax
2219 // Note that this function assumes the mask is provided as a mask *after* the
2220 // value is shifted. The input chain may or may not match that, but computing
2221 // such a mask is trivial.
2222 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
2223 uint64_t Mask,
2224 SDValue Shift, SDValue X,
2225 X86ISelAddressMode &AM) {
2226 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
2227 !isa<ConstantSDNode>(Shift.getOperand(1)))
2228 return true;
2230 // We need to ensure that mask is a continuous run of bits.
2231 unsigned MaskIdx, MaskLen;
2232 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2233 return true;
2234 unsigned MaskLZ = 64 - (MaskIdx + MaskLen);
2236 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2238 // The amount of shift we're trying to fit into the addressing mode is taken
2239 // from the shifted mask index (number of trailing zeros of the mask).
2240 unsigned AMShiftAmt = MaskIdx;
2242 // There is nothing we can do here unless the mask is removing some bits.
2243 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2244 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2246 // Scale the leading zero count down based on the actual size of the value.
2247 // Also scale it down based on the size of the shift.
2248 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
2249 if (MaskLZ < ScaleDown)
2250 return true;
2251 MaskLZ -= ScaleDown;
2253 // The final check is to ensure that any masked out high bits of X are
2254 // already known to be zero. Otherwise, the mask has a semantic impact
2255 // other than masking out a couple of low bits. Unfortunately, because of
2256 // the mask, zero extensions will be removed from operands in some cases.
2257 // This code works extra hard to look through extensions because we can
2258 // replace them with zero extensions cheaply if necessary.
2259 bool ReplacingAnyExtend = false;
2260 if (X.getOpcode() == ISD::ANY_EXTEND) {
2261 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
2262 X.getOperand(0).getSimpleValueType().getSizeInBits();
2263 // Assume that we'll replace the any-extend with a zero-extend, and
2264 // narrow the search to the extended value.
2265 X = X.getOperand(0);
2266 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
2267 ReplacingAnyExtend = true;
2269 APInt MaskedHighBits =
2270 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
2271 if (!DAG.MaskedValueIsZero(X, MaskedHighBits))
2272 return true;
2274 // We've identified a pattern that can be transformed into a single shift
2275 // and an addressing mode. Make it so.
2276 MVT VT = N.getSimpleValueType();
2277 if (ReplacingAnyExtend) {
2278 assert(X.getValueType() != VT);
2279 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
2280 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
2281 insertDAGNode(DAG, N, NewX);
2282 X = NewX;
2285 MVT XVT = X.getSimpleValueType();
2286 SDLoc DL(N);
2287 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2288 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2289 SDValue NewExt = DAG.getZExtOrTrunc(NewSRL, DL, VT);
2290 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2291 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2293 // Insert the new nodes into the topological ordering. We must do this in
2294 // a valid topological ordering as nothing is going to go back and re-sort
2295 // these nodes. We continually insert before 'N' in sequence as this is
2296 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2297 // hierarchy left to express.
2298 insertDAGNode(DAG, N, NewSRLAmt);
2299 insertDAGNode(DAG, N, NewSRL);
2300 insertDAGNode(DAG, N, NewExt);
2301 insertDAGNode(DAG, N, NewSHLAmt);
2302 insertDAGNode(DAG, N, NewSHL);
2303 DAG.ReplaceAllUsesWith(N, NewSHL);
2304 DAG.RemoveDeadNode(N.getNode());
2306 AM.Scale = 1 << AMShiftAmt;
2307 AM.IndexReg = NewExt;
2308 return false;
2311 // Transform "(X >> SHIFT) & (MASK << C1)" to
2312 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
2313 // matched to a BEXTR later. Returns false if the simplification is performed.
2314 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
2315 uint64_t Mask,
2316 SDValue Shift, SDValue X,
2317 X86ISelAddressMode &AM,
2318 const X86Subtarget &Subtarget) {
2319 if (Shift.getOpcode() != ISD::SRL ||
2320 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
2321 !Shift.hasOneUse() || !N.hasOneUse())
2322 return true;
2324 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
2325 if (!Subtarget.hasTBM() &&
2326 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
2327 return true;
2329 // We need to ensure that mask is a continuous run of bits.
2330 unsigned MaskIdx, MaskLen;
2331 if (!isShiftedMask_64(Mask, MaskIdx, MaskLen))
2332 return true;
2334 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
2336 // The amount of shift we're trying to fit into the addressing mode is taken
2337 // from the shifted mask index (number of trailing zeros of the mask).
2338 unsigned AMShiftAmt = MaskIdx;
2340 // There is nothing we can do here unless the mask is removing some bits.
2341 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
2342 if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
2344 MVT XVT = X.getSimpleValueType();
2345 MVT VT = N.getSimpleValueType();
2346 SDLoc DL(N);
2347 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
2348 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, XVT, X, NewSRLAmt);
2349 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, XVT);
2350 SDValue NewAnd = DAG.getNode(ISD::AND, DL, XVT, NewSRL, NewMask);
2351 SDValue NewExt = DAG.getZExtOrTrunc(NewAnd, DL, VT);
2352 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
2353 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewExt, NewSHLAmt);
2355 // Insert the new nodes into the topological ordering. We must do this in
2356 // a valid topological ordering as nothing is going to go back and re-sort
2357 // these nodes. We continually insert before 'N' in sequence as this is
2358 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
2359 // hierarchy left to express.
2360 insertDAGNode(DAG, N, NewSRLAmt);
2361 insertDAGNode(DAG, N, NewSRL);
2362 insertDAGNode(DAG, N, NewMask);
2363 insertDAGNode(DAG, N, NewAnd);
2364 insertDAGNode(DAG, N, NewExt);
2365 insertDAGNode(DAG, N, NewSHLAmt);
2366 insertDAGNode(DAG, N, NewSHL);
2367 DAG.ReplaceAllUsesWith(N, NewSHL);
2368 DAG.RemoveDeadNode(N.getNode());
2370 AM.Scale = 1 << AMShiftAmt;
2371 AM.IndexReg = NewExt;
2372 return false;
2375 // Attempt to peek further into a scaled index register, collecting additional
2376 // extensions / offsets / etc. Returns /p N if we can't peek any further.
2377 SDValue X86DAGToDAGISel::matchIndexRecursively(SDValue N,
2378 X86ISelAddressMode &AM,
2379 unsigned Depth) {
2380 assert(AM.IndexReg.getNode() == nullptr && "IndexReg already matched");
2381 assert((AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8) &&
2382 "Illegal index scale");
2384 // Limit recursion.
2385 if (Depth >= SelectionDAG::MaxRecursionDepth)
2386 return N;
2388 EVT VT = N.getValueType();
2389 unsigned Opc = N.getOpcode();
2391 // index: add(x,c) -> index: x, disp + c
2392 if (CurDAG->isBaseWithConstantOffset(N)) {
2393 auto *AddVal = cast<ConstantSDNode>(N.getOperand(1));
2394 uint64_t Offset = (uint64_t)AddVal->getSExtValue() * AM.Scale;
2395 if (!foldOffsetIntoAddress(Offset, AM))
2396 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2399 // index: add(x,x) -> index: x, scale * 2
2400 if (Opc == ISD::ADD && N.getOperand(0) == N.getOperand(1)) {
2401 if (AM.Scale <= 4) {
2402 AM.Scale *= 2;
2403 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2407 // index: shl(x,i) -> index: x, scale * (1 << i)
2408 if (Opc == X86ISD::VSHLI) {
2409 uint64_t ShiftAmt = N.getConstantOperandVal(1);
2410 uint64_t ScaleAmt = 1ULL << ShiftAmt;
2411 if ((AM.Scale * ScaleAmt) <= 8) {
2412 AM.Scale *= ScaleAmt;
2413 return matchIndexRecursively(N.getOperand(0), AM, Depth + 1);
2417 // index: sext(add_nsw(x,c)) -> index: sext(x), disp + sext(c)
2418 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2419 if (Opc == ISD::SIGN_EXTEND && !VT.isVector() && N.hasOneUse()) {
2420 SDValue Src = N.getOperand(0);
2421 if (Src.getOpcode() == ISD::ADD && Src->getFlags().hasNoSignedWrap() &&
2422 Src.hasOneUse()) {
2423 if (CurDAG->isBaseWithConstantOffset(Src)) {
2424 SDValue AddSrc = Src.getOperand(0);
2425 auto *AddVal = cast<ConstantSDNode>(Src.getOperand(1));
2426 int64_t Offset = AddVal->getSExtValue();
2427 if (!foldOffsetIntoAddress((uint64_t)Offset * AM.Scale, AM)) {
2428 SDLoc DL(N);
2429 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2430 SDValue ExtVal = CurDAG->getSignedConstant(Offset, DL, VT);
2431 SDValue ExtAdd = CurDAG->getNode(ISD::ADD, DL, VT, ExtSrc, ExtVal);
2432 insertDAGNode(*CurDAG, N, ExtSrc);
2433 insertDAGNode(*CurDAG, N, ExtVal);
2434 insertDAGNode(*CurDAG, N, ExtAdd);
2435 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2436 CurDAG->RemoveDeadNode(N.getNode());
2437 return ExtSrc;
2443 // index: zext(add_nuw(x,c)) -> index: zext(x), disp + zext(c)
2444 // index: zext(addlike(x,c)) -> index: zext(x), disp + zext(c)
2445 // TODO: call matchIndexRecursively(AddSrc) if we won't corrupt sext?
2446 if (Opc == ISD::ZERO_EXTEND && !VT.isVector() && N.hasOneUse()) {
2447 SDValue Src = N.getOperand(0);
2448 unsigned SrcOpc = Src.getOpcode();
2449 if (((SrcOpc == ISD::ADD && Src->getFlags().hasNoUnsignedWrap()) ||
2450 CurDAG->isADDLike(Src, /*NoWrap=*/true)) &&
2451 Src.hasOneUse()) {
2452 if (CurDAG->isBaseWithConstantOffset(Src)) {
2453 SDValue AddSrc = Src.getOperand(0);
2454 uint64_t Offset = Src.getConstantOperandVal(1);
2455 if (!foldOffsetIntoAddress(Offset * AM.Scale, AM)) {
2456 SDLoc DL(N);
2457 SDValue Res;
2458 // If we're also scaling, see if we can use that as well.
2459 if (AddSrc.getOpcode() == ISD::SHL &&
2460 isa<ConstantSDNode>(AddSrc.getOperand(1))) {
2461 SDValue ShVal = AddSrc.getOperand(0);
2462 uint64_t ShAmt = AddSrc.getConstantOperandVal(1);
2463 APInt HiBits =
2464 APInt::getHighBitsSet(AddSrc.getScalarValueSizeInBits(), ShAmt);
2465 uint64_t ScaleAmt = 1ULL << ShAmt;
2466 if ((AM.Scale * ScaleAmt) <= 8 &&
2467 (AddSrc->getFlags().hasNoUnsignedWrap() ||
2468 CurDAG->MaskedValueIsZero(ShVal, HiBits))) {
2469 AM.Scale *= ScaleAmt;
2470 SDValue ExtShVal = CurDAG->getNode(Opc, DL, VT, ShVal);
2471 SDValue ExtShift = CurDAG->getNode(ISD::SHL, DL, VT, ExtShVal,
2472 AddSrc.getOperand(1));
2473 insertDAGNode(*CurDAG, N, ExtShVal);
2474 insertDAGNode(*CurDAG, N, ExtShift);
2475 AddSrc = ExtShift;
2476 Res = ExtShVal;
2479 SDValue ExtSrc = CurDAG->getNode(Opc, DL, VT, AddSrc);
2480 SDValue ExtVal = CurDAG->getConstant(Offset, DL, VT);
2481 SDValue ExtAdd = CurDAG->getNode(SrcOpc, DL, VT, ExtSrc, ExtVal);
2482 insertDAGNode(*CurDAG, N, ExtSrc);
2483 insertDAGNode(*CurDAG, N, ExtVal);
2484 insertDAGNode(*CurDAG, N, ExtAdd);
2485 CurDAG->ReplaceAllUsesWith(N, ExtAdd);
2486 CurDAG->RemoveDeadNode(N.getNode());
2487 return Res ? Res : ExtSrc;
2493 // TODO: Handle extensions, shifted masks etc.
2494 return N;
2497 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
2498 unsigned Depth) {
2499 SDLoc dl(N);
2500 LLVM_DEBUG({
2501 dbgs() << "MatchAddress: ";
2502 AM.dump(CurDAG);
2504 // Limit recursion.
2505 if (Depth >= SelectionDAG::MaxRecursionDepth)
2506 return matchAddressBase(N, AM);
2508 // If this is already a %rip relative address, we can only merge immediates
2509 // into it. Instead of handling this in every case, we handle it here.
2510 // RIP relative addressing: %rip + 32-bit displacement!
2511 if (AM.isRIPRelative()) {
2512 // FIXME: JumpTable and ExternalSymbol address currently don't like
2513 // displacements. It isn't very important, but this should be fixed for
2514 // consistency.
2515 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
2516 return true;
2518 if (auto *Cst = dyn_cast<ConstantSDNode>(N))
2519 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
2520 return false;
2521 return true;
2524 switch (N.getOpcode()) {
2525 default: break;
2526 case ISD::LOCAL_RECOVER: {
2527 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
2528 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
2529 // Use the symbol and don't prefix it.
2530 AM.MCSym = ESNode->getMCSymbol();
2531 return false;
2533 break;
2535 case ISD::Constant: {
2536 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2537 if (!foldOffsetIntoAddress(Val, AM))
2538 return false;
2539 break;
2542 case X86ISD::Wrapper:
2543 case X86ISD::WrapperRIP:
2544 if (!matchWrapper(N, AM))
2545 return false;
2546 break;
2548 case ISD::LOAD:
2549 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
2550 return false;
2551 break;
2553 case ISD::FrameIndex:
2554 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2555 AM.Base_Reg.getNode() == nullptr &&
2556 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
2557 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
2558 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
2559 return false;
2561 break;
2563 case ISD::SHL:
2564 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2565 break;
2567 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
2568 unsigned Val = CN->getZExtValue();
2569 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
2570 // that the base operand remains free for further matching. If
2571 // the base doesn't end up getting used, a post-processing step
2572 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
2573 if (Val == 1 || Val == 2 || Val == 3) {
2574 SDValue ShVal = N.getOperand(0);
2575 AM.Scale = 1 << Val;
2576 AM.IndexReg = matchIndexRecursively(ShVal, AM, Depth + 1);
2577 return false;
2580 break;
2582 case ISD::SRL: {
2583 // Scale must not be used already.
2584 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2586 // We only handle up to 64-bit values here as those are what matter for
2587 // addressing mode optimizations.
2588 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2589 "Unexpected value size!");
2591 SDValue And = N.getOperand(0);
2592 if (And.getOpcode() != ISD::AND) break;
2593 SDValue X = And.getOperand(0);
2595 // The mask used for the transform is expected to be post-shift, but we
2596 // found the shift first so just apply the shift to the mask before passing
2597 // it down.
2598 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
2599 !isa<ConstantSDNode>(And.getOperand(1)))
2600 break;
2601 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
2603 // Try to fold the mask and shift into the scale, and return false if we
2604 // succeed.
2605 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
2606 return false;
2607 break;
2610 case ISD::SMUL_LOHI:
2611 case ISD::UMUL_LOHI:
2612 // A mul_lohi where we need the low part can be folded as a plain multiply.
2613 if (N.getResNo() != 0) break;
2614 [[fallthrough]];
2615 case ISD::MUL:
2616 case X86ISD::MUL_IMM:
2617 // X*[3,5,9] -> X+X*[2,4,8]
2618 if (AM.BaseType == X86ISelAddressMode::RegBase &&
2619 AM.Base_Reg.getNode() == nullptr &&
2620 AM.IndexReg.getNode() == nullptr) {
2621 if (auto *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
2622 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
2623 CN->getZExtValue() == 9) {
2624 AM.Scale = unsigned(CN->getZExtValue())-1;
2626 SDValue MulVal = N.getOperand(0);
2627 SDValue Reg;
2629 // Okay, we know that we have a scale by now. However, if the scaled
2630 // value is an add of something and a constant, we can fold the
2631 // constant into the disp field here.
2632 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
2633 isa<ConstantSDNode>(MulVal.getOperand(1))) {
2634 Reg = MulVal.getOperand(0);
2635 auto *AddVal = cast<ConstantSDNode>(MulVal.getOperand(1));
2636 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
2637 if (foldOffsetIntoAddress(Disp, AM))
2638 Reg = N.getOperand(0);
2639 } else {
2640 Reg = N.getOperand(0);
2643 AM.IndexReg = AM.Base_Reg = Reg;
2644 return false;
2647 break;
2649 case ISD::SUB: {
2650 // Given A-B, if A can be completely folded into the address and
2651 // the index field with the index field unused, use -B as the index.
2652 // This is a win if a has multiple parts that can be folded into
2653 // the address. Also, this saves a mov if the base register has
2654 // other uses, since it avoids a two-address sub instruction, however
2655 // it costs an additional mov if the index register has other uses.
2657 // Add an artificial use to this node so that we can keep track of
2658 // it if it gets CSE'd with a different node.
2659 HandleSDNode Handle(N);
2661 // Test if the LHS of the sub can be folded.
2662 X86ISelAddressMode Backup = AM;
2663 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2664 N = Handle.getValue();
2665 AM = Backup;
2666 break;
2668 N = Handle.getValue();
2669 // Test if the index field is free for use.
2670 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2671 AM = Backup;
2672 break;
2675 int Cost = 0;
2676 SDValue RHS = N.getOperand(1);
2677 // If the RHS involves a register with multiple uses, this
2678 // transformation incurs an extra mov, due to the neg instruction
2679 // clobbering its operand.
2680 if (!RHS.getNode()->hasOneUse() ||
2681 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2682 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2683 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2684 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2685 RHS.getOperand(0).getValueType() == MVT::i32))
2686 ++Cost;
2687 // If the base is a register with multiple uses, this
2688 // transformation may save a mov.
2689 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2690 !AM.Base_Reg.getNode()->hasOneUse()) ||
2691 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2692 --Cost;
2693 // If the folded LHS was interesting, this transformation saves
2694 // address arithmetic.
2695 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2696 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2697 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2698 --Cost;
2699 // If it doesn't look like it may be an overall win, don't do it.
2700 if (Cost >= 0) {
2701 AM = Backup;
2702 break;
2705 // Ok, the transformation is legal and appears profitable. Go for it.
2706 // Negation will be emitted later to avoid creating dangling nodes if this
2707 // was an unprofitable LEA.
2708 AM.IndexReg = RHS;
2709 AM.NegateIndex = true;
2710 AM.Scale = 1;
2711 return false;
2714 case ISD::OR:
2715 case ISD::XOR:
2716 // See if we can treat the OR/XOR node as an ADD node.
2717 if (!CurDAG->isADDLike(N))
2718 break;
2719 [[fallthrough]];
2720 case ISD::ADD:
2721 if (!matchAdd(N, AM, Depth))
2722 return false;
2723 break;
2725 case ISD::AND: {
2726 // Perform some heroic transforms on an and of a constant-count shift
2727 // with a constant to enable use of the scaled offset field.
2729 // Scale must not be used already.
2730 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2732 // We only handle up to 64-bit values here as those are what matter for
2733 // addressing mode optimizations.
2734 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2735 "Unexpected value size!");
2737 if (!isa<ConstantSDNode>(N.getOperand(1)))
2738 break;
2740 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2741 SDValue Shift = N.getOperand(0);
2742 SDValue X = Shift.getOperand(0);
2744 uint64_t Mask = N.getConstantOperandVal(1);
2746 // Try to fold the mask and shift into an extract and scale.
2747 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2748 return false;
2750 // Try to fold the mask and shift directly into the scale.
2751 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2752 return false;
2754 // Try to fold the mask and shift into BEXTR and scale.
2755 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2756 return false;
2759 // Try to swap the mask and shift to place shifts which can be done as
2760 // a scale on the outside of the mask.
2761 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2762 return false;
2764 break;
2766 case ISD::ZERO_EXTEND: {
2767 // Try to widen a zexted shift left to the same size as its use, so we can
2768 // match the shift as a scale factor.
2769 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2770 break;
2772 SDValue Src = N.getOperand(0);
2774 // See if we can match a zext(addlike(x,c)).
2775 // TODO: Move more ZERO_EXTEND patterns into matchIndexRecursively.
2776 if (Src.getOpcode() == ISD::ADD || Src.getOpcode() == ISD::OR)
2777 if (SDValue Index = matchIndexRecursively(N, AM, Depth + 1))
2778 if (Index != N) {
2779 AM.IndexReg = Index;
2780 return false;
2783 // Peek through mask: zext(and(shl(x,c1),c2))
2784 APInt Mask = APInt::getAllOnes(Src.getScalarValueSizeInBits());
2785 if (Src.getOpcode() == ISD::AND && Src.hasOneUse())
2786 if (auto *MaskC = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
2787 Mask = MaskC->getAPIntValue();
2788 Src = Src.getOperand(0);
2791 if (Src.getOpcode() == ISD::SHL && Src.hasOneUse() && N->hasOneUse()) {
2792 // Give up if the shift is not a valid scale factor [1,2,3].
2793 SDValue ShlSrc = Src.getOperand(0);
2794 SDValue ShlAmt = Src.getOperand(1);
2795 auto *ShAmtC = dyn_cast<ConstantSDNode>(ShlAmt);
2796 if (!ShAmtC)
2797 break;
2798 unsigned ShAmtV = ShAmtC->getZExtValue();
2799 if (ShAmtV > 3)
2800 break;
2802 // The narrow shift must only shift out zero bits (it must be 'nuw').
2803 // That makes it safe to widen to the destination type.
2804 APInt HighZeros =
2805 APInt::getHighBitsSet(ShlSrc.getValueSizeInBits(), ShAmtV);
2806 if (!Src->getFlags().hasNoUnsignedWrap() &&
2807 !CurDAG->MaskedValueIsZero(ShlSrc, HighZeros & Mask))
2808 break;
2810 // zext (shl nuw i8 %x, C1) to i32
2811 // --> shl (zext i8 %x to i32), (zext C1)
2812 // zext (and (shl nuw i8 %x, C1), C2) to i32
2813 // --> shl (zext i8 (and %x, C2 >> C1) to i32), (zext C1)
2814 MVT SrcVT = ShlSrc.getSimpleValueType();
2815 MVT VT = N.getSimpleValueType();
2816 SDLoc DL(N);
2818 SDValue Res = ShlSrc;
2819 if (!Mask.isAllOnes()) {
2820 Res = CurDAG->getConstant(Mask.lshr(ShAmtV), DL, SrcVT);
2821 insertDAGNode(*CurDAG, N, Res);
2822 Res = CurDAG->getNode(ISD::AND, DL, SrcVT, ShlSrc, Res);
2823 insertDAGNode(*CurDAG, N, Res);
2825 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Res);
2826 insertDAGNode(*CurDAG, N, Zext);
2827 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, ShlAmt);
2828 insertDAGNode(*CurDAG, N, NewShl);
2829 CurDAG->ReplaceAllUsesWith(N, NewShl);
2830 CurDAG->RemoveDeadNode(N.getNode());
2832 // Convert the shift to scale factor.
2833 AM.Scale = 1 << ShAmtV;
2834 // If matchIndexRecursively is not called here,
2835 // Zext may be replaced by other nodes but later used to call a builder
2836 // method
2837 AM.IndexReg = matchIndexRecursively(Zext, AM, Depth + 1);
2838 return false;
2841 if (Src.getOpcode() == ISD::SRL && !Mask.isAllOnes()) {
2842 // Try to fold the mask and shift into an extract and scale.
2843 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask.getZExtValue(), Src,
2844 Src.getOperand(0), AM))
2845 return false;
2847 // Try to fold the mask and shift directly into the scale.
2848 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask.getZExtValue(), Src,
2849 Src.getOperand(0), AM))
2850 return false;
2852 // Try to fold the mask and shift into BEXTR and scale.
2853 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask.getZExtValue(), Src,
2854 Src.getOperand(0), AM, *Subtarget))
2855 return false;
2858 break;
2862 return matchAddressBase(N, AM);
2865 /// Helper for MatchAddress. Add the specified node to the
2866 /// specified addressing mode without any further recursion.
2867 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2868 // Is the base register already occupied?
2869 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2870 // If so, check to see if the scale index register is set.
2871 if (!AM.IndexReg.getNode()) {
2872 AM.IndexReg = N;
2873 AM.Scale = 1;
2874 return false;
2877 // Otherwise, we cannot select it.
2878 return true;
2881 // Default, generate it as a register.
2882 AM.BaseType = X86ISelAddressMode::RegBase;
2883 AM.Base_Reg = N;
2884 return false;
2887 bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N,
2888 X86ISelAddressMode &AM,
2889 unsigned Depth) {
2890 SDLoc dl(N);
2891 LLVM_DEBUG({
2892 dbgs() << "MatchVectorAddress: ";
2893 AM.dump(CurDAG);
2895 // Limit recursion.
2896 if (Depth >= SelectionDAG::MaxRecursionDepth)
2897 return matchAddressBase(N, AM);
2899 // TODO: Support other operations.
2900 switch (N.getOpcode()) {
2901 case ISD::Constant: {
2902 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2903 if (!foldOffsetIntoAddress(Val, AM))
2904 return false;
2905 break;
2907 case X86ISD::Wrapper:
2908 if (!matchWrapper(N, AM))
2909 return false;
2910 break;
2911 case ISD::ADD: {
2912 // Add an artificial use to this node so that we can keep track of
2913 // it if it gets CSE'd with a different node.
2914 HandleSDNode Handle(N);
2916 X86ISelAddressMode Backup = AM;
2917 if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) &&
2918 !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2919 Depth + 1))
2920 return false;
2921 AM = Backup;
2923 // Try again after commuting the operands.
2924 if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM,
2925 Depth + 1) &&
2926 !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM,
2927 Depth + 1))
2928 return false;
2929 AM = Backup;
2931 N = Handle.getValue();
2932 break;
2936 return matchAddressBase(N, AM);
2939 /// Helper for selectVectorAddr. Handles things that can be folded into a
2940 /// gather/scatter address. The index register and scale should have already
2941 /// been handled.
2942 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2943 return matchVectorAddressRecursively(N, AM, 0);
2946 bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
2947 SDValue IndexOp, SDValue ScaleOp,
2948 SDValue &Base, SDValue &Scale,
2949 SDValue &Index, SDValue &Disp,
2950 SDValue &Segment) {
2951 X86ISelAddressMode AM;
2952 AM.Scale = ScaleOp->getAsZExtVal();
2954 // Attempt to match index patterns, as long as we're not relying on implicit
2955 // sign-extension, which is performed BEFORE scale.
2956 if (IndexOp.getScalarValueSizeInBits() == BasePtr.getScalarValueSizeInBits())
2957 AM.IndexReg = matchIndexRecursively(IndexOp, AM, 0);
2958 else
2959 AM.IndexReg = IndexOp;
2961 unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
2962 if (AddrSpace == X86AS::GS)
2963 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2964 if (AddrSpace == X86AS::FS)
2965 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2966 if (AddrSpace == X86AS::SS)
2967 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2969 SDLoc DL(BasePtr);
2970 MVT VT = BasePtr.getSimpleValueType();
2972 // Try to match into the base and displacement fields.
2973 if (matchVectorAddress(BasePtr, AM))
2974 return false;
2976 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2977 return true;
2980 /// Returns true if it is able to pattern match an addressing mode.
2981 /// It returns the operands which make up the maximal addressing mode it can
2982 /// match by reference.
2984 /// Parent is the parent node of the addr operand that is being matched. It
2985 /// is always a load, store, atomic node, or null. It is only null when
2986 /// checking memory operands for inline asm nodes.
2987 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2988 SDValue &Scale, SDValue &Index,
2989 SDValue &Disp, SDValue &Segment) {
2990 X86ISelAddressMode AM;
2992 if (Parent &&
2993 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2994 // that are not a MemSDNode, and thus don't have proper addrspace info.
2995 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2996 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2997 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2998 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2999 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
3000 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
3001 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
3002 unsigned AddrSpace =
3003 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
3004 if (AddrSpace == X86AS::GS)
3005 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
3006 if (AddrSpace == X86AS::FS)
3007 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
3008 if (AddrSpace == X86AS::SS)
3009 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
3012 // Save the DL and VT before calling matchAddress, it can invalidate N.
3013 SDLoc DL(N);
3014 MVT VT = N.getSimpleValueType();
3016 if (matchAddress(N, AM))
3017 return false;
3019 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3020 return true;
3023 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
3024 // Cannot use 32 bit constants to reference objects in kernel/large code
3025 // model.
3026 if (TM.getCodeModel() == CodeModel::Kernel ||
3027 TM.getCodeModel() == CodeModel::Large)
3028 return false;
3030 // In static codegen with small code model, we can get the address of a label
3031 // into a register with 'movl'
3032 if (N->getOpcode() != X86ISD::Wrapper)
3033 return false;
3035 N = N.getOperand(0);
3037 // At least GNU as does not accept 'movl' for TPOFF relocations.
3038 // FIXME: We could use 'movl' when we know we are targeting MC.
3039 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
3040 return false;
3042 Imm = N;
3043 // Small/medium code model can reference non-TargetGlobalAddress objects with
3044 // 32 bit constants.
3045 if (N->getOpcode() != ISD::TargetGlobalAddress) {
3046 return TM.getCodeModel() == CodeModel::Small ||
3047 TM.getCodeModel() == CodeModel::Medium;
3050 const GlobalValue *GV = cast<GlobalAddressSDNode>(N)->getGlobal();
3051 if (std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange())
3052 return CR->getUnsignedMax().ult(1ull << 32);
3054 return !TM.isLargeGlobalValue(GV);
3057 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
3058 SDValue &Scale, SDValue &Index,
3059 SDValue &Disp, SDValue &Segment) {
3060 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
3061 SDLoc DL(N);
3063 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
3064 return false;
3066 auto *RN = dyn_cast<RegisterSDNode>(Base);
3067 if (RN && RN->getReg() == 0)
3068 Base = CurDAG->getRegister(0, MVT::i64);
3069 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
3070 // Base could already be %rip, particularly in the x32 ABI.
3071 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3072 MVT::i64), 0);
3073 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3074 Base);
3077 RN = dyn_cast<RegisterSDNode>(Index);
3078 if (RN && RN->getReg() == 0)
3079 Index = CurDAG->getRegister(0, MVT::i64);
3080 else {
3081 assert(Index.getValueType() == MVT::i32 &&
3082 "Expect to be extending 32-bit registers for use in LEA");
3083 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
3084 MVT::i64), 0);
3085 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
3086 Index);
3089 return true;
3092 /// Calls SelectAddr and determines if the maximal addressing
3093 /// mode it matches can be cost effectively emitted as an LEA instruction.
3094 bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
3095 SDValue &Base, SDValue &Scale,
3096 SDValue &Index, SDValue &Disp,
3097 SDValue &Segment) {
3098 X86ISelAddressMode AM;
3100 // Save the DL and VT before calling matchAddress, it can invalidate N.
3101 SDLoc DL(N);
3102 MVT VT = N.getSimpleValueType();
3104 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
3105 // segments.
3106 SDValue Copy = AM.Segment;
3107 SDValue T = CurDAG->getRegister(0, MVT::i32);
3108 AM.Segment = T;
3109 if (matchAddress(N, AM))
3110 return false;
3111 assert (T == AM.Segment);
3112 AM.Segment = Copy;
3114 unsigned Complexity = 0;
3115 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
3116 Complexity = 1;
3117 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
3118 Complexity = 4;
3120 if (AM.IndexReg.getNode())
3121 Complexity++;
3123 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
3124 // a simple shift.
3125 if (AM.Scale > 1)
3126 Complexity++;
3128 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
3129 // to a LEA. This is determined with some experimentation but is by no means
3130 // optimal (especially for code size consideration). LEA is nice because of
3131 // its three-address nature. Tweak the cost function again when we can run
3132 // convertToThreeAddress() at register allocation time.
3133 if (AM.hasSymbolicDisplacement()) {
3134 // For X86-64, always use LEA to materialize RIP-relative addresses.
3135 if (Subtarget->is64Bit())
3136 Complexity = 4;
3137 else
3138 Complexity += 2;
3141 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
3142 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
3143 // duplicating flag-producing instructions later in the pipeline.
3144 if (N.getOpcode() == ISD::ADD) {
3145 auto isMathWithFlags = [](SDValue V) {
3146 switch (V.getOpcode()) {
3147 case X86ISD::ADD:
3148 case X86ISD::SUB:
3149 case X86ISD::ADC:
3150 case X86ISD::SBB:
3151 case X86ISD::SMUL:
3152 case X86ISD::UMUL:
3153 /* TODO: These opcodes can be added safely, but we may want to justify
3154 their inclusion for different reasons (better for reg-alloc).
3155 case X86ISD::OR:
3156 case X86ISD::XOR:
3157 case X86ISD::AND:
3159 // Value 1 is the flag output of the node - verify it's not dead.
3160 return !SDValue(V.getNode(), 1).use_empty();
3161 default:
3162 return false;
3165 // TODO: We might want to factor in whether there's a load folding
3166 // opportunity for the math op that disappears with LEA.
3167 if (isMathWithFlags(N.getOperand(0)) || isMathWithFlags(N.getOperand(1)))
3168 Complexity++;
3171 if (AM.Disp)
3172 Complexity++;
3174 // If it isn't worth using an LEA, reject it.
3175 if (Complexity <= 2)
3176 return false;
3178 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
3179 return true;
3182 /// This is only run on TargetGlobalTLSAddress nodes.
3183 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
3184 SDValue &Scale, SDValue &Index,
3185 SDValue &Disp, SDValue &Segment) {
3186 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress ||
3187 N.getOpcode() == ISD::TargetExternalSymbol);
3189 X86ISelAddressMode AM;
3190 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N)) {
3191 AM.GV = GA->getGlobal();
3192 AM.Disp += GA->getOffset();
3193 AM.SymbolFlags = GA->getTargetFlags();
3194 } else {
3195 auto *SA = cast<ExternalSymbolSDNode>(N);
3196 AM.ES = SA->getSymbol();
3197 AM.SymbolFlags = SA->getTargetFlags();
3200 if (Subtarget->is32Bit()) {
3201 AM.Scale = 1;
3202 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
3205 MVT VT = N.getSimpleValueType();
3206 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
3207 return true;
3210 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
3211 // Keep track of the original value type and whether this value was
3212 // truncated. If we see a truncation from pointer type to VT that truncates
3213 // bits that are known to be zero, we can use a narrow reference.
3214 EVT VT = N.getValueType();
3215 bool WasTruncated = false;
3216 if (N.getOpcode() == ISD::TRUNCATE) {
3217 WasTruncated = true;
3218 N = N.getOperand(0);
3221 if (N.getOpcode() != X86ISD::Wrapper)
3222 return false;
3224 // We can only use non-GlobalValues as immediates if they were not truncated,
3225 // as we do not have any range information. If we have a GlobalValue and the
3226 // address was not truncated, we can select it as an operand directly.
3227 unsigned Opc = N.getOperand(0)->getOpcode();
3228 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
3229 Op = N.getOperand(0);
3230 // We can only select the operand directly if we didn't have to look past a
3231 // truncate.
3232 return !WasTruncated;
3235 // Check that the global's range fits into VT.
3236 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
3237 std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
3238 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
3239 return false;
3241 // Okay, we can use a narrow reference.
3242 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
3243 GA->getOffset(), GA->getTargetFlags());
3244 return true;
3247 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
3248 SDValue &Base, SDValue &Scale,
3249 SDValue &Index, SDValue &Disp,
3250 SDValue &Segment) {
3251 assert(Root && P && "Unknown root/parent nodes");
3252 if (!ISD::isNON_EXTLoad(N.getNode()) ||
3253 !IsProfitableToFold(N, P, Root) ||
3254 !IsLegalToFold(N, P, Root, OptLevel))
3255 return false;
3257 return selectAddr(N.getNode(),
3258 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3261 bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N,
3262 SDValue &Base, SDValue &Scale,
3263 SDValue &Index, SDValue &Disp,
3264 SDValue &Segment) {
3265 assert(Root && P && "Unknown root/parent nodes");
3266 if (N->getOpcode() != X86ISD::VBROADCAST_LOAD ||
3267 !IsProfitableToFold(N, P, Root) ||
3268 !IsLegalToFold(N, P, Root, OptLevel))
3269 return false;
3271 return selectAddr(N.getNode(),
3272 N.getOperand(1), Base, Scale, Index, Disp, Segment);
3275 /// Return an SDNode that returns the value of the global base register.
3276 /// Output instructions required to initialize the global base register,
3277 /// if necessary.
3278 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
3279 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
3280 auto &DL = MF->getDataLayout();
3281 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
3284 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
3285 if (N->getOpcode() == ISD::TRUNCATE)
3286 N = N->getOperand(0).getNode();
3287 if (N->getOpcode() != X86ISD::Wrapper)
3288 return false;
3290 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
3291 if (!GA)
3292 return false;
3294 auto *GV = GA->getGlobal();
3295 std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
3296 if (CR)
3297 return CR->getSignedMin().sge(-1ull << Width) &&
3298 CR->getSignedMax().slt(1ull << Width);
3299 // In the kernel code model, globals are in the negative 2GB of the address
3300 // space, so globals can be a sign extended 32-bit immediate.
3301 // In other code models, small globals are in the low 2GB of the address
3302 // space, so sign extending them is equivalent to zero extending them.
3303 return Width == 32 && !TM.isLargeGlobalValue(GV);
3306 X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
3307 assert(N->isMachineOpcode() && "Unexpected node");
3308 unsigned Opc = N->getMachineOpcode();
3309 const MCInstrDesc &MCID = getInstrInfo()->get(Opc);
3310 int CondNo = X86::getCondSrcNoFromDesc(MCID);
3311 if (CondNo < 0)
3312 return X86::COND_INVALID;
3314 return static_cast<X86::CondCode>(N->getConstantOperandVal(CondNo));
3317 /// Test whether the given X86ISD::CMP node has any users that use a flag
3318 /// other than ZF.
3319 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
3320 // Examine each user of the node.
3321 for (SDUse &Use : Flags->uses()) {
3322 // Only check things that use the flags.
3323 if (Use.getResNo() != Flags.getResNo())
3324 continue;
3325 SDNode *User = Use.getUser();
3326 // Only examine CopyToReg uses that copy to EFLAGS.
3327 if (User->getOpcode() != ISD::CopyToReg ||
3328 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3329 return false;
3330 // Examine each user of the CopyToReg use.
3331 for (SDUse &FlagUse : User->uses()) {
3332 // Only examine the Flag result.
3333 if (FlagUse.getResNo() != 1)
3334 continue;
3335 // Anything unusual: assume conservatively.
3336 if (!FlagUse.getUser()->isMachineOpcode())
3337 return false;
3338 // Examine the condition code of the user.
3339 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3341 switch (CC) {
3342 // Comparisons which only use the zero flag.
3343 case X86::COND_E: case X86::COND_NE:
3344 continue;
3345 // Anything else: assume conservatively.
3346 default:
3347 return false;
3351 return true;
3354 /// Test whether the given X86ISD::CMP node has any uses which require the SF
3355 /// flag to be accurate.
3356 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
3357 // Examine each user of the node.
3358 for (SDUse &Use : Flags->uses()) {
3359 // Only check things that use the flags.
3360 if (Use.getResNo() != Flags.getResNo())
3361 continue;
3362 SDNode *User = Use.getUser();
3363 // Only examine CopyToReg uses that copy to EFLAGS.
3364 if (User->getOpcode() != ISD::CopyToReg ||
3365 cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3366 return false;
3367 // Examine each user of the CopyToReg use.
3368 for (SDUse &FlagUse : User->uses()) {
3369 // Only examine the Flag result.
3370 if (FlagUse.getResNo() != 1)
3371 continue;
3372 // Anything unusual: assume conservatively.
3373 if (!FlagUse.getUser()->isMachineOpcode())
3374 return false;
3375 // Examine the condition code of the user.
3376 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3378 switch (CC) {
3379 // Comparisons which don't examine the SF flag.
3380 case X86::COND_A: case X86::COND_AE:
3381 case X86::COND_B: case X86::COND_BE:
3382 case X86::COND_E: case X86::COND_NE:
3383 case X86::COND_O: case X86::COND_NO:
3384 case X86::COND_P: case X86::COND_NP:
3385 continue;
3386 // Anything else: assume conservatively.
3387 default:
3388 return false;
3392 return true;
3395 static bool mayUseCarryFlag(X86::CondCode CC) {
3396 switch (CC) {
3397 // Comparisons which don't examine the CF flag.
3398 case X86::COND_O: case X86::COND_NO:
3399 case X86::COND_E: case X86::COND_NE:
3400 case X86::COND_S: case X86::COND_NS:
3401 case X86::COND_P: case X86::COND_NP:
3402 case X86::COND_L: case X86::COND_GE:
3403 case X86::COND_G: case X86::COND_LE:
3404 return false;
3405 // Anything else: assume conservatively.
3406 default:
3407 return true;
3411 /// Test whether the given node which sets flags has any uses which require the
3412 /// CF flag to be accurate.
3413 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
3414 // Examine each user of the node.
3415 for (SDUse &Use : Flags->uses()) {
3416 // Only check things that use the flags.
3417 if (Use.getResNo() != Flags.getResNo())
3418 continue;
3420 SDNode *User = Use.getUser();
3421 unsigned UserOpc = User->getOpcode();
3423 if (UserOpc == ISD::CopyToReg) {
3424 // Only examine CopyToReg uses that copy to EFLAGS.
3425 if (cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
3426 return false;
3427 // Examine each user of the CopyToReg use.
3428 for (SDUse &FlagUse : User->uses()) {
3429 // Only examine the Flag result.
3430 if (FlagUse.getResNo() != 1)
3431 continue;
3432 // Anything unusual: assume conservatively.
3433 if (!FlagUse.getUser()->isMachineOpcode())
3434 return false;
3435 // Examine the condition code of the user.
3436 X86::CondCode CC = getCondFromNode(FlagUse.getUser());
3438 if (mayUseCarryFlag(CC))
3439 return false;
3442 // This CopyToReg is ok. Move on to the next user.
3443 continue;
3446 // This might be an unselected node. So look for the pre-isel opcodes that
3447 // use flags.
3448 unsigned CCOpNo;
3449 switch (UserOpc) {
3450 default:
3451 // Something unusual. Be conservative.
3452 return false;
3453 case X86ISD::SETCC: CCOpNo = 0; break;
3454 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
3455 case X86ISD::CMOV: CCOpNo = 2; break;
3456 case X86ISD::BRCOND: CCOpNo = 2; break;
3459 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
3460 if (mayUseCarryFlag(CC))
3461 return false;
3463 return true;
3466 /// Check whether or not the chain ending in StoreNode is suitable for doing
3467 /// the {load; op; store} to modify transformation.
3468 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
3469 SDValue StoredVal, SelectionDAG *CurDAG,
3470 unsigned LoadOpNo,
3471 LoadSDNode *&LoadNode,
3472 SDValue &InputChain) {
3473 // Is the stored value result 0 of the operation?
3474 if (StoredVal.getResNo() != 0) return false;
3476 // Are there other uses of the operation other than the store?
3477 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
3479 // Is the store non-extending and non-indexed?
3480 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
3481 return false;
3483 SDValue Load = StoredVal->getOperand(LoadOpNo);
3484 // Is the stored value a non-extending and non-indexed load?
3485 if (!ISD::isNormalLoad(Load.getNode())) return false;
3487 // Return LoadNode by reference.
3488 LoadNode = cast<LoadSDNode>(Load);
3490 // Is store the only read of the loaded value?
3491 if (!Load.hasOneUse())
3492 return false;
3494 // Is the address of the store the same as the load?
3495 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
3496 LoadNode->getOffset() != StoreNode->getOffset())
3497 return false;
3499 bool FoundLoad = false;
3500 SmallVector<SDValue, 4> ChainOps;
3501 SmallVector<const SDNode *, 4> LoopWorklist;
3502 SmallPtrSet<const SDNode *, 16> Visited;
3503 const unsigned int Max = 1024;
3505 // Visualization of Load-Op-Store fusion:
3506 // -------------------------
3507 // Legend:
3508 // *-lines = Chain operand dependencies.
3509 // |-lines = Normal operand dependencies.
3510 // Dependencies flow down and right. n-suffix references multiple nodes.
3512 // C Xn C
3513 // * * *
3514 // * * *
3515 // Xn A-LD Yn TF Yn
3516 // * * \ | * |
3517 // * * \ | * |
3518 // * * \ | => A--LD_OP_ST
3519 // * * \| \
3520 // TF OP \
3521 // * | \ Zn
3522 // * | \
3523 // A-ST Zn
3526 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
3527 // #2: Yn -> LD
3528 // #3: ST -> Zn
3530 // Ensure the transform is safe by checking for the dual
3531 // dependencies to make sure we do not induce a loop.
3533 // As LD is a predecessor to both OP and ST we can do this by checking:
3534 // a). if LD is a predecessor to a member of Xn or Yn.
3535 // b). if a Zn is a predecessor to ST.
3537 // However, (b) can only occur through being a chain predecessor to
3538 // ST, which is the same as Zn being a member or predecessor of Xn,
3539 // which is a subset of LD being a predecessor of Xn. So it's
3540 // subsumed by check (a).
3542 SDValue Chain = StoreNode->getChain();
3544 // Gather X elements in ChainOps.
3545 if (Chain == Load.getValue(1)) {
3546 FoundLoad = true;
3547 ChainOps.push_back(Load.getOperand(0));
3548 } else if (Chain.getOpcode() == ISD::TokenFactor) {
3549 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
3550 SDValue Op = Chain.getOperand(i);
3551 if (Op == Load.getValue(1)) {
3552 FoundLoad = true;
3553 // Drop Load, but keep its chain. No cycle check necessary.
3554 ChainOps.push_back(Load.getOperand(0));
3555 continue;
3557 LoopWorklist.push_back(Op.getNode());
3558 ChainOps.push_back(Op);
3562 if (!FoundLoad)
3563 return false;
3565 // Worklist is currently Xn. Add Yn to worklist.
3566 for (SDValue Op : StoredVal->ops())
3567 if (Op.getNode() != LoadNode)
3568 LoopWorklist.push_back(Op.getNode());
3570 // Check (a) if Load is a predecessor to Xn + Yn
3571 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
3572 true))
3573 return false;
3575 InputChain =
3576 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
3577 return true;
3580 // Change a chain of {load; op; store} of the same value into a simple op
3581 // through memory of that value, if the uses of the modified value and its
3582 // address are suitable.
3584 // The tablegen pattern memory operand pattern is currently not able to match
3585 // the case where the EFLAGS on the original operation are used.
3587 // To move this to tablegen, we'll need to improve tablegen to allow flags to
3588 // be transferred from a node in the pattern to the result node, probably with
3589 // a new keyword. For example, we have this
3590 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3591 // [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
3592 // but maybe need something like this
3593 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
3594 // [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst),
3595 // (transferrable EFLAGS)]>;
3597 // Until then, we manually fold these and instruction select the operation
3598 // here.
3599 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
3600 auto *StoreNode = cast<StoreSDNode>(Node);
3601 SDValue StoredVal = StoreNode->getOperand(1);
3602 unsigned Opc = StoredVal->getOpcode();
3604 // Before we try to select anything, make sure this is memory operand size
3605 // and opcode we can handle. Note that this must match the code below that
3606 // actually lowers the opcodes.
3607 EVT MemVT = StoreNode->getMemoryVT();
3608 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
3609 MemVT != MVT::i8)
3610 return false;
3612 bool IsCommutable = false;
3613 bool IsNegate = false;
3614 switch (Opc) {
3615 default:
3616 return false;
3617 case X86ISD::SUB:
3618 IsNegate = isNullConstant(StoredVal.getOperand(0));
3619 break;
3620 case X86ISD::SBB:
3621 break;
3622 case X86ISD::ADD:
3623 case X86ISD::ADC:
3624 case X86ISD::AND:
3625 case X86ISD::OR:
3626 case X86ISD::XOR:
3627 IsCommutable = true;
3628 break;
3631 unsigned LoadOpNo = IsNegate ? 1 : 0;
3632 LoadSDNode *LoadNode = nullptr;
3633 SDValue InputChain;
3634 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3635 LoadNode, InputChain)) {
3636 if (!IsCommutable)
3637 return false;
3639 // This operation is commutable, try the other operand.
3640 LoadOpNo = 1;
3641 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
3642 LoadNode, InputChain))
3643 return false;
3646 SDValue Base, Scale, Index, Disp, Segment;
3647 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
3648 Segment))
3649 return false;
3651 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
3652 unsigned Opc8) {
3653 switch (MemVT.getSimpleVT().SimpleTy) {
3654 case MVT::i64:
3655 return Opc64;
3656 case MVT::i32:
3657 return Opc32;
3658 case MVT::i16:
3659 return Opc16;
3660 case MVT::i8:
3661 return Opc8;
3662 default:
3663 llvm_unreachable("Invalid size!");
3667 MachineSDNode *Result;
3668 switch (Opc) {
3669 case X86ISD::SUB:
3670 // Handle negate.
3671 if (IsNegate) {
3672 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
3673 X86::NEG8m);
3674 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3675 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3676 MVT::Other, Ops);
3677 break;
3679 [[fallthrough]];
3680 case X86ISD::ADD:
3681 // Try to match inc/dec.
3682 if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
3683 bool IsOne = isOneConstant(StoredVal.getOperand(1));
3684 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
3685 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3686 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3687 unsigned NewOpc =
3688 ((Opc == X86ISD::ADD) == IsOne)
3689 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3690 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3691 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3692 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3693 MVT::Other, Ops);
3694 break;
3697 [[fallthrough]];
3698 case X86ISD::ADC:
3699 case X86ISD::SBB:
3700 case X86ISD::AND:
3701 case X86ISD::OR:
3702 case X86ISD::XOR: {
3703 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3704 switch (Opc) {
3705 case X86ISD::ADD:
3706 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3707 X86::ADD8mr);
3708 case X86ISD::ADC:
3709 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3710 X86::ADC8mr);
3711 case X86ISD::SUB:
3712 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3713 X86::SUB8mr);
3714 case X86ISD::SBB:
3715 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3716 X86::SBB8mr);
3717 case X86ISD::AND:
3718 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3719 X86::AND8mr);
3720 case X86ISD::OR:
3721 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3722 case X86ISD::XOR:
3723 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3724 X86::XOR8mr);
3725 default:
3726 llvm_unreachable("Invalid opcode!");
3729 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3730 switch (Opc) {
3731 case X86ISD::ADD:
3732 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3733 X86::ADD8mi);
3734 case X86ISD::ADC:
3735 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3736 X86::ADC8mi);
3737 case X86ISD::SUB:
3738 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3739 X86::SUB8mi);
3740 case X86ISD::SBB:
3741 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3742 X86::SBB8mi);
3743 case X86ISD::AND:
3744 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3745 X86::AND8mi);
3746 case X86ISD::OR:
3747 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3748 X86::OR8mi);
3749 case X86ISD::XOR:
3750 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3751 X86::XOR8mi);
3752 default:
3753 llvm_unreachable("Invalid opcode!");
3757 unsigned NewOpc = SelectRegOpcode(Opc);
3758 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3760 // See if the operand is a constant that we can fold into an immediate
3761 // operand.
3762 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3763 int64_t OperandV = OperandC->getSExtValue();
3765 // Check if we can shrink the operand enough to fit in an immediate (or
3766 // fit into a smaller immediate) by negating it and switching the
3767 // operation.
3768 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3769 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3770 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3771 isInt<32>(-OperandV))) &&
3772 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3773 OperandV = -OperandV;
3774 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3777 if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3778 Operand = CurDAG->getSignedTargetConstant(OperandV, SDLoc(Node), MemVT);
3779 NewOpc = SelectImmOpcode(Opc);
3783 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3784 SDValue CopyTo =
3785 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3786 StoredVal.getOperand(2), SDValue());
3788 const SDValue Ops[] = {Base, Scale, Index, Disp,
3789 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3790 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3791 Ops);
3792 } else {
3793 const SDValue Ops[] = {Base, Scale, Index, Disp,
3794 Segment, Operand, InputChain};
3795 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3796 Ops);
3798 break;
3800 default:
3801 llvm_unreachable("Invalid opcode!");
3804 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3805 LoadNode->getMemOperand()};
3806 CurDAG->setNodeMemRefs(Result, MemOps);
3808 // Update Load Chain uses as well.
3809 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3810 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3811 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3812 CurDAG->RemoveDeadNode(Node);
3813 return true;
3816 // See if this is an X & Mask that we can match to BEXTR/BZHI.
3817 // Where Mask is one of the following patterns:
3818 // a) x & (1 << nbits) - 1
3819 // b) x & ~(-1 << nbits)
3820 // c) x & (-1 >> (32 - y))
3821 // d) x << (32 - y) >> (32 - y)
3822 // e) (1 << nbits) - 1
3823 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3824 assert(
3825 (Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::AND ||
3826 Node->getOpcode() == ISD::SRL) &&
3827 "Should be either an and-mask, or right-shift after clearing high bits.");
3829 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3830 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3831 return false;
3833 MVT NVT = Node->getSimpleValueType(0);
3835 // Only supported for 32 and 64 bits.
3836 if (NVT != MVT::i32 && NVT != MVT::i64)
3837 return false;
3839 SDValue NBits;
3840 bool NegateNBits;
3842 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3843 // Else, if we only have BMI1's BEXTR, we require one-use.
3844 const bool AllowExtraUsesByDefault = Subtarget->hasBMI2();
3845 auto checkUses = [AllowExtraUsesByDefault](
3846 SDValue Op, unsigned NUses,
3847 std::optional<bool> AllowExtraUses) {
3848 return AllowExtraUses.value_or(AllowExtraUsesByDefault) ||
3849 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3851 auto checkOneUse = [checkUses](SDValue Op,
3852 std::optional<bool> AllowExtraUses =
3853 std::nullopt) {
3854 return checkUses(Op, 1, AllowExtraUses);
3856 auto checkTwoUse = [checkUses](SDValue Op,
3857 std::optional<bool> AllowExtraUses =
3858 std::nullopt) {
3859 return checkUses(Op, 2, AllowExtraUses);
3862 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3863 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3864 assert(V.getSimpleValueType() == MVT::i32 &&
3865 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3866 "Expected i64 -> i32 truncation");
3867 V = V.getOperand(0);
3869 return V;
3872 // a) x & ((1 << nbits) + (-1))
3873 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits,
3874 &NegateNBits](SDValue Mask) -> bool {
3875 // Match `add`. Must only have one use!
3876 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3877 return false;
3878 // We should be adding all-ones constant (i.e. subtracting one.)
3879 if (!isAllOnesConstant(Mask->getOperand(1)))
3880 return false;
3881 // Match `1 << nbits`. Might be truncated. Must only have one use!
3882 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3883 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3884 return false;
3885 if (!isOneConstant(M0->getOperand(0)))
3886 return false;
3887 NBits = M0->getOperand(1);
3888 NegateNBits = false;
3889 return true;
3892 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3893 V = peekThroughOneUseTruncation(V);
3894 return CurDAG->MaskedValueIsAllOnes(
3895 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3896 NVT.getSizeInBits()));
3899 // b) x & ~(-1 << nbits)
3900 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3901 &NBits, &NegateNBits](SDValue Mask) -> bool {
3902 // Match `~()`. Must only have one use!
3903 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3904 return false;
3905 // The -1 only has to be all-ones for the final Node's NVT.
3906 if (!isAllOnes(Mask->getOperand(1)))
3907 return false;
3908 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3909 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3910 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3911 return false;
3912 // The -1 only has to be all-ones for the final Node's NVT.
3913 if (!isAllOnes(M0->getOperand(0)))
3914 return false;
3915 NBits = M0->getOperand(1);
3916 NegateNBits = false;
3917 return true;
3920 // Try to match potentially-truncated shift amount as `(bitwidth - y)`,
3921 // or leave the shift amount as-is, but then we'll have to negate it.
3922 auto canonicalizeShiftAmt = [&NBits, &NegateNBits](SDValue ShiftAmt,
3923 unsigned Bitwidth) {
3924 NBits = ShiftAmt;
3925 NegateNBits = true;
3926 // Skip over a truncate of the shift amount, if any.
3927 if (NBits.getOpcode() == ISD::TRUNCATE)
3928 NBits = NBits.getOperand(0);
3929 // Try to match the shift amount as (bitwidth - y). It should go away, too.
3930 // If it doesn't match, that's fine, we'll just negate it ourselves.
3931 if (NBits.getOpcode() != ISD::SUB)
3932 return;
3933 auto *V0 = dyn_cast<ConstantSDNode>(NBits.getOperand(0));
3934 if (!V0 || V0->getZExtValue() != Bitwidth)
3935 return;
3936 NBits = NBits.getOperand(1);
3937 NegateNBits = false;
3940 // c) x & (-1 >> z) but then we'll have to subtract z from bitwidth
3941 // or
3942 // c) x & (-1 >> (32 - y))
3943 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, &NegateNBits,
3944 canonicalizeShiftAmt](SDValue Mask) -> bool {
3945 // The mask itself may be truncated.
3946 Mask = peekThroughOneUseTruncation(Mask);
3947 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3948 // Match `l>>`. Must only have one use!
3949 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3950 return false;
3951 // We should be shifting truly all-ones constant.
3952 if (!isAllOnesConstant(Mask.getOperand(0)))
3953 return false;
3954 SDValue M1 = Mask.getOperand(1);
3955 // The shift amount should not be used externally.
3956 if (!checkOneUse(M1))
3957 return false;
3958 canonicalizeShiftAmt(M1, Bitwidth);
3959 // Pattern c. is non-canonical, and is expanded into pattern d. iff there
3960 // is no extra use of the mask. Clearly, there was one since we are here.
3961 // But at the same time, if we need to negate the shift amount,
3962 // then we don't want the mask to stick around, else it's unprofitable.
3963 return !NegateNBits;
3966 SDValue X;
3968 // d) x << z >> z but then we'll have to subtract z from bitwidth
3969 // or
3970 // d) x << (32 - y) >> (32 - y)
3971 auto matchPatternD = [checkOneUse, checkTwoUse, canonicalizeShiftAmt,
3972 AllowExtraUsesByDefault, &NegateNBits,
3973 &X](SDNode *Node) -> bool {
3974 if (Node->getOpcode() != ISD::SRL)
3975 return false;
3976 SDValue N0 = Node->getOperand(0);
3977 if (N0->getOpcode() != ISD::SHL)
3978 return false;
3979 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3980 SDValue N1 = Node->getOperand(1);
3981 SDValue N01 = N0->getOperand(1);
3982 // Both of the shifts must be by the exact same value.
3983 if (N1 != N01)
3984 return false;
3985 canonicalizeShiftAmt(N1, Bitwidth);
3986 // There should not be any external uses of the inner shift / shift amount.
3987 // Note that while we are generally okay with external uses given BMI2,
3988 // iff we need to negate the shift amount, we are not okay with extra uses.
3989 const bool AllowExtraUses = AllowExtraUsesByDefault && !NegateNBits;
3990 if (!checkOneUse(N0, AllowExtraUses) || !checkTwoUse(N1, AllowExtraUses))
3991 return false;
3992 X = N0->getOperand(0);
3993 return true;
3996 auto matchLowBitMask = [matchPatternA, matchPatternB,
3997 matchPatternC](SDValue Mask) -> bool {
3998 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
4001 if (Node->getOpcode() == ISD::AND) {
4002 X = Node->getOperand(0);
4003 SDValue Mask = Node->getOperand(1);
4005 if (matchLowBitMask(Mask)) {
4006 // Great.
4007 } else {
4008 std::swap(X, Mask);
4009 if (!matchLowBitMask(Mask))
4010 return false;
4012 } else if (matchLowBitMask(SDValue(Node, 0))) {
4013 X = CurDAG->getAllOnesConstant(SDLoc(Node), NVT);
4014 } else if (!matchPatternD(Node))
4015 return false;
4017 // If we need to negate the shift amount, require BMI2 BZHI support.
4018 // It's just too unprofitable for BMI1 BEXTR.
4019 if (NegateNBits && !Subtarget->hasBMI2())
4020 return false;
4022 SDLoc DL(Node);
4024 // Truncate the shift amount.
4025 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
4026 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4028 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
4029 // All the other bits are undefined, we do not care about them.
4030 SDValue ImplDef = SDValue(
4031 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
4032 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
4034 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
4035 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
4036 NBits = SDValue(CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
4037 MVT::i32, ImplDef, NBits, SRIdxVal),
4039 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4041 // We might have matched the amount of high bits to be cleared,
4042 // but we want the amount of low bits to be kept, so negate it then.
4043 if (NegateNBits) {
4044 SDValue BitWidthC = CurDAG->getConstant(NVT.getSizeInBits(), DL, MVT::i32);
4045 insertDAGNode(*CurDAG, SDValue(Node, 0), BitWidthC);
4047 NBits = CurDAG->getNode(ISD::SUB, DL, MVT::i32, BitWidthC, NBits);
4048 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4051 if (Subtarget->hasBMI2()) {
4052 // Great, just emit the BZHI..
4053 if (NVT != MVT::i32) {
4054 // But have to place the bit count into the wide-enough register first.
4055 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
4056 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
4059 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
4060 ReplaceNode(Node, Extract.getNode());
4061 SelectCode(Extract.getNode());
4062 return true;
4065 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
4066 // *logically* shifted (potentially with one-use trunc inbetween),
4067 // and the truncation was the only use of the shift,
4068 // and if so look past one-use truncation.
4070 SDValue RealX = peekThroughOneUseTruncation(X);
4071 // FIXME: only if the shift is one-use?
4072 if (RealX != X && RealX.getOpcode() == ISD::SRL)
4073 X = RealX;
4076 MVT XVT = X.getSimpleValueType();
4078 // Else, emitting BEXTR requires one more step.
4079 // The 'control' of BEXTR has the pattern of:
4080 // [15...8 bit][ 7...0 bit] location
4081 // [ bit count][ shift] name
4082 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4084 // Shift NBits left by 8 bits, thus producing 'control'.
4085 // This makes the low 8 bits to be zero.
4086 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
4087 insertDAGNode(*CurDAG, SDValue(Node, 0), C8);
4088 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
4089 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4091 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
4092 // FIXME: only if the shift is one-use?
4093 if (X.getOpcode() == ISD::SRL) {
4094 SDValue ShiftAmt = X.getOperand(1);
4095 X = X.getOperand(0);
4097 assert(ShiftAmt.getValueType() == MVT::i8 &&
4098 "Expected shift amount to be i8");
4100 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
4101 // We could zext to i16 in some form, but we intentionally don't do that.
4102 SDValue OrigShiftAmt = ShiftAmt;
4103 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
4104 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
4106 // And now 'or' these low 8 bits of shift amount into the 'control'.
4107 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
4108 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4111 // But have to place the 'control' into the wide-enough register first.
4112 if (XVT != MVT::i32) {
4113 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
4114 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
4117 // And finally, form the BEXTR itself.
4118 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
4120 // The 'X' was originally truncated. Do that now.
4121 if (XVT != NVT) {
4122 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
4123 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
4126 ReplaceNode(Node, Extract.getNode());
4127 SelectCode(Extract.getNode());
4129 return true;
4132 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
4133 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
4134 MVT NVT = Node->getSimpleValueType(0);
4135 SDLoc dl(Node);
4137 SDValue N0 = Node->getOperand(0);
4138 SDValue N1 = Node->getOperand(1);
4140 // If we have TBM we can use an immediate for the control. If we have BMI
4141 // we should only do this if the BEXTR instruction is implemented well.
4142 // Otherwise moving the control into a register makes this more costly.
4143 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
4144 // hoisting the move immediate would make it worthwhile with a less optimal
4145 // BEXTR?
4146 bool PreferBEXTR =
4147 Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR());
4148 if (!PreferBEXTR && !Subtarget->hasBMI2())
4149 return nullptr;
4151 // Must have a shift right.
4152 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
4153 return nullptr;
4155 // Shift can't have additional users.
4156 if (!N0->hasOneUse())
4157 return nullptr;
4159 // Only supported for 32 and 64 bits.
4160 if (NVT != MVT::i32 && NVT != MVT::i64)
4161 return nullptr;
4163 // Shift amount and RHS of and must be constant.
4164 auto *MaskCst = dyn_cast<ConstantSDNode>(N1);
4165 auto *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
4166 if (!MaskCst || !ShiftCst)
4167 return nullptr;
4169 // And RHS must be a mask.
4170 uint64_t Mask = MaskCst->getZExtValue();
4171 if (!isMask_64(Mask))
4172 return nullptr;
4174 uint64_t Shift = ShiftCst->getZExtValue();
4175 uint64_t MaskSize = llvm::popcount(Mask);
4177 // Don't interfere with something that can be handled by extracting AH.
4178 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
4179 if (Shift == 8 && MaskSize == 8)
4180 return nullptr;
4182 // Make sure we are only using bits that were in the original value, not
4183 // shifted in.
4184 if (Shift + MaskSize > NVT.getSizeInBits())
4185 return nullptr;
4187 // BZHI, if available, is always fast, unlike BEXTR. But even if we decide
4188 // that we can't use BEXTR, it is only worthwhile using BZHI if the mask
4189 // does not fit into 32 bits. Load folding is not a sufficient reason.
4190 if (!PreferBEXTR && MaskSize <= 32)
4191 return nullptr;
4193 SDValue Control;
4194 unsigned ROpc, MOpc;
4196 #define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
4197 if (!PreferBEXTR) {
4198 assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
4199 // If we can't make use of BEXTR then we can't fuse shift+mask stages.
4200 // Let's perform the mask first, and apply shift later. Note that we need to
4201 // widen the mask to account for the fact that we'll apply shift afterwards!
4202 Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
4203 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
4204 : GET_EGPR_IF_ENABLED(X86::BZHI32rr);
4205 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
4206 : GET_EGPR_IF_ENABLED(X86::BZHI32rm);
4207 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4208 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4209 } else {
4210 // The 'control' of BEXTR has the pattern of:
4211 // [15...8 bit][ 7...0 bit] location
4212 // [ bit count][ shift] name
4213 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
4214 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
4215 if (Subtarget->hasTBM()) {
4216 ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
4217 MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
4218 } else {
4219 assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
4220 // BMI requires the immediate to placed in a register.
4221 ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
4222 : GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
4223 MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
4224 : GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
4225 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
4226 Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
4230 MachineSDNode *NewNode;
4231 SDValue Input = N0->getOperand(0);
4232 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4233 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4234 SDValue Ops[] = {
4235 Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)};
4236 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4237 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4238 // Update the chain.
4239 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
4240 // Record the mem-refs
4241 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
4242 } else {
4243 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control);
4246 if (!PreferBEXTR) {
4247 // We still need to apply the shift.
4248 SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT);
4249 unsigned NewOpc = NVT == MVT::i64 ? GET_ND_IF_ENABLED(X86::SHR64ri)
4250 : GET_ND_IF_ENABLED(X86::SHR32ri);
4251 NewNode =
4252 CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt);
4255 return NewNode;
4258 // Emit a PCMISTR(I/M) instruction.
4259 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
4260 bool MayFoldLoad, const SDLoc &dl,
4261 MVT VT, SDNode *Node) {
4262 SDValue N0 = Node->getOperand(0);
4263 SDValue N1 = Node->getOperand(1);
4264 SDValue Imm = Node->getOperand(2);
4265 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4266 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4268 // Try to fold a load. No need to check alignment.
4269 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4270 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4271 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4272 N1.getOperand(0) };
4273 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
4274 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4275 // Update the chain.
4276 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
4277 // Record the mem-refs
4278 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4279 return CNode;
4282 SDValue Ops[] = { N0, N1, Imm };
4283 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
4284 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4285 return CNode;
4288 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
4289 // to emit a second instruction after this one. This is needed since we have two
4290 // copyToReg nodes glued before this and we need to continue that glue through.
4291 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
4292 bool MayFoldLoad, const SDLoc &dl,
4293 MVT VT, SDNode *Node,
4294 SDValue &InGlue) {
4295 SDValue N0 = Node->getOperand(0);
4296 SDValue N2 = Node->getOperand(2);
4297 SDValue Imm = Node->getOperand(4);
4298 auto *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
4299 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
4301 // Try to fold a load. No need to check alignment.
4302 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4303 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4304 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4305 N2.getOperand(0), InGlue };
4306 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
4307 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4308 InGlue = SDValue(CNode, 3);
4309 // Update the chain.
4310 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
4311 // Record the mem-refs
4312 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
4313 return CNode;
4316 SDValue Ops[] = { N0, N2, Imm, InGlue };
4317 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
4318 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
4319 InGlue = SDValue(CNode, 2);
4320 return CNode;
4323 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
4324 EVT VT = N->getValueType(0);
4326 // Only handle scalar shifts.
4327 if (VT.isVector())
4328 return false;
4330 // Narrower shifts only mask to 5 bits in hardware.
4331 unsigned Size = VT == MVT::i64 ? 64 : 32;
4333 SDValue OrigShiftAmt = N->getOperand(1);
4334 SDValue ShiftAmt = OrigShiftAmt;
4335 SDLoc DL(N);
4337 // Skip over a truncate of the shift amount.
4338 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
4339 ShiftAmt = ShiftAmt->getOperand(0);
4341 // This function is called after X86DAGToDAGISel::matchBitExtract(),
4342 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
4344 SDValue NewShiftAmt;
4345 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB ||
4346 ShiftAmt->getOpcode() == ISD::XOR) {
4347 SDValue Add0 = ShiftAmt->getOperand(0);
4348 SDValue Add1 = ShiftAmt->getOperand(1);
4349 auto *Add0C = dyn_cast<ConstantSDNode>(Add0);
4350 auto *Add1C = dyn_cast<ConstantSDNode>(Add1);
4351 // If we are shifting by X+/-/^N where N == 0 mod Size, then just shift by X
4352 // to avoid the ADD/SUB/XOR.
4353 if (Add1C && Add1C->getAPIntValue().urem(Size) == 0) {
4354 NewShiftAmt = Add0;
4356 } else if (ShiftAmt->getOpcode() != ISD::ADD && ShiftAmt.hasOneUse() &&
4357 ((Add0C && Add0C->getAPIntValue().urem(Size) == Size - 1) ||
4358 (Add1C && Add1C->getAPIntValue().urem(Size) == Size - 1))) {
4359 // If we are doing a NOT on just the lower bits with (Size*N-1) -/^ X
4360 // we can replace it with a NOT. In the XOR case it may save some code
4361 // size, in the SUB case it also may save a move.
4362 assert(Add0C == nullptr || Add1C == nullptr);
4364 // We can only do N-X, not X-N
4365 if (ShiftAmt->getOpcode() == ISD::SUB && Add0C == nullptr)
4366 return false;
4368 EVT OpVT = ShiftAmt.getValueType();
4370 SDValue AllOnes = CurDAG->getAllOnesConstant(DL, OpVT);
4371 NewShiftAmt = CurDAG->getNode(ISD::XOR, DL, OpVT,
4372 Add0C == nullptr ? Add0 : Add1, AllOnes);
4373 insertDAGNode(*CurDAG, OrigShiftAmt, AllOnes);
4374 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4375 // If we are shifting by N-X where N == 0 mod Size, then just shift by
4376 // -X to generate a NEG instead of a SUB of a constant.
4377 } else if (ShiftAmt->getOpcode() == ISD::SUB && Add0C &&
4378 Add0C->getZExtValue() != 0) {
4379 EVT SubVT = ShiftAmt.getValueType();
4380 SDValue X;
4381 if (Add0C->getZExtValue() % Size == 0)
4382 X = Add1;
4383 else if (ShiftAmt.hasOneUse() && Size == 64 &&
4384 Add0C->getZExtValue() % 32 == 0) {
4385 // We have a 64-bit shift by (n*32-x), turn it into -(x+n*32).
4386 // This is mainly beneficial if we already compute (x+n*32).
4387 if (Add1.getOpcode() == ISD::TRUNCATE) {
4388 Add1 = Add1.getOperand(0);
4389 SubVT = Add1.getValueType();
4391 if (Add0.getValueType() != SubVT) {
4392 Add0 = CurDAG->getZExtOrTrunc(Add0, DL, SubVT);
4393 insertDAGNode(*CurDAG, OrigShiftAmt, Add0);
4396 X = CurDAG->getNode(ISD::ADD, DL, SubVT, Add1, Add0);
4397 insertDAGNode(*CurDAG, OrigShiftAmt, X);
4398 } else
4399 return false;
4400 // Insert a negate op.
4401 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
4402 // that uses it that's not a shift.
4403 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
4404 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, X);
4405 NewShiftAmt = Neg;
4407 // Insert these operands into a valid topological order so they can
4408 // get selected independently.
4409 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
4410 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
4411 } else
4412 return false;
4413 } else
4414 return false;
4416 if (NewShiftAmt.getValueType() != MVT::i8) {
4417 // Need to truncate the shift amount.
4418 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
4419 // Add to a correct topological ordering.
4420 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4423 // Insert a new mask to keep the shift amount legal. This should be removed
4424 // by isel patterns.
4425 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
4426 CurDAG->getConstant(Size - 1, DL, MVT::i8));
4427 // Place in a correct topological ordering.
4428 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
4430 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
4431 NewShiftAmt);
4432 if (UpdatedNode != N) {
4433 // If we found an existing node, we should replace ourselves with that node
4434 // and wait for it to be selected after its other users.
4435 ReplaceNode(N, UpdatedNode);
4436 return true;
4439 // If the original shift amount is now dead, delete it so that we don't run
4440 // it through isel.
4441 if (OrigShiftAmt.getNode()->use_empty())
4442 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
4444 // Now that we've optimized the shift amount, defer to normal isel to get
4445 // load folding and legacy vs BMI2 selection without repeating it here.
4446 SelectCode(N);
4447 return true;
4450 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
4451 MVT NVT = N->getSimpleValueType(0);
4452 unsigned Opcode = N->getOpcode();
4453 SDLoc dl(N);
4455 // For operations of the form (x << C1) op C2, check if we can use a smaller
4456 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
4457 SDValue Shift = N->getOperand(0);
4458 SDValue N1 = N->getOperand(1);
4460 auto *Cst = dyn_cast<ConstantSDNode>(N1);
4461 if (!Cst)
4462 return false;
4464 int64_t Val = Cst->getSExtValue();
4466 // If we have an any_extend feeding the AND, look through it to see if there
4467 // is a shift behind it. But only if the AND doesn't use the extended bits.
4468 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
4469 bool FoundAnyExtend = false;
4470 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
4471 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
4472 isUInt<32>(Val)) {
4473 FoundAnyExtend = true;
4474 Shift = Shift.getOperand(0);
4477 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
4478 return false;
4480 // i8 is unshrinkable, i16 should be promoted to i32.
4481 if (NVT != MVT::i32 && NVT != MVT::i64)
4482 return false;
4484 auto *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
4485 if (!ShlCst)
4486 return false;
4488 uint64_t ShAmt = ShlCst->getZExtValue();
4490 // Make sure that we don't change the operation by removing bits.
4491 // This only matters for OR and XOR, AND is unaffected.
4492 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
4493 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
4494 return false;
4496 // Check the minimum bitwidth for the new constant.
4497 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
4498 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
4499 if (Opcode == ISD::AND) {
4500 // AND32ri is the same as AND64ri32 with zext imm.
4501 // Try this before sign extended immediates below.
4502 ShiftedVal = (uint64_t)Val >> ShAmt;
4503 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4504 return true;
4505 // Also swap order when the AND can become MOVZX.
4506 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
4507 return true;
4509 ShiftedVal = Val >> ShAmt;
4510 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
4511 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
4512 return true;
4513 if (Opcode != ISD::AND) {
4514 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
4515 ShiftedVal = (uint64_t)Val >> ShAmt;
4516 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
4517 return true;
4519 return false;
4522 int64_t ShiftedVal;
4523 if (!CanShrinkImmediate(ShiftedVal))
4524 return false;
4526 // Ok, we can reorder to get a smaller immediate.
4528 // But, its possible the original immediate allowed an AND to become MOVZX.
4529 // Doing this late due to avoid the MakedValueIsZero call as late as
4530 // possible.
4531 if (Opcode == ISD::AND) {
4532 // Find the smallest zext this could possibly be.
4533 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
4534 ZExtWidth = llvm::bit_ceil(std::max(ZExtWidth, 8U));
4536 // Figure out which bits need to be zero to achieve that mask.
4537 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
4538 ZExtWidth);
4539 NeededMask &= ~Cst->getAPIntValue();
4541 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
4542 return false;
4545 SDValue X = Shift.getOperand(0);
4546 if (FoundAnyExtend) {
4547 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
4548 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
4549 X = NewX;
4552 SDValue NewCst = CurDAG->getSignedConstant(ShiftedVal, dl, NVT);
4553 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
4554 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
4555 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
4556 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
4557 Shift.getOperand(1));
4558 ReplaceNode(N, NewSHL.getNode());
4559 SelectCode(NewSHL.getNode());
4560 return true;
4563 bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA,
4564 SDNode *ParentB, SDNode *ParentC,
4565 SDValue A, SDValue B, SDValue C,
4566 uint8_t Imm) {
4567 assert(A.isOperandOf(ParentA) && B.isOperandOf(ParentB) &&
4568 C.isOperandOf(ParentC) && "Incorrect parent node");
4570 auto tryFoldLoadOrBCast =
4571 [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale,
4572 SDValue &Index, SDValue &Disp, SDValue &Segment) {
4573 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4574 return true;
4576 // Not a load, check for broadcast which may be behind a bitcast.
4577 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4578 P = L.getNode();
4579 L = L.getOperand(0);
4582 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4583 return false;
4585 // Only 32 and 64 bit broadcasts are supported.
4586 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4587 unsigned Size = MemIntr->getMemoryVT().getSizeInBits();
4588 if (Size != 32 && Size != 64)
4589 return false;
4591 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4594 bool FoldedLoad = false;
4595 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4596 if (tryFoldLoadOrBCast(Root, ParentC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4597 FoldedLoad = true;
4598 } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3,
4599 Tmp4)) {
4600 FoldedLoad = true;
4601 std::swap(A, C);
4602 // Swap bits 1/4 and 3/6.
4603 uint8_t OldImm = Imm;
4604 Imm = OldImm & 0xa5;
4605 if (OldImm & 0x02) Imm |= 0x10;
4606 if (OldImm & 0x10) Imm |= 0x02;
4607 if (OldImm & 0x08) Imm |= 0x40;
4608 if (OldImm & 0x40) Imm |= 0x08;
4609 } else if (tryFoldLoadOrBCast(Root, ParentB, B, Tmp0, Tmp1, Tmp2, Tmp3,
4610 Tmp4)) {
4611 FoldedLoad = true;
4612 std::swap(B, C);
4613 // Swap bits 1/2 and 5/6.
4614 uint8_t OldImm = Imm;
4615 Imm = OldImm & 0x99;
4616 if (OldImm & 0x02) Imm |= 0x04;
4617 if (OldImm & 0x04) Imm |= 0x02;
4618 if (OldImm & 0x20) Imm |= 0x40;
4619 if (OldImm & 0x40) Imm |= 0x20;
4622 SDLoc DL(Root);
4624 SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8);
4626 MVT NVT = Root->getSimpleValueType(0);
4628 MachineSDNode *MNode;
4629 if (FoldedLoad) {
4630 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
4632 unsigned Opc;
4633 if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) {
4634 auto *MemIntr = cast<MemIntrinsicSDNode>(C);
4635 unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits();
4636 assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!");
4638 bool UseD = EltSize == 32;
4639 if (NVT.is128BitVector())
4640 Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi;
4641 else if (NVT.is256BitVector())
4642 Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi;
4643 else if (NVT.is512BitVector())
4644 Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi;
4645 else
4646 llvm_unreachable("Unexpected vector size!");
4647 } else {
4648 bool UseD = NVT.getVectorElementType() == MVT::i32;
4649 if (NVT.is128BitVector())
4650 Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi;
4651 else if (NVT.is256BitVector())
4652 Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi;
4653 else if (NVT.is512BitVector())
4654 Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi;
4655 else
4656 llvm_unreachable("Unexpected vector size!");
4659 SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)};
4660 MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
4662 // Update the chain.
4663 ReplaceUses(C.getValue(1), SDValue(MNode, 1));
4664 // Record the mem-refs
4665 CurDAG->setNodeMemRefs(MNode, {cast<MemSDNode>(C)->getMemOperand()});
4666 } else {
4667 bool UseD = NVT.getVectorElementType() == MVT::i32;
4668 unsigned Opc;
4669 if (NVT.is128BitVector())
4670 Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri;
4671 else if (NVT.is256BitVector())
4672 Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri;
4673 else if (NVT.is512BitVector())
4674 Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri;
4675 else
4676 llvm_unreachable("Unexpected vector size!");
4678 MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm});
4681 ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0));
4682 CurDAG->RemoveDeadNode(Root);
4683 return true;
4686 // Try to match two logic ops to a VPTERNLOG.
4687 // FIXME: Handle more complex patterns that use an operand more than once?
4688 bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
4689 MVT NVT = N->getSimpleValueType(0);
4691 // Make sure we support VPTERNLOG.
4692 if (!NVT.isVector() || !Subtarget->hasAVX512() ||
4693 NVT.getVectorElementType() == MVT::i1)
4694 return false;
4696 // We need VLX for 128/256-bit.
4697 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
4698 return false;
4700 SDValue N0 = N->getOperand(0);
4701 SDValue N1 = N->getOperand(1);
4703 auto getFoldableLogicOp = [](SDValue Op) {
4704 // Peek through single use bitcast.
4705 if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
4706 Op = Op.getOperand(0);
4708 if (!Op.hasOneUse())
4709 return SDValue();
4711 unsigned Opc = Op.getOpcode();
4712 if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
4713 Opc == X86ISD::ANDNP)
4714 return Op;
4716 return SDValue();
4719 SDValue A, FoldableOp;
4720 if ((FoldableOp = getFoldableLogicOp(N1))) {
4721 A = N0;
4722 } else if ((FoldableOp = getFoldableLogicOp(N0))) {
4723 A = N1;
4724 } else
4725 return false;
4727 SDValue B = FoldableOp.getOperand(0);
4728 SDValue C = FoldableOp.getOperand(1);
4729 SDNode *ParentA = N;
4730 SDNode *ParentB = FoldableOp.getNode();
4731 SDNode *ParentC = FoldableOp.getNode();
4733 // We can build the appropriate control immediate by performing the logic
4734 // operation we're matching using these constants for A, B, and C.
4735 uint8_t TernlogMagicA = 0xf0;
4736 uint8_t TernlogMagicB = 0xcc;
4737 uint8_t TernlogMagicC = 0xaa;
4739 // Some of the inputs may be inverted, peek through them and invert the
4740 // magic values accordingly.
4741 // TODO: There may be a bitcast before the xor that we should peek through.
4742 auto PeekThroughNot = [](SDValue &Op, SDNode *&Parent, uint8_t &Magic) {
4743 if (Op.getOpcode() == ISD::XOR && Op.hasOneUse() &&
4744 ISD::isBuildVectorAllOnes(Op.getOperand(1).getNode())) {
4745 Magic = ~Magic;
4746 Parent = Op.getNode();
4747 Op = Op.getOperand(0);
4751 PeekThroughNot(A, ParentA, TernlogMagicA);
4752 PeekThroughNot(B, ParentB, TernlogMagicB);
4753 PeekThroughNot(C, ParentC, TernlogMagicC);
4755 uint8_t Imm;
4756 switch (FoldableOp.getOpcode()) {
4757 default: llvm_unreachable("Unexpected opcode!");
4758 case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break;
4759 case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break;
4760 case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break;
4761 case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break;
4764 switch (N->getOpcode()) {
4765 default: llvm_unreachable("Unexpected opcode!");
4766 case X86ISD::ANDNP:
4767 if (A == N0)
4768 Imm &= ~TernlogMagicA;
4769 else
4770 Imm = ~(Imm) & TernlogMagicA;
4771 break;
4772 case ISD::AND: Imm &= TernlogMagicA; break;
4773 case ISD::OR: Imm |= TernlogMagicA; break;
4774 case ISD::XOR: Imm ^= TernlogMagicA; break;
4777 return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
4780 /// If the high bits of an 'and' operand are known zero, try setting the
4781 /// high bits of an 'and' constant operand to produce a smaller encoding by
4782 /// creating a small, sign-extended negative immediate rather than a large
4783 /// positive one. This reverses a transform in SimplifyDemandedBits that
4784 /// shrinks mask constants by clearing bits. There is also a possibility that
4785 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
4786 /// case, just replace the 'and'. Return 'true' if the node is replaced.
4787 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
4788 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
4789 // have immediate operands.
4790 MVT VT = And->getSimpleValueType(0);
4791 if (VT != MVT::i32 && VT != MVT::i64)
4792 return false;
4794 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
4795 if (!And1C)
4796 return false;
4798 // Bail out if the mask constant is already negative. It's can't shrink more.
4799 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
4800 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
4801 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
4802 // are negative too.
4803 APInt MaskVal = And1C->getAPIntValue();
4804 unsigned MaskLZ = MaskVal.countl_zero();
4805 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
4806 return false;
4808 // Don't extend into the upper 32 bits of a 64 bit mask.
4809 if (VT == MVT::i64 && MaskLZ >= 32) {
4810 MaskLZ -= 32;
4811 MaskVal = MaskVal.trunc(32);
4814 SDValue And0 = And->getOperand(0);
4815 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
4816 APInt NegMaskVal = MaskVal | HighZeros;
4818 // If a negative constant would not allow a smaller encoding, there's no need
4819 // to continue. Only change the constant when we know it's a win.
4820 unsigned MinWidth = NegMaskVal.getSignificantBits();
4821 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getSignificantBits() <= 32))
4822 return false;
4824 // Extend masks if we truncated above.
4825 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
4826 NegMaskVal = NegMaskVal.zext(64);
4827 HighZeros = HighZeros.zext(64);
4830 // The variable operand must be all zeros in the top bits to allow using the
4831 // new, negative constant as the mask.
4832 // TODO: Handle constant folding?
4833 KnownBits Known0 = CurDAG->computeKnownBits(And0);
4834 if (Known0.isConstant() || !HighZeros.isSubsetOf(Known0.Zero))
4835 return false;
4837 // Check if the mask is -1. In that case, this is an unnecessary instruction
4838 // that escaped earlier analysis.
4839 if (NegMaskVal.isAllOnes()) {
4840 ReplaceNode(And, And0.getNode());
4841 return true;
4844 // A negative mask allows a smaller encoding. Create a new 'and' node.
4845 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
4846 insertDAGNode(*CurDAG, SDValue(And, 0), NewMask);
4847 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
4848 ReplaceNode(And, NewAnd.getNode());
4849 SelectCode(NewAnd.getNode());
4850 return true;
4853 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
4854 bool FoldedBCast, bool Masked) {
4855 #define VPTESTM_CASE(VT, SUFFIX) \
4856 case MVT::VT: \
4857 if (Masked) \
4858 return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
4859 return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
4862 #define VPTESTM_BROADCAST_CASES(SUFFIX) \
4863 default: llvm_unreachable("Unexpected VT!"); \
4864 VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
4865 VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
4866 VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
4867 VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
4868 VPTESTM_CASE(v16i32, DZ##SUFFIX) \
4869 VPTESTM_CASE(v8i64, QZ##SUFFIX)
4871 #define VPTESTM_FULL_CASES(SUFFIX) \
4872 VPTESTM_BROADCAST_CASES(SUFFIX) \
4873 VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
4874 VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
4875 VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
4876 VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
4877 VPTESTM_CASE(v64i8, BZ##SUFFIX) \
4878 VPTESTM_CASE(v32i16, WZ##SUFFIX)
4880 if (FoldedBCast) {
4881 switch (TestVT.SimpleTy) {
4882 VPTESTM_BROADCAST_CASES(rmb)
4886 if (FoldedLoad) {
4887 switch (TestVT.SimpleTy) {
4888 VPTESTM_FULL_CASES(rm)
4892 switch (TestVT.SimpleTy) {
4893 VPTESTM_FULL_CASES(rr)
4896 #undef VPTESTM_FULL_CASES
4897 #undef VPTESTM_BROADCAST_CASES
4898 #undef VPTESTM_CASE
4901 // Try to create VPTESTM instruction. If InMask is not null, it will be used
4902 // to form a masked operation.
4903 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4904 SDValue InMask) {
4905 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4906 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4907 "Unexpected VT!");
4909 // Look for equal and not equal compares.
4910 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4911 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4912 return false;
4914 SDValue SetccOp0 = Setcc.getOperand(0);
4915 SDValue SetccOp1 = Setcc.getOperand(1);
4917 // Canonicalize the all zero vector to the RHS.
4918 if (ISD::isBuildVectorAllZeros(SetccOp0.getNode()))
4919 std::swap(SetccOp0, SetccOp1);
4921 // See if we're comparing against zero.
4922 if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode()))
4923 return false;
4925 SDValue N0 = SetccOp0;
4927 MVT CmpVT = N0.getSimpleValueType();
4928 MVT CmpSVT = CmpVT.getVectorElementType();
4930 // Start with both operands the same. We'll try to refine this.
4931 SDValue Src0 = N0;
4932 SDValue Src1 = N0;
4935 // Look through single use bitcasts.
4936 SDValue N0Temp = N0;
4937 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4938 N0Temp = N0.getOperand(0);
4940 // Look for single use AND.
4941 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4942 Src0 = N0Temp.getOperand(0);
4943 Src1 = N0Temp.getOperand(1);
4947 // Without VLX we need to widen the operation.
4948 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4950 auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L,
4951 SDValue &Base, SDValue &Scale, SDValue &Index,
4952 SDValue &Disp, SDValue &Segment) {
4953 // If we need to widen, we can't fold the load.
4954 if (!Widen)
4955 if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment))
4956 return true;
4958 // If we didn't fold a load, try to match broadcast. No widening limitation
4959 // for this. But only 32 and 64 bit types are supported.
4960 if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64)
4961 return false;
4963 // Look through single use bitcasts.
4964 if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) {
4965 P = L.getNode();
4966 L = L.getOperand(0);
4969 if (L.getOpcode() != X86ISD::VBROADCAST_LOAD)
4970 return false;
4972 auto *MemIntr = cast<MemIntrinsicSDNode>(L);
4973 if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits())
4974 return false;
4976 return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment);
4979 // We can only fold loads if the sources are unique.
4980 bool CanFoldLoads = Src0 != Src1;
4982 bool FoldedLoad = false;
4983 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4984 if (CanFoldLoads) {
4985 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2,
4986 Tmp3, Tmp4);
4987 if (!FoldedLoad) {
4988 // And is commutative.
4989 FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1,
4990 Tmp2, Tmp3, Tmp4);
4991 if (FoldedLoad)
4992 std::swap(Src0, Src1);
4996 bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD;
4998 bool IsMasked = InMask.getNode() != nullptr;
5000 SDLoc dl(Root);
5002 MVT ResVT = Setcc.getSimpleValueType();
5003 MVT MaskVT = ResVT;
5004 if (Widen) {
5005 // Widen the inputs using insert_subreg or copy_to_regclass.
5006 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
5007 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
5008 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
5009 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
5010 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
5011 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
5012 CmpVT), 0);
5013 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
5015 if (!FoldedBCast)
5016 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
5018 if (IsMasked) {
5019 // Widen the mask.
5020 unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID();
5021 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5022 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5023 dl, MaskVT, InMask, RC), 0);
5027 bool IsTestN = CC == ISD::SETEQ;
5028 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
5029 IsMasked);
5031 MachineSDNode *CNode;
5032 if (FoldedLoad) {
5033 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
5035 if (IsMasked) {
5036 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5037 Src1.getOperand(0) };
5038 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5039 } else {
5040 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
5041 Src1.getOperand(0) };
5042 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5045 // Update the chain.
5046 ReplaceUses(Src1.getValue(1), SDValue(CNode, 1));
5047 // Record the mem-refs
5048 CurDAG->setNodeMemRefs(CNode, {cast<MemSDNode>(Src1)->getMemOperand()});
5049 } else {
5050 if (IsMasked)
5051 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
5052 else
5053 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
5056 // If we widened, we need to shrink the mask VT.
5057 if (Widen) {
5058 unsigned RegClass = TLI->getRegClassFor(ResVT)->getID();
5059 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
5060 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
5061 dl, ResVT, SDValue(CNode, 0), RC);
5064 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
5065 CurDAG->RemoveDeadNode(Root);
5066 return true;
5069 // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it
5070 // into vpternlog.
5071 bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) {
5072 assert(N->getOpcode() == ISD::OR && "Unexpected opcode!");
5074 MVT NVT = N->getSimpleValueType(0);
5076 // Make sure we support VPTERNLOG.
5077 if (!NVT.isVector() || !Subtarget->hasAVX512())
5078 return false;
5080 // We need VLX for 128/256-bit.
5081 if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
5082 return false;
5084 SDValue N0 = N->getOperand(0);
5085 SDValue N1 = N->getOperand(1);
5087 // Canonicalize AND to LHS.
5088 if (N1.getOpcode() == ISD::AND)
5089 std::swap(N0, N1);
5091 if (N0.getOpcode() != ISD::AND ||
5092 N1.getOpcode() != X86ISD::ANDNP ||
5093 !N0.hasOneUse() || !N1.hasOneUse())
5094 return false;
5096 // ANDN is not commutable, use it to pick down A and C.
5097 SDValue A = N1.getOperand(0);
5098 SDValue C = N1.getOperand(1);
5100 // AND is commutable, if one operand matches A, the other operand is B.
5101 // Otherwise this isn't a match.
5102 SDValue B;
5103 if (N0.getOperand(0) == A)
5104 B = N0.getOperand(1);
5105 else if (N0.getOperand(1) == A)
5106 B = N0.getOperand(0);
5107 else
5108 return false;
5110 SDLoc dl(N);
5111 SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8);
5112 SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm);
5113 ReplaceNode(N, Ternlog.getNode());
5115 return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(),
5116 Ternlog.getNode(), A, B, C, 0xCA);
5119 void X86DAGToDAGISel::Select(SDNode *Node) {
5120 MVT NVT = Node->getSimpleValueType(0);
5121 unsigned Opcode = Node->getOpcode();
5122 SDLoc dl(Node);
5124 if (Node->isMachineOpcode()) {
5125 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
5126 Node->setNodeId(-1);
5127 return; // Already selected.
5130 switch (Opcode) {
5131 default: break;
5132 case ISD::INTRINSIC_W_CHAIN: {
5133 unsigned IntNo = Node->getConstantOperandVal(1);
5134 switch (IntNo) {
5135 default: break;
5136 case Intrinsic::x86_encodekey128:
5137 case Intrinsic::x86_encodekey256: {
5138 if (!Subtarget->hasKL())
5139 break;
5141 unsigned Opcode;
5142 switch (IntNo) {
5143 default: llvm_unreachable("Impossible intrinsic");
5144 case Intrinsic::x86_encodekey128:
5145 Opcode = X86::ENCODEKEY128;
5146 break;
5147 case Intrinsic::x86_encodekey256:
5148 Opcode = X86::ENCODEKEY256;
5149 break;
5152 SDValue Chain = Node->getOperand(0);
5153 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
5154 SDValue());
5155 if (Opcode == X86::ENCODEKEY256)
5156 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
5157 Chain.getValue(1));
5159 MachineSDNode *Res = CurDAG->getMachineNode(
5160 Opcode, dl, Node->getVTList(),
5161 {Node->getOperand(2), Chain, Chain.getValue(1)});
5162 ReplaceNode(Node, Res);
5163 return;
5165 case Intrinsic::x86_tileloaddrs64_internal:
5166 case Intrinsic::x86_tileloaddrst164_internal:
5167 if (!Subtarget->hasAMXMOVRS())
5168 break;
5169 [[fallthrough]];
5170 case Intrinsic::x86_tileloadd64_internal:
5171 case Intrinsic::x86_tileloaddt164_internal: {
5172 if (!Subtarget->hasAMXTILE())
5173 break;
5174 auto *MFI =
5175 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5176 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5177 unsigned Opc;
5178 switch (IntNo) {
5179 default:
5180 llvm_unreachable("Unexpected intrinsic!");
5181 case Intrinsic::x86_tileloaddrs64_internal:
5182 Opc = X86::PTILELOADDRSV;
5183 break;
5184 case Intrinsic::x86_tileloaddrst164_internal:
5185 Opc = X86::PTILELOADDRST1V;
5186 break;
5187 case Intrinsic::x86_tileloadd64_internal:
5188 Opc = X86::PTILELOADDV;
5189 break;
5190 case Intrinsic::x86_tileloaddt164_internal:
5191 Opc = X86::PTILELOADDT1V;
5192 break;
5194 // _tile_loadd_internal(row, col, buf, STRIDE)
5195 SDValue Base = Node->getOperand(4);
5196 SDValue Scale = getI8Imm(1, dl);
5197 SDValue Index = Node->getOperand(5);
5198 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5199 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5200 SDValue Chain = Node->getOperand(0);
5201 MachineSDNode *CNode;
5202 SDValue Ops[] = {Node->getOperand(2),
5203 Node->getOperand(3),
5204 Base,
5205 Scale,
5206 Index,
5207 Disp,
5208 Segment,
5209 Chain};
5210 CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
5211 ReplaceNode(Node, CNode);
5212 return;
5215 break;
5217 case ISD::INTRINSIC_VOID: {
5218 unsigned IntNo = Node->getConstantOperandVal(1);
5219 switch (IntNo) {
5220 default: break;
5221 case Intrinsic::x86_sse3_monitor:
5222 case Intrinsic::x86_monitorx:
5223 case Intrinsic::x86_clzero: {
5224 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
5226 unsigned Opc = 0;
5227 switch (IntNo) {
5228 default: llvm_unreachable("Unexpected intrinsic!");
5229 case Intrinsic::x86_sse3_monitor:
5230 if (!Subtarget->hasSSE3())
5231 break;
5232 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
5233 break;
5234 case Intrinsic::x86_monitorx:
5235 if (!Subtarget->hasMWAITX())
5236 break;
5237 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
5238 break;
5239 case Intrinsic::x86_clzero:
5240 if (!Subtarget->hasCLZERO())
5241 break;
5242 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
5243 break;
5246 if (Opc) {
5247 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
5248 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
5249 Node->getOperand(2), SDValue());
5250 SDValue InGlue = Chain.getValue(1);
5252 if (IntNo == Intrinsic::x86_sse3_monitor ||
5253 IntNo == Intrinsic::x86_monitorx) {
5254 // Copy the other two operands to ECX and EDX.
5255 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
5256 InGlue);
5257 InGlue = Chain.getValue(1);
5258 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
5259 InGlue);
5260 InGlue = Chain.getValue(1);
5263 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
5264 { Chain, InGlue});
5265 ReplaceNode(Node, CNode);
5266 return;
5269 break;
5271 case Intrinsic::x86_tilestored64_internal: {
5272 auto *MFI =
5273 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5274 MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
5275 unsigned Opc = X86::PTILESTOREDV;
5276 // _tile_stored_internal(row, col, buf, STRIDE, c)
5277 SDValue Base = Node->getOperand(4);
5278 SDValue Scale = getI8Imm(1, dl);
5279 SDValue Index = Node->getOperand(5);
5280 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5281 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5282 SDValue Chain = Node->getOperand(0);
5283 MachineSDNode *CNode;
5284 SDValue Ops[] = {Node->getOperand(2),
5285 Node->getOperand(3),
5286 Base,
5287 Scale,
5288 Index,
5289 Disp,
5290 Segment,
5291 Node->getOperand(6),
5292 Chain};
5293 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5294 ReplaceNode(Node, CNode);
5295 return;
5297 case Intrinsic::x86_tileloaddrs64:
5298 case Intrinsic::x86_tileloaddrst164:
5299 if (!Subtarget->hasAMXMOVRS())
5300 break;
5301 [[fallthrough]];
5302 case Intrinsic::x86_tileloadd64:
5303 case Intrinsic::x86_tileloaddt164:
5304 case Intrinsic::x86_tilestored64: {
5305 if (!Subtarget->hasAMXTILE())
5306 break;
5307 auto *MFI =
5308 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5309 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5310 unsigned Opc;
5311 switch (IntNo) {
5312 default: llvm_unreachable("Unexpected intrinsic!");
5313 case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
5314 case Intrinsic::x86_tileloaddrs64:
5315 Opc = X86::PTILELOADDRS;
5316 break;
5317 case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
5318 case Intrinsic::x86_tileloaddrst164:
5319 Opc = X86::PTILELOADDRST1;
5320 break;
5321 case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
5323 // FIXME: Match displacement and scale.
5324 unsigned TIndex = Node->getConstantOperandVal(2);
5325 SDValue TReg = getI8Imm(TIndex, dl);
5326 SDValue Base = Node->getOperand(3);
5327 SDValue Scale = getI8Imm(1, dl);
5328 SDValue Index = Node->getOperand(4);
5329 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5330 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5331 SDValue Chain = Node->getOperand(0);
5332 MachineSDNode *CNode;
5333 if (Opc == X86::PTILESTORED) {
5334 SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
5335 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5336 } else {
5337 SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
5338 CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5340 ReplaceNode(Node, CNode);
5341 return;
5343 case Intrinsic::x86_t2rpntlvwz0rs:
5344 case Intrinsic::x86_t2rpntlvwz0rst1:
5345 case Intrinsic::x86_t2rpntlvwz1rs:
5346 case Intrinsic::x86_t2rpntlvwz1rst1:
5347 if (!Subtarget->hasAMXMOVRS())
5348 break;
5349 [[fallthrough]];
5350 case Intrinsic::x86_t2rpntlvwz0:
5351 case Intrinsic::x86_t2rpntlvwz0t1:
5352 case Intrinsic::x86_t2rpntlvwz1:
5353 case Intrinsic::x86_t2rpntlvwz1t1: {
5354 if (!Subtarget->hasAMXTRANSPOSE())
5355 break;
5356 auto *MFI =
5357 CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
5358 MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
5359 unsigned Opc;
5360 switch (IntNo) {
5361 default:
5362 llvm_unreachable("Unexpected intrinsic!");
5363 case Intrinsic::x86_t2rpntlvwz0:
5364 Opc = X86::PT2RPNTLVWZ0;
5365 break;
5366 case Intrinsic::x86_t2rpntlvwz0t1:
5367 Opc = X86::PT2RPNTLVWZ0T1;
5368 break;
5369 case Intrinsic::x86_t2rpntlvwz1:
5370 Opc = X86::PT2RPNTLVWZ1;
5371 break;
5372 case Intrinsic::x86_t2rpntlvwz1t1:
5373 Opc = X86::PT2RPNTLVWZ1T1;
5374 break;
5375 case Intrinsic::x86_t2rpntlvwz0rs:
5376 Opc = X86::PT2RPNTLVWZ0RS;
5377 break;
5378 case Intrinsic::x86_t2rpntlvwz0rst1:
5379 Opc = X86::PT2RPNTLVWZ0RST1;
5380 break;
5381 case Intrinsic::x86_t2rpntlvwz1rs:
5382 Opc = X86::PT2RPNTLVWZ1RS;
5383 break;
5384 case Intrinsic::x86_t2rpntlvwz1rst1:
5385 Opc = X86::PT2RPNTLVWZ1RST1;
5386 break;
5388 // FIXME: Match displacement and scale.
5389 unsigned TIndex = Node->getConstantOperandVal(2);
5390 SDValue TReg = getI8Imm(TIndex, dl);
5391 SDValue Base = Node->getOperand(3);
5392 SDValue Scale = getI8Imm(1, dl);
5393 SDValue Index = Node->getOperand(4);
5394 SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
5395 SDValue Segment = CurDAG->getRegister(0, MVT::i16);
5396 SDValue Chain = Node->getOperand(0);
5397 SDValue Ops[] = {TReg, Base, Scale, Index, Disp, Segment, Chain};
5398 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
5399 ReplaceNode(Node, CNode);
5400 return;
5403 break;
5405 case ISD::BRIND:
5406 case X86ISD::NT_BRIND: {
5407 if (Subtarget->isTargetNaCl())
5408 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
5409 // leave the instruction alone.
5410 break;
5411 if (Subtarget->isTarget64BitILP32()) {
5412 // Converts a 32-bit register to a 64-bit, zero-extended version of
5413 // it. This is needed because x86-64 can do many things, but jmp %r32
5414 // ain't one of them.
5415 SDValue Target = Node->getOperand(1);
5416 assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
5417 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
5418 SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
5419 Node->getOperand(0), ZextTarget);
5420 ReplaceNode(Node, Brind.getNode());
5421 SelectCode(ZextTarget.getNode());
5422 SelectCode(Brind.getNode());
5423 return;
5425 break;
5427 case X86ISD::GlobalBaseReg:
5428 ReplaceNode(Node, getGlobalBaseReg());
5429 return;
5431 case ISD::BITCAST:
5432 // Just drop all 128/256/512-bit bitcasts.
5433 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
5434 NVT == MVT::f128) {
5435 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
5436 CurDAG->RemoveDeadNode(Node);
5437 return;
5439 break;
5441 case ISD::SRL:
5442 if (matchBitExtract(Node))
5443 return;
5444 [[fallthrough]];
5445 case ISD::SRA:
5446 case ISD::SHL:
5447 if (tryShiftAmountMod(Node))
5448 return;
5449 break;
5451 case X86ISD::VPTERNLOG: {
5452 uint8_t Imm = Node->getConstantOperandVal(3);
5453 if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
5454 Node->getOperand(1), Node->getOperand(2), Imm))
5455 return;
5456 break;
5459 case X86ISD::ANDNP:
5460 if (tryVPTERNLOG(Node))
5461 return;
5462 break;
5464 case ISD::AND:
5465 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
5466 // Try to form a masked VPTESTM. Operands can be in either order.
5467 SDValue N0 = Node->getOperand(0);
5468 SDValue N1 = Node->getOperand(1);
5469 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
5470 tryVPTESTM(Node, N0, N1))
5471 return;
5472 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
5473 tryVPTESTM(Node, N1, N0))
5474 return;
5477 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
5478 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
5479 CurDAG->RemoveDeadNode(Node);
5480 return;
5482 if (matchBitExtract(Node))
5483 return;
5484 if (AndImmShrink && shrinkAndImmediate(Node))
5485 return;
5487 [[fallthrough]];
5488 case ISD::OR:
5489 case ISD::XOR:
5490 if (tryShrinkShlLogicImm(Node))
5491 return;
5492 if (Opcode == ISD::OR && tryMatchBitSelect(Node))
5493 return;
5494 if (tryVPTERNLOG(Node))
5495 return;
5497 [[fallthrough]];
5498 case ISD::ADD:
5499 if (Opcode == ISD::ADD && matchBitExtract(Node))
5500 return;
5501 [[fallthrough]];
5502 case ISD::SUB: {
5503 // Try to avoid folding immediates with multiple uses for optsize.
5504 // This code tries to select to register form directly to avoid going
5505 // through the isel table which might fold the immediate. We can't change
5506 // the patterns on the add/sub/and/or/xor with immediate paterns in the
5507 // tablegen files to check immediate use count without making the patterns
5508 // unavailable to the fast-isel table.
5509 if (!CurDAG->shouldOptForSize())
5510 break;
5512 // Only handle i8/i16/i32/i64.
5513 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
5514 break;
5516 SDValue N0 = Node->getOperand(0);
5517 SDValue N1 = Node->getOperand(1);
5519 auto *Cst = dyn_cast<ConstantSDNode>(N1);
5520 if (!Cst)
5521 break;
5523 int64_t Val = Cst->getSExtValue();
5525 // Make sure its an immediate that is considered foldable.
5526 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
5527 if (!isInt<8>(Val) && !isInt<32>(Val))
5528 break;
5530 // If this can match to INC/DEC, let it go.
5531 if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
5532 break;
5534 // Check if we should avoid folding this immediate.
5535 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
5536 break;
5538 // We should not fold the immediate. So we need a register form instead.
5539 unsigned ROpc, MOpc;
5540 switch (NVT.SimpleTy) {
5541 default: llvm_unreachable("Unexpected VT!");
5542 case MVT::i8:
5543 switch (Opcode) {
5544 default: llvm_unreachable("Unexpected opcode!");
5545 case ISD::ADD:
5546 ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
5547 MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
5548 break;
5549 case ISD::SUB:
5550 ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
5551 MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
5552 break;
5553 case ISD::AND:
5554 ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
5555 MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
5556 break;
5557 case ISD::OR:
5558 ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
5559 MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
5560 break;
5561 case ISD::XOR:
5562 ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
5563 MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
5564 break;
5566 break;
5567 case MVT::i16:
5568 switch (Opcode) {
5569 default: llvm_unreachable("Unexpected opcode!");
5570 case ISD::ADD:
5571 ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
5572 MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
5573 break;
5574 case ISD::SUB:
5575 ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
5576 MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
5577 break;
5578 case ISD::AND:
5579 ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
5580 MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
5581 break;
5582 case ISD::OR:
5583 ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
5584 MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
5585 break;
5586 case ISD::XOR:
5587 ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
5588 MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
5589 break;
5591 break;
5592 case MVT::i32:
5593 switch (Opcode) {
5594 default: llvm_unreachable("Unexpected opcode!");
5595 case ISD::ADD:
5596 ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
5597 MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
5598 break;
5599 case ISD::SUB:
5600 ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
5601 MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
5602 break;
5603 case ISD::AND:
5604 ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
5605 MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
5606 break;
5607 case ISD::OR:
5608 ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
5609 MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
5610 break;
5611 case ISD::XOR:
5612 ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
5613 MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
5614 break;
5616 break;
5617 case MVT::i64:
5618 switch (Opcode) {
5619 default: llvm_unreachable("Unexpected opcode!");
5620 case ISD::ADD:
5621 ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
5622 MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
5623 break;
5624 case ISD::SUB:
5625 ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
5626 MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
5627 break;
5628 case ISD::AND:
5629 ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
5630 MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
5631 break;
5632 case ISD::OR:
5633 ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
5634 MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
5635 break;
5636 case ISD::XOR:
5637 ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
5638 MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
5639 break;
5641 break;
5644 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
5646 // If this is a not a subtract, we can still try to fold a load.
5647 if (Opcode != ISD::SUB) {
5648 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5649 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5650 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5651 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5652 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5653 // Update the chain.
5654 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
5655 // Record the mem-refs
5656 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
5657 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5658 CurDAG->RemoveDeadNode(Node);
5659 return;
5663 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
5664 return;
5667 case X86ISD::SMUL:
5668 // i16/i32/i64 are handled with isel patterns.
5669 if (NVT != MVT::i8)
5670 break;
5671 [[fallthrough]];
5672 case X86ISD::UMUL: {
5673 SDValue N0 = Node->getOperand(0);
5674 SDValue N1 = Node->getOperand(1);
5676 unsigned LoReg, ROpc, MOpc;
5677 switch (NVT.SimpleTy) {
5678 default: llvm_unreachable("Unsupported VT!");
5679 case MVT::i8:
5680 LoReg = X86::AL;
5681 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
5682 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
5683 break;
5684 case MVT::i16:
5685 LoReg = X86::AX;
5686 ROpc = X86::MUL16r;
5687 MOpc = X86::MUL16m;
5688 break;
5689 case MVT::i32:
5690 LoReg = X86::EAX;
5691 ROpc = X86::MUL32r;
5692 MOpc = X86::MUL32m;
5693 break;
5694 case MVT::i64:
5695 LoReg = X86::RAX;
5696 ROpc = X86::MUL64r;
5697 MOpc = X86::MUL64m;
5698 break;
5701 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5702 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5703 // Multiply is commutative.
5704 if (!FoldedLoad) {
5705 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5706 if (FoldedLoad)
5707 std::swap(N0, N1);
5710 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5711 N0, SDValue()).getValue(1);
5713 MachineSDNode *CNode;
5714 if (FoldedLoad) {
5715 // i16/i32/i64 use an instruction that produces a low and high result even
5716 // though only the low result is used.
5717 SDVTList VTs;
5718 if (NVT == MVT::i8)
5719 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
5720 else
5721 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
5723 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5724 InGlue };
5725 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5727 // Update the chain.
5728 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
5729 // Record the mem-refs
5730 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5731 } else {
5732 // i16/i32/i64 use an instruction that produces a low and high result even
5733 // though only the low result is used.
5734 SDVTList VTs;
5735 if (NVT == MVT::i8)
5736 VTs = CurDAG->getVTList(NVT, MVT::i32);
5737 else
5738 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
5740 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InGlue});
5743 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
5744 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
5745 CurDAG->RemoveDeadNode(Node);
5746 return;
5749 case ISD::SMUL_LOHI:
5750 case ISD::UMUL_LOHI: {
5751 SDValue N0 = Node->getOperand(0);
5752 SDValue N1 = Node->getOperand(1);
5754 unsigned Opc, MOpc;
5755 unsigned LoReg, HiReg;
5756 bool IsSigned = Opcode == ISD::SMUL_LOHI;
5757 bool UseMULX = !IsSigned && Subtarget->hasBMI2();
5758 bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
5759 switch (NVT.SimpleTy) {
5760 default: llvm_unreachable("Unsupported VT!");
5761 case MVT::i32:
5762 Opc = UseMULXHi ? X86::MULX32Hrr
5763 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
5764 : IsSigned ? X86::IMUL32r
5765 : X86::MUL32r;
5766 MOpc = UseMULXHi ? X86::MULX32Hrm
5767 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
5768 : IsSigned ? X86::IMUL32m
5769 : X86::MUL32m;
5770 LoReg = UseMULX ? X86::EDX : X86::EAX;
5771 HiReg = X86::EDX;
5772 break;
5773 case MVT::i64:
5774 Opc = UseMULXHi ? X86::MULX64Hrr
5775 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
5776 : IsSigned ? X86::IMUL64r
5777 : X86::MUL64r;
5778 MOpc = UseMULXHi ? X86::MULX64Hrm
5779 : UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
5780 : IsSigned ? X86::IMUL64m
5781 : X86::MUL64m;
5782 LoReg = UseMULX ? X86::RDX : X86::RAX;
5783 HiReg = X86::RDX;
5784 break;
5787 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5788 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5789 // Multiply is commutative.
5790 if (!foldedLoad) {
5791 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5792 if (foldedLoad)
5793 std::swap(N0, N1);
5796 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
5797 N0, SDValue()).getValue(1);
5798 SDValue ResHi, ResLo;
5799 if (foldedLoad) {
5800 SDValue Chain;
5801 MachineSDNode *CNode = nullptr;
5802 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5803 InGlue };
5804 if (UseMULXHi) {
5805 SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
5806 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5807 ResHi = SDValue(CNode, 0);
5808 Chain = SDValue(CNode, 1);
5809 } else if (UseMULX) {
5810 SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
5811 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5812 ResHi = SDValue(CNode, 0);
5813 ResLo = SDValue(CNode, 1);
5814 Chain = SDValue(CNode, 2);
5815 } else {
5816 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
5817 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
5818 Chain = SDValue(CNode, 0);
5819 InGlue = SDValue(CNode, 1);
5822 // Update the chain.
5823 ReplaceUses(N1.getValue(1), Chain);
5824 // Record the mem-refs
5825 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
5826 } else {
5827 SDValue Ops[] = { N1, InGlue };
5828 if (UseMULXHi) {
5829 SDVTList VTs = CurDAG->getVTList(NVT);
5830 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5831 ResHi = SDValue(CNode, 0);
5832 } else if (UseMULX) {
5833 SDVTList VTs = CurDAG->getVTList(NVT, NVT);
5834 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5835 ResHi = SDValue(CNode, 0);
5836 ResLo = SDValue(CNode, 1);
5837 } else {
5838 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
5839 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
5840 InGlue = SDValue(CNode, 0);
5844 // Copy the low half of the result, if it is needed.
5845 if (!SDValue(Node, 0).use_empty()) {
5846 if (!ResLo) {
5847 assert(LoReg && "Register for low half is not defined!");
5848 ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
5849 NVT, InGlue);
5850 InGlue = ResLo.getValue(2);
5852 ReplaceUses(SDValue(Node, 0), ResLo);
5853 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
5854 dbgs() << '\n');
5856 // Copy the high half of the result, if it is needed.
5857 if (!SDValue(Node, 1).use_empty()) {
5858 if (!ResHi) {
5859 assert(HiReg && "Register for high half is not defined!");
5860 ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
5861 NVT, InGlue);
5862 InGlue = ResHi.getValue(2);
5864 ReplaceUses(SDValue(Node, 1), ResHi);
5865 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
5866 dbgs() << '\n');
5869 CurDAG->RemoveDeadNode(Node);
5870 return;
5873 case ISD::SDIVREM:
5874 case ISD::UDIVREM: {
5875 SDValue N0 = Node->getOperand(0);
5876 SDValue N1 = Node->getOperand(1);
5878 unsigned ROpc, MOpc;
5879 bool isSigned = Opcode == ISD::SDIVREM;
5880 if (!isSigned) {
5881 switch (NVT.SimpleTy) {
5882 default: llvm_unreachable("Unsupported VT!");
5883 case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
5884 case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
5885 case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
5886 case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
5888 } else {
5889 switch (NVT.SimpleTy) {
5890 default: llvm_unreachable("Unsupported VT!");
5891 case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
5892 case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
5893 case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
5894 case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
5898 unsigned LoReg, HiReg, ClrReg;
5899 unsigned SExtOpcode;
5900 switch (NVT.SimpleTy) {
5901 default: llvm_unreachable("Unsupported VT!");
5902 case MVT::i8:
5903 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
5904 SExtOpcode = 0; // Not used.
5905 break;
5906 case MVT::i16:
5907 LoReg = X86::AX; HiReg = X86::DX;
5908 ClrReg = X86::DX;
5909 SExtOpcode = X86::CWD;
5910 break;
5911 case MVT::i32:
5912 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
5913 SExtOpcode = X86::CDQ;
5914 break;
5915 case MVT::i64:
5916 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
5917 SExtOpcode = X86::CQO;
5918 break;
5921 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
5922 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
5923 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
5925 SDValue InGlue;
5926 if (NVT == MVT::i8) {
5927 // Special case for div8, just use a move with zero extension to AX to
5928 // clear the upper 8 bits (AH).
5929 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
5930 MachineSDNode *Move;
5931 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
5932 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
5933 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
5934 : X86::MOVZX16rm8;
5935 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
5936 Chain = SDValue(Move, 1);
5937 ReplaceUses(N0.getValue(1), Chain);
5938 // Record the mem-refs
5939 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
5940 } else {
5941 unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
5942 : X86::MOVZX16rr8;
5943 Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
5944 Chain = CurDAG->getEntryNode();
5946 Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
5947 SDValue());
5948 InGlue = Chain.getValue(1);
5949 } else {
5950 InGlue =
5951 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
5952 LoReg, N0, SDValue()).getValue(1);
5953 if (isSigned && !signBitIsZero) {
5954 // Sign extend the low part into the high part.
5955 InGlue =
5956 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InGlue),0);
5957 } else {
5958 // Zero out the high part, effectively zero extending the input.
5959 SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
5960 SDValue ClrNode =
5961 SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, {}), 0);
5962 switch (NVT.SimpleTy) {
5963 case MVT::i16:
5964 ClrNode =
5965 SDValue(CurDAG->getMachineNode(
5966 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
5967 CurDAG->getTargetConstant(X86::sub_16bit, dl,
5968 MVT::i32)),
5970 break;
5971 case MVT::i32:
5972 break;
5973 case MVT::i64:
5974 ClrNode =
5975 SDValue(CurDAG->getMachineNode(
5976 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
5977 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
5978 CurDAG->getTargetConstant(X86::sub_32bit, dl,
5979 MVT::i32)),
5981 break;
5982 default:
5983 llvm_unreachable("Unexpected division source");
5986 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
5987 ClrNode, InGlue).getValue(1);
5991 if (foldedLoad) {
5992 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
5993 InGlue };
5994 MachineSDNode *CNode =
5995 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
5996 InGlue = SDValue(CNode, 1);
5997 // Update the chain.
5998 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
5999 // Record the mem-refs
6000 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
6001 } else {
6002 InGlue =
6003 SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InGlue), 0);
6006 // Prevent use of AH in a REX instruction by explicitly copying it to
6007 // an ABCD_L register.
6009 // The current assumption of the register allocator is that isel
6010 // won't generate explicit references to the GR8_ABCD_H registers. If
6011 // the allocator and/or the backend get enhanced to be more robust in
6012 // that regard, this can be, and should be, removed.
6013 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
6014 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
6015 unsigned AHExtOpcode =
6016 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
6018 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
6019 MVT::Glue, AHCopy, InGlue);
6020 SDValue Result(RNode, 0);
6021 InGlue = SDValue(RNode, 1);
6023 Result =
6024 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
6026 ReplaceUses(SDValue(Node, 1), Result);
6027 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6028 dbgs() << '\n');
6030 // Copy the division (low) result, if it is needed.
6031 if (!SDValue(Node, 0).use_empty()) {
6032 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6033 LoReg, NVT, InGlue);
6034 InGlue = Result.getValue(2);
6035 ReplaceUses(SDValue(Node, 0), Result);
6036 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6037 dbgs() << '\n');
6039 // Copy the remainder (high) result, if it is needed.
6040 if (!SDValue(Node, 1).use_empty()) {
6041 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
6042 HiReg, NVT, InGlue);
6043 InGlue = Result.getValue(2);
6044 ReplaceUses(SDValue(Node, 1), Result);
6045 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
6046 dbgs() << '\n');
6048 CurDAG->RemoveDeadNode(Node);
6049 return;
6052 case X86ISD::FCMP:
6053 case X86ISD::STRICT_FCMP:
6054 case X86ISD::STRICT_FCMPS: {
6055 bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
6056 Node->getOpcode() == X86ISD::STRICT_FCMPS;
6057 SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
6058 SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
6060 // Save the original VT of the compare.
6061 MVT CmpVT = N0.getSimpleValueType();
6063 // Floating point needs special handling if we don't have FCOMI.
6064 if (Subtarget->canUseCMOV())
6065 break;
6067 bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
6069 unsigned Opc;
6070 switch (CmpVT.SimpleTy) {
6071 default: llvm_unreachable("Unexpected type!");
6072 case MVT::f32:
6073 Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
6074 break;
6075 case MVT::f64:
6076 Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
6077 break;
6078 case MVT::f80:
6079 Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
6080 break;
6083 SDValue Chain =
6084 IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
6085 SDValue Glue;
6086 if (IsStrictCmp) {
6087 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
6088 Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
6089 Glue = Chain.getValue(1);
6090 } else {
6091 Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
6094 // Move FPSW to AX.
6095 SDValue FNSTSW =
6096 SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);
6098 // Extract upper 8-bits of AX.
6099 SDValue Extract =
6100 CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
6102 // Move AH into flags.
6103 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
6104 assert(Subtarget->canUseLAHFSAHF() &&
6105 "Target doesn't support SAHF or FCOMI?");
6106 SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
6107 Chain = AH;
6108 SDValue SAHF = SDValue(
6109 CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
6111 if (IsStrictCmp)
6112 ReplaceUses(SDValue(Node, 1), Chain);
6114 ReplaceUses(SDValue(Node, 0), SAHF);
6115 CurDAG->RemoveDeadNode(Node);
6116 return;
6119 case X86ISD::CMP: {
6120 SDValue N0 = Node->getOperand(0);
6121 SDValue N1 = Node->getOperand(1);
6123 // Optimizations for TEST compares.
6124 if (!isNullConstant(N1))
6125 break;
6127 // Save the original VT of the compare.
6128 MVT CmpVT = N0.getSimpleValueType();
6130 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
6131 // by a test instruction. The test should be removed later by
6132 // analyzeCompare if we are using only the zero flag.
6133 // TODO: Should we check the users and use the BEXTR flags directly?
6134 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
6135 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
6136 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
6137 : X86::TEST32rr;
6138 SDValue BEXTR = SDValue(NewNode, 0);
6139 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
6140 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6141 CurDAG->RemoveDeadNode(Node);
6142 return;
6146 // We can peek through truncates, but we need to be careful below.
6147 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
6148 N0 = N0.getOperand(0);
6150 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
6151 // use a smaller encoding.
6152 // Look past the truncate if CMP is the only use of it.
6153 if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
6154 N0.getValueType() != MVT::i8) {
6155 auto *MaskC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6156 if (!MaskC)
6157 break;
6159 // We may have looked through a truncate so mask off any bits that
6160 // shouldn't be part of the compare.
6161 uint64_t Mask = MaskC->getZExtValue();
6162 Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());
6164 // Check if we can replace AND+IMM{32,64} with a shift. This is possible
6165 // for masks like 0xFF000000 or 0x00FFFFFF and if we care only about the
6166 // zero flag.
6167 if (CmpVT == MVT::i64 && !isInt<8>(Mask) && isShiftedMask_64(Mask) &&
6168 onlyUsesZeroFlag(SDValue(Node, 0))) {
6169 unsigned ShiftOpcode = ISD::DELETED_NODE;
6170 unsigned ShiftAmt;
6171 unsigned SubRegIdx;
6172 MVT SubRegVT;
6173 unsigned TestOpcode;
6174 unsigned LeadingZeros = llvm::countl_zero(Mask);
6175 unsigned TrailingZeros = llvm::countr_zero(Mask);
6177 // With leading/trailing zeros, the transform is profitable if we can
6178 // eliminate a movabsq or shrink a 32-bit immediate to 8-bit without
6179 // incurring any extra register moves.
6180 bool SavesBytes = !isInt<32>(Mask) || N0.getOperand(0).hasOneUse();
6181 if (LeadingZeros == 0 && SavesBytes) {
6182 // If the mask covers the most significant bit, then we can replace
6183 // TEST+AND with a SHR and check eflags.
6184 // This emits a redundant TEST which is subsequently eliminated.
6185 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6186 ShiftAmt = TrailingZeros;
6187 SubRegIdx = 0;
6188 TestOpcode = X86::TEST64rr;
6189 } else if (TrailingZeros == 0 && SavesBytes) {
6190 // If the mask covers the least significant bit, then we can replace
6191 // TEST+AND with a SHL and check eflags.
6192 // This emits a redundant TEST which is subsequently eliminated.
6193 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHL64ri);
6194 ShiftAmt = LeadingZeros;
6195 SubRegIdx = 0;
6196 TestOpcode = X86::TEST64rr;
6197 } else if (MaskC->hasOneUse() && !isInt<32>(Mask)) {
6198 // If the shifted mask extends into the high half and is 8/16/32 bits
6199 // wide, then replace it with a SHR and a TEST8rr/TEST16rr/TEST32rr.
6200 unsigned PopCount = 64 - LeadingZeros - TrailingZeros;
6201 if (PopCount == 8) {
6202 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6203 ShiftAmt = TrailingZeros;
6204 SubRegIdx = X86::sub_8bit;
6205 SubRegVT = MVT::i8;
6206 TestOpcode = X86::TEST8rr;
6207 } else if (PopCount == 16) {
6208 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6209 ShiftAmt = TrailingZeros;
6210 SubRegIdx = X86::sub_16bit;
6211 SubRegVT = MVT::i16;
6212 TestOpcode = X86::TEST16rr;
6213 } else if (PopCount == 32) {
6214 ShiftOpcode = GET_ND_IF_ENABLED(X86::SHR64ri);
6215 ShiftAmt = TrailingZeros;
6216 SubRegIdx = X86::sub_32bit;
6217 SubRegVT = MVT::i32;
6218 TestOpcode = X86::TEST32rr;
6221 if (ShiftOpcode != ISD::DELETED_NODE) {
6222 SDValue ShiftC = CurDAG->getTargetConstant(ShiftAmt, dl, MVT::i64);
6223 SDValue Shift = SDValue(
6224 CurDAG->getMachineNode(ShiftOpcode, dl, MVT::i64, MVT::i32,
6225 N0.getOperand(0), ShiftC),
6227 if (SubRegIdx != 0) {
6228 Shift =
6229 CurDAG->getTargetExtractSubreg(SubRegIdx, dl, SubRegVT, Shift);
6231 MachineSDNode *Test =
6232 CurDAG->getMachineNode(TestOpcode, dl, MVT::i32, Shift, Shift);
6233 ReplaceNode(Node, Test);
6234 return;
6238 MVT VT;
6239 int SubRegOp;
6240 unsigned ROpc, MOpc;
6242 // For each of these checks we need to be careful if the sign flag is
6243 // being used. It is only safe to use the sign flag in two conditions,
6244 // either the sign bit in the shrunken mask is zero or the final test
6245 // size is equal to the original compare size.
6247 if (isUInt<8>(Mask) &&
6248 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
6249 hasNoSignFlagUses(SDValue(Node, 0)))) {
6250 // For example, convert "testl %eax, $8" to "testb %al, $8"
6251 VT = MVT::i8;
6252 SubRegOp = X86::sub_8bit;
6253 ROpc = X86::TEST8ri;
6254 MOpc = X86::TEST8mi;
6255 } else if (OptForMinSize && isUInt<16>(Mask) &&
6256 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
6257 hasNoSignFlagUses(SDValue(Node, 0)))) {
6258 // For example, "testl %eax, $32776" to "testw %ax, $32776".
6259 // NOTE: We only want to form TESTW instructions if optimizing for
6260 // min size. Otherwise we only save one byte and possibly get a length
6261 // changing prefix penalty in the decoders.
6262 VT = MVT::i16;
6263 SubRegOp = X86::sub_16bit;
6264 ROpc = X86::TEST16ri;
6265 MOpc = X86::TEST16mi;
6266 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
6267 ((!(Mask & 0x80000000) &&
6268 // Without minsize 16-bit Cmps can get here so we need to
6269 // be sure we calculate the correct sign flag if needed.
6270 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
6271 CmpVT == MVT::i32 ||
6272 hasNoSignFlagUses(SDValue(Node, 0)))) {
6273 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
6274 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
6275 // Otherwize, we find ourselves in a position where we have to do
6276 // promotion. If previous passes did not promote the and, we assume
6277 // they had a good reason not to and do not promote here.
6278 VT = MVT::i32;
6279 SubRegOp = X86::sub_32bit;
6280 ROpc = X86::TEST32ri;
6281 MOpc = X86::TEST32mi;
6282 } else {
6283 // No eligible transformation was found.
6284 break;
6287 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
6288 SDValue Reg = N0.getOperand(0);
6290 // Emit a testl or testw.
6291 MachineSDNode *NewNode;
6292 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
6293 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
6294 if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
6295 if (!LoadN->isSimple()) {
6296 unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
6297 if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
6298 (MOpc == X86::TEST16mi && NumVolBits != 16) ||
6299 (MOpc == X86::TEST32mi && NumVolBits != 32))
6300 break;
6303 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
6304 Reg.getOperand(0) };
6305 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
6306 // Update the chain.
6307 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
6308 // Record the mem-refs
6309 CurDAG->setNodeMemRefs(NewNode,
6310 {cast<LoadSDNode>(Reg)->getMemOperand()});
6311 } else {
6312 // Extract the subregister if necessary.
6313 if (N0.getValueType() != VT)
6314 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
6316 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
6318 // Replace CMP with TEST.
6319 ReplaceNode(Node, NewNode);
6320 return;
6322 break;
6324 case X86ISD::PCMPISTR: {
6325 if (!Subtarget->hasSSE42())
6326 break;
6328 bool NeedIndex = !SDValue(Node, 0).use_empty();
6329 bool NeedMask = !SDValue(Node, 1).use_empty();
6330 // We can't fold a load if we are going to make two instructions.
6331 bool MayFoldLoad = !NeedIndex || !NeedMask;
6333 MachineSDNode *CNode;
6334 if (NeedMask) {
6335 unsigned ROpc =
6336 Subtarget->hasAVX() ? X86::VPCMPISTRMrri : X86::PCMPISTRMrri;
6337 unsigned MOpc =
6338 Subtarget->hasAVX() ? X86::VPCMPISTRMrmi : X86::PCMPISTRMrmi;
6339 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
6340 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6342 if (NeedIndex || !NeedMask) {
6343 unsigned ROpc =
6344 Subtarget->hasAVX() ? X86::VPCMPISTRIrri : X86::PCMPISTRIrri;
6345 unsigned MOpc =
6346 Subtarget->hasAVX() ? X86::VPCMPISTRIrmi : X86::PCMPISTRIrmi;
6347 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
6348 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6351 // Connect the flag usage to the last instruction created.
6352 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6353 CurDAG->RemoveDeadNode(Node);
6354 return;
6356 case X86ISD::PCMPESTR: {
6357 if (!Subtarget->hasSSE42())
6358 break;
6360 // Copy the two implicit register inputs.
6361 SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
6362 Node->getOperand(1),
6363 SDValue()).getValue(1);
6364 InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
6365 Node->getOperand(3), InGlue).getValue(1);
6367 bool NeedIndex = !SDValue(Node, 0).use_empty();
6368 bool NeedMask = !SDValue(Node, 1).use_empty();
6369 // We can't fold a load if we are going to make two instructions.
6370 bool MayFoldLoad = !NeedIndex || !NeedMask;
6372 MachineSDNode *CNode;
6373 if (NeedMask) {
6374 unsigned ROpc =
6375 Subtarget->hasAVX() ? X86::VPCMPESTRMrri : X86::PCMPESTRMrri;
6376 unsigned MOpc =
6377 Subtarget->hasAVX() ? X86::VPCMPESTRMrmi : X86::PCMPESTRMrmi;
6378 CNode =
6379 emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InGlue);
6380 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
6382 if (NeedIndex || !NeedMask) {
6383 unsigned ROpc =
6384 Subtarget->hasAVX() ? X86::VPCMPESTRIrri : X86::PCMPESTRIrri;
6385 unsigned MOpc =
6386 Subtarget->hasAVX() ? X86::VPCMPESTRIrmi : X86::PCMPESTRIrmi;
6387 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InGlue);
6388 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
6390 // Connect the flag usage to the last instruction created.
6391 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
6392 CurDAG->RemoveDeadNode(Node);
6393 return;
6396 case ISD::SETCC: {
6397 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
6398 return;
6400 break;
6403 case ISD::STORE:
6404 if (foldLoadStoreIntoMemOperand(Node))
6405 return;
6406 break;
6408 case X86ISD::SETCC_CARRY: {
6409 MVT VT = Node->getSimpleValueType(0);
6410 SDValue Result;
6411 if (Subtarget->hasSBBDepBreaking()) {
6412 // We have to do this manually because tblgen will put the eflags copy in
6413 // the wrong place if we use an extract_subreg in the pattern.
6414 // Copy flags to the EFLAGS register and glue it to next node.
6415 SDValue EFLAGS =
6416 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
6417 Node->getOperand(1), SDValue());
6419 // Create a 64-bit instruction if the result is 64-bits otherwise use the
6420 // 32-bit version.
6421 unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
6422 MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
6423 Result = SDValue(
6424 CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
6426 } else {
6427 // The target does not recognize sbb with the same reg operand as a
6428 // no-source idiom, so we explicitly zero the input values.
6429 Result = getSBBZero(Node);
6432 // For less than 32-bits we need to extract from the 32-bit node.
6433 if (VT == MVT::i8 || VT == MVT::i16) {
6434 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6435 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6438 ReplaceUses(SDValue(Node, 0), Result);
6439 CurDAG->RemoveDeadNode(Node);
6440 return;
6442 case X86ISD::SBB: {
6443 if (isNullConstant(Node->getOperand(0)) &&
6444 isNullConstant(Node->getOperand(1))) {
6445 SDValue Result = getSBBZero(Node);
6447 // Replace the flag use.
6448 ReplaceUses(SDValue(Node, 1), Result.getValue(1));
6450 // Replace the result use.
6451 if (!SDValue(Node, 0).use_empty()) {
6452 // For less than 32-bits we need to extract from the 32-bit node.
6453 MVT VT = Node->getSimpleValueType(0);
6454 if (VT == MVT::i8 || VT == MVT::i16) {
6455 int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
6456 Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
6458 ReplaceUses(SDValue(Node, 0), Result);
6461 CurDAG->RemoveDeadNode(Node);
6462 return;
6464 break;
6466 case X86ISD::MGATHER: {
6467 auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
6468 SDValue IndexOp = Mgt->getIndex();
6469 SDValue Mask = Mgt->getMask();
6470 MVT IndexVT = IndexOp.getSimpleValueType();
6471 MVT ValueVT = Node->getSimpleValueType(0);
6472 MVT MaskVT = Mask.getSimpleValueType();
6474 // This is just to prevent crashes if the nodes are malformed somehow. We're
6475 // otherwise only doing loose type checking in here based on type what
6476 // a type constraint would say just like table based isel.
6477 if (!ValueVT.isVector() || !MaskVT.isVector())
6478 break;
6480 unsigned NumElts = ValueVT.getVectorNumElements();
6481 MVT ValueSVT = ValueVT.getVectorElementType();
6483 bool IsFP = ValueSVT.isFloatingPoint();
6484 unsigned EltSize = ValueSVT.getSizeInBits();
6486 unsigned Opc = 0;
6487 bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
6488 if (AVX512Gather) {
6489 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6490 Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
6491 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6492 Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
6493 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6494 Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
6495 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6496 Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
6497 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6498 Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
6499 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6500 Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
6501 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6502 Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
6503 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6504 Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
6505 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6506 Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
6507 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6508 Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
6509 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6510 Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
6511 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6512 Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
6513 } else {
6514 assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
6515 "Unexpected mask VT!");
6516 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6517 Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
6518 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6519 Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
6520 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6521 Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
6522 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6523 Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
6524 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6525 Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
6526 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6527 Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
6528 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6529 Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
6530 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6531 Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
6534 if (!Opc)
6535 break;
6537 SDValue Base, Scale, Index, Disp, Segment;
6538 if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
6539 Base, Scale, Index, Disp, Segment))
6540 break;
6542 SDValue PassThru = Mgt->getPassThru();
6543 SDValue Chain = Mgt->getChain();
6544 // Gather instructions have a mask output not in the ISD node.
6545 SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
6547 MachineSDNode *NewNode;
6548 if (AVX512Gather) {
6549 SDValue Ops[] = {PassThru, Mask, Base, Scale,
6550 Index, Disp, Segment, Chain};
6551 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6552 } else {
6553 SDValue Ops[] = {PassThru, Base, Scale, Index,
6554 Disp, Segment, Mask, Chain};
6555 NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6557 CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
6558 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
6559 ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
6560 CurDAG->RemoveDeadNode(Node);
6561 return;
6563 case X86ISD::MSCATTER: {
6564 auto *Sc = cast<X86MaskedScatterSDNode>(Node);
6565 SDValue Value = Sc->getValue();
6566 SDValue IndexOp = Sc->getIndex();
6567 MVT IndexVT = IndexOp.getSimpleValueType();
6568 MVT ValueVT = Value.getSimpleValueType();
6570 // This is just to prevent crashes if the nodes are malformed somehow. We're
6571 // otherwise only doing loose type checking in here based on type what
6572 // a type constraint would say just like table based isel.
6573 if (!ValueVT.isVector())
6574 break;
6576 unsigned NumElts = ValueVT.getVectorNumElements();
6577 MVT ValueSVT = ValueVT.getVectorElementType();
6579 bool IsFP = ValueSVT.isFloatingPoint();
6580 unsigned EltSize = ValueSVT.getSizeInBits();
6582 unsigned Opc;
6583 if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
6584 Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
6585 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
6586 Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
6587 else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
6588 Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
6589 else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
6590 Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
6591 else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
6592 Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
6593 else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
6594 Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
6595 else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
6596 Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
6597 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
6598 Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
6599 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
6600 Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
6601 else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
6602 Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
6603 else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
6604 Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
6605 else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
6606 Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
6607 else
6608 break;
6610 SDValue Base, Scale, Index, Disp, Segment;
6611 if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
6612 Base, Scale, Index, Disp, Segment))
6613 break;
6615 SDValue Mask = Sc->getMask();
6616 SDValue Chain = Sc->getChain();
6617 // Scatter instructions have a mask output not in the ISD node.
6618 SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
6619 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
6621 MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
6622 CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
6623 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
6624 CurDAG->RemoveDeadNode(Node);
6625 return;
6627 case ISD::PREALLOCATED_SETUP: {
6628 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6629 auto CallId = MFI->getPreallocatedIdForCallSite(
6630 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6631 SDValue Chain = Node->getOperand(0);
6632 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6633 MachineSDNode *New = CurDAG->getMachineNode(
6634 TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
6635 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
6636 CurDAG->RemoveDeadNode(Node);
6637 return;
6639 case ISD::PREALLOCATED_ARG: {
6640 auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
6641 auto CallId = MFI->getPreallocatedIdForCallSite(
6642 cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
6643 SDValue Chain = Node->getOperand(0);
6644 SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
6645 SDValue ArgIndex = Node->getOperand(2);
6646 SDValue Ops[3];
6647 Ops[0] = CallIdValue;
6648 Ops[1] = ArgIndex;
6649 Ops[2] = Chain;
6650 MachineSDNode *New = CurDAG->getMachineNode(
6651 TargetOpcode::PREALLOCATED_ARG, dl,
6652 CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
6653 MVT::Other),
6654 Ops);
6655 ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
6656 ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
6657 CurDAG->RemoveDeadNode(Node);
6658 return;
6660 case X86ISD::AESENCWIDE128KL:
6661 case X86ISD::AESDECWIDE128KL:
6662 case X86ISD::AESENCWIDE256KL:
6663 case X86ISD::AESDECWIDE256KL: {
6664 if (!Subtarget->hasWIDEKL())
6665 break;
6667 unsigned Opcode;
6668 switch (Node->getOpcode()) {
6669 default:
6670 llvm_unreachable("Unexpected opcode!");
6671 case X86ISD::AESENCWIDE128KL:
6672 Opcode = X86::AESENCWIDE128KL;
6673 break;
6674 case X86ISD::AESDECWIDE128KL:
6675 Opcode = X86::AESDECWIDE128KL;
6676 break;
6677 case X86ISD::AESENCWIDE256KL:
6678 Opcode = X86::AESENCWIDE256KL;
6679 break;
6680 case X86ISD::AESDECWIDE256KL:
6681 Opcode = X86::AESDECWIDE256KL;
6682 break;
6685 SDValue Chain = Node->getOperand(0);
6686 SDValue Addr = Node->getOperand(1);
6688 SDValue Base, Scale, Index, Disp, Segment;
6689 if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
6690 break;
6692 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
6693 SDValue());
6694 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
6695 Chain.getValue(1));
6696 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
6697 Chain.getValue(1));
6698 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
6699 Chain.getValue(1));
6700 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
6701 Chain.getValue(1));
6702 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
6703 Chain.getValue(1));
6704 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
6705 Chain.getValue(1));
6706 Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
6707 Chain.getValue(1));
6709 MachineSDNode *Res = CurDAG->getMachineNode(
6710 Opcode, dl, Node->getVTList(),
6711 {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
6712 CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
6713 ReplaceNode(Node, Res);
6714 return;
6718 SelectCode(Node);
6721 bool X86DAGToDAGISel::SelectInlineAsmMemoryOperand(
6722 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
6723 std::vector<SDValue> &OutOps) {
6724 SDValue Op0, Op1, Op2, Op3, Op4;
6725 switch (ConstraintID) {
6726 default:
6727 llvm_unreachable("Unexpected asm memory constraint");
6728 case InlineAsm::ConstraintCode::o: // offsetable ??
6729 case InlineAsm::ConstraintCode::v: // not offsetable ??
6730 case InlineAsm::ConstraintCode::m: // memory
6731 case InlineAsm::ConstraintCode::X:
6732 case InlineAsm::ConstraintCode::p: // address
6733 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
6734 return true;
6735 break;
6738 OutOps.push_back(Op0);
6739 OutOps.push_back(Op1);
6740 OutOps.push_back(Op2);
6741 OutOps.push_back(Op3);
6742 OutOps.push_back(Op4);
6743 return false;
6746 X86ISelDAGToDAGPass::X86ISelDAGToDAGPass(X86TargetMachine &TM)
6747 : SelectionDAGISelPass(
6748 std::make_unique<X86DAGToDAGISel>(TM, TM.getOptLevel())) {}
6750 /// This pass converts a legalized DAG into a X86-specific DAG,
6751 /// ready for instruction scheduling.
6752 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
6753 CodeGenOptLevel OptLevel) {
6754 return new X86DAGToDAGISelLegacy(TM, OptLevel);