llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp

   1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 /// \file
   9 /// This file implements the targeting of the InstructionSelector class for
  10 /// AArch64.
  11 /// \todo This should be generated by TableGen.
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AArch64GlobalISelUtils.h"
  15 #include "AArch64InstrInfo.h"
  16 #include "AArch64MachineFunctionInfo.h"
  17 #include "AArch64RegisterBankInfo.h"
  18 #include "AArch64RegisterInfo.h"
  19 #include "AArch64Subtarget.h"
  20 #include "AArch64TargetMachine.h"
  21 #include "MCTargetDesc/AArch64AddressingModes.h"
  22 #include "MCTargetDesc/AArch64MCTargetDesc.h"
  23 #include "llvm/BinaryFormat/Dwarf.h"
  24 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
  25 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
  26 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
  27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
  28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
  29 #include "llvm/CodeGen/GlobalISel/Utils.h"
  30 #include "llvm/CodeGen/MachineBasicBlock.h"
  31 #include "llvm/CodeGen/MachineConstantPool.h"
  32 #include "llvm/CodeGen/MachineFrameInfo.h"
  33 #include "llvm/CodeGen/MachineFunction.h"
  34 #include "llvm/CodeGen/MachineInstr.h"
  35 #include "llvm/CodeGen/MachineInstrBuilder.h"
  36 #include "llvm/CodeGen/MachineMemOperand.h"
  37 #include "llvm/CodeGen/MachineOperand.h"
  38 #include "llvm/CodeGen/MachineRegisterInfo.h"
  39 #include "llvm/CodeGen/TargetOpcodes.h"
  40 #include "llvm/CodeGen/TargetRegisterInfo.h"
  41 #include "llvm/IR/Constants.h"
  42 #include "llvm/IR/DerivedTypes.h"
  43 #include "llvm/IR/Instructions.h"
  44 #include "llvm/IR/IntrinsicsAArch64.h"
  45 #include "llvm/IR/Type.h"
  46 #include "llvm/Pass.h"
  47 #include "llvm/Support/Debug.h"
  48 #include "llvm/Support/raw_ostream.h"
  49 #include <optional>
  50
  51 #define DEBUG_TYPE "aarch64-isel"
  52
  53 using namespace llvm;
  54 using namespace MIPatternMatch;
  55 using namespace AArch64GISelUtils;
  56
  57 namespace llvm {
  58 class BlockFrequencyInfo;
  59 class ProfileSummaryInfo;
  60 }
  61
  62 namespace {
  63
  64 #define GET_GLOBALISEL_PREDICATE_BITSET
  65 #include "AArch64GenGlobalISel.inc"
  66 #undef GET_GLOBALISEL_PREDICATE_BITSET
  67
  68
  69 class AArch64InstructionSelector : public InstructionSelector {
  70 public:
  71   AArch64InstructionSelector(const AArch64TargetMachine &TM,
  72                              const AArch64Subtarget &STI,
  73                              const AArch64RegisterBankInfo &RBI);
  74
  75   bool select(MachineInstr &I) override;
  76   static const char *getName() { return DEBUG_TYPE; }
  77
  78   void setupMF(MachineFunction &MF, GISelKnownBits *KB,
  79                CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI,
  80                BlockFrequencyInfo *BFI) override {
  81     InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
  82     MIB.setMF(MF);
  83
  84     // hasFnAttribute() is expensive to call on every BRCOND selection, so
  85     // cache it here for each run of the selector.
  86     ProduceNonFlagSettingCondBr =
  87         !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
  88     MFReturnAddr = Register();
  89
  90     processPHIs(MF);
  91   }
  92
  93 private:
  94   /// tblgen-erated 'select' implementation, used as the initial selector for
  95   /// the patterns that don't require complex C++.
  96   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
  97
  98   // A lowering phase that runs before any selection attempts.
  99   // Returns true if the instruction was modified.
 100   bool preISelLower(MachineInstr &I);
 101
 102   // An early selection function that runs before the selectImpl() call.
 103   bool earlySelect(MachineInstr &I);
 104
 105   /// Save state that is shared between select calls, call select on \p I and
 106   /// then restore the saved state. This can be used to recursively call select
 107   /// within a select call.
 108   bool selectAndRestoreState(MachineInstr &I);
 109
 110   // Do some preprocessing of G_PHIs before we begin selection.
 111   void processPHIs(MachineFunction &MF);
 112
 113   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
 114
 115   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
 116   bool contractCrossBankCopyIntoStore(MachineInstr &I,
 117                                       MachineRegisterInfo &MRI);
 118
 119   bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
 120
 121   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
 122                           MachineRegisterInfo &MRI) const;
 123   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
 124                            MachineRegisterInfo &MRI) const;
 125
 126   ///@{
 127   /// Helper functions for selectCompareBranch.
 128   bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
 129                                     MachineIRBuilder &MIB) const;
 130   bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
 131                                     MachineIRBuilder &MIB) const;
 132   bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
 133                                     MachineIRBuilder &MIB) const;
 134   bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
 135                                   MachineBasicBlock *DstMBB,
 136                                   MachineIRBuilder &MIB) const;
 137   ///@}
 138
 139   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
 140                            MachineRegisterInfo &MRI);
 141
 142   bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
 143   bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
 144
 145   // Helper to generate an equivalent of scalar_to_vector into a new register,
 146   // returned via 'Dst'.
 147   MachineInstr *emitScalarToVector(unsigned EltSize,
 148                                    const TargetRegisterClass *DstRC,
 149                                    Register Scalar,
 150                                    MachineIRBuilder &MIRBuilder) const;
 151   /// Helper to narrow vector that was widened by emitScalarToVector.
 152   /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit
 153   /// vector, correspondingly.
 154   MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg,
 155                                  MachineIRBuilder &MIRBuilder,
 156                                  MachineRegisterInfo &MRI) const;
 157
 158   /// Emit a lane insert into \p DstReg, or a new vector register if
 159   /// std::nullopt is provided.
 160   ///
 161   /// The lane inserted into is defined by \p LaneIdx. The vector source
 162   /// register is given by \p SrcReg. The register containing the element is
 163   /// given by \p EltReg.
 164   MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg,
 165                                Register EltReg, unsigned LaneIdx,
 166                                const RegisterBank &RB,
 167                                MachineIRBuilder &MIRBuilder) const;
 168
 169   /// Emit a sequence of instructions representing a constant \p CV for a
 170   /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
 171   ///
 172   /// \returns the last instruction in the sequence on success, and nullptr
 173   /// otherwise.
 174   MachineInstr *emitConstantVector(Register Dst, Constant *CV,
 175                                    MachineIRBuilder &MIRBuilder,
 176                                    MachineRegisterInfo &MRI);
 177
 178   MachineInstr *tryAdvSIMDModImm8(Register Dst, unsigned DstSize, APInt Bits,
 179                                   MachineIRBuilder &MIRBuilder);
 180
 181   MachineInstr *tryAdvSIMDModImm16(Register Dst, unsigned DstSize, APInt Bits,
 182                                    MachineIRBuilder &MIRBuilder, bool Inv);
 183
 184   MachineInstr *tryAdvSIMDModImm32(Register Dst, unsigned DstSize, APInt Bits,
 185                                    MachineIRBuilder &MIRBuilder, bool Inv);
 186   MachineInstr *tryAdvSIMDModImm64(Register Dst, unsigned DstSize, APInt Bits,
 187                                    MachineIRBuilder &MIRBuilder);
 188   MachineInstr *tryAdvSIMDModImm321s(Register Dst, unsigned DstSize, APInt Bits,
 189                                      MachineIRBuilder &MIRBuilder, bool Inv);
 190   MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits,
 191                                    MachineIRBuilder &MIRBuilder);
 192
 193   bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
 194                               MachineRegisterInfo &MRI);
 195   /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
 196   /// SUBREG_TO_REG.
 197   bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
 198   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
 199   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
 200   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
 201
 202   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
 203   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
 204   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
 205   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
 206
 207   /// Helper function to select vector load intrinsics like
 208   /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
 209   /// \p Opc is the opcode that the selected instruction should use.
 210   /// \p NumVecs is the number of vector destinations for the instruction.
 211   /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
 212   bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
 213                                  MachineInstr &I);
 214   bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
 215                                      MachineInstr &I);
 216   void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs,
 217                                   unsigned Opc);
 218   bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs,
 219                                       unsigned Opc);
 220   bool selectIntrinsicWithSideEffects(MachineInstr &I,
 221                                       MachineRegisterInfo &MRI);
 222   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
 223   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
 224   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
 225   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
 226   bool selectPtrAuthGlobalValue(MachineInstr &I,
 227                                 MachineRegisterInfo &MRI) const;
 228   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
 229   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
 230   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
 231   void SelectTable(MachineInstr &I, MachineRegisterInfo &MRI, unsigned NumVecs,
 232                    unsigned Opc1, unsigned Opc2, bool isExt);
 233
 234   bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
 235   bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
 236   bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI);
 237
 238   unsigned emitConstantPoolEntry(const Constant *CPVal,
 239                                  MachineFunction &MF) const;
 240   MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
 241                                          MachineIRBuilder &MIRBuilder) const;
 242
 243   // Emit a vector concat operation.
 244   MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1,
 245                                  Register Op2,
 246                                  MachineIRBuilder &MIRBuilder) const;
 247
 248   // Emit an integer compare between LHS and RHS, which checks for Predicate.
 249   MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
 250                                    MachineOperand &Predicate,
 251                                    MachineIRBuilder &MIRBuilder) const;
 252
 253   /// Emit a floating point comparison between \p LHS and \p RHS.
 254   /// \p Pred if given is the intended predicate to use.
 255   MachineInstr *
 256   emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
 257                 std::optional<CmpInst::Predicate> = std::nullopt) const;
 258
 259   MachineInstr *
 260   emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
 261             std::initializer_list<llvm::SrcOp> SrcOps,
 262             MachineIRBuilder &MIRBuilder,
 263             const ComplexRendererFns &RenderFns = std::nullopt) const;
 264   /// Helper function to emit an add or sub instruction.
 265   ///
 266   /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
 267   /// in a specific order.
 268   ///
 269   /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
 270   ///
 271   /// \code
 272   ///   const std::array<std::array<unsigned, 2>, 4> Table {
 273   ///    {{AArch64::ADDXri, AArch64::ADDWri},
 274   ///     {AArch64::ADDXrs, AArch64::ADDWrs},
 275   ///     {AArch64::ADDXrr, AArch64::ADDWrr},
 276   ///     {AArch64::SUBXri, AArch64::SUBWri},
 277   ///     {AArch64::ADDXrx, AArch64::ADDWrx}}};
 278   /// \endcode
 279   ///
 280   /// Each row in the table corresponds to a different addressing mode. Each
 281   /// column corresponds to a different register size.
 282   ///
 283   /// \attention Rows must be structured as follows:
 284   ///   - Row 0: The ri opcode variants
 285   ///   - Row 1: The rs opcode variants
 286   ///   - Row 2: The rr opcode variants
 287   ///   - Row 3: The ri opcode variants for negative immediates
 288   ///   - Row 4: The rx opcode variants
 289   ///
 290   /// \attention Columns must be structured as follows:
 291   ///   - Column 0: The 64-bit opcode variants
 292   ///   - Column 1: The 32-bit opcode variants
 293   ///
 294   /// \p Dst is the destination register of the binop to emit.
 295   /// \p LHS is the left-hand operand of the binop to emit.
 296   /// \p RHS is the right-hand operand of the binop to emit.
 297   MachineInstr *emitAddSub(
 298       const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
 299       Register Dst, MachineOperand &LHS, MachineOperand &RHS,
 300       MachineIRBuilder &MIRBuilder) const;
 301   MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
 302                         MachineOperand &RHS,
 303                         MachineIRBuilder &MIRBuilder) const;
 304   MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
 305                          MachineIRBuilder &MIRBuilder) const;
 306   MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
 307                          MachineIRBuilder &MIRBuilder) const;
 308   MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
 309                          MachineIRBuilder &MIRBuilder) const;
 310   MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
 311                          MachineIRBuilder &MIRBuilder) const;
 312   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
 313                         MachineIRBuilder &MIRBuilder) const;
 314   MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
 315                         MachineIRBuilder &MIRBuilder) const;
 316   MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
 317                            AArch64CC::CondCode CC,
 318                            MachineIRBuilder &MIRBuilder) const;
 319   MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg,
 320                                      const RegisterBank &DstRB, LLT ScalarTy,
 321                                      Register VecReg, unsigned LaneIdx,
 322                                      MachineIRBuilder &MIRBuilder) const;
 323   MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
 324                           AArch64CC::CondCode Pred,
 325                           MachineIRBuilder &MIRBuilder) const;
 326   /// Emit a CSet for a FP compare.
 327   ///
 328   /// \p Dst is expected to be a 32-bit scalar register.
 329   MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
 330                                 MachineIRBuilder &MIRBuilder) const;
 331
 332   /// Emit an instruction that sets NZCV to the carry-in expected by \p I.
 333   /// Might elide the instruction if the previous instruction already sets NZCV
 334   /// correctly.
 335   MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg);
 336
 337   /// Emit the overflow op for \p Opcode.
 338   ///
 339   /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
 340   /// G_USUBO, etc.
 341   std::pair<MachineInstr *, AArch64CC::CondCode>
 342   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
 343                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
 344
 345   bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI);
 346
 347   /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
 348   /// In some cases this is even possible with OR operations in the expression.
 349   MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
 350                                 MachineIRBuilder &MIB) const;
 351   MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
 352                                           CmpInst::Predicate CC,
 353                                           AArch64CC::CondCode Predicate,
 354                                           AArch64CC::CondCode OutCC,
 355                                           MachineIRBuilder &MIB) const;
 356   MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
 357                                    bool Negate, Register CCOp,
 358                                    AArch64CC::CondCode Predicate,
 359                                    MachineIRBuilder &MIB) const;
 360
 361   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
 362   /// \p IsNegative is true if the test should be "not zero".
 363   /// This will also optimize the test bit instruction when possible.
 364   MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
 365                             MachineBasicBlock *DstMBB,
 366                             MachineIRBuilder &MIB) const;
 367
 368   /// Emit a CB(N)Z instruction which branches to \p DestMBB.
 369   MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
 370                         MachineBasicBlock *DestMBB,
 371                         MachineIRBuilder &MIB) const;
 372
 373   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
 374   // We use these manually instead of using the importer since it doesn't
 375   // support SDNodeXForm.
 376   ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
 377   ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
 378   ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
 379   ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
 380
 381   ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
 382   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
 383   ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
 384
 385   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
 386                                             unsigned Size) const;
 387
 388   ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
 389     return selectAddrModeUnscaled(Root, 1);
 390   }
 391   ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
 392     return selectAddrModeUnscaled(Root, 2);
 393   }
 394   ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
 395     return selectAddrModeUnscaled(Root, 4);
 396   }
 397   ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
 398     return selectAddrModeUnscaled(Root, 8);
 399   }
 400   ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
 401     return selectAddrModeUnscaled(Root, 16);
 402   }
 403
 404   /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
 405   /// from complex pattern matchers like selectAddrModeIndexed().
 406   ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
 407                                           MachineRegisterInfo &MRI) const;
 408
 409   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
 410                                            unsigned Size) const;
 411   template <int Width>
 412   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
 413     return selectAddrModeIndexed(Root, Width / 8);
 414   }
 415
 416   std::optional<bool>
 417   isWorthFoldingIntoAddrMode(MachineInstr &MI,
 418                              const MachineRegisterInfo &MRI) const;
 419
 420   bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
 421                                      const MachineRegisterInfo &MRI,
 422                                      bool IsAddrOperand) const;
 423   ComplexRendererFns
 424   selectAddrModeShiftedExtendXReg(MachineOperand &Root,
 425                                   unsigned SizeInBytes) const;
 426
 427   /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
 428   /// or not a shift + extend should be folded into an addressing mode. Returns
 429   /// None when this is not profitable or possible.
 430   ComplexRendererFns
 431   selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
 432                     MachineOperand &Offset, unsigned SizeInBytes,
 433                     bool WantsExt) const;
 434   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
 435   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
 436                                        unsigned SizeInBytes) const;
 437   template <int Width>
 438   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
 439     return selectAddrModeXRO(Root, Width / 8);
 440   }
 441
 442   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
 443                                        unsigned SizeInBytes) const;
 444   template <int Width>
 445   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
 446     return selectAddrModeWRO(Root, Width / 8);
 447   }
 448
 449   ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
 450                                            bool AllowROR = false) const;
 451
 452   ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
 453     return selectShiftedRegister(Root);
 454   }
 455
 456   ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
 457     return selectShiftedRegister(Root, true);
 458   }
 459
 460   /// Given an extend instruction, determine the correct shift-extend type for
 461   /// that instruction.
 462   ///
 463   /// If the instruction is going to be used in a load or store, pass
 464   /// \p IsLoadStore = true.
 465   AArch64_AM::ShiftExtendType
 466   getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
 467                        bool IsLoadStore = false) const;
 468
 469   /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
 470   ///
 471   /// \returns Either \p Reg if no change was necessary, or the new register
 472   /// created by moving \p Reg.
 473   ///
 474   /// Note: This uses emitCopy right now.
 475   Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
 476                               MachineIRBuilder &MIB) const;
 477
 478   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
 479
 480   ComplexRendererFns selectExtractHigh(MachineOperand &Root) const;
 481
 482   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
 483                       int OpIdx = -1) const;
 484   void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
 485                           int OpIdx = -1) const;
 486   void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
 487                           int OpIdx = -1) const;
 488   void renderUbsanTrap(MachineInstrBuilder &MIB, const MachineInstr &MI,
 489                        int OpIdx) const;
 490   void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
 491                      int OpIdx = -1) const;
 492   void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
 493                      int OpIdx = -1) const;
 494   void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
 495                      int OpIdx = -1) const;
 496   void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
 497                                     const MachineInstr &MI,
 498                                     int OpIdx = -1) const;
 499
 500   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
 501   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
 502
 503   // Optimization methods.
 504   bool tryOptSelect(GSelect &Sel);
 505   bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
 506   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
 507                                       MachineOperand &Predicate,
 508                                       MachineIRBuilder &MIRBuilder) const;
 509
 510   /// Return true if \p MI is a load or store of \p NumBytes bytes.
 511   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
 512
 513   /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
 514   /// register zeroed out. In other words, the result of MI has been explicitly
 515   /// zero extended.
 516   bool isDef32(const MachineInstr &MI) const;
 517
 518   const AArch64TargetMachine &TM;
 519   const AArch64Subtarget &STI;
 520   const AArch64InstrInfo &TII;
 521   const AArch64RegisterInfo &TRI;
 522   const AArch64RegisterBankInfo &RBI;
 523
 524   bool ProduceNonFlagSettingCondBr = false;
 525
 526   // Some cached values used during selection.
 527   // We use LR as a live-in register, and we keep track of it here as it can be
 528   // clobbered by calls.
 529   Register MFReturnAddr;
 530
 531   MachineIRBuilder MIB;
 532
 533 #define GET_GLOBALISEL_PREDICATES_DECL
 534 #include "AArch64GenGlobalISel.inc"
 535 #undef GET_GLOBALISEL_PREDICATES_DECL
 536
 537 // We declare the temporaries used by selectImpl() in the class to minimize the
 538 // cost of constructing placeholder values.
 539 #define GET_GLOBALISEL_TEMPORARIES_DECL
 540 #include "AArch64GenGlobalISel.inc"
 541 #undef GET_GLOBALISEL_TEMPORARIES_DECL
 542 };
 543
 544 } // end anonymous namespace
 545
 546 #define GET_GLOBALISEL_IMPL
 547 #include "AArch64GenGlobalISel.inc"
 548 #undef GET_GLOBALISEL_IMPL
 549
 550 AArch64InstructionSelector::AArch64InstructionSelector(
 551     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
 552     const AArch64RegisterBankInfo &RBI)
 553     : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
 554       RBI(RBI),
 555 #define GET_GLOBALISEL_PREDICATES_INIT
 556 #include "AArch64GenGlobalISel.inc"
 557 #undef GET_GLOBALISEL_PREDICATES_INIT
 558 #define GET_GLOBALISEL_TEMPORARIES_INIT
 559 #include "AArch64GenGlobalISel.inc"
 560 #undef GET_GLOBALISEL_TEMPORARIES_INIT
 561 {
 562 }
 563
 564 // FIXME: This should be target-independent, inferred from the types declared
 565 // for each class in the bank.
 566 //
 567 /// Given a register bank, and a type, return the smallest register class that
 568 /// can represent that combination.
 569 static const TargetRegisterClass *
 570 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
 571                          bool GetAllRegSet = false) {
 572   if (RB.getID() == AArch64::GPRRegBankID) {
 573     if (Ty.getSizeInBits() <= 32)
 574       return GetAllRegSet ? &AArch64::GPR32allRegClass
 575                           : &AArch64::GPR32RegClass;
 576     if (Ty.getSizeInBits() == 64)
 577       return GetAllRegSet ? &AArch64::GPR64allRegClass
 578                           : &AArch64::GPR64RegClass;
 579     if (Ty.getSizeInBits() == 128)
 580       return &AArch64::XSeqPairsClassRegClass;
 581     return nullptr;
 582   }
 583
 584   if (RB.getID() == AArch64::FPRRegBankID) {
 585     switch (Ty.getSizeInBits()) {
 586     case 8:
 587       return &AArch64::FPR8RegClass;
 588     case 16:
 589       return &AArch64::FPR16RegClass;
 590     case 32:
 591       return &AArch64::FPR32RegClass;
 592     case 64:
 593       return &AArch64::FPR64RegClass;
 594     case 128:
 595       return &AArch64::FPR128RegClass;
 596     }
 597     return nullptr;
 598   }
 599
 600   return nullptr;
 601 }
 602
 603 /// Given a register bank, and size in bits, return the smallest register class
 604 /// that can represent that combination.
 605 static const TargetRegisterClass *
 606 getMinClassForRegBank(const RegisterBank &RB, TypeSize SizeInBits,
 607                       bool GetAllRegSet = false) {
 608   if (SizeInBits.isScalable()) {
 609     assert(RB.getID() == AArch64::FPRRegBankID &&
 610            "Expected FPR regbank for scalable type size");
 611     return &AArch64::ZPRRegClass;
 612   }
 613
 614   unsigned RegBankID = RB.getID();
 615
 616   if (RegBankID == AArch64::GPRRegBankID) {
 617     assert(!SizeInBits.isScalable() && "Unexpected scalable register size");
 618     if (SizeInBits <= 32)
 619       return GetAllRegSet ? &AArch64::GPR32allRegClass
 620                           : &AArch64::GPR32RegClass;
 621     if (SizeInBits == 64)
 622       return GetAllRegSet ? &AArch64::GPR64allRegClass
 623                           : &AArch64::GPR64RegClass;
 624     if (SizeInBits == 128)
 625       return &AArch64::XSeqPairsClassRegClass;
 626   }
 627
 628   if (RegBankID == AArch64::FPRRegBankID) {
 629     if (SizeInBits.isScalable()) {
 630       assert(SizeInBits == TypeSize::getScalable(128) &&
 631              "Unexpected scalable register size");
 632       return &AArch64::ZPRRegClass;
 633     }
 634
 635     switch (SizeInBits) {
 636     default:
 637       return nullptr;
 638     case 8:
 639       return &AArch64::FPR8RegClass;
 640     case 16:
 641       return &AArch64::FPR16RegClass;
 642     case 32:
 643       return &AArch64::FPR32RegClass;
 644     case 64:
 645       return &AArch64::FPR64RegClass;
 646     case 128:
 647       return &AArch64::FPR128RegClass;
 648     }
 649   }
 650
 651   return nullptr;
 652 }
 653
 654 /// Returns the correct subregister to use for a given register class.
 655 static bool getSubRegForClass(const TargetRegisterClass *RC,
 656                               const TargetRegisterInfo &TRI, unsigned &SubReg) {
 657   switch (TRI.getRegSizeInBits(*RC)) {
 658   case 8:
 659     SubReg = AArch64::bsub;
 660     break;
 661   case 16:
 662     SubReg = AArch64::hsub;
 663     break;
 664   case 32:
 665     if (RC != &AArch64::FPR32RegClass)
 666       SubReg = AArch64::sub_32;
 667     else
 668       SubReg = AArch64::ssub;
 669     break;
 670   case 64:
 671     SubReg = AArch64::dsub;
 672     break;
 673   default:
 674     LLVM_DEBUG(
 675         dbgs() << "Couldn't find appropriate subregister for register class.");
 676     return false;
 677   }
 678
 679   return true;
 680 }
 681
 682 /// Returns the minimum size the given register bank can hold.
 683 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
 684   switch (RB.getID()) {
 685   case AArch64::GPRRegBankID:
 686     return 32;
 687   case AArch64::FPRRegBankID:
 688     return 8;
 689   default:
 690     llvm_unreachable("Tried to get minimum size for unknown register bank.");
 691   }
 692 }
 693
 694 /// Create a REG_SEQUENCE instruction using the registers in \p Regs.
 695 /// Helper function for functions like createDTuple and createQTuple.
 696 ///
 697 /// \p RegClassIDs - The list of register class IDs available for some tuple of
 698 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
 699 /// expected to contain between 2 and 4 tuple classes.
 700 ///
 701 /// \p SubRegs - The list of subregister classes associated with each register
 702 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
 703 /// subregister class. The index of each subregister class is expected to
 704 /// correspond with the index of each register class.
 705 ///
 706 /// \returns Either the destination register of REG_SEQUENCE instruction that
 707 /// was created, or the 0th element of \p Regs if \p Regs contains a single
 708 /// element.
 709 static Register createTuple(ArrayRef<Register> Regs,
 710                             const unsigned RegClassIDs[],
 711                             const unsigned SubRegs[], MachineIRBuilder &MIB) {
 712   unsigned NumRegs = Regs.size();
 713   if (NumRegs == 1)
 714     return Regs[0];
 715   assert(NumRegs >= 2 && NumRegs <= 4 &&
 716          "Only support between two and 4 registers in a tuple!");
 717   const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
 718   auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
 719   auto RegSequence =
 720       MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
 721   for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
 722     RegSequence.addUse(Regs[I]);
 723     RegSequence.addImm(SubRegs[I]);
 724   }
 725   return RegSequence.getReg(0);
 726 }
 727
 728 /// Create a tuple of D-registers using the registers in \p Regs.
 729 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
 730   static const unsigned RegClassIDs[] = {
 731       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
 732   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
 733                                      AArch64::dsub2, AArch64::dsub3};
 734   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
 735 }
 736
 737 /// Create a tuple of Q-registers using the registers in \p Regs.
 738 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
 739   static const unsigned RegClassIDs[] = {
 740       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
 741   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
 742                                      AArch64::qsub2, AArch64::qsub3};
 743   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
 744 }
 745
 746 static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
 747   auto &MI = *Root.getParent();
 748   auto &MBB = *MI.getParent();
 749   auto &MF = *MBB.getParent();
 750   auto &MRI = MF.getRegInfo();
 751   uint64_t Immed;
 752   if (Root.isImm())
 753     Immed = Root.getImm();
 754   else if (Root.isCImm())
 755     Immed = Root.getCImm()->getZExtValue();
 756   else if (Root.isReg()) {
 757     auto ValAndVReg =
 758         getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
 759     if (!ValAndVReg)
 760       return std::nullopt;
 761     Immed = ValAndVReg->Value.getSExtValue();
 762   } else
 763     return std::nullopt;
 764   return Immed;
 765 }
 766
 767 /// Check whether \p I is a currently unsupported binary operation:
 768 /// - it has an unsized type
 769 /// - an operand is not a vreg
 770 /// - all operands are not in the same bank
 771 /// These are checks that should someday live in the verifier, but right now,
 772 /// these are mostly limitations of the aarch64 selector.
 773 static bool unsupportedBinOp(const MachineInstr &I,
 774                              const AArch64RegisterBankInfo &RBI,
 775                              const MachineRegisterInfo &MRI,
 776                              const AArch64RegisterInfo &TRI) {
 777   LLT Ty = MRI.getType(I.getOperand(0).getReg());
 778   if (!Ty.isValid()) {
 779     LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
 780     return true;
 781   }
 782
 783   const RegisterBank *PrevOpBank = nullptr;
 784   for (auto &MO : I.operands()) {
 785     // FIXME: Support non-register operands.
 786     if (!MO.isReg()) {
 787       LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
 788       return true;
 789     }
 790
 791     // FIXME: Can generic operations have physical registers operands? If
 792     // so, this will need to be taught about that, and we'll need to get the
 793     // bank out of the minimal class for the register.
 794     // Either way, this needs to be documented (and possibly verified).
 795     if (!MO.getReg().isVirtual()) {
 796       LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
 797       return true;
 798     }
 799
 800     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
 801     if (!OpBank) {
 802       LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
 803       return true;
 804     }
 805
 806     if (PrevOpBank && OpBank != PrevOpBank) {
 807       LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
 808       return true;
 809     }
 810     PrevOpBank = OpBank;
 811   }
 812   return false;
 813 }
 814
 815 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
 816 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
 817 /// and of size \p OpSize.
 818 /// \returns \p GenericOpc if the combination is unsupported.
 819 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
 820                                unsigned OpSize) {
 821   switch (RegBankID) {
 822   case AArch64::GPRRegBankID:
 823     if (OpSize == 32) {
 824       switch (GenericOpc) {
 825       case TargetOpcode::G_SHL:
 826         return AArch64::LSLVWr;
 827       case TargetOpcode::G_LSHR:
 828         return AArch64::LSRVWr;
 829       case TargetOpcode::G_ASHR:
 830         return AArch64::ASRVWr;
 831       default:
 832         return GenericOpc;
 833       }
 834     } else if (OpSize == 64) {
 835       switch (GenericOpc) {
 836       case TargetOpcode::G_PTR_ADD:
 837         return AArch64::ADDXrr;
 838       case TargetOpcode::G_SHL:
 839         return AArch64::LSLVXr;
 840       case TargetOpcode::G_LSHR:
 841         return AArch64::LSRVXr;
 842       case TargetOpcode::G_ASHR:
 843         return AArch64::ASRVXr;
 844       default:
 845         return GenericOpc;
 846       }
 847     }
 848     break;
 849   case AArch64::FPRRegBankID:
 850     switch (OpSize) {
 851     case 32:
 852       switch (GenericOpc) {
 853       case TargetOpcode::G_FADD:
 854         return AArch64::FADDSrr;
 855       case TargetOpcode::G_FSUB:
 856         return AArch64::FSUBSrr;
 857       case TargetOpcode::G_FMUL:
 858         return AArch64::FMULSrr;
 859       case TargetOpcode::G_FDIV:
 860         return AArch64::FDIVSrr;
 861       default:
 862         return GenericOpc;
 863       }
 864     case 64:
 865       switch (GenericOpc) {
 866       case TargetOpcode::G_FADD:
 867         return AArch64::FADDDrr;
 868       case TargetOpcode::G_FSUB:
 869         return AArch64::FSUBDrr;
 870       case TargetOpcode::G_FMUL:
 871         return AArch64::FMULDrr;
 872       case TargetOpcode::G_FDIV:
 873         return AArch64::FDIVDrr;
 874       case TargetOpcode::G_OR:
 875         return AArch64::ORRv8i8;
 876       default:
 877         return GenericOpc;
 878       }
 879     }
 880     break;
 881   }
 882   return GenericOpc;
 883 }
 884
 885 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
 886 /// appropriate for the (value) register bank \p RegBankID and of memory access
 887 /// size \p OpSize.  This returns the variant with the base+unsigned-immediate
 888 /// addressing mode (e.g., LDRXui).
 889 /// \returns \p GenericOpc if the combination is unsupported.
 890 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
 891                                     unsigned OpSize) {
 892   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
 893   switch (RegBankID) {
 894   case AArch64::GPRRegBankID:
 895     switch (OpSize) {
 896     case 8:
 897       return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
 898     case 16:
 899       return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
 900     case 32:
 901       return isStore ? AArch64::STRWui : AArch64::LDRWui;
 902     case 64:
 903       return isStore ? AArch64::STRXui : AArch64::LDRXui;
 904     }
 905     break;
 906   case AArch64::FPRRegBankID:
 907     switch (OpSize) {
 908     case 8:
 909       return isStore ? AArch64::STRBui : AArch64::LDRBui;
 910     case 16:
 911       return isStore ? AArch64::STRHui : AArch64::LDRHui;
 912     case 32:
 913       return isStore ? AArch64::STRSui : AArch64::LDRSui;
 914     case 64:
 915       return isStore ? AArch64::STRDui : AArch64::LDRDui;
 916     case 128:
 917       return isStore ? AArch64::STRQui : AArch64::LDRQui;
 918     }
 919     break;
 920   }
 921   return GenericOpc;
 922 }
 923
 924 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
 925 /// to \p *To.
 926 ///
 927 /// E.g "To = COPY SrcReg:SubReg"
 928 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
 929                        const RegisterBankInfo &RBI, Register SrcReg,
 930                        const TargetRegisterClass *To, unsigned SubReg) {
 931   assert(SrcReg.isValid() && "Expected a valid source register?");
 932   assert(To && "Destination register class cannot be null");
 933   assert(SubReg && "Expected a valid subregister");
 934
 935   MachineIRBuilder MIB(I);
 936   auto SubRegCopy =
 937       MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
 938   MachineOperand &RegOp = I.getOperand(1);
 939   RegOp.setReg(SubRegCopy.getReg(0));
 940
 941   // It's possible that the destination register won't be constrained. Make
 942   // sure that happens.
 943   if (!I.getOperand(0).getReg().isPhysical())
 944     RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
 945
 946   return true;
 947 }
 948
 949 /// Helper function to get the source and destination register classes for a
 950 /// copy. Returns a std::pair containing the source register class for the
 951 /// copy, and the destination register class for the copy. If a register class
 952 /// cannot be determined, then it will be nullptr.
 953 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
 954 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
 955                      MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
 956                      const RegisterBankInfo &RBI) {
 957   Register DstReg = I.getOperand(0).getReg();
 958   Register SrcReg = I.getOperand(1).getReg();
 959   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
 960   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
 961
 962   TypeSize DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
 963   TypeSize SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
 964
 965   // Special casing for cross-bank copies of s1s. We can technically represent
 966   // a 1-bit value with any size of register. The minimum size for a GPR is 32
 967   // bits. So, we need to put the FPR on 32 bits as well.
 968   //
 969   // FIXME: I'm not sure if this case holds true outside of copies. If it does,
 970   // then we can pull it into the helpers that get the appropriate class for a
 971   // register bank. Or make a new helper that carries along some constraint
 972   // information.
 973   if (SrcRegBank != DstRegBank &&
 974       (DstSize == TypeSize::getFixed(1) && SrcSize == TypeSize::getFixed(1)))
 975     SrcSize = DstSize = TypeSize::getFixed(32);
 976
 977   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
 978           getMinClassForRegBank(DstRegBank, DstSize, true)};
 979 }
 980
 981 // FIXME: We need some sort of API in RBI/TRI to allow generic code to
 982 // constrain operands of simple instructions given a TargetRegisterClass
 983 // and LLT
 984 static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
 985                              const RegisterBankInfo &RBI) {
 986   for (MachineOperand &MO : I.operands()) {
 987     if (!MO.isReg())
 988       continue;
 989     Register Reg = MO.getReg();
 990     if (!Reg)
 991       continue;
 992     if (Reg.isPhysical())
 993       continue;
 994     LLT Ty = MRI.getType(Reg);
 995     const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
 996     const TargetRegisterClass *RC =
 997         dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
 998     if (!RC) {
 999       const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
1000       RC = getRegClassForTypeOnBank(Ty, RB);
1001       if (!RC) {
1002         LLVM_DEBUG(
1003             dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n");
1004         break;
1005       }
1006     }
1007     RBI.constrainGenericRegister(Reg, *RC, MRI);
1008   }
1009
1010   return true;
1011 }
1012
1013 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
1014                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
1015                        const RegisterBankInfo &RBI) {
1016   Register DstReg = I.getOperand(0).getReg();
1017   Register SrcReg = I.getOperand(1).getReg();
1018   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
1019   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
1020
1021   // Find the correct register classes for the source and destination registers.
1022   const TargetRegisterClass *SrcRC;
1023   const TargetRegisterClass *DstRC;
1024   std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
1025
1026   if (!DstRC) {
1027     LLVM_DEBUG(dbgs() << "Unexpected dest size "
1028                       << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
1029     return false;
1030   }
1031
1032   // Is this a copy? If so, then we may need to insert a subregister copy.
1033   if (I.isCopy()) {
1034     // Yes. Check if there's anything to fix up.
1035     if (!SrcRC) {
1036       LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
1037       return false;
1038     }
1039
1040     const TypeSize SrcSize = TRI.getRegSizeInBits(*SrcRC);
1041     const TypeSize DstSize = TRI.getRegSizeInBits(*DstRC);
1042     unsigned SubReg;
1043
1044     // If the source bank doesn't support a subregister copy small enough,
1045     // then we first need to copy to the destination bank.
1046     if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
1047       const TargetRegisterClass *DstTempRC =
1048           getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
1049       getSubRegForClass(DstRC, TRI, SubReg);
1050
1051       MachineIRBuilder MIB(I);
1052       auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
1053       copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
1054     } else if (SrcSize > DstSize) {
1055       // If the source register is bigger than the destination we need to
1056       // perform a subregister copy.
1057       const TargetRegisterClass *SubRegRC =
1058           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1059       getSubRegForClass(SubRegRC, TRI, SubReg);
1060       copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
1061     } else if (DstSize > SrcSize) {
1062       // If the destination register is bigger than the source we need to do
1063       // a promotion using SUBREG_TO_REG.
1064       const TargetRegisterClass *PromotionRC =
1065           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1066       getSubRegForClass(SrcRC, TRI, SubReg);
1067
1068       Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
1069       BuildMI(*I.getParent(), I, I.getDebugLoc(),
1070               TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
1071           .addImm(0)
1072           .addUse(SrcReg)
1073           .addImm(SubReg);
1074       MachineOperand &RegOp = I.getOperand(1);
1075       RegOp.setReg(PromoteReg);
1076     }
1077
1078     // If the destination is a physical register, then there's nothing to
1079     // change, so we're done.
1080     if (DstReg.isPhysical())
1081       return true;
1082   }
1083
1084   // No need to constrain SrcReg. It will get constrained when we hit another
1085   // of its use or its defs. Copies do not have constraints.
1086   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
1087     LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1088                       << " operand\n");
1089     return false;
1090   }
1091
1092   // If this a GPR ZEXT that we want to just reduce down into a copy.
1093   // The sizes will be mismatched with the source < 32b but that's ok.
1094   if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1095     I.setDesc(TII.get(AArch64::COPY));
1096     assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1097     return selectCopy(I, TII, MRI, TRI, RBI);
1098   }
1099
1100   I.setDesc(TII.get(AArch64::COPY));
1101   return true;
1102 }
1103
1104 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1105   if (!DstTy.isScalar() || !SrcTy.isScalar())
1106     return GenericOpc;
1107
1108   const unsigned DstSize = DstTy.getSizeInBits();
1109   const unsigned SrcSize = SrcTy.getSizeInBits();
1110
1111   switch (DstSize) {
1112   case 32:
1113     switch (SrcSize) {
1114     case 32:
1115       switch (GenericOpc) {
1116       case TargetOpcode::G_SITOFP:
1117         return AArch64::SCVTFUWSri;
1118       case TargetOpcode::G_UITOFP:
1119         return AArch64::UCVTFUWSri;
1120       case TargetOpcode::G_FPTOSI:
1121         return AArch64::FCVTZSUWSr;
1122       case TargetOpcode::G_FPTOUI:
1123         return AArch64::FCVTZUUWSr;
1124       default:
1125         return GenericOpc;
1126       }
1127     case 64:
1128       switch (GenericOpc) {
1129       case TargetOpcode::G_SITOFP:
1130         return AArch64::SCVTFUXSri;
1131       case TargetOpcode::G_UITOFP:
1132         return AArch64::UCVTFUXSri;
1133       case TargetOpcode::G_FPTOSI:
1134         return AArch64::FCVTZSUWDr;
1135       case TargetOpcode::G_FPTOUI:
1136         return AArch64::FCVTZUUWDr;
1137       default:
1138         return GenericOpc;
1139       }
1140     default:
1141       return GenericOpc;
1142     }
1143   case 64:
1144     switch (SrcSize) {
1145     case 32:
1146       switch (GenericOpc) {
1147       case TargetOpcode::G_SITOFP:
1148         return AArch64::SCVTFUWDri;
1149       case TargetOpcode::G_UITOFP:
1150         return AArch64::UCVTFUWDri;
1151       case TargetOpcode::G_FPTOSI:
1152         return AArch64::FCVTZSUXSr;
1153       case TargetOpcode::G_FPTOUI:
1154         return AArch64::FCVTZUUXSr;
1155       default:
1156         return GenericOpc;
1157       }
1158     case 64:
1159       switch (GenericOpc) {
1160       case TargetOpcode::G_SITOFP:
1161         return AArch64::SCVTFUXDri;
1162       case TargetOpcode::G_UITOFP:
1163         return AArch64::UCVTFUXDri;
1164       case TargetOpcode::G_FPTOSI:
1165         return AArch64::FCVTZSUXDr;
1166       case TargetOpcode::G_FPTOUI:
1167         return AArch64::FCVTZUUXDr;
1168       default:
1169         return GenericOpc;
1170       }
1171     default:
1172       return GenericOpc;
1173     }
1174   default:
1175     return GenericOpc;
1176   };
1177   return GenericOpc;
1178 }
1179
1180 MachineInstr *
1181 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1182                                        Register False, AArch64CC::CondCode CC,
1183                                        MachineIRBuilder &MIB) const {
1184   MachineRegisterInfo &MRI = *MIB.getMRI();
1185   assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1186              RBI.getRegBank(True, MRI, TRI)->getID() &&
1187          "Expected both select operands to have the same regbank?");
1188   LLT Ty = MRI.getType(True);
1189   if (Ty.isVector())
1190     return nullptr;
1191   const unsigned Size = Ty.getSizeInBits();
1192   assert((Size == 32 || Size == 64) &&
1193          "Expected 32 bit or 64 bit select only?");
1194   const bool Is32Bit = Size == 32;
1195   if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1196     unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1197     auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1198     constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1199     return &*FCSel;
1200   }
1201
1202   // By default, we'll try and emit a CSEL.
1203   unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1204   bool Optimized = false;
1205   auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1206                                  &Optimized](Register &Reg, Register &OtherReg,
1207                                              bool Invert) {
1208     if (Optimized)
1209       return false;
1210
1211     // Attempt to fold:
1212     //
1213     // %sub = G_SUB 0, %x
1214     // %select = G_SELECT cc, %reg, %sub
1215     //
1216     // Into:
1217     // %select = CSNEG %reg, %x, cc
1218     Register MatchReg;
1219     if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1220       Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1221       Reg = MatchReg;
1222       if (Invert) {
1223         CC = AArch64CC::getInvertedCondCode(CC);
1224         std::swap(Reg, OtherReg);
1225       }
1226       return true;
1227     }
1228
1229     // Attempt to fold:
1230     //
1231     // %xor = G_XOR %x, -1
1232     // %select = G_SELECT cc, %reg, %xor
1233     //
1234     // Into:
1235     // %select = CSINV %reg, %x, cc
1236     if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1237       Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1238       Reg = MatchReg;
1239       if (Invert) {
1240         CC = AArch64CC::getInvertedCondCode(CC);
1241         std::swap(Reg, OtherReg);
1242       }
1243       return true;
1244     }
1245
1246     // Attempt to fold:
1247     //
1248     // %add = G_ADD %x, 1
1249     // %select = G_SELECT cc, %reg, %add
1250     //
1251     // Into:
1252     // %select = CSINC %reg, %x, cc
1253     if (mi_match(Reg, MRI,
1254                  m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1255                           m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1256       Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1257       Reg = MatchReg;
1258       if (Invert) {
1259         CC = AArch64CC::getInvertedCondCode(CC);
1260         std::swap(Reg, OtherReg);
1261       }
1262       return true;
1263     }
1264
1265     return false;
1266   };
1267
1268   // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1269   // true/false values are constants.
1270   // FIXME: All of these patterns already exist in tablegen. We should be
1271   // able to import these.
1272   auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1273                           &Optimized]() {
1274     if (Optimized)
1275       return false;
1276     auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI);
1277     auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI);
1278     if (!TrueCst && !FalseCst)
1279       return false;
1280
1281     Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1282     if (TrueCst && FalseCst) {
1283       int64_t T = TrueCst->Value.getSExtValue();
1284       int64_t F = FalseCst->Value.getSExtValue();
1285
1286       if (T == 0 && F == 1) {
1287         // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1288         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1289         True = ZReg;
1290         False = ZReg;
1291         return true;
1292       }
1293
1294       if (T == 0 && F == -1) {
1295         // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1296         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1297         True = ZReg;
1298         False = ZReg;
1299         return true;
1300       }
1301     }
1302
1303     if (TrueCst) {
1304       int64_t T = TrueCst->Value.getSExtValue();
1305       if (T == 1) {
1306         // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1307         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1308         True = False;
1309         False = ZReg;
1310         CC = AArch64CC::getInvertedCondCode(CC);
1311         return true;
1312       }
1313
1314       if (T == -1) {
1315         // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1316         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1317         True = False;
1318         False = ZReg;
1319         CC = AArch64CC::getInvertedCondCode(CC);
1320         return true;
1321       }
1322     }
1323
1324     if (FalseCst) {
1325       int64_t F = FalseCst->Value.getSExtValue();
1326       if (F == 1) {
1327         // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1328         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1329         False = ZReg;
1330         return true;
1331       }
1332
1333       if (F == -1) {
1334         // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1335         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1336         False = ZReg;
1337         return true;
1338       }
1339     }
1340     return false;
1341   };
1342
1343   Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1344   Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1345   Optimized |= TryOptSelectCst();
1346   auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1347   constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1348   return &*SelectInst;
1349 }
1350
1351 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1352   switch (P) {
1353   default:
1354     llvm_unreachable("Unknown condition code!");
1355   case CmpInst::ICMP_NE:
1356     return AArch64CC::NE;
1357   case CmpInst::ICMP_EQ:
1358     return AArch64CC::EQ;
1359   case CmpInst::ICMP_SGT:
1360     return AArch64CC::GT;
1361   case CmpInst::ICMP_SGE:
1362     return AArch64CC::GE;
1363   case CmpInst::ICMP_SLT:
1364     return AArch64CC::LT;
1365   case CmpInst::ICMP_SLE:
1366     return AArch64CC::LE;
1367   case CmpInst::ICMP_UGT:
1368     return AArch64CC::HI;
1369   case CmpInst::ICMP_UGE:
1370     return AArch64CC::HS;
1371   case CmpInst::ICMP_ULT:
1372     return AArch64CC::LO;
1373   case CmpInst::ICMP_ULE:
1374     return AArch64CC::LS;
1375   }
1376 }
1377
1378 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
1379 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1380                                     AArch64CC::CondCode &CondCode,
1381                                     AArch64CC::CondCode &CondCode2) {
1382   CondCode2 = AArch64CC::AL;
1383   switch (CC) {
1384   default:
1385     llvm_unreachable("Unknown FP condition!");
1386   case CmpInst::FCMP_OEQ:
1387     CondCode = AArch64CC::EQ;
1388     break;
1389   case CmpInst::FCMP_OGT:
1390     CondCode = AArch64CC::GT;
1391     break;
1392   case CmpInst::FCMP_OGE:
1393     CondCode = AArch64CC::GE;
1394     break;
1395   case CmpInst::FCMP_OLT:
1396     CondCode = AArch64CC::MI;
1397     break;
1398   case CmpInst::FCMP_OLE:
1399     CondCode = AArch64CC::LS;
1400     break;
1401   case CmpInst::FCMP_ONE:
1402     CondCode = AArch64CC::MI;
1403     CondCode2 = AArch64CC::GT;
1404     break;
1405   case CmpInst::FCMP_ORD:
1406     CondCode = AArch64CC::VC;
1407     break;
1408   case CmpInst::FCMP_UNO:
1409     CondCode = AArch64CC::VS;
1410     break;
1411   case CmpInst::FCMP_UEQ:
1412     CondCode = AArch64CC::EQ;
1413     CondCode2 = AArch64CC::VS;
1414     break;
1415   case CmpInst::FCMP_UGT:
1416     CondCode = AArch64CC::HI;
1417     break;
1418   case CmpInst::FCMP_UGE:
1419     CondCode = AArch64CC::PL;
1420     break;
1421   case CmpInst::FCMP_ULT:
1422     CondCode = AArch64CC::LT;
1423     break;
1424   case CmpInst::FCMP_ULE:
1425     CondCode = AArch64CC::LE;
1426     break;
1427   case CmpInst::FCMP_UNE:
1428     CondCode = AArch64CC::NE;
1429     break;
1430   }
1431 }
1432
1433 /// Convert an IR fp condition code to an AArch64 CC.
1434 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1435 /// should be AND'ed instead of OR'ed.
1436 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1437                                      AArch64CC::CondCode &CondCode,
1438                                      AArch64CC::CondCode &CondCode2) {
1439   CondCode2 = AArch64CC::AL;
1440   switch (CC) {
1441   default:
1442     changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1443     assert(CondCode2 == AArch64CC::AL);
1444     break;
1445   case CmpInst::FCMP_ONE:
1446     // (a one b)
1447     // == ((a olt b) || (a ogt b))
1448     // == ((a ord b) && (a une b))
1449     CondCode = AArch64CC::VC;
1450     CondCode2 = AArch64CC::NE;
1451     break;
1452   case CmpInst::FCMP_UEQ:
1453     // (a ueq b)
1454     // == ((a uno b) || (a oeq b))
1455     // == ((a ule b) && (a uge b))
1456     CondCode = AArch64CC::PL;
1457     CondCode2 = AArch64CC::LE;
1458     break;
1459   }
1460 }
1461
1462 /// Return a register which can be used as a bit to test in a TB(N)Z.
1463 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1464                               MachineRegisterInfo &MRI) {
1465   assert(Reg.isValid() && "Expected valid register!");
1466   bool HasZext = false;
1467   while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1468     unsigned Opc = MI->getOpcode();
1469
1470     if (!MI->getOperand(0).isReg() ||
1471         !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1472       break;
1473
1474     // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1475     //
1476     // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1477     // on the truncated x is the same as the bit number on x.
1478     if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1479         Opc == TargetOpcode::G_TRUNC) {
1480       if (Opc == TargetOpcode::G_ZEXT)
1481         HasZext = true;
1482
1483       Register NextReg = MI->getOperand(1).getReg();
1484       // Did we find something worth folding?
1485       if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1486         break;
1487
1488       // NextReg is worth folding. Keep looking.
1489       Reg = NextReg;
1490       continue;
1491     }
1492
1493     // Attempt to find a suitable operation with a constant on one side.
1494     std::optional<uint64_t> C;
1495     Register TestReg;
1496     switch (Opc) {
1497     default:
1498       break;
1499     case TargetOpcode::G_AND:
1500     case TargetOpcode::G_XOR: {
1501       TestReg = MI->getOperand(1).getReg();
1502       Register ConstantReg = MI->getOperand(2).getReg();
1503       auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1504       if (!VRegAndVal) {
1505         // AND commutes, check the other side for a constant.
1506         // FIXME: Can we canonicalize the constant so that it's always on the
1507         // same side at some point earlier?
1508         std::swap(ConstantReg, TestReg);
1509         VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1510       }
1511       if (VRegAndVal) {
1512         if (HasZext)
1513           C = VRegAndVal->Value.getZExtValue();
1514         else
1515           C = VRegAndVal->Value.getSExtValue();
1516       }
1517       break;
1518     }
1519     case TargetOpcode::G_ASHR:
1520     case TargetOpcode::G_LSHR:
1521     case TargetOpcode::G_SHL: {
1522       TestReg = MI->getOperand(1).getReg();
1523       auto VRegAndVal =
1524           getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1525       if (VRegAndVal)
1526         C = VRegAndVal->Value.getSExtValue();
1527       break;
1528     }
1529     }
1530
1531     // Didn't find a constant or viable register. Bail out of the loop.
1532     if (!C || !TestReg.isValid())
1533       break;
1534
1535     // We found a suitable instruction with a constant. Check to see if we can
1536     // walk through the instruction.
1537     Register NextReg;
1538     unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1539     switch (Opc) {
1540     default:
1541       break;
1542     case TargetOpcode::G_AND:
1543       // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1544       if ((*C >> Bit) & 1)
1545         NextReg = TestReg;
1546       break;
1547     case TargetOpcode::G_SHL:
1548       // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1549       // the type of the register.
1550       if (*C <= Bit && (Bit - *C) < TestRegSize) {
1551         NextReg = TestReg;
1552         Bit = Bit - *C;
1553       }
1554       break;
1555     case TargetOpcode::G_ASHR:
1556       // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1557       // in x
1558       NextReg = TestReg;
1559       Bit = Bit + *C;
1560       if (Bit >= TestRegSize)
1561         Bit = TestRegSize - 1;
1562       break;
1563     case TargetOpcode::G_LSHR:
1564       // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1565       if ((Bit + *C) < TestRegSize) {
1566         NextReg = TestReg;
1567         Bit = Bit + *C;
1568       }
1569       break;
1570     case TargetOpcode::G_XOR:
1571       // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1572       // appropriate.
1573       //
1574       // e.g. If x' = xor x, c, and the b-th bit is set in c then
1575       //
1576       // tbz x', b -> tbnz x, b
1577       //
1578       // Because x' only has the b-th bit set if x does not.
1579       if ((*C >> Bit) & 1)
1580         Invert = !Invert;
1581       NextReg = TestReg;
1582       break;
1583     }
1584
1585     // Check if we found anything worth folding.
1586     if (!NextReg.isValid())
1587       return Reg;
1588     Reg = NextReg;
1589   }
1590
1591   return Reg;
1592 }
1593
1594 MachineInstr *AArch64InstructionSelector::emitTestBit(
1595     Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1596     MachineIRBuilder &MIB) const {
1597   assert(TestReg.isValid());
1598   assert(ProduceNonFlagSettingCondBr &&
1599          "Cannot emit TB(N)Z with speculation tracking!");
1600   MachineRegisterInfo &MRI = *MIB.getMRI();
1601
1602   // Attempt to optimize the test bit by walking over instructions.
1603   TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1604   LLT Ty = MRI.getType(TestReg);
1605   unsigned Size = Ty.getSizeInBits();
1606   assert(!Ty.isVector() && "Expected a scalar!");
1607   assert(Bit < 64 && "Bit is too large!");
1608
1609   // When the test register is a 64-bit register, we have to narrow to make
1610   // TBNZW work.
1611   bool UseWReg = Bit < 32;
1612   unsigned NecessarySize = UseWReg ? 32 : 64;
1613   if (Size != NecessarySize)
1614     TestReg = moveScalarRegClass(
1615         TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1616         MIB);
1617
1618   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1619                                           {AArch64::TBZW, AArch64::TBNZW}};
1620   unsigned Opc = OpcTable[UseWReg][IsNegative];
1621   auto TestBitMI =
1622       MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1623   constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1624   return &*TestBitMI;
1625 }
1626
1627 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1628     MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1629     MachineIRBuilder &MIB) const {
1630   assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1631   // Given something like this:
1632   //
1633   //  %x = ...Something...
1634   //  %one = G_CONSTANT i64 1
1635   //  %zero = G_CONSTANT i64 0
1636   //  %and = G_AND %x, %one
1637   //  %cmp = G_ICMP intpred(ne), %and, %zero
1638   //  %cmp_trunc = G_TRUNC %cmp
1639   //  G_BRCOND %cmp_trunc, %bb.3
1640   //
1641   // We want to try and fold the AND into the G_BRCOND and produce either a
1642   // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1643   //
1644   // In this case, we'd get
1645   //
1646   // TBNZ %x %bb.3
1647   //
1648
1649   // Check if the AND has a constant on its RHS which we can use as a mask.
1650   // If it's a power of 2, then it's the same as checking a specific bit.
1651   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1652   auto MaybeBit = getIConstantVRegValWithLookThrough(
1653       AndInst.getOperand(2).getReg(), *MIB.getMRI());
1654   if (!MaybeBit)
1655     return false;
1656
1657   int32_t Bit = MaybeBit->Value.exactLogBase2();
1658   if (Bit < 0)
1659     return false;
1660
1661   Register TestReg = AndInst.getOperand(1).getReg();
1662
1663   // Emit a TB(N)Z.
1664   emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1665   return true;
1666 }
1667
1668 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1669                                                   bool IsNegative,
1670                                                   MachineBasicBlock *DestMBB,
1671                                                   MachineIRBuilder &MIB) const {
1672   assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1673   MachineRegisterInfo &MRI = *MIB.getMRI();
1674   assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1675              AArch64::GPRRegBankID &&
1676          "Expected GPRs only?");
1677   auto Ty = MRI.getType(CompareReg);
1678   unsigned Width = Ty.getSizeInBits();
1679   assert(!Ty.isVector() && "Expected scalar only?");
1680   assert(Width <= 64 && "Expected width to be at most 64?");
1681   static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1682                                           {AArch64::CBNZW, AArch64::CBNZX}};
1683   unsigned Opc = OpcTable[IsNegative][Width == 64];
1684   auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1685   constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1686   return &*BranchMI;
1687 }
1688
1689 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1690     MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1691   assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1692   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1693   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1694   // totally clean.  Some of them require two branches to implement.
1695   auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1696   emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1697                 Pred);
1698   AArch64CC::CondCode CC1, CC2;
1699   changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1700   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1701   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1702   if (CC2 != AArch64CC::AL)
1703     MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1704   I.eraseFromParent();
1705   return true;
1706 }
1707
1708 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1709     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1710   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1711   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1712   // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1713   //
1714   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1715   // instructions will not be produced, as they are conditional branch
1716   // instructions that do not set flags.
1717   if (!ProduceNonFlagSettingCondBr)
1718     return false;
1719
1720   MachineRegisterInfo &MRI = *MIB.getMRI();
1721   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1722   auto Pred =
1723       static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1724   Register LHS = ICmp.getOperand(2).getReg();
1725   Register RHS = ICmp.getOperand(3).getReg();
1726
1727   // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1728   auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1729   MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1730
1731   // When we can emit a TB(N)Z, prefer that.
1732   //
1733   // Handle non-commutative condition codes first.
1734   // Note that we don't want to do this when we have a G_AND because it can
1735   // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1736   if (VRegAndVal && !AndInst) {
1737     int64_t C = VRegAndVal->Value.getSExtValue();
1738
1739     // When we have a greater-than comparison, we can just test if the msb is
1740     // zero.
1741     if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1742       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1743       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1744       I.eraseFromParent();
1745       return true;
1746     }
1747
1748     // When we have a less than comparison, we can just test if the msb is not
1749     // zero.
1750     if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1751       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1752       emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1753       I.eraseFromParent();
1754       return true;
1755     }
1756
1757     // Inversely, if we have a signed greater-than-or-equal comparison to zero,
1758     // we can test if the msb is zero.
1759     if (C == 0 && Pred == CmpInst::ICMP_SGE) {
1760       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1761       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1762       I.eraseFromParent();
1763       return true;
1764     }
1765   }
1766
1767   // Attempt to handle commutative condition codes. Right now, that's only
1768   // eq/ne.
1769   if (ICmpInst::isEquality(Pred)) {
1770     if (!VRegAndVal) {
1771       std::swap(RHS, LHS);
1772       VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1773       AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1774     }
1775
1776     if (VRegAndVal && VRegAndVal->Value == 0) {
1777       // If there's a G_AND feeding into this branch, try to fold it away by
1778       // emitting a TB(N)Z instead.
1779       //
1780       // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1781       // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1782       // would be redundant.
1783       if (AndInst &&
1784           tryOptAndIntoCompareBranch(
1785               *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1786         I.eraseFromParent();
1787         return true;
1788       }
1789
1790       // Otherwise, try to emit a CB(N)Z instead.
1791       auto LHSTy = MRI.getType(LHS);
1792       if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1793         emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1794         I.eraseFromParent();
1795         return true;
1796       }
1797     }
1798   }
1799
1800   return false;
1801 }
1802
1803 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1804     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1805   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1806   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1807   if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1808     return true;
1809
1810   // Couldn't optimize. Emit a compare + a Bcc.
1811   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1812   auto PredOp = ICmp.getOperand(1);
1813   emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1814   const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1815       static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1816   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1817   I.eraseFromParent();
1818   return true;
1819 }
1820
1821 bool AArch64InstructionSelector::selectCompareBranch(
1822     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1823   Register CondReg = I.getOperand(0).getReg();
1824   MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1825   // Try to select the G_BRCOND using whatever is feeding the condition if
1826   // possible.
1827   unsigned CCMIOpc = CCMI->getOpcode();
1828   if (CCMIOpc == TargetOpcode::G_FCMP)
1829     return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1830   if (CCMIOpc == TargetOpcode::G_ICMP)
1831     return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1832
1833   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1834   // instructions will not be produced, as they are conditional branch
1835   // instructions that do not set flags.
1836   if (ProduceNonFlagSettingCondBr) {
1837     emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1838                 I.getOperand(1).getMBB(), MIB);
1839     I.eraseFromParent();
1840     return true;
1841   }
1842
1843   // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1844   auto TstMI =
1845       MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1846   constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1847   auto Bcc = MIB.buildInstr(AArch64::Bcc)
1848                  .addImm(AArch64CC::NE)
1849                  .addMBB(I.getOperand(1).getMBB());
1850   I.eraseFromParent();
1851   return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1852 }
1853
1854 /// Returns the element immediate value of a vector shift operand if found.
1855 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
1856 static std::optional<int64_t> getVectorShiftImm(Register Reg,
1857                                                 MachineRegisterInfo &MRI) {
1858   assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1859   MachineInstr *OpMI = MRI.getVRegDef(Reg);
1860   return getAArch64VectorSplatScalar(*OpMI, MRI);
1861 }
1862
1863 /// Matches and returns the shift immediate value for a SHL instruction given
1864 /// a shift operand.
1865 static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg,
1866                                               MachineRegisterInfo &MRI) {
1867   std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1868   if (!ShiftImm)
1869     return std::nullopt;
1870   // Check the immediate is in range for a SHL.
1871   int64_t Imm = *ShiftImm;
1872   if (Imm < 0)
1873     return std::nullopt;
1874   switch (SrcTy.getElementType().getSizeInBits()) {
1875   default:
1876     LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1877     return std::nullopt;
1878   case 8:
1879     if (Imm > 7)
1880       return std::nullopt;
1881     break;
1882   case 16:
1883     if (Imm > 15)
1884       return std::nullopt;
1885     break;
1886   case 32:
1887     if (Imm > 31)
1888       return std::nullopt;
1889     break;
1890   case 64:
1891     if (Imm > 63)
1892       return std::nullopt;
1893     break;
1894   }
1895   return Imm;
1896 }
1897
1898 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1899                                                  MachineRegisterInfo &MRI) {
1900   assert(I.getOpcode() == TargetOpcode::G_SHL);
1901   Register DstReg = I.getOperand(0).getReg();
1902   const LLT Ty = MRI.getType(DstReg);
1903   Register Src1Reg = I.getOperand(1).getReg();
1904   Register Src2Reg = I.getOperand(2).getReg();
1905
1906   if (!Ty.isVector())
1907     return false;
1908
1909   // Check if we have a vector of constants on RHS that we can select as the
1910   // immediate form.
1911   std::optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1912
1913   unsigned Opc = 0;
1914   if (Ty == LLT::fixed_vector(2, 64)) {
1915     Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1916   } else if (Ty == LLT::fixed_vector(4, 32)) {
1917     Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1918   } else if (Ty == LLT::fixed_vector(2, 32)) {
1919     Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1920   } else if (Ty == LLT::fixed_vector(4, 16)) {
1921     Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1922   } else if (Ty == LLT::fixed_vector(8, 16)) {
1923     Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1924   } else if (Ty == LLT::fixed_vector(16, 8)) {
1925     Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1926   } else if (Ty == LLT::fixed_vector(8, 8)) {
1927     Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1928   } else {
1929     LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1930     return false;
1931   }
1932
1933   auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1934   if (ImmVal)
1935     Shl.addImm(*ImmVal);
1936   else
1937     Shl.addUse(Src2Reg);
1938   constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1939   I.eraseFromParent();
1940   return true;
1941 }
1942
1943 bool AArch64InstructionSelector::selectVectorAshrLshr(
1944     MachineInstr &I, MachineRegisterInfo &MRI) {
1945   assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1946          I.getOpcode() == TargetOpcode::G_LSHR);
1947   Register DstReg = I.getOperand(0).getReg();
1948   const LLT Ty = MRI.getType(DstReg);
1949   Register Src1Reg = I.getOperand(1).getReg();
1950   Register Src2Reg = I.getOperand(2).getReg();
1951
1952   if (!Ty.isVector())
1953     return false;
1954
1955   bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1956
1957   // We expect the immediate case to be lowered in the PostLegalCombiner to
1958   // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1959
1960   // There is not a shift right register instruction, but the shift left
1961   // register instruction takes a signed value, where negative numbers specify a
1962   // right shift.
1963
1964   unsigned Opc = 0;
1965   unsigned NegOpc = 0;
1966   const TargetRegisterClass *RC =
1967       getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID));
1968   if (Ty == LLT::fixed_vector(2, 64)) {
1969     Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1970     NegOpc = AArch64::NEGv2i64;
1971   } else if (Ty == LLT::fixed_vector(4, 32)) {
1972     Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1973     NegOpc = AArch64::NEGv4i32;
1974   } else if (Ty == LLT::fixed_vector(2, 32)) {
1975     Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1976     NegOpc = AArch64::NEGv2i32;
1977   } else if (Ty == LLT::fixed_vector(4, 16)) {
1978     Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1979     NegOpc = AArch64::NEGv4i16;
1980   } else if (Ty == LLT::fixed_vector(8, 16)) {
1981     Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1982     NegOpc = AArch64::NEGv8i16;
1983   } else if (Ty == LLT::fixed_vector(16, 8)) {
1984     Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1985     NegOpc = AArch64::NEGv16i8;
1986   } else if (Ty == LLT::fixed_vector(8, 8)) {
1987     Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1988     NegOpc = AArch64::NEGv8i8;
1989   } else {
1990     LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1991     return false;
1992   }
1993
1994   auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1995   constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1996   auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1997   constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1998   I.eraseFromParent();
1999   return true;
2000 }
2001
2002 bool AArch64InstructionSelector::selectVaStartAAPCS(
2003     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
2004
2005   if (STI.isCallingConvWin64(MF.getFunction().getCallingConv(),
2006                              MF.getFunction().isVarArg()))
2007     return false;
2008
2009   // The layout of the va_list struct is specified in the AArch64 Procedure Call
2010   // Standard, section 10.1.5.
2011
2012   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2013   const unsigned PtrSize = STI.isTargetILP32() ? 4 : 8;
2014   const auto *PtrRegClass =
2015       STI.isTargetILP32() ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
2016
2017   const MCInstrDesc &MCIDAddAddr =
2018       TII.get(STI.isTargetILP32() ? AArch64::ADDWri : AArch64::ADDXri);
2019   const MCInstrDesc &MCIDStoreAddr =
2020       TII.get(STI.isTargetILP32() ? AArch64::STRWui : AArch64::STRXui);
2021
2022   /*
2023    * typedef struct va_list {
2024    *  void * stack; // next stack param
2025    *  void * gr_top; // end of GP arg reg save area
2026    *  void * vr_top; // end of FP/SIMD arg reg save area
2027    *  int gr_offs; // offset from gr_top to next GP register arg
2028    *  int vr_offs; // offset from vr_top to next FP/SIMD register arg
2029    * } va_list;
2030    */
2031   const auto VAList = I.getOperand(0).getReg();
2032
2033   // Our current offset in bytes from the va_list struct (VAList).
2034   unsigned OffsetBytes = 0;
2035
2036   // Helper function to store (FrameIndex + Imm) to VAList at offset OffsetBytes
2037   // and increment OffsetBytes by PtrSize.
2038   const auto PushAddress = [&](const int FrameIndex, const int64_t Imm) {
2039     const Register Top = MRI.createVirtualRegister(PtrRegClass);
2040     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDAddAddr)
2041                    .addDef(Top)
2042                    .addFrameIndex(FrameIndex)
2043                    .addImm(Imm)
2044                    .addImm(0);
2045     constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2046
2047     const auto *MMO = *I.memoperands_begin();
2048     MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDStoreAddr)
2049               .addUse(Top)
2050               .addUse(VAList)
2051               .addImm(OffsetBytes / PtrSize)
2052               .addMemOperand(MF.getMachineMemOperand(
2053                   MMO->getPointerInfo().getWithOffset(OffsetBytes),
2054                   MachineMemOperand::MOStore, PtrSize, MMO->getBaseAlign()));
2055     constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2056
2057     OffsetBytes += PtrSize;
2058   };
2059
2060   // void* stack at offset 0
2061   PushAddress(FuncInfo->getVarArgsStackIndex(), 0);
2062
2063   // void* gr_top at offset 8 (4 on ILP32)
2064   const unsigned GPRSize = FuncInfo->getVarArgsGPRSize();
2065   PushAddress(FuncInfo->getVarArgsGPRIndex(), GPRSize);
2066
2067   // void* vr_top at offset 16 (8 on ILP32)
2068   const unsigned FPRSize = FuncInfo->getVarArgsFPRSize();
2069   PushAddress(FuncInfo->getVarArgsFPRIndex(), FPRSize);
2070
2071   // Helper function to store a 4-byte integer constant to VAList at offset
2072   // OffsetBytes, and increment OffsetBytes by 4.
2073   const auto PushIntConstant = [&](const int32_t Value) {
2074     constexpr int IntSize = 4;
2075     const Register Temp = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2076     auto MIB =
2077         BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::MOVi32imm))
2078             .addDef(Temp)
2079             .addImm(Value);
2080     constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2081
2082     const auto *MMO = *I.memoperands_begin();
2083     MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRWui))
2084               .addUse(Temp)
2085               .addUse(VAList)
2086               .addImm(OffsetBytes / IntSize)
2087               .addMemOperand(MF.getMachineMemOperand(
2088                   MMO->getPointerInfo().getWithOffset(OffsetBytes),
2089                   MachineMemOperand::MOStore, IntSize, MMO->getBaseAlign()));
2090     constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2091     OffsetBytes += IntSize;
2092   };
2093
2094   // int gr_offs at offset 24 (12 on ILP32)
2095   PushIntConstant(-static_cast<int32_t>(GPRSize));
2096
2097   // int vr_offs at offset 28 (16 on ILP32)
2098   PushIntConstant(-static_cast<int32_t>(FPRSize));
2099
2100   assert(OffsetBytes == (STI.isTargetILP32() ? 20 : 32) && "Unexpected offset");
2101
2102   I.eraseFromParent();
2103   return true;
2104 }
2105
2106 bool AArch64InstructionSelector::selectVaStartDarwin(
2107     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
2108   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
2109   Register ListReg = I.getOperand(0).getReg();
2110
2111   Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2112
2113   int FrameIdx = FuncInfo->getVarArgsStackIndex();
2114   if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
2115           MF.getFunction().getCallingConv(), MF.getFunction().isVarArg())) {
2116     FrameIdx = FuncInfo->getVarArgsGPRSize() > 0
2117                    ? FuncInfo->getVarArgsGPRIndex()
2118                    : FuncInfo->getVarArgsStackIndex();
2119   }
2120
2121   auto MIB =
2122       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
2123           .addDef(ArgsAddrReg)
2124           .addFrameIndex(FrameIdx)
2125           .addImm(0)
2126           .addImm(0);
2127
2128   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2129
2130   MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
2131             .addUse(ArgsAddrReg)
2132             .addUse(ListReg)
2133             .addImm(0)
2134             .addMemOperand(*I.memoperands_begin());
2135
2136   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2137   I.eraseFromParent();
2138   return true;
2139 }
2140
2141 void AArch64InstructionSelector::materializeLargeCMVal(
2142     MachineInstr &I, const Value *V, unsigned OpFlags) {
2143   MachineBasicBlock &MBB = *I.getParent();
2144   MachineFunction &MF = *MBB.getParent();
2145   MachineRegisterInfo &MRI = MF.getRegInfo();
2146
2147   auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
2148   MovZ->addOperand(MF, I.getOperand(1));
2149   MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
2150                                      AArch64II::MO_NC);
2151   MovZ->addOperand(MF, MachineOperand::CreateImm(0));
2152   constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
2153
2154   auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
2155                        Register ForceDstReg) {
2156     Register DstReg = ForceDstReg
2157                           ? ForceDstReg
2158                           : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2159     auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
2160     if (auto *GV = dyn_cast<GlobalValue>(V)) {
2161       MovI->addOperand(MF, MachineOperand::CreateGA(
2162                                GV, MovZ->getOperand(1).getOffset(), Flags));
2163     } else {
2164       MovI->addOperand(
2165           MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
2166                                        MovZ->getOperand(1).getOffset(), Flags));
2167     }
2168     MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
2169     constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
2170     return DstReg;
2171   };
2172   Register DstReg = BuildMovK(MovZ.getReg(0),
2173                               AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
2174   DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
2175   BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
2176 }
2177
2178 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
2179   MachineBasicBlock &MBB = *I.getParent();
2180   MachineFunction &MF = *MBB.getParent();
2181   MachineRegisterInfo &MRI = MF.getRegInfo();
2182
2183   switch (I.getOpcode()) {
2184   case TargetOpcode::G_STORE: {
2185     bool Changed = contractCrossBankCopyIntoStore(I, MRI);
2186     MachineOperand &SrcOp = I.getOperand(0);
2187     if (MRI.getType(SrcOp.getReg()).isPointer()) {
2188       // Allow matching with imported patterns for stores of pointers. Unlike
2189       // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
2190       // and constrain.
2191       auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
2192       Register NewSrc = Copy.getReg(0);
2193       SrcOp.setReg(NewSrc);
2194       RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
2195       Changed = true;
2196     }
2197     return Changed;
2198   }
2199   case TargetOpcode::G_PTR_ADD:
2200     return convertPtrAddToAdd(I, MRI);
2201   case TargetOpcode::G_LOAD: {
2202     // For scalar loads of pointers, we try to convert the dest type from p0
2203     // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
2204     // conversion, this should be ok because all users should have been
2205     // selected already, so the type doesn't matter for them.
2206     Register DstReg = I.getOperand(0).getReg();
2207     const LLT DstTy = MRI.getType(DstReg);
2208     if (!DstTy.isPointer())
2209       return false;
2210     MRI.setType(DstReg, LLT::scalar(64));
2211     return true;
2212   }
2213   case AArch64::G_DUP: {
2214     // Convert the type from p0 to s64 to help selection.
2215     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2216     if (!DstTy.isPointerVector())
2217       return false;
2218     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
2219     MRI.setType(I.getOperand(0).getReg(),
2220                 DstTy.changeElementType(LLT::scalar(64)));
2221     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
2222     I.getOperand(1).setReg(NewSrc.getReg(0));
2223     return true;
2224   }
2225   case AArch64::G_INSERT_VECTOR_ELT: {
2226     // Convert the type from p0 to s64 to help selection.
2227     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2228     LLT SrcVecTy = MRI.getType(I.getOperand(1).getReg());
2229     if (!SrcVecTy.isPointerVector())
2230       return false;
2231     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(2).getReg());
2232     MRI.setType(I.getOperand(1).getReg(),
2233                 DstTy.changeElementType(LLT::scalar(64)));
2234     MRI.setType(I.getOperand(0).getReg(),
2235                 DstTy.changeElementType(LLT::scalar(64)));
2236     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
2237     I.getOperand(2).setReg(NewSrc.getReg(0));
2238     return true;
2239   }
2240   case TargetOpcode::G_UITOFP:
2241   case TargetOpcode::G_SITOFP: {
2242     // If both source and destination regbanks are FPR, then convert the opcode
2243     // to G_SITOF so that the importer can select it to an fpr variant.
2244     // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2245     // copy.
2246     Register SrcReg = I.getOperand(1).getReg();
2247     LLT SrcTy = MRI.getType(SrcReg);
2248     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2249     if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2250       return false;
2251
2252     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2253       if (I.getOpcode() == TargetOpcode::G_SITOFP)
2254         I.setDesc(TII.get(AArch64::G_SITOF));
2255       else
2256         I.setDesc(TII.get(AArch64::G_UITOF));
2257       return true;
2258     }
2259     return false;
2260   }
2261   default:
2262     return false;
2263   }
2264 }
2265
2266 /// This lowering tries to look for G_PTR_ADD instructions and then converts
2267 /// them to a standard G_ADD with a COPY on the source.
2268 ///
2269 /// The motivation behind this is to expose the add semantics to the imported
2270 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2271 /// because the selector works bottom up, uses before defs. By the time we
2272 /// end up trying to select a G_PTR_ADD, we should have already attempted to
2273 /// fold this into addressing modes and were therefore unsuccessful.
2274 bool AArch64InstructionSelector::convertPtrAddToAdd(
2275     MachineInstr &I, MachineRegisterInfo &MRI) {
2276   assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2277   Register DstReg = I.getOperand(0).getReg();
2278   Register AddOp1Reg = I.getOperand(1).getReg();
2279   const LLT PtrTy = MRI.getType(DstReg);
2280   if (PtrTy.getAddressSpace() != 0)
2281     return false;
2282
2283   const LLT CastPtrTy =
2284       PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
2285   auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
2286   // Set regbanks on the registers.
2287   if (PtrTy.isVector())
2288     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2289   else
2290     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2291
2292   // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2293   // %dst(intty) = G_ADD %intbase, off
2294   I.setDesc(TII.get(TargetOpcode::G_ADD));
2295   MRI.setType(DstReg, CastPtrTy);
2296   I.getOperand(1).setReg(PtrToInt.getReg(0));
2297   if (!select(*PtrToInt)) {
2298     LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2299     return false;
2300   }
2301
2302   // Also take the opportunity here to try to do some optimization.
2303   // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2304   Register NegatedReg;
2305   if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
2306     return true;
2307   I.getOperand(2).setReg(NegatedReg);
2308   I.setDesc(TII.get(TargetOpcode::G_SUB));
2309   return true;
2310 }
2311
2312 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2313                                                 MachineRegisterInfo &MRI) {
2314   // We try to match the immediate variant of LSL, which is actually an alias
2315   // for a special case of UBFM. Otherwise, we fall back to the imported
2316   // selector which will match the register variant.
2317   assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2318   const auto &MO = I.getOperand(2);
2319   auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI);
2320   if (!VRegAndVal)
2321     return false;
2322
2323   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2324   if (DstTy.isVector())
2325     return false;
2326   bool Is64Bit = DstTy.getSizeInBits() == 64;
2327   auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
2328   auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2329
2330   if (!Imm1Fn || !Imm2Fn)
2331     return false;
2332
2333   auto NewI =
2334       MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2335                      {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2336
2337   for (auto &RenderFn : *Imm1Fn)
2338     RenderFn(NewI);
2339   for (auto &RenderFn : *Imm2Fn)
2340     RenderFn(NewI);
2341
2342   I.eraseFromParent();
2343   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2344 }
2345
2346 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2347     MachineInstr &I, MachineRegisterInfo &MRI) {
2348   assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2349   // If we're storing a scalar, it doesn't matter what register bank that
2350   // scalar is on. All that matters is the size.
2351   //
2352   // So, if we see something like this (with a 32-bit scalar as an example):
2353   //
2354   // %x:gpr(s32) = ... something ...
2355   // %y:fpr(s32) = COPY %x:gpr(s32)
2356   // G_STORE %y:fpr(s32)
2357   //
2358   // We can fix this up into something like this:
2359   //
2360   // G_STORE %x:gpr(s32)
2361   //
2362   // And then continue the selection process normally.
2363   Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2364   if (!DefDstReg.isValid())
2365     return false;
2366   LLT DefDstTy = MRI.getType(DefDstReg);
2367   Register StoreSrcReg = I.getOperand(0).getReg();
2368   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2369
2370   // If we get something strange like a physical register, then we shouldn't
2371   // go any further.
2372   if (!DefDstTy.isValid())
2373     return false;
2374
2375   // Are the source and dst types the same size?
2376   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2377     return false;
2378
2379   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2380       RBI.getRegBank(DefDstReg, MRI, TRI))
2381     return false;
2382
2383   // We have a cross-bank copy, which is entering a store. Let's fold it.
2384   I.getOperand(0).setReg(DefDstReg);
2385   return true;
2386 }
2387
2388 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2389   assert(I.getParent() && "Instruction should be in a basic block!");
2390   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2391
2392   MachineBasicBlock &MBB = *I.getParent();
2393   MachineFunction &MF = *MBB.getParent();
2394   MachineRegisterInfo &MRI = MF.getRegInfo();
2395
2396   switch (I.getOpcode()) {
2397   case AArch64::G_DUP: {
2398     // Before selecting a DUP instruction, check if it is better selected as a
2399     // MOV or load from a constant pool.
2400     Register Src = I.getOperand(1).getReg();
2401     auto ValAndVReg = getAnyConstantVRegValWithLookThrough(Src, MRI);
2402     if (!ValAndVReg)
2403       return false;
2404     LLVMContext &Ctx = MF.getFunction().getContext();
2405     Register Dst = I.getOperand(0).getReg();
2406     auto *CV = ConstantDataVector::getSplat(
2407         MRI.getType(Dst).getNumElements(),
2408         ConstantInt::get(
2409             Type::getIntNTy(Ctx, MRI.getType(Dst).getScalarSizeInBits()),
2410             ValAndVReg->Value.trunc(MRI.getType(Dst).getScalarSizeInBits())));
2411     if (!emitConstantVector(Dst, CV, MIB, MRI))
2412       return false;
2413     I.eraseFromParent();
2414     return true;
2415   }
2416   case TargetOpcode::G_SEXT:
2417     // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2418     // over a normal extend.
2419     if (selectUSMovFromExtend(I, MRI))
2420       return true;
2421     return false;
2422   case TargetOpcode::G_BR:
2423     return false;
2424   case TargetOpcode::G_SHL:
2425     return earlySelectSHL(I, MRI);
2426   case TargetOpcode::G_CONSTANT: {
2427     bool IsZero = false;
2428     if (I.getOperand(1).isCImm())
2429       IsZero = I.getOperand(1).getCImm()->isZero();
2430     else if (I.getOperand(1).isImm())
2431       IsZero = I.getOperand(1).getImm() == 0;
2432
2433     if (!IsZero)
2434       return false;
2435
2436     Register DefReg = I.getOperand(0).getReg();
2437     LLT Ty = MRI.getType(DefReg);
2438     if (Ty.getSizeInBits() == 64) {
2439       I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2440       RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2441     } else if (Ty.getSizeInBits() == 32) {
2442       I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2443       RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2444     } else
2445       return false;
2446
2447     I.setDesc(TII.get(TargetOpcode::COPY));
2448     return true;
2449   }
2450
2451   case TargetOpcode::G_ADD: {
2452     // Check if this is being fed by a G_ICMP on either side.
2453     //
2454     // (cmp pred, x, y) + z
2455     //
2456     // In the above case, when the cmp is true, we increment z by 1. So, we can
2457     // fold the add into the cset for the cmp by using cinc.
2458     //
2459     // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2460     Register AddDst = I.getOperand(0).getReg();
2461     Register AddLHS = I.getOperand(1).getReg();
2462     Register AddRHS = I.getOperand(2).getReg();
2463     // Only handle scalars.
2464     LLT Ty = MRI.getType(AddLHS);
2465     if (Ty.isVector())
2466       return false;
2467     // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2468     // bits.
2469     unsigned Size = Ty.getSizeInBits();
2470     if (Size != 32 && Size != 64)
2471       return false;
2472     auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2473       if (!MRI.hasOneNonDBGUse(Reg))
2474         return nullptr;
2475       // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2476       // compare.
2477       if (Size == 32)
2478         return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI);
2479       // We model scalar compares using 32-bit destinations right now.
2480       // If it's a 64-bit compare, it'll have 64-bit sources.
2481       Register ZExt;
2482       if (!mi_match(Reg, MRI,
2483                     m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt))))))
2484         return nullptr;
2485       auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI);
2486       if (!Cmp ||
2487           MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64)
2488         return nullptr;
2489       return Cmp;
2490     };
2491     // Try to match
2492     // z + (cmp pred, x, y)
2493     MachineInstr *Cmp = MatchCmp(AddRHS);
2494     if (!Cmp) {
2495       // (cmp pred, x, y) + z
2496       std::swap(AddLHS, AddRHS);
2497       Cmp = MatchCmp(AddRHS);
2498       if (!Cmp)
2499         return false;
2500     }
2501     auto &PredOp = Cmp->getOperand(1);
2502     auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2503     const AArch64CC::CondCode InvCC =
2504         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
2505     MIB.setInstrAndDebugLoc(I);
2506     emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
2507                        /*RHS=*/Cmp->getOperand(3), PredOp, MIB);
2508     emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
2509     I.eraseFromParent();
2510     return true;
2511   }
2512   case TargetOpcode::G_OR: {
2513     // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2514     // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2515     // shifting and masking that we can replace with a BFI (encoded as a BFM).
2516     Register Dst = I.getOperand(0).getReg();
2517     LLT Ty = MRI.getType(Dst);
2518
2519     if (!Ty.isScalar())
2520       return false;
2521
2522     unsigned Size = Ty.getSizeInBits();
2523     if (Size != 32 && Size != 64)
2524       return false;
2525
2526     Register ShiftSrc;
2527     int64_t ShiftImm;
2528     Register MaskSrc;
2529     int64_t MaskImm;
2530     if (!mi_match(
2531             Dst, MRI,
2532             m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
2533                   m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
2534       return false;
2535
2536     if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2537       return false;
2538
2539     int64_t Immr = Size - ShiftImm;
2540     int64_t Imms = Size - ShiftImm - 1;
2541     unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2542     emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
2543     I.eraseFromParent();
2544     return true;
2545   }
2546   case TargetOpcode::G_FENCE: {
2547     if (I.getOperand(1).getImm() == 0)
2548       BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER));
2549     else
2550       BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB))
2551           .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb);
2552     I.eraseFromParent();
2553     return true;
2554   }
2555   default:
2556     return false;
2557   }
2558 }
2559
2560 bool AArch64InstructionSelector::select(MachineInstr &I) {
2561   assert(I.getParent() && "Instruction should be in a basic block!");
2562   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2563
2564   MachineBasicBlock &MBB = *I.getParent();
2565   MachineFunction &MF = *MBB.getParent();
2566   MachineRegisterInfo &MRI = MF.getRegInfo();
2567
2568   const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2569   if (Subtarget->requiresStrictAlign()) {
2570     // We don't support this feature yet.
2571     LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2572     return false;
2573   }
2574
2575   MIB.setInstrAndDebugLoc(I);
2576
2577   unsigned Opcode = I.getOpcode();
2578   // G_PHI requires same handling as PHI
2579   if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2580     // Certain non-generic instructions also need some special handling.
2581
2582     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
2583       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2584
2585     if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2586       const Register DefReg = I.getOperand(0).getReg();
2587       const LLT DefTy = MRI.getType(DefReg);
2588
2589       const RegClassOrRegBank &RegClassOrBank =
2590         MRI.getRegClassOrRegBank(DefReg);
2591
2592       const TargetRegisterClass *DefRC =
2593           dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
2594       if (!DefRC) {
2595         if (!DefTy.isValid()) {
2596           LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2597           return false;
2598         }
2599         const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
2600         DefRC = getRegClassForTypeOnBank(DefTy, RB);
2601         if (!DefRC) {
2602           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2603           return false;
2604         }
2605       }
2606
2607       I.setDesc(TII.get(TargetOpcode::PHI));
2608
2609       return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2610     }
2611
2612     if (I.isCopy())
2613       return selectCopy(I, TII, MRI, TRI, RBI);
2614
2615     if (I.isDebugInstr())
2616       return selectDebugInstr(I, MRI, RBI);
2617
2618     return true;
2619   }
2620
2621
2622   if (I.getNumOperands() != I.getNumExplicitOperands()) {
2623     LLVM_DEBUG(
2624         dbgs() << "Generic instruction has unexpected implicit operands\n");
2625     return false;
2626   }
2627
2628   // Try to do some lowering before we start instruction selecting. These
2629   // lowerings are purely transformations on the input G_MIR and so selection
2630   // must continue after any modification of the instruction.
2631   if (preISelLower(I)) {
2632     Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2633   }
2634
2635   // There may be patterns where the importer can't deal with them optimally,
2636   // but does select it to a suboptimal sequence so our custom C++ selection
2637   // code later never has a chance to work on it. Therefore, we have an early
2638   // selection attempt here to give priority to certain selection routines
2639   // over the imported ones.
2640   if (earlySelect(I))
2641     return true;
2642
2643   if (selectImpl(I, *CoverageInfo))
2644     return true;
2645
2646   LLT Ty =
2647       I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2648
2649   switch (Opcode) {
2650   case TargetOpcode::G_SBFX:
2651   case TargetOpcode::G_UBFX: {
2652     static const unsigned OpcTable[2][2] = {
2653         {AArch64::UBFMWri, AArch64::UBFMXri},
2654         {AArch64::SBFMWri, AArch64::SBFMXri}};
2655     bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2656     unsigned Size = Ty.getSizeInBits();
2657     unsigned Opc = OpcTable[IsSigned][Size == 64];
2658     auto Cst1 =
2659         getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2660     assert(Cst1 && "Should have gotten a constant for src 1?");
2661     auto Cst2 =
2662         getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2663     assert(Cst2 && "Should have gotten a constant for src 2?");
2664     auto LSB = Cst1->Value.getZExtValue();
2665     auto Width = Cst2->Value.getZExtValue();
2666     auto BitfieldInst =
2667         MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2668             .addImm(LSB)
2669             .addImm(LSB + Width - 1);
2670     I.eraseFromParent();
2671     return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2672   }
2673   case TargetOpcode::G_BRCOND:
2674     return selectCompareBranch(I, MF, MRI);
2675
2676   case TargetOpcode::G_BRINDIRECT: {
2677     const Function &Fn = MF.getFunction();
2678     if (std::optional<uint16_t> BADisc =
2679             STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(Fn)) {
2680       auto MI = MIB.buildInstr(AArch64::BRA, {}, {I.getOperand(0).getReg()});
2681       MI.addImm(AArch64PACKey::IA);
2682       MI.addImm(*BADisc);
2683       MI.addReg(/*AddrDisc=*/AArch64::XZR);
2684       I.eraseFromParent();
2685       return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
2686     }
2687     I.setDesc(TII.get(AArch64::BR));
2688     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2689   }
2690
2691   case TargetOpcode::G_BRJT:
2692     return selectBrJT(I, MRI);
2693
2694   case AArch64::G_ADD_LOW: {
2695     // This op may have been separated from it's ADRP companion by the localizer
2696     // or some other code motion pass. Given that many CPUs will try to
2697     // macro fuse these operations anyway, select this into a MOVaddr pseudo
2698     // which will later be expanded into an ADRP+ADD pair after scheduling.
2699     MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2700     if (BaseMI->getOpcode() != AArch64::ADRP) {
2701       I.setDesc(TII.get(AArch64::ADDXri));
2702       I.addOperand(MachineOperand::CreateImm(0));
2703       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2704     }
2705     assert(TM.getCodeModel() == CodeModel::Small &&
2706            "Expected small code model");
2707     auto Op1 = BaseMI->getOperand(1);
2708     auto Op2 = I.getOperand(2);
2709     auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2710                        .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2711                                          Op1.getTargetFlags())
2712                        .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2713                                          Op2.getTargetFlags());
2714     I.eraseFromParent();
2715     return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2716   }
2717
2718   case TargetOpcode::G_FCONSTANT:
2719   case TargetOpcode::G_CONSTANT: {
2720     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2721
2722     const LLT s8 = LLT::scalar(8);
2723     const LLT s16 = LLT::scalar(16);
2724     const LLT s32 = LLT::scalar(32);
2725     const LLT s64 = LLT::scalar(64);
2726     const LLT s128 = LLT::scalar(128);
2727     const LLT p0 = LLT::pointer(0, 64);
2728
2729     const Register DefReg = I.getOperand(0).getReg();
2730     const LLT DefTy = MRI.getType(DefReg);
2731     const unsigned DefSize = DefTy.getSizeInBits();
2732     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2733
2734     // FIXME: Redundant check, but even less readable when factored out.
2735     if (isFP) {
2736       if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2737         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2738                           << " constant, expected: " << s16 << " or " << s32
2739                           << " or " << s64 << " or " << s128 << '\n');
2740         return false;
2741       }
2742
2743       if (RB.getID() != AArch64::FPRRegBankID) {
2744         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2745                           << " constant on bank: " << RB
2746                           << ", expected: FPR\n");
2747         return false;
2748       }
2749
2750       // The case when we have 0.0 is covered by tablegen. Reject it here so we
2751       // can be sure tablegen works correctly and isn't rescued by this code.
2752       // 0.0 is not covered by tablegen for FP128. So we will handle this
2753       // scenario in the code here.
2754       if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2755         return false;
2756     } else {
2757       // s32 and s64 are covered by tablegen.
2758       if (Ty != p0 && Ty != s8 && Ty != s16) {
2759         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2760                           << " constant, expected: " << s32 << ", " << s64
2761                           << ", or " << p0 << '\n');
2762         return false;
2763       }
2764
2765       if (RB.getID() != AArch64::GPRRegBankID) {
2766         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2767                           << " constant on bank: " << RB
2768                           << ", expected: GPR\n");
2769         return false;
2770       }
2771     }
2772
2773     if (isFP) {
2774       const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB);
2775       // For 16, 64, and 128b values, emit a constant pool load.
2776       switch (DefSize) {
2777       default:
2778         llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2779       case 32:
2780       case 64: {
2781         bool OptForSize = shouldOptForSize(&MF);
2782         const auto &TLI = MF.getSubtarget().getTargetLowering();
2783         // If TLI says that this fpimm is illegal, then we'll expand to a
2784         // constant pool load.
2785         if (TLI->isFPImmLegal(I.getOperand(1).getFPImm()->getValueAPF(),
2786                               EVT::getFloatingPointVT(DefSize), OptForSize))
2787           break;
2788         [[fallthrough]];
2789       }
2790       case 16:
2791       case 128: {
2792         auto *FPImm = I.getOperand(1).getFPImm();
2793         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2794         if (!LoadMI) {
2795           LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2796           return false;
2797         }
2798         MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2799         I.eraseFromParent();
2800         return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2801       }
2802       }
2803
2804       assert((DefSize == 32 || DefSize == 64) && "Unexpected const def size");
2805       // Either emit a FMOV, or emit a copy to emit a normal mov.
2806       const Register DefGPRReg = MRI.createVirtualRegister(
2807           DefSize == 32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
2808       MachineOperand &RegOp = I.getOperand(0);
2809       RegOp.setReg(DefGPRReg);
2810       MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2811       MIB.buildCopy({DefReg}, {DefGPRReg});
2812
2813       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2814         LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2815         return false;
2816       }
2817
2818       MachineOperand &ImmOp = I.getOperand(1);
2819       // FIXME: Is going through int64_t always correct?
2820       ImmOp.ChangeToImmediate(
2821           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2822     } else if (I.getOperand(1).isCImm()) {
2823       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2824       I.getOperand(1).ChangeToImmediate(Val);
2825     } else if (I.getOperand(1).isImm()) {
2826       uint64_t Val = I.getOperand(1).getImm();
2827       I.getOperand(1).ChangeToImmediate(Val);
2828     }
2829
2830     const unsigned MovOpc =
2831         DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2832     I.setDesc(TII.get(MovOpc));
2833     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2834     return true;
2835   }
2836   case TargetOpcode::G_EXTRACT: {
2837     Register DstReg = I.getOperand(0).getReg();
2838     Register SrcReg = I.getOperand(1).getReg();
2839     LLT SrcTy = MRI.getType(SrcReg);
2840     LLT DstTy = MRI.getType(DstReg);
2841     (void)DstTy;
2842     unsigned SrcSize = SrcTy.getSizeInBits();
2843
2844     if (SrcTy.getSizeInBits() > 64) {
2845       // This should be an extract of an s128, which is like a vector extract.
2846       if (SrcTy.getSizeInBits() != 128)
2847         return false;
2848       // Only support extracting 64 bits from an s128 at the moment.
2849       if (DstTy.getSizeInBits() != 64)
2850         return false;
2851
2852       unsigned Offset = I.getOperand(2).getImm();
2853       if (Offset % 64 != 0)
2854         return false;
2855
2856       // Check we have the right regbank always.
2857       const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2858       const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2859       assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2860
2861       if (SrcRB.getID() == AArch64::GPRRegBankID) {
2862         auto NewI =
2863             MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2864                 .addUse(SrcReg, 0,
2865                         Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2866         constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI,
2867                                  AArch64::GPR64RegClass, NewI->getOperand(0));
2868         I.eraseFromParent();
2869         return true;
2870       }
2871
2872       // Emit the same code as a vector extract.
2873       // Offset must be a multiple of 64.
2874       unsigned LaneIdx = Offset / 64;
2875       MachineInstr *Extract = emitExtractVectorElt(
2876           DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2877       if (!Extract)
2878         return false;
2879       I.eraseFromParent();
2880       return true;
2881     }
2882
2883     I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2884     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2885                                       Ty.getSizeInBits() - 1);
2886
2887     if (SrcSize < 64) {
2888       assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2889              "unexpected G_EXTRACT types");
2890       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2891     }
2892
2893     DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2894     MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2895     MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2896         .addReg(DstReg, 0, AArch64::sub_32);
2897     RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2898                                  AArch64::GPR32RegClass, MRI);
2899     I.getOperand(0).setReg(DstReg);
2900
2901     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2902   }
2903
2904   case TargetOpcode::G_INSERT: {
2905     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2906     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2907     unsigned DstSize = DstTy.getSizeInBits();
2908     // Larger inserts are vectors, same-size ones should be something else by
2909     // now (split up or turned into COPYs).
2910     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2911       return false;
2912
2913     I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2914     unsigned LSB = I.getOperand(3).getImm();
2915     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2916     I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2917     MachineInstrBuilder(MF, I).addImm(Width - 1);
2918
2919     if (DstSize < 64) {
2920       assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2921              "unexpected G_INSERT types");
2922       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2923     }
2924
2925     Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2926     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2927             TII.get(AArch64::SUBREG_TO_REG))
2928         .addDef(SrcReg)
2929         .addImm(0)
2930         .addUse(I.getOperand(2).getReg())
2931         .addImm(AArch64::sub_32);
2932     RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2933                                  AArch64::GPR32RegClass, MRI);
2934     I.getOperand(2).setReg(SrcReg);
2935
2936     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2937   }
2938   case TargetOpcode::G_FRAME_INDEX: {
2939     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2940     if (Ty != LLT::pointer(0, 64)) {
2941       LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2942                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2943       return false;
2944     }
2945     I.setDesc(TII.get(AArch64::ADDXri));
2946
2947     // MOs for a #0 shifted immediate.
2948     I.addOperand(MachineOperand::CreateImm(0));
2949     I.addOperand(MachineOperand::CreateImm(0));
2950
2951     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2952   }
2953
2954   case TargetOpcode::G_GLOBAL_VALUE: {
2955     const GlobalValue *GV = nullptr;
2956     unsigned OpFlags;
2957     if (I.getOperand(1).isSymbol()) {
2958       OpFlags = I.getOperand(1).getTargetFlags();
2959       // Currently only used by "RtLibUseGOT".
2960       assert(OpFlags == AArch64II::MO_GOT);
2961     } else {
2962       GV = I.getOperand(1).getGlobal();
2963       if (GV->isThreadLocal())
2964         return selectTLSGlobalValue(I, MRI);
2965       OpFlags = STI.ClassifyGlobalReference(GV, TM);
2966     }
2967
2968     if (OpFlags & AArch64II::MO_GOT) {
2969       I.setDesc(TII.get(MF.getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
2970                             ? AArch64::LOADgotAUTH
2971                             : AArch64::LOADgot));
2972       I.getOperand(1).setTargetFlags(OpFlags);
2973     } else if (TM.getCodeModel() == CodeModel::Large &&
2974                !TM.isPositionIndependent()) {
2975       // Materialize the global using movz/movk instructions.
2976       materializeLargeCMVal(I, GV, OpFlags);
2977       I.eraseFromParent();
2978       return true;
2979     } else if (TM.getCodeModel() == CodeModel::Tiny) {
2980       I.setDesc(TII.get(AArch64::ADR));
2981       I.getOperand(1).setTargetFlags(OpFlags);
2982     } else {
2983       I.setDesc(TII.get(AArch64::MOVaddr));
2984       I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2985       MachineInstrBuilder MIB(MF, I);
2986       MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2987                            OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2988     }
2989     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2990   }
2991
2992   case TargetOpcode::G_PTRAUTH_GLOBAL_VALUE:
2993     return selectPtrAuthGlobalValue(I, MRI);
2994
2995   case TargetOpcode::G_ZEXTLOAD:
2996   case TargetOpcode::G_LOAD:
2997   case TargetOpcode::G_STORE: {
2998     GLoadStore &LdSt = cast<GLoadStore>(I);
2999     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
3000     LLT PtrTy = MRI.getType(LdSt.getPointerReg());
3001
3002     if (PtrTy != LLT::pointer(0, 64)) {
3003       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
3004                         << ", expected: " << LLT::pointer(0, 64) << '\n');
3005       return false;
3006     }
3007
3008     uint64_t MemSizeInBytes = LdSt.getMemSize().getValue();
3009     unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue();
3010     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
3011
3012     // Need special instructions for atomics that affect ordering.
3013     if (Order != AtomicOrdering::NotAtomic &&
3014         Order != AtomicOrdering::Unordered &&
3015         Order != AtomicOrdering::Monotonic) {
3016       assert(!isa<GZExtLoad>(LdSt));
3017       assert(MemSizeInBytes <= 8 &&
3018              "128-bit atomics should already be custom-legalized");
3019
3020       if (isa<GLoad>(LdSt)) {
3021         static constexpr unsigned LDAPROpcodes[] = {
3022             AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
3023         static constexpr unsigned LDAROpcodes[] = {
3024             AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
3025         ArrayRef<unsigned> Opcodes =
3026             STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent
3027                 ? LDAPROpcodes
3028                 : LDAROpcodes;
3029         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
3030       } else {
3031         static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
3032                                                AArch64::STLRW, AArch64::STLRX};
3033         Register ValReg = LdSt.getReg(0);
3034         if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
3035           // Emit a subreg copy of 32 bits.
3036           Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3037           MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {})
3038               .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32);
3039           I.getOperand(0).setReg(NewVal);
3040         }
3041         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
3042       }
3043       constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3044       return true;
3045     }
3046
3047 #ifndef NDEBUG
3048     const Register PtrReg = LdSt.getPointerReg();
3049     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
3050     // Check that the pointer register is valid.
3051     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
3052            "Load/Store pointer operand isn't a GPR");
3053     assert(MRI.getType(PtrReg).isPointer() &&
3054            "Load/Store pointer operand isn't a pointer");
3055 #endif
3056
3057     const Register ValReg = LdSt.getReg(0);
3058     const LLT ValTy = MRI.getType(ValReg);
3059     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
3060
3061     // The code below doesn't support truncating stores, so we need to split it
3062     // again.
3063     if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
3064       unsigned SubReg;
3065       LLT MemTy = LdSt.getMMO().getMemoryType();
3066       auto *RC = getRegClassForTypeOnBank(MemTy, RB);
3067       if (!getSubRegForClass(RC, TRI, SubReg))
3068         return false;
3069
3070       // Generate a subreg copy.
3071       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
3072                       .addReg(ValReg, 0, SubReg)
3073                       .getReg(0);
3074       RBI.constrainGenericRegister(Copy, *RC, MRI);
3075       LdSt.getOperand(0).setReg(Copy);
3076     } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
3077       // If this is an any-extending load from the FPR bank, split it into a regular
3078       // load + extend.
3079       if (RB.getID() == AArch64::FPRRegBankID) {
3080         unsigned SubReg;
3081         LLT MemTy = LdSt.getMMO().getMemoryType();
3082         auto *RC = getRegClassForTypeOnBank(MemTy, RB);
3083         if (!getSubRegForClass(RC, TRI, SubReg))
3084           return false;
3085         Register OldDst = LdSt.getReg(0);
3086         Register NewDst =
3087             MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType());
3088         LdSt.getOperand(0).setReg(NewDst);
3089         MRI.setRegBank(NewDst, RB);
3090         // Generate a SUBREG_TO_REG to extend it.
3091         MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator()));
3092         MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
3093             .addImm(0)
3094             .addUse(NewDst)
3095             .addImm(SubReg);
3096         auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB);
3097         RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
3098         MIB.setInstr(LdSt);
3099       }
3100     }
3101
3102     // Helper lambda for partially selecting I. Either returns the original
3103     // instruction with an updated opcode, or a new instruction.
3104     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
3105       bool IsStore = isa<GStore>(I);
3106       const unsigned NewOpc =
3107           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
3108       if (NewOpc == I.getOpcode())
3109         return nullptr;
3110       // Check if we can fold anything into the addressing mode.
3111       auto AddrModeFns =
3112           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
3113       if (!AddrModeFns) {
3114         // Can't fold anything. Use the original instruction.
3115         I.setDesc(TII.get(NewOpc));
3116         I.addOperand(MachineOperand::CreateImm(0));
3117         return &I;
3118       }
3119
3120       // Folded something. Create a new instruction and return it.
3121       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
3122       Register CurValReg = I.getOperand(0).getReg();
3123       IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
3124       NewInst.cloneMemRefs(I);
3125       for (auto &Fn : *AddrModeFns)
3126         Fn(NewInst);
3127       I.eraseFromParent();
3128       return &*NewInst;
3129     };
3130
3131     MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
3132     if (!LoadStore)
3133       return false;
3134
3135     // If we're storing a 0, use WZR/XZR.
3136     if (Opcode == TargetOpcode::G_STORE) {
3137       auto CVal = getIConstantVRegValWithLookThrough(
3138           LoadStore->getOperand(0).getReg(), MRI);
3139       if (CVal && CVal->Value == 0) {
3140         switch (LoadStore->getOpcode()) {
3141         case AArch64::STRWui:
3142         case AArch64::STRHHui:
3143         case AArch64::STRBBui:
3144           LoadStore->getOperand(0).setReg(AArch64::WZR);
3145           break;
3146         case AArch64::STRXui:
3147           LoadStore->getOperand(0).setReg(AArch64::XZR);
3148           break;
3149         }
3150       }
3151     }
3152
3153     if (IsZExtLoad || (Opcode == TargetOpcode::G_LOAD &&
3154                        ValTy == LLT::scalar(64) && MemSizeInBits == 32)) {
3155       // The any/zextload from a smaller type to i32 should be handled by the
3156       // importer.
3157       if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
3158         return false;
3159       // If we have an extending load then change the load's type to be a
3160       // narrower reg and zero_extend with SUBREG_TO_REG.
3161       Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3162       Register DstReg = LoadStore->getOperand(0).getReg();
3163       LoadStore->getOperand(0).setReg(LdReg);
3164
3165       MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
3166       MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
3167           .addImm(0)
3168           .addUse(LdReg)
3169           .addImm(AArch64::sub_32);
3170       constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3171       return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
3172                                           MRI);
3173     }
3174     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3175   }
3176
3177   case TargetOpcode::G_INDEXED_ZEXTLOAD:
3178   case TargetOpcode::G_INDEXED_SEXTLOAD:
3179     return selectIndexedExtLoad(I, MRI);
3180   case TargetOpcode::G_INDEXED_LOAD:
3181     return selectIndexedLoad(I, MRI);
3182   case TargetOpcode::G_INDEXED_STORE:
3183     return selectIndexedStore(cast<GIndexedStore>(I), MRI);
3184
3185   case TargetOpcode::G_LSHR:
3186   case TargetOpcode::G_ASHR:
3187     if (MRI.getType(I.getOperand(0).getReg()).isVector())
3188       return selectVectorAshrLshr(I, MRI);
3189     [[fallthrough]];
3190   case TargetOpcode::G_SHL:
3191     if (Opcode == TargetOpcode::G_SHL &&
3192         MRI.getType(I.getOperand(0).getReg()).isVector())
3193       return selectVectorSHL(I, MRI);
3194
3195     // These shifts were legalized to have 64 bit shift amounts because we
3196     // want to take advantage of the selection patterns that assume the
3197     // immediates are s64s, however, selectBinaryOp will assume both operands
3198     // will have the same bit size.
3199     {
3200       Register SrcReg = I.getOperand(1).getReg();
3201       Register ShiftReg = I.getOperand(2).getReg();
3202       const LLT ShiftTy = MRI.getType(ShiftReg);
3203       const LLT SrcTy = MRI.getType(SrcReg);
3204       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
3205           ShiftTy.getSizeInBits() == 64) {
3206         assert(!ShiftTy.isVector() && "unexpected vector shift ty");
3207         // Insert a subregister copy to implement a 64->32 trunc
3208         auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
3209                          .addReg(ShiftReg, 0, AArch64::sub_32);
3210         MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
3211         I.getOperand(2).setReg(Trunc.getReg(0));
3212       }
3213     }
3214     [[fallthrough]];
3215   case TargetOpcode::G_OR: {
3216     // Reject the various things we don't support yet.
3217     if (unsupportedBinOp(I, RBI, MRI, TRI))
3218       return false;
3219
3220     const unsigned OpSize = Ty.getSizeInBits();
3221
3222     const Register DefReg = I.getOperand(0).getReg();
3223     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3224
3225     const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
3226     if (NewOpc == I.getOpcode())
3227       return false;
3228
3229     I.setDesc(TII.get(NewOpc));
3230     // FIXME: Should the type be always reset in setDesc?
3231
3232     // Now that we selected an opcode, we need to constrain the register
3233     // operands to use appropriate classes.
3234     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3235   }
3236
3237   case TargetOpcode::G_PTR_ADD: {
3238     emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
3239     I.eraseFromParent();
3240     return true;
3241   }
3242
3243   case TargetOpcode::G_SADDE:
3244   case TargetOpcode::G_UADDE:
3245   case TargetOpcode::G_SSUBE:
3246   case TargetOpcode::G_USUBE:
3247   case TargetOpcode::G_SADDO:
3248   case TargetOpcode::G_UADDO:
3249   case TargetOpcode::G_SSUBO:
3250   case TargetOpcode::G_USUBO:
3251     return selectOverflowOp(I, MRI);
3252
3253   case TargetOpcode::G_PTRMASK: {
3254     Register MaskReg = I.getOperand(2).getReg();
3255     std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI);
3256     // TODO: Implement arbitrary cases
3257     if (!MaskVal || !isShiftedMask_64(*MaskVal))
3258       return false;
3259
3260     uint64_t Mask = *MaskVal;
3261     I.setDesc(TII.get(AArch64::ANDXri));
3262     I.getOperand(2).ChangeToImmediate(
3263         AArch64_AM::encodeLogicalImmediate(Mask, 64));
3264
3265     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3266   }
3267   case TargetOpcode::G_PTRTOINT:
3268   case TargetOpcode::G_TRUNC: {
3269     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3270     const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3271
3272     const Register DstReg = I.getOperand(0).getReg();
3273     const Register SrcReg = I.getOperand(1).getReg();
3274
3275     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3276     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3277
3278     if (DstRB.getID() != SrcRB.getID()) {
3279       LLVM_DEBUG(
3280           dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3281       return false;
3282     }
3283
3284     if (DstRB.getID() == AArch64::GPRRegBankID) {
3285       const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3286       if (!DstRC)
3287         return false;
3288
3289       const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
3290       if (!SrcRC)
3291         return false;
3292
3293       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
3294           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
3295         LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3296         return false;
3297       }
3298
3299       if (DstRC == SrcRC) {
3300         // Nothing to be done
3301       } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
3302                  SrcTy == LLT::scalar(64)) {
3303         llvm_unreachable("TableGen can import this case");
3304         return false;
3305       } else if (DstRC == &AArch64::GPR32RegClass &&
3306                  SrcRC == &AArch64::GPR64RegClass) {
3307         I.getOperand(1).setSubReg(AArch64::sub_32);
3308       } else {
3309         LLVM_DEBUG(
3310             dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3311         return false;
3312       }
3313
3314       I.setDesc(TII.get(TargetOpcode::COPY));
3315       return true;
3316     } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3317       if (DstTy == LLT::fixed_vector(4, 16) &&
3318           SrcTy == LLT::fixed_vector(4, 32)) {
3319         I.setDesc(TII.get(AArch64::XTNv4i16));
3320         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3321         return true;
3322       }
3323
3324       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3325         MachineInstr *Extract = emitExtractVectorElt(
3326             DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
3327         if (!Extract)
3328           return false;
3329         I.eraseFromParent();
3330         return true;
3331       }
3332
3333       // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3334       if (Opcode == TargetOpcode::G_PTRTOINT) {
3335         assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3336         I.setDesc(TII.get(TargetOpcode::COPY));
3337         return selectCopy(I, TII, MRI, TRI, RBI);
3338       }
3339     }
3340
3341     return false;
3342   }
3343
3344   case TargetOpcode::G_ANYEXT: {
3345     if (selectUSMovFromExtend(I, MRI))
3346       return true;
3347
3348     const Register DstReg = I.getOperand(0).getReg();
3349     const Register SrcReg = I.getOperand(1).getReg();
3350
3351     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3352     if (RBDst.getID() != AArch64::GPRRegBankID) {
3353       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3354                         << ", expected: GPR\n");
3355       return false;
3356     }
3357
3358     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3359     if (RBSrc.getID() != AArch64::GPRRegBankID) {
3360       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3361                         << ", expected: GPR\n");
3362       return false;
3363     }
3364
3365     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3366
3367     if (DstSize == 0) {
3368       LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3369       return false;
3370     }
3371
3372     if (DstSize != 64 && DstSize > 32) {
3373       LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3374                         << ", expected: 32 or 64\n");
3375       return false;
3376     }
3377     // At this point G_ANYEXT is just like a plain COPY, but we need
3378     // to explicitly form the 64-bit value if any.
3379     if (DstSize > 32) {
3380       Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3381       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3382           .addDef(ExtSrc)
3383           .addImm(0)
3384           .addUse(SrcReg)
3385           .addImm(AArch64::sub_32);
3386       I.getOperand(1).setReg(ExtSrc);
3387     }
3388     return selectCopy(I, TII, MRI, TRI, RBI);
3389   }
3390
3391   case TargetOpcode::G_ZEXT:
3392   case TargetOpcode::G_SEXT_INREG:
3393   case TargetOpcode::G_SEXT: {
3394     if (selectUSMovFromExtend(I, MRI))
3395       return true;
3396
3397     unsigned Opcode = I.getOpcode();
3398     const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3399     const Register DefReg = I.getOperand(0).getReg();
3400     Register SrcReg = I.getOperand(1).getReg();
3401     const LLT DstTy = MRI.getType(DefReg);
3402     const LLT SrcTy = MRI.getType(SrcReg);
3403     unsigned DstSize = DstTy.getSizeInBits();
3404     unsigned SrcSize = SrcTy.getSizeInBits();
3405
3406     // SEXT_INREG has the same src reg size as dst, the size of the value to be
3407     // extended is encoded in the imm.
3408     if (Opcode == TargetOpcode::G_SEXT_INREG)
3409       SrcSize = I.getOperand(2).getImm();
3410
3411     if (DstTy.isVector())
3412       return false; // Should be handled by imported patterns.
3413
3414     assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3415                AArch64::GPRRegBankID &&
3416            "Unexpected ext regbank");
3417
3418     MachineInstr *ExtI;
3419
3420     // First check if we're extending the result of a load which has a dest type
3421     // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3422     // GPR register on AArch64 and all loads which are smaller automatically
3423     // zero-extend the upper bits. E.g.
3424     // %v(s8) = G_LOAD %p, :: (load 1)
3425     // %v2(s32) = G_ZEXT %v(s8)
3426     if (!IsSigned) {
3427       auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
3428       bool IsGPR =
3429           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3430       if (LoadMI && IsGPR) {
3431         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3432         unsigned BytesLoaded = MemOp->getSize().getValue();
3433         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3434           return selectCopy(I, TII, MRI, TRI, RBI);
3435       }
3436
3437       // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3438       // + SUBREG_TO_REG.
3439       if (IsGPR && SrcSize == 32 && DstSize == 64) {
3440         Register SubregToRegSrc =
3441             MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3442         const Register ZReg = AArch64::WZR;
3443         MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg})
3444             .addImm(0);
3445
3446         MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3447             .addImm(0)
3448             .addUse(SubregToRegSrc)
3449             .addImm(AArch64::sub_32);
3450
3451         if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3452                                           MRI)) {
3453           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3454           return false;
3455         }
3456
3457         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3458                                           MRI)) {
3459           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3460           return false;
3461         }
3462
3463         I.eraseFromParent();
3464         return true;
3465       }
3466     }
3467
3468     if (DstSize == 64) {
3469       if (Opcode != TargetOpcode::G_SEXT_INREG) {
3470         // FIXME: Can we avoid manually doing this?
3471         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3472                                           MRI)) {
3473           LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3474                             << " operand\n");
3475           return false;
3476         }
3477         SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3478                                 {&AArch64::GPR64RegClass}, {})
3479                      .addImm(0)
3480                      .addUse(SrcReg)
3481                      .addImm(AArch64::sub_32)
3482                      .getReg(0);
3483       }
3484
3485       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3486                              {DefReg}, {SrcReg})
3487                   .addImm(0)
3488                   .addImm(SrcSize - 1);
3489     } else if (DstSize <= 32) {
3490       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3491                              {DefReg}, {SrcReg})
3492                   .addImm(0)
3493                   .addImm(SrcSize - 1);
3494     } else {
3495       return false;
3496     }
3497
3498     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3499     I.eraseFromParent();
3500     return true;
3501   }
3502
3503   case TargetOpcode::G_SITOFP:
3504   case TargetOpcode::G_UITOFP:
3505   case TargetOpcode::G_FPTOSI:
3506   case TargetOpcode::G_FPTOUI: {
3507     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3508               SrcTy = MRI.getType(I.getOperand(1).getReg());
3509     const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3510     if (NewOpc == Opcode)
3511       return false;
3512
3513     I.setDesc(TII.get(NewOpc));
3514     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3515     I.setFlags(MachineInstr::NoFPExcept);
3516
3517     return true;
3518   }
3519
3520   case TargetOpcode::G_FREEZE:
3521     return selectCopy(I, TII, MRI, TRI, RBI);
3522
3523   case TargetOpcode::G_INTTOPTR:
3524     // The importer is currently unable to import pointer types since they
3525     // didn't exist in SelectionDAG.
3526     return selectCopy(I, TII, MRI, TRI, RBI);
3527
3528   case TargetOpcode::G_BITCAST:
3529     // Imported SelectionDAG rules can handle every bitcast except those that
3530     // bitcast from a type to the same type. Ideally, these shouldn't occur
3531     // but we might not run an optimizer that deletes them. The other exception
3532     // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3533     // of them.
3534     return selectCopy(I, TII, MRI, TRI, RBI);
3535
3536   case TargetOpcode::G_SELECT: {
3537     auto &Sel = cast<GSelect>(I);
3538     const Register CondReg = Sel.getCondReg();
3539     const Register TReg = Sel.getTrueReg();
3540     const Register FReg = Sel.getFalseReg();
3541
3542     if (tryOptSelect(Sel))
3543       return true;
3544
3545     // Make sure to use an unused vreg instead of wzr, so that the peephole
3546     // optimizations will be able to optimize these.
3547     Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3548     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3549                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3550     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3551     if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
3552       return false;
3553     Sel.eraseFromParent();
3554     return true;
3555   }
3556   case TargetOpcode::G_ICMP: {
3557     if (Ty.isVector())
3558       return false;
3559
3560     if (Ty != LLT::scalar(32)) {
3561       LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3562                         << ", expected: " << LLT::scalar(32) << '\n');
3563       return false;
3564     }
3565
3566     auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3567     const AArch64CC::CondCode InvCC =
3568         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
3569     emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB);
3570     emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
3571               /*Src2=*/AArch64::WZR, InvCC, MIB);
3572     I.eraseFromParent();
3573     return true;
3574   }
3575
3576   case TargetOpcode::G_FCMP: {
3577     CmpInst::Predicate Pred =
3578         static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3579     if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3580                        Pred) ||
3581         !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3582       return false;
3583     I.eraseFromParent();
3584     return true;
3585   }
3586   case TargetOpcode::G_VASTART:
3587     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3588                                 : selectVaStartAAPCS(I, MF, MRI);
3589   case TargetOpcode::G_INTRINSIC:
3590     return selectIntrinsic(I, MRI);
3591   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3592     return selectIntrinsicWithSideEffects(I, MRI);
3593   case TargetOpcode::G_IMPLICIT_DEF: {
3594     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3595     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3596     const Register DstReg = I.getOperand(0).getReg();
3597     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3598     const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3599     RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3600     return true;
3601   }
3602   case TargetOpcode::G_BLOCK_ADDR: {
3603     Function *BAFn = I.getOperand(1).getBlockAddress()->getFunction();
3604     if (std::optional<uint16_t> BADisc =
3605             STI.getPtrAuthBlockAddressDiscriminatorIfEnabled(*BAFn)) {
3606       MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {});
3607       MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
3608       MIB.buildInstr(AArch64::MOVaddrPAC)
3609           .addBlockAddress(I.getOperand(1).getBlockAddress())
3610           .addImm(AArch64PACKey::IA)
3611           .addReg(/*AddrDisc=*/AArch64::XZR)
3612           .addImm(*BADisc)
3613           .constrainAllUses(TII, TRI, RBI);
3614       MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X16));
3615       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
3616                                    AArch64::GPR64RegClass, MRI);
3617       I.eraseFromParent();
3618       return true;
3619     }
3620     if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
3621       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3622       I.eraseFromParent();
3623       return true;
3624     } else {
3625       I.setDesc(TII.get(AArch64::MOVaddrBA));
3626       auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3627                            I.getOperand(0).getReg())
3628                        .addBlockAddress(I.getOperand(1).getBlockAddress(),
3629                                         /* Offset */ 0, AArch64II::MO_PAGE)
3630                        .addBlockAddress(
3631                            I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3632                            AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3633       I.eraseFromParent();
3634       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3635     }
3636   }
3637   case AArch64::G_DUP: {
3638     // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3639     // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3640     // difficult because at RBS we may end up pessimizing the fpr case if we
3641     // decided to add an anyextend to fix this. Manual selection is the most
3642     // robust solution for now.
3643     if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3644         AArch64::GPRRegBankID)
3645       return false; // We expect the fpr regbank case to be imported.
3646     LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3647     if (VecTy == LLT::fixed_vector(8, 8))
3648       I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3649     else if (VecTy == LLT::fixed_vector(16, 8))
3650       I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3651     else if (VecTy == LLT::fixed_vector(4, 16))
3652       I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3653     else if (VecTy == LLT::fixed_vector(8, 16))
3654       I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3655     else
3656       return false;
3657     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3658   }
3659   case TargetOpcode::G_BUILD_VECTOR:
3660     return selectBuildVector(I, MRI);
3661   case TargetOpcode::G_MERGE_VALUES:
3662     return selectMergeValues(I, MRI);
3663   case TargetOpcode::G_UNMERGE_VALUES:
3664     return selectUnmergeValues(I, MRI);
3665   case TargetOpcode::G_SHUFFLE_VECTOR:
3666     return selectShuffleVector(I, MRI);
3667   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3668     return selectExtractElt(I, MRI);
3669   case TargetOpcode::G_CONCAT_VECTORS:
3670     return selectConcatVectors(I, MRI);
3671   case TargetOpcode::G_JUMP_TABLE:
3672     return selectJumpTable(I, MRI);
3673   case TargetOpcode::G_MEMCPY:
3674   case TargetOpcode::G_MEMCPY_INLINE:
3675   case TargetOpcode::G_MEMMOVE:
3676   case TargetOpcode::G_MEMSET:
3677     assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3678     return selectMOPS(I, MRI);
3679   }
3680
3681   return false;
3682 }
3683
3684 bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) {
3685   MachineIRBuilderState OldMIBState = MIB.getState();
3686   bool Success = select(I);
3687   MIB.setState(OldMIBState);
3688   return Success;
3689 }
3690
3691 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3692                                             MachineRegisterInfo &MRI) {
3693   unsigned Mopcode;
3694   switch (GI.getOpcode()) {
3695   case TargetOpcode::G_MEMCPY:
3696   case TargetOpcode::G_MEMCPY_INLINE:
3697     Mopcode = AArch64::MOPSMemoryCopyPseudo;
3698     break;
3699   case TargetOpcode::G_MEMMOVE:
3700     Mopcode = AArch64::MOPSMemoryMovePseudo;
3701     break;
3702   case TargetOpcode::G_MEMSET:
3703     // For tagged memset see llvm.aarch64.mops.memset.tag
3704     Mopcode = AArch64::MOPSMemorySetPseudo;
3705     break;
3706   }
3707
3708   auto &DstPtr = GI.getOperand(0);
3709   auto &SrcOrVal = GI.getOperand(1);
3710   auto &Size = GI.getOperand(2);
3711
3712   // Create copies of the registers that can be clobbered.
3713   const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg());
3714   const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg());
3715   const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg());
3716
3717   const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3718   const auto &SrcValRegClass =
3719       IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3720
3721   // Constrain to specific registers
3722   RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
3723   RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI);
3724   RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
3725
3726   MIB.buildCopy(DstPtrCopy, DstPtr);
3727   MIB.buildCopy(SrcValCopy, SrcOrVal);
3728   MIB.buildCopy(SizeCopy, Size);
3729
3730   // New instruction uses the copied registers because it must update them.
3731   // The defs are not used since they don't exist in G_MEM*. They are still
3732   // tied.
3733   // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3734   Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
3735   Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3736   if (IsSet) {
3737     MIB.buildInstr(Mopcode, {DefDstPtr, DefSize},
3738                    {DstPtrCopy, SizeCopy, SrcValCopy});
3739   } else {
3740     Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
3741     MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize},
3742                    {DstPtrCopy, SrcValCopy, SizeCopy});
3743   }
3744
3745   GI.eraseFromParent();
3746   return true;
3747 }
3748
3749 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3750                                             MachineRegisterInfo &MRI) {
3751   assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3752   Register JTAddr = I.getOperand(0).getReg();
3753   unsigned JTI = I.getOperand(1).getIndex();
3754   Register Index = I.getOperand(2).getReg();
3755
3756   MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3757
3758   // With aarch64-jump-table-hardening, we only expand the jump table dispatch
3759   // sequence later, to guarantee the integrity of the intermediate values.
3760   if (MF->getFunction().hasFnAttribute("aarch64-jump-table-hardening")) {
3761     CodeModel::Model CM = TM.getCodeModel();
3762     if (STI.isTargetMachO()) {
3763       if (CM != CodeModel::Small && CM != CodeModel::Large)
3764         report_fatal_error("Unsupported code-model for hardened jump-table");
3765     } else {
3766       // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
3767       assert(STI.isTargetELF() &&
3768              "jump table hardening only supported on MachO/ELF");
3769       if (CM != CodeModel::Small)
3770         report_fatal_error("Unsupported code-model for hardened jump-table");
3771     }
3772
3773     MIB.buildCopy({AArch64::X16}, I.getOperand(2).getReg());
3774     MIB.buildInstr(AArch64::BR_JumpTable)
3775         .addJumpTableIndex(I.getOperand(1).getIndex());
3776     I.eraseFromParent();
3777     return true;
3778   }
3779
3780   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3781   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3782
3783   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3784                                       {TargetReg, ScratchReg}, {JTAddr, Index})
3785                            .addJumpTableIndex(JTI);
3786   // Save the jump table info.
3787   MIB.buildInstr(TargetOpcode::JUMP_TABLE_DEBUG_INFO, {},
3788                  {static_cast<int64_t>(JTI)});
3789   // Build the indirect branch.
3790   MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3791   I.eraseFromParent();
3792   return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3793 }
3794
3795 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3796                                                  MachineRegisterInfo &MRI) {
3797   assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3798   assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3799
3800   Register DstReg = I.getOperand(0).getReg();
3801   unsigned JTI = I.getOperand(1).getIndex();
3802   // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3803   auto MovMI =
3804     MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3805           .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3806           .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3807   I.eraseFromParent();
3808   return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3809 }
3810
3811 bool AArch64InstructionSelector::selectTLSGlobalValue(
3812     MachineInstr &I, MachineRegisterInfo &MRI) {
3813   if (!STI.isTargetMachO())
3814     return false;
3815   MachineFunction &MF = *I.getParent()->getParent();
3816   MF.getFrameInfo().setAdjustsStack(true);
3817
3818   const auto &GlobalOp = I.getOperand(1);
3819   assert(GlobalOp.getOffset() == 0 &&
3820          "Shouldn't have an offset on TLS globals!");
3821   const GlobalValue &GV = *GlobalOp.getGlobal();
3822
3823   auto LoadGOT =
3824       MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3825           .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3826
3827   auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3828                              {LoadGOT.getReg(0)})
3829                   .addImm(0);
3830
3831   MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3832   // TLS calls preserve all registers except those that absolutely must be
3833   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3834   // silly).
3835   unsigned Opcode = getBLRCallOpcode(MF);
3836
3837   // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
3838   if (MF.getFunction().hasFnAttribute("ptrauth-calls")) {
3839     assert(Opcode == AArch64::BLR);
3840     Opcode = AArch64::BLRAAZ;
3841   }
3842
3843   MIB.buildInstr(Opcode, {}, {Load})
3844       .addUse(AArch64::X0, RegState::Implicit)
3845       .addDef(AArch64::X0, RegState::Implicit)
3846       .addRegMask(TRI.getTLSCallPreservedMask());
3847
3848   MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3849   RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3850                                MRI);
3851   I.eraseFromParent();
3852   return true;
3853 }
3854
3855 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3856     unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3857     MachineIRBuilder &MIRBuilder) const {
3858   auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3859
3860   auto BuildFn = [&](unsigned SubregIndex) {
3861     auto Ins =
3862         MIRBuilder
3863             .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3864             .addImm(SubregIndex);
3865     constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3866     constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3867     return &*Ins;
3868   };
3869
3870   switch (EltSize) {
3871   case 8:
3872     return BuildFn(AArch64::bsub);
3873   case 16:
3874     return BuildFn(AArch64::hsub);
3875   case 32:
3876     return BuildFn(AArch64::ssub);
3877   case 64:
3878     return BuildFn(AArch64::dsub);
3879   default:
3880     return nullptr;
3881   }
3882 }
3883
3884 MachineInstr *
3885 AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg,
3886                                              MachineIRBuilder &MIB,
3887                                              MachineRegisterInfo &MRI) const {
3888   LLT DstTy = MRI.getType(DstReg);
3889   const TargetRegisterClass *RC =
3890       getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI));
3891   if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
3892     LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
3893     return nullptr;
3894   }
3895   unsigned SubReg = 0;
3896   if (!getSubRegForClass(RC, TRI, SubReg))
3897     return nullptr;
3898   if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
3899     LLVM_DEBUG(dbgs() << "Unsupported destination size! ("
3900                       << DstTy.getSizeInBits() << "\n");
3901     return nullptr;
3902   }
3903   auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3904                   .addReg(SrcReg, 0, SubReg);
3905   RBI.constrainGenericRegister(DstReg, *RC, MRI);
3906   return Copy;
3907 }
3908
3909 bool AArch64InstructionSelector::selectMergeValues(
3910     MachineInstr &I, MachineRegisterInfo &MRI) {
3911   assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3912   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3913   const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3914   assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3915   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
3916
3917   if (I.getNumOperands() != 3)
3918     return false;
3919
3920   // Merging 2 s64s into an s128.
3921   if (DstTy == LLT::scalar(128)) {
3922     if (SrcTy.getSizeInBits() != 64)
3923       return false;
3924     Register DstReg = I.getOperand(0).getReg();
3925     Register Src1Reg = I.getOperand(1).getReg();
3926     Register Src2Reg = I.getOperand(2).getReg();
3927     auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
3928     MachineInstr *InsMI = emitLaneInsert(std::nullopt, Tmp.getReg(0), Src1Reg,
3929                                          /* LaneIdx */ 0, RB, MIB);
3930     if (!InsMI)
3931       return false;
3932     MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
3933                                           Src2Reg, /* LaneIdx */ 1, RB, MIB);
3934     if (!Ins2MI)
3935       return false;
3936     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
3937     constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
3938     I.eraseFromParent();
3939     return true;
3940   }
3941
3942   if (RB.getID() != AArch64::GPRRegBankID)
3943     return false;
3944
3945   if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3946     return false;
3947
3948   auto *DstRC = &AArch64::GPR64RegClass;
3949   Register SubToRegDef = MRI.createVirtualRegister(DstRC);
3950   MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3951                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3952                                 .addDef(SubToRegDef)
3953                                 .addImm(0)
3954                                 .addUse(I.getOperand(1).getReg())
3955                                 .addImm(AArch64::sub_32);
3956   Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
3957   // Need to anyext the second scalar before we can use bfm
3958   MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3959                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3960                                 .addDef(SubToRegDef2)
3961                                 .addImm(0)
3962                                 .addUse(I.getOperand(2).getReg())
3963                                 .addImm(AArch64::sub_32);
3964   MachineInstr &BFM =
3965       *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
3966            .addDef(I.getOperand(0).getReg())
3967            .addUse(SubToRegDef)
3968            .addUse(SubToRegDef2)
3969            .addImm(32)
3970            .addImm(31);
3971   constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
3972   constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
3973   constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
3974   I.eraseFromParent();
3975   return true;
3976 }
3977
3978 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3979                               const unsigned EltSize) {
3980   // Choose a lane copy opcode and subregister based off of the size of the
3981   // vector's elements.
3982   switch (EltSize) {
3983   case 8:
3984     CopyOpc = AArch64::DUPi8;
3985     ExtractSubReg = AArch64::bsub;
3986     break;
3987   case 16:
3988     CopyOpc = AArch64::DUPi16;
3989     ExtractSubReg = AArch64::hsub;
3990     break;
3991   case 32:
3992     CopyOpc = AArch64::DUPi32;
3993     ExtractSubReg = AArch64::ssub;
3994     break;
3995   case 64:
3996     CopyOpc = AArch64::DUPi64;
3997     ExtractSubReg = AArch64::dsub;
3998     break;
3999   default:
4000     // Unknown size, bail out.
4001     LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
4002     return false;
4003   }
4004   return true;
4005 }
4006
4007 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
4008     std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
4009     Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
4010   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4011   unsigned CopyOpc = 0;
4012   unsigned ExtractSubReg = 0;
4013   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
4014     LLVM_DEBUG(
4015         dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
4016     return nullptr;
4017   }
4018
4019   const TargetRegisterClass *DstRC =
4020       getRegClassForTypeOnBank(ScalarTy, DstRB, true);
4021   if (!DstRC) {
4022     LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
4023     return nullptr;
4024   }
4025
4026   const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
4027   const LLT &VecTy = MRI.getType(VecReg);
4028   const TargetRegisterClass *VecRC =
4029       getRegClassForTypeOnBank(VecTy, VecRB, true);
4030   if (!VecRC) {
4031     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
4032     return nullptr;
4033   }
4034
4035   // The register that we're going to copy into.
4036   Register InsertReg = VecReg;
4037   if (!DstReg)
4038     DstReg = MRI.createVirtualRegister(DstRC);
4039   // If the lane index is 0, we just use a subregister COPY.
4040   if (LaneIdx == 0) {
4041     auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
4042                     .addReg(VecReg, 0, ExtractSubReg);
4043     RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4044     return &*Copy;
4045   }
4046
4047   // Lane copies require 128-bit wide registers. If we're dealing with an
4048   // unpacked vector, then we need to move up to that width. Insert an implicit
4049   // def and a subregister insert to get us there.
4050   if (VecTy.getSizeInBits() != 128) {
4051     MachineInstr *ScalarToVector = emitScalarToVector(
4052         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
4053     if (!ScalarToVector)
4054       return nullptr;
4055     InsertReg = ScalarToVector->getOperand(0).getReg();
4056   }
4057
4058   MachineInstr *LaneCopyMI =
4059       MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
4060   constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
4061
4062   // Make sure that we actually constrain the initial copy.
4063   RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4064   return LaneCopyMI;
4065 }
4066
4067 bool AArch64InstructionSelector::selectExtractElt(
4068     MachineInstr &I, MachineRegisterInfo &MRI) {
4069   assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4070          "unexpected opcode!");
4071   Register DstReg = I.getOperand(0).getReg();
4072   const LLT NarrowTy = MRI.getType(DstReg);
4073   const Register SrcReg = I.getOperand(1).getReg();
4074   const LLT WideTy = MRI.getType(SrcReg);
4075   (void)WideTy;
4076   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4077          "source register size too small!");
4078   assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4079
4080   // Need the lane index to determine the correct copy opcode.
4081   MachineOperand &LaneIdxOp = I.getOperand(2);
4082   assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4083
4084   if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4085     LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4086     return false;
4087   }
4088
4089   // Find the index to extract from.
4090   auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
4091   if (!VRegAndVal)
4092     return false;
4093   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4094
4095
4096   const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
4097   MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
4098                                                LaneIdx, MIB);
4099   if (!Extract)
4100     return false;
4101
4102   I.eraseFromParent();
4103   return true;
4104 }
4105
4106 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4107     MachineInstr &I, MachineRegisterInfo &MRI) {
4108   unsigned NumElts = I.getNumOperands() - 1;
4109   Register SrcReg = I.getOperand(NumElts).getReg();
4110   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4111   const LLT SrcTy = MRI.getType(SrcReg);
4112
4113   assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4114   if (SrcTy.getSizeInBits() > 128) {
4115     LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4116     return false;
4117   }
4118
4119   // We implement a split vector operation by treating the sub-vectors as
4120   // scalars and extracting them.
4121   const RegisterBank &DstRB =
4122       *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
4123   for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4124     Register Dst = I.getOperand(OpIdx).getReg();
4125     MachineInstr *Extract =
4126         emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
4127     if (!Extract)
4128       return false;
4129   }
4130   I.eraseFromParent();
4131   return true;
4132 }
4133
4134 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4135                                                      MachineRegisterInfo &MRI) {
4136   assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4137          "unexpected opcode");
4138
4139   // TODO: Handle unmerging into GPRs and from scalars to scalars.
4140   if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4141           AArch64::FPRRegBankID ||
4142       RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4143           AArch64::FPRRegBankID) {
4144     LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4145                          "currently unsupported.\n");
4146     return false;
4147   }
4148
4149   // The last operand is the vector source register, and every other operand is
4150   // a register to unpack into.
4151   unsigned NumElts = I.getNumOperands() - 1;
4152   Register SrcReg = I.getOperand(NumElts).getReg();
4153   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4154   const LLT WideTy = MRI.getType(SrcReg);
4155   (void)WideTy;
4156   assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4157          "can only unmerge from vector or s128 types!");
4158   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4159          "source register size too small!");
4160
4161   if (!NarrowTy.isScalar())
4162     return selectSplitVectorUnmerge(I, MRI);
4163
4164   // Choose a lane copy opcode and subregister based off of the size of the
4165   // vector's elements.
4166   unsigned CopyOpc = 0;
4167   unsigned ExtractSubReg = 0;
4168   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
4169     return false;
4170
4171   // Set up for the lane copies.
4172   MachineBasicBlock &MBB = *I.getParent();
4173
4174   // Stores the registers we'll be copying from.
4175   SmallVector<Register, 4> InsertRegs;
4176
4177   // We'll use the first register twice, so we only need NumElts-1 registers.
4178   unsigned NumInsertRegs = NumElts - 1;
4179
4180   // If our elements fit into exactly 128 bits, then we can copy from the source
4181   // directly. Otherwise, we need to do a bit of setup with some subregister
4182   // inserts.
4183   if (NarrowTy.getSizeInBits() * NumElts == 128) {
4184     InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4185   } else {
4186     // No. We have to perform subregister inserts. For each insert, create an
4187     // implicit def and a subregister insert, and save the register we create.
4188     const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4189         LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()),
4190         *RBI.getRegBank(SrcReg, MRI, TRI));
4191     unsigned SubReg = 0;
4192     bool Found = getSubRegForClass(RC, TRI, SubReg);
4193     (void)Found;
4194     assert(Found && "expected to find last operand's subeg idx");
4195     for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4196       Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4197       MachineInstr &ImpDefMI =
4198           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4199                    ImpDefReg);
4200
4201       // Now, create the subregister insert from SrcReg.
4202       Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4203       MachineInstr &InsMI =
4204           *BuildMI(MBB, I, I.getDebugLoc(),
4205                    TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4206                .addUse(ImpDefReg)
4207                .addUse(SrcReg)
4208                .addImm(SubReg);
4209
4210       constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4211       constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4212
4213       // Save the register so that we can copy from it after.
4214       InsertRegs.push_back(InsertReg);
4215     }
4216   }
4217
4218   // Now that we've created any necessary subregister inserts, we can
4219   // create the copies.
4220   //
4221   // Perform the first copy separately as a subregister copy.
4222   Register CopyTo = I.getOperand(0).getReg();
4223   auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
4224                        .addReg(InsertRegs[0], 0, ExtractSubReg);
4225   constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4226
4227   // Now, perform the remaining copies as vector lane copies.
4228   unsigned LaneIdx = 1;
4229   for (Register InsReg : InsertRegs) {
4230     Register CopyTo = I.getOperand(LaneIdx).getReg();
4231     MachineInstr &CopyInst =
4232         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4233              .addUse(InsReg)
4234              .addImm(LaneIdx);
4235     constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4236     ++LaneIdx;
4237   }
4238
4239   // Separately constrain the first copy's destination. Because of the
4240   // limitation in constrainOperandRegClass, we can't guarantee that this will
4241   // actually be constrained. So, do it ourselves using the second operand.
4242   const TargetRegisterClass *RC =
4243       MRI.getRegClassOrNull(I.getOperand(1).getReg());
4244   if (!RC) {
4245     LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4246     return false;
4247   }
4248
4249   RBI.constrainGenericRegister(CopyTo, *RC, MRI);
4250   I.eraseFromParent();
4251   return true;
4252 }
4253
4254 bool AArch64InstructionSelector::selectConcatVectors(
4255     MachineInstr &I, MachineRegisterInfo &MRI)  {
4256   assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4257          "Unexpected opcode");
4258   Register Dst = I.getOperand(0).getReg();
4259   Register Op1 = I.getOperand(1).getReg();
4260   Register Op2 = I.getOperand(2).getReg();
4261   MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
4262   if (!ConcatMI)
4263     return false;
4264   I.eraseFromParent();
4265   return true;
4266 }
4267
4268 unsigned
4269 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4270                                                   MachineFunction &MF) const {
4271   Type *CPTy = CPVal->getType();
4272   Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4273
4274   MachineConstantPool *MCP = MF.getConstantPool();
4275   return MCP->getConstantPoolIndex(CPVal, Alignment);
4276 }
4277
4278 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4279     const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4280   const TargetRegisterClass *RC;
4281   unsigned Opc;
4282   bool IsTiny = TM.getCodeModel() == CodeModel::Tiny;
4283   unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4284   switch (Size) {
4285   case 16:
4286     RC = &AArch64::FPR128RegClass;
4287     Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui;
4288     break;
4289   case 8:
4290     RC = &AArch64::FPR64RegClass;
4291     Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui;
4292     break;
4293   case 4:
4294     RC = &AArch64::FPR32RegClass;
4295     Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui;
4296     break;
4297   case 2:
4298     RC = &AArch64::FPR16RegClass;
4299     Opc = AArch64::LDRHui;
4300     break;
4301   default:
4302     LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4303                       << *CPVal->getType());
4304     return nullptr;
4305   }
4306
4307   MachineInstr *LoadMI = nullptr;
4308   auto &MF = MIRBuilder.getMF();
4309   unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4310   if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) {
4311     // Use load(literal) for tiny code model.
4312     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {}).addConstantPoolIndex(CPIdx);
4313   } else {
4314     auto Adrp =
4315         MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4316             .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4317
4318     LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {Adrp})
4319                    .addConstantPoolIndex(
4320                        CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4321
4322     constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4323   }
4324
4325   MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4326   LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4327                                                     MachineMemOperand::MOLoad,
4328                                                     Size, Align(Size)));
4329   constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4330   return LoadMI;
4331 }
4332
4333 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4334 /// size and RB.
4335 static std::pair<unsigned, unsigned>
4336 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4337   unsigned Opc, SubregIdx;
4338   if (RB.getID() == AArch64::GPRRegBankID) {
4339     if (EltSize == 8) {
4340       Opc = AArch64::INSvi8gpr;
4341       SubregIdx = AArch64::bsub;
4342     } else if (EltSize == 16) {
4343       Opc = AArch64::INSvi16gpr;
4344       SubregIdx = AArch64::ssub;
4345     } else if (EltSize == 32) {
4346       Opc = AArch64::INSvi32gpr;
4347       SubregIdx = AArch64::ssub;
4348     } else if (EltSize == 64) {
4349       Opc = AArch64::INSvi64gpr;
4350       SubregIdx = AArch64::dsub;
4351     } else {
4352       llvm_unreachable("invalid elt size!");
4353     }
4354   } else {
4355     if (EltSize == 8) {
4356       Opc = AArch64::INSvi8lane;
4357       SubregIdx = AArch64::bsub;
4358     } else if (EltSize == 16) {
4359       Opc = AArch64::INSvi16lane;
4360       SubregIdx = AArch64::hsub;
4361     } else if (EltSize == 32) {
4362       Opc = AArch64::INSvi32lane;
4363       SubregIdx = AArch64::ssub;
4364     } else if (EltSize == 64) {
4365       Opc = AArch64::INSvi64lane;
4366       SubregIdx = AArch64::dsub;
4367     } else {
4368       llvm_unreachable("invalid elt size!");
4369     }
4370   }
4371   return std::make_pair(Opc, SubregIdx);
4372 }
4373
4374 MachineInstr *AArch64InstructionSelector::emitInstr(
4375     unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4376     std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4377     const ComplexRendererFns &RenderFns) const {
4378   assert(Opcode && "Expected an opcode?");
4379   assert(!isPreISelGenericOpcode(Opcode) &&
4380          "Function should only be used to produce selected instructions!");
4381   auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4382   if (RenderFns)
4383     for (auto &Fn : *RenderFns)
4384       Fn(MI);
4385   constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4386   return &*MI;
4387 }
4388
4389 MachineInstr *AArch64InstructionSelector::emitAddSub(
4390     const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4391     Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4392     MachineIRBuilder &MIRBuilder) const {
4393   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4394   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4395   auto Ty = MRI.getType(LHS.getReg());
4396   assert(!Ty.isVector() && "Expected a scalar or pointer?");
4397   unsigned Size = Ty.getSizeInBits();
4398   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4399   bool Is32Bit = Size == 32;
4400
4401   // INSTRri form with positive arithmetic immediate.
4402   if (auto Fns = selectArithImmed(RHS))
4403     return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4404                      MIRBuilder, Fns);
4405
4406   // INSTRri form with negative arithmetic immediate.
4407   if (auto Fns = selectNegArithImmed(RHS))
4408     return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4409                      MIRBuilder, Fns);
4410
4411   // INSTRrx form.
4412   if (auto Fns = selectArithExtendedRegister(RHS))
4413     return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4414                      MIRBuilder, Fns);
4415
4416   // INSTRrs form.
4417   if (auto Fns = selectShiftedRegister(RHS))
4418     return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4419                      MIRBuilder, Fns);
4420   return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4421                    MIRBuilder);
4422 }
4423
4424 MachineInstr *
4425 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4426                                     MachineOperand &RHS,
4427                                     MachineIRBuilder &MIRBuilder) const {
4428   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4429       {{AArch64::ADDXri, AArch64::ADDWri},
4430        {AArch64::ADDXrs, AArch64::ADDWrs},
4431        {AArch64::ADDXrr, AArch64::ADDWrr},
4432        {AArch64::SUBXri, AArch64::SUBWri},
4433        {AArch64::ADDXrx, AArch64::ADDWrx}}};
4434   return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4435 }
4436
4437 MachineInstr *
4438 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4439                                      MachineOperand &RHS,
4440                                      MachineIRBuilder &MIRBuilder) const {
4441   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4442       {{AArch64::ADDSXri, AArch64::ADDSWri},
4443        {AArch64::ADDSXrs, AArch64::ADDSWrs},
4444        {AArch64::ADDSXrr, AArch64::ADDSWrr},
4445        {AArch64::SUBSXri, AArch64::SUBSWri},
4446        {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4447   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4448 }
4449
4450 MachineInstr *
4451 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4452                                      MachineOperand &RHS,
4453                                      MachineIRBuilder &MIRBuilder) const {
4454   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4455       {{AArch64::SUBSXri, AArch64::SUBSWri},
4456        {AArch64::SUBSXrs, AArch64::SUBSWrs},
4457        {AArch64::SUBSXrr, AArch64::SUBSWrr},
4458        {AArch64::ADDSXri, AArch64::ADDSWri},
4459        {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4460   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4461 }
4462
4463 MachineInstr *
4464 AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS,
4465                                      MachineOperand &RHS,
4466                                      MachineIRBuilder &MIRBuilder) const {
4467   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4468   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4469   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4470   static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr};
4471   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4472 }
4473
4474 MachineInstr *
4475 AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS,
4476                                      MachineOperand &RHS,
4477                                      MachineIRBuilder &MIRBuilder) const {
4478   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4479   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4480   bool Is32Bit = (MRI->getType(LHS.getReg()).getSizeInBits() == 32);
4481   static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr};
4482   return emitInstr(OpcTable[Is32Bit], {Dst}, {LHS, RHS}, MIRBuilder);
4483 }
4484
4485 MachineInstr *
4486 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4487                                     MachineIRBuilder &MIRBuilder) const {
4488   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4489   bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4490   auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4491   return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4492 }
4493
4494 MachineInstr *
4495 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4496                                     MachineIRBuilder &MIRBuilder) const {
4497   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4498   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4499   LLT Ty = MRI.getType(LHS.getReg());
4500   unsigned RegSize = Ty.getSizeInBits();
4501   bool Is32Bit = (RegSize == 32);
4502   const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4503                                    {AArch64::ANDSXrs, AArch64::ANDSWrs},
4504                                    {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4505   // ANDS needs a logical immediate for its immediate form. Check if we can
4506   // fold one in.
4507   if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4508     int64_t Imm = ValAndVReg->Value.getSExtValue();
4509
4510     if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4511       auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4512       TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4513       constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4514       return &*TstMI;
4515     }
4516   }
4517
4518   if (auto Fns = selectLogicalShiftedRegister(RHS))
4519     return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4520   return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4521 }
4522
4523 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4524     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4525     MachineIRBuilder &MIRBuilder) const {
4526   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4527   assert(Predicate.isPredicate() && "Expected predicate?");
4528   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4529   LLT CmpTy = MRI.getType(LHS.getReg());
4530   assert(!CmpTy.isVector() && "Expected scalar or pointer");
4531   unsigned Size = CmpTy.getSizeInBits();
4532   (void)Size;
4533   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4534   // Fold the compare into a cmn or tst if possible.
4535   if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4536     return FoldCmp;
4537   auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4538   return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4539 }
4540
4541 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4542     Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4543   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4544 #ifndef NDEBUG
4545   LLT Ty = MRI.getType(Dst);
4546   assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4547          "Expected a 32-bit scalar register?");
4548 #endif
4549   const Register ZReg = AArch64::WZR;
4550   AArch64CC::CondCode CC1, CC2;
4551   changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4552   auto InvCC1 = AArch64CC::getInvertedCondCode(CC1);
4553   if (CC2 == AArch64CC::AL)
4554     return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1,
4555                      MIRBuilder);
4556   const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4557   Register Def1Reg = MRI.createVirtualRegister(RC);
4558   Register Def2Reg = MRI.createVirtualRegister(RC);
4559   auto InvCC2 = AArch64CC::getInvertedCondCode(CC2);
4560   emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder);
4561   emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder);
4562   auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4563   constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4564   return &*OrMI;
4565 }
4566
4567 MachineInstr *AArch64InstructionSelector::emitFPCompare(
4568     Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
4569     std::optional<CmpInst::Predicate> Pred) const {
4570   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4571   LLT Ty = MRI.getType(LHS);
4572   if (Ty.isVector())
4573     return nullptr;
4574   unsigned OpSize = Ty.getSizeInBits();
4575   assert(OpSize == 16 || OpSize == 32 || OpSize == 64);
4576
4577   // If this is a compare against +0.0, then we don't have
4578   // to explicitly materialize a constant.
4579   const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4580   bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4581
4582   auto IsEqualityPred = [](CmpInst::Predicate P) {
4583     return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4584            P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4585   };
4586   if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4587     // Try commutating the operands.
4588     const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4589     if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4590       ShouldUseImm = true;
4591       std::swap(LHS, RHS);
4592     }
4593   }
4594   unsigned CmpOpcTbl[2][3] = {
4595       {AArch64::FCMPHrr, AArch64::FCMPSrr, AArch64::FCMPDrr},
4596       {AArch64::FCMPHri, AArch64::FCMPSri, AArch64::FCMPDri}};
4597   unsigned CmpOpc =
4598       CmpOpcTbl[ShouldUseImm][OpSize == 16 ? 0 : (OpSize == 32 ? 1 : 2)];
4599
4600   // Partially build the compare. Decide if we need to add a use for the
4601   // third operand based off whether or not we're comparing against 0.0.
4602   auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4603   CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4604   if (!ShouldUseImm)
4605     CmpMI.addUse(RHS);
4606   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4607   return &*CmpMI;
4608 }
4609
4610 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4611     std::optional<Register> Dst, Register Op1, Register Op2,
4612     MachineIRBuilder &MIRBuilder) const {
4613   // We implement a vector concat by:
4614   // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4615   // 2. Insert the upper vector into the destination's upper element
4616   // TODO: some of this code is common with G_BUILD_VECTOR handling.
4617   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4618
4619   const LLT Op1Ty = MRI.getType(Op1);
4620   const LLT Op2Ty = MRI.getType(Op2);
4621
4622   if (Op1Ty != Op2Ty) {
4623     LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4624     return nullptr;
4625   }
4626   assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4627
4628   if (Op1Ty.getSizeInBits() >= 128) {
4629     LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4630     return nullptr;
4631   }
4632
4633   // At the moment we just support 64 bit vector concats.
4634   if (Op1Ty.getSizeInBits() != 64) {
4635     LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4636     return nullptr;
4637   }
4638
4639   const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4640   const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4641   const TargetRegisterClass *DstRC =
4642       getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank);
4643
4644   MachineInstr *WidenedOp1 =
4645       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4646   MachineInstr *WidenedOp2 =
4647       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4648   if (!WidenedOp1 || !WidenedOp2) {
4649     LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4650     return nullptr;
4651   }
4652
4653   // Now do the insert of the upper element.
4654   unsigned InsertOpc, InsSubRegIdx;
4655   std::tie(InsertOpc, InsSubRegIdx) =
4656       getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4657
4658   if (!Dst)
4659     Dst = MRI.createVirtualRegister(DstRC);
4660   auto InsElt =
4661       MIRBuilder
4662           .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4663           .addImm(1) /* Lane index */
4664           .addUse(WidenedOp2->getOperand(0).getReg())
4665           .addImm(0);
4666   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4667   return &*InsElt;
4668 }
4669
4670 MachineInstr *
4671 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4672                                       Register Src2, AArch64CC::CondCode Pred,
4673                                       MachineIRBuilder &MIRBuilder) const {
4674   auto &MRI = *MIRBuilder.getMRI();
4675   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst);
4676   // If we used a register class, then this won't necessarily have an LLT.
4677   // Compute the size based off whether or not we have a class or bank.
4678   unsigned Size;
4679   if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
4680     Size = TRI.getRegSizeInBits(*RC);
4681   else
4682     Size = MRI.getType(Dst).getSizeInBits();
4683   // Some opcodes use s1.
4684   assert(Size <= 64 && "Expected 64 bits or less only!");
4685   static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4686   unsigned Opc = OpcTable[Size == 64];
4687   auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred);
4688   constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI);
4689   return &*CSINC;
4690 }
4691
4692 MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I,
4693                                                       Register CarryReg) {
4694   MachineRegisterInfo *MRI = MIB.getMRI();
4695   unsigned Opcode = I.getOpcode();
4696
4697   // If the instruction is a SUB, we need to negate the carry,
4698   // because borrowing is indicated by carry-flag == 0.
4699   bool NeedsNegatedCarry =
4700       (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE);
4701
4702   // If the previous instruction will already produce the correct carry, do not
4703   // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences
4704   // generated during legalization of wide add/sub. This optimization depends on
4705   // these sequences not being interrupted by other instructions.
4706   // We have to select the previous instruction before the carry-using
4707   // instruction is deleted by the calling function, otherwise the previous
4708   // instruction might become dead and would get deleted.
4709   MachineInstr *SrcMI = MRI->getVRegDef(CarryReg);
4710   if (SrcMI == I.getPrevNode()) {
4711     if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(SrcMI)) {
4712       bool ProducesNegatedCarry = CarrySrcMI->isSub();
4713       if (NeedsNegatedCarry == ProducesNegatedCarry &&
4714           CarrySrcMI->isUnsigned() &&
4715           CarrySrcMI->getCarryOutReg() == CarryReg &&
4716           selectAndRestoreState(*SrcMI))
4717         return nullptr;
4718     }
4719   }
4720
4721   Register DeadReg = MRI->createVirtualRegister(&AArch64::GPR32RegClass);
4722
4723   if (NeedsNegatedCarry) {
4724     // (0 - Carry) sets !C in NZCV when Carry == 1
4725     Register ZReg = AArch64::WZR;
4726     return emitInstr(AArch64::SUBSWrr, {DeadReg}, {ZReg, CarryReg}, MIB);
4727   }
4728
4729   // (Carry - 1) sets !C in NZCV when Carry == 0
4730   auto Fns = select12BitValueWithLeftShift(1);
4731   return emitInstr(AArch64::SUBSWri, {DeadReg}, {CarryReg}, MIB, Fns);
4732 }
4733
4734 bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I,
4735                                                   MachineRegisterInfo &MRI) {
4736   auto &CarryMI = cast<GAddSubCarryOut>(I);
4737
4738   if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(&I)) {
4739     // Set NZCV carry according to carry-in VReg
4740     emitCarryIn(I, CarryInMI->getCarryInReg());
4741   }
4742
4743   // Emit the operation and get the correct condition code.
4744   auto OpAndCC = emitOverflowOp(I.getOpcode(), CarryMI.getDstReg(),
4745                                 CarryMI.getLHS(), CarryMI.getRHS(), MIB);
4746
4747   Register CarryOutReg = CarryMI.getCarryOutReg();
4748
4749   // Don't convert carry-out to VReg if it is never used
4750   if (!MRI.use_nodbg_empty(CarryOutReg)) {
4751     // Now, put the overflow result in the register given by the first operand
4752     // to the overflow op. CSINC increments the result when the predicate is
4753     // false, so to get the increment when it's true, we need to use the
4754     // inverse. In this case, we want to increment when carry is set.
4755     Register ZReg = AArch64::WZR;
4756     emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg,
4757               getInvertedCondCode(OpAndCC.second), MIB);
4758   }
4759
4760   I.eraseFromParent();
4761   return true;
4762 }
4763
4764 std::pair<MachineInstr *, AArch64CC::CondCode>
4765 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4766                                            MachineOperand &LHS,
4767                                            MachineOperand &RHS,
4768                                            MachineIRBuilder &MIRBuilder) const {
4769   switch (Opcode) {
4770   default:
4771     llvm_unreachable("Unexpected opcode!");
4772   case TargetOpcode::G_SADDO:
4773     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4774   case TargetOpcode::G_UADDO:
4775     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4776   case TargetOpcode::G_SSUBO:
4777     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4778   case TargetOpcode::G_USUBO:
4779     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4780   case TargetOpcode::G_SADDE:
4781     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4782   case TargetOpcode::G_UADDE:
4783     return std::make_pair(emitADCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4784   case TargetOpcode::G_SSUBE:
4785     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4786   case TargetOpcode::G_USUBE:
4787     return std::make_pair(emitSBCS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4788   }
4789 }
4790
4791 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4792 /// expressed as a conjunction.
4793 /// \param CanNegate    Set to true if we can negate the whole sub-tree just by
4794 ///                     changing the conditions on the CMP tests.
4795 ///                     (this means we can call emitConjunctionRec() with
4796 ///                      Negate==true on this sub-tree)
4797 /// \param MustBeFirst  Set to true if this subtree needs to be negated and we
4798 ///                     cannot do the negation naturally. We are required to
4799 ///                     emit the subtree first in this case.
4800 /// \param WillNegate   Is true if are called when the result of this
4801 ///                     subexpression must be negated. This happens when the
4802 ///                     outer expression is an OR. We can use this fact to know
4803 ///                     that we have a double negation (or (or ...) ...) that
4804 ///                     can be implemented for free.
4805 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4806                                bool WillNegate, MachineRegisterInfo &MRI,
4807                                unsigned Depth = 0) {
4808   if (!MRI.hasOneNonDBGUse(Val))
4809     return false;
4810   MachineInstr *ValDef = MRI.getVRegDef(Val);
4811   unsigned Opcode = ValDef->getOpcode();
4812   if (isa<GAnyCmp>(ValDef)) {
4813     CanNegate = true;
4814     MustBeFirst = false;
4815     return true;
4816   }
4817   // Protect against exponential runtime and stack overflow.
4818   if (Depth > 6)
4819     return false;
4820   if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
4821     bool IsOR = Opcode == TargetOpcode::G_OR;
4822     Register O0 = ValDef->getOperand(1).getReg();
4823     Register O1 = ValDef->getOperand(2).getReg();
4824     bool CanNegateL;
4825     bool MustBeFirstL;
4826     if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
4827       return false;
4828     bool CanNegateR;
4829     bool MustBeFirstR;
4830     if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
4831       return false;
4832
4833     if (MustBeFirstL && MustBeFirstR)
4834       return false;
4835
4836     if (IsOR) {
4837       // For an OR expression we need to be able to naturally negate at least
4838       // one side or we cannot do the transformation at all.
4839       if (!CanNegateL && !CanNegateR)
4840         return false;
4841       // If we the result of the OR will be negated and we can naturally negate
4842       // the leaves, then this sub-tree as a whole negates naturally.
4843       CanNegate = WillNegate && CanNegateL && CanNegateR;
4844       // If we cannot naturally negate the whole sub-tree, then this must be
4845       // emitted first.
4846       MustBeFirst = !CanNegate;
4847     } else {
4848       assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4849       // We cannot naturally negate an AND operation.
4850       CanNegate = false;
4851       MustBeFirst = MustBeFirstL || MustBeFirstR;
4852     }
4853     return true;
4854   }
4855   return false;
4856 }
4857
4858 MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4859     Register LHS, Register RHS, CmpInst::Predicate CC,
4860     AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4861     MachineIRBuilder &MIB) const {
4862   auto &MRI = *MIB.getMRI();
4863   LLT OpTy = MRI.getType(LHS);
4864   unsigned CCmpOpc;
4865   std::optional<ValueAndVReg> C;
4866   if (CmpInst::isIntPredicate(CC)) {
4867     assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
4868     C = getIConstantVRegValWithLookThrough(RHS, MRI);
4869     if (!C || C->Value.sgt(31) || C->Value.slt(-31))
4870       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
4871     else if (C->Value.ule(31))
4872       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
4873     else
4874       CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMNWi : AArch64::CCMNXi;
4875   } else {
4876     assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 ||
4877            OpTy.getSizeInBits() == 64);
4878     switch (OpTy.getSizeInBits()) {
4879     case 16:
4880       assert(STI.hasFullFP16() && "Expected Full FP16 for fp16 comparisons");
4881       CCmpOpc = AArch64::FCCMPHrr;
4882       break;
4883     case 32:
4884       CCmpOpc = AArch64::FCCMPSrr;
4885       break;
4886     case 64:
4887       CCmpOpc = AArch64::FCCMPDrr;
4888       break;
4889     default:
4890       return nullptr;
4891     }
4892   }
4893   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
4894   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
4895   auto CCmp =
4896       MIB.buildInstr(CCmpOpc, {}, {LHS});
4897   if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi)
4898     CCmp.addImm(C->Value.getZExtValue());
4899   else if (CCmpOpc == AArch64::CCMNWi || CCmpOpc == AArch64::CCMNXi)
4900     CCmp.addImm(C->Value.abs().getZExtValue());
4901   else
4902     CCmp.addReg(RHS);
4903   CCmp.addImm(NZCV).addImm(Predicate);
4904   constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
4905   return &*CCmp;
4906 }
4907
4908 MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4909     Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4910     AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4911   // We're at a tree leaf, produce a conditional comparison operation.
4912   auto &MRI = *MIB.getMRI();
4913   MachineInstr *ValDef = MRI.getVRegDef(Val);
4914   unsigned Opcode = ValDef->getOpcode();
4915   if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
4916     Register LHS = Cmp->getLHSReg();
4917     Register RHS = Cmp->getRHSReg();
4918     CmpInst::Predicate CC = Cmp->getCond();
4919     if (Negate)
4920       CC = CmpInst::getInversePredicate(CC);
4921     if (isa<GICmp>(Cmp)) {
4922       OutCC = changeICMPPredToAArch64CC(CC);
4923     } else {
4924       // Handle special FP cases.
4925       AArch64CC::CondCode ExtraCC;
4926       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4927       // Some floating point conditions can't be tested with a single condition
4928       // code. Construct an additional comparison in this case.
4929       if (ExtraCC != AArch64CC::AL) {
4930         MachineInstr *ExtraCmp;
4931         if (!CCOp)
4932           ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
4933         else
4934           ExtraCmp =
4935               emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
4936         CCOp = ExtraCmp->getOperand(0).getReg();
4937         Predicate = ExtraCC;
4938       }
4939     }
4940
4941     // Produce a normal comparison if we are first in the chain
4942     if (!CCOp) {
4943       auto Dst = MRI.cloneVirtualRegister(LHS);
4944       if (isa<GICmp>(Cmp))
4945         return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
4946       return emitFPCompare(Cmp->getOperand(2).getReg(),
4947                            Cmp->getOperand(3).getReg(), MIB);
4948     }
4949     // Otherwise produce a ccmp.
4950     return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
4951   }
4952   assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
4953
4954   bool IsOR = Opcode == TargetOpcode::G_OR;
4955
4956   Register LHS = ValDef->getOperand(1).getReg();
4957   bool CanNegateL;
4958   bool MustBeFirstL;
4959   bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
4960   assert(ValidL && "Valid conjunction/disjunction tree");
4961   (void)ValidL;
4962
4963   Register RHS = ValDef->getOperand(2).getReg();
4964   bool CanNegateR;
4965   bool MustBeFirstR;
4966   bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
4967   assert(ValidR && "Valid conjunction/disjunction tree");
4968   (void)ValidR;
4969
4970   // Swap sub-tree that must come first to the right side.
4971   if (MustBeFirstL) {
4972     assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4973     std::swap(LHS, RHS);
4974     std::swap(CanNegateL, CanNegateR);
4975     std::swap(MustBeFirstL, MustBeFirstR);
4976   }
4977
4978   bool NegateR;
4979   bool NegateAfterR;
4980   bool NegateL;
4981   bool NegateAfterAll;
4982   if (Opcode == TargetOpcode::G_OR) {
4983     // Swap the sub-tree that we can negate naturally to the left.
4984     if (!CanNegateL) {
4985       assert(CanNegateR && "at least one side must be negatable");
4986       assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4987       assert(!Negate);
4988       std::swap(LHS, RHS);
4989       NegateR = false;
4990       NegateAfterR = true;
4991     } else {
4992       // Negate the left sub-tree if possible, otherwise negate the result.
4993       NegateR = CanNegateR;
4994       NegateAfterR = !CanNegateR;
4995     }
4996     NegateL = true;
4997     NegateAfterAll = !Negate;
4998   } else {
4999     assert(Opcode == TargetOpcode::G_AND &&
5000            "Valid conjunction/disjunction tree");
5001     assert(!Negate && "Valid conjunction/disjunction tree");
5002
5003     NegateL = false;
5004     NegateR = false;
5005     NegateAfterR = false;
5006     NegateAfterAll = false;
5007   }
5008
5009   // Emit sub-trees.
5010   AArch64CC::CondCode RHSCC;
5011   MachineInstr *CmpR =
5012       emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
5013   if (NegateAfterR)
5014     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
5015   MachineInstr *CmpL = emitConjunctionRec(
5016       LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
5017   if (NegateAfterAll)
5018     OutCC = AArch64CC::getInvertedCondCode(OutCC);
5019   return CmpL;
5020 }
5021
5022 MachineInstr *AArch64InstructionSelector::emitConjunction(
5023     Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
5024   bool DummyCanNegate;
5025   bool DummyMustBeFirst;
5026   if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
5027                           *MIB.getMRI()))
5028     return nullptr;
5029   return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
5030 }
5031
5032 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
5033                                                          MachineInstr &CondMI) {
5034   AArch64CC::CondCode AArch64CC;
5035   MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
5036   if (!ConjMI)
5037     return false;
5038
5039   emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB);
5040   SelI.eraseFromParent();
5041   return true;
5042 }
5043
5044 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
5045   MachineRegisterInfo &MRI = *MIB.getMRI();
5046   // We want to recognize this pattern:
5047   //
5048   // $z = G_FCMP pred, $x, $y
5049   // ...
5050   // $w = G_SELECT $z, $a, $b
5051   //
5052   // Where the value of $z is *only* ever used by the G_SELECT (possibly with
5053   // some copies/truncs in between.)
5054   //
5055   // If we see this, then we can emit something like this:
5056   //
5057   // fcmp $x, $y
5058   // fcsel $w, $a, $b, pred
5059   //
5060   // Rather than emitting both of the rather long sequences in the standard
5061   // G_FCMP/G_SELECT select methods.
5062
5063   // First, check if the condition is defined by a compare.
5064   MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
5065
5066   // We can only fold if all of the defs have one use.
5067   Register CondDefReg = CondDef->getOperand(0).getReg();
5068   if (!MRI.hasOneNonDBGUse(CondDefReg)) {
5069     // Unless it's another select.
5070     for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
5071       if (CondDef == &UI)
5072         continue;
5073       if (UI.getOpcode() != TargetOpcode::G_SELECT)
5074         return false;
5075     }
5076   }
5077
5078   // Is the condition defined by a compare?
5079   unsigned CondOpc = CondDef->getOpcode();
5080   if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
5081     if (tryOptSelectConjunction(I, *CondDef))
5082       return true;
5083     return false;
5084   }
5085
5086   AArch64CC::CondCode CondCode;
5087   if (CondOpc == TargetOpcode::G_ICMP) {
5088     auto Pred =
5089         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5090     CondCode = changeICMPPredToAArch64CC(Pred);
5091     emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
5092                        CondDef->getOperand(1), MIB);
5093   } else {
5094     // Get the condition code for the select.
5095     auto Pred =
5096         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5097     AArch64CC::CondCode CondCode2;
5098     changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
5099
5100     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
5101     // instructions to emit the comparison.
5102     // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
5103     // unnecessary.
5104     if (CondCode2 != AArch64CC::AL)
5105       return false;
5106
5107     if (!emitFPCompare(CondDef->getOperand(2).getReg(),
5108                        CondDef->getOperand(3).getReg(), MIB)) {
5109       LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
5110       return false;
5111     }
5112   }
5113
5114   // Emit the select.
5115   emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
5116              I.getOperand(3).getReg(), CondCode, MIB);
5117   I.eraseFromParent();
5118   return true;
5119 }
5120
5121 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
5122     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
5123     MachineIRBuilder &MIRBuilder) const {
5124   assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5125          "Unexpected MachineOperand");
5126   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5127   // We want to find this sort of thing:
5128   // x = G_SUB 0, y
5129   // G_ICMP z, x
5130   //
5131   // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5132   // e.g:
5133   //
5134   // cmn z, y
5135
5136   // Check if the RHS or LHS of the G_ICMP is defined by a SUB
5137   MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
5138   MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
5139   auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5140   // Given this:
5141   //
5142   // x = G_SUB 0, y
5143   // G_ICMP x, z
5144   //
5145   // Produce this:
5146   //
5147   // cmn y, z
5148   if (isCMN(LHSDef, P, MRI))
5149     return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
5150
5151   // Same idea here, but with the RHS of the compare instead:
5152   //
5153   // Given this:
5154   //
5155   // x = G_SUB 0, y
5156   // G_ICMP z, x
5157   //
5158   // Produce this:
5159   //
5160   // cmn z, y
5161   if (isCMN(RHSDef, P, MRI))
5162     return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
5163
5164   // Given this:
5165   //
5166   // z = G_AND x, y
5167   // G_ICMP z, 0
5168   //
5169   // Produce this if the compare is signed:
5170   //
5171   // tst x, y
5172   if (!CmpInst::isUnsigned(P) && LHSDef &&
5173       LHSDef->getOpcode() == TargetOpcode::G_AND) {
5174     // Make sure that the RHS is 0.
5175     auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
5176     if (!ValAndVReg || ValAndVReg->Value != 0)
5177       return nullptr;
5178
5179     return emitTST(LHSDef->getOperand(1),
5180                    LHSDef->getOperand(2), MIRBuilder);
5181   }
5182
5183   return nullptr;
5184 }
5185
5186 bool AArch64InstructionSelector::selectShuffleVector(
5187     MachineInstr &I, MachineRegisterInfo &MRI) {
5188   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5189   Register Src1Reg = I.getOperand(1).getReg();
5190   const LLT Src1Ty = MRI.getType(Src1Reg);
5191   Register Src2Reg = I.getOperand(2).getReg();
5192   const LLT Src2Ty = MRI.getType(Src2Reg);
5193   ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
5194
5195   MachineBasicBlock &MBB = *I.getParent();
5196   MachineFunction &MF = *MBB.getParent();
5197   LLVMContext &Ctx = MF.getFunction().getContext();
5198
5199   // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
5200   // it's originated from a <1 x T> type. Those should have been lowered into
5201   // G_BUILD_VECTOR earlier.
5202   if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
5203     LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
5204     return false;
5205   }
5206
5207   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
5208
5209   SmallVector<Constant *, 64> CstIdxs;
5210   for (int Val : Mask) {
5211     // For now, any undef indexes we'll just assume to be 0. This should be
5212     // optimized in future, e.g. to select DUP etc.
5213     Val = Val < 0 ? 0 : Val;
5214     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
5215       unsigned Offset = Byte + Val * BytesPerElt;
5216       CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
5217     }
5218   }
5219
5220   // Use a constant pool to load the index vector for TBL.
5221   Constant *CPVal = ConstantVector::get(CstIdxs);
5222   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
5223   if (!IndexLoad) {
5224     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5225     return false;
5226   }
5227
5228   if (DstTy.getSizeInBits() != 128) {
5229     assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
5230     // This case can be done with TBL1.
5231     MachineInstr *Concat =
5232         emitVectorConcat(std::nullopt, Src1Reg, Src2Reg, MIB);
5233     if (!Concat) {
5234       LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5235       return false;
5236     }
5237
5238     // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5239     IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
5240                                    IndexLoad->getOperand(0).getReg(), MIB);
5241
5242     auto TBL1 = MIB.buildInstr(
5243         AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
5244         {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
5245     constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
5246
5247     auto Copy =
5248         MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
5249             .addReg(TBL1.getReg(0), 0, AArch64::dsub);
5250     RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
5251     I.eraseFromParent();
5252     return true;
5253   }
5254
5255   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5256   // Q registers for regalloc.
5257   SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
5258   auto RegSeq = createQTuple(Regs, MIB);
5259   auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
5260                              {RegSeq, IndexLoad->getOperand(0)});
5261   constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
5262   I.eraseFromParent();
5263   return true;
5264 }
5265
5266 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5267     std::optional<Register> DstReg, Register SrcReg, Register EltReg,
5268     unsigned LaneIdx, const RegisterBank &RB,
5269     MachineIRBuilder &MIRBuilder) const {
5270   MachineInstr *InsElt = nullptr;
5271   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5272   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5273
5274   // Create a register to define with the insert if one wasn't passed in.
5275   if (!DstReg)
5276     DstReg = MRI.createVirtualRegister(DstRC);
5277
5278   unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
5279   unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5280
5281   if (RB.getID() == AArch64::FPRRegBankID) {
5282     auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
5283     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5284                  .addImm(LaneIdx)
5285                  .addUse(InsSub->getOperand(0).getReg())
5286                  .addImm(0);
5287   } else {
5288     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5289                  .addImm(LaneIdx)
5290                  .addUse(EltReg);
5291   }
5292
5293   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
5294   return InsElt;
5295 }
5296
5297 bool AArch64InstructionSelector::selectUSMovFromExtend(
5298     MachineInstr &MI, MachineRegisterInfo &MRI) {
5299   if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5300       MI.getOpcode() != TargetOpcode::G_ZEXT &&
5301       MI.getOpcode() != TargetOpcode::G_ANYEXT)
5302     return false;
5303   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5304   const Register DefReg = MI.getOperand(0).getReg();
5305   const LLT DstTy = MRI.getType(DefReg);
5306   unsigned DstSize = DstTy.getSizeInBits();
5307
5308   if (DstSize != 32 && DstSize != 64)
5309     return false;
5310
5311   MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT,
5312                                        MI.getOperand(1).getReg(), MRI);
5313   int64_t Lane;
5314   if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane)))
5315     return false;
5316   Register Src0 = Extract->getOperand(1).getReg();
5317
5318   const LLT VecTy = MRI.getType(Src0);
5319   if (VecTy.isScalableVector())
5320     return false;
5321
5322   if (VecTy.getSizeInBits() != 128) {
5323     const MachineInstr *ScalarToVector = emitScalarToVector(
5324         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
5325     assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5326     Src0 = ScalarToVector->getOperand(0).getReg();
5327   }
5328
5329   unsigned Opcode;
5330   if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5331     Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5332   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5333     Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5334   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5335     Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5336   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5337     Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5338   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5339     Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5340   else
5341     llvm_unreachable("Unexpected type combo for S/UMov!");
5342
5343   // We may need to generate one of these, depending on the type and sign of the
5344   // input:
5345   //  DstReg = SMOV Src0, Lane;
5346   //  NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5347   MachineInstr *ExtI = nullptr;
5348   if (DstSize == 64 && !IsSigned) {
5349     Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
5350     MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane);
5351     ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
5352                .addImm(0)
5353                .addUse(NewReg)
5354                .addImm(AArch64::sub_32);
5355     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
5356   } else
5357     ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane);
5358
5359   constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
5360   MI.eraseFromParent();
5361   return true;
5362 }
5363
5364 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8(
5365     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5366   unsigned int Op;
5367   if (DstSize == 128) {
5368     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5369       return nullptr;
5370     Op = AArch64::MOVIv16b_ns;
5371   } else {
5372     Op = AArch64::MOVIv8b_ns;
5373   }
5374
5375   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5376
5377   if (AArch64_AM::isAdvSIMDModImmType9(Val)) {
5378     Val = AArch64_AM::encodeAdvSIMDModImmType9(Val);
5379     auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5380     constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5381     return &*Mov;
5382   }
5383   return nullptr;
5384 }
5385
5386 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16(
5387     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5388     bool Inv) {
5389
5390   unsigned int Op;
5391   if (DstSize == 128) {
5392     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5393       return nullptr;
5394     Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16;
5395   } else {
5396     Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16;
5397   }
5398
5399   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5400   uint64_t Shift;
5401
5402   if (AArch64_AM::isAdvSIMDModImmType5(Val)) {
5403     Val = AArch64_AM::encodeAdvSIMDModImmType5(Val);
5404     Shift = 0;
5405   } else if (AArch64_AM::isAdvSIMDModImmType6(Val)) {
5406     Val = AArch64_AM::encodeAdvSIMDModImmType6(Val);
5407     Shift = 8;
5408   } else
5409     return nullptr;
5410
5411   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5412   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5413   return &*Mov;
5414 }
5415
5416 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32(
5417     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5418     bool Inv) {
5419
5420   unsigned int Op;
5421   if (DstSize == 128) {
5422     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5423       return nullptr;
5424     Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32;
5425   } else {
5426     Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32;
5427   }
5428
5429   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5430   uint64_t Shift;
5431
5432   if ((AArch64_AM::isAdvSIMDModImmType1(Val))) {
5433     Val = AArch64_AM::encodeAdvSIMDModImmType1(Val);
5434     Shift = 0;
5435   } else if ((AArch64_AM::isAdvSIMDModImmType2(Val))) {
5436     Val = AArch64_AM::encodeAdvSIMDModImmType2(Val);
5437     Shift = 8;
5438   } else if ((AArch64_AM::isAdvSIMDModImmType3(Val))) {
5439     Val = AArch64_AM::encodeAdvSIMDModImmType3(Val);
5440     Shift = 16;
5441   } else if ((AArch64_AM::isAdvSIMDModImmType4(Val))) {
5442     Val = AArch64_AM::encodeAdvSIMDModImmType4(Val);
5443     Shift = 24;
5444   } else
5445     return nullptr;
5446
5447   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5448   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5449   return &*Mov;
5450 }
5451
5452 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64(
5453     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5454
5455   unsigned int Op;
5456   if (DstSize == 128) {
5457     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5458       return nullptr;
5459     Op = AArch64::MOVIv2d_ns;
5460   } else {
5461     Op = AArch64::MOVID;
5462   }
5463
5464   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5465   if (AArch64_AM::isAdvSIMDModImmType10(Val)) {
5466     Val = AArch64_AM::encodeAdvSIMDModImmType10(Val);
5467     auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5468     constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5469     return &*Mov;
5470   }
5471   return nullptr;
5472 }
5473
5474 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s(
5475     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5476     bool Inv) {
5477
5478   unsigned int Op;
5479   if (DstSize == 128) {
5480     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5481       return nullptr;
5482     Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl;
5483   } else {
5484     Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl;
5485   }
5486
5487   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5488   uint64_t Shift;
5489
5490   if (AArch64_AM::isAdvSIMDModImmType7(Val)) {
5491     Val = AArch64_AM::encodeAdvSIMDModImmType7(Val);
5492     Shift = 264;
5493   } else if (AArch64_AM::isAdvSIMDModImmType8(Val)) {
5494     Val = AArch64_AM::encodeAdvSIMDModImmType8(Val);
5495     Shift = 272;
5496   } else
5497     return nullptr;
5498
5499   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val).addImm(Shift);
5500   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5501   return &*Mov;
5502 }
5503
5504 MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
5505     Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5506
5507   unsigned int Op;
5508   bool IsWide = false;
5509   if (DstSize == 128) {
5510     if (Bits.getHiBits(64) != Bits.getLoBits(64))
5511       return nullptr;
5512     Op = AArch64::FMOVv4f32_ns;
5513     IsWide = true;
5514   } else {
5515     Op = AArch64::FMOVv2f32_ns;
5516   }
5517
5518   uint64_t Val = Bits.zextOrTrunc(64).getZExtValue();
5519
5520   if (AArch64_AM::isAdvSIMDModImmType11(Val)) {
5521     Val = AArch64_AM::encodeAdvSIMDModImmType11(Val);
5522   } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Val)) {
5523     Val = AArch64_AM::encodeAdvSIMDModImmType12(Val);
5524     Op = AArch64::FMOVv2f64_ns;
5525   } else
5526     return nullptr;
5527
5528   auto Mov = Builder.buildInstr(Op, {Dst}, {}).addImm(Val);
5529   constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5530   return &*Mov;
5531 }
5532
5533 bool AArch64InstructionSelector::selectIndexedExtLoad(
5534     MachineInstr &MI, MachineRegisterInfo &MRI) {
5535   auto &ExtLd = cast<GIndexedAnyExtLoad>(MI);
5536   Register Dst = ExtLd.getDstReg();
5537   Register WriteBack = ExtLd.getWritebackReg();
5538   Register Base = ExtLd.getBaseReg();
5539   Register Offset = ExtLd.getOffsetReg();
5540   LLT Ty = MRI.getType(Dst);
5541   assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs.
5542   unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits();
5543   bool IsPre = ExtLd.isPre();
5544   bool IsSExt = isa<GIndexedSExtLoad>(ExtLd);
5545   bool InsertIntoXReg = false;
5546   bool IsDst64 = Ty.getSizeInBits() == 64;
5547
5548   unsigned Opc = 0;
5549   LLT NewLdDstTy;
5550   LLT s32 = LLT::scalar(32);
5551   LLT s64 = LLT::scalar(64);
5552
5553   if (MemSizeBits == 8) {
5554     if (IsSExt) {
5555       if (IsDst64)
5556         Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
5557       else
5558         Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
5559       NewLdDstTy = IsDst64 ? s64 : s32;
5560     } else {
5561       Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
5562       InsertIntoXReg = IsDst64;
5563       NewLdDstTy = s32;
5564     }
5565   } else if (MemSizeBits == 16) {
5566     if (IsSExt) {
5567       if (IsDst64)
5568         Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
5569       else
5570         Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
5571       NewLdDstTy = IsDst64 ? s64 : s32;
5572     } else {
5573       Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
5574       InsertIntoXReg = IsDst64;
5575       NewLdDstTy = s32;
5576     }
5577   } else if (MemSizeBits == 32) {
5578     if (IsSExt) {
5579       Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
5580       NewLdDstTy = s64;
5581     } else {
5582       Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
5583       InsertIntoXReg = IsDst64;
5584       NewLdDstTy = s32;
5585     }
5586   } else {
5587     llvm_unreachable("Unexpected size for indexed load");
5588   }
5589
5590   if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5591     return false; // We should be on gpr.
5592
5593   auto Cst = getIConstantVRegVal(Offset, MRI);
5594   if (!Cst)
5595     return false; // Shouldn't happen, but just in case.
5596
5597   auto LdMI = MIB.buildInstr(Opc, {WriteBack, NewLdDstTy}, {Base})
5598                   .addImm(Cst->getSExtValue());
5599   LdMI.cloneMemRefs(ExtLd);
5600   constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5601   // Make sure to select the load with the MemTy as the dest type, and then
5602   // insert into X reg if needed.
5603   if (InsertIntoXReg) {
5604     // Generate a SUBREG_TO_REG.
5605     auto SubToReg = MIB.buildInstr(TargetOpcode::SUBREG_TO_REG, {Dst}, {})
5606                         .addImm(0)
5607                         .addUse(LdMI.getReg(1))
5608                         .addImm(AArch64::sub_32);
5609     RBI.constrainGenericRegister(SubToReg.getReg(0), AArch64::GPR64RegClass,
5610                                  MRI);
5611   } else {
5612     auto Copy = MIB.buildCopy(Dst, LdMI.getReg(1));
5613     selectCopy(*Copy, TII, MRI, TRI, RBI);
5614   }
5615   MI.eraseFromParent();
5616
5617   return true;
5618 }
5619
5620 bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
5621                                                    MachineRegisterInfo &MRI) {
5622   auto &Ld = cast<GIndexedLoad>(MI);
5623   Register Dst = Ld.getDstReg();
5624   Register WriteBack = Ld.getWritebackReg();
5625   Register Base = Ld.getBaseReg();
5626   Register Offset = Ld.getOffsetReg();
5627   assert(MRI.getType(Dst).getSizeInBits() <= 128 &&
5628          "Unexpected type for indexed load");
5629   unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
5630
5631   if (MemSize < MRI.getType(Dst).getSizeInBytes())
5632     return selectIndexedExtLoad(MI, MRI);
5633
5634   unsigned Opc = 0;
5635   if (Ld.isPre()) {
5636     static constexpr unsigned GPROpcodes[] = {
5637         AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre,
5638         AArch64::LDRXpre};
5639     static constexpr unsigned FPROpcodes[] = {
5640         AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre,
5641         AArch64::LDRQpre};
5642     if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5643       Opc = FPROpcodes[Log2_32(MemSize)];
5644     else
5645       Opc = GPROpcodes[Log2_32(MemSize)];
5646   } else {
5647     static constexpr unsigned GPROpcodes[] = {
5648         AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
5649         AArch64::LDRXpost};
5650     static constexpr unsigned FPROpcodes[] = {
5651         AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
5652         AArch64::LDRDpost, AArch64::LDRQpost};
5653     if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5654       Opc = FPROpcodes[Log2_32(MemSize)];
5655     else
5656       Opc = GPROpcodes[Log2_32(MemSize)];
5657   }
5658   auto Cst = getIConstantVRegVal(Offset, MRI);
5659   if (!Cst)
5660     return false; // Shouldn't happen, but just in case.
5661   auto LdMI =
5662       MIB.buildInstr(Opc, {WriteBack, Dst}, {Base}).addImm(Cst->getSExtValue());
5663   LdMI.cloneMemRefs(Ld);
5664   constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5665   MI.eraseFromParent();
5666   return true;
5667 }
5668
5669 bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I,
5670                                                     MachineRegisterInfo &MRI) {
5671   Register Dst = I.getWritebackReg();
5672   Register Val = I.getValueReg();
5673   Register Base = I.getBaseReg();
5674   Register Offset = I.getOffsetReg();
5675   LLT ValTy = MRI.getType(Val);
5676   assert(ValTy.getSizeInBits() <= 128 && "Unexpected type for indexed store");
5677
5678   unsigned Opc = 0;
5679   if (I.isPre()) {
5680     static constexpr unsigned GPROpcodes[] = {
5681         AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre,
5682         AArch64::STRXpre};
5683     static constexpr unsigned FPROpcodes[] = {
5684         AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre,
5685         AArch64::STRQpre};
5686
5687     if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5688       Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5689     else
5690       Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5691   } else {
5692     static constexpr unsigned GPROpcodes[] = {
5693         AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
5694         AArch64::STRXpost};
5695     static constexpr unsigned FPROpcodes[] = {
5696         AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
5697         AArch64::STRDpost, AArch64::STRQpost};
5698
5699     if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5700       Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5701     else
5702       Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
5703   }
5704
5705   auto Cst = getIConstantVRegVal(Offset, MRI);
5706   if (!Cst)
5707     return false; // Shouldn't happen, but just in case.
5708   auto Str =
5709       MIB.buildInstr(Opc, {Dst}, {Val, Base}).addImm(Cst->getSExtValue());
5710   Str.cloneMemRefs(I);
5711   constrainSelectedInstRegOperands(*Str, TII, TRI, RBI);
5712   I.eraseFromParent();
5713   return true;
5714 }
5715
5716 MachineInstr *
5717 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5718                                                MachineIRBuilder &MIRBuilder,
5719                                                MachineRegisterInfo &MRI) {
5720   LLT DstTy = MRI.getType(Dst);
5721   unsigned DstSize = DstTy.getSizeInBits();
5722   if (CV->isNullValue()) {
5723     if (DstSize == 128) {
5724       auto Mov =
5725           MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
5726       constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5727       return &*Mov;
5728     }
5729
5730     if (DstSize == 64) {
5731       auto Mov =
5732           MIRBuilder
5733               .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
5734               .addImm(0);
5735       auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
5736                       .addReg(Mov.getReg(0), 0, AArch64::dsub);
5737       RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
5738       return &*Copy;
5739     }
5740   }
5741
5742   if (CV->getSplatValue()) {
5743     APInt DefBits = APInt::getSplat(
5744         DstSize, CV->getUniqueInteger().trunc(DstTy.getScalarSizeInBits()));
5745     auto TryMOVIWithBits = [&](APInt DefBits) -> MachineInstr * {
5746       MachineInstr *NewOp;
5747       bool Inv = false;
5748       if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, DefBits, MIRBuilder)) ||
5749           (NewOp =
5750                tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5751           (NewOp =
5752                tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5753           (NewOp =
5754                tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5755           (NewOp = tryAdvSIMDModImm8(Dst, DstSize, DefBits, MIRBuilder)) ||
5756           (NewOp = tryAdvSIMDModImmFP(Dst, DstSize, DefBits, MIRBuilder)))
5757         return NewOp;
5758
5759       DefBits = ~DefBits;
5760       Inv = true;
5761       if ((NewOp =
5762                tryAdvSIMDModImm32(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5763           (NewOp =
5764                tryAdvSIMDModImm321s(Dst, DstSize, DefBits, MIRBuilder, Inv)) ||
5765           (NewOp = tryAdvSIMDModImm16(Dst, DstSize, DefBits, MIRBuilder, Inv)))
5766         return NewOp;
5767       return nullptr;
5768     };
5769
5770     if (auto *NewOp = TryMOVIWithBits(DefBits))
5771       return NewOp;
5772
5773     // See if a fneg of the constant can be materialized with a MOVI, etc
5774     auto TryWithFNeg = [&](APInt DefBits, int NumBits,
5775                            unsigned NegOpc) -> MachineInstr * {
5776       // FNegate each sub-element of the constant
5777       APInt Neg = APInt::getHighBitsSet(NumBits, 1).zext(DstSize);
5778       APInt NegBits(DstSize, 0);
5779       unsigned NumElts = DstSize / NumBits;
5780       for (unsigned i = 0; i < NumElts; i++)
5781         NegBits |= Neg << (NumBits * i);
5782       NegBits = DefBits ^ NegBits;
5783
5784       // Try to create the new constants with MOVI, and if so generate a fneg
5785       // for it.
5786       if (auto *NewOp = TryMOVIWithBits(NegBits)) {
5787         Register NewDst = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
5788         NewOp->getOperand(0).setReg(NewDst);
5789         return MIRBuilder.buildInstr(NegOpc, {Dst}, {NewDst});
5790       }
5791       return nullptr;
5792     };
5793     MachineInstr *R;
5794     if ((R = TryWithFNeg(DefBits, 32, AArch64::FNEGv4f32)) ||
5795         (R = TryWithFNeg(DefBits, 64, AArch64::FNEGv2f64)) ||
5796         (STI.hasFullFP16() &&
5797          (R = TryWithFNeg(DefBits, 16, AArch64::FNEGv8f16))))
5798       return R;
5799   }
5800
5801   auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
5802   if (!CPLoad) {
5803     LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5804     return nullptr;
5805   }
5806
5807   auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
5808   RBI.constrainGenericRegister(
5809       Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
5810   return &*Copy;
5811 }
5812
5813 bool AArch64InstructionSelector::tryOptConstantBuildVec(
5814     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5815   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5816   unsigned DstSize = DstTy.getSizeInBits();
5817   assert(DstSize <= 128 && "Unexpected build_vec type!");
5818   if (DstSize < 32)
5819     return false;
5820   // Check if we're building a constant vector, in which case we want to
5821   // generate a constant pool load instead of a vector insert sequence.
5822   SmallVector<Constant *, 16> Csts;
5823   for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5824     // Try to find G_CONSTANT or G_FCONSTANT
5825     auto *OpMI =
5826         getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
5827     if (OpMI)
5828       Csts.emplace_back(
5829           const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
5830     else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
5831                                   I.getOperand(Idx).getReg(), MRI)))
5832       Csts.emplace_back(
5833           const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
5834     else
5835       return false;
5836   }
5837   Constant *CV = ConstantVector::get(Csts);
5838   if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
5839     return false;
5840   I.eraseFromParent();
5841   return true;
5842 }
5843
5844 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5845     MachineInstr &I, MachineRegisterInfo &MRI) {
5846   // Given:
5847   //  %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5848   //
5849   // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5850   Register Dst = I.getOperand(0).getReg();
5851   Register EltReg = I.getOperand(1).getReg();
5852   LLT EltTy = MRI.getType(EltReg);
5853   // If the index isn't on the same bank as its elements, then this can't be a
5854   // SUBREG_TO_REG.
5855   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5856   const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
5857   if (EltRB != DstRB)
5858     return false;
5859   if (any_of(drop_begin(I.operands(), 2), [&MRI](const MachineOperand &Op) {
5860         return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(), MRI);
5861       }))
5862     return false;
5863   unsigned SubReg;
5864   const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB);
5865   if (!EltRC)
5866     return false;
5867   const TargetRegisterClass *DstRC =
5868       getRegClassForTypeOnBank(MRI.getType(Dst), DstRB);
5869   if (!DstRC)
5870     return false;
5871   if (!getSubRegForClass(EltRC, TRI, SubReg))
5872     return false;
5873   auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
5874                          .addImm(0)
5875                          .addUse(EltReg)
5876                          .addImm(SubReg);
5877   I.eraseFromParent();
5878   constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
5879   return RBI.constrainGenericRegister(Dst, *DstRC, MRI);
5880 }
5881
5882 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5883                                                    MachineRegisterInfo &MRI) {
5884   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5885   // Until we port more of the optimized selections, for now just use a vector
5886   // insert sequence.
5887   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5888   const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
5889   unsigned EltSize = EltTy.getSizeInBits();
5890
5891   if (tryOptConstantBuildVec(I, DstTy, MRI))
5892     return true;
5893   if (tryOptBuildVecToSubregToReg(I, MRI))
5894     return true;
5895
5896   if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64)
5897     return false; // Don't support all element types yet.
5898   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
5899
5900   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5901   MachineInstr *ScalarToVec =
5902       emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
5903                          I.getOperand(1).getReg(), MIB);
5904   if (!ScalarToVec)
5905     return false;
5906
5907   Register DstVec = ScalarToVec->getOperand(0).getReg();
5908   unsigned DstSize = DstTy.getSizeInBits();
5909
5910   // Keep track of the last MI we inserted. Later on, we might be able to save
5911   // a copy using it.
5912   MachineInstr *PrevMI = ScalarToVec;
5913   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5914     // Note that if we don't do a subregister copy, we can end up making an
5915     // extra register.
5916     Register OpReg = I.getOperand(i).getReg();
5917     // Do not emit inserts for undefs
5918     if (!getOpcodeDef<GImplicitDef>(OpReg, MRI)) {
5919       PrevMI = &*emitLaneInsert(std::nullopt, DstVec, OpReg, i - 1, RB, MIB);
5920       DstVec = PrevMI->getOperand(0).getReg();
5921     }
5922   }
5923
5924   // If DstTy's size in bits is less than 128, then emit a subregister copy
5925   // from DstVec to the last register we've defined.
5926   if (DstSize < 128) {
5927     // Force this to be FPR using the destination vector.
5928     const TargetRegisterClass *RC =
5929         getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5930     if (!RC)
5931       return false;
5932     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5933       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5934       return false;
5935     }
5936
5937     unsigned SubReg = 0;
5938     if (!getSubRegForClass(RC, TRI, SubReg))
5939       return false;
5940     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5941       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5942                         << "\n");
5943       return false;
5944     }
5945
5946     Register Reg = MRI.createVirtualRegister(RC);
5947     Register DstReg = I.getOperand(0).getReg();
5948
5949     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
5950     MachineOperand &RegOp = I.getOperand(1);
5951     RegOp.setReg(Reg);
5952     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5953   } else {
5954     // We either have a vector with all elements (except the first one) undef or
5955     // at least one non-undef non-first element. In the first case, we need to
5956     // constrain the output register ourselves as we may have generated an
5957     // INSERT_SUBREG operation which is a generic operation for which the
5958     // output regclass cannot be automatically chosen.
5959     //
5960     // In the second case, there is no need to do this as it may generate an
5961     // instruction like INSvi32gpr where the regclass can be automatically
5962     // chosen.
5963     //
5964     // Also, we save a copy by re-using the destination register on the final
5965     // insert.
5966     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
5967     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5968
5969     Register DstReg = PrevMI->getOperand(0).getReg();
5970     if (PrevMI == ScalarToVec && DstReg.isVirtual()) {
5971       const TargetRegisterClass *RC =
5972           getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5973       RBI.constrainGenericRegister(DstReg, *RC, MRI);
5974     }
5975   }
5976
5977   I.eraseFromParent();
5978   return true;
5979 }
5980
5981 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5982                                                            unsigned NumVecs,
5983                                                            MachineInstr &I) {
5984   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5985   assert(Opc && "Expected an opcode?");
5986   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5987   auto &MRI = *MIB.getMRI();
5988   LLT Ty = MRI.getType(I.getOperand(0).getReg());
5989   unsigned Size = Ty.getSizeInBits();
5990   assert((Size == 64 || Size == 128) &&
5991          "Destination must be 64 bits or 128 bits?");
5992   unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
5993   auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg();
5994   assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
5995   auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
5996   Load.cloneMemRefs(I);
5997   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
5998   Register SelectedLoadDst = Load->getOperand(0).getReg();
5999   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6000     auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {})
6001                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
6002     // Emit the subreg copies and immediately select them.
6003     // FIXME: We should refactor our copy code into an emitCopy helper and
6004     // clean up uses of this pattern elsewhere in the selector.
6005     selectCopy(*Vec, TII, MRI, TRI, RBI);
6006   }
6007   return true;
6008 }
6009
6010 bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
6011     unsigned Opc, unsigned NumVecs, MachineInstr &I) {
6012   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
6013   assert(Opc && "Expected an opcode?");
6014   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
6015   auto &MRI = *MIB.getMRI();
6016   LLT Ty = MRI.getType(I.getOperand(0).getReg());
6017   bool Narrow = Ty.getSizeInBits() == 64;
6018
6019   auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1;
6020   SmallVector<Register, 4> Regs(NumVecs);
6021   std::transform(FirstSrcRegIt, FirstSrcRegIt + NumVecs, Regs.begin(),
6022                  [](auto MO) { return MO.getReg(); });
6023
6024   if (Narrow) {
6025     transform(Regs, Regs.begin(), [this](Register Reg) {
6026       return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
6027           ->getOperand(0)
6028           .getReg();
6029     });
6030     Ty = Ty.multiplyElements(2);
6031   }
6032
6033   Register Tuple = createQTuple(Regs, MIB);
6034   auto LaneNo = getIConstantVRegVal((FirstSrcRegIt + NumVecs)->getReg(), MRI);
6035   if (!LaneNo)
6036     return false;
6037
6038   Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg();
6039   auto Load = MIB.buildInstr(Opc, {Ty}, {})
6040                   .addReg(Tuple)
6041                   .addImm(LaneNo->getZExtValue())
6042                   .addReg(Ptr);
6043   Load.cloneMemRefs(I);
6044   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
6045   Register SelectedLoadDst = Load->getOperand(0).getReg();
6046   unsigned SubReg = AArch64::qsub0;
6047   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6048     auto Vec = MIB.buildInstr(TargetOpcode::COPY,
6049                               {Narrow ? DstOp(&AArch64::FPR128RegClass)
6050                                       : DstOp(I.getOperand(Idx).getReg())},
6051                               {})
6052                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
6053     Register WideReg = Vec.getReg(0);
6054     // Emit the subreg copies and immediately select them.
6055     selectCopy(*Vec, TII, MRI, TRI, RBI);
6056     if (Narrow &&
6057         !emitNarrowVector(I.getOperand(Idx).getReg(), WideReg, MIB, MRI))
6058       return false;
6059   }
6060   return true;
6061 }
6062
6063 void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I,
6064                                                             unsigned NumVecs,
6065                                                             unsigned Opc) {
6066   MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6067   LLT Ty = MRI.getType(I.getOperand(1).getReg());
6068   Register Ptr = I.getOperand(1 + NumVecs).getReg();
6069
6070   SmallVector<Register, 2> Regs(NumVecs);
6071   std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
6072                  Regs.begin(), [](auto MO) { return MO.getReg(); });
6073
6074   Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
6075                                              : createDTuple(Regs, MIB);
6076   auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
6077   Store.cloneMemRefs(I);
6078   constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6079 }
6080
6081 bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic(
6082     MachineInstr &I, unsigned NumVecs, unsigned Opc) {
6083   MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6084   LLT Ty = MRI.getType(I.getOperand(1).getReg());
6085   bool Narrow = Ty.getSizeInBits() == 64;
6086
6087   SmallVector<Register, 2> Regs(NumVecs);
6088   std::transform(I.operands_begin() + 1, I.operands_begin() + 1 + NumVecs,
6089                  Regs.begin(), [](auto MO) { return MO.getReg(); });
6090
6091   if (Narrow)
6092     transform(Regs, Regs.begin(), [this](Register Reg) {
6093       return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
6094           ->getOperand(0)
6095           .getReg();
6096     });
6097
6098   Register Tuple = createQTuple(Regs, MIB);
6099
6100   auto LaneNo = getIConstantVRegVal(I.getOperand(1 + NumVecs).getReg(), MRI);
6101   if (!LaneNo)
6102     return false;
6103   Register Ptr = I.getOperand(1 + NumVecs + 1).getReg();
6104   auto Store = MIB.buildInstr(Opc, {}, {})
6105                    .addReg(Tuple)
6106                    .addImm(LaneNo->getZExtValue())
6107                    .addReg(Ptr);
6108   Store.cloneMemRefs(I);
6109   constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6110   return true;
6111 }
6112
6113 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
6114     MachineInstr &I, MachineRegisterInfo &MRI) {
6115   // Find the intrinsic ID.
6116   unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID();
6117
6118   const LLT S8 = LLT::scalar(8);
6119   const LLT S16 = LLT::scalar(16);
6120   const LLT S32 = LLT::scalar(32);
6121   const LLT S64 = LLT::scalar(64);
6122   const LLT P0 = LLT::pointer(0, 64);
6123   // Select the instruction.
6124   switch (IntrinID) {
6125   default:
6126     return false;
6127   case Intrinsic::aarch64_ldxp:
6128   case Intrinsic::aarch64_ldaxp: {
6129     auto NewI = MIB.buildInstr(
6130         IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
6131         {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
6132         {I.getOperand(3)});
6133     NewI.cloneMemRefs(I);
6134     constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
6135     break;
6136   }
6137   case Intrinsic::aarch64_neon_ld1x2: {
6138     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6139     unsigned Opc = 0;
6140     if (Ty == LLT::fixed_vector(8, S8))
6141       Opc = AArch64::LD1Twov8b;
6142     else if (Ty == LLT::fixed_vector(16, S8))
6143       Opc = AArch64::LD1Twov16b;
6144     else if (Ty == LLT::fixed_vector(4, S16))
6145       Opc = AArch64::LD1Twov4h;
6146     else if (Ty == LLT::fixed_vector(8, S16))
6147       Opc = AArch64::LD1Twov8h;
6148     else if (Ty == LLT::fixed_vector(2, S32))
6149       Opc = AArch64::LD1Twov2s;
6150     else if (Ty == LLT::fixed_vector(4, S32))
6151       Opc = AArch64::LD1Twov4s;
6152     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6153       Opc = AArch64::LD1Twov2d;
6154     else if (Ty == S64 || Ty == P0)
6155       Opc = AArch64::LD1Twov1d;
6156     else
6157       llvm_unreachable("Unexpected type for ld1x2!");
6158     selectVectorLoadIntrinsic(Opc, 2, I);
6159     break;
6160   }
6161   case Intrinsic::aarch64_neon_ld1x3: {
6162     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6163     unsigned Opc = 0;
6164     if (Ty == LLT::fixed_vector(8, S8))
6165       Opc = AArch64::LD1Threev8b;
6166     else if (Ty == LLT::fixed_vector(16, S8))
6167       Opc = AArch64::LD1Threev16b;
6168     else if (Ty == LLT::fixed_vector(4, S16))
6169       Opc = AArch64::LD1Threev4h;
6170     else if (Ty == LLT::fixed_vector(8, S16))
6171       Opc = AArch64::LD1Threev8h;
6172     else if (Ty == LLT::fixed_vector(2, S32))
6173       Opc = AArch64::LD1Threev2s;
6174     else if (Ty == LLT::fixed_vector(4, S32))
6175       Opc = AArch64::LD1Threev4s;
6176     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6177       Opc = AArch64::LD1Threev2d;
6178     else if (Ty == S64 || Ty == P0)
6179       Opc = AArch64::LD1Threev1d;
6180     else
6181       llvm_unreachable("Unexpected type for ld1x3!");
6182     selectVectorLoadIntrinsic(Opc, 3, I);
6183     break;
6184   }
6185   case Intrinsic::aarch64_neon_ld1x4: {
6186     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6187     unsigned Opc = 0;
6188     if (Ty == LLT::fixed_vector(8, S8))
6189       Opc = AArch64::LD1Fourv8b;
6190     else if (Ty == LLT::fixed_vector(16, S8))
6191       Opc = AArch64::LD1Fourv16b;
6192     else if (Ty == LLT::fixed_vector(4, S16))
6193       Opc = AArch64::LD1Fourv4h;
6194     else if (Ty == LLT::fixed_vector(8, S16))
6195       Opc = AArch64::LD1Fourv8h;
6196     else if (Ty == LLT::fixed_vector(2, S32))
6197       Opc = AArch64::LD1Fourv2s;
6198     else if (Ty == LLT::fixed_vector(4, S32))
6199       Opc = AArch64::LD1Fourv4s;
6200     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6201       Opc = AArch64::LD1Fourv2d;
6202     else if (Ty == S64 || Ty == P0)
6203       Opc = AArch64::LD1Fourv1d;
6204     else
6205       llvm_unreachable("Unexpected type for ld1x4!");
6206     selectVectorLoadIntrinsic(Opc, 4, I);
6207     break;
6208   }
6209   case Intrinsic::aarch64_neon_ld2: {
6210     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6211     unsigned Opc = 0;
6212     if (Ty == LLT::fixed_vector(8, S8))
6213       Opc = AArch64::LD2Twov8b;
6214     else if (Ty == LLT::fixed_vector(16, S8))
6215       Opc = AArch64::LD2Twov16b;
6216     else if (Ty == LLT::fixed_vector(4, S16))
6217       Opc = AArch64::LD2Twov4h;
6218     else if (Ty == LLT::fixed_vector(8, S16))
6219       Opc = AArch64::LD2Twov8h;
6220     else if (Ty == LLT::fixed_vector(2, S32))
6221       Opc = AArch64::LD2Twov2s;
6222     else if (Ty == LLT::fixed_vector(4, S32))
6223       Opc = AArch64::LD2Twov4s;
6224     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6225       Opc = AArch64::LD2Twov2d;
6226     else if (Ty == S64 || Ty == P0)
6227       Opc = AArch64::LD1Twov1d;
6228     else
6229       llvm_unreachable("Unexpected type for ld2!");
6230     selectVectorLoadIntrinsic(Opc, 2, I);
6231     break;
6232   }
6233   case Intrinsic::aarch64_neon_ld2lane: {
6234     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6235     unsigned Opc;
6236     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6237       Opc = AArch64::LD2i8;
6238     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6239       Opc = AArch64::LD2i16;
6240     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6241       Opc = AArch64::LD2i32;
6242     else if (Ty == LLT::fixed_vector(2, S64) ||
6243              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6244       Opc = AArch64::LD2i64;
6245     else
6246       llvm_unreachable("Unexpected type for st2lane!");
6247     if (!selectVectorLoadLaneIntrinsic(Opc, 2, I))
6248       return false;
6249     break;
6250   }
6251   case Intrinsic::aarch64_neon_ld2r: {
6252     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6253     unsigned Opc = 0;
6254     if (Ty == LLT::fixed_vector(8, S8))
6255       Opc = AArch64::LD2Rv8b;
6256     else if (Ty == LLT::fixed_vector(16, S8))
6257       Opc = AArch64::LD2Rv16b;
6258     else if (Ty == LLT::fixed_vector(4, S16))
6259       Opc = AArch64::LD2Rv4h;
6260     else if (Ty == LLT::fixed_vector(8, S16))
6261       Opc = AArch64::LD2Rv8h;
6262     else if (Ty == LLT::fixed_vector(2, S32))
6263       Opc = AArch64::LD2Rv2s;
6264     else if (Ty == LLT::fixed_vector(4, S32))
6265       Opc = AArch64::LD2Rv4s;
6266     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6267       Opc = AArch64::LD2Rv2d;
6268     else if (Ty == S64 || Ty == P0)
6269       Opc = AArch64::LD2Rv1d;
6270     else
6271       llvm_unreachable("Unexpected type for ld2r!");
6272     selectVectorLoadIntrinsic(Opc, 2, I);
6273     break;
6274   }
6275   case Intrinsic::aarch64_neon_ld3: {
6276     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6277     unsigned Opc = 0;
6278     if (Ty == LLT::fixed_vector(8, S8))
6279       Opc = AArch64::LD3Threev8b;
6280     else if (Ty == LLT::fixed_vector(16, S8))
6281       Opc = AArch64::LD3Threev16b;
6282     else if (Ty == LLT::fixed_vector(4, S16))
6283       Opc = AArch64::LD3Threev4h;
6284     else if (Ty == LLT::fixed_vector(8, S16))
6285       Opc = AArch64::LD3Threev8h;
6286     else if (Ty == LLT::fixed_vector(2, S32))
6287       Opc = AArch64::LD3Threev2s;
6288     else if (Ty == LLT::fixed_vector(4, S32))
6289       Opc = AArch64::LD3Threev4s;
6290     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6291       Opc = AArch64::LD3Threev2d;
6292     else if (Ty == S64 || Ty == P0)
6293       Opc = AArch64::LD1Threev1d;
6294     else
6295       llvm_unreachable("Unexpected type for ld3!");
6296     selectVectorLoadIntrinsic(Opc, 3, I);
6297     break;
6298   }
6299   case Intrinsic::aarch64_neon_ld3lane: {
6300     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6301     unsigned Opc;
6302     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6303       Opc = AArch64::LD3i8;
6304     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6305       Opc = AArch64::LD3i16;
6306     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6307       Opc = AArch64::LD3i32;
6308     else if (Ty == LLT::fixed_vector(2, S64) ||
6309              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6310       Opc = AArch64::LD3i64;
6311     else
6312       llvm_unreachable("Unexpected type for st3lane!");
6313     if (!selectVectorLoadLaneIntrinsic(Opc, 3, I))
6314       return false;
6315     break;
6316   }
6317   case Intrinsic::aarch64_neon_ld3r: {
6318     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6319     unsigned Opc = 0;
6320     if (Ty == LLT::fixed_vector(8, S8))
6321       Opc = AArch64::LD3Rv8b;
6322     else if (Ty == LLT::fixed_vector(16, S8))
6323       Opc = AArch64::LD3Rv16b;
6324     else if (Ty == LLT::fixed_vector(4, S16))
6325       Opc = AArch64::LD3Rv4h;
6326     else if (Ty == LLT::fixed_vector(8, S16))
6327       Opc = AArch64::LD3Rv8h;
6328     else if (Ty == LLT::fixed_vector(2, S32))
6329       Opc = AArch64::LD3Rv2s;
6330     else if (Ty == LLT::fixed_vector(4, S32))
6331       Opc = AArch64::LD3Rv4s;
6332     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6333       Opc = AArch64::LD3Rv2d;
6334     else if (Ty == S64 || Ty == P0)
6335       Opc = AArch64::LD3Rv1d;
6336     else
6337       llvm_unreachable("Unexpected type for ld3r!");
6338     selectVectorLoadIntrinsic(Opc, 3, I);
6339     break;
6340   }
6341   case Intrinsic::aarch64_neon_ld4: {
6342     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6343     unsigned Opc = 0;
6344     if (Ty == LLT::fixed_vector(8, S8))
6345       Opc = AArch64::LD4Fourv8b;
6346     else if (Ty == LLT::fixed_vector(16, S8))
6347       Opc = AArch64::LD4Fourv16b;
6348     else if (Ty == LLT::fixed_vector(4, S16))
6349       Opc = AArch64::LD4Fourv4h;
6350     else if (Ty == LLT::fixed_vector(8, S16))
6351       Opc = AArch64::LD4Fourv8h;
6352     else if (Ty == LLT::fixed_vector(2, S32))
6353       Opc = AArch64::LD4Fourv2s;
6354     else if (Ty == LLT::fixed_vector(4, S32))
6355       Opc = AArch64::LD4Fourv4s;
6356     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6357       Opc = AArch64::LD4Fourv2d;
6358     else if (Ty == S64 || Ty == P0)
6359       Opc = AArch64::LD1Fourv1d;
6360     else
6361       llvm_unreachable("Unexpected type for ld4!");
6362     selectVectorLoadIntrinsic(Opc, 4, I);
6363     break;
6364   }
6365   case Intrinsic::aarch64_neon_ld4lane: {
6366     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6367     unsigned Opc;
6368     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6369       Opc = AArch64::LD4i8;
6370     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6371       Opc = AArch64::LD4i16;
6372     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6373       Opc = AArch64::LD4i32;
6374     else if (Ty == LLT::fixed_vector(2, S64) ||
6375              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6376       Opc = AArch64::LD4i64;
6377     else
6378       llvm_unreachable("Unexpected type for st4lane!");
6379     if (!selectVectorLoadLaneIntrinsic(Opc, 4, I))
6380       return false;
6381     break;
6382   }
6383   case Intrinsic::aarch64_neon_ld4r: {
6384     LLT Ty = MRI.getType(I.getOperand(0).getReg());
6385     unsigned Opc = 0;
6386     if (Ty == LLT::fixed_vector(8, S8))
6387       Opc = AArch64::LD4Rv8b;
6388     else if (Ty == LLT::fixed_vector(16, S8))
6389       Opc = AArch64::LD4Rv16b;
6390     else if (Ty == LLT::fixed_vector(4, S16))
6391       Opc = AArch64::LD4Rv4h;
6392     else if (Ty == LLT::fixed_vector(8, S16))
6393       Opc = AArch64::LD4Rv8h;
6394     else if (Ty == LLT::fixed_vector(2, S32))
6395       Opc = AArch64::LD4Rv2s;
6396     else if (Ty == LLT::fixed_vector(4, S32))
6397       Opc = AArch64::LD4Rv4s;
6398     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6399       Opc = AArch64::LD4Rv2d;
6400     else if (Ty == S64 || Ty == P0)
6401       Opc = AArch64::LD4Rv1d;
6402     else
6403       llvm_unreachable("Unexpected type for ld4r!");
6404     selectVectorLoadIntrinsic(Opc, 4, I);
6405     break;
6406   }
6407   case Intrinsic::aarch64_neon_st1x2: {
6408     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6409     unsigned Opc;
6410     if (Ty == LLT::fixed_vector(8, S8))
6411       Opc = AArch64::ST1Twov8b;
6412     else if (Ty == LLT::fixed_vector(16, S8))
6413       Opc = AArch64::ST1Twov16b;
6414     else if (Ty == LLT::fixed_vector(4, S16))
6415       Opc = AArch64::ST1Twov4h;
6416     else if (Ty == LLT::fixed_vector(8, S16))
6417       Opc = AArch64::ST1Twov8h;
6418     else if (Ty == LLT::fixed_vector(2, S32))
6419       Opc = AArch64::ST1Twov2s;
6420     else if (Ty == LLT::fixed_vector(4, S32))
6421       Opc = AArch64::ST1Twov4s;
6422     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6423       Opc = AArch64::ST1Twov2d;
6424     else if (Ty == S64 || Ty == P0)
6425       Opc = AArch64::ST1Twov1d;
6426     else
6427       llvm_unreachable("Unexpected type for st1x2!");
6428     selectVectorStoreIntrinsic(I, 2, Opc);
6429     break;
6430   }
6431   case Intrinsic::aarch64_neon_st1x3: {
6432     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6433     unsigned Opc;
6434     if (Ty == LLT::fixed_vector(8, S8))
6435       Opc = AArch64::ST1Threev8b;
6436     else if (Ty == LLT::fixed_vector(16, S8))
6437       Opc = AArch64::ST1Threev16b;
6438     else if (Ty == LLT::fixed_vector(4, S16))
6439       Opc = AArch64::ST1Threev4h;
6440     else if (Ty == LLT::fixed_vector(8, S16))
6441       Opc = AArch64::ST1Threev8h;
6442     else if (Ty == LLT::fixed_vector(2, S32))
6443       Opc = AArch64::ST1Threev2s;
6444     else if (Ty == LLT::fixed_vector(4, S32))
6445       Opc = AArch64::ST1Threev4s;
6446     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6447       Opc = AArch64::ST1Threev2d;
6448     else if (Ty == S64 || Ty == P0)
6449       Opc = AArch64::ST1Threev1d;
6450     else
6451       llvm_unreachable("Unexpected type for st1x3!");
6452     selectVectorStoreIntrinsic(I, 3, Opc);
6453     break;
6454   }
6455   case Intrinsic::aarch64_neon_st1x4: {
6456     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6457     unsigned Opc;
6458     if (Ty == LLT::fixed_vector(8, S8))
6459       Opc = AArch64::ST1Fourv8b;
6460     else if (Ty == LLT::fixed_vector(16, S8))
6461       Opc = AArch64::ST1Fourv16b;
6462     else if (Ty == LLT::fixed_vector(4, S16))
6463       Opc = AArch64::ST1Fourv4h;
6464     else if (Ty == LLT::fixed_vector(8, S16))
6465       Opc = AArch64::ST1Fourv8h;
6466     else if (Ty == LLT::fixed_vector(2, S32))
6467       Opc = AArch64::ST1Fourv2s;
6468     else if (Ty == LLT::fixed_vector(4, S32))
6469       Opc = AArch64::ST1Fourv4s;
6470     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6471       Opc = AArch64::ST1Fourv2d;
6472     else if (Ty == S64 || Ty == P0)
6473       Opc = AArch64::ST1Fourv1d;
6474     else
6475       llvm_unreachable("Unexpected type for st1x4!");
6476     selectVectorStoreIntrinsic(I, 4, Opc);
6477     break;
6478   }
6479   case Intrinsic::aarch64_neon_st2: {
6480     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6481     unsigned Opc;
6482     if (Ty == LLT::fixed_vector(8, S8))
6483       Opc = AArch64::ST2Twov8b;
6484     else if (Ty == LLT::fixed_vector(16, S8))
6485       Opc = AArch64::ST2Twov16b;
6486     else if (Ty == LLT::fixed_vector(4, S16))
6487       Opc = AArch64::ST2Twov4h;
6488     else if (Ty == LLT::fixed_vector(8, S16))
6489       Opc = AArch64::ST2Twov8h;
6490     else if (Ty == LLT::fixed_vector(2, S32))
6491       Opc = AArch64::ST2Twov2s;
6492     else if (Ty == LLT::fixed_vector(4, S32))
6493       Opc = AArch64::ST2Twov4s;
6494     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6495       Opc = AArch64::ST2Twov2d;
6496     else if (Ty == S64 || Ty == P0)
6497       Opc = AArch64::ST1Twov1d;
6498     else
6499       llvm_unreachable("Unexpected type for st2!");
6500     selectVectorStoreIntrinsic(I, 2, Opc);
6501     break;
6502   }
6503   case Intrinsic::aarch64_neon_st3: {
6504     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6505     unsigned Opc;
6506     if (Ty == LLT::fixed_vector(8, S8))
6507       Opc = AArch64::ST3Threev8b;
6508     else if (Ty == LLT::fixed_vector(16, S8))
6509       Opc = AArch64::ST3Threev16b;
6510     else if (Ty == LLT::fixed_vector(4, S16))
6511       Opc = AArch64::ST3Threev4h;
6512     else if (Ty == LLT::fixed_vector(8, S16))
6513       Opc = AArch64::ST3Threev8h;
6514     else if (Ty == LLT::fixed_vector(2, S32))
6515       Opc = AArch64::ST3Threev2s;
6516     else if (Ty == LLT::fixed_vector(4, S32))
6517       Opc = AArch64::ST3Threev4s;
6518     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6519       Opc = AArch64::ST3Threev2d;
6520     else if (Ty == S64 || Ty == P0)
6521       Opc = AArch64::ST1Threev1d;
6522     else
6523       llvm_unreachable("Unexpected type for st3!");
6524     selectVectorStoreIntrinsic(I, 3, Opc);
6525     break;
6526   }
6527   case Intrinsic::aarch64_neon_st4: {
6528     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6529     unsigned Opc;
6530     if (Ty == LLT::fixed_vector(8, S8))
6531       Opc = AArch64::ST4Fourv8b;
6532     else if (Ty == LLT::fixed_vector(16, S8))
6533       Opc = AArch64::ST4Fourv16b;
6534     else if (Ty == LLT::fixed_vector(4, S16))
6535       Opc = AArch64::ST4Fourv4h;
6536     else if (Ty == LLT::fixed_vector(8, S16))
6537       Opc = AArch64::ST4Fourv8h;
6538     else if (Ty == LLT::fixed_vector(2, S32))
6539       Opc = AArch64::ST4Fourv2s;
6540     else if (Ty == LLT::fixed_vector(4, S32))
6541       Opc = AArch64::ST4Fourv4s;
6542     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
6543       Opc = AArch64::ST4Fourv2d;
6544     else if (Ty == S64 || Ty == P0)
6545       Opc = AArch64::ST1Fourv1d;
6546     else
6547       llvm_unreachable("Unexpected type for st4!");
6548     selectVectorStoreIntrinsic(I, 4, Opc);
6549     break;
6550   }
6551   case Intrinsic::aarch64_neon_st2lane: {
6552     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6553     unsigned Opc;
6554     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6555       Opc = AArch64::ST2i8;
6556     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6557       Opc = AArch64::ST2i16;
6558     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6559       Opc = AArch64::ST2i32;
6560     else if (Ty == LLT::fixed_vector(2, S64) ||
6561              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6562       Opc = AArch64::ST2i64;
6563     else
6564       llvm_unreachable("Unexpected type for st2lane!");
6565     if (!selectVectorStoreLaneIntrinsic(I, 2, Opc))
6566       return false;
6567     break;
6568   }
6569   case Intrinsic::aarch64_neon_st3lane: {
6570     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6571     unsigned Opc;
6572     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6573       Opc = AArch64::ST3i8;
6574     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6575       Opc = AArch64::ST3i16;
6576     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6577       Opc = AArch64::ST3i32;
6578     else if (Ty == LLT::fixed_vector(2, S64) ||
6579              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6580       Opc = AArch64::ST3i64;
6581     else
6582       llvm_unreachable("Unexpected type for st3lane!");
6583     if (!selectVectorStoreLaneIntrinsic(I, 3, Opc))
6584       return false;
6585     break;
6586   }
6587   case Intrinsic::aarch64_neon_st4lane: {
6588     LLT Ty = MRI.getType(I.getOperand(1).getReg());
6589     unsigned Opc;
6590     if (Ty == LLT::fixed_vector(8, S8) || Ty == LLT::fixed_vector(16, S8))
6591       Opc = AArch64::ST4i8;
6592     else if (Ty == LLT::fixed_vector(4, S16) || Ty == LLT::fixed_vector(8, S16))
6593       Opc = AArch64::ST4i16;
6594     else if (Ty == LLT::fixed_vector(2, S32) || Ty == LLT::fixed_vector(4, S32))
6595       Opc = AArch64::ST4i32;
6596     else if (Ty == LLT::fixed_vector(2, S64) ||
6597              Ty == LLT::fixed_vector(2, P0) || Ty == S64 || Ty == P0)
6598       Opc = AArch64::ST4i64;
6599     else
6600       llvm_unreachable("Unexpected type for st4lane!");
6601     if (!selectVectorStoreLaneIntrinsic(I, 4, Opc))
6602       return false;
6603     break;
6604   }
6605   case Intrinsic::aarch64_mops_memset_tag: {
6606     // Transform
6607     //    %dst:gpr(p0) = \
6608     //      G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
6609     //      \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
6610     // where %dst is updated, into
6611     //    %Rd:GPR64common, %Rn:GPR64) = \
6612     //      MOPSMemorySetTaggingPseudo \
6613     //      %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
6614     // where Rd and Rn are tied.
6615     // It is expected that %val has been extended to s64 in legalization.
6616     // Note that the order of the size/value operands are swapped.
6617
6618     Register DstDef = I.getOperand(0).getReg();
6619     // I.getOperand(1) is the intrinsic function
6620     Register DstUse = I.getOperand(2).getReg();
6621     Register ValUse = I.getOperand(3).getReg();
6622     Register SizeUse = I.getOperand(4).getReg();
6623
6624     // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
6625     // Therefore an additional virtual register is requried for the updated size
6626     // operand. This value is not accessible via the semantics of the intrinsic.
6627     Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));
6628
6629     auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
6630                                  {DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
6631     Memset.cloneMemRefs(I);
6632     constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
6633     break;
6634   }
6635   }
6636
6637   I.eraseFromParent();
6638   return true;
6639 }
6640
6641 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
6642                                                  MachineRegisterInfo &MRI) {
6643   unsigned IntrinID = cast<GIntrinsic>(I).getIntrinsicID();
6644
6645   switch (IntrinID) {
6646   default:
6647     break;
6648   case Intrinsic::aarch64_crypto_sha1h: {
6649     Register DstReg = I.getOperand(0).getReg();
6650     Register SrcReg = I.getOperand(2).getReg();
6651
6652     // FIXME: Should this be an assert?
6653     if (MRI.getType(DstReg).getSizeInBits() != 32 ||
6654         MRI.getType(SrcReg).getSizeInBits() != 32)
6655       return false;
6656
6657     // The operation has to happen on FPRs. Set up some new FPR registers for
6658     // the source and destination if they are on GPRs.
6659     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
6660       SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6661       MIB.buildCopy({SrcReg}, {I.getOperand(2)});
6662
6663       // Make sure the copy ends up getting constrained properly.
6664       RBI.constrainGenericRegister(I.getOperand(2).getReg(),
6665                                    AArch64::GPR32RegClass, MRI);
6666     }
6667
6668     if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
6669       DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6670
6671     // Actually insert the instruction.
6672     auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
6673     constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
6674
6675     // Did we create a new register for the destination?
6676     if (DstReg != I.getOperand(0).getReg()) {
6677       // Yep. Copy the result of the instruction back into the original
6678       // destination.
6679       MIB.buildCopy({I.getOperand(0)}, {DstReg});
6680       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
6681                                    AArch64::GPR32RegClass, MRI);
6682     }
6683
6684     I.eraseFromParent();
6685     return true;
6686   }
6687   case Intrinsic::ptrauth_resign: {
6688     Register DstReg = I.getOperand(0).getReg();
6689     Register ValReg = I.getOperand(2).getReg();
6690     uint64_t AUTKey = I.getOperand(3).getImm();
6691     Register AUTDisc = I.getOperand(4).getReg();
6692     uint64_t PACKey = I.getOperand(5).getImm();
6693     Register PACDisc = I.getOperand(6).getReg();
6694
6695     Register AUTAddrDisc = AUTDisc;
6696     uint16_t AUTConstDiscC = 0;
6697     std::tie(AUTConstDiscC, AUTAddrDisc) =
6698         extractPtrauthBlendDiscriminators(AUTDisc, MRI);
6699
6700     Register PACAddrDisc = PACDisc;
6701     uint16_t PACConstDiscC = 0;
6702     std::tie(PACConstDiscC, PACAddrDisc) =
6703         extractPtrauthBlendDiscriminators(PACDisc, MRI);
6704
6705     MIB.buildCopy({AArch64::X16}, {ValReg});
6706     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
6707     MIB.buildInstr(AArch64::AUTPAC)
6708         .addImm(AUTKey)
6709         .addImm(AUTConstDiscC)
6710         .addUse(AUTAddrDisc)
6711         .addImm(PACKey)
6712         .addImm(PACConstDiscC)
6713         .addUse(PACAddrDisc)
6714         .constrainAllUses(TII, TRI, RBI);
6715     MIB.buildCopy({DstReg}, Register(AArch64::X16));
6716
6717     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6718     I.eraseFromParent();
6719     return true;
6720   }
6721   case Intrinsic::ptrauth_auth: {
6722     Register DstReg = I.getOperand(0).getReg();
6723     Register ValReg = I.getOperand(2).getReg();
6724     uint64_t AUTKey = I.getOperand(3).getImm();
6725     Register AUTDisc = I.getOperand(4).getReg();
6726
6727     Register AUTAddrDisc = AUTDisc;
6728     uint16_t AUTConstDiscC = 0;
6729     std::tie(AUTConstDiscC, AUTAddrDisc) =
6730         extractPtrauthBlendDiscriminators(AUTDisc, MRI);
6731
6732     MIB.buildCopy({AArch64::X16}, {ValReg});
6733     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
6734     MIB.buildInstr(AArch64::AUT)
6735         .addImm(AUTKey)
6736         .addImm(AUTConstDiscC)
6737         .addUse(AUTAddrDisc)
6738         .constrainAllUses(TII, TRI, RBI);
6739     MIB.buildCopy({DstReg}, Register(AArch64::X16));
6740
6741     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6742     I.eraseFromParent();
6743     return true;
6744   }
6745   case Intrinsic::frameaddress:
6746   case Intrinsic::returnaddress: {
6747     MachineFunction &MF = *I.getParent()->getParent();
6748     MachineFrameInfo &MFI = MF.getFrameInfo();
6749
6750     unsigned Depth = I.getOperand(2).getImm();
6751     Register DstReg = I.getOperand(0).getReg();
6752     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6753
6754     if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
6755       if (!MFReturnAddr) {
6756         // Insert the copy from LR/X30 into the entry block, before it can be
6757         // clobbered by anything.
6758         MFI.setReturnAddressIsTaken(true);
6759         MFReturnAddr = getFunctionLiveInPhysReg(
6760             MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
6761       }
6762
6763       if (STI.hasPAuth()) {
6764         MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
6765       } else {
6766         MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
6767         MIB.buildInstr(AArch64::XPACLRI);
6768         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6769       }
6770
6771       I.eraseFromParent();
6772       return true;
6773     }
6774
6775     MFI.setFrameAddressIsTaken(true);
6776     Register FrameAddr(AArch64::FP);
6777     while (Depth--) {
6778       Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
6779       auto Ldr =
6780           MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
6781       constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
6782       FrameAddr = NextFrame;
6783     }
6784
6785     if (IntrinID == Intrinsic::frameaddress)
6786       MIB.buildCopy({DstReg}, {FrameAddr});
6787     else {
6788       MFI.setReturnAddressIsTaken(true);
6789
6790       if (STI.hasPAuth()) {
6791         Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
6792         MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
6793         MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
6794       } else {
6795         MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
6796             .addImm(1);
6797         MIB.buildInstr(AArch64::XPACLRI);
6798         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6799       }
6800     }
6801
6802     I.eraseFromParent();
6803     return true;
6804   }
6805   case Intrinsic::aarch64_neon_tbl2:
6806     SelectTable(I, MRI, 2, AArch64::TBLv8i8Two, AArch64::TBLv16i8Two, false);
6807     return true;
6808   case Intrinsic::aarch64_neon_tbl3:
6809     SelectTable(I, MRI, 3, AArch64::TBLv8i8Three, AArch64::TBLv16i8Three,
6810                 false);
6811     return true;
6812   case Intrinsic::aarch64_neon_tbl4:
6813     SelectTable(I, MRI, 4, AArch64::TBLv8i8Four, AArch64::TBLv16i8Four, false);
6814     return true;
6815   case Intrinsic::aarch64_neon_tbx2:
6816     SelectTable(I, MRI, 2, AArch64::TBXv8i8Two, AArch64::TBXv16i8Two, true);
6817     return true;
6818   case Intrinsic::aarch64_neon_tbx3:
6819     SelectTable(I, MRI, 3, AArch64::TBXv8i8Three, AArch64::TBXv16i8Three, true);
6820     return true;
6821   case Intrinsic::aarch64_neon_tbx4:
6822     SelectTable(I, MRI, 4, AArch64::TBXv8i8Four, AArch64::TBXv16i8Four, true);
6823     return true;
6824   case Intrinsic::swift_async_context_addr:
6825     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
6826                               {Register(AArch64::FP)})
6827                    .addImm(8)
6828                    .addImm(0);
6829     constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
6830
6831     MF->getFrameInfo().setFrameAddressIsTaken(true);
6832     MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6833     I.eraseFromParent();
6834     return true;
6835   }
6836   return false;
6837 }
6838
6839 // G_PTRAUTH_GLOBAL_VALUE lowering
6840 //
6841 // We have 3 lowering alternatives to choose from:
6842 // - MOVaddrPAC: similar to MOVaddr, with added PAC.
6843 //   If the GV doesn't need a GOT load (i.e., is locally defined)
6844 //   materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
6845 //
6846 // - LOADgotPAC: similar to LOADgot, with added PAC.
6847 //   If the GV needs a GOT load, materialize the pointer using the usual
6848 //   GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
6849 //   section is assumed to be read-only (for example, via relro mechanism). See
6850 //   LowerMOVaddrPAC.
6851 //
6852 // - LOADauthptrstatic: similar to LOADgot, but use a
6853 //   special stub slot instead of a GOT slot.
6854 //   Load a signed pointer for symbol 'sym' from a stub slot named
6855 //   'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
6856 //   resolving. This usually lowers to adrp+ldr, but also emits an entry into
6857 //   .data with an
6858 //   @AUTH relocation. See LowerLOADauthptrstatic.
6859 //
6860 // All 3 are pseudos that are expand late to longer sequences: this lets us
6861 // provide integrity guarantees on the to-be-signed intermediate values.
6862 //
6863 // LOADauthptrstatic is undesirable because it requires a large section filled
6864 // with often similarly-signed pointers, making it a good harvesting target.
6865 // Thus, it's only used for ptrauth references to extern_weak to avoid null
6866 // checks.
6867
6868 bool AArch64InstructionSelector::selectPtrAuthGlobalValue(
6869     MachineInstr &I, MachineRegisterInfo &MRI) const {
6870   Register DefReg = I.getOperand(0).getReg();
6871   Register Addr = I.getOperand(1).getReg();
6872   uint64_t Key = I.getOperand(2).getImm();
6873   Register AddrDisc = I.getOperand(3).getReg();
6874   uint64_t Disc = I.getOperand(4).getImm();
6875   int64_t Offset = 0;
6876
6877   if (Key > AArch64PACKey::LAST)
6878     report_fatal_error("key in ptrauth global out of range [0, " +
6879                        Twine((int)AArch64PACKey::LAST) + "]");
6880
6881   // Blend only works if the integer discriminator is 16-bit wide.
6882   if (!isUInt<16>(Disc))
6883     report_fatal_error(
6884         "constant discriminator in ptrauth global out of range [0, 0xffff]");
6885
6886   // Choosing between 3 lowering alternatives is target-specific.
6887   if (!STI.isTargetELF() && !STI.isTargetMachO())
6888     report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
6889
6890   if (!MRI.hasOneDef(Addr))
6891     return false;
6892
6893   // First match any offset we take from the real global.
6894   const MachineInstr *DefMI = &*MRI.def_instr_begin(Addr);
6895   if (DefMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
6896     Register OffsetReg = DefMI->getOperand(2).getReg();
6897     if (!MRI.hasOneDef(OffsetReg))
6898       return false;
6899     const MachineInstr &OffsetMI = *MRI.def_instr_begin(OffsetReg);
6900     if (OffsetMI.getOpcode() != TargetOpcode::G_CONSTANT)
6901       return false;
6902
6903     Addr = DefMI->getOperand(1).getReg();
6904     if (!MRI.hasOneDef(Addr))
6905       return false;
6906
6907     DefMI = &*MRI.def_instr_begin(Addr);
6908     Offset = OffsetMI.getOperand(1).getCImm()->getSExtValue();
6909   }
6910
6911   // We should be left with a genuine unauthenticated GlobalValue.
6912   const GlobalValue *GV;
6913   if (DefMI->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
6914     GV = DefMI->getOperand(1).getGlobal();
6915     Offset += DefMI->getOperand(1).getOffset();
6916   } else if (DefMI->getOpcode() == AArch64::G_ADD_LOW) {
6917     GV = DefMI->getOperand(2).getGlobal();
6918     Offset += DefMI->getOperand(2).getOffset();
6919   } else {
6920     return false;
6921   }
6922
6923   MachineIRBuilder MIB(I);
6924
6925   // Classify the reference to determine whether it needs a GOT load.
6926   unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
6927   const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
6928   assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
6929          "unsupported non-GOT op flags on ptrauth global reference");
6930   assert((!GV->hasExternalWeakLinkage() || NeedsGOTLoad) &&
6931          "unsupported non-GOT reference to weak ptrauth global");
6932
6933   std::optional<APInt> AddrDiscVal = getIConstantVRegVal(AddrDisc, MRI);
6934   bool HasAddrDisc = !AddrDiscVal || *AddrDiscVal != 0;
6935
6936   // Non-extern_weak:
6937   // - No GOT load needed -> MOVaddrPAC
6938   // - GOT load for non-extern_weak -> LOADgotPAC
6939   //   Note that we disallow extern_weak refs to avoid null checks later.
6940   if (!GV->hasExternalWeakLinkage()) {
6941     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X16}, {});
6942     MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {AArch64::X17}, {});
6943     MIB.buildInstr(NeedsGOTLoad ? AArch64::LOADgotPAC : AArch64::MOVaddrPAC)
6944         .addGlobalAddress(GV, Offset)
6945         .addImm(Key)
6946         .addReg(HasAddrDisc ? AddrDisc : AArch64::XZR)
6947         .addImm(Disc)
6948         .constrainAllUses(TII, TRI, RBI);
6949     MIB.buildCopy(DefReg, Register(AArch64::X16));
6950     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
6951     I.eraseFromParent();
6952     return true;
6953   }
6954
6955   // extern_weak -> LOADauthptrstatic
6956
6957   // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
6958   // offset alone as a pointer if the symbol wasn't available, which would
6959   // probably break null checks in users. Ptrauth complicates things further:
6960   // error out.
6961   if (Offset != 0)
6962     report_fatal_error(
6963         "unsupported non-zero offset in weak ptrauth global reference");
6964
6965   if (HasAddrDisc)
6966     report_fatal_error("unsupported weak addr-div ptrauth global");
6967
6968   MIB.buildInstr(AArch64::LOADauthptrstatic, {DefReg}, {})
6969       .addGlobalAddress(GV, Offset)
6970       .addImm(Key)
6971       .addImm(Disc);
6972   RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
6973
6974   I.eraseFromParent();
6975   return true;
6976 }
6977
6978 void AArch64InstructionSelector::SelectTable(MachineInstr &I,
6979                                              MachineRegisterInfo &MRI,
6980                                              unsigned NumVec, unsigned Opc1,
6981                                              unsigned Opc2, bool isExt) {
6982   Register DstReg = I.getOperand(0).getReg();
6983   unsigned Opc = MRI.getType(DstReg) == LLT::fixed_vector(8, 8) ? Opc1 : Opc2;
6984
6985   // Create the REG_SEQUENCE
6986   SmallVector<Register, 4> Regs;
6987   for (unsigned i = 0; i < NumVec; i++)
6988     Regs.push_back(I.getOperand(i + 2 + isExt).getReg());
6989   Register RegSeq = createQTuple(Regs, MIB);
6990
6991   Register IdxReg = I.getOperand(2 + NumVec + isExt).getReg();
6992   MachineInstrBuilder Instr;
6993   if (isExt) {
6994     Register Reg = I.getOperand(2).getReg();
6995     Instr = MIB.buildInstr(Opc, {DstReg}, {Reg, RegSeq, IdxReg});
6996   } else
6997     Instr = MIB.buildInstr(Opc, {DstReg}, {RegSeq, IdxReg});
6998   constrainSelectedInstRegOperands(*Instr, TII, TRI, RBI);
6999   I.eraseFromParent();
7000 }
7001
7002 InstructionSelector::ComplexRendererFns
7003 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
7004   auto MaybeImmed = getImmedFromMO(Root);
7005   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
7006     return std::nullopt;
7007   uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
7008   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
7009 }
7010
7011 InstructionSelector::ComplexRendererFns
7012 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
7013   auto MaybeImmed = getImmedFromMO(Root);
7014   if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
7015     return std::nullopt;
7016   uint64_t Enc = 31 - *MaybeImmed;
7017   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
7018 }
7019
7020 InstructionSelector::ComplexRendererFns
7021 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
7022   auto MaybeImmed = getImmedFromMO(Root);
7023   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
7024     return std::nullopt;
7025   uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
7026   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
7027 }
7028
7029 InstructionSelector::ComplexRendererFns
7030 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
7031   auto MaybeImmed = getImmedFromMO(Root);
7032   if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
7033     return std::nullopt;
7034   uint64_t Enc = 63 - *MaybeImmed;
7035   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
7036 }
7037
7038 /// Helper to select an immediate value that can be represented as a 12-bit
7039 /// value shifted left by either 0 or 12. If it is possible to do so, return
7040 /// the immediate and shift value. If not, return std::nullopt.
7041 ///
7042 /// Used by selectArithImmed and selectNegArithImmed.
7043 InstructionSelector::ComplexRendererFns
7044 AArch64InstructionSelector::select12BitValueWithLeftShift(
7045     uint64_t Immed) const {
7046   unsigned ShiftAmt;
7047   if (Immed >> 12 == 0) {
7048     ShiftAmt = 0;
7049   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
7050     ShiftAmt = 12;
7051     Immed = Immed >> 12;
7052   } else
7053     return std::nullopt;
7054
7055   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
7056   return {{
7057       [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
7058       [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
7059   }};
7060 }
7061
7062 /// SelectArithImmed - Select an immediate value that can be represented as
7063 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
7064 /// Val set to the 12-bit value and Shift set to the shifter operand.
7065 InstructionSelector::ComplexRendererFns
7066 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
7067   // This function is called from the addsub_shifted_imm ComplexPattern,
7068   // which lists [imm] as the list of opcode it's interested in, however
7069   // we still need to check whether the operand is actually an immediate
7070   // here because the ComplexPattern opcode list is only used in
7071   // root-level opcode matching.
7072   auto MaybeImmed = getImmedFromMO(Root);
7073   if (MaybeImmed == std::nullopt)
7074     return std::nullopt;
7075   return select12BitValueWithLeftShift(*MaybeImmed);
7076 }
7077
7078 /// SelectNegArithImmed - As above, but negates the value before trying to
7079 /// select it.
7080 InstructionSelector::ComplexRendererFns
7081 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
7082   // We need a register here, because we need to know if we have a 64 or 32
7083   // bit immediate.
7084   if (!Root.isReg())
7085     return std::nullopt;
7086   auto MaybeImmed = getImmedFromMO(Root);
7087   if (MaybeImmed == std::nullopt)
7088     return std::nullopt;
7089   uint64_t Immed = *MaybeImmed;
7090
7091   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
7092   // have the opposite effect on the C flag, so this pattern mustn't match under
7093   // those circumstances.
7094   if (Immed == 0)
7095     return std::nullopt;
7096
7097   // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
7098   // the root.
7099   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7100   if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
7101     Immed = ~((uint32_t)Immed) + 1;
7102   else
7103     Immed = ~Immed + 1ULL;
7104
7105   if (Immed & 0xFFFFFFFFFF000000ULL)
7106     return std::nullopt;
7107
7108   Immed &= 0xFFFFFFULL;
7109   return select12BitValueWithLeftShift(Immed);
7110 }
7111
7112 /// Checks if we are sure that folding MI into load/store addressing mode is
7113 /// beneficial or not.
7114 ///
7115 /// Returns:
7116 /// - true if folding MI would be beneficial.
7117 /// - false if folding MI would be bad.
7118 /// - std::nullopt if it is not sure whether folding MI is beneficial.
7119 ///
7120 /// \p MI can be the offset operand of G_PTR_ADD, e.g. G_SHL in the example:
7121 ///
7122 /// %13:gpr(s64) = G_CONSTANT i64 1
7123 /// %8:gpr(s64) = G_SHL %6, %13(s64)
7124 /// %9:gpr(p0) = G_PTR_ADD %0, %8(s64)
7125 /// %12:gpr(s32) = G_LOAD %9(p0) :: (load (s16))
7126 std::optional<bool> AArch64InstructionSelector::isWorthFoldingIntoAddrMode(
7127     MachineInstr &MI, const MachineRegisterInfo &MRI) const {
7128   if (MI.getOpcode() == AArch64::G_SHL) {
7129     // Address operands with shifts are free, except for running on subtargets
7130     // with AddrLSLSlow14.
7131     if (const auto ValAndVeg = getIConstantVRegValWithLookThrough(
7132             MI.getOperand(2).getReg(), MRI)) {
7133       const APInt ShiftVal = ValAndVeg->Value;
7134
7135       // Don't fold if we know this will be slow.
7136       return !(STI.hasAddrLSLSlow14() && (ShiftVal == 1 || ShiftVal == 4));
7137     }
7138   }
7139   return std::nullopt;
7140 }
7141
7142 /// Return true if it is worth folding MI into an extended register. That is,
7143 /// if it's safe to pull it into the addressing mode of a load or store as a
7144 /// shift.
7145 /// \p IsAddrOperand whether the def of MI is used as an address operand
7146 /// (e.g. feeding into an LDR/STR).
7147 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
7148     MachineInstr &MI, const MachineRegisterInfo &MRI,
7149     bool IsAddrOperand) const {
7150
7151   // Always fold if there is one use, or if we're optimizing for size.
7152   Register DefReg = MI.getOperand(0).getReg();
7153   if (MRI.hasOneNonDBGUse(DefReg) ||
7154       MI.getParent()->getParent()->getFunction().hasOptSize())
7155     return true;
7156
7157   if (IsAddrOperand) {
7158     // If we are already sure that folding MI is good or bad, return the result.
7159     if (const auto Worth = isWorthFoldingIntoAddrMode(MI, MRI))
7160       return *Worth;
7161
7162     // Fold G_PTR_ADD if its offset operand can be folded
7163     if (MI.getOpcode() == AArch64::G_PTR_ADD) {
7164       MachineInstr *OffsetInst =
7165           getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
7166
7167       // Note, we already know G_PTR_ADD is used by at least two instructions.
7168       // If we are also sure about whether folding is beneficial or not,
7169       // return the result.
7170       if (const auto Worth = isWorthFoldingIntoAddrMode(*OffsetInst, MRI))
7171         return *Worth;
7172     }
7173   }
7174
7175   // FIXME: Consider checking HasALULSLFast as appropriate.
7176
7177   // We have a fastpath, so folding a shift in and potentially computing it
7178   // many times may be beneficial. Check if this is only used in memory ops.
7179   // If it is, then we should fold.
7180   return all_of(MRI.use_nodbg_instructions(DefReg),
7181                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
7182 }
7183
7184 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
7185   switch (Type) {
7186   case AArch64_AM::SXTB:
7187   case AArch64_AM::SXTH:
7188   case AArch64_AM::SXTW:
7189     return true;
7190   default:
7191     return false;
7192   }
7193 }
7194
7195 InstructionSelector::ComplexRendererFns
7196 AArch64InstructionSelector::selectExtendedSHL(
7197     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
7198     unsigned SizeInBytes, bool WantsExt) const {
7199   assert(Base.isReg() && "Expected base to be a register operand");
7200   assert(Offset.isReg() && "Expected offset to be a register operand");
7201
7202   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7203   MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
7204
7205   unsigned OffsetOpc = OffsetInst->getOpcode();
7206   bool LookedThroughZExt = false;
7207   if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
7208     // Try to look through a ZEXT.
7209     if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
7210       return std::nullopt;
7211
7212     OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
7213     OffsetOpc = OffsetInst->getOpcode();
7214     LookedThroughZExt = true;
7215
7216     if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
7217       return std::nullopt;
7218   }
7219   // Make sure that the memory op is a valid size.
7220   int64_t LegalShiftVal = Log2_32(SizeInBytes);
7221   if (LegalShiftVal == 0)
7222     return std::nullopt;
7223   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true))
7224     return std::nullopt;
7225
7226   // Now, try to find the specific G_CONSTANT. Start by assuming that the
7227   // register we will offset is the LHS, and the register containing the
7228   // constant is the RHS.
7229   Register OffsetReg = OffsetInst->getOperand(1).getReg();
7230   Register ConstantReg = OffsetInst->getOperand(2).getReg();
7231   auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
7232   if (!ValAndVReg) {
7233     // We didn't get a constant on the RHS. If the opcode is a shift, then
7234     // we're done.
7235     if (OffsetOpc == TargetOpcode::G_SHL)
7236       return std::nullopt;
7237
7238     // If we have a G_MUL, we can use either register. Try looking at the RHS.
7239     std::swap(OffsetReg, ConstantReg);
7240     ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
7241     if (!ValAndVReg)
7242       return std::nullopt;
7243   }
7244
7245   // The value must fit into 3 bits, and must be positive. Make sure that is
7246   // true.
7247   int64_t ImmVal = ValAndVReg->Value.getSExtValue();
7248
7249   // Since we're going to pull this into a shift, the constant value must be
7250   // a power of 2. If we got a multiply, then we need to check this.
7251   if (OffsetOpc == TargetOpcode::G_MUL) {
7252     if (!llvm::has_single_bit<uint32_t>(ImmVal))
7253       return std::nullopt;
7254
7255     // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
7256     ImmVal = Log2_32(ImmVal);
7257   }
7258
7259   if ((ImmVal & 0x7) != ImmVal)
7260     return std::nullopt;
7261
7262   // We are only allowed to shift by LegalShiftVal. This shift value is built
7263   // into the instruction, so we can't just use whatever we want.
7264   if (ImmVal != LegalShiftVal)
7265     return std::nullopt;
7266
7267   unsigned SignExtend = 0;
7268   if (WantsExt) {
7269     // Check if the offset is defined by an extend, unless we looked through a
7270     // G_ZEXT earlier.
7271     if (!LookedThroughZExt) {
7272       MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
7273       auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
7274       if (Ext == AArch64_AM::InvalidShiftExtend)
7275         return std::nullopt;
7276
7277       SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
7278       // We only support SXTW for signed extension here.
7279       if (SignExtend && Ext != AArch64_AM::SXTW)
7280         return std::nullopt;
7281       OffsetReg = ExtInst->getOperand(1).getReg();
7282     }
7283
7284     // Need a 32-bit wide register here.
7285     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
7286     OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
7287   }
7288
7289   // We can use the LHS of the GEP as the base, and the LHS of the shift as an
7290   // offset. Signify that we are shifting by setting the shift flag to 1.
7291   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
7292            [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
7293            [=](MachineInstrBuilder &MIB) {
7294              // Need to add both immediates here to make sure that they are both
7295              // added to the instruction.
7296              MIB.addImm(SignExtend);
7297              MIB.addImm(1);
7298            }}};
7299 }
7300
7301 /// This is used for computing addresses like this:
7302 ///
7303 /// ldr x1, [x2, x3, lsl #3]
7304 ///
7305 /// Where x2 is the base register, and x3 is an offset register. The shift-left
7306 /// is a constant value specific to this load instruction. That is, we'll never
7307 /// see anything other than a 3 here (which corresponds to the size of the
7308 /// element being loaded.)
7309 InstructionSelector::ComplexRendererFns
7310 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
7311     MachineOperand &Root, unsigned SizeInBytes) const {
7312   if (!Root.isReg())
7313     return std::nullopt;
7314   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7315
7316   // We want to find something like this:
7317   //
7318   // val = G_CONSTANT LegalShiftVal
7319   // shift = G_SHL off_reg val
7320   // ptr = G_PTR_ADD base_reg shift
7321   // x = G_LOAD ptr
7322   //
7323   // And fold it into this addressing mode:
7324   //
7325   // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
7326
7327   // Check if we can find the G_PTR_ADD.
7328   MachineInstr *PtrAdd =
7329       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7330   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true))
7331     return std::nullopt;
7332
7333   // Now, try to match an opcode which will match our specific offset.
7334   // We want a G_SHL or a G_MUL.
7335   MachineInstr *OffsetInst =
7336       getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
7337   return selectExtendedSHL(Root, PtrAdd->getOperand(1),
7338                            OffsetInst->getOperand(0), SizeInBytes,
7339                            /*WantsExt=*/false);
7340 }
7341
7342 /// This is used for computing addresses like this:
7343 ///
7344 /// ldr x1, [x2, x3]
7345 ///
7346 /// Where x2 is the base register, and x3 is an offset register.
7347 ///
7348 /// When possible (or profitable) to fold a G_PTR_ADD into the address
7349 /// calculation, this will do so. Otherwise, it will return std::nullopt.
7350 InstructionSelector::ComplexRendererFns
7351 AArch64InstructionSelector::selectAddrModeRegisterOffset(
7352     MachineOperand &Root) const {
7353   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7354
7355   // We need a GEP.
7356   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
7357   if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
7358     return std::nullopt;
7359
7360   // If this is used more than once, let's not bother folding.
7361   // TODO: Check if they are memory ops. If they are, then we can still fold
7362   // without having to recompute anything.
7363   if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
7364     return std::nullopt;
7365
7366   // Base is the GEP's LHS, offset is its RHS.
7367   return {{[=](MachineInstrBuilder &MIB) {
7368              MIB.addUse(Gep->getOperand(1).getReg());
7369            },
7370            [=](MachineInstrBuilder &MIB) {
7371              MIB.addUse(Gep->getOperand(2).getReg());
7372            },
7373            [=](MachineInstrBuilder &MIB) {
7374              // Need to add both immediates here to make sure that they are both
7375              // added to the instruction.
7376              MIB.addImm(0);
7377              MIB.addImm(0);
7378            }}};
7379 }
7380
7381 /// This is intended to be equivalent to selectAddrModeXRO in
7382 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
7383 InstructionSelector::ComplexRendererFns
7384 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
7385                                               unsigned SizeInBytes) const {
7386   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7387   if (!Root.isReg())
7388     return std::nullopt;
7389   MachineInstr *PtrAdd =
7390       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7391   if (!PtrAdd)
7392     return std::nullopt;
7393
7394   // Check for an immediates which cannot be encoded in the [base + imm]
7395   // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
7396   // end up with code like:
7397   //
7398   // mov x0, wide
7399   // add x1 base, x0
7400   // ldr x2, [x1, x0]
7401   //
7402   // In this situation, we can use the [base, xreg] addressing mode to save an
7403   // add/sub:
7404   //
7405   // mov x0, wide
7406   // ldr x2, [base, x0]
7407   auto ValAndVReg =
7408       getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
7409   if (ValAndVReg) {
7410     unsigned Scale = Log2_32(SizeInBytes);
7411     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
7412
7413     // Skip immediates that can be selected in the load/store addresing
7414     // mode.
7415     if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
7416         ImmOff < (0x1000 << Scale))
7417       return std::nullopt;
7418
7419     // Helper lambda to decide whether or not it is preferable to emit an add.
7420     auto isPreferredADD = [](int64_t ImmOff) {
7421       // Constants in [0x0, 0xfff] can be encoded in an add.
7422       if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
7423         return true;
7424
7425       // Can it be encoded in an add lsl #12?
7426       if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
7427         return false;
7428
7429       // It can be encoded in an add lsl #12, but we may not want to. If it is
7430       // possible to select this as a single movz, then prefer that. A single
7431       // movz is faster than an add with a shift.
7432       return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
7433              (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
7434     };
7435
7436     // If the immediate can be encoded in a single add/sub, then bail out.
7437     if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
7438       return std::nullopt;
7439   }
7440
7441   // Try to fold shifts into the addressing mode.
7442   auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
7443   if (AddrModeFns)
7444     return AddrModeFns;
7445
7446   // If that doesn't work, see if it's possible to fold in registers from
7447   // a GEP.
7448   return selectAddrModeRegisterOffset(Root);
7449 }
7450
7451 /// This is used for computing addresses like this:
7452 ///
7453 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
7454 ///
7455 /// Where we have a 64-bit base register, a 32-bit offset register, and an
7456 /// extend (which may or may not be signed).
7457 InstructionSelector::ComplexRendererFns
7458 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
7459                                               unsigned SizeInBytes) const {
7460   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7461
7462   MachineInstr *PtrAdd =
7463       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
7464   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI, true))
7465     return std::nullopt;
7466
7467   MachineOperand &LHS = PtrAdd->getOperand(1);
7468   MachineOperand &RHS = PtrAdd->getOperand(2);
7469   MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
7470
7471   // The first case is the same as selectAddrModeXRO, except we need an extend.
7472   // In this case, we try to find a shift and extend, and fold them into the
7473   // addressing mode.
7474   //
7475   // E.g.
7476   //
7477   // off_reg = G_Z/S/ANYEXT ext_reg
7478   // val = G_CONSTANT LegalShiftVal
7479   // shift = G_SHL off_reg val
7480   // ptr = G_PTR_ADD base_reg shift
7481   // x = G_LOAD ptr
7482   //
7483   // In this case we can get a load like this:
7484   //
7485   // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
7486   auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
7487                                        SizeInBytes, /*WantsExt=*/true);
7488   if (ExtendedShl)
7489     return ExtendedShl;
7490
7491   // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
7492   //
7493   // e.g.
7494   // ldr something, [base_reg, ext_reg, sxtw]
7495   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI, true))
7496     return std::nullopt;
7497
7498   // Check if this is an extend. We'll get an extend type if it is.
7499   AArch64_AM::ShiftExtendType Ext =
7500       getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
7501   if (Ext == AArch64_AM::InvalidShiftExtend)
7502     return std::nullopt;
7503
7504   // Need a 32-bit wide register.
7505   MachineIRBuilder MIB(*PtrAdd);
7506   Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
7507                                        AArch64::GPR32RegClass, MIB);
7508   unsigned SignExtend = Ext == AArch64_AM::SXTW;
7509
7510   // Base is LHS, offset is ExtReg.
7511   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
7512            [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
7513            [=](MachineInstrBuilder &MIB) {
7514              MIB.addImm(SignExtend);
7515              MIB.addImm(0);
7516            }}};
7517 }
7518
7519 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
7520 /// should only match when there is an offset that is not valid for a scaled
7521 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
7522 /// memory reference, which is needed here to know what is valid for a scaled
7523 /// immediate.
7524 InstructionSelector::ComplexRendererFns
7525 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
7526                                                    unsigned Size) const {
7527   MachineRegisterInfo &MRI =
7528       Root.getParent()->getParent()->getParent()->getRegInfo();
7529
7530   if (!Root.isReg())
7531     return std::nullopt;
7532
7533   if (!isBaseWithConstantOffset(Root, MRI))
7534     return std::nullopt;
7535
7536   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
7537
7538   MachineOperand &OffImm = RootDef->getOperand(2);
7539   if (!OffImm.isReg())
7540     return std::nullopt;
7541   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
7542   if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
7543     return std::nullopt;
7544   int64_t RHSC;
7545   MachineOperand &RHSOp1 = RHS->getOperand(1);
7546   if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
7547     return std::nullopt;
7548   RHSC = RHSOp1.getCImm()->getSExtValue();
7549
7550   if (RHSC >= -256 && RHSC < 256) {
7551     MachineOperand &Base = RootDef->getOperand(1);
7552     return {{
7553         [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
7554         [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
7555     }};
7556   }
7557   return std::nullopt;
7558 }
7559
7560 InstructionSelector::ComplexRendererFns
7561 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
7562                                                  unsigned Size,
7563                                                  MachineRegisterInfo &MRI) const {
7564   if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
7565     return std::nullopt;
7566   MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
7567   if (Adrp.getOpcode() != AArch64::ADRP)
7568     return std::nullopt;
7569
7570   // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
7571   auto Offset = Adrp.getOperand(1).getOffset();
7572   if (Offset % Size != 0)
7573     return std::nullopt;
7574
7575   auto GV = Adrp.getOperand(1).getGlobal();
7576   if (GV->isThreadLocal())
7577     return std::nullopt;
7578
7579   auto &MF = *RootDef.getParent()->getParent();
7580   if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
7581     return std::nullopt;
7582
7583   unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
7584   MachineIRBuilder MIRBuilder(RootDef);
7585   Register AdrpReg = Adrp.getOperand(0).getReg();
7586   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
7587            [=](MachineInstrBuilder &MIB) {
7588              MIB.addGlobalAddress(GV, Offset,
7589                                   OpFlags | AArch64II::MO_PAGEOFF |
7590                                       AArch64II::MO_NC);
7591            }}};
7592 }
7593
7594 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
7595 /// "Size" argument is the size in bytes of the memory reference, which
7596 /// determines the scale.
7597 InstructionSelector::ComplexRendererFns
7598 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
7599                                                   unsigned Size) const {
7600   MachineFunction &MF = *Root.getParent()->getParent()->getParent();
7601   MachineRegisterInfo &MRI = MF.getRegInfo();
7602
7603   if (!Root.isReg())
7604     return std::nullopt;
7605
7606   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
7607   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
7608     return {{
7609         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
7610         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
7611     }};
7612   }
7613
7614   CodeModel::Model CM = MF.getTarget().getCodeModel();
7615   // Check if we can fold in the ADD of small code model ADRP + ADD address.
7616   if (CM == CodeModel::Small) {
7617     auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
7618     if (OpFns)
7619       return OpFns;
7620   }
7621
7622   if (isBaseWithConstantOffset(Root, MRI)) {
7623     MachineOperand &LHS = RootDef->getOperand(1);
7624     MachineOperand &RHS = RootDef->getOperand(2);
7625     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
7626     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
7627
7628     int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
7629     unsigned Scale = Log2_32(Size);
7630     if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
7631       if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
7632         return {{
7633             [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
7634             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
7635         }};
7636
7637       return {{
7638           [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
7639           [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
7640       }};
7641     }
7642   }
7643
7644   // Before falling back to our general case, check if the unscaled
7645   // instructions can handle this. If so, that's preferable.
7646   if (selectAddrModeUnscaled(Root, Size))
7647     return std::nullopt;
7648
7649   return {{
7650       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
7651       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
7652   }};
7653 }
7654
7655 /// Given a shift instruction, return the correct shift type for that
7656 /// instruction.
7657 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
7658   switch (MI.getOpcode()) {
7659   default:
7660     return AArch64_AM::InvalidShiftExtend;
7661   case TargetOpcode::G_SHL:
7662     return AArch64_AM::LSL;
7663   case TargetOpcode::G_LSHR:
7664     return AArch64_AM::LSR;
7665   case TargetOpcode::G_ASHR:
7666     return AArch64_AM::ASR;
7667   case TargetOpcode::G_ROTR:
7668     return AArch64_AM::ROR;
7669   }
7670 }
7671
7672 /// Select a "shifted register" operand. If the value is not shifted, set the
7673 /// shift operand to a default value of "lsl 0".
7674 InstructionSelector::ComplexRendererFns
7675 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
7676                                                   bool AllowROR) const {
7677   if (!Root.isReg())
7678     return std::nullopt;
7679   MachineRegisterInfo &MRI =
7680       Root.getParent()->getParent()->getParent()->getRegInfo();
7681
7682   // Check if the operand is defined by an instruction which corresponds to
7683   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
7684   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
7685   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
7686   if (ShType == AArch64_AM::InvalidShiftExtend)
7687     return std::nullopt;
7688   if (ShType == AArch64_AM::ROR && !AllowROR)
7689     return std::nullopt;
7690   if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI, false))
7691     return std::nullopt;
7692
7693   // Need an immediate on the RHS.
7694   MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
7695   auto Immed = getImmedFromMO(ShiftRHS);
7696   if (!Immed)
7697     return std::nullopt;
7698
7699   // We have something that we can fold. Fold in the shift's LHS and RHS into
7700   // the instruction.
7701   MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
7702   Register ShiftReg = ShiftLHS.getReg();
7703
7704   unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
7705   unsigned Val = *Immed & (NumBits - 1);
7706   unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
7707
7708   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
7709            [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
7710 }
7711
7712 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
7713     MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
7714   unsigned Opc = MI.getOpcode();
7715
7716   // Handle explicit extend instructions first.
7717   if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
7718     unsigned Size;
7719     if (Opc == TargetOpcode::G_SEXT)
7720       Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
7721     else
7722       Size = MI.getOperand(2).getImm();
7723     assert(Size != 64 && "Extend from 64 bits?");
7724     switch (Size) {
7725     case 8:
7726       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
7727     case 16:
7728       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
7729     case 32:
7730       return AArch64_AM::SXTW;
7731     default:
7732       return AArch64_AM::InvalidShiftExtend;
7733     }
7734   }
7735
7736   if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
7737     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
7738     assert(Size != 64 && "Extend from 64 bits?");
7739     switch (Size) {
7740     case 8:
7741       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
7742     case 16:
7743       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
7744     case 32:
7745       return AArch64_AM::UXTW;
7746     default:
7747       return AArch64_AM::InvalidShiftExtend;
7748     }
7749   }
7750
7751   // Don't have an explicit extend. Try to handle a G_AND with a constant mask
7752   // on the RHS.
7753   if (Opc != TargetOpcode::G_AND)
7754     return AArch64_AM::InvalidShiftExtend;
7755
7756   std::optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
7757   if (!MaybeAndMask)
7758     return AArch64_AM::InvalidShiftExtend;
7759   uint64_t AndMask = *MaybeAndMask;
7760   switch (AndMask) {
7761   default:
7762     return AArch64_AM::InvalidShiftExtend;
7763   case 0xFF:
7764     return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
7765   case 0xFFFF:
7766     return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
7767   case 0xFFFFFFFF:
7768     return AArch64_AM::UXTW;
7769   }
7770 }
7771
7772 Register AArch64InstructionSelector::moveScalarRegClass(
7773     Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
7774   MachineRegisterInfo &MRI = *MIB.getMRI();
7775   auto Ty = MRI.getType(Reg);
7776   assert(!Ty.isVector() && "Expected scalars only!");
7777   if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
7778     return Reg;
7779
7780   // Create a copy and immediately select it.
7781   // FIXME: We should have an emitCopy function?
7782   auto Copy = MIB.buildCopy({&RC}, {Reg});
7783   selectCopy(*Copy, TII, MRI, TRI, RBI);
7784   return Copy.getReg(0);
7785 }
7786
7787 /// Select an "extended register" operand. This operand folds in an extend
7788 /// followed by an optional left shift.
7789 InstructionSelector::ComplexRendererFns
7790 AArch64InstructionSelector::selectArithExtendedRegister(
7791     MachineOperand &Root) const {
7792   if (!Root.isReg())
7793     return std::nullopt;
7794   MachineRegisterInfo &MRI =
7795       Root.getParent()->getParent()->getParent()->getRegInfo();
7796
7797   uint64_t ShiftVal = 0;
7798   Register ExtReg;
7799   AArch64_AM::ShiftExtendType Ext;
7800   MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
7801   if (!RootDef)
7802     return std::nullopt;
7803
7804   if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI, false))
7805     return std::nullopt;
7806
7807   // Check if we can fold a shift and an extend.
7808   if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
7809     // Look for a constant on the RHS of the shift.
7810     MachineOperand &RHS = RootDef->getOperand(2);
7811     std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
7812     if (!MaybeShiftVal)
7813       return std::nullopt;
7814     ShiftVal = *MaybeShiftVal;
7815     if (ShiftVal > 4)
7816       return std::nullopt;
7817     // Look for a valid extend instruction on the LHS of the shift.
7818     MachineOperand &LHS = RootDef->getOperand(1);
7819     MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
7820     if (!ExtDef)
7821       return std::nullopt;
7822     Ext = getExtendTypeForInst(*ExtDef, MRI);
7823     if (Ext == AArch64_AM::InvalidShiftExtend)
7824       return std::nullopt;
7825     ExtReg = ExtDef->getOperand(1).getReg();
7826   } else {
7827     // Didn't get a shift. Try just folding an extend.
7828     Ext = getExtendTypeForInst(*RootDef, MRI);
7829     if (Ext == AArch64_AM::InvalidShiftExtend)
7830       return std::nullopt;
7831     ExtReg = RootDef->getOperand(1).getReg();
7832
7833     // If we have a 32 bit instruction which zeroes out the high half of a
7834     // register, we get an implicit zero extend for free. Check if we have one.
7835     // FIXME: We actually emit the extend right now even though we don't have
7836     // to.
7837     if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
7838       MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
7839       if (isDef32(*ExtInst))
7840         return std::nullopt;
7841     }
7842   }
7843
7844   // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
7845   // copy.
7846   MachineIRBuilder MIB(*RootDef);
7847   ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
7848
7849   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
7850            [=](MachineInstrBuilder &MIB) {
7851              MIB.addImm(getArithExtendImm(Ext, ShiftVal));
7852            }}};
7853 }
7854
7855 InstructionSelector::ComplexRendererFns
7856 AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const {
7857   if (!Root.isReg())
7858     return std::nullopt;
7859   MachineRegisterInfo &MRI =
7860       Root.getParent()->getParent()->getParent()->getRegInfo();
7861
7862   auto Extract = getDefSrcRegIgnoringCopies(Root.getReg(), MRI);
7863   while (Extract && Extract->MI->getOpcode() == TargetOpcode::G_BITCAST &&
7864          STI.isLittleEndian())
7865     Extract =
7866         getDefSrcRegIgnoringCopies(Extract->MI->getOperand(1).getReg(), MRI);
7867   if (!Extract)
7868     return std::nullopt;
7869
7870   if (Extract->MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
7871     if (Extract->Reg == Extract->MI->getOperand(1).getReg()) {
7872       Register ExtReg = Extract->MI->getOperand(2).getReg();
7873       return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
7874     }
7875   }
7876   if (Extract->MI->getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) {
7877     LLT SrcTy = MRI.getType(Extract->MI->getOperand(1).getReg());
7878     auto LaneIdx = getIConstantVRegValWithLookThrough(
7879         Extract->MI->getOperand(2).getReg(), MRI);
7880     if (LaneIdx && SrcTy == LLT::fixed_vector(2, 64) &&
7881         LaneIdx->Value.getSExtValue() == 1) {
7882       Register ExtReg = Extract->MI->getOperand(1).getReg();
7883       return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }}};
7884     }
7885   }
7886
7887   return std::nullopt;
7888 }
7889
7890 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
7891                                                 const MachineInstr &MI,
7892                                                 int OpIdx) const {
7893   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7894   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7895          "Expected G_CONSTANT");
7896   std::optional<int64_t> CstVal =
7897       getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
7898   assert(CstVal && "Expected constant value");
7899   MIB.addImm(*CstVal);
7900 }
7901
7902 void AArch64InstructionSelector::renderLogicalImm32(
7903   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7904   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7905          "Expected G_CONSTANT");
7906   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
7907   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
7908   MIB.addImm(Enc);
7909 }
7910
7911 void AArch64InstructionSelector::renderLogicalImm64(
7912   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7913   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7914          "Expected G_CONSTANT");
7915   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
7916   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
7917   MIB.addImm(Enc);
7918 }
7919
7920 void AArch64InstructionSelector::renderUbsanTrap(MachineInstrBuilder &MIB,
7921                                                  const MachineInstr &MI,
7922                                                  int OpIdx) const {
7923   assert(MI.getOpcode() == TargetOpcode::G_UBSANTRAP && OpIdx == 0 &&
7924          "Expected G_UBSANTRAP");
7925   MIB.addImm(MI.getOperand(0).getImm() | ('U' << 8));
7926 }
7927
7928 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
7929                                                const MachineInstr &MI,
7930                                                int OpIdx) const {
7931   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7932          "Expected G_FCONSTANT");
7933   MIB.addImm(
7934       AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7935 }
7936
7937 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
7938                                                const MachineInstr &MI,
7939                                                int OpIdx) const {
7940   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7941          "Expected G_FCONSTANT");
7942   MIB.addImm(
7943       AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7944 }
7945
7946 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
7947                                                const MachineInstr &MI,
7948                                                int OpIdx) const {
7949   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7950          "Expected G_FCONSTANT");
7951   MIB.addImm(
7952       AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
7953 }
7954
7955 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
7956     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7957   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7958          "Expected G_FCONSTANT");
7959   MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1)
7960                                                       .getFPImm()
7961                                                       ->getValueAPF()
7962                                                       .bitcastToAPInt()
7963                                                       .getZExtValue()));
7964 }
7965
7966 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
7967     const MachineInstr &MI, unsigned NumBytes) const {
7968   if (!MI.mayLoadOrStore())
7969     return false;
7970   assert(MI.hasOneMemOperand() &&
7971          "Expected load/store to have only one mem op!");
7972   return (*MI.memoperands_begin())->getSize() == NumBytes;
7973 }
7974
7975 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
7976   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7977   if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
7978     return false;
7979
7980   // Only return true if we know the operation will zero-out the high half of
7981   // the 64-bit register. Truncates can be subregister copies, which don't
7982   // zero out the high bits. Copies and other copy-like instructions can be
7983   // fed by truncates, or could be lowered as subregister copies.
7984   switch (MI.getOpcode()) {
7985   default:
7986     return true;
7987   case TargetOpcode::COPY:
7988   case TargetOpcode::G_BITCAST:
7989   case TargetOpcode::G_TRUNC:
7990   case TargetOpcode::G_PHI:
7991     return false;
7992   }
7993 }
7994
7995
7996 // Perform fixups on the given PHI instruction's operands to force them all
7997 // to be the same as the destination regbank.
7998 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
7999                             const AArch64RegisterBankInfo &RBI) {
8000   assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
8001   Register DstReg = MI.getOperand(0).getReg();
8002   const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
8003   assert(DstRB && "Expected PHI dst to have regbank assigned");
8004   MachineIRBuilder MIB(MI);
8005
8006   // Go through each operand and ensure it has the same regbank.
8007   for (MachineOperand &MO : llvm::drop_begin(MI.operands())) {
8008     if (!MO.isReg())
8009       continue;
8010     Register OpReg = MO.getReg();
8011     const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
8012     if (RB != DstRB) {
8013       // Insert a cross-bank copy.
8014       auto *OpDef = MRI.getVRegDef(OpReg);
8015       const LLT &Ty = MRI.getType(OpReg);
8016       MachineBasicBlock &OpDefBB = *OpDef->getParent();
8017
8018       // Any instruction we insert must appear after all PHIs in the block
8019       // for the block to be valid MIR.
8020       MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
8021       if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
8022         InsertPt = OpDefBB.getFirstNonPHI();
8023       MIB.setInsertPt(*OpDef->getParent(), InsertPt);
8024       auto Copy = MIB.buildCopy(Ty, OpReg);
8025       MRI.setRegBank(Copy.getReg(0), *DstRB);
8026       MO.setReg(Copy.getReg(0));
8027     }
8028   }
8029 }
8030
8031 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
8032   // We're looking for PHIs, build a list so we don't invalidate iterators.
8033   MachineRegisterInfo &MRI = MF.getRegInfo();
8034   SmallVector<MachineInstr *, 32> Phis;
8035   for (auto &BB : MF) {
8036     for (auto &MI : BB) {
8037       if (MI.getOpcode() == TargetOpcode::G_PHI)
8038         Phis.emplace_back(&MI);
8039     }
8040   }
8041
8042   for (auto *MI : Phis) {
8043     // We need to do some work here if the operand types are < 16 bit and they
8044     // are split across fpr/gpr banks. Since all types <32b on gpr
8045     // end up being assigned gpr32 regclasses, we can end up with PHIs here
8046     // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
8047     // be selecting heterogenous regbanks for operands if possible, but we
8048     // still need to be able to deal with it here.
8049     //
8050     // To fix this, if we have a gpr-bank operand < 32b in size and at least
8051     // one other operand is on the fpr bank, then we add cross-bank copies
8052     // to homogenize the operand banks. For simplicity the bank that we choose
8053     // to settle on is whatever bank the def operand has. For example:
8054     //
8055     // %endbb:
8056     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
8057     //  =>
8058     // %bb2:
8059     //   ...
8060     //   %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
8061     //   ...
8062     // %endbb:
8063     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
8064     bool HasGPROp = false, HasFPROp = false;
8065     for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) {
8066       if (!MO.isReg())
8067         continue;
8068       const LLT &Ty = MRI.getType(MO.getReg());
8069       if (!Ty.isValid() || !Ty.isScalar())
8070         break;
8071       if (Ty.getSizeInBits() >= 32)
8072         break;
8073       const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
8074       // If for some reason we don't have a regbank yet. Don't try anything.
8075       if (!RB)
8076         break;
8077
8078       if (RB->getID() == AArch64::GPRRegBankID)
8079         HasGPROp = true;
8080       else
8081         HasFPROp = true;
8082     }
8083     // We have heterogenous regbanks, need to fixup.
8084     if (HasGPROp && HasFPROp)
8085       fixupPHIOpBanks(*MI, MRI, RBI);
8086   }
8087 }
8088
8089 namespace llvm {
8090 InstructionSelector *
8091 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
8092                                  const AArch64Subtarget &Subtarget,
8093                                  const AArch64RegisterBankInfo &RBI) {
8094   return new AArch64InstructionSelector(TM, Subtarget, RBI);
8095 }
8096 }