llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp

   1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements the ARMSelectionDAGInfo class.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "ARMTargetTransformInfo.h"
  14 #include "llvm/CodeGen/SelectionDAG.h"
  15 #include "llvm/Support/CommandLine.h"
  16 using namespace llvm;
  17
  18 #define DEBUG_TYPE "arm-selectiondag-info"
  19
  20 cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
  21     "arm-memtransfer-tploop", cl::Hidden,
  22     cl::desc("Control conversion of memcpy to "
  23              "Tail predicated loops (WLSTP)"),
  24     cl::init(TPLoop::ForceDisabled),
  25     cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
  26                           "Don't convert memcpy to TP loop."),
  27                clEnumValN(TPLoop::ForceEnabled, "force-enabled",
  28                           "Always convert memcpy to TP loop."),
  29                clEnumValN(TPLoop::Allow, "allow",
  30                           "Allow (may be subject to certain conditions) "
  31                           "conversion of memcpy to TP loop.")));
  32
  33 bool ARMSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
  34   return Opcode >= ARMISD::FIRST_MEMORY_OPCODE &&
  35          Opcode <= ARMISD::LAST_MEMORY_OPCODE;
  36 }
  37
  38 // Emit, if possible, a specialized version of the given Libcall. Typically this
  39 // means selecting the appropriately aligned version, but we also convert memset
  40 // of 0 into memclr.
  41 SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
  42     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
  43     SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
  44   const ARMSubtarget &Subtarget =
  45       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
  46   const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
  47
  48   // Only use a specialized AEABI function if the default version of this
  49   // Libcall is an AEABI function.
  50   if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
  51     return SDValue();
  52
  53   // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
  54   // able to translate memset to memclr and use the value to index the function
  55   // name array.
  56   enum {
  57     AEABI_MEMCPY = 0,
  58     AEABI_MEMMOVE,
  59     AEABI_MEMSET,
  60     AEABI_MEMCLR
  61   } AEABILibcall;
  62   switch (LC) {
  63   case RTLIB::MEMCPY:
  64     AEABILibcall = AEABI_MEMCPY;
  65     break;
  66   case RTLIB::MEMMOVE:
  67     AEABILibcall = AEABI_MEMMOVE;
  68     break;
  69   case RTLIB::MEMSET:
  70     AEABILibcall = AEABI_MEMSET;
  71     if (isNullConstant(Src))
  72       AEABILibcall = AEABI_MEMCLR;
  73     break;
  74   default:
  75     return SDValue();
  76   }
  77
  78   // Choose the most-aligned libcall variant that we can
  79   enum {
  80     ALIGN1 = 0,
  81     ALIGN4,
  82     ALIGN8
  83   } AlignVariant;
  84   if ((Align & 7) == 0)
  85     AlignVariant = ALIGN8;
  86   else if ((Align & 3) == 0)
  87     AlignVariant = ALIGN4;
  88   else
  89     AlignVariant = ALIGN1;
  90
  91   TargetLowering::ArgListTy Args;
  92   TargetLowering::ArgListEntry Entry;
  93   Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
  94   Entry.Node = Dst;
  95   Args.push_back(Entry);
  96   if (AEABILibcall == AEABI_MEMCLR) {
  97     Entry.Node = Size;
  98     Args.push_back(Entry);
  99   } else if (AEABILibcall == AEABI_MEMSET) {
 100     // Adjust parameters for memset, EABI uses format (ptr, size, value),
 101     // GNU library uses (ptr, value, size)
 102     // See RTABI section 4.3.4
 103     Entry.Node = Size;
 104     Args.push_back(Entry);
 105
 106     // Extend or truncate the argument to be an i32 value for the call.
 107     if (Src.getValueType().bitsGT(MVT::i32))
 108       Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
 109     else if (Src.getValueType().bitsLT(MVT::i32))
 110       Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
 111
 112     Entry.Node = Src;
 113     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
 114     Entry.IsSExt = false;
 115     Args.push_back(Entry);
 116   } else {
 117     Entry.Node = Src;
 118     Args.push_back(Entry);
 119
 120     Entry.Node = Size;
 121     Args.push_back(Entry);
 122   }
 123
 124   char const *FunctionNames[4][3] = {
 125     { "__aeabi_memcpy",  "__aeabi_memcpy4",  "__aeabi_memcpy8"  },
 126     { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
 127     { "__aeabi_memset",  "__aeabi_memset4",  "__aeabi_memset8"  },
 128     { "__aeabi_memclr",  "__aeabi_memclr4",  "__aeabi_memclr8"  }
 129   };
 130   TargetLowering::CallLoweringInfo CLI(DAG);
 131   CLI.setDebugLoc(dl)
 132       .setChain(Chain)
 133       .setLibCallee(
 134           TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
 135           DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
 136                                 TLI->getPointerTy(DAG.getDataLayout())),
 137           std::move(Args))
 138       .setDiscardResult();
 139   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
 140
 141   return CallResult.second;
 142 }
 143
 144 static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
 145                                        const SelectionDAG &DAG,
 146                                        ConstantSDNode *ConstantSize,
 147                                        Align Alignment, bool IsMemcpy) {
 148   auto &F = DAG.getMachineFunction().getFunction();
 149   if (!EnableMemtransferTPLoop)
 150     return false;
 151   if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
 152     return true;
 153   // Do not generate inline TP loop if optimizations is disabled,
 154   // or if optimization for size (-Os or -Oz) is on.
 155   if (F.hasOptNone() || F.hasOptSize())
 156     return false;
 157   // If cli option is unset, for memset always generate inline TP.
 158   // For memcpy, check some conditions
 159   if (!IsMemcpy)
 160     return true;
 161   if (!ConstantSize && Alignment >= Align(4))
 162     return true;
 163   if (ConstantSize &&
 164       ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
 165       ConstantSize->getZExtValue() <
 166           Subtarget.getMaxMemcpyTPInlineSizeThreshold())
 167     return true;
 168   return false;
 169 }
 170
 171 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
 172     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
 173     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
 174     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
 175   const ARMSubtarget &Subtarget =
 176       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
 177   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
 178
 179   if (Subtarget.hasMVEIntegerOps() &&
 180       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
 181     return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
 182                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
 183
 184   // Do repeated 4-byte loads and stores. To be improved.
 185   // This requires 4-byte alignment.
 186   if (Alignment < Align(4))
 187     return SDValue();
 188   // This requires the copy size to be a constant, preferably
 189   // within a subtarget-specific limit.
 190   if (!ConstantSize)
 191     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
 192                                   Alignment.value(), RTLIB::MEMCPY);
 193   uint64_t SizeVal = ConstantSize->getZExtValue();
 194   if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
 195     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
 196                                   Alignment.value(), RTLIB::MEMCPY);
 197
 198   unsigned BytesLeft = SizeVal & 3;
 199   unsigned NumMemOps = SizeVal >> 2;
 200   unsigned EmittedNumMemOps = 0;
 201   EVT VT = MVT::i32;
 202   unsigned VTSize = 4;
 203   unsigned i = 0;
 204   // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
 205   const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
 206   SDValue TFOps[6];
 207   SDValue Loads[6];
 208   uint64_t SrcOff = 0, DstOff = 0;
 209
 210   // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
 211   // VLDM/VSTM and make this code emit it when appropriate. This would reduce
 212   // pressure on the general purpose registers. However this seems harder to map
 213   // onto the register allocator's view of the world.
 214
 215   // The number of MEMCPY pseudo-instructions to emit. We use up to
 216   // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
 217   // later on. This is a lower bound on the number of MEMCPY operations we must
 218   // emit.
 219   unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
 220
 221   // Code size optimisation: do not inline memcpy if expansion results in
 222   // more instructions than the libary call.
 223   if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
 224     return SDValue();
 225   }
 226
 227   SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
 228
 229   for (unsigned I = 0; I != NumMEMCPYs; ++I) {
 230     // Evenly distribute registers among MEMCPY operations to reduce register
 231     // pressure.
 232     unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
 233     unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
 234
 235     Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
 236                       DAG.getConstant(NumRegs, dl, MVT::i32));
 237     Src = Dst.getValue(1);
 238     Chain = Dst.getValue(2);
 239
 240     DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
 241     SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
 242
 243     EmittedNumMemOps = NextEmittedNumMemOps;
 244   }
 245
 246   if (BytesLeft == 0)
 247     return Chain;
 248
 249   // Issue loads / stores for the trailing (1 - 3) bytes.
 250   auto getRemainingValueType = [](unsigned BytesLeft) {
 251     return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
 252   };
 253   auto getRemainingSize = [](unsigned BytesLeft) {
 254     return (BytesLeft >= 2) ? 2 : 1;
 255   };
 256
 257   unsigned BytesLeftSave = BytesLeft;
 258   i = 0;
 259   while (BytesLeft) {
 260     VT = getRemainingValueType(BytesLeft);
 261     VTSize = getRemainingSize(BytesLeft);
 262     Loads[i] = DAG.getLoad(VT, dl, Chain,
 263                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
 264                                        DAG.getConstant(SrcOff, dl, MVT::i32)),
 265                            SrcPtrInfo.getWithOffset(SrcOff));
 266     TFOps[i] = Loads[i].getValue(1);
 267     ++i;
 268     SrcOff += VTSize;
 269     BytesLeft -= VTSize;
 270   }
 271   Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
 272
 273   i = 0;
 274   BytesLeft = BytesLeftSave;
 275   while (BytesLeft) {
 276     VT = getRemainingValueType(BytesLeft);
 277     VTSize = getRemainingSize(BytesLeft);
 278     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
 279                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
 280                                         DAG.getConstant(DstOff, dl, MVT::i32)),
 281                             DstPtrInfo.getWithOffset(DstOff));
 282     ++i;
 283     DstOff += VTSize;
 284     BytesLeft -= VTSize;
 285   }
 286   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
 287 }
 288
 289 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
 290     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
 291     SDValue Size, Align Alignment, bool isVolatile,
 292     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
 293   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
 294                                 Alignment.value(), RTLIB::MEMMOVE);
 295 }
 296
 297 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
 298     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
 299     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
 300     MachinePointerInfo DstPtrInfo) const {
 301
 302   const ARMSubtarget &Subtarget =
 303       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
 304
 305   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
 306
 307   // Generate TP loop for llvm.memset
 308   if (Subtarget.hasMVEIntegerOps() &&
 309       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
 310                                  false)) {
 311     Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
 312                                   DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
 313     return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
 314                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
 315   }
 316
 317   if (!AlwaysInline)
 318     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
 319                                   Alignment.value(), RTLIB::MEMSET);
 320
 321   return SDValue();
 322 }