llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp

   1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements the ARMSelectionDAGInfo class.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "ARMTargetMachine.h"
  14 #include "ARMTargetTransformInfo.h"
  15 #include "llvm/CodeGen/SelectionDAG.h"
  16 #include "llvm/IR/DerivedTypes.h"
  17 #include "llvm/Support/CommandLine.h"
  18 using namespace llvm;
  19
  20 #define DEBUG_TYPE "arm-selectiondag-info"
  21
  22 cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
  23     "arm-memtransfer-tploop", cl::Hidden,
  24     cl::desc("Control conversion of memcpy to "
  25              "Tail predicated loops (WLSTP)"),
  26     cl::init(TPLoop::ForceDisabled),
  27     cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
  28                           "Don't convert memcpy to TP loop."),
  29                clEnumValN(TPLoop::ForceEnabled, "force-enabled",
  30                           "Always convert memcpy to TP loop."),
  31                clEnumValN(TPLoop::Allow, "allow",
  32                           "Allow (may be subject to certain conditions) "
  33                           "conversion of memcpy to TP loop.")));
  34
  35 // Emit, if possible, a specialized version of the given Libcall. Typically this
  36 // means selecting the appropriately aligned version, but we also convert memset
  37 // of 0 into memclr.
  38 SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
  39     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
  40     SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
  41   const ARMSubtarget &Subtarget =
  42       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
  43   const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
  44
  45   // Only use a specialized AEABI function if the default version of this
  46   // Libcall is an AEABI function.
  47   if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
  48     return SDValue();
  49
  50   // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
  51   // able to translate memset to memclr and use the value to index the function
  52   // name array.
  53   enum {
  54     AEABI_MEMCPY = 0,
  55     AEABI_MEMMOVE,
  56     AEABI_MEMSET,
  57     AEABI_MEMCLR
  58   } AEABILibcall;
  59   switch (LC) {
  60   case RTLIB::MEMCPY:
  61     AEABILibcall = AEABI_MEMCPY;
  62     break;
  63   case RTLIB::MEMMOVE:
  64     AEABILibcall = AEABI_MEMMOVE;
  65     break;
  66   case RTLIB::MEMSET:
  67     AEABILibcall = AEABI_MEMSET;
  68     if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
  69       if (ConstantSrc->getZExtValue() == 0)
  70         AEABILibcall = AEABI_MEMCLR;
  71     break;
  72   default:
  73     return SDValue();
  74   }
  75
  76   // Choose the most-aligned libcall variant that we can
  77   enum {
  78     ALIGN1 = 0,
  79     ALIGN4,
  80     ALIGN8
  81   } AlignVariant;
  82   if ((Align & 7) == 0)
  83     AlignVariant = ALIGN8;
  84   else if ((Align & 3) == 0)
  85     AlignVariant = ALIGN4;
  86   else
  87     AlignVariant = ALIGN1;
  88
  89   TargetLowering::ArgListTy Args;
  90   TargetLowering::ArgListEntry Entry;
  91   Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
  92   Entry.Node = Dst;
  93   Args.push_back(Entry);
  94   if (AEABILibcall == AEABI_MEMCLR) {
  95     Entry.Node = Size;
  96     Args.push_back(Entry);
  97   } else if (AEABILibcall == AEABI_MEMSET) {
  98     // Adjust parameters for memset, EABI uses format (ptr, size, value),
  99     // GNU library uses (ptr, value, size)
 100     // See RTABI section 4.3.4
 101     Entry.Node = Size;
 102     Args.push_back(Entry);
 103
 104     // Extend or truncate the argument to be an i32 value for the call.
 105     if (Src.getValueType().bitsGT(MVT::i32))
 106       Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
 107     else if (Src.getValueType().bitsLT(MVT::i32))
 108       Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
 109
 110     Entry.Node = Src;
 111     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
 112     Entry.IsSExt = false;
 113     Args.push_back(Entry);
 114   } else {
 115     Entry.Node = Src;
 116     Args.push_back(Entry);
 117
 118     Entry.Node = Size;
 119     Args.push_back(Entry);
 120   }
 121
 122   char const *FunctionNames[4][3] = {
 123     { "__aeabi_memcpy",  "__aeabi_memcpy4",  "__aeabi_memcpy8"  },
 124     { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
 125     { "__aeabi_memset",  "__aeabi_memset4",  "__aeabi_memset8"  },
 126     { "__aeabi_memclr",  "__aeabi_memclr4",  "__aeabi_memclr8"  }
 127   };
 128   TargetLowering::CallLoweringInfo CLI(DAG);
 129   CLI.setDebugLoc(dl)
 130       .setChain(Chain)
 131       .setLibCallee(
 132           TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
 133           DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
 134                                 TLI->getPointerTy(DAG.getDataLayout())),
 135           std::move(Args))
 136       .setDiscardResult();
 137   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
 138
 139   return CallResult.second;
 140 }
 141
 142 static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
 143                                        const SelectionDAG &DAG,
 144                                        ConstantSDNode *ConstantSize,
 145                                        Align Alignment, bool IsMemcpy) {
 146   auto &F = DAG.getMachineFunction().getFunction();
 147   if (!EnableMemtransferTPLoop)
 148     return false;
 149   if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
 150     return true;
 151   // Do not generate inline TP loop if optimizations is disabled,
 152   // or if optimization for size (-Os or -Oz) is on.
 153   if (F.hasOptNone() || F.hasOptSize())
 154     return false;
 155   // If cli option is unset, for memset always generate inline TP.
 156   // For memcpy, check some conditions
 157   if (!IsMemcpy)
 158     return true;
 159   if (!ConstantSize && Alignment >= Align(4))
 160     return true;
 161   if (ConstantSize &&
 162       ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
 163       ConstantSize->getZExtValue() <
 164           Subtarget.getMaxMemcpyTPInlineSizeThreshold())
 165     return true;
 166   return false;
 167 }
 168
 169 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
 170     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
 171     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
 172     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
 173   const ARMSubtarget &Subtarget =
 174       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
 175   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
 176
 177   if (Subtarget.hasMVEIntegerOps() &&
 178       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
 179     return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
 180                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
 181
 182   // Do repeated 4-byte loads and stores. To be improved.
 183   // This requires 4-byte alignment.
 184   if (Alignment < Align(4))
 185     return SDValue();
 186   // This requires the copy size to be a constant, preferably
 187   // within a subtarget-specific limit.
 188   if (!ConstantSize)
 189     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
 190                                   Alignment.value(), RTLIB::MEMCPY);
 191   uint64_t SizeVal = ConstantSize->getZExtValue();
 192   if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
 193     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
 194                                   Alignment.value(), RTLIB::MEMCPY);
 195
 196   unsigned BytesLeft = SizeVal & 3;
 197   unsigned NumMemOps = SizeVal >> 2;
 198   unsigned EmittedNumMemOps = 0;
 199   EVT VT = MVT::i32;
 200   unsigned VTSize = 4;
 201   unsigned i = 0;
 202   // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
 203   const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
 204   SDValue TFOps[6];
 205   SDValue Loads[6];
 206   uint64_t SrcOff = 0, DstOff = 0;
 207
 208   // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
 209   // VLDM/VSTM and make this code emit it when appropriate. This would reduce
 210   // pressure on the general purpose registers. However this seems harder to map
 211   // onto the register allocator's view of the world.
 212
 213   // The number of MEMCPY pseudo-instructions to emit. We use up to
 214   // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
 215   // later on. This is a lower bound on the number of MEMCPY operations we must
 216   // emit.
 217   unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
 218
 219   // Code size optimisation: do not inline memcpy if expansion results in
 220   // more instructions than the libary call.
 221   if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
 222     return SDValue();
 223   }
 224
 225   SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
 226
 227   for (unsigned I = 0; I != NumMEMCPYs; ++I) {
 228     // Evenly distribute registers among MEMCPY operations to reduce register
 229     // pressure.
 230     unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
 231     unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
 232
 233     Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
 234                       DAG.getConstant(NumRegs, dl, MVT::i32));
 235     Src = Dst.getValue(1);
 236     Chain = Dst.getValue(2);
 237
 238     DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
 239     SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
 240
 241     EmittedNumMemOps = NextEmittedNumMemOps;
 242   }
 243
 244   if (BytesLeft == 0)
 245     return Chain;
 246
 247   // Issue loads / stores for the trailing (1 - 3) bytes.
 248   auto getRemainingValueType = [](unsigned BytesLeft) {
 249     return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
 250   };
 251   auto getRemainingSize = [](unsigned BytesLeft) {
 252     return (BytesLeft >= 2) ? 2 : 1;
 253   };
 254
 255   unsigned BytesLeftSave = BytesLeft;
 256   i = 0;
 257   while (BytesLeft) {
 258     VT = getRemainingValueType(BytesLeft);
 259     VTSize = getRemainingSize(BytesLeft);
 260     Loads[i] = DAG.getLoad(VT, dl, Chain,
 261                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
 262                                        DAG.getConstant(SrcOff, dl, MVT::i32)),
 263                            SrcPtrInfo.getWithOffset(SrcOff));
 264     TFOps[i] = Loads[i].getValue(1);
 265     ++i;
 266     SrcOff += VTSize;
 267     BytesLeft -= VTSize;
 268   }
 269   Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
 270                       makeArrayRef(TFOps, i));
 271
 272   i = 0;
 273   BytesLeft = BytesLeftSave;
 274   while (BytesLeft) {
 275     VT = getRemainingValueType(BytesLeft);
 276     VTSize = getRemainingSize(BytesLeft);
 277     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
 278                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
 279                                         DAG.getConstant(DstOff, dl, MVT::i32)),
 280                             DstPtrInfo.getWithOffset(DstOff));
 281     ++i;
 282     DstOff += VTSize;
 283     BytesLeft -= VTSize;
 284   }
 285   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
 286                      makeArrayRef(TFOps, i));
 287 }
 288
 289 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
 290     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
 291     SDValue Size, Align Alignment, bool isVolatile,
 292     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
 293   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
 294                                 Alignment.value(), RTLIB::MEMMOVE);
 295 }
 296
 297 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
 298     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
 299     SDValue Size, Align Alignment, bool isVolatile,
 300     MachinePointerInfo DstPtrInfo) const {
 301
 302   const ARMSubtarget &Subtarget =
 303       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
 304
 305   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
 306
 307   // Generate TP loop for llvm.memset
 308   if (Subtarget.hasMVEIntegerOps() &&
 309       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
 310                                  false)) {
 311     Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
 312                                   DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
 313     return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
 314                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
 315   }
 316
 317   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
 318                                 Alignment.value(), RTLIB::MEMSET);
 319 }