1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the ARMSelectionDAGInfo class.
11 //===----------------------------------------------------------------------===//
13 #include "ARMTargetMachine.h"
14 #include "ARMTargetTransformInfo.h"
15 #include "llvm/CodeGen/SelectionDAG.h"
16 #include "llvm/IR/DerivedTypes.h"
17 #include "llvm/Support/CommandLine.h"
20 #define DEBUG_TYPE "arm-selectiondag-info"
22 cl::opt
<TPLoop::MemTransfer
> EnableMemtransferTPLoop(
23 "arm-memtransfer-tploop", cl::Hidden
,
24 cl::desc("Control conversion of memcpy to "
25 "Tail predicated loops (WLSTP)"),
26 cl::init(TPLoop::ForceDisabled
),
27 cl::values(clEnumValN(TPLoop::ForceDisabled
, "force-disabled",
28 "Don't convert memcpy to TP loop."),
29 clEnumValN(TPLoop::ForceEnabled
, "force-enabled",
30 "Always convert memcpy to TP loop."),
31 clEnumValN(TPLoop::Allow
, "allow",
32 "Allow (may be subject to certain conditions) "
33 "conversion of memcpy to TP loop.")));
35 // Emit, if possible, a specialized version of the given Libcall. Typically this
36 // means selecting the appropriately aligned version, but we also convert memset
38 SDValue
ARMSelectionDAGInfo::EmitSpecializedLibcall(
39 SelectionDAG
&DAG
, const SDLoc
&dl
, SDValue Chain
, SDValue Dst
, SDValue Src
,
40 SDValue Size
, unsigned Align
, RTLIB::Libcall LC
) const {
41 const ARMSubtarget
&Subtarget
=
42 DAG
.getMachineFunction().getSubtarget
<ARMSubtarget
>();
43 const ARMTargetLowering
*TLI
= Subtarget
.getTargetLowering();
45 // Only use a specialized AEABI function if the default version of this
46 // Libcall is an AEABI function.
47 if (std::strncmp(TLI
->getLibcallName(LC
), "__aeabi", 7) != 0)
50 // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
51 // able to translate memset to memclr and use the value to index the function
61 AEABILibcall
= AEABI_MEMCPY
;
64 AEABILibcall
= AEABI_MEMMOVE
;
67 AEABILibcall
= AEABI_MEMSET
;
68 if (ConstantSDNode
*ConstantSrc
= dyn_cast
<ConstantSDNode
>(Src
))
69 if (ConstantSrc
->getZExtValue() == 0)
70 AEABILibcall
= AEABI_MEMCLR
;
76 // Choose the most-aligned libcall variant that we can
83 AlignVariant
= ALIGN8
;
84 else if ((Align
& 3) == 0)
85 AlignVariant
= ALIGN4
;
87 AlignVariant
= ALIGN1
;
89 TargetLowering::ArgListTy Args
;
90 TargetLowering::ArgListEntry Entry
;
91 Entry
.Ty
= DAG
.getDataLayout().getIntPtrType(*DAG
.getContext());
93 Args
.push_back(Entry
);
94 if (AEABILibcall
== AEABI_MEMCLR
) {
96 Args
.push_back(Entry
);
97 } else if (AEABILibcall
== AEABI_MEMSET
) {
98 // Adjust parameters for memset, EABI uses format (ptr, size, value),
99 // GNU library uses (ptr, value, size)
100 // See RTABI section 4.3.4
102 Args
.push_back(Entry
);
104 // Extend or truncate the argument to be an i32 value for the call.
105 if (Src
.getValueType().bitsGT(MVT::i32
))
106 Src
= DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::i32
, Src
);
107 else if (Src
.getValueType().bitsLT(MVT::i32
))
108 Src
= DAG
.getNode(ISD::ZERO_EXTEND
, dl
, MVT::i32
, Src
);
111 Entry
.Ty
= Type::getInt32Ty(*DAG
.getContext());
112 Entry
.IsSExt
= false;
113 Args
.push_back(Entry
);
116 Args
.push_back(Entry
);
119 Args
.push_back(Entry
);
122 char const *FunctionNames
[4][3] = {
123 { "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
124 { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
125 { "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
126 { "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
128 TargetLowering::CallLoweringInfo
CLI(DAG
);
132 TLI
->getLibcallCallingConv(LC
), Type::getVoidTy(*DAG
.getContext()),
133 DAG
.getExternalSymbol(FunctionNames
[AEABILibcall
][AlignVariant
],
134 TLI
->getPointerTy(DAG
.getDataLayout())),
137 std::pair
<SDValue
,SDValue
> CallResult
= TLI
->LowerCallTo(CLI
);
139 return CallResult
.second
;
142 static bool shouldGenerateInlineTPLoop(const ARMSubtarget
&Subtarget
,
143 const SelectionDAG
&DAG
,
144 ConstantSDNode
*ConstantSize
,
145 Align Alignment
, bool IsMemcpy
) {
146 auto &F
= DAG
.getMachineFunction().getFunction();
147 if (!EnableMemtransferTPLoop
)
149 if (EnableMemtransferTPLoop
== TPLoop::ForceEnabled
)
151 // Do not generate inline TP loop if optimizations is disabled,
152 // or if optimization for size (-Os or -Oz) is on.
153 if (F
.hasOptNone() || F
.hasOptSize())
155 // If cli option is unset, for memset always generate inline TP.
156 // For memcpy, check some conditions
159 if (!ConstantSize
&& Alignment
>= Align(4))
162 ConstantSize
->getZExtValue() > Subtarget
.getMaxInlineSizeThreshold() &&
163 ConstantSize
->getZExtValue() <
164 Subtarget
.getMaxMemcpyTPInlineSizeThreshold())
169 SDValue
ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
170 SelectionDAG
&DAG
, const SDLoc
&dl
, SDValue Chain
, SDValue Dst
, SDValue Src
,
171 SDValue Size
, Align Alignment
, bool isVolatile
, bool AlwaysInline
,
172 MachinePointerInfo DstPtrInfo
, MachinePointerInfo SrcPtrInfo
) const {
173 const ARMSubtarget
&Subtarget
=
174 DAG
.getMachineFunction().getSubtarget
<ARMSubtarget
>();
175 ConstantSDNode
*ConstantSize
= dyn_cast
<ConstantSDNode
>(Size
);
177 if (Subtarget
.hasMVEIntegerOps() &&
178 shouldGenerateInlineTPLoop(Subtarget
, DAG
, ConstantSize
, Alignment
, true))
179 return DAG
.getNode(ARMISD::MEMCPYLOOP
, dl
, MVT::Other
, Chain
, Dst
, Src
,
180 DAG
.getZExtOrTrunc(Size
, dl
, MVT::i32
));
182 // Do repeated 4-byte loads and stores. To be improved.
183 // This requires 4-byte alignment.
184 if (Alignment
< Align(4))
186 // This requires the copy size to be a constant, preferably
187 // within a subtarget-specific limit.
189 return EmitSpecializedLibcall(DAG
, dl
, Chain
, Dst
, Src
, Size
,
190 Alignment
.value(), RTLIB::MEMCPY
);
191 uint64_t SizeVal
= ConstantSize
->getZExtValue();
192 if (!AlwaysInline
&& SizeVal
> Subtarget
.getMaxInlineSizeThreshold())
193 return EmitSpecializedLibcall(DAG
, dl
, Chain
, Dst
, Src
, Size
,
194 Alignment
.value(), RTLIB::MEMCPY
);
196 unsigned BytesLeft
= SizeVal
& 3;
197 unsigned NumMemOps
= SizeVal
>> 2;
198 unsigned EmittedNumMemOps
= 0;
202 // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
203 const unsigned MaxLoadsInLDM
= Subtarget
.isThumb1Only() ? 4 : 6;
206 uint64_t SrcOff
= 0, DstOff
= 0;
208 // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
209 // VLDM/VSTM and make this code emit it when appropriate. This would reduce
210 // pressure on the general purpose registers. However this seems harder to map
211 // onto the register allocator's view of the world.
213 // The number of MEMCPY pseudo-instructions to emit. We use up to
214 // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
215 // later on. This is a lower bound on the number of MEMCPY operations we must
217 unsigned NumMEMCPYs
= (NumMemOps
+ MaxLoadsInLDM
- 1) / MaxLoadsInLDM
;
219 // Code size optimisation: do not inline memcpy if expansion results in
220 // more instructions than the libary call.
221 if (NumMEMCPYs
> 1 && Subtarget
.hasMinSize()) {
225 SDVTList VTs
= DAG
.getVTList(MVT::i32
, MVT::i32
, MVT::Other
, MVT::Glue
);
227 for (unsigned I
= 0; I
!= NumMEMCPYs
; ++I
) {
228 // Evenly distribute registers among MEMCPY operations to reduce register
230 unsigned NextEmittedNumMemOps
= NumMemOps
* (I
+ 1) / NumMEMCPYs
;
231 unsigned NumRegs
= NextEmittedNumMemOps
- EmittedNumMemOps
;
233 Dst
= DAG
.getNode(ARMISD::MEMCPY
, dl
, VTs
, Chain
, Dst
, Src
,
234 DAG
.getConstant(NumRegs
, dl
, MVT::i32
));
235 Src
= Dst
.getValue(1);
236 Chain
= Dst
.getValue(2);
238 DstPtrInfo
= DstPtrInfo
.getWithOffset(NumRegs
* VTSize
);
239 SrcPtrInfo
= SrcPtrInfo
.getWithOffset(NumRegs
* VTSize
);
241 EmittedNumMemOps
= NextEmittedNumMemOps
;
247 // Issue loads / stores for the trailing (1 - 3) bytes.
248 auto getRemainingValueType
= [](unsigned BytesLeft
) {
249 return (BytesLeft
>= 2) ? MVT::i16
: MVT::i8
;
251 auto getRemainingSize
= [](unsigned BytesLeft
) {
252 return (BytesLeft
>= 2) ? 2 : 1;
255 unsigned BytesLeftSave
= BytesLeft
;
258 VT
= getRemainingValueType(BytesLeft
);
259 VTSize
= getRemainingSize(BytesLeft
);
260 Loads
[i
] = DAG
.getLoad(VT
, dl
, Chain
,
261 DAG
.getNode(ISD::ADD
, dl
, MVT::i32
, Src
,
262 DAG
.getConstant(SrcOff
, dl
, MVT::i32
)),
263 SrcPtrInfo
.getWithOffset(SrcOff
));
264 TFOps
[i
] = Loads
[i
].getValue(1);
269 Chain
= DAG
.getNode(ISD::TokenFactor
, dl
, MVT::Other
,
270 makeArrayRef(TFOps
, i
));
273 BytesLeft
= BytesLeftSave
;
275 VT
= getRemainingValueType(BytesLeft
);
276 VTSize
= getRemainingSize(BytesLeft
);
277 TFOps
[i
] = DAG
.getStore(Chain
, dl
, Loads
[i
],
278 DAG
.getNode(ISD::ADD
, dl
, MVT::i32
, Dst
,
279 DAG
.getConstant(DstOff
, dl
, MVT::i32
)),
280 DstPtrInfo
.getWithOffset(DstOff
));
285 return DAG
.getNode(ISD::TokenFactor
, dl
, MVT::Other
,
286 makeArrayRef(TFOps
, i
));
289 SDValue
ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
290 SelectionDAG
&DAG
, const SDLoc
&dl
, SDValue Chain
, SDValue Dst
, SDValue Src
,
291 SDValue Size
, Align Alignment
, bool isVolatile
,
292 MachinePointerInfo DstPtrInfo
, MachinePointerInfo SrcPtrInfo
) const {
293 return EmitSpecializedLibcall(DAG
, dl
, Chain
, Dst
, Src
, Size
,
294 Alignment
.value(), RTLIB::MEMMOVE
);
297 SDValue
ARMSelectionDAGInfo::EmitTargetCodeForMemset(
298 SelectionDAG
&DAG
, const SDLoc
&dl
, SDValue Chain
, SDValue Dst
, SDValue Src
,
299 SDValue Size
, Align Alignment
, bool isVolatile
,
300 MachinePointerInfo DstPtrInfo
) const {
302 const ARMSubtarget
&Subtarget
=
303 DAG
.getMachineFunction().getSubtarget
<ARMSubtarget
>();
305 ConstantSDNode
*ConstantSize
= dyn_cast
<ConstantSDNode
>(Size
);
307 // Generate TP loop for llvm.memset
308 if (Subtarget
.hasMVEIntegerOps() &&
309 shouldGenerateInlineTPLoop(Subtarget
, DAG
, ConstantSize
, Alignment
,
311 Src
= DAG
.getSplatBuildVector(MVT::v16i8
, dl
,
312 DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::i8
, Src
));
313 return DAG
.getNode(ARMISD::MEMSETLOOP
, dl
, MVT::Other
, Chain
, Dst
, Src
,
314 DAG
.getZExtOrTrunc(Size
, dl
, MVT::i32
));
317 return EmitSpecializedLibcall(DAG
, dl
, Chain
, Dst
, Src
, Size
,
318 Alignment
.value(), RTLIB::MEMSET
);