1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the ARMSelectionDAGInfo class.
11 //===----------------------------------------------------------------------===//
13 #include "ARMTargetTransformInfo.h"
14 #include "llvm/CodeGen/SelectionDAG.h"
15 #include "llvm/Support/CommandLine.h"
18 #define DEBUG_TYPE "arm-selectiondag-info"
20 cl::opt
<TPLoop::MemTransfer
> EnableMemtransferTPLoop(
21 "arm-memtransfer-tploop", cl::Hidden
,
22 cl::desc("Control conversion of memcpy to "
23 "Tail predicated loops (WLSTP)"),
24 cl::init(TPLoop::ForceDisabled
),
25 cl::values(clEnumValN(TPLoop::ForceDisabled
, "force-disabled",
26 "Don't convert memcpy to TP loop."),
27 clEnumValN(TPLoop::ForceEnabled
, "force-enabled",
28 "Always convert memcpy to TP loop."),
29 clEnumValN(TPLoop::Allow
, "allow",
30 "Allow (may be subject to certain conditions) "
31 "conversion of memcpy to TP loop.")));
33 bool ARMSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode
) const {
34 return Opcode
>= ARMISD::FIRST_MEMORY_OPCODE
&&
35 Opcode
<= ARMISD::LAST_MEMORY_OPCODE
;
38 // Emit, if possible, a specialized version of the given Libcall. Typically this
39 // means selecting the appropriately aligned version, but we also convert memset
41 SDValue
ARMSelectionDAGInfo::EmitSpecializedLibcall(
42 SelectionDAG
&DAG
, const SDLoc
&dl
, SDValue Chain
, SDValue Dst
, SDValue Src
,
43 SDValue Size
, unsigned Align
, RTLIB::Libcall LC
) const {
44 const ARMSubtarget
&Subtarget
=
45 DAG
.getMachineFunction().getSubtarget
<ARMSubtarget
>();
46 const ARMTargetLowering
*TLI
= Subtarget
.getTargetLowering();
48 // Only use a specialized AEABI function if the default version of this
49 // Libcall is an AEABI function.
50 if (std::strncmp(TLI
->getLibcallName(LC
), "__aeabi", 7) != 0)
53 // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
54 // able to translate memset to memclr and use the value to index the function
64 AEABILibcall
= AEABI_MEMCPY
;
67 AEABILibcall
= AEABI_MEMMOVE
;
70 AEABILibcall
= AEABI_MEMSET
;
71 if (isNullConstant(Src
))
72 AEABILibcall
= AEABI_MEMCLR
;
78 // Choose the most-aligned libcall variant that we can
85 AlignVariant
= ALIGN8
;
86 else if ((Align
& 3) == 0)
87 AlignVariant
= ALIGN4
;
89 AlignVariant
= ALIGN1
;
91 TargetLowering::ArgListTy Args
;
92 TargetLowering::ArgListEntry Entry
;
93 Entry
.Ty
= DAG
.getDataLayout().getIntPtrType(*DAG
.getContext());
95 Args
.push_back(Entry
);
96 if (AEABILibcall
== AEABI_MEMCLR
) {
98 Args
.push_back(Entry
);
99 } else if (AEABILibcall
== AEABI_MEMSET
) {
100 // Adjust parameters for memset, EABI uses format (ptr, size, value),
101 // GNU library uses (ptr, value, size)
102 // See RTABI section 4.3.4
104 Args
.push_back(Entry
);
106 // Extend or truncate the argument to be an i32 value for the call.
107 if (Src
.getValueType().bitsGT(MVT::i32
))
108 Src
= DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::i32
, Src
);
109 else if (Src
.getValueType().bitsLT(MVT::i32
))
110 Src
= DAG
.getNode(ISD::ZERO_EXTEND
, dl
, MVT::i32
, Src
);
113 Entry
.Ty
= Type::getInt32Ty(*DAG
.getContext());
114 Entry
.IsSExt
= false;
115 Args
.push_back(Entry
);
118 Args
.push_back(Entry
);
121 Args
.push_back(Entry
);
124 char const *FunctionNames
[4][3] = {
125 { "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
126 { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
127 { "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
128 { "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
130 TargetLowering::CallLoweringInfo
CLI(DAG
);
134 TLI
->getLibcallCallingConv(LC
), Type::getVoidTy(*DAG
.getContext()),
135 DAG
.getExternalSymbol(FunctionNames
[AEABILibcall
][AlignVariant
],
136 TLI
->getPointerTy(DAG
.getDataLayout())),
139 std::pair
<SDValue
,SDValue
> CallResult
= TLI
->LowerCallTo(CLI
);
141 return CallResult
.second
;
144 static bool shouldGenerateInlineTPLoop(const ARMSubtarget
&Subtarget
,
145 const SelectionDAG
&DAG
,
146 ConstantSDNode
*ConstantSize
,
147 Align Alignment
, bool IsMemcpy
) {
148 auto &F
= DAG
.getMachineFunction().getFunction();
149 if (!EnableMemtransferTPLoop
)
151 if (EnableMemtransferTPLoop
== TPLoop::ForceEnabled
)
153 // Do not generate inline TP loop if optimizations is disabled,
154 // or if optimization for size (-Os or -Oz) is on.
155 if (F
.hasOptNone() || F
.hasOptSize())
157 // If cli option is unset, for memset always generate inline TP.
158 // For memcpy, check some conditions
161 if (!ConstantSize
&& Alignment
>= Align(4))
164 ConstantSize
->getZExtValue() > Subtarget
.getMaxInlineSizeThreshold() &&
165 ConstantSize
->getZExtValue() <
166 Subtarget
.getMaxMemcpyTPInlineSizeThreshold())
171 SDValue
ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
172 SelectionDAG
&DAG
, const SDLoc
&dl
, SDValue Chain
, SDValue Dst
, SDValue Src
,
173 SDValue Size
, Align Alignment
, bool isVolatile
, bool AlwaysInline
,
174 MachinePointerInfo DstPtrInfo
, MachinePointerInfo SrcPtrInfo
) const {
175 const ARMSubtarget
&Subtarget
=
176 DAG
.getMachineFunction().getSubtarget
<ARMSubtarget
>();
177 ConstantSDNode
*ConstantSize
= dyn_cast
<ConstantSDNode
>(Size
);
179 if (Subtarget
.hasMVEIntegerOps() &&
180 shouldGenerateInlineTPLoop(Subtarget
, DAG
, ConstantSize
, Alignment
, true))
181 return DAG
.getNode(ARMISD::MEMCPYLOOP
, dl
, MVT::Other
, Chain
, Dst
, Src
,
182 DAG
.getZExtOrTrunc(Size
, dl
, MVT::i32
));
184 // Do repeated 4-byte loads and stores. To be improved.
185 // This requires 4-byte alignment.
186 if (Alignment
< Align(4))
188 // This requires the copy size to be a constant, preferably
189 // within a subtarget-specific limit.
191 return EmitSpecializedLibcall(DAG
, dl
, Chain
, Dst
, Src
, Size
,
192 Alignment
.value(), RTLIB::MEMCPY
);
193 uint64_t SizeVal
= ConstantSize
->getZExtValue();
194 if (!AlwaysInline
&& SizeVal
> Subtarget
.getMaxInlineSizeThreshold())
195 return EmitSpecializedLibcall(DAG
, dl
, Chain
, Dst
, Src
, Size
,
196 Alignment
.value(), RTLIB::MEMCPY
);
198 unsigned BytesLeft
= SizeVal
& 3;
199 unsigned NumMemOps
= SizeVal
>> 2;
200 unsigned EmittedNumMemOps
= 0;
204 // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
205 const unsigned MaxLoadsInLDM
= Subtarget
.isThumb1Only() ? 4 : 6;
208 uint64_t SrcOff
= 0, DstOff
= 0;
210 // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
211 // VLDM/VSTM and make this code emit it when appropriate. This would reduce
212 // pressure on the general purpose registers. However this seems harder to map
213 // onto the register allocator's view of the world.
215 // The number of MEMCPY pseudo-instructions to emit. We use up to
216 // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
217 // later on. This is a lower bound on the number of MEMCPY operations we must
219 unsigned NumMEMCPYs
= (NumMemOps
+ MaxLoadsInLDM
- 1) / MaxLoadsInLDM
;
221 // Code size optimisation: do not inline memcpy if expansion results in
222 // more instructions than the libary call.
223 if (NumMEMCPYs
> 1 && Subtarget
.hasMinSize()) {
227 SDVTList VTs
= DAG
.getVTList(MVT::i32
, MVT::i32
, MVT::Other
, MVT::Glue
);
229 for (unsigned I
= 0; I
!= NumMEMCPYs
; ++I
) {
230 // Evenly distribute registers among MEMCPY operations to reduce register
232 unsigned NextEmittedNumMemOps
= NumMemOps
* (I
+ 1) / NumMEMCPYs
;
233 unsigned NumRegs
= NextEmittedNumMemOps
- EmittedNumMemOps
;
235 Dst
= DAG
.getNode(ARMISD::MEMCPY
, dl
, VTs
, Chain
, Dst
, Src
,
236 DAG
.getConstant(NumRegs
, dl
, MVT::i32
));
237 Src
= Dst
.getValue(1);
238 Chain
= Dst
.getValue(2);
240 DstPtrInfo
= DstPtrInfo
.getWithOffset(NumRegs
* VTSize
);
241 SrcPtrInfo
= SrcPtrInfo
.getWithOffset(NumRegs
* VTSize
);
243 EmittedNumMemOps
= NextEmittedNumMemOps
;
249 // Issue loads / stores for the trailing (1 - 3) bytes.
250 auto getRemainingValueType
= [](unsigned BytesLeft
) {
251 return (BytesLeft
>= 2) ? MVT::i16
: MVT::i8
;
253 auto getRemainingSize
= [](unsigned BytesLeft
) {
254 return (BytesLeft
>= 2) ? 2 : 1;
257 unsigned BytesLeftSave
= BytesLeft
;
260 VT
= getRemainingValueType(BytesLeft
);
261 VTSize
= getRemainingSize(BytesLeft
);
262 Loads
[i
] = DAG
.getLoad(VT
, dl
, Chain
,
263 DAG
.getNode(ISD::ADD
, dl
, MVT::i32
, Src
,
264 DAG
.getConstant(SrcOff
, dl
, MVT::i32
)),
265 SrcPtrInfo
.getWithOffset(SrcOff
));
266 TFOps
[i
] = Loads
[i
].getValue(1);
271 Chain
= DAG
.getNode(ISD::TokenFactor
, dl
, MVT::Other
, ArrayRef(TFOps
, i
));
274 BytesLeft
= BytesLeftSave
;
276 VT
= getRemainingValueType(BytesLeft
);
277 VTSize
= getRemainingSize(BytesLeft
);
278 TFOps
[i
] = DAG
.getStore(Chain
, dl
, Loads
[i
],
279 DAG
.getNode(ISD::ADD
, dl
, MVT::i32
, Dst
,
280 DAG
.getConstant(DstOff
, dl
, MVT::i32
)),
281 DstPtrInfo
.getWithOffset(DstOff
));
286 return DAG
.getNode(ISD::TokenFactor
, dl
, MVT::Other
, ArrayRef(TFOps
, i
));
289 SDValue
ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
290 SelectionDAG
&DAG
, const SDLoc
&dl
, SDValue Chain
, SDValue Dst
, SDValue Src
,
291 SDValue Size
, Align Alignment
, bool isVolatile
,
292 MachinePointerInfo DstPtrInfo
, MachinePointerInfo SrcPtrInfo
) const {
293 return EmitSpecializedLibcall(DAG
, dl
, Chain
, Dst
, Src
, Size
,
294 Alignment
.value(), RTLIB::MEMMOVE
);
297 SDValue
ARMSelectionDAGInfo::EmitTargetCodeForMemset(
298 SelectionDAG
&DAG
, const SDLoc
&dl
, SDValue Chain
, SDValue Dst
, SDValue Src
,
299 SDValue Size
, Align Alignment
, bool isVolatile
, bool AlwaysInline
,
300 MachinePointerInfo DstPtrInfo
) const {
302 const ARMSubtarget
&Subtarget
=
303 DAG
.getMachineFunction().getSubtarget
<ARMSubtarget
>();
305 ConstantSDNode
*ConstantSize
= dyn_cast
<ConstantSDNode
>(Size
);
307 // Generate TP loop for llvm.memset
308 if (Subtarget
.hasMVEIntegerOps() &&
309 shouldGenerateInlineTPLoop(Subtarget
, DAG
, ConstantSize
, Alignment
,
311 Src
= DAG
.getSplatBuildVector(MVT::v16i8
, dl
,
312 DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::i8
, Src
));
313 return DAG
.getNode(ARMISD::MEMSETLOOP
, dl
, MVT::Other
, Chain
, Dst
, Src
,
314 DAG
.getZExtOrTrunc(Size
, dl
, MVT::i32
));
318 return EmitSpecializedLibcall(DAG
, dl
, Chain
, Dst
, Src
, Size
,
319 Alignment
.value(), RTLIB::MEMSET
);