[ARM] Better patterns for fp <> predicate vectors
[llvm-complete.git] / lib / Target / ARM / ARMSelectionDAGInfo.cpp
blobcade06e8c109bd737db729342984935fe9d5ce09
1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the ARMSelectionDAGInfo class.
11 //===----------------------------------------------------------------------===//
13 #include "ARMTargetMachine.h"
14 #include "llvm/CodeGen/SelectionDAG.h"
15 #include "llvm/IR/DerivedTypes.h"
16 using namespace llvm;
18 #define DEBUG_TYPE "arm-selectiondag-info"
20 // Emit, if possible, a specialized version of the given Libcall. Typically this
21 // means selecting the appropriately aligned version, but we also convert memset
22 // of 0 into memclr.
23 SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
24 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
25 SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
26 const ARMSubtarget &Subtarget =
27 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
28 const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
30 // Only use a specialized AEABI function if the default version of this
31 // Libcall is an AEABI function.
32 if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
33 return SDValue();
35 // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
36 // able to translate memset to memclr and use the value to index the function
37 // name array.
38 enum {
39 AEABI_MEMCPY = 0,
40 AEABI_MEMMOVE,
41 AEABI_MEMSET,
42 AEABI_MEMCLR
43 } AEABILibcall;
44 switch (LC) {
45 case RTLIB::MEMCPY:
46 AEABILibcall = AEABI_MEMCPY;
47 break;
48 case RTLIB::MEMMOVE:
49 AEABILibcall = AEABI_MEMMOVE;
50 break;
51 case RTLIB::MEMSET:
52 AEABILibcall = AEABI_MEMSET;
53 if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
54 if (ConstantSrc->getZExtValue() == 0)
55 AEABILibcall = AEABI_MEMCLR;
56 break;
57 default:
58 return SDValue();
61 // Choose the most-aligned libcall variant that we can
62 enum {
63 ALIGN1 = 0,
64 ALIGN4,
65 ALIGN8
66 } AlignVariant;
67 if ((Align & 7) == 0)
68 AlignVariant = ALIGN8;
69 else if ((Align & 3) == 0)
70 AlignVariant = ALIGN4;
71 else
72 AlignVariant = ALIGN1;
74 TargetLowering::ArgListTy Args;
75 TargetLowering::ArgListEntry Entry;
76 Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
77 Entry.Node = Dst;
78 Args.push_back(Entry);
79 if (AEABILibcall == AEABI_MEMCLR) {
80 Entry.Node = Size;
81 Args.push_back(Entry);
82 } else if (AEABILibcall == AEABI_MEMSET) {
83 // Adjust parameters for memset, EABI uses format (ptr, size, value),
84 // GNU library uses (ptr, value, size)
85 // See RTABI section 4.3.4
86 Entry.Node = Size;
87 Args.push_back(Entry);
89 // Extend or truncate the argument to be an i32 value for the call.
90 if (Src.getValueType().bitsGT(MVT::i32))
91 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
92 else if (Src.getValueType().bitsLT(MVT::i32))
93 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
95 Entry.Node = Src;
96 Entry.Ty = Type::getInt32Ty(*DAG.getContext());
97 Entry.IsSExt = false;
98 Args.push_back(Entry);
99 } else {
100 Entry.Node = Src;
101 Args.push_back(Entry);
103 Entry.Node = Size;
104 Args.push_back(Entry);
107 char const *FunctionNames[4][3] = {
108 { "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
109 { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
110 { "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
111 { "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
113 TargetLowering::CallLoweringInfo CLI(DAG);
114 CLI.setDebugLoc(dl)
115 .setChain(Chain)
116 .setLibCallee(
117 TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
118 DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
119 TLI->getPointerTy(DAG.getDataLayout())),
120 std::move(Args))
121 .setDiscardResult();
122 std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
124 return CallResult.second;
127 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
128 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
129 SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
130 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
131 const ARMSubtarget &Subtarget =
132 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
133 // Do repeated 4-byte loads and stores. To be improved.
134 // This requires 4-byte alignment.
135 if ((Align & 3) != 0)
136 return SDValue();
137 // This requires the copy size to be a constant, preferably
138 // within a subtarget-specific limit.
139 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
140 if (!ConstantSize)
141 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
142 RTLIB::MEMCPY);
143 uint64_t SizeVal = ConstantSize->getZExtValue();
144 if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
145 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
146 RTLIB::MEMCPY);
148 unsigned BytesLeft = SizeVal & 3;
149 unsigned NumMemOps = SizeVal >> 2;
150 unsigned EmittedNumMemOps = 0;
151 EVT VT = MVT::i32;
152 unsigned VTSize = 4;
153 unsigned i = 0;
154 // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
155 const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
156 SDValue TFOps[6];
157 SDValue Loads[6];
158 uint64_t SrcOff = 0, DstOff = 0;
160 // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
161 // VLDM/VSTM and make this code emit it when appropriate. This would reduce
162 // pressure on the general purpose registers. However this seems harder to map
163 // onto the register allocator's view of the world.
165 // The number of MEMCPY pseudo-instructions to emit. We use up to
166 // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
167 // later on. This is a lower bound on the number of MEMCPY operations we must
168 // emit.
169 unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
171 // Code size optimisation: do not inline memcpy if expansion results in
172 // more instructions than the libary call.
173 if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
174 return SDValue();
177 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
179 for (unsigned I = 0; I != NumMEMCPYs; ++I) {
180 // Evenly distribute registers among MEMCPY operations to reduce register
181 // pressure.
182 unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
183 unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
185 Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
186 DAG.getConstant(NumRegs, dl, MVT::i32));
187 Src = Dst.getValue(1);
188 Chain = Dst.getValue(2);
190 DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
191 SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
193 EmittedNumMemOps = NextEmittedNumMemOps;
196 if (BytesLeft == 0)
197 return Chain;
199 // Issue loads / stores for the trailing (1 - 3) bytes.
200 auto getRemainingValueType = [](unsigned BytesLeft) {
201 return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
203 auto getRemainingSize = [](unsigned BytesLeft) {
204 return (BytesLeft >= 2) ? 2 : 1;
207 unsigned BytesLeftSave = BytesLeft;
208 i = 0;
209 while (BytesLeft) {
210 VT = getRemainingValueType(BytesLeft);
211 VTSize = getRemainingSize(BytesLeft);
212 Loads[i] = DAG.getLoad(VT, dl, Chain,
213 DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
214 DAG.getConstant(SrcOff, dl, MVT::i32)),
215 SrcPtrInfo.getWithOffset(SrcOff));
216 TFOps[i] = Loads[i].getValue(1);
217 ++i;
218 SrcOff += VTSize;
219 BytesLeft -= VTSize;
221 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
222 makeArrayRef(TFOps, i));
224 i = 0;
225 BytesLeft = BytesLeftSave;
226 while (BytesLeft) {
227 VT = getRemainingValueType(BytesLeft);
228 VTSize = getRemainingSize(BytesLeft);
229 TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
230 DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
231 DAG.getConstant(DstOff, dl, MVT::i32)),
232 DstPtrInfo.getWithOffset(DstOff));
233 ++i;
234 DstOff += VTSize;
235 BytesLeft -= VTSize;
237 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
238 makeArrayRef(TFOps, i));
241 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
242 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
243 SDValue Size, unsigned Align, bool isVolatile,
244 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
245 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
246 RTLIB::MEMMOVE);
249 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
250 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
251 SDValue Size, unsigned Align, bool isVolatile,
252 MachinePointerInfo DstPtrInfo) const {
253 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
254 RTLIB::MEMSET);