1 //===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the X86SelectionDAGInfo class.
11 //===----------------------------------------------------------------------===//
13 #include "X86SelectionDAGInfo.h"
14 #include "X86ISelLowering.h"
15 #include "X86InstrInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/SelectionDAG.h"
20 #include "llvm/CodeGen/TargetLowering.h"
24 #define DEBUG_TYPE "x86-selectiondag-info"
27 UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden
, cl::init(false),
28 cl::desc("Use fast short rep mov in memcpy lowering"));
30 bool X86SelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode
) const {
31 return Opcode
>= X86ISD::FIRST_MEMORY_OPCODE
&&
32 Opcode
<= X86ISD::LAST_MEMORY_OPCODE
;
35 bool X86SelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode
) const {
36 return Opcode
>= X86ISD::FIRST_STRICTFP_OPCODE
&&
37 Opcode
<= X86ISD::LAST_STRICTFP_OPCODE
;
40 /// Returns the best type to use with repmovs/repstos depending on alignment.
41 static MVT
getOptimalRepType(const X86Subtarget
&Subtarget
, Align Alignment
) {
42 uint64_t Align
= Alignment
.value();
43 assert((Align
!= 0) && "Align is normalized");
44 assert(isPowerOf2_64(Align
) && "Align is a power of 2");
53 return Subtarget
.is64Bit() ? MVT::i64
: MVT::i32
;
57 bool X86SelectionDAGInfo::isBaseRegConflictPossible(
58 SelectionDAG
&DAG
, ArrayRef
<MCPhysReg
> ClobberSet
) const {
59 // We cannot use TRI->hasBasePointer() until *after* we select all basic
60 // blocks. Legalization may introduce new stack temporaries with large
61 // alignment requirements. Fall back to generic code if there are any
62 // dynamic stack adjustments (hopefully rare) and the base pointer would
63 // conflict if we had to use it.
64 MachineFrameInfo
&MFI
= DAG
.getMachineFunction().getFrameInfo();
65 if (!MFI
.hasVarSizedObjects() && !MFI
.hasOpaqueSPAdjustment())
68 const X86RegisterInfo
*TRI
= static_cast<const X86RegisterInfo
*>(
69 DAG
.getSubtarget().getRegisterInfo());
70 return llvm::is_contained(ClobberSet
, TRI
->getBaseRegister());
73 /// Emit a single REP STOSB instruction for a particular constant size.
74 static SDValue
emitRepstos(const X86Subtarget
&Subtarget
, SelectionDAG
&DAG
,
75 const SDLoc
&dl
, SDValue Chain
, SDValue Dst
,
76 SDValue Val
, SDValue Size
, MVT AVT
) {
77 const bool Use64BitRegs
= Subtarget
.isTarget64BitLP64();
78 unsigned AX
= X86::AL
;
79 switch (AVT
.getSizeInBits()) {
94 const unsigned CX
= Use64BitRegs
? X86::RCX
: X86::ECX
;
95 const unsigned DI
= Use64BitRegs
? X86::RDI
: X86::EDI
;
98 Chain
= DAG
.getCopyToReg(Chain
, dl
, AX
, Val
, InGlue
);
99 InGlue
= Chain
.getValue(1);
100 Chain
= DAG
.getCopyToReg(Chain
, dl
, CX
, Size
, InGlue
);
101 InGlue
= Chain
.getValue(1);
102 Chain
= DAG
.getCopyToReg(Chain
, dl
, DI
, Dst
, InGlue
);
103 InGlue
= Chain
.getValue(1);
105 SDVTList Tys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
106 SDValue Ops
[] = {Chain
, DAG
.getValueType(AVT
), InGlue
};
107 return DAG
.getNode(X86ISD::REP_STOS
, dl
, Tys
, Ops
);
110 /// Emit a single REP STOSB instruction for a particular constant size.
111 static SDValue
emitRepstosB(const X86Subtarget
&Subtarget
, SelectionDAG
&DAG
,
112 const SDLoc
&dl
, SDValue Chain
, SDValue Dst
,
113 SDValue Val
, uint64_t Size
) {
114 return emitRepstos(Subtarget
, DAG
, dl
, Chain
, Dst
, Val
,
115 DAG
.getIntPtrConstant(Size
, dl
), MVT::i8
);
118 /// Returns a REP STOS instruction, possibly with a few load/stores to implement
119 /// a constant size memory set. In some cases where we know REP MOVS is
120 /// inefficient we return an empty SDValue so the calling code can either
121 /// generate a store sequence or call the runtime memset function.
122 static SDValue
emitConstantSizeRepstos(SelectionDAG
&DAG
,
123 const X86Subtarget
&Subtarget
,
124 const SDLoc
&dl
, SDValue Chain
,
125 SDValue Dst
, SDValue Val
, uint64_t Size
,
126 EVT SizeVT
, Align Alignment
,
127 bool isVolatile
, bool AlwaysInline
,
128 MachinePointerInfo DstPtrInfo
) {
129 /// In case we optimize for size, we use repstosb even if it's less efficient
130 /// so we can save the loads/stores of the leftover.
131 if (DAG
.getMachineFunction().getFunction().hasMinSize()) {
132 if (auto *ValC
= dyn_cast
<ConstantSDNode
>(Val
)) {
133 // Special case 0 because otherwise we get large literals,
134 // which causes larger encoding.
135 if ((Size
& 31) == 0 && (ValC
->getZExtValue() & 255) == 0) {
136 MVT BlockType
= MVT::i32
;
137 const uint64_t BlockBits
= BlockType
.getSizeInBits();
138 const uint64_t BlockBytes
= BlockBits
/ 8;
139 const uint64_t BlockCount
= Size
/ BlockBytes
;
141 Val
= DAG
.getConstant(0, dl
, BlockType
);
142 // repstosd is same size as repstosb
143 return emitRepstos(Subtarget
, DAG
, dl
, Chain
, Dst
, Val
,
144 DAG
.getIntPtrConstant(BlockCount
, dl
), BlockType
);
147 return emitRepstosB(Subtarget
, DAG
, dl
, Chain
, Dst
, Val
, Size
);
150 if (Size
> Subtarget
.getMaxInlineSizeThreshold())
153 // If not DWORD aligned or size is more than the threshold, call the library.
154 // The libc version is likely to be faster for these cases. It can use the
155 // address value and run time information about the CPU.
156 if (Alignment
< Align(4))
159 MVT BlockType
= MVT::i8
;
160 uint64_t BlockCount
= Size
;
161 uint64_t BytesLeft
= 0;
163 SDValue OriginalVal
= Val
;
164 if (auto *ValC
= dyn_cast
<ConstantSDNode
>(Val
)) {
165 BlockType
= getOptimalRepType(Subtarget
, Alignment
);
166 uint64_t Value
= ValC
->getZExtValue() & 255;
167 const uint64_t BlockBits
= BlockType
.getSizeInBits();
170 Value
= (Value
<< 8) | Value
;
173 Value
= (Value
<< 16) | Value
;
176 Value
= (Value
<< 32) | Value
;
178 const uint64_t BlockBytes
= BlockBits
/ 8;
179 BlockCount
= Size
/ BlockBytes
;
180 BytesLeft
= Size
% BlockBytes
;
181 Val
= DAG
.getConstant(Value
, dl
, BlockType
);
185 emitRepstos(Subtarget
, DAG
, dl
, Chain
, Dst
, Val
,
186 DAG
.getIntPtrConstant(BlockCount
, dl
), BlockType
);
187 /// RepStos can process the whole length.
191 // Handle the last 1 - 7 bytes.
192 SmallVector
<SDValue
, 4> Results
;
193 Results
.push_back(RepStos
);
194 unsigned Offset
= Size
- BytesLeft
;
195 EVT AddrVT
= Dst
.getValueType();
198 DAG
.getMemset(Chain
, dl
,
199 DAG
.getNode(ISD::ADD
, dl
, AddrVT
, Dst
,
200 DAG
.getConstant(Offset
, dl
, AddrVT
)),
201 OriginalVal
, DAG
.getConstant(BytesLeft
, dl
, SizeVT
),
202 Alignment
, isVolatile
, AlwaysInline
,
203 /* CI */ nullptr, DstPtrInfo
.getWithOffset(Offset
)));
205 return DAG
.getNode(ISD::TokenFactor
, dl
, MVT::Other
, Results
);
208 SDValue
X86SelectionDAGInfo::EmitTargetCodeForMemset(
209 SelectionDAG
&DAG
, const SDLoc
&dl
, SDValue Chain
, SDValue Dst
, SDValue Val
,
210 SDValue Size
, Align Alignment
, bool isVolatile
, bool AlwaysInline
,
211 MachinePointerInfo DstPtrInfo
) const {
212 // If to a segment-relative address space, use the default lowering.
213 if (DstPtrInfo
.getAddrSpace() >= 256)
216 // If the base register might conflict with our physical registers, bail out.
217 const MCPhysReg ClobberSet
[] = {X86::RCX
, X86::RAX
, X86::RDI
,
218 X86::ECX
, X86::EAX
, X86::EDI
};
219 if (isBaseRegConflictPossible(DAG
, ClobberSet
))
222 ConstantSDNode
*ConstantSize
= dyn_cast
<ConstantSDNode
>(Size
);
226 const X86Subtarget
&Subtarget
=
227 DAG
.getMachineFunction().getSubtarget
<X86Subtarget
>();
228 return emitConstantSizeRepstos(
229 DAG
, Subtarget
, dl
, Chain
, Dst
, Val
, ConstantSize
->getZExtValue(),
230 Size
.getValueType(), Alignment
, isVolatile
, AlwaysInline
, DstPtrInfo
);
233 /// Emit a single REP MOVS{B,W,D,Q} instruction.
234 static SDValue
emitRepmovs(const X86Subtarget
&Subtarget
, SelectionDAG
&DAG
,
235 const SDLoc
&dl
, SDValue Chain
, SDValue Dst
,
236 SDValue Src
, SDValue Size
, MVT AVT
) {
237 const bool Use64BitRegs
= Subtarget
.isTarget64BitLP64();
238 const unsigned CX
= Use64BitRegs
? X86::RCX
: X86::ECX
;
239 const unsigned DI
= Use64BitRegs
? X86::RDI
: X86::EDI
;
240 const unsigned SI
= Use64BitRegs
? X86::RSI
: X86::ESI
;
243 Chain
= DAG
.getCopyToReg(Chain
, dl
, CX
, Size
, InGlue
);
244 InGlue
= Chain
.getValue(1);
245 Chain
= DAG
.getCopyToReg(Chain
, dl
, DI
, Dst
, InGlue
);
246 InGlue
= Chain
.getValue(1);
247 Chain
= DAG
.getCopyToReg(Chain
, dl
, SI
, Src
, InGlue
);
248 InGlue
= Chain
.getValue(1);
250 SDVTList Tys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
251 SDValue Ops
[] = {Chain
, DAG
.getValueType(AVT
), InGlue
};
252 return DAG
.getNode(X86ISD::REP_MOVS
, dl
, Tys
, Ops
);
255 /// Emit a single REP MOVSB instruction for a particular constant size.
256 static SDValue
emitRepmovsB(const X86Subtarget
&Subtarget
, SelectionDAG
&DAG
,
257 const SDLoc
&dl
, SDValue Chain
, SDValue Dst
,
258 SDValue Src
, uint64_t Size
) {
259 return emitRepmovs(Subtarget
, DAG
, dl
, Chain
, Dst
, Src
,
260 DAG
.getIntPtrConstant(Size
, dl
), MVT::i8
);
263 /// Returns a REP MOVS instruction, possibly with a few load/stores to implement
264 /// a constant size memory copy. In some cases where we know REP MOVS is
265 /// inefficient we return an empty SDValue so the calling code can either
266 /// generate a load/store sequence or call the runtime memcpy function.
267 static SDValue
emitConstantSizeRepmov(
268 SelectionDAG
&DAG
, const X86Subtarget
&Subtarget
, const SDLoc
&dl
,
269 SDValue Chain
, SDValue Dst
, SDValue Src
, uint64_t Size
, EVT SizeVT
,
270 Align Alignment
, bool isVolatile
, bool AlwaysInline
,
271 MachinePointerInfo DstPtrInfo
, MachinePointerInfo SrcPtrInfo
) {
272 /// In case we optimize for size, we use repmovsb even if it's less efficient
273 /// so we can save the loads/stores of the leftover.
274 if (DAG
.getMachineFunction().getFunction().hasMinSize())
275 return emitRepmovsB(Subtarget
, DAG
, dl
, Chain
, Dst
, Src
, Size
);
277 /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
279 if (!AlwaysInline
&& Size
> Subtarget
.getMaxInlineSizeThreshold())
282 /// If we have enhanced repmovs we use it.
283 if (Subtarget
.hasERMSB())
284 return emitRepmovsB(Subtarget
, DAG
, dl
, Chain
, Dst
, Src
, Size
);
286 assert(!Subtarget
.hasERMSB() && "No efficient RepMovs");
287 /// We assume runtime memcpy will do a better job for unaligned copies when
288 /// ERMS is not present.
289 if (!AlwaysInline
&& (Alignment
< Align(4)))
292 const MVT BlockType
= getOptimalRepType(Subtarget
, Alignment
);
293 const uint64_t BlockBytes
= BlockType
.getSizeInBits() / 8;
294 const uint64_t BlockCount
= Size
/ BlockBytes
;
295 const uint64_t BytesLeft
= Size
% BlockBytes
;
297 emitRepmovs(Subtarget
, DAG
, dl
, Chain
, Dst
, Src
,
298 DAG
.getIntPtrConstant(BlockCount
, dl
), BlockType
);
300 /// RepMov can process the whole length.
304 assert(BytesLeft
&& "We have leftover at this point");
306 // Handle the last 1 - 7 bytes.
307 SmallVector
<SDValue
, 4> Results
;
308 Results
.push_back(RepMovs
);
309 unsigned Offset
= Size
- BytesLeft
;
310 EVT DstVT
= Dst
.getValueType();
311 EVT SrcVT
= Src
.getValueType();
312 Results
.push_back(DAG
.getMemcpy(
314 DAG
.getNode(ISD::ADD
, dl
, DstVT
, Dst
, DAG
.getConstant(Offset
, dl
, DstVT
)),
315 DAG
.getNode(ISD::ADD
, dl
, SrcVT
, Src
, DAG
.getConstant(Offset
, dl
, SrcVT
)),
316 DAG
.getConstant(BytesLeft
, dl
, SizeVT
), Alignment
, isVolatile
,
317 /*AlwaysInline*/ true, /*CI=*/nullptr, std::nullopt
,
318 DstPtrInfo
.getWithOffset(Offset
), SrcPtrInfo
.getWithOffset(Offset
)));
319 return DAG
.getNode(ISD::TokenFactor
, dl
, MVT::Other
, Results
);
322 SDValue
X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
323 SelectionDAG
&DAG
, const SDLoc
&dl
, SDValue Chain
, SDValue Dst
, SDValue Src
,
324 SDValue Size
, Align Alignment
, bool isVolatile
, bool AlwaysInline
,
325 MachinePointerInfo DstPtrInfo
, MachinePointerInfo SrcPtrInfo
) const {
326 // If to a segment-relative address space, use the default lowering.
327 if (DstPtrInfo
.getAddrSpace() >= 256 || SrcPtrInfo
.getAddrSpace() >= 256)
330 // If the base registers conflict with our physical registers, use the default
332 const MCPhysReg ClobberSet
[] = {X86::RCX
, X86::RSI
, X86::RDI
,
333 X86::ECX
, X86::ESI
, X86::EDI
};
334 if (isBaseRegConflictPossible(DAG
, ClobberSet
))
337 const X86Subtarget
&Subtarget
=
338 DAG
.getMachineFunction().getSubtarget
<X86Subtarget
>();
340 // If enabled and available, use fast short rep mov.
341 if (UseFSRMForMemcpy
&& Subtarget
.hasFSRM())
342 return emitRepmovs(Subtarget
, DAG
, dl
, Chain
, Dst
, Src
, Size
, MVT::i8
);
344 /// Handle constant sizes
345 if (ConstantSDNode
*ConstantSize
= dyn_cast
<ConstantSDNode
>(Size
))
346 return emitConstantSizeRepmov(DAG
, Subtarget
, dl
, Chain
, Dst
, Src
,
347 ConstantSize
->getZExtValue(),
348 Size
.getValueType(), Alignment
, isVolatile
,
349 AlwaysInline
, DstPtrInfo
, SrcPtrInfo
);