AMDGPU: Mark test as XFAIL in expensive_checks builds
[llvm-project.git] / llvm / lib / Target / X86 / X86SelectionDAGInfo.cpp
blobaba62c36546f92a995f9e164d42d46a7bcf8f8f3
1 //===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the X86SelectionDAGInfo class.
11 //===----------------------------------------------------------------------===//
13 #include "X86SelectionDAGInfo.h"
14 #include "X86ISelLowering.h"
15 #include "X86InstrInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/SelectionDAG.h"
20 #include "llvm/CodeGen/TargetLowering.h"
22 using namespace llvm;
24 #define DEBUG_TYPE "x86-selectiondag-info"
26 static cl::opt<bool>
27 UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
28 cl::desc("Use fast short rep mov in memcpy lowering"));
30 bool X86SelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
31 return Opcode >= X86ISD::FIRST_MEMORY_OPCODE &&
32 Opcode <= X86ISD::LAST_MEMORY_OPCODE;
35 bool X86SelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode) const {
36 return Opcode >= X86ISD::FIRST_STRICTFP_OPCODE &&
37 Opcode <= X86ISD::LAST_STRICTFP_OPCODE;
40 /// Returns the best type to use with repmovs/repstos depending on alignment.
41 static MVT getOptimalRepType(const X86Subtarget &Subtarget, Align Alignment) {
42 uint64_t Align = Alignment.value();
43 assert((Align != 0) && "Align is normalized");
44 assert(isPowerOf2_64(Align) && "Align is a power of 2");
45 switch (Align) {
46 case 1:
47 return MVT::i8;
48 case 2:
49 return MVT::i16;
50 case 4:
51 return MVT::i32;
52 default:
53 return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
57 bool X86SelectionDAGInfo::isBaseRegConflictPossible(
58 SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
59 // We cannot use TRI->hasBasePointer() until *after* we select all basic
60 // blocks. Legalization may introduce new stack temporaries with large
61 // alignment requirements. Fall back to generic code if there are any
62 // dynamic stack adjustments (hopefully rare) and the base pointer would
63 // conflict if we had to use it.
64 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
65 if (!MFI.hasVarSizedObjects() && !MFI.hasOpaqueSPAdjustment())
66 return false;
68 const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
69 DAG.getSubtarget().getRegisterInfo());
70 return llvm::is_contained(ClobberSet, TRI->getBaseRegister());
73 /// Emit a single REP STOSB instruction for a particular constant size.
74 static SDValue emitRepstos(const X86Subtarget &Subtarget, SelectionDAG &DAG,
75 const SDLoc &dl, SDValue Chain, SDValue Dst,
76 SDValue Val, SDValue Size, MVT AVT) {
77 const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
78 unsigned AX = X86::AL;
79 switch (AVT.getSizeInBits()) {
80 case 8:
81 AX = X86::AL;
82 break;
83 case 16:
84 AX = X86::AX;
85 break;
86 case 32:
87 AX = X86::EAX;
88 break;
89 default:
90 AX = X86::RAX;
91 break;
94 const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
95 const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
97 SDValue InGlue;
98 Chain = DAG.getCopyToReg(Chain, dl, AX, Val, InGlue);
99 InGlue = Chain.getValue(1);
100 Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InGlue);
101 InGlue = Chain.getValue(1);
102 Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InGlue);
103 InGlue = Chain.getValue(1);
105 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
106 SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
107 return DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
110 /// Emit a single REP STOSB instruction for a particular constant size.
111 static SDValue emitRepstosB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
112 const SDLoc &dl, SDValue Chain, SDValue Dst,
113 SDValue Val, uint64_t Size) {
114 return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
115 DAG.getIntPtrConstant(Size, dl), MVT::i8);
118 /// Returns a REP STOS instruction, possibly with a few load/stores to implement
119 /// a constant size memory set. In some cases where we know REP MOVS is
120 /// inefficient we return an empty SDValue so the calling code can either
121 /// generate a store sequence or call the runtime memset function.
122 static SDValue emitConstantSizeRepstos(SelectionDAG &DAG,
123 const X86Subtarget &Subtarget,
124 const SDLoc &dl, SDValue Chain,
125 SDValue Dst, SDValue Val, uint64_t Size,
126 EVT SizeVT, Align Alignment,
127 bool isVolatile, bool AlwaysInline,
128 MachinePointerInfo DstPtrInfo) {
129 /// In case we optimize for size, we use repstosb even if it's less efficient
130 /// so we can save the loads/stores of the leftover.
131 if (DAG.getMachineFunction().getFunction().hasMinSize()) {
132 if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
133 // Special case 0 because otherwise we get large literals,
134 // which causes larger encoding.
135 if ((Size & 31) == 0 && (ValC->getZExtValue() & 255) == 0) {
136 MVT BlockType = MVT::i32;
137 const uint64_t BlockBits = BlockType.getSizeInBits();
138 const uint64_t BlockBytes = BlockBits / 8;
139 const uint64_t BlockCount = Size / BlockBytes;
141 Val = DAG.getConstant(0, dl, BlockType);
142 // repstosd is same size as repstosb
143 return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
144 DAG.getIntPtrConstant(BlockCount, dl), BlockType);
147 return emitRepstosB(Subtarget, DAG, dl, Chain, Dst, Val, Size);
150 if (Size > Subtarget.getMaxInlineSizeThreshold())
151 return SDValue();
153 // If not DWORD aligned or size is more than the threshold, call the library.
154 // The libc version is likely to be faster for these cases. It can use the
155 // address value and run time information about the CPU.
156 if (Alignment < Align(4))
157 return SDValue();
159 MVT BlockType = MVT::i8;
160 uint64_t BlockCount = Size;
161 uint64_t BytesLeft = 0;
163 SDValue OriginalVal = Val;
164 if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
165 BlockType = getOptimalRepType(Subtarget, Alignment);
166 uint64_t Value = ValC->getZExtValue() & 255;
167 const uint64_t BlockBits = BlockType.getSizeInBits();
169 if (BlockBits >= 16)
170 Value = (Value << 8) | Value;
172 if (BlockBits >= 32)
173 Value = (Value << 16) | Value;
175 if (BlockBits >= 64)
176 Value = (Value << 32) | Value;
178 const uint64_t BlockBytes = BlockBits / 8;
179 BlockCount = Size / BlockBytes;
180 BytesLeft = Size % BlockBytes;
181 Val = DAG.getConstant(Value, dl, BlockType);
184 SDValue RepStos =
185 emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
186 DAG.getIntPtrConstant(BlockCount, dl), BlockType);
187 /// RepStos can process the whole length.
188 if (BytesLeft == 0)
189 return RepStos;
191 // Handle the last 1 - 7 bytes.
192 SmallVector<SDValue, 4> Results;
193 Results.push_back(RepStos);
194 unsigned Offset = Size - BytesLeft;
195 EVT AddrVT = Dst.getValueType();
197 Results.push_back(
198 DAG.getMemset(Chain, dl,
199 DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
200 DAG.getConstant(Offset, dl, AddrVT)),
201 OriginalVal, DAG.getConstant(BytesLeft, dl, SizeVT),
202 Alignment, isVolatile, AlwaysInline,
203 /* CI */ nullptr, DstPtrInfo.getWithOffset(Offset)));
205 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
208 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
209 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
210 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
211 MachinePointerInfo DstPtrInfo) const {
212 // If to a segment-relative address space, use the default lowering.
213 if (DstPtrInfo.getAddrSpace() >= 256)
214 return SDValue();
216 // If the base register might conflict with our physical registers, bail out.
217 const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
218 X86::ECX, X86::EAX, X86::EDI};
219 if (isBaseRegConflictPossible(DAG, ClobberSet))
220 return SDValue();
222 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
223 if (!ConstantSize)
224 return SDValue();
226 const X86Subtarget &Subtarget =
227 DAG.getMachineFunction().getSubtarget<X86Subtarget>();
228 return emitConstantSizeRepstos(
229 DAG, Subtarget, dl, Chain, Dst, Val, ConstantSize->getZExtValue(),
230 Size.getValueType(), Alignment, isVolatile, AlwaysInline, DstPtrInfo);
233 /// Emit a single REP MOVS{B,W,D,Q} instruction.
234 static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
235 const SDLoc &dl, SDValue Chain, SDValue Dst,
236 SDValue Src, SDValue Size, MVT AVT) {
237 const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
238 const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
239 const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
240 const unsigned SI = Use64BitRegs ? X86::RSI : X86::ESI;
242 SDValue InGlue;
243 Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InGlue);
244 InGlue = Chain.getValue(1);
245 Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InGlue);
246 InGlue = Chain.getValue(1);
247 Chain = DAG.getCopyToReg(Chain, dl, SI, Src, InGlue);
248 InGlue = Chain.getValue(1);
250 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
251 SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
252 return DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
255 /// Emit a single REP MOVSB instruction for a particular constant size.
256 static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
257 const SDLoc &dl, SDValue Chain, SDValue Dst,
258 SDValue Src, uint64_t Size) {
259 return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
260 DAG.getIntPtrConstant(Size, dl), MVT::i8);
263 /// Returns a REP MOVS instruction, possibly with a few load/stores to implement
264 /// a constant size memory copy. In some cases where we know REP MOVS is
265 /// inefficient we return an empty SDValue so the calling code can either
266 /// generate a load/store sequence or call the runtime memcpy function.
267 static SDValue emitConstantSizeRepmov(
268 SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl,
269 SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
270 Align Alignment, bool isVolatile, bool AlwaysInline,
271 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
272 /// In case we optimize for size, we use repmovsb even if it's less efficient
273 /// so we can save the loads/stores of the leftover.
274 if (DAG.getMachineFunction().getFunction().hasMinSize())
275 return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
277 /// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
278 /// efficient.
279 if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold())
280 return SDValue();
282 /// If we have enhanced repmovs we use it.
283 if (Subtarget.hasERMSB())
284 return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
286 assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
287 /// We assume runtime memcpy will do a better job for unaligned copies when
288 /// ERMS is not present.
289 if (!AlwaysInline && (Alignment < Align(4)))
290 return SDValue();
292 const MVT BlockType = getOptimalRepType(Subtarget, Alignment);
293 const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
294 const uint64_t BlockCount = Size / BlockBytes;
295 const uint64_t BytesLeft = Size % BlockBytes;
296 SDValue RepMovs =
297 emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
298 DAG.getIntPtrConstant(BlockCount, dl), BlockType);
300 /// RepMov can process the whole length.
301 if (BytesLeft == 0)
302 return RepMovs;
304 assert(BytesLeft && "We have leftover at this point");
306 // Handle the last 1 - 7 bytes.
307 SmallVector<SDValue, 4> Results;
308 Results.push_back(RepMovs);
309 unsigned Offset = Size - BytesLeft;
310 EVT DstVT = Dst.getValueType();
311 EVT SrcVT = Src.getValueType();
312 Results.push_back(DAG.getMemcpy(
313 Chain, dl,
314 DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)),
315 DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)),
316 DAG.getConstant(BytesLeft, dl, SizeVT), Alignment, isVolatile,
317 /*AlwaysInline*/ true, /*CI=*/nullptr, std::nullopt,
318 DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset)));
319 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
322 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
323 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
324 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
325 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
326 // If to a segment-relative address space, use the default lowering.
327 if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256)
328 return SDValue();
330 // If the base registers conflict with our physical registers, use the default
331 // lowering.
332 const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
333 X86::ECX, X86::ESI, X86::EDI};
334 if (isBaseRegConflictPossible(DAG, ClobberSet))
335 return SDValue();
337 const X86Subtarget &Subtarget =
338 DAG.getMachineFunction().getSubtarget<X86Subtarget>();
340 // If enabled and available, use fast short rep mov.
341 if (UseFSRMForMemcpy && Subtarget.hasFSRM())
342 return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
344 /// Handle constant sizes
345 if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
346 return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src,
347 ConstantSize->getZExtValue(),
348 Size.getValueType(), Alignment, isVolatile,
349 AlwaysInline, DstPtrInfo, SrcPtrInfo);
351 return SDValue();