1 //===-- lib/CodeGen/GlobalISel/GICombinerHelper.cpp -----------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
8 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
9 #include "llvm/CodeGen/GlobalISel/Combiner.h"
10 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
11 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
12 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
13 #include "llvm/CodeGen/GlobalISel/Utils.h"
14 #include "llvm/CodeGen/MachineDominators.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/MachineInstr.h"
17 #include "llvm/CodeGen/MachineRegisterInfo.h"
18 #include "llvm/CodeGen/TargetInstrInfo.h"
19 #include "llvm/CodeGen/TargetLowering.h"
20 #include "llvm/Target/TargetMachine.h"
22 #define DEBUG_TYPE "gi-combiner"
26 // Option to allow testing of the combiner while no targets know about indexed
29 ForceLegalIndexing("force-legal-indexing", cl::Hidden
, cl::init(false),
30 cl::desc("Force all indexed operations to be "
31 "legal for the GlobalISel combiner"));
34 CombinerHelper::CombinerHelper(GISelChangeObserver
&Observer
,
35 MachineIRBuilder
&B
, GISelKnownBits
*KB
,
36 MachineDominatorTree
*MDT
)
37 : Builder(B
), MRI(Builder
.getMF().getRegInfo()), Observer(Observer
),
42 void CombinerHelper::replaceRegWith(MachineRegisterInfo
&MRI
, Register FromReg
,
43 Register ToReg
) const {
44 Observer
.changingAllUsesOfReg(MRI
, FromReg
);
46 if (MRI
.constrainRegAttrs(ToReg
, FromReg
))
47 MRI
.replaceRegWith(FromReg
, ToReg
);
49 Builder
.buildCopy(ToReg
, FromReg
);
51 Observer
.finishedChangingAllUsesOfReg();
54 void CombinerHelper::replaceRegOpWith(MachineRegisterInfo
&MRI
,
55 MachineOperand
&FromRegOp
,
56 Register ToReg
) const {
57 assert(FromRegOp
.getParent() && "Expected an operand in an MI");
58 Observer
.changingInstr(*FromRegOp
.getParent());
60 FromRegOp
.setReg(ToReg
);
62 Observer
.changedInstr(*FromRegOp
.getParent());
65 bool CombinerHelper::tryCombineCopy(MachineInstr
&MI
) {
66 if (matchCombineCopy(MI
)) {
72 bool CombinerHelper::matchCombineCopy(MachineInstr
&MI
) {
73 if (MI
.getOpcode() != TargetOpcode::COPY
)
75 Register DstReg
= MI
.getOperand(0).getReg();
76 Register SrcReg
= MI
.getOperand(1).getReg();
77 LLT DstTy
= MRI
.getType(DstReg
);
78 LLT SrcTy
= MRI
.getType(SrcReg
);
79 // Simple Copy Propagation.
80 // a(sx) = COPY b(sx) -> Replace all uses of a with b.
81 if (DstTy
.isValid() && SrcTy
.isValid() && DstTy
== SrcTy
)
85 void CombinerHelper::applyCombineCopy(MachineInstr
&MI
) {
86 Register DstReg
= MI
.getOperand(0).getReg();
87 Register SrcReg
= MI
.getOperand(1).getReg();
89 replaceRegWith(MRI
, DstReg
, SrcReg
);
92 bool CombinerHelper::tryCombineConcatVectors(MachineInstr
&MI
) {
94 SmallVector
<Register
, 4> Ops
;
95 if (matchCombineConcatVectors(MI
, IsUndef
, Ops
)) {
96 applyCombineConcatVectors(MI
, IsUndef
, Ops
);
102 bool CombinerHelper::matchCombineConcatVectors(MachineInstr
&MI
, bool &IsUndef
,
103 SmallVectorImpl
<Register
> &Ops
) {
104 assert(MI
.getOpcode() == TargetOpcode::G_CONCAT_VECTORS
&&
105 "Invalid instruction");
107 MachineInstr
*Undef
= nullptr;
109 // Walk over all the operands of concat vectors and check if they are
110 // build_vector themselves or undef.
111 // Then collect their operands in Ops.
112 for (const MachineOperand
&MO
: MI
.operands()) {
113 // Skip the instruction definition.
116 Register Reg
= MO
.getReg();
117 MachineInstr
*Def
= MRI
.getVRegDef(Reg
);
118 assert(Def
&& "Operand not defined");
119 switch (Def
->getOpcode()) {
120 case TargetOpcode::G_BUILD_VECTOR
:
122 // Remember the operands of the build_vector to fold
123 // them into the yet-to-build flattened concat vectors.
124 for (const MachineOperand
&BuildVecMO
: Def
->operands()) {
125 // Skip the definition.
126 if (BuildVecMO
.isDef())
128 Ops
.push_back(BuildVecMO
.getReg());
131 case TargetOpcode::G_IMPLICIT_DEF
: {
132 LLT OpType
= MRI
.getType(Reg
);
133 // Keep one undef value for all the undef operands.
135 Builder
.setInsertPt(*MI
.getParent(), MI
);
136 Undef
= Builder
.buildUndef(OpType
.getScalarType());
138 assert(MRI
.getType(Undef
->getOperand(0).getReg()) ==
139 OpType
.getScalarType() &&
140 "All undefs should have the same type");
141 // Break the undef vector in as many scalar elements as needed
142 // for the flattening.
143 for (unsigned EltIdx
= 0, EltEnd
= OpType
.getNumElements();
144 EltIdx
!= EltEnd
; ++EltIdx
)
145 Ops
.push_back(Undef
->getOperand(0).getReg());
154 void CombinerHelper::applyCombineConcatVectors(
155 MachineInstr
&MI
, bool IsUndef
, const ArrayRef
<Register
> Ops
) {
156 // We determined that the concat_vectors can be flatten.
157 // Generate the flattened build_vector.
158 Register DstReg
= MI
.getOperand(0).getReg();
159 Builder
.setInsertPt(*MI
.getParent(), MI
);
160 Register NewDstReg
= MRI
.cloneVirtualRegister(DstReg
);
162 // Note: IsUndef is sort of redundant. We could have determine it by
163 // checking that at all Ops are undef. Alternatively, we could have
164 // generate a build_vector of undefs and rely on another combine to
165 // clean that up. For now, given we already gather this information
166 // in tryCombineConcatVectors, just save compile time and issue the
169 Builder
.buildUndef(NewDstReg
);
171 Builder
.buildBuildVector(NewDstReg
, Ops
);
172 MI
.eraseFromParent();
173 replaceRegWith(MRI
, DstReg
, NewDstReg
);
176 bool CombinerHelper::tryCombineShuffleVector(MachineInstr
&MI
) {
177 SmallVector
<Register
, 4> Ops
;
178 if (matchCombineShuffleVector(MI
, Ops
)) {
179 applyCombineShuffleVector(MI
, Ops
);
185 bool CombinerHelper::matchCombineShuffleVector(MachineInstr
&MI
,
186 SmallVectorImpl
<Register
> &Ops
) {
187 assert(MI
.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR
&&
188 "Invalid instruction kind");
189 LLT DstType
= MRI
.getType(MI
.getOperand(0).getReg());
190 Register Src1
= MI
.getOperand(1).getReg();
191 LLT SrcType
= MRI
.getType(Src1
);
192 unsigned DstNumElts
= DstType
.getNumElements();
193 unsigned SrcNumElts
= SrcType
.getNumElements();
195 // If the resulting vector is smaller than the size of the source
196 // vectors being concatenated, we won't be able to replace the
197 // shuffle vector into a concat_vectors.
199 // Note: We may still be able to produce a concat_vectors fed by
200 // extract_vector_elt and so on. It is less clear that would
201 // be better though, so don't bother for now.
202 if (DstNumElts
< 2 * SrcNumElts
)
205 // Check that the shuffle mask can be broken evenly between the
206 // different sources.
207 if (DstNumElts
% SrcNumElts
!= 0)
210 // Mask length is a multiple of the source vector length.
211 // Check if the shuffle is some kind of concatenation of the input
213 unsigned NumConcat
= DstNumElts
/ SrcNumElts
;
214 SmallVector
<int, 8> ConcatSrcs(NumConcat
, -1);
215 SmallVector
<int, 8> Mask
;
216 ShuffleVectorInst::getShuffleMask(MI
.getOperand(3).getShuffleMask(), Mask
);
217 for (unsigned i
= 0; i
!= DstNumElts
; ++i
) {
222 // Ensure the indices in each SrcType sized piece are sequential and that
223 // the same source is used for the whole piece.
224 if ((Idx
% SrcNumElts
!= (i
% SrcNumElts
)) ||
225 (ConcatSrcs
[i
/ SrcNumElts
] >= 0 &&
226 ConcatSrcs
[i
/ SrcNumElts
] != (int)(Idx
/ SrcNumElts
)))
228 // Remember which source this index came from.
229 ConcatSrcs
[i
/ SrcNumElts
] = Idx
/ SrcNumElts
;
232 // The shuffle is concatenating multiple vectors together.
233 // Collect the different operands for that.
235 Register Src2
= MI
.getOperand(2).getReg();
236 for (auto Src
: ConcatSrcs
) {
239 Builder
.setInsertPt(*MI
.getParent(), MI
);
240 UndefReg
= Builder
.buildUndef(SrcType
).getReg(0);
242 Ops
.push_back(UndefReg
);
251 void CombinerHelper::applyCombineShuffleVector(MachineInstr
&MI
,
252 const ArrayRef
<Register
> Ops
) {
253 Register DstReg
= MI
.getOperand(0).getReg();
254 Builder
.setInsertPt(*MI
.getParent(), MI
);
255 Register NewDstReg
= MRI
.cloneVirtualRegister(DstReg
);
257 Builder
.buildConcatVectors(NewDstReg
, Ops
);
259 MI
.eraseFromParent();
260 replaceRegWith(MRI
, DstReg
, NewDstReg
);
265 /// Select a preference between two uses. CurrentUse is the current preference
266 /// while *ForCandidate is attributes of the candidate under consideration.
267 PreferredTuple
ChoosePreferredUse(PreferredTuple
&CurrentUse
,
268 const LLT
&TyForCandidate
,
269 unsigned OpcodeForCandidate
,
270 MachineInstr
*MIForCandidate
) {
271 if (!CurrentUse
.Ty
.isValid()) {
272 if (CurrentUse
.ExtendOpcode
== OpcodeForCandidate
||
273 CurrentUse
.ExtendOpcode
== TargetOpcode::G_ANYEXT
)
274 return {TyForCandidate
, OpcodeForCandidate
, MIForCandidate
};
278 // We permit the extend to hoist through basic blocks but this is only
279 // sensible if the target has extending loads. If you end up lowering back
280 // into a load and extend during the legalizer then the end result is
281 // hoisting the extend up to the load.
283 // Prefer defined extensions to undefined extensions as these are more
284 // likely to reduce the number of instructions.
285 if (OpcodeForCandidate
== TargetOpcode::G_ANYEXT
&&
286 CurrentUse
.ExtendOpcode
!= TargetOpcode::G_ANYEXT
)
288 else if (CurrentUse
.ExtendOpcode
== TargetOpcode::G_ANYEXT
&&
289 OpcodeForCandidate
!= TargetOpcode::G_ANYEXT
)
290 return {TyForCandidate
, OpcodeForCandidate
, MIForCandidate
};
292 // Prefer sign extensions to zero extensions as sign-extensions tend to be
294 if (CurrentUse
.Ty
== TyForCandidate
) {
295 if (CurrentUse
.ExtendOpcode
== TargetOpcode::G_SEXT
&&
296 OpcodeForCandidate
== TargetOpcode::G_ZEXT
)
298 else if (CurrentUse
.ExtendOpcode
== TargetOpcode::G_ZEXT
&&
299 OpcodeForCandidate
== TargetOpcode::G_SEXT
)
300 return {TyForCandidate
, OpcodeForCandidate
, MIForCandidate
};
303 // This is potentially target specific. We've chosen the largest type
304 // because G_TRUNC is usually free. One potential catch with this is that
305 // some targets have a reduced number of larger registers than smaller
306 // registers and this choice potentially increases the live-range for the
308 if (TyForCandidate
.getSizeInBits() > CurrentUse
.Ty
.getSizeInBits()) {
309 return {TyForCandidate
, OpcodeForCandidate
, MIForCandidate
};
314 /// Find a suitable place to insert some instructions and insert them. This
315 /// function accounts for special cases like inserting before a PHI node.
316 /// The current strategy for inserting before PHI's is to duplicate the
317 /// instructions for each predecessor. However, while that's ok for G_TRUNC
318 /// on most targets since it generally requires no code, other targets/cases may
319 /// want to try harder to find a dominating block.
320 static void InsertInsnsWithoutSideEffectsBeforeUse(
321 MachineIRBuilder
&Builder
, MachineInstr
&DefMI
, MachineOperand
&UseMO
,
322 std::function
<void(MachineBasicBlock
*, MachineBasicBlock::iterator
,
323 MachineOperand
&UseMO
)>
325 MachineInstr
&UseMI
= *UseMO
.getParent();
327 MachineBasicBlock
*InsertBB
= UseMI
.getParent();
329 // If the use is a PHI then we want the predecessor block instead.
331 MachineOperand
*PredBB
= std::next(&UseMO
);
332 InsertBB
= PredBB
->getMBB();
335 // If the block is the same block as the def then we want to insert just after
336 // the def instead of at the start of the block.
337 if (InsertBB
== DefMI
.getParent()) {
338 MachineBasicBlock::iterator InsertPt
= &DefMI
;
339 Inserter(InsertBB
, std::next(InsertPt
), UseMO
);
343 // Otherwise we want the start of the BB
344 Inserter(InsertBB
, InsertBB
->getFirstNonPHI(), UseMO
);
346 } // end anonymous namespace
348 bool CombinerHelper::tryCombineExtendingLoads(MachineInstr
&MI
) {
349 PreferredTuple Preferred
;
350 if (matchCombineExtendingLoads(MI
, Preferred
)) {
351 applyCombineExtendingLoads(MI
, Preferred
);
357 bool CombinerHelper::matchCombineExtendingLoads(MachineInstr
&MI
,
358 PreferredTuple
&Preferred
) {
359 // We match the loads and follow the uses to the extend instead of matching
360 // the extends and following the def to the load. This is because the load
361 // must remain in the same position for correctness (unless we also add code
362 // to find a safe place to sink it) whereas the extend is freely movable.
363 // It also prevents us from duplicating the load for the volatile case or just
366 if (MI
.getOpcode() != TargetOpcode::G_LOAD
&&
367 MI
.getOpcode() != TargetOpcode::G_SEXTLOAD
&&
368 MI
.getOpcode() != TargetOpcode::G_ZEXTLOAD
)
371 auto &LoadValue
= MI
.getOperand(0);
372 assert(LoadValue
.isReg() && "Result wasn't a register?");
374 LLT LoadValueTy
= MRI
.getType(LoadValue
.getReg());
375 if (!LoadValueTy
.isScalar())
378 // Most architectures are going to legalize <s8 loads into at least a 1 byte
379 // load, and the MMOs can only describe memory accesses in multiples of bytes.
380 // If we try to perform extload combining on those, we can end up with
381 // %a(s8) = extload %ptr (load 1 byte from %ptr)
382 // ... which is an illegal extload instruction.
383 if (LoadValueTy
.getSizeInBits() < 8)
386 // For non power-of-2 types, they will very likely be legalized into multiple
387 // loads. Don't bother trying to match them into extending loads.
388 if (!isPowerOf2_32(LoadValueTy
.getSizeInBits()))
391 // Find the preferred type aside from the any-extends (unless it's the only
392 // one) and non-extending ops. We'll emit an extending load to that type and
393 // and emit a variant of (extend (trunc X)) for the others according to the
394 // relative type sizes. At the same time, pick an extend to use based on the
395 // extend involved in the chosen type.
396 unsigned PreferredOpcode
= MI
.getOpcode() == TargetOpcode::G_LOAD
397 ? TargetOpcode::G_ANYEXT
398 : MI
.getOpcode() == TargetOpcode::G_SEXTLOAD
399 ? TargetOpcode::G_SEXT
400 : TargetOpcode::G_ZEXT
;
401 Preferred
= {LLT(), PreferredOpcode
, nullptr};
402 for (auto &UseMI
: MRI
.use_instructions(LoadValue
.getReg())) {
403 if (UseMI
.getOpcode() == TargetOpcode::G_SEXT
||
404 UseMI
.getOpcode() == TargetOpcode::G_ZEXT
||
405 UseMI
.getOpcode() == TargetOpcode::G_ANYEXT
) {
406 Preferred
= ChoosePreferredUse(Preferred
,
407 MRI
.getType(UseMI
.getOperand(0).getReg()),
408 UseMI
.getOpcode(), &UseMI
);
412 // There were no extends
415 // It should be impossible to chose an extend without selecting a different
416 // type since by definition the result of an extend is larger.
417 assert(Preferred
.Ty
!= LoadValueTy
&& "Extending to same type?");
419 LLVM_DEBUG(dbgs() << "Preferred use is: " << *Preferred
.MI
);
423 void CombinerHelper::applyCombineExtendingLoads(MachineInstr
&MI
,
424 PreferredTuple
&Preferred
) {
425 // Rewrite the load to the chosen extending load.
426 Register ChosenDstReg
= Preferred
.MI
->getOperand(0).getReg();
428 // Inserter to insert a truncate back to the original type at a given point
429 // with some basic CSE to limit truncate duplication to one per BB.
430 DenseMap
<MachineBasicBlock
*, MachineInstr
*> EmittedInsns
;
431 auto InsertTruncAt
= [&](MachineBasicBlock
*InsertIntoBB
,
432 MachineBasicBlock::iterator InsertBefore
,
433 MachineOperand
&UseMO
) {
434 MachineInstr
*PreviouslyEmitted
= EmittedInsns
.lookup(InsertIntoBB
);
435 if (PreviouslyEmitted
) {
436 Observer
.changingInstr(*UseMO
.getParent());
437 UseMO
.setReg(PreviouslyEmitted
->getOperand(0).getReg());
438 Observer
.changedInstr(*UseMO
.getParent());
442 Builder
.setInsertPt(*InsertIntoBB
, InsertBefore
);
443 Register NewDstReg
= MRI
.cloneVirtualRegister(MI
.getOperand(0).getReg());
444 MachineInstr
*NewMI
= Builder
.buildTrunc(NewDstReg
, ChosenDstReg
);
445 EmittedInsns
[InsertIntoBB
] = NewMI
;
446 replaceRegOpWith(MRI
, UseMO
, NewDstReg
);
449 Observer
.changingInstr(MI
);
451 Builder
.getTII().get(Preferred
.ExtendOpcode
== TargetOpcode::G_SEXT
452 ? TargetOpcode::G_SEXTLOAD
453 : Preferred
.ExtendOpcode
== TargetOpcode::G_ZEXT
454 ? TargetOpcode::G_ZEXTLOAD
455 : TargetOpcode::G_LOAD
));
457 // Rewrite all the uses to fix up the types.
458 auto &LoadValue
= MI
.getOperand(0);
459 SmallVector
<MachineOperand
*, 4> Uses
;
460 for (auto &UseMO
: MRI
.use_operands(LoadValue
.getReg()))
461 Uses
.push_back(&UseMO
);
463 for (auto *UseMO
: Uses
) {
464 MachineInstr
*UseMI
= UseMO
->getParent();
466 // If the extend is compatible with the preferred extend then we should fix
467 // up the type and extend so that it uses the preferred use.
468 if (UseMI
->getOpcode() == Preferred
.ExtendOpcode
||
469 UseMI
->getOpcode() == TargetOpcode::G_ANYEXT
) {
470 Register UseDstReg
= UseMI
->getOperand(0).getReg();
471 MachineOperand
&UseSrcMO
= UseMI
->getOperand(1);
472 const LLT
&UseDstTy
= MRI
.getType(UseDstReg
);
473 if (UseDstReg
!= ChosenDstReg
) {
474 if (Preferred
.Ty
== UseDstTy
) {
475 // If the use has the same type as the preferred use, then merge
476 // the vregs and erase the extend. For example:
477 // %1:_(s8) = G_LOAD ...
478 // %2:_(s32) = G_SEXT %1(s8)
479 // %3:_(s32) = G_ANYEXT %1(s8)
482 // %2:_(s32) = G_SEXTLOAD ...
484 replaceRegWith(MRI
, UseDstReg
, ChosenDstReg
);
485 Observer
.erasingInstr(*UseMO
->getParent());
486 UseMO
->getParent()->eraseFromParent();
487 } else if (Preferred
.Ty
.getSizeInBits() < UseDstTy
.getSizeInBits()) {
488 // If the preferred size is smaller, then keep the extend but extend
489 // from the result of the extending load. For example:
490 // %1:_(s8) = G_LOAD ...
491 // %2:_(s32) = G_SEXT %1(s8)
492 // %3:_(s64) = G_ANYEXT %1(s8)
495 // %2:_(s32) = G_SEXTLOAD ...
496 // %3:_(s64) = G_ANYEXT %2:_(s32)
498 replaceRegOpWith(MRI
, UseSrcMO
, ChosenDstReg
);
500 // If the preferred size is large, then insert a truncate. For
502 // %1:_(s8) = G_LOAD ...
503 // %2:_(s64) = G_SEXT %1(s8)
504 // %3:_(s32) = G_ZEXT %1(s8)
507 // %2:_(s64) = G_SEXTLOAD ...
508 // %4:_(s8) = G_TRUNC %2:_(s32)
509 // %3:_(s64) = G_ZEXT %2:_(s8)
511 InsertInsnsWithoutSideEffectsBeforeUse(Builder
, MI
, *UseMO
,
516 // The use is (one of) the uses of the preferred use we chose earlier.
517 // We're going to update the load to def this value later so just erase
519 Observer
.erasingInstr(*UseMO
->getParent());
520 UseMO
->getParent()->eraseFromParent();
524 // The use isn't an extend. Truncate back to the type we originally loaded.
525 // This is free on many targets.
526 InsertInsnsWithoutSideEffectsBeforeUse(Builder
, MI
, *UseMO
, InsertTruncAt
);
529 MI
.getOperand(0).setReg(ChosenDstReg
);
530 Observer
.changedInstr(MI
);
533 bool CombinerHelper::isPredecessor(MachineInstr
&DefMI
, MachineInstr
&UseMI
) {
534 assert(DefMI
.getParent() == UseMI
.getParent());
535 if (&DefMI
== &UseMI
)
538 // Loop through the basic block until we find one of the instructions.
539 MachineBasicBlock::const_iterator I
= DefMI
.getParent()->begin();
540 for (; &*I
!= &DefMI
&& &*I
!= &UseMI
; ++I
)
541 return &*I
== &DefMI
;
543 llvm_unreachable("Block must contain instructions");
546 bool CombinerHelper::dominates(MachineInstr
&DefMI
, MachineInstr
&UseMI
) {
548 return MDT
->dominates(&DefMI
, &UseMI
);
549 else if (DefMI
.getParent() != UseMI
.getParent())
552 return isPredecessor(DefMI
, UseMI
);
555 bool CombinerHelper::findPostIndexCandidate(MachineInstr
&MI
, Register
&Addr
,
556 Register
&Base
, Register
&Offset
) {
557 auto &MF
= *MI
.getParent()->getParent();
558 const auto &TLI
= *MF
.getSubtarget().getTargetLowering();
561 unsigned Opcode
= MI
.getOpcode();
562 assert(Opcode
== TargetOpcode::G_LOAD
|| Opcode
== TargetOpcode::G_SEXTLOAD
||
563 Opcode
== TargetOpcode::G_ZEXTLOAD
|| Opcode
== TargetOpcode::G_STORE
);
566 Base
= MI
.getOperand(1).getReg();
567 MachineInstr
*BaseDef
= MRI
.getUniqueVRegDef(Base
);
568 if (BaseDef
&& BaseDef
->getOpcode() == TargetOpcode::G_FRAME_INDEX
)
571 LLVM_DEBUG(dbgs() << "Searching for post-indexing opportunity for: " << MI
);
573 for (auto &Use
: MRI
.use_instructions(Base
)) {
574 if (Use
.getOpcode() != TargetOpcode::G_GEP
)
577 Offset
= Use
.getOperand(2).getReg();
578 if (!ForceLegalIndexing
&&
579 !TLI
.isIndexingLegal(MI
, Base
, Offset
, /*IsPre*/ false, MRI
)) {
580 LLVM_DEBUG(dbgs() << " Ignoring candidate with illegal addrmode: "
585 // Make sure the offset calculation is before the potentially indexed op.
586 // FIXME: we really care about dependency here. The offset calculation might
588 MachineInstr
*OffsetDef
= MRI
.getUniqueVRegDef(Offset
);
589 if (!OffsetDef
|| !dominates(*OffsetDef
, MI
)) {
590 LLVM_DEBUG(dbgs() << " Ignoring candidate with offset after mem-op: "
595 // FIXME: check whether all uses of Base are load/store with foldable
596 // addressing modes. If so, using the normal addr-modes is better than
597 // forming an indexed one.
599 bool MemOpDominatesAddrUses
= true;
600 for (auto &GEPUse
: MRI
.use_instructions(Use
.getOperand(0).getReg())) {
601 if (!dominates(MI
, GEPUse
)) {
602 MemOpDominatesAddrUses
= false;
607 if (!MemOpDominatesAddrUses
) {
609 dbgs() << " Ignoring candidate as memop does not dominate uses: "
614 LLVM_DEBUG(dbgs() << " Found match: " << Use
);
615 Addr
= Use
.getOperand(0).getReg();
622 bool CombinerHelper::findPreIndexCandidate(MachineInstr
&MI
, Register
&Addr
,
623 Register
&Base
, Register
&Offset
) {
624 auto &MF
= *MI
.getParent()->getParent();
625 const auto &TLI
= *MF
.getSubtarget().getTargetLowering();
628 unsigned Opcode
= MI
.getOpcode();
629 assert(Opcode
== TargetOpcode::G_LOAD
|| Opcode
== TargetOpcode::G_SEXTLOAD
||
630 Opcode
== TargetOpcode::G_ZEXTLOAD
|| Opcode
== TargetOpcode::G_STORE
);
633 Addr
= MI
.getOperand(1).getReg();
634 MachineInstr
*AddrDef
= getOpcodeDef(TargetOpcode::G_GEP
, Addr
, MRI
);
635 if (!AddrDef
|| MRI
.hasOneUse(Addr
))
638 Base
= AddrDef
->getOperand(1).getReg();
639 Offset
= AddrDef
->getOperand(2).getReg();
641 LLVM_DEBUG(dbgs() << "Found potential pre-indexed load_store: " << MI
);
643 if (!ForceLegalIndexing
&&
644 !TLI
.isIndexingLegal(MI
, Base
, Offset
, /*IsPre*/ true, MRI
)) {
645 LLVM_DEBUG(dbgs() << " Skipping, not legal for target");
649 MachineInstr
*BaseDef
= getDefIgnoringCopies(Base
, MRI
);
650 if (BaseDef
->getOpcode() == TargetOpcode::G_FRAME_INDEX
) {
651 LLVM_DEBUG(dbgs() << " Skipping, frame index would need copy anyway.");
655 if (MI
.getOpcode() == TargetOpcode::G_STORE
) {
656 // Would require a copy.
657 if (Base
== MI
.getOperand(0).getReg()) {
658 LLVM_DEBUG(dbgs() << " Skipping, storing base so need copy anyway.");
662 // We're expecting one use of Addr in MI, but it could also be the
663 // value stored, which isn't actually dominated by the instruction.
664 if (MI
.getOperand(0).getReg() == Addr
) {
665 LLVM_DEBUG(dbgs() << " Skipping, does not dominate all addr uses");
670 // FIXME: check whether all uses of the base pointer are constant GEPs. That
671 // might allow us to end base's liveness here by adjusting the constant.
673 for (auto &UseMI
: MRI
.use_instructions(Addr
)) {
674 if (!dominates(MI
, UseMI
)) {
675 LLVM_DEBUG(dbgs() << " Skipping, does not dominate all addr uses.");
683 bool CombinerHelper::tryCombineIndexedLoadStore(MachineInstr
&MI
) {
684 unsigned Opcode
= MI
.getOpcode();
685 if (Opcode
!= TargetOpcode::G_LOAD
&& Opcode
!= TargetOpcode::G_SEXTLOAD
&&
686 Opcode
!= TargetOpcode::G_ZEXTLOAD
&& Opcode
!= TargetOpcode::G_STORE
)
689 bool IsStore
= Opcode
== TargetOpcode::G_STORE
;
690 Register Addr
, Base
, Offset
;
691 bool IsPre
= findPreIndexCandidate(MI
, Addr
, Base
, Offset
);
692 if (!IsPre
&& !findPostIndexCandidate(MI
, Addr
, Base
, Offset
))
698 case TargetOpcode::G_LOAD
:
699 NewOpcode
= TargetOpcode::G_INDEXED_LOAD
;
701 case TargetOpcode::G_SEXTLOAD
:
702 NewOpcode
= TargetOpcode::G_INDEXED_SEXTLOAD
;
704 case TargetOpcode::G_ZEXTLOAD
:
705 NewOpcode
= TargetOpcode::G_INDEXED_ZEXTLOAD
;
707 case TargetOpcode::G_STORE
:
708 NewOpcode
= TargetOpcode::G_INDEXED_STORE
;
711 llvm_unreachable("Unknown load/store opcode");
714 MachineInstr
&AddrDef
= *MRI
.getUniqueVRegDef(Addr
);
715 MachineIRBuilder
MIRBuilder(MI
);
716 auto MIB
= MIRBuilder
.buildInstr(NewOpcode
);
719 MIB
.addUse(MI
.getOperand(0).getReg());
721 MIB
.addDef(MI
.getOperand(0).getReg());
728 MI
.eraseFromParent();
729 AddrDef
.eraseFromParent();
731 LLVM_DEBUG(dbgs() << " Combinined to indexed operation");
735 bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr
&MI
) {
736 if (MI
.getOpcode() != TargetOpcode::G_BR
)
739 // Try to match the following:
741 // %c(s32) = G_ICMP pred, %a, %b
742 // %c1(s1) = G_TRUNC %c(s32)
743 // G_BRCOND %c1, %bb2
749 // The above pattern does not have a fall through to the successor bb2, always
750 // resulting in a branch no matter which path is taken. Here we try to find
751 // and replace that pattern with conditional branch to bb3 and otherwise
752 // fallthrough to bb2.
754 MachineBasicBlock
*MBB
= MI
.getParent();
755 MachineBasicBlock::iterator
BrIt(MI
);
756 if (BrIt
== MBB
->begin())
758 assert(std::next(BrIt
) == MBB
->end() && "expected G_BR to be a terminator");
760 MachineInstr
*BrCond
= &*std::prev(BrIt
);
761 if (BrCond
->getOpcode() != TargetOpcode::G_BRCOND
)
764 // Check that the next block is the conditional branch target.
765 if (!MBB
->isLayoutSuccessor(BrCond
->getOperand(1).getMBB()))
768 MachineInstr
*CmpMI
= MRI
.getVRegDef(BrCond
->getOperand(0).getReg());
769 if (!CmpMI
|| CmpMI
->getOpcode() != TargetOpcode::G_ICMP
||
770 !MRI
.hasOneUse(CmpMI
->getOperand(0).getReg()))
775 bool CombinerHelper::tryElideBrByInvertingCond(MachineInstr
&MI
) {
776 if (!matchElideBrByInvertingCond(MI
))
778 applyElideBrByInvertingCond(MI
);
782 void CombinerHelper::applyElideBrByInvertingCond(MachineInstr
&MI
) {
783 MachineBasicBlock
*BrTarget
= MI
.getOperand(0).getMBB();
784 MachineBasicBlock::iterator
BrIt(MI
);
785 MachineInstr
*BrCond
= &*std::prev(BrIt
);
786 MachineInstr
*CmpMI
= MRI
.getVRegDef(BrCond
->getOperand(0).getReg());
788 CmpInst::Predicate InversePred
= CmpInst::getInversePredicate(
789 (CmpInst::Predicate
)CmpMI
->getOperand(1).getPredicate());
791 // Invert the G_ICMP condition.
792 Observer
.changingInstr(*CmpMI
);
793 CmpMI
->getOperand(1).setPredicate(InversePred
);
794 Observer
.changedInstr(*CmpMI
);
796 // Change the conditional branch target.
797 Observer
.changingInstr(*BrCond
);
798 BrCond
->getOperand(1).setMBB(BrTarget
);
799 Observer
.changedInstr(*BrCond
);
800 MI
.eraseFromParent();
803 static bool shouldLowerMemFuncForSize(const MachineFunction
&MF
) {
804 // On Darwin, -Os means optimize for size without hurting performance, so
805 // only really optimize for size when -Oz (MinSize) is used.
806 if (MF
.getTarget().getTargetTriple().isOSDarwin())
807 return MF
.getFunction().hasMinSize();
808 return MF
.getFunction().hasOptSize();
811 // Returns a list of types to use for memory op lowering in MemOps. A partial
812 // port of findOptimalMemOpLowering in TargetLowering.
813 static bool findGISelOptimalMemOpLowering(
814 std::vector
<LLT
> &MemOps
, unsigned Limit
, uint64_t Size
, unsigned DstAlign
,
815 unsigned SrcAlign
, bool IsMemset
, bool ZeroMemset
, bool MemcpyStrSrc
,
816 bool AllowOverlap
, unsigned DstAS
, unsigned SrcAS
,
817 const AttributeList
&FuncAttributes
, const TargetLowering
&TLI
) {
818 // If 'SrcAlign' is zero, that means the memory operation does not need to
819 // load the value, i.e. memset or memcpy from constant string. Otherwise,
820 // it's the inferred alignment of the source. 'DstAlign', on the other hand,
821 // is the specified alignment of the memory operation. If it is zero, that
822 // means it's possible to change the alignment of the destination.
823 // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
824 // not need to be loaded.
825 if (SrcAlign
!= 0 && SrcAlign
< DstAlign
)
828 LLT Ty
= TLI
.getOptimalMemOpLLT(Size
, DstAlign
, SrcAlign
, IsMemset
,
829 ZeroMemset
, MemcpyStrSrc
, FuncAttributes
);
832 // Use the largest scalar type whose alignment constraints are satisfied.
833 // We only need to check DstAlign here as SrcAlign is always greater or
834 // equal to DstAlign (or zero).
835 Ty
= LLT::scalar(64);
836 while (DstAlign
&& DstAlign
< Ty
.getSizeInBytes() &&
837 !TLI
.allowsMisalignedMemoryAccesses(Ty
, DstAS
, DstAlign
))
838 Ty
= LLT::scalar(Ty
.getSizeInBytes());
839 assert(Ty
.getSizeInBits() > 0 && "Could not find valid type");
840 // FIXME: check for the largest legal type we can load/store to.
843 unsigned NumMemOps
= 0;
845 unsigned TySize
= Ty
.getSizeInBytes();
846 while (TySize
> Size
) {
847 // For now, only use non-vector load / store's for the left-over pieces.
849 // FIXME: check for mem op safety and legality of the types. Not all of
850 // SDAGisms map cleanly to GISel concepts.
851 if (NewTy
.isVector())
852 NewTy
= NewTy
.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
853 NewTy
= LLT::scalar(PowerOf2Floor(NewTy
.getSizeInBits() - 1));
854 unsigned NewTySize
= NewTy
.getSizeInBytes();
855 assert(NewTySize
> 0 && "Could not find appropriate type");
857 // If the new LLT cannot cover all of the remaining bits, then consider
858 // issuing a (or a pair of) unaligned and overlapping load / store.
860 // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
861 MVT VT
= getMVTForLLT(Ty
);
862 if (NumMemOps
&& AllowOverlap
&& NewTySize
< Size
&&
863 TLI
.allowsMisalignedMemoryAccesses(
864 VT
, DstAS
, DstAlign
, MachineMemOperand::MONone
, &Fast
) &&
873 if (++NumMemOps
> Limit
)
876 MemOps
.push_back(Ty
);
883 static Type
*getTypeForLLT(LLT Ty
, LLVMContext
&C
) {
885 return VectorType::get(IntegerType::get(C
, Ty
.getScalarSizeInBits()),
886 Ty
.getNumElements());
887 return IntegerType::get(C
, Ty
.getSizeInBits());
890 // Get a vectorized representation of the memset value operand, GISel edition.
891 static Register
getMemsetValue(Register Val
, LLT Ty
, MachineIRBuilder
&MIB
) {
892 MachineRegisterInfo
&MRI
= *MIB
.getMRI();
893 unsigned NumBits
= Ty
.getScalarSizeInBits();
894 auto ValVRegAndVal
= getConstantVRegValWithLookThrough(Val
, MRI
);
895 if (!Ty
.isVector() && ValVRegAndVal
) {
896 unsigned KnownVal
= ValVRegAndVal
->Value
;
897 APInt Scalar
= APInt(8, KnownVal
);
898 APInt SplatVal
= APInt::getSplat(NumBits
, Scalar
);
899 return MIB
.buildConstant(Ty
, SplatVal
).getReg(0);
901 // FIXME: for vector types create a G_BUILD_VECTOR.
905 // Extend the byte value to the larger type, and then multiply by a magic
906 // value 0x010101... in order to replicate it across every byte.
907 LLT ExtType
= Ty
.getScalarType();
908 auto ZExt
= MIB
.buildZExtOrTrunc(ExtType
, Val
);
910 APInt Magic
= APInt::getSplat(NumBits
, APInt(8, 0x01));
911 auto MagicMI
= MIB
.buildConstant(ExtType
, Magic
);
912 Val
= MIB
.buildMul(ExtType
, ZExt
, MagicMI
).getReg(0);
915 assert(ExtType
== Ty
&& "Vector memset value type not supported yet");
919 bool CombinerHelper::optimizeMemset(MachineInstr
&MI
, Register Dst
, Register Val
,
920 unsigned KnownLen
, unsigned Align
,
922 auto &MF
= *MI
.getParent()->getParent();
923 const auto &TLI
= *MF
.getSubtarget().getTargetLowering();
924 auto &DL
= MF
.getDataLayout();
925 LLVMContext
&C
= MF
.getFunction().getContext();
927 assert(KnownLen
!= 0 && "Have a zero length memset length!");
929 bool DstAlignCanChange
= false;
930 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
931 bool OptSize
= shouldLowerMemFuncForSize(MF
);
933 MachineInstr
*FIDef
= getOpcodeDef(TargetOpcode::G_FRAME_INDEX
, Dst
, MRI
);
934 if (FIDef
&& !MFI
.isFixedObjectIndex(FIDef
->getOperand(1).getIndex()))
935 DstAlignCanChange
= true;
937 unsigned Limit
= TLI
.getMaxStoresPerMemset(OptSize
);
938 std::vector
<LLT
> MemOps
;
940 const auto &DstMMO
= **MI
.memoperands_begin();
941 MachinePointerInfo DstPtrInfo
= DstMMO
.getPointerInfo();
943 auto ValVRegAndVal
= getConstantVRegValWithLookThrough(Val
, MRI
);
944 bool IsZeroVal
= ValVRegAndVal
&& ValVRegAndVal
->Value
== 0;
946 if (!findGISelOptimalMemOpLowering(
947 MemOps
, Limit
, KnownLen
, (DstAlignCanChange
? 0 : Align
), 0,
949 /*ZeroMemset=*/IsZeroVal
, /*MemcpyStrSrc=*/false,
950 /*AllowOverlap=*/!IsVolatile
, DstPtrInfo
.getAddrSpace(), ~0u,
951 MF
.getFunction().getAttributes(), TLI
))
954 if (DstAlignCanChange
) {
955 // Get an estimate of the type from the LLT.
956 Type
*IRTy
= getTypeForLLT(MemOps
[0], C
);
957 unsigned NewAlign
= (unsigned)DL
.getABITypeAlignment(IRTy
);
958 if (NewAlign
> Align
) {
960 unsigned FI
= FIDef
->getOperand(1).getIndex();
961 // Give the stack frame object a larger alignment if needed.
962 if (MFI
.getObjectAlignment(FI
) < Align
)
963 MFI
.setObjectAlignment(FI
, Align
);
967 MachineIRBuilder
MIB(MI
);
968 // Find the largest store and generate the bit pattern for it.
969 LLT LargestTy
= MemOps
[0];
970 for (unsigned i
= 1; i
< MemOps
.size(); i
++)
971 if (MemOps
[i
].getSizeInBits() > LargestTy
.getSizeInBits())
972 LargestTy
= MemOps
[i
];
974 // The memset stored value is always defined as an s8, so in order to make it
975 // work with larger store types we need to repeat the bit pattern across the
977 Register MemSetValue
= getMemsetValue(Val
, LargestTy
, MIB
);
982 // Generate the stores. For each store type in the list, we generate the
983 // matching store of that type to the destination address.
984 LLT PtrTy
= MRI
.getType(Dst
);
986 unsigned Size
= KnownLen
;
987 for (unsigned I
= 0; I
< MemOps
.size(); I
++) {
989 unsigned TySize
= Ty
.getSizeInBytes();
991 // Issuing an unaligned load / store pair that overlaps with the previous
992 // pair. Adjust the offset accordingly.
993 assert(I
== MemOps
.size() - 1 && I
!= 0);
994 DstOff
-= TySize
- Size
;
997 // If this store is smaller than the largest store see whether we can get
998 // the smaller value for free with a truncate.
999 Register Value
= MemSetValue
;
1000 if (Ty
.getSizeInBits() < LargestTy
.getSizeInBits()) {
1001 MVT VT
= getMVTForLLT(Ty
);
1002 MVT LargestVT
= getMVTForLLT(LargestTy
);
1003 if (!LargestTy
.isVector() && !Ty
.isVector() &&
1004 TLI
.isTruncateFree(LargestVT
, VT
))
1005 Value
= MIB
.buildTrunc(Ty
, MemSetValue
).getReg(0);
1007 Value
= getMemsetValue(Val
, Ty
, MIB
);
1013 MF
.getMachineMemOperand(&DstMMO
, DstOff
, Ty
.getSizeInBytes());
1018 MIB
.buildConstant(LLT::scalar(PtrTy
.getSizeInBits()), DstOff
);
1019 Ptr
= MIB
.buildGEP(PtrTy
, Dst
, Offset
).getReg(0);
1022 MIB
.buildStore(Value
, Ptr
, *StoreMMO
);
1023 DstOff
+= Ty
.getSizeInBytes();
1027 MI
.eraseFromParent();
1032 bool CombinerHelper::optimizeMemcpy(MachineInstr
&MI
, Register Dst
,
1033 Register Src
, unsigned KnownLen
,
1034 unsigned DstAlign
, unsigned SrcAlign
,
1036 auto &MF
= *MI
.getParent()->getParent();
1037 const auto &TLI
= *MF
.getSubtarget().getTargetLowering();
1038 auto &DL
= MF
.getDataLayout();
1039 LLVMContext
&C
= MF
.getFunction().getContext();
1041 assert(KnownLen
!= 0 && "Have a zero length memcpy length!");
1043 bool DstAlignCanChange
= false;
1044 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1045 bool OptSize
= shouldLowerMemFuncForSize(MF
);
1046 unsigned Alignment
= MinAlign(DstAlign
, SrcAlign
);
1048 MachineInstr
*FIDef
= getOpcodeDef(TargetOpcode::G_FRAME_INDEX
, Dst
, MRI
);
1049 if (FIDef
&& !MFI
.isFixedObjectIndex(FIDef
->getOperand(1).getIndex()))
1050 DstAlignCanChange
= true;
1052 // FIXME: infer better src pointer alignment like SelectionDAG does here.
1053 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
1054 // if the memcpy is in a tail call position.
1056 unsigned Limit
= TLI
.getMaxStoresPerMemcpy(OptSize
);
1057 std::vector
<LLT
> MemOps
;
1059 const auto &DstMMO
= **MI
.memoperands_begin();
1060 const auto &SrcMMO
= **std::next(MI
.memoperands_begin());
1061 MachinePointerInfo DstPtrInfo
= DstMMO
.getPointerInfo();
1062 MachinePointerInfo SrcPtrInfo
= SrcMMO
.getPointerInfo();
1064 if (!findGISelOptimalMemOpLowering(
1065 MemOps
, Limit
, KnownLen
, (DstAlignCanChange
? 0 : Alignment
),
1068 /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
1069 /*AllowOverlap=*/!IsVolatile
, DstPtrInfo
.getAddrSpace(),
1070 SrcPtrInfo
.getAddrSpace(), MF
.getFunction().getAttributes(), TLI
))
1073 if (DstAlignCanChange
) {
1074 // Get an estimate of the type from the LLT.
1075 Type
*IRTy
= getTypeForLLT(MemOps
[0], C
);
1076 unsigned NewAlign
= (unsigned)DL
.getABITypeAlignment(IRTy
);
1078 // Don't promote to an alignment that would require dynamic stack
1080 const TargetRegisterInfo
*TRI
= MF
.getSubtarget().getRegisterInfo();
1081 if (!TRI
->needsStackRealignment(MF
))
1082 while (NewAlign
> Alignment
&&
1083 DL
.exceedsNaturalStackAlignment(Align(NewAlign
)))
1086 if (NewAlign
> Alignment
) {
1087 Alignment
= NewAlign
;
1088 unsigned FI
= FIDef
->getOperand(1).getIndex();
1089 // Give the stack frame object a larger alignment if needed.
1090 if (MFI
.getObjectAlignment(FI
) < Alignment
)
1091 MFI
.setObjectAlignment(FI
, Alignment
);
1095 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI
<< " into loads & stores\n");
1097 MachineIRBuilder
MIB(MI
);
1098 // Now we need to emit a pair of load and stores for each of the types we've
1099 // collected. I.e. for each type, generate a load from the source pointer of
1100 // that type width, and then generate a corresponding store to the dest buffer
1101 // of that value loaded. This can result in a sequence of loads and stores
1102 // mixed types, depending on what the target specifies as good types to use.
1103 unsigned CurrOffset
= 0;
1104 LLT PtrTy
= MRI
.getType(Src
);
1105 unsigned Size
= KnownLen
;
1106 for (auto CopyTy
: MemOps
) {
1107 // Issuing an unaligned load / store pair that overlaps with the previous
1108 // pair. Adjust the offset accordingly.
1109 if (CopyTy
.getSizeInBytes() > Size
)
1110 CurrOffset
-= CopyTy
.getSizeInBytes() - Size
;
1112 // Construct MMOs for the accesses.
1114 MF
.getMachineMemOperand(&SrcMMO
, CurrOffset
, CopyTy
.getSizeInBytes());
1116 MF
.getMachineMemOperand(&DstMMO
, CurrOffset
, CopyTy
.getSizeInBytes());
1119 Register LoadPtr
= Src
;
1121 if (CurrOffset
!= 0) {
1122 Offset
= MIB
.buildConstant(LLT::scalar(PtrTy
.getSizeInBits()), CurrOffset
)
1124 LoadPtr
= MIB
.buildGEP(PtrTy
, Src
, Offset
).getReg(0);
1126 auto LdVal
= MIB
.buildLoad(CopyTy
, LoadPtr
, *LoadMMO
);
1128 // Create the store.
1130 CurrOffset
== 0 ? Dst
: MIB
.buildGEP(PtrTy
, Dst
, Offset
).getReg(0);
1131 MIB
.buildStore(LdVal
, StorePtr
, *StoreMMO
);
1132 CurrOffset
+= CopyTy
.getSizeInBytes();
1133 Size
-= CopyTy
.getSizeInBytes();
1136 MI
.eraseFromParent();
1140 bool CombinerHelper::optimizeMemmove(MachineInstr
&MI
, Register Dst
,
1141 Register Src
, unsigned KnownLen
,
1142 unsigned DstAlign
, unsigned SrcAlign
,
1144 auto &MF
= *MI
.getParent()->getParent();
1145 const auto &TLI
= *MF
.getSubtarget().getTargetLowering();
1146 auto &DL
= MF
.getDataLayout();
1147 LLVMContext
&C
= MF
.getFunction().getContext();
1149 assert(KnownLen
!= 0 && "Have a zero length memmove length!");
1151 bool DstAlignCanChange
= false;
1152 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1153 bool OptSize
= shouldLowerMemFuncForSize(MF
);
1154 unsigned Alignment
= MinAlign(DstAlign
, SrcAlign
);
1156 MachineInstr
*FIDef
= getOpcodeDef(TargetOpcode::G_FRAME_INDEX
, Dst
, MRI
);
1157 if (FIDef
&& !MFI
.isFixedObjectIndex(FIDef
->getOperand(1).getIndex()))
1158 DstAlignCanChange
= true;
1160 unsigned Limit
= TLI
.getMaxStoresPerMemmove(OptSize
);
1161 std::vector
<LLT
> MemOps
;
1163 const auto &DstMMO
= **MI
.memoperands_begin();
1164 const auto &SrcMMO
= **std::next(MI
.memoperands_begin());
1165 MachinePointerInfo DstPtrInfo
= DstMMO
.getPointerInfo();
1166 MachinePointerInfo SrcPtrInfo
= SrcMMO
.getPointerInfo();
1168 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
1169 // to a bug in it's findOptimalMemOpLowering implementation. For now do the
1171 if (!findGISelOptimalMemOpLowering(
1172 MemOps
, Limit
, KnownLen
, (DstAlignCanChange
? 0 : Alignment
),
1175 /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
1176 /*AllowOverlap=*/false, DstPtrInfo
.getAddrSpace(),
1177 SrcPtrInfo
.getAddrSpace(), MF
.getFunction().getAttributes(), TLI
))
1180 if (DstAlignCanChange
) {
1181 // Get an estimate of the type from the LLT.
1182 Type
*IRTy
= getTypeForLLT(MemOps
[0], C
);
1183 unsigned NewAlign
= (unsigned)DL
.getABITypeAlignment(IRTy
);
1185 // Don't promote to an alignment that would require dynamic stack
1187 const TargetRegisterInfo
*TRI
= MF
.getSubtarget().getRegisterInfo();
1188 if (!TRI
->needsStackRealignment(MF
))
1189 while (NewAlign
> Alignment
&&
1190 DL
.exceedsNaturalStackAlignment(Align(NewAlign
)))
1193 if (NewAlign
> Alignment
) {
1194 Alignment
= NewAlign
;
1195 unsigned FI
= FIDef
->getOperand(1).getIndex();
1196 // Give the stack frame object a larger alignment if needed.
1197 if (MFI
.getObjectAlignment(FI
) < Alignment
)
1198 MFI
.setObjectAlignment(FI
, Alignment
);
1202 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI
<< " into loads & stores\n");
1204 MachineIRBuilder
MIB(MI
);
1205 // Memmove requires that we perform the loads first before issuing the stores.
1206 // Apart from that, this loop is pretty much doing the same thing as the
1207 // memcpy codegen function.
1208 unsigned CurrOffset
= 0;
1209 LLT PtrTy
= MRI
.getType(Src
);
1210 SmallVector
<Register
, 16> LoadVals
;
1211 for (auto CopyTy
: MemOps
) {
1212 // Construct MMO for the load.
1214 MF
.getMachineMemOperand(&SrcMMO
, CurrOffset
, CopyTy
.getSizeInBytes());
1217 Register LoadPtr
= Src
;
1218 if (CurrOffset
!= 0) {
1220 MIB
.buildConstant(LLT::scalar(PtrTy
.getSizeInBits()), CurrOffset
);
1221 LoadPtr
= MIB
.buildGEP(PtrTy
, Src
, Offset
).getReg(0);
1223 LoadVals
.push_back(MIB
.buildLoad(CopyTy
, LoadPtr
, *LoadMMO
).getReg(0));
1224 CurrOffset
+= CopyTy
.getSizeInBytes();
1228 for (unsigned I
= 0; I
< MemOps
.size(); ++I
) {
1229 LLT CopyTy
= MemOps
[I
];
1230 // Now store the values loaded.
1232 MF
.getMachineMemOperand(&DstMMO
, CurrOffset
, CopyTy
.getSizeInBytes());
1234 Register StorePtr
= Dst
;
1235 if (CurrOffset
!= 0) {
1237 MIB
.buildConstant(LLT::scalar(PtrTy
.getSizeInBits()), CurrOffset
);
1238 StorePtr
= MIB
.buildGEP(PtrTy
, Dst
, Offset
).getReg(0);
1240 MIB
.buildStore(LoadVals
[I
], StorePtr
, *StoreMMO
);
1241 CurrOffset
+= CopyTy
.getSizeInBytes();
1243 MI
.eraseFromParent();
1247 bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr
&MI
, unsigned MaxLen
) {
1248 // This combine is fairly complex so it's not written with a separate
1249 // matcher function.
1250 assert(MI
.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS
);
1251 Intrinsic::ID ID
= (Intrinsic::ID
)MI
.getIntrinsicID();
1252 assert((ID
== Intrinsic::memcpy
|| ID
== Intrinsic::memmove
||
1253 ID
== Intrinsic::memset
) &&
1254 "Expected a memcpy like intrinsic");
1256 auto MMOIt
= MI
.memoperands_begin();
1257 const MachineMemOperand
*MemOp
= *MMOIt
;
1258 bool IsVolatile
= MemOp
->isVolatile();
1259 // Don't try to optimize volatile.
1263 unsigned DstAlign
= MemOp
->getBaseAlignment();
1264 unsigned SrcAlign
= 0;
1265 Register Dst
= MI
.getOperand(1).getReg();
1266 Register Src
= MI
.getOperand(2).getReg();
1267 Register Len
= MI
.getOperand(3).getReg();
1269 if (ID
!= Intrinsic::memset
) {
1270 assert(MMOIt
!= MI
.memoperands_end() && "Expected a second MMO on MI");
1272 SrcAlign
= MemOp
->getBaseAlignment();
1275 // See if this is a constant length copy
1276 auto LenVRegAndVal
= getConstantVRegValWithLookThrough(Len
, MRI
);
1278 return false; // Leave it to the legalizer to lower it to a libcall.
1279 unsigned KnownLen
= LenVRegAndVal
->Value
;
1281 if (KnownLen
== 0) {
1282 MI
.eraseFromParent();
1286 if (MaxLen
&& KnownLen
> MaxLen
)
1289 if (ID
== Intrinsic::memcpy
)
1290 return optimizeMemcpy(MI
, Dst
, Src
, KnownLen
, DstAlign
, SrcAlign
, IsVolatile
);
1291 if (ID
== Intrinsic::memmove
)
1292 return optimizeMemmove(MI
, Dst
, Src
, KnownLen
, DstAlign
, SrcAlign
, IsVolatile
);
1293 if (ID
== Intrinsic::memset
)
1294 return optimizeMemset(MI
, Dst
, Src
, KnownLen
, DstAlign
, IsVolatile
);
1298 bool CombinerHelper::tryCombine(MachineInstr
&MI
) {
1299 if (tryCombineCopy(MI
))
1301 if (tryCombineExtendingLoads(MI
))
1303 if (tryCombineIndexedLoadStore(MI
))