1 //===- LoadStoreOpt.cpp ----------- Generic memory optimizations -*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the LoadStoreOpt optimization pass.
10 //===----------------------------------------------------------------------===//
12 #include "llvm/CodeGen/GlobalISel/LoadStoreOpt.h"
13 #include "llvm/ADT/STLExtras.h"
14 #include "llvm/ADT/SmallPtrSet.h"
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/Analysis/AliasAnalysis.h"
17 #include "llvm/Analysis/MemoryLocation.h"
18 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
19 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
22 #include "llvm/CodeGen/GlobalISel/Utils.h"
23 #include "llvm/CodeGen/LowLevelTypeUtils.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineFrameInfo.h"
26 #include "llvm/CodeGen/MachineFunction.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
29 #include "llvm/CodeGen/MachineRegisterInfo.h"
30 #include "llvm/CodeGen/Register.h"
31 #include "llvm/CodeGen/TargetLowering.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/IR/DebugInfoMetadata.h"
34 #include "llvm/InitializePasses.h"
35 #include "llvm/Support/AtomicOrdering.h"
36 #include "llvm/Support/Casting.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Support/ErrorHandling.h"
41 #define DEBUG_TYPE "loadstore-opt"
45 using namespace MIPatternMatch
;
47 STATISTIC(NumStoresMerged
, "Number of stores merged");
49 const unsigned MaxStoreSizeToForm
= 128;
51 char LoadStoreOpt::ID
= 0;
52 INITIALIZE_PASS_BEGIN(LoadStoreOpt
, DEBUG_TYPE
, "Generic memory optimizations",
54 INITIALIZE_PASS_END(LoadStoreOpt
, DEBUG_TYPE
, "Generic memory optimizations",
57 LoadStoreOpt::LoadStoreOpt(std::function
<bool(const MachineFunction
&)> F
)
58 : MachineFunctionPass(ID
), DoNotRunPass(F
) {}
60 LoadStoreOpt::LoadStoreOpt()
61 : LoadStoreOpt([](const MachineFunction
&) { return false; }) {}
63 void LoadStoreOpt::init(MachineFunction
&MF
) {
65 MRI
= &MF
.getRegInfo();
66 AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
67 TLI
= MF
.getSubtarget().getTargetLowering();
68 LI
= MF
.getSubtarget().getLegalizerInfo();
70 IsPreLegalizer
= !MF
.getProperties().hasProperty(
71 MachineFunctionProperties::Property::Legalized
);
75 void LoadStoreOpt::getAnalysisUsage(AnalysisUsage
&AU
) const {
76 AU
.addRequired
<AAResultsWrapperPass
>();
78 getSelectionDAGFallbackAnalysisUsage(AU
);
79 MachineFunctionPass::getAnalysisUsage(AU
);
82 BaseIndexOffset
GISelAddressing::getPointerInfo(Register Ptr
,
83 MachineRegisterInfo
&MRI
) {
87 if (!mi_match(Ptr
, MRI
, m_GPtrAdd(m_Reg(BaseReg
), m_Reg(PtrAddRHS
)))) {
92 Info
.setBase(BaseReg
);
93 auto RHSCst
= getIConstantVRegValWithLookThrough(PtrAddRHS
, MRI
);
95 Info
.setOffset(RHSCst
->Value
.getSExtValue());
97 // Just recognize a simple case for now. In future we'll need to match
98 // indexing patterns for base + index + constant.
99 Info
.setIndex(PtrAddRHS
);
103 bool GISelAddressing::aliasIsKnownForLoadStore(const MachineInstr
&MI1
,
104 const MachineInstr
&MI2
,
106 MachineRegisterInfo
&MRI
) {
107 auto *LdSt1
= dyn_cast
<GLoadStore
>(&MI1
);
108 auto *LdSt2
= dyn_cast
<GLoadStore
>(&MI2
);
109 if (!LdSt1
|| !LdSt2
)
112 BaseIndexOffset BasePtr0
= getPointerInfo(LdSt1
->getPointerReg(), MRI
);
113 BaseIndexOffset BasePtr1
= getPointerInfo(LdSt2
->getPointerReg(), MRI
);
115 if (!BasePtr0
.getBase().isValid() || !BasePtr1
.getBase().isValid())
118 LocationSize Size1
= LdSt1
->getMemSize();
119 LocationSize Size2
= LdSt2
->getMemSize();
122 if (BasePtr0
.getBase() == BasePtr1
.getBase() && BasePtr0
.hasValidOffset() &&
123 BasePtr1
.hasValidOffset()) {
124 PtrDiff
= BasePtr1
.getOffset() - BasePtr0
.getOffset();
125 // If the size of memory access is unknown, do not use it to do analysis.
126 // One example of unknown size memory access is to load/store scalable
127 // vector objects on the stack.
128 // BasePtr1 is PtrDiff away from BasePtr0. They alias if none of the
129 // following situations arise:
130 if (PtrDiff
>= 0 && Size1
.hasValue() && !Size1
.isScalable()) {
131 // [----BasePtr0----]
133 // ========PtrDiff========>
134 IsAlias
= !((int64_t)Size1
.getValue() <= PtrDiff
);
137 if (PtrDiff
< 0 && Size2
.hasValue() && !Size2
.isScalable()) {
138 // [----BasePtr0----]
140 // =====(-PtrDiff)====>
141 IsAlias
= !((PtrDiff
+ (int64_t)Size2
.getValue()) <= 0);
147 // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
148 // able to calculate their relative offset if at least one arises
149 // from an alloca. However, these allocas cannot overlap and we
150 // can infer there is no alias.
151 auto *Base0Def
= getDefIgnoringCopies(BasePtr0
.getBase(), MRI
);
152 auto *Base1Def
= getDefIgnoringCopies(BasePtr1
.getBase(), MRI
);
153 if (!Base0Def
|| !Base1Def
)
154 return false; // Couldn't tell anything.
157 if (Base0Def
->getOpcode() != Base1Def
->getOpcode())
160 if (Base0Def
->getOpcode() == TargetOpcode::G_FRAME_INDEX
) {
161 MachineFrameInfo
&MFI
= Base0Def
->getMF()->getFrameInfo();
162 // If the bases have the same frame index but we couldn't find a
163 // constant offset, (indices are different) be conservative.
164 if (Base0Def
!= Base1Def
&&
165 (!MFI
.isFixedObjectIndex(Base0Def
->getOperand(1).getIndex()) ||
166 !MFI
.isFixedObjectIndex(Base1Def
->getOperand(1).getIndex()))) {
172 // This implementation is a lot more primitive than the SDAG one for now.
173 // FIXME: what about constant pools?
174 if (Base0Def
->getOpcode() == TargetOpcode::G_GLOBAL_VALUE
) {
175 auto GV0
= Base0Def
->getOperand(1).getGlobal();
176 auto GV1
= Base1Def
->getOperand(1).getGlobal();
183 // Can't tell anything about aliasing.
187 bool GISelAddressing::instMayAlias(const MachineInstr
&MI
,
188 const MachineInstr
&Other
,
189 MachineRegisterInfo
&MRI
,
191 struct MemUseCharacteristics
{
196 LocationSize NumBytes
;
197 MachineMemOperand
*MMO
;
200 auto getCharacteristics
=
201 [&](const MachineInstr
*MI
) -> MemUseCharacteristics
{
202 if (const auto *LS
= dyn_cast
<GLoadStore
>(MI
)) {
205 // No pre/post-inc addressing modes are considered here, unlike in SDAG.
206 if (!mi_match(LS
->getPointerReg(), MRI
,
207 m_GPtrAdd(m_Reg(BaseReg
), m_ICst(Offset
)))) {
208 BaseReg
= LS
->getPointerReg();
212 LocationSize Size
= LS
->getMMO().getSize();
213 return {LS
->isVolatile(), LS
->isAtomic(), BaseReg
,
214 Offset
/*base offset*/, Size
, &LS
->getMMO()};
216 // FIXME: support recognizing lifetime instructions.
218 return {false /*isvolatile*/,
221 (int64_t)0 /*offset*/,
222 LocationSize::beforeOrAfterPointer() /*size*/,
223 (MachineMemOperand
*)nullptr};
225 MemUseCharacteristics MUC0
= getCharacteristics(&MI
),
226 MUC1
= getCharacteristics(&Other
);
228 // If they are to the same address, then they must be aliases.
229 if (MUC0
.BasePtr
.isValid() && MUC0
.BasePtr
== MUC1
.BasePtr
&&
230 MUC0
.Offset
== MUC1
.Offset
)
233 // If they are both volatile then they cannot be reordered.
234 if (MUC0
.IsVolatile
&& MUC1
.IsVolatile
)
237 // Be conservative about atomics for the moment
238 // TODO: This is way overconservative for unordered atomics (see D66309)
239 if (MUC0
.IsAtomic
&& MUC1
.IsAtomic
)
242 // If one operation reads from invariant memory, and the other may store, they
244 if (MUC0
.MMO
&& MUC1
.MMO
) {
245 if ((MUC0
.MMO
->isInvariant() && MUC1
.MMO
->isStore()) ||
246 (MUC1
.MMO
->isInvariant() && MUC0
.MMO
->isStore()))
250 // If NumBytes is scalable and offset is not 0, conservatively return may
252 if ((MUC0
.NumBytes
.isScalable() && MUC0
.Offset
!= 0) ||
253 (MUC1
.NumBytes
.isScalable() && MUC1
.Offset
!= 0))
256 const bool BothNotScalable
=
257 !MUC0
.NumBytes
.isScalable() && !MUC1
.NumBytes
.isScalable();
259 // Try to prove that there is aliasing, or that there is no aliasing. Either
260 // way, we can return now. If nothing can be proved, proceed with more tests.
262 if (BothNotScalable
&&
263 GISelAddressing::aliasIsKnownForLoadStore(MI
, Other
, IsAlias
, MRI
))
266 // The following all rely on MMO0 and MMO1 being valid.
267 if (!MUC0
.MMO
|| !MUC1
.MMO
)
270 // FIXME: port the alignment based alias analysis from SDAG's isAlias().
271 int64_t SrcValOffset0
= MUC0
.MMO
->getOffset();
272 int64_t SrcValOffset1
= MUC1
.MMO
->getOffset();
273 LocationSize Size0
= MUC0
.NumBytes
;
274 LocationSize Size1
= MUC1
.NumBytes
;
275 if (AA
&& MUC0
.MMO
->getValue() && MUC1
.MMO
->getValue() && Size0
.hasValue() &&
277 // Use alias analysis information.
278 int64_t MinOffset
= std::min(SrcValOffset0
, SrcValOffset1
);
280 Size0
.getValue().getKnownMinValue() + SrcValOffset0
- MinOffset
;
282 Size1
.getValue().getKnownMinValue() + SrcValOffset1
- MinOffset
;
284 Size0
.isScalable() ? Size0
: LocationSize::precise(Overlap0
);
286 Size1
.isScalable() ? Size1
: LocationSize::precise(Overlap1
);
289 MemoryLocation(MUC0
.MMO
->getValue(), Loc0
, MUC0
.MMO
->getAAInfo()),
290 MemoryLocation(MUC1
.MMO
->getValue(), Loc1
, MUC1
.MMO
->getAAInfo())))
294 // Otherwise we have to assume they alias.
298 /// Returns true if the instruction creates an unavoidable hazard that
299 /// forces a boundary between store merge candidates.
300 static bool isInstHardMergeHazard(MachineInstr
&MI
) {
301 return MI
.hasUnmodeledSideEffects() || MI
.hasOrderedMemoryRef();
304 bool LoadStoreOpt::mergeStores(SmallVectorImpl
<GStore
*> &StoresToMerge
) {
305 // Try to merge all the stores in the vector, splitting into separate segments
307 assert(StoresToMerge
.size() > 1 && "Expected multiple stores to merge");
308 LLT OrigTy
= MRI
->getType(StoresToMerge
[0]->getValueReg());
309 LLT PtrTy
= MRI
->getType(StoresToMerge
[0]->getPointerReg());
310 unsigned AS
= PtrTy
.getAddressSpace();
311 // Ensure the legal store info is computed for this address space.
312 initializeStoreMergeTargetInfo(AS
);
313 const auto &LegalSizes
= LegalStoreSizes
[AS
];
316 for (auto *StoreMI
: StoresToMerge
)
317 assert(MRI
->getType(StoreMI
->getValueReg()) == OrigTy
);
320 const auto &DL
= MF
->getFunction().getDataLayout();
321 bool AnyMerged
= false;
323 unsigned NumPow2
= llvm::bit_floor(StoresToMerge
.size());
324 unsigned MaxSizeBits
= NumPow2
* OrigTy
.getSizeInBits().getFixedValue();
325 // Compute the biggest store we can generate to handle the number of stores.
326 unsigned MergeSizeBits
;
327 for (MergeSizeBits
= MaxSizeBits
; MergeSizeBits
> 1; MergeSizeBits
/= 2) {
328 LLT StoreTy
= LLT::scalar(MergeSizeBits
);
330 getApproximateEVTForLLT(StoreTy
, DL
, MF
->getFunction().getContext());
331 if (LegalSizes
.size() > MergeSizeBits
&& LegalSizes
[MergeSizeBits
] &&
332 TLI
->canMergeStoresTo(AS
, StoreEVT
, *MF
) &&
333 (TLI
->isTypeLegal(StoreEVT
)))
334 break; // We can generate a MergeSize bits store.
336 if (MergeSizeBits
<= OrigTy
.getSizeInBits())
337 return AnyMerged
; // No greater merge.
339 unsigned NumStoresToMerge
= MergeSizeBits
/ OrigTy
.getSizeInBits();
340 // Perform the actual merging.
341 SmallVector
<GStore
*, 8> SingleMergeStores(
342 StoresToMerge
.begin(), StoresToMerge
.begin() + NumStoresToMerge
);
343 AnyMerged
|= doSingleStoreMerge(SingleMergeStores
);
344 StoresToMerge
.erase(StoresToMerge
.begin(),
345 StoresToMerge
.begin() + NumStoresToMerge
);
346 } while (StoresToMerge
.size() > 1);
350 bool LoadStoreOpt::isLegalOrBeforeLegalizer(const LegalityQuery
&Query
,
351 MachineFunction
&MF
) const {
352 auto Action
= LI
->getAction(Query
).Action
;
353 // If the instruction is unsupported, it can't be legalized at all.
354 if (Action
== LegalizeActions::Unsupported
)
356 return IsPreLegalizer
|| Action
== LegalizeAction::Legal
;
359 bool LoadStoreOpt::doSingleStoreMerge(SmallVectorImpl
<GStore
*> &Stores
) {
360 assert(Stores
.size() > 1);
361 // We know that all the stores are consecutive and there are no aliasing
362 // operations in the range. However, the values that are being stored may be
363 // generated anywhere before each store. To ensure we have the values
364 // available, we materialize the wide value and new store at the place of the
365 // final store in the merge sequence.
366 GStore
*FirstStore
= Stores
[0];
367 const unsigned NumStores
= Stores
.size();
368 LLT SmallTy
= MRI
->getType(FirstStore
->getValueReg());
370 LLT::scalar(NumStores
* SmallTy
.getSizeInBits().getFixedValue());
372 // For each store, compute pairwise merged debug locs.
373 DebugLoc MergedLoc
= Stores
.front()->getDebugLoc();
374 for (auto *Store
: drop_begin(Stores
))
375 MergedLoc
= DILocation::getMergedLocation(MergedLoc
, Store
->getDebugLoc());
377 Builder
.setInstr(*Stores
.back());
378 Builder
.setDebugLoc(MergedLoc
);
380 // If all of the store values are constants, then create a wide constant
381 // directly. Otherwise, we need to generate some instructions to merge the
382 // existing values together into a wider type.
383 SmallVector
<APInt
, 8> ConstantVals
;
384 for (auto *Store
: Stores
) {
386 getIConstantVRegValWithLookThrough(Store
->getValueReg(), *MRI
);
388 ConstantVals
.clear();
391 ConstantVals
.emplace_back(MaybeCst
->Value
);
396 MF
->getMachineMemOperand(&FirstStore
->getMMO(), 0, WideValueTy
);
397 if (ConstantVals
.empty()) {
398 // Mimic the SDAG behaviour here and don't try to do anything for unknown
399 // values. In future, we should also support the cases of loads and
400 // extracted vector elements.
404 assert(ConstantVals
.size() == NumStores
);
405 // Check if our wide constant is legal.
406 if (!isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT
, {WideValueTy
}}, *MF
))
408 APInt
WideConst(WideValueTy
.getSizeInBits(), 0);
409 for (unsigned Idx
= 0; Idx
< ConstantVals
.size(); ++Idx
) {
410 // Insert the smaller constant into the corresponding position in the
412 WideConst
.insertBits(ConstantVals
[Idx
], Idx
* SmallTy
.getSizeInBits());
414 WideReg
= Builder
.buildConstant(WideValueTy
, WideConst
).getReg(0);
416 Builder
.buildStore(WideReg
, FirstStore
->getPointerReg(), *WideMMO
);
418 LLVM_DEBUG(dbgs() << "Merged " << Stores
.size()
419 << " stores into merged store: " << *NewStore
);
420 LLVM_DEBUG(for (auto *MI
: Stores
) dbgs() << " " << *MI
;);
421 NumStoresMerged
+= Stores
.size();
423 MachineOptimizationRemarkEmitter
MORE(*MF
, nullptr);
425 MachineOptimizationRemark
R(DEBUG_TYPE
, "MergedStore",
426 FirstStore
->getDebugLoc(),
427 FirstStore
->getParent());
428 R
<< "Merged " << NV("NumMerged", Stores
.size()) << " stores of "
429 << NV("OrigWidth", SmallTy
.getSizeInBytes())
430 << " bytes into a single store of "
431 << NV("NewWidth", WideValueTy
.getSizeInBytes()) << " bytes";
435 for (auto *MI
: Stores
)
436 InstsToErase
.insert(MI
);
440 bool LoadStoreOpt::processMergeCandidate(StoreMergeCandidate
&C
) {
441 if (C
.Stores
.size() < 2) {
446 LLVM_DEBUG(dbgs() << "Checking store merge candidate with " << C
.Stores
.size()
447 << " stores, starting with " << *C
.Stores
[0]);
448 // We know that the stores in the candidate are adjacent.
449 // Now we need to check if any potential aliasing instructions recorded
450 // during the search alias with load/stores added to the candidate after.
451 // For example, if we have the candidate:
452 // C.Stores = [ST1, ST2, ST3, ST4]
453 // and after seeing ST2 we saw a load LD1, which did not alias with ST1 or
454 // ST2, then we would have recorded it into the PotentialAliases structure
455 // with the associated index value of "1". Then we see ST3 and ST4 and add
456 // them to the candidate group. We know that LD1 does not alias with ST1 or
457 // ST2, since we already did that check. However we don't yet know if it
458 // may alias ST3 and ST4, so we perform those checks now.
459 SmallVector
<GStore
*> StoresToMerge
;
461 auto DoesStoreAliasWithPotential
= [&](unsigned Idx
, GStore
&CheckStore
) {
462 for (auto AliasInfo
: reverse(C
.PotentialAliases
)) {
463 MachineInstr
*PotentialAliasOp
= AliasInfo
.first
;
464 unsigned PreCheckedIdx
= AliasInfo
.second
;
465 if (static_cast<unsigned>(Idx
) < PreCheckedIdx
) {
466 // Once our store index is lower than the index associated with the
467 // potential alias, we know that we've already checked for this alias
468 // and all of the earlier potential aliases too.
471 // Need to check this alias.
472 if (GISelAddressing::instMayAlias(CheckStore
, *PotentialAliasOp
, *MRI
,
474 LLVM_DEBUG(dbgs() << "Potential alias " << *PotentialAliasOp
481 // Start from the last store in the group, and check if it aliases with any
482 // of the potential aliasing operations in the list.
483 for (int StoreIdx
= C
.Stores
.size() - 1; StoreIdx
>= 0; --StoreIdx
) {
484 auto *CheckStore
= C
.Stores
[StoreIdx
];
485 if (DoesStoreAliasWithPotential(StoreIdx
, *CheckStore
))
487 StoresToMerge
.emplace_back(CheckStore
);
490 LLVM_DEBUG(dbgs() << StoresToMerge
.size()
491 << " stores remaining after alias checks. Merging...\n");
493 // Now we've checked for aliasing hazards, merge any stores left.
495 if (StoresToMerge
.size() < 2)
497 return mergeStores(StoresToMerge
);
500 bool LoadStoreOpt::operationAliasesWithCandidate(MachineInstr
&MI
,
501 StoreMergeCandidate
&C
) {
502 if (C
.Stores
.empty())
504 return llvm::any_of(C
.Stores
, [&](MachineInstr
*OtherMI
) {
505 return instMayAlias(MI
, *OtherMI
, *MRI
, AA
);
509 void LoadStoreOpt::StoreMergeCandidate::addPotentialAlias(MachineInstr
&MI
) {
510 PotentialAliases
.emplace_back(std::make_pair(&MI
, Stores
.size() - 1));
513 bool LoadStoreOpt::addStoreToCandidate(GStore
&StoreMI
,
514 StoreMergeCandidate
&C
) {
515 // Check if the given store writes to an adjacent address, and other
517 LLT ValueTy
= MRI
->getType(StoreMI
.getValueReg());
518 LLT PtrTy
= MRI
->getType(StoreMI
.getPointerReg());
520 // Only handle scalars.
521 if (!ValueTy
.isScalar())
524 // Don't allow truncating stores for now.
525 if (StoreMI
.getMemSizeInBits() != ValueTy
.getSizeInBits())
528 // Avoid adding volatile or ordered stores to the candidate. We already have a
529 // check for this in instMayAlias() but that only get's called later between
530 // potential aliasing hazards.
531 if (!StoreMI
.isSimple())
534 Register StoreAddr
= StoreMI
.getPointerReg();
535 auto BIO
= getPointerInfo(StoreAddr
, *MRI
);
536 Register StoreBase
= BIO
.getBase();
537 if (C
.Stores
.empty()) {
538 C
.BasePtr
= StoreBase
;
539 if (!BIO
.hasValidOffset()) {
540 C
.CurrentLowestOffset
= 0;
542 C
.CurrentLowestOffset
= BIO
.getOffset();
544 // This is the first store of the candidate.
545 // If the offset can't possibly allow for a lower addressed store with the
546 // same base, don't bother adding it.
547 if (BIO
.hasValidOffset() &&
548 BIO
.getOffset() < static_cast<int64_t>(ValueTy
.getSizeInBytes()))
550 C
.Stores
.emplace_back(&StoreMI
);
551 LLVM_DEBUG(dbgs() << "Starting a new merge candidate group with: "
556 // Check the store is the same size as the existing ones in the candidate.
557 if (MRI
->getType(C
.Stores
[0]->getValueReg()).getSizeInBits() !=
558 ValueTy
.getSizeInBits())
561 if (MRI
->getType(C
.Stores
[0]->getPointerReg()).getAddressSpace() !=
562 PtrTy
.getAddressSpace())
565 // There are other stores in the candidate. Check that the store address
566 // writes to the next lowest adjacent address.
567 if (C
.BasePtr
!= StoreBase
)
569 // If we don't have a valid offset, we can't guarantee to be an adjacent
571 if (!BIO
.hasValidOffset())
573 if ((C
.CurrentLowestOffset
-
574 static_cast<int64_t>(ValueTy
.getSizeInBytes())) != BIO
.getOffset())
577 // This writes to an adjacent address. Allow it.
578 C
.Stores
.emplace_back(&StoreMI
);
579 C
.CurrentLowestOffset
= C
.CurrentLowestOffset
- ValueTy
.getSizeInBytes();
580 LLVM_DEBUG(dbgs() << "Candidate added store: " << StoreMI
);
584 bool LoadStoreOpt::mergeBlockStores(MachineBasicBlock
&MBB
) {
585 bool Changed
= false;
586 // Walk through the block bottom-up, looking for merging candidates.
587 StoreMergeCandidate Candidate
;
588 for (MachineInstr
&MI
: llvm::reverse(MBB
)) {
589 if (InstsToErase
.contains(&MI
))
592 if (auto *StoreMI
= dyn_cast
<GStore
>(&MI
)) {
593 // We have a G_STORE. Add it to the candidate if it writes to an adjacent
595 if (!addStoreToCandidate(*StoreMI
, Candidate
)) {
596 // Store wasn't eligible to be added. May need to record it as a
598 if (operationAliasesWithCandidate(*StoreMI
, Candidate
)) {
599 Changed
|= processMergeCandidate(Candidate
);
602 Candidate
.addPotentialAlias(*StoreMI
);
607 // If we don't have any stores yet, this instruction can't pose a problem.
608 if (Candidate
.Stores
.empty())
611 // We're dealing with some other kind of instruction.
612 if (isInstHardMergeHazard(MI
)) {
613 Changed
|= processMergeCandidate(Candidate
);
614 Candidate
.Stores
.clear();
618 if (!MI
.mayLoadOrStore())
621 if (operationAliasesWithCandidate(MI
, Candidate
)) {
622 // We have a potential alias, so process the current candidate if we can
623 // and then continue looking for a new candidate.
624 Changed
|= processMergeCandidate(Candidate
);
628 // Record this instruction as a potential alias for future stores that are
629 // added to the candidate.
630 Candidate
.addPotentialAlias(MI
);
633 // Process any candidate left after finishing searching the entire block.
634 Changed
|= processMergeCandidate(Candidate
);
636 // Erase instructions now that we're no longer iterating over the block.
637 for (auto *MI
: InstsToErase
)
638 MI
->eraseFromParent();
639 InstsToErase
.clear();
643 /// Check if the store \p Store is a truncstore that can be merged. That is,
644 /// it's a store of a shifted value of \p SrcVal. If \p SrcVal is an empty
645 /// Register then it does not need to match and SrcVal is set to the source
647 /// On match, returns the start byte offset of the \p SrcVal that is being
649 static std::optional
<int64_t>
650 getTruncStoreByteOffset(GStore
&Store
, Register
&SrcVal
,
651 MachineRegisterInfo
&MRI
) {
653 if (!mi_match(Store
.getValueReg(), MRI
, m_GTrunc(m_Reg(TruncVal
))))
656 // The shift amount must be a constant multiple of the narrow type.
657 // It is translated to the offset address in the wide source value "y".
659 // x = G_LSHR y, ShiftAmtC
662 Register FoundSrcVal
;
664 if (!mi_match(TruncVal
, MRI
,
665 m_any_of(m_GLShr(m_Reg(FoundSrcVal
), m_ICst(ShiftAmt
)),
666 m_GAShr(m_Reg(FoundSrcVal
), m_ICst(ShiftAmt
))))) {
667 if (!SrcVal
.isValid() || TruncVal
== SrcVal
) {
668 if (!SrcVal
.isValid())
670 return 0; // If it's the lowest index store.
675 unsigned NarrowBits
= Store
.getMMO().getMemoryType().getScalarSizeInBits();
676 if (ShiftAmt
% NarrowBits
!= 0)
678 const unsigned Offset
= ShiftAmt
/ NarrowBits
;
680 if (SrcVal
.isValid() && FoundSrcVal
!= SrcVal
)
683 if (!SrcVal
.isValid())
684 SrcVal
= FoundSrcVal
;
685 else if (MRI
.getType(SrcVal
) != MRI
.getType(FoundSrcVal
))
690 /// Match a pattern where a wide type scalar value is stored by several narrow
691 /// stores. Fold it into a single store or a BSWAP and a store if the targets
694 /// Assuming little endian target:
697 /// p[0] = (val >> 0) & 0xFF;
698 /// p[1] = (val >> 8) & 0xFF;
699 /// p[2] = (val >> 16) & 0xFF;
700 /// p[3] = (val >> 24) & 0xFF;
706 /// p[0] = (val >> 24) & 0xFF;
707 /// p[1] = (val >> 16) & 0xFF;
708 /// p[2] = (val >> 8) & 0xFF;
709 /// p[3] = (val >> 0) & 0xFF;
711 /// *((i32)p) = BSWAP(val);
712 bool LoadStoreOpt::mergeTruncStore(GStore
&StoreMI
,
713 SmallPtrSetImpl
<GStore
*> &DeletedStores
) {
714 LLT MemTy
= StoreMI
.getMMO().getMemoryType();
716 // We only handle merging simple stores of 1-4 bytes.
717 if (!MemTy
.isScalar())
719 switch (MemTy
.getSizeInBits()) {
727 if (!StoreMI
.isSimple())
730 // We do a simple search for mergeable stores prior to this one.
731 // Any potential alias hazard along the way terminates the search.
732 SmallVector
<GStore
*> FoundStores
;
734 // We're looking for:
735 // 1) a (store(trunc(...)))
736 // 2) of an LSHR/ASHR of a single wide value, by the appropriate shift to get
737 // the partial value stored.
738 // 3) where the offsets form either a little or big-endian sequence.
740 auto &LastStore
= StoreMI
;
742 // The single base pointer that all stores must use.
745 if (!mi_match(LastStore
.getPointerReg(), *MRI
,
746 m_GPtrAdd(m_Reg(BaseReg
), m_ICst(LastOffset
)))) {
747 BaseReg
= LastStore
.getPointerReg();
751 GStore
*LowestIdxStore
= &LastStore
;
752 int64_t LowestIdxOffset
= LastOffset
;
755 auto LowestShiftAmt
= getTruncStoreByteOffset(LastStore
, WideSrcVal
, *MRI
);
757 return false; // Didn't match a trunc.
758 assert(WideSrcVal
.isValid());
760 LLT WideStoreTy
= MRI
->getType(WideSrcVal
);
761 // The wide type might not be a multiple of the memory type, e.g. s48 and s32.
762 if (WideStoreTy
.getSizeInBits() % MemTy
.getSizeInBits() != 0)
764 const unsigned NumStoresRequired
=
765 WideStoreTy
.getSizeInBits() / MemTy
.getSizeInBits();
767 SmallVector
<int64_t, 8> OffsetMap(NumStoresRequired
, INT64_MAX
);
768 OffsetMap
[*LowestShiftAmt
] = LastOffset
;
769 FoundStores
.emplace_back(&LastStore
);
771 const int MaxInstsToCheck
= 10;
772 int NumInstsChecked
= 0;
773 for (auto II
= ++LastStore
.getReverseIterator();
774 II
!= LastStore
.getParent()->rend() && NumInstsChecked
< MaxInstsToCheck
;
778 if ((NewStore
= dyn_cast
<GStore
>(&*II
))) {
779 if (NewStore
->getMMO().getMemoryType() != MemTy
|| !NewStore
->isSimple())
781 } else if (II
->isLoadFoldBarrier() || II
->mayLoad()) {
784 continue; // This is a safe instruction we can look past.
789 // Check we're storing to the same base + some offset.
790 if (!mi_match(NewStore
->getPointerReg(), *MRI
,
791 m_GPtrAdd(m_Reg(NewBaseReg
), m_ICst(MemOffset
)))) {
792 NewBaseReg
= NewStore
->getPointerReg();
795 if (BaseReg
!= NewBaseReg
)
798 auto ShiftByteOffset
= getTruncStoreByteOffset(*NewStore
, WideSrcVal
, *MRI
);
799 if (!ShiftByteOffset
)
801 if (MemOffset
< LowestIdxOffset
) {
802 LowestIdxOffset
= MemOffset
;
803 LowestIdxStore
= NewStore
;
806 // Map the offset in the store and the offset in the combined value, and
807 // early return if it has been set before.
808 if (*ShiftByteOffset
< 0 || *ShiftByteOffset
>= NumStoresRequired
||
809 OffsetMap
[*ShiftByteOffset
] != INT64_MAX
)
811 OffsetMap
[*ShiftByteOffset
] = MemOffset
;
813 FoundStores
.emplace_back(NewStore
);
814 // Reset counter since we've found a matching inst.
816 if (FoundStores
.size() == NumStoresRequired
)
820 if (FoundStores
.size() != NumStoresRequired
) {
821 if (FoundStores
.size() == 1)
823 // We didn't find enough stores to merge into the size of the original
824 // source value, but we may be able to generate a smaller store if we
825 // truncate the source value.
826 WideStoreTy
= LLT::scalar(FoundStores
.size() * MemTy
.getScalarSizeInBits());
829 unsigned NumStoresFound
= FoundStores
.size();
831 const auto &DL
= LastStore
.getMF()->getDataLayout();
832 auto &C
= LastStore
.getMF()->getFunction().getContext();
833 // Check that a store of the wide type is both allowed and fast on the target
835 bool Allowed
= TLI
->allowsMemoryAccess(
836 C
, DL
, WideStoreTy
, LowestIdxStore
->getMMO(), &Fast
);
837 if (!Allowed
|| !Fast
)
840 // Check if the pieces of the value are going to the expected places in memory
841 // to merge the stores.
842 unsigned NarrowBits
= MemTy
.getScalarSizeInBits();
843 auto checkOffsets
= [&](bool MatchLittleEndian
) {
844 if (MatchLittleEndian
) {
845 for (unsigned i
= 0; i
!= NumStoresFound
; ++i
)
846 if (OffsetMap
[i
] != i
* (NarrowBits
/ 8) + LowestIdxOffset
)
848 } else { // MatchBigEndian by reversing loop counter.
849 for (unsigned i
= 0, j
= NumStoresFound
- 1; i
!= NumStoresFound
;
851 if (OffsetMap
[j
] != i
* (NarrowBits
/ 8) + LowestIdxOffset
)
857 // Check if the offsets line up for the native data layout of this target.
858 bool NeedBswap
= false;
859 bool NeedRotate
= false;
860 if (!checkOffsets(DL
.isLittleEndian())) {
861 // Special-case: check if byte offsets line up for the opposite endian.
862 if (NarrowBits
== 8 && checkOffsets(DL
.isBigEndian()))
864 else if (NumStoresFound
== 2 && checkOffsets(DL
.isBigEndian()))
871 !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP
, {WideStoreTy
}}, *MF
))
874 !isLegalOrBeforeLegalizer(
875 {TargetOpcode::G_ROTR
, {WideStoreTy
, WideStoreTy
}}, *MF
))
878 Builder
.setInstrAndDebugLoc(StoreMI
);
880 if (WideStoreTy
!= MRI
->getType(WideSrcVal
))
881 WideSrcVal
= Builder
.buildTrunc(WideStoreTy
, WideSrcVal
).getReg(0);
884 WideSrcVal
= Builder
.buildBSwap(WideStoreTy
, WideSrcVal
).getReg(0);
885 } else if (NeedRotate
) {
886 assert(WideStoreTy
.getSizeInBits() % 2 == 0 &&
887 "Unexpected type for rotate");
889 Builder
.buildConstant(WideStoreTy
, WideStoreTy
.getSizeInBits() / 2);
891 Builder
.buildRotateRight(WideStoreTy
, WideSrcVal
, RotAmt
).getReg(0);
894 Builder
.buildStore(WideSrcVal
, LowestIdxStore
->getPointerReg(),
895 LowestIdxStore
->getMMO().getPointerInfo(),
896 LowestIdxStore
->getMMO().getAlign());
898 // Erase the old stores.
899 for (auto *ST
: FoundStores
) {
900 ST
->eraseFromParent();
901 DeletedStores
.insert(ST
);
906 bool LoadStoreOpt::mergeTruncStoresBlock(MachineBasicBlock
&BB
) {
907 bool Changed
= false;
908 SmallVector
<GStore
*, 16> Stores
;
909 SmallPtrSet
<GStore
*, 8> DeletedStores
;
910 // Walk up the block so we can see the most eligible stores.
911 for (MachineInstr
&MI
: llvm::reverse(BB
))
912 if (auto *StoreMI
= dyn_cast
<GStore
>(&MI
))
913 Stores
.emplace_back(StoreMI
);
915 for (auto *StoreMI
: Stores
) {
916 if (DeletedStores
.count(StoreMI
))
918 if (mergeTruncStore(*StoreMI
, DeletedStores
))
924 bool LoadStoreOpt::mergeFunctionStores(MachineFunction
&MF
) {
925 bool Changed
= false;
927 Changed
|= mergeBlockStores(BB
);
928 Changed
|= mergeTruncStoresBlock(BB
);
931 // Erase all dead instructions left over by the merging.
933 for (auto &BB
: MF
) {
934 for (auto &I
: make_early_inc_range(make_range(BB
.rbegin(), BB
.rend()))) {
935 if (isTriviallyDead(I
, *MRI
))
944 void LoadStoreOpt::initializeStoreMergeTargetInfo(unsigned AddrSpace
) {
945 // Query the legalizer info to record what store types are legal.
946 // We record this because we don't want to bother trying to merge stores into
947 // illegal ones, which would just result in being split again.
949 if (LegalStoreSizes
.count(AddrSpace
)) {
950 assert(LegalStoreSizes
[AddrSpace
].any());
951 return; // Already cached sizes for this address space.
954 // Need to reserve at least MaxStoreSizeToForm + 1 bits.
955 BitVector
LegalSizes(MaxStoreSizeToForm
* 2);
956 const auto &LI
= *MF
->getSubtarget().getLegalizerInfo();
957 const auto &DL
= MF
->getFunction().getDataLayout();
958 Type
*IRPtrTy
= PointerType::get(MF
->getFunction().getContext(), AddrSpace
);
959 LLT PtrTy
= getLLTForType(*IRPtrTy
, DL
);
960 // We assume that we're not going to be generating any stores wider than
961 // MaxStoreSizeToForm bits for now.
962 for (unsigned Size
= 2; Size
<= MaxStoreSizeToForm
; Size
*= 2) {
963 LLT Ty
= LLT::scalar(Size
);
964 SmallVector
<LegalityQuery::MemDesc
, 2> MemDescrs(
965 {{Ty
, Ty
.getSizeInBits(), AtomicOrdering::NotAtomic
}});
966 SmallVector
<LLT
> StoreTys({Ty
, PtrTy
});
967 LegalityQuery
Q(TargetOpcode::G_STORE
, StoreTys
, MemDescrs
);
968 LegalizeActionStep ActionStep
= LI
.getAction(Q
);
969 if (ActionStep
.Action
== LegalizeActions::Legal
)
970 LegalSizes
.set(Size
);
972 assert(LegalSizes
.any() && "Expected some store sizes to be legal!");
973 LegalStoreSizes
[AddrSpace
] = LegalSizes
;
976 bool LoadStoreOpt::runOnMachineFunction(MachineFunction
&MF
) {
977 // If the ISel pipeline failed, do not bother running that pass.
978 if (MF
.getProperties().hasProperty(
979 MachineFunctionProperties::Property::FailedISel
))
982 LLVM_DEBUG(dbgs() << "Begin memory optimizations for: " << MF
.getName()
986 bool Changed
= false;
987 Changed
|= mergeFunctionStores(MF
);
989 LegalStoreSizes
.clear();