1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This transformation implements the well known scalar replacement of
10 /// aggregates transformation. It tries to identify promotable elements of an
11 /// aggregate alloca, and promote them to registers. It will also try to
12 /// convert uses of an element (or set of elements) of an alloca into a vector
13 /// or bitfield-style integer scalar if appropriate.
15 /// It works to do this with minimal slicing of the alloca so that regions
16 /// which are merely transferred in and out of external memory remain unchanged
17 /// and are not decomposed to scalar code.
19 /// Because this also performs alloca promotion, it can be thought of as also
20 /// serving the purpose of SSA formation. The algorithm iterates on the
21 /// function until all opportunities for promotion have been realized.
23 //===----------------------------------------------------------------------===//
25 #include "llvm/Transforms/Scalar/SROA.h"
26 #include "llvm/ADT/APInt.h"
27 #include "llvm/ADT/ArrayRef.h"
28 #include "llvm/ADT/DenseMap.h"
29 #include "llvm/ADT/MapVector.h"
30 #include "llvm/ADT/PointerIntPair.h"
31 #include "llvm/ADT/STLExtras.h"
32 #include "llvm/ADT/SetVector.h"
33 #include "llvm/ADT/SmallBitVector.h"
34 #include "llvm/ADT/SmallPtrSet.h"
35 #include "llvm/ADT/SmallVector.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/ADT/StringRef.h"
38 #include "llvm/ADT/Twine.h"
39 #include "llvm/ADT/iterator.h"
40 #include "llvm/ADT/iterator_range.h"
41 #include "llvm/Analysis/AssumptionCache.h"
42 #include "llvm/Analysis/DomTreeUpdater.h"
43 #include "llvm/Analysis/GlobalsModRef.h"
44 #include "llvm/Analysis/Loads.h"
45 #include "llvm/Analysis/PtrUseVisitor.h"
46 #include "llvm/Config/llvm-config.h"
47 #include "llvm/IR/BasicBlock.h"
48 #include "llvm/IR/Constant.h"
49 #include "llvm/IR/ConstantFolder.h"
50 #include "llvm/IR/Constants.h"
51 #include "llvm/IR/DIBuilder.h"
52 #include "llvm/IR/DataLayout.h"
53 #include "llvm/IR/DebugInfo.h"
54 #include "llvm/IR/DebugInfoMetadata.h"
55 #include "llvm/IR/DerivedTypes.h"
56 #include "llvm/IR/Dominators.h"
57 #include "llvm/IR/Function.h"
58 #include "llvm/IR/GetElementPtrTypeIterator.h"
59 #include "llvm/IR/GlobalAlias.h"
60 #include "llvm/IR/IRBuilder.h"
61 #include "llvm/IR/InstVisitor.h"
62 #include "llvm/IR/Instruction.h"
63 #include "llvm/IR/Instructions.h"
64 #include "llvm/IR/IntrinsicInst.h"
65 #include "llvm/IR/LLVMContext.h"
66 #include "llvm/IR/Metadata.h"
67 #include "llvm/IR/Module.h"
68 #include "llvm/IR/Operator.h"
69 #include "llvm/IR/PassManager.h"
70 #include "llvm/IR/Type.h"
71 #include "llvm/IR/Use.h"
72 #include "llvm/IR/User.h"
73 #include "llvm/IR/Value.h"
74 #include "llvm/IR/ValueHandle.h"
75 #include "llvm/InitializePasses.h"
76 #include "llvm/Pass.h"
77 #include "llvm/Support/Casting.h"
78 #include "llvm/Support/CommandLine.h"
79 #include "llvm/Support/Compiler.h"
80 #include "llvm/Support/Debug.h"
81 #include "llvm/Support/ErrorHandling.h"
82 #include "llvm/Support/raw_ostream.h"
83 #include "llvm/Transforms/Scalar.h"
84 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
85 #include "llvm/Transforms/Utils/Local.h"
86 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
101 #define DEBUG_TYPE "sroa"
103 STATISTIC(NumAllocasAnalyzed
, "Number of allocas analyzed for replacement");
104 STATISTIC(NumAllocaPartitions
, "Number of alloca partitions formed");
105 STATISTIC(MaxPartitionsPerAlloca
, "Maximum number of partitions per alloca");
106 STATISTIC(NumAllocaPartitionUses
, "Number of alloca partition uses rewritten");
107 STATISTIC(MaxUsesPerAllocaPartition
, "Maximum number of uses of a partition");
108 STATISTIC(NumNewAllocas
, "Number of new, smaller allocas introduced");
109 STATISTIC(NumPromoted
, "Number of allocas promoted to SSA values");
110 STATISTIC(NumLoadsSpeculated
, "Number of loads speculated to allow promotion");
111 STATISTIC(NumLoadsPredicated
,
112 "Number of loads rewritten into predicated loads to allow promotion");
115 "Number of stores rewritten into predicated loads to allow promotion");
116 STATISTIC(NumDeleted
, "Number of instructions deleted");
117 STATISTIC(NumVectorized
, "Number of vectorized aggregates");
119 /// Hidden option to experiment with completely strict handling of inbounds
121 static cl::opt
<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
123 /// Disable running mem2reg during SROA in order to test or debug SROA.
124 static cl::opt
<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
128 class AllocaSliceRewriter
;
132 class SelectHandSpeculativity
{
133 unsigned char Storage
= 0; // None are speculatable by default.
134 using TrueVal
= Bitfield::Element
<bool, 0, 1>; // Low 0'th bit.
135 using FalseVal
= Bitfield::Element
<bool, 1, 1>; // Low 1'th bit.
137 SelectHandSpeculativity() = default;
138 SelectHandSpeculativity
&setAsSpeculatable(bool isTrueVal
);
139 bool isSpeculatable(bool isTrueVal
) const;
140 bool areAllSpeculatable() const;
141 bool areAnySpeculatable() const;
142 bool areNoneSpeculatable() const;
143 // For interop as int half of PointerIntPair.
144 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage
); }
145 explicit SelectHandSpeculativity(intptr_t Storage_
) : Storage(Storage_
) {}
147 static_assert(sizeof(SelectHandSpeculativity
) == sizeof(unsigned char));
149 using PossiblySpeculatableLoad
=
150 PointerIntPair
<LoadInst
*, 2, SelectHandSpeculativity
>;
151 using UnspeculatableStore
= StoreInst
*;
152 using RewriteableMemOp
=
153 std::variant
<PossiblySpeculatableLoad
, UnspeculatableStore
>;
154 using RewriteableMemOps
= SmallVector
<RewriteableMemOp
, 2>;
156 /// An optimization pass providing Scalar Replacement of Aggregates.
158 /// This pass takes allocations which can be completely analyzed (that is, they
159 /// don't escape) and tries to turn them into scalar SSA values. There are
160 /// a few steps to this process.
162 /// 1) It takes allocations of aggregates and analyzes the ways in which they
163 /// are used to try to split them into smaller allocations, ideally of
164 /// a single scalar data type. It will split up memcpy and memset accesses
165 /// as necessary and try to isolate individual scalar accesses.
166 /// 2) It will transform accesses into forms which are suitable for SSA value
167 /// promotion. This can be replacing a memset with a scalar store of an
168 /// integer value, or it can involve speculating operations on a PHI or
169 /// select to be a PHI or select of the results.
170 /// 3) Finally, this will try to detect a pattern of accesses which map cleanly
171 /// onto insert and extract operations on a vector value, and convert them to
172 /// this form. By doing so, it will enable promotion of vector aggregates to
173 /// SSA vector values.
175 LLVMContext
*const C
;
176 DomTreeUpdater
*const DTU
;
177 AssumptionCache
*const AC
;
178 const bool PreserveCFG
;
180 /// Worklist of alloca instructions to simplify.
182 /// Each alloca in the function is added to this. Each new alloca formed gets
183 /// added to it as well to recursively simplify unless that alloca can be
184 /// directly promoted. Finally, each time we rewrite a use of an alloca other
185 /// the one being actively rewritten, we add it back onto the list if not
186 /// already present to ensure it is re-visited.
187 SmallSetVector
<AllocaInst
*, 16> Worklist
;
189 /// A collection of instructions to delete.
190 /// We try to batch deletions to simplify code and make things a bit more
191 /// efficient. We also make sure there is no dangling pointers.
192 SmallVector
<WeakVH
, 8> DeadInsts
;
194 /// Post-promotion worklist.
196 /// Sometimes we discover an alloca which has a high probability of becoming
197 /// viable for SROA after a round of promotion takes place. In those cases,
198 /// the alloca is enqueued here for re-processing.
200 /// Note that we have to be very careful to clear allocas out of this list in
201 /// the event they are deleted.
202 SmallSetVector
<AllocaInst
*, 16> PostPromotionWorklist
;
204 /// A collection of alloca instructions we can directly promote.
205 std::vector
<AllocaInst
*> PromotableAllocas
;
207 /// A worklist of PHIs to speculate prior to promoting allocas.
209 /// All of these PHIs have been checked for the safety of speculation and by
210 /// being speculated will allow promoting allocas currently in the promotable
212 SmallSetVector
<PHINode
*, 8> SpeculatablePHIs
;
214 /// A worklist of select instructions to rewrite prior to promoting
216 SmallMapVector
<SelectInst
*, RewriteableMemOps
, 8> SelectsToRewrite
;
218 /// Select instructions that use an alloca and are subsequently loaded can be
219 /// rewritten to load both input pointers and then select between the result,
220 /// allowing the load of the alloca to be promoted.
222 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
223 /// %V = load <type>, ptr %P2
225 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
226 /// %V2 = load <type>, ptr %Other
227 /// %V = select i1 %cond, <type> %V1, <type> %V2
229 /// We can do this to a select if its only uses are loads
230 /// and if either the operand to the select can be loaded unconditionally,
231 /// or if we are allowed to perform CFG modifications.
232 /// If found an intervening bitcast with a single use of the load,
233 /// allow the promotion.
234 static std::optional
<RewriteableMemOps
>
235 isSafeSelectToSpeculate(SelectInst
&SI
, bool PreserveCFG
);
238 SROA(LLVMContext
*C
, DomTreeUpdater
*DTU
, AssumptionCache
*AC
,
239 SROAOptions PreserveCFG_
)
240 : C(C
), DTU(DTU
), AC(AC
),
241 PreserveCFG(PreserveCFG_
== SROAOptions::PreserveCFG
) {}
243 /// Main run method used by both the SROAPass and by the legacy pass.
244 std::pair
<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function
&F
);
247 friend class AllocaSliceRewriter
;
249 bool presplitLoadsAndStores(AllocaInst
&AI
, AllocaSlices
&AS
);
250 AllocaInst
*rewritePartition(AllocaInst
&AI
, AllocaSlices
&AS
, Partition
&P
);
251 bool splitAlloca(AllocaInst
&AI
, AllocaSlices
&AS
);
252 std::pair
<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst
&AI
);
253 void clobberUse(Use
&U
);
254 bool deleteDeadInstructions(SmallPtrSetImpl
<AllocaInst
*> &DeletedAllocas
);
255 bool promoteAllocas(Function
&F
);
258 } // end anonymous namespace
260 /// Calculate the fragment of a variable to use when slicing a store
261 /// based on the slice dimensions, existing fragment, and base storage
264 /// UseFrag - Use Target as the new fragment.
265 /// UseNoFrag - The new slice already covers the whole variable.
266 /// Skip - The new alloca slice doesn't include this variable.
267 /// FIXME: Can we use calculateFragmentIntersect instead?
269 enum FragCalcResult
{ UseFrag
, UseNoFrag
, Skip
};
271 static FragCalcResult
272 calculateFragment(DILocalVariable
*Variable
,
273 uint64_t NewStorageSliceOffsetInBits
,
274 uint64_t NewStorageSliceSizeInBits
,
275 std::optional
<DIExpression::FragmentInfo
> StorageFragment
,
276 std::optional
<DIExpression::FragmentInfo
> CurrentFragment
,
277 DIExpression::FragmentInfo
&Target
) {
278 // If the base storage describes part of the variable apply the offset and
279 // the size constraint.
280 if (StorageFragment
) {
282 std::min(NewStorageSliceSizeInBits
, StorageFragment
->SizeInBits
);
283 Target
.OffsetInBits
=
284 NewStorageSliceOffsetInBits
+ StorageFragment
->OffsetInBits
;
286 Target
.SizeInBits
= NewStorageSliceSizeInBits
;
287 Target
.OffsetInBits
= NewStorageSliceOffsetInBits
;
290 // If this slice extracts the entirety of an independent variable from a
291 // larger alloca, do not produce a fragment expression, as the variable is
293 if (!CurrentFragment
) {
294 if (auto Size
= Variable
->getSizeInBits()) {
295 // Treat the current fragment as covering the whole variable.
296 CurrentFragment
= DIExpression::FragmentInfo(*Size
, 0);
297 if (Target
== CurrentFragment
)
302 // No additional work to do if there isn't a fragment already, or there is
303 // but it already exactly describes the new assignment.
304 if (!CurrentFragment
|| *CurrentFragment
== Target
)
307 // Reject the target fragment if it doesn't fit wholly within the current
308 // fragment. TODO: We could instead chop up the target to fit in the case of
309 // a partial overlap.
310 if (Target
.startInBits() < CurrentFragment
->startInBits() ||
311 Target
.endInBits() > CurrentFragment
->endInBits())
314 // Target fits within the current fragment, return it.
318 static DebugVariable
getAggregateVariable(DbgVariableIntrinsic
*DVI
) {
319 return DebugVariable(DVI
->getVariable(), std::nullopt
,
320 DVI
->getDebugLoc().getInlinedAt());
322 static DebugVariable
getAggregateVariable(DPValue
*DPV
) {
323 return DebugVariable(DPV
->getVariable(), std::nullopt
,
324 DPV
->getDebugLoc().getInlinedAt());
327 static DPValue
*createLinkedAssign(DPValue
*, DIBuilder
&DIB
,
328 Instruction
*LinkedInstr
, Value
*NewValue
,
329 DILocalVariable
*Variable
,
330 DIExpression
*Expression
, Value
*Address
,
331 DIExpression
*AddressExpression
,
332 const DILocation
*DI
) {
334 return DPValue::createLinkedDPVAssign(LinkedInstr
, NewValue
, Variable
,
335 Expression
, Address
, AddressExpression
,
338 static DbgAssignIntrinsic
*createLinkedAssign(
339 DbgAssignIntrinsic
*, DIBuilder
&DIB
, Instruction
*LinkedInstr
,
340 Value
*NewValue
, DILocalVariable
*Variable
, DIExpression
*Expression
,
341 Value
*Address
, DIExpression
*AddressExpression
, const DILocation
*DI
) {
342 return DIB
.insertDbgAssign(LinkedInstr
, NewValue
, Variable
, Expression
,
343 Address
, AddressExpression
, DI
);
346 /// Find linked dbg.assign and generate a new one with the correct
347 /// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
348 /// value component is copied from the old dbg.assign to the new.
349 /// \param OldAlloca Alloca for the variable before splitting.
350 /// \param IsSplit True if the store (not necessarily alloca)
352 /// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
353 /// \param SliceSizeInBits New number of bits being written to.
354 /// \param OldInst Instruction that is being split.
355 /// \param Inst New instruction performing this part of the
357 /// \param Dest Store destination.
358 /// \param Value Stored value.
359 /// \param DL Datalayout.
360 static void migrateDebugInfo(AllocaInst
*OldAlloca
, bool IsSplit
,
361 uint64_t OldAllocaOffsetInBits
,
362 uint64_t SliceSizeInBits
, Instruction
*OldInst
,
363 Instruction
*Inst
, Value
*Dest
, Value
*Value
,
364 const DataLayout
&DL
) {
365 auto MarkerRange
= at::getAssignmentMarkers(OldInst
);
366 auto DPVAssignMarkerRange
= at::getDPVAssignmentMarkers(OldInst
);
367 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
368 if (MarkerRange
.empty() && DPVAssignMarkerRange
.empty())
371 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
372 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca
<< "\n");
373 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit
<< "\n");
374 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
376 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits
<< "\n");
377 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst
<< "\n");
378 LLVM_DEBUG(dbgs() << " Inst: " << *Inst
<< "\n");
379 LLVM_DEBUG(dbgs() << " Dest: " << *Dest
<< "\n");
381 LLVM_DEBUG(dbgs() << " Value: " << *Value
<< "\n");
383 /// Map of aggregate variables to their fragment associated with OldAlloca.
384 DenseMap
<DebugVariable
, std::optional
<DIExpression::FragmentInfo
>>
386 for (auto *DAI
: at::getAssignmentMarkers(OldAlloca
))
387 BaseFragments
[getAggregateVariable(DAI
)] =
388 DAI
->getExpression()->getFragmentInfo();
389 for (auto *DPV
: at::getDPVAssignmentMarkers(OldAlloca
))
390 BaseFragments
[getAggregateVariable(DPV
)] =
391 DPV
->getExpression()->getFragmentInfo();
393 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
394 // one). It shouldn't already have one: assert this assumption.
395 assert(!Inst
->getMetadata(LLVMContext::MD_DIAssignID
));
396 DIAssignID
*NewID
= nullptr;
397 auto &Ctx
= Inst
->getContext();
398 DIBuilder
DIB(*OldInst
->getModule(), /*AllowUnresolved*/ false);
399 assert(OldAlloca
->isStaticAlloca());
401 auto MigrateDbgAssign
= [&](auto DbgAssign
) {
402 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
404 auto *Expr
= DbgAssign
->getExpression();
405 bool SetKillLocation
= false;
408 std::optional
<DIExpression::FragmentInfo
> BaseFragment
;
410 auto R
= BaseFragments
.find(getAggregateVariable(DbgAssign
));
411 if (R
== BaseFragments
.end())
413 BaseFragment
= R
->second
;
415 std::optional
<DIExpression::FragmentInfo
> CurrentFragment
=
416 Expr
->getFragmentInfo();
417 DIExpression::FragmentInfo NewFragment
;
418 FragCalcResult Result
= calculateFragment(
419 DbgAssign
->getVariable(), OldAllocaOffsetInBits
, SliceSizeInBits
,
420 BaseFragment
, CurrentFragment
, NewFragment
);
424 if (Result
== UseFrag
&& !(NewFragment
== CurrentFragment
)) {
425 if (CurrentFragment
) {
426 // Rewrite NewFragment to be relative to the existing one (this is
427 // what createFragmentExpression wants). CalculateFragment has
428 // already resolved the size for us. FIXME: Should it return the
429 // relative fragment too?
430 NewFragment
.OffsetInBits
-= CurrentFragment
->OffsetInBits
;
432 // Add the new fragment info to the existing expression if possible.
433 if (auto E
= DIExpression::createFragmentExpression(
434 Expr
, NewFragment
.OffsetInBits
, NewFragment
.SizeInBits
)) {
437 // Otherwise, add the new fragment info to an empty expression and
438 // discard the value component of this dbg.assign as the value cannot
439 // be computed with the new fragment.
440 Expr
= *DIExpression::createFragmentExpression(
441 DIExpression::get(Expr
->getContext(), std::nullopt
),
442 NewFragment
.OffsetInBits
, NewFragment
.SizeInBits
);
443 SetKillLocation
= true;
448 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
450 NewID
= DIAssignID::getDistinct(Ctx
);
451 Inst
->setMetadata(LLVMContext::MD_DIAssignID
, NewID
);
454 ::Value
*NewValue
= Value
? Value
: DbgAssign
->getValue();
455 auto *NewAssign
= createLinkedAssign(
456 DbgAssign
, DIB
, Inst
, NewValue
, DbgAssign
->getVariable(), Expr
, Dest
,
457 DIExpression::get(Expr
->getContext(), std::nullopt
),
458 DbgAssign
->getDebugLoc());
460 // If we've updated the value but the original dbg.assign has an arglist
461 // then kill it now - we can't use the requested new value.
462 // We can't replace the DIArgList with the new value as it'd leave
463 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
464 // an arglist). And we can't keep the DIArgList in case the linked store
465 // is being split - in which case the DIArgList + expression may no longer
466 // be computing the correct value.
467 // This should be a very rare situation as it requires the value being
468 // stored to differ from the dbg.assign (i.e., the value has been
469 // represented differently in the debug intrinsic for some reason).
471 Value
&& (DbgAssign
->hasArgList() ||
472 !DbgAssign
->getExpression()->isSingleLocationExpression());
474 NewAssign
->setKillLocation();
476 // We could use more precision here at the cost of some additional (code)
477 // complexity - if the original dbg.assign was adjacent to its store, we
478 // could position this new dbg.assign adjacent to its store rather than the
479 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
485 // This (current behaviour) results results in debug assignments being
486 // noted as slightly offset (in code) from the store. In practice this
487 // should have little effect on the debugging experience due to the fact
488 // that all the split stores should get the same line number.
489 NewAssign
->moveBefore(DbgAssign
);
491 NewAssign
->setDebugLoc(DbgAssign
->getDebugLoc());
492 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign
<< "\n");
495 for_each(MarkerRange
, MigrateDbgAssign
);
496 for_each(DPVAssignMarkerRange
, MigrateDbgAssign
);
501 /// A custom IRBuilder inserter which prefixes all names, but only in
503 class IRBuilderPrefixedInserter final
: public IRBuilderDefaultInserter
{
506 Twine
getNameWithPrefix(const Twine
&Name
) const {
507 return Name
.isTriviallyEmpty() ? Name
: Prefix
+ Name
;
511 void SetNamePrefix(const Twine
&P
) { Prefix
= P
.str(); }
513 void InsertHelper(Instruction
*I
, const Twine
&Name
, BasicBlock
*BB
,
514 BasicBlock::iterator InsertPt
) const override
{
515 IRBuilderDefaultInserter::InsertHelper(I
, getNameWithPrefix(Name
), BB
,
520 /// Provide a type for IRBuilder that drops names in release builds.
521 using IRBuilderTy
= IRBuilder
<ConstantFolder
, IRBuilderPrefixedInserter
>;
523 /// A used slice of an alloca.
525 /// This structure represents a slice of an alloca used by some instruction. It
526 /// stores both the begin and end offsets of this use, a pointer to the use
527 /// itself, and a flag indicating whether we can classify the use as splittable
528 /// or not when forming partitions of the alloca.
530 /// The beginning offset of the range.
531 uint64_t BeginOffset
= 0;
533 /// The ending offset, not included in the range.
534 uint64_t EndOffset
= 0;
536 /// Storage for both the use of this slice and whether it can be
538 PointerIntPair
<Use
*, 1, bool> UseAndIsSplittable
;
543 Slice(uint64_t BeginOffset
, uint64_t EndOffset
, Use
*U
, bool IsSplittable
)
544 : BeginOffset(BeginOffset
), EndOffset(EndOffset
),
545 UseAndIsSplittable(U
, IsSplittable
) {}
547 uint64_t beginOffset() const { return BeginOffset
; }
548 uint64_t endOffset() const { return EndOffset
; }
550 bool isSplittable() const { return UseAndIsSplittable
.getInt(); }
551 void makeUnsplittable() { UseAndIsSplittable
.setInt(false); }
553 Use
*getUse() const { return UseAndIsSplittable
.getPointer(); }
555 bool isDead() const { return getUse() == nullptr; }
556 void kill() { UseAndIsSplittable
.setPointer(nullptr); }
558 /// Support for ordering ranges.
560 /// This provides an ordering over ranges such that start offsets are
561 /// always increasing, and within equal start offsets, the end offsets are
562 /// decreasing. Thus the spanning range comes first in a cluster with the
563 /// same start position.
564 bool operator<(const Slice
&RHS
) const {
565 if (beginOffset() < RHS
.beginOffset())
567 if (beginOffset() > RHS
.beginOffset())
569 if (isSplittable() != RHS
.isSplittable())
570 return !isSplittable();
571 if (endOffset() > RHS
.endOffset())
576 /// Support comparison with a single offset to allow binary searches.
577 friend LLVM_ATTRIBUTE_UNUSED
bool operator<(const Slice
&LHS
,
578 uint64_t RHSOffset
) {
579 return LHS
.beginOffset() < RHSOffset
;
581 friend LLVM_ATTRIBUTE_UNUSED
bool operator<(uint64_t LHSOffset
,
583 return LHSOffset
< RHS
.beginOffset();
586 bool operator==(const Slice
&RHS
) const {
587 return isSplittable() == RHS
.isSplittable() &&
588 beginOffset() == RHS
.beginOffset() && endOffset() == RHS
.endOffset();
590 bool operator!=(const Slice
&RHS
) const { return !operator==(RHS
); }
593 /// Representation of the alloca slices.
595 /// This class represents the slices of an alloca which are formed by its
596 /// various uses. If a pointer escapes, we can't fully build a representation
597 /// for the slices used and we reflect that in this structure. The uses are
598 /// stored, sorted by increasing beginning offset and with unsplittable slices
599 /// starting at a particular offset before splittable slices.
602 /// Construct the slices of a particular alloca.
603 AllocaSlices(const DataLayout
&DL
, AllocaInst
&AI
);
605 /// Test whether a pointer to the allocation escapes our analysis.
607 /// If this is true, the slices are never fully built and should be
609 bool isEscaped() const { return PointerEscapingInstr
; }
611 /// Support for iterating over the slices.
613 using iterator
= SmallVectorImpl
<Slice
>::iterator
;
614 using range
= iterator_range
<iterator
>;
616 iterator
begin() { return Slices
.begin(); }
617 iterator
end() { return Slices
.end(); }
619 using const_iterator
= SmallVectorImpl
<Slice
>::const_iterator
;
620 using const_range
= iterator_range
<const_iterator
>;
622 const_iterator
begin() const { return Slices
.begin(); }
623 const_iterator
end() const { return Slices
.end(); }
626 /// Erase a range of slices.
627 void erase(iterator Start
, iterator Stop
) { Slices
.erase(Start
, Stop
); }
629 /// Insert new slices for this alloca.
631 /// This moves the slices into the alloca's slices collection, and re-sorts
632 /// everything so that the usual ordering properties of the alloca's slices
634 void insert(ArrayRef
<Slice
> NewSlices
) {
635 int OldSize
= Slices
.size();
636 Slices
.append(NewSlices
.begin(), NewSlices
.end());
637 auto SliceI
= Slices
.begin() + OldSize
;
638 llvm::sort(SliceI
, Slices
.end());
639 std::inplace_merge(Slices
.begin(), SliceI
, Slices
.end());
642 // Forward declare the iterator and range accessor for walking the
644 class partition_iterator
;
645 iterator_range
<partition_iterator
> partitions();
647 /// Access the dead users for this alloca.
648 ArrayRef
<Instruction
*> getDeadUsers() const { return DeadUsers
; }
650 /// Access Uses that should be dropped if the alloca is promotable.
651 ArrayRef
<Use
*> getDeadUsesIfPromotable() const {
652 return DeadUseIfPromotable
;
655 /// Access the dead operands referring to this alloca.
657 /// These are operands which have cannot actually be used to refer to the
658 /// alloca as they are outside its range and the user doesn't correct for
659 /// that. These mostly consist of PHI node inputs and the like which we just
660 /// need to replace with undef.
661 ArrayRef
<Use
*> getDeadOperands() const { return DeadOperands
; }
663 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
664 void print(raw_ostream
&OS
, const_iterator I
, StringRef Indent
= " ") const;
665 void printSlice(raw_ostream
&OS
, const_iterator I
,
666 StringRef Indent
= " ") const;
667 void printUse(raw_ostream
&OS
, const_iterator I
,
668 StringRef Indent
= " ") const;
669 void print(raw_ostream
&OS
) const;
670 void dump(const_iterator I
) const;
675 template <typename DerivedT
, typename RetT
= void> class BuilderBase
;
678 friend class AllocaSlices::SliceBuilder
;
680 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
681 /// Handle to alloca instruction to simplify method interfaces.
685 /// The instruction responsible for this alloca not having a known set
688 /// When an instruction (potentially) escapes the pointer to the alloca, we
689 /// store a pointer to that here and abort trying to form slices of the
690 /// alloca. This will be null if the alloca slices are analyzed successfully.
691 Instruction
*PointerEscapingInstr
;
693 /// The slices of the alloca.
695 /// We store a vector of the slices formed by uses of the alloca here. This
696 /// vector is sorted by increasing begin offset, and then the unsplittable
697 /// slices before the splittable ones. See the Slice inner class for more
699 SmallVector
<Slice
, 8> Slices
;
701 /// Instructions which will become dead if we rewrite the alloca.
703 /// Note that these are not separated by slice. This is because we expect an
704 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
705 /// all these instructions can simply be removed and replaced with poison as
706 /// they come from outside of the allocated space.
707 SmallVector
<Instruction
*, 8> DeadUsers
;
709 /// Uses which will become dead if can promote the alloca.
710 SmallVector
<Use
*, 8> DeadUseIfPromotable
;
712 /// Operands which will become dead if we rewrite the alloca.
714 /// These are operands that in their particular use can be replaced with
715 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
716 /// to PHI nodes and the like. They aren't entirely dead (there might be
717 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
718 /// want to swap this particular input for poison to simplify the use lists of
720 SmallVector
<Use
*, 8> DeadOperands
;
723 /// A partition of the slices.
725 /// An ephemeral representation for a range of slices which can be viewed as
726 /// a partition of the alloca. This range represents a span of the alloca's
727 /// memory which cannot be split, and provides access to all of the slices
728 /// overlapping some part of the partition.
730 /// Objects of this type are produced by traversing the alloca's slices, but
731 /// are only ephemeral and not persistent.
734 friend class AllocaSlices
;
735 friend class AllocaSlices::partition_iterator
;
737 using iterator
= AllocaSlices::iterator
;
739 /// The beginning and ending offsets of the alloca for this
741 uint64_t BeginOffset
= 0, EndOffset
= 0;
743 /// The start and end iterators of this partition.
746 /// A collection of split slice tails overlapping the partition.
747 SmallVector
<Slice
*, 4> SplitTails
;
749 /// Raw constructor builds an empty partition starting and ending at
750 /// the given iterator.
751 Partition(iterator SI
) : SI(SI
), SJ(SI
) {}
754 /// The start offset of this partition.
756 /// All of the contained slices start at or after this offset.
757 uint64_t beginOffset() const { return BeginOffset
; }
759 /// The end offset of this partition.
761 /// All of the contained slices end at or before this offset.
762 uint64_t endOffset() const { return EndOffset
; }
764 /// The size of the partition.
766 /// Note that this can never be zero.
767 uint64_t size() const {
768 assert(BeginOffset
< EndOffset
&& "Partitions must span some bytes!");
769 return EndOffset
- BeginOffset
;
772 /// Test whether this partition contains no slices, and merely spans
773 /// a region occupied by split slices.
774 bool empty() const { return SI
== SJ
; }
776 /// \name Iterate slices that start within the partition.
777 /// These may be splittable or unsplittable. They have a begin offset >= the
778 /// partition begin offset.
780 // FIXME: We should probably define a "concat_iterator" helper and use that
781 // to stitch together pointee_iterators over the split tails and the
782 // contiguous iterators of the partition. That would give a much nicer
783 // interface here. We could then additionally expose filtered iterators for
784 // split, unsplit, and unsplittable splices based on the usage patterns.
785 iterator
begin() const { return SI
; }
786 iterator
end() const { return SJ
; }
789 /// Get the sequence of split slice tails.
791 /// These tails are of slices which start before this partition but are
792 /// split and overlap into the partition. We accumulate these while forming
794 ArrayRef
<Slice
*> splitSliceTails() const { return SplitTails
; }
797 } // end anonymous namespace
799 /// An iterator over partitions of the alloca's slices.
801 /// This iterator implements the core algorithm for partitioning the alloca's
802 /// slices. It is a forward iterator as we don't support backtracking for
803 /// efficiency reasons, and re-use a single storage area to maintain the
804 /// current set of split slices.
806 /// It is templated on the slice iterator type to use so that it can operate
807 /// with either const or non-const slice iterators.
808 class AllocaSlices::partition_iterator
809 : public iterator_facade_base
<partition_iterator
, std::forward_iterator_tag
,
811 friend class AllocaSlices
;
813 /// Most of the state for walking the partitions is held in a class
814 /// with a nice interface for examining them.
817 /// We need to keep the end of the slices to know when to stop.
818 AllocaSlices::iterator SE
;
820 /// We also need to keep track of the maximum split end offset seen.
821 /// FIXME: Do we really?
822 uint64_t MaxSplitSliceEndOffset
= 0;
824 /// Sets the partition to be empty at given iterator, and sets the
826 partition_iterator(AllocaSlices::iterator SI
, AllocaSlices::iterator SE
)
828 // If not already at the end, advance our state to form the initial
834 /// Advance the iterator to the next partition.
836 /// Requires that the iterator not be at the end of the slices.
838 assert((P
.SI
!= SE
|| !P
.SplitTails
.empty()) &&
839 "Cannot advance past the end of the slices!");
841 // Clear out any split uses which have ended.
842 if (!P
.SplitTails
.empty()) {
843 if (P
.EndOffset
>= MaxSplitSliceEndOffset
) {
844 // If we've finished all splits, this is easy.
845 P
.SplitTails
.clear();
846 MaxSplitSliceEndOffset
= 0;
848 // Remove the uses which have ended in the prior partition. This
849 // cannot change the max split slice end because we just checked that
850 // the prior partition ended prior to that max.
851 llvm::erase_if(P
.SplitTails
,
852 [&](Slice
*S
) { return S
->endOffset() <= P
.EndOffset
; });
853 assert(llvm::any_of(P
.SplitTails
,
855 return S
->endOffset() == MaxSplitSliceEndOffset
;
857 "Could not find the current max split slice offset!");
858 assert(llvm::all_of(P
.SplitTails
,
860 return S
->endOffset() <= MaxSplitSliceEndOffset
;
862 "Max split slice end offset is not actually the max!");
866 // If P.SI is already at the end, then we've cleared the split tail and
867 // now have an end iterator.
869 assert(P
.SplitTails
.empty() && "Failed to clear the split slices!");
873 // If we had a non-empty partition previously, set up the state for
874 // subsequent partitions.
876 // Accumulate all the splittable slices which started in the old
877 // partition into the split list.
879 if (S
.isSplittable() && S
.endOffset() > P
.EndOffset
) {
880 P
.SplitTails
.push_back(&S
);
881 MaxSplitSliceEndOffset
=
882 std::max(S
.endOffset(), MaxSplitSliceEndOffset
);
885 // Start from the end of the previous partition.
888 // If P.SI is now at the end, we at most have a tail of split slices.
890 P
.BeginOffset
= P
.EndOffset
;
891 P
.EndOffset
= MaxSplitSliceEndOffset
;
895 // If the we have split slices and the next slice is after a gap and is
896 // not splittable immediately form an empty partition for the split
897 // slices up until the next slice begins.
898 if (!P
.SplitTails
.empty() && P
.SI
->beginOffset() != P
.EndOffset
&&
899 !P
.SI
->isSplittable()) {
900 P
.BeginOffset
= P
.EndOffset
;
901 P
.EndOffset
= P
.SI
->beginOffset();
906 // OK, we need to consume new slices. Set the end offset based on the
907 // current slice, and step SJ past it. The beginning offset of the
908 // partition is the beginning offset of the next slice unless we have
909 // pre-existing split slices that are continuing, in which case we begin
910 // at the prior end offset.
911 P
.BeginOffset
= P
.SplitTails
.empty() ? P
.SI
->beginOffset() : P
.EndOffset
;
912 P
.EndOffset
= P
.SI
->endOffset();
915 // There are two strategies to form a partition based on whether the
916 // partition starts with an unsplittable slice or a splittable slice.
917 if (!P
.SI
->isSplittable()) {
918 // When we're forming an unsplittable region, it must always start at
919 // the first slice and will extend through its end.
920 assert(P
.BeginOffset
== P
.SI
->beginOffset());
922 // Form a partition including all of the overlapping slices with this
923 // unsplittable slice.
924 while (P
.SJ
!= SE
&& P
.SJ
->beginOffset() < P
.EndOffset
) {
925 if (!P
.SJ
->isSplittable())
926 P
.EndOffset
= std::max(P
.EndOffset
, P
.SJ
->endOffset());
930 // We have a partition across a set of overlapping unsplittable
935 // If we're starting with a splittable slice, then we need to form
936 // a synthetic partition spanning it and any other overlapping splittable
938 assert(P
.SI
->isSplittable() && "Forming a splittable partition!");
940 // Collect all of the overlapping splittable slices.
941 while (P
.SJ
!= SE
&& P
.SJ
->beginOffset() < P
.EndOffset
&&
942 P
.SJ
->isSplittable()) {
943 P
.EndOffset
= std::max(P
.EndOffset
, P
.SJ
->endOffset());
947 // Back upiP.EndOffset if we ended the span early when encountering an
948 // unsplittable slice. This synthesizes the early end offset of
949 // a partition spanning only splittable slices.
950 if (P
.SJ
!= SE
&& P
.SJ
->beginOffset() < P
.EndOffset
) {
951 assert(!P
.SJ
->isSplittable());
952 P
.EndOffset
= P
.SJ
->beginOffset();
957 bool operator==(const partition_iterator
&RHS
) const {
958 assert(SE
== RHS
.SE
&&
959 "End iterators don't match between compared partition iterators!");
961 // The observed positions of partitions is marked by the P.SI iterator and
962 // the emptiness of the split slices. The latter is only relevant when
963 // P.SI == SE, as the end iterator will additionally have an empty split
964 // slices list, but the prior may have the same P.SI and a tail of split
966 if (P
.SI
== RHS
.P
.SI
&& P
.SplitTails
.empty() == RHS
.P
.SplitTails
.empty()) {
967 assert(P
.SJ
== RHS
.P
.SJ
&&
968 "Same set of slices formed two different sized partitions!");
969 assert(P
.SplitTails
.size() == RHS
.P
.SplitTails
.size() &&
970 "Same slice position with differently sized non-empty split "
977 partition_iterator
&operator++() {
982 Partition
&operator*() { return P
; }
985 /// A forward range over the partitions of the alloca's slices.
987 /// This accesses an iterator range over the partitions of the alloca's
988 /// slices. It computes these partitions on the fly based on the overlapping
989 /// offsets of the slices and the ability to split them. It will visit "empty"
990 /// partitions to cover regions of the alloca only accessed via split
992 iterator_range
<AllocaSlices::partition_iterator
> AllocaSlices::partitions() {
993 return make_range(partition_iterator(begin(), end()),
994 partition_iterator(end(), end()));
997 static Value
*foldSelectInst(SelectInst
&SI
) {
998 // If the condition being selected on is a constant or the same value is
999 // being selected between, fold the select. Yes this does (rarely) happen
1001 if (ConstantInt
*CI
= dyn_cast
<ConstantInt
>(SI
.getCondition()))
1002 return SI
.getOperand(1 + CI
->isZero());
1003 if (SI
.getOperand(1) == SI
.getOperand(2))
1004 return SI
.getOperand(1);
1009 /// A helper that folds a PHI node or a select.
1010 static Value
*foldPHINodeOrSelectInst(Instruction
&I
) {
1011 if (PHINode
*PN
= dyn_cast
<PHINode
>(&I
)) {
1012 // If PN merges together the same value, return that value.
1013 return PN
->hasConstantValue();
1015 return foldSelectInst(cast
<SelectInst
>(I
));
1018 /// Builder for the alloca slices.
1020 /// This class builds a set of alloca slices by recursively visiting the uses
1021 /// of an alloca and making a slice for each load and store at each offset.
1022 class AllocaSlices::SliceBuilder
: public PtrUseVisitor
<SliceBuilder
> {
1023 friend class PtrUseVisitor
<SliceBuilder
>;
1024 friend class InstVisitor
<SliceBuilder
>;
1026 using Base
= PtrUseVisitor
<SliceBuilder
>;
1028 const uint64_t AllocSize
;
1031 SmallDenseMap
<Instruction
*, unsigned> MemTransferSliceMap
;
1032 SmallDenseMap
<Instruction
*, uint64_t> PHIOrSelectSizes
;
1034 /// Set to de-duplicate dead instructions found in the use walk.
1035 SmallPtrSet
<Instruction
*, 4> VisitedDeadInsts
;
1038 SliceBuilder(const DataLayout
&DL
, AllocaInst
&AI
, AllocaSlices
&AS
)
1039 : PtrUseVisitor
<SliceBuilder
>(DL
),
1040 AllocSize(DL
.getTypeAllocSize(AI
.getAllocatedType()).getFixedValue()),
1044 void markAsDead(Instruction
&I
) {
1045 if (VisitedDeadInsts
.insert(&I
).second
)
1046 AS
.DeadUsers
.push_back(&I
);
1049 void insertUse(Instruction
&I
, const APInt
&Offset
, uint64_t Size
,
1050 bool IsSplittable
= false) {
1051 // Completely skip uses which have a zero size or start either before or
1052 // past the end of the allocation.
1053 if (Size
== 0 || Offset
.uge(AllocSize
)) {
1054 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size
<< " byte use @"
1056 << " which has zero size or starts outside of the "
1057 << AllocSize
<< " byte alloca:\n"
1058 << " alloca: " << AS
.AI
<< "\n"
1059 << " use: " << I
<< "\n");
1060 return markAsDead(I
);
1063 uint64_t BeginOffset
= Offset
.getZExtValue();
1064 uint64_t EndOffset
= BeginOffset
+ Size
;
1066 // Clamp the end offset to the end of the allocation. Note that this is
1067 // formulated to handle even the case where "BeginOffset + Size" overflows.
1068 // This may appear superficially to be something we could ignore entirely,
1069 // but that is not so! There may be widened loads or PHI-node uses where
1070 // some instructions are dead but not others. We can't completely ignore
1071 // them, and so have to record at least the information here.
1072 assert(AllocSize
>= BeginOffset
); // Established above.
1073 if (Size
> AllocSize
- BeginOffset
) {
1074 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size
<< " byte use @"
1075 << Offset
<< " to remain within the " << AllocSize
1076 << " byte alloca:\n"
1077 << " alloca: " << AS
.AI
<< "\n"
1078 << " use: " << I
<< "\n");
1079 EndOffset
= AllocSize
;
1082 AS
.Slices
.push_back(Slice(BeginOffset
, EndOffset
, U
, IsSplittable
));
1085 void visitBitCastInst(BitCastInst
&BC
) {
1087 return markAsDead(BC
);
1089 return Base::visitBitCastInst(BC
);
1092 void visitAddrSpaceCastInst(AddrSpaceCastInst
&ASC
) {
1093 if (ASC
.use_empty())
1094 return markAsDead(ASC
);
1096 return Base::visitAddrSpaceCastInst(ASC
);
1099 void visitGetElementPtrInst(GetElementPtrInst
&GEPI
) {
1100 if (GEPI
.use_empty())
1101 return markAsDead(GEPI
);
1103 if (SROAStrictInbounds
&& GEPI
.isInBounds()) {
1104 // FIXME: This is a manually un-factored variant of the basic code inside
1105 // of GEPs with checking of the inbounds invariant specified in the
1106 // langref in a very strict sense. If we ever want to enable
1107 // SROAStrictInbounds, this code should be factored cleanly into
1108 // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
1109 // by writing out the code here where we have the underlying allocation
1110 // size readily available.
1111 APInt GEPOffset
= Offset
;
1112 const DataLayout
&DL
= GEPI
.getModule()->getDataLayout();
1113 for (gep_type_iterator GTI
= gep_type_begin(GEPI
),
1114 GTE
= gep_type_end(GEPI
);
1115 GTI
!= GTE
; ++GTI
) {
1116 ConstantInt
*OpC
= dyn_cast
<ConstantInt
>(GTI
.getOperand());
1120 // Handle a struct index, which adds its field offset to the pointer.
1121 if (StructType
*STy
= GTI
.getStructTypeOrNull()) {
1122 unsigned ElementIdx
= OpC
->getZExtValue();
1123 const StructLayout
*SL
= DL
.getStructLayout(STy
);
1125 APInt(Offset
.getBitWidth(), SL
->getElementOffset(ElementIdx
));
1127 // For array or vector indices, scale the index by the size of the
1129 APInt Index
= OpC
->getValue().sextOrTrunc(Offset
.getBitWidth());
1130 GEPOffset
+= Index
* APInt(Offset
.getBitWidth(),
1131 GTI
.getSequentialElementStride(DL
));
1134 // If this index has computed an intermediate pointer which is not
1135 // inbounds, then the result of the GEP is a poison value and we can
1136 // delete it and all uses.
1137 if (GEPOffset
.ugt(AllocSize
))
1138 return markAsDead(GEPI
);
1142 return Base::visitGetElementPtrInst(GEPI
);
1145 void handleLoadOrStore(Type
*Ty
, Instruction
&I
, const APInt
&Offset
,
1146 uint64_t Size
, bool IsVolatile
) {
1147 // We allow splitting of non-volatile loads and stores where the type is an
1148 // integer type. These may be used to implement 'memcpy' or other "transfer
1149 // of bits" patterns.
1151 Ty
->isIntegerTy() && !IsVolatile
&& DL
.typeSizeEqualsStoreSize(Ty
);
1153 insertUse(I
, Offset
, Size
, IsSplittable
);
1156 void visitLoadInst(LoadInst
&LI
) {
1157 assert((!LI
.isSimple() || LI
.getType()->isSingleValueType()) &&
1158 "All simple FCA loads should have been pre-split");
1161 return PI
.setAborted(&LI
);
1163 TypeSize Size
= DL
.getTypeStoreSize(LI
.getType());
1164 if (Size
.isScalable())
1165 return PI
.setAborted(&LI
);
1167 return handleLoadOrStore(LI
.getType(), LI
, Offset
, Size
.getFixedValue(),
1171 void visitStoreInst(StoreInst
&SI
) {
1172 Value
*ValOp
= SI
.getValueOperand();
1174 return PI
.setEscapedAndAborted(&SI
);
1176 return PI
.setAborted(&SI
);
1178 TypeSize StoreSize
= DL
.getTypeStoreSize(ValOp
->getType());
1179 if (StoreSize
.isScalable())
1180 return PI
.setAborted(&SI
);
1182 uint64_t Size
= StoreSize
.getFixedValue();
1184 // If this memory access can be shown to *statically* extend outside the
1185 // bounds of the allocation, it's behavior is undefined, so simply
1186 // ignore it. Note that this is more strict than the generic clamping
1187 // behavior of insertUse. We also try to handle cases which might run the
1188 // risk of overflow.
1189 // FIXME: We should instead consider the pointer to have escaped if this
1190 // function is being instrumented for addressing bugs or race conditions.
1191 if (Size
> AllocSize
|| Offset
.ugt(AllocSize
- Size
)) {
1192 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size
<< " byte store @"
1193 << Offset
<< " which extends past the end of the "
1194 << AllocSize
<< " byte alloca:\n"
1195 << " alloca: " << AS
.AI
<< "\n"
1196 << " use: " << SI
<< "\n");
1197 return markAsDead(SI
);
1200 assert((!SI
.isSimple() || ValOp
->getType()->isSingleValueType()) &&
1201 "All simple FCA stores should have been pre-split");
1202 handleLoadOrStore(ValOp
->getType(), SI
, Offset
, Size
, SI
.isVolatile());
1205 void visitMemSetInst(MemSetInst
&II
) {
1206 assert(II
.getRawDest() == *U
&& "Pointer use is not the destination?");
1207 ConstantInt
*Length
= dyn_cast
<ConstantInt
>(II
.getLength());
1208 if ((Length
&& Length
->getValue() == 0) ||
1209 (IsOffsetKnown
&& Offset
.uge(AllocSize
)))
1210 // Zero-length mem transfer intrinsics can be ignored entirely.
1211 return markAsDead(II
);
1214 return PI
.setAborted(&II
);
1216 insertUse(II
, Offset
, Length
? Length
->getLimitedValue()
1217 : AllocSize
- Offset
.getLimitedValue(),
1221 void visitMemTransferInst(MemTransferInst
&II
) {
1222 ConstantInt
*Length
= dyn_cast
<ConstantInt
>(II
.getLength());
1223 if (Length
&& Length
->getValue() == 0)
1224 // Zero-length mem transfer intrinsics can be ignored entirely.
1225 return markAsDead(II
);
1227 // Because we can visit these intrinsics twice, also check to see if the
1228 // first time marked this instruction as dead. If so, skip it.
1229 if (VisitedDeadInsts
.count(&II
))
1233 return PI
.setAborted(&II
);
1235 // This side of the transfer is completely out-of-bounds, and so we can
1236 // nuke the entire transfer. However, we also need to nuke the other side
1237 // if already added to our partitions.
1238 // FIXME: Yet another place we really should bypass this when
1239 // instrumenting for ASan.
1240 if (Offset
.uge(AllocSize
)) {
1241 SmallDenseMap
<Instruction
*, unsigned>::iterator MTPI
=
1242 MemTransferSliceMap
.find(&II
);
1243 if (MTPI
!= MemTransferSliceMap
.end())
1244 AS
.Slices
[MTPI
->second
].kill();
1245 return markAsDead(II
);
1248 uint64_t RawOffset
= Offset
.getLimitedValue();
1249 uint64_t Size
= Length
? Length
->getLimitedValue() : AllocSize
- RawOffset
;
1251 // Check for the special case where the same exact value is used for both
1253 if (*U
== II
.getRawDest() && *U
== II
.getRawSource()) {
1254 // For non-volatile transfers this is a no-op.
1255 if (!II
.isVolatile())
1256 return markAsDead(II
);
1258 return insertUse(II
, Offset
, Size
, /*IsSplittable=*/false);
1261 // If we have seen both source and destination for a mem transfer, then
1262 // they both point to the same alloca.
1264 SmallDenseMap
<Instruction
*, unsigned>::iterator MTPI
;
1265 std::tie(MTPI
, Inserted
) =
1266 MemTransferSliceMap
.insert(std::make_pair(&II
, AS
.Slices
.size()));
1267 unsigned PrevIdx
= MTPI
->second
;
1269 Slice
&PrevP
= AS
.Slices
[PrevIdx
];
1271 // Check if the begin offsets match and this is a non-volatile transfer.
1272 // In that case, we can completely elide the transfer.
1273 if (!II
.isVolatile() && PrevP
.beginOffset() == RawOffset
) {
1275 return markAsDead(II
);
1278 // Otherwise we have an offset transfer within the same alloca. We can't
1280 PrevP
.makeUnsplittable();
1283 // Insert the use now that we've fixed up the splittable nature.
1284 insertUse(II
, Offset
, Size
, /*IsSplittable=*/Inserted
&& Length
);
1286 // Check that we ended up with a valid index in the map.
1287 assert(AS
.Slices
[PrevIdx
].getUse()->getUser() == &II
&&
1288 "Map index doesn't point back to a slice with this user.");
1291 // Disable SRoA for any intrinsics except for lifetime invariants and
1293 // FIXME: What about debug intrinsics? This matches old behavior, but
1294 // doesn't make sense.
1295 void visitIntrinsicInst(IntrinsicInst
&II
) {
1296 if (II
.isDroppable()) {
1297 AS
.DeadUseIfPromotable
.push_back(U
);
1302 return PI
.setAborted(&II
);
1304 if (II
.isLifetimeStartOrEnd()) {
1305 ConstantInt
*Length
= cast
<ConstantInt
>(II
.getArgOperand(0));
1306 uint64_t Size
= std::min(AllocSize
- Offset
.getLimitedValue(),
1307 Length
->getLimitedValue());
1308 insertUse(II
, Offset
, Size
, true);
1312 if (II
.isLaunderOrStripInvariantGroup()) {
1313 insertUse(II
, Offset
, AllocSize
, true);
1318 Base::visitIntrinsicInst(II
);
1321 Instruction
*hasUnsafePHIOrSelectUse(Instruction
*Root
, uint64_t &Size
) {
1322 // We consider any PHI or select that results in a direct load or store of
1323 // the same offset to be a viable use for slicing purposes. These uses
1324 // are considered unsplittable and the size is the maximum loaded or stored
1326 SmallPtrSet
<Instruction
*, 4> Visited
;
1327 SmallVector
<std::pair
<Instruction
*, Instruction
*>, 4> Uses
;
1328 Visited
.insert(Root
);
1329 Uses
.push_back(std::make_pair(cast
<Instruction
>(*U
), Root
));
1330 const DataLayout
&DL
= Root
->getModule()->getDataLayout();
1331 // If there are no loads or stores, the access is dead. We mark that as
1332 // a size zero access.
1335 Instruction
*I
, *UsedI
;
1336 std::tie(UsedI
, I
) = Uses
.pop_back_val();
1338 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(I
)) {
1339 TypeSize LoadSize
= DL
.getTypeStoreSize(LI
->getType());
1340 if (LoadSize
.isScalable()) {
1344 Size
= std::max(Size
, LoadSize
.getFixedValue());
1347 if (StoreInst
*SI
= dyn_cast
<StoreInst
>(I
)) {
1348 Value
*Op
= SI
->getOperand(0);
1351 TypeSize StoreSize
= DL
.getTypeStoreSize(Op
->getType());
1352 if (StoreSize
.isScalable()) {
1356 Size
= std::max(Size
, StoreSize
.getFixedValue());
1360 if (GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(I
)) {
1361 if (!GEP
->hasAllZeroIndices())
1363 } else if (!isa
<BitCastInst
>(I
) && !isa
<PHINode
>(I
) &&
1364 !isa
<SelectInst
>(I
) && !isa
<AddrSpaceCastInst
>(I
)) {
1368 for (User
*U
: I
->users())
1369 if (Visited
.insert(cast
<Instruction
>(U
)).second
)
1370 Uses
.push_back(std::make_pair(I
, cast
<Instruction
>(U
)));
1371 } while (!Uses
.empty());
1376 void visitPHINodeOrSelectInst(Instruction
&I
) {
1377 assert(isa
<PHINode
>(I
) || isa
<SelectInst
>(I
));
1379 return markAsDead(I
);
1381 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1382 // instructions in this BB, which may be required during rewriting. Bail out
1384 if (isa
<PHINode
>(I
) &&
1385 I
.getParent()->getFirstInsertionPt() == I
.getParent()->end())
1386 return PI
.setAborted(&I
);
1388 // TODO: We could use simplifyInstruction here to fold PHINodes and
1389 // SelectInsts. However, doing so requires to change the current
1390 // dead-operand-tracking mechanism. For instance, suppose neither loading
1391 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1392 // trap either. However, if we simply replace %U with undef using the
1393 // current dead-operand-tracking mechanism, "load (select undef, undef,
1394 // %other)" may trap because the select may return the first operand
1396 if (Value
*Result
= foldPHINodeOrSelectInst(I
)) {
1398 // If the result of the constant fold will be the pointer, recurse
1399 // through the PHI/select as if we had RAUW'ed it.
1402 // Otherwise the operand to the PHI/select is dead, and we can replace
1404 AS
.DeadOperands
.push_back(U
);
1410 return PI
.setAborted(&I
);
1412 // See if we already have computed info on this node.
1413 uint64_t &Size
= PHIOrSelectSizes
[&I
];
1415 // This is a new PHI/Select, check for an unsafe use of it.
1416 if (Instruction
*UnsafeI
= hasUnsafePHIOrSelectUse(&I
, Size
))
1417 return PI
.setAborted(UnsafeI
);
1420 // For PHI and select operands outside the alloca, we can't nuke the entire
1421 // phi or select -- the other side might still be relevant, so we special
1422 // case them here and use a separate structure to track the operands
1423 // themselves which should be replaced with poison.
1424 // FIXME: This should instead be escaped in the event we're instrumenting
1425 // for address sanitization.
1426 if (Offset
.uge(AllocSize
)) {
1427 AS
.DeadOperands
.push_back(U
);
1431 insertUse(I
, Offset
, Size
);
1434 void visitPHINode(PHINode
&PN
) { visitPHINodeOrSelectInst(PN
); }
1436 void visitSelectInst(SelectInst
&SI
) { visitPHINodeOrSelectInst(SI
); }
1438 /// Disable SROA entirely if there are unhandled users of the alloca.
1439 void visitInstruction(Instruction
&I
) { PI
.setAborted(&I
); }
1442 AllocaSlices::AllocaSlices(const DataLayout
&DL
, AllocaInst
&AI
)
1444 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1447 PointerEscapingInstr(nullptr) {
1448 SliceBuilder
PB(DL
, AI
, *this);
1449 SliceBuilder::PtrInfo PtrI
= PB
.visitPtr(AI
);
1450 if (PtrI
.isEscaped() || PtrI
.isAborted()) {
1451 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1452 // possibly by just storing the PtrInfo in the AllocaSlices.
1453 PointerEscapingInstr
= PtrI
.getEscapingInst() ? PtrI
.getEscapingInst()
1454 : PtrI
.getAbortingInst();
1455 assert(PointerEscapingInstr
&& "Did not track a bad instruction");
1459 llvm::erase_if(Slices
, [](const Slice
&S
) { return S
.isDead(); });
1461 // Sort the uses. This arranges for the offsets to be in ascending order,
1462 // and the sizes to be in descending order.
1463 llvm::stable_sort(Slices
);
1466 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1468 void AllocaSlices::print(raw_ostream
&OS
, const_iterator I
,
1469 StringRef Indent
) const {
1470 printSlice(OS
, I
, Indent
);
1472 printUse(OS
, I
, Indent
);
1475 void AllocaSlices::printSlice(raw_ostream
&OS
, const_iterator I
,
1476 StringRef Indent
) const {
1477 OS
<< Indent
<< "[" << I
->beginOffset() << "," << I
->endOffset() << ")"
1478 << " slice #" << (I
- begin())
1479 << (I
->isSplittable() ? " (splittable)" : "");
1482 void AllocaSlices::printUse(raw_ostream
&OS
, const_iterator I
,
1483 StringRef Indent
) const {
1484 OS
<< Indent
<< " used by: " << *I
->getUse()->getUser() << "\n";
1487 void AllocaSlices::print(raw_ostream
&OS
) const {
1488 if (PointerEscapingInstr
) {
1489 OS
<< "Can't analyze slices for alloca: " << AI
<< "\n"
1490 << " A pointer to this alloca escaped by:\n"
1491 << " " << *PointerEscapingInstr
<< "\n";
1495 OS
<< "Slices of alloca: " << AI
<< "\n";
1496 for (const_iterator I
= begin(), E
= end(); I
!= E
; ++I
)
1500 LLVM_DUMP_METHOD
void AllocaSlices::dump(const_iterator I
) const {
1503 LLVM_DUMP_METHOD
void AllocaSlices::dump() const { print(dbgs()); }
1505 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1507 /// Walk the range of a partitioning looking for a common type to cover this
1508 /// sequence of slices.
1509 static std::pair
<Type
*, IntegerType
*>
1510 findCommonType(AllocaSlices::const_iterator B
, AllocaSlices::const_iterator E
,
1511 uint64_t EndOffset
) {
1513 bool TyIsCommon
= true;
1514 IntegerType
*ITy
= nullptr;
1516 // Note that we need to look at *every* alloca slice's Use to ensure we
1517 // always get consistent results regardless of the order of slices.
1518 for (AllocaSlices::const_iterator I
= B
; I
!= E
; ++I
) {
1519 Use
*U
= I
->getUse();
1520 if (isa
<IntrinsicInst
>(*U
->getUser()))
1522 if (I
->beginOffset() != B
->beginOffset() || I
->endOffset() != EndOffset
)
1525 Type
*UserTy
= nullptr;
1526 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(U
->getUser())) {
1527 UserTy
= LI
->getType();
1528 } else if (StoreInst
*SI
= dyn_cast
<StoreInst
>(U
->getUser())) {
1529 UserTy
= SI
->getValueOperand()->getType();
1532 if (IntegerType
*UserITy
= dyn_cast_or_null
<IntegerType
>(UserTy
)) {
1533 // If the type is larger than the partition, skip it. We only encounter
1534 // this for split integer operations where we want to use the type of the
1535 // entity causing the split. Also skip if the type is not a byte width
1537 if (UserITy
->getBitWidth() % 8 != 0 ||
1538 UserITy
->getBitWidth() / 8 > (EndOffset
- B
->beginOffset()))
1541 // Track the largest bitwidth integer type used in this way in case there
1542 // is no common type.
1543 if (!ITy
|| ITy
->getBitWidth() < UserITy
->getBitWidth())
1547 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1548 // depend on types skipped above.
1549 if (!UserTy
|| (Ty
&& Ty
!= UserTy
))
1550 TyIsCommon
= false; // Give up on anything but an iN type.
1555 return {TyIsCommon
? Ty
: nullptr, ITy
};
1558 /// PHI instructions that use an alloca and are subsequently loaded can be
1559 /// rewritten to load both input pointers in the pred blocks and then PHI the
1560 /// results, allowing the load of the alloca to be promoted.
1562 /// %P2 = phi [i32* %Alloca, i32* %Other]
1563 /// %V = load i32* %P2
1565 /// %V1 = load i32* %Alloca -> will be mem2reg'd
1567 /// %V2 = load i32* %Other
1569 /// %V = phi [i32 %V1, i32 %V2]
1571 /// We can do this to a select if its only uses are loads and if the operands
1572 /// to the select can be loaded unconditionally.
1574 /// FIXME: This should be hoisted into a generic utility, likely in
1575 /// Transforms/Util/Local.h
1576 static bool isSafePHIToSpeculate(PHINode
&PN
) {
1577 const DataLayout
&DL
= PN
.getModule()->getDataLayout();
1579 // For now, we can only do this promotion if the load is in the same block
1580 // as the PHI, and if there are no stores between the phi and load.
1581 // TODO: Allow recursive phi users.
1582 // TODO: Allow stores.
1583 BasicBlock
*BB
= PN
.getParent();
1585 uint64_t APWidth
= DL
.getIndexTypeSizeInBits(PN
.getType());
1586 Type
*LoadType
= nullptr;
1587 for (User
*U
: PN
.users()) {
1588 LoadInst
*LI
= dyn_cast
<LoadInst
>(U
);
1589 if (!LI
|| !LI
->isSimple())
1592 // For now we only allow loads in the same block as the PHI. This is
1593 // a common case that happens when instcombine merges two loads through
1595 if (LI
->getParent() != BB
)
1599 if (LoadType
!= LI
->getType())
1602 LoadType
= LI
->getType();
1605 // Ensure that there are no instructions between the PHI and the load that
1607 for (BasicBlock::iterator
BBI(PN
); &*BBI
!= LI
; ++BBI
)
1608 if (BBI
->mayWriteToMemory())
1611 MaxAlign
= std::max(MaxAlign
, LI
->getAlign());
1618 APInt(APWidth
, DL
.getTypeStoreSize(LoadType
).getFixedValue());
1620 // We can only transform this if it is safe to push the loads into the
1621 // predecessor blocks. The only thing to watch out for is that we can't put
1622 // a possibly trapping load in the predecessor if it is a critical edge.
1623 for (unsigned Idx
= 0, Num
= PN
.getNumIncomingValues(); Idx
!= Num
; ++Idx
) {
1624 Instruction
*TI
= PN
.getIncomingBlock(Idx
)->getTerminator();
1625 Value
*InVal
= PN
.getIncomingValue(Idx
);
1627 // If the value is produced by the terminator of the predecessor (an
1628 // invoke) or it has side-effects, there is no valid place to put a load
1629 // in the predecessor.
1630 if (TI
== InVal
|| TI
->mayHaveSideEffects())
1633 // If the predecessor has a single successor, then the edge isn't
1635 if (TI
->getNumSuccessors() == 1)
1638 // If this pointer is always safe to load, or if we can prove that there
1639 // is already a load in the block, then we can move the load to the pred
1641 if (isSafeToLoadUnconditionally(InVal
, MaxAlign
, LoadSize
, DL
, TI
))
1650 static void speculatePHINodeLoads(IRBuilderTy
&IRB
, PHINode
&PN
) {
1651 LLVM_DEBUG(dbgs() << " original: " << PN
<< "\n");
1653 LoadInst
*SomeLoad
= cast
<LoadInst
>(PN
.user_back());
1654 Type
*LoadTy
= SomeLoad
->getType();
1655 IRB
.SetInsertPoint(&PN
);
1656 PHINode
*NewPN
= IRB
.CreatePHI(LoadTy
, PN
.getNumIncomingValues(),
1657 PN
.getName() + ".sroa.speculated");
1659 // Get the AA tags and alignment to use from one of the loads. It does not
1660 // matter which one we get and if any differ.
1661 AAMDNodes AATags
= SomeLoad
->getAAMetadata();
1662 Align Alignment
= SomeLoad
->getAlign();
1664 // Rewrite all loads of the PN to use the new PHI.
1665 while (!PN
.use_empty()) {
1666 LoadInst
*LI
= cast
<LoadInst
>(PN
.user_back());
1667 LI
->replaceAllUsesWith(NewPN
);
1668 LI
->eraseFromParent();
1671 // Inject loads into all of the pred blocks.
1672 DenseMap
<BasicBlock
*, Value
*> InjectedLoads
;
1673 for (unsigned Idx
= 0, Num
= PN
.getNumIncomingValues(); Idx
!= Num
; ++Idx
) {
1674 BasicBlock
*Pred
= PN
.getIncomingBlock(Idx
);
1675 Value
*InVal
= PN
.getIncomingValue(Idx
);
1677 // A PHI node is allowed to have multiple (duplicated) entries for the same
1678 // basic block, as long as the value is the same. So if we already injected
1679 // a load in the predecessor, then we should reuse the same load for all
1680 // duplicated entries.
1681 if (Value
* V
= InjectedLoads
.lookup(Pred
)) {
1682 NewPN
->addIncoming(V
, Pred
);
1686 Instruction
*TI
= Pred
->getTerminator();
1687 IRB
.SetInsertPoint(TI
);
1689 LoadInst
*Load
= IRB
.CreateAlignedLoad(
1690 LoadTy
, InVal
, Alignment
,
1691 (PN
.getName() + ".sroa.speculate.load." + Pred
->getName()));
1692 ++NumLoadsSpeculated
;
1694 Load
->setAAMetadata(AATags
);
1695 NewPN
->addIncoming(Load
, Pred
);
1696 InjectedLoads
[Pred
] = Load
;
1699 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN
<< "\n");
1700 PN
.eraseFromParent();
1703 SelectHandSpeculativity
&
1704 SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal
) {
1706 Bitfield::set
<SelectHandSpeculativity::TrueVal
>(Storage
, true);
1708 Bitfield::set
<SelectHandSpeculativity::FalseVal
>(Storage
, true);
1712 bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal
) const {
1713 return isTrueVal
? Bitfield::get
<SelectHandSpeculativity::TrueVal
>(Storage
)
1714 : Bitfield::get
<SelectHandSpeculativity::FalseVal
>(Storage
);
1717 bool SelectHandSpeculativity::areAllSpeculatable() const {
1718 return isSpeculatable(/*isTrueVal=*/true) &&
1719 isSpeculatable(/*isTrueVal=*/false);
1722 bool SelectHandSpeculativity::areAnySpeculatable() const {
1723 return isSpeculatable(/*isTrueVal=*/true) ||
1724 isSpeculatable(/*isTrueVal=*/false);
1726 bool SelectHandSpeculativity::areNoneSpeculatable() const {
1727 return !areAnySpeculatable();
1730 static SelectHandSpeculativity
1731 isSafeLoadOfSelectToSpeculate(LoadInst
&LI
, SelectInst
&SI
, bool PreserveCFG
) {
1732 assert(LI
.isSimple() && "Only for simple loads");
1733 SelectHandSpeculativity Spec
;
1735 const DataLayout
&DL
= SI
.getModule()->getDataLayout();
1736 for (Value
*Value
: {SI
.getTrueValue(), SI
.getFalseValue()})
1737 if (isSafeToLoadUnconditionally(Value
, LI
.getType(), LI
.getAlign(), DL
,
1739 Spec
.setAsSpeculatable(/*isTrueVal=*/Value
== SI
.getTrueValue());
1740 else if (PreserveCFG
)
1746 std::optional
<RewriteableMemOps
>
1747 SROA::isSafeSelectToSpeculate(SelectInst
&SI
, bool PreserveCFG
) {
1748 RewriteableMemOps Ops
;
1750 for (User
*U
: SI
.users()) {
1751 if (auto *BC
= dyn_cast
<BitCastInst
>(U
); BC
&& BC
->hasOneUse())
1752 U
= *BC
->user_begin();
1754 if (auto *Store
= dyn_cast
<StoreInst
>(U
)) {
1755 // Note that atomic stores can be transformed; atomic semantics do not
1756 // have any meaning for a local alloca. Stores are not speculatable,
1757 // however, so if we can't turn it into a predicated store, we are done.
1758 if (Store
->isVolatile() || PreserveCFG
)
1759 return {}; // Give up on this `select`.
1760 Ops
.emplace_back(Store
);
1764 auto *LI
= dyn_cast
<LoadInst
>(U
);
1766 // Note that atomic loads can be transformed;
1767 // atomic semantics do not have any meaning for a local alloca.
1768 if (!LI
|| LI
->isVolatile())
1769 return {}; // Give up on this `select`.
1771 PossiblySpeculatableLoad
Load(LI
);
1772 if (!LI
->isSimple()) {
1773 // If the `load` is not simple, we can't speculatively execute it,
1774 // but we could handle this via a CFG modification. But can we?
1776 return {}; // Give up on this `select`.
1777 Ops
.emplace_back(Load
);
1781 SelectHandSpeculativity Spec
=
1782 isSafeLoadOfSelectToSpeculate(*LI
, SI
, PreserveCFG
);
1783 if (PreserveCFG
&& !Spec
.areAllSpeculatable())
1784 return {}; // Give up on this `select`.
1787 Ops
.emplace_back(Load
);
1793 static void speculateSelectInstLoads(SelectInst
&SI
, LoadInst
&LI
,
1795 LLVM_DEBUG(dbgs() << " original load: " << SI
<< "\n");
1797 Value
*TV
= SI
.getTrueValue();
1798 Value
*FV
= SI
.getFalseValue();
1799 // Replace the given load of the select with a select of two loads.
1801 assert(LI
.isSimple() && "We only speculate simple loads");
1803 IRB
.SetInsertPoint(&LI
);
1806 IRB
.CreateAlignedLoad(LI
.getType(), TV
, LI
.getAlign(),
1807 LI
.getName() + ".sroa.speculate.load.true");
1809 IRB
.CreateAlignedLoad(LI
.getType(), FV
, LI
.getAlign(),
1810 LI
.getName() + ".sroa.speculate.load.false");
1811 NumLoadsSpeculated
+= 2;
1813 // Transfer alignment and AA info if present.
1814 TL
->setAlignment(LI
.getAlign());
1815 FL
->setAlignment(LI
.getAlign());
1817 AAMDNodes Tags
= LI
.getAAMetadata();
1819 TL
->setAAMetadata(Tags
);
1820 FL
->setAAMetadata(Tags
);
1823 Value
*V
= IRB
.CreateSelect(SI
.getCondition(), TL
, FL
,
1824 LI
.getName() + ".sroa.speculated");
1826 LLVM_DEBUG(dbgs() << " speculated to: " << *V
<< "\n");
1827 LI
.replaceAllUsesWith(V
);
1830 template <typename T
>
1831 static void rewriteMemOpOfSelect(SelectInst
&SI
, T
&I
,
1832 SelectHandSpeculativity Spec
,
1833 DomTreeUpdater
&DTU
) {
1834 assert((isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) && "Only for load and store!");
1835 LLVM_DEBUG(dbgs() << " original mem op: " << I
<< "\n");
1836 BasicBlock
*Head
= I
.getParent();
1837 Instruction
*ThenTerm
= nullptr;
1838 Instruction
*ElseTerm
= nullptr;
1839 if (Spec
.areNoneSpeculatable())
1840 SplitBlockAndInsertIfThenElse(SI
.getCondition(), &I
, &ThenTerm
, &ElseTerm
,
1841 SI
.getMetadata(LLVMContext::MD_prof
), &DTU
);
1843 SplitBlockAndInsertIfThen(SI
.getCondition(), &I
, /*Unreachable=*/false,
1844 SI
.getMetadata(LLVMContext::MD_prof
), &DTU
,
1845 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1846 if (Spec
.isSpeculatable(/*isTrueVal=*/true))
1847 cast
<BranchInst
>(Head
->getTerminator())->swapSuccessors();
1849 auto *HeadBI
= cast
<BranchInst
>(Head
->getTerminator());
1850 Spec
= {}; // Do not use `Spec` beyond this point.
1851 BasicBlock
*Tail
= I
.getParent();
1852 Tail
->setName(Head
->getName() + ".cont");
1854 if (isa
<LoadInst
>(I
))
1855 PN
= PHINode::Create(I
.getType(), 2, "", &I
);
1856 for (BasicBlock
*SuccBB
: successors(Head
)) {
1857 bool IsThen
= SuccBB
== HeadBI
->getSuccessor(0);
1858 int SuccIdx
= IsThen
? 0 : 1;
1859 auto *NewMemOpBB
= SuccBB
== Tail
? Head
: SuccBB
;
1860 auto &CondMemOp
= cast
<T
>(*I
.clone());
1861 if (NewMemOpBB
!= Head
) {
1862 NewMemOpBB
->setName(Head
->getName() + (IsThen
? ".then" : ".else"));
1863 if (isa
<LoadInst
>(I
))
1864 ++NumLoadsPredicated
;
1866 ++NumStoresPredicated
;
1868 CondMemOp
.dropUBImplyingAttrsAndMetadata();
1869 ++NumLoadsSpeculated
;
1871 CondMemOp
.insertBefore(NewMemOpBB
->getTerminator());
1872 Value
*Ptr
= SI
.getOperand(1 + SuccIdx
);
1873 CondMemOp
.setOperand(I
.getPointerOperandIndex(), Ptr
);
1874 if (isa
<LoadInst
>(I
)) {
1875 CondMemOp
.setName(I
.getName() + (IsThen
? ".then" : ".else") + ".val");
1876 PN
->addIncoming(&CondMemOp
, NewMemOpBB
);
1878 LLVM_DEBUG(dbgs() << " to: " << CondMemOp
<< "\n");
1880 if (isa
<LoadInst
>(I
)) {
1882 LLVM_DEBUG(dbgs() << " to: " << *PN
<< "\n");
1883 I
.replaceAllUsesWith(PN
);
1887 static void rewriteMemOpOfSelect(SelectInst
&SelInst
, Instruction
&I
,
1888 SelectHandSpeculativity Spec
,
1889 DomTreeUpdater
&DTU
) {
1890 if (auto *LI
= dyn_cast
<LoadInst
>(&I
))
1891 rewriteMemOpOfSelect(SelInst
, *LI
, Spec
, DTU
);
1892 else if (auto *SI
= dyn_cast
<StoreInst
>(&I
))
1893 rewriteMemOpOfSelect(SelInst
, *SI
, Spec
, DTU
);
1895 llvm_unreachable_internal("Only for load and store.");
1898 static bool rewriteSelectInstMemOps(SelectInst
&SI
,
1899 const RewriteableMemOps
&Ops
,
1900 IRBuilderTy
&IRB
, DomTreeUpdater
*DTU
) {
1901 bool CFGChanged
= false;
1902 LLVM_DEBUG(dbgs() << " original select: " << SI
<< "\n");
1904 for (const RewriteableMemOp
&Op
: Ops
) {
1905 SelectHandSpeculativity Spec
;
1907 if (auto *const *US
= std::get_if
<UnspeculatableStore
>(&Op
)) {
1910 auto PSL
= std::get
<PossiblySpeculatableLoad
>(Op
);
1911 I
= PSL
.getPointer();
1912 Spec
= PSL
.getInt();
1914 if (Spec
.areAllSpeculatable()) {
1915 speculateSelectInstLoads(SI
, cast
<LoadInst
>(*I
), IRB
);
1917 assert(DTU
&& "Should not get here when not allowed to modify the CFG!");
1918 rewriteMemOpOfSelect(SI
, *I
, Spec
, *DTU
);
1921 I
->eraseFromParent();
1924 for (User
*U
: make_early_inc_range(SI
.users()))
1925 cast
<BitCastInst
>(U
)->eraseFromParent();
1926 SI
.eraseFromParent();
1930 /// Compute an adjusted pointer from Ptr by Offset bytes where the
1931 /// resulting pointer has PointerTy.
1932 static Value
*getAdjustedPtr(IRBuilderTy
&IRB
, const DataLayout
&DL
, Value
*Ptr
,
1933 APInt Offset
, Type
*PointerTy
,
1934 const Twine
&NamePrefix
) {
1936 Ptr
= IRB
.CreateInBoundsPtrAdd(Ptr
, IRB
.getInt(Offset
),
1937 NamePrefix
+ "sroa_idx");
1938 return IRB
.CreatePointerBitCastOrAddrSpaceCast(Ptr
, PointerTy
,
1939 NamePrefix
+ "sroa_cast");
1942 /// Compute the adjusted alignment for a load or store from an offset.
1943 static Align
getAdjustedAlignment(Instruction
*I
, uint64_t Offset
) {
1944 return commonAlignment(getLoadStoreAlignment(I
), Offset
);
1947 /// Test whether we can convert a value from the old to the new type.
1949 /// This predicate should be used to guard calls to convertValue in order to
1950 /// ensure that we only try to convert viable values. The strategy is that we
1951 /// will peel off single element struct and array wrappings to get to an
1952 /// underlying value, and convert that value.
1953 static bool canConvertValue(const DataLayout
&DL
, Type
*OldTy
, Type
*NewTy
) {
1957 // For integer types, we can't handle any bit-width differences. This would
1958 // break both vector conversions with extension and introduce endianness
1959 // issues when in conjunction with loads and stores.
1960 if (isa
<IntegerType
>(OldTy
) && isa
<IntegerType
>(NewTy
)) {
1961 assert(cast
<IntegerType
>(OldTy
)->getBitWidth() !=
1962 cast
<IntegerType
>(NewTy
)->getBitWidth() &&
1963 "We can't have the same bitwidth for different int types");
1967 if (DL
.getTypeSizeInBits(NewTy
).getFixedValue() !=
1968 DL
.getTypeSizeInBits(OldTy
).getFixedValue())
1970 if (!NewTy
->isSingleValueType() || !OldTy
->isSingleValueType())
1973 // We can convert pointers to integers and vice-versa. Same for vectors
1974 // of pointers and integers.
1975 OldTy
= OldTy
->getScalarType();
1976 NewTy
= NewTy
->getScalarType();
1977 if (NewTy
->isPointerTy() || OldTy
->isPointerTy()) {
1978 if (NewTy
->isPointerTy() && OldTy
->isPointerTy()) {
1979 unsigned OldAS
= OldTy
->getPointerAddressSpace();
1980 unsigned NewAS
= NewTy
->getPointerAddressSpace();
1981 // Convert pointers if they are pointers from the same address space or
1982 // different integral (not non-integral) address spaces with the same
1984 return OldAS
== NewAS
||
1985 (!DL
.isNonIntegralAddressSpace(OldAS
) &&
1986 !DL
.isNonIntegralAddressSpace(NewAS
) &&
1987 DL
.getPointerSize(OldAS
) == DL
.getPointerSize(NewAS
));
1990 // We can convert integers to integral pointers, but not to non-integral
1992 if (OldTy
->isIntegerTy())
1993 return !DL
.isNonIntegralPointerType(NewTy
);
1995 // We can convert integral pointers to integers, but non-integral pointers
1996 // need to remain pointers.
1997 if (!DL
.isNonIntegralPointerType(OldTy
))
1998 return NewTy
->isIntegerTy();
2003 if (OldTy
->isTargetExtTy() || NewTy
->isTargetExtTy())
2009 /// Generic routine to convert an SSA value to a value of a different
2012 /// This will try various different casting techniques, such as bitcasts,
2013 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
2014 /// two types for viability with this routine.
2015 static Value
*convertValue(const DataLayout
&DL
, IRBuilderTy
&IRB
, Value
*V
,
2017 Type
*OldTy
= V
->getType();
2018 assert(canConvertValue(DL
, OldTy
, NewTy
) && "Value not convertable to type");
2023 assert(!(isa
<IntegerType
>(OldTy
) && isa
<IntegerType
>(NewTy
)) &&
2024 "Integer types must be the exact same to convert.");
2026 // See if we need inttoptr for this type pair. May require additional bitcast.
2027 if (OldTy
->isIntOrIntVectorTy() && NewTy
->isPtrOrPtrVectorTy()) {
2028 // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
2029 // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
2030 // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
2031 // Directly handle i64 to i8*
2032 return IRB
.CreateIntToPtr(IRB
.CreateBitCast(V
, DL
.getIntPtrType(NewTy
)),
2036 // See if we need ptrtoint for this type pair. May require additional bitcast.
2037 if (OldTy
->isPtrOrPtrVectorTy() && NewTy
->isIntOrIntVectorTy()) {
2038 // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
2039 // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
2040 // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
2041 // Expand i8* to i64 --> i8* to i64 to i64
2042 return IRB
.CreateBitCast(IRB
.CreatePtrToInt(V
, DL
.getIntPtrType(OldTy
)),
2046 if (OldTy
->isPtrOrPtrVectorTy() && NewTy
->isPtrOrPtrVectorTy()) {
2047 unsigned OldAS
= OldTy
->getPointerAddressSpace();
2048 unsigned NewAS
= NewTy
->getPointerAddressSpace();
2049 // To convert pointers with different address spaces (they are already
2050 // checked convertible, i.e. they have the same pointer size), so far we
2051 // cannot use `bitcast` (which has restrict on the same address space) or
2052 // `addrspacecast` (which is not always no-op casting). Instead, use a pair
2053 // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
2055 if (OldAS
!= NewAS
) {
2056 assert(DL
.getPointerSize(OldAS
) == DL
.getPointerSize(NewAS
));
2057 return IRB
.CreateIntToPtr(IRB
.CreatePtrToInt(V
, DL
.getIntPtrType(OldTy
)),
2062 return IRB
.CreateBitCast(V
, NewTy
);
2065 /// Test whether the given slice use can be promoted to a vector.
2067 /// This function is called to test each entry in a partition which is slated
2068 /// for a single slice.
2069 static bool isVectorPromotionViableForSlice(Partition
&P
, const Slice
&S
,
2071 uint64_t ElementSize
,
2072 const DataLayout
&DL
) {
2073 // First validate the slice offsets.
2074 uint64_t BeginOffset
=
2075 std::max(S
.beginOffset(), P
.beginOffset()) - P
.beginOffset();
2076 uint64_t BeginIndex
= BeginOffset
/ ElementSize
;
2077 if (BeginIndex
* ElementSize
!= BeginOffset
||
2078 BeginIndex
>= cast
<FixedVectorType
>(Ty
)->getNumElements())
2080 uint64_t EndOffset
=
2081 std::min(S
.endOffset(), P
.endOffset()) - P
.beginOffset();
2082 uint64_t EndIndex
= EndOffset
/ ElementSize
;
2083 if (EndIndex
* ElementSize
!= EndOffset
||
2084 EndIndex
> cast
<FixedVectorType
>(Ty
)->getNumElements())
2087 assert(EndIndex
> BeginIndex
&& "Empty vector!");
2088 uint64_t NumElements
= EndIndex
- BeginIndex
;
2089 Type
*SliceTy
= (NumElements
== 1)
2090 ? Ty
->getElementType()
2091 : FixedVectorType::get(Ty
->getElementType(), NumElements
);
2094 Type::getIntNTy(Ty
->getContext(), NumElements
* ElementSize
* 8);
2096 Use
*U
= S
.getUse();
2098 if (MemIntrinsic
*MI
= dyn_cast
<MemIntrinsic
>(U
->getUser())) {
2099 if (MI
->isVolatile())
2101 if (!S
.isSplittable())
2102 return false; // Skip any unsplittable intrinsics.
2103 } else if (IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(U
->getUser())) {
2104 if (!II
->isLifetimeStartOrEnd() && !II
->isDroppable())
2106 } else if (LoadInst
*LI
= dyn_cast
<LoadInst
>(U
->getUser())) {
2107 if (LI
->isVolatile())
2109 Type
*LTy
= LI
->getType();
2110 // Disable vector promotion when there are loads or stores of an FCA.
2111 if (LTy
->isStructTy())
2113 if (P
.beginOffset() > S
.beginOffset() || P
.endOffset() < S
.endOffset()) {
2114 assert(LTy
->isIntegerTy());
2117 if (!canConvertValue(DL
, SliceTy
, LTy
))
2119 } else if (StoreInst
*SI
= dyn_cast
<StoreInst
>(U
->getUser())) {
2120 if (SI
->isVolatile())
2122 Type
*STy
= SI
->getValueOperand()->getType();
2123 // Disable vector promotion when there are loads or stores of an FCA.
2124 if (STy
->isStructTy())
2126 if (P
.beginOffset() > S
.beginOffset() || P
.endOffset() < S
.endOffset()) {
2127 assert(STy
->isIntegerTy());
2130 if (!canConvertValue(DL
, STy
, SliceTy
))
2139 /// Test whether a vector type is viable for promotion.
2141 /// This implements the necessary checking for \c checkVectorTypesForPromotion
2142 /// (and thus isVectorPromotionViable) over all slices of the alloca for the
2143 /// given VectorType.
2144 static bool checkVectorTypeForPromotion(Partition
&P
, VectorType
*VTy
,
2145 const DataLayout
&DL
) {
2146 uint64_t ElementSize
=
2147 DL
.getTypeSizeInBits(VTy
->getElementType()).getFixedValue();
2149 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2150 // that aren't byte sized.
2151 if (ElementSize
% 8)
2153 assert((DL
.getTypeSizeInBits(VTy
).getFixedValue() % 8) == 0 &&
2154 "vector size not a multiple of element size?");
2157 for (const Slice
&S
: P
)
2158 if (!isVectorPromotionViableForSlice(P
, S
, VTy
, ElementSize
, DL
))
2161 for (const Slice
*S
: P
.splitSliceTails())
2162 if (!isVectorPromotionViableForSlice(P
, *S
, VTy
, ElementSize
, DL
))
2168 /// Test whether any vector type in \p CandidateTys is viable for promotion.
2170 /// This implements the necessary checking for \c isVectorPromotionViable over
2171 /// all slices of the alloca for the given VectorType.
2173 checkVectorTypesForPromotion(Partition
&P
, const DataLayout
&DL
,
2174 SmallVectorImpl
<VectorType
*> &CandidateTys
,
2175 bool HaveCommonEltTy
, Type
*CommonEltTy
,
2176 bool HaveVecPtrTy
, bool HaveCommonVecPtrTy
,
2177 VectorType
*CommonVecPtrTy
) {
2178 // If we didn't find a vector type, nothing to do here.
2179 if (CandidateTys
.empty())
2182 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2183 // then we should choose it, not some other alternative.
2184 // But, we can't perform a no-op pointer address space change via bitcast,
2185 // so if we didn't have a common pointer element type, bail.
2186 if (HaveVecPtrTy
&& !HaveCommonVecPtrTy
)
2189 // Try to pick the "best" element type out of the choices.
2190 if (!HaveCommonEltTy
&& HaveVecPtrTy
) {
2191 // If there was a pointer element type, there's really only one choice.
2192 CandidateTys
.clear();
2193 CandidateTys
.push_back(CommonVecPtrTy
);
2194 } else if (!HaveCommonEltTy
&& !HaveVecPtrTy
) {
2195 // Integer-ify vector types.
2196 for (VectorType
*&VTy
: CandidateTys
) {
2197 if (!VTy
->getElementType()->isIntegerTy())
2198 VTy
= cast
<VectorType
>(VTy
->getWithNewType(IntegerType::getIntNTy(
2199 VTy
->getContext(), VTy
->getScalarSizeInBits())));
2202 // Rank the remaining candidate vector types. This is easy because we know
2203 // they're all integer vectors. We sort by ascending number of elements.
2204 auto RankVectorTypesComp
= [&DL
](VectorType
*RHSTy
, VectorType
*LHSTy
) {
2206 assert(DL
.getTypeSizeInBits(RHSTy
).getFixedValue() ==
2207 DL
.getTypeSizeInBits(LHSTy
).getFixedValue() &&
2208 "Cannot have vector types of different sizes!");
2209 assert(RHSTy
->getElementType()->isIntegerTy() &&
2210 "All non-integer types eliminated!");
2211 assert(LHSTy
->getElementType()->isIntegerTy() &&
2212 "All non-integer types eliminated!");
2213 return cast
<FixedVectorType
>(RHSTy
)->getNumElements() <
2214 cast
<FixedVectorType
>(LHSTy
)->getNumElements();
2216 auto RankVectorTypesEq
= [&DL
](VectorType
*RHSTy
, VectorType
*LHSTy
) {
2218 assert(DL
.getTypeSizeInBits(RHSTy
).getFixedValue() ==
2219 DL
.getTypeSizeInBits(LHSTy
).getFixedValue() &&
2220 "Cannot have vector types of different sizes!");
2221 assert(RHSTy
->getElementType()->isIntegerTy() &&
2222 "All non-integer types eliminated!");
2223 assert(LHSTy
->getElementType()->isIntegerTy() &&
2224 "All non-integer types eliminated!");
2225 return cast
<FixedVectorType
>(RHSTy
)->getNumElements() ==
2226 cast
<FixedVectorType
>(LHSTy
)->getNumElements();
2228 llvm::sort(CandidateTys
, RankVectorTypesComp
);
2229 CandidateTys
.erase(std::unique(CandidateTys
.begin(), CandidateTys
.end(),
2231 CandidateTys
.end());
2233 // The only way to have the same element type in every vector type is to
2234 // have the same vector type. Check that and remove all but one.
2236 for (VectorType
*VTy
: CandidateTys
) {
2237 assert(VTy
->getElementType() == CommonEltTy
&&
2238 "Unaccounted for element type!");
2239 assert(VTy
== CandidateTys
[0] &&
2240 "Different vector types with the same element type!");
2243 CandidateTys
.resize(1);
2246 // FIXME: hack. Do we have a named constant for this?
2247 // SDAG SDNode can't have more than 65535 operands.
2248 llvm::erase_if(CandidateTys
, [](VectorType
*VTy
) {
2249 return cast
<FixedVectorType
>(VTy
)->getNumElements() >
2250 std::numeric_limits
<unsigned short>::max();
2253 for (VectorType
*VTy
: CandidateTys
)
2254 if (checkVectorTypeForPromotion(P
, VTy
, DL
))
2260 /// Test whether the given alloca partitioning and range of slices can be
2261 /// promoted to a vector.
2263 /// This is a quick test to check whether we can rewrite a particular alloca
2264 /// partition (and its newly formed alloca) into a vector alloca with only
2265 /// whole-vector loads and stores such that it could be promoted to a vector
2266 /// SSA value. We only can ensure this for a limited set of operations, and we
2267 /// don't want to do the rewrites unless we are confident that the result will
2268 /// be promotable, so we have an early test here.
2269 static VectorType
*isVectorPromotionViable(Partition
&P
, const DataLayout
&DL
) {
2270 // Collect the candidate types for vector-based promotion. Also track whether
2271 // we have different element types.
2272 SmallVector
<VectorType
*, 4> CandidateTys
;
2273 SetVector
<Type
*> LoadStoreTys
;
2274 Type
*CommonEltTy
= nullptr;
2275 VectorType
*CommonVecPtrTy
= nullptr;
2276 bool HaveVecPtrTy
= false;
2277 bool HaveCommonEltTy
= true;
2278 bool HaveCommonVecPtrTy
= true;
2279 auto CheckCandidateType
= [&](Type
*Ty
) {
2280 if (auto *VTy
= dyn_cast
<VectorType
>(Ty
)) {
2281 // Return if bitcast to vectors is different for total size in bits.
2282 if (!CandidateTys
.empty()) {
2283 VectorType
*V
= CandidateTys
[0];
2284 if (DL
.getTypeSizeInBits(VTy
).getFixedValue() !=
2285 DL
.getTypeSizeInBits(V
).getFixedValue()) {
2286 CandidateTys
.clear();
2290 CandidateTys
.push_back(VTy
);
2291 Type
*EltTy
= VTy
->getElementType();
2294 CommonEltTy
= EltTy
;
2295 else if (CommonEltTy
!= EltTy
)
2296 HaveCommonEltTy
= false;
2298 if (EltTy
->isPointerTy()) {
2299 HaveVecPtrTy
= true;
2300 if (!CommonVecPtrTy
)
2301 CommonVecPtrTy
= VTy
;
2302 else if (CommonVecPtrTy
!= VTy
)
2303 HaveCommonVecPtrTy
= false;
2308 // Put load and store types into a set for de-duplication.
2309 for (const Slice
&S
: P
) {
2311 if (auto *LI
= dyn_cast
<LoadInst
>(S
.getUse()->getUser()))
2313 else if (auto *SI
= dyn_cast
<StoreInst
>(S
.getUse()->getUser()))
2314 Ty
= SI
->getValueOperand()->getType();
2317 LoadStoreTys
.insert(Ty
);
2318 // Consider any loads or stores that are the exact size of the slice.
2319 if (S
.beginOffset() == P
.beginOffset() && S
.endOffset() == P
.endOffset())
2320 CheckCandidateType(Ty
);
2323 if (auto *VTy
= checkVectorTypesForPromotion(
2324 P
, DL
, CandidateTys
, HaveCommonEltTy
, CommonEltTy
, HaveVecPtrTy
,
2325 HaveCommonVecPtrTy
, CommonVecPtrTy
))
2328 // Consider additional vector types where the element type size is a
2329 // multiple of load/store element size.
2330 for (Type
*Ty
: LoadStoreTys
) {
2331 if (!VectorType::isValidElementType(Ty
))
2333 unsigned TypeSize
= DL
.getTypeSizeInBits(Ty
).getFixedValue();
2334 // Make a copy of CandidateTys and iterate through it, because we might
2335 // append to CandidateTys in the loop.
2336 SmallVector
<VectorType
*, 4> CandidateTysCopy
= CandidateTys
;
2337 CandidateTys
.clear();
2338 for (VectorType
*&VTy
: CandidateTysCopy
) {
2339 unsigned VectorSize
= DL
.getTypeSizeInBits(VTy
).getFixedValue();
2340 unsigned ElementSize
=
2341 DL
.getTypeSizeInBits(VTy
->getElementType()).getFixedValue();
2342 if (TypeSize
!= VectorSize
&& TypeSize
!= ElementSize
&&
2343 VectorSize
% TypeSize
== 0) {
2344 VectorType
*NewVTy
= VectorType::get(Ty
, VectorSize
/ TypeSize
, false);
2345 CheckCandidateType(NewVTy
);
2350 return checkVectorTypesForPromotion(P
, DL
, CandidateTys
, HaveCommonEltTy
,
2351 CommonEltTy
, HaveVecPtrTy
,
2352 HaveCommonVecPtrTy
, CommonVecPtrTy
);
2355 /// Test whether a slice of an alloca is valid for integer widening.
2357 /// This implements the necessary checking for the \c isIntegerWideningViable
2358 /// test below on a single slice of the alloca.
2359 static bool isIntegerWideningViableForSlice(const Slice
&S
,
2360 uint64_t AllocBeginOffset
,
2362 const DataLayout
&DL
,
2363 bool &WholeAllocaOp
) {
2364 uint64_t Size
= DL
.getTypeStoreSize(AllocaTy
).getFixedValue();
2366 uint64_t RelBegin
= S
.beginOffset() - AllocBeginOffset
;
2367 uint64_t RelEnd
= S
.endOffset() - AllocBeginOffset
;
2369 Use
*U
= S
.getUse();
2371 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2372 // larger than other load/store slices (RelEnd > Size). But lifetime are
2373 // always promotable and should not impact other slices' promotability of the
2375 if (IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(U
->getUser())) {
2376 if (II
->isLifetimeStartOrEnd() || II
->isDroppable())
2380 // We can't reasonably handle cases where the load or store extends past
2381 // the end of the alloca's type and into its padding.
2385 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(U
->getUser())) {
2386 if (LI
->isVolatile())
2388 // We can't handle loads that extend past the allocated memory.
2389 if (DL
.getTypeStoreSize(LI
->getType()).getFixedValue() > Size
)
2391 // So far, AllocaSliceRewriter does not support widening split slice tails
2392 // in rewriteIntegerLoad.
2393 if (S
.beginOffset() < AllocBeginOffset
)
2395 // Note that we don't count vector loads or stores as whole-alloca
2396 // operations which enable integer widening because we would prefer to use
2397 // vector widening instead.
2398 if (!isa
<VectorType
>(LI
->getType()) && RelBegin
== 0 && RelEnd
== Size
)
2399 WholeAllocaOp
= true;
2400 if (IntegerType
*ITy
= dyn_cast
<IntegerType
>(LI
->getType())) {
2401 if (ITy
->getBitWidth() < DL
.getTypeStoreSizeInBits(ITy
).getFixedValue())
2403 } else if (RelBegin
!= 0 || RelEnd
!= Size
||
2404 !canConvertValue(DL
, AllocaTy
, LI
->getType())) {
2405 // Non-integer loads need to be convertible from the alloca type so that
2406 // they are promotable.
2409 } else if (StoreInst
*SI
= dyn_cast
<StoreInst
>(U
->getUser())) {
2410 Type
*ValueTy
= SI
->getValueOperand()->getType();
2411 if (SI
->isVolatile())
2413 // We can't handle stores that extend past the allocated memory.
2414 if (DL
.getTypeStoreSize(ValueTy
).getFixedValue() > Size
)
2416 // So far, AllocaSliceRewriter does not support widening split slice tails
2417 // in rewriteIntegerStore.
2418 if (S
.beginOffset() < AllocBeginOffset
)
2420 // Note that we don't count vector loads or stores as whole-alloca
2421 // operations which enable integer widening because we would prefer to use
2422 // vector widening instead.
2423 if (!isa
<VectorType
>(ValueTy
) && RelBegin
== 0 && RelEnd
== Size
)
2424 WholeAllocaOp
= true;
2425 if (IntegerType
*ITy
= dyn_cast
<IntegerType
>(ValueTy
)) {
2426 if (ITy
->getBitWidth() < DL
.getTypeStoreSizeInBits(ITy
).getFixedValue())
2428 } else if (RelBegin
!= 0 || RelEnd
!= Size
||
2429 !canConvertValue(DL
, ValueTy
, AllocaTy
)) {
2430 // Non-integer stores need to be convertible to the alloca type so that
2431 // they are promotable.
2434 } else if (MemIntrinsic
*MI
= dyn_cast
<MemIntrinsic
>(U
->getUser())) {
2435 if (MI
->isVolatile() || !isa
<Constant
>(MI
->getLength()))
2437 if (!S
.isSplittable())
2438 return false; // Skip any unsplittable intrinsics.
2446 /// Test whether the given alloca partition's integer operations can be
2447 /// widened to promotable ones.
2449 /// This is a quick test to check whether we can rewrite the integer loads and
2450 /// stores to a particular alloca into wider loads and stores and be able to
2451 /// promote the resulting alloca.
2452 static bool isIntegerWideningViable(Partition
&P
, Type
*AllocaTy
,
2453 const DataLayout
&DL
) {
2454 uint64_t SizeInBits
= DL
.getTypeSizeInBits(AllocaTy
).getFixedValue();
2455 // Don't create integer types larger than the maximum bitwidth.
2456 if (SizeInBits
> IntegerType::MAX_INT_BITS
)
2459 // Don't try to handle allocas with bit-padding.
2460 if (SizeInBits
!= DL
.getTypeStoreSizeInBits(AllocaTy
).getFixedValue())
2463 // We need to ensure that an integer type with the appropriate bitwidth can
2464 // be converted to the alloca type, whatever that is. We don't want to force
2465 // the alloca itself to have an integer type if there is a more suitable one.
2466 Type
*IntTy
= Type::getIntNTy(AllocaTy
->getContext(), SizeInBits
);
2467 if (!canConvertValue(DL
, AllocaTy
, IntTy
) ||
2468 !canConvertValue(DL
, IntTy
, AllocaTy
))
2471 // While examining uses, we ensure that the alloca has a covering load or
2472 // store. We don't want to widen the integer operations only to fail to
2473 // promote due to some other unsplittable entry (which we may make splittable
2474 // later). However, if there are only splittable uses, go ahead and assume
2475 // that we cover the alloca.
2476 // FIXME: We shouldn't consider split slices that happen to start in the
2477 // partition here...
2478 bool WholeAllocaOp
= P
.empty() && DL
.isLegalInteger(SizeInBits
);
2480 for (const Slice
&S
: P
)
2481 if (!isIntegerWideningViableForSlice(S
, P
.beginOffset(), AllocaTy
, DL
,
2485 for (const Slice
*S
: P
.splitSliceTails())
2486 if (!isIntegerWideningViableForSlice(*S
, P
.beginOffset(), AllocaTy
, DL
,
2490 return WholeAllocaOp
;
2493 static Value
*extractInteger(const DataLayout
&DL
, IRBuilderTy
&IRB
, Value
*V
,
2494 IntegerType
*Ty
, uint64_t Offset
,
2495 const Twine
&Name
) {
2496 LLVM_DEBUG(dbgs() << " start: " << *V
<< "\n");
2497 IntegerType
*IntTy
= cast
<IntegerType
>(V
->getType());
2498 assert(DL
.getTypeStoreSize(Ty
).getFixedValue() + Offset
<=
2499 DL
.getTypeStoreSize(IntTy
).getFixedValue() &&
2500 "Element extends past full value");
2501 uint64_t ShAmt
= 8 * Offset
;
2502 if (DL
.isBigEndian())
2503 ShAmt
= 8 * (DL
.getTypeStoreSize(IntTy
).getFixedValue() -
2504 DL
.getTypeStoreSize(Ty
).getFixedValue() - Offset
);
2506 V
= IRB
.CreateLShr(V
, ShAmt
, Name
+ ".shift");
2507 LLVM_DEBUG(dbgs() << " shifted: " << *V
<< "\n");
2509 assert(Ty
->getBitWidth() <= IntTy
->getBitWidth() &&
2510 "Cannot extract to a larger integer!");
2512 V
= IRB
.CreateTrunc(V
, Ty
, Name
+ ".trunc");
2513 LLVM_DEBUG(dbgs() << " trunced: " << *V
<< "\n");
2518 static Value
*insertInteger(const DataLayout
&DL
, IRBuilderTy
&IRB
, Value
*Old
,
2519 Value
*V
, uint64_t Offset
, const Twine
&Name
) {
2520 IntegerType
*IntTy
= cast
<IntegerType
>(Old
->getType());
2521 IntegerType
*Ty
= cast
<IntegerType
>(V
->getType());
2522 assert(Ty
->getBitWidth() <= IntTy
->getBitWidth() &&
2523 "Cannot insert a larger integer!");
2524 LLVM_DEBUG(dbgs() << " start: " << *V
<< "\n");
2526 V
= IRB
.CreateZExt(V
, IntTy
, Name
+ ".ext");
2527 LLVM_DEBUG(dbgs() << " extended: " << *V
<< "\n");
2529 assert(DL
.getTypeStoreSize(Ty
).getFixedValue() + Offset
<=
2530 DL
.getTypeStoreSize(IntTy
).getFixedValue() &&
2531 "Element store outside of alloca store");
2532 uint64_t ShAmt
= 8 * Offset
;
2533 if (DL
.isBigEndian())
2534 ShAmt
= 8 * (DL
.getTypeStoreSize(IntTy
).getFixedValue() -
2535 DL
.getTypeStoreSize(Ty
).getFixedValue() - Offset
);
2537 V
= IRB
.CreateShl(V
, ShAmt
, Name
+ ".shift");
2538 LLVM_DEBUG(dbgs() << " shifted: " << *V
<< "\n");
2541 if (ShAmt
|| Ty
->getBitWidth() < IntTy
->getBitWidth()) {
2542 APInt Mask
= ~Ty
->getMask().zext(IntTy
->getBitWidth()).shl(ShAmt
);
2543 Old
= IRB
.CreateAnd(Old
, Mask
, Name
+ ".mask");
2544 LLVM_DEBUG(dbgs() << " masked: " << *Old
<< "\n");
2545 V
= IRB
.CreateOr(Old
, V
, Name
+ ".insert");
2546 LLVM_DEBUG(dbgs() << " inserted: " << *V
<< "\n");
2551 static Value
*extractVector(IRBuilderTy
&IRB
, Value
*V
, unsigned BeginIndex
,
2552 unsigned EndIndex
, const Twine
&Name
) {
2553 auto *VecTy
= cast
<FixedVectorType
>(V
->getType());
2554 unsigned NumElements
= EndIndex
- BeginIndex
;
2555 assert(NumElements
<= VecTy
->getNumElements() && "Too many elements!");
2557 if (NumElements
== VecTy
->getNumElements())
2560 if (NumElements
== 1) {
2561 V
= IRB
.CreateExtractElement(V
, IRB
.getInt32(BeginIndex
),
2563 LLVM_DEBUG(dbgs() << " extract: " << *V
<< "\n");
2567 auto Mask
= llvm::to_vector
<8>(llvm::seq
<int>(BeginIndex
, EndIndex
));
2568 V
= IRB
.CreateShuffleVector(V
, Mask
, Name
+ ".extract");
2569 LLVM_DEBUG(dbgs() << " shuffle: " << *V
<< "\n");
2573 static Value
*insertVector(IRBuilderTy
&IRB
, Value
*Old
, Value
*V
,
2574 unsigned BeginIndex
, const Twine
&Name
) {
2575 VectorType
*VecTy
= cast
<VectorType
>(Old
->getType());
2576 assert(VecTy
&& "Can only insert a vector into a vector");
2578 VectorType
*Ty
= dyn_cast
<VectorType
>(V
->getType());
2580 // Single element to insert.
2581 V
= IRB
.CreateInsertElement(Old
, V
, IRB
.getInt32(BeginIndex
),
2583 LLVM_DEBUG(dbgs() << " insert: " << *V
<< "\n");
2587 assert(cast
<FixedVectorType
>(Ty
)->getNumElements() <=
2588 cast
<FixedVectorType
>(VecTy
)->getNumElements() &&
2589 "Too many elements!");
2590 if (cast
<FixedVectorType
>(Ty
)->getNumElements() ==
2591 cast
<FixedVectorType
>(VecTy
)->getNumElements()) {
2592 assert(V
->getType() == VecTy
&& "Vector type mismatch");
2595 unsigned EndIndex
= BeginIndex
+ cast
<FixedVectorType
>(Ty
)->getNumElements();
2597 // When inserting a smaller vector into the larger to store, we first
2598 // use a shuffle vector to widen it with undef elements, and then
2599 // a second shuffle vector to select between the loaded vector and the
2601 SmallVector
<int, 8> Mask
;
2602 Mask
.reserve(cast
<FixedVectorType
>(VecTy
)->getNumElements());
2603 for (unsigned i
= 0; i
!= cast
<FixedVectorType
>(VecTy
)->getNumElements(); ++i
)
2604 if (i
>= BeginIndex
&& i
< EndIndex
)
2605 Mask
.push_back(i
- BeginIndex
);
2608 V
= IRB
.CreateShuffleVector(V
, Mask
, Name
+ ".expand");
2609 LLVM_DEBUG(dbgs() << " shuffle: " << *V
<< "\n");
2611 SmallVector
<Constant
*, 8> Mask2
;
2612 Mask2
.reserve(cast
<FixedVectorType
>(VecTy
)->getNumElements());
2613 for (unsigned i
= 0; i
!= cast
<FixedVectorType
>(VecTy
)->getNumElements(); ++i
)
2614 Mask2
.push_back(IRB
.getInt1(i
>= BeginIndex
&& i
< EndIndex
));
2616 V
= IRB
.CreateSelect(ConstantVector::get(Mask2
), V
, Old
, Name
+ "blend");
2618 LLVM_DEBUG(dbgs() << " blend: " << *V
<< "\n");
2624 /// Visitor to rewrite instructions using p particular slice of an alloca
2625 /// to use a new alloca.
2627 /// Also implements the rewriting to vector-based accesses when the partition
2628 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2630 class AllocaSliceRewriter
: public InstVisitor
<AllocaSliceRewriter
, bool> {
2631 // Befriend the base class so it can delegate to private visit methods.
2632 friend class InstVisitor
<AllocaSliceRewriter
, bool>;
2634 using Base
= InstVisitor
<AllocaSliceRewriter
, bool>;
2636 const DataLayout
&DL
;
2639 AllocaInst
&OldAI
, &NewAI
;
2640 const uint64_t NewAllocaBeginOffset
, NewAllocaEndOffset
;
2643 // This is a convenience and flag variable that will be null unless the new
2644 // alloca's integer operations should be widened to this integer type due to
2645 // passing isIntegerWideningViable above. If it is non-null, the desired
2646 // integer type will be stored here for easy access during rewriting.
2649 // If we are rewriting an alloca partition which can be written as pure
2650 // vector operations, we stash extra information here. When VecTy is
2651 // non-null, we have some strict guarantees about the rewritten alloca:
2652 // - The new alloca is exactly the size of the vector type here.
2653 // - The accesses all either map to the entire vector or to a single
2655 // - The set of accessing instructions is only one of those handled above
2656 // in isVectorPromotionViable. Generally these are the same access kinds
2657 // which are promotable via mem2reg.
2660 uint64_t ElementSize
;
2662 // The original offset of the slice currently being rewritten relative to
2663 // the original alloca.
2664 uint64_t BeginOffset
= 0;
2665 uint64_t EndOffset
= 0;
2667 // The new offsets of the slice currently being rewritten relative to the
2669 uint64_t NewBeginOffset
= 0, NewEndOffset
= 0;
2671 uint64_t SliceSize
= 0;
2672 bool IsSplittable
= false;
2673 bool IsSplit
= false;
2674 Use
*OldUse
= nullptr;
2675 Instruction
*OldPtr
= nullptr;
2677 // Track post-rewrite users which are PHI nodes and Selects.
2678 SmallSetVector
<PHINode
*, 8> &PHIUsers
;
2679 SmallSetVector
<SelectInst
*, 8> &SelectUsers
;
2681 // Utility IR builder, whose name prefix is setup for each visited use, and
2682 // the insertion point is set to point to the user.
2685 // Return the new alloca, addrspacecasted if required to avoid changing the
2686 // addrspace of a volatile access.
2687 Value
*getPtrToNewAI(unsigned AddrSpace
, bool IsVolatile
) {
2688 if (!IsVolatile
|| AddrSpace
== NewAI
.getType()->getPointerAddressSpace())
2691 Type
*AccessTy
= IRB
.getPtrTy(AddrSpace
);
2692 return IRB
.CreateAddrSpaceCast(&NewAI
, AccessTy
);
2696 AllocaSliceRewriter(const DataLayout
&DL
, AllocaSlices
&AS
, SROA
&Pass
,
2697 AllocaInst
&OldAI
, AllocaInst
&NewAI
,
2698 uint64_t NewAllocaBeginOffset
,
2699 uint64_t NewAllocaEndOffset
, bool IsIntegerPromotable
,
2700 VectorType
*PromotableVecTy
,
2701 SmallSetVector
<PHINode
*, 8> &PHIUsers
,
2702 SmallSetVector
<SelectInst
*, 8> &SelectUsers
)
2703 : DL(DL
), AS(AS
), Pass(Pass
), OldAI(OldAI
), NewAI(NewAI
),
2704 NewAllocaBeginOffset(NewAllocaBeginOffset
),
2705 NewAllocaEndOffset(NewAllocaEndOffset
),
2706 NewAllocaTy(NewAI
.getAllocatedType()),
2709 ? Type::getIntNTy(NewAI
.getContext(),
2710 DL
.getTypeSizeInBits(NewAI
.getAllocatedType())
2713 VecTy(PromotableVecTy
),
2714 ElementTy(VecTy
? VecTy
->getElementType() : nullptr),
2715 ElementSize(VecTy
? DL
.getTypeSizeInBits(ElementTy
).getFixedValue() / 8
2717 PHIUsers(PHIUsers
), SelectUsers(SelectUsers
),
2718 IRB(NewAI
.getContext(), ConstantFolder()) {
2720 assert((DL
.getTypeSizeInBits(ElementTy
).getFixedValue() % 8) == 0 &&
2721 "Only multiple-of-8 sized vector elements are viable");
2724 assert((!IntTy
&& !VecTy
) || (IntTy
&& !VecTy
) || (!IntTy
&& VecTy
));
2727 bool visit(AllocaSlices::const_iterator I
) {
2728 bool CanSROA
= true;
2729 BeginOffset
= I
->beginOffset();
2730 EndOffset
= I
->endOffset();
2731 IsSplittable
= I
->isSplittable();
2733 BeginOffset
< NewAllocaBeginOffset
|| EndOffset
> NewAllocaEndOffset
;
2734 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit
? "split " : ""));
2735 LLVM_DEBUG(AS
.printSlice(dbgs(), I
, ""));
2736 LLVM_DEBUG(dbgs() << "\n");
2738 // Compute the intersecting offset range.
2739 assert(BeginOffset
< NewAllocaEndOffset
);
2740 assert(EndOffset
> NewAllocaBeginOffset
);
2741 NewBeginOffset
= std::max(BeginOffset
, NewAllocaBeginOffset
);
2742 NewEndOffset
= std::min(EndOffset
, NewAllocaEndOffset
);
2744 SliceSize
= NewEndOffset
- NewBeginOffset
;
2745 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset
<< ", " << EndOffset
2746 << ") NewBegin:(" << NewBeginOffset
<< ", "
2747 << NewEndOffset
<< ") NewAllocaBegin:("
2748 << NewAllocaBeginOffset
<< ", " << NewAllocaEndOffset
2750 assert(IsSplit
|| NewBeginOffset
== BeginOffset
);
2751 OldUse
= I
->getUse();
2752 OldPtr
= cast
<Instruction
>(OldUse
->get());
2754 Instruction
*OldUserI
= cast
<Instruction
>(OldUse
->getUser());
2755 IRB
.SetInsertPoint(OldUserI
);
2756 IRB
.SetCurrentDebugLocation(OldUserI
->getDebugLoc());
2757 IRB
.getInserter().SetNamePrefix(
2758 Twine(NewAI
.getName()) + "." + Twine(BeginOffset
) + ".");
2760 CanSROA
&= visit(cast
<Instruction
>(OldUse
->getUser()));
2767 // Make sure the other visit overloads are visible.
2770 // Every instruction which can end up as a user must have a rewrite rule.
2771 bool visitInstruction(Instruction
&I
) {
2772 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I
<< "\n");
2773 llvm_unreachable("No rewrite rule for this instruction!");
2776 Value
*getNewAllocaSlicePtr(IRBuilderTy
&IRB
, Type
*PointerTy
) {
2777 // Note that the offset computation can use BeginOffset or NewBeginOffset
2778 // interchangeably for unsplit slices.
2779 assert(IsSplit
|| BeginOffset
== NewBeginOffset
);
2780 uint64_t Offset
= NewBeginOffset
- NewAllocaBeginOffset
;
2783 StringRef OldName
= OldPtr
->getName();
2784 // Skip through the last '.sroa.' component of the name.
2785 size_t LastSROAPrefix
= OldName
.rfind(".sroa.");
2786 if (LastSROAPrefix
!= StringRef::npos
) {
2787 OldName
= OldName
.substr(LastSROAPrefix
+ strlen(".sroa."));
2788 // Look for an SROA slice index.
2789 size_t IndexEnd
= OldName
.find_first_not_of("0123456789");
2790 if (IndexEnd
!= StringRef::npos
&& OldName
[IndexEnd
] == '.') {
2791 // Strip the index and look for the offset.
2792 OldName
= OldName
.substr(IndexEnd
+ 1);
2793 size_t OffsetEnd
= OldName
.find_first_not_of("0123456789");
2794 if (OffsetEnd
!= StringRef::npos
&& OldName
[OffsetEnd
] == '.')
2795 // Strip the offset.
2796 OldName
= OldName
.substr(OffsetEnd
+ 1);
2799 // Strip any SROA suffixes as well.
2800 OldName
= OldName
.substr(0, OldName
.find(".sroa_"));
2803 return getAdjustedPtr(IRB
, DL
, &NewAI
,
2804 APInt(DL
.getIndexTypeSizeInBits(PointerTy
), Offset
),
2807 Twine(OldName
) + "."
2814 /// Compute suitable alignment to access this slice of the *new*
2817 /// You can optionally pass a type to this routine and if that type's ABI
2818 /// alignment is itself suitable, this will return zero.
2819 Align
getSliceAlign() {
2820 return commonAlignment(NewAI
.getAlign(),
2821 NewBeginOffset
- NewAllocaBeginOffset
);
2824 unsigned getIndex(uint64_t Offset
) {
2825 assert(VecTy
&& "Can only call getIndex when rewriting a vector");
2826 uint64_t RelOffset
= Offset
- NewAllocaBeginOffset
;
2827 assert(RelOffset
/ ElementSize
< UINT32_MAX
&& "Index out of bounds");
2828 uint32_t Index
= RelOffset
/ ElementSize
;
2829 assert(Index
* ElementSize
== RelOffset
);
2833 void deleteIfTriviallyDead(Value
*V
) {
2834 Instruction
*I
= cast
<Instruction
>(V
);
2835 if (isInstructionTriviallyDead(I
))
2836 Pass
.DeadInsts
.push_back(I
);
2839 Value
*rewriteVectorizedLoadInst(LoadInst
&LI
) {
2840 unsigned BeginIndex
= getIndex(NewBeginOffset
);
2841 unsigned EndIndex
= getIndex(NewEndOffset
);
2842 assert(EndIndex
> BeginIndex
&& "Empty vector!");
2844 LoadInst
*Load
= IRB
.CreateAlignedLoad(NewAI
.getAllocatedType(), &NewAI
,
2845 NewAI
.getAlign(), "load");
2847 Load
->copyMetadata(LI
, {LLVMContext::MD_mem_parallel_loop_access
,
2848 LLVMContext::MD_access_group
});
2849 return extractVector(IRB
, Load
, BeginIndex
, EndIndex
, "vec");
2852 Value
*rewriteIntegerLoad(LoadInst
&LI
) {
2853 assert(IntTy
&& "We cannot insert an integer to the alloca");
2854 assert(!LI
.isVolatile());
2855 Value
*V
= IRB
.CreateAlignedLoad(NewAI
.getAllocatedType(), &NewAI
,
2856 NewAI
.getAlign(), "load");
2857 V
= convertValue(DL
, IRB
, V
, IntTy
);
2858 assert(NewBeginOffset
>= NewAllocaBeginOffset
&& "Out of bounds offset");
2859 uint64_t Offset
= NewBeginOffset
- NewAllocaBeginOffset
;
2860 if (Offset
> 0 || NewEndOffset
< NewAllocaEndOffset
) {
2861 IntegerType
*ExtractTy
= Type::getIntNTy(LI
.getContext(), SliceSize
* 8);
2862 V
= extractInteger(DL
, IRB
, V
, ExtractTy
, Offset
, "extract");
2864 // It is possible that the extracted type is not the load type. This
2865 // happens if there is a load past the end of the alloca, and as
2866 // a consequence the slice is narrower but still a candidate for integer
2867 // lowering. To handle this case, we just zero extend the extracted
2869 assert(cast
<IntegerType
>(LI
.getType())->getBitWidth() >= SliceSize
* 8 &&
2870 "Can only handle an extract for an overly wide load");
2871 if (cast
<IntegerType
>(LI
.getType())->getBitWidth() > SliceSize
* 8)
2872 V
= IRB
.CreateZExt(V
, LI
.getType());
2876 bool visitLoadInst(LoadInst
&LI
) {
2877 LLVM_DEBUG(dbgs() << " original: " << LI
<< "\n");
2878 Value
*OldOp
= LI
.getOperand(0);
2879 assert(OldOp
== OldPtr
);
2881 AAMDNodes AATags
= LI
.getAAMetadata();
2883 unsigned AS
= LI
.getPointerAddressSpace();
2885 Type
*TargetTy
= IsSplit
? Type::getIntNTy(LI
.getContext(), SliceSize
* 8)
2887 const bool IsLoadPastEnd
=
2888 DL
.getTypeStoreSize(TargetTy
).getFixedValue() > SliceSize
;
2889 bool IsPtrAdjusted
= false;
2892 V
= rewriteVectorizedLoadInst(LI
);
2893 } else if (IntTy
&& LI
.getType()->isIntegerTy()) {
2894 V
= rewriteIntegerLoad(LI
);
2895 } else if (NewBeginOffset
== NewAllocaBeginOffset
&&
2896 NewEndOffset
== NewAllocaEndOffset
&&
2897 (canConvertValue(DL
, NewAllocaTy
, TargetTy
) ||
2898 (IsLoadPastEnd
&& NewAllocaTy
->isIntegerTy() &&
2899 TargetTy
->isIntegerTy() && !LI
.isVolatile()))) {
2901 getPtrToNewAI(LI
.getPointerAddressSpace(), LI
.isVolatile());
2902 LoadInst
*NewLI
= IRB
.CreateAlignedLoad(NewAI
.getAllocatedType(), NewPtr
,
2903 NewAI
.getAlign(), LI
.isVolatile(),
2905 if (LI
.isVolatile())
2906 NewLI
->setAtomic(LI
.getOrdering(), LI
.getSyncScopeID());
2907 if (NewLI
->isAtomic())
2908 NewLI
->setAlignment(LI
.getAlign());
2910 // Copy any metadata that is valid for the new load. This may require
2911 // conversion to a different kind of metadata, e.g. !nonnull might change
2912 // to !range or vice versa.
2913 copyMetadataForLoad(*NewLI
, LI
);
2915 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
2917 NewLI
->setAAMetadata(AATags
.shift(NewBeginOffset
- BeginOffset
));
2919 // Try to preserve nonnull metadata
2922 // If this is an integer load past the end of the slice (which means the
2923 // bytes outside the slice are undef or this load is dead) just forcibly
2924 // fix the integer size with correct handling of endianness.
2925 if (auto *AITy
= dyn_cast
<IntegerType
>(NewAllocaTy
))
2926 if (auto *TITy
= dyn_cast
<IntegerType
>(TargetTy
))
2927 if (AITy
->getBitWidth() < TITy
->getBitWidth()) {
2928 V
= IRB
.CreateZExt(V
, TITy
, "load.ext");
2929 if (DL
.isBigEndian())
2930 V
= IRB
.CreateShl(V
, TITy
->getBitWidth() - AITy
->getBitWidth(),
2934 Type
*LTy
= IRB
.getPtrTy(AS
);
2936 IRB
.CreateAlignedLoad(TargetTy
, getNewAllocaSlicePtr(IRB
, LTy
),
2937 getSliceAlign(), LI
.isVolatile(), LI
.getName());
2939 NewLI
->setAAMetadata(AATags
.shift(NewBeginOffset
- BeginOffset
));
2940 if (LI
.isVolatile())
2941 NewLI
->setAtomic(LI
.getOrdering(), LI
.getSyncScopeID());
2942 NewLI
->copyMetadata(LI
, {LLVMContext::MD_mem_parallel_loop_access
,
2943 LLVMContext::MD_access_group
});
2946 IsPtrAdjusted
= true;
2948 V
= convertValue(DL
, IRB
, V
, TargetTy
);
2951 assert(!LI
.isVolatile());
2952 assert(LI
.getType()->isIntegerTy() &&
2953 "Only integer type loads and stores are split");
2954 assert(SliceSize
< DL
.getTypeStoreSize(LI
.getType()).getFixedValue() &&
2955 "Split load isn't smaller than original load");
2956 assert(DL
.typeSizeEqualsStoreSize(LI
.getType()) &&
2957 "Non-byte-multiple bit width");
2958 // Move the insertion point just past the load so that we can refer to it.
2959 IRB
.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI
)));
2960 // Create a placeholder value with the same type as LI to use as the
2961 // basis for the new value. This allows us to replace the uses of LI with
2962 // the computed value, and then replace the placeholder with LI, leaving
2963 // LI only used for this computation.
2964 Value
*Placeholder
=
2965 new LoadInst(LI
.getType(), PoisonValue::get(IRB
.getPtrTy(AS
)), "",
2967 V
= insertInteger(DL
, IRB
, Placeholder
, V
, NewBeginOffset
- BeginOffset
,
2969 LI
.replaceAllUsesWith(V
);
2970 Placeholder
->replaceAllUsesWith(&LI
);
2971 Placeholder
->deleteValue();
2973 LI
.replaceAllUsesWith(V
);
2976 Pass
.DeadInsts
.push_back(&LI
);
2977 deleteIfTriviallyDead(OldOp
);
2978 LLVM_DEBUG(dbgs() << " to: " << *V
<< "\n");
2979 return !LI
.isVolatile() && !IsPtrAdjusted
;
2982 bool rewriteVectorizedStoreInst(Value
*V
, StoreInst
&SI
, Value
*OldOp
,
2984 // Capture V for the purpose of debug-info accounting once it's converted
2985 // to a vector store.
2987 if (V
->getType() != VecTy
) {
2988 unsigned BeginIndex
= getIndex(NewBeginOffset
);
2989 unsigned EndIndex
= getIndex(NewEndOffset
);
2990 assert(EndIndex
> BeginIndex
&& "Empty vector!");
2991 unsigned NumElements
= EndIndex
- BeginIndex
;
2992 assert(NumElements
<= cast
<FixedVectorType
>(VecTy
)->getNumElements() &&
2993 "Too many elements!");
2994 Type
*SliceTy
= (NumElements
== 1)
2996 : FixedVectorType::get(ElementTy
, NumElements
);
2997 if (V
->getType() != SliceTy
)
2998 V
= convertValue(DL
, IRB
, V
, SliceTy
);
3000 // Mix in the existing elements.
3001 Value
*Old
= IRB
.CreateAlignedLoad(NewAI
.getAllocatedType(), &NewAI
,
3002 NewAI
.getAlign(), "load");
3003 V
= insertVector(IRB
, Old
, V
, BeginIndex
, "vec");
3005 StoreInst
*Store
= IRB
.CreateAlignedStore(V
, &NewAI
, NewAI
.getAlign());
3006 Store
->copyMetadata(SI
, {LLVMContext::MD_mem_parallel_loop_access
,
3007 LLVMContext::MD_access_group
});
3009 Store
->setAAMetadata(AATags
.shift(NewBeginOffset
- BeginOffset
));
3010 Pass
.DeadInsts
.push_back(&SI
);
3012 // NOTE: Careful to use OrigV rather than V.
3013 migrateDebugInfo(&OldAI
, IsSplit
, NewBeginOffset
* 8, SliceSize
* 8, &SI
,
3014 Store
, Store
->getPointerOperand(), OrigV
, DL
);
3015 LLVM_DEBUG(dbgs() << " to: " << *Store
<< "\n");
3019 bool rewriteIntegerStore(Value
*V
, StoreInst
&SI
, AAMDNodes AATags
) {
3020 assert(IntTy
&& "We cannot extract an integer from the alloca");
3021 assert(!SI
.isVolatile());
3022 if (DL
.getTypeSizeInBits(V
->getType()).getFixedValue() !=
3023 IntTy
->getBitWidth()) {
3024 Value
*Old
= IRB
.CreateAlignedLoad(NewAI
.getAllocatedType(), &NewAI
,
3025 NewAI
.getAlign(), "oldload");
3026 Old
= convertValue(DL
, IRB
, Old
, IntTy
);
3027 assert(BeginOffset
>= NewAllocaBeginOffset
&& "Out of bounds offset");
3028 uint64_t Offset
= BeginOffset
- NewAllocaBeginOffset
;
3029 V
= insertInteger(DL
, IRB
, Old
, SI
.getValueOperand(), Offset
, "insert");
3031 V
= convertValue(DL
, IRB
, V
, NewAllocaTy
);
3032 StoreInst
*Store
= IRB
.CreateAlignedStore(V
, &NewAI
, NewAI
.getAlign());
3033 Store
->copyMetadata(SI
, {LLVMContext::MD_mem_parallel_loop_access
,
3034 LLVMContext::MD_access_group
});
3036 Store
->setAAMetadata(AATags
.shift(NewBeginOffset
- BeginOffset
));
3038 migrateDebugInfo(&OldAI
, IsSplit
, NewBeginOffset
* 8, SliceSize
* 8, &SI
,
3039 Store
, Store
->getPointerOperand(),
3040 Store
->getValueOperand(), DL
);
3042 Pass
.DeadInsts
.push_back(&SI
);
3043 LLVM_DEBUG(dbgs() << " to: " << *Store
<< "\n");
3047 bool visitStoreInst(StoreInst
&SI
) {
3048 LLVM_DEBUG(dbgs() << " original: " << SI
<< "\n");
3049 Value
*OldOp
= SI
.getOperand(1);
3050 assert(OldOp
== OldPtr
);
3052 AAMDNodes AATags
= SI
.getAAMetadata();
3053 Value
*V
= SI
.getValueOperand();
3055 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3056 // alloca that should be re-examined after promoting this alloca.
3057 if (V
->getType()->isPointerTy())
3058 if (AllocaInst
*AI
= dyn_cast
<AllocaInst
>(V
->stripInBoundsOffsets()))
3059 Pass
.PostPromotionWorklist
.insert(AI
);
3061 if (SliceSize
< DL
.getTypeStoreSize(V
->getType()).getFixedValue()) {
3062 assert(!SI
.isVolatile());
3063 assert(V
->getType()->isIntegerTy() &&
3064 "Only integer type loads and stores are split");
3065 assert(DL
.typeSizeEqualsStoreSize(V
->getType()) &&
3066 "Non-byte-multiple bit width");
3067 IntegerType
*NarrowTy
= Type::getIntNTy(SI
.getContext(), SliceSize
* 8);
3068 V
= extractInteger(DL
, IRB
, V
, NarrowTy
, NewBeginOffset
- BeginOffset
,
3073 return rewriteVectorizedStoreInst(V
, SI
, OldOp
, AATags
);
3074 if (IntTy
&& V
->getType()->isIntegerTy())
3075 return rewriteIntegerStore(V
, SI
, AATags
);
3078 if (NewBeginOffset
== NewAllocaBeginOffset
&&
3079 NewEndOffset
== NewAllocaEndOffset
&&
3080 canConvertValue(DL
, V
->getType(), NewAllocaTy
)) {
3081 V
= convertValue(DL
, IRB
, V
, NewAllocaTy
);
3083 getPtrToNewAI(SI
.getPointerAddressSpace(), SI
.isVolatile());
3086 IRB
.CreateAlignedStore(V
, NewPtr
, NewAI
.getAlign(), SI
.isVolatile());
3088 unsigned AS
= SI
.getPointerAddressSpace();
3089 Value
*NewPtr
= getNewAllocaSlicePtr(IRB
, IRB
.getPtrTy(AS
));
3091 IRB
.CreateAlignedStore(V
, NewPtr
, getSliceAlign(), SI
.isVolatile());
3093 NewSI
->copyMetadata(SI
, {LLVMContext::MD_mem_parallel_loop_access
,
3094 LLVMContext::MD_access_group
});
3096 NewSI
->setAAMetadata(AATags
.shift(NewBeginOffset
- BeginOffset
));
3097 if (SI
.isVolatile())
3098 NewSI
->setAtomic(SI
.getOrdering(), SI
.getSyncScopeID());
3099 if (NewSI
->isAtomic())
3100 NewSI
->setAlignment(SI
.getAlign());
3102 migrateDebugInfo(&OldAI
, IsSplit
, NewBeginOffset
* 8, SliceSize
* 8, &SI
,
3103 NewSI
, NewSI
->getPointerOperand(),
3104 NewSI
->getValueOperand(), DL
);
3106 Pass
.DeadInsts
.push_back(&SI
);
3107 deleteIfTriviallyDead(OldOp
);
3109 LLVM_DEBUG(dbgs() << " to: " << *NewSI
<< "\n");
3110 return NewSI
->getPointerOperand() == &NewAI
&&
3111 NewSI
->getValueOperand()->getType() == NewAllocaTy
&&
3115 /// Compute an integer value from splatting an i8 across the given
3116 /// number of bytes.
3118 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3119 /// call this routine.
3120 /// FIXME: Heed the advice above.
3122 /// \param V The i8 value to splat.
3123 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3124 Value
*getIntegerSplat(Value
*V
, unsigned Size
) {
3125 assert(Size
> 0 && "Expected a positive number of bytes.");
3126 IntegerType
*VTy
= cast
<IntegerType
>(V
->getType());
3127 assert(VTy
->getBitWidth() == 8 && "Expected an i8 value for the byte");
3131 Type
*SplatIntTy
= Type::getIntNTy(VTy
->getContext(), Size
* 8);
3133 IRB
.CreateZExt(V
, SplatIntTy
, "zext"),
3134 IRB
.CreateUDiv(Constant::getAllOnesValue(SplatIntTy
),
3135 IRB
.CreateZExt(Constant::getAllOnesValue(V
->getType()),
3141 /// Compute a vector splat for a given element value.
3142 Value
*getVectorSplat(Value
*V
, unsigned NumElements
) {
3143 V
= IRB
.CreateVectorSplat(NumElements
, V
, "vsplat");
3144 LLVM_DEBUG(dbgs() << " splat: " << *V
<< "\n");
3148 bool visitMemSetInst(MemSetInst
&II
) {
3149 LLVM_DEBUG(dbgs() << " original: " << II
<< "\n");
3150 assert(II
.getRawDest() == OldPtr
);
3152 AAMDNodes AATags
= II
.getAAMetadata();
3154 // If the memset has a variable size, it cannot be split, just adjust the
3155 // pointer to the new alloca.
3156 if (!isa
<ConstantInt
>(II
.getLength())) {
3158 assert(NewBeginOffset
== BeginOffset
);
3159 II
.setDest(getNewAllocaSlicePtr(IRB
, OldPtr
->getType()));
3160 II
.setDestAlignment(getSliceAlign());
3161 // In theory we should call migrateDebugInfo here. However, we do not
3162 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3163 // constant geps, or storing a variable number of bytes.
3164 assert(at::getAssignmentMarkers(&II
).empty() &&
3165 at::getDPVAssignmentMarkers(&II
).empty() &&
3166 "AT: Unexpected link to non-const GEP");
3167 deleteIfTriviallyDead(OldPtr
);
3171 // Record this instruction for deletion.
3172 Pass
.DeadInsts
.push_back(&II
);
3174 Type
*AllocaTy
= NewAI
.getAllocatedType();
3175 Type
*ScalarTy
= AllocaTy
->getScalarType();
3177 const bool CanContinue
= [&]() {
3180 if (BeginOffset
> NewAllocaBeginOffset
||
3181 EndOffset
< NewAllocaEndOffset
)
3183 // Length must be in range for FixedVectorType.
3184 auto *C
= cast
<ConstantInt
>(II
.getLength());
3185 const uint64_t Len
= C
->getLimitedValue();
3186 if (Len
> std::numeric_limits
<unsigned>::max())
3188 auto *Int8Ty
= IntegerType::getInt8Ty(NewAI
.getContext());
3189 auto *SrcTy
= FixedVectorType::get(Int8Ty
, Len
);
3190 return canConvertValue(DL
, SrcTy
, AllocaTy
) &&
3191 DL
.isLegalInteger(DL
.getTypeSizeInBits(ScalarTy
).getFixedValue());
3194 // If this doesn't map cleanly onto the alloca type, and that type isn't
3195 // a single value type, just emit a memset.
3197 Type
*SizeTy
= II
.getLength()->getType();
3198 Constant
*Size
= ConstantInt::get(SizeTy
, NewEndOffset
- NewBeginOffset
);
3199 MemIntrinsic
*New
= cast
<MemIntrinsic
>(IRB
.CreateMemSet(
3200 getNewAllocaSlicePtr(IRB
, OldPtr
->getType()), II
.getValue(), Size
,
3201 MaybeAlign(getSliceAlign()), II
.isVolatile()));
3203 New
->setAAMetadata(AATags
.shift(NewBeginOffset
- BeginOffset
));
3205 migrateDebugInfo(&OldAI
, IsSplit
, NewBeginOffset
* 8, SliceSize
* 8, &II
,
3206 New
, New
->getRawDest(), nullptr, DL
);
3208 LLVM_DEBUG(dbgs() << " to: " << *New
<< "\n");
3212 // If we can represent this as a simple value, we have to build the actual
3213 // value to store, which requires expanding the byte present in memset to
3214 // a sensible representation for the alloca type. This is essentially
3215 // splatting the byte to a sufficiently wide integer, splatting it across
3216 // any desired vector width, and bitcasting to the final type.
3220 // If this is a memset of a vectorized alloca, insert it.
3221 assert(ElementTy
== ScalarTy
);
3223 unsigned BeginIndex
= getIndex(NewBeginOffset
);
3224 unsigned EndIndex
= getIndex(NewEndOffset
);
3225 assert(EndIndex
> BeginIndex
&& "Empty vector!");
3226 unsigned NumElements
= EndIndex
- BeginIndex
;
3227 assert(NumElements
<= cast
<FixedVectorType
>(VecTy
)->getNumElements() &&
3228 "Too many elements!");
3230 Value
*Splat
= getIntegerSplat(
3231 II
.getValue(), DL
.getTypeSizeInBits(ElementTy
).getFixedValue() / 8);
3232 Splat
= convertValue(DL
, IRB
, Splat
, ElementTy
);
3233 if (NumElements
> 1)
3234 Splat
= getVectorSplat(Splat
, NumElements
);
3236 Value
*Old
= IRB
.CreateAlignedLoad(NewAI
.getAllocatedType(), &NewAI
,
3237 NewAI
.getAlign(), "oldload");
3238 V
= insertVector(IRB
, Old
, Splat
, BeginIndex
, "vec");
3240 // If this is a memset on an alloca where we can widen stores, insert the
3242 assert(!II
.isVolatile());
3244 uint64_t Size
= NewEndOffset
- NewBeginOffset
;
3245 V
= getIntegerSplat(II
.getValue(), Size
);
3247 if (IntTy
&& (BeginOffset
!= NewAllocaBeginOffset
||
3248 EndOffset
!= NewAllocaBeginOffset
)) {
3249 Value
*Old
= IRB
.CreateAlignedLoad(NewAI
.getAllocatedType(), &NewAI
,
3250 NewAI
.getAlign(), "oldload");
3251 Old
= convertValue(DL
, IRB
, Old
, IntTy
);
3252 uint64_t Offset
= NewBeginOffset
- NewAllocaBeginOffset
;
3253 V
= insertInteger(DL
, IRB
, Old
, V
, Offset
, "insert");
3255 assert(V
->getType() == IntTy
&&
3256 "Wrong type for an alloca wide integer!");
3258 V
= convertValue(DL
, IRB
, V
, AllocaTy
);
3260 // Established these invariants above.
3261 assert(NewBeginOffset
== NewAllocaBeginOffset
);
3262 assert(NewEndOffset
== NewAllocaEndOffset
);
3264 V
= getIntegerSplat(II
.getValue(),
3265 DL
.getTypeSizeInBits(ScalarTy
).getFixedValue() / 8);
3266 if (VectorType
*AllocaVecTy
= dyn_cast
<VectorType
>(AllocaTy
))
3268 V
, cast
<FixedVectorType
>(AllocaVecTy
)->getNumElements());
3270 V
= convertValue(DL
, IRB
, V
, AllocaTy
);
3273 Value
*NewPtr
= getPtrToNewAI(II
.getDestAddressSpace(), II
.isVolatile());
3275 IRB
.CreateAlignedStore(V
, NewPtr
, NewAI
.getAlign(), II
.isVolatile());
3276 New
->copyMetadata(II
, {LLVMContext::MD_mem_parallel_loop_access
,
3277 LLVMContext::MD_access_group
});
3279 New
->setAAMetadata(AATags
.shift(NewBeginOffset
- BeginOffset
));
3281 migrateDebugInfo(&OldAI
, IsSplit
, NewBeginOffset
* 8, SliceSize
* 8, &II
,
3282 New
, New
->getPointerOperand(), V
, DL
);
3284 LLVM_DEBUG(dbgs() << " to: " << *New
<< "\n");
3285 return !II
.isVolatile();
3288 bool visitMemTransferInst(MemTransferInst
&II
) {
3289 // Rewriting of memory transfer instructions can be a bit tricky. We break
3290 // them into two categories: split intrinsics and unsplit intrinsics.
3292 LLVM_DEBUG(dbgs() << " original: " << II
<< "\n");
3294 AAMDNodes AATags
= II
.getAAMetadata();
3296 bool IsDest
= &II
.getRawDestUse() == OldUse
;
3297 assert((IsDest
&& II
.getRawDest() == OldPtr
) ||
3298 (!IsDest
&& II
.getRawSource() == OldPtr
));
3300 Align SliceAlign
= getSliceAlign();
3301 // For unsplit intrinsics, we simply modify the source and destination
3302 // pointers in place. This isn't just an optimization, it is a matter of
3303 // correctness. With unsplit intrinsics we may be dealing with transfers
3304 // within a single alloca before SROA ran, or with transfers that have
3305 // a variable length. We may also be dealing with memmove instead of
3306 // memcpy, and so simply updating the pointers is the necessary for us to
3307 // update both source and dest of a single call.
3308 if (!IsSplittable
) {
3309 Value
*AdjustedPtr
= getNewAllocaSlicePtr(IRB
, OldPtr
->getType());
3311 // Update the address component of linked dbg.assigns.
3312 auto UpdateAssignAddress
= [&](auto *DbgAssign
) {
3313 if (llvm::is_contained(DbgAssign
->location_ops(), II
.getDest()) ||
3314 DbgAssign
->getAddress() == II
.getDest())
3315 DbgAssign
->replaceVariableLocationOp(II
.getDest(), AdjustedPtr
);
3317 for_each(at::getAssignmentMarkers(&II
), UpdateAssignAddress
);
3318 for_each(at::getDPVAssignmentMarkers(&II
), UpdateAssignAddress
);
3319 II
.setDest(AdjustedPtr
);
3320 II
.setDestAlignment(SliceAlign
);
3322 II
.setSource(AdjustedPtr
);
3323 II
.setSourceAlignment(SliceAlign
);
3326 LLVM_DEBUG(dbgs() << " to: " << II
<< "\n");
3327 deleteIfTriviallyDead(OldPtr
);
3330 // For split transfer intrinsics we have an incredibly useful assurance:
3331 // the source and destination do not reside within the same alloca, and at
3332 // least one of them does not escape. This means that we can replace
3333 // memmove with memcpy, and we don't need to worry about all manner of
3334 // downsides to splitting and transforming the operations.
3336 // If this doesn't map cleanly onto the alloca type, and that type isn't
3337 // a single value type, just emit a memcpy.
3340 (BeginOffset
> NewAllocaBeginOffset
|| EndOffset
< NewAllocaEndOffset
||
3342 DL
.getTypeStoreSize(NewAI
.getAllocatedType()).getFixedValue() ||
3343 !DL
.typeSizeEqualsStoreSize(NewAI
.getAllocatedType()) ||
3344 !NewAI
.getAllocatedType()->isSingleValueType());
3346 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3347 // size hasn't been shrunk based on analysis of the viable range, this is
3349 if (EmitMemCpy
&& &OldAI
== &NewAI
) {
3350 // Ensure the start lines up.
3351 assert(NewBeginOffset
== BeginOffset
);
3353 // Rewrite the size as needed.
3354 if (NewEndOffset
!= EndOffset
)
3355 II
.setLength(ConstantInt::get(II
.getLength()->getType(),
3356 NewEndOffset
- NewBeginOffset
));
3359 // Record this instruction for deletion.
3360 Pass
.DeadInsts
.push_back(&II
);
3362 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3363 // alloca that should be re-examined after rewriting this instruction.
3364 Value
*OtherPtr
= IsDest
? II
.getRawSource() : II
.getRawDest();
3365 if (AllocaInst
*AI
=
3366 dyn_cast
<AllocaInst
>(OtherPtr
->stripInBoundsOffsets())) {
3367 assert(AI
!= &OldAI
&& AI
!= &NewAI
&&
3368 "Splittable transfers cannot reach the same alloca on both ends.");
3369 Pass
.Worklist
.insert(AI
);
3372 Type
*OtherPtrTy
= OtherPtr
->getType();
3373 unsigned OtherAS
= OtherPtrTy
->getPointerAddressSpace();
3375 // Compute the relative offset for the other pointer within the transfer.
3376 unsigned OffsetWidth
= DL
.getIndexSizeInBits(OtherAS
);
3377 APInt
OtherOffset(OffsetWidth
, NewBeginOffset
- BeginOffset
);
3379 (IsDest
? II
.getSourceAlign() : II
.getDestAlign()).valueOrOne();
3381 commonAlignment(OtherAlign
, OtherOffset
.zextOrTrunc(64).getZExtValue());
3384 // Compute the other pointer, folding as much as possible to produce
3385 // a single, simple GEP in most cases.
3386 OtherPtr
= getAdjustedPtr(IRB
, DL
, OtherPtr
, OtherOffset
, OtherPtrTy
,
3387 OtherPtr
->getName() + ".");
3389 Value
*OurPtr
= getNewAllocaSlicePtr(IRB
, OldPtr
->getType());
3390 Type
*SizeTy
= II
.getLength()->getType();
3391 Constant
*Size
= ConstantInt::get(SizeTy
, NewEndOffset
- NewBeginOffset
);
3393 Value
*DestPtr
, *SrcPtr
;
3394 MaybeAlign DestAlign
, SrcAlign
;
3395 // Note: IsDest is true iff we're copying into the new alloca slice
3398 DestAlign
= SliceAlign
;
3400 SrcAlign
= OtherAlign
;
3403 DestAlign
= OtherAlign
;
3405 SrcAlign
= SliceAlign
;
3407 CallInst
*New
= IRB
.CreateMemCpy(DestPtr
, DestAlign
, SrcPtr
, SrcAlign
,
3408 Size
, II
.isVolatile());
3410 New
->setAAMetadata(AATags
.shift(NewBeginOffset
- BeginOffset
));
3412 APInt
Offset(DL
.getIndexTypeSizeInBits(DestPtr
->getType()), 0);
3414 migrateDebugInfo(&OldAI
, IsSplit
, NewBeginOffset
* 8, SliceSize
* 8,
3415 &II
, New
, DestPtr
, nullptr, DL
);
3416 } else if (AllocaInst
*Base
= dyn_cast
<AllocaInst
>(
3417 DestPtr
->stripAndAccumulateConstantOffsets(
3418 DL
, Offset
, /*AllowNonInbounds*/ true))) {
3419 migrateDebugInfo(Base
, IsSplit
, Offset
.getZExtValue() * 8,
3420 SliceSize
* 8, &II
, New
, DestPtr
, nullptr, DL
);
3422 LLVM_DEBUG(dbgs() << " to: " << *New
<< "\n");
3426 bool IsWholeAlloca
= NewBeginOffset
== NewAllocaBeginOffset
&&
3427 NewEndOffset
== NewAllocaEndOffset
;
3428 uint64_t Size
= NewEndOffset
- NewBeginOffset
;
3429 unsigned BeginIndex
= VecTy
? getIndex(NewBeginOffset
) : 0;
3430 unsigned EndIndex
= VecTy
? getIndex(NewEndOffset
) : 0;
3431 unsigned NumElements
= EndIndex
- BeginIndex
;
3432 IntegerType
*SubIntTy
=
3433 IntTy
? Type::getIntNTy(IntTy
->getContext(), Size
* 8) : nullptr;
3435 // Reset the other pointer type to match the register type we're going to
3436 // use, but using the address space of the original other pointer.
3438 if (VecTy
&& !IsWholeAlloca
) {
3439 if (NumElements
== 1)
3440 OtherTy
= VecTy
->getElementType();
3442 OtherTy
= FixedVectorType::get(VecTy
->getElementType(), NumElements
);
3443 } else if (IntTy
&& !IsWholeAlloca
) {
3446 OtherTy
= NewAllocaTy
;
3449 Value
*AdjPtr
= getAdjustedPtr(IRB
, DL
, OtherPtr
, OtherOffset
, OtherPtrTy
,
3450 OtherPtr
->getName() + ".");
3451 MaybeAlign SrcAlign
= OtherAlign
;
3452 MaybeAlign DstAlign
= SliceAlign
;
3454 std::swap(SrcAlign
, DstAlign
);
3460 DstPtr
= getPtrToNewAI(II
.getDestAddressSpace(), II
.isVolatile());
3464 SrcPtr
= getPtrToNewAI(II
.getSourceAddressSpace(), II
.isVolatile());
3468 if (VecTy
&& !IsWholeAlloca
&& !IsDest
) {
3469 Src
= IRB
.CreateAlignedLoad(NewAI
.getAllocatedType(), &NewAI
,
3470 NewAI
.getAlign(), "load");
3471 Src
= extractVector(IRB
, Src
, BeginIndex
, EndIndex
, "vec");
3472 } else if (IntTy
&& !IsWholeAlloca
&& !IsDest
) {
3473 Src
= IRB
.CreateAlignedLoad(NewAI
.getAllocatedType(), &NewAI
,
3474 NewAI
.getAlign(), "load");
3475 Src
= convertValue(DL
, IRB
, Src
, IntTy
);
3476 uint64_t Offset
= NewBeginOffset
- NewAllocaBeginOffset
;
3477 Src
= extractInteger(DL
, IRB
, Src
, SubIntTy
, Offset
, "extract");
3479 LoadInst
*Load
= IRB
.CreateAlignedLoad(OtherTy
, SrcPtr
, SrcAlign
,
3480 II
.isVolatile(), "copyload");
3481 Load
->copyMetadata(II
, {LLVMContext::MD_mem_parallel_loop_access
,
3482 LLVMContext::MD_access_group
});
3484 Load
->setAAMetadata(AATags
.shift(NewBeginOffset
- BeginOffset
));
3488 if (VecTy
&& !IsWholeAlloca
&& IsDest
) {
3489 Value
*Old
= IRB
.CreateAlignedLoad(NewAI
.getAllocatedType(), &NewAI
,
3490 NewAI
.getAlign(), "oldload");
3491 Src
= insertVector(IRB
, Old
, Src
, BeginIndex
, "vec");
3492 } else if (IntTy
&& !IsWholeAlloca
&& IsDest
) {
3493 Value
*Old
= IRB
.CreateAlignedLoad(NewAI
.getAllocatedType(), &NewAI
,
3494 NewAI
.getAlign(), "oldload");
3495 Old
= convertValue(DL
, IRB
, Old
, IntTy
);
3496 uint64_t Offset
= NewBeginOffset
- NewAllocaBeginOffset
;
3497 Src
= insertInteger(DL
, IRB
, Old
, Src
, Offset
, "insert");
3498 Src
= convertValue(DL
, IRB
, Src
, NewAllocaTy
);
3501 StoreInst
*Store
= cast
<StoreInst
>(
3502 IRB
.CreateAlignedStore(Src
, DstPtr
, DstAlign
, II
.isVolatile()));
3503 Store
->copyMetadata(II
, {LLVMContext::MD_mem_parallel_loop_access
,
3504 LLVMContext::MD_access_group
});
3506 Store
->setAAMetadata(AATags
.shift(NewBeginOffset
- BeginOffset
));
3508 APInt
Offset(DL
.getIndexTypeSizeInBits(DstPtr
->getType()), 0);
3511 migrateDebugInfo(&OldAI
, IsSplit
, NewBeginOffset
* 8, SliceSize
* 8, &II
,
3512 Store
, DstPtr
, Src
, DL
);
3513 } else if (AllocaInst
*Base
= dyn_cast
<AllocaInst
>(
3514 DstPtr
->stripAndAccumulateConstantOffsets(
3515 DL
, Offset
, /*AllowNonInbounds*/ true))) {
3516 migrateDebugInfo(Base
, IsSplit
, Offset
.getZExtValue() * 8, SliceSize
* 8,
3517 &II
, Store
, DstPtr
, Src
, DL
);
3520 LLVM_DEBUG(dbgs() << " to: " << *Store
<< "\n");
3521 return !II
.isVolatile();
3524 bool visitIntrinsicInst(IntrinsicInst
&II
) {
3525 assert((II
.isLifetimeStartOrEnd() || II
.isLaunderOrStripInvariantGroup() ||
3526 II
.isDroppable()) &&
3527 "Unexpected intrinsic!");
3528 LLVM_DEBUG(dbgs() << " original: " << II
<< "\n");
3530 // Record this instruction for deletion.
3531 Pass
.DeadInsts
.push_back(&II
);
3533 if (II
.isDroppable()) {
3534 assert(II
.getIntrinsicID() == Intrinsic::assume
&& "Expected assume");
3535 // TODO For now we forget assumed information, this can be improved.
3536 OldPtr
->dropDroppableUsesIn(II
);
3540 if (II
.isLaunderOrStripInvariantGroup())
3543 assert(II
.getArgOperand(1) == OldPtr
);
3544 // Lifetime intrinsics are only promotable if they cover the whole alloca.
3545 // Therefore, we drop lifetime intrinsics which don't cover the whole
3547 // (In theory, intrinsics which partially cover an alloca could be
3548 // promoted, but PromoteMemToReg doesn't handle that case.)
3549 // FIXME: Check whether the alloca is promotable before dropping the
3550 // lifetime intrinsics?
3551 if (NewBeginOffset
!= NewAllocaBeginOffset
||
3552 NewEndOffset
!= NewAllocaEndOffset
)
3556 ConstantInt::get(cast
<IntegerType
>(II
.getArgOperand(0)->getType()),
3557 NewEndOffset
- NewBeginOffset
);
3558 // Lifetime intrinsics always expect an i8* so directly get such a pointer
3559 // for the new alloca slice.
3560 Type
*PointerTy
= IRB
.getPtrTy(OldPtr
->getType()->getPointerAddressSpace());
3561 Value
*Ptr
= getNewAllocaSlicePtr(IRB
, PointerTy
);
3563 if (II
.getIntrinsicID() == Intrinsic::lifetime_start
)
3564 New
= IRB
.CreateLifetimeStart(Ptr
, Size
);
3566 New
= IRB
.CreateLifetimeEnd(Ptr
, Size
);
3569 LLVM_DEBUG(dbgs() << " to: " << *New
<< "\n");
3574 void fixLoadStoreAlign(Instruction
&Root
) {
3575 // This algorithm implements the same visitor loop as
3576 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3578 SmallPtrSet
<Instruction
*, 4> Visited
;
3579 SmallVector
<Instruction
*, 4> Uses
;
3580 Visited
.insert(&Root
);
3581 Uses
.push_back(&Root
);
3583 Instruction
*I
= Uses
.pop_back_val();
3585 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(I
)) {
3586 LI
->setAlignment(std::min(LI
->getAlign(), getSliceAlign()));
3589 if (StoreInst
*SI
= dyn_cast
<StoreInst
>(I
)) {
3590 SI
->setAlignment(std::min(SI
->getAlign(), getSliceAlign()));
3594 assert(isa
<BitCastInst
>(I
) || isa
<AddrSpaceCastInst
>(I
) ||
3595 isa
<PHINode
>(I
) || isa
<SelectInst
>(I
) ||
3596 isa
<GetElementPtrInst
>(I
));
3597 for (User
*U
: I
->users())
3598 if (Visited
.insert(cast
<Instruction
>(U
)).second
)
3599 Uses
.push_back(cast
<Instruction
>(U
));
3600 } while (!Uses
.empty());
3603 bool visitPHINode(PHINode
&PN
) {
3604 LLVM_DEBUG(dbgs() << " original: " << PN
<< "\n");
3605 assert(BeginOffset
>= NewAllocaBeginOffset
&& "PHIs are unsplittable");
3606 assert(EndOffset
<= NewAllocaEndOffset
&& "PHIs are unsplittable");
3608 // We would like to compute a new pointer in only one place, but have it be
3609 // as local as possible to the PHI. To do that, we re-use the location of
3610 // the old pointer, which necessarily must be in the right position to
3611 // dominate the PHI.
3612 IRBuilderBase::InsertPointGuard
Guard(IRB
);
3613 if (isa
<PHINode
>(OldPtr
))
3614 IRB
.SetInsertPoint(OldPtr
->getParent(),
3615 OldPtr
->getParent()->getFirstInsertionPt());
3617 IRB
.SetInsertPoint(OldPtr
);
3618 IRB
.SetCurrentDebugLocation(OldPtr
->getDebugLoc());
3620 Value
*NewPtr
= getNewAllocaSlicePtr(IRB
, OldPtr
->getType());
3621 // Replace the operands which were using the old pointer.
3622 std::replace(PN
.op_begin(), PN
.op_end(), cast
<Value
>(OldPtr
), NewPtr
);
3624 LLVM_DEBUG(dbgs() << " to: " << PN
<< "\n");
3625 deleteIfTriviallyDead(OldPtr
);
3627 // Fix the alignment of any loads or stores using this PHI node.
3628 fixLoadStoreAlign(PN
);
3630 // PHIs can't be promoted on their own, but often can be speculated. We
3631 // check the speculation outside of the rewriter so that we see the
3632 // fully-rewritten alloca.
3633 PHIUsers
.insert(&PN
);
3637 bool visitSelectInst(SelectInst
&SI
) {
3638 LLVM_DEBUG(dbgs() << " original: " << SI
<< "\n");
3639 assert((SI
.getTrueValue() == OldPtr
|| SI
.getFalseValue() == OldPtr
) &&
3640 "Pointer isn't an operand!");
3641 assert(BeginOffset
>= NewAllocaBeginOffset
&& "Selects are unsplittable");
3642 assert(EndOffset
<= NewAllocaEndOffset
&& "Selects are unsplittable");
3644 Value
*NewPtr
= getNewAllocaSlicePtr(IRB
, OldPtr
->getType());
3645 // Replace the operands which were using the old pointer.
3646 if (SI
.getOperand(1) == OldPtr
)
3647 SI
.setOperand(1, NewPtr
);
3648 if (SI
.getOperand(2) == OldPtr
)
3649 SI
.setOperand(2, NewPtr
);
3651 LLVM_DEBUG(dbgs() << " to: " << SI
<< "\n");
3652 deleteIfTriviallyDead(OldPtr
);
3654 // Fix the alignment of any loads or stores using this select.
3655 fixLoadStoreAlign(SI
);
3657 // Selects can't be promoted on their own, but often can be speculated. We
3658 // check the speculation outside of the rewriter so that we see the
3659 // fully-rewritten alloca.
3660 SelectUsers
.insert(&SI
);
3665 /// Visitor to rewrite aggregate loads and stores as scalar.
3667 /// This pass aggressively rewrites all aggregate loads and stores on
3668 /// a particular pointer (or any pointer derived from it which we can identify)
3669 /// with scalar loads and stores.
3670 class AggLoadStoreRewriter
: public InstVisitor
<AggLoadStoreRewriter
, bool> {
3671 // Befriend the base class so it can delegate to private visit methods.
3672 friend class InstVisitor
<AggLoadStoreRewriter
, bool>;
3674 /// Queue of pointer uses to analyze and potentially rewrite.
3675 SmallVector
<Use
*, 8> Queue
;
3677 /// Set to prevent us from cycling with phi nodes and loops.
3678 SmallPtrSet
<User
*, 8> Visited
;
3680 /// The current pointer use being rewritten. This is used to dig up the used
3681 /// value (as opposed to the user).
3684 /// Used to calculate offsets, and hence alignment, of subobjects.
3685 const DataLayout
&DL
;
3690 AggLoadStoreRewriter(const DataLayout
&DL
, IRBuilderTy
&IRB
)
3691 : DL(DL
), IRB(IRB
) {}
3693 /// Rewrite loads and stores through a pointer and all pointers derived from
3695 bool rewrite(Instruction
&I
) {
3696 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3698 bool Changed
= false;
3699 while (!Queue
.empty()) {
3700 U
= Queue
.pop_back_val();
3701 Changed
|= visit(cast
<Instruction
>(U
->getUser()));
3707 /// Enqueue all the users of the given instruction for further processing.
3708 /// This uses a set to de-duplicate users.
3709 void enqueueUsers(Instruction
&I
) {
3710 for (Use
&U
: I
.uses())
3711 if (Visited
.insert(U
.getUser()).second
)
3712 Queue
.push_back(&U
);
3715 // Conservative default is to not rewrite anything.
3716 bool visitInstruction(Instruction
&I
) { return false; }
3718 /// Generic recursive split emission class.
3719 template <typename Derived
> class OpSplitter
{
3721 /// The builder used to form new instructions.
3724 /// The indices which to be used with insert- or extractvalue to select the
3725 /// appropriate value within the aggregate.
3726 SmallVector
<unsigned, 4> Indices
;
3728 /// The indices to a GEP instruction which will move Ptr to the correct slot
3729 /// within the aggregate.
3730 SmallVector
<Value
*, 4> GEPIndices
;
3732 /// The base pointer of the original op, used as a base for GEPing the
3733 /// split operations.
3736 /// The base pointee type being GEPed into.
3739 /// Known alignment of the base pointer.
3742 /// To calculate offset of each component so we can correctly deduce
3744 const DataLayout
&DL
;
3746 /// Initialize the splitter with an insertion point, Ptr and start with a
3747 /// single zero GEP index.
3748 OpSplitter(Instruction
*InsertionPoint
, Value
*Ptr
, Type
*BaseTy
,
3749 Align BaseAlign
, const DataLayout
&DL
, IRBuilderTy
&IRB
)
3750 : IRB(IRB
), GEPIndices(1, IRB
.getInt32(0)), Ptr(Ptr
), BaseTy(BaseTy
),
3751 BaseAlign(BaseAlign
), DL(DL
) {
3752 IRB
.SetInsertPoint(InsertionPoint
);
3756 /// Generic recursive split emission routine.
3758 /// This method recursively splits an aggregate op (load or store) into
3759 /// scalar or vector ops. It splits recursively until it hits a single value
3760 /// and emits that single value operation via the template argument.
3762 /// The logic of this routine relies on GEPs and insertvalue and
3763 /// extractvalue all operating with the same fundamental index list, merely
3764 /// formatted differently (GEPs need actual values).
3766 /// \param Ty The type being split recursively into smaller ops.
3767 /// \param Agg The aggregate value being built up or stored, depending on
3768 /// whether this is splitting a load or a store respectively.
3769 void emitSplitOps(Type
*Ty
, Value
*&Agg
, const Twine
&Name
) {
3770 if (Ty
->isSingleValueType()) {
3771 unsigned Offset
= DL
.getIndexedOffsetInType(BaseTy
, GEPIndices
);
3772 return static_cast<Derived
*>(this)->emitFunc(
3773 Ty
, Agg
, commonAlignment(BaseAlign
, Offset
), Name
);
3776 if (ArrayType
*ATy
= dyn_cast
<ArrayType
>(Ty
)) {
3777 unsigned OldSize
= Indices
.size();
3779 for (unsigned Idx
= 0, Size
= ATy
->getNumElements(); Idx
!= Size
;
3781 assert(Indices
.size() == OldSize
&& "Did not return to the old size");
3782 Indices
.push_back(Idx
);
3783 GEPIndices
.push_back(IRB
.getInt32(Idx
));
3784 emitSplitOps(ATy
->getElementType(), Agg
, Name
+ "." + Twine(Idx
));
3785 GEPIndices
.pop_back();
3791 if (StructType
*STy
= dyn_cast
<StructType
>(Ty
)) {
3792 unsigned OldSize
= Indices
.size();
3794 for (unsigned Idx
= 0, Size
= STy
->getNumElements(); Idx
!= Size
;
3796 assert(Indices
.size() == OldSize
&& "Did not return to the old size");
3797 Indices
.push_back(Idx
);
3798 GEPIndices
.push_back(IRB
.getInt32(Idx
));
3799 emitSplitOps(STy
->getElementType(Idx
), Agg
, Name
+ "." + Twine(Idx
));
3800 GEPIndices
.pop_back();
3806 llvm_unreachable("Only arrays and structs are aggregate loadable types");
3810 struct LoadOpSplitter
: public OpSplitter
<LoadOpSplitter
> {
3813 LoadOpSplitter(Instruction
*InsertionPoint
, Value
*Ptr
, Type
*BaseTy
,
3814 AAMDNodes AATags
, Align BaseAlign
, const DataLayout
&DL
,
3816 : OpSplitter
<LoadOpSplitter
>(InsertionPoint
, Ptr
, BaseTy
, BaseAlign
, DL
,
3820 /// Emit a leaf load of a single value. This is called at the leaves of the
3821 /// recursive emission to actually load values.
3822 void emitFunc(Type
*Ty
, Value
*&Agg
, Align Alignment
, const Twine
&Name
) {
3823 assert(Ty
->isSingleValueType());
3824 // Load the single value and insert it using the indices.
3826 IRB
.CreateInBoundsGEP(BaseTy
, Ptr
, GEPIndices
, Name
+ ".gep");
3828 IRB
.CreateAlignedLoad(Ty
, GEP
, Alignment
, Name
+ ".load");
3831 DL
.getIndexSizeInBits(Ptr
->getType()->getPointerAddressSpace()), 0);
3833 GEPOperator::accumulateConstantOffset(BaseTy
, GEPIndices
, DL
, Offset
))
3834 Load
->setAAMetadata(AATags
.shift(Offset
.getZExtValue()));
3836 Agg
= IRB
.CreateInsertValue(Agg
, Load
, Indices
, Name
+ ".insert");
3837 LLVM_DEBUG(dbgs() << " to: " << *Load
<< "\n");
3841 bool visitLoadInst(LoadInst
&LI
) {
3842 assert(LI
.getPointerOperand() == *U
);
3843 if (!LI
.isSimple() || LI
.getType()->isSingleValueType())
3846 // We have an aggregate being loaded, split it apart.
3847 LLVM_DEBUG(dbgs() << " original: " << LI
<< "\n");
3848 LoadOpSplitter
Splitter(&LI
, *U
, LI
.getType(), LI
.getAAMetadata(),
3849 getAdjustedAlignment(&LI
, 0), DL
, IRB
);
3850 Value
*V
= PoisonValue::get(LI
.getType());
3851 Splitter
.emitSplitOps(LI
.getType(), V
, LI
.getName() + ".fca");
3853 LI
.replaceAllUsesWith(V
);
3854 LI
.eraseFromParent();
3858 struct StoreOpSplitter
: public OpSplitter
<StoreOpSplitter
> {
3859 StoreOpSplitter(Instruction
*InsertionPoint
, Value
*Ptr
, Type
*BaseTy
,
3860 AAMDNodes AATags
, StoreInst
*AggStore
, Align BaseAlign
,
3861 const DataLayout
&DL
, IRBuilderTy
&IRB
)
3862 : OpSplitter
<StoreOpSplitter
>(InsertionPoint
, Ptr
, BaseTy
, BaseAlign
,
3864 AATags(AATags
), AggStore(AggStore
) {}
3866 StoreInst
*AggStore
;
3867 /// Emit a leaf store of a single value. This is called at the leaves of the
3868 /// recursive emission to actually produce stores.
3869 void emitFunc(Type
*Ty
, Value
*&Agg
, Align Alignment
, const Twine
&Name
) {
3870 assert(Ty
->isSingleValueType());
3871 // Extract the single value and store it using the indices.
3873 // The gep and extractvalue values are factored out of the CreateStore
3874 // call to make the output independent of the argument evaluation order.
3875 Value
*ExtractValue
=
3876 IRB
.CreateExtractValue(Agg
, Indices
, Name
+ ".extract");
3877 Value
*InBoundsGEP
=
3878 IRB
.CreateInBoundsGEP(BaseTy
, Ptr
, GEPIndices
, Name
+ ".gep");
3880 IRB
.CreateAlignedStore(ExtractValue
, InBoundsGEP
, Alignment
);
3883 DL
.getIndexSizeInBits(Ptr
->getType()->getPointerAddressSpace()), 0);
3884 GEPOperator::accumulateConstantOffset(BaseTy
, GEPIndices
, DL
, Offset
);
3886 Store
->setAAMetadata(AATags
.shift(Offset
.getZExtValue()));
3888 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
3889 // If we cannot (because there's an intervening non-const or unbounded
3890 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
3891 // this instruction.
3892 Value
*Base
= AggStore
->getPointerOperand()->stripInBoundsOffsets();
3893 if (auto *OldAI
= dyn_cast
<AllocaInst
>(Base
)) {
3894 uint64_t SizeInBits
=
3895 DL
.getTypeSizeInBits(Store
->getValueOperand()->getType());
3896 migrateDebugInfo(OldAI
, /*IsSplit*/ true, Offset
.getZExtValue() * 8,
3897 SizeInBits
, AggStore
, Store
,
3898 Store
->getPointerOperand(), Store
->getValueOperand(),
3901 assert(at::getAssignmentMarkers(Store
).empty() &&
3902 at::getDPVAssignmentMarkers(Store
).empty() &&
3903 "AT: unexpected debug.assign linked to store through "
3906 LLVM_DEBUG(dbgs() << " to: " << *Store
<< "\n");
3910 bool visitStoreInst(StoreInst
&SI
) {
3911 if (!SI
.isSimple() || SI
.getPointerOperand() != *U
)
3913 Value
*V
= SI
.getValueOperand();
3914 if (V
->getType()->isSingleValueType())
3917 // We have an aggregate being stored, split it apart.
3918 LLVM_DEBUG(dbgs() << " original: " << SI
<< "\n");
3919 StoreOpSplitter
Splitter(&SI
, *U
, V
->getType(), SI
.getAAMetadata(), &SI
,
3920 getAdjustedAlignment(&SI
, 0), DL
, IRB
);
3921 Splitter
.emitSplitOps(V
->getType(), V
, V
->getName() + ".fca");
3923 // The stores replacing SI each have markers describing fragments of the
3924 // assignment so delete the assignment markers linked to SI.
3925 at::deleteAssignmentMarkers(&SI
);
3926 SI
.eraseFromParent();
3930 bool visitBitCastInst(BitCastInst
&BC
) {
3935 bool visitAddrSpaceCastInst(AddrSpaceCastInst
&ASC
) {
3940 // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2)
3941 bool foldGEPSelect(GetElementPtrInst
&GEPI
) {
3942 if (!GEPI
.hasAllConstantIndices())
3945 SelectInst
*Sel
= cast
<SelectInst
>(GEPI
.getPointerOperand());
3947 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):"
3948 << "\n original: " << *Sel
3951 IRB
.SetInsertPoint(&GEPI
);
3952 SmallVector
<Value
*, 4> Index(GEPI
.indices());
3953 bool IsInBounds
= GEPI
.isInBounds();
3955 Type
*Ty
= GEPI
.getSourceElementType();
3956 Value
*True
= Sel
->getTrueValue();
3957 Value
*NTrue
= IRB
.CreateGEP(Ty
, True
, Index
, True
->getName() + ".sroa.gep",
3960 Value
*False
= Sel
->getFalseValue();
3962 Value
*NFalse
= IRB
.CreateGEP(Ty
, False
, Index
,
3963 False
->getName() + ".sroa.gep", IsInBounds
);
3965 Value
*NSel
= IRB
.CreateSelect(Sel
->getCondition(), NTrue
, NFalse
,
3966 Sel
->getName() + ".sroa.sel");
3967 Visited
.erase(&GEPI
);
3968 GEPI
.replaceAllUsesWith(NSel
);
3969 GEPI
.eraseFromParent();
3970 Instruction
*NSelI
= cast
<Instruction
>(NSel
);
3971 Visited
.insert(NSelI
);
3972 enqueueUsers(*NSelI
);
3974 LLVM_DEBUG(dbgs() << "\n to: " << *NTrue
3976 << "\n " << *NSel
<< '\n');
3981 // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
3982 bool foldGEPPhi(GetElementPtrInst
&GEPI
) {
3983 if (!GEPI
.hasAllConstantIndices())
3986 PHINode
*PHI
= cast
<PHINode
>(GEPI
.getPointerOperand());
3987 if (GEPI
.getParent() != PHI
->getParent() ||
3988 llvm::any_of(PHI
->incoming_values(), [](Value
*In
)
3989 { Instruction
*I
= dyn_cast
<Instruction
>(In
);
3990 return !I
|| isa
<GetElementPtrInst
>(I
) || isa
<PHINode
>(I
) ||
3991 succ_empty(I
->getParent()) ||
3992 !I
->getParent()->isLegalToHoistInto();
3996 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):"
3997 << "\n original: " << *PHI
4001 SmallVector
<Value
*, 4> Index(GEPI
.indices());
4002 bool IsInBounds
= GEPI
.isInBounds();
4003 IRB
.SetInsertPoint(GEPI
.getParent(), GEPI
.getParent()->getFirstNonPHIIt());
4004 PHINode
*NewPN
= IRB
.CreatePHI(GEPI
.getType(), PHI
->getNumIncomingValues(),
4005 PHI
->getName() + ".sroa.phi");
4006 for (unsigned I
= 0, E
= PHI
->getNumIncomingValues(); I
!= E
; ++I
) {
4007 BasicBlock
*B
= PHI
->getIncomingBlock(I
);
4008 Value
*NewVal
= nullptr;
4009 int Idx
= NewPN
->getBasicBlockIndex(B
);
4011 NewVal
= NewPN
->getIncomingValue(Idx
);
4013 Instruction
*In
= cast
<Instruction
>(PHI
->getIncomingValue(I
));
4015 IRB
.SetInsertPoint(In
->getParent(), std::next(In
->getIterator()));
4016 Type
*Ty
= GEPI
.getSourceElementType();
4017 NewVal
= IRB
.CreateGEP(Ty
, In
, Index
, In
->getName() + ".sroa.gep",
4020 NewPN
->addIncoming(NewVal
, B
);
4023 Visited
.erase(&GEPI
);
4024 GEPI
.replaceAllUsesWith(NewPN
);
4025 GEPI
.eraseFromParent();
4026 Visited
.insert(NewPN
);
4027 enqueueUsers(*NewPN
);
4029 LLVM_DEBUG(for (Value
*In
: NewPN
->incoming_values())
4030 dbgs() << "\n " << *In
;
4031 dbgs() << "\n " << *NewPN
<< '\n');
4036 bool visitGetElementPtrInst(GetElementPtrInst
&GEPI
) {
4037 if (isa
<SelectInst
>(GEPI
.getPointerOperand()) &&
4038 foldGEPSelect(GEPI
))
4041 if (isa
<PHINode
>(GEPI
.getPointerOperand()) &&
4049 bool visitPHINode(PHINode
&PN
) {
4054 bool visitSelectInst(SelectInst
&SI
) {
4060 } // end anonymous namespace
4062 /// Strip aggregate type wrapping.
4064 /// This removes no-op aggregate types wrapping an underlying type. It will
4065 /// strip as many layers of types as it can without changing either the type
4066 /// size or the allocated size.
4067 static Type
*stripAggregateTypeWrapping(const DataLayout
&DL
, Type
*Ty
) {
4068 if (Ty
->isSingleValueType())
4071 uint64_t AllocSize
= DL
.getTypeAllocSize(Ty
).getFixedValue();
4072 uint64_t TypeSize
= DL
.getTypeSizeInBits(Ty
).getFixedValue();
4075 if (ArrayType
*ArrTy
= dyn_cast
<ArrayType
>(Ty
)) {
4076 InnerTy
= ArrTy
->getElementType();
4077 } else if (StructType
*STy
= dyn_cast
<StructType
>(Ty
)) {
4078 const StructLayout
*SL
= DL
.getStructLayout(STy
);
4079 unsigned Index
= SL
->getElementContainingOffset(0);
4080 InnerTy
= STy
->getElementType(Index
);
4085 if (AllocSize
> DL
.getTypeAllocSize(InnerTy
).getFixedValue() ||
4086 TypeSize
> DL
.getTypeSizeInBits(InnerTy
).getFixedValue())
4089 return stripAggregateTypeWrapping(DL
, InnerTy
);
4092 /// Try to find a partition of the aggregate type passed in for a given
4093 /// offset and size.
4095 /// This recurses through the aggregate type and tries to compute a subtype
4096 /// based on the offset and size. When the offset and size span a sub-section
4097 /// of an array, it will even compute a new array type for that sub-section,
4098 /// and the same for structs.
4100 /// Note that this routine is very strict and tries to find a partition of the
4101 /// type which produces the *exact* right offset and size. It is not forgiving
4102 /// when the size or offset cause either end of type-based partition to be off.
4103 /// Also, this is a best-effort routine. It is reasonable to give up and not
4104 /// return a type if necessary.
4105 static Type
*getTypePartition(const DataLayout
&DL
, Type
*Ty
, uint64_t Offset
,
4107 if (Offset
== 0 && DL
.getTypeAllocSize(Ty
).getFixedValue() == Size
)
4108 return stripAggregateTypeWrapping(DL
, Ty
);
4109 if (Offset
> DL
.getTypeAllocSize(Ty
).getFixedValue() ||
4110 (DL
.getTypeAllocSize(Ty
).getFixedValue() - Offset
) < Size
)
4113 if (isa
<ArrayType
>(Ty
) || isa
<VectorType
>(Ty
)) {
4115 uint64_t TyNumElements
;
4116 if (auto *AT
= dyn_cast
<ArrayType
>(Ty
)) {
4117 ElementTy
= AT
->getElementType();
4118 TyNumElements
= AT
->getNumElements();
4120 // FIXME: This isn't right for vectors with non-byte-sized or
4121 // non-power-of-two sized elements.
4122 auto *VT
= cast
<FixedVectorType
>(Ty
);
4123 ElementTy
= VT
->getElementType();
4124 TyNumElements
= VT
->getNumElements();
4126 uint64_t ElementSize
= DL
.getTypeAllocSize(ElementTy
).getFixedValue();
4127 uint64_t NumSkippedElements
= Offset
/ ElementSize
;
4128 if (NumSkippedElements
>= TyNumElements
)
4130 Offset
-= NumSkippedElements
* ElementSize
;
4132 // First check if we need to recurse.
4133 if (Offset
> 0 || Size
< ElementSize
) {
4134 // Bail if the partition ends in a different array element.
4135 if ((Offset
+ Size
) > ElementSize
)
4137 // Recurse through the element type trying to peel off offset bytes.
4138 return getTypePartition(DL
, ElementTy
, Offset
, Size
);
4140 assert(Offset
== 0);
4142 if (Size
== ElementSize
)
4143 return stripAggregateTypeWrapping(DL
, ElementTy
);
4144 assert(Size
> ElementSize
);
4145 uint64_t NumElements
= Size
/ ElementSize
;
4146 if (NumElements
* ElementSize
!= Size
)
4148 return ArrayType::get(ElementTy
, NumElements
);
4151 StructType
*STy
= dyn_cast
<StructType
>(Ty
);
4155 const StructLayout
*SL
= DL
.getStructLayout(STy
);
4157 if (SL
->getSizeInBits().isScalable())
4160 if (Offset
>= SL
->getSizeInBytes())
4162 uint64_t EndOffset
= Offset
+ Size
;
4163 if (EndOffset
> SL
->getSizeInBytes())
4166 unsigned Index
= SL
->getElementContainingOffset(Offset
);
4167 Offset
-= SL
->getElementOffset(Index
);
4169 Type
*ElementTy
= STy
->getElementType(Index
);
4170 uint64_t ElementSize
= DL
.getTypeAllocSize(ElementTy
).getFixedValue();
4171 if (Offset
>= ElementSize
)
4172 return nullptr; // The offset points into alignment padding.
4174 // See if any partition must be contained by the element.
4175 if (Offset
> 0 || Size
< ElementSize
) {
4176 if ((Offset
+ Size
) > ElementSize
)
4178 return getTypePartition(DL
, ElementTy
, Offset
, Size
);
4180 assert(Offset
== 0);
4182 if (Size
== ElementSize
)
4183 return stripAggregateTypeWrapping(DL
, ElementTy
);
4185 StructType::element_iterator EI
= STy
->element_begin() + Index
,
4186 EE
= STy
->element_end();
4187 if (EndOffset
< SL
->getSizeInBytes()) {
4188 unsigned EndIndex
= SL
->getElementContainingOffset(EndOffset
);
4189 if (Index
== EndIndex
)
4190 return nullptr; // Within a single element and its padding.
4192 // Don't try to form "natural" types if the elements don't line up with the
4194 // FIXME: We could potentially recurse down through the last element in the
4195 // sub-struct to find a natural end point.
4196 if (SL
->getElementOffset(EndIndex
) != EndOffset
)
4199 assert(Index
< EndIndex
);
4200 EE
= STy
->element_begin() + EndIndex
;
4203 // Try to build up a sub-structure.
4205 StructType::get(STy
->getContext(), ArrayRef(EI
, EE
), STy
->isPacked());
4206 const StructLayout
*SubSL
= DL
.getStructLayout(SubTy
);
4207 if (Size
!= SubSL
->getSizeInBytes())
4208 return nullptr; // The sub-struct doesn't have quite the size needed.
4213 /// Pre-split loads and stores to simplify rewriting.
4215 /// We want to break up the splittable load+store pairs as much as
4216 /// possible. This is important to do as a preprocessing step, as once we
4217 /// start rewriting the accesses to partitions of the alloca we lose the
4218 /// necessary information to correctly split apart paired loads and stores
4219 /// which both point into this alloca. The case to consider is something like
4222 /// %a = alloca [12 x i8]
4223 /// %gep1 = getelementptr i8, ptr %a, i32 0
4224 /// %gep2 = getelementptr i8, ptr %a, i32 4
4225 /// %gep3 = getelementptr i8, ptr %a, i32 8
4226 /// store float 0.0, ptr %gep1
4227 /// store float 1.0, ptr %gep2
4228 /// %v = load i64, ptr %gep1
4229 /// store i64 %v, ptr %gep2
4230 /// %f1 = load float, ptr %gep2
4231 /// %f2 = load float, ptr %gep3
4233 /// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4234 /// promote everything so we recover the 2 SSA values that should have been
4235 /// there all along.
4237 /// \returns true if any changes are made.
4238 bool SROA::presplitLoadsAndStores(AllocaInst
&AI
, AllocaSlices
&AS
) {
4239 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4241 // Track the loads and stores which are candidates for pre-splitting here, in
4242 // the order they first appear during the partition scan. These give stable
4243 // iteration order and a basis for tracking which loads and stores we
4245 SmallVector
<LoadInst
*, 4> Loads
;
4246 SmallVector
<StoreInst
*, 4> Stores
;
4248 // We need to accumulate the splits required of each load or store where we
4249 // can find them via a direct lookup. This is important to cross-check loads
4250 // and stores against each other. We also track the slice so that we can kill
4251 // all the slices that end up split.
4252 struct SplitOffsets
{
4254 std::vector
<uint64_t> Splits
;
4256 SmallDenseMap
<Instruction
*, SplitOffsets
, 8> SplitOffsetsMap
;
4258 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4259 // This is important as we also cannot pre-split stores of those loads!
4260 // FIXME: This is all pretty gross. It means that we can be more aggressive
4261 // in pre-splitting when the load feeding the store happens to come from
4262 // a separate alloca. Put another way, the effectiveness of SROA would be
4263 // decreased by a frontend which just concatenated all of its local allocas
4264 // into one big flat alloca. But defeating such patterns is exactly the job
4265 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4266 // change store pre-splitting to actually force pre-splitting of the load
4267 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4268 // maybe it would make it more principled?
4269 SmallPtrSet
<LoadInst
*, 8> UnsplittableLoads
;
4271 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4272 for (auto &P
: AS
.partitions()) {
4273 for (Slice
&S
: P
) {
4274 Instruction
*I
= cast
<Instruction
>(S
.getUse()->getUser());
4275 if (!S
.isSplittable() || S
.endOffset() <= P
.endOffset()) {
4276 // If this is a load we have to track that it can't participate in any
4277 // pre-splitting. If this is a store of a load we have to track that
4278 // that load also can't participate in any pre-splitting.
4279 if (auto *LI
= dyn_cast
<LoadInst
>(I
))
4280 UnsplittableLoads
.insert(LI
);
4281 else if (auto *SI
= dyn_cast
<StoreInst
>(I
))
4282 if (auto *LI
= dyn_cast
<LoadInst
>(SI
->getValueOperand()))
4283 UnsplittableLoads
.insert(LI
);
4286 assert(P
.endOffset() > S
.beginOffset() &&
4287 "Empty or backwards partition!");
4289 // Determine if this is a pre-splittable slice.
4290 if (auto *LI
= dyn_cast
<LoadInst
>(I
)) {
4291 assert(!LI
->isVolatile() && "Cannot split volatile loads!");
4293 // The load must be used exclusively to store into other pointers for
4294 // us to be able to arbitrarily pre-split it. The stores must also be
4295 // simple to avoid changing semantics.
4296 auto IsLoadSimplyStored
= [](LoadInst
*LI
) {
4297 for (User
*LU
: LI
->users()) {
4298 auto *SI
= dyn_cast
<StoreInst
>(LU
);
4299 if (!SI
|| !SI
->isSimple())
4304 if (!IsLoadSimplyStored(LI
)) {
4305 UnsplittableLoads
.insert(LI
);
4309 Loads
.push_back(LI
);
4310 } else if (auto *SI
= dyn_cast
<StoreInst
>(I
)) {
4311 if (S
.getUse() != &SI
->getOperandUse(SI
->getPointerOperandIndex()))
4312 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4314 auto *StoredLoad
= dyn_cast
<LoadInst
>(SI
->getValueOperand());
4315 if (!StoredLoad
|| !StoredLoad
->isSimple())
4317 assert(!SI
->isVolatile() && "Cannot split volatile stores!");
4319 Stores
.push_back(SI
);
4321 // Other uses cannot be pre-split.
4325 // Record the initial split.
4326 LLVM_DEBUG(dbgs() << " Candidate: " << *I
<< "\n");
4327 auto &Offsets
= SplitOffsetsMap
[I
];
4328 assert(Offsets
.Splits
.empty() &&
4329 "Should not have splits the first time we see an instruction!");
4331 Offsets
.Splits
.push_back(P
.endOffset() - S
.beginOffset());
4334 // Now scan the already split slices, and add a split for any of them which
4335 // we're going to pre-split.
4336 for (Slice
*S
: P
.splitSliceTails()) {
4337 auto SplitOffsetsMapI
=
4338 SplitOffsetsMap
.find(cast
<Instruction
>(S
->getUse()->getUser()));
4339 if (SplitOffsetsMapI
== SplitOffsetsMap
.end())
4341 auto &Offsets
= SplitOffsetsMapI
->second
;
4343 assert(Offsets
.S
== S
&& "Found a mismatched slice!");
4344 assert(!Offsets
.Splits
.empty() &&
4345 "Cannot have an empty set of splits on the second partition!");
4346 assert(Offsets
.Splits
.back() ==
4347 P
.beginOffset() - Offsets
.S
->beginOffset() &&
4348 "Previous split does not end where this one begins!");
4350 // Record each split. The last partition's end isn't needed as the size
4351 // of the slice dictates that.
4352 if (S
->endOffset() > P
.endOffset())
4353 Offsets
.Splits
.push_back(P
.endOffset() - Offsets
.S
->beginOffset());
4357 // We may have split loads where some of their stores are split stores. For
4358 // such loads and stores, we can only pre-split them if their splits exactly
4359 // match relative to their starting offset. We have to verify this prior to
4361 llvm::erase_if(Stores
, [&UnsplittableLoads
, &SplitOffsetsMap
](StoreInst
*SI
) {
4362 // Lookup the load we are storing in our map of split
4364 auto *LI
= cast
<LoadInst
>(SI
->getValueOperand());
4365 // If it was completely unsplittable, then we're done,
4366 // and this store can't be pre-split.
4367 if (UnsplittableLoads
.count(LI
))
4370 auto LoadOffsetsI
= SplitOffsetsMap
.find(LI
);
4371 if (LoadOffsetsI
== SplitOffsetsMap
.end())
4372 return false; // Unrelated loads are definitely safe.
4373 auto &LoadOffsets
= LoadOffsetsI
->second
;
4375 // Now lookup the store's offsets.
4376 auto &StoreOffsets
= SplitOffsetsMap
[SI
];
4378 // If the relative offsets of each split in the load and
4379 // store match exactly, then we can split them and we
4380 // don't need to remove them here.
4381 if (LoadOffsets
.Splits
== StoreOffsets
.Splits
)
4384 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4385 << " " << *LI
<< "\n"
4386 << " " << *SI
<< "\n");
4388 // We've found a store and load that we need to split
4389 // with mismatched relative splits. Just give up on them
4390 // and remove both instructions from our list of
4392 UnsplittableLoads
.insert(LI
);
4395 // Now we have to go *back* through all the stores, because a later store may
4396 // have caused an earlier store's load to become unsplittable and if it is
4397 // unsplittable for the later store, then we can't rely on it being split in
4398 // the earlier store either.
4399 llvm::erase_if(Stores
, [&UnsplittableLoads
](StoreInst
*SI
) {
4400 auto *LI
= cast
<LoadInst
>(SI
->getValueOperand());
4401 return UnsplittableLoads
.count(LI
);
4403 // Once we've established all the loads that can't be split for some reason,
4404 // filter any that made it into our list out.
4405 llvm::erase_if(Loads
, [&UnsplittableLoads
](LoadInst
*LI
) {
4406 return UnsplittableLoads
.count(LI
);
4409 // If no loads or stores are left, there is no pre-splitting to be done for
4411 if (Loads
.empty() && Stores
.empty())
4414 // From here on, we can't fail and will be building new accesses, so rig up
4416 IRBuilderTy
IRB(&AI
);
4418 // Collect the new slices which we will merge into the alloca slices.
4419 SmallVector
<Slice
, 4> NewSlices
;
4421 // Track any allocas we end up splitting loads and stores for so we iterate
4423 SmallPtrSet
<AllocaInst
*, 4> ResplitPromotableAllocas
;
4425 // At this point, we have collected all of the loads and stores we can
4426 // pre-split, and the specific splits needed for them. We actually do the
4427 // splitting in a specific order in order to handle when one of the loads in
4428 // the value operand to one of the stores.
4430 // First, we rewrite all of the split loads, and just accumulate each split
4431 // load in a parallel structure. We also build the slices for them and append
4432 // them to the alloca slices.
4433 SmallDenseMap
<LoadInst
*, std::vector
<LoadInst
*>, 1> SplitLoadsMap
;
4434 std::vector
<LoadInst
*> SplitLoads
;
4435 const DataLayout
&DL
= AI
.getModule()->getDataLayout();
4436 for (LoadInst
*LI
: Loads
) {
4439 auto &Offsets
= SplitOffsetsMap
[LI
];
4440 unsigned SliceSize
= Offsets
.S
->endOffset() - Offsets
.S
->beginOffset();
4441 assert(LI
->getType()->getIntegerBitWidth() % 8 == 0 &&
4442 "Load must have type size equal to store size");
4443 assert(LI
->getType()->getIntegerBitWidth() / 8 >= SliceSize
&&
4444 "Load must be >= slice size");
4446 uint64_t BaseOffset
= Offsets
.S
->beginOffset();
4447 assert(BaseOffset
+ SliceSize
> BaseOffset
&&
4448 "Cannot represent alloca access size using 64-bit integers!");
4450 Instruction
*BasePtr
= cast
<Instruction
>(LI
->getPointerOperand());
4451 IRB
.SetInsertPoint(LI
);
4453 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI
<< "\n");
4455 uint64_t PartOffset
= 0, PartSize
= Offsets
.Splits
.front();
4456 int Idx
= 0, Size
= Offsets
.Splits
.size();
4458 auto *PartTy
= Type::getIntNTy(LI
->getContext(), PartSize
* 8);
4459 auto AS
= LI
->getPointerAddressSpace();
4460 auto *PartPtrTy
= LI
->getPointerOperandType();
4461 LoadInst
*PLoad
= IRB
.CreateAlignedLoad(
4463 getAdjustedPtr(IRB
, DL
, BasePtr
,
4464 APInt(DL
.getIndexSizeInBits(AS
), PartOffset
),
4465 PartPtrTy
, BasePtr
->getName() + "."),
4466 getAdjustedAlignment(LI
, PartOffset
),
4467 /*IsVolatile*/ false, LI
->getName());
4468 PLoad
->copyMetadata(*LI
, {LLVMContext::MD_mem_parallel_loop_access
,
4469 LLVMContext::MD_access_group
});
4471 // Append this load onto the list of split loads so we can find it later
4472 // to rewrite the stores.
4473 SplitLoads
.push_back(PLoad
);
4475 // Now build a new slice for the alloca.
4476 NewSlices
.push_back(
4477 Slice(BaseOffset
+ PartOffset
, BaseOffset
+ PartOffset
+ PartSize
,
4478 &PLoad
->getOperandUse(PLoad
->getPointerOperandIndex()),
4479 /*IsSplittable*/ false));
4480 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices
.back().beginOffset()
4481 << ", " << NewSlices
.back().endOffset()
4482 << "): " << *PLoad
<< "\n");
4484 // See if we've handled all the splits.
4488 // Setup the next partition.
4489 PartOffset
= Offsets
.Splits
[Idx
];
4491 PartSize
= (Idx
< Size
? Offsets
.Splits
[Idx
] : SliceSize
) - PartOffset
;
4494 // Now that we have the split loads, do the slow walk over all uses of the
4495 // load and rewrite them as split stores, or save the split loads to use
4496 // below if the store is going to be split there anyways.
4497 bool DeferredStores
= false;
4498 for (User
*LU
: LI
->users()) {
4499 StoreInst
*SI
= cast
<StoreInst
>(LU
);
4500 if (!Stores
.empty() && SplitOffsetsMap
.count(SI
)) {
4501 DeferredStores
= true;
4502 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4507 Value
*StoreBasePtr
= SI
->getPointerOperand();
4508 IRB
.SetInsertPoint(SI
);
4510 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI
<< "\n");
4512 for (int Idx
= 0, Size
= SplitLoads
.size(); Idx
< Size
; ++Idx
) {
4513 LoadInst
*PLoad
= SplitLoads
[Idx
];
4514 uint64_t PartOffset
= Idx
== 0 ? 0 : Offsets
.Splits
[Idx
- 1];
4515 auto *PartPtrTy
= SI
->getPointerOperandType();
4517 auto AS
= SI
->getPointerAddressSpace();
4518 StoreInst
*PStore
= IRB
.CreateAlignedStore(
4520 getAdjustedPtr(IRB
, DL
, StoreBasePtr
,
4521 APInt(DL
.getIndexSizeInBits(AS
), PartOffset
),
4522 PartPtrTy
, StoreBasePtr
->getName() + "."),
4523 getAdjustedAlignment(SI
, PartOffset
),
4524 /*IsVolatile*/ false);
4525 PStore
->copyMetadata(*SI
, {LLVMContext::MD_mem_parallel_loop_access
,
4526 LLVMContext::MD_access_group
,
4527 LLVMContext::MD_DIAssignID
});
4528 LLVM_DEBUG(dbgs() << " +" << PartOffset
<< ":" << *PStore
<< "\n");
4531 // We want to immediately iterate on any allocas impacted by splitting
4532 // this store, and we have to track any promotable alloca (indicated by
4533 // a direct store) as needing to be resplit because it is no longer
4535 if (AllocaInst
*OtherAI
= dyn_cast
<AllocaInst
>(StoreBasePtr
)) {
4536 ResplitPromotableAllocas
.insert(OtherAI
);
4537 Worklist
.insert(OtherAI
);
4538 } else if (AllocaInst
*OtherAI
= dyn_cast
<AllocaInst
>(
4539 StoreBasePtr
->stripInBoundsOffsets())) {
4540 Worklist
.insert(OtherAI
);
4543 // Mark the original store as dead.
4544 DeadInsts
.push_back(SI
);
4547 // Save the split loads if there are deferred stores among the users.
4549 SplitLoadsMap
.insert(std::make_pair(LI
, std::move(SplitLoads
)));
4551 // Mark the original load as dead and kill the original slice.
4552 DeadInsts
.push_back(LI
);
4556 // Second, we rewrite all of the split stores. At this point, we know that
4557 // all loads from this alloca have been split already. For stores of such
4558 // loads, we can simply look up the pre-existing split loads. For stores of
4559 // other loads, we split those loads first and then write split stores of
4561 for (StoreInst
*SI
: Stores
) {
4562 auto *LI
= cast
<LoadInst
>(SI
->getValueOperand());
4563 IntegerType
*Ty
= cast
<IntegerType
>(LI
->getType());
4564 assert(Ty
->getBitWidth() % 8 == 0);
4565 uint64_t StoreSize
= Ty
->getBitWidth() / 8;
4566 assert(StoreSize
> 0 && "Cannot have a zero-sized integer store!");
4568 auto &Offsets
= SplitOffsetsMap
[SI
];
4569 assert(StoreSize
== Offsets
.S
->endOffset() - Offsets
.S
->beginOffset() &&
4570 "Slice size should always match load size exactly!");
4571 uint64_t BaseOffset
= Offsets
.S
->beginOffset();
4572 assert(BaseOffset
+ StoreSize
> BaseOffset
&&
4573 "Cannot represent alloca access size using 64-bit integers!");
4575 Value
*LoadBasePtr
= LI
->getPointerOperand();
4576 Instruction
*StoreBasePtr
= cast
<Instruction
>(SI
->getPointerOperand());
4578 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI
<< "\n");
4580 // Check whether we have an already split load.
4581 auto SplitLoadsMapI
= SplitLoadsMap
.find(LI
);
4582 std::vector
<LoadInst
*> *SplitLoads
= nullptr;
4583 if (SplitLoadsMapI
!= SplitLoadsMap
.end()) {
4584 SplitLoads
= &SplitLoadsMapI
->second
;
4585 assert(SplitLoads
->size() == Offsets
.Splits
.size() + 1 &&
4586 "Too few split loads for the number of splits in the store!");
4588 LLVM_DEBUG(dbgs() << " of load: " << *LI
<< "\n");
4591 uint64_t PartOffset
= 0, PartSize
= Offsets
.Splits
.front();
4592 int Idx
= 0, Size
= Offsets
.Splits
.size();
4594 auto *PartTy
= Type::getIntNTy(Ty
->getContext(), PartSize
* 8);
4595 auto *LoadPartPtrTy
= LI
->getPointerOperandType();
4596 auto *StorePartPtrTy
= SI
->getPointerOperandType();
4598 // Either lookup a split load or create one.
4601 PLoad
= (*SplitLoads
)[Idx
];
4603 IRB
.SetInsertPoint(LI
);
4604 auto AS
= LI
->getPointerAddressSpace();
4605 PLoad
= IRB
.CreateAlignedLoad(
4607 getAdjustedPtr(IRB
, DL
, LoadBasePtr
,
4608 APInt(DL
.getIndexSizeInBits(AS
), PartOffset
),
4609 LoadPartPtrTy
, LoadBasePtr
->getName() + "."),
4610 getAdjustedAlignment(LI
, PartOffset
),
4611 /*IsVolatile*/ false, LI
->getName());
4612 PLoad
->copyMetadata(*LI
, {LLVMContext::MD_mem_parallel_loop_access
,
4613 LLVMContext::MD_access_group
});
4616 // And store this partition.
4617 IRB
.SetInsertPoint(SI
);
4618 auto AS
= SI
->getPointerAddressSpace();
4619 StoreInst
*PStore
= IRB
.CreateAlignedStore(
4621 getAdjustedPtr(IRB
, DL
, StoreBasePtr
,
4622 APInt(DL
.getIndexSizeInBits(AS
), PartOffset
),
4623 StorePartPtrTy
, StoreBasePtr
->getName() + "."),
4624 getAdjustedAlignment(SI
, PartOffset
),
4625 /*IsVolatile*/ false);
4626 PStore
->copyMetadata(*SI
, {LLVMContext::MD_mem_parallel_loop_access
,
4627 LLVMContext::MD_access_group
});
4629 // Now build a new slice for the alloca.
4630 NewSlices
.push_back(
4631 Slice(BaseOffset
+ PartOffset
, BaseOffset
+ PartOffset
+ PartSize
,
4632 &PStore
->getOperandUse(PStore
->getPointerOperandIndex()),
4633 /*IsSplittable*/ false));
4634 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices
.back().beginOffset()
4635 << ", " << NewSlices
.back().endOffset()
4636 << "): " << *PStore
<< "\n");
4638 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad
<< "\n");
4641 // See if we've finished all the splits.
4645 // Setup the next partition.
4646 PartOffset
= Offsets
.Splits
[Idx
];
4648 PartSize
= (Idx
< Size
? Offsets
.Splits
[Idx
] : StoreSize
) - PartOffset
;
4651 // We want to immediately iterate on any allocas impacted by splitting
4652 // this load, which is only relevant if it isn't a load of this alloca and
4653 // thus we didn't already split the loads above. We also have to keep track
4654 // of any promotable allocas we split loads on as they can no longer be
4657 if (AllocaInst
*OtherAI
= dyn_cast
<AllocaInst
>(LoadBasePtr
)) {
4658 assert(OtherAI
!= &AI
&& "We can't re-split our own alloca!");
4659 ResplitPromotableAllocas
.insert(OtherAI
);
4660 Worklist
.insert(OtherAI
);
4661 } else if (AllocaInst
*OtherAI
= dyn_cast
<AllocaInst
>(
4662 LoadBasePtr
->stripInBoundsOffsets())) {
4663 assert(OtherAI
!= &AI
&& "We can't re-split our own alloca!");
4664 Worklist
.insert(OtherAI
);
4668 // Mark the original store as dead now that we've split it up and kill its
4669 // slice. Note that we leave the original load in place unless this store
4670 // was its only use. It may in turn be split up if it is an alloca load
4671 // for some other alloca, but it may be a normal load. This may introduce
4672 // redundant loads, but where those can be merged the rest of the optimizer
4673 // should handle the merging, and this uncovers SSA splits which is more
4674 // important. In practice, the original loads will almost always be fully
4675 // split and removed eventually, and the splits will be merged by any
4676 // trivial CSE, including instcombine.
4677 if (LI
->hasOneUse()) {
4678 assert(*LI
->user_begin() == SI
&& "Single use isn't this store!");
4679 DeadInsts
.push_back(LI
);
4681 DeadInsts
.push_back(SI
);
4685 // Remove the killed slices that have ben pre-split.
4686 llvm::erase_if(AS
, [](const Slice
&S
) { return S
.isDead(); });
4688 // Insert our new slices. This will sort and merge them into the sorted
4690 AS
.insert(NewSlices
);
4692 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
4694 for (auto I
= AS
.begin(), E
= AS
.end(); I
!= E
; ++I
)
4695 LLVM_DEBUG(AS
.print(dbgs(), I
, " "));
4698 // Finally, don't try to promote any allocas that new require re-splitting.
4699 // They have already been added to the worklist above.
4700 llvm::erase_if(PromotableAllocas
, [&](AllocaInst
*AI
) {
4701 return ResplitPromotableAllocas
.count(AI
);
4707 /// Rewrite an alloca partition's users.
4709 /// This routine drives both of the rewriting goals of the SROA pass. It tries
4710 /// to rewrite uses of an alloca partition to be conducive for SSA value
4711 /// promotion. If the partition needs a new, more refined alloca, this will
4712 /// build that new alloca, preserving as much type information as possible, and
4713 /// rewrite the uses of the old alloca to point at the new one and have the
4714 /// appropriate new offsets. It also evaluates how successful the rewrite was
4715 /// at enabling promotion and if it was successful queues the alloca to be
4717 AllocaInst
*SROA::rewritePartition(AllocaInst
&AI
, AllocaSlices
&AS
,
4719 // Try to compute a friendly type for this partition of the alloca. This
4720 // won't always succeed, in which case we fall back to a legal integer type
4721 // or an i8 array of an appropriate size.
4722 Type
*SliceTy
= nullptr;
4723 VectorType
*SliceVecTy
= nullptr;
4724 const DataLayout
&DL
= AI
.getModule()->getDataLayout();
4725 std::pair
<Type
*, IntegerType
*> CommonUseTy
=
4726 findCommonType(P
.begin(), P
.end(), P
.endOffset());
4727 // Do all uses operate on the same type?
4728 if (CommonUseTy
.first
)
4729 if (DL
.getTypeAllocSize(CommonUseTy
.first
).getFixedValue() >= P
.size()) {
4730 SliceTy
= CommonUseTy
.first
;
4731 SliceVecTy
= dyn_cast
<VectorType
>(SliceTy
);
4733 // If not, can we find an appropriate subtype in the original allocated type?
4735 if (Type
*TypePartitionTy
= getTypePartition(DL
, AI
.getAllocatedType(),
4736 P
.beginOffset(), P
.size()))
4737 SliceTy
= TypePartitionTy
;
4739 // If still not, can we use the largest bitwidth integer type used?
4740 if (!SliceTy
&& CommonUseTy
.second
)
4741 if (DL
.getTypeAllocSize(CommonUseTy
.second
).getFixedValue() >= P
.size()) {
4742 SliceTy
= CommonUseTy
.second
;
4743 SliceVecTy
= dyn_cast
<VectorType
>(SliceTy
);
4745 if ((!SliceTy
|| (SliceTy
->isArrayTy() &&
4746 SliceTy
->getArrayElementType()->isIntegerTy())) &&
4747 DL
.isLegalInteger(P
.size() * 8)) {
4748 SliceTy
= Type::getIntNTy(*C
, P
.size() * 8);
4751 // If the common use types are not viable for promotion then attempt to find
4752 // another type that is viable.
4753 if (SliceVecTy
&& !checkVectorTypeForPromotion(P
, SliceVecTy
, DL
))
4754 if (Type
*TypePartitionTy
= getTypePartition(DL
, AI
.getAllocatedType(),
4755 P
.beginOffset(), P
.size())) {
4756 VectorType
*TypePartitionVecTy
= dyn_cast
<VectorType
>(TypePartitionTy
);
4757 if (TypePartitionVecTy
&&
4758 checkVectorTypeForPromotion(P
, TypePartitionVecTy
, DL
))
4759 SliceTy
= TypePartitionTy
;
4763 SliceTy
= ArrayType::get(Type::getInt8Ty(*C
), P
.size());
4764 assert(DL
.getTypeAllocSize(SliceTy
).getFixedValue() >= P
.size());
4766 bool IsIntegerPromotable
= isIntegerWideningViable(P
, SliceTy
, DL
);
4769 IsIntegerPromotable
? nullptr : isVectorPromotionViable(P
, DL
);
4773 // Check for the case where we're going to rewrite to a new alloca of the
4774 // exact same type as the original, and with the same access offsets. In that
4775 // case, re-use the existing alloca, but still run through the rewriter to
4776 // perform phi and select speculation.
4777 // P.beginOffset() can be non-zero even with the same type in a case with
4778 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
4780 if (SliceTy
== AI
.getAllocatedType() && P
.beginOffset() == 0) {
4782 // FIXME: We should be able to bail at this point with "nothing changed".
4783 // FIXME: We might want to defer PHI speculation until after here.
4784 // FIXME: return nullptr;
4786 // Make sure the alignment is compatible with P.beginOffset().
4787 const Align Alignment
= commonAlignment(AI
.getAlign(), P
.beginOffset());
4788 // If we will get at least this much alignment from the type alone, leave
4789 // the alloca's alignment unconstrained.
4790 const bool IsUnconstrained
= Alignment
<= DL
.getABITypeAlign(SliceTy
);
4791 NewAI
= new AllocaInst(
4792 SliceTy
, AI
.getAddressSpace(), nullptr,
4793 IsUnconstrained
? DL
.getPrefTypeAlign(SliceTy
) : Alignment
,
4794 AI
.getName() + ".sroa." + Twine(P
.begin() - AS
.begin()), &AI
);
4795 // Copy the old AI debug location over to the new one.
4796 NewAI
->setDebugLoc(AI
.getDebugLoc());
4800 LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
4801 << "[" << P
.beginOffset() << "," << P
.endOffset()
4802 << ") to: " << *NewAI
<< "\n");
4804 // Track the high watermark on the worklist as it is only relevant for
4805 // promoted allocas. We will reset it to this point if the alloca is not in
4806 // fact scheduled for promotion.
4807 unsigned PPWOldSize
= PostPromotionWorklist
.size();
4808 unsigned NumUses
= 0;
4809 SmallSetVector
<PHINode
*, 8> PHIUsers
;
4810 SmallSetVector
<SelectInst
*, 8> SelectUsers
;
4812 AllocaSliceRewriter
Rewriter(DL
, AS
, *this, AI
, *NewAI
, P
.beginOffset(),
4813 P
.endOffset(), IsIntegerPromotable
, VecTy
,
4814 PHIUsers
, SelectUsers
);
4815 bool Promotable
= true;
4816 for (Slice
*S
: P
.splitSliceTails()) {
4817 Promotable
&= Rewriter
.visit(S
);
4820 for (Slice
&S
: P
) {
4821 Promotable
&= Rewriter
.visit(&S
);
4825 NumAllocaPartitionUses
+= NumUses
;
4826 MaxUsesPerAllocaPartition
.updateMax(NumUses
);
4828 // Now that we've processed all the slices in the new partition, check if any
4829 // PHIs or Selects would block promotion.
4830 for (PHINode
*PHI
: PHIUsers
)
4831 if (!isSafePHIToSpeculate(*PHI
)) {
4834 SelectUsers
.clear();
4838 SmallVector
<std::pair
<SelectInst
*, RewriteableMemOps
>, 2>
4839 NewSelectsToRewrite
;
4840 NewSelectsToRewrite
.reserve(SelectUsers
.size());
4841 for (SelectInst
*Sel
: SelectUsers
) {
4842 std::optional
<RewriteableMemOps
> Ops
=
4843 isSafeSelectToSpeculate(*Sel
, PreserveCFG
);
4847 SelectUsers
.clear();
4848 NewSelectsToRewrite
.clear();
4851 NewSelectsToRewrite
.emplace_back(std::make_pair(Sel
, *Ops
));
4855 for (Use
*U
: AS
.getDeadUsesIfPromotable()) {
4856 auto *OldInst
= dyn_cast
<Instruction
>(U
->get());
4857 Value::dropDroppableUse(*U
);
4859 if (isInstructionTriviallyDead(OldInst
))
4860 DeadInsts
.push_back(OldInst
);
4862 if (PHIUsers
.empty() && SelectUsers
.empty()) {
4863 // Promote the alloca.
4864 PromotableAllocas
.push_back(NewAI
);
4866 // If we have either PHIs or Selects to speculate, add them to those
4867 // worklists and re-queue the new alloca so that we promote in on the
4869 for (PHINode
*PHIUser
: PHIUsers
)
4870 SpeculatablePHIs
.insert(PHIUser
);
4871 SelectsToRewrite
.reserve(SelectsToRewrite
.size() +
4872 NewSelectsToRewrite
.size());
4873 for (auto &&KV
: llvm::make_range(
4874 std::make_move_iterator(NewSelectsToRewrite
.begin()),
4875 std::make_move_iterator(NewSelectsToRewrite
.end())))
4876 SelectsToRewrite
.insert(std::move(KV
));
4877 Worklist
.insert(NewAI
);
4880 // Drop any post-promotion work items if promotion didn't happen.
4881 while (PostPromotionWorklist
.size() > PPWOldSize
)
4882 PostPromotionWorklist
.pop_back();
4884 // We couldn't promote and we didn't create a new partition, nothing
4889 // If we can't promote the alloca, iterate on it to check for new
4890 // refinements exposed by splitting the current alloca. Don't iterate on an
4891 // alloca which didn't actually change and didn't get promoted.
4892 Worklist
.insert(NewAI
);
4898 static void insertNewDbgInst(DIBuilder
&DIB
, DbgDeclareInst
*Orig
,
4899 AllocaInst
*NewAddr
, DIExpression
*NewFragmentExpr
,
4900 Instruction
*BeforeInst
) {
4901 DIB
.insertDeclare(NewAddr
, Orig
->getVariable(), NewFragmentExpr
,
4902 Orig
->getDebugLoc(), BeforeInst
);
4904 static void insertNewDbgInst(DIBuilder
&DIB
, DbgAssignIntrinsic
*Orig
,
4905 AllocaInst
*NewAddr
, DIExpression
*NewFragmentExpr
,
4906 Instruction
*BeforeInst
) {
4908 if (!NewAddr
->hasMetadata(LLVMContext::MD_DIAssignID
)) {
4909 NewAddr
->setMetadata(LLVMContext::MD_DIAssignID
,
4910 DIAssignID::getDistinct(NewAddr
->getContext()));
4912 auto *NewAssign
= DIB
.insertDbgAssign(
4913 NewAddr
, Orig
->getValue(), Orig
->getVariable(), NewFragmentExpr
, NewAddr
,
4914 Orig
->getAddressExpression(), Orig
->getDebugLoc());
4915 LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign
<< "\n");
4918 static void insertNewDbgInst(DIBuilder
&DIB
, DPValue
*Orig
, AllocaInst
*NewAddr
,
4919 DIExpression
*NewFragmentExpr
,
4920 Instruction
*BeforeInst
) {
4922 if (Orig
->isDbgDeclare()) {
4923 DPValue
*DPV
= DPValue::createDPVDeclare(
4924 NewAddr
, Orig
->getVariable(), NewFragmentExpr
, Orig
->getDebugLoc());
4925 BeforeInst
->getParent()->insertDPValueBefore(DPV
,
4926 BeforeInst
->getIterator());
4929 if (!NewAddr
->hasMetadata(LLVMContext::MD_DIAssignID
)) {
4930 NewAddr
->setMetadata(LLVMContext::MD_DIAssignID
,
4931 DIAssignID::getDistinct(NewAddr
->getContext()));
4933 auto *NewAssign
= DPValue::createLinkedDPVAssign(
4934 NewAddr
, Orig
->getValue(), Orig
->getVariable(), NewFragmentExpr
, NewAddr
,
4935 Orig
->getAddressExpression(), Orig
->getDebugLoc());
4936 LLVM_DEBUG(dbgs() << "Created new DPVAssign: " << *NewAssign
<< "\n");
4940 /// Walks the slices of an alloca and form partitions based on them,
4941 /// rewriting each of their uses.
4942 bool SROA::splitAlloca(AllocaInst
&AI
, AllocaSlices
&AS
) {
4943 if (AS
.begin() == AS
.end())
4946 unsigned NumPartitions
= 0;
4947 bool Changed
= false;
4948 const DataLayout
&DL
= AI
.getModule()->getDataLayout();
4950 // First try to pre-split loads and stores.
4951 Changed
|= presplitLoadsAndStores(AI
, AS
);
4953 // Now that we have identified any pre-splitting opportunities,
4954 // mark loads and stores unsplittable except for the following case.
4955 // We leave a slice splittable if all other slices are disjoint or fully
4956 // included in the slice, such as whole-alloca loads and stores.
4957 // If we fail to split these during pre-splitting, we want to force them
4958 // to be rewritten into a partition.
4959 bool IsSorted
= true;
4961 uint64_t AllocaSize
=
4962 DL
.getTypeAllocSize(AI
.getAllocatedType()).getFixedValue();
4963 const uint64_t MaxBitVectorSize
= 1024;
4964 if (AllocaSize
<= MaxBitVectorSize
) {
4965 // If a byte boundary is included in any load or store, a slice starting or
4966 // ending at the boundary is not splittable.
4967 SmallBitVector
SplittableOffset(AllocaSize
+ 1, true);
4969 for (unsigned O
= S
.beginOffset() + 1;
4970 O
< S
.endOffset() && O
< AllocaSize
; O
++)
4971 SplittableOffset
.reset(O
);
4973 for (Slice
&S
: AS
) {
4974 if (!S
.isSplittable())
4977 if ((S
.beginOffset() > AllocaSize
|| SplittableOffset
[S
.beginOffset()]) &&
4978 (S
.endOffset() > AllocaSize
|| SplittableOffset
[S
.endOffset()]))
4981 if (isa
<LoadInst
>(S
.getUse()->getUser()) ||
4982 isa
<StoreInst
>(S
.getUse()->getUser())) {
4983 S
.makeUnsplittable();
4989 // We only allow whole-alloca splittable loads and stores
4990 // for a large alloca to avoid creating too large BitVector.
4991 for (Slice
&S
: AS
) {
4992 if (!S
.isSplittable())
4995 if (S
.beginOffset() == 0 && S
.endOffset() >= AllocaSize
)
4998 if (isa
<LoadInst
>(S
.getUse()->getUser()) ||
4999 isa
<StoreInst
>(S
.getUse()->getUser())) {
5000 S
.makeUnsplittable();
5009 /// Describes the allocas introduced by rewritePartition in order to migrate
5015 Fragment(AllocaInst
*AI
, uint64_t O
, uint64_t S
)
5016 : Alloca(AI
), Offset(O
), Size(S
) {}
5018 SmallVector
<Fragment
, 4> Fragments
;
5020 // Rewrite each partition.
5021 for (auto &P
: AS
.partitions()) {
5022 if (AllocaInst
*NewAI
= rewritePartition(AI
, AS
, P
)) {
5025 uint64_t SizeOfByte
= 8;
5026 uint64_t AllocaSize
=
5027 DL
.getTypeSizeInBits(NewAI
->getAllocatedType()).getFixedValue();
5028 // Don't include any padding.
5029 uint64_t Size
= std::min(AllocaSize
, P
.size() * SizeOfByte
);
5030 Fragments
.push_back(Fragment(NewAI
, P
.beginOffset() * SizeOfByte
, Size
));
5036 NumAllocaPartitions
+= NumPartitions
;
5037 MaxPartitionsPerAlloca
.updateMax(NumPartitions
);
5039 // Migrate debug information from the old alloca to the new alloca(s)
5040 // and the individual partitions.
5041 auto MigrateOne
= [&](auto *DbgVariable
) {
5042 auto *Expr
= DbgVariable
->getExpression();
5043 DIBuilder
DIB(*AI
.getModule(), /*AllowUnresolved*/ false);
5044 uint64_t AllocaSize
=
5045 DL
.getTypeSizeInBits(AI
.getAllocatedType()).getFixedValue();
5046 for (auto Fragment
: Fragments
) {
5047 // Create a fragment expression describing the new partition or reuse AI's
5048 // expression if there is only one partition.
5049 auto *FragmentExpr
= Expr
;
5050 if (Fragment
.Size
< AllocaSize
|| Expr
->isFragment()) {
5051 // If this alloca is already a scalar replacement of a larger aggregate,
5052 // Fragment.Offset describes the offset inside the scalar.
5053 auto ExprFragment
= Expr
->getFragmentInfo();
5054 uint64_t Offset
= ExprFragment
? ExprFragment
->OffsetInBits
: 0;
5055 uint64_t Start
= Offset
+ Fragment
.Offset
;
5056 uint64_t Size
= Fragment
.Size
;
5059 ExprFragment
->OffsetInBits
+ ExprFragment
->SizeInBits
;
5060 if (Start
>= AbsEnd
) {
5061 // No need to describe a SROAed padding.
5064 Size
= std::min(Size
, AbsEnd
- Start
);
5066 // The new, smaller fragment is stenciled out from the old fragment.
5067 if (auto OrigFragment
= FragmentExpr
->getFragmentInfo()) {
5068 assert(Start
>= OrigFragment
->OffsetInBits
&&
5069 "new fragment is outside of original fragment");
5070 Start
-= OrigFragment
->OffsetInBits
;
5073 // The alloca may be larger than the variable.
5074 auto VarSize
= DbgVariable
->getVariable()->getSizeInBits();
5076 if (Size
> *VarSize
)
5078 if (Size
== 0 || Start
+ Size
> *VarSize
)
5082 // Avoid creating a fragment expression that covers the entire variable.
5083 if (!VarSize
|| *VarSize
!= Size
) {
5085 DIExpression::createFragmentExpression(Expr
, Start
, Size
))
5092 // Remove any existing intrinsics on the new alloca describing
5093 // the variable fragment.
5094 auto RemoveOne
= [DbgVariable
](auto *OldDII
) {
5095 auto SameVariableFragment
= [](const auto *LHS
, const auto *RHS
) {
5096 return LHS
->getVariable() == RHS
->getVariable() &&
5097 LHS
->getDebugLoc()->getInlinedAt() ==
5098 RHS
->getDebugLoc()->getInlinedAt();
5100 if (SameVariableFragment(OldDII
, DbgVariable
))
5101 OldDII
->eraseFromParent();
5103 for_each(findDbgDeclares(Fragment
.Alloca
), RemoveOne
);
5104 for_each(findDPVDeclares(Fragment
.Alloca
), RemoveOne
);
5106 insertNewDbgInst(DIB
, DbgVariable
, Fragment
.Alloca
, FragmentExpr
, &AI
);
5110 // Migrate debug information from the old alloca to the new alloca(s)
5111 // and the individual partitions.
5112 for_each(findDbgDeclares(&AI
), MigrateOne
);
5113 for_each(findDPVDeclares(&AI
), MigrateOne
);
5114 for_each(at::getAssignmentMarkers(&AI
), MigrateOne
);
5115 for_each(at::getDPVAssignmentMarkers(&AI
), MigrateOne
);
5120 /// Clobber a use with poison, deleting the used value if it becomes dead.
5121 void SROA::clobberUse(Use
&U
) {
5123 // Replace the use with an poison value.
5124 U
= PoisonValue::get(OldV
->getType());
5126 // Check for this making an instruction dead. We have to garbage collect
5127 // all the dead instructions to ensure the uses of any alloca end up being
5129 if (Instruction
*OldI
= dyn_cast
<Instruction
>(OldV
))
5130 if (isInstructionTriviallyDead(OldI
)) {
5131 DeadInsts
.push_back(OldI
);
5135 /// Analyze an alloca for SROA.
5137 /// This analyzes the alloca to ensure we can reason about it, builds
5138 /// the slices of the alloca, and then hands it off to be split and
5139 /// rewritten as needed.
5140 std::pair
<bool /*Changed*/, bool /*CFGChanged*/>
5141 SROA::runOnAlloca(AllocaInst
&AI
) {
5142 bool Changed
= false;
5143 bool CFGChanged
= false;
5145 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI
<< "\n");
5146 ++NumAllocasAnalyzed
;
5148 // Special case dead allocas, as they're trivial.
5149 if (AI
.use_empty()) {
5150 AI
.eraseFromParent();
5152 return {Changed
, CFGChanged
};
5154 const DataLayout
&DL
= AI
.getModule()->getDataLayout();
5156 // Skip alloca forms that this analysis can't handle.
5157 auto *AT
= AI
.getAllocatedType();
5158 TypeSize Size
= DL
.getTypeAllocSize(AT
);
5159 if (AI
.isArrayAllocation() || !AT
->isSized() || Size
.isScalable() ||
5160 Size
.getFixedValue() == 0)
5161 return {Changed
, CFGChanged
};
5163 // First, split any FCA loads and stores touching this alloca to promote
5164 // better splitting and promotion opportunities.
5165 IRBuilderTy
IRB(&AI
);
5166 AggLoadStoreRewriter
AggRewriter(DL
, IRB
);
5167 Changed
|= AggRewriter
.rewrite(AI
);
5169 // Build the slices using a recursive instruction-visiting builder.
5170 AllocaSlices
AS(DL
, AI
);
5171 LLVM_DEBUG(AS
.print(dbgs()));
5173 return {Changed
, CFGChanged
};
5175 // Delete all the dead users of this alloca before splitting and rewriting it.
5176 for (Instruction
*DeadUser
: AS
.getDeadUsers()) {
5177 // Free up everything used by this instruction.
5178 for (Use
&DeadOp
: DeadUser
->operands())
5181 // Now replace the uses of this instruction.
5182 DeadUser
->replaceAllUsesWith(PoisonValue::get(DeadUser
->getType()));
5184 // And mark it for deletion.
5185 DeadInsts
.push_back(DeadUser
);
5188 for (Use
*DeadOp
: AS
.getDeadOperands()) {
5189 clobberUse(*DeadOp
);
5193 // No slices to split. Leave the dead alloca for a later pass to clean up.
5194 if (AS
.begin() == AS
.end())
5195 return {Changed
, CFGChanged
};
5197 Changed
|= splitAlloca(AI
, AS
);
5199 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5200 while (!SpeculatablePHIs
.empty())
5201 speculatePHINodeLoads(IRB
, *SpeculatablePHIs
.pop_back_val());
5203 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5204 auto RemainingSelectsToRewrite
= SelectsToRewrite
.takeVector();
5205 while (!RemainingSelectsToRewrite
.empty()) {
5206 const auto [K
, V
] = RemainingSelectsToRewrite
.pop_back_val();
5208 rewriteSelectInstMemOps(*K
, V
, IRB
, PreserveCFG
? nullptr : DTU
);
5211 return {Changed
, CFGChanged
};
5214 /// Delete the dead instructions accumulated in this run.
5216 /// Recursively deletes the dead instructions we've accumulated. This is done
5217 /// at the very end to maximize locality of the recursive delete and to
5218 /// minimize the problems of invalidated instruction pointers as such pointers
5219 /// are used heavily in the intermediate stages of the algorithm.
5221 /// We also record the alloca instructions deleted here so that they aren't
5222 /// subsequently handed to mem2reg to promote.
5223 bool SROA::deleteDeadInstructions(
5224 SmallPtrSetImpl
<AllocaInst
*> &DeletedAllocas
) {
5225 bool Changed
= false;
5226 while (!DeadInsts
.empty()) {
5227 Instruction
*I
= dyn_cast_or_null
<Instruction
>(DeadInsts
.pop_back_val());
5230 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I
<< "\n");
5232 // If the instruction is an alloca, find the possible dbg.declare connected
5233 // to it, and remove it too. We must do this before calling RAUW or we will
5234 // not be able to find it.
5235 if (AllocaInst
*AI
= dyn_cast
<AllocaInst
>(I
)) {
5236 DeletedAllocas
.insert(AI
);
5237 for (DbgDeclareInst
*OldDII
: findDbgDeclares(AI
))
5238 OldDII
->eraseFromParent();
5239 for (DPValue
*OldDII
: findDPVDeclares(AI
))
5240 OldDII
->eraseFromParent();
5243 at::deleteAssignmentMarkers(I
);
5244 I
->replaceAllUsesWith(UndefValue::get(I
->getType()));
5246 for (Use
&Operand
: I
->operands())
5247 if (Instruction
*U
= dyn_cast
<Instruction
>(Operand
)) {
5248 // Zero out the operand and see if it becomes trivially dead.
5250 if (isInstructionTriviallyDead(U
))
5251 DeadInsts
.push_back(U
);
5255 I
->eraseFromParent();
5261 /// Promote the allocas, using the best available technique.
5263 /// This attempts to promote whatever allocas have been identified as viable in
5264 /// the PromotableAllocas list. If that list is empty, there is nothing to do.
5265 /// This function returns whether any promotion occurred.
5266 bool SROA::promoteAllocas(Function
&F
) {
5267 if (PromotableAllocas
.empty())
5270 NumPromoted
+= PromotableAllocas
.size();
5272 if (SROASkipMem2Reg
) {
5273 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5275 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5276 PromoteMemToReg(PromotableAllocas
, DTU
->getDomTree(), AC
);
5279 PromotableAllocas
.clear();
5283 std::pair
<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function
&F
) {
5284 LLVM_DEBUG(dbgs() << "SROA function: " << F
.getName() << "\n");
5286 const DataLayout
&DL
= F
.getParent()->getDataLayout();
5287 BasicBlock
&EntryBB
= F
.getEntryBlock();
5288 for (BasicBlock::iterator I
= EntryBB
.begin(), E
= std::prev(EntryBB
.end());
5290 if (AllocaInst
*AI
= dyn_cast
<AllocaInst
>(I
)) {
5291 if (DL
.getTypeAllocSize(AI
->getAllocatedType()).isScalable() &&
5292 isAllocaPromotable(AI
))
5293 PromotableAllocas
.push_back(AI
);
5295 Worklist
.insert(AI
);
5299 bool Changed
= false;
5300 bool CFGChanged
= false;
5301 // A set of deleted alloca instruction pointers which should be removed from
5302 // the list of promotable allocas.
5303 SmallPtrSet
<AllocaInst
*, 4> DeletedAllocas
;
5306 while (!Worklist
.empty()) {
5307 auto [IterationChanged
, IterationCFGChanged
] =
5308 runOnAlloca(*Worklist
.pop_back_val());
5309 Changed
|= IterationChanged
;
5310 CFGChanged
|= IterationCFGChanged
;
5312 Changed
|= deleteDeadInstructions(DeletedAllocas
);
5314 // Remove the deleted allocas from various lists so that we don't try to
5315 // continue processing them.
5316 if (!DeletedAllocas
.empty()) {
5317 auto IsInSet
= [&](AllocaInst
*AI
) { return DeletedAllocas
.count(AI
); };
5318 Worklist
.remove_if(IsInSet
);
5319 PostPromotionWorklist
.remove_if(IsInSet
);
5320 llvm::erase_if(PromotableAllocas
, IsInSet
);
5321 DeletedAllocas
.clear();
5325 Changed
|= promoteAllocas(F
);
5327 Worklist
= PostPromotionWorklist
;
5328 PostPromotionWorklist
.clear();
5329 } while (!Worklist
.empty());
5331 assert((!CFGChanged
|| Changed
) && "Can not only modify the CFG.");
5332 assert((!CFGChanged
|| !PreserveCFG
) &&
5333 "Should not have modified the CFG when told to preserve it.");
5335 if (Changed
&& isAssignmentTrackingEnabled(*F
.getParent())) {
5336 for (auto &BB
: F
) {
5337 RemoveRedundantDbgInstrs(&BB
);
5341 return {Changed
, CFGChanged
};
5344 PreservedAnalyses
SROAPass::run(Function
&F
, FunctionAnalysisManager
&AM
) {
5345 DominatorTree
&DT
= AM
.getResult
<DominatorTreeAnalysis
>(F
);
5346 AssumptionCache
&AC
= AM
.getResult
<AssumptionAnalysis
>(F
);
5347 DomTreeUpdater
DTU(DT
, DomTreeUpdater::UpdateStrategy::Lazy
);
5348 auto [Changed
, CFGChanged
] =
5349 SROA(&F
.getContext(), &DTU
, &AC
, PreserveCFG
).runSROA(F
);
5351 return PreservedAnalyses::all();
5352 PreservedAnalyses PA
;
5354 PA
.preserveSet
<CFGAnalyses
>();
5355 PA
.preserve
<DominatorTreeAnalysis
>();
5359 void SROAPass::printPipeline(
5360 raw_ostream
&OS
, function_ref
<StringRef(StringRef
)> MapClassName2PassName
) {
5361 static_cast<PassInfoMixin
<SROAPass
> *>(this)->printPipeline(
5362 OS
, MapClassName2PassName
);
5363 OS
<< (PreserveCFG
== SROAOptions::PreserveCFG
? "<preserve-cfg>"
5367 SROAPass::SROAPass(SROAOptions PreserveCFG
) : PreserveCFG(PreserveCFG
) {}
5371 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
5372 class SROALegacyPass
: public FunctionPass
{
5373 SROAOptions PreserveCFG
;
5378 SROALegacyPass(SROAOptions PreserveCFG
= SROAOptions::PreserveCFG
)
5379 : FunctionPass(ID
), PreserveCFG(PreserveCFG
) {
5380 initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
5383 bool runOnFunction(Function
&F
) override
{
5384 if (skipFunction(F
))
5387 DominatorTree
&DT
= getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
5388 AssumptionCache
&AC
=
5389 getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
5390 DomTreeUpdater
DTU(DT
, DomTreeUpdater::UpdateStrategy::Lazy
);
5392 SROA(&F
.getContext(), &DTU
, &AC
, PreserveCFG
).runSROA(F
);
5396 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
5397 AU
.addRequired
<AssumptionCacheTracker
>();
5398 AU
.addRequired
<DominatorTreeWrapperPass
>();
5399 AU
.addPreserved
<GlobalsAAWrapperPass
>();
5400 AU
.addPreserved
<DominatorTreeWrapperPass
>();
5403 StringRef
getPassName() const override
{ return "SROA"; }
5406 } // end anonymous namespace
5408 char SROALegacyPass::ID
= 0;
5410 FunctionPass
*llvm::createSROAPass(bool PreserveCFG
) {
5411 return new SROALegacyPass(PreserveCFG
? SROAOptions::PreserveCFG
5412 : SROAOptions::ModifyCFG
);
5415 INITIALIZE_PASS_BEGIN(SROALegacyPass
, "sroa",
5416 "Scalar Replacement Of Aggregates", false, false)
5417 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
5418 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
5419 INITIALIZE_PASS_END(SROALegacyPass
, "sroa", "Scalar Replacement Of Aggregates",