1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass implements an idiom recognizer that transforms simple loops into a
10 // non-loop form. In cases that this kicks in, it can be a significant
13 // If compiling for code size we avoid idiom recognition if the resulting
14 // code could be larger than the code for the original loop. One way this could
15 // happen is if the loop is not removable after idiom recognition due to the
16 // presence of non-idiom instructions. The initial implementation of the
17 // heuristics applies to idioms in multi-block loops.
19 //===----------------------------------------------------------------------===//
23 // Future loop memory idioms to recognize:
24 // memcmp, memmove, strlen, etc.
25 // Future floating point idioms to recognize in -ffast-math mode:
27 // Future integer operation idioms to recognize:
30 // Beware that isel's default lowering for ctpop is highly inefficient for
31 // i64 and larger types when i64 is legal and the value has few bits set. It
32 // would be good to enhance isel to emit a loop for ctpop in this case.
34 // This could recognize common matrix multiplies and dot product idioms and
35 // replace them with calls to BLAS (if linked in??).
37 //===----------------------------------------------------------------------===//
39 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
40 #include "llvm/ADT/APInt.h"
41 #include "llvm/ADT/ArrayRef.h"
42 #include "llvm/ADT/DenseMap.h"
43 #include "llvm/ADT/MapVector.h"
44 #include "llvm/ADT/SetVector.h"
45 #include "llvm/ADT/SmallPtrSet.h"
46 #include "llvm/ADT/SmallVector.h"
47 #include "llvm/ADT/Statistic.h"
48 #include "llvm/ADT/StringRef.h"
49 #include "llvm/Analysis/AliasAnalysis.h"
50 #include "llvm/Analysis/LoopAccessAnalysis.h"
51 #include "llvm/Analysis/LoopInfo.h"
52 #include "llvm/Analysis/LoopPass.h"
53 #include "llvm/Analysis/MemoryLocation.h"
54 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
55 #include "llvm/Analysis/ScalarEvolution.h"
56 #include "llvm/Analysis/ScalarEvolutionExpander.h"
57 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
58 #include "llvm/Analysis/TargetLibraryInfo.h"
59 #include "llvm/Analysis/TargetTransformInfo.h"
60 #include "llvm/Analysis/ValueTracking.h"
61 #include "llvm/IR/Attributes.h"
62 #include "llvm/IR/BasicBlock.h"
63 #include "llvm/IR/Constant.h"
64 #include "llvm/IR/Constants.h"
65 #include "llvm/IR/DataLayout.h"
66 #include "llvm/IR/DebugLoc.h"
67 #include "llvm/IR/DerivedTypes.h"
68 #include "llvm/IR/Dominators.h"
69 #include "llvm/IR/GlobalValue.h"
70 #include "llvm/IR/GlobalVariable.h"
71 #include "llvm/IR/IRBuilder.h"
72 #include "llvm/IR/InstrTypes.h"
73 #include "llvm/IR/Instruction.h"
74 #include "llvm/IR/Instructions.h"
75 #include "llvm/IR/IntrinsicInst.h"
76 #include "llvm/IR/Intrinsics.h"
77 #include "llvm/IR/LLVMContext.h"
78 #include "llvm/IR/Module.h"
79 #include "llvm/IR/PassManager.h"
80 #include "llvm/IR/Type.h"
81 #include "llvm/IR/User.h"
82 #include "llvm/IR/Value.h"
83 #include "llvm/IR/ValueHandle.h"
84 #include "llvm/Pass.h"
85 #include "llvm/Support/Casting.h"
86 #include "llvm/Support/CommandLine.h"
87 #include "llvm/Support/Debug.h"
88 #include "llvm/Support/raw_ostream.h"
89 #include "llvm/Transforms/Scalar.h"
90 #include "llvm/Transforms/Utils/BuildLibCalls.h"
91 #include "llvm/Transforms/Utils/Local.h"
92 #include "llvm/Transforms/Utils/LoopUtils.h"
101 #define DEBUG_TYPE "loop-idiom"
103 STATISTIC(NumMemSet
, "Number of memset's formed from loop stores");
104 STATISTIC(NumMemCpy
, "Number of memcpy's formed from loop load+stores");
106 static cl::opt
<bool> UseLIRCodeSizeHeurs(
107 "use-lir-code-size-heurs",
108 cl::desc("Use loop idiom recognition code size heuristics when compiling"
110 cl::init(true), cl::Hidden
);
114 class LoopIdiomRecognize
{
115 Loop
*CurLoop
= nullptr;
120 TargetLibraryInfo
*TLI
;
121 const TargetTransformInfo
*TTI
;
122 const DataLayout
*DL
;
123 OptimizationRemarkEmitter
&ORE
;
124 bool ApplyCodeSizeHeuristics
;
127 explicit LoopIdiomRecognize(AliasAnalysis
*AA
, DominatorTree
*DT
,
128 LoopInfo
*LI
, ScalarEvolution
*SE
,
129 TargetLibraryInfo
*TLI
,
130 const TargetTransformInfo
*TTI
,
131 const DataLayout
*DL
,
132 OptimizationRemarkEmitter
&ORE
)
133 : AA(AA
), DT(DT
), LI(LI
), SE(SE
), TLI(TLI
), TTI(TTI
), DL(DL
), ORE(ORE
) {}
135 bool runOnLoop(Loop
*L
);
138 using StoreList
= SmallVector
<StoreInst
*, 8>;
139 using StoreListMap
= MapVector
<Value
*, StoreList
>;
141 StoreListMap StoreRefsForMemset
;
142 StoreListMap StoreRefsForMemsetPattern
;
143 StoreList StoreRefsForMemcpy
;
145 bool HasMemsetPattern
;
148 /// Return code for isLegalStore()
149 enum LegalStoreKind
{
154 UnorderedAtomicMemcpy
,
155 DontUse
// Dummy retval never to be used. Allows catching errors in retval
159 /// \name Countable Loop Idiom Handling
162 bool runOnCountableLoop();
163 bool runOnLoopBlock(BasicBlock
*BB
, const SCEV
*BECount
,
164 SmallVectorImpl
<BasicBlock
*> &ExitBlocks
);
166 void collectStores(BasicBlock
*BB
);
167 LegalStoreKind
isLegalStore(StoreInst
*SI
);
168 enum class ForMemset
{ No
, Yes
};
169 bool processLoopStores(SmallVectorImpl
<StoreInst
*> &SL
, const SCEV
*BECount
,
171 bool processLoopMemSet(MemSetInst
*MSI
, const SCEV
*BECount
);
173 bool processLoopStridedStore(Value
*DestPtr
, unsigned StoreSize
,
174 unsigned StoreAlignment
, Value
*StoredVal
,
175 Instruction
*TheStore
,
176 SmallPtrSetImpl
<Instruction
*> &Stores
,
177 const SCEVAddRecExpr
*Ev
, const SCEV
*BECount
,
178 bool NegStride
, bool IsLoopMemset
= false);
179 bool processLoopStoreOfLoopLoad(StoreInst
*SI
, const SCEV
*BECount
);
180 bool avoidLIRForMultiBlockLoop(bool IsMemset
= false,
181 bool IsLoopMemset
= false);
184 /// \name Noncountable Loop Idiom Handling
187 bool runOnNoncountableLoop();
189 bool recognizePopcount();
190 void transformLoopToPopcount(BasicBlock
*PreCondBB
, Instruction
*CntInst
,
191 PHINode
*CntPhi
, Value
*Var
);
192 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
193 void transformLoopToCountable(Intrinsic::ID IntrinID
, BasicBlock
*PreCondBB
,
194 Instruction
*CntInst
, PHINode
*CntPhi
,
195 Value
*Var
, Instruction
*DefX
,
196 const DebugLoc
&DL
, bool ZeroCheck
,
197 bool IsCntPhiUsedOutsideLoop
);
202 class LoopIdiomRecognizeLegacyPass
: public LoopPass
{
206 explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID
) {
207 initializeLoopIdiomRecognizeLegacyPassPass(
208 *PassRegistry::getPassRegistry());
211 bool runOnLoop(Loop
*L
, LPPassManager
&LPM
) override
{
215 AliasAnalysis
*AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
216 DominatorTree
*DT
= &getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
217 LoopInfo
*LI
= &getAnalysis
<LoopInfoWrapperPass
>().getLoopInfo();
218 ScalarEvolution
*SE
= &getAnalysis
<ScalarEvolutionWrapperPass
>().getSE();
219 TargetLibraryInfo
*TLI
=
220 &getAnalysis
<TargetLibraryInfoWrapperPass
>().getTLI();
221 const TargetTransformInfo
*TTI
=
222 &getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(
223 *L
->getHeader()->getParent());
224 const DataLayout
*DL
= &L
->getHeader()->getModule()->getDataLayout();
226 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
227 // pass. Function analyses need to be preserved across loop transformations
228 // but ORE cannot be preserved (see comment before the pass definition).
229 OptimizationRemarkEmitter
ORE(L
->getHeader()->getParent());
231 LoopIdiomRecognize
LIR(AA
, DT
, LI
, SE
, TLI
, TTI
, DL
, ORE
);
232 return LIR
.runOnLoop(L
);
235 /// This transformation requires natural loop information & requires that
236 /// loop preheaders be inserted into the CFG.
237 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
238 AU
.addRequired
<TargetLibraryInfoWrapperPass
>();
239 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
240 getLoopAnalysisUsage(AU
);
244 } // end anonymous namespace
246 char LoopIdiomRecognizeLegacyPass::ID
= 0;
248 PreservedAnalyses
LoopIdiomRecognizePass::run(Loop
&L
, LoopAnalysisManager
&AM
,
249 LoopStandardAnalysisResults
&AR
,
251 const auto *DL
= &L
.getHeader()->getModule()->getDataLayout();
254 AM
.getResult
<FunctionAnalysisManagerLoopProxy
>(L
, AR
).getManager();
255 Function
*F
= L
.getHeader()->getParent();
257 auto *ORE
= FAM
.getCachedResult
<OptimizationRemarkEmitterAnalysis
>(*F
);
258 // FIXME: This should probably be optional rather than required.
261 "LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached "
262 "at a higher level");
264 LoopIdiomRecognize
LIR(&AR
.AA
, &AR
.DT
, &AR
.LI
, &AR
.SE
, &AR
.TLI
, &AR
.TTI
, DL
,
266 if (!LIR
.runOnLoop(&L
))
267 return PreservedAnalyses::all();
269 return getLoopPassPreservedAnalyses();
272 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass
, "loop-idiom",
273 "Recognize loop idioms", false, false)
274 INITIALIZE_PASS_DEPENDENCY(LoopPass
)
275 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass
)
276 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
277 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass
, "loop-idiom",
278 "Recognize loop idioms", false, false)
280 Pass
*llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
282 static void deleteDeadInstruction(Instruction
*I
) {
283 I
->replaceAllUsesWith(UndefValue::get(I
->getType()));
284 I
->eraseFromParent();
287 //===----------------------------------------------------------------------===//
289 // Implementation of LoopIdiomRecognize
291 //===----------------------------------------------------------------------===//
293 bool LoopIdiomRecognize::runOnLoop(Loop
*L
) {
295 // If the loop could not be converted to canonical form, it must have an
296 // indirectbr in it, just give up.
297 if (!L
->getLoopPreheader())
300 // Disable loop idiom recognition if the function's name is a common idiom.
301 StringRef Name
= L
->getHeader()->getParent()->getName();
302 if (Name
== "memset" || Name
== "memcpy")
305 // Determine if code size heuristics need to be applied.
306 ApplyCodeSizeHeuristics
=
307 L
->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs
;
309 HasMemset
= TLI
->has(LibFunc_memset
);
310 HasMemsetPattern
= TLI
->has(LibFunc_memset_pattern16
);
311 HasMemcpy
= TLI
->has(LibFunc_memcpy
);
313 if (HasMemset
|| HasMemsetPattern
|| HasMemcpy
)
314 if (SE
->hasLoopInvariantBackedgeTakenCount(L
))
315 return runOnCountableLoop();
317 return runOnNoncountableLoop();
320 bool LoopIdiomRecognize::runOnCountableLoop() {
321 const SCEV
*BECount
= SE
->getBackedgeTakenCount(CurLoop
);
322 assert(!isa
<SCEVCouldNotCompute
>(BECount
) &&
323 "runOnCountableLoop() called on a loop without a predictable"
324 "backedge-taken count");
326 // If this loop executes exactly one time, then it should be peeled, not
327 // optimized by this pass.
328 if (const SCEVConstant
*BECst
= dyn_cast
<SCEVConstant
>(BECount
))
329 if (BECst
->getAPInt() == 0)
332 SmallVector
<BasicBlock
*, 8> ExitBlocks
;
333 CurLoop
->getUniqueExitBlocks(ExitBlocks
);
335 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Scanning: F["
336 << CurLoop
->getHeader()->getParent()->getName()
337 << "] Countable Loop %" << CurLoop
->getHeader()->getName()
340 bool MadeChange
= false;
342 // The following transforms hoist stores/memsets into the loop pre-header.
343 // Give up if the loop has instructions may throw.
344 SimpleLoopSafetyInfo SafetyInfo
;
345 SafetyInfo
.computeLoopSafetyInfo(CurLoop
);
346 if (SafetyInfo
.anyBlockMayThrow())
349 // Scan all the blocks in the loop that are not in subloops.
350 for (auto *BB
: CurLoop
->getBlocks()) {
351 // Ignore blocks in subloops.
352 if (LI
->getLoopFor(BB
) != CurLoop
)
355 MadeChange
|= runOnLoopBlock(BB
, BECount
, ExitBlocks
);
360 static APInt
getStoreStride(const SCEVAddRecExpr
*StoreEv
) {
361 const SCEVConstant
*ConstStride
= cast
<SCEVConstant
>(StoreEv
->getOperand(1));
362 return ConstStride
->getAPInt();
365 /// getMemSetPatternValue - If a strided store of the specified value is safe to
366 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
367 /// be passed in. Otherwise, return null.
369 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
370 /// just replicate their input array and then pass on to memset_pattern16.
371 static Constant
*getMemSetPatternValue(Value
*V
, const DataLayout
*DL
) {
372 // FIXME: This could check for UndefValue because it can be merged into any
373 // other valid pattern.
375 // If the value isn't a constant, we can't promote it to being in a constant
376 // array. We could theoretically do a store to an alloca or something, but
377 // that doesn't seem worthwhile.
378 Constant
*C
= dyn_cast
<Constant
>(V
);
382 // Only handle simple values that are a power of two bytes in size.
383 uint64_t Size
= DL
->getTypeSizeInBits(V
->getType());
384 if (Size
== 0 || (Size
& 7) || (Size
& (Size
- 1)))
387 // Don't care enough about darwin/ppc to implement this.
388 if (DL
->isBigEndian())
391 // Convert to size in bytes.
394 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
395 // if the top and bottom are the same (e.g. for vectors and large integers).
399 // If the constant is exactly 16 bytes, just use it.
403 // Otherwise, we'll use an array of the constants.
404 unsigned ArraySize
= 16 / Size
;
405 ArrayType
*AT
= ArrayType::get(V
->getType(), ArraySize
);
406 return ConstantArray::get(AT
, std::vector
<Constant
*>(ArraySize
, C
));
409 LoopIdiomRecognize::LegalStoreKind
410 LoopIdiomRecognize::isLegalStore(StoreInst
*SI
) {
411 // Don't touch volatile stores.
412 if (SI
->isVolatile())
413 return LegalStoreKind::None
;
414 // We only want simple or unordered-atomic stores.
415 if (!SI
->isUnordered())
416 return LegalStoreKind::None
;
418 // Don't convert stores of non-integral pointer types to memsets (which stores
420 if (DL
->isNonIntegralPointerType(SI
->getValueOperand()->getType()))
421 return LegalStoreKind::None
;
423 // Avoid merging nontemporal stores.
424 if (SI
->getMetadata(LLVMContext::MD_nontemporal
))
425 return LegalStoreKind::None
;
427 Value
*StoredVal
= SI
->getValueOperand();
428 Value
*StorePtr
= SI
->getPointerOperand();
430 // Reject stores that are so large that they overflow an unsigned.
431 uint64_t SizeInBits
= DL
->getTypeSizeInBits(StoredVal
->getType());
432 if ((SizeInBits
& 7) || (SizeInBits
>> 32) != 0)
433 return LegalStoreKind::None
;
435 // See if the pointer expression is an AddRec like {base,+,1} on the current
436 // loop, which indicates a strided store. If we have something else, it's a
437 // random store we can't handle.
438 const SCEVAddRecExpr
*StoreEv
=
439 dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(StorePtr
));
440 if (!StoreEv
|| StoreEv
->getLoop() != CurLoop
|| !StoreEv
->isAffine())
441 return LegalStoreKind::None
;
443 // Check to see if we have a constant stride.
444 if (!isa
<SCEVConstant
>(StoreEv
->getOperand(1)))
445 return LegalStoreKind::None
;
447 // See if the store can be turned into a memset.
449 // If the stored value is a byte-wise value (like i32 -1), then it may be
450 // turned into a memset of i8 -1, assuming that all the consecutive bytes
451 // are stored. A store of i32 0x01020304 can never be turned into a memset,
452 // but it can be turned into memset_pattern if the target supports it.
453 Value
*SplatValue
= isBytewiseValue(StoredVal
, *DL
);
454 Constant
*PatternValue
= nullptr;
456 // Note: memset and memset_pattern on unordered-atomic is yet not supported
457 bool UnorderedAtomic
= SI
->isUnordered() && !SI
->isSimple();
459 // If we're allowed to form a memset, and the stored value would be
460 // acceptable for memset, use it.
461 if (!UnorderedAtomic
&& HasMemset
&& SplatValue
&&
462 // Verify that the stored value is loop invariant. If not, we can't
463 // promote the memset.
464 CurLoop
->isLoopInvariant(SplatValue
)) {
465 // It looks like we can use SplatValue.
466 return LegalStoreKind::Memset
;
467 } else if (!UnorderedAtomic
&& HasMemsetPattern
&&
468 // Don't create memset_pattern16s with address spaces.
469 StorePtr
->getType()->getPointerAddressSpace() == 0 &&
470 (PatternValue
= getMemSetPatternValue(StoredVal
, DL
))) {
471 // It looks like we can use PatternValue!
472 return LegalStoreKind::MemsetPattern
;
475 // Otherwise, see if the store can be turned into a memcpy.
477 // Check to see if the stride matches the size of the store. If so, then we
478 // know that every byte is touched in the loop.
479 APInt Stride
= getStoreStride(StoreEv
);
480 unsigned StoreSize
= DL
->getTypeStoreSize(SI
->getValueOperand()->getType());
481 if (StoreSize
!= Stride
&& StoreSize
!= -Stride
)
482 return LegalStoreKind::None
;
484 // The store must be feeding a non-volatile load.
485 LoadInst
*LI
= dyn_cast
<LoadInst
>(SI
->getValueOperand());
487 // Only allow non-volatile loads
488 if (!LI
|| LI
->isVolatile())
489 return LegalStoreKind::None
;
490 // Only allow simple or unordered-atomic loads
491 if (!LI
->isUnordered())
492 return LegalStoreKind::None
;
494 // See if the pointer expression is an AddRec like {base,+,1} on the current
495 // loop, which indicates a strided load. If we have something else, it's a
496 // random load we can't handle.
497 const SCEVAddRecExpr
*LoadEv
=
498 dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(LI
->getPointerOperand()));
499 if (!LoadEv
|| LoadEv
->getLoop() != CurLoop
|| !LoadEv
->isAffine())
500 return LegalStoreKind::None
;
502 // The store and load must share the same stride.
503 if (StoreEv
->getOperand(1) != LoadEv
->getOperand(1))
504 return LegalStoreKind::None
;
506 // Success. This store can be converted into a memcpy.
507 UnorderedAtomic
= UnorderedAtomic
|| LI
->isAtomic();
508 return UnorderedAtomic
? LegalStoreKind::UnorderedAtomicMemcpy
509 : LegalStoreKind::Memcpy
;
511 // This store can't be transformed into a memset/memcpy.
512 return LegalStoreKind::None
;
515 void LoopIdiomRecognize::collectStores(BasicBlock
*BB
) {
516 StoreRefsForMemset
.clear();
517 StoreRefsForMemsetPattern
.clear();
518 StoreRefsForMemcpy
.clear();
519 for (Instruction
&I
: *BB
) {
520 StoreInst
*SI
= dyn_cast
<StoreInst
>(&I
);
524 // Make sure this is a strided store with a constant stride.
525 switch (isLegalStore(SI
)) {
526 case LegalStoreKind::None
:
529 case LegalStoreKind::Memset
: {
530 // Find the base pointer.
531 Value
*Ptr
= GetUnderlyingObject(SI
->getPointerOperand(), *DL
);
532 StoreRefsForMemset
[Ptr
].push_back(SI
);
534 case LegalStoreKind::MemsetPattern
: {
535 // Find the base pointer.
536 Value
*Ptr
= GetUnderlyingObject(SI
->getPointerOperand(), *DL
);
537 StoreRefsForMemsetPattern
[Ptr
].push_back(SI
);
539 case LegalStoreKind::Memcpy
:
540 case LegalStoreKind::UnorderedAtomicMemcpy
:
541 StoreRefsForMemcpy
.push_back(SI
);
544 assert(false && "unhandled return value");
550 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
551 /// with the specified backedge count. This block is known to be in the current
552 /// loop and not in any subloops.
553 bool LoopIdiomRecognize::runOnLoopBlock(
554 BasicBlock
*BB
, const SCEV
*BECount
,
555 SmallVectorImpl
<BasicBlock
*> &ExitBlocks
) {
556 // We can only promote stores in this block if they are unconditionally
557 // executed in the loop. For a block to be unconditionally executed, it has
558 // to dominate all the exit blocks of the loop. Verify this now.
559 for (unsigned i
= 0, e
= ExitBlocks
.size(); i
!= e
; ++i
)
560 if (!DT
->dominates(BB
, ExitBlocks
[i
]))
563 bool MadeChange
= false;
564 // Look for store instructions, which may be optimized to memset/memcpy.
567 // Look for a single store or sets of stores with a common base, which can be
568 // optimized into a memset (memset_pattern). The latter most commonly happens
569 // with structs and handunrolled loops.
570 for (auto &SL
: StoreRefsForMemset
)
571 MadeChange
|= processLoopStores(SL
.second
, BECount
, ForMemset::Yes
);
573 for (auto &SL
: StoreRefsForMemsetPattern
)
574 MadeChange
|= processLoopStores(SL
.second
, BECount
, ForMemset::No
);
576 // Optimize the store into a memcpy, if it feeds an similarly strided load.
577 for (auto &SI
: StoreRefsForMemcpy
)
578 MadeChange
|= processLoopStoreOfLoopLoad(SI
, BECount
);
580 for (BasicBlock::iterator I
= BB
->begin(), E
= BB
->end(); I
!= E
;) {
581 Instruction
*Inst
= &*I
++;
582 // Look for memset instructions, which may be optimized to a larger memset.
583 if (MemSetInst
*MSI
= dyn_cast
<MemSetInst
>(Inst
)) {
584 WeakTrackingVH
InstPtr(&*I
);
585 if (!processLoopMemSet(MSI
, BECount
))
589 // If processing the memset invalidated our iterator, start over from the
600 /// See if this store(s) can be promoted to a memset.
601 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl
<StoreInst
*> &SL
,
602 const SCEV
*BECount
, ForMemset For
) {
603 // Try to find consecutive stores that can be transformed into memsets.
604 SetVector
<StoreInst
*> Heads
, Tails
;
605 SmallDenseMap
<StoreInst
*, StoreInst
*> ConsecutiveChain
;
607 // Do a quadratic search on all of the given stores and find
608 // all of the pairs of stores that follow each other.
609 SmallVector
<unsigned, 16> IndexQueue
;
610 for (unsigned i
= 0, e
= SL
.size(); i
< e
; ++i
) {
611 assert(SL
[i
]->isSimple() && "Expected only non-volatile stores.");
613 Value
*FirstStoredVal
= SL
[i
]->getValueOperand();
614 Value
*FirstStorePtr
= SL
[i
]->getPointerOperand();
615 const SCEVAddRecExpr
*FirstStoreEv
=
616 cast
<SCEVAddRecExpr
>(SE
->getSCEV(FirstStorePtr
));
617 APInt FirstStride
= getStoreStride(FirstStoreEv
);
618 unsigned FirstStoreSize
= DL
->getTypeStoreSize(SL
[i
]->getValueOperand()->getType());
620 // See if we can optimize just this store in isolation.
621 if (FirstStride
== FirstStoreSize
|| -FirstStride
== FirstStoreSize
) {
626 Value
*FirstSplatValue
= nullptr;
627 Constant
*FirstPatternValue
= nullptr;
629 if (For
== ForMemset::Yes
)
630 FirstSplatValue
= isBytewiseValue(FirstStoredVal
, *DL
);
632 FirstPatternValue
= getMemSetPatternValue(FirstStoredVal
, DL
);
634 assert((FirstSplatValue
|| FirstPatternValue
) &&
635 "Expected either splat value or pattern value.");
638 // If a store has multiple consecutive store candidates, search Stores
639 // array according to the sequence: from i+1 to e, then from i-1 to 0.
640 // This is because usually pairing with immediate succeeding or preceding
641 // candidate create the best chance to find memset opportunity.
643 for (j
= i
+ 1; j
< e
; ++j
)
644 IndexQueue
.push_back(j
);
645 for (j
= i
; j
> 0; --j
)
646 IndexQueue
.push_back(j
- 1);
648 for (auto &k
: IndexQueue
) {
649 assert(SL
[k
]->isSimple() && "Expected only non-volatile stores.");
650 Value
*SecondStorePtr
= SL
[k
]->getPointerOperand();
651 const SCEVAddRecExpr
*SecondStoreEv
=
652 cast
<SCEVAddRecExpr
>(SE
->getSCEV(SecondStorePtr
));
653 APInt SecondStride
= getStoreStride(SecondStoreEv
);
655 if (FirstStride
!= SecondStride
)
658 Value
*SecondStoredVal
= SL
[k
]->getValueOperand();
659 Value
*SecondSplatValue
= nullptr;
660 Constant
*SecondPatternValue
= nullptr;
662 if (For
== ForMemset::Yes
)
663 SecondSplatValue
= isBytewiseValue(SecondStoredVal
, *DL
);
665 SecondPatternValue
= getMemSetPatternValue(SecondStoredVal
, DL
);
667 assert((SecondSplatValue
|| SecondPatternValue
) &&
668 "Expected either splat value or pattern value.");
670 if (isConsecutiveAccess(SL
[i
], SL
[k
], *DL
, *SE
, false)) {
671 if (For
== ForMemset::Yes
) {
672 if (isa
<UndefValue
>(FirstSplatValue
))
673 FirstSplatValue
= SecondSplatValue
;
674 if (FirstSplatValue
!= SecondSplatValue
)
677 if (isa
<UndefValue
>(FirstPatternValue
))
678 FirstPatternValue
= SecondPatternValue
;
679 if (FirstPatternValue
!= SecondPatternValue
)
684 ConsecutiveChain
[SL
[i
]] = SL
[k
];
690 // We may run into multiple chains that merge into a single chain. We mark the
691 // stores that we transformed so that we don't visit the same store twice.
692 SmallPtrSet
<Value
*, 16> TransformedStores
;
693 bool Changed
= false;
695 // For stores that start but don't end a link in the chain:
696 for (SetVector
<StoreInst
*>::iterator it
= Heads
.begin(), e
= Heads
.end();
698 if (Tails
.count(*it
))
701 // We found a store instr that starts a chain. Now follow the chain and try
703 SmallPtrSet
<Instruction
*, 8> AdjacentStores
;
706 StoreInst
*HeadStore
= I
;
707 unsigned StoreSize
= 0;
709 // Collect the chain into a list.
710 while (Tails
.count(I
) || Heads
.count(I
)) {
711 if (TransformedStores
.count(I
))
713 AdjacentStores
.insert(I
);
715 StoreSize
+= DL
->getTypeStoreSize(I
->getValueOperand()->getType());
716 // Move to the next value in the chain.
717 I
= ConsecutiveChain
[I
];
720 Value
*StoredVal
= HeadStore
->getValueOperand();
721 Value
*StorePtr
= HeadStore
->getPointerOperand();
722 const SCEVAddRecExpr
*StoreEv
= cast
<SCEVAddRecExpr
>(SE
->getSCEV(StorePtr
));
723 APInt Stride
= getStoreStride(StoreEv
);
725 // Check to see if the stride matches the size of the stores. If so, then
726 // we know that every byte is touched in the loop.
727 if (StoreSize
!= Stride
&& StoreSize
!= -Stride
)
730 bool NegStride
= StoreSize
== -Stride
;
732 if (processLoopStridedStore(StorePtr
, StoreSize
, HeadStore
->getAlignment(),
733 StoredVal
, HeadStore
, AdjacentStores
, StoreEv
,
734 BECount
, NegStride
)) {
735 TransformedStores
.insert(AdjacentStores
.begin(), AdjacentStores
.end());
743 /// processLoopMemSet - See if this memset can be promoted to a large memset.
744 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst
*MSI
,
745 const SCEV
*BECount
) {
746 // We can only handle non-volatile memsets with a constant size.
747 if (MSI
->isVolatile() || !isa
<ConstantInt
>(MSI
->getLength()))
750 // If we're not allowed to hack on memset, we fail.
754 Value
*Pointer
= MSI
->getDest();
756 // See if the pointer expression is an AddRec like {base,+,1} on the current
757 // loop, which indicates a strided store. If we have something else, it's a
758 // random store we can't handle.
759 const SCEVAddRecExpr
*Ev
= dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(Pointer
));
760 if (!Ev
|| Ev
->getLoop() != CurLoop
|| !Ev
->isAffine())
763 // Reject memsets that are so large that they overflow an unsigned.
764 uint64_t SizeInBytes
= cast
<ConstantInt
>(MSI
->getLength())->getZExtValue();
765 if ((SizeInBytes
>> 32) != 0)
768 // Check to see if the stride matches the size of the memset. If so, then we
769 // know that every byte is touched in the loop.
770 const SCEVConstant
*ConstStride
= dyn_cast
<SCEVConstant
>(Ev
->getOperand(1));
774 APInt Stride
= ConstStride
->getAPInt();
775 if (SizeInBytes
!= Stride
&& SizeInBytes
!= -Stride
)
778 // Verify that the memset value is loop invariant. If not, we can't promote
780 Value
*SplatValue
= MSI
->getValue();
781 if (!SplatValue
|| !CurLoop
->isLoopInvariant(SplatValue
))
784 SmallPtrSet
<Instruction
*, 1> MSIs
;
786 bool NegStride
= SizeInBytes
== -Stride
;
787 return processLoopStridedStore(Pointer
, (unsigned)SizeInBytes
,
788 MSI
->getDestAlignment(), SplatValue
, MSI
, MSIs
,
789 Ev
, BECount
, NegStride
, /*IsLoopMemset=*/true);
792 /// mayLoopAccessLocation - Return true if the specified loop might access the
793 /// specified pointer location, which is a loop-strided access. The 'Access'
794 /// argument specifies what the verboten forms of access are (read or write).
796 mayLoopAccessLocation(Value
*Ptr
, ModRefInfo Access
, Loop
*L
,
797 const SCEV
*BECount
, unsigned StoreSize
,
799 SmallPtrSetImpl
<Instruction
*> &IgnoredStores
) {
800 // Get the location that may be stored across the loop. Since the access is
801 // strided positively through memory, we say that the modified location starts
802 // at the pointer and has infinite size.
803 LocationSize AccessSize
= LocationSize::unknown();
805 // If the loop iterates a fixed number of times, we can refine the access size
806 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
807 if (const SCEVConstant
*BECst
= dyn_cast
<SCEVConstant
>(BECount
))
808 AccessSize
= LocationSize::precise((BECst
->getValue()->getZExtValue() + 1) *
811 // TODO: For this to be really effective, we have to dive into the pointer
812 // operand in the store. Store to &A[i] of 100 will always return may alias
813 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
814 // which will then no-alias a store to &A[100].
815 MemoryLocation
StoreLoc(Ptr
, AccessSize
);
817 for (Loop::block_iterator BI
= L
->block_begin(), E
= L
->block_end(); BI
!= E
;
819 for (Instruction
&I
: **BI
)
820 if (IgnoredStores
.count(&I
) == 0 &&
822 intersectModRef(AA
.getModRefInfo(&I
, StoreLoc
), Access
)))
828 // If we have a negative stride, Start refers to the end of the memory location
829 // we're trying to memset. Therefore, we need to recompute the base pointer,
830 // which is just Start - BECount*Size.
831 static const SCEV
*getStartForNegStride(const SCEV
*Start
, const SCEV
*BECount
,
832 Type
*IntPtr
, unsigned StoreSize
,
833 ScalarEvolution
*SE
) {
834 const SCEV
*Index
= SE
->getTruncateOrZeroExtend(BECount
, IntPtr
);
836 Index
= SE
->getMulExpr(Index
, SE
->getConstant(IntPtr
, StoreSize
),
838 return SE
->getMinusSCEV(Start
, Index
);
841 /// Compute the number of bytes as a SCEV from the backedge taken count.
843 /// This also maps the SCEV into the provided type and tries to handle the
844 /// computation in a way that will fold cleanly.
845 static const SCEV
*getNumBytes(const SCEV
*BECount
, Type
*IntPtr
,
846 unsigned StoreSize
, Loop
*CurLoop
,
847 const DataLayout
*DL
, ScalarEvolution
*SE
) {
848 const SCEV
*NumBytesS
;
849 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
850 // pointer size if it isn't already.
852 // If we're going to need to zero extend the BE count, check if we can add
853 // one to it prior to zero extending without overflow. Provided this is safe,
854 // it allows better simplification of the +1.
855 if (DL
->getTypeSizeInBits(BECount
->getType()) <
856 DL
->getTypeSizeInBits(IntPtr
) &&
857 SE
->isLoopEntryGuardedByCond(
858 CurLoop
, ICmpInst::ICMP_NE
, BECount
,
859 SE
->getNegativeSCEV(SE
->getOne(BECount
->getType())))) {
860 NumBytesS
= SE
->getZeroExtendExpr(
861 SE
->getAddExpr(BECount
, SE
->getOne(BECount
->getType()), SCEV::FlagNUW
),
864 NumBytesS
= SE
->getAddExpr(SE
->getTruncateOrZeroExtend(BECount
, IntPtr
),
865 SE
->getOne(IntPtr
), SCEV::FlagNUW
);
868 // And scale it based on the store size.
869 if (StoreSize
!= 1) {
870 NumBytesS
= SE
->getMulExpr(NumBytesS
, SE
->getConstant(IntPtr
, StoreSize
),
876 /// processLoopStridedStore - We see a strided store of some value. If we can
877 /// transform this into a memset or memset_pattern in the loop preheader, do so.
878 bool LoopIdiomRecognize::processLoopStridedStore(
879 Value
*DestPtr
, unsigned StoreSize
, unsigned StoreAlignment
,
880 Value
*StoredVal
, Instruction
*TheStore
,
881 SmallPtrSetImpl
<Instruction
*> &Stores
, const SCEVAddRecExpr
*Ev
,
882 const SCEV
*BECount
, bool NegStride
, bool IsLoopMemset
) {
883 Value
*SplatValue
= isBytewiseValue(StoredVal
, *DL
);
884 Constant
*PatternValue
= nullptr;
887 PatternValue
= getMemSetPatternValue(StoredVal
, DL
);
889 assert((SplatValue
|| PatternValue
) &&
890 "Expected either splat value or pattern value.");
892 // The trip count of the loop and the base pointer of the addrec SCEV is
893 // guaranteed to be loop invariant, which means that it should dominate the
894 // header. This allows us to insert code for it in the preheader.
895 unsigned DestAS
= DestPtr
->getType()->getPointerAddressSpace();
896 BasicBlock
*Preheader
= CurLoop
->getLoopPreheader();
897 IRBuilder
<> Builder(Preheader
->getTerminator());
898 SCEVExpander
Expander(*SE
, *DL
, "loop-idiom");
900 Type
*DestInt8PtrTy
= Builder
.getInt8PtrTy(DestAS
);
901 Type
*IntPtr
= Builder
.getIntPtrTy(*DL
, DestAS
);
903 const SCEV
*Start
= Ev
->getStart();
904 // Handle negative strided loops.
906 Start
= getStartForNegStride(Start
, BECount
, IntPtr
, StoreSize
, SE
);
908 // TODO: ideally we should still be able to generate memset if SCEV expander
909 // is taught to generate the dependencies at the latest point.
910 if (!isSafeToExpand(Start
, *SE
))
913 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
914 // this into a memset in the loop preheader now if we want. However, this
915 // would be unsafe to do if there is anything else in the loop that may read
916 // or write to the aliased location. Check for any overlap by generating the
917 // base pointer and checking the region.
919 Expander
.expandCodeFor(Start
, DestInt8PtrTy
, Preheader
->getTerminator());
920 if (mayLoopAccessLocation(BasePtr
, ModRefInfo::ModRef
, CurLoop
, BECount
,
921 StoreSize
, *AA
, Stores
)) {
923 // If we generated new code for the base pointer, clean up.
924 RecursivelyDeleteTriviallyDeadInstructions(BasePtr
, TLI
);
928 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset
))
931 // Okay, everything looks good, insert the memset.
933 const SCEV
*NumBytesS
=
934 getNumBytes(BECount
, IntPtr
, StoreSize
, CurLoop
, DL
, SE
);
936 // TODO: ideally we should still be able to generate memset if SCEV expander
937 // is taught to generate the dependencies at the latest point.
938 if (!isSafeToExpand(NumBytesS
, *SE
))
942 Expander
.expandCodeFor(NumBytesS
, IntPtr
, Preheader
->getTerminator());
947 Builder
.CreateMemSet(BasePtr
, SplatValue
, NumBytes
, StoreAlignment
);
949 // Everything is emitted in default address space
950 Type
*Int8PtrTy
= DestInt8PtrTy
;
952 Module
*M
= TheStore
->getModule();
953 StringRef FuncName
= "memset_pattern16";
954 FunctionCallee MSP
= M
->getOrInsertFunction(FuncName
, Builder
.getVoidTy(),
955 Int8PtrTy
, Int8PtrTy
, IntPtr
);
956 inferLibFuncAttributes(M
, FuncName
, *TLI
);
958 // Otherwise we should form a memset_pattern16. PatternValue is known to be
959 // an constant array of 16-bytes. Plop the value into a mergable global.
960 GlobalVariable
*GV
= new GlobalVariable(*M
, PatternValue
->getType(), true,
961 GlobalValue::PrivateLinkage
,
962 PatternValue
, ".memset_pattern");
963 GV
->setUnnamedAddr(GlobalValue::UnnamedAddr::Global
); // Ok to merge these.
964 GV
->setAlignment(16);
965 Value
*PatternPtr
= ConstantExpr::getBitCast(GV
, Int8PtrTy
);
966 NewCall
= Builder
.CreateCall(MSP
, {BasePtr
, PatternPtr
, NumBytes
});
969 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall
<< "\n"
970 << " from store to: " << *Ev
<< " at: " << *TheStore
972 NewCall
->setDebugLoc(TheStore
->getDebugLoc());
975 return OptimizationRemark(DEBUG_TYPE
, "ProcessLoopStridedStore",
976 NewCall
->getDebugLoc(), Preheader
)
977 << "Transformed loop-strided store into a call to "
978 << ore::NV("NewFunction", NewCall
->getCalledFunction())
982 // Okay, the memset has been formed. Zap the original store and anything that
984 for (auto *I
: Stores
)
985 deleteDeadInstruction(I
);
990 /// If the stored value is a strided load in the same loop with the same stride
991 /// this may be transformable into a memcpy. This kicks in for stuff like
992 /// for (i) A[i] = B[i];
993 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst
*SI
,
994 const SCEV
*BECount
) {
995 assert(SI
->isUnordered() && "Expected only non-volatile non-ordered stores.");
997 Value
*StorePtr
= SI
->getPointerOperand();
998 const SCEVAddRecExpr
*StoreEv
= cast
<SCEVAddRecExpr
>(SE
->getSCEV(StorePtr
));
999 APInt Stride
= getStoreStride(StoreEv
);
1000 unsigned StoreSize
= DL
->getTypeStoreSize(SI
->getValueOperand()->getType());
1001 bool NegStride
= StoreSize
== -Stride
;
1003 // The store must be feeding a non-volatile load.
1004 LoadInst
*LI
= cast
<LoadInst
>(SI
->getValueOperand());
1005 assert(LI
->isUnordered() && "Expected only non-volatile non-ordered loads.");
1007 // See if the pointer expression is an AddRec like {base,+,1} on the current
1008 // loop, which indicates a strided load. If we have something else, it's a
1009 // random load we can't handle.
1010 const SCEVAddRecExpr
*LoadEv
=
1011 cast
<SCEVAddRecExpr
>(SE
->getSCEV(LI
->getPointerOperand()));
1013 // The trip count of the loop and the base pointer of the addrec SCEV is
1014 // guaranteed to be loop invariant, which means that it should dominate the
1015 // header. This allows us to insert code for it in the preheader.
1016 BasicBlock
*Preheader
= CurLoop
->getLoopPreheader();
1017 IRBuilder
<> Builder(Preheader
->getTerminator());
1018 SCEVExpander
Expander(*SE
, *DL
, "loop-idiom");
1020 const SCEV
*StrStart
= StoreEv
->getStart();
1021 unsigned StrAS
= SI
->getPointerAddressSpace();
1022 Type
*IntPtrTy
= Builder
.getIntPtrTy(*DL
, StrAS
);
1024 // Handle negative strided loops.
1026 StrStart
= getStartForNegStride(StrStart
, BECount
, IntPtrTy
, StoreSize
, SE
);
1028 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1029 // this into a memcpy in the loop preheader now if we want. However, this
1030 // would be unsafe to do if there is anything else in the loop that may read
1031 // or write the memory region we're storing to. This includes the load that
1032 // feeds the stores. Check for an alias by generating the base address and
1033 // checking everything.
1034 Value
*StoreBasePtr
= Expander
.expandCodeFor(
1035 StrStart
, Builder
.getInt8PtrTy(StrAS
), Preheader
->getTerminator());
1037 SmallPtrSet
<Instruction
*, 1> Stores
;
1039 if (mayLoopAccessLocation(StoreBasePtr
, ModRefInfo::ModRef
, CurLoop
, BECount
,
1040 StoreSize
, *AA
, Stores
)) {
1042 // If we generated new code for the base pointer, clean up.
1043 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr
, TLI
);
1047 const SCEV
*LdStart
= LoadEv
->getStart();
1048 unsigned LdAS
= LI
->getPointerAddressSpace();
1050 // Handle negative strided loops.
1052 LdStart
= getStartForNegStride(LdStart
, BECount
, IntPtrTy
, StoreSize
, SE
);
1054 // For a memcpy, we have to make sure that the input array is not being
1055 // mutated by the loop.
1056 Value
*LoadBasePtr
= Expander
.expandCodeFor(
1057 LdStart
, Builder
.getInt8PtrTy(LdAS
), Preheader
->getTerminator());
1059 if (mayLoopAccessLocation(LoadBasePtr
, ModRefInfo::Mod
, CurLoop
, BECount
,
1060 StoreSize
, *AA
, Stores
)) {
1062 // If we generated new code for the base pointer, clean up.
1063 RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr
, TLI
);
1064 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr
, TLI
);
1068 if (avoidLIRForMultiBlockLoop())
1071 // Okay, everything is safe, we can transform this!
1073 const SCEV
*NumBytesS
=
1074 getNumBytes(BECount
, IntPtrTy
, StoreSize
, CurLoop
, DL
, SE
);
1077 Expander
.expandCodeFor(NumBytesS
, IntPtrTy
, Preheader
->getTerminator());
1079 CallInst
*NewCall
= nullptr;
1080 // Check whether to generate an unordered atomic memcpy:
1081 // If the load or store are atomic, then they must necessarily be unordered
1082 // by previous checks.
1083 if (!SI
->isAtomic() && !LI
->isAtomic())
1084 NewCall
= Builder
.CreateMemCpy(StoreBasePtr
, SI
->getAlignment(),
1085 LoadBasePtr
, LI
->getAlignment(), NumBytes
);
1087 // We cannot allow unaligned ops for unordered load/store, so reject
1088 // anything where the alignment isn't at least the element size.
1089 unsigned Align
= std::min(SI
->getAlignment(), LI
->getAlignment());
1090 if (Align
< StoreSize
)
1093 // If the element.atomic memcpy is not lowered into explicit
1094 // loads/stores later, then it will be lowered into an element-size
1095 // specific lib call. If the lib call doesn't exist for our store size, then
1096 // we shouldn't generate the memcpy.
1097 if (StoreSize
> TTI
->getAtomicMemIntrinsicMaxElementSize())
1101 // Note that unordered atomic loads/stores are *required* by the spec to
1102 // have an alignment but non-atomic loads/stores may not.
1103 NewCall
= Builder
.CreateElementUnorderedAtomicMemCpy(
1104 StoreBasePtr
, SI
->getAlignment(), LoadBasePtr
, LI
->getAlignment(),
1105 NumBytes
, StoreSize
);
1107 NewCall
->setDebugLoc(SI
->getDebugLoc());
1109 LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall
<< "\n"
1110 << " from load ptr=" << *LoadEv
<< " at: " << *LI
<< "\n"
1111 << " from store ptr=" << *StoreEv
<< " at: " << *SI
1115 return OptimizationRemark(DEBUG_TYPE
, "ProcessLoopStoreOfLoopLoad",
1116 NewCall
->getDebugLoc(), Preheader
)
1117 << "Formed a call to "
1118 << ore::NV("NewFunction", NewCall
->getCalledFunction())
1122 // Okay, the memcpy has been formed. Zap the original store and anything that
1124 deleteDeadInstruction(SI
);
1129 // When compiling for codesize we avoid idiom recognition for a multi-block loop
1130 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1132 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset
,
1133 bool IsLoopMemset
) {
1134 if (ApplyCodeSizeHeuristics
&& CurLoop
->getNumBlocks() > 1) {
1135 if (!CurLoop
->getParentLoop() && (!IsMemset
|| !IsLoopMemset
)) {
1136 LLVM_DEBUG(dbgs() << " " << CurLoop
->getHeader()->getParent()->getName()
1137 << " : LIR " << (IsMemset
? "Memset" : "Memcpy")
1138 << " avoided: multi-block top-level loop\n");
1146 bool LoopIdiomRecognize::runOnNoncountableLoop() {
1147 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Scanning: F["
1148 << CurLoop
->getHeader()->getParent()->getName()
1149 << "] Noncountable Loop %"
1150 << CurLoop
->getHeader()->getName() << "\n");
1152 return recognizePopcount() || recognizeAndInsertFFS();
1155 /// Check if the given conditional branch is based on the comparison between
1156 /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1157 /// true), the control yields to the loop entry. If the branch matches the
1158 /// behavior, the variable involved in the comparison is returned. This function
1159 /// will be called to see if the precondition and postcondition of the loop are
1160 /// in desirable form.
1161 static Value
*matchCondition(BranchInst
*BI
, BasicBlock
*LoopEntry
,
1162 bool JmpOnZero
= false) {
1163 if (!BI
|| !BI
->isConditional())
1166 ICmpInst
*Cond
= dyn_cast
<ICmpInst
>(BI
->getCondition());
1170 ConstantInt
*CmpZero
= dyn_cast
<ConstantInt
>(Cond
->getOperand(1));
1171 if (!CmpZero
|| !CmpZero
->isZero())
1174 BasicBlock
*TrueSucc
= BI
->getSuccessor(0);
1175 BasicBlock
*FalseSucc
= BI
->getSuccessor(1);
1177 std::swap(TrueSucc
, FalseSucc
);
1179 ICmpInst::Predicate Pred
= Cond
->getPredicate();
1180 if ((Pred
== ICmpInst::ICMP_NE
&& TrueSucc
== LoopEntry
) ||
1181 (Pred
== ICmpInst::ICMP_EQ
&& FalseSucc
== LoopEntry
))
1182 return Cond
->getOperand(0);
1187 // Check if the recurrence variable `VarX` is in the right form to create
1188 // the idiom. Returns the value coerced to a PHINode if so.
1189 static PHINode
*getRecurrenceVar(Value
*VarX
, Instruction
*DefX
,
1190 BasicBlock
*LoopEntry
) {
1191 auto *PhiX
= dyn_cast
<PHINode
>(VarX
);
1192 if (PhiX
&& PhiX
->getParent() == LoopEntry
&&
1193 (PhiX
->getOperand(0) == DefX
|| PhiX
->getOperand(1) == DefX
))
1198 /// Return true iff the idiom is detected in the loop.
1201 /// 1) \p CntInst is set to the instruction counting the population bit.
1202 /// 2) \p CntPhi is set to the corresponding phi node.
1203 /// 3) \p Var is set to the value whose population bits are being counted.
1205 /// The core idiom we are trying to detect is:
1208 /// goto loop-exit // the precondition of the loop
1209 /// cnt0 = init-val;
1211 /// x1 = phi (x0, x2);
1212 /// cnt1 = phi(cnt0, cnt2);
1214 /// cnt2 = cnt1 + 1;
1216 /// x2 = x1 & (x1 - 1);
1218 /// } while(x != 0);
1222 static bool detectPopcountIdiom(Loop
*CurLoop
, BasicBlock
*PreCondBB
,
1223 Instruction
*&CntInst
, PHINode
*&CntPhi
,
1225 // step 1: Check to see if the look-back branch match this pattern:
1226 // "if (a!=0) goto loop-entry".
1227 BasicBlock
*LoopEntry
;
1228 Instruction
*DefX2
, *CountInst
;
1229 Value
*VarX1
, *VarX0
;
1230 PHINode
*PhiX
, *CountPhi
;
1232 DefX2
= CountInst
= nullptr;
1233 VarX1
= VarX0
= nullptr;
1234 PhiX
= CountPhi
= nullptr;
1235 LoopEntry
= *(CurLoop
->block_begin());
1237 // step 1: Check if the loop-back branch is in desirable form.
1239 if (Value
*T
= matchCondition(
1240 dyn_cast
<BranchInst
>(LoopEntry
->getTerminator()), LoopEntry
))
1241 DefX2
= dyn_cast
<Instruction
>(T
);
1246 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1248 if (!DefX2
|| DefX2
->getOpcode() != Instruction::And
)
1251 BinaryOperator
*SubOneOp
;
1253 if ((SubOneOp
= dyn_cast
<BinaryOperator
>(DefX2
->getOperand(0))))
1254 VarX1
= DefX2
->getOperand(1);
1256 VarX1
= DefX2
->getOperand(0);
1257 SubOneOp
= dyn_cast
<BinaryOperator
>(DefX2
->getOperand(1));
1259 if (!SubOneOp
|| SubOneOp
->getOperand(0) != VarX1
)
1262 ConstantInt
*Dec
= dyn_cast
<ConstantInt
>(SubOneOp
->getOperand(1));
1264 !((SubOneOp
->getOpcode() == Instruction::Sub
&& Dec
->isOne()) ||
1265 (SubOneOp
->getOpcode() == Instruction::Add
&&
1266 Dec
->isMinusOne()))) {
1271 // step 3: Check the recurrence of variable X
1272 PhiX
= getRecurrenceVar(VarX1
, DefX2
, LoopEntry
);
1276 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1278 CountInst
= nullptr;
1279 for (BasicBlock::iterator Iter
= LoopEntry
->getFirstNonPHI()->getIterator(),
1280 IterE
= LoopEntry
->end();
1281 Iter
!= IterE
; Iter
++) {
1282 Instruction
*Inst
= &*Iter
;
1283 if (Inst
->getOpcode() != Instruction::Add
)
1286 ConstantInt
*Inc
= dyn_cast
<ConstantInt
>(Inst
->getOperand(1));
1287 if (!Inc
|| !Inc
->isOne())
1290 PHINode
*Phi
= getRecurrenceVar(Inst
->getOperand(0), Inst
, LoopEntry
);
1294 // Check if the result of the instruction is live of the loop.
1295 bool LiveOutLoop
= false;
1296 for (User
*U
: Inst
->users()) {
1297 if ((cast
<Instruction
>(U
))->getParent() != LoopEntry
) {
1314 // step 5: check if the precondition is in this form:
1315 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1317 auto *PreCondBr
= dyn_cast
<BranchInst
>(PreCondBB
->getTerminator());
1318 Value
*T
= matchCondition(PreCondBr
, CurLoop
->getLoopPreheader());
1319 if (T
!= PhiX
->getOperand(0) && T
!= PhiX
->getOperand(1))
1322 CntInst
= CountInst
;
1330 /// Return true if the idiom is detected in the loop.
1333 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1334 /// or nullptr if there is no such.
1335 /// 2) \p CntPhi is set to the corresponding phi node
1336 /// or nullptr if there is no such.
1337 /// 3) \p Var is set to the value whose CTLZ could be used.
1338 /// 4) \p DefX is set to the instruction calculating Loop exit condition.
1340 /// The core idiom we are trying to detect is:
1343 /// goto loop-exit // the precondition of the loop
1344 /// cnt0 = init-val;
1346 /// x = phi (x0, x.next); //PhiX
1347 /// cnt = phi(cnt0, cnt.next);
1349 /// cnt.next = cnt + 1;
1351 /// x.next = x >> 1; // DefX
1353 /// } while(x.next != 0);
1357 static bool detectShiftUntilZeroIdiom(Loop
*CurLoop
, const DataLayout
&DL
,
1358 Intrinsic::ID
&IntrinID
, Value
*&InitX
,
1359 Instruction
*&CntInst
, PHINode
*&CntPhi
,
1360 Instruction
*&DefX
) {
1361 BasicBlock
*LoopEntry
;
1362 Value
*VarX
= nullptr;
1367 LoopEntry
= *(CurLoop
->block_begin());
1369 // step 1: Check if the loop-back branch is in desirable form.
1370 if (Value
*T
= matchCondition(
1371 dyn_cast
<BranchInst
>(LoopEntry
->getTerminator()), LoopEntry
))
1372 DefX
= dyn_cast
<Instruction
>(T
);
1376 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1377 if (!DefX
|| !DefX
->isShift())
1379 IntrinID
= DefX
->getOpcode() == Instruction::Shl
? Intrinsic::cttz
:
1381 ConstantInt
*Shft
= dyn_cast
<ConstantInt
>(DefX
->getOperand(1));
1382 if (!Shft
|| !Shft
->isOne())
1384 VarX
= DefX
->getOperand(0);
1386 // step 3: Check the recurrence of variable X
1387 PHINode
*PhiX
= getRecurrenceVar(VarX
, DefX
, LoopEntry
);
1391 InitX
= PhiX
->getIncomingValueForBlock(CurLoop
->getLoopPreheader());
1393 // Make sure the initial value can't be negative otherwise the ashr in the
1394 // loop might never reach zero which would make the loop infinite.
1395 if (DefX
->getOpcode() == Instruction::AShr
&& !isKnownNonNegative(InitX
, DL
))
1398 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1399 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1400 // then all uses of "cnt.next" could be optimized to the trip count
1401 // plus "cnt0". Currently it is not optimized.
1402 // This step could be used to detect POPCNT instruction:
1403 // cnt.next = cnt + (x.next & 1)
1404 for (BasicBlock::iterator Iter
= LoopEntry
->getFirstNonPHI()->getIterator(),
1405 IterE
= LoopEntry
->end();
1406 Iter
!= IterE
; Iter
++) {
1407 Instruction
*Inst
= &*Iter
;
1408 if (Inst
->getOpcode() != Instruction::Add
)
1411 ConstantInt
*Inc
= dyn_cast
<ConstantInt
>(Inst
->getOperand(1));
1412 if (!Inc
|| !Inc
->isOne())
1415 PHINode
*Phi
= getRecurrenceVar(Inst
->getOperand(0), Inst
, LoopEntry
);
1429 /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1430 /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1431 /// trip count returns true; otherwise, returns false.
1432 bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1433 // Give up if the loop has multiple blocks or multiple backedges.
1434 if (CurLoop
->getNumBackEdges() != 1 || CurLoop
->getNumBlocks() != 1)
1437 Intrinsic::ID IntrinID
;
1439 Instruction
*DefX
= nullptr;
1440 PHINode
*CntPhi
= nullptr;
1441 Instruction
*CntInst
= nullptr;
1442 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1443 // this is always 6.
1444 size_t IdiomCanonicalSize
= 6;
1446 if (!detectShiftUntilZeroIdiom(CurLoop
, *DL
, IntrinID
, InitX
,
1447 CntInst
, CntPhi
, DefX
))
1450 bool IsCntPhiUsedOutsideLoop
= false;
1451 for (User
*U
: CntPhi
->users())
1452 if (!CurLoop
->contains(cast
<Instruction
>(U
))) {
1453 IsCntPhiUsedOutsideLoop
= true;
1456 bool IsCntInstUsedOutsideLoop
= false;
1457 for (User
*U
: CntInst
->users())
1458 if (!CurLoop
->contains(cast
<Instruction
>(U
))) {
1459 IsCntInstUsedOutsideLoop
= true;
1462 // If both CntInst and CntPhi are used outside the loop the profitability
1464 if (IsCntInstUsedOutsideLoop
&& IsCntPhiUsedOutsideLoop
)
1467 // For some CPUs result of CTLZ(X) intrinsic is undefined
1468 // when X is 0. If we can not guarantee X != 0, we need to check this
1470 bool ZeroCheck
= false;
1471 // It is safe to assume Preheader exist as it was checked in
1472 // parent function RunOnLoop.
1473 BasicBlock
*PH
= CurLoop
->getLoopPreheader();
1475 // If we are using the count instruction outside the loop, make sure we
1476 // have a zero check as a precondition. Without the check the loop would run
1477 // one iteration for before any check of the input value. This means 0 and 1
1478 // would have identical behavior in the original loop and thus
1479 if (!IsCntPhiUsedOutsideLoop
) {
1480 auto *PreCondBB
= PH
->getSinglePredecessor();
1483 auto *PreCondBI
= dyn_cast
<BranchInst
>(PreCondBB
->getTerminator());
1486 if (matchCondition(PreCondBI
, PH
) != InitX
)
1491 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1492 // profitable if we delete the loop.
1494 // the loop has only 6 instructions:
1495 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1496 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1497 // %shr = ashr %n.addr.0, 1
1498 // %tobool = icmp eq %shr, 0
1499 // %inc = add nsw %i.0, 1
1502 const Value
*Args
[] =
1503 {InitX
, ZeroCheck
? ConstantInt::getTrue(InitX
->getContext())
1504 : ConstantInt::getFalse(InitX
->getContext())};
1506 // @llvm.dbg doesn't count as they have no semantic effect.
1507 auto InstWithoutDebugIt
= CurLoop
->getHeader()->instructionsWithoutDebug();
1508 uint32_t HeaderSize
=
1509 std::distance(InstWithoutDebugIt
.begin(), InstWithoutDebugIt
.end());
1511 if (HeaderSize
!= IdiomCanonicalSize
&&
1512 TTI
->getIntrinsicCost(IntrinID
, InitX
->getType(), Args
) >
1513 TargetTransformInfo::TCC_Basic
)
1516 transformLoopToCountable(IntrinID
, PH
, CntInst
, CntPhi
, InitX
, DefX
,
1517 DefX
->getDebugLoc(), ZeroCheck
,
1518 IsCntPhiUsedOutsideLoop
);
1522 /// Recognizes a population count idiom in a non-countable loop.
1524 /// If detected, transforms the relevant code to issue the popcount intrinsic
1525 /// function call, and returns true; otherwise, returns false.
1526 bool LoopIdiomRecognize::recognizePopcount() {
1527 if (TTI
->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware
)
1530 // Counting population are usually conducted by few arithmetic instructions.
1531 // Such instructions can be easily "absorbed" by vacant slots in a
1532 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1533 // in a compact loop.
1535 // Give up if the loop has multiple blocks or multiple backedges.
1536 if (CurLoop
->getNumBackEdges() != 1 || CurLoop
->getNumBlocks() != 1)
1539 BasicBlock
*LoopBody
= *(CurLoop
->block_begin());
1540 if (LoopBody
->size() >= 20) {
1541 // The loop is too big, bail out.
1545 // It should have a preheader containing nothing but an unconditional branch.
1546 BasicBlock
*PH
= CurLoop
->getLoopPreheader();
1547 if (!PH
|| &PH
->front() != PH
->getTerminator())
1549 auto *EntryBI
= dyn_cast
<BranchInst
>(PH
->getTerminator());
1550 if (!EntryBI
|| EntryBI
->isConditional())
1553 // It should have a precondition block where the generated popcount intrinsic
1554 // function can be inserted.
1555 auto *PreCondBB
= PH
->getSinglePredecessor();
1558 auto *PreCondBI
= dyn_cast
<BranchInst
>(PreCondBB
->getTerminator());
1559 if (!PreCondBI
|| PreCondBI
->isUnconditional())
1562 Instruction
*CntInst
;
1565 if (!detectPopcountIdiom(CurLoop
, PreCondBB
, CntInst
, CntPhi
, Val
))
1568 transformLoopToPopcount(PreCondBB
, CntInst
, CntPhi
, Val
);
1572 static CallInst
*createPopcntIntrinsic(IRBuilder
<> &IRBuilder
, Value
*Val
,
1573 const DebugLoc
&DL
) {
1574 Value
*Ops
[] = {Val
};
1575 Type
*Tys
[] = {Val
->getType()};
1577 Module
*M
= IRBuilder
.GetInsertBlock()->getParent()->getParent();
1578 Function
*Func
= Intrinsic::getDeclaration(M
, Intrinsic::ctpop
, Tys
);
1579 CallInst
*CI
= IRBuilder
.CreateCall(Func
, Ops
);
1580 CI
->setDebugLoc(DL
);
1585 static CallInst
*createFFSIntrinsic(IRBuilder
<> &IRBuilder
, Value
*Val
,
1586 const DebugLoc
&DL
, bool ZeroCheck
,
1587 Intrinsic::ID IID
) {
1588 Value
*Ops
[] = {Val
, ZeroCheck
? IRBuilder
.getTrue() : IRBuilder
.getFalse()};
1589 Type
*Tys
[] = {Val
->getType()};
1591 Module
*M
= IRBuilder
.GetInsertBlock()->getParent()->getParent();
1592 Function
*Func
= Intrinsic::getDeclaration(M
, IID
, Tys
);
1593 CallInst
*CI
= IRBuilder
.CreateCall(Func
, Ops
);
1594 CI
->setDebugLoc(DL
);
1599 /// Transform the following loop (Using CTLZ, CTTZ is similar):
1601 /// CntPhi = PHI [Cnt0, CntInst]
1602 /// PhiX = PHI [InitX, DefX]
1603 /// CntInst = CntPhi + 1
1604 /// DefX = PhiX >> 1
1606 /// Br: loop if (DefX != 0)
1607 /// Use(CntPhi) or Use(CntInst)
1610 /// If CntPhi used outside the loop:
1611 /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1612 /// Count = CountPrev + 1
1614 /// Count = BitWidth(InitX) - CTLZ(InitX)
1616 /// CntPhi = PHI [Cnt0, CntInst]
1617 /// PhiX = PHI [InitX, DefX]
1618 /// PhiCount = PHI [Count, Dec]
1619 /// CntInst = CntPhi + 1
1620 /// DefX = PhiX >> 1
1621 /// Dec = PhiCount - 1
1623 /// Br: loop if (Dec != 0)
1624 /// Use(CountPrev + Cnt0) // Use(CntPhi)
1626 /// Use(Count + Cnt0) // Use(CntInst)
1628 /// If LOOP_BODY is empty the loop will be deleted.
1629 /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1630 void LoopIdiomRecognize::transformLoopToCountable(
1631 Intrinsic::ID IntrinID
, BasicBlock
*Preheader
, Instruction
*CntInst
,
1632 PHINode
*CntPhi
, Value
*InitX
, Instruction
*DefX
, const DebugLoc
&DL
,
1633 bool ZeroCheck
, bool IsCntPhiUsedOutsideLoop
) {
1634 BranchInst
*PreheaderBr
= cast
<BranchInst
>(Preheader
->getTerminator());
1636 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1637 IRBuilder
<> Builder(PreheaderBr
);
1638 Builder
.SetCurrentDebugLocation(DL
);
1639 Value
*FFS
, *Count
, *CountPrev
, *NewCount
, *InitXNext
;
1641 // Count = BitWidth - CTLZ(InitX);
1642 // If there are uses of CntPhi create:
1643 // CountPrev = BitWidth - CTLZ(InitX >> 1);
1644 if (IsCntPhiUsedOutsideLoop
) {
1645 if (DefX
->getOpcode() == Instruction::AShr
)
1647 Builder
.CreateAShr(InitX
, ConstantInt::get(InitX
->getType(), 1));
1648 else if (DefX
->getOpcode() == Instruction::LShr
)
1650 Builder
.CreateLShr(InitX
, ConstantInt::get(InitX
->getType(), 1));
1651 else if (DefX
->getOpcode() == Instruction::Shl
) // cttz
1653 Builder
.CreateShl(InitX
, ConstantInt::get(InitX
->getType(), 1));
1655 llvm_unreachable("Unexpected opcode!");
1658 FFS
= createFFSIntrinsic(Builder
, InitXNext
, DL
, ZeroCheck
, IntrinID
);
1659 Count
= Builder
.CreateSub(
1660 ConstantInt::get(FFS
->getType(),
1661 FFS
->getType()->getIntegerBitWidth()),
1663 if (IsCntPhiUsedOutsideLoop
) {
1665 Count
= Builder
.CreateAdd(
1667 ConstantInt::get(CountPrev
->getType(), 1));
1670 NewCount
= Builder
.CreateZExtOrTrunc(
1671 IsCntPhiUsedOutsideLoop
? CountPrev
: Count
,
1672 cast
<IntegerType
>(CntInst
->getType()));
1674 // If the counter's initial value is not zero, insert Add Inst.
1675 Value
*CntInitVal
= CntPhi
->getIncomingValueForBlock(Preheader
);
1676 ConstantInt
*InitConst
= dyn_cast
<ConstantInt
>(CntInitVal
);
1677 if (!InitConst
|| !InitConst
->isZero())
1678 NewCount
= Builder
.CreateAdd(NewCount
, CntInitVal
);
1680 // Step 2: Insert new IV and loop condition:
1683 // PhiCount = PHI [Count, Dec]
1685 // Dec = PhiCount - 1
1687 // Br: loop if (Dec != 0)
1688 BasicBlock
*Body
= *(CurLoop
->block_begin());
1689 auto *LbBr
= cast
<BranchInst
>(Body
->getTerminator());
1690 ICmpInst
*LbCond
= cast
<ICmpInst
>(LbBr
->getCondition());
1691 Type
*Ty
= Count
->getType();
1693 PHINode
*TcPhi
= PHINode::Create(Ty
, 2, "tcphi", &Body
->front());
1695 Builder
.SetInsertPoint(LbCond
);
1696 Instruction
*TcDec
= cast
<Instruction
>(
1697 Builder
.CreateSub(TcPhi
, ConstantInt::get(Ty
, 1),
1698 "tcdec", false, true));
1700 TcPhi
->addIncoming(Count
, Preheader
);
1701 TcPhi
->addIncoming(TcDec
, Body
);
1703 CmpInst::Predicate Pred
=
1704 (LbBr
->getSuccessor(0) == Body
) ? CmpInst::ICMP_NE
: CmpInst::ICMP_EQ
;
1705 LbCond
->setPredicate(Pred
);
1706 LbCond
->setOperand(0, TcDec
);
1707 LbCond
->setOperand(1, ConstantInt::get(Ty
, 0));
1709 // Step 3: All the references to the original counter outside
1710 // the loop are replaced with the NewCount
1711 if (IsCntPhiUsedOutsideLoop
)
1712 CntPhi
->replaceUsesOutsideBlock(NewCount
, Body
);
1714 CntInst
->replaceUsesOutsideBlock(NewCount
, Body
);
1716 // step 4: Forget the "non-computable" trip-count SCEV associated with the
1717 // loop. The loop would otherwise not be deleted even if it becomes empty.
1718 SE
->forgetLoop(CurLoop
);
1721 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock
*PreCondBB
,
1722 Instruction
*CntInst
,
1723 PHINode
*CntPhi
, Value
*Var
) {
1724 BasicBlock
*PreHead
= CurLoop
->getLoopPreheader();
1725 auto *PreCondBr
= cast
<BranchInst
>(PreCondBB
->getTerminator());
1726 const DebugLoc
&DL
= CntInst
->getDebugLoc();
1728 // Assuming before transformation, the loop is following:
1729 // if (x) // the precondition
1730 // do { cnt++; x &= x - 1; } while(x);
1732 // Step 1: Insert the ctpop instruction at the end of the precondition block
1733 IRBuilder
<> Builder(PreCondBr
);
1734 Value
*PopCnt
, *PopCntZext
, *NewCount
, *TripCnt
;
1736 PopCnt
= createPopcntIntrinsic(Builder
, Var
, DL
);
1737 NewCount
= PopCntZext
=
1738 Builder
.CreateZExtOrTrunc(PopCnt
, cast
<IntegerType
>(CntPhi
->getType()));
1740 if (NewCount
!= PopCnt
)
1741 (cast
<Instruction
>(NewCount
))->setDebugLoc(DL
);
1743 // TripCnt is exactly the number of iterations the loop has
1746 // If the population counter's initial value is not zero, insert Add Inst.
1747 Value
*CntInitVal
= CntPhi
->getIncomingValueForBlock(PreHead
);
1748 ConstantInt
*InitConst
= dyn_cast
<ConstantInt
>(CntInitVal
);
1749 if (!InitConst
|| !InitConst
->isZero()) {
1750 NewCount
= Builder
.CreateAdd(NewCount
, CntInitVal
);
1751 (cast
<Instruction
>(NewCount
))->setDebugLoc(DL
);
1755 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1756 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1757 // function would be partial dead code, and downstream passes will drag
1758 // it back from the precondition block to the preheader.
1760 ICmpInst
*PreCond
= cast
<ICmpInst
>(PreCondBr
->getCondition());
1762 Value
*Opnd0
= PopCntZext
;
1763 Value
*Opnd1
= ConstantInt::get(PopCntZext
->getType(), 0);
1764 if (PreCond
->getOperand(0) != Var
)
1765 std::swap(Opnd0
, Opnd1
);
1767 ICmpInst
*NewPreCond
= cast
<ICmpInst
>(
1768 Builder
.CreateICmp(PreCond
->getPredicate(), Opnd0
, Opnd1
));
1769 PreCondBr
->setCondition(NewPreCond
);
1771 RecursivelyDeleteTriviallyDeadInstructions(PreCond
, TLI
);
1774 // Step 3: Note that the population count is exactly the trip count of the
1775 // loop in question, which enable us to convert the loop from noncountable
1776 // loop into a countable one. The benefit is twofold:
1778 // - If the loop only counts population, the entire loop becomes dead after
1779 // the transformation. It is a lot easier to prove a countable loop dead
1780 // than to prove a noncountable one. (In some C dialects, an infinite loop
1781 // isn't dead even if it computes nothing useful. In general, DCE needs
1782 // to prove a noncountable loop finite before safely delete it.)
1784 // - If the loop also performs something else, it remains alive.
1785 // Since it is transformed to countable form, it can be aggressively
1786 // optimized by some optimizations which are in general not applicable
1787 // to a noncountable loop.
1789 // After this step, this loop (conceptually) would look like following:
1790 // newcnt = __builtin_ctpop(x);
1793 // do { cnt++; x &= x-1; t--) } while (t > 0);
1794 BasicBlock
*Body
= *(CurLoop
->block_begin());
1796 auto *LbBr
= cast
<BranchInst
>(Body
->getTerminator());
1797 ICmpInst
*LbCond
= cast
<ICmpInst
>(LbBr
->getCondition());
1798 Type
*Ty
= TripCnt
->getType();
1800 PHINode
*TcPhi
= PHINode::Create(Ty
, 2, "tcphi", &Body
->front());
1802 Builder
.SetInsertPoint(LbCond
);
1803 Instruction
*TcDec
= cast
<Instruction
>(
1804 Builder
.CreateSub(TcPhi
, ConstantInt::get(Ty
, 1),
1805 "tcdec", false, true));
1807 TcPhi
->addIncoming(TripCnt
, PreHead
);
1808 TcPhi
->addIncoming(TcDec
, Body
);
1810 CmpInst::Predicate Pred
=
1811 (LbBr
->getSuccessor(0) == Body
) ? CmpInst::ICMP_UGT
: CmpInst::ICMP_SLE
;
1812 LbCond
->setPredicate(Pred
);
1813 LbCond
->setOperand(0, TcDec
);
1814 LbCond
->setOperand(1, ConstantInt::get(Ty
, 0));
1817 // Step 4: All the references to the original population counter outside
1818 // the loop are replaced with the NewCount -- the value returned from
1819 // __builtin_ctpop().
1820 CntInst
->replaceUsesOutsideBlock(NewCount
, Body
);
1822 // step 5: Forget the "non-computable" trip-count SCEV associated with the
1823 // loop. The loop would otherwise not be deleted even if it becomes empty.
1824 SE
->forgetLoop(CurLoop
);