1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass implements an idiom recognizer that transforms simple loops into a
10 // non-loop form. In cases that this kicks in, it can be a significant
13 // If compiling for code size we avoid idiom recognition if the resulting
14 // code could be larger than the code for the original loop. One way this could
15 // happen is if the loop is not removable after idiom recognition due to the
16 // presence of non-idiom instructions. The initial implementation of the
17 // heuristics applies to idioms in multi-block loops.
19 //===----------------------------------------------------------------------===//
23 // Future loop memory idioms to recognize:
24 // memcmp, strlen, etc.
26 // This could recognize common matrix multiplies and dot product idioms and
27 // replace them with calls to BLAS (if linked in??).
29 //===----------------------------------------------------------------------===//
31 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
32 #include "llvm/ADT/APInt.h"
33 #include "llvm/ADT/ArrayRef.h"
34 #include "llvm/ADT/DenseMap.h"
35 #include "llvm/ADT/MapVector.h"
36 #include "llvm/ADT/SetVector.h"
37 #include "llvm/ADT/SmallPtrSet.h"
38 #include "llvm/ADT/SmallVector.h"
39 #include "llvm/ADT/Statistic.h"
40 #include "llvm/ADT/StringRef.h"
41 #include "llvm/Analysis/AliasAnalysis.h"
42 #include "llvm/Analysis/CmpInstAnalysis.h"
43 #include "llvm/Analysis/LoopInfo.h"
44 #include "llvm/Analysis/LoopPass.h"
45 #include "llvm/Analysis/MemoryLocation.h"
46 #include "llvm/Analysis/MemorySSA.h"
47 #include "llvm/Analysis/MemorySSAUpdater.h"
48 #include "llvm/Analysis/MustExecute.h"
49 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
50 #include "llvm/Analysis/ScalarEvolution.h"
51 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
52 #include "llvm/Analysis/TargetLibraryInfo.h"
53 #include "llvm/Analysis/TargetTransformInfo.h"
54 #include "llvm/Analysis/ValueTracking.h"
55 #include "llvm/IR/BasicBlock.h"
56 #include "llvm/IR/Constant.h"
57 #include "llvm/IR/Constants.h"
58 #include "llvm/IR/DataLayout.h"
59 #include "llvm/IR/DebugLoc.h"
60 #include "llvm/IR/DerivedTypes.h"
61 #include "llvm/IR/Dominators.h"
62 #include "llvm/IR/GlobalValue.h"
63 #include "llvm/IR/GlobalVariable.h"
64 #include "llvm/IR/IRBuilder.h"
65 #include "llvm/IR/InstrTypes.h"
66 #include "llvm/IR/Instruction.h"
67 #include "llvm/IR/Instructions.h"
68 #include "llvm/IR/IntrinsicInst.h"
69 #include "llvm/IR/Intrinsics.h"
70 #include "llvm/IR/LLVMContext.h"
71 #include "llvm/IR/Module.h"
72 #include "llvm/IR/PassManager.h"
73 #include "llvm/IR/PatternMatch.h"
74 #include "llvm/IR/Type.h"
75 #include "llvm/IR/User.h"
76 #include "llvm/IR/Value.h"
77 #include "llvm/IR/ValueHandle.h"
78 #include "llvm/Support/Casting.h"
79 #include "llvm/Support/CommandLine.h"
80 #include "llvm/Support/Debug.h"
81 #include "llvm/Support/InstructionCost.h"
82 #include "llvm/Support/raw_ostream.h"
83 #include "llvm/Transforms/Utils/BuildLibCalls.h"
84 #include "llvm/Transforms/Utils/Local.h"
85 #include "llvm/Transforms/Utils/LoopUtils.h"
86 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
95 #define DEBUG_TYPE "loop-idiom"
97 STATISTIC(NumMemSet
, "Number of memset's formed from loop stores");
98 STATISTIC(NumMemCpy
, "Number of memcpy's formed from loop load+stores");
99 STATISTIC(NumMemMove
, "Number of memmove's formed from loop load+stores");
101 NumShiftUntilBitTest
,
102 "Number of uncountable loops recognized as 'shift until bitttest' idiom");
103 STATISTIC(NumShiftUntilZero
,
104 "Number of uncountable loops recognized as 'shift until zero' idiom");
106 bool DisableLIRP::All
;
107 static cl::opt
<bool, true>
108 DisableLIRPAll("disable-" DEBUG_TYPE
"-all",
109 cl::desc("Options to disable Loop Idiom Recognize Pass."),
110 cl::location(DisableLIRP::All
), cl::init(false),
113 bool DisableLIRP::Memset
;
114 static cl::opt
<bool, true>
115 DisableLIRPMemset("disable-" DEBUG_TYPE
"-memset",
116 cl::desc("Proceed with loop idiom recognize pass, but do "
117 "not convert loop(s) to memset."),
118 cl::location(DisableLIRP::Memset
), cl::init(false),
121 bool DisableLIRP::Memcpy
;
122 static cl::opt
<bool, true>
123 DisableLIRPMemcpy("disable-" DEBUG_TYPE
"-memcpy",
124 cl::desc("Proceed with loop idiom recognize pass, but do "
125 "not convert loop(s) to memcpy."),
126 cl::location(DisableLIRP::Memcpy
), cl::init(false),
129 static cl::opt
<bool> UseLIRCodeSizeHeurs(
130 "use-lir-code-size-heurs",
131 cl::desc("Use loop idiom recognition code size heuristics when compiling"
133 cl::init(true), cl::Hidden
);
137 class LoopIdiomRecognize
{
138 Loop
*CurLoop
= nullptr;
143 TargetLibraryInfo
*TLI
;
144 const TargetTransformInfo
*TTI
;
145 const DataLayout
*DL
;
146 OptimizationRemarkEmitter
&ORE
;
147 bool ApplyCodeSizeHeuristics
;
148 std::unique_ptr
<MemorySSAUpdater
> MSSAU
;
151 explicit LoopIdiomRecognize(AliasAnalysis
*AA
, DominatorTree
*DT
,
152 LoopInfo
*LI
, ScalarEvolution
*SE
,
153 TargetLibraryInfo
*TLI
,
154 const TargetTransformInfo
*TTI
, MemorySSA
*MSSA
,
155 const DataLayout
*DL
,
156 OptimizationRemarkEmitter
&ORE
)
157 : AA(AA
), DT(DT
), LI(LI
), SE(SE
), TLI(TLI
), TTI(TTI
), DL(DL
), ORE(ORE
) {
159 MSSAU
= std::make_unique
<MemorySSAUpdater
>(MSSA
);
162 bool runOnLoop(Loop
*L
);
165 using StoreList
= SmallVector
<StoreInst
*, 8>;
166 using StoreListMap
= MapVector
<Value
*, StoreList
>;
168 StoreListMap StoreRefsForMemset
;
169 StoreListMap StoreRefsForMemsetPattern
;
170 StoreList StoreRefsForMemcpy
;
172 bool HasMemsetPattern
;
175 /// Return code for isLegalStore()
176 enum LegalStoreKind
{
181 UnorderedAtomicMemcpy
,
182 DontUse
// Dummy retval never to be used. Allows catching errors in retval
186 /// \name Countable Loop Idiom Handling
189 bool runOnCountableLoop();
190 bool runOnLoopBlock(BasicBlock
*BB
, const SCEV
*BECount
,
191 SmallVectorImpl
<BasicBlock
*> &ExitBlocks
);
193 void collectStores(BasicBlock
*BB
);
194 LegalStoreKind
isLegalStore(StoreInst
*SI
);
195 enum class ForMemset
{ No
, Yes
};
196 bool processLoopStores(SmallVectorImpl
<StoreInst
*> &SL
, const SCEV
*BECount
,
199 template <typename MemInst
>
200 bool processLoopMemIntrinsic(
202 bool (LoopIdiomRecognize::*Processor
)(MemInst
*, const SCEV
*),
203 const SCEV
*BECount
);
204 bool processLoopMemCpy(MemCpyInst
*MCI
, const SCEV
*BECount
);
205 bool processLoopMemSet(MemSetInst
*MSI
, const SCEV
*BECount
);
207 bool processLoopStridedStore(Value
*DestPtr
, const SCEV
*StoreSizeSCEV
,
208 MaybeAlign StoreAlignment
, Value
*StoredVal
,
209 Instruction
*TheStore
,
210 SmallPtrSetImpl
<Instruction
*> &Stores
,
211 const SCEVAddRecExpr
*Ev
, const SCEV
*BECount
,
212 bool IsNegStride
, bool IsLoopMemset
= false);
213 bool processLoopStoreOfLoopLoad(StoreInst
*SI
, const SCEV
*BECount
);
214 bool processLoopStoreOfLoopLoad(Value
*DestPtr
, Value
*SourcePtr
,
215 const SCEV
*StoreSize
, MaybeAlign StoreAlign
,
216 MaybeAlign LoadAlign
, Instruction
*TheStore
,
217 Instruction
*TheLoad
,
218 const SCEVAddRecExpr
*StoreEv
,
219 const SCEVAddRecExpr
*LoadEv
,
220 const SCEV
*BECount
);
221 bool avoidLIRForMultiBlockLoop(bool IsMemset
= false,
222 bool IsLoopMemset
= false);
225 /// \name Noncountable Loop Idiom Handling
228 bool runOnNoncountableLoop();
230 bool recognizePopcount();
231 void transformLoopToPopcount(BasicBlock
*PreCondBB
, Instruction
*CntInst
,
232 PHINode
*CntPhi
, Value
*Var
);
233 bool isProfitableToInsertFFS(Intrinsic::ID IntrinID
, Value
*InitX
,
234 bool ZeroCheck
, size_t CanonicalSize
);
235 bool insertFFSIfProfitable(Intrinsic::ID IntrinID
, Value
*InitX
,
236 Instruction
*DefX
, PHINode
*CntPhi
,
237 Instruction
*CntInst
);
238 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
239 bool recognizeShiftUntilLessThan();
240 void transformLoopToCountable(Intrinsic::ID IntrinID
, BasicBlock
*PreCondBB
,
241 Instruction
*CntInst
, PHINode
*CntPhi
,
242 Value
*Var
, Instruction
*DefX
,
243 const DebugLoc
&DL
, bool ZeroCheck
,
244 bool IsCntPhiUsedOutsideLoop
,
245 bool InsertSub
= false);
247 bool recognizeShiftUntilBitTest();
248 bool recognizeShiftUntilZero();
252 } // end anonymous namespace
254 PreservedAnalyses
LoopIdiomRecognizePass::run(Loop
&L
, LoopAnalysisManager
&AM
,
255 LoopStandardAnalysisResults
&AR
,
257 if (DisableLIRP::All
)
258 return PreservedAnalyses::all();
260 const auto *DL
= &L
.getHeader()->getDataLayout();
262 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
263 // pass. Function analyses need to be preserved across loop transformations
264 // but ORE cannot be preserved (see comment before the pass definition).
265 OptimizationRemarkEmitter
ORE(L
.getHeader()->getParent());
267 LoopIdiomRecognize
LIR(&AR
.AA
, &AR
.DT
, &AR
.LI
, &AR
.SE
, &AR
.TLI
, &AR
.TTI
,
269 if (!LIR
.runOnLoop(&L
))
270 return PreservedAnalyses::all();
272 auto PA
= getLoopPassPreservedAnalyses();
274 PA
.preserve
<MemorySSAAnalysis
>();
278 static void deleteDeadInstruction(Instruction
*I
) {
279 I
->replaceAllUsesWith(PoisonValue::get(I
->getType()));
280 I
->eraseFromParent();
283 //===----------------------------------------------------------------------===//
285 // Implementation of LoopIdiomRecognize
287 //===----------------------------------------------------------------------===//
289 bool LoopIdiomRecognize::runOnLoop(Loop
*L
) {
291 // If the loop could not be converted to canonical form, it must have an
292 // indirectbr in it, just give up.
293 if (!L
->getLoopPreheader())
296 // Disable loop idiom recognition if the function's name is a common idiom.
297 StringRef Name
= L
->getHeader()->getParent()->getName();
298 if (Name
== "memset" || Name
== "memcpy")
301 // Determine if code size heuristics need to be applied.
302 ApplyCodeSizeHeuristics
=
303 L
->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs
;
305 HasMemset
= TLI
->has(LibFunc_memset
);
306 HasMemsetPattern
= TLI
->has(LibFunc_memset_pattern16
);
307 HasMemcpy
= TLI
->has(LibFunc_memcpy
);
309 if (HasMemset
|| HasMemsetPattern
|| HasMemcpy
)
310 if (SE
->hasLoopInvariantBackedgeTakenCount(L
))
311 return runOnCountableLoop();
313 return runOnNoncountableLoop();
316 bool LoopIdiomRecognize::runOnCountableLoop() {
317 const SCEV
*BECount
= SE
->getBackedgeTakenCount(CurLoop
);
318 assert(!isa
<SCEVCouldNotCompute
>(BECount
) &&
319 "runOnCountableLoop() called on a loop without a predictable"
320 "backedge-taken count");
322 // If this loop executes exactly one time, then it should be peeled, not
323 // optimized by this pass.
324 if (const SCEVConstant
*BECst
= dyn_cast
<SCEVConstant
>(BECount
))
325 if (BECst
->getAPInt() == 0)
328 SmallVector
<BasicBlock
*, 8> ExitBlocks
;
329 CurLoop
->getUniqueExitBlocks(ExitBlocks
);
331 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Scanning: F["
332 << CurLoop
->getHeader()->getParent()->getName()
333 << "] Countable Loop %" << CurLoop
->getHeader()->getName()
336 // The following transforms hoist stores/memsets into the loop pre-header.
337 // Give up if the loop has instructions that may throw.
338 SimpleLoopSafetyInfo SafetyInfo
;
339 SafetyInfo
.computeLoopSafetyInfo(CurLoop
);
340 if (SafetyInfo
.anyBlockMayThrow())
343 bool MadeChange
= false;
345 // Scan all the blocks in the loop that are not in subloops.
346 for (auto *BB
: CurLoop
->getBlocks()) {
347 // Ignore blocks in subloops.
348 if (LI
->getLoopFor(BB
) != CurLoop
)
351 MadeChange
|= runOnLoopBlock(BB
, BECount
, ExitBlocks
);
356 static APInt
getStoreStride(const SCEVAddRecExpr
*StoreEv
) {
357 const SCEVConstant
*ConstStride
= cast
<SCEVConstant
>(StoreEv
->getOperand(1));
358 return ConstStride
->getAPInt();
361 /// getMemSetPatternValue - If a strided store of the specified value is safe to
362 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
363 /// be passed in. Otherwise, return null.
365 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
366 /// just replicate their input array and then pass on to memset_pattern16.
367 static Constant
*getMemSetPatternValue(Value
*V
, const DataLayout
*DL
) {
368 // FIXME: This could check for UndefValue because it can be merged into any
369 // other valid pattern.
371 // If the value isn't a constant, we can't promote it to being in a constant
372 // array. We could theoretically do a store to an alloca or something, but
373 // that doesn't seem worthwhile.
374 Constant
*C
= dyn_cast
<Constant
>(V
);
375 if (!C
|| isa
<ConstantExpr
>(C
))
378 // Only handle simple values that are a power of two bytes in size.
379 uint64_t Size
= DL
->getTypeSizeInBits(V
->getType());
380 if (Size
== 0 || (Size
& 7) || (Size
& (Size
- 1)))
383 // Don't care enough about darwin/ppc to implement this.
384 if (DL
->isBigEndian())
387 // Convert to size in bytes.
390 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
391 // if the top and bottom are the same (e.g. for vectors and large integers).
395 // If the constant is exactly 16 bytes, just use it.
399 // Otherwise, we'll use an array of the constants.
400 unsigned ArraySize
= 16 / Size
;
401 ArrayType
*AT
= ArrayType::get(V
->getType(), ArraySize
);
402 return ConstantArray::get(AT
, std::vector
<Constant
*>(ArraySize
, C
));
405 LoopIdiomRecognize::LegalStoreKind
406 LoopIdiomRecognize::isLegalStore(StoreInst
*SI
) {
407 // Don't touch volatile stores.
408 if (SI
->isVolatile())
409 return LegalStoreKind::None
;
410 // We only want simple or unordered-atomic stores.
411 if (!SI
->isUnordered())
412 return LegalStoreKind::None
;
414 // Avoid merging nontemporal stores.
415 if (SI
->getMetadata(LLVMContext::MD_nontemporal
))
416 return LegalStoreKind::None
;
418 Value
*StoredVal
= SI
->getValueOperand();
419 Value
*StorePtr
= SI
->getPointerOperand();
421 // Don't convert stores of non-integral pointer types to memsets (which stores
423 if (DL
->isNonIntegralPointerType(StoredVal
->getType()->getScalarType()))
424 return LegalStoreKind::None
;
426 // Reject stores that are so large that they overflow an unsigned.
427 // When storing out scalable vectors we bail out for now, since the code
428 // below currently only works for constant strides.
429 TypeSize SizeInBits
= DL
->getTypeSizeInBits(StoredVal
->getType());
430 if (SizeInBits
.isScalable() || (SizeInBits
.getFixedValue() & 7) ||
431 (SizeInBits
.getFixedValue() >> 32) != 0)
432 return LegalStoreKind::None
;
434 // See if the pointer expression is an AddRec like {base,+,1} on the current
435 // loop, which indicates a strided store. If we have something else, it's a
436 // random store we can't handle.
437 const SCEVAddRecExpr
*StoreEv
=
438 dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(StorePtr
));
439 if (!StoreEv
|| StoreEv
->getLoop() != CurLoop
|| !StoreEv
->isAffine())
440 return LegalStoreKind::None
;
442 // Check to see if we have a constant stride.
443 if (!isa
<SCEVConstant
>(StoreEv
->getOperand(1)))
444 return LegalStoreKind::None
;
446 // See if the store can be turned into a memset.
448 // If the stored value is a byte-wise value (like i32 -1), then it may be
449 // turned into a memset of i8 -1, assuming that all the consecutive bytes
450 // are stored. A store of i32 0x01020304 can never be turned into a memset,
451 // but it can be turned into memset_pattern if the target supports it.
452 Value
*SplatValue
= isBytewiseValue(StoredVal
, *DL
);
454 // Note: memset and memset_pattern on unordered-atomic is yet not supported
455 bool UnorderedAtomic
= SI
->isUnordered() && !SI
->isSimple();
457 // If we're allowed to form a memset, and the stored value would be
458 // acceptable for memset, use it.
459 if (!UnorderedAtomic
&& HasMemset
&& SplatValue
&& !DisableLIRP::Memset
&&
460 // Verify that the stored value is loop invariant. If not, we can't
461 // promote the memset.
462 CurLoop
->isLoopInvariant(SplatValue
)) {
463 // It looks like we can use SplatValue.
464 return LegalStoreKind::Memset
;
466 if (!UnorderedAtomic
&& HasMemsetPattern
&& !DisableLIRP::Memset
&&
467 // Don't create memset_pattern16s with address spaces.
468 StorePtr
->getType()->getPointerAddressSpace() == 0 &&
469 getMemSetPatternValue(StoredVal
, DL
)) {
470 // It looks like we can use PatternValue!
471 return LegalStoreKind::MemsetPattern
;
474 // Otherwise, see if the store can be turned into a memcpy.
475 if (HasMemcpy
&& !DisableLIRP::Memcpy
) {
476 // Check to see if the stride matches the size of the store. If so, then we
477 // know that every byte is touched in the loop.
478 APInt Stride
= getStoreStride(StoreEv
);
479 unsigned StoreSize
= DL
->getTypeStoreSize(SI
->getValueOperand()->getType());
480 if (StoreSize
!= Stride
&& StoreSize
!= -Stride
)
481 return LegalStoreKind::None
;
483 // The store must be feeding a non-volatile load.
484 LoadInst
*LI
= dyn_cast
<LoadInst
>(SI
->getValueOperand());
486 // Only allow non-volatile loads
487 if (!LI
|| LI
->isVolatile())
488 return LegalStoreKind::None
;
489 // Only allow simple or unordered-atomic loads
490 if (!LI
->isUnordered())
491 return LegalStoreKind::None
;
493 // See if the pointer expression is an AddRec like {base,+,1} on the current
494 // loop, which indicates a strided load. If we have something else, it's a
495 // random load we can't handle.
496 const SCEVAddRecExpr
*LoadEv
=
497 dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(LI
->getPointerOperand()));
498 if (!LoadEv
|| LoadEv
->getLoop() != CurLoop
|| !LoadEv
->isAffine())
499 return LegalStoreKind::None
;
501 // The store and load must share the same stride.
502 if (StoreEv
->getOperand(1) != LoadEv
->getOperand(1))
503 return LegalStoreKind::None
;
505 // Success. This store can be converted into a memcpy.
506 UnorderedAtomic
= UnorderedAtomic
|| LI
->isAtomic();
507 return UnorderedAtomic
? LegalStoreKind::UnorderedAtomicMemcpy
508 : LegalStoreKind::Memcpy
;
510 // This store can't be transformed into a memset/memcpy.
511 return LegalStoreKind::None
;
514 void LoopIdiomRecognize::collectStores(BasicBlock
*BB
) {
515 StoreRefsForMemset
.clear();
516 StoreRefsForMemsetPattern
.clear();
517 StoreRefsForMemcpy
.clear();
518 for (Instruction
&I
: *BB
) {
519 StoreInst
*SI
= dyn_cast
<StoreInst
>(&I
);
523 // Make sure this is a strided store with a constant stride.
524 switch (isLegalStore(SI
)) {
525 case LegalStoreKind::None
:
528 case LegalStoreKind::Memset
: {
529 // Find the base pointer.
530 Value
*Ptr
= getUnderlyingObject(SI
->getPointerOperand());
531 StoreRefsForMemset
[Ptr
].push_back(SI
);
533 case LegalStoreKind::MemsetPattern
: {
534 // Find the base pointer.
535 Value
*Ptr
= getUnderlyingObject(SI
->getPointerOperand());
536 StoreRefsForMemsetPattern
[Ptr
].push_back(SI
);
538 case LegalStoreKind::Memcpy
:
539 case LegalStoreKind::UnorderedAtomicMemcpy
:
540 StoreRefsForMemcpy
.push_back(SI
);
543 assert(false && "unhandled return value");
549 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
550 /// with the specified backedge count. This block is known to be in the current
551 /// loop and not in any subloops.
552 bool LoopIdiomRecognize::runOnLoopBlock(
553 BasicBlock
*BB
, const SCEV
*BECount
,
554 SmallVectorImpl
<BasicBlock
*> &ExitBlocks
) {
555 // We can only promote stores in this block if they are unconditionally
556 // executed in the loop. For a block to be unconditionally executed, it has
557 // to dominate all the exit blocks of the loop. Verify this now.
558 for (BasicBlock
*ExitBlock
: ExitBlocks
)
559 if (!DT
->dominates(BB
, ExitBlock
))
562 bool MadeChange
= false;
563 // Look for store instructions, which may be optimized to memset/memcpy.
566 // Look for a single store or sets of stores with a common base, which can be
567 // optimized into a memset (memset_pattern). The latter most commonly happens
568 // with structs and handunrolled loops.
569 for (auto &SL
: StoreRefsForMemset
)
570 MadeChange
|= processLoopStores(SL
.second
, BECount
, ForMemset::Yes
);
572 for (auto &SL
: StoreRefsForMemsetPattern
)
573 MadeChange
|= processLoopStores(SL
.second
, BECount
, ForMemset::No
);
575 // Optimize the store into a memcpy, if it feeds an similarly strided load.
576 for (auto &SI
: StoreRefsForMemcpy
)
577 MadeChange
|= processLoopStoreOfLoopLoad(SI
, BECount
);
579 MadeChange
|= processLoopMemIntrinsic
<MemCpyInst
>(
580 BB
, &LoopIdiomRecognize::processLoopMemCpy
, BECount
);
581 MadeChange
|= processLoopMemIntrinsic
<MemSetInst
>(
582 BB
, &LoopIdiomRecognize::processLoopMemSet
, BECount
);
587 /// See if this store(s) can be promoted to a memset.
588 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl
<StoreInst
*> &SL
,
589 const SCEV
*BECount
, ForMemset For
) {
590 // Try to find consecutive stores that can be transformed into memsets.
591 SetVector
<StoreInst
*> Heads
, Tails
;
592 SmallDenseMap
<StoreInst
*, StoreInst
*> ConsecutiveChain
;
594 // Do a quadratic search on all of the given stores and find
595 // all of the pairs of stores that follow each other.
596 SmallVector
<unsigned, 16> IndexQueue
;
597 for (unsigned i
= 0, e
= SL
.size(); i
< e
; ++i
) {
598 assert(SL
[i
]->isSimple() && "Expected only non-volatile stores.");
600 Value
*FirstStoredVal
= SL
[i
]->getValueOperand();
601 Value
*FirstStorePtr
= SL
[i
]->getPointerOperand();
602 const SCEVAddRecExpr
*FirstStoreEv
=
603 cast
<SCEVAddRecExpr
>(SE
->getSCEV(FirstStorePtr
));
604 APInt FirstStride
= getStoreStride(FirstStoreEv
);
605 unsigned FirstStoreSize
= DL
->getTypeStoreSize(SL
[i
]->getValueOperand()->getType());
607 // See if we can optimize just this store in isolation.
608 if (FirstStride
== FirstStoreSize
|| -FirstStride
== FirstStoreSize
) {
613 Value
*FirstSplatValue
= nullptr;
614 Constant
*FirstPatternValue
= nullptr;
616 if (For
== ForMemset::Yes
)
617 FirstSplatValue
= isBytewiseValue(FirstStoredVal
, *DL
);
619 FirstPatternValue
= getMemSetPatternValue(FirstStoredVal
, DL
);
621 assert((FirstSplatValue
|| FirstPatternValue
) &&
622 "Expected either splat value or pattern value.");
625 // If a store has multiple consecutive store candidates, search Stores
626 // array according to the sequence: from i+1 to e, then from i-1 to 0.
627 // This is because usually pairing with immediate succeeding or preceding
628 // candidate create the best chance to find memset opportunity.
630 for (j
= i
+ 1; j
< e
; ++j
)
631 IndexQueue
.push_back(j
);
632 for (j
= i
; j
> 0; --j
)
633 IndexQueue
.push_back(j
- 1);
635 for (auto &k
: IndexQueue
) {
636 assert(SL
[k
]->isSimple() && "Expected only non-volatile stores.");
637 Value
*SecondStorePtr
= SL
[k
]->getPointerOperand();
638 const SCEVAddRecExpr
*SecondStoreEv
=
639 cast
<SCEVAddRecExpr
>(SE
->getSCEV(SecondStorePtr
));
640 APInt SecondStride
= getStoreStride(SecondStoreEv
);
642 if (FirstStride
!= SecondStride
)
645 Value
*SecondStoredVal
= SL
[k
]->getValueOperand();
646 Value
*SecondSplatValue
= nullptr;
647 Constant
*SecondPatternValue
= nullptr;
649 if (For
== ForMemset::Yes
)
650 SecondSplatValue
= isBytewiseValue(SecondStoredVal
, *DL
);
652 SecondPatternValue
= getMemSetPatternValue(SecondStoredVal
, DL
);
654 assert((SecondSplatValue
|| SecondPatternValue
) &&
655 "Expected either splat value or pattern value.");
657 if (isConsecutiveAccess(SL
[i
], SL
[k
], *DL
, *SE
, false)) {
658 if (For
== ForMemset::Yes
) {
659 if (isa
<UndefValue
>(FirstSplatValue
))
660 FirstSplatValue
= SecondSplatValue
;
661 if (FirstSplatValue
!= SecondSplatValue
)
664 if (isa
<UndefValue
>(FirstPatternValue
))
665 FirstPatternValue
= SecondPatternValue
;
666 if (FirstPatternValue
!= SecondPatternValue
)
671 ConsecutiveChain
[SL
[i
]] = SL
[k
];
677 // We may run into multiple chains that merge into a single chain. We mark the
678 // stores that we transformed so that we don't visit the same store twice.
679 SmallPtrSet
<Value
*, 16> TransformedStores
;
680 bool Changed
= false;
682 // For stores that start but don't end a link in the chain:
683 for (StoreInst
*I
: Heads
) {
687 // We found a store instr that starts a chain. Now follow the chain and try
689 SmallPtrSet
<Instruction
*, 8> AdjacentStores
;
690 StoreInst
*HeadStore
= I
;
691 unsigned StoreSize
= 0;
693 // Collect the chain into a list.
694 while (Tails
.count(I
) || Heads
.count(I
)) {
695 if (TransformedStores
.count(I
))
697 AdjacentStores
.insert(I
);
699 StoreSize
+= DL
->getTypeStoreSize(I
->getValueOperand()->getType());
700 // Move to the next value in the chain.
701 I
= ConsecutiveChain
[I
];
704 Value
*StoredVal
= HeadStore
->getValueOperand();
705 Value
*StorePtr
= HeadStore
->getPointerOperand();
706 const SCEVAddRecExpr
*StoreEv
= cast
<SCEVAddRecExpr
>(SE
->getSCEV(StorePtr
));
707 APInt Stride
= getStoreStride(StoreEv
);
709 // Check to see if the stride matches the size of the stores. If so, then
710 // we know that every byte is touched in the loop.
711 if (StoreSize
!= Stride
&& StoreSize
!= -Stride
)
714 bool IsNegStride
= StoreSize
== -Stride
;
716 Type
*IntIdxTy
= DL
->getIndexType(StorePtr
->getType());
717 const SCEV
*StoreSizeSCEV
= SE
->getConstant(IntIdxTy
, StoreSize
);
718 if (processLoopStridedStore(StorePtr
, StoreSizeSCEV
,
719 MaybeAlign(HeadStore
->getAlign()), StoredVal
,
720 HeadStore
, AdjacentStores
, StoreEv
, BECount
,
722 TransformedStores
.insert(AdjacentStores
.begin(), AdjacentStores
.end());
730 /// processLoopMemIntrinsic - Template function for calling different processor
731 /// functions based on mem intrinsic type.
732 template <typename MemInst
>
733 bool LoopIdiomRecognize::processLoopMemIntrinsic(
735 bool (LoopIdiomRecognize::*Processor
)(MemInst
*, const SCEV
*),
736 const SCEV
*BECount
) {
737 bool MadeChange
= false;
738 for (BasicBlock::iterator I
= BB
->begin(), E
= BB
->end(); I
!= E
;) {
739 Instruction
*Inst
= &*I
++;
740 // Look for memory instructions, which may be optimized to a larger one.
741 if (MemInst
*MI
= dyn_cast
<MemInst
>(Inst
)) {
742 WeakTrackingVH
InstPtr(&*I
);
743 if (!(this->*Processor
)(MI
, BECount
))
747 // If processing the instruction invalidated our iterator, start over from
748 // the top of the block.
756 /// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
757 bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst
*MCI
,
758 const SCEV
*BECount
) {
759 // We can only handle non-volatile memcpys with a constant size.
760 if (MCI
->isVolatile() || !isa
<ConstantInt
>(MCI
->getLength()))
763 // If we're not allowed to hack on memcpy, we fail.
764 if ((!HasMemcpy
&& !isa
<MemCpyInlineInst
>(MCI
)) || DisableLIRP::Memcpy
)
767 Value
*Dest
= MCI
->getDest();
768 Value
*Source
= MCI
->getSource();
769 if (!Dest
|| !Source
)
772 // See if the load and store pointer expressions are AddRec like {base,+,1} on
773 // the current loop, which indicates a strided load and store. If we have
774 // something else, it's a random load or store we can't handle.
775 const SCEVAddRecExpr
*StoreEv
= dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(Dest
));
776 if (!StoreEv
|| StoreEv
->getLoop() != CurLoop
|| !StoreEv
->isAffine())
778 const SCEVAddRecExpr
*LoadEv
= dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(Source
));
779 if (!LoadEv
|| LoadEv
->getLoop() != CurLoop
|| !LoadEv
->isAffine())
782 // Reject memcpys that are so large that they overflow an unsigned.
783 uint64_t SizeInBytes
= cast
<ConstantInt
>(MCI
->getLength())->getZExtValue();
784 if ((SizeInBytes
>> 32) != 0)
787 // Check if the stride matches the size of the memcpy. If so, then we know
788 // that every byte is touched in the loop.
789 const SCEVConstant
*ConstStoreStride
=
790 dyn_cast
<SCEVConstant
>(StoreEv
->getOperand(1));
791 const SCEVConstant
*ConstLoadStride
=
792 dyn_cast
<SCEVConstant
>(LoadEv
->getOperand(1));
793 if (!ConstStoreStride
|| !ConstLoadStride
)
796 APInt StoreStrideValue
= ConstStoreStride
->getAPInt();
797 APInt LoadStrideValue
= ConstLoadStride
->getAPInt();
798 // Huge stride value - give up
799 if (StoreStrideValue
.getBitWidth() > 64 || LoadStrideValue
.getBitWidth() > 64)
802 if (SizeInBytes
!= StoreStrideValue
&& SizeInBytes
!= -StoreStrideValue
) {
804 return OptimizationRemarkMissed(DEBUG_TYPE
, "SizeStrideUnequal", MCI
)
805 << ore::NV("Inst", "memcpy") << " in "
806 << ore::NV("Function", MCI
->getFunction())
807 << " function will not be hoisted: "
808 << ore::NV("Reason", "memcpy size is not equal to stride");
813 int64_t StoreStrideInt
= StoreStrideValue
.getSExtValue();
814 int64_t LoadStrideInt
= LoadStrideValue
.getSExtValue();
815 // Check if the load stride matches the store stride.
816 if (StoreStrideInt
!= LoadStrideInt
)
819 return processLoopStoreOfLoopLoad(
820 Dest
, Source
, SE
->getConstant(Dest
->getType(), SizeInBytes
),
821 MCI
->getDestAlign(), MCI
->getSourceAlign(), MCI
, MCI
, StoreEv
, LoadEv
,
825 /// processLoopMemSet - See if this memset can be promoted to a large memset.
826 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst
*MSI
,
827 const SCEV
*BECount
) {
828 // We can only handle non-volatile memsets.
829 if (MSI
->isVolatile())
832 // If we're not allowed to hack on memset, we fail.
833 if (!HasMemset
|| DisableLIRP::Memset
)
836 Value
*Pointer
= MSI
->getDest();
838 // See if the pointer expression is an AddRec like {base,+,1} on the current
839 // loop, which indicates a strided store. If we have something else, it's a
840 // random store we can't handle.
841 const SCEVAddRecExpr
*Ev
= dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(Pointer
));
842 if (!Ev
|| Ev
->getLoop() != CurLoop
)
844 if (!Ev
->isAffine()) {
845 LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
849 const SCEV
*PointerStrideSCEV
= Ev
->getOperand(1);
850 const SCEV
*MemsetSizeSCEV
= SE
->getSCEV(MSI
->getLength());
851 if (!PointerStrideSCEV
|| !MemsetSizeSCEV
)
854 bool IsNegStride
= false;
855 const bool IsConstantSize
= isa
<ConstantInt
>(MSI
->getLength());
857 if (IsConstantSize
) {
858 // Memset size is constant.
859 // Check if the pointer stride matches the memset size. If so, then
860 // we know that every byte is touched in the loop.
861 LLVM_DEBUG(dbgs() << " memset size is constant\n");
862 uint64_t SizeInBytes
= cast
<ConstantInt
>(MSI
->getLength())->getZExtValue();
863 const SCEVConstant
*ConstStride
= dyn_cast
<SCEVConstant
>(Ev
->getOperand(1));
867 APInt Stride
= ConstStride
->getAPInt();
868 if (SizeInBytes
!= Stride
&& SizeInBytes
!= -Stride
)
871 IsNegStride
= SizeInBytes
== -Stride
;
873 // Memset size is non-constant.
874 // Check if the pointer stride matches the memset size.
875 // To be conservative, the pass would not promote pointers that aren't in
876 // address space zero. Also, the pass only handles memset length and stride
877 // that are invariant for the top level loop.
878 LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
879 if (Pointer
->getType()->getPointerAddressSpace() != 0) {
880 LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
884 if (!SE
->isLoopInvariant(MemsetSizeSCEV
, CurLoop
)) {
885 LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
890 // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
891 IsNegStride
= PointerStrideSCEV
->isNonConstantNegative();
892 const SCEV
*PositiveStrideSCEV
=
893 IsNegStride
? SE
->getNegativeSCEV(PointerStrideSCEV
)
895 LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV
<< "\n"
896 << " PositiveStrideSCEV: " << *PositiveStrideSCEV
899 if (PositiveStrideSCEV
!= MemsetSizeSCEV
) {
900 // If an expression is covered by the loop guard, compare again and
901 // proceed with optimization if equal.
902 const SCEV
*FoldedPositiveStride
=
903 SE
->applyLoopGuards(PositiveStrideSCEV
, CurLoop
);
904 const SCEV
*FoldedMemsetSize
=
905 SE
->applyLoopGuards(MemsetSizeSCEV
, CurLoop
);
907 LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
908 << " FoldedMemsetSize: " << *FoldedMemsetSize
<< "\n"
909 << " FoldedPositiveStride: " << *FoldedPositiveStride
912 if (FoldedPositiveStride
!= FoldedMemsetSize
) {
913 LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
919 // Verify that the memset value is loop invariant. If not, we can't promote
921 Value
*SplatValue
= MSI
->getValue();
922 if (!SplatValue
|| !CurLoop
->isLoopInvariant(SplatValue
))
925 SmallPtrSet
<Instruction
*, 1> MSIs
;
927 return processLoopStridedStore(Pointer
, SE
->getSCEV(MSI
->getLength()),
928 MSI
->getDestAlign(), SplatValue
, MSI
, MSIs
, Ev
,
929 BECount
, IsNegStride
, /*IsLoopMemset=*/true);
932 /// mayLoopAccessLocation - Return true if the specified loop might access the
933 /// specified pointer location, which is a loop-strided access. The 'Access'
934 /// argument specifies what the verboten forms of access are (read or write).
936 mayLoopAccessLocation(Value
*Ptr
, ModRefInfo Access
, Loop
*L
,
937 const SCEV
*BECount
, const SCEV
*StoreSizeSCEV
,
939 SmallPtrSetImpl
<Instruction
*> &IgnoredInsts
) {
940 // Get the location that may be stored across the loop. Since the access is
941 // strided positively through memory, we say that the modified location starts
942 // at the pointer and has infinite size.
943 LocationSize AccessSize
= LocationSize::afterPointer();
945 // If the loop iterates a fixed number of times, we can refine the access size
946 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
947 const SCEVConstant
*BECst
= dyn_cast
<SCEVConstant
>(BECount
);
948 const SCEVConstant
*ConstSize
= dyn_cast
<SCEVConstant
>(StoreSizeSCEV
);
949 if (BECst
&& ConstSize
) {
950 std::optional
<uint64_t> BEInt
= BECst
->getAPInt().tryZExtValue();
951 std::optional
<uint64_t> SizeInt
= ConstSize
->getAPInt().tryZExtValue();
952 // FIXME: Should this check for overflow?
953 if (BEInt
&& SizeInt
)
954 AccessSize
= LocationSize::precise((*BEInt
+ 1) * *SizeInt
);
957 // TODO: For this to be really effective, we have to dive into the pointer
958 // operand in the store. Store to &A[i] of 100 will always return may alias
959 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
960 // which will then no-alias a store to &A[100].
961 MemoryLocation
StoreLoc(Ptr
, AccessSize
);
963 for (BasicBlock
*B
: L
->blocks())
964 for (Instruction
&I
: *B
)
965 if (!IgnoredInsts
.contains(&I
) &&
966 isModOrRefSet(AA
.getModRefInfo(&I
, StoreLoc
) & Access
))
971 // If we have a negative stride, Start refers to the end of the memory location
972 // we're trying to memset. Therefore, we need to recompute the base pointer,
973 // which is just Start - BECount*Size.
974 static const SCEV
*getStartForNegStride(const SCEV
*Start
, const SCEV
*BECount
,
975 Type
*IntPtr
, const SCEV
*StoreSizeSCEV
,
976 ScalarEvolution
*SE
) {
977 const SCEV
*Index
= SE
->getTruncateOrZeroExtend(BECount
, IntPtr
);
978 if (!StoreSizeSCEV
->isOne()) {
979 // index = back edge count * store size
980 Index
= SE
->getMulExpr(Index
,
981 SE
->getTruncateOrZeroExtend(StoreSizeSCEV
, IntPtr
),
984 // base pointer = start - index * store size
985 return SE
->getMinusSCEV(Start
, Index
);
988 /// Compute the number of bytes as a SCEV from the backedge taken count.
990 /// This also maps the SCEV into the provided type and tries to handle the
991 /// computation in a way that will fold cleanly.
992 static const SCEV
*getNumBytes(const SCEV
*BECount
, Type
*IntPtr
,
993 const SCEV
*StoreSizeSCEV
, Loop
*CurLoop
,
994 const DataLayout
*DL
, ScalarEvolution
*SE
) {
995 const SCEV
*TripCountSCEV
=
996 SE
->getTripCountFromExitCount(BECount
, IntPtr
, CurLoop
);
997 return SE
->getMulExpr(TripCountSCEV
,
998 SE
->getTruncateOrZeroExtend(StoreSizeSCEV
, IntPtr
),
1002 /// processLoopStridedStore - We see a strided store of some value. If we can
1003 /// transform this into a memset or memset_pattern in the loop preheader, do so.
1004 bool LoopIdiomRecognize::processLoopStridedStore(
1005 Value
*DestPtr
, const SCEV
*StoreSizeSCEV
, MaybeAlign StoreAlignment
,
1006 Value
*StoredVal
, Instruction
*TheStore
,
1007 SmallPtrSetImpl
<Instruction
*> &Stores
, const SCEVAddRecExpr
*Ev
,
1008 const SCEV
*BECount
, bool IsNegStride
, bool IsLoopMemset
) {
1009 Module
*M
= TheStore
->getModule();
1010 Value
*SplatValue
= isBytewiseValue(StoredVal
, *DL
);
1011 Constant
*PatternValue
= nullptr;
1014 PatternValue
= getMemSetPatternValue(StoredVal
, DL
);
1016 assert((SplatValue
|| PatternValue
) &&
1017 "Expected either splat value or pattern value.");
1019 // The trip count of the loop and the base pointer of the addrec SCEV is
1020 // guaranteed to be loop invariant, which means that it should dominate the
1021 // header. This allows us to insert code for it in the preheader.
1022 unsigned DestAS
= DestPtr
->getType()->getPointerAddressSpace();
1023 BasicBlock
*Preheader
= CurLoop
->getLoopPreheader();
1024 IRBuilder
<> Builder(Preheader
->getTerminator());
1025 SCEVExpander
Expander(*SE
, *DL
, "loop-idiom");
1026 SCEVExpanderCleaner
ExpCleaner(Expander
);
1028 Type
*DestInt8PtrTy
= Builder
.getPtrTy(DestAS
);
1029 Type
*IntIdxTy
= DL
->getIndexType(DestPtr
->getType());
1031 bool Changed
= false;
1032 const SCEV
*Start
= Ev
->getStart();
1033 // Handle negative strided loops.
1035 Start
= getStartForNegStride(Start
, BECount
, IntIdxTy
, StoreSizeSCEV
, SE
);
1037 // TODO: ideally we should still be able to generate memset if SCEV expander
1038 // is taught to generate the dependencies at the latest point.
1039 if (!Expander
.isSafeToExpand(Start
))
1042 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1043 // this into a memset in the loop preheader now if we want. However, this
1044 // would be unsafe to do if there is anything else in the loop that may read
1045 // or write to the aliased location. Check for any overlap by generating the
1046 // base pointer and checking the region.
1048 Expander
.expandCodeFor(Start
, DestInt8PtrTy
, Preheader
->getTerminator());
1050 // From here on out, conservatively report to the pass manager that we've
1051 // changed the IR, even if we later clean up these added instructions. There
1052 // may be structural differences e.g. in the order of use lists not accounted
1053 // for in just a textual dump of the IR. This is written as a variable, even
1054 // though statically all the places this dominates could be replaced with
1055 // 'true', with the hope that anyone trying to be clever / "more precise" with
1056 // the return value will read this comment, and leave them alone.
1059 if (mayLoopAccessLocation(BasePtr
, ModRefInfo::ModRef
, CurLoop
, BECount
,
1060 StoreSizeSCEV
, *AA
, Stores
))
1063 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset
))
1066 // Okay, everything looks good, insert the memset.
1068 const SCEV
*NumBytesS
=
1069 getNumBytes(BECount
, IntIdxTy
, StoreSizeSCEV
, CurLoop
, DL
, SE
);
1071 // TODO: ideally we should still be able to generate memset if SCEV expander
1072 // is taught to generate the dependencies at the latest point.
1073 if (!Expander
.isSafeToExpand(NumBytesS
))
1077 Expander
.expandCodeFor(NumBytesS
, IntIdxTy
, Preheader
->getTerminator());
1079 if (!SplatValue
&& !isLibFuncEmittable(M
, TLI
, LibFunc_memset_pattern16
))
1082 AAMDNodes AATags
= TheStore
->getAAMetadata();
1083 for (Instruction
*Store
: Stores
)
1084 AATags
= AATags
.merge(Store
->getAAMetadata());
1085 if (auto CI
= dyn_cast
<ConstantInt
>(NumBytes
))
1086 AATags
= AATags
.extendTo(CI
->getZExtValue());
1088 AATags
= AATags
.extendTo(-1);
1092 NewCall
= Builder
.CreateMemSet(
1093 BasePtr
, SplatValue
, NumBytes
, MaybeAlign(StoreAlignment
),
1094 /*isVolatile=*/false, AATags
.TBAA
, AATags
.Scope
, AATags
.NoAlias
);
1096 assert (isLibFuncEmittable(M
, TLI
, LibFunc_memset_pattern16
));
1097 // Everything is emitted in default address space
1098 Type
*Int8PtrTy
= DestInt8PtrTy
;
1100 StringRef FuncName
= "memset_pattern16";
1101 FunctionCallee MSP
= getOrInsertLibFunc(M
, *TLI
, LibFunc_memset_pattern16
,
1102 Builder
.getVoidTy(), Int8PtrTy
, Int8PtrTy
, IntIdxTy
);
1103 inferNonMandatoryLibFuncAttrs(M
, FuncName
, *TLI
);
1105 // Otherwise we should form a memset_pattern16. PatternValue is known to be
1106 // an constant array of 16-bytes. Plop the value into a mergable global.
1107 GlobalVariable
*GV
= new GlobalVariable(*M
, PatternValue
->getType(), true,
1108 GlobalValue::PrivateLinkage
,
1109 PatternValue
, ".memset_pattern");
1110 GV
->setUnnamedAddr(GlobalValue::UnnamedAddr::Global
); // Ok to merge these.
1111 GV
->setAlignment(Align(16));
1112 Value
*PatternPtr
= GV
;
1113 NewCall
= Builder
.CreateCall(MSP
, {BasePtr
, PatternPtr
, NumBytes
});
1115 // Set the TBAA info if present.
1117 NewCall
->setMetadata(LLVMContext::MD_tbaa
, AATags
.TBAA
);
1120 NewCall
->setMetadata(LLVMContext::MD_alias_scope
, AATags
.Scope
);
1123 NewCall
->setMetadata(LLVMContext::MD_noalias
, AATags
.NoAlias
);
1126 NewCall
->setDebugLoc(TheStore
->getDebugLoc());
1129 MemoryAccess
*NewMemAcc
= MSSAU
->createMemoryAccessInBB(
1130 NewCall
, nullptr, NewCall
->getParent(), MemorySSA::BeforeTerminator
);
1131 MSSAU
->insertDef(cast
<MemoryDef
>(NewMemAcc
), true);
1134 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall
<< "\n"
1135 << " from store to: " << *Ev
<< " at: " << *TheStore
1139 OptimizationRemark
R(DEBUG_TYPE
, "ProcessLoopStridedStore",
1140 NewCall
->getDebugLoc(), Preheader
);
1141 R
<< "Transformed loop-strided store in "
1142 << ore::NV("Function", TheStore
->getFunction())
1143 << " function into a call to "
1144 << ore::NV("NewFunction", NewCall
->getCalledFunction())
1146 if (!Stores
.empty())
1147 R
<< ore::setExtraArgs();
1148 for (auto *I
: Stores
) {
1149 R
<< ore::NV("FromBlock", I
->getParent()->getName())
1150 << ore::NV("ToBlock", Preheader
->getName());
1155 // Okay, the memset has been formed. Zap the original store and anything that
1157 for (auto *I
: Stores
) {
1159 MSSAU
->removeMemoryAccess(I
, true);
1160 deleteDeadInstruction(I
);
1162 if (MSSAU
&& VerifyMemorySSA
)
1163 MSSAU
->getMemorySSA()->verifyMemorySSA();
1165 ExpCleaner
.markResultUsed();
1169 /// If the stored value is a strided load in the same loop with the same stride
1170 /// this may be transformable into a memcpy. This kicks in for stuff like
1171 /// for (i) A[i] = B[i];
1172 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst
*SI
,
1173 const SCEV
*BECount
) {
1174 assert(SI
->isUnordered() && "Expected only non-volatile non-ordered stores.");
1176 Value
*StorePtr
= SI
->getPointerOperand();
1177 const SCEVAddRecExpr
*StoreEv
= cast
<SCEVAddRecExpr
>(SE
->getSCEV(StorePtr
));
1178 unsigned StoreSize
= DL
->getTypeStoreSize(SI
->getValueOperand()->getType());
1180 // The store must be feeding a non-volatile load.
1181 LoadInst
*LI
= cast
<LoadInst
>(SI
->getValueOperand());
1182 assert(LI
->isUnordered() && "Expected only non-volatile non-ordered loads.");
1184 // See if the pointer expression is an AddRec like {base,+,1} on the current
1185 // loop, which indicates a strided load. If we have something else, it's a
1186 // random load we can't handle.
1187 Value
*LoadPtr
= LI
->getPointerOperand();
1188 const SCEVAddRecExpr
*LoadEv
= cast
<SCEVAddRecExpr
>(SE
->getSCEV(LoadPtr
));
1190 const SCEV
*StoreSizeSCEV
= SE
->getConstant(StorePtr
->getType(), StoreSize
);
1191 return processLoopStoreOfLoopLoad(StorePtr
, LoadPtr
, StoreSizeSCEV
,
1192 SI
->getAlign(), LI
->getAlign(), SI
, LI
,
1193 StoreEv
, LoadEv
, BECount
);
1197 class MemmoveVerifier
{
1199 explicit MemmoveVerifier(const Value
&LoadBasePtr
, const Value
&StoreBasePtr
,
1200 const DataLayout
&DL
)
1201 : DL(DL
), BP1(llvm::GetPointerBaseWithConstantOffset(
1202 LoadBasePtr
.stripPointerCasts(), LoadOff
, DL
)),
1203 BP2(llvm::GetPointerBaseWithConstantOffset(
1204 StoreBasePtr
.stripPointerCasts(), StoreOff
, DL
)),
1205 IsSameObject(BP1
== BP2
) {}
1207 bool loadAndStoreMayFormMemmove(unsigned StoreSize
, bool IsNegStride
,
1208 const Instruction
&TheLoad
,
1209 bool IsMemCpy
) const {
1211 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1212 // for negative stride.
1213 if ((!IsNegStride
&& LoadOff
<= StoreOff
) ||
1214 (IsNegStride
&& LoadOff
>= StoreOff
))
1217 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1218 // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1220 DL
.getTypeSizeInBits(TheLoad
.getType()).getFixedValue() / 8;
1221 if (BP1
!= BP2
|| LoadSize
!= int64_t(StoreSize
))
1223 if ((!IsNegStride
&& LoadOff
< StoreOff
+ int64_t(StoreSize
)) ||
1224 (IsNegStride
&& LoadOff
+ LoadSize
> StoreOff
))
1231 const DataLayout
&DL
;
1232 int64_t LoadOff
= 0;
1233 int64_t StoreOff
= 0;
1238 const bool IsSameObject
;
1242 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1243 Value
*DestPtr
, Value
*SourcePtr
, const SCEV
*StoreSizeSCEV
,
1244 MaybeAlign StoreAlign
, MaybeAlign LoadAlign
, Instruction
*TheStore
,
1245 Instruction
*TheLoad
, const SCEVAddRecExpr
*StoreEv
,
1246 const SCEVAddRecExpr
*LoadEv
, const SCEV
*BECount
) {
1248 // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1249 // conservatively bail here, since otherwise we may have to transform
1250 // llvm.memcpy.inline into llvm.memcpy which is illegal.
1251 if (isa
<MemCpyInlineInst
>(TheStore
))
1254 // The trip count of the loop and the base pointer of the addrec SCEV is
1255 // guaranteed to be loop invariant, which means that it should dominate the
1256 // header. This allows us to insert code for it in the preheader.
1257 BasicBlock
*Preheader
= CurLoop
->getLoopPreheader();
1258 IRBuilder
<> Builder(Preheader
->getTerminator());
1259 SCEVExpander
Expander(*SE
, *DL
, "loop-idiom");
1261 SCEVExpanderCleaner
ExpCleaner(Expander
);
1263 bool Changed
= false;
1264 const SCEV
*StrStart
= StoreEv
->getStart();
1265 unsigned StrAS
= DestPtr
->getType()->getPointerAddressSpace();
1266 Type
*IntIdxTy
= Builder
.getIntNTy(DL
->getIndexSizeInBits(StrAS
));
1268 APInt Stride
= getStoreStride(StoreEv
);
1269 const SCEVConstant
*ConstStoreSize
= dyn_cast
<SCEVConstant
>(StoreSizeSCEV
);
1271 // TODO: Deal with non-constant size; Currently expect constant store size
1272 assert(ConstStoreSize
&& "store size is expected to be a constant");
1274 int64_t StoreSize
= ConstStoreSize
->getValue()->getZExtValue();
1275 bool IsNegStride
= StoreSize
== -Stride
;
1277 // Handle negative strided loops.
1280 getStartForNegStride(StrStart
, BECount
, IntIdxTy
, StoreSizeSCEV
, SE
);
1282 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1283 // this into a memcpy in the loop preheader now if we want. However, this
1284 // would be unsafe to do if there is anything else in the loop that may read
1285 // or write the memory region we're storing to. This includes the load that
1286 // feeds the stores. Check for an alias by generating the base address and
1287 // checking everything.
1288 Value
*StoreBasePtr
= Expander
.expandCodeFor(
1289 StrStart
, Builder
.getPtrTy(StrAS
), Preheader
->getTerminator());
1291 // From here on out, conservatively report to the pass manager that we've
1292 // changed the IR, even if we later clean up these added instructions. There
1293 // may be structural differences e.g. in the order of use lists not accounted
1294 // for in just a textual dump of the IR. This is written as a variable, even
1295 // though statically all the places this dominates could be replaced with
1296 // 'true', with the hope that anyone trying to be clever / "more precise" with
1297 // the return value will read this comment, and leave them alone.
1300 SmallPtrSet
<Instruction
*, 2> IgnoredInsts
;
1301 IgnoredInsts
.insert(TheStore
);
1303 bool IsMemCpy
= isa
<MemCpyInst
>(TheStore
);
1304 const StringRef InstRemark
= IsMemCpy
? "memcpy" : "load and store";
1306 bool LoopAccessStore
=
1307 mayLoopAccessLocation(StoreBasePtr
, ModRefInfo::ModRef
, CurLoop
, BECount
,
1308 StoreSizeSCEV
, *AA
, IgnoredInsts
);
1309 if (LoopAccessStore
) {
1310 // For memmove case it's not enough to guarantee that loop doesn't access
1311 // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1312 // the only user of TheLoad.
1313 if (!TheLoad
->hasOneUse())
1315 IgnoredInsts
.insert(TheLoad
);
1316 if (mayLoopAccessLocation(StoreBasePtr
, ModRefInfo::ModRef
, CurLoop
,
1317 BECount
, StoreSizeSCEV
, *AA
, IgnoredInsts
)) {
1319 return OptimizationRemarkMissed(DEBUG_TYPE
, "LoopMayAccessStore",
1321 << ore::NV("Inst", InstRemark
) << " in "
1322 << ore::NV("Function", TheStore
->getFunction())
1323 << " function will not be hoisted: "
1324 << ore::NV("Reason", "The loop may access store location");
1328 IgnoredInsts
.erase(TheLoad
);
1331 const SCEV
*LdStart
= LoadEv
->getStart();
1332 unsigned LdAS
= SourcePtr
->getType()->getPointerAddressSpace();
1334 // Handle negative strided loops.
1337 getStartForNegStride(LdStart
, BECount
, IntIdxTy
, StoreSizeSCEV
, SE
);
1339 // For a memcpy, we have to make sure that the input array is not being
1340 // mutated by the loop.
1341 Value
*LoadBasePtr
= Expander
.expandCodeFor(LdStart
, Builder
.getPtrTy(LdAS
),
1342 Preheader
->getTerminator());
1344 // If the store is a memcpy instruction, we must check if it will write to
1345 // the load memory locations. So remove it from the ignored stores.
1346 MemmoveVerifier
Verifier(*LoadBasePtr
, *StoreBasePtr
, *DL
);
1347 if (IsMemCpy
&& !Verifier
.IsSameObject
)
1348 IgnoredInsts
.erase(TheStore
);
1349 if (mayLoopAccessLocation(LoadBasePtr
, ModRefInfo::Mod
, CurLoop
, BECount
,
1350 StoreSizeSCEV
, *AA
, IgnoredInsts
)) {
1352 return OptimizationRemarkMissed(DEBUG_TYPE
, "LoopMayAccessLoad", TheLoad
)
1353 << ore::NV("Inst", InstRemark
) << " in "
1354 << ore::NV("Function", TheStore
->getFunction())
1355 << " function will not be hoisted: "
1356 << ore::NV("Reason", "The loop may access load location");
1361 bool UseMemMove
= IsMemCpy
? Verifier
.IsSameObject
: LoopAccessStore
;
1363 if (!Verifier
.loadAndStoreMayFormMemmove(StoreSize
, IsNegStride
, *TheLoad
,
1367 if (avoidLIRForMultiBlockLoop())
1370 // Okay, everything is safe, we can transform this!
1372 const SCEV
*NumBytesS
=
1373 getNumBytes(BECount
, IntIdxTy
, StoreSizeSCEV
, CurLoop
, DL
, SE
);
1376 Expander
.expandCodeFor(NumBytesS
, IntIdxTy
, Preheader
->getTerminator());
1378 AAMDNodes AATags
= TheLoad
->getAAMetadata();
1379 AAMDNodes StoreAATags
= TheStore
->getAAMetadata();
1380 AATags
= AATags
.merge(StoreAATags
);
1381 if (auto CI
= dyn_cast
<ConstantInt
>(NumBytes
))
1382 AATags
= AATags
.extendTo(CI
->getZExtValue());
1384 AATags
= AATags
.extendTo(-1);
1386 CallInst
*NewCall
= nullptr;
1387 // Check whether to generate an unordered atomic memcpy:
1388 // If the load or store are atomic, then they must necessarily be unordered
1389 // by previous checks.
1390 if (!TheStore
->isAtomic() && !TheLoad
->isAtomic()) {
1392 NewCall
= Builder
.CreateMemMove(
1393 StoreBasePtr
, StoreAlign
, LoadBasePtr
, LoadAlign
, NumBytes
,
1394 /*isVolatile=*/false, AATags
.TBAA
, AATags
.Scope
, AATags
.NoAlias
);
1397 Builder
.CreateMemCpy(StoreBasePtr
, StoreAlign
, LoadBasePtr
, LoadAlign
,
1398 NumBytes
, /*isVolatile=*/false, AATags
.TBAA
,
1399 AATags
.TBAAStruct
, AATags
.Scope
, AATags
.NoAlias
);
1401 // For now don't support unordered atomic memmove.
1404 // We cannot allow unaligned ops for unordered load/store, so reject
1405 // anything where the alignment isn't at least the element size.
1406 assert((StoreAlign
&& LoadAlign
) &&
1407 "Expect unordered load/store to have align.");
1408 if (*StoreAlign
< StoreSize
|| *LoadAlign
< StoreSize
)
1411 // If the element.atomic memcpy is not lowered into explicit
1412 // loads/stores later, then it will be lowered into an element-size
1413 // specific lib call. If the lib call doesn't exist for our store size, then
1414 // we shouldn't generate the memcpy.
1415 if (StoreSize
> TTI
->getAtomicMemIntrinsicMaxElementSize())
1419 // Note that unordered atomic loads/stores are *required* by the spec to
1420 // have an alignment but non-atomic loads/stores may not.
1421 NewCall
= Builder
.CreateElementUnorderedAtomicMemCpy(
1422 StoreBasePtr
, *StoreAlign
, LoadBasePtr
, *LoadAlign
, NumBytes
, StoreSize
,
1423 AATags
.TBAA
, AATags
.TBAAStruct
, AATags
.Scope
, AATags
.NoAlias
);
1425 NewCall
->setDebugLoc(TheStore
->getDebugLoc());
1428 MemoryAccess
*NewMemAcc
= MSSAU
->createMemoryAccessInBB(
1429 NewCall
, nullptr, NewCall
->getParent(), MemorySSA::BeforeTerminator
);
1430 MSSAU
->insertDef(cast
<MemoryDef
>(NewMemAcc
), true);
1433 LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall
<< "\n"
1434 << " from load ptr=" << *LoadEv
<< " at: " << *TheLoad
1436 << " from store ptr=" << *StoreEv
<< " at: " << *TheStore
1440 return OptimizationRemark(DEBUG_TYPE
, "ProcessLoopStoreOfLoopLoad",
1441 NewCall
->getDebugLoc(), Preheader
)
1442 << "Formed a call to "
1443 << ore::NV("NewFunction", NewCall
->getCalledFunction())
1444 << "() intrinsic from " << ore::NV("Inst", InstRemark
)
1445 << " instruction in " << ore::NV("Function", TheStore
->getFunction())
1447 << ore::setExtraArgs()
1448 << ore::NV("FromBlock", TheStore
->getParent()->getName())
1449 << ore::NV("ToBlock", Preheader
->getName());
1452 // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1453 // and anything that feeds into it.
1455 MSSAU
->removeMemoryAccess(TheStore
, true);
1456 deleteDeadInstruction(TheStore
);
1457 if (MSSAU
&& VerifyMemorySSA
)
1458 MSSAU
->getMemorySSA()->verifyMemorySSA();
1463 ExpCleaner
.markResultUsed();
1467 // When compiling for codesize we avoid idiom recognition for a multi-block loop
1468 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1470 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset
,
1471 bool IsLoopMemset
) {
1472 if (ApplyCodeSizeHeuristics
&& CurLoop
->getNumBlocks() > 1) {
1473 if (CurLoop
->isOutermost() && (!IsMemset
|| !IsLoopMemset
)) {
1474 LLVM_DEBUG(dbgs() << " " << CurLoop
->getHeader()->getParent()->getName()
1475 << " : LIR " << (IsMemset
? "Memset" : "Memcpy")
1476 << " avoided: multi-block top-level loop\n");
1484 bool LoopIdiomRecognize::runOnNoncountableLoop() {
1485 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Scanning: F["
1486 << CurLoop
->getHeader()->getParent()->getName()
1487 << "] Noncountable Loop %"
1488 << CurLoop
->getHeader()->getName() << "\n");
1490 return recognizePopcount() || recognizeAndInsertFFS() ||
1491 recognizeShiftUntilBitTest() || recognizeShiftUntilZero() ||
1492 recognizeShiftUntilLessThan();
1495 /// Check if the given conditional branch is based on the comparison between
1496 /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1497 /// true), the control yields to the loop entry. If the branch matches the
1498 /// behavior, the variable involved in the comparison is returned. This function
1499 /// will be called to see if the precondition and postcondition of the loop are
1500 /// in desirable form.
1501 static Value
*matchCondition(BranchInst
*BI
, BasicBlock
*LoopEntry
,
1502 bool JmpOnZero
= false) {
1503 if (!BI
|| !BI
->isConditional())
1506 ICmpInst
*Cond
= dyn_cast
<ICmpInst
>(BI
->getCondition());
1510 ConstantInt
*CmpZero
= dyn_cast
<ConstantInt
>(Cond
->getOperand(1));
1511 if (!CmpZero
|| !CmpZero
->isZero())
1514 BasicBlock
*TrueSucc
= BI
->getSuccessor(0);
1515 BasicBlock
*FalseSucc
= BI
->getSuccessor(1);
1517 std::swap(TrueSucc
, FalseSucc
);
1519 ICmpInst::Predicate Pred
= Cond
->getPredicate();
1520 if ((Pred
== ICmpInst::ICMP_NE
&& TrueSucc
== LoopEntry
) ||
1521 (Pred
== ICmpInst::ICMP_EQ
&& FalseSucc
== LoopEntry
))
1522 return Cond
->getOperand(0);
1527 /// Check if the given conditional branch is based on an unsigned less-than
1528 /// comparison between a variable and a constant, and if the comparison is false
1529 /// the control yields to the loop entry. If the branch matches the behaviour,
1530 /// the variable involved in the comparison is returned.
1531 static Value
*matchShiftULTCondition(BranchInst
*BI
, BasicBlock
*LoopEntry
,
1533 if (!BI
|| !BI
->isConditional())
1536 ICmpInst
*Cond
= dyn_cast
<ICmpInst
>(BI
->getCondition());
1540 ConstantInt
*CmpConst
= dyn_cast
<ConstantInt
>(Cond
->getOperand(1));
1544 BasicBlock
*FalseSucc
= BI
->getSuccessor(1);
1545 ICmpInst::Predicate Pred
= Cond
->getPredicate();
1547 if (Pred
== ICmpInst::ICMP_ULT
&& FalseSucc
== LoopEntry
) {
1548 Threshold
= CmpConst
->getValue();
1549 return Cond
->getOperand(0);
1555 // Check if the recurrence variable `VarX` is in the right form to create
1556 // the idiom. Returns the value coerced to a PHINode if so.
1557 static PHINode
*getRecurrenceVar(Value
*VarX
, Instruction
*DefX
,
1558 BasicBlock
*LoopEntry
) {
1559 auto *PhiX
= dyn_cast
<PHINode
>(VarX
);
1560 if (PhiX
&& PhiX
->getParent() == LoopEntry
&&
1561 (PhiX
->getOperand(0) == DefX
|| PhiX
->getOperand(1) == DefX
))
1566 /// Return true if the idiom is detected in the loop.
1569 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1570 /// or nullptr if there is no such.
1571 /// 2) \p CntPhi is set to the corresponding phi node
1572 /// or nullptr if there is no such.
1573 /// 3) \p InitX is set to the value whose CTLZ could be used.
1574 /// 4) \p DefX is set to the instruction calculating Loop exit condition.
1575 /// 5) \p Threshold is set to the constant involved in the unsigned less-than
1578 /// The core idiom we are trying to detect is:
1581 /// goto loop-exit // the precondition of the loop
1584 /// x = phi (x0, x.next); //PhiX
1585 /// cnt = phi (cnt0, cnt.next)
1587 /// cnt.next = cnt + 1;
1589 /// x.next = x >> 1; // DefX
1590 /// } while (x >= 4)
1593 static bool detectShiftUntilLessThanIdiom(Loop
*CurLoop
, const DataLayout
&DL
,
1594 Intrinsic::ID
&IntrinID
,
1595 Value
*&InitX
, Instruction
*&CntInst
,
1596 PHINode
*&CntPhi
, Instruction
*&DefX
,
1598 BasicBlock
*LoopEntry
;
1603 LoopEntry
= *(CurLoop
->block_begin());
1605 // step 1: Check if the loop-back branch is in desirable form.
1606 if (Value
*T
= matchShiftULTCondition(
1607 dyn_cast
<BranchInst
>(LoopEntry
->getTerminator()), LoopEntry
,
1609 DefX
= dyn_cast
<Instruction
>(T
);
1613 // step 2: Check the recurrence of variable X
1614 if (!DefX
|| !isa
<PHINode
>(DefX
))
1617 PHINode
*VarPhi
= cast
<PHINode
>(DefX
);
1618 int Idx
= VarPhi
->getBasicBlockIndex(LoopEntry
);
1622 DefX
= dyn_cast
<Instruction
>(VarPhi
->getIncomingValue(Idx
));
1623 if (!DefX
|| DefX
->getNumOperands() == 0 || DefX
->getOperand(0) != VarPhi
)
1626 // step 3: detect instructions corresponding to "x.next = x >> 1"
1627 if (DefX
->getOpcode() != Instruction::LShr
)
1630 IntrinID
= Intrinsic::ctlz
;
1631 ConstantInt
*Shft
= dyn_cast
<ConstantInt
>(DefX
->getOperand(1));
1632 if (!Shft
|| !Shft
->isOne())
1635 InitX
= VarPhi
->getIncomingValueForBlock(CurLoop
->getLoopPreheader());
1637 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1638 // or cnt.next = cnt + -1.
1639 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1640 // then all uses of "cnt.next" could be optimized to the trip count
1641 // plus "cnt0". Currently it is not optimized.
1642 // This step could be used to detect POPCNT instruction:
1643 // cnt.next = cnt + (x.next & 1)
1644 for (Instruction
&Inst
: llvm::make_range(
1645 LoopEntry
->getFirstNonPHI()->getIterator(), LoopEntry
->end())) {
1646 if (Inst
.getOpcode() != Instruction::Add
)
1649 ConstantInt
*Inc
= dyn_cast
<ConstantInt
>(Inst
.getOperand(1));
1650 if (!Inc
|| (!Inc
->isOne() && !Inc
->isMinusOne()))
1653 PHINode
*Phi
= getRecurrenceVar(Inst
.getOperand(0), &Inst
, LoopEntry
);
1667 /// Return true iff the idiom is detected in the loop.
1670 /// 1) \p CntInst is set to the instruction counting the population bit.
1671 /// 2) \p CntPhi is set to the corresponding phi node.
1672 /// 3) \p Var is set to the value whose population bits are being counted.
1674 /// The core idiom we are trying to detect is:
1677 /// goto loop-exit // the precondition of the loop
1678 /// cnt0 = init-val;
1680 /// x1 = phi (x0, x2);
1681 /// cnt1 = phi(cnt0, cnt2);
1683 /// cnt2 = cnt1 + 1;
1685 /// x2 = x1 & (x1 - 1);
1687 /// } while(x != 0);
1691 static bool detectPopcountIdiom(Loop
*CurLoop
, BasicBlock
*PreCondBB
,
1692 Instruction
*&CntInst
, PHINode
*&CntPhi
,
1694 // step 1: Check to see if the look-back branch match this pattern:
1695 // "if (a!=0) goto loop-entry".
1696 BasicBlock
*LoopEntry
;
1697 Instruction
*DefX2
, *CountInst
;
1698 Value
*VarX1
, *VarX0
;
1699 PHINode
*PhiX
, *CountPhi
;
1701 DefX2
= CountInst
= nullptr;
1702 VarX1
= VarX0
= nullptr;
1703 PhiX
= CountPhi
= nullptr;
1704 LoopEntry
= *(CurLoop
->block_begin());
1706 // step 1: Check if the loop-back branch is in desirable form.
1708 if (Value
*T
= matchCondition(
1709 dyn_cast
<BranchInst
>(LoopEntry
->getTerminator()), LoopEntry
))
1710 DefX2
= dyn_cast
<Instruction
>(T
);
1715 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1717 if (!DefX2
|| DefX2
->getOpcode() != Instruction::And
)
1720 BinaryOperator
*SubOneOp
;
1722 if ((SubOneOp
= dyn_cast
<BinaryOperator
>(DefX2
->getOperand(0))))
1723 VarX1
= DefX2
->getOperand(1);
1725 VarX1
= DefX2
->getOperand(0);
1726 SubOneOp
= dyn_cast
<BinaryOperator
>(DefX2
->getOperand(1));
1728 if (!SubOneOp
|| SubOneOp
->getOperand(0) != VarX1
)
1731 ConstantInt
*Dec
= dyn_cast
<ConstantInt
>(SubOneOp
->getOperand(1));
1733 !((SubOneOp
->getOpcode() == Instruction::Sub
&& Dec
->isOne()) ||
1734 (SubOneOp
->getOpcode() == Instruction::Add
&&
1735 Dec
->isMinusOne()))) {
1740 // step 3: Check the recurrence of variable X
1741 PhiX
= getRecurrenceVar(VarX1
, DefX2
, LoopEntry
);
1745 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1747 CountInst
= nullptr;
1748 for (Instruction
&Inst
: llvm::make_range(
1749 LoopEntry
->getFirstNonPHI()->getIterator(), LoopEntry
->end())) {
1750 if (Inst
.getOpcode() != Instruction::Add
)
1753 ConstantInt
*Inc
= dyn_cast
<ConstantInt
>(Inst
.getOperand(1));
1754 if (!Inc
|| !Inc
->isOne())
1757 PHINode
*Phi
= getRecurrenceVar(Inst
.getOperand(0), &Inst
, LoopEntry
);
1761 // Check if the result of the instruction is live of the loop.
1762 bool LiveOutLoop
= false;
1763 for (User
*U
: Inst
.users()) {
1764 if ((cast
<Instruction
>(U
))->getParent() != LoopEntry
) {
1781 // step 5: check if the precondition is in this form:
1782 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1784 auto *PreCondBr
= dyn_cast
<BranchInst
>(PreCondBB
->getTerminator());
1785 Value
*T
= matchCondition(PreCondBr
, CurLoop
->getLoopPreheader());
1786 if (T
!= PhiX
->getOperand(0) && T
!= PhiX
->getOperand(1))
1789 CntInst
= CountInst
;
1797 /// Return true if the idiom is detected in the loop.
1800 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1801 /// or nullptr if there is no such.
1802 /// 2) \p CntPhi is set to the corresponding phi node
1803 /// or nullptr if there is no such.
1804 /// 3) \p Var is set to the value whose CTLZ could be used.
1805 /// 4) \p DefX is set to the instruction calculating Loop exit condition.
1807 /// The core idiom we are trying to detect is:
1810 /// goto loop-exit // the precondition of the loop
1811 /// cnt0 = init-val;
1813 /// x = phi (x0, x.next); //PhiX
1814 /// cnt = phi(cnt0, cnt.next);
1816 /// cnt.next = cnt + 1;
1818 /// x.next = x >> 1; // DefX
1820 /// } while(x.next != 0);
1824 static bool detectShiftUntilZeroIdiom(Loop
*CurLoop
, const DataLayout
&DL
,
1825 Intrinsic::ID
&IntrinID
, Value
*&InitX
,
1826 Instruction
*&CntInst
, PHINode
*&CntPhi
,
1827 Instruction
*&DefX
) {
1828 BasicBlock
*LoopEntry
;
1829 Value
*VarX
= nullptr;
1834 LoopEntry
= *(CurLoop
->block_begin());
1836 // step 1: Check if the loop-back branch is in desirable form.
1837 if (Value
*T
= matchCondition(
1838 dyn_cast
<BranchInst
>(LoopEntry
->getTerminator()), LoopEntry
))
1839 DefX
= dyn_cast
<Instruction
>(T
);
1843 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1844 if (!DefX
|| !DefX
->isShift())
1846 IntrinID
= DefX
->getOpcode() == Instruction::Shl
? Intrinsic::cttz
:
1848 ConstantInt
*Shft
= dyn_cast
<ConstantInt
>(DefX
->getOperand(1));
1849 if (!Shft
|| !Shft
->isOne())
1851 VarX
= DefX
->getOperand(0);
1853 // step 3: Check the recurrence of variable X
1854 PHINode
*PhiX
= getRecurrenceVar(VarX
, DefX
, LoopEntry
);
1858 InitX
= PhiX
->getIncomingValueForBlock(CurLoop
->getLoopPreheader());
1860 // Make sure the initial value can't be negative otherwise the ashr in the
1861 // loop might never reach zero which would make the loop infinite.
1862 if (DefX
->getOpcode() == Instruction::AShr
&& !isKnownNonNegative(InitX
, DL
))
1865 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1866 // or cnt.next = cnt + -1.
1867 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1868 // then all uses of "cnt.next" could be optimized to the trip count
1869 // plus "cnt0". Currently it is not optimized.
1870 // This step could be used to detect POPCNT instruction:
1871 // cnt.next = cnt + (x.next & 1)
1872 for (Instruction
&Inst
: llvm::make_range(
1873 LoopEntry
->getFirstNonPHI()->getIterator(), LoopEntry
->end())) {
1874 if (Inst
.getOpcode() != Instruction::Add
)
1877 ConstantInt
*Inc
= dyn_cast
<ConstantInt
>(Inst
.getOperand(1));
1878 if (!Inc
|| (!Inc
->isOne() && !Inc
->isMinusOne()))
1881 PHINode
*Phi
= getRecurrenceVar(Inst
.getOperand(0), &Inst
, LoopEntry
);
1895 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1896 // profitable if we delete the loop.
1897 bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID
,
1898 Value
*InitX
, bool ZeroCheck
,
1899 size_t CanonicalSize
) {
1900 const Value
*Args
[] = {InitX
,
1901 ConstantInt::getBool(InitX
->getContext(), ZeroCheck
)};
1903 // @llvm.dbg doesn't count as they have no semantic effect.
1904 auto InstWithoutDebugIt
= CurLoop
->getHeader()->instructionsWithoutDebug();
1905 uint32_t HeaderSize
=
1906 std::distance(InstWithoutDebugIt
.begin(), InstWithoutDebugIt
.end());
1908 IntrinsicCostAttributes
Attrs(IntrinID
, InitX
->getType(), Args
);
1909 InstructionCost Cost
= TTI
->getIntrinsicInstrCost(
1910 Attrs
, TargetTransformInfo::TCK_SizeAndLatency
);
1911 if (HeaderSize
!= CanonicalSize
&& Cost
> TargetTransformInfo::TCC_Basic
)
1917 /// Convert CTLZ / CTTZ idiom loop into countable loop.
1918 /// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
1920 bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID
,
1921 Value
*InitX
, Instruction
*DefX
,
1923 Instruction
*CntInst
) {
1924 bool IsCntPhiUsedOutsideLoop
= false;
1925 for (User
*U
: CntPhi
->users())
1926 if (!CurLoop
->contains(cast
<Instruction
>(U
))) {
1927 IsCntPhiUsedOutsideLoop
= true;
1930 bool IsCntInstUsedOutsideLoop
= false;
1931 for (User
*U
: CntInst
->users())
1932 if (!CurLoop
->contains(cast
<Instruction
>(U
))) {
1933 IsCntInstUsedOutsideLoop
= true;
1936 // If both CntInst and CntPhi are used outside the loop the profitability
1938 if (IsCntInstUsedOutsideLoop
&& IsCntPhiUsedOutsideLoop
)
1941 // For some CPUs result of CTLZ(X) intrinsic is undefined
1942 // when X is 0. If we can not guarantee X != 0, we need to check this
1944 bool ZeroCheck
= false;
1945 // It is safe to assume Preheader exist as it was checked in
1946 // parent function RunOnLoop.
1947 BasicBlock
*PH
= CurLoop
->getLoopPreheader();
1949 // If we are using the count instruction outside the loop, make sure we
1950 // have a zero check as a precondition. Without the check the loop would run
1951 // one iteration for before any check of the input value. This means 0 and 1
1952 // would have identical behavior in the original loop and thus
1953 if (!IsCntPhiUsedOutsideLoop
) {
1954 auto *PreCondBB
= PH
->getSinglePredecessor();
1957 auto *PreCondBI
= dyn_cast
<BranchInst
>(PreCondBB
->getTerminator());
1960 if (matchCondition(PreCondBI
, PH
) != InitX
)
1965 // FFS idiom loop has only 6 instructions:
1966 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1967 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1968 // %shr = ashr %n.addr.0, 1
1969 // %tobool = icmp eq %shr, 0
1970 // %inc = add nsw %i.0, 1
1972 size_t IdiomCanonicalSize
= 6;
1973 if (!isProfitableToInsertFFS(IntrinID
, InitX
, ZeroCheck
, IdiomCanonicalSize
))
1976 transformLoopToCountable(IntrinID
, PH
, CntInst
, CntPhi
, InitX
, DefX
,
1977 DefX
->getDebugLoc(), ZeroCheck
,
1978 IsCntPhiUsedOutsideLoop
);
1982 /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1983 /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1984 /// trip count returns true; otherwise, returns false.
1985 bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1986 // Give up if the loop has multiple blocks or multiple backedges.
1987 if (CurLoop
->getNumBackEdges() != 1 || CurLoop
->getNumBlocks() != 1)
1990 Intrinsic::ID IntrinID
;
1992 Instruction
*DefX
= nullptr;
1993 PHINode
*CntPhi
= nullptr;
1994 Instruction
*CntInst
= nullptr;
1996 if (!detectShiftUntilZeroIdiom(CurLoop
, *DL
, IntrinID
, InitX
, CntInst
, CntPhi
,
2000 return insertFFSIfProfitable(IntrinID
, InitX
, DefX
, CntPhi
, CntInst
);
2003 bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
2004 // Give up if the loop has multiple blocks or multiple backedges.
2005 if (CurLoop
->getNumBackEdges() != 1 || CurLoop
->getNumBlocks() != 1)
2008 Intrinsic::ID IntrinID
;
2010 Instruction
*DefX
= nullptr;
2011 PHINode
*CntPhi
= nullptr;
2012 Instruction
*CntInst
= nullptr;
2014 APInt LoopThreshold
;
2015 if (!detectShiftUntilLessThanIdiom(CurLoop
, *DL
, IntrinID
, InitX
, CntInst
,
2016 CntPhi
, DefX
, LoopThreshold
))
2019 if (LoopThreshold
== 2) {
2020 // Treat as regular FFS.
2021 return insertFFSIfProfitable(IntrinID
, InitX
, DefX
, CntPhi
, CntInst
);
2024 // Look for Floor Log2 Idiom.
2025 if (LoopThreshold
!= 4)
2028 // Abort if CntPhi is used outside of the loop.
2029 for (User
*U
: CntPhi
->users())
2030 if (!CurLoop
->contains(cast
<Instruction
>(U
)))
2033 // It is safe to assume Preheader exist as it was checked in
2034 // parent function RunOnLoop.
2035 BasicBlock
*PH
= CurLoop
->getLoopPreheader();
2036 auto *PreCondBB
= PH
->getSinglePredecessor();
2039 auto *PreCondBI
= dyn_cast
<BranchInst
>(PreCondBB
->getTerminator());
2043 APInt PreLoopThreshold
;
2044 if (matchShiftULTCondition(PreCondBI
, PH
, PreLoopThreshold
) != InitX
||
2045 PreLoopThreshold
!= 2)
2048 bool ZeroCheck
= true;
2050 // the loop has only 6 instructions:
2051 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2052 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2053 // %shr = ashr %n.addr.0, 1
2054 // %tobool = icmp ult %n.addr.0, C
2055 // %inc = add nsw %i.0, 1
2057 size_t IdiomCanonicalSize
= 6;
2058 if (!isProfitableToInsertFFS(IntrinID
, InitX
, ZeroCheck
, IdiomCanonicalSize
))
2061 // log2(x) = w − 1 − clz(x)
2062 transformLoopToCountable(IntrinID
, PH
, CntInst
, CntPhi
, InitX
, DefX
,
2063 DefX
->getDebugLoc(), ZeroCheck
,
2064 /*IsCntPhiUsedOutsideLoop=*/false,
2065 /*InsertSub=*/true);
2069 /// Recognizes a population count idiom in a non-countable loop.
2071 /// If detected, transforms the relevant code to issue the popcount intrinsic
2072 /// function call, and returns true; otherwise, returns false.
2073 bool LoopIdiomRecognize::recognizePopcount() {
2074 if (TTI
->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware
)
2077 // Counting population are usually conducted by few arithmetic instructions.
2078 // Such instructions can be easily "absorbed" by vacant slots in a
2079 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
2080 // in a compact loop.
2082 // Give up if the loop has multiple blocks or multiple backedges.
2083 if (CurLoop
->getNumBackEdges() != 1 || CurLoop
->getNumBlocks() != 1)
2086 BasicBlock
*LoopBody
= *(CurLoop
->block_begin());
2087 if (LoopBody
->size() >= 20) {
2088 // The loop is too big, bail out.
2092 // It should have a preheader containing nothing but an unconditional branch.
2093 BasicBlock
*PH
= CurLoop
->getLoopPreheader();
2094 if (!PH
|| &PH
->front() != PH
->getTerminator())
2096 auto *EntryBI
= dyn_cast
<BranchInst
>(PH
->getTerminator());
2097 if (!EntryBI
|| EntryBI
->isConditional())
2100 // It should have a precondition block where the generated popcount intrinsic
2101 // function can be inserted.
2102 auto *PreCondBB
= PH
->getSinglePredecessor();
2105 auto *PreCondBI
= dyn_cast
<BranchInst
>(PreCondBB
->getTerminator());
2106 if (!PreCondBI
|| PreCondBI
->isUnconditional())
2109 Instruction
*CntInst
;
2112 if (!detectPopcountIdiom(CurLoop
, PreCondBB
, CntInst
, CntPhi
, Val
))
2115 transformLoopToPopcount(PreCondBB
, CntInst
, CntPhi
, Val
);
2119 static CallInst
*createPopcntIntrinsic(IRBuilder
<> &IRBuilder
, Value
*Val
,
2120 const DebugLoc
&DL
) {
2121 Value
*Ops
[] = {Val
};
2122 Type
*Tys
[] = {Val
->getType()};
2124 CallInst
*CI
= IRBuilder
.CreateIntrinsic(Intrinsic::ctpop
, Tys
, Ops
);
2125 CI
->setDebugLoc(DL
);
2130 static CallInst
*createFFSIntrinsic(IRBuilder
<> &IRBuilder
, Value
*Val
,
2131 const DebugLoc
&DL
, bool ZeroCheck
,
2132 Intrinsic::ID IID
) {
2133 Value
*Ops
[] = {Val
, IRBuilder
.getInt1(ZeroCheck
)};
2134 Type
*Tys
[] = {Val
->getType()};
2136 CallInst
*CI
= IRBuilder
.CreateIntrinsic(IID
, Tys
, Ops
);
2137 CI
->setDebugLoc(DL
);
2142 /// Transform the following loop (Using CTLZ, CTTZ is similar):
2144 /// CntPhi = PHI [Cnt0, CntInst]
2145 /// PhiX = PHI [InitX, DefX]
2146 /// CntInst = CntPhi + 1
2147 /// DefX = PhiX >> 1
2149 /// Br: loop if (DefX != 0)
2150 /// Use(CntPhi) or Use(CntInst)
2153 /// If CntPhi used outside the loop:
2154 /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2155 /// Count = CountPrev + 1
2157 /// Count = BitWidth(InitX) - CTLZ(InitX)
2159 /// CntPhi = PHI [Cnt0, CntInst]
2160 /// PhiX = PHI [InitX, DefX]
2161 /// PhiCount = PHI [Count, Dec]
2162 /// CntInst = CntPhi + 1
2163 /// DefX = PhiX >> 1
2164 /// Dec = PhiCount - 1
2166 /// Br: loop if (Dec != 0)
2167 /// Use(CountPrev + Cnt0) // Use(CntPhi)
2169 /// Use(Count + Cnt0) // Use(CntInst)
2171 /// If LOOP_BODY is empty the loop will be deleted.
2172 /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2173 void LoopIdiomRecognize::transformLoopToCountable(
2174 Intrinsic::ID IntrinID
, BasicBlock
*Preheader
, Instruction
*CntInst
,
2175 PHINode
*CntPhi
, Value
*InitX
, Instruction
*DefX
, const DebugLoc
&DL
,
2176 bool ZeroCheck
, bool IsCntPhiUsedOutsideLoop
, bool InsertSub
) {
2177 BranchInst
*PreheaderBr
= cast
<BranchInst
>(Preheader
->getTerminator());
2179 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2180 IRBuilder
<> Builder(PreheaderBr
);
2181 Builder
.SetCurrentDebugLocation(DL
);
2183 // If there are no uses of CntPhi crate:
2184 // Count = BitWidth - CTLZ(InitX);
2185 // NewCount = Count;
2186 // If there are uses of CntPhi create:
2187 // NewCount = BitWidth - CTLZ(InitX >> 1);
2188 // Count = NewCount + 1;
2190 if (IsCntPhiUsedOutsideLoop
) {
2191 if (DefX
->getOpcode() == Instruction::AShr
)
2192 InitXNext
= Builder
.CreateAShr(InitX
, 1);
2193 else if (DefX
->getOpcode() == Instruction::LShr
)
2194 InitXNext
= Builder
.CreateLShr(InitX
, 1);
2195 else if (DefX
->getOpcode() == Instruction::Shl
) // cttz
2196 InitXNext
= Builder
.CreateShl(InitX
, 1);
2198 llvm_unreachable("Unexpected opcode!");
2202 createFFSIntrinsic(Builder
, InitXNext
, DL
, ZeroCheck
, IntrinID
);
2203 Type
*CountTy
= Count
->getType();
2204 Count
= Builder
.CreateSub(
2205 ConstantInt::get(CountTy
, CountTy
->getIntegerBitWidth()), Count
);
2207 Count
= Builder
.CreateSub(Count
, ConstantInt::get(CountTy
, 1));
2208 Value
*NewCount
= Count
;
2209 if (IsCntPhiUsedOutsideLoop
)
2210 Count
= Builder
.CreateAdd(Count
, ConstantInt::get(CountTy
, 1));
2212 NewCount
= Builder
.CreateZExtOrTrunc(NewCount
, CntInst
->getType());
2214 Value
*CntInitVal
= CntPhi
->getIncomingValueForBlock(Preheader
);
2215 if (cast
<ConstantInt
>(CntInst
->getOperand(1))->isOne()) {
2216 // If the counter was being incremented in the loop, add NewCount to the
2217 // counter's initial value, but only if the initial value is not zero.
2218 ConstantInt
*InitConst
= dyn_cast
<ConstantInt
>(CntInitVal
);
2219 if (!InitConst
|| !InitConst
->isZero())
2220 NewCount
= Builder
.CreateAdd(NewCount
, CntInitVal
);
2222 // If the count was being decremented in the loop, subtract NewCount from
2223 // the counter's initial value.
2224 NewCount
= Builder
.CreateSub(CntInitVal
, NewCount
);
2227 // Step 2: Insert new IV and loop condition:
2230 // PhiCount = PHI [Count, Dec]
2232 // Dec = PhiCount - 1
2234 // Br: loop if (Dec != 0)
2235 BasicBlock
*Body
= *(CurLoop
->block_begin());
2236 auto *LbBr
= cast
<BranchInst
>(Body
->getTerminator());
2237 ICmpInst
*LbCond
= cast
<ICmpInst
>(LbBr
->getCondition());
2239 PHINode
*TcPhi
= PHINode::Create(CountTy
, 2, "tcphi");
2240 TcPhi
->insertBefore(Body
->begin());
2242 Builder
.SetInsertPoint(LbCond
);
2243 Instruction
*TcDec
= cast
<Instruction
>(Builder
.CreateSub(
2244 TcPhi
, ConstantInt::get(CountTy
, 1), "tcdec", false, true));
2246 TcPhi
->addIncoming(Count
, Preheader
);
2247 TcPhi
->addIncoming(TcDec
, Body
);
2249 CmpInst::Predicate Pred
=
2250 (LbBr
->getSuccessor(0) == Body
) ? CmpInst::ICMP_NE
: CmpInst::ICMP_EQ
;
2251 LbCond
->setPredicate(Pred
);
2252 LbCond
->setOperand(0, TcDec
);
2253 LbCond
->setOperand(1, ConstantInt::get(CountTy
, 0));
2255 // Step 3: All the references to the original counter outside
2256 // the loop are replaced with the NewCount
2257 if (IsCntPhiUsedOutsideLoop
)
2258 CntPhi
->replaceUsesOutsideBlock(NewCount
, Body
);
2260 CntInst
->replaceUsesOutsideBlock(NewCount
, Body
);
2262 // step 4: Forget the "non-computable" trip-count SCEV associated with the
2263 // loop. The loop would otherwise not be deleted even if it becomes empty.
2264 SE
->forgetLoop(CurLoop
);
2267 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock
*PreCondBB
,
2268 Instruction
*CntInst
,
2269 PHINode
*CntPhi
, Value
*Var
) {
2270 BasicBlock
*PreHead
= CurLoop
->getLoopPreheader();
2271 auto *PreCondBr
= cast
<BranchInst
>(PreCondBB
->getTerminator());
2272 const DebugLoc
&DL
= CntInst
->getDebugLoc();
2274 // Assuming before transformation, the loop is following:
2275 // if (x) // the precondition
2276 // do { cnt++; x &= x - 1; } while(x);
2278 // Step 1: Insert the ctpop instruction at the end of the precondition block
2279 IRBuilder
<> Builder(PreCondBr
);
2280 Value
*PopCnt
, *PopCntZext
, *NewCount
, *TripCnt
;
2282 PopCnt
= createPopcntIntrinsic(Builder
, Var
, DL
);
2283 NewCount
= PopCntZext
=
2284 Builder
.CreateZExtOrTrunc(PopCnt
, cast
<IntegerType
>(CntPhi
->getType()));
2286 if (NewCount
!= PopCnt
)
2287 (cast
<Instruction
>(NewCount
))->setDebugLoc(DL
);
2289 // TripCnt is exactly the number of iterations the loop has
2292 // If the population counter's initial value is not zero, insert Add Inst.
2293 Value
*CntInitVal
= CntPhi
->getIncomingValueForBlock(PreHead
);
2294 ConstantInt
*InitConst
= dyn_cast
<ConstantInt
>(CntInitVal
);
2295 if (!InitConst
|| !InitConst
->isZero()) {
2296 NewCount
= Builder
.CreateAdd(NewCount
, CntInitVal
);
2297 (cast
<Instruction
>(NewCount
))->setDebugLoc(DL
);
2301 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2302 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2303 // function would be partial dead code, and downstream passes will drag
2304 // it back from the precondition block to the preheader.
2306 ICmpInst
*PreCond
= cast
<ICmpInst
>(PreCondBr
->getCondition());
2308 Value
*Opnd0
= PopCntZext
;
2309 Value
*Opnd1
= ConstantInt::get(PopCntZext
->getType(), 0);
2310 if (PreCond
->getOperand(0) != Var
)
2311 std::swap(Opnd0
, Opnd1
);
2313 ICmpInst
*NewPreCond
= cast
<ICmpInst
>(
2314 Builder
.CreateICmp(PreCond
->getPredicate(), Opnd0
, Opnd1
));
2315 PreCondBr
->setCondition(NewPreCond
);
2317 RecursivelyDeleteTriviallyDeadInstructions(PreCond
, TLI
);
2320 // Step 3: Note that the population count is exactly the trip count of the
2321 // loop in question, which enable us to convert the loop from noncountable
2322 // loop into a countable one. The benefit is twofold:
2324 // - If the loop only counts population, the entire loop becomes dead after
2325 // the transformation. It is a lot easier to prove a countable loop dead
2326 // than to prove a noncountable one. (In some C dialects, an infinite loop
2327 // isn't dead even if it computes nothing useful. In general, DCE needs
2328 // to prove a noncountable loop finite before safely delete it.)
2330 // - If the loop also performs something else, it remains alive.
2331 // Since it is transformed to countable form, it can be aggressively
2332 // optimized by some optimizations which are in general not applicable
2333 // to a noncountable loop.
2335 // After this step, this loop (conceptually) would look like following:
2336 // newcnt = __builtin_ctpop(x);
2339 // do { cnt++; x &= x-1; t--) } while (t > 0);
2340 BasicBlock
*Body
= *(CurLoop
->block_begin());
2342 auto *LbBr
= cast
<BranchInst
>(Body
->getTerminator());
2343 ICmpInst
*LbCond
= cast
<ICmpInst
>(LbBr
->getCondition());
2344 Type
*Ty
= TripCnt
->getType();
2346 PHINode
*TcPhi
= PHINode::Create(Ty
, 2, "tcphi");
2347 TcPhi
->insertBefore(Body
->begin());
2349 Builder
.SetInsertPoint(LbCond
);
2350 Instruction
*TcDec
= cast
<Instruction
>(
2351 Builder
.CreateSub(TcPhi
, ConstantInt::get(Ty
, 1),
2352 "tcdec", false, true));
2354 TcPhi
->addIncoming(TripCnt
, PreHead
);
2355 TcPhi
->addIncoming(TcDec
, Body
);
2357 CmpInst::Predicate Pred
=
2358 (LbBr
->getSuccessor(0) == Body
) ? CmpInst::ICMP_UGT
: CmpInst::ICMP_SLE
;
2359 LbCond
->setPredicate(Pred
);
2360 LbCond
->setOperand(0, TcDec
);
2361 LbCond
->setOperand(1, ConstantInt::get(Ty
, 0));
2364 // Step 4: All the references to the original population counter outside
2365 // the loop are replaced with the NewCount -- the value returned from
2366 // __builtin_ctpop().
2367 CntInst
->replaceUsesOutsideBlock(NewCount
, Body
);
2369 // step 5: Forget the "non-computable" trip-count SCEV associated with the
2370 // loop. The loop would otherwise not be deleted even if it becomes empty.
2371 SE
->forgetLoop(CurLoop
);
2374 /// Match loop-invariant value.
2375 template <typename SubPattern_t
> struct match_LoopInvariant
{
2376 SubPattern_t SubPattern
;
2379 match_LoopInvariant(const SubPattern_t
&SP
, const Loop
*L
)
2380 : SubPattern(SP
), L(L
) {}
2382 template <typename ITy
> bool match(ITy
*V
) {
2383 return L
->isLoopInvariant(V
) && SubPattern
.match(V
);
2387 /// Matches if the value is loop-invariant.
2388 template <typename Ty
>
2389 inline match_LoopInvariant
<Ty
> m_LoopInvariant(const Ty
&M
, const Loop
*L
) {
2390 return match_LoopInvariant
<Ty
>(M
, L
);
2393 /// Return true if the idiom is detected in the loop.
2395 /// The core idiom we are trying to detect is:
2399 /// %bitmask = shl i32 1, %bitpos
2403 /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2404 /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2405 /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2406 /// %x.next = shl i32 %x.curr, 1
2408 /// br i1 %x.curr.isbitunset, label %loop, label %end
2411 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2412 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2415 static bool detectShiftUntilBitTestIdiom(Loop
*CurLoop
, Value
*&BaseX
,
2416 Value
*&BitMask
, Value
*&BitPos
,
2417 Value
*&CurrX
, Instruction
*&NextX
) {
2418 LLVM_DEBUG(dbgs() << DEBUG_TYPE
2419 " Performing shift-until-bittest idiom detection.\n");
2421 // Give up if the loop has multiple blocks or multiple backedges.
2422 if (CurLoop
->getNumBlocks() != 1 || CurLoop
->getNumBackEdges() != 1) {
2423 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Bad block/backedge count.\n");
2427 BasicBlock
*LoopHeaderBB
= CurLoop
->getHeader();
2428 BasicBlock
*LoopPreheaderBB
= CurLoop
->getLoopPreheader();
2429 assert(LoopPreheaderBB
&& "There is always a loop preheader.");
2431 using namespace PatternMatch
;
2433 // Step 1: Check if the loop backedge is in desirable form.
2435 ICmpInst::Predicate Pred
;
2436 Value
*CmpLHS
, *CmpRHS
;
2437 BasicBlock
*TrueBB
, *FalseBB
;
2438 if (!match(LoopHeaderBB
->getTerminator(),
2439 m_Br(m_ICmp(Pred
, m_Value(CmpLHS
), m_Value(CmpRHS
)),
2440 m_BasicBlock(TrueBB
), m_BasicBlock(FalseBB
)))) {
2441 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Bad backedge structure.\n");
2445 // Step 2: Check if the backedge's condition is in desirable form.
2447 auto MatchVariableBitMask
= [&]() {
2448 return ICmpInst::isEquality(Pred
) && match(CmpRHS
, m_Zero()) &&
2450 m_c_And(m_Value(CurrX
),
2453 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos
)),
2456 auto MatchConstantBitMask
= [&]() {
2457 return ICmpInst::isEquality(Pred
) && match(CmpRHS
, m_Zero()) &&
2458 match(CmpLHS
, m_And(m_Value(CurrX
),
2459 m_CombineAnd(m_Value(BitMask
), m_Power2()))) &&
2460 (BitPos
= ConstantExpr::getExactLogBase2(cast
<Constant
>(BitMask
)));
2462 auto MatchDecomposableConstantBitMask
= [&]() {
2463 auto Res
= llvm::decomposeBitTestICmp(CmpLHS
, CmpRHS
, Pred
);
2464 if (Res
&& Res
->Mask
.isPowerOf2()) {
2465 assert(ICmpInst::isEquality(Res
->Pred
));
2468 BitMask
= ConstantInt::get(CurrX
->getType(), Res
->Mask
);
2469 BitPos
= ConstantInt::get(CurrX
->getType(), Res
->Mask
.logBase2());
2475 if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2476 !MatchDecomposableConstantBitMask()) {
2477 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Bad backedge comparison.\n");
2481 // Step 3: Check if the recurrence is in desirable form.
2482 auto *CurrXPN
= dyn_cast
<PHINode
>(CurrX
);
2483 if (!CurrXPN
|| CurrXPN
->getParent() != LoopHeaderBB
) {
2484 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Not an expected PHI node.\n");
2488 BaseX
= CurrXPN
->getIncomingValueForBlock(LoopPreheaderBB
);
2490 dyn_cast
<Instruction
>(CurrXPN
->getIncomingValueForBlock(LoopHeaderBB
));
2492 assert(CurLoop
->isLoopInvariant(BaseX
) &&
2493 "Expected BaseX to be available in the preheader!");
2495 if (!NextX
|| !match(NextX
, m_Shl(m_Specific(CurrX
), m_One()))) {
2496 // FIXME: support right-shift?
2497 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Bad recurrence.\n");
2501 // Step 4: Check if the backedge's destinations are in desirable form.
2503 assert(ICmpInst::isEquality(Pred
) &&
2504 "Should only get equality predicates here.");
2506 // cmp-br is commutative, so canonicalize to a single variant.
2507 if (Pred
!= ICmpInst::Predicate::ICMP_EQ
) {
2508 Pred
= ICmpInst::getInversePredicate(Pred
);
2509 std::swap(TrueBB
, FalseBB
);
2512 // We expect to exit loop when comparison yields false,
2513 // so when it yields true we should branch back to loop header.
2514 if (TrueBB
!= LoopHeaderBB
) {
2515 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Bad backedge flow.\n");
2519 // Okay, idiom checks out.
2523 /// Look for the following loop:
2527 /// %bitmask = shl i32 1, %bitpos
2531 /// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2532 /// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2533 /// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2534 /// %x.next = shl i32 %x.curr, 1
2536 /// br i1 %x.curr.isbitunset, label %loop, label %end
2539 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2540 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2544 /// And transform it into:
2547 /// %bitmask = shl i32 1, %bitpos
2548 /// %lowbitmask = add i32 %bitmask, -1
2549 /// %mask = or i32 %lowbitmask, %bitmask
2550 /// %x.masked = and i32 %x, %mask
2551 /// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2553 /// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2554 /// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2555 /// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2556 /// %tripcount = add i32 %backedgetakencount, 1
2557 /// %x.curr = shl i32 %x, %backedgetakencount
2558 /// %x.next = shl i32 %x, %tripcount
2562 /// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2563 /// %loop.iv.next = add nuw i32 %loop.iv, 1
2564 /// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2566 /// br i1 %loop.ivcheck, label %end, label %loop
2569 /// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2570 /// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2573 bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2574 bool MadeChange
= false;
2576 Value
*X
, *BitMask
, *BitPos
, *XCurr
;
2578 if (!detectShiftUntilBitTestIdiom(CurLoop
, X
, BitMask
, BitPos
, XCurr
,
2580 LLVM_DEBUG(dbgs() << DEBUG_TYPE
2581 " shift-until-bittest idiom detection failed.\n");
2584 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" shift-until-bittest idiom detected!\n");
2586 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2587 // but is it profitable to transform?
2589 BasicBlock
*LoopHeaderBB
= CurLoop
->getHeader();
2590 BasicBlock
*LoopPreheaderBB
= CurLoop
->getLoopPreheader();
2591 assert(LoopPreheaderBB
&& "There is always a loop preheader.");
2593 BasicBlock
*SuccessorBB
= CurLoop
->getExitBlock();
2594 assert(SuccessorBB
&& "There is only a single successor.");
2596 IRBuilder
<> Builder(LoopPreheaderBB
->getTerminator());
2597 Builder
.SetCurrentDebugLocation(cast
<Instruction
>(XCurr
)->getDebugLoc());
2599 Intrinsic::ID IntrID
= Intrinsic::ctlz
;
2600 Type
*Ty
= X
->getType();
2601 unsigned Bitwidth
= Ty
->getScalarSizeInBits();
2603 TargetTransformInfo::TargetCostKind CostKind
=
2604 TargetTransformInfo::TCK_SizeAndLatency
;
2606 // The rewrite is considered to be unprofitable iff and only iff the
2607 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2608 // making the loop countable, even if nothing else changes.
2609 IntrinsicCostAttributes
Attrs(
2610 IntrID
, Ty
, {PoisonValue::get(Ty
), /*is_zero_poison=*/Builder
.getTrue()});
2611 InstructionCost Cost
= TTI
->getIntrinsicInstrCost(Attrs
, CostKind
);
2612 if (Cost
> TargetTransformInfo::TCC_Basic
) {
2613 LLVM_DEBUG(dbgs() << DEBUG_TYPE
2614 " Intrinsic is too costly, not beneficial\n");
2617 if (TTI
->getArithmeticInstrCost(Instruction::Shl
, Ty
, CostKind
) >
2618 TargetTransformInfo::TCC_Basic
) {
2619 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Shift is too costly, not beneficial\n");
2623 // Ok, transform appears worthwhile.
2626 if (!isGuaranteedNotToBeUndefOrPoison(BitPos
)) {
2627 // BitMask may be computed from BitPos, Freeze BitPos so we can increase
2629 std::optional
<BasicBlock::iterator
> InsertPt
= std::nullopt
;
2630 if (auto *BitPosI
= dyn_cast
<Instruction
>(BitPos
))
2631 InsertPt
= BitPosI
->getInsertionPointAfterDef();
2633 InsertPt
= DT
->getRoot()->getFirstNonPHIOrDbgOrAlloca();
2636 FreezeInst
*BitPosFrozen
=
2637 new FreezeInst(BitPos
, BitPos
->getName() + ".fr", *InsertPt
);
2638 BitPos
->replaceUsesWithIf(BitPosFrozen
, [BitPosFrozen
](Use
&U
) {
2639 return U
.getUser() != BitPosFrozen
;
2641 BitPos
= BitPosFrozen
;
2644 // Step 1: Compute the loop trip count.
2646 Value
*LowBitMask
= Builder
.CreateAdd(BitMask
, Constant::getAllOnesValue(Ty
),
2647 BitPos
->getName() + ".lowbitmask");
2649 Builder
.CreateOr(LowBitMask
, BitMask
, BitPos
->getName() + ".mask");
2650 Value
*XMasked
= Builder
.CreateAnd(X
, Mask
, X
->getName() + ".masked");
2651 CallInst
*XMaskedNumLeadingZeros
= Builder
.CreateIntrinsic(
2652 IntrID
, Ty
, {XMasked
, /*is_zero_poison=*/Builder
.getTrue()},
2653 /*FMFSource=*/nullptr, XMasked
->getName() + ".numleadingzeros");
2654 Value
*XMaskedNumActiveBits
= Builder
.CreateSub(
2655 ConstantInt::get(Ty
, Ty
->getScalarSizeInBits()), XMaskedNumLeadingZeros
,
2656 XMasked
->getName() + ".numactivebits", /*HasNUW=*/true,
2657 /*HasNSW=*/Bitwidth
!= 2);
2658 Value
*XMaskedLeadingOnePos
=
2659 Builder
.CreateAdd(XMaskedNumActiveBits
, Constant::getAllOnesValue(Ty
),
2660 XMasked
->getName() + ".leadingonepos", /*HasNUW=*/false,
2661 /*HasNSW=*/Bitwidth
> 2);
2663 Value
*LoopBackedgeTakenCount
= Builder
.CreateSub(
2664 BitPos
, XMaskedLeadingOnePos
, CurLoop
->getName() + ".backedgetakencount",
2665 /*HasNUW=*/true, /*HasNSW=*/true);
2666 // We know loop's backedge-taken count, but what's loop's trip count?
2667 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2668 Value
*LoopTripCount
=
2669 Builder
.CreateAdd(LoopBackedgeTakenCount
, ConstantInt::get(Ty
, 1),
2670 CurLoop
->getName() + ".tripcount", /*HasNUW=*/true,
2671 /*HasNSW=*/Bitwidth
!= 2);
2673 // Step 2: Compute the recurrence's final value without a loop.
2675 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2676 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2677 Value
*NewX
= Builder
.CreateShl(X
, LoopBackedgeTakenCount
);
2678 NewX
->takeName(XCurr
);
2679 if (auto *I
= dyn_cast
<Instruction
>(NewX
))
2680 I
->copyIRFlags(XNext
, /*IncludeWrapFlags=*/true);
2683 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2684 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2685 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2686 // that isn't the case, we'll need to emit an alternative, safe IR.
2687 if (XNext
->hasNoSignedWrap() || XNext
->hasNoUnsignedWrap() ||
2688 PatternMatch::match(
2689 BitPos
, PatternMatch::m_SpecificInt_ICMP(
2690 ICmpInst::ICMP_NE
, APInt(Ty
->getScalarSizeInBits(),
2691 Ty
->getScalarSizeInBits() - 1))))
2692 NewXNext
= Builder
.CreateShl(X
, LoopTripCount
);
2694 // Otherwise, just additionally shift by one. It's the smallest solution,
2695 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2696 // and select 0 instead.
2697 NewXNext
= Builder
.CreateShl(NewX
, ConstantInt::get(Ty
, 1));
2700 NewXNext
->takeName(XNext
);
2701 if (auto *I
= dyn_cast
<Instruction
>(NewXNext
))
2702 I
->copyIRFlags(XNext
, /*IncludeWrapFlags=*/true);
2704 // Step 3: Adjust the successor basic block to recieve the computed
2705 // recurrence's final value instead of the recurrence itself.
2707 XCurr
->replaceUsesOutsideBlock(NewX
, LoopHeaderBB
);
2708 XNext
->replaceUsesOutsideBlock(NewXNext
, LoopHeaderBB
);
2710 // Step 4: Rewrite the loop into a countable form, with canonical IV.
2712 // The new canonical induction variable.
2713 Builder
.SetInsertPoint(LoopHeaderBB
, LoopHeaderBB
->begin());
2714 auto *IV
= Builder
.CreatePHI(Ty
, 2, CurLoop
->getName() + ".iv");
2716 // The induction itself.
2717 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2718 Builder
.SetInsertPoint(LoopHeaderBB
->getTerminator());
2720 Builder
.CreateAdd(IV
, ConstantInt::get(Ty
, 1), IV
->getName() + ".next",
2721 /*HasNUW=*/true, /*HasNSW=*/Bitwidth
!= 2);
2723 // The loop trip count check.
2724 auto *IVCheck
= Builder
.CreateICmpEQ(IVNext
, LoopTripCount
,
2725 CurLoop
->getName() + ".ivcheck");
2726 Builder
.CreateCondBr(IVCheck
, SuccessorBB
, LoopHeaderBB
);
2727 LoopHeaderBB
->getTerminator()->eraseFromParent();
2729 // Populate the IV PHI.
2730 IV
->addIncoming(ConstantInt::get(Ty
, 0), LoopPreheaderBB
);
2731 IV
->addIncoming(IVNext
, LoopHeaderBB
);
2733 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2734 // loop. The loop would otherwise not be deleted even if it becomes empty.
2736 SE
->forgetLoop(CurLoop
);
2738 // Other passes will take care of actually deleting the loop if possible.
2740 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" shift-until-bittest idiom optimized!\n");
2742 ++NumShiftUntilBitTest
;
2746 /// Return true if the idiom is detected in the loop.
2748 /// The core idiom we are trying to detect is:
2753 /// %extraoffset = <...>
2755 /// br label %for.cond
2758 /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2759 /// %nbits = add nsw i8 %iv, %extraoffset
2760 /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2761 /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2762 /// %iv.next = add i8 %iv, 1
2764 /// br i1 %val.shifted.iszero, label %end, label %loop
2767 /// %iv.res = phi i8 [ %iv, %loop ] <...>
2768 /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2769 /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2770 /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2771 /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2774 static bool detectShiftUntilZeroIdiom(Loop
*CurLoop
, ScalarEvolution
*SE
,
2775 Instruction
*&ValShiftedIsZero
,
2776 Intrinsic::ID
&IntrinID
, Instruction
*&IV
,
2777 Value
*&Start
, Value
*&Val
,
2778 const SCEV
*&ExtraOffsetExpr
,
2779 bool &InvertedCond
) {
2780 LLVM_DEBUG(dbgs() << DEBUG_TYPE
2781 " Performing shift-until-zero idiom detection.\n");
2783 // Give up if the loop has multiple blocks or multiple backedges.
2784 if (CurLoop
->getNumBlocks() != 1 || CurLoop
->getNumBackEdges() != 1) {
2785 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Bad block/backedge count.\n");
2789 Instruction
*ValShifted
, *NBits
, *IVNext
;
2792 BasicBlock
*LoopHeaderBB
= CurLoop
->getHeader();
2793 BasicBlock
*LoopPreheaderBB
= CurLoop
->getLoopPreheader();
2794 assert(LoopPreheaderBB
&& "There is always a loop preheader.");
2796 using namespace PatternMatch
;
2798 // Step 1: Check if the loop backedge, condition is in desirable form.
2800 ICmpInst::Predicate Pred
;
2801 BasicBlock
*TrueBB
, *FalseBB
;
2802 if (!match(LoopHeaderBB
->getTerminator(),
2803 m_Br(m_Instruction(ValShiftedIsZero
), m_BasicBlock(TrueBB
),
2804 m_BasicBlock(FalseBB
))) ||
2805 !match(ValShiftedIsZero
,
2806 m_ICmp(Pred
, m_Instruction(ValShifted
), m_Zero())) ||
2807 !ICmpInst::isEquality(Pred
)) {
2808 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Bad backedge structure.\n");
2812 // Step 2: Check if the comparison's operand is in desirable form.
2813 // FIXME: Val could be a one-input PHI node, which we should look past.
2814 if (!match(ValShifted
, m_Shift(m_LoopInvariant(m_Value(Val
), CurLoop
),
2815 m_Instruction(NBits
)))) {
2816 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Bad comparisons value computation.\n");
2819 IntrinID
= ValShifted
->getOpcode() == Instruction::Shl
? Intrinsic::cttz
2822 // Step 3: Check if the shift amount is in desirable form.
2824 if (match(NBits
, m_c_Add(m_Instruction(IV
),
2825 m_LoopInvariant(m_Value(ExtraOffset
), CurLoop
))) &&
2826 (NBits
->hasNoSignedWrap() || NBits
->hasNoUnsignedWrap()))
2827 ExtraOffsetExpr
= SE
->getNegativeSCEV(SE
->getSCEV(ExtraOffset
));
2828 else if (match(NBits
,
2829 m_Sub(m_Instruction(IV
),
2830 m_LoopInvariant(m_Value(ExtraOffset
), CurLoop
))) &&
2831 NBits
->hasNoSignedWrap())
2832 ExtraOffsetExpr
= SE
->getSCEV(ExtraOffset
);
2835 ExtraOffsetExpr
= SE
->getZero(NBits
->getType());
2838 // Step 4: Check if the recurrence is in desirable form.
2839 auto *IVPN
= dyn_cast
<PHINode
>(IV
);
2840 if (!IVPN
|| IVPN
->getParent() != LoopHeaderBB
) {
2841 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Not an expected PHI node.\n");
2845 Start
= IVPN
->getIncomingValueForBlock(LoopPreheaderBB
);
2846 IVNext
= dyn_cast
<Instruction
>(IVPN
->getIncomingValueForBlock(LoopHeaderBB
));
2848 if (!IVNext
|| !match(IVNext
, m_Add(m_Specific(IVPN
), m_One()))) {
2849 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Bad recurrence.\n");
2853 // Step 4: Check if the backedge's destinations are in desirable form.
2855 assert(ICmpInst::isEquality(Pred
) &&
2856 "Should only get equality predicates here.");
2858 // cmp-br is commutative, so canonicalize to a single variant.
2859 InvertedCond
= Pred
!= ICmpInst::Predicate::ICMP_EQ
;
2861 Pred
= ICmpInst::getInversePredicate(Pred
);
2862 std::swap(TrueBB
, FalseBB
);
2865 // We expect to exit loop when comparison yields true,
2866 // so when it yields false we should branch back to loop header.
2867 if (FalseBB
!= LoopHeaderBB
) {
2868 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Bad backedge flow.\n");
2872 // The new, countable, loop will certainly only run a known number of
2873 // iterations, It won't be infinite. But the old loop might be infinite
2874 // under certain conditions. For logical shifts, the value will become zero
2875 // after at most bitwidth(%Val) loop iterations. However, for arithmetic
2876 // right-shift, iff the sign bit was set, the value will never become zero,
2877 // and the loop may never finish.
2878 if (ValShifted
->getOpcode() == Instruction::AShr
&&
2879 !isMustProgress(CurLoop
) && !SE
->isKnownNonNegative(SE
->getSCEV(Val
))) {
2880 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Can not prove the loop is finite.\n");
2884 // Okay, idiom checks out.
2888 /// Look for the following loop:
2893 /// %extraoffset = <...>
2895 /// br label %for.cond
2898 /// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2899 /// %nbits = add nsw i8 %iv, %extraoffset
2900 /// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2901 /// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2902 /// %iv.next = add i8 %iv, 1
2904 /// br i1 %val.shifted.iszero, label %end, label %loop
2907 /// %iv.res = phi i8 [ %iv, %loop ] <...>
2908 /// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2909 /// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2910 /// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2911 /// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2915 /// And transform it into:
2920 /// %extraoffset = <...>
2922 /// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2923 /// %val.numactivebits = sub i8 8, %val.numleadingzeros
2924 /// %extraoffset.neg = sub i8 0, %extraoffset
2925 /// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2926 /// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2927 /// %loop.tripcount = sub i8 %iv.final, %start
2931 /// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2932 /// %loop.iv.next = add i8 %loop.iv, 1
2933 /// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2934 /// %iv = add i8 %loop.iv, %start
2936 /// br i1 %loop.ivcheck, label %end, label %loop
2939 /// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2942 bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2943 bool MadeChange
= false;
2945 Instruction
*ValShiftedIsZero
;
2946 Intrinsic::ID IntrID
;
2949 const SCEV
*ExtraOffsetExpr
;
2951 if (!detectShiftUntilZeroIdiom(CurLoop
, SE
, ValShiftedIsZero
, IntrID
, IV
,
2952 Start
, Val
, ExtraOffsetExpr
, InvertedCond
)) {
2953 LLVM_DEBUG(dbgs() << DEBUG_TYPE
2954 " shift-until-zero idiom detection failed.\n");
2957 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" shift-until-zero idiom detected!\n");
2959 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2960 // but is it profitable to transform?
2962 BasicBlock
*LoopHeaderBB
= CurLoop
->getHeader();
2963 BasicBlock
*LoopPreheaderBB
= CurLoop
->getLoopPreheader();
2964 assert(LoopPreheaderBB
&& "There is always a loop preheader.");
2966 BasicBlock
*SuccessorBB
= CurLoop
->getExitBlock();
2967 assert(SuccessorBB
&& "There is only a single successor.");
2969 IRBuilder
<> Builder(LoopPreheaderBB
->getTerminator());
2970 Builder
.SetCurrentDebugLocation(IV
->getDebugLoc());
2972 Type
*Ty
= Val
->getType();
2973 unsigned Bitwidth
= Ty
->getScalarSizeInBits();
2975 TargetTransformInfo::TargetCostKind CostKind
=
2976 TargetTransformInfo::TCK_SizeAndLatency
;
2978 // The rewrite is considered to be unprofitable iff and only iff the
2979 // intrinsic we'll use are not cheap. Note that we are okay with *just*
2980 // making the loop countable, even if nothing else changes.
2981 IntrinsicCostAttributes
Attrs(
2982 IntrID
, Ty
, {PoisonValue::get(Ty
), /*is_zero_poison=*/Builder
.getFalse()});
2983 InstructionCost Cost
= TTI
->getIntrinsicInstrCost(Attrs
, CostKind
);
2984 if (Cost
> TargetTransformInfo::TCC_Basic
) {
2985 LLVM_DEBUG(dbgs() << DEBUG_TYPE
2986 " Intrinsic is too costly, not beneficial\n");
2990 // Ok, transform appears worthwhile.
2993 bool OffsetIsZero
= false;
2994 if (auto *ExtraOffsetExprC
= dyn_cast
<SCEVConstant
>(ExtraOffsetExpr
))
2995 OffsetIsZero
= ExtraOffsetExprC
->isZero();
2997 // Step 1: Compute the loop's final IV value / trip count.
2999 CallInst
*ValNumLeadingZeros
= Builder
.CreateIntrinsic(
3000 IntrID
, Ty
, {Val
, /*is_zero_poison=*/Builder
.getFalse()},
3001 /*FMFSource=*/nullptr, Val
->getName() + ".numleadingzeros");
3002 Value
*ValNumActiveBits
= Builder
.CreateSub(
3003 ConstantInt::get(Ty
, Ty
->getScalarSizeInBits()), ValNumLeadingZeros
,
3004 Val
->getName() + ".numactivebits", /*HasNUW=*/true,
3005 /*HasNSW=*/Bitwidth
!= 2);
3007 SCEVExpander
Expander(*SE
, *DL
, "loop-idiom");
3008 Expander
.setInsertPoint(&*Builder
.GetInsertPoint());
3009 Value
*ExtraOffset
= Expander
.expandCodeFor(ExtraOffsetExpr
);
3011 Value
*ValNumActiveBitsOffset
= Builder
.CreateAdd(
3012 ValNumActiveBits
, ExtraOffset
, ValNumActiveBits
->getName() + ".offset",
3013 /*HasNUW=*/OffsetIsZero
, /*HasNSW=*/true);
3014 Value
*IVFinal
= Builder
.CreateIntrinsic(Intrinsic::smax
, {Ty
},
3015 {ValNumActiveBitsOffset
, Start
},
3016 /*FMFSource=*/nullptr, "iv.final");
3018 auto *LoopBackedgeTakenCount
= cast
<Instruction
>(Builder
.CreateSub(
3019 IVFinal
, Start
, CurLoop
->getName() + ".backedgetakencount",
3020 /*HasNUW=*/OffsetIsZero
, /*HasNSW=*/true));
3021 // FIXME: or when the offset was `add nuw`
3023 // We know loop's backedge-taken count, but what's loop's trip count?
3024 Value
*LoopTripCount
=
3025 Builder
.CreateAdd(LoopBackedgeTakenCount
, ConstantInt::get(Ty
, 1),
3026 CurLoop
->getName() + ".tripcount", /*HasNUW=*/true,
3027 /*HasNSW=*/Bitwidth
!= 2);
3029 // Step 2: Adjust the successor basic block to recieve the original
3030 // induction variable's final value instead of the orig. IV itself.
3032 IV
->replaceUsesOutsideBlock(IVFinal
, LoopHeaderBB
);
3034 // Step 3: Rewrite the loop into a countable form, with canonical IV.
3036 // The new canonical induction variable.
3037 Builder
.SetInsertPoint(LoopHeaderBB
, LoopHeaderBB
->begin());
3038 auto *CIV
= Builder
.CreatePHI(Ty
, 2, CurLoop
->getName() + ".iv");
3040 // The induction itself.
3041 Builder
.SetInsertPoint(LoopHeaderBB
, LoopHeaderBB
->getFirstNonPHIIt());
3043 Builder
.CreateAdd(CIV
, ConstantInt::get(Ty
, 1), CIV
->getName() + ".next",
3044 /*HasNUW=*/true, /*HasNSW=*/Bitwidth
!= 2);
3046 // The loop trip count check.
3047 auto *CIVCheck
= Builder
.CreateICmpEQ(CIVNext
, LoopTripCount
,
3048 CurLoop
->getName() + ".ivcheck");
3049 auto *NewIVCheck
= CIVCheck
;
3051 NewIVCheck
= Builder
.CreateNot(CIVCheck
);
3052 NewIVCheck
->takeName(ValShiftedIsZero
);
3055 // The original IV, but rebased to be an offset to the CIV.
3056 auto *IVDePHId
= Builder
.CreateAdd(CIV
, Start
, "", /*HasNUW=*/false,
3057 /*HasNSW=*/true); // FIXME: what about NUW?
3058 IVDePHId
->takeName(IV
);
3060 // The loop terminator.
3061 Builder
.SetInsertPoint(LoopHeaderBB
->getTerminator());
3062 Builder
.CreateCondBr(CIVCheck
, SuccessorBB
, LoopHeaderBB
);
3063 LoopHeaderBB
->getTerminator()->eraseFromParent();
3065 // Populate the IV PHI.
3066 CIV
->addIncoming(ConstantInt::get(Ty
, 0), LoopPreheaderBB
);
3067 CIV
->addIncoming(CIVNext
, LoopHeaderBB
);
3069 // Step 4: Forget the "non-computable" trip-count SCEV associated with the
3070 // loop. The loop would otherwise not be deleted even if it becomes empty.
3072 SE
->forgetLoop(CurLoop
);
3074 // Step 5: Try to cleanup the loop's body somewhat.
3075 IV
->replaceAllUsesWith(IVDePHId
);
3076 IV
->eraseFromParent();
3078 ValShiftedIsZero
->replaceAllUsesWith(NewIVCheck
);
3079 ValShiftedIsZero
->eraseFromParent();
3081 // Other passes will take care of actually deleting the loop if possible.
3083 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" shift-until-zero idiom optimized!\n");
3085 ++NumShiftUntilZero
;