1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass implements an idiom recognizer that transforms simple loops into a
10 // non-loop form. In cases that this kicks in, it can be a significant
13 // If compiling for code size we avoid idiom recognition if the resulting
14 // code could be larger than the code for the original loop. One way this could
15 // happen is if the loop is not removable after idiom recognition due to the
16 // presence of non-idiom instructions. The initial implementation of the
17 // heuristics applies to idioms in multi-block loops.
19 //===----------------------------------------------------------------------===//
23 // Future loop memory idioms to recognize:
24 // memcmp, memmove, strlen, etc.
25 // Future floating point idioms to recognize in -ffast-math mode:
27 // Future integer operation idioms to recognize:
30 // Beware that isel's default lowering for ctpop is highly inefficient for
31 // i64 and larger types when i64 is legal and the value has few bits set. It
32 // would be good to enhance isel to emit a loop for ctpop in this case.
34 // This could recognize common matrix multiplies and dot product idioms and
35 // replace them with calls to BLAS (if linked in??).
37 //===----------------------------------------------------------------------===//
39 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
40 #include "llvm/ADT/APInt.h"
41 #include "llvm/ADT/ArrayRef.h"
42 #include "llvm/ADT/DenseMap.h"
43 #include "llvm/ADT/MapVector.h"
44 #include "llvm/ADT/STLExtras.h"
45 #include "llvm/ADT/SetVector.h"
46 #include "llvm/ADT/SmallPtrSet.h"
47 #include "llvm/ADT/SmallVector.h"
48 #include "llvm/ADT/Statistic.h"
49 #include "llvm/ADT/StringRef.h"
50 #include "llvm/Analysis/AliasAnalysis.h"
51 #include "llvm/Analysis/LoopAccessAnalysis.h"
52 #include "llvm/Analysis/LoopInfo.h"
53 #include "llvm/Analysis/LoopPass.h"
54 #include "llvm/Analysis/MemoryLocation.h"
55 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
56 #include "llvm/Analysis/ScalarEvolution.h"
57 #include "llvm/Analysis/ScalarEvolutionExpander.h"
58 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
59 #include "llvm/Analysis/TargetLibraryInfo.h"
60 #include "llvm/Analysis/TargetTransformInfo.h"
61 #include "llvm/Analysis/ValueTracking.h"
62 #include "llvm/IR/Attributes.h"
63 #include "llvm/IR/BasicBlock.h"
64 #include "llvm/IR/Constant.h"
65 #include "llvm/IR/Constants.h"
66 #include "llvm/IR/DataLayout.h"
67 #include "llvm/IR/DebugLoc.h"
68 #include "llvm/IR/DerivedTypes.h"
69 #include "llvm/IR/Dominators.h"
70 #include "llvm/IR/GlobalValue.h"
71 #include "llvm/IR/GlobalVariable.h"
72 #include "llvm/IR/IRBuilder.h"
73 #include "llvm/IR/InstrTypes.h"
74 #include "llvm/IR/Instruction.h"
75 #include "llvm/IR/Instructions.h"
76 #include "llvm/IR/IntrinsicInst.h"
77 #include "llvm/IR/Intrinsics.h"
78 #include "llvm/IR/LLVMContext.h"
79 #include "llvm/IR/Module.h"
80 #include "llvm/IR/PassManager.h"
81 #include "llvm/IR/PatternMatch.h"
82 #include "llvm/IR/Type.h"
83 #include "llvm/IR/User.h"
84 #include "llvm/IR/Value.h"
85 #include "llvm/IR/ValueHandle.h"
86 #include "llvm/IR/Verifier.h"
87 #include "llvm/Pass.h"
88 #include "llvm/Support/Casting.h"
89 #include "llvm/Support/CommandLine.h"
90 #include "llvm/Support/Debug.h"
91 #include "llvm/Support/raw_ostream.h"
92 #include "llvm/Transforms/Scalar.h"
93 #include "llvm/Transforms/Scalar/LoopPassManager.h"
94 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
95 #include "llvm/Transforms/Utils/BuildLibCalls.h"
96 #include "llvm/Transforms/Utils/Local.h"
97 #include "llvm/Transforms/Utils/LoopUtils.h"
104 using namespace llvm
;
106 #define DEBUG_TYPE "loop-idiom"
108 STATISTIC(NumMemSet
, "Number of memset's formed from loop stores");
109 STATISTIC(NumMemCpy
, "Number of memcpy's formed from loop load+stores");
110 STATISTIC(NumBCmp
, "Number of memcmp's formed from loop 2xload+eq-compare");
112 static cl::opt
<bool> UseLIRCodeSizeHeurs(
113 "use-lir-code-size-heurs",
114 cl::desc("Use loop idiom recognition code size heuristics when compiling"
116 cl::init(true), cl::Hidden
);
120 // FIXME: reinventing the wheel much? Is there a cleaner solution?
121 struct PMAbstraction
{
122 virtual void markLoopAsDeleted(Loop
*L
) = 0;
123 virtual ~PMAbstraction() = default;
125 struct LegacyPMAbstraction
: PMAbstraction
{
127 LegacyPMAbstraction(LPPassManager
&LPM
) : LPM(LPM
) {}
128 virtual ~LegacyPMAbstraction() = default;
129 void markLoopAsDeleted(Loop
*L
) override
{ LPM
.markLoopAsDeleted(*L
); }
131 struct NewPMAbstraction
: PMAbstraction
{
133 NewPMAbstraction(LPMUpdater
&Updater
) : Updater(Updater
) {}
134 virtual ~NewPMAbstraction() = default;
135 void markLoopAsDeleted(Loop
*L
) override
{
136 Updater
.markLoopAsDeleted(*L
, L
->getName());
140 class LoopIdiomRecognize
{
141 Loop
*CurLoop
= nullptr;
146 TargetLibraryInfo
*TLI
;
147 const TargetTransformInfo
*TTI
;
148 const DataLayout
*DL
;
149 PMAbstraction
&LoopDeleter
;
150 OptimizationRemarkEmitter
&ORE
;
151 bool ApplyCodeSizeHeuristics
;
154 explicit LoopIdiomRecognize(AliasAnalysis
*AA
, DominatorTree
*DT
,
155 LoopInfo
*LI
, ScalarEvolution
*SE
,
156 TargetLibraryInfo
*TLI
,
157 const TargetTransformInfo
*TTI
,
158 const DataLayout
*DL
, PMAbstraction
&LoopDeleter
,
159 OptimizationRemarkEmitter
&ORE
)
160 : AA(AA
), DT(DT
), LI(LI
), SE(SE
), TLI(TLI
), TTI(TTI
), DL(DL
),
161 LoopDeleter(LoopDeleter
), ORE(ORE
) {}
163 bool runOnLoop(Loop
*L
);
166 using StoreList
= SmallVector
<StoreInst
*, 8>;
167 using StoreListMap
= MapVector
<Value
*, StoreList
>;
169 StoreListMap StoreRefsForMemset
;
170 StoreListMap StoreRefsForMemsetPattern
;
171 StoreList StoreRefsForMemcpy
;
173 bool HasMemsetPattern
;
178 /// Return code for isLegalStore()
179 enum LegalStoreKind
{
184 UnorderedAtomicMemcpy
,
185 DontUse
// Dummy retval never to be used. Allows catching errors in retval
189 /// \name Countable Loop Idiom Handling
192 bool runOnCountableLoop();
193 bool runOnLoopBlock(BasicBlock
*BB
, const SCEV
*BECount
,
194 SmallVectorImpl
<BasicBlock
*> &ExitBlocks
);
196 void collectStores(BasicBlock
*BB
);
197 LegalStoreKind
isLegalStore(StoreInst
*SI
);
198 enum class ForMemset
{ No
, Yes
};
199 bool processLoopStores(SmallVectorImpl
<StoreInst
*> &SL
, const SCEV
*BECount
,
201 bool processLoopMemSet(MemSetInst
*MSI
, const SCEV
*BECount
);
203 bool processLoopStridedStore(Value
*DestPtr
, unsigned StoreSize
,
204 unsigned StoreAlignment
, Value
*StoredVal
,
205 Instruction
*TheStore
,
206 SmallPtrSetImpl
<Instruction
*> &Stores
,
207 const SCEVAddRecExpr
*Ev
, const SCEV
*BECount
,
208 bool NegStride
, bool IsLoopMemset
= false);
209 bool processLoopStoreOfLoopLoad(StoreInst
*SI
, const SCEV
*BECount
);
210 bool avoidLIRForMultiBlockLoop(bool IsMemset
= false,
211 bool IsLoopMemset
= false);
214 /// \name Noncountable Loop Idiom Handling
217 bool runOnNoncountableLoop();
219 struct CmpLoopStructure
{
220 Value
*BCmpValue
, *LatchCmpValue
;
221 BasicBlock
*HeaderBrEqualBB
, *HeaderBrUnequalBB
;
222 BasicBlock
*LatchBrFinishBB
, *LatchBrContinueBB
;
224 bool matchBCmpLoopStructure(CmpLoopStructure
&CmpLoop
) const;
226 ICmpInst::Predicate BCmpPred
;
227 Value
*LoadSrcA
, *LoadSrcB
;
228 Value
*LoadA
, *LoadB
;
230 bool matchBCmpOfLoads(Value
*BCmpValue
, CmpOfLoads
&CmpOfLoads
) const;
231 bool recognizeBCmpLoopControlFlow(const CmpOfLoads
&CmpOfLoads
,
232 CmpLoopStructure
&CmpLoop
) const;
233 bool recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes
, CmpOfLoads
&CmpOfLoads
,
234 const SCEV
*&SrcA
, const SCEV
*&SrcB
,
235 const SCEV
*&Iterations
) const;
236 bool detectBCmpIdiom(ICmpInst
*&BCmpInst
, CmpInst
*&LatchCmpInst
,
237 LoadInst
*&LoadA
, LoadInst
*&LoadB
, const SCEV
*&SrcA
,
238 const SCEV
*&SrcB
, const SCEV
*&NBytes
) const;
239 BasicBlock
*transformBCmpControlFlow(ICmpInst
*ComparedEqual
);
240 void transformLoopToBCmp(ICmpInst
*BCmpInst
, CmpInst
*LatchCmpInst
,
241 LoadInst
*LoadA
, LoadInst
*LoadB
, const SCEV
*SrcA
,
242 const SCEV
*SrcB
, const SCEV
*NBytes
);
243 bool recognizeBCmp();
245 bool recognizePopcount();
246 void transformLoopToPopcount(BasicBlock
*PreCondBB
, Instruction
*CntInst
,
247 PHINode
*CntPhi
, Value
*Var
);
248 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
249 void transformLoopToCountable(Intrinsic::ID IntrinID
, BasicBlock
*PreCondBB
,
250 Instruction
*CntInst
, PHINode
*CntPhi
,
251 Value
*Var
, Instruction
*DefX
,
252 const DebugLoc
&DL
, bool ZeroCheck
,
253 bool IsCntPhiUsedOutsideLoop
);
258 class LoopIdiomRecognizeLegacyPass
: public LoopPass
{
262 explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID
) {
263 initializeLoopIdiomRecognizeLegacyPassPass(
264 *PassRegistry::getPassRegistry());
267 bool runOnLoop(Loop
*L
, LPPassManager
&LPM
) override
{
271 AliasAnalysis
*AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
272 DominatorTree
*DT
= &getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
273 LoopInfo
*LI
= &getAnalysis
<LoopInfoWrapperPass
>().getLoopInfo();
274 ScalarEvolution
*SE
= &getAnalysis
<ScalarEvolutionWrapperPass
>().getSE();
275 TargetLibraryInfo
*TLI
=
276 &getAnalysis
<TargetLibraryInfoWrapperPass
>().getTLI(
277 *L
->getHeader()->getParent());
278 const TargetTransformInfo
*TTI
=
279 &getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(
280 *L
->getHeader()->getParent());
281 const DataLayout
*DL
= &L
->getHeader()->getModule()->getDataLayout();
282 LegacyPMAbstraction
LoopDeleter(LPM
);
284 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
285 // pass. Function analyses need to be preserved across loop transformations
286 // but ORE cannot be preserved (see comment before the pass definition).
287 OptimizationRemarkEmitter
ORE(L
->getHeader()->getParent());
289 LoopIdiomRecognize
LIR(AA
, DT
, LI
, SE
, TLI
, TTI
, DL
, LoopDeleter
, ORE
);
290 return LIR
.runOnLoop(L
);
293 /// This transformation requires natural loop information & requires that
294 /// loop preheaders be inserted into the CFG.
295 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
296 AU
.addRequired
<TargetLibraryInfoWrapperPass
>();
297 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
298 getLoopAnalysisUsage(AU
);
302 } // end anonymous namespace
304 char LoopIdiomRecognizeLegacyPass::ID
= 0;
306 PreservedAnalyses
LoopIdiomRecognizePass::run(Loop
&L
, LoopAnalysisManager
&AM
,
307 LoopStandardAnalysisResults
&AR
,
308 LPMUpdater
&Updater
) {
309 const auto *DL
= &L
.getHeader()->getModule()->getDataLayout();
312 AM
.getResult
<FunctionAnalysisManagerLoopProxy
>(L
, AR
).getManager();
313 Function
*F
= L
.getHeader()->getParent();
315 auto *ORE
= FAM
.getCachedResult
<OptimizationRemarkEmitterAnalysis
>(*F
);
316 // FIXME: This should probably be optional rather than required.
319 "LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached "
320 "at a higher level");
322 NewPMAbstraction
LoopDeleter(Updater
);
323 LoopIdiomRecognize
LIR(&AR
.AA
, &AR
.DT
, &AR
.LI
, &AR
.SE
, &AR
.TLI
, &AR
.TTI
, DL
,
325 if (!LIR
.runOnLoop(&L
))
326 return PreservedAnalyses::all();
328 return getLoopPassPreservedAnalyses();
331 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass
, "loop-idiom",
332 "Recognize loop idioms", false, false)
333 INITIALIZE_PASS_DEPENDENCY(LoopPass
)
334 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass
)
335 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
336 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass
, "loop-idiom",
337 "Recognize loop idioms", false, false)
339 Pass
*llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
341 static void deleteDeadInstruction(Instruction
*I
) {
342 I
->replaceAllUsesWith(UndefValue::get(I
->getType()));
343 I
->eraseFromParent();
346 //===----------------------------------------------------------------------===//
348 // Implementation of LoopIdiomRecognize
350 //===----------------------------------------------------------------------===//
352 bool LoopIdiomRecognize::runOnLoop(Loop
*L
) {
354 // If the loop could not be converted to canonical form, it must have an
355 // indirectbr in it, just give up.
356 if (!L
->getLoopPreheader())
359 // Disable loop idiom recognition if the function's name is a common idiom.
360 StringRef Name
= L
->getHeader()->getParent()->getName();
361 if (Name
== "memset" || Name
== "memcpy" || Name
== "memcmp" ||
365 // Determine if code size heuristics need to be applied.
366 ApplyCodeSizeHeuristics
=
367 L
->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs
;
369 HasMemset
= TLI
->has(LibFunc_memset
);
370 HasMemsetPattern
= TLI
->has(LibFunc_memset_pattern16
);
371 HasMemcpy
= TLI
->has(LibFunc_memcpy
);
372 HasMemCmp
= TLI
->has(LibFunc_memcmp
);
373 HasBCmp
= TLI
->has(LibFunc_bcmp
);
375 if (HasMemset
|| HasMemsetPattern
|| HasMemcpy
|| HasMemCmp
|| HasBCmp
)
376 if (SE
->hasLoopInvariantBackedgeTakenCount(L
))
377 return runOnCountableLoop();
379 return runOnNoncountableLoop();
382 bool LoopIdiomRecognize::runOnCountableLoop() {
383 const SCEV
*BECount
= SE
->getBackedgeTakenCount(CurLoop
);
384 assert(!isa
<SCEVCouldNotCompute
>(BECount
) &&
385 "runOnCountableLoop() called on a loop without a predictable"
386 "backedge-taken count");
388 // If this loop executes exactly one time, then it should be peeled, not
389 // optimized by this pass.
390 if (const SCEVConstant
*BECst
= dyn_cast
<SCEVConstant
>(BECount
))
391 if (BECst
->getAPInt() == 0)
394 SmallVector
<BasicBlock
*, 8> ExitBlocks
;
395 CurLoop
->getUniqueExitBlocks(ExitBlocks
);
397 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Scanning: F["
398 << CurLoop
->getHeader()->getParent()->getName()
399 << "] Countable Loop %" << CurLoop
->getHeader()->getName()
402 bool MadeChange
= false;
404 // The following transforms hoist stores/memsets into the loop pre-header.
405 // Give up if the loop has instructions may throw.
406 SimpleLoopSafetyInfo SafetyInfo
;
407 SafetyInfo
.computeLoopSafetyInfo(CurLoop
);
408 if (SafetyInfo
.anyBlockMayThrow())
411 // Scan all the blocks in the loop that are not in subloops.
412 for (auto *BB
: CurLoop
->getBlocks()) {
413 // Ignore blocks in subloops.
414 if (LI
->getLoopFor(BB
) != CurLoop
)
417 MadeChange
|= runOnLoopBlock(BB
, BECount
, ExitBlocks
);
422 static APInt
getStoreStride(const SCEVAddRecExpr
*StoreEv
) {
423 const SCEVConstant
*ConstStride
= cast
<SCEVConstant
>(StoreEv
->getOperand(1));
424 return ConstStride
->getAPInt();
427 /// getMemSetPatternValue - If a strided store of the specified value is safe to
428 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
429 /// be passed in. Otherwise, return null.
431 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
432 /// just replicate their input array and then pass on to memset_pattern16.
433 static Constant
*getMemSetPatternValue(Value
*V
, const DataLayout
*DL
) {
434 // FIXME: This could check for UndefValue because it can be merged into any
435 // other valid pattern.
437 // If the value isn't a constant, we can't promote it to being in a constant
438 // array. We could theoretically do a store to an alloca or something, but
439 // that doesn't seem worthwhile.
440 Constant
*C
= dyn_cast
<Constant
>(V
);
444 // Only handle simple values that are a power of two bytes in size.
445 uint64_t Size
= DL
->getTypeSizeInBits(V
->getType());
446 if (Size
== 0 || (Size
& 7) || (Size
& (Size
- 1)))
449 // Don't care enough about darwin/ppc to implement this.
450 if (DL
->isBigEndian())
453 // Convert to size in bytes.
456 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
457 // if the top and bottom are the same (e.g. for vectors and large integers).
461 // If the constant is exactly 16 bytes, just use it.
465 // Otherwise, we'll use an array of the constants.
466 unsigned ArraySize
= 16 / Size
;
467 ArrayType
*AT
= ArrayType::get(V
->getType(), ArraySize
);
468 return ConstantArray::get(AT
, std::vector
<Constant
*>(ArraySize
, C
));
471 LoopIdiomRecognize::LegalStoreKind
472 LoopIdiomRecognize::isLegalStore(StoreInst
*SI
) {
473 // Don't touch volatile stores.
474 if (SI
->isVolatile())
475 return LegalStoreKind::None
;
476 // We only want simple or unordered-atomic stores.
477 if (!SI
->isUnordered())
478 return LegalStoreKind::None
;
480 // Don't convert stores of non-integral pointer types to memsets (which stores
482 if (DL
->isNonIntegralPointerType(SI
->getValueOperand()->getType()))
483 return LegalStoreKind::None
;
485 // Avoid merging nontemporal stores.
486 if (SI
->getMetadata(LLVMContext::MD_nontemporal
))
487 return LegalStoreKind::None
;
489 Value
*StoredVal
= SI
->getValueOperand();
490 Value
*StorePtr
= SI
->getPointerOperand();
492 // Reject stores that are so large that they overflow an unsigned.
493 uint64_t SizeInBits
= DL
->getTypeSizeInBits(StoredVal
->getType());
494 if ((SizeInBits
& 7) || (SizeInBits
>> 32) != 0)
495 return LegalStoreKind::None
;
497 // See if the pointer expression is an AddRec like {base,+,1} on the current
498 // loop, which indicates a strided store. If we have something else, it's a
499 // random store we can't handle.
500 const SCEVAddRecExpr
*StoreEv
=
501 dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(StorePtr
));
502 if (!StoreEv
|| StoreEv
->getLoop() != CurLoop
|| !StoreEv
->isAffine())
503 return LegalStoreKind::None
;
505 // Check to see if we have a constant stride.
506 if (!isa
<SCEVConstant
>(StoreEv
->getOperand(1)))
507 return LegalStoreKind::None
;
509 // See if the store can be turned into a memset.
511 // If the stored value is a byte-wise value (like i32 -1), then it may be
512 // turned into a memset of i8 -1, assuming that all the consecutive bytes
513 // are stored. A store of i32 0x01020304 can never be turned into a memset,
514 // but it can be turned into memset_pattern if the target supports it.
515 Value
*SplatValue
= isBytewiseValue(StoredVal
, *DL
);
516 Constant
*PatternValue
= nullptr;
518 // Note: memset and memset_pattern on unordered-atomic is yet not supported
519 bool UnorderedAtomic
= SI
->isUnordered() && !SI
->isSimple();
521 // If we're allowed to form a memset, and the stored value would be
522 // acceptable for memset, use it.
523 if (!UnorderedAtomic
&& HasMemset
&& SplatValue
&&
524 // Verify that the stored value is loop invariant. If not, we can't
525 // promote the memset.
526 CurLoop
->isLoopInvariant(SplatValue
)) {
527 // It looks like we can use SplatValue.
528 return LegalStoreKind::Memset
;
529 } else if (!UnorderedAtomic
&& HasMemsetPattern
&&
530 // Don't create memset_pattern16s with address spaces.
531 StorePtr
->getType()->getPointerAddressSpace() == 0 &&
532 (PatternValue
= getMemSetPatternValue(StoredVal
, DL
))) {
533 // It looks like we can use PatternValue!
534 return LegalStoreKind::MemsetPattern
;
537 // Otherwise, see if the store can be turned into a memcpy.
539 // Check to see if the stride matches the size of the store. If so, then we
540 // know that every byte is touched in the loop.
541 APInt Stride
= getStoreStride(StoreEv
);
542 unsigned StoreSize
= DL
->getTypeStoreSize(SI
->getValueOperand()->getType());
543 if (StoreSize
!= Stride
&& StoreSize
!= -Stride
)
544 return LegalStoreKind::None
;
546 // The store must be feeding a non-volatile load.
547 LoadInst
*LI
= dyn_cast
<LoadInst
>(SI
->getValueOperand());
549 // Only allow non-volatile loads
550 if (!LI
|| LI
->isVolatile())
551 return LegalStoreKind::None
;
552 // Only allow simple or unordered-atomic loads
553 if (!LI
->isUnordered())
554 return LegalStoreKind::None
;
556 // See if the pointer expression is an AddRec like {base,+,1} on the current
557 // loop, which indicates a strided load. If we have something else, it's a
558 // random load we can't handle.
559 const SCEVAddRecExpr
*LoadEv
=
560 dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(LI
->getPointerOperand()));
561 if (!LoadEv
|| LoadEv
->getLoop() != CurLoop
|| !LoadEv
->isAffine())
562 return LegalStoreKind::None
;
564 // The store and load must share the same stride.
565 if (StoreEv
->getOperand(1) != LoadEv
->getOperand(1))
566 return LegalStoreKind::None
;
568 // Success. This store can be converted into a memcpy.
569 UnorderedAtomic
= UnorderedAtomic
|| LI
->isAtomic();
570 return UnorderedAtomic
? LegalStoreKind::UnorderedAtomicMemcpy
571 : LegalStoreKind::Memcpy
;
573 // This store can't be transformed into a memset/memcpy.
574 return LegalStoreKind::None
;
577 void LoopIdiomRecognize::collectStores(BasicBlock
*BB
) {
578 StoreRefsForMemset
.clear();
579 StoreRefsForMemsetPattern
.clear();
580 StoreRefsForMemcpy
.clear();
581 for (Instruction
&I
: *BB
) {
582 StoreInst
*SI
= dyn_cast
<StoreInst
>(&I
);
586 // Make sure this is a strided store with a constant stride.
587 switch (isLegalStore(SI
)) {
588 case LegalStoreKind::None
:
591 case LegalStoreKind::Memset
: {
592 // Find the base pointer.
593 Value
*Ptr
= GetUnderlyingObject(SI
->getPointerOperand(), *DL
);
594 StoreRefsForMemset
[Ptr
].push_back(SI
);
596 case LegalStoreKind::MemsetPattern
: {
597 // Find the base pointer.
598 Value
*Ptr
= GetUnderlyingObject(SI
->getPointerOperand(), *DL
);
599 StoreRefsForMemsetPattern
[Ptr
].push_back(SI
);
601 case LegalStoreKind::Memcpy
:
602 case LegalStoreKind::UnorderedAtomicMemcpy
:
603 StoreRefsForMemcpy
.push_back(SI
);
606 assert(false && "unhandled return value");
612 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
613 /// with the specified backedge count. This block is known to be in the current
614 /// loop and not in any subloops.
615 bool LoopIdiomRecognize::runOnLoopBlock(
616 BasicBlock
*BB
, const SCEV
*BECount
,
617 SmallVectorImpl
<BasicBlock
*> &ExitBlocks
) {
618 // We can only promote stores in this block if they are unconditionally
619 // executed in the loop. For a block to be unconditionally executed, it has
620 // to dominate all the exit blocks of the loop. Verify this now.
621 for (unsigned i
= 0, e
= ExitBlocks
.size(); i
!= e
; ++i
)
622 if (!DT
->dominates(BB
, ExitBlocks
[i
]))
625 bool MadeChange
= false;
626 // Look for store instructions, which may be optimized to memset/memcpy.
629 // Look for a single store or sets of stores with a common base, which can be
630 // optimized into a memset (memset_pattern). The latter most commonly happens
631 // with structs and handunrolled loops.
632 for (auto &SL
: StoreRefsForMemset
)
633 MadeChange
|= processLoopStores(SL
.second
, BECount
, ForMemset::Yes
);
635 for (auto &SL
: StoreRefsForMemsetPattern
)
636 MadeChange
|= processLoopStores(SL
.second
, BECount
, ForMemset::No
);
638 // Optimize the store into a memcpy, if it feeds an similarly strided load.
639 for (auto &SI
: StoreRefsForMemcpy
)
640 MadeChange
|= processLoopStoreOfLoopLoad(SI
, BECount
);
642 for (BasicBlock::iterator I
= BB
->begin(), E
= BB
->end(); I
!= E
;) {
643 Instruction
*Inst
= &*I
++;
644 // Look for memset instructions, which may be optimized to a larger memset.
645 if (MemSetInst
*MSI
= dyn_cast
<MemSetInst
>(Inst
)) {
646 WeakTrackingVH
InstPtr(&*I
);
647 if (!processLoopMemSet(MSI
, BECount
))
651 // If processing the memset invalidated our iterator, start over from the
662 /// See if this store(s) can be promoted to a memset.
663 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl
<StoreInst
*> &SL
,
664 const SCEV
*BECount
, ForMemset For
) {
665 // Try to find consecutive stores that can be transformed into memsets.
666 SetVector
<StoreInst
*> Heads
, Tails
;
667 SmallDenseMap
<StoreInst
*, StoreInst
*> ConsecutiveChain
;
669 // Do a quadratic search on all of the given stores and find
670 // all of the pairs of stores that follow each other.
671 SmallVector
<unsigned, 16> IndexQueue
;
672 for (unsigned i
= 0, e
= SL
.size(); i
< e
; ++i
) {
673 assert(SL
[i
]->isSimple() && "Expected only non-volatile stores.");
675 Value
*FirstStoredVal
= SL
[i
]->getValueOperand();
676 Value
*FirstStorePtr
= SL
[i
]->getPointerOperand();
677 const SCEVAddRecExpr
*FirstStoreEv
=
678 cast
<SCEVAddRecExpr
>(SE
->getSCEV(FirstStorePtr
));
679 APInt FirstStride
= getStoreStride(FirstStoreEv
);
680 unsigned FirstStoreSize
= DL
->getTypeStoreSize(SL
[i
]->getValueOperand()->getType());
682 // See if we can optimize just this store in isolation.
683 if (FirstStride
== FirstStoreSize
|| -FirstStride
== FirstStoreSize
) {
688 Value
*FirstSplatValue
= nullptr;
689 Constant
*FirstPatternValue
= nullptr;
691 if (For
== ForMemset::Yes
)
692 FirstSplatValue
= isBytewiseValue(FirstStoredVal
, *DL
);
694 FirstPatternValue
= getMemSetPatternValue(FirstStoredVal
, DL
);
696 assert((FirstSplatValue
|| FirstPatternValue
) &&
697 "Expected either splat value or pattern value.");
700 // If a store has multiple consecutive store candidates, search Stores
701 // array according to the sequence: from i+1 to e, then from i-1 to 0.
702 // This is because usually pairing with immediate succeeding or preceding
703 // candidate create the best chance to find memset opportunity.
705 for (j
= i
+ 1; j
< e
; ++j
)
706 IndexQueue
.push_back(j
);
707 for (j
= i
; j
> 0; --j
)
708 IndexQueue
.push_back(j
- 1);
710 for (auto &k
: IndexQueue
) {
711 assert(SL
[k
]->isSimple() && "Expected only non-volatile stores.");
712 Value
*SecondStorePtr
= SL
[k
]->getPointerOperand();
713 const SCEVAddRecExpr
*SecondStoreEv
=
714 cast
<SCEVAddRecExpr
>(SE
->getSCEV(SecondStorePtr
));
715 APInt SecondStride
= getStoreStride(SecondStoreEv
);
717 if (FirstStride
!= SecondStride
)
720 Value
*SecondStoredVal
= SL
[k
]->getValueOperand();
721 Value
*SecondSplatValue
= nullptr;
722 Constant
*SecondPatternValue
= nullptr;
724 if (For
== ForMemset::Yes
)
725 SecondSplatValue
= isBytewiseValue(SecondStoredVal
, *DL
);
727 SecondPatternValue
= getMemSetPatternValue(SecondStoredVal
, DL
);
729 assert((SecondSplatValue
|| SecondPatternValue
) &&
730 "Expected either splat value or pattern value.");
732 if (isConsecutiveAccess(SL
[i
], SL
[k
], *DL
, *SE
, false)) {
733 if (For
== ForMemset::Yes
) {
734 if (isa
<UndefValue
>(FirstSplatValue
))
735 FirstSplatValue
= SecondSplatValue
;
736 if (FirstSplatValue
!= SecondSplatValue
)
739 if (isa
<UndefValue
>(FirstPatternValue
))
740 FirstPatternValue
= SecondPatternValue
;
741 if (FirstPatternValue
!= SecondPatternValue
)
746 ConsecutiveChain
[SL
[i
]] = SL
[k
];
752 // We may run into multiple chains that merge into a single chain. We mark the
753 // stores that we transformed so that we don't visit the same store twice.
754 SmallPtrSet
<Value
*, 16> TransformedStores
;
755 bool Changed
= false;
757 // For stores that start but don't end a link in the chain:
758 for (SetVector
<StoreInst
*>::iterator it
= Heads
.begin(), e
= Heads
.end();
760 if (Tails
.count(*it
))
763 // We found a store instr that starts a chain. Now follow the chain and try
765 SmallPtrSet
<Instruction
*, 8> AdjacentStores
;
768 StoreInst
*HeadStore
= I
;
769 unsigned StoreSize
= 0;
771 // Collect the chain into a list.
772 while (Tails
.count(I
) || Heads
.count(I
)) {
773 if (TransformedStores
.count(I
))
775 AdjacentStores
.insert(I
);
777 StoreSize
+= DL
->getTypeStoreSize(I
->getValueOperand()->getType());
778 // Move to the next value in the chain.
779 I
= ConsecutiveChain
[I
];
782 Value
*StoredVal
= HeadStore
->getValueOperand();
783 Value
*StorePtr
= HeadStore
->getPointerOperand();
784 const SCEVAddRecExpr
*StoreEv
= cast
<SCEVAddRecExpr
>(SE
->getSCEV(StorePtr
));
785 APInt Stride
= getStoreStride(StoreEv
);
787 // Check to see if the stride matches the size of the stores. If so, then
788 // we know that every byte is touched in the loop.
789 if (StoreSize
!= Stride
&& StoreSize
!= -Stride
)
792 bool NegStride
= StoreSize
== -Stride
;
794 if (processLoopStridedStore(StorePtr
, StoreSize
, HeadStore
->getAlignment(),
795 StoredVal
, HeadStore
, AdjacentStores
, StoreEv
,
796 BECount
, NegStride
)) {
797 TransformedStores
.insert(AdjacentStores
.begin(), AdjacentStores
.end());
805 /// processLoopMemSet - See if this memset can be promoted to a large memset.
806 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst
*MSI
,
807 const SCEV
*BECount
) {
808 // We can only handle non-volatile memsets with a constant size.
809 if (MSI
->isVolatile() || !isa
<ConstantInt
>(MSI
->getLength()))
812 // If we're not allowed to hack on memset, we fail.
816 Value
*Pointer
= MSI
->getDest();
818 // See if the pointer expression is an AddRec like {base,+,1} on the current
819 // loop, which indicates a strided store. If we have something else, it's a
820 // random store we can't handle.
821 const SCEVAddRecExpr
*Ev
= dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(Pointer
));
822 if (!Ev
|| Ev
->getLoop() != CurLoop
|| !Ev
->isAffine())
825 // Reject memsets that are so large that they overflow an unsigned.
826 uint64_t SizeInBytes
= cast
<ConstantInt
>(MSI
->getLength())->getZExtValue();
827 if ((SizeInBytes
>> 32) != 0)
830 // Check to see if the stride matches the size of the memset. If so, then we
831 // know that every byte is touched in the loop.
832 const SCEVConstant
*ConstStride
= dyn_cast
<SCEVConstant
>(Ev
->getOperand(1));
836 APInt Stride
= ConstStride
->getAPInt();
837 if (SizeInBytes
!= Stride
&& SizeInBytes
!= -Stride
)
840 // Verify that the memset value is loop invariant. If not, we can't promote
842 Value
*SplatValue
= MSI
->getValue();
843 if (!SplatValue
|| !CurLoop
->isLoopInvariant(SplatValue
))
846 SmallPtrSet
<Instruction
*, 1> MSIs
;
848 bool NegStride
= SizeInBytes
== -Stride
;
849 return processLoopStridedStore(Pointer
, (unsigned)SizeInBytes
,
850 MSI
->getDestAlignment(), SplatValue
, MSI
, MSIs
,
851 Ev
, BECount
, NegStride
, /*IsLoopMemset=*/true);
854 /// mayLoopAccessLocation - Return true if the specified loop might access the
855 /// specified pointer location, which is a loop-strided access. The 'Access'
856 /// argument specifies what the verboten forms of access are (read or write).
858 mayLoopAccessLocation(Value
*Ptr
, ModRefInfo Access
, Loop
*L
,
859 const SCEV
*BECount
, unsigned StoreSize
,
861 SmallPtrSetImpl
<Instruction
*> &IgnoredStores
) {
862 // Get the location that may be stored across the loop. Since the access is
863 // strided positively through memory, we say that the modified location starts
864 // at the pointer and has infinite size.
865 LocationSize AccessSize
= LocationSize::unknown();
867 // If the loop iterates a fixed number of times, we can refine the access size
868 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
869 if (const SCEVConstant
*BECst
= dyn_cast
<SCEVConstant
>(BECount
))
870 AccessSize
= LocationSize::precise((BECst
->getValue()->getZExtValue() + 1) *
873 // TODO: For this to be really effective, we have to dive into the pointer
874 // operand in the store. Store to &A[i] of 100 will always return may alias
875 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
876 // which will then no-alias a store to &A[100].
877 MemoryLocation
StoreLoc(Ptr
, AccessSize
);
879 for (Loop::block_iterator BI
= L
->block_begin(), E
= L
->block_end(); BI
!= E
;
881 for (Instruction
&I
: **BI
)
882 if (IgnoredStores
.count(&I
) == 0 &&
884 intersectModRef(AA
.getModRefInfo(&I
, StoreLoc
), Access
)))
890 // If we have a negative stride, Start refers to the end of the memory location
891 // we're trying to memset. Therefore, we need to recompute the base pointer,
892 // which is just Start - BECount*Size.
893 static const SCEV
*getStartForNegStride(const SCEV
*Start
, const SCEV
*BECount
,
894 Type
*IntPtr
, unsigned StoreSize
,
895 ScalarEvolution
*SE
) {
896 const SCEV
*Index
= SE
->getTruncateOrZeroExtend(BECount
, IntPtr
);
898 Index
= SE
->getMulExpr(Index
, SE
->getConstant(IntPtr
, StoreSize
),
900 return SE
->getMinusSCEV(Start
, Index
);
903 /// Compute the number of bytes as a SCEV from the backedge taken count.
905 /// This also maps the SCEV into the provided type and tries to handle the
906 /// computation in a way that will fold cleanly.
907 static const SCEV
*getNumBytes(const SCEV
*BECount
, Type
*IntPtr
,
908 unsigned StoreSize
, Loop
*CurLoop
,
909 const DataLayout
*DL
, ScalarEvolution
*SE
) {
910 const SCEV
*NumBytesS
;
911 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
912 // pointer size if it isn't already.
914 // If we're going to need to zero extend the BE count, check if we can add
915 // one to it prior to zero extending without overflow. Provided this is safe,
916 // it allows better simplification of the +1.
917 if (DL
->getTypeSizeInBits(BECount
->getType()) <
918 DL
->getTypeSizeInBits(IntPtr
) &&
919 SE
->isLoopEntryGuardedByCond(
920 CurLoop
, ICmpInst::ICMP_NE
, BECount
,
921 SE
->getNegativeSCEV(SE
->getOne(BECount
->getType())))) {
922 NumBytesS
= SE
->getZeroExtendExpr(
923 SE
->getAddExpr(BECount
, SE
->getOne(BECount
->getType()), SCEV::FlagNUW
),
926 NumBytesS
= SE
->getAddExpr(SE
->getTruncateOrZeroExtend(BECount
, IntPtr
),
927 SE
->getOne(IntPtr
), SCEV::FlagNUW
);
930 // And scale it based on the store size.
931 if (StoreSize
!= 1) {
932 NumBytesS
= SE
->getMulExpr(NumBytesS
, SE
->getConstant(IntPtr
, StoreSize
),
938 /// processLoopStridedStore - We see a strided store of some value. If we can
939 /// transform this into a memset or memset_pattern in the loop preheader, do so.
940 bool LoopIdiomRecognize::processLoopStridedStore(
941 Value
*DestPtr
, unsigned StoreSize
, unsigned StoreAlignment
,
942 Value
*StoredVal
, Instruction
*TheStore
,
943 SmallPtrSetImpl
<Instruction
*> &Stores
, const SCEVAddRecExpr
*Ev
,
944 const SCEV
*BECount
, bool NegStride
, bool IsLoopMemset
) {
945 Value
*SplatValue
= isBytewiseValue(StoredVal
, *DL
);
946 Constant
*PatternValue
= nullptr;
949 PatternValue
= getMemSetPatternValue(StoredVal
, DL
);
951 assert((SplatValue
|| PatternValue
) &&
952 "Expected either splat value or pattern value.");
954 // The trip count of the loop and the base pointer of the addrec SCEV is
955 // guaranteed to be loop invariant, which means that it should dominate the
956 // header. This allows us to insert code for it in the preheader.
957 unsigned DestAS
= DestPtr
->getType()->getPointerAddressSpace();
958 BasicBlock
*Preheader
= CurLoop
->getLoopPreheader();
959 IRBuilder
<> Builder(Preheader
->getTerminator());
960 SCEVExpander
Expander(*SE
, *DL
, "loop-idiom");
962 Type
*DestInt8PtrTy
= Builder
.getInt8PtrTy(DestAS
);
963 Type
*IntPtr
= Builder
.getIntPtrTy(*DL
, DestAS
);
965 const SCEV
*Start
= Ev
->getStart();
966 // Handle negative strided loops.
968 Start
= getStartForNegStride(Start
, BECount
, IntPtr
, StoreSize
, SE
);
970 // TODO: ideally we should still be able to generate memset if SCEV expander
971 // is taught to generate the dependencies at the latest point.
972 if (!isSafeToExpand(Start
, *SE
))
975 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
976 // this into a memset in the loop preheader now if we want. However, this
977 // would be unsafe to do if there is anything else in the loop that may read
978 // or write to the aliased location. Check for any overlap by generating the
979 // base pointer and checking the region.
981 Expander
.expandCodeFor(Start
, DestInt8PtrTy
, Preheader
->getTerminator());
982 if (mayLoopAccessLocation(BasePtr
, ModRefInfo::ModRef
, CurLoop
, BECount
,
983 StoreSize
, *AA
, Stores
)) {
985 // If we generated new code for the base pointer, clean up.
986 RecursivelyDeleteTriviallyDeadInstructions(BasePtr
, TLI
);
990 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset
))
993 // Okay, everything looks good, insert the memset.
995 const SCEV
*NumBytesS
=
996 getNumBytes(BECount
, IntPtr
, StoreSize
, CurLoop
, DL
, SE
);
998 // TODO: ideally we should still be able to generate memset if SCEV expander
999 // is taught to generate the dependencies at the latest point.
1000 if (!isSafeToExpand(NumBytesS
, *SE
))
1004 Expander
.expandCodeFor(NumBytesS
, IntPtr
, Preheader
->getTerminator());
1009 Builder
.CreateMemSet(BasePtr
, SplatValue
, NumBytes
, StoreAlignment
);
1011 // Everything is emitted in default address space
1012 Type
*Int8PtrTy
= DestInt8PtrTy
;
1014 Module
*M
= TheStore
->getModule();
1015 StringRef FuncName
= "memset_pattern16";
1016 FunctionCallee MSP
= M
->getOrInsertFunction(FuncName
, Builder
.getVoidTy(),
1017 Int8PtrTy
, Int8PtrTy
, IntPtr
);
1018 inferLibFuncAttributes(M
, FuncName
, *TLI
);
1020 // Otherwise we should form a memset_pattern16. PatternValue is known to be
1021 // an constant array of 16-bytes. Plop the value into a mergable global.
1022 GlobalVariable
*GV
= new GlobalVariable(*M
, PatternValue
->getType(), true,
1023 GlobalValue::PrivateLinkage
,
1024 PatternValue
, ".memset_pattern");
1025 GV
->setUnnamedAddr(GlobalValue::UnnamedAddr::Global
); // Ok to merge these.
1026 GV
->setAlignment(Align(16));
1027 Value
*PatternPtr
= ConstantExpr::getBitCast(GV
, Int8PtrTy
);
1028 NewCall
= Builder
.CreateCall(MSP
, {BasePtr
, PatternPtr
, NumBytes
});
1031 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall
<< "\n"
1032 << " from store to: " << *Ev
<< " at: " << *TheStore
1034 NewCall
->setDebugLoc(TheStore
->getDebugLoc());
1037 return OptimizationRemark(DEBUG_TYPE
, "ProcessLoopStridedStore",
1038 NewCall
->getDebugLoc(), Preheader
)
1039 << "Transformed loop-strided store into a call to "
1040 << ore::NV("NewFunction", NewCall
->getCalledFunction())
1044 // Okay, the memset has been formed. Zap the original store and anything that
1046 for (auto *I
: Stores
)
1047 deleteDeadInstruction(I
);
1052 /// If the stored value is a strided load in the same loop with the same stride
1053 /// this may be transformable into a memcpy. This kicks in for stuff like
1054 /// for (i) A[i] = B[i];
1055 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst
*SI
,
1056 const SCEV
*BECount
) {
1057 assert(SI
->isUnordered() && "Expected only non-volatile non-ordered stores.");
1059 Value
*StorePtr
= SI
->getPointerOperand();
1060 const SCEVAddRecExpr
*StoreEv
= cast
<SCEVAddRecExpr
>(SE
->getSCEV(StorePtr
));
1061 APInt Stride
= getStoreStride(StoreEv
);
1062 unsigned StoreSize
= DL
->getTypeStoreSize(SI
->getValueOperand()->getType());
1063 bool NegStride
= StoreSize
== -Stride
;
1065 // The store must be feeding a non-volatile load.
1066 LoadInst
*LI
= cast
<LoadInst
>(SI
->getValueOperand());
1067 assert(LI
->isUnordered() && "Expected only non-volatile non-ordered loads.");
1069 // See if the pointer expression is an AddRec like {base,+,1} on the current
1070 // loop, which indicates a strided load. If we have something else, it's a
1071 // random load we can't handle.
1072 const SCEVAddRecExpr
*LoadEv
=
1073 cast
<SCEVAddRecExpr
>(SE
->getSCEV(LI
->getPointerOperand()));
1075 // The trip count of the loop and the base pointer of the addrec SCEV is
1076 // guaranteed to be loop invariant, which means that it should dominate the
1077 // header. This allows us to insert code for it in the preheader.
1078 BasicBlock
*Preheader
= CurLoop
->getLoopPreheader();
1079 IRBuilder
<> Builder(Preheader
->getTerminator());
1080 SCEVExpander
Expander(*SE
, *DL
, "loop-idiom");
1082 const SCEV
*StrStart
= StoreEv
->getStart();
1083 unsigned StrAS
= SI
->getPointerAddressSpace();
1084 Type
*IntPtrTy
= Builder
.getIntPtrTy(*DL
, StrAS
);
1086 // Handle negative strided loops.
1088 StrStart
= getStartForNegStride(StrStart
, BECount
, IntPtrTy
, StoreSize
, SE
);
1090 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1091 // this into a memcpy in the loop preheader now if we want. However, this
1092 // would be unsafe to do if there is anything else in the loop that may read
1093 // or write the memory region we're storing to. This includes the load that
1094 // feeds the stores. Check for an alias by generating the base address and
1095 // checking everything.
1096 Value
*StoreBasePtr
= Expander
.expandCodeFor(
1097 StrStart
, Builder
.getInt8PtrTy(StrAS
), Preheader
->getTerminator());
1099 SmallPtrSet
<Instruction
*, 1> Stores
;
1101 if (mayLoopAccessLocation(StoreBasePtr
, ModRefInfo::ModRef
, CurLoop
, BECount
,
1102 StoreSize
, *AA
, Stores
)) {
1104 // If we generated new code for the base pointer, clean up.
1105 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr
, TLI
);
1109 const SCEV
*LdStart
= LoadEv
->getStart();
1110 unsigned LdAS
= LI
->getPointerAddressSpace();
1112 // Handle negative strided loops.
1114 LdStart
= getStartForNegStride(LdStart
, BECount
, IntPtrTy
, StoreSize
, SE
);
1116 // For a memcpy, we have to make sure that the input array is not being
1117 // mutated by the loop.
1118 Value
*LoadBasePtr
= Expander
.expandCodeFor(
1119 LdStart
, Builder
.getInt8PtrTy(LdAS
), Preheader
->getTerminator());
1121 if (mayLoopAccessLocation(LoadBasePtr
, ModRefInfo::Mod
, CurLoop
, BECount
,
1122 StoreSize
, *AA
, Stores
)) {
1124 // If we generated new code for the base pointer, clean up.
1125 RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr
, TLI
);
1126 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr
, TLI
);
1130 if (avoidLIRForMultiBlockLoop())
1133 // Okay, everything is safe, we can transform this!
1135 const SCEV
*NumBytesS
=
1136 getNumBytes(BECount
, IntPtrTy
, StoreSize
, CurLoop
, DL
, SE
);
1139 Expander
.expandCodeFor(NumBytesS
, IntPtrTy
, Preheader
->getTerminator());
1141 CallInst
*NewCall
= nullptr;
1142 // Check whether to generate an unordered atomic memcpy:
1143 // If the load or store are atomic, then they must necessarily be unordered
1144 // by previous checks.
1145 if (!SI
->isAtomic() && !LI
->isAtomic())
1146 NewCall
= Builder
.CreateMemCpy(StoreBasePtr
, SI
->getAlignment(),
1147 LoadBasePtr
, LI
->getAlignment(), NumBytes
);
1149 // We cannot allow unaligned ops for unordered load/store, so reject
1150 // anything where the alignment isn't at least the element size.
1151 unsigned Align
= std::min(SI
->getAlignment(), LI
->getAlignment());
1152 if (Align
< StoreSize
)
1155 // If the element.atomic memcpy is not lowered into explicit
1156 // loads/stores later, then it will be lowered into an element-size
1157 // specific lib call. If the lib call doesn't exist for our store size, then
1158 // we shouldn't generate the memcpy.
1159 if (StoreSize
> TTI
->getAtomicMemIntrinsicMaxElementSize())
1163 // Note that unordered atomic loads/stores are *required* by the spec to
1164 // have an alignment but non-atomic loads/stores may not.
1165 NewCall
= Builder
.CreateElementUnorderedAtomicMemCpy(
1166 StoreBasePtr
, SI
->getAlignment(), LoadBasePtr
, LI
->getAlignment(),
1167 NumBytes
, StoreSize
);
1169 NewCall
->setDebugLoc(SI
->getDebugLoc());
1171 LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall
<< "\n"
1172 << " from load ptr=" << *LoadEv
<< " at: " << *LI
<< "\n"
1173 << " from store ptr=" << *StoreEv
<< " at: " << *SI
1177 return OptimizationRemark(DEBUG_TYPE
, "ProcessLoopStoreOfLoopLoad",
1178 NewCall
->getDebugLoc(), Preheader
)
1179 << "Formed a call to "
1180 << ore::NV("NewFunction", NewCall
->getCalledFunction())
1184 // Okay, the memcpy has been formed. Zap the original store and anything that
1186 deleteDeadInstruction(SI
);
1191 // When compiling for codesize we avoid idiom recognition for a multi-block loop
1192 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1194 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset
,
1195 bool IsLoopMemset
) {
1196 if (ApplyCodeSizeHeuristics
&& CurLoop
->getNumBlocks() > 1) {
1197 if (!CurLoop
->getParentLoop() && (!IsMemset
|| !IsLoopMemset
)) {
1198 LLVM_DEBUG(dbgs() << " " << CurLoop
->getHeader()->getParent()->getName()
1199 << " : LIR " << (IsMemset
? "Memset" : "Memcpy")
1200 << " avoided: multi-block top-level loop\n");
1208 bool LoopIdiomRecognize::runOnNoncountableLoop() {
1209 LLVM_DEBUG(dbgs() << DEBUG_TYPE
" Scanning: F["
1210 << CurLoop
->getHeader()->getParent()->getName()
1211 << "] Noncountable Loop %"
1212 << CurLoop
->getHeader()->getName() << "\n");
1214 return recognizeBCmp() || recognizePopcount() || recognizeAndInsertFFS();
1217 /// Check if the given conditional branch is based on the comparison between
1218 /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1219 /// true), the control yields to the loop entry. If the branch matches the
1220 /// behavior, the variable involved in the comparison is returned. This function
1221 /// will be called to see if the precondition and postcondition of the loop are
1222 /// in desirable form.
1223 static Value
*matchCondition(BranchInst
*BI
, BasicBlock
*LoopEntry
,
1224 bool JmpOnZero
= false) {
1225 if (!BI
|| !BI
->isConditional())
1228 ICmpInst
*Cond
= dyn_cast
<ICmpInst
>(BI
->getCondition());
1232 ConstantInt
*CmpZero
= dyn_cast
<ConstantInt
>(Cond
->getOperand(1));
1233 if (!CmpZero
|| !CmpZero
->isZero())
1236 BasicBlock
*TrueSucc
= BI
->getSuccessor(0);
1237 BasicBlock
*FalseSucc
= BI
->getSuccessor(1);
1239 std::swap(TrueSucc
, FalseSucc
);
1241 ICmpInst::Predicate Pred
= Cond
->getPredicate();
1242 if ((Pred
== ICmpInst::ICMP_NE
&& TrueSucc
== LoopEntry
) ||
1243 (Pred
== ICmpInst::ICMP_EQ
&& FalseSucc
== LoopEntry
))
1244 return Cond
->getOperand(0);
1249 // Check if the recurrence variable `VarX` is in the right form to create
1250 // the idiom. Returns the value coerced to a PHINode if so.
1251 static PHINode
*getRecurrenceVar(Value
*VarX
, Instruction
*DefX
,
1252 BasicBlock
*LoopEntry
) {
1253 auto *PhiX
= dyn_cast
<PHINode
>(VarX
);
1254 if (PhiX
&& PhiX
->getParent() == LoopEntry
&&
1255 (PhiX
->getOperand(0) == DefX
|| PhiX
->getOperand(1) == DefX
))
1260 /// Return true iff the idiom is detected in the loop.
1263 /// 1) \p CntInst is set to the instruction counting the population bit.
1264 /// 2) \p CntPhi is set to the corresponding phi node.
1265 /// 3) \p Var is set to the value whose population bits are being counted.
1267 /// The core idiom we are trying to detect is:
1270 /// goto loop-exit // the precondition of the loop
1271 /// cnt0 = init-val;
1273 /// x1 = phi (x0, x2);
1274 /// cnt1 = phi(cnt0, cnt2);
1276 /// cnt2 = cnt1 + 1;
1278 /// x2 = x1 & (x1 - 1);
1280 /// } while(x != 0);
1284 static bool detectPopcountIdiom(Loop
*CurLoop
, BasicBlock
*PreCondBB
,
1285 Instruction
*&CntInst
, PHINode
*&CntPhi
,
1287 // step 1: Check to see if the look-back branch match this pattern:
1288 // "if (a!=0) goto loop-entry".
1289 BasicBlock
*LoopEntry
;
1290 Instruction
*DefX2
, *CountInst
;
1291 Value
*VarX1
, *VarX0
;
1292 PHINode
*PhiX
, *CountPhi
;
1294 DefX2
= CountInst
= nullptr;
1295 VarX1
= VarX0
= nullptr;
1296 PhiX
= CountPhi
= nullptr;
1297 LoopEntry
= *(CurLoop
->block_begin());
1299 // step 1: Check if the loop-back branch is in desirable form.
1301 if (Value
*T
= matchCondition(
1302 dyn_cast
<BranchInst
>(LoopEntry
->getTerminator()), LoopEntry
))
1303 DefX2
= dyn_cast
<Instruction
>(T
);
1308 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1310 if (!DefX2
|| DefX2
->getOpcode() != Instruction::And
)
1313 BinaryOperator
*SubOneOp
;
1315 if ((SubOneOp
= dyn_cast
<BinaryOperator
>(DefX2
->getOperand(0))))
1316 VarX1
= DefX2
->getOperand(1);
1318 VarX1
= DefX2
->getOperand(0);
1319 SubOneOp
= dyn_cast
<BinaryOperator
>(DefX2
->getOperand(1));
1321 if (!SubOneOp
|| SubOneOp
->getOperand(0) != VarX1
)
1324 ConstantInt
*Dec
= dyn_cast
<ConstantInt
>(SubOneOp
->getOperand(1));
1326 !((SubOneOp
->getOpcode() == Instruction::Sub
&& Dec
->isOne()) ||
1327 (SubOneOp
->getOpcode() == Instruction::Add
&&
1328 Dec
->isMinusOne()))) {
1333 // step 3: Check the recurrence of variable X
1334 PhiX
= getRecurrenceVar(VarX1
, DefX2
, LoopEntry
);
1338 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1340 CountInst
= nullptr;
1341 for (BasicBlock::iterator Iter
= LoopEntry
->getFirstNonPHI()->getIterator(),
1342 IterE
= LoopEntry
->end();
1343 Iter
!= IterE
; Iter
++) {
1344 Instruction
*Inst
= &*Iter
;
1345 if (Inst
->getOpcode() != Instruction::Add
)
1348 ConstantInt
*Inc
= dyn_cast
<ConstantInt
>(Inst
->getOperand(1));
1349 if (!Inc
|| !Inc
->isOne())
1352 PHINode
*Phi
= getRecurrenceVar(Inst
->getOperand(0), Inst
, LoopEntry
);
1356 // Check if the result of the instruction is live of the loop.
1357 bool LiveOutLoop
= false;
1358 for (User
*U
: Inst
->users()) {
1359 if ((cast
<Instruction
>(U
))->getParent() != LoopEntry
) {
1376 // step 5: check if the precondition is in this form:
1377 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1379 auto *PreCondBr
= dyn_cast
<BranchInst
>(PreCondBB
->getTerminator());
1380 Value
*T
= matchCondition(PreCondBr
, CurLoop
->getLoopPreheader());
1381 if (T
!= PhiX
->getOperand(0) && T
!= PhiX
->getOperand(1))
1384 CntInst
= CountInst
;
1392 /// Return true if the idiom is detected in the loop.
1395 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1396 /// or nullptr if there is no such.
1397 /// 2) \p CntPhi is set to the corresponding phi node
1398 /// or nullptr if there is no such.
1399 /// 3) \p Var is set to the value whose CTLZ could be used.
1400 /// 4) \p DefX is set to the instruction calculating Loop exit condition.
1402 /// The core idiom we are trying to detect is:
1405 /// goto loop-exit // the precondition of the loop
1406 /// cnt0 = init-val;
1408 /// x = phi (x0, x.next); //PhiX
1409 /// cnt = phi(cnt0, cnt.next);
1411 /// cnt.next = cnt + 1;
1413 /// x.next = x >> 1; // DefX
1415 /// } while(x.next != 0);
1419 static bool detectShiftUntilZeroIdiom(Loop
*CurLoop
, const DataLayout
&DL
,
1420 Intrinsic::ID
&IntrinID
, Value
*&InitX
,
1421 Instruction
*&CntInst
, PHINode
*&CntPhi
,
1422 Instruction
*&DefX
) {
1423 BasicBlock
*LoopEntry
;
1424 Value
*VarX
= nullptr;
1429 LoopEntry
= *(CurLoop
->block_begin());
1431 // step 1: Check if the loop-back branch is in desirable form.
1432 if (Value
*T
= matchCondition(
1433 dyn_cast
<BranchInst
>(LoopEntry
->getTerminator()), LoopEntry
))
1434 DefX
= dyn_cast
<Instruction
>(T
);
1438 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1439 if (!DefX
|| !DefX
->isShift())
1441 IntrinID
= DefX
->getOpcode() == Instruction::Shl
? Intrinsic::cttz
:
1443 ConstantInt
*Shft
= dyn_cast
<ConstantInt
>(DefX
->getOperand(1));
1444 if (!Shft
|| !Shft
->isOne())
1446 VarX
= DefX
->getOperand(0);
1448 // step 3: Check the recurrence of variable X
1449 PHINode
*PhiX
= getRecurrenceVar(VarX
, DefX
, LoopEntry
);
1453 InitX
= PhiX
->getIncomingValueForBlock(CurLoop
->getLoopPreheader());
1455 // Make sure the initial value can't be negative otherwise the ashr in the
1456 // loop might never reach zero which would make the loop infinite.
1457 if (DefX
->getOpcode() == Instruction::AShr
&& !isKnownNonNegative(InitX
, DL
))
1460 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1461 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1462 // then all uses of "cnt.next" could be optimized to the trip count
1463 // plus "cnt0". Currently it is not optimized.
1464 // This step could be used to detect POPCNT instruction:
1465 // cnt.next = cnt + (x.next & 1)
1466 for (BasicBlock::iterator Iter
= LoopEntry
->getFirstNonPHI()->getIterator(),
1467 IterE
= LoopEntry
->end();
1468 Iter
!= IterE
; Iter
++) {
1469 Instruction
*Inst
= &*Iter
;
1470 if (Inst
->getOpcode() != Instruction::Add
)
1473 ConstantInt
*Inc
= dyn_cast
<ConstantInt
>(Inst
->getOperand(1));
1474 if (!Inc
|| !Inc
->isOne())
1477 PHINode
*Phi
= getRecurrenceVar(Inst
->getOperand(0), Inst
, LoopEntry
);
1491 /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1492 /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1493 /// trip count returns true; otherwise, returns false.
1494 bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1495 // Give up if the loop has multiple blocks or multiple backedges.
1496 if (CurLoop
->getNumBackEdges() != 1 || CurLoop
->getNumBlocks() != 1)
1499 Intrinsic::ID IntrinID
;
1501 Instruction
*DefX
= nullptr;
1502 PHINode
*CntPhi
= nullptr;
1503 Instruction
*CntInst
= nullptr;
1504 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1505 // this is always 6.
1506 size_t IdiomCanonicalSize
= 6;
1508 if (!detectShiftUntilZeroIdiom(CurLoop
, *DL
, IntrinID
, InitX
,
1509 CntInst
, CntPhi
, DefX
))
1512 bool IsCntPhiUsedOutsideLoop
= false;
1513 for (User
*U
: CntPhi
->users())
1514 if (!CurLoop
->contains(cast
<Instruction
>(U
))) {
1515 IsCntPhiUsedOutsideLoop
= true;
1518 bool IsCntInstUsedOutsideLoop
= false;
1519 for (User
*U
: CntInst
->users())
1520 if (!CurLoop
->contains(cast
<Instruction
>(U
))) {
1521 IsCntInstUsedOutsideLoop
= true;
1524 // If both CntInst and CntPhi are used outside the loop the profitability
1526 if (IsCntInstUsedOutsideLoop
&& IsCntPhiUsedOutsideLoop
)
1529 // For some CPUs result of CTLZ(X) intrinsic is undefined
1530 // when X is 0. If we can not guarantee X != 0, we need to check this
1532 bool ZeroCheck
= false;
1533 // It is safe to assume Preheader exist as it was checked in
1534 // parent function RunOnLoop.
1535 BasicBlock
*PH
= CurLoop
->getLoopPreheader();
1537 // If we are using the count instruction outside the loop, make sure we
1538 // have a zero check as a precondition. Without the check the loop would run
1539 // one iteration for before any check of the input value. This means 0 and 1
1540 // would have identical behavior in the original loop and thus
1541 if (!IsCntPhiUsedOutsideLoop
) {
1542 auto *PreCondBB
= PH
->getSinglePredecessor();
1545 auto *PreCondBI
= dyn_cast
<BranchInst
>(PreCondBB
->getTerminator());
1548 if (matchCondition(PreCondBI
, PH
) != InitX
)
1553 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1554 // profitable if we delete the loop.
1556 // the loop has only 6 instructions:
1557 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1558 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1559 // %shr = ashr %n.addr.0, 1
1560 // %tobool = icmp eq %shr, 0
1561 // %inc = add nsw %i.0, 1
1564 const Value
*Args
[] =
1565 {InitX
, ZeroCheck
? ConstantInt::getTrue(InitX
->getContext())
1566 : ConstantInt::getFalse(InitX
->getContext())};
1568 // @llvm.dbg doesn't count as they have no semantic effect.
1569 auto InstWithoutDebugIt
= CurLoop
->getHeader()->instructionsWithoutDebug();
1570 uint32_t HeaderSize
=
1571 std::distance(InstWithoutDebugIt
.begin(), InstWithoutDebugIt
.end());
1573 if (HeaderSize
!= IdiomCanonicalSize
&&
1574 TTI
->getIntrinsicCost(IntrinID
, InitX
->getType(), Args
) >
1575 TargetTransformInfo::TCC_Basic
)
1578 transformLoopToCountable(IntrinID
, PH
, CntInst
, CntPhi
, InitX
, DefX
,
1579 DefX
->getDebugLoc(), ZeroCheck
,
1580 IsCntPhiUsedOutsideLoop
);
1584 /// Recognizes a population count idiom in a non-countable loop.
1586 /// If detected, transforms the relevant code to issue the popcount intrinsic
1587 /// function call, and returns true; otherwise, returns false.
1588 bool LoopIdiomRecognize::recognizePopcount() {
1589 if (TTI
->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware
)
1592 // Counting population are usually conducted by few arithmetic instructions.
1593 // Such instructions can be easily "absorbed" by vacant slots in a
1594 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1595 // in a compact loop.
1597 // Give up if the loop has multiple blocks or multiple backedges.
1598 if (CurLoop
->getNumBackEdges() != 1 || CurLoop
->getNumBlocks() != 1)
1601 BasicBlock
*LoopBody
= *(CurLoop
->block_begin());
1602 if (LoopBody
->size() >= 20) {
1603 // The loop is too big, bail out.
1607 // It should have a preheader containing nothing but an unconditional branch.
1608 BasicBlock
*PH
= CurLoop
->getLoopPreheader();
1609 if (!PH
|| &PH
->front() != PH
->getTerminator())
1611 auto *EntryBI
= dyn_cast
<BranchInst
>(PH
->getTerminator());
1612 if (!EntryBI
|| EntryBI
->isConditional())
1615 // It should have a precondition block where the generated popcount intrinsic
1616 // function can be inserted.
1617 auto *PreCondBB
= PH
->getSinglePredecessor();
1620 auto *PreCondBI
= dyn_cast
<BranchInst
>(PreCondBB
->getTerminator());
1621 if (!PreCondBI
|| PreCondBI
->isUnconditional())
1624 Instruction
*CntInst
;
1627 if (!detectPopcountIdiom(CurLoop
, PreCondBB
, CntInst
, CntPhi
, Val
))
1630 transformLoopToPopcount(PreCondBB
, CntInst
, CntPhi
, Val
);
1634 static CallInst
*createPopcntIntrinsic(IRBuilder
<> &IRBuilder
, Value
*Val
,
1635 const DebugLoc
&DL
) {
1636 Value
*Ops
[] = {Val
};
1637 Type
*Tys
[] = {Val
->getType()};
1639 Module
*M
= IRBuilder
.GetInsertBlock()->getParent()->getParent();
1640 Function
*Func
= Intrinsic::getDeclaration(M
, Intrinsic::ctpop
, Tys
);
1641 CallInst
*CI
= IRBuilder
.CreateCall(Func
, Ops
);
1642 CI
->setDebugLoc(DL
);
1647 static CallInst
*createFFSIntrinsic(IRBuilder
<> &IRBuilder
, Value
*Val
,
1648 const DebugLoc
&DL
, bool ZeroCheck
,
1649 Intrinsic::ID IID
) {
1650 Value
*Ops
[] = {Val
, ZeroCheck
? IRBuilder
.getTrue() : IRBuilder
.getFalse()};
1651 Type
*Tys
[] = {Val
->getType()};
1653 Module
*M
= IRBuilder
.GetInsertBlock()->getParent()->getParent();
1654 Function
*Func
= Intrinsic::getDeclaration(M
, IID
, Tys
);
1655 CallInst
*CI
= IRBuilder
.CreateCall(Func
, Ops
);
1656 CI
->setDebugLoc(DL
);
1661 /// Transform the following loop (Using CTLZ, CTTZ is similar):
1663 /// CntPhi = PHI [Cnt0, CntInst]
1664 /// PhiX = PHI [InitX, DefX]
1665 /// CntInst = CntPhi + 1
1666 /// DefX = PhiX >> 1
1668 /// Br: loop if (DefX != 0)
1669 /// Use(CntPhi) or Use(CntInst)
1672 /// If CntPhi used outside the loop:
1673 /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1674 /// Count = CountPrev + 1
1676 /// Count = BitWidth(InitX) - CTLZ(InitX)
1678 /// CntPhi = PHI [Cnt0, CntInst]
1679 /// PhiX = PHI [InitX, DefX]
1680 /// PhiCount = PHI [Count, Dec]
1681 /// CntInst = CntPhi + 1
1682 /// DefX = PhiX >> 1
1683 /// Dec = PhiCount - 1
1685 /// Br: loop if (Dec != 0)
1686 /// Use(CountPrev + Cnt0) // Use(CntPhi)
1688 /// Use(Count + Cnt0) // Use(CntInst)
1690 /// If LOOP_BODY is empty the loop will be deleted.
1691 /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1692 void LoopIdiomRecognize::transformLoopToCountable(
1693 Intrinsic::ID IntrinID
, BasicBlock
*Preheader
, Instruction
*CntInst
,
1694 PHINode
*CntPhi
, Value
*InitX
, Instruction
*DefX
, const DebugLoc
&DL
,
1695 bool ZeroCheck
, bool IsCntPhiUsedOutsideLoop
) {
1696 BranchInst
*PreheaderBr
= cast
<BranchInst
>(Preheader
->getTerminator());
1698 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1699 IRBuilder
<> Builder(PreheaderBr
);
1700 Builder
.SetCurrentDebugLocation(DL
);
1701 Value
*FFS
, *Count
, *CountPrev
, *NewCount
, *InitXNext
;
1703 // Count = BitWidth - CTLZ(InitX);
1704 // If there are uses of CntPhi create:
1705 // CountPrev = BitWidth - CTLZ(InitX >> 1);
1706 if (IsCntPhiUsedOutsideLoop
) {
1707 if (DefX
->getOpcode() == Instruction::AShr
)
1709 Builder
.CreateAShr(InitX
, ConstantInt::get(InitX
->getType(), 1));
1710 else if (DefX
->getOpcode() == Instruction::LShr
)
1712 Builder
.CreateLShr(InitX
, ConstantInt::get(InitX
->getType(), 1));
1713 else if (DefX
->getOpcode() == Instruction::Shl
) // cttz
1715 Builder
.CreateShl(InitX
, ConstantInt::get(InitX
->getType(), 1));
1717 llvm_unreachable("Unexpected opcode!");
1720 FFS
= createFFSIntrinsic(Builder
, InitXNext
, DL
, ZeroCheck
, IntrinID
);
1721 Count
= Builder
.CreateSub(
1722 ConstantInt::get(FFS
->getType(),
1723 FFS
->getType()->getIntegerBitWidth()),
1725 if (IsCntPhiUsedOutsideLoop
) {
1727 Count
= Builder
.CreateAdd(
1729 ConstantInt::get(CountPrev
->getType(), 1));
1732 NewCount
= Builder
.CreateZExtOrTrunc(
1733 IsCntPhiUsedOutsideLoop
? CountPrev
: Count
,
1734 cast
<IntegerType
>(CntInst
->getType()));
1736 // If the counter's initial value is not zero, insert Add Inst.
1737 Value
*CntInitVal
= CntPhi
->getIncomingValueForBlock(Preheader
);
1738 ConstantInt
*InitConst
= dyn_cast
<ConstantInt
>(CntInitVal
);
1739 if (!InitConst
|| !InitConst
->isZero())
1740 NewCount
= Builder
.CreateAdd(NewCount
, CntInitVal
);
1742 // Step 2: Insert new IV and loop condition:
1745 // PhiCount = PHI [Count, Dec]
1747 // Dec = PhiCount - 1
1749 // Br: loop if (Dec != 0)
1750 BasicBlock
*Body
= *(CurLoop
->block_begin());
1751 auto *LbBr
= cast
<BranchInst
>(Body
->getTerminator());
1752 ICmpInst
*LbCond
= cast
<ICmpInst
>(LbBr
->getCondition());
1753 Type
*Ty
= Count
->getType();
1755 PHINode
*TcPhi
= PHINode::Create(Ty
, 2, "tcphi", &Body
->front());
1757 Builder
.SetInsertPoint(LbCond
);
1758 Instruction
*TcDec
= cast
<Instruction
>(
1759 Builder
.CreateSub(TcPhi
, ConstantInt::get(Ty
, 1),
1760 "tcdec", false, true));
1762 TcPhi
->addIncoming(Count
, Preheader
);
1763 TcPhi
->addIncoming(TcDec
, Body
);
1765 CmpInst::Predicate Pred
=
1766 (LbBr
->getSuccessor(0) == Body
) ? CmpInst::ICMP_NE
: CmpInst::ICMP_EQ
;
1767 LbCond
->setPredicate(Pred
);
1768 LbCond
->setOperand(0, TcDec
);
1769 LbCond
->setOperand(1, ConstantInt::get(Ty
, 0));
1771 // Step 3: All the references to the original counter outside
1772 // the loop are replaced with the NewCount
1773 if (IsCntPhiUsedOutsideLoop
)
1774 CntPhi
->replaceUsesOutsideBlock(NewCount
, Body
);
1776 CntInst
->replaceUsesOutsideBlock(NewCount
, Body
);
1778 // step 4: Forget the "non-computable" trip-count SCEV associated with the
1779 // loop. The loop would otherwise not be deleted even if it becomes empty.
1780 SE
->forgetLoop(CurLoop
);
1783 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock
*PreCondBB
,
1784 Instruction
*CntInst
,
1785 PHINode
*CntPhi
, Value
*Var
) {
1786 BasicBlock
*PreHead
= CurLoop
->getLoopPreheader();
1787 auto *PreCondBr
= cast
<BranchInst
>(PreCondBB
->getTerminator());
1788 const DebugLoc
&DL
= CntInst
->getDebugLoc();
1790 // Assuming before transformation, the loop is following:
1791 // if (x) // the precondition
1792 // do { cnt++; x &= x - 1; } while(x);
1794 // Step 1: Insert the ctpop instruction at the end of the precondition block
1795 IRBuilder
<> Builder(PreCondBr
);
1796 Value
*PopCnt
, *PopCntZext
, *NewCount
, *TripCnt
;
1798 PopCnt
= createPopcntIntrinsic(Builder
, Var
, DL
);
1799 NewCount
= PopCntZext
=
1800 Builder
.CreateZExtOrTrunc(PopCnt
, cast
<IntegerType
>(CntPhi
->getType()));
1802 if (NewCount
!= PopCnt
)
1803 (cast
<Instruction
>(NewCount
))->setDebugLoc(DL
);
1805 // TripCnt is exactly the number of iterations the loop has
1808 // If the population counter's initial value is not zero, insert Add Inst.
1809 Value
*CntInitVal
= CntPhi
->getIncomingValueForBlock(PreHead
);
1810 ConstantInt
*InitConst
= dyn_cast
<ConstantInt
>(CntInitVal
);
1811 if (!InitConst
|| !InitConst
->isZero()) {
1812 NewCount
= Builder
.CreateAdd(NewCount
, CntInitVal
);
1813 (cast
<Instruction
>(NewCount
))->setDebugLoc(DL
);
1817 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1818 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1819 // function would be partial dead code, and downstream passes will drag
1820 // it back from the precondition block to the preheader.
1822 ICmpInst
*PreCond
= cast
<ICmpInst
>(PreCondBr
->getCondition());
1824 Value
*Opnd0
= PopCntZext
;
1825 Value
*Opnd1
= ConstantInt::get(PopCntZext
->getType(), 0);
1826 if (PreCond
->getOperand(0) != Var
)
1827 std::swap(Opnd0
, Opnd1
);
1829 ICmpInst
*NewPreCond
= cast
<ICmpInst
>(
1830 Builder
.CreateICmp(PreCond
->getPredicate(), Opnd0
, Opnd1
));
1831 PreCondBr
->setCondition(NewPreCond
);
1833 RecursivelyDeleteTriviallyDeadInstructions(PreCond
, TLI
);
1836 // Step 3: Note that the population count is exactly the trip count of the
1837 // loop in question, which enable us to convert the loop from noncountable
1838 // loop into a countable one. The benefit is twofold:
1840 // - If the loop only counts population, the entire loop becomes dead after
1841 // the transformation. It is a lot easier to prove a countable loop dead
1842 // than to prove a noncountable one. (In some C dialects, an infinite loop
1843 // isn't dead even if it computes nothing useful. In general, DCE needs
1844 // to prove a noncountable loop finite before safely delete it.)
1846 // - If the loop also performs something else, it remains alive.
1847 // Since it is transformed to countable form, it can be aggressively
1848 // optimized by some optimizations which are in general not applicable
1849 // to a noncountable loop.
1851 // After this step, this loop (conceptually) would look like following:
1852 // newcnt = __builtin_ctpop(x);
1855 // do { cnt++; x &= x-1; t--) } while (t > 0);
1856 BasicBlock
*Body
= *(CurLoop
->block_begin());
1858 auto *LbBr
= cast
<BranchInst
>(Body
->getTerminator());
1859 ICmpInst
*LbCond
= cast
<ICmpInst
>(LbBr
->getCondition());
1860 Type
*Ty
= TripCnt
->getType();
1862 PHINode
*TcPhi
= PHINode::Create(Ty
, 2, "tcphi", &Body
->front());
1864 Builder
.SetInsertPoint(LbCond
);
1865 Instruction
*TcDec
= cast
<Instruction
>(
1866 Builder
.CreateSub(TcPhi
, ConstantInt::get(Ty
, 1),
1867 "tcdec", false, true));
1869 TcPhi
->addIncoming(TripCnt
, PreHead
);
1870 TcPhi
->addIncoming(TcDec
, Body
);
1872 CmpInst::Predicate Pred
=
1873 (LbBr
->getSuccessor(0) == Body
) ? CmpInst::ICMP_UGT
: CmpInst::ICMP_SLE
;
1874 LbCond
->setPredicate(Pred
);
1875 LbCond
->setOperand(0, TcDec
);
1876 LbCond
->setOperand(1, ConstantInt::get(Ty
, 0));
1879 // Step 4: All the references to the original population counter outside
1880 // the loop are replaced with the NewCount -- the value returned from
1881 // __builtin_ctpop().
1882 CntInst
->replaceUsesOutsideBlock(NewCount
, Body
);
1884 // step 5: Forget the "non-computable" trip-count SCEV associated with the
1885 // loop. The loop would otherwise not be deleted even if it becomes empty.
1886 SE
->forgetLoop(CurLoop
);
1889 bool LoopIdiomRecognize::matchBCmpLoopStructure(
1890 CmpLoopStructure
&CmpLoop
) const {
1891 ICmpInst::Predicate BCmpPred
;
1893 // We are looking for the following basic layout:
1894 // PreheaderBB: <preheader> ; preds = ???
1896 // br label %LoopHeaderBB
1897 // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB
1899 // %BCmpValue = icmp <...>
1900 // br i1 %BCmpValue, label %LoopLatchBB, label %Successor0
1901 // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
1903 // %LatchCmpValue = <are we done, or do next iteration?>
1904 // br i1 %LatchCmpValue, label %Successor1, label %LoopHeaderBB
1905 // Successor0: <exit> ; preds = %LoopHeaderBB
1907 // Successor1: <exit> ; preds = %LoopLatchBB
1910 // Successor0 and Successor1 may or may not be the same basic block.
1912 // Match basic frame-work of this supposedly-comparison loop.
1913 using namespace PatternMatch
;
1914 if (!match(CurLoop
->getHeader()->getTerminator(),
1915 m_Br(m_CombineAnd(m_ICmp(BCmpPred
, m_Value(), m_Value()),
1916 m_Value(CmpLoop
.BCmpValue
)),
1917 CmpLoop
.HeaderBrEqualBB
, CmpLoop
.HeaderBrUnequalBB
)) ||
1918 !match(CurLoop
->getLoopLatch()->getTerminator(),
1919 m_Br(m_CombineAnd(m_Cmp(), m_Value(CmpLoop
.LatchCmpValue
)),
1920 CmpLoop
.LatchBrFinishBB
, CmpLoop
.LatchBrContinueBB
))) {
1921 LLVM_DEBUG(dbgs() << "Basic control-flow layout unrecognized.\n");
1924 LLVM_DEBUG(dbgs() << "Recognized basic control-flow layout.\n");
1928 bool LoopIdiomRecognize::matchBCmpOfLoads(Value
*BCmpValue
,
1929 CmpOfLoads
&CmpOfLoads
) const {
1930 using namespace PatternMatch
;
1931 LLVM_DEBUG(dbgs() << "Analyzing header icmp " << *BCmpValue
1932 << " as bcmp pattern.\n");
1934 // Match bcmp-style loop header cmp. It must be an eq-icmp of loads. Example:
1935 // %v0 = load <...>, <...>* %LoadSrcA
1936 // %v1 = load <...>, <...>* %LoadSrcB
1937 // %CmpLoop.BCmpValue = icmp eq <...> %v0, %v1
1938 // There won't be any no-op bitcasts between load and icmp,
1939 // they would have been transformed into a load of bitcast.
1940 // FIXME: {b,mem}cmp() calls have the same semantics as icmp. Match them too.
1941 if (!match(BCmpValue
,
1942 m_ICmp(CmpOfLoads
.BCmpPred
,
1943 m_CombineAnd(m_Load(m_Value(CmpOfLoads
.LoadSrcA
)),
1944 m_Value(CmpOfLoads
.LoadA
)),
1945 m_CombineAnd(m_Load(m_Value(CmpOfLoads
.LoadSrcB
)),
1946 m_Value(CmpOfLoads
.LoadB
)))) ||
1947 !ICmpInst::isEquality(CmpOfLoads
.BCmpPred
)) {
1948 LLVM_DEBUG(dbgs() << "Loop header icmp did not match bcmp pattern.\n");
1951 LLVM_DEBUG(dbgs() << "Recognized header icmp as bcmp pattern with loads:\n\t"
1952 << *CmpOfLoads
.LoadA
<< "\n\t" << *CmpOfLoads
.LoadB
1954 // FIXME: handle memcmp pattern?
1958 bool LoopIdiomRecognize::recognizeBCmpLoopControlFlow(
1959 const CmpOfLoads
&CmpOfLoads
, CmpLoopStructure
&CmpLoop
) const {
1960 BasicBlock
*LoopHeaderBB
= CurLoop
->getHeader();
1961 BasicBlock
*LoopLatchBB
= CurLoop
->getLoopLatch();
1963 // Be wary, comparisons can be inverted, canonicalize order.
1964 // If this 'element' comparison passed, we expect to proceed to the next elt.
1965 if (CmpOfLoads
.BCmpPred
!= ICmpInst::Predicate::ICMP_EQ
)
1966 std::swap(CmpLoop
.HeaderBrEqualBB
, CmpLoop
.HeaderBrUnequalBB
);
1967 // The predicate on loop latch does not matter, just canonicalize some order.
1968 if (CmpLoop
.LatchBrContinueBB
!= LoopHeaderBB
)
1969 std::swap(CmpLoop
.LatchBrFinishBB
, CmpLoop
.LatchBrContinueBB
);
1971 // Check that control-flow between blocks is as expected.
1972 if (CmpLoop
.HeaderBrEqualBB
!= LoopLatchBB
||
1973 CmpLoop
.LatchBrContinueBB
!= LoopHeaderBB
) {
1974 LLVM_DEBUG(dbgs() << "Loop control-flow not recognized.\n");
1978 SmallVector
<BasicBlock
*, 2> ExitBlocks
;
1979 CurLoop
->getUniqueExitBlocks(ExitBlocks
);
1980 assert(ExitBlocks
.size() <= 2U && "Can't have more than two exit blocks.");
1982 assert(!is_contained(ExitBlocks
, CmpLoop
.HeaderBrEqualBB
) &&
1983 is_contained(ExitBlocks
, CmpLoop
.HeaderBrUnequalBB
) &&
1984 !is_contained(ExitBlocks
, CmpLoop
.LatchBrContinueBB
) &&
1985 is_contained(ExitBlocks
, CmpLoop
.LatchBrFinishBB
) &&
1986 "Unexpected exit edges.");
1988 LLVM_DEBUG(dbgs() << "Recognized loop control-flow.\n");
1990 LLVM_DEBUG(dbgs() << "Performing side-effect analysis on the loop.\n");
1991 assert(CurLoop
->isLCSSAForm(*DT
) && "Should only get LCSSA-form loops here.");
1992 // No loop instructions must be used outside of the loop. Since we are in
1993 // LCSSA form, we only need to check successor block's PHI nodes's incoming
1994 // values for incoming blocks that are the loop basic blocks.
1995 for (const BasicBlock
*ExitBB
: ExitBlocks
) {
1996 for (const PHINode
&PHI
: ExitBB
->phis()) {
1997 for (const BasicBlock
*LoopBB
:
1998 make_filter_range(PHI
.blocks(), [this](BasicBlock
*PredecessorBB
) {
1999 return CurLoop
->contains(PredecessorBB
);
2002 dyn_cast
<Instruction
>(PHI
.getIncomingValueForBlock(LoopBB
));
2003 if (I
&& CurLoop
->contains(I
)) {
2005 << "Loop contains instruction " << *I
2006 << " which is used outside of the loop in basic block "
2007 << ExitBB
->getName() << " in phi node " << PHI
<< "\n");
2013 // Similarly, the loop should not have any other observable side-effects
2014 // other than the final comparison result.
2015 for (BasicBlock
*LoopBB
: CurLoop
->blocks()) {
2016 for (Instruction
&I
: *LoopBB
) {
2017 if (isa
<DbgInfoIntrinsic
>(I
)) // Ignore dbginfo.
2018 continue; // FIXME: anything else? lifetime info?
2019 if ((I
.mayHaveSideEffects() || I
.isAtomic() || I
.isFenceLike()) &&
2020 &I
!= CmpOfLoads
.LoadA
&& &I
!= CmpOfLoads
.LoadB
) {
2022 dbgs() << "Loop contains instruction with potential side-effects: "
2028 LLVM_DEBUG(dbgs() << "No loop instructions deemed to have side-effects.\n");
2032 bool LoopIdiomRecognize::recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes
,
2033 CmpOfLoads
&CmpOfLoads
,
2036 const SCEV
*&Iterations
) const {
2037 // Try to compute SCEV of the loads, for this loop's scope.
2038 const auto *ScevForSrcA
= dyn_cast
<SCEVAddRecExpr
>(
2039 SE
->getSCEVAtScope(CmpOfLoads
.LoadSrcA
, CurLoop
));
2040 const auto *ScevForSrcB
= dyn_cast
<SCEVAddRecExpr
>(
2041 SE
->getSCEVAtScope(CmpOfLoads
.LoadSrcB
, CurLoop
));
2042 if (!ScevForSrcA
|| !ScevForSrcB
) {
2043 LLVM_DEBUG(dbgs() << "Failed to get SCEV expressions for load sources.\n");
2047 LLVM_DEBUG(dbgs() << "Got SCEV expressions (at loop scope) for loads:\n\t"
2048 << *ScevForSrcA
<< "\n\t" << *ScevForSrcB
<< "\n");
2050 // Loads must have folloving SCEV exprs: {%ptr,+,BCmpTyBytes}<%LoopHeaderBB>
2051 const SCEV
*RecStepForA
= ScevForSrcA
->getStepRecurrence(*SE
);
2052 const SCEV
*RecStepForB
= ScevForSrcB
->getStepRecurrence(*SE
);
2053 if (!ScevForSrcA
->isAffine() || !ScevForSrcB
->isAffine() ||
2054 ScevForSrcA
->getLoop() != CurLoop
|| ScevForSrcB
->getLoop() != CurLoop
||
2055 RecStepForA
!= RecStepForB
|| !isa
<SCEVConstant
>(RecStepForA
) ||
2056 cast
<SCEVConstant
>(RecStepForA
)->getAPInt() != BCmpTyBytes
) {
2057 LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads. Only support "
2058 "affine SCEV expressions originating in the loop we "
2059 "are analysing with identical constant positive step, "
2060 "equal to the count of bytes compared. Got:\n\t"
2061 << *RecStepForA
<< "\n\t" << *RecStepForB
<< "\n");
2063 // FIXME: can support BCmpTyBytes > Step.
2064 // But will need to account for the extra bytes compared at the end.
2067 SrcA
= ScevForSrcA
->getStart();
2068 SrcB
= ScevForSrcB
->getStart();
2069 LLVM_DEBUG(dbgs() << "Got SCEV expressions for load sources:\n\t" << *SrcA
2070 << "\n\t" << *SrcB
<< "\n");
2072 // The load sources must be loop-invants that dominate the loop header.
2073 if (SrcA
== SE
->getCouldNotCompute() || SrcB
== SE
->getCouldNotCompute() ||
2074 !SE
->isAvailableAtLoopEntry(SrcA
, CurLoop
) ||
2075 !SE
->isAvailableAtLoopEntry(SrcB
, CurLoop
)) {
2076 LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads, unavaliable "
2077 "prior to loop header.\n");
2081 LLVM_DEBUG(dbgs() << "SCEV expressions for loads are acceptable.\n");
2083 // bcmp / memcmp take length argument as size_t, so let's conservatively
2084 // assume that the iteration count should be not wider than that.
2085 Type
*CmpFuncSizeTy
= DL
->getIntPtrType(SE
->getContext());
2087 // For how many iterations is loop guaranteed not to exit via LoopLatch?
2088 // This is one less than the maximal number of comparisons,and is: n + -1
2089 const SCEV
*LoopExitCount
=
2090 SE
->getExitCount(CurLoop
, CurLoop
->getLoopLatch());
2091 LLVM_DEBUG(dbgs() << "Got SCEV expression for loop latch exit count: "
2092 << *LoopExitCount
<< "\n");
2093 // Exit count, similarly, must be loop-invant that dominates the loop header.
2094 if (LoopExitCount
== SE
->getCouldNotCompute() ||
2095 !LoopExitCount
->getType()->isIntOrPtrTy() ||
2096 LoopExitCount
->getType()->getScalarSizeInBits() >
2097 CmpFuncSizeTy
->getScalarSizeInBits() ||
2098 !SE
->isAvailableAtLoopEntry(LoopExitCount
, CurLoop
)) {
2099 LLVM_DEBUG(dbgs() << "Unsupported SCEV expression for loop latch exit.\n");
2103 // LoopExitCount is always one less than the actual count of iterations.
2104 // Do this before cast, else we will be stuck with 1 + zext(-1 + n)
2105 Iterations
= SE
->getAddExpr(
2106 LoopExitCount
, SE
->getOne(LoopExitCount
->getType()), SCEV::FlagNUW
);
2107 assert(Iterations
!= SE
->getCouldNotCompute() &&
2108 "Shouldn't fail to increment by one.");
2110 LLVM_DEBUG(dbgs() << "Computed iteration count: " << *Iterations
<< "\n");
2114 /// Return true iff the bcmp idiom is detected in the loop.
2117 /// 1) \p BCmpInst is set to the root byte-comparison instruction.
2118 /// 2) \p LatchCmpInst is set to the comparison that controls the latch.
2119 /// 3) \p LoadA is set to the first LoadInst.
2120 /// 4) \p LoadB is set to the second LoadInst.
2121 /// 5) \p SrcA is set to the first source location that is being compared.
2122 /// 6) \p SrcB is set to the second source location that is being compared.
2123 /// 7) \p NBytes is set to the number of bytes to compare.
2124 bool LoopIdiomRecognize::detectBCmpIdiom(ICmpInst
*&BCmpInst
,
2125 CmpInst
*&LatchCmpInst
,
2126 LoadInst
*&LoadA
, LoadInst
*&LoadB
,
2127 const SCEV
*&SrcA
, const SCEV
*&SrcB
,
2128 const SCEV
*&NBytes
) const {
2129 LLVM_DEBUG(dbgs() << "Recognizing bcmp idiom\n");
2131 // Give up if the loop is not in normal form, or has more than 2 blocks.
2132 if (!CurLoop
->isLoopSimplifyForm() || CurLoop
->getNumBlocks() > 2) {
2133 LLVM_DEBUG(dbgs() << "Basic loop structure unrecognized.\n");
2136 LLVM_DEBUG(dbgs() << "Recognized basic loop structure.\n");
2138 CmpLoopStructure CmpLoop
;
2139 if (!matchBCmpLoopStructure(CmpLoop
))
2142 CmpOfLoads CmpOfLoads
;
2143 if (!matchBCmpOfLoads(CmpLoop
.BCmpValue
, CmpOfLoads
))
2146 if (!recognizeBCmpLoopControlFlow(CmpOfLoads
, CmpLoop
))
2149 BCmpInst
= cast
<ICmpInst
>(CmpLoop
.BCmpValue
); // FIXME: is there no
2150 LatchCmpInst
= cast
<CmpInst
>(CmpLoop
.LatchCmpValue
); // way to combine
2151 LoadA
= cast
<LoadInst
>(CmpOfLoads
.LoadA
); // these cast with
2152 LoadB
= cast
<LoadInst
>(CmpOfLoads
.LoadB
); // m_Value() matcher?
2154 Type
*BCmpValTy
= BCmpInst
->getOperand(0)->getType();
2155 LLVMContext
&Context
= BCmpValTy
->getContext();
2156 uint64_t BCmpTyBits
= DL
->getTypeSizeInBits(BCmpValTy
);
2157 static constexpr uint64_t ByteTyBits
= 8;
2159 LLVM_DEBUG(dbgs() << "Got comparison between values of type " << *BCmpValTy
2160 << " of size " << BCmpTyBits
2161 << " bits (while byte = " << ByteTyBits
<< " bits).\n");
2162 // bcmp()/memcmp() minimal unit of work is a byte. Therefore we must check
2163 // that we are dealing with a multiple of a byte here.
2164 if (BCmpTyBits
% ByteTyBits
!= 0) {
2165 LLVM_DEBUG(dbgs() << "Value size is not a multiple of byte.\n");
2167 // FIXME: could still be done under a run-time check that the total bit
2168 // count is a multiple of a byte i guess? Or handle remainder separately?
2171 // Each comparison is done on this many bytes.
2172 uint64_t BCmpTyBytes
= BCmpTyBits
/ ByteTyBits
;
2173 LLVM_DEBUG(dbgs() << "Size is exactly " << BCmpTyBytes
2174 << " bytes, eligible for bcmp conversion.\n");
2176 const SCEV
*Iterations
;
2177 if (!recognizeBCmpLoopSCEV(BCmpTyBytes
, CmpOfLoads
, SrcA
, SrcB
, Iterations
))
2180 // bcmp / memcmp take length argument as size_t, do promotion now.
2181 Type
*CmpFuncSizeTy
= DL
->getIntPtrType(Context
);
2182 Iterations
= SE
->getNoopOrZeroExtend(Iterations
, CmpFuncSizeTy
);
2183 assert(Iterations
!= SE
->getCouldNotCompute() && "Promotion failed.");
2184 // Note that it didn't do ptrtoint cast, we will need to do it manually.
2186 // We will be comparing *bytes*, not BCmpTy, we need to recalculate size.
2187 // It's a multiplication, and it *could* overflow. But for it to overflow
2188 // we'd want to compare more bytes than could be represented by size_t, But
2189 // allocation functions also take size_t. So how'd you produce such buffer?
2190 // FIXME: we likely need to actually check that we know this won't overflow,
2191 // via llvm::computeOverflowForUnsignedMul().
2192 NBytes
= SE
->getMulExpr(
2193 Iterations
, SE
->getConstant(CmpFuncSizeTy
, BCmpTyBytes
), SCEV::FlagNUW
);
2194 assert(NBytes
!= SE
->getCouldNotCompute() &&
2195 "Shouldn't fail to increment by one.");
2197 LLVM_DEBUG(dbgs() << "Computed total byte count: " << *NBytes
<< "\n");
2199 if (LoadA
->getPointerAddressSpace() != LoadB
->getPointerAddressSpace() ||
2200 LoadA
->getPointerAddressSpace() != 0 || !LoadA
->isSimple() ||
2201 !LoadB
->isSimple()) {
2202 StringLiteral
L("Unsupported loads in idiom - only support identical, "
2203 "simple loads from address space 0.\n");
2204 LLVM_DEBUG(dbgs() << L
);
2206 return OptimizationRemarkMissed(DEBUG_TYPE
, "BCmpIdiomUnsupportedLoads",
2207 BCmpInst
->getDebugLoc(),
2208 CurLoop
->getHeader())
2211 return false; // FIXME: support non-simple loads.
2214 LLVM_DEBUG(dbgs() << "Recognized bcmp idiom\n");
2216 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "RecognizedBCmpIdiom",
2217 CurLoop
->getStartLoc(),
2218 CurLoop
->getHeader())
2219 << "Loop recognized as a bcmp idiom";
2226 LoopIdiomRecognize::transformBCmpControlFlow(ICmpInst
*ComparedEqual
) {
2227 LLVM_DEBUG(dbgs() << "Transforming control-flow.\n");
2228 SmallVector
<DominatorTree::UpdateType
, 8> DTUpdates
;
2230 BasicBlock
*PreheaderBB
= CurLoop
->getLoopPreheader();
2231 BasicBlock
*HeaderBB
= CurLoop
->getHeader();
2232 BasicBlock
*LoopLatchBB
= CurLoop
->getLoopLatch();
2233 SmallString
<32> LoopName
= CurLoop
->getName();
2234 Function
*Func
= PreheaderBB
->getParent();
2235 LLVMContext
&Context
= Func
->getContext();
2237 // Before doing anything, drop SCEV info.
2238 SE
->forgetLoop(CurLoop
);
2240 // Here we start with: (0/6)
2241 // PreheaderBB: <preheader> ; preds = ???
2243 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2244 // %ComparedEqual = icmp eq <...> %memcmp, 0
2245 // br label %LoopHeaderBB
2246 // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB
2248 // br i1 %<...>, label %LoopLatchBB, label %Successor0BB
2249 // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
2251 // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB
2252 // Successor0BB: <exit> ; preds = %LoopHeaderBB
2253 // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ]
2255 // Successor1BB: <exit> ; preds = %LoopLatchBB
2256 // %S1PHI = phi <...> [ <...>, %LoopLatchBB ]
2259 // Successor0 and Successor1 may or may not be the same basic block.
2261 // Decouple the edge between loop preheader basic block and loop header basic
2262 // block. Thus the loop has become unreachable.
2263 assert(cast
<BranchInst
>(PreheaderBB
->getTerminator())->isUnconditional() &&
2264 PreheaderBB
->getTerminator()->getSuccessor(0) == HeaderBB
&&
2265 "Preheader bb must end with an unconditional branch to header bb.");
2266 PreheaderBB
->getTerminator()->eraseFromParent();
2267 DTUpdates
.push_back({DominatorTree::Delete
, PreheaderBB
, HeaderBB
});
2269 // Create a new preheader basic block before loop header basic block.
2270 auto *PhonyPreheaderBB
= BasicBlock::Create(
2271 Context
, LoopName
+ ".phonypreheaderbb", Func
, HeaderBB
);
2272 // And insert an unconditional branch from phony preheader basic block to
2273 // loop header basic block.
2274 IRBuilder
<>(PhonyPreheaderBB
).CreateBr(HeaderBB
);
2275 DTUpdates
.push_back({DominatorTree::Insert
, PhonyPreheaderBB
, HeaderBB
});
2277 // Create a *single* new empty block that we will substitute as a
2278 // successor basic block for the loop's exits. This one is temporary.
2279 // Much like phony preheader basic block, it is not connected.
2280 auto *PhonySuccessorBB
=
2281 BasicBlock::Create(Context
, LoopName
+ ".phonysuccessorbb", Func
,
2282 LoopLatchBB
->getNextNode());
2283 // That block must have *some* non-PHI instruction, or else deleteDeadLoop()
2284 // will mess up cleanup of dbginfo, and verifier will complain.
2285 IRBuilder
<>(PhonySuccessorBB
).CreateUnreachable();
2287 // Create two new empty blocks that we will use to preserve the original
2288 // loop exit control-flow, and preserve the incoming values in the PHI nodes
2289 // in loop's successor exit blocks. These will live one.
2290 auto *ComparedUnequalBB
=
2291 BasicBlock::Create(Context
, ComparedEqual
->getName() + ".unequalbb", Func
,
2292 PhonySuccessorBB
->getNextNode());
2293 auto *ComparedEqualBB
=
2294 BasicBlock::Create(Context
, ComparedEqual
->getName() + ".equalbb", Func
,
2295 PhonySuccessorBB
->getNextNode());
2297 // By now we have: (1/6)
2298 // PreheaderBB: ; preds = ???
2300 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2301 // %ComparedEqual = icmp eq <...> %memcmp, 0
2302 // [no terminator instruction!]
2303 // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE!
2304 // br label %LoopHeaderBB
2305 // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB
2307 // br i1 %<...>, label %LoopLatchBB, label %Successor0BB
2308 // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
2310 // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB
2311 // PhonySuccessorBB: ; No preds, UNREACHABLE!
2313 // EqualBB: ; No preds, UNREACHABLE!
2314 // [no terminator instruction!]
2315 // UnequalBB: ; No preds, UNREACHABLE!
2316 // [no terminator instruction!]
2317 // Successor0BB: <exit> ; preds = %LoopHeaderBB
2318 // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ]
2320 // Successor1BB: <exit> ; preds = %LoopLatchBB
2321 // %S1PHI = phi <...> [ <...>, %LoopLatchBB ]
2324 // What is the mapping/replacement basic block for exiting out of the loop
2325 // from either of old's loop basic blocks?
2326 auto GetReplacementBB
= [this, ComparedEqualBB
,
2327 ComparedUnequalBB
](const BasicBlock
*OldBB
) {
2328 assert(CurLoop
->contains(OldBB
) && "Only for loop's basic blocks.");
2329 if (OldBB
== CurLoop
->getLoopLatch()) // "all elements compared equal".
2330 return ComparedEqualBB
;
2331 if (OldBB
== CurLoop
->getHeader()) // "element compared unequal".
2332 return ComparedUnequalBB
;
2333 llvm_unreachable("Only had two basic blocks in loop.");
2336 // What are the exits out of this loop?
2337 SmallVector
<Loop::Edge
, 2> LoopExitEdges
;
2338 CurLoop
->getExitEdges(LoopExitEdges
);
2339 assert(LoopExitEdges
.size() == 2 && "Should have only to two exit edges.");
2341 // Populate new basic blocks, update the exiting control-flow, PHI nodes.
2342 for (const Loop::Edge
&Edge
: LoopExitEdges
) {
2343 auto *OldLoopBB
= const_cast<BasicBlock
*>(Edge
.first
);
2344 auto *SuccessorBB
= const_cast<BasicBlock
*>(Edge
.second
);
2345 assert(CurLoop
->contains(OldLoopBB
) && !CurLoop
->contains(SuccessorBB
) &&
2346 "Unexpected edge.");
2348 // If we would exit the loop from this loop's basic block,
2349 // what semantically would that mean? Did comparison succeed or fail?
2350 BasicBlock
*NewBB
= GetReplacementBB(OldLoopBB
);
2351 assert(NewBB
->empty() && "Should not get same new basic block here twice.");
2352 IRBuilder
<> Builder(NewBB
);
2353 Builder
.SetCurrentDebugLocation(OldLoopBB
->getTerminator()->getDebugLoc());
2354 Builder
.CreateBr(SuccessorBB
);
2355 DTUpdates
.push_back({DominatorTree::Insert
, NewBB
, SuccessorBB
});
2356 // Also, be *REALLY* careful with PHI nodes in successor basic block,
2357 // update them to recieve the same input value, but not from current loop's
2358 // basic block, but from new basic block instead.
2359 SuccessorBB
->replacePhiUsesWith(OldLoopBB
, NewBB
);
2360 // Also, change loop control-flow. This loop's basic block shall no longer
2361 // exit from the loop to it's original successor basic block, but to our new
2362 // phony successor basic block. Note that new successor will be unique exit.
2363 OldLoopBB
->getTerminator()->replaceSuccessorWith(SuccessorBB
,
2365 DTUpdates
.push_back({DominatorTree::Delete
, OldLoopBB
, SuccessorBB
});
2366 DTUpdates
.push_back({DominatorTree::Insert
, OldLoopBB
, PhonySuccessorBB
});
2369 // Inform DomTree about edge changes. Note that LoopInfo is still out-of-date.
2370 assert(DTUpdates
.size() == 8 && "Update count prediction failed.");
2371 DomTreeUpdater
DTU(DT
, DomTreeUpdater::UpdateStrategy::Eager
);
2372 DTU
.applyUpdates(DTUpdates
);
2375 // By now we have: (2/6)
2376 // PreheaderBB: ; preds = ???
2378 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2379 // %ComparedEqual = icmp eq <...> %memcmp, 0
2380 // [no terminator instruction!]
2381 // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE!
2382 // br label %LoopHeaderBB
2383 // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB
2385 // br i1 %<...>, label %LoopLatchBB, label %PhonySuccessorBB
2386 // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
2388 // br i1 %<...>, label %PhonySuccessorBB, label %LoopHeaderBB
2389 // PhonySuccessorBB: <uniq. exit> ; preds = %LoopHeaderBB, %LoopLatchBB
2391 // EqualBB: ; No preds, UNREACHABLE!
2392 // br label %Successor1BB
2393 // UnequalBB: ; No preds, UNREACHABLE!
2394 // br label %Successor0BB
2395 // Successor0BB: ; preds = %UnequalBB
2396 // %S0PHI = phi <...> [ <...>, %UnequalBB ]
2398 // Successor1BB: ; preds = %EqualBB
2399 // %S0PHI = phi <...> [ <...>, %EqualBB ]
2402 // *Finally*, zap the original loop. Record it's parent loop though.
2403 Loop
*ParentLoop
= CurLoop
->getParentLoop();
2404 LLVM_DEBUG(dbgs() << "Deleting old loop.\n");
2405 LoopDeleter
.markLoopAsDeleted(CurLoop
); // Mark as deleted *BEFORE* deleting!
2406 deleteDeadLoop(CurLoop
, DT
, SE
, LI
); // And actually delete the loop.
2409 // By now we have: (3/6)
2410 // PreheaderBB: ; preds = ???
2412 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2413 // %ComparedEqual = icmp eq <...> %memcmp, 0
2414 // [no terminator instruction!]
2415 // PhonyPreheaderBB: ; No preds, UNREACHABLE!
2416 // br label %PhonySuccessorBB
2417 // PhonySuccessorBB: ; preds = %PhonyPreheaderBB
2419 // EqualBB: ; No preds, UNREACHABLE!
2420 // br label %Successor1BB
2421 // UnequalBB: ; No preds, UNREACHABLE!
2422 // br label %Successor0BB
2423 // Successor0BB: ; preds = %UnequalBB
2424 // %S0PHI = phi <...> [ <...>, %UnequalBB ]
2426 // Successor1BB: ; preds = %EqualBB
2427 // %S0PHI = phi <...> [ <...>, %EqualBB ]
2430 // Now, actually restore the CFG.
2432 // Insert an unconditional branch from an actual preheader basic block to
2433 // phony preheader basic block.
2434 IRBuilder
<>(PreheaderBB
).CreateBr(PhonyPreheaderBB
);
2435 DTUpdates
.push_back({DominatorTree::Insert
, PhonyPreheaderBB
, HeaderBB
});
2436 // Insert proper conditional branch from phony successor basic block to the
2437 // "dispatch" basic blocks, which were used to preserve incoming values in
2438 // original loop's successor basic blocks.
2439 assert(isa
<UnreachableInst
>(PhonySuccessorBB
->getTerminator()) &&
2440 "Yep, that's the one we created to keep deleteDeadLoop() happy.");
2441 PhonySuccessorBB
->getTerminator()->eraseFromParent();
2443 IRBuilder
<> Builder(PhonySuccessorBB
);
2444 Builder
.SetCurrentDebugLocation(ComparedEqual
->getDebugLoc());
2445 Builder
.CreateCondBr(ComparedEqual
, ComparedEqualBB
, ComparedUnequalBB
);
2447 DTUpdates
.push_back(
2448 {DominatorTree::Insert
, PhonySuccessorBB
, ComparedEqualBB
});
2449 DTUpdates
.push_back(
2450 {DominatorTree::Insert
, PhonySuccessorBB
, ComparedUnequalBB
});
2452 BasicBlock
*DispatchBB
= PhonySuccessorBB
;
2453 DispatchBB
->setName(LoopName
+ ".bcmpdispatchbb");
2455 assert(DTUpdates
.size() == 3 && "Update count prediction failed.");
2456 DTU
.applyUpdates(DTUpdates
);
2459 // By now we have: (4/6)
2460 // PreheaderBB: ; preds = ???
2462 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2463 // %ComparedEqual = icmp eq <...> %memcmp, 0
2464 // br label %PhonyPreheaderBB
2465 // PhonyPreheaderBB: ; preds = %PreheaderBB
2466 // br label %DispatchBB
2467 // DispatchBB: ; preds = %PhonyPreheaderBB
2468 // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
2469 // EqualBB: ; preds = %DispatchBB
2470 // br label %Successor1BB
2471 // UnequalBB: ; preds = %DispatchBB
2472 // br label %Successor0BB
2473 // Successor0BB: ; preds = %UnequalBB
2474 // %S0PHI = phi <...> [ <...>, %UnequalBB ]
2476 // Successor1BB: ; preds = %EqualBB
2477 // %S0PHI = phi <...> [ <...>, %EqualBB ]
2480 // The basic CFG has been restored! Now let's merge redundant basic blocks.
2482 // Merge phony successor basic block into it's only predecessor,
2483 // phony preheader basic block. It is fully pointlessly redundant.
2484 MergeBasicBlockIntoOnlyPred(DispatchBB
, &DTU
);
2486 // By now we have: (5/6)
2487 // PreheaderBB: ; preds = ???
2489 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2490 // %ComparedEqual = icmp eq <...> %memcmp, 0
2491 // br label %DispatchBB
2492 // DispatchBB: ; preds = %PreheaderBB
2493 // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
2494 // EqualBB: ; preds = %DispatchBB
2495 // br label %Successor1BB
2496 // UnequalBB: ; preds = %DispatchBB
2497 // br label %Successor0BB
2498 // Successor0BB: ; preds = %UnequalBB
2499 // %S0PHI = phi <...> [ <...>, %UnequalBB ]
2501 // Successor1BB: ; preds = %EqualBB
2502 // %S0PHI = phi <...> [ <...>, %EqualBB ]
2505 // Was this loop nested?
2507 // If the loop was *NOT* nested, then let's also merge phony successor
2508 // basic block into it's only predecessor, preheader basic block.
2509 // Also, here we need to update LoopInfo.
2510 LI
->removeBlock(PreheaderBB
);
2511 MergeBasicBlockIntoOnlyPred(DispatchBB
, &DTU
);
2513 // By now we have: (6/6)
2514 // DispatchBB: ; preds = ???
2516 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2517 // %ComparedEqual = icmp eq <...> %memcmp, 0
2518 // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
2519 // EqualBB: ; preds = %DispatchBB
2520 // br label %Successor1BB
2521 // UnequalBB: ; preds = %DispatchBB
2522 // br label %Successor0BB
2523 // Successor0BB: ; preds = %UnequalBB
2524 // %S0PHI = phi <...> [ <...>, %UnequalBB ]
2526 // Successor1BB: ; preds = %EqualBB
2527 // %S0PHI = phi <...> [ <...>, %EqualBB ]
2533 // Otherwise, we need to "preserve" the LoopSimplify form of the deleted loop.
2534 // To achieve that, we shall keep the preheader basic block (mainly so that
2535 // the loop header block will be guaranteed to have a predecessor outside of
2536 // the loop), and create a phony loop with all these new three basic blocks.
2537 Loop
*PhonyLoop
= LI
->AllocateLoop();
2538 ParentLoop
->addChildLoop(PhonyLoop
);
2539 PhonyLoop
->addBasicBlockToLoop(DispatchBB
, *LI
);
2540 PhonyLoop
->addBasicBlockToLoop(ComparedEqualBB
, *LI
);
2541 PhonyLoop
->addBasicBlockToLoop(ComparedUnequalBB
, *LI
);
2543 // But we only have a preheader basic block, a header basic block block and
2544 // two exiting basic blocks. For a proper loop we also need a backedge from
2545 // non-header basic block to header bb.
2546 // Let's just add a never-taken branch from both of the exiting basic blocks.
2547 for (BasicBlock
*BB
: {ComparedEqualBB
, ComparedUnequalBB
}) {
2548 BranchInst
*OldTerminator
= cast
<BranchInst
>(BB
->getTerminator());
2549 assert(OldTerminator
->isUnconditional() && "That's the one we created.");
2550 BasicBlock
*SuccessorBB
= OldTerminator
->getSuccessor(0);
2552 IRBuilder
<> Builder(OldTerminator
);
2553 Builder
.SetCurrentDebugLocation(OldTerminator
->getDebugLoc());
2554 Builder
.CreateCondBr(ConstantInt::getTrue(Context
), SuccessorBB
,
2556 OldTerminator
->eraseFromParent();
2557 // Yes, the backedge will never be taken. The control-flow is redundant.
2558 // If it can be simplified further, other passes will take care.
2559 DTUpdates
.push_back({DominatorTree::Delete
, BB
, SuccessorBB
});
2560 DTUpdates
.push_back({DominatorTree::Insert
, BB
, SuccessorBB
});
2561 DTUpdates
.push_back({DominatorTree::Insert
, BB
, DispatchBB
});
2563 assert(DTUpdates
.size() == 6 && "Update count prediction failed.");
2564 DTU
.applyUpdates(DTUpdates
);
2567 // By now we have: (6/6)
2568 // PreheaderBB: <preheader> ; preds = ???
2570 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
2571 // %ComparedEqual = icmp eq <...> %memcmp, 0
2572 // br label %BCmpDispatchBB
2573 // BCmpDispatchBB: <header> ; preds = %PreheaderBB
2574 // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
2575 // EqualBB: <latch,exiting> ; preds = %BCmpDispatchBB
2576 // br i1 %true, label %Successor1BB, label %BCmpDispatchBB
2577 // UnequalBB: <latch,exiting> ; preds = %BCmpDispatchBB
2578 // br i1 %true, label %Successor0BB, label %BCmpDispatchBB
2579 // Successor0BB: ; preds = %UnequalBB
2580 // %S0PHI = phi <...> [ <...>, %UnequalBB ]
2582 // Successor1BB: ; preds = %EqualBB
2583 // %S0PHI = phi <...> [ <...>, %EqualBB ]
2586 // Finally fully DONE!
2590 void LoopIdiomRecognize::transformLoopToBCmp(ICmpInst
*BCmpInst
,
2591 CmpInst
*LatchCmpInst
,
2592 LoadInst
*LoadA
, LoadInst
*LoadB
,
2593 const SCEV
*SrcA
, const SCEV
*SrcB
,
2594 const SCEV
*NBytes
) {
2595 // We will be inserting before the terminator instruction of preheader block.
2596 IRBuilder
<> Builder(CurLoop
->getLoopPreheader()->getTerminator());
2598 LLVM_DEBUG(dbgs() << "Transforming bcmp loop idiom into a call.\n");
2599 LLVM_DEBUG(dbgs() << "Emitting new instructions.\n");
2601 // Expand the SCEV expressions for both sources to compare, and produce value
2602 // for the byte len (beware of Iterations potentially being a pointer, and
2603 // account for element size being BCmpTyBytes bytes, which may be not 1 byte)
2604 Value
*PtrA
, *PtrB
, *Len
;
2606 SCEVExpander
SExp(*SE
, *DL
, "LoopToBCmp");
2607 SExp
.setInsertPoint(&*Builder
.GetInsertPoint());
2609 auto HandlePtr
= [&SExp
](LoadInst
*Load
, const SCEV
*Src
) {
2610 SExp
.SetCurrentDebugLocation(DebugLoc());
2611 // If the pointer operand of original load had dbgloc - use it.
2612 if (const auto *I
= dyn_cast
<Instruction
>(Load
->getPointerOperand()))
2613 SExp
.SetCurrentDebugLocation(I
->getDebugLoc());
2614 return SExp
.expandCodeFor(Src
);
2616 PtrA
= HandlePtr(LoadA
, SrcA
);
2617 PtrB
= HandlePtr(LoadB
, SrcB
);
2619 // For len calculation let's use dbgloc for the loop's latch condition.
2620 Builder
.SetCurrentDebugLocation(LatchCmpInst
->getDebugLoc());
2621 SExp
.SetCurrentDebugLocation(LatchCmpInst
->getDebugLoc());
2622 Len
= SExp
.expandCodeFor(NBytes
);
2624 Type
*CmpFuncSizeTy
= DL
->getIntPtrType(Builder
.getContext());
2625 assert(SE
->getTypeSizeInBits(Len
->getType()) ==
2626 DL
->getTypeSizeInBits(CmpFuncSizeTy
) &&
2627 "Len should already have the correct size.");
2629 // Make sure that iteration count is a number, insert ptrtoint cast if not.
2630 if (Len
->getType()->isPointerTy())
2631 Len
= Builder
.CreatePtrToInt(Len
, CmpFuncSizeTy
);
2632 assert(Len
->getType() == CmpFuncSizeTy
&& "Should have correct type now.");
2634 Len
->setName(Len
->getName() + ".bytecount");
2636 // There is no legality check needed. We want to compare that the memory
2637 // regions [PtrA, PtrA+Len) and [PtrB, PtrB+Len) are fully identical, equal.
2638 // For them to be fully equal, they must match bit-by-bit. And likewise,
2639 // for them to *NOT* be fully equal, they have to differ just by one bit.
2640 // The step of comparison (bits compared at once) simply does not matter.
2643 // For the rest of new instructions, dbgloc should point at the value cmp.
2644 Builder
.SetCurrentDebugLocation(BCmpInst
->getDebugLoc());
2646 // Emit the comparison itself.
2648 cast
<CallInst
>(HasBCmp
? emitBCmp(PtrA
, PtrB
, Len
, Builder
, *DL
, TLI
)
2649 : emitMemCmp(PtrA
, PtrB
, Len
, Builder
, *DL
, TLI
));
2650 // FIXME: add {B,Mem}CmpInst with MemoryCompareInst
2651 // (based on MemIntrinsicBase) as base?
2652 // FIXME: propagate metadata from loads? (alignments, AS, TBAA, ...)
2654 // {b,mem}cmp returned 0 if they were equal, or non-zero if not equal.
2655 auto *ComparedEqual
= cast
<ICmpInst
>(Builder
.CreateICmpEQ(
2656 CmpCall
, ConstantInt::get(CmpCall
->getType(), 0),
2657 PtrA
->getName() + ".vs." + PtrB
->getName() + ".eqcmp"));
2659 BasicBlock
*BB
= transformBCmpControlFlow(ComparedEqual
);
2660 Builder
.ClearInsertionPoint();
2663 LLVM_DEBUG(dbgs() << "Transformed loop bcmp idiom into a call.\n");
2665 return OptimizationRemark(DEBUG_TYPE
, "TransformedBCmpIdiomToCall",
2666 CmpCall
->getDebugLoc(), BB
)
2667 << "Transformed bcmp idiom into a call to "
2668 << ore::NV("NewFunction", CmpCall
->getCalledFunction())
2674 /// Recognizes a bcmp idiom in a non-countable loop.
2676 /// If detected, transforms the relevant code to issue the bcmp (or memcmp)
2677 /// intrinsic function call, and returns true; otherwise, returns false.
2678 bool LoopIdiomRecognize::recognizeBCmp() {
2679 if (!HasMemCmp
&& !HasBCmp
)
2683 CmpInst
*LatchCmpInst
;
2684 LoadInst
*LoadA
, *LoadB
;
2685 const SCEV
*SrcA
, *SrcB
, *NBytes
;
2686 if (!detectBCmpIdiom(BCmpInst
, LatchCmpInst
, LoadA
, LoadB
, SrcA
, SrcB
,
2688 LLVM_DEBUG(dbgs() << "bcmp idiom recognition failed.\n");
2692 transformLoopToBCmp(BCmpInst
, LatchCmpInst
, LoadA
, LoadB
, SrcA
, SrcB
, NBytes
);