1 //===- HexagonLoopIdiomRecognition.cpp ------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #define DEBUG_TYPE "hexagon-lir"
11 #include "llvm/ADT/APInt.h"
12 #include "llvm/ADT/DenseMap.h"
13 #include "llvm/ADT/SetVector.h"
14 #include "llvm/ADT/SmallPtrSet.h"
15 #include "llvm/ADT/SmallSet.h"
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/ADT/Triple.h"
19 #include "llvm/Analysis/AliasAnalysis.h"
20 #include "llvm/Analysis/InstructionSimplify.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/LoopPass.h"
23 #include "llvm/Analysis/MemoryLocation.h"
24 #include "llvm/Analysis/ScalarEvolution.h"
25 #include "llvm/Analysis/ScalarEvolutionExpander.h"
26 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
27 #include "llvm/Analysis/TargetLibraryInfo.h"
28 #include "llvm/Transforms/Utils/Local.h"
29 #include "llvm/Analysis/ValueTracking.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/Constant.h"
33 #include "llvm/IR/Constants.h"
34 #include "llvm/IR/DataLayout.h"
35 #include "llvm/IR/DebugLoc.h"
36 #include "llvm/IR/DerivedTypes.h"
37 #include "llvm/IR/Dominators.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/IRBuilder.h"
40 #include "llvm/IR/InstrTypes.h"
41 #include "llvm/IR/Instruction.h"
42 #include "llvm/IR/Instructions.h"
43 #include "llvm/IR/IntrinsicInst.h"
44 #include "llvm/IR/Intrinsics.h"
45 #include "llvm/IR/Module.h"
46 #include "llvm/IR/PatternMatch.h"
47 #include "llvm/IR/Type.h"
48 #include "llvm/IR/User.h"
49 #include "llvm/IR/Value.h"
50 #include "llvm/Pass.h"
51 #include "llvm/Support/Casting.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/Compiler.h"
54 #include "llvm/Support/Debug.h"
55 #include "llvm/Support/ErrorHandling.h"
56 #include "llvm/Support/KnownBits.h"
57 #include "llvm/Support/raw_ostream.h"
58 #include "llvm/Transforms/Scalar.h"
59 #include "llvm/Transforms/Utils.h"
75 static cl::opt
<bool> DisableMemcpyIdiom("disable-memcpy-idiom",
76 cl::Hidden
, cl::init(false),
77 cl::desc("Disable generation of memcpy in loop idiom recognition"));
79 static cl::opt
<bool> DisableMemmoveIdiom("disable-memmove-idiom",
80 cl::Hidden
, cl::init(false),
81 cl::desc("Disable generation of memmove in loop idiom recognition"));
83 static cl::opt
<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold",
84 cl::Hidden
, cl::init(0), cl::desc("Threshold (in bytes) for the runtime "
85 "check guarding the memmove."));
87 static cl::opt
<unsigned> CompileTimeMemSizeThreshold(
88 "compile-time-mem-idiom-threshold", cl::Hidden
, cl::init(64),
89 cl::desc("Threshold (in bytes) to perform the transformation, if the "
90 "runtime loop count (mem transfer size) is known at compile-time."));
92 static cl::opt
<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
93 cl::Hidden
, cl::init(true),
94 cl::desc("Only enable generating memmove in non-nested loops"));
96 static cl::opt
<bool> HexagonVolatileMemcpy(
97 "disable-hexagon-volatile-memcpy", cl::Hidden
, cl::init(false),
98 cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
100 static cl::opt
<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
101 cl::Hidden
, cl::desc("Maximum number of simplification steps in HLIR"));
103 static const char *HexagonVolatileMemcpyName
104 = "hexagon_memcpy_forward_vp4cp4n2";
109 void initializeHexagonLoopIdiomRecognizePass(PassRegistry
&);
110 Pass
*createHexagonLoopIdiomPass();
112 } // end namespace llvm
116 class HexagonLoopIdiomRecognize
: public LoopPass
{
120 explicit HexagonLoopIdiomRecognize() : LoopPass(ID
) {
121 initializeHexagonLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
124 StringRef
getPassName() const override
{
125 return "Recognize Hexagon-specific loop idioms";
128 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
129 AU
.addRequired
<LoopInfoWrapperPass
>();
130 AU
.addRequiredID(LoopSimplifyID
);
131 AU
.addRequiredID(LCSSAID
);
132 AU
.addRequired
<AAResultsWrapperPass
>();
133 AU
.addPreserved
<AAResultsWrapperPass
>();
134 AU
.addRequired
<ScalarEvolutionWrapperPass
>();
135 AU
.addRequired
<DominatorTreeWrapperPass
>();
136 AU
.addRequired
<TargetLibraryInfoWrapperPass
>();
137 AU
.addPreserved
<TargetLibraryInfoWrapperPass
>();
140 bool runOnLoop(Loop
*L
, LPPassManager
&LPM
) override
;
143 int getSCEVStride(const SCEVAddRecExpr
*StoreEv
);
144 bool isLegalStore(Loop
*CurLoop
, StoreInst
*SI
);
145 void collectStores(Loop
*CurLoop
, BasicBlock
*BB
,
146 SmallVectorImpl
<StoreInst
*> &Stores
);
147 bool processCopyingStore(Loop
*CurLoop
, StoreInst
*SI
, const SCEV
*BECount
);
148 bool coverLoop(Loop
*L
, SmallVectorImpl
<Instruction
*> &Insts
) const;
149 bool runOnLoopBlock(Loop
*CurLoop
, BasicBlock
*BB
, const SCEV
*BECount
,
150 SmallVectorImpl
<BasicBlock
*> &ExitBlocks
);
151 bool runOnCountableLoop(Loop
*L
);
154 const DataLayout
*DL
;
157 const TargetLibraryInfo
*TLI
;
159 bool HasMemcpy
, HasMemmove
;
164 using FuncType
= std::function
<Value
* (Instruction
*, LLVMContext
&)>;
165 Rule(StringRef N
, FuncType F
) : Name(N
), Fn(F
) {}
166 StringRef Name
; // For debugging.
170 void addRule(StringRef N
, const Rule::FuncType
&F
) {
171 Rules
.push_back(Rule(N
, F
));
175 struct WorkListType
{
176 WorkListType() = default;
178 void push_back(Value
* V
) {
179 // Do not push back duplicates.
180 if (!S
.count(V
)) { Q
.push_back(V
); S
.insert(V
); }
183 Value
*pop_front_val() {
184 Value
*V
= Q
.front(); Q
.pop_front(); S
.erase(V
);
188 bool empty() const { return Q
.empty(); }
191 std::deque
<Value
*> Q
;
195 using ValueSetType
= std::set
<Value
*>;
197 std::vector
<Rule
> Rules
;
201 using ValueMapType
= DenseMap
<Value
*, Value
*>;
204 ValueSetType Used
; // The set of all cloned values used by Root.
205 ValueSetType Clones
; // The set of all cloned values.
208 Context(Instruction
*Exp
)
209 : Ctx(Exp
->getParent()->getParent()->getContext()) {
213 ~Context() { cleanup(); }
215 void print(raw_ostream
&OS
, const Value
*V
) const;
216 Value
*materialize(BasicBlock
*B
, BasicBlock::iterator At
);
219 friend struct Simplifier
;
221 void initialize(Instruction
*Exp
);
224 template <typename FuncT
> void traverse(Value
*V
, FuncT F
);
225 void record(Value
*V
);
227 void unuse(Value
*V
);
229 bool equal(const Instruction
*I
, const Instruction
*J
) const;
230 Value
*find(Value
*Tree
, Value
*Sub
) const;
231 Value
*subst(Value
*Tree
, Value
*OldV
, Value
*NewV
);
232 void replace(Value
*OldV
, Value
*NewV
);
233 void link(Instruction
*I
, BasicBlock
*B
, BasicBlock::iterator At
);
236 Value
*simplify(Context
&C
);
240 PE(const Simplifier::Context
&c
, Value
*v
= nullptr) : C(c
), V(v
) {}
242 const Simplifier::Context
&C
;
247 raw_ostream
&operator<<(raw_ostream
&OS
, const PE
&P
) {
248 P
.C
.print(OS
, P
.V
? P
.V
: P
.C
.Root
);
252 } // end anonymous namespace
254 char HexagonLoopIdiomRecognize::ID
= 0;
256 INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognize
, "hexagon-loop-idiom",
257 "Recognize Hexagon-specific loop idioms", false, false)
258 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass
)
259 INITIALIZE_PASS_DEPENDENCY(LoopSimplify
)
260 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass
)
261 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass
)
262 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
263 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass
)
264 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
265 INITIALIZE_PASS_END(HexagonLoopIdiomRecognize
, "hexagon-loop-idiom",
266 "Recognize Hexagon-specific loop idioms", false, false)
268 template <typename FuncT
>
269 void Simplifier::Context::traverse(Value
*V
, FuncT F
) {
274 Instruction
*U
= dyn_cast
<Instruction
>(Q
.pop_front_val());
275 if (!U
|| U
->getParent())
279 for (Value
*Op
: U
->operands())
284 void Simplifier::Context::print(raw_ostream
&OS
, const Value
*V
) const {
285 const auto *U
= dyn_cast
<const Instruction
>(V
);
287 OS
<< V
<< '(' << *V
<< ')';
291 if (U
->getParent()) {
293 U
->printAsOperand(OS
, true);
298 unsigned N
= U
->getNumOperands();
301 OS
<< U
->getOpcodeName();
302 for (const Value
*Op
: U
->operands()) {
310 void Simplifier::Context::initialize(Instruction
*Exp
) {
311 // Perform a deep clone of the expression, set Root to the root
312 // of the clone, and build a map from the cloned values to the
315 BasicBlock
*Block
= Exp
->getParent();
320 Value
*V
= Q
.pop_front_val();
321 if (M
.find(V
) != M
.end())
323 if (Instruction
*U
= dyn_cast
<Instruction
>(V
)) {
324 if (isa
<PHINode
>(U
) || U
->getParent() != Block
)
326 for (Value
*Op
: U
->operands())
328 M
.insert({U
, U
->clone()});
332 for (std::pair
<Value
*,Value
*> P
: M
) {
333 Instruction
*U
= cast
<Instruction
>(P
.second
);
334 for (unsigned i
= 0, n
= U
->getNumOperands(); i
!= n
; ++i
) {
335 auto F
= M
.find(U
->getOperand(i
));
337 U
->setOperand(i
, F
->second
);
341 auto R
= M
.find(Exp
);
342 assert(R
!= M
.end());
349 void Simplifier::Context::record(Value
*V
) {
350 auto Record
= [this](Instruction
*U
) -> bool {
357 void Simplifier::Context::use(Value
*V
) {
358 auto Use
= [this](Instruction
*U
) -> bool {
365 void Simplifier::Context::unuse(Value
*V
) {
366 if (!isa
<Instruction
>(V
) || cast
<Instruction
>(V
)->getParent() != nullptr)
369 auto Unuse
= [this](Instruction
*U
) -> bool {
378 Value
*Simplifier::Context::subst(Value
*Tree
, Value
*OldV
, Value
*NewV
) {
387 Instruction
*U
= dyn_cast
<Instruction
>(Q
.pop_front_val());
388 // If U is not an instruction, or it's not a clone, skip it.
389 if (!U
|| U
->getParent())
391 for (unsigned i
= 0, n
= U
->getNumOperands(); i
!= n
; ++i
) {
392 Value
*Op
= U
->getOperand(i
);
394 U
->setOperand(i
, NewV
);
404 void Simplifier::Context::replace(Value
*OldV
, Value
*NewV
) {
411 // NewV may be a complex tree that has just been created by one of the
412 // transformation rules. We need to make sure that it is commoned with
413 // the existing Root to the maximum extent possible.
414 // Identify all subtrees of NewV (including NewV itself) that have
415 // equivalent counterparts in Root, and replace those subtrees with
416 // these counterparts.
420 Value
*V
= Q
.pop_front_val();
421 Instruction
*U
= dyn_cast
<Instruction
>(V
);
422 if (!U
|| U
->getParent())
424 if (Value
*DupV
= find(Root
, V
)) {
426 NewV
= subst(NewV
, V
, DupV
);
428 for (Value
*Op
: U
->operands())
433 // Now, simply replace OldV with NewV in Root.
434 Root
= subst(Root
, OldV
, NewV
);
438 void Simplifier::Context::cleanup() {
439 for (Value
*V
: Clones
) {
440 Instruction
*U
= cast
<Instruction
>(V
);
442 U
->dropAllReferences();
445 for (Value
*V
: Clones
) {
446 Instruction
*U
= cast
<Instruction
>(V
);
452 bool Simplifier::Context::equal(const Instruction
*I
,
453 const Instruction
*J
) const {
456 if (!I
->isSameOperationAs(J
))
459 return I
->isIdenticalTo(J
);
461 for (unsigned i
= 0, n
= I
->getNumOperands(); i
!= n
; ++i
) {
462 Value
*OpI
= I
->getOperand(i
), *OpJ
= J
->getOperand(i
);
465 auto *InI
= dyn_cast
<const Instruction
>(OpI
);
466 auto *InJ
= dyn_cast
<const Instruction
>(OpJ
);
468 if (!equal(InI
, InJ
))
470 } else if (InI
!= InJ
|| !InI
)
476 Value
*Simplifier::Context::find(Value
*Tree
, Value
*Sub
) const {
477 Instruction
*SubI
= dyn_cast
<Instruction
>(Sub
);
482 Value
*V
= Q
.pop_front_val();
485 Instruction
*U
= dyn_cast
<Instruction
>(V
);
486 if (!U
|| U
->getParent())
488 if (SubI
&& equal(SubI
, U
))
490 assert(!isa
<PHINode
>(U
));
491 for (Value
*Op
: U
->operands())
497 void Simplifier::Context::link(Instruction
*I
, BasicBlock
*B
,
498 BasicBlock::iterator At
) {
502 for (Value
*Op
: I
->operands()) {
503 if (Instruction
*OpI
= dyn_cast
<Instruction
>(Op
))
507 B
->getInstList().insert(At
, I
);
510 Value
*Simplifier::Context::materialize(BasicBlock
*B
,
511 BasicBlock::iterator At
) {
512 if (Instruction
*RootI
= dyn_cast
<Instruction
>(Root
))
517 Value
*Simplifier::simplify(Context
&C
) {
521 const unsigned Limit
= SimplifyLimit
;
524 if (Count
++ >= Limit
)
526 Instruction
*U
= dyn_cast
<Instruction
>(Q
.pop_front_val());
527 if (!U
|| U
->getParent() || !C
.Used
.count(U
))
529 bool Changed
= false;
530 for (Rule
&R
: Rules
) {
531 Value
*W
= R
.Fn(U
, C
.Ctx
);
541 for (Value
*Op
: U
->operands())
545 return Count
< Limit
? C
.Root
: nullptr;
548 //===----------------------------------------------------------------------===//
550 // Implementation of PolynomialMultiplyRecognize
552 //===----------------------------------------------------------------------===//
556 class PolynomialMultiplyRecognize
{
558 explicit PolynomialMultiplyRecognize(Loop
*loop
, const DataLayout
&dl
,
559 const DominatorTree
&dt
, const TargetLibraryInfo
&tli
,
561 : CurLoop(loop
), DL(dl
), DT(dt
), TLI(tli
), SE(se
) {}
566 using ValueSeq
= SetVector
<Value
*>;
568 IntegerType
*getPmpyType() const {
569 LLVMContext
&Ctx
= CurLoop
->getHeader()->getParent()->getContext();
570 return IntegerType::get(Ctx
, 32);
573 bool isPromotableTo(Value
*V
, IntegerType
*Ty
);
574 void promoteTo(Instruction
*In
, IntegerType
*DestTy
, BasicBlock
*LoopB
);
575 bool promoteTypes(BasicBlock
*LoopB
, BasicBlock
*ExitB
);
577 Value
*getCountIV(BasicBlock
*BB
);
578 bool findCycle(Value
*Out
, Value
*In
, ValueSeq
&Cycle
);
579 void classifyCycle(Instruction
*DivI
, ValueSeq
&Cycle
, ValueSeq
&Early
,
581 bool classifyInst(Instruction
*UseI
, ValueSeq
&Early
, ValueSeq
&Late
);
582 bool commutesWithShift(Instruction
*I
);
583 bool highBitsAreZero(Value
*V
, unsigned IterCount
);
584 bool keepsHighBitsZero(Value
*V
, unsigned IterCount
);
585 bool isOperandShifted(Instruction
*I
, Value
*Op
);
586 bool convertShiftsToLeft(BasicBlock
*LoopB
, BasicBlock
*ExitB
,
588 void cleanupLoopBody(BasicBlock
*LoopB
);
590 struct ParsedValues
{
591 ParsedValues() = default;
598 Instruction
*Res
= nullptr;
599 unsigned IterCount
= 0;
604 bool matchLeftShift(SelectInst
*SelI
, Value
*CIV
, ParsedValues
&PV
);
605 bool matchRightShift(SelectInst
*SelI
, ParsedValues
&PV
);
606 bool scanSelect(SelectInst
*SI
, BasicBlock
*LoopB
, BasicBlock
*PrehB
,
607 Value
*CIV
, ParsedValues
&PV
, bool PreScan
);
608 unsigned getInverseMxN(unsigned QP
);
609 Value
*generate(BasicBlock::iterator At
, ParsedValues
&PV
);
611 void setupPreSimplifier(Simplifier
&S
);
612 void setupPostSimplifier(Simplifier
&S
);
615 const DataLayout
&DL
;
616 const DominatorTree
&DT
;
617 const TargetLibraryInfo
&TLI
;
621 } // end anonymous namespace
623 Value
*PolynomialMultiplyRecognize::getCountIV(BasicBlock
*BB
) {
624 pred_iterator PI
= pred_begin(BB
), PE
= pred_end(BB
);
625 if (std::distance(PI
, PE
) != 2)
627 BasicBlock
*PB
= (*PI
== BB
) ? *std::next(PI
) : *PI
;
629 for (auto I
= BB
->begin(), E
= BB
->end(); I
!= E
&& isa
<PHINode
>(I
); ++I
) {
630 auto *PN
= cast
<PHINode
>(I
);
631 Value
*InitV
= PN
->getIncomingValueForBlock(PB
);
632 if (!isa
<ConstantInt
>(InitV
) || !cast
<ConstantInt
>(InitV
)->isZero())
634 Value
*IterV
= PN
->getIncomingValueForBlock(BB
);
635 if (!isa
<BinaryOperator
>(IterV
))
637 auto *BO
= dyn_cast
<BinaryOperator
>(IterV
);
638 if (BO
->getOpcode() != Instruction::Add
)
640 Value
*IncV
= nullptr;
641 if (BO
->getOperand(0) == PN
)
642 IncV
= BO
->getOperand(1);
643 else if (BO
->getOperand(1) == PN
)
644 IncV
= BO
->getOperand(0);
648 if (auto *T
= dyn_cast
<ConstantInt
>(IncV
))
649 if (T
->getZExtValue() == 1)
655 static void replaceAllUsesOfWithIn(Value
*I
, Value
*J
, BasicBlock
*BB
) {
656 for (auto UI
= I
->user_begin(), UE
= I
->user_end(); UI
!= UE
;) {
657 Use
&TheUse
= UI
.getUse();
659 if (auto *II
= dyn_cast
<Instruction
>(TheUse
.getUser()))
660 if (BB
== II
->getParent())
661 II
->replaceUsesOfWith(I
, J
);
665 bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst
*SelI
,
666 Value
*CIV
, ParsedValues
&PV
) {
667 // Match the following:
668 // select (X & (1 << i)) != 0 ? R ^ (Q << i) : R
669 // select (X & (1 << i)) == 0 ? R : R ^ (Q << i)
670 // The condition may also check for equality with the masked value, i.e
671 // select (X & (1 << i)) == (1 << i) ? R ^ (Q << i) : R
672 // select (X & (1 << i)) != (1 << i) ? R : R ^ (Q << i);
674 Value
*CondV
= SelI
->getCondition();
675 Value
*TrueV
= SelI
->getTrueValue();
676 Value
*FalseV
= SelI
->getFalseValue();
678 using namespace PatternMatch
;
680 CmpInst::Predicate P
;
681 Value
*A
= nullptr, *B
= nullptr, *C
= nullptr;
683 if (!match(CondV
, m_ICmp(P
, m_And(m_Value(A
), m_Value(B
)), m_Value(C
))) &&
684 !match(CondV
, m_ICmp(P
, m_Value(C
), m_And(m_Value(A
), m_Value(B
)))))
686 if (P
!= CmpInst::ICMP_EQ
&& P
!= CmpInst::ICMP_NE
)
688 // Matched: select (A & B) == C ? ... : ...
689 // select (A & B) != C ? ... : ...
691 Value
*X
= nullptr, *Sh1
= nullptr;
692 // Check (A & B) for (X & (1 << i)):
693 if (match(A
, m_Shl(m_One(), m_Specific(CIV
)))) {
696 } else if (match(B
, m_Shl(m_One(), m_Specific(CIV
)))) {
700 // TODO: Could also check for an induction variable containing single
701 // bit shifted left by 1 in each iteration.
707 // Check C against the possible values for comparison: 0 and (1 << i):
708 if (match(C
, m_Zero()))
709 TrueIfZero
= (P
== CmpInst::ICMP_EQ
);
711 TrueIfZero
= (P
== CmpInst::ICMP_NE
);
716 // select (X & (1 << i)) ? ... : ...
717 // including variations of the check against zero/non-zero value.
719 Value
*ShouldSameV
= nullptr, *ShouldXoredV
= nullptr;
722 ShouldXoredV
= FalseV
;
724 ShouldSameV
= FalseV
;
725 ShouldXoredV
= TrueV
;
728 Value
*Q
= nullptr, *R
= nullptr, *Y
= nullptr, *Z
= nullptr;
730 if (match(ShouldXoredV
, m_Xor(m_Value(Y
), m_Value(Z
)))) {
731 // Matched: select +++ ? ... : Y ^ Z
732 // select +++ ? Y ^ Z : ...
733 // where +++ denotes previously checked matches.
734 if (ShouldSameV
== Y
)
736 else if (ShouldSameV
== Z
)
741 // Matched: select +++ ? R : R ^ T
742 // select +++ ? R ^ T : R
743 // depending on TrueIfZero.
745 } else if (match(ShouldSameV
, m_Zero())) {
746 // Matched: select +++ ? 0 : ...
747 // select +++ ? ... : 0
748 if (!SelI
->hasOneUse())
751 // Matched: select +++ ? 0 : T
752 // select +++ ? T : 0
754 Value
*U
= *SelI
->user_begin();
755 if (!match(U
, m_Xor(m_Specific(SelI
), m_Value(R
))) &&
756 !match(U
, m_Xor(m_Value(R
), m_Specific(SelI
))))
758 // Matched: xor (select +++ ? 0 : T), R
759 // xor (select +++ ? T : 0), R
763 // The xor input value T is isolated into its own match so that it could
764 // be checked against an induction variable containing a shifted bit
766 // For now, check against (Q << i).
767 if (!match(T
, m_Shl(m_Value(Q
), m_Specific(CIV
))) &&
768 !match(T
, m_Shl(m_ZExt(m_Value(Q
)), m_ZExt(m_Specific(CIV
)))))
770 // Matched: select +++ ? R : R ^ (Q << i)
771 // select +++ ? R ^ (Q << i) : R
780 bool PolynomialMultiplyRecognize::matchRightShift(SelectInst
*SelI
,
782 // Match the following:
783 // select (X & 1) != 0 ? (R >> 1) ^ Q : (R >> 1)
784 // select (X & 1) == 0 ? (R >> 1) : (R >> 1) ^ Q
785 // The condition may also check for equality with the masked value, i.e
786 // select (X & 1) == 1 ? (R >> 1) ^ Q : (R >> 1)
787 // select (X & 1) != 1 ? (R >> 1) : (R >> 1) ^ Q
789 Value
*CondV
= SelI
->getCondition();
790 Value
*TrueV
= SelI
->getTrueValue();
791 Value
*FalseV
= SelI
->getFalseValue();
793 using namespace PatternMatch
;
796 CmpInst::Predicate P
;
799 if (match(CondV
, m_ICmp(P
, m_Value(C
), m_Zero())) ||
800 match(CondV
, m_ICmp(P
, m_Zero(), m_Value(C
)))) {
801 if (P
!= CmpInst::ICMP_EQ
&& P
!= CmpInst::ICMP_NE
)
803 // Matched: select C == 0 ? ... : ...
804 // select C != 0 ? ... : ...
805 TrueIfZero
= (P
== CmpInst::ICMP_EQ
);
806 } else if (match(CondV
, m_ICmp(P
, m_Value(C
), m_One())) ||
807 match(CondV
, m_ICmp(P
, m_One(), m_Value(C
)))) {
808 if (P
!= CmpInst::ICMP_EQ
&& P
!= CmpInst::ICMP_NE
)
810 // Matched: select C == 1 ? ... : ...
811 // select C != 1 ? ... : ...
812 TrueIfZero
= (P
== CmpInst::ICMP_NE
);
817 if (!match(C
, m_And(m_Value(X
), m_One())) &&
818 !match(C
, m_And(m_One(), m_Value(X
))))
820 // Matched: select (X & 1) == +++ ? ... : ...
821 // select (X & 1) != +++ ? ... : ...
823 Value
*R
= nullptr, *Q
= nullptr;
825 // The select's condition is true if the tested bit is 0.
826 // TrueV must be the shift, FalseV must be the xor.
827 if (!match(TrueV
, m_LShr(m_Value(R
), m_One())))
829 // Matched: select +++ ? (R >> 1) : ...
830 if (!match(FalseV
, m_Xor(m_Specific(TrueV
), m_Value(Q
))) &&
831 !match(FalseV
, m_Xor(m_Value(Q
), m_Specific(TrueV
))))
833 // Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
836 // The select's condition is true if the tested bit is 1.
837 // TrueV must be the xor, FalseV must be the shift.
838 if (!match(FalseV
, m_LShr(m_Value(R
), m_One())))
840 // Matched: select +++ ? ... : (R >> 1)
841 if (!match(TrueV
, m_Xor(m_Specific(FalseV
), m_Value(Q
))) &&
842 !match(TrueV
, m_Xor(m_Value(Q
), m_Specific(FalseV
))))
844 // Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
855 bool PolynomialMultiplyRecognize::scanSelect(SelectInst
*SelI
,
856 BasicBlock
*LoopB
, BasicBlock
*PrehB
, Value
*CIV
, ParsedValues
&PV
,
858 using namespace PatternMatch
;
860 // The basic pattern for R = P.Q is:
863 // if (P & (1 << i)) ; test-bit(P, i)
866 // Similarly, the basic pattern for R = (P/Q).Q - P
872 // There exist idioms, where instead of Q being shifted left, P is shifted
873 // right. This produces a result that is shifted right by 32 bits (the
874 // non-shifted result is 64-bit).
876 // For R = P.Q, this would be:
880 // R' = (R >> 1) ^ Q ; R is cycled through the loop, so it must
881 // else ; be shifted by 1, not i.
884 // And for the inverse:
892 // The left-shifting idioms share the same pattern:
893 // select (X & (1 << i)) ? R ^ (Q << i) : R
894 // Similarly for right-shifting idioms:
895 // select (X & 1) ? (R >> 1) ^ Q
897 if (matchLeftShift(SelI
, CIV
, PV
)) {
898 // If this is a pre-scan, getting this far is sufficient.
902 // Need to make sure that the SelI goes back into R.
903 auto *RPhi
= dyn_cast
<PHINode
>(PV
.R
);
906 if (SelI
!= RPhi
->getIncomingValueForBlock(LoopB
))
910 // If X is loop invariant, it must be the input polynomial, and the
911 // idiom is the basic polynomial multiply.
912 if (CurLoop
->isLoopInvariant(PV
.X
)) {
916 // X is not loop invariant. If X == R, this is the inverse pmpy.
917 // Otherwise, check for an xor with an invariant value. If the
918 // variable argument to the xor is R, then this is still a valid
922 Value
*Var
= nullptr, *Inv
= nullptr, *X1
= nullptr, *X2
= nullptr;
923 if (!match(PV
.X
, m_Xor(m_Value(X1
), m_Value(X2
))))
925 auto *I1
= dyn_cast
<Instruction
>(X1
);
926 auto *I2
= dyn_cast
<Instruction
>(X2
);
927 if (!I1
|| I1
->getParent() != LoopB
) {
930 } else if (!I2
|| I2
->getParent() != LoopB
) {
939 // The input polynomial P still needs to be determined. It will be
940 // the entry value of R.
941 Value
*EntryP
= RPhi
->getIncomingValueForBlock(PrehB
);
948 if (matchRightShift(SelI
, PV
)) {
949 // If this is an inverse pattern, the Q polynomial must be known at
951 if (PV
.Inv
&& !isa
<ConstantInt
>(PV
.Q
))
955 // There is no exact matching of right-shift pmpy.
962 bool PolynomialMultiplyRecognize::isPromotableTo(Value
*Val
,
963 IntegerType
*DestTy
) {
964 IntegerType
*T
= dyn_cast
<IntegerType
>(Val
->getType());
965 if (!T
|| T
->getBitWidth() > DestTy
->getBitWidth())
967 if (T
->getBitWidth() == DestTy
->getBitWidth())
969 // Non-instructions are promotable. The reason why an instruction may not
970 // be promotable is that it may produce a different result if its operands
971 // and the result are promoted, for example, it may produce more non-zero
972 // bits. While it would still be possible to represent the proper result
973 // in a wider type, it may require adding additional instructions (which
974 // we don't want to do).
975 Instruction
*In
= dyn_cast
<Instruction
>(Val
);
978 // The bitwidth of the source type is smaller than the destination.
979 // Check if the individual operation can be promoted.
980 switch (In
->getOpcode()) {
981 case Instruction::PHI
:
982 case Instruction::ZExt
:
983 case Instruction::And
:
984 case Instruction::Or
:
985 case Instruction::Xor
:
986 case Instruction::LShr
: // Shift right is ok.
987 case Instruction::Select
:
988 case Instruction::Trunc
:
990 case Instruction::ICmp
:
991 if (CmpInst
*CI
= cast
<CmpInst
>(In
))
992 return CI
->isEquality() || CI
->isUnsigned();
993 llvm_unreachable("Cast failed unexpectedly");
994 case Instruction::Add
:
995 return In
->hasNoSignedWrap() && In
->hasNoUnsignedWrap();
1000 void PolynomialMultiplyRecognize::promoteTo(Instruction
*In
,
1001 IntegerType
*DestTy
, BasicBlock
*LoopB
) {
1002 Type
*OrigTy
= In
->getType();
1003 assert(!OrigTy
->isVoidTy() && "Invalid instruction to promote");
1005 // Leave boolean values alone.
1006 if (!In
->getType()->isIntegerTy(1))
1007 In
->mutateType(DestTy
);
1008 unsigned DestBW
= DestTy
->getBitWidth();
1011 if (PHINode
*P
= dyn_cast
<PHINode
>(In
)) {
1012 unsigned N
= P
->getNumIncomingValues();
1013 for (unsigned i
= 0; i
!= N
; ++i
) {
1014 BasicBlock
*InB
= P
->getIncomingBlock(i
);
1017 Value
*InV
= P
->getIncomingValue(i
);
1018 IntegerType
*Ty
= cast
<IntegerType
>(InV
->getType());
1019 // Do not promote values in PHI nodes of type i1.
1020 if (Ty
!= P
->getType()) {
1021 // If the value type does not match the PHI type, the PHI type
1022 // must have been promoted.
1023 assert(Ty
->getBitWidth() < DestBW
);
1024 InV
= IRBuilder
<>(InB
->getTerminator()).CreateZExt(InV
, DestTy
);
1025 P
->setIncomingValue(i
, InV
);
1028 } else if (ZExtInst
*Z
= dyn_cast
<ZExtInst
>(In
)) {
1029 Value
*Op
= Z
->getOperand(0);
1030 if (Op
->getType() == Z
->getType())
1031 Z
->replaceAllUsesWith(Op
);
1032 Z
->eraseFromParent();
1035 if (TruncInst
*T
= dyn_cast
<TruncInst
>(In
)) {
1036 IntegerType
*TruncTy
= cast
<IntegerType
>(OrigTy
);
1037 Value
*Mask
= ConstantInt::get(DestTy
, (1u << TruncTy
->getBitWidth()) - 1);
1038 Value
*And
= IRBuilder
<>(In
).CreateAnd(T
->getOperand(0), Mask
);
1039 T
->replaceAllUsesWith(And
);
1040 T
->eraseFromParent();
1044 // Promote immediates.
1045 for (unsigned i
= 0, n
= In
->getNumOperands(); i
!= n
; ++i
) {
1046 if (ConstantInt
*CI
= dyn_cast
<ConstantInt
>(In
->getOperand(i
)))
1047 if (CI
->getType()->getBitWidth() < DestBW
)
1048 In
->setOperand(i
, ConstantInt::get(DestTy
, CI
->getZExtValue()));
1052 bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock
*LoopB
,
1053 BasicBlock
*ExitB
) {
1055 // Skip loops where the exit block has more than one predecessor. The values
1056 // coming from the loop block will be promoted to another type, and so the
1057 // values coming into the exit block from other predecessors would also have
1059 if (!ExitB
|| (ExitB
->getSinglePredecessor() != LoopB
))
1061 IntegerType
*DestTy
= getPmpyType();
1062 // Check if the exit values have types that are no wider than the type
1063 // that we want to promote to.
1064 unsigned DestBW
= DestTy
->getBitWidth();
1065 for (PHINode
&P
: ExitB
->phis()) {
1066 if (P
.getNumIncomingValues() != 1)
1068 assert(P
.getIncomingBlock(0) == LoopB
);
1069 IntegerType
*T
= dyn_cast
<IntegerType
>(P
.getType());
1070 if (!T
|| T
->getBitWidth() > DestBW
)
1074 // Check all instructions in the loop.
1075 for (Instruction
&In
: *LoopB
)
1076 if (!In
.isTerminator() && !isPromotableTo(&In
, DestTy
))
1079 // Perform the promotion.
1080 std::vector
<Instruction
*> LoopIns
;
1081 std::transform(LoopB
->begin(), LoopB
->end(), std::back_inserter(LoopIns
),
1082 [](Instruction
&In
) { return &In
; });
1083 for (Instruction
*In
: LoopIns
)
1084 if (!In
->isTerminator())
1085 promoteTo(In
, DestTy
, LoopB
);
1087 // Fix up the PHI nodes in the exit block.
1088 Instruction
*EndI
= ExitB
->getFirstNonPHI();
1089 BasicBlock::iterator End
= EndI
? EndI
->getIterator() : ExitB
->end();
1090 for (auto I
= ExitB
->begin(); I
!= End
; ++I
) {
1091 PHINode
*P
= dyn_cast
<PHINode
>(I
);
1094 Type
*Ty0
= P
->getIncomingValue(0)->getType();
1095 Type
*PTy
= P
->getType();
1097 assert(Ty0
== DestTy
);
1098 // In order to create the trunc, P must have the promoted type.
1100 Value
*T
= IRBuilder
<>(ExitB
, End
).CreateTrunc(P
, PTy
);
1101 // In order for the RAUW to work, the types of P and T must match.
1103 P
->replaceAllUsesWith(T
);
1104 // Final update of the P's type.
1106 cast
<Instruction
>(T
)->setOperand(0, P
);
1113 bool PolynomialMultiplyRecognize::findCycle(Value
*Out
, Value
*In
,
1115 // Out = ..., In, ...
1119 auto *BB
= cast
<Instruction
>(Out
)->getParent();
1120 bool HadPhi
= false;
1122 for (auto U
: Out
->users()) {
1123 auto *I
= dyn_cast
<Instruction
>(&*U
);
1124 if (I
== nullptr || I
->getParent() != BB
)
1126 // Make sure that there are no multi-iteration cycles, e.g.
1129 // The cycle p1->p2->p1 would span two loop iterations.
1130 // Check that there is only one phi in the cycle.
1131 bool IsPhi
= isa
<PHINode
>(I
);
1132 if (IsPhi
&& HadPhi
)
1138 if (findCycle(I
, In
, Cycle
))
1142 return !Cycle
.empty();
1145 void PolynomialMultiplyRecognize::classifyCycle(Instruction
*DivI
,
1146 ValueSeq
&Cycle
, ValueSeq
&Early
, ValueSeq
&Late
) {
1147 // All the values in the cycle that are between the phi node and the
1148 // divider instruction will be classified as "early", all other values
1152 unsigned I
, N
= Cycle
.size();
1153 for (I
= 0; I
< N
; ++I
) {
1154 Value
*V
= Cycle
[I
];
1157 else if (!isa
<PHINode
>(V
))
1159 // Stop if found either.
1162 // "I" is the index of either DivI or the phi node, whichever was first.
1163 // "E" is "false" or "true" respectively.
1164 ValueSeq
&First
= !IsE
? Early
: Late
;
1165 for (unsigned J
= 0; J
< I
; ++J
)
1166 First
.insert(Cycle
[J
]);
1168 ValueSeq
&Second
= IsE
? Early
: Late
;
1169 Second
.insert(Cycle
[I
]);
1170 for (++I
; I
< N
; ++I
) {
1171 Value
*V
= Cycle
[I
];
1172 if (DivI
== V
|| isa
<PHINode
>(V
))
1178 First
.insert(Cycle
[I
]);
1181 bool PolynomialMultiplyRecognize::classifyInst(Instruction
*UseI
,
1182 ValueSeq
&Early
, ValueSeq
&Late
) {
1183 // Select is an exception, since the condition value does not have to be
1184 // classified in the same way as the true/false values. The true/false
1185 // values do have to be both early or both late.
1186 if (UseI
->getOpcode() == Instruction::Select
) {
1187 Value
*TV
= UseI
->getOperand(1), *FV
= UseI
->getOperand(2);
1188 if (Early
.count(TV
) || Early
.count(FV
)) {
1189 if (Late
.count(TV
) || Late
.count(FV
))
1192 } else if (Late
.count(TV
) || Late
.count(FV
)) {
1193 if (Early
.count(TV
) || Early
.count(FV
))
1200 // Not sure what would be the example of this, but the code below relies
1201 // on having at least one operand.
1202 if (UseI
->getNumOperands() == 0)
1205 bool AE
= true, AL
= true;
1206 for (auto &I
: UseI
->operands()) {
1207 if (Early
.count(&*I
))
1209 else if (Late
.count(&*I
))
1212 // If the operands appear "all early" and "all late" at the same time,
1213 // then it means that none of them are actually classified as either.
1214 // This is harmless.
1217 // Conversely, if they are neither "all early" nor "all late", then
1218 // we have a mixture of early and late operands that is not a known
1223 // Check that we have covered the two special cases.
1233 bool PolynomialMultiplyRecognize::commutesWithShift(Instruction
*I
) {
1234 switch (I
->getOpcode()) {
1235 case Instruction::And
:
1236 case Instruction::Or
:
1237 case Instruction::Xor
:
1238 case Instruction::LShr
:
1239 case Instruction::Shl
:
1240 case Instruction::Select
:
1241 case Instruction::ICmp
:
1242 case Instruction::PHI
:
1250 bool PolynomialMultiplyRecognize::highBitsAreZero(Value
*V
,
1251 unsigned IterCount
) {
1252 auto *T
= dyn_cast
<IntegerType
>(V
->getType());
1256 KnownBits
Known(T
->getBitWidth());
1257 computeKnownBits(V
, Known
, DL
);
1258 return Known
.countMinLeadingZeros() >= IterCount
;
1261 bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value
*V
,
1262 unsigned IterCount
) {
1263 // Assume that all inputs to the value have the high bits zero.
1264 // Check if the value itself preserves the zeros in the high bits.
1265 if (auto *C
= dyn_cast
<ConstantInt
>(V
))
1266 return C
->getValue().countLeadingZeros() >= IterCount
;
1268 if (auto *I
= dyn_cast
<Instruction
>(V
)) {
1269 switch (I
->getOpcode()) {
1270 case Instruction::And
:
1271 case Instruction::Or
:
1272 case Instruction::Xor
:
1273 case Instruction::LShr
:
1274 case Instruction::Select
:
1275 case Instruction::ICmp
:
1276 case Instruction::PHI
:
1277 case Instruction::ZExt
:
1285 bool PolynomialMultiplyRecognize::isOperandShifted(Instruction
*I
, Value
*Op
) {
1286 unsigned Opc
= I
->getOpcode();
1287 if (Opc
== Instruction::Shl
|| Opc
== Instruction::LShr
)
1288 return Op
!= I
->getOperand(1);
1292 bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock
*LoopB
,
1293 BasicBlock
*ExitB
, unsigned IterCount
) {
1294 Value
*CIV
= getCountIV(LoopB
);
1297 auto *CIVTy
= dyn_cast
<IntegerType
>(CIV
->getType());
1298 if (CIVTy
== nullptr)
1302 ValueSeq Early
, Late
, Cycled
;
1304 // Find all value cycles that contain logical right shifts by 1.
1305 for (Instruction
&I
: *LoopB
) {
1306 using namespace PatternMatch
;
1309 if (!match(&I
, m_LShr(m_Value(V
), m_One())))
1312 if (!findCycle(&I
, V
, C
))
1317 classifyCycle(&I
, C
, Early
, Late
);
1318 Cycled
.insert(C
.begin(), C
.end());
1322 // Find the set of all values affected by the shift cycles, i.e. all
1323 // cycled values, and (recursively) all their users.
1324 ValueSeq
Users(Cycled
.begin(), Cycled
.end());
1325 for (unsigned i
= 0; i
< Users
.size(); ++i
) {
1326 Value
*V
= Users
[i
];
1327 if (!isa
<IntegerType
>(V
->getType()))
1329 auto *R
= cast
<Instruction
>(V
);
1330 // If the instruction does not commute with shifts, the loop cannot
1332 if (!commutesWithShift(R
))
1334 for (auto I
= R
->user_begin(), E
= R
->user_end(); I
!= E
; ++I
) {
1335 auto *T
= cast
<Instruction
>(*I
);
1336 // Skip users from outside of the loop. They will be handled later.
1337 // Also, skip the right-shifts and phi nodes, since they mix early
1339 if (T
->getParent() != LoopB
|| RShifts
.count(T
) || isa
<PHINode
>(T
))
1343 if (!classifyInst(T
, Early
, Late
))
1351 // Verify that high bits remain zero.
1352 ValueSeq
Internal(Users
.begin(), Users
.end());
1354 for (unsigned i
= 0; i
< Internal
.size(); ++i
) {
1355 auto *R
= dyn_cast
<Instruction
>(Internal
[i
]);
1358 for (Value
*Op
: R
->operands()) {
1359 auto *T
= dyn_cast
<Instruction
>(Op
);
1360 if (T
&& T
->getParent() != LoopB
)
1363 Internal
.insert(Op
);
1366 for (Value
*V
: Inputs
)
1367 if (!highBitsAreZero(V
, IterCount
))
1369 for (Value
*V
: Internal
)
1370 if (!keepsHighBitsZero(V
, IterCount
))
1373 // Finally, the work can be done. Unshift each user.
1374 IRBuilder
<> IRB(LoopB
);
1375 std::map
<Value
*,Value
*> ShiftMap
;
1377 using CastMapType
= std::map
<std::pair
<Value
*, Type
*>, Value
*>;
1379 CastMapType CastMap
;
1381 auto upcast
= [] (CastMapType
&CM
, IRBuilder
<> &IRB
, Value
*V
,
1382 IntegerType
*Ty
) -> Value
* {
1383 auto H
= CM
.find(std::make_pair(V
, Ty
));
1386 Value
*CV
= IRB
.CreateIntCast(V
, Ty
, false);
1387 CM
.insert(std::make_pair(std::make_pair(V
, Ty
), CV
));
1391 for (auto I
= LoopB
->begin(), E
= LoopB
->end(); I
!= E
; ++I
) {
1392 using namespace PatternMatch
;
1394 if (isa
<PHINode
>(I
) || !Users
.count(&*I
))
1399 if (match(&*I
, m_LShr(m_Value(V
), m_One()))) {
1400 replaceAllUsesOfWithIn(&*I
, V
, LoopB
);
1403 // For each non-cycled operand, replace it with the corresponding
1404 // value shifted left.
1405 for (auto &J
: I
->operands()) {
1406 Value
*Op
= J
.get();
1407 if (!isOperandShifted(&*I
, Op
))
1409 if (Users
.count(Op
))
1411 // Skip shifting zeros.
1412 if (isa
<ConstantInt
>(Op
) && cast
<ConstantInt
>(Op
)->isZero())
1414 // Check if we have already generated a shift for this value.
1415 auto F
= ShiftMap
.find(Op
);
1416 Value
*W
= (F
!= ShiftMap
.end()) ? F
->second
: nullptr;
1418 IRB
.SetInsertPoint(&*I
);
1419 // First, the shift amount will be CIV or CIV+1, depending on
1420 // whether the value is early or late. Instead of creating CIV+1,
1421 // do a single shift of the value.
1422 Value
*ShAmt
= CIV
, *ShVal
= Op
;
1423 auto *VTy
= cast
<IntegerType
>(ShVal
->getType());
1424 auto *ATy
= cast
<IntegerType
>(ShAmt
->getType());
1425 if (Late
.count(&*I
))
1426 ShVal
= IRB
.CreateShl(Op
, ConstantInt::get(VTy
, 1));
1427 // Second, the types of the shifted value and the shift amount
1430 if (VTy
->getBitWidth() < ATy
->getBitWidth())
1431 ShVal
= upcast(CastMap
, IRB
, ShVal
, ATy
);
1433 ShAmt
= upcast(CastMap
, IRB
, ShAmt
, VTy
);
1435 // Ready to generate the shift and memoize it.
1436 W
= IRB
.CreateShl(ShVal
, ShAmt
);
1437 ShiftMap
.insert(std::make_pair(Op
, W
));
1439 I
->replaceUsesOfWith(Op
, W
);
1443 // Update the users outside of the loop to account for having left
1444 // shifts. They would normally be shifted right in the loop, so shift
1445 // them right after the loop exit.
1446 // Take advantage of the loop-closed SSA form, which has all the post-
1447 // loop values in phi nodes.
1448 IRB
.SetInsertPoint(ExitB
, ExitB
->getFirstInsertionPt());
1449 for (auto P
= ExitB
->begin(), Q
= ExitB
->end(); P
!= Q
; ++P
) {
1450 if (!isa
<PHINode
>(P
))
1452 auto *PN
= cast
<PHINode
>(P
);
1453 Value
*U
= PN
->getIncomingValueForBlock(LoopB
);
1454 if (!Users
.count(U
))
1456 Value
*S
= IRB
.CreateLShr(PN
, ConstantInt::get(PN
->getType(), IterCount
));
1457 PN
->replaceAllUsesWith(S
);
1458 // The above RAUW will create
1459 // S = lshr S, IterCount
1460 // so we need to fix it back into
1461 // S = lshr PN, IterCount
1462 cast
<User
>(S
)->replaceUsesOfWith(S
, PN
);
1468 void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock
*LoopB
) {
1469 for (auto &I
: *LoopB
)
1470 if (Value
*SV
= SimplifyInstruction(&I
, {DL
, &TLI
, &DT
}))
1471 I
.replaceAllUsesWith(SV
);
1473 for (auto I
= LoopB
->begin(), N
= I
; I
!= LoopB
->end(); I
= N
) {
1475 RecursivelyDeleteTriviallyDeadInstructions(&*I
, &TLI
);
1479 unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP
) {
1480 // Arrays of coefficients of Q and the inverse, C.
1481 // Q[i] = coefficient at x^i.
1482 std::array
<char,32> Q
, C
;
1484 for (unsigned i
= 0; i
< 32; ++i
) {
1490 // Find C, such that
1491 // (Q[n]*x^n + ... + Q[1]*x + Q[0]) * (C[n]*x^n + ... + C[1]*x + C[0]) = 1
1493 // For it to have a solution, Q[0] must be 1. Since this is Z2[x], the
1494 // operations * and + are & and ^ respectively.
1496 // Find C[i] recursively, by comparing i-th coefficient in the product
1497 // with 0 (or 1 for i=0).
1499 // C[0] = 1, since C[0] = Q[0], and Q[0] = 1.
1501 for (unsigned i
= 1; i
< 32; ++i
) {
1502 // Solve for C[i] in:
1503 // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i]Q[0] = 0
1504 // This is equivalent to
1505 // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i] = 0
1507 // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] = C[i]
1509 for (unsigned j
= 0; j
< i
; ++j
)
1510 T
= T
^ (C
[j
] & Q
[i
-j
]);
1515 for (unsigned i
= 0; i
< 32; ++i
)
1522 Value
*PolynomialMultiplyRecognize::generate(BasicBlock::iterator At
,
1524 IRBuilder
<> B(&*At
);
1525 Module
*M
= At
->getParent()->getParent()->getParent();
1526 Function
*PMF
= Intrinsic::getDeclaration(M
, Intrinsic::hexagon_M4_pmpyw
);
1528 Value
*P
= PV
.P
, *Q
= PV
.Q
, *P0
= P
;
1529 unsigned IC
= PV
.IterCount
;
1531 if (PV
.M
!= nullptr)
1532 P0
= P
= B
.CreateXor(P
, PV
.M
);
1534 // Create a bit mask to clear the high bits beyond IterCount.
1535 auto *BMI
= ConstantInt::get(P
->getType(), APInt::getLowBitsSet(32, IC
));
1537 if (PV
.IterCount
!= 32)
1538 P
= B
.CreateAnd(P
, BMI
);
1541 auto *QI
= dyn_cast
<ConstantInt
>(PV
.Q
);
1542 assert(QI
&& QI
->getBitWidth() <= 32);
1544 // Again, clearing bits beyond IterCount.
1545 unsigned M
= (1 << PV
.IterCount
) - 1;
1546 unsigned Tmp
= (QI
->getZExtValue() | 1) & M
;
1547 unsigned QV
= getInverseMxN(Tmp
) & M
;
1548 auto *QVI
= ConstantInt::get(QI
->getType(), QV
);
1549 P
= B
.CreateCall(PMF
, {P
, QVI
});
1550 P
= B
.CreateTrunc(P
, QI
->getType());
1552 P
= B
.CreateAnd(P
, BMI
);
1555 Value
*R
= B
.CreateCall(PMF
, {P
, Q
});
1557 if (PV
.M
!= nullptr)
1558 R
= B
.CreateXor(R
, B
.CreateIntCast(P0
, R
->getType(), false));
1563 static bool hasZeroSignBit(const Value
*V
) {
1564 if (const auto *CI
= dyn_cast
<const ConstantInt
>(V
))
1565 return (CI
->getType()->getSignBit() & CI
->getSExtValue()) == 0;
1566 const Instruction
*I
= dyn_cast
<const Instruction
>(V
);
1569 switch (I
->getOpcode()) {
1570 case Instruction::LShr
:
1571 if (const auto SI
= dyn_cast
<const ConstantInt
>(I
->getOperand(1)))
1572 return SI
->getZExtValue() > 0;
1574 case Instruction::Or
:
1575 case Instruction::Xor
:
1576 return hasZeroSignBit(I
->getOperand(0)) &&
1577 hasZeroSignBit(I
->getOperand(1));
1578 case Instruction::And
:
1579 return hasZeroSignBit(I
->getOperand(0)) ||
1580 hasZeroSignBit(I
->getOperand(1));
1585 void PolynomialMultiplyRecognize::setupPreSimplifier(Simplifier
&S
) {
1586 S
.addRule("sink-zext",
1587 // Sink zext past bitwise operations.
1588 [](Instruction
*I
, LLVMContext
&Ctx
) -> Value
* {
1589 if (I
->getOpcode() != Instruction::ZExt
)
1591 Instruction
*T
= dyn_cast
<Instruction
>(I
->getOperand(0));
1594 switch (T
->getOpcode()) {
1595 case Instruction::And
:
1596 case Instruction::Or
:
1597 case Instruction::Xor
:
1603 return B
.CreateBinOp(cast
<BinaryOperator
>(T
)->getOpcode(),
1604 B
.CreateZExt(T
->getOperand(0), I
->getType()),
1605 B
.CreateZExt(T
->getOperand(1), I
->getType()));
1607 S
.addRule("xor/and -> and/xor",
1608 // (xor (and x a) (and y a)) -> (and (xor x y) a)
1609 [](Instruction
*I
, LLVMContext
&Ctx
) -> Value
* {
1610 if (I
->getOpcode() != Instruction::Xor
)
1612 Instruction
*And0
= dyn_cast
<Instruction
>(I
->getOperand(0));
1613 Instruction
*And1
= dyn_cast
<Instruction
>(I
->getOperand(1));
1616 if (And0
->getOpcode() != Instruction::And
||
1617 And1
->getOpcode() != Instruction::And
)
1619 if (And0
->getOperand(1) != And1
->getOperand(1))
1622 return B
.CreateAnd(B
.CreateXor(And0
->getOperand(0), And1
->getOperand(0)),
1623 And0
->getOperand(1));
1625 S
.addRule("sink binop into select",
1626 // (Op (select c x y) z) -> (select c (Op x z) (Op y z))
1627 // (Op x (select c y z)) -> (select c (Op x y) (Op x z))
1628 [](Instruction
*I
, LLVMContext
&Ctx
) -> Value
* {
1629 BinaryOperator
*BO
= dyn_cast
<BinaryOperator
>(I
);
1632 Instruction::BinaryOps Op
= BO
->getOpcode();
1633 if (SelectInst
*Sel
= dyn_cast
<SelectInst
>(BO
->getOperand(0))) {
1635 Value
*X
= Sel
->getTrueValue(), *Y
= Sel
->getFalseValue();
1636 Value
*Z
= BO
->getOperand(1);
1637 return B
.CreateSelect(Sel
->getCondition(),
1638 B
.CreateBinOp(Op
, X
, Z
),
1639 B
.CreateBinOp(Op
, Y
, Z
));
1641 if (SelectInst
*Sel
= dyn_cast
<SelectInst
>(BO
->getOperand(1))) {
1643 Value
*X
= BO
->getOperand(0);
1644 Value
*Y
= Sel
->getTrueValue(), *Z
= Sel
->getFalseValue();
1645 return B
.CreateSelect(Sel
->getCondition(),
1646 B
.CreateBinOp(Op
, X
, Y
),
1647 B
.CreateBinOp(Op
, X
, Z
));
1651 S
.addRule("fold select-select",
1652 // (select c (select c x y) z) -> (select c x z)
1653 // (select c x (select c y z)) -> (select c x z)
1654 [](Instruction
*I
, LLVMContext
&Ctx
) -> Value
* {
1655 SelectInst
*Sel
= dyn_cast
<SelectInst
>(I
);
1659 Value
*C
= Sel
->getCondition();
1660 if (SelectInst
*Sel0
= dyn_cast
<SelectInst
>(Sel
->getTrueValue())) {
1661 if (Sel0
->getCondition() == C
)
1662 return B
.CreateSelect(C
, Sel0
->getTrueValue(), Sel
->getFalseValue());
1664 if (SelectInst
*Sel1
= dyn_cast
<SelectInst
>(Sel
->getFalseValue())) {
1665 if (Sel1
->getCondition() == C
)
1666 return B
.CreateSelect(C
, Sel
->getTrueValue(), Sel1
->getFalseValue());
1670 S
.addRule("or-signbit -> xor-signbit",
1671 // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
1672 [](Instruction
*I
, LLVMContext
&Ctx
) -> Value
* {
1673 if (I
->getOpcode() != Instruction::Or
)
1675 ConstantInt
*Msb
= dyn_cast
<ConstantInt
>(I
->getOperand(1));
1676 if (!Msb
|| Msb
->getZExtValue() != Msb
->getType()->getSignBit())
1678 if (!hasZeroSignBit(I
->getOperand(0)))
1680 return IRBuilder
<>(Ctx
).CreateXor(I
->getOperand(0), Msb
);
1682 S
.addRule("sink lshr into binop",
1683 // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
1684 [](Instruction
*I
, LLVMContext
&Ctx
) -> Value
* {
1685 if (I
->getOpcode() != Instruction::LShr
)
1687 BinaryOperator
*BitOp
= dyn_cast
<BinaryOperator
>(I
->getOperand(0));
1690 switch (BitOp
->getOpcode()) {
1691 case Instruction::And
:
1692 case Instruction::Or
:
1693 case Instruction::Xor
:
1699 Value
*S
= I
->getOperand(1);
1700 return B
.CreateBinOp(BitOp
->getOpcode(),
1701 B
.CreateLShr(BitOp
->getOperand(0), S
),
1702 B
.CreateLShr(BitOp
->getOperand(1), S
));
1704 S
.addRule("expose bitop-const",
1705 // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
1706 [](Instruction
*I
, LLVMContext
&Ctx
) -> Value
* {
1707 auto IsBitOp
= [](unsigned Op
) -> bool {
1709 case Instruction::And
:
1710 case Instruction::Or
:
1711 case Instruction::Xor
:
1716 BinaryOperator
*BitOp1
= dyn_cast
<BinaryOperator
>(I
);
1717 if (!BitOp1
|| !IsBitOp(BitOp1
->getOpcode()))
1719 BinaryOperator
*BitOp2
= dyn_cast
<BinaryOperator
>(BitOp1
->getOperand(0));
1720 if (!BitOp2
|| !IsBitOp(BitOp2
->getOpcode()))
1722 ConstantInt
*CA
= dyn_cast
<ConstantInt
>(BitOp2
->getOperand(1));
1723 ConstantInt
*CB
= dyn_cast
<ConstantInt
>(BitOp1
->getOperand(1));
1727 Value
*X
= BitOp2
->getOperand(0);
1728 return B
.CreateBinOp(BitOp2
->getOpcode(), X
,
1729 B
.CreateBinOp(BitOp1
->getOpcode(), CA
, CB
));
1733 void PolynomialMultiplyRecognize::setupPostSimplifier(Simplifier
&S
) {
1734 S
.addRule("(and (xor (and x a) y) b) -> (and (xor x y) b), if b == b&a",
1735 [](Instruction
*I
, LLVMContext
&Ctx
) -> Value
* {
1736 if (I
->getOpcode() != Instruction::And
)
1738 Instruction
*Xor
= dyn_cast
<Instruction
>(I
->getOperand(0));
1739 ConstantInt
*C0
= dyn_cast
<ConstantInt
>(I
->getOperand(1));
1742 if (Xor
->getOpcode() != Instruction::Xor
)
1744 Instruction
*And0
= dyn_cast
<Instruction
>(Xor
->getOperand(0));
1745 Instruction
*And1
= dyn_cast
<Instruction
>(Xor
->getOperand(1));
1746 // Pick the first non-null and.
1747 if (!And0
|| And0
->getOpcode() != Instruction::And
)
1748 std::swap(And0
, And1
);
1749 ConstantInt
*C1
= dyn_cast
<ConstantInt
>(And0
->getOperand(1));
1752 uint32_t V0
= C0
->getZExtValue();
1753 uint32_t V1
= C1
->getZExtValue();
1754 if (V0
!= (V0
& V1
))
1757 return B
.CreateAnd(B
.CreateXor(And0
->getOperand(0), And1
), C0
);
1761 bool PolynomialMultiplyRecognize::recognize() {
1762 LLVM_DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
1763 << *CurLoop
<< '\n');
1765 // - The loop must consist of a single block.
1766 // - The iteration count must be known at compile-time.
1767 // - The loop must have an induction variable starting from 0, and
1768 // incremented in each iteration of the loop.
1769 BasicBlock
*LoopB
= CurLoop
->getHeader();
1770 LLVM_DEBUG(dbgs() << "Loop header:\n" << *LoopB
);
1772 if (LoopB
!= CurLoop
->getLoopLatch())
1774 BasicBlock
*ExitB
= CurLoop
->getExitBlock();
1775 if (ExitB
== nullptr)
1777 BasicBlock
*EntryB
= CurLoop
->getLoopPreheader();
1778 if (EntryB
== nullptr)
1781 unsigned IterCount
= 0;
1782 const SCEV
*CT
= SE
.getBackedgeTakenCount(CurLoop
);
1783 if (isa
<SCEVCouldNotCompute
>(CT
))
1785 if (auto *CV
= dyn_cast
<SCEVConstant
>(CT
))
1786 IterCount
= CV
->getValue()->getZExtValue() + 1;
1788 Value
*CIV
= getCountIV(LoopB
);
1791 PV
.IterCount
= IterCount
;
1792 LLVM_DEBUG(dbgs() << "Loop IV: " << *CIV
<< "\nIterCount: " << IterCount
1795 setupPreSimplifier(PreSimp
);
1797 // Perform a preliminary scan of select instructions to see if any of them
1798 // looks like a generator of the polynomial multiply steps. Assume that a
1799 // loop can only contain a single transformable operation, so stop the
1800 // traversal after the first reasonable candidate was found.
1801 // XXX: Currently this approach can modify the loop before being 100% sure
1802 // that the transformation can be carried out.
1803 bool FoundPreScan
= false;
1804 auto FeedsPHI
= [LoopB
](const Value
*V
) -> bool {
1805 for (const Value
*U
: V
->users()) {
1806 if (const auto *P
= dyn_cast
<const PHINode
>(U
))
1807 if (P
->getParent() == LoopB
)
1812 for (Instruction
&In
: *LoopB
) {
1813 SelectInst
*SI
= dyn_cast
<SelectInst
>(&In
);
1814 if (!SI
|| !FeedsPHI(SI
))
1817 Simplifier::Context
C(SI
);
1818 Value
*T
= PreSimp
.simplify(C
);
1819 SelectInst
*SelI
= (T
&& isa
<SelectInst
>(T
)) ? cast
<SelectInst
>(T
) : SI
;
1820 LLVM_DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C
, SelI
) << '\n');
1821 if (scanSelect(SelI
, LoopB
, EntryB
, CIV
, PV
, true)) {
1822 FoundPreScan
= true;
1824 Value
*NewSel
= C
.materialize(LoopB
, SI
->getIterator());
1825 SI
->replaceAllUsesWith(NewSel
);
1826 RecursivelyDeleteTriviallyDeadInstructions(SI
, &TLI
);
1832 if (!FoundPreScan
) {
1833 LLVM_DEBUG(dbgs() << "Have not found candidates for pmpy\n");
1838 // The right shift version actually only returns the higher bits of
1839 // the result (each iteration discards the LSB). If we want to convert it
1840 // to a left-shifting loop, the working data type must be at least as
1841 // wide as the target's pmpy instruction.
1842 if (!promoteTypes(LoopB
, ExitB
))
1844 // Run post-promotion simplifications.
1845 Simplifier PostSimp
;
1846 setupPostSimplifier(PostSimp
);
1847 for (Instruction
&In
: *LoopB
) {
1848 SelectInst
*SI
= dyn_cast
<SelectInst
>(&In
);
1849 if (!SI
|| !FeedsPHI(SI
))
1851 Simplifier::Context
C(SI
);
1852 Value
*T
= PostSimp
.simplify(C
);
1853 SelectInst
*SelI
= dyn_cast_or_null
<SelectInst
>(T
);
1855 Value
*NewSel
= C
.materialize(LoopB
, SI
->getIterator());
1856 SI
->replaceAllUsesWith(NewSel
);
1857 RecursivelyDeleteTriviallyDeadInstructions(SI
, &TLI
);
1862 if (!convertShiftsToLeft(LoopB
, ExitB
, IterCount
))
1864 cleanupLoopBody(LoopB
);
1867 // Scan the loop again, find the generating select instruction.
1868 bool FoundScan
= false;
1869 for (Instruction
&In
: *LoopB
) {
1870 SelectInst
*SelI
= dyn_cast
<SelectInst
>(&In
);
1873 LLVM_DEBUG(dbgs() << "scanSelect: " << *SelI
<< '\n');
1874 FoundScan
= scanSelect(SelI
, LoopB
, EntryB
, CIV
, PV
, false);
1881 StringRef PP
= (PV
.M
? "(P+M)" : "P");
1883 dbgs() << "Found pmpy idiom: R = " << PP
<< ".Q\n";
1885 dbgs() << "Found inverse pmpy idiom: R = (" << PP
<< "/Q).Q) + "
1887 dbgs() << " Res:" << *PV
.Res
<< "\n P:" << *PV
.P
<< "\n";
1889 dbgs() << " M:" << *PV
.M
<< "\n";
1890 dbgs() << " Q:" << *PV
.Q
<< "\n";
1891 dbgs() << " Iteration count:" << PV
.IterCount
<< "\n";
1894 BasicBlock::iterator
At(EntryB
->getTerminator());
1895 Value
*PM
= generate(At
, PV
);
1899 if (PM
->getType() != PV
.Res
->getType())
1900 PM
= IRBuilder
<>(&*At
).CreateIntCast(PM
, PV
.Res
->getType(), false);
1902 PV
.Res
->replaceAllUsesWith(PM
);
1903 PV
.Res
->eraseFromParent();
1907 int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr
*S
) {
1908 if (const SCEVConstant
*SC
= dyn_cast
<SCEVConstant
>(S
->getOperand(1)))
1909 return SC
->getAPInt().getSExtValue();
1913 bool HexagonLoopIdiomRecognize::isLegalStore(Loop
*CurLoop
, StoreInst
*SI
) {
1914 // Allow volatile stores if HexagonVolatileMemcpy is enabled.
1915 if (!(SI
->isVolatile() && HexagonVolatileMemcpy
) && !SI
->isSimple())
1918 Value
*StoredVal
= SI
->getValueOperand();
1919 Value
*StorePtr
= SI
->getPointerOperand();
1921 // Reject stores that are so large that they overflow an unsigned.
1922 uint64_t SizeInBits
= DL
->getTypeSizeInBits(StoredVal
->getType());
1923 if ((SizeInBits
& 7) || (SizeInBits
>> 32) != 0)
1926 // See if the pointer expression is an AddRec like {base,+,1} on the current
1927 // loop, which indicates a strided store. If we have something else, it's a
1928 // random store we can't handle.
1929 auto *StoreEv
= dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(StorePtr
));
1930 if (!StoreEv
|| StoreEv
->getLoop() != CurLoop
|| !StoreEv
->isAffine())
1933 // Check to see if the stride matches the size of the store. If so, then we
1934 // know that every byte is touched in the loop.
1935 int Stride
= getSCEVStride(StoreEv
);
1938 unsigned StoreSize
= DL
->getTypeStoreSize(SI
->getValueOperand()->getType());
1939 if (StoreSize
!= unsigned(std::abs(Stride
)))
1942 // The store must be feeding a non-volatile load.
1943 LoadInst
*LI
= dyn_cast
<LoadInst
>(SI
->getValueOperand());
1944 if (!LI
|| !LI
->isSimple())
1947 // See if the pointer expression is an AddRec like {base,+,1} on the current
1948 // loop, which indicates a strided load. If we have something else, it's a
1949 // random load we can't handle.
1950 Value
*LoadPtr
= LI
->getPointerOperand();
1951 auto *LoadEv
= dyn_cast
<SCEVAddRecExpr
>(SE
->getSCEV(LoadPtr
));
1952 if (!LoadEv
|| LoadEv
->getLoop() != CurLoop
|| !LoadEv
->isAffine())
1955 // The store and load must share the same stride.
1956 if (StoreEv
->getOperand(1) != LoadEv
->getOperand(1))
1959 // Success. This store can be converted into a memcpy.
1963 /// mayLoopAccessLocation - Return true if the specified loop might access the
1964 /// specified pointer location, which is a loop-strided access. The 'Access'
1965 /// argument specifies what the verboten forms of access are (read or write).
1967 mayLoopAccessLocation(Value
*Ptr
, ModRefInfo Access
, Loop
*L
,
1968 const SCEV
*BECount
, unsigned StoreSize
,
1970 SmallPtrSetImpl
<Instruction
*> &Ignored
) {
1971 // Get the location that may be stored across the loop. Since the access
1972 // is strided positively through memory, we say that the modified location
1973 // starts at the pointer and has infinite size.
1974 LocationSize AccessSize
= LocationSize::unknown();
1976 // If the loop iterates a fixed number of times, we can refine the access
1977 // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
1978 if (const SCEVConstant
*BECst
= dyn_cast
<SCEVConstant
>(BECount
))
1979 AccessSize
= LocationSize::precise((BECst
->getValue()->getZExtValue() + 1) *
1982 // TODO: For this to be really effective, we have to dive into the pointer
1983 // operand in the store. Store to &A[i] of 100 will always return may alias
1984 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1985 // which will then no-alias a store to &A[100].
1986 MemoryLocation
StoreLoc(Ptr
, AccessSize
);
1988 for (auto *B
: L
->blocks())
1990 if (Ignored
.count(&I
) == 0 &&
1992 intersectModRef(AA
.getModRefInfo(&I
, StoreLoc
), Access
)))
1998 void HexagonLoopIdiomRecognize::collectStores(Loop
*CurLoop
, BasicBlock
*BB
,
1999 SmallVectorImpl
<StoreInst
*> &Stores
) {
2001 for (Instruction
&I
: *BB
)
2002 if (StoreInst
*SI
= dyn_cast
<StoreInst
>(&I
))
2003 if (isLegalStore(CurLoop
, SI
))
2004 Stores
.push_back(SI
);
2007 bool HexagonLoopIdiomRecognize::processCopyingStore(Loop
*CurLoop
,
2008 StoreInst
*SI
, const SCEV
*BECount
) {
2009 assert((SI
->isSimple() || (SI
->isVolatile() && HexagonVolatileMemcpy
)) &&
2010 "Expected only non-volatile stores, or Hexagon-specific memcpy"
2011 "to volatile destination.");
2013 Value
*StorePtr
= SI
->getPointerOperand();
2014 auto *StoreEv
= cast
<SCEVAddRecExpr
>(SE
->getSCEV(StorePtr
));
2015 unsigned Stride
= getSCEVStride(StoreEv
);
2016 unsigned StoreSize
= DL
->getTypeStoreSize(SI
->getValueOperand()->getType());
2017 if (Stride
!= StoreSize
)
2020 // See if the pointer expression is an AddRec like {base,+,1} on the current
2021 // loop, which indicates a strided load. If we have something else, it's a
2022 // random load we can't handle.
2023 LoadInst
*LI
= dyn_cast
<LoadInst
>(SI
->getValueOperand());
2024 auto *LoadEv
= cast
<SCEVAddRecExpr
>(SE
->getSCEV(LI
->getPointerOperand()));
2026 // The trip count of the loop and the base pointer of the addrec SCEV is
2027 // guaranteed to be loop invariant, which means that it should dominate the
2028 // header. This allows us to insert code for it in the preheader.
2029 BasicBlock
*Preheader
= CurLoop
->getLoopPreheader();
2030 Instruction
*ExpPt
= Preheader
->getTerminator();
2031 IRBuilder
<> Builder(ExpPt
);
2032 SCEVExpander
Expander(*SE
, *DL
, "hexagon-loop-idiom");
2034 Type
*IntPtrTy
= Builder
.getIntPtrTy(*DL
, SI
->getPointerAddressSpace());
2036 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
2037 // this into a memcpy/memmove in the loop preheader now if we want. However,
2038 // this would be unsafe to do if there is anything else in the loop that may
2039 // read or write the memory region we're storing to. For memcpy, this
2040 // includes the load that feeds the stores. Check for an alias by generating
2041 // the base address and checking everything.
2042 Value
*StoreBasePtr
= Expander
.expandCodeFor(StoreEv
->getStart(),
2043 Builder
.getInt8PtrTy(SI
->getPointerAddressSpace()), ExpPt
);
2044 Value
*LoadBasePtr
= nullptr;
2046 bool Overlap
= false;
2047 bool DestVolatile
= SI
->isVolatile();
2048 Type
*BECountTy
= BECount
->getType();
2051 // The trip count must fit in i32, since it is the type of the "num_words"
2052 // argument to hexagon_memcpy_forward_vp4cp4n2.
2053 if (StoreSize
!= 4 || DL
->getTypeSizeInBits(BECountTy
) > 32) {
2055 // If we generated new code for the base pointer, clean up.
2057 if (StoreBasePtr
&& (LoadBasePtr
!= StoreBasePtr
)) {
2058 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr
, TLI
);
2059 StoreBasePtr
= nullptr;
2062 RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr
, TLI
);
2063 LoadBasePtr
= nullptr;
2069 SmallPtrSet
<Instruction
*, 2> Ignore1
;
2071 if (mayLoopAccessLocation(StoreBasePtr
, ModRefInfo::ModRef
, CurLoop
, BECount
,
2072 StoreSize
, *AA
, Ignore1
)) {
2073 // Check if the load is the offending instruction.
2075 if (mayLoopAccessLocation(StoreBasePtr
, ModRefInfo::ModRef
, CurLoop
,
2076 BECount
, StoreSize
, *AA
, Ignore1
)) {
2077 // Still bad. Nothing we can do.
2078 goto CleanupAndExit
;
2080 // It worked with the load ignored.
2085 if (DisableMemcpyIdiom
|| !HasMemcpy
)
2086 goto CleanupAndExit
;
2088 // Don't generate memmove if this function will be inlined. This is
2089 // because the caller will undergo this transformation after inlining.
2090 Function
*Func
= CurLoop
->getHeader()->getParent();
2091 if (Func
->hasFnAttribute(Attribute::AlwaysInline
))
2092 goto CleanupAndExit
;
2094 // In case of a memmove, the call to memmove will be executed instead
2095 // of the loop, so we need to make sure that there is nothing else in
2096 // the loop than the load, store and instructions that these two depend
2098 SmallVector
<Instruction
*,2> Insts
;
2099 Insts
.push_back(SI
);
2100 Insts
.push_back(LI
);
2101 if (!coverLoop(CurLoop
, Insts
))
2102 goto CleanupAndExit
;
2104 if (DisableMemmoveIdiom
|| !HasMemmove
)
2105 goto CleanupAndExit
;
2106 bool IsNested
= CurLoop
->getParentLoop() != nullptr;
2107 if (IsNested
&& OnlyNonNestedMemmove
)
2108 goto CleanupAndExit
;
2111 // For a memcpy, we have to make sure that the input array is not being
2112 // mutated by the loop.
2113 LoadBasePtr
= Expander
.expandCodeFor(LoadEv
->getStart(),
2114 Builder
.getInt8PtrTy(LI
->getPointerAddressSpace()), ExpPt
);
2116 SmallPtrSet
<Instruction
*, 2> Ignore2
;
2118 if (mayLoopAccessLocation(LoadBasePtr
, ModRefInfo::Mod
, CurLoop
, BECount
,
2119 StoreSize
, *AA
, Ignore2
))
2120 goto CleanupAndExit
;
2122 // Check the stride.
2123 bool StridePos
= getSCEVStride(LoadEv
) >= 0;
2125 // Currently, the volatile memcpy only emulates traversing memory forward.
2126 if (!StridePos
&& DestVolatile
)
2127 goto CleanupAndExit
;
2129 bool RuntimeCheck
= (Overlap
|| DestVolatile
);
2133 // The runtime check needs a single exit block.
2134 SmallVector
<BasicBlock
*, 8> ExitBlocks
;
2135 CurLoop
->getUniqueExitBlocks(ExitBlocks
);
2136 if (ExitBlocks
.size() != 1)
2137 goto CleanupAndExit
;
2138 ExitB
= ExitBlocks
[0];
2141 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
2142 // pointer size if it isn't already.
2143 LLVMContext
&Ctx
= SI
->getContext();
2144 BECount
= SE
->getTruncateOrZeroExtend(BECount
, IntPtrTy
);
2145 DebugLoc DLoc
= SI
->getDebugLoc();
2147 const SCEV
*NumBytesS
=
2148 SE
->getAddExpr(BECount
, SE
->getOne(IntPtrTy
), SCEV::FlagNUW
);
2150 NumBytesS
= SE
->getMulExpr(NumBytesS
, SE
->getConstant(IntPtrTy
, StoreSize
),
2152 Value
*NumBytes
= Expander
.expandCodeFor(NumBytesS
, IntPtrTy
, ExpPt
);
2153 if (Instruction
*In
= dyn_cast
<Instruction
>(NumBytes
))
2154 if (Value
*Simp
= SimplifyInstruction(In
, {*DL
, TLI
, DT
}))
2160 unsigned Threshold
= RuntimeMemSizeThreshold
;
2161 if (ConstantInt
*CI
= dyn_cast
<ConstantInt
>(NumBytes
)) {
2162 uint64_t C
= CI
->getZExtValue();
2163 if (Threshold
!= 0 && C
< Threshold
)
2164 goto CleanupAndExit
;
2165 if (C
< CompileTimeMemSizeThreshold
)
2166 goto CleanupAndExit
;
2169 BasicBlock
*Header
= CurLoop
->getHeader();
2170 Function
*Func
= Header
->getParent();
2171 Loop
*ParentL
= LF
->getLoopFor(Preheader
);
2172 StringRef HeaderName
= Header
->getName();
2174 // Create a new (empty) preheader, and update the PHI nodes in the
2175 // header to use the new preheader.
2176 BasicBlock
*NewPreheader
= BasicBlock::Create(Ctx
, HeaderName
+".rtli.ph",
2179 ParentL
->addBasicBlockToLoop(NewPreheader
, *LF
);
2180 IRBuilder
<>(NewPreheader
).CreateBr(Header
);
2181 for (auto &In
: *Header
) {
2182 PHINode
*PN
= dyn_cast
<PHINode
>(&In
);
2185 int bx
= PN
->getBasicBlockIndex(Preheader
);
2187 PN
->setIncomingBlock(bx
, NewPreheader
);
2189 DT
->addNewBlock(NewPreheader
, Preheader
);
2190 DT
->changeImmediateDominator(Header
, NewPreheader
);
2192 // Check for safe conditions to execute memmove.
2193 // If stride is positive, copying things from higher to lower addresses
2194 // is equivalent to memmove. For negative stride, it's the other way
2195 // around. Copying forward in memory with positive stride may not be
2196 // same as memmove since we may be copying values that we just stored
2197 // in some previous iteration.
2198 Value
*LA
= Builder
.CreatePtrToInt(LoadBasePtr
, IntPtrTy
);
2199 Value
*SA
= Builder
.CreatePtrToInt(StoreBasePtr
, IntPtrTy
);
2200 Value
*LowA
= StridePos
? SA
: LA
;
2201 Value
*HighA
= StridePos
? LA
: SA
;
2202 Value
*CmpA
= Builder
.CreateICmpULT(LowA
, HighA
);
2205 // Check for distance between pointers. Since the case LowA < HighA
2206 // is checked for above, assume LowA >= HighA.
2207 Value
*Dist
= Builder
.CreateSub(LowA
, HighA
);
2208 Value
*CmpD
= Builder
.CreateICmpSLE(NumBytes
, Dist
);
2209 Value
*CmpEither
= Builder
.CreateOr(Cond
, CmpD
);
2212 if (Threshold
!= 0) {
2213 Type
*Ty
= NumBytes
->getType();
2214 Value
*Thr
= ConstantInt::get(Ty
, Threshold
);
2215 Value
*CmpB
= Builder
.CreateICmpULT(Thr
, NumBytes
);
2216 Value
*CmpBoth
= Builder
.CreateAnd(Cond
, CmpB
);
2219 BasicBlock
*MemmoveB
= BasicBlock::Create(Ctx
, Header
->getName()+".rtli",
2220 Func
, NewPreheader
);
2222 ParentL
->addBasicBlockToLoop(MemmoveB
, *LF
);
2223 Instruction
*OldT
= Preheader
->getTerminator();
2224 Builder
.CreateCondBr(Cond
, MemmoveB
, NewPreheader
);
2225 OldT
->eraseFromParent();
2226 Preheader
->setName(Preheader
->getName()+".old");
2227 DT
->addNewBlock(MemmoveB
, Preheader
);
2228 // Find the new immediate dominator of the exit block.
2229 BasicBlock
*ExitD
= Preheader
;
2230 for (auto PI
= pred_begin(ExitB
), PE
= pred_end(ExitB
); PI
!= PE
; ++PI
) {
2231 BasicBlock
*PB
= *PI
;
2232 ExitD
= DT
->findNearestCommonDominator(ExitD
, PB
);
2236 // If the prior immediate dominator of ExitB was dominated by the
2237 // old preheader, then the old preheader becomes the new immediate
2238 // dominator. Otherwise don't change anything (because the newly
2239 // added blocks are dominated by the old preheader).
2240 if (ExitD
&& DT
->dominates(Preheader
, ExitD
)) {
2241 DomTreeNode
*BN
= DT
->getNode(ExitB
);
2242 DomTreeNode
*DN
= DT
->getNode(ExitD
);
2246 // Add a call to memmove to the conditional block.
2247 IRBuilder
<> CondBuilder(MemmoveB
);
2248 CondBuilder
.CreateBr(ExitB
);
2249 CondBuilder
.SetInsertPoint(MemmoveB
->getTerminator());
2252 Type
*Int32Ty
= Type::getInt32Ty(Ctx
);
2253 Type
*Int32PtrTy
= Type::getInt32PtrTy(Ctx
);
2254 Type
*VoidTy
= Type::getVoidTy(Ctx
);
2255 Module
*M
= Func
->getParent();
2256 FunctionCallee Fn
= M
->getOrInsertFunction(
2257 HexagonVolatileMemcpyName
, VoidTy
, Int32PtrTy
, Int32PtrTy
, Int32Ty
);
2259 const SCEV
*OneS
= SE
->getConstant(Int32Ty
, 1);
2260 const SCEV
*BECount32
= SE
->getTruncateOrZeroExtend(BECount
, Int32Ty
);
2261 const SCEV
*NumWordsS
= SE
->getAddExpr(BECount32
, OneS
, SCEV::FlagNUW
);
2262 Value
*NumWords
= Expander
.expandCodeFor(NumWordsS
, Int32Ty
,
2263 MemmoveB
->getTerminator());
2264 if (Instruction
*In
= dyn_cast
<Instruction
>(NumWords
))
2265 if (Value
*Simp
= SimplifyInstruction(In
, {*DL
, TLI
, DT
}))
2268 Value
*Op0
= (StoreBasePtr
->getType() == Int32PtrTy
)
2270 : CondBuilder
.CreateBitCast(StoreBasePtr
, Int32PtrTy
);
2271 Value
*Op1
= (LoadBasePtr
->getType() == Int32PtrTy
)
2273 : CondBuilder
.CreateBitCast(LoadBasePtr
, Int32PtrTy
);
2274 NewCall
= CondBuilder
.CreateCall(Fn
, {Op0
, Op1
, NumWords
});
2276 NewCall
= CondBuilder
.CreateMemMove(StoreBasePtr
, SI
->getAlignment(),
2277 LoadBasePtr
, LI
->getAlignment(),
2281 NewCall
= Builder
.CreateMemCpy(StoreBasePtr
, SI
->getAlignment(),
2282 LoadBasePtr
, LI
->getAlignment(),
2284 // Okay, the memcpy has been formed. Zap the original store and
2285 // anything that feeds into it.
2286 RecursivelyDeleteTriviallyDeadInstructions(SI
, TLI
);
2289 NewCall
->setDebugLoc(DLoc
);
2291 LLVM_DEBUG(dbgs() << " Formed " << (Overlap
? "memmove: " : "memcpy: ")
2293 << " from load ptr=" << *LoadEv
<< " at: " << *LI
<< "\n"
2294 << " from store ptr=" << *StoreEv
<< " at: " << *SI
2300 // Check if the instructions in Insts, together with their dependencies
2301 // cover the loop in the sense that the loop could be safely eliminated once
2302 // the instructions in Insts are removed.
2303 bool HexagonLoopIdiomRecognize::coverLoop(Loop
*L
,
2304 SmallVectorImpl
<Instruction
*> &Insts
) const {
2305 SmallSet
<BasicBlock
*,8> LoopBlocks
;
2306 for (auto *B
: L
->blocks())
2307 LoopBlocks
.insert(B
);
2309 SetVector
<Instruction
*> Worklist(Insts
.begin(), Insts
.end());
2311 // Collect all instructions from the loop that the instructions in Insts
2312 // depend on (plus their dependencies, etc.). These instructions will
2313 // constitute the expression trees that feed those in Insts, but the trees
2314 // will be limited only to instructions contained in the loop.
2315 for (unsigned i
= 0; i
< Worklist
.size(); ++i
) {
2316 Instruction
*In
= Worklist
[i
];
2317 for (auto I
= In
->op_begin(), E
= In
->op_end(); I
!= E
; ++I
) {
2318 Instruction
*OpI
= dyn_cast
<Instruction
>(I
);
2321 BasicBlock
*PB
= OpI
->getParent();
2322 if (!LoopBlocks
.count(PB
))
2324 Worklist
.insert(OpI
);
2328 // Scan all instructions in the loop, if any of them have a user outside
2329 // of the loop, or outside of the expressions collected above, then either
2330 // the loop has a side-effect visible outside of it, or there are
2331 // instructions in it that are not involved in the original set Insts.
2332 for (auto *B
: L
->blocks()) {
2333 for (auto &In
: *B
) {
2334 if (isa
<BranchInst
>(In
) || isa
<DbgInfoIntrinsic
>(In
))
2336 if (!Worklist
.count(&In
) && In
.mayHaveSideEffects())
2338 for (const auto &K
: In
.users()) {
2339 Instruction
*UseI
= dyn_cast
<Instruction
>(K
);
2342 BasicBlock
*UseB
= UseI
->getParent();
2343 if (LF
->getLoopFor(UseB
) != L
)
2352 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
2353 /// with the specified backedge count. This block is known to be in the current
2354 /// loop and not in any subloops.
2355 bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop
*CurLoop
, BasicBlock
*BB
,
2356 const SCEV
*BECount
, SmallVectorImpl
<BasicBlock
*> &ExitBlocks
) {
2357 // We can only promote stores in this block if they are unconditionally
2358 // executed in the loop. For a block to be unconditionally executed, it has
2359 // to dominate all the exit blocks of the loop. Verify this now.
2360 auto DominatedByBB
= [this,BB
] (BasicBlock
*EB
) -> bool {
2361 return DT
->dominates(BB
, EB
);
2363 if (!all_of(ExitBlocks
, DominatedByBB
))
2366 bool MadeChange
= false;
2367 // Look for store instructions, which may be optimized to memset/memcpy.
2368 SmallVector
<StoreInst
*,8> Stores
;
2369 collectStores(CurLoop
, BB
, Stores
);
2371 // Optimize the store into a memcpy, if it feeds an similarly strided load.
2372 for (auto &SI
: Stores
)
2373 MadeChange
|= processCopyingStore(CurLoop
, SI
, BECount
);
2378 bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop
*L
) {
2379 PolynomialMultiplyRecognize
PMR(L
, *DL
, *DT
, *TLI
, *SE
);
2380 if (PMR
.recognize())
2383 if (!HasMemcpy
&& !HasMemmove
)
2386 const SCEV
*BECount
= SE
->getBackedgeTakenCount(L
);
2387 assert(!isa
<SCEVCouldNotCompute
>(BECount
) &&
2388 "runOnCountableLoop() called on a loop without a predictable"
2389 "backedge-taken count");
2391 SmallVector
<BasicBlock
*, 8> ExitBlocks
;
2392 L
->getUniqueExitBlocks(ExitBlocks
);
2394 bool Changed
= false;
2396 // Scan all the blocks in the loop that are not in subloops.
2397 for (auto *BB
: L
->getBlocks()) {
2398 // Ignore blocks in subloops.
2399 if (LF
->getLoopFor(BB
) != L
)
2401 Changed
|= runOnLoopBlock(L
, BB
, BECount
, ExitBlocks
);
2407 bool HexagonLoopIdiomRecognize::runOnLoop(Loop
*L
, LPPassManager
&LPM
) {
2408 const Module
&M
= *L
->getHeader()->getParent()->getParent();
2409 if (Triple(M
.getTargetTriple()).getArch() != Triple::hexagon
)
2415 // If the loop could not be converted to canonical form, it must have an
2416 // indirectbr in it, just give up.
2417 if (!L
->getLoopPreheader())
2420 // Disable loop idiom recognition if the function's name is a common idiom.
2421 StringRef Name
= L
->getHeader()->getParent()->getName();
2422 if (Name
== "memset" || Name
== "memcpy" || Name
== "memmove")
2425 AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
2426 DL
= &L
->getHeader()->getModule()->getDataLayout();
2427 DT
= &getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
2428 LF
= &getAnalysis
<LoopInfoWrapperPass
>().getLoopInfo();
2429 TLI
= &getAnalysis
<TargetLibraryInfoWrapperPass
>().getTLI(
2430 *L
->getHeader()->getParent());
2431 SE
= &getAnalysis
<ScalarEvolutionWrapperPass
>().getSE();
2433 HasMemcpy
= TLI
->has(LibFunc_memcpy
);
2434 HasMemmove
= TLI
->has(LibFunc_memmove
);
2436 if (SE
->hasLoopInvariantBackedgeTakenCount(L
))
2437 return runOnCountableLoop(L
);
2441 Pass
*llvm::createHexagonLoopIdiomPass() {
2442 return new HexagonLoopIdiomRecognize();