1 //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file contains an optimization for div and rem on architectures that
10 // execute short instructions significantly faster than longer instructions.
11 // For example, on Intel Atom 32-bit divides are slow enough that during
12 // runtime it is profitable to check the value of the operands, and if they are
13 // positive and less than 256 use an unsigned 8-bit divide.
15 //===----------------------------------------------------------------------===//
17 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
18 #include "llvm/ADT/DenseMap.h"
19 #include "llvm/ADT/None.h"
20 #include "llvm/ADT/Optional.h"
21 #include "llvm/ADT/STLExtras.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/Transforms/Utils/Local.h"
24 #include "llvm/Analysis/ValueTracking.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/Constants.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/Function.h"
29 #include "llvm/IR/IRBuilder.h"
30 #include "llvm/IR/Instruction.h"
31 #include "llvm/IR/Instructions.h"
32 #include "llvm/IR/Module.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/IR/Value.h"
35 #include "llvm/Support/Casting.h"
36 #include "llvm/Support/KnownBits.h"
42 #define DEBUG_TYPE "bypass-slow-division"
50 QuotRemPair(Value
*InQuotient
, Value
*InRemainder
)
51 : Quotient(InQuotient
), Remainder(InRemainder
) {}
54 /// A quotient and remainder, plus a BB from which they logically "originate".
55 /// If you use Quotient or Remainder in a Phi node, you should use BB as its
56 /// corresponding predecessor.
57 struct QuotRemWithBB
{
58 BasicBlock
*BB
= nullptr;
59 Value
*Quotient
= nullptr;
60 Value
*Remainder
= nullptr;
63 using DivCacheTy
= DenseMap
<DivRemMapKey
, QuotRemPair
>;
64 using BypassWidthsTy
= DenseMap
<unsigned, unsigned>;
65 using VisitedSetTy
= SmallPtrSet
<Instruction
*, 4>;
68 /// Operand definitely fits into BypassType. No runtime checks are needed.
70 /// A runtime check is required, as value range is unknown.
72 /// Operand is unlikely to fit into BypassType. The bypassing should be
77 class FastDivInsertionTask
{
78 bool IsValidTask
= false;
79 Instruction
*SlowDivOrRem
= nullptr;
80 IntegerType
*BypassType
= nullptr;
81 BasicBlock
*MainBB
= nullptr;
83 bool isHashLikeValue(Value
*V
, VisitedSetTy
&Visited
);
84 ValueRange
getValueRange(Value
*Op
, VisitedSetTy
&Visited
);
85 QuotRemWithBB
createSlowBB(BasicBlock
*Successor
);
86 QuotRemWithBB
createFastBB(BasicBlock
*Successor
);
87 QuotRemPair
createDivRemPhiNodes(QuotRemWithBB
&LHS
, QuotRemWithBB
&RHS
,
89 Value
*insertOperandRuntimeCheck(Value
*Op1
, Value
*Op2
);
90 Optional
<QuotRemPair
> insertFastDivAndRem();
93 return SlowDivOrRem
->getOpcode() == Instruction::SDiv
||
94 SlowDivOrRem
->getOpcode() == Instruction::SRem
;
98 return SlowDivOrRem
->getOpcode() == Instruction::SDiv
||
99 SlowDivOrRem
->getOpcode() == Instruction::UDiv
;
102 Type
*getSlowType() { return SlowDivOrRem
->getType(); }
105 FastDivInsertionTask(Instruction
*I
, const BypassWidthsTy
&BypassWidths
);
107 Value
*getReplacement(DivCacheTy
&Cache
);
110 } // end anonymous namespace
112 FastDivInsertionTask::FastDivInsertionTask(Instruction
*I
,
113 const BypassWidthsTy
&BypassWidths
) {
114 switch (I
->getOpcode()) {
115 case Instruction::UDiv
:
116 case Instruction::SDiv
:
117 case Instruction::URem
:
118 case Instruction::SRem
:
122 // I is not a div/rem operation.
126 // Skip division on vector types. Only optimize integer instructions.
127 IntegerType
*SlowType
= dyn_cast
<IntegerType
>(SlowDivOrRem
->getType());
131 // Skip if this bitwidth is not bypassed.
132 auto BI
= BypassWidths
.find(SlowType
->getBitWidth());
133 if (BI
== BypassWidths
.end())
136 // Get type for div/rem instruction with bypass bitwidth.
137 IntegerType
*BT
= IntegerType::get(I
->getContext(), BI
->second
);
140 // The original basic block.
141 MainBB
= I
->getParent();
143 // The instruction is indeed a slow div or rem operation.
147 /// Reuses previously-computed dividend or remainder from the current BB if
148 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to
149 /// perform the optimization and caches the resulting dividend and remainder.
150 /// If no replacement can be generated, nullptr is returned.
151 Value
*FastDivInsertionTask::getReplacement(DivCacheTy
&Cache
) {
152 // First, make sure that the task is valid.
156 // Then, look for a value in Cache.
157 Value
*Dividend
= SlowDivOrRem
->getOperand(0);
158 Value
*Divisor
= SlowDivOrRem
->getOperand(1);
159 DivRemMapKey
Key(isSignedOp(), Dividend
, Divisor
);
160 auto CacheI
= Cache
.find(Key
);
162 if (CacheI
== Cache
.end()) {
163 // If previous instance does not exist, try to insert fast div.
164 Optional
<QuotRemPair
> OptResult
= insertFastDivAndRem();
165 // Bail out if insertFastDivAndRem has failed.
168 CacheI
= Cache
.insert({Key
, *OptResult
}).first
;
171 QuotRemPair
&Value
= CacheI
->second
;
172 return isDivisionOp() ? Value
.Quotient
: Value
.Remainder
;
175 /// Check if a value looks like a hash.
177 /// The routine is expected to detect values computed using the most common hash
178 /// algorithms. Typically, hash computations end with one of the following
181 /// 1) MUL with a constant wider than BypassType
182 /// 2) XOR instruction
184 /// And even if we are wrong and the value is not a hash, it is still quite
185 /// unlikely that such values will fit into BypassType.
187 /// To detect string hash algorithms like FNV we have to look through PHI-nodes.
188 /// It is implemented as a depth-first search for values that look neither long
190 bool FastDivInsertionTask::isHashLikeValue(Value
*V
, VisitedSetTy
&Visited
) {
191 Instruction
*I
= dyn_cast
<Instruction
>(V
);
195 switch (I
->getOpcode()) {
196 case Instruction::Xor
:
198 case Instruction::Mul
: {
199 // After Constant Hoisting pass, long constants may be represented as
200 // bitcast instructions. As a result, some constants may look like an
201 // instruction at first, and an additional check is necessary to find out if
202 // an operand is actually a constant.
203 Value
*Op1
= I
->getOperand(1);
204 ConstantInt
*C
= dyn_cast
<ConstantInt
>(Op1
);
205 if (!C
&& isa
<BitCastInst
>(Op1
))
206 C
= dyn_cast
<ConstantInt
>(cast
<BitCastInst
>(Op1
)->getOperand(0));
207 return C
&& C
->getValue().getMinSignedBits() > BypassType
->getBitWidth();
209 case Instruction::PHI
:
210 // Stop IR traversal in case of a crazy input code. This limits recursion
212 if (Visited
.size() >= 16)
214 // Do not visit nodes that have been visited already. We return true because
215 // it means that we couldn't find any value that doesn't look hash-like.
216 if (!Visited
.insert(I
).second
)
218 return llvm::all_of(cast
<PHINode
>(I
)->incoming_values(), [&](Value
*V
) {
219 // Ignore undef values as they probably don't affect the division
221 return getValueRange(V
, Visited
) == VALRNG_LIKELY_LONG
||
229 /// Check if an integer value fits into our bypass type.
230 ValueRange
FastDivInsertionTask::getValueRange(Value
*V
,
231 VisitedSetTy
&Visited
) {
232 unsigned ShortLen
= BypassType
->getBitWidth();
233 unsigned LongLen
= V
->getType()->getIntegerBitWidth();
235 assert(LongLen
> ShortLen
&& "Value type must be wider than BypassType");
236 unsigned HiBits
= LongLen
- ShortLen
;
238 const DataLayout
&DL
= SlowDivOrRem
->getModule()->getDataLayout();
239 KnownBits
Known(LongLen
);
241 computeKnownBits(V
, Known
, DL
);
243 if (Known
.countMinLeadingZeros() >= HiBits
)
244 return VALRNG_KNOWN_SHORT
;
246 if (Known
.countMaxLeadingZeros() < HiBits
)
247 return VALRNG_LIKELY_LONG
;
249 // Long integer divisions are often used in hashtable implementations. It's
250 // not worth bypassing such divisions because hash values are extremely
251 // unlikely to have enough leading zeros. The call below tries to detect
252 // values that are unlikely to fit BypassType (including hashes).
253 if (isHashLikeValue(V
, Visited
))
254 return VALRNG_LIKELY_LONG
;
256 return VALRNG_UNKNOWN
;
259 /// Add new basic block for slow div and rem operations and put it before
261 QuotRemWithBB
FastDivInsertionTask::createSlowBB(BasicBlock
*SuccessorBB
) {
262 QuotRemWithBB DivRemPair
;
263 DivRemPair
.BB
= BasicBlock::Create(MainBB
->getParent()->getContext(), "",
264 MainBB
->getParent(), SuccessorBB
);
265 IRBuilder
<> Builder(DivRemPair
.BB
, DivRemPair
.BB
->begin());
266 Builder
.SetCurrentDebugLocation(SlowDivOrRem
->getDebugLoc());
268 Value
*Dividend
= SlowDivOrRem
->getOperand(0);
269 Value
*Divisor
= SlowDivOrRem
->getOperand(1);
272 DivRemPair
.Quotient
= Builder
.CreateSDiv(Dividend
, Divisor
);
273 DivRemPair
.Remainder
= Builder
.CreateSRem(Dividend
, Divisor
);
275 DivRemPair
.Quotient
= Builder
.CreateUDiv(Dividend
, Divisor
);
276 DivRemPair
.Remainder
= Builder
.CreateURem(Dividend
, Divisor
);
279 Builder
.CreateBr(SuccessorBB
);
283 /// Add new basic block for fast div and rem operations and put it before
285 QuotRemWithBB
FastDivInsertionTask::createFastBB(BasicBlock
*SuccessorBB
) {
286 QuotRemWithBB DivRemPair
;
287 DivRemPair
.BB
= BasicBlock::Create(MainBB
->getParent()->getContext(), "",
288 MainBB
->getParent(), SuccessorBB
);
289 IRBuilder
<> Builder(DivRemPair
.BB
, DivRemPair
.BB
->begin());
290 Builder
.SetCurrentDebugLocation(SlowDivOrRem
->getDebugLoc());
292 Value
*Dividend
= SlowDivOrRem
->getOperand(0);
293 Value
*Divisor
= SlowDivOrRem
->getOperand(1);
294 Value
*ShortDivisorV
=
295 Builder
.CreateCast(Instruction::Trunc
, Divisor
, BypassType
);
296 Value
*ShortDividendV
=
297 Builder
.CreateCast(Instruction::Trunc
, Dividend
, BypassType
);
299 // udiv/urem because this optimization only handles positive numbers.
300 Value
*ShortQV
= Builder
.CreateUDiv(ShortDividendV
, ShortDivisorV
);
301 Value
*ShortRV
= Builder
.CreateURem(ShortDividendV
, ShortDivisorV
);
302 DivRemPair
.Quotient
=
303 Builder
.CreateCast(Instruction::ZExt
, ShortQV
, getSlowType());
304 DivRemPair
.Remainder
=
305 Builder
.CreateCast(Instruction::ZExt
, ShortRV
, getSlowType());
306 Builder
.CreateBr(SuccessorBB
);
311 /// Creates Phi nodes for result of Div and Rem.
312 QuotRemPair
FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB
&LHS
,
315 IRBuilder
<> Builder(PhiBB
, PhiBB
->begin());
316 Builder
.SetCurrentDebugLocation(SlowDivOrRem
->getDebugLoc());
317 PHINode
*QuoPhi
= Builder
.CreatePHI(getSlowType(), 2);
318 QuoPhi
->addIncoming(LHS
.Quotient
, LHS
.BB
);
319 QuoPhi
->addIncoming(RHS
.Quotient
, RHS
.BB
);
320 PHINode
*RemPhi
= Builder
.CreatePHI(getSlowType(), 2);
321 RemPhi
->addIncoming(LHS
.Remainder
, LHS
.BB
);
322 RemPhi
->addIncoming(RHS
.Remainder
, RHS
.BB
);
323 return QuotRemPair(QuoPhi
, RemPhi
);
326 /// Creates a runtime check to test whether both the divisor and dividend fit
327 /// into BypassType. The check is inserted at the end of MainBB. True return
328 /// value means that the operands fit. Either of the operands may be NULL if it
329 /// doesn't need a runtime check.
330 Value
*FastDivInsertionTask::insertOperandRuntimeCheck(Value
*Op1
, Value
*Op2
) {
331 assert((Op1
|| Op2
) && "Nothing to check");
332 IRBuilder
<> Builder(MainBB
, MainBB
->end());
333 Builder
.SetCurrentDebugLocation(SlowDivOrRem
->getDebugLoc());
337 OrV
= Builder
.CreateOr(Op1
, Op2
);
339 OrV
= Op1
? Op1
: Op2
;
341 // BitMask is inverted to check if the operands are
342 // larger than the bypass type
343 uint64_t BitMask
= ~BypassType
->getBitMask();
344 Value
*AndV
= Builder
.CreateAnd(OrV
, BitMask
);
346 // Compare operand values
347 Value
*ZeroV
= ConstantInt::getSigned(getSlowType(), 0);
348 return Builder
.CreateICmpEQ(AndV
, ZeroV
);
351 /// Substitutes the div/rem instruction with code that checks the value of the
352 /// operands and uses a shorter-faster div/rem instruction when possible.
353 Optional
<QuotRemPair
> FastDivInsertionTask::insertFastDivAndRem() {
354 Value
*Dividend
= SlowDivOrRem
->getOperand(0);
355 Value
*Divisor
= SlowDivOrRem
->getOperand(1);
358 ValueRange DividendRange
= getValueRange(Dividend
, SetL
);
359 if (DividendRange
== VALRNG_LIKELY_LONG
)
363 ValueRange DivisorRange
= getValueRange(Divisor
, SetR
);
364 if (DivisorRange
== VALRNG_LIKELY_LONG
)
367 bool DividendShort
= (DividendRange
== VALRNG_KNOWN_SHORT
);
368 bool DivisorShort
= (DivisorRange
== VALRNG_KNOWN_SHORT
);
370 if (DividendShort
&& DivisorShort
) {
371 // If both operands are known to be short then just replace the long
372 // division with a short one in-place. Since we're not introducing control
373 // flow in this case, narrowing the division is always a win, even if the
374 // divisor is a constant (and will later get replaced by a multiplication).
376 IRBuilder
<> Builder(SlowDivOrRem
);
377 Value
*TruncDividend
= Builder
.CreateTrunc(Dividend
, BypassType
);
378 Value
*TruncDivisor
= Builder
.CreateTrunc(Divisor
, BypassType
);
379 Value
*TruncDiv
= Builder
.CreateUDiv(TruncDividend
, TruncDivisor
);
380 Value
*TruncRem
= Builder
.CreateURem(TruncDividend
, TruncDivisor
);
381 Value
*ExtDiv
= Builder
.CreateZExt(TruncDiv
, getSlowType());
382 Value
*ExtRem
= Builder
.CreateZExt(TruncRem
, getSlowType());
383 return QuotRemPair(ExtDiv
, ExtRem
);
386 if (isa
<ConstantInt
>(Divisor
)) {
387 // If the divisor is not a constant, DAGCombiner will convert it to a
388 // multiplication by a magic constant. It isn't clear if it is worth
389 // introducing control flow to get a narrower multiply.
393 // After Constant Hoisting pass, long constants may be represented as
394 // bitcast instructions. As a result, some constants may look like an
395 // instruction at first, and an additional check is necessary to find out if
396 // an operand is actually a constant.
397 if (auto *BCI
= dyn_cast
<BitCastInst
>(Divisor
))
398 if (BCI
->getParent() == SlowDivOrRem
->getParent() &&
399 isa
<ConstantInt
>(BCI
->getOperand(0)))
402 IRBuilder
<> Builder(MainBB
, MainBB
->end());
403 Builder
.SetCurrentDebugLocation(SlowDivOrRem
->getDebugLoc());
405 if (DividendShort
&& !isSignedOp()) {
406 // If the division is unsigned and Dividend is known to be short, then
408 // 1) Divisor is less or equal to Dividend, and the result can be computed
409 // with a short division.
410 // 2) Divisor is greater than Dividend. In this case, no division is needed
411 // at all: The quotient is 0 and the remainder is equal to Dividend.
413 // So instead of checking at runtime whether Divisor fits into BypassType,
414 // we emit a runtime check to differentiate between these two cases. This
415 // lets us entirely avoid a long div.
417 // Split the basic block before the div/rem.
418 BasicBlock
*SuccessorBB
= MainBB
->splitBasicBlock(SlowDivOrRem
);
419 // Remove the unconditional branch from MainBB to SuccessorBB.
420 MainBB
->getInstList().back().eraseFromParent();
423 Long
.Quotient
= ConstantInt::get(getSlowType(), 0);
424 Long
.Remainder
= Dividend
;
425 QuotRemWithBB Fast
= createFastBB(SuccessorBB
);
426 QuotRemPair Result
= createDivRemPhiNodes(Fast
, Long
, SuccessorBB
);
427 Value
*CmpV
= Builder
.CreateICmpUGE(Dividend
, Divisor
);
428 Builder
.CreateCondBr(CmpV
, Fast
.BB
, SuccessorBB
);
431 // General case. Create both slow and fast div/rem pairs and choose one of
434 // Split the basic block before the div/rem.
435 BasicBlock
*SuccessorBB
= MainBB
->splitBasicBlock(SlowDivOrRem
);
436 // Remove the unconditional branch from MainBB to SuccessorBB.
437 MainBB
->getInstList().back().eraseFromParent();
438 QuotRemWithBB Fast
= createFastBB(SuccessorBB
);
439 QuotRemWithBB Slow
= createSlowBB(SuccessorBB
);
440 QuotRemPair Result
= createDivRemPhiNodes(Fast
, Slow
, SuccessorBB
);
441 Value
*CmpV
= insertOperandRuntimeCheck(DividendShort
? nullptr : Dividend
,
442 DivisorShort
? nullptr : Divisor
);
443 Builder
.CreateCondBr(CmpV
, Fast
.BB
, Slow
.BB
);
448 /// This optimization identifies DIV/REM instructions in a BB that can be
449 /// profitably bypassed and carried out with a shorter, faster divide.
450 bool llvm::bypassSlowDivision(BasicBlock
*BB
,
451 const BypassWidthsTy
&BypassWidths
) {
452 DivCacheTy PerBBDivCache
;
454 bool MadeChange
= false;
455 Instruction
*Next
= &*BB
->begin();
456 while (Next
!= nullptr) {
457 // We may add instructions immediately after I, but we want to skip over
459 Instruction
*I
= Next
;
460 Next
= Next
->getNextNode();
462 // Ignore dead code to save time and avoid bugs.
466 FastDivInsertionTask
Task(I
, BypassWidths
);
467 if (Value
*Replacement
= Task
.getReplacement(PerBBDivCache
)) {
468 I
->replaceAllUsesWith(Replacement
);
469 I
->eraseFromParent();
474 // Above we eagerly create divs and rems, as pairs, so that we can efficiently
475 // create divrem machine instructions. Now erase any unused divs / rems so we
476 // don't leave extra instructions sitting around.
477 for (auto &KV
: PerBBDivCache
)
478 for (Value
*V
: {KV
.second
.Quotient
, KV
.second
.Remainder
})
479 RecursivelyDeleteTriviallyDeadInstructions(V
);