lib/Target/ARM/ARMCodeGenPrepare.cpp

   1 //===----- ARMCodeGenPrepare.cpp ------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// This pass inserts intrinsics to handle small types that would otherwise be
  11 /// promoted during legalization. Here we can manually promote types or insert
  12 /// intrinsics which can handle narrow types that aren't supported by the
  13 /// register classes.
  14 //
  15 //===----------------------------------------------------------------------===//
  16
  17 #include "ARM.h"
  18 #include "ARMSubtarget.h"
  19 #include "ARMTargetMachine.h"
  20 #include "llvm/ADT/StringRef.h"
  21 #include "llvm/CodeGen/Passes.h"
  22 #include "llvm/CodeGen/TargetPassConfig.h"
  23 #include "llvm/IR/Attributes.h"
  24 #include "llvm/IR/BasicBlock.h"
  25 #include "llvm/IR/IRBuilder.h"
  26 #include "llvm/IR/Constants.h"
  27 #include "llvm/IR/InstrTypes.h"
  28 #include "llvm/IR/Instruction.h"
  29 #include "llvm/IR/Instructions.h"
  30 #include "llvm/IR/IntrinsicInst.h"
  31 #include "llvm/IR/Intrinsics.h"
  32 #include "llvm/IR/Type.h"
  33 #include "llvm/IR/Value.h"
  34 #include "llvm/IR/Verifier.h"
  35 #include "llvm/Pass.h"
  36 #include "llvm/Support/Casting.h"
  37 #include "llvm/Support/CommandLine.h"
  38
  39 #define DEBUG_TYPE "arm-codegenprepare"
  40
  41 using namespace llvm;
  42
  43 static cl::opt<bool>
  44 DisableCGP("arm-disable-cgp", cl::Hidden, cl::init(true),
  45            cl::desc("Disable ARM specific CodeGenPrepare pass"));
  46
  47 static cl::opt<bool>
  48 EnableDSP("arm-enable-scalar-dsp", cl::Hidden, cl::init(false),
  49           cl::desc("Use DSP instructions for scalar operations"));
  50
  51 static cl::opt<bool>
  52 EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false),
  53                    cl::desc("Use DSP instructions for scalar operations\
  54                             with immediate operands"));
  55
  56 // The goal of this pass is to enable more efficient code generation for
  57 // operations on narrow types (i.e. types with < 32-bits) and this is a
  58 // motivating IR code example:
  59 //
  60 //   define hidden i32 @cmp(i8 zeroext) {
  61 //     %2 = add i8 %0, -49
  62 //     %3 = icmp ult i8 %2, 3
  63 //     ..
  64 //   }
  65 //
  66 // The issue here is that i8 is type-legalized to i32 because i8 is not a
  67 // legal type. Thus, arithmetic is done in integer-precision, but then the
  68 // byte value is masked out as follows:
  69 //
  70 //   t19: i32 = add t4, Constant:i32<-49>
  71 //     t24: i32 = and t19, Constant:i32<255>
  72 //
  73 // Consequently, we generate code like this:
  74 //
  75 //   subs  r0, #49
  76 //   uxtb  r1, r0
  77 //   cmp r1, #3
  78 //
  79 // This shows that masking out the byte value results in generation of
  80 // the UXTB instruction. This is not optimal as r0 already contains the byte
  81 // value we need, and so instead we can just generate:
  82 //
  83 //   sub.w r1, r0, #49
  84 //   cmp r1, #3
  85 //
  86 // We achieve this by type promoting the IR to i32 like so for this example:
  87 //
  88 //   define i32 @cmp(i8 zeroext %c) {
  89 //     %0 = zext i8 %c to i32
  90 //     %c.off = add i32 %0, -49
  91 //     %1 = icmp ult i32 %c.off, 3
  92 //     ..
  93 //   }
  94 //
  95 // For this to be valid and legal, we need to prove that the i32 add is
  96 // producing the same value as the i8 addition, and that e.g. no overflow
  97 // happens.
  98 //
  99 // A brief sketch of the algorithm and some terminology.
 100 // We pattern match interesting IR patterns:
 101 // - which have "sources": instructions producing narrow values (i8, i16), and
 102 // - they have "sinks": instructions consuming these narrow values.
 103 //
 104 // We collect all instruction connecting sources and sinks in a worklist, so
 105 // that we can mutate these instruction and perform type promotion when it is
 106 // legal to do so.
 107
 108 namespace {
 109 class IRPromoter {
 110   SmallPtrSet<Value*, 8> NewInsts;
 111   SmallPtrSet<Instruction*, 4> InstsToRemove;
 112   DenseMap<Value*, SmallVector<Type*, 4>> TruncTysMap;
 113   SmallPtrSet<Value*, 8> Promoted;
 114   Module *M = nullptr;
 115   LLVMContext &Ctx;
 116   // The type we promote to: always i32
 117   IntegerType *ExtTy = nullptr;
 118   // The type of the value that the search began from, either i8 or i16.
 119   // This defines the max range of the values that we allow in the promoted
 120   // tree.
 121   IntegerType *OrigTy = nullptr;
 122   SmallPtrSetImpl<Value*> *Visited;
 123   SmallPtrSetImpl<Value*> *Sources;
 124   SmallPtrSetImpl<Instruction*> *Sinks;
 125   SmallPtrSetImpl<Instruction*> *SafeToPromote;
 126
 127   void ReplaceAllUsersOfWith(Value *From, Value *To);
 128   void PrepareConstants(void);
 129   void ExtendSources(void);
 130   void ConvertTruncs(void);
 131   void PromoteTree(void);
 132   void TruncateSinks(void);
 133   void Cleanup(void);
 134
 135 public:
 136   IRPromoter(Module *M) : M(M), Ctx(M->getContext()),
 137                           ExtTy(Type::getInt32Ty(Ctx)) { }
 138
 139
 140   void Mutate(Type *OrigTy,
 141               SmallPtrSetImpl<Value*> &Visited,
 142               SmallPtrSetImpl<Value*> &Sources,
 143               SmallPtrSetImpl<Instruction*> &Sinks,
 144               SmallPtrSetImpl<Instruction*> &SafeToPromote);
 145 };
 146
 147 class ARMCodeGenPrepare : public FunctionPass {
 148   const ARMSubtarget *ST = nullptr;
 149   IRPromoter *Promoter = nullptr;
 150   std::set<Value*> AllVisited;
 151   SmallPtrSet<Instruction*, 8> SafeToPromote;
 152
 153   bool isSafeOverflow(Instruction *I);
 154   bool isSupportedValue(Value *V);
 155   bool isLegalToPromote(Value *V);
 156   bool TryToPromote(Value *V);
 157
 158 public:
 159   static char ID;
 160   static unsigned TypeSize;
 161   Type *OrigTy = nullptr;
 162
 163   ARMCodeGenPrepare() : FunctionPass(ID) {}
 164
 165   void getAnalysisUsage(AnalysisUsage &AU) const override {
 166     AU.addRequired<TargetPassConfig>();
 167   }
 168
 169   StringRef getPassName() const override { return "ARM IR optimizations"; }
 170
 171   bool doInitialization(Module &M) override;
 172   bool runOnFunction(Function &F) override;
 173   bool doFinalization(Module &M) override;
 174 };
 175
 176 }
 177
 178 static bool generateSignBits(Value *V) {
 179   if (!isa<Instruction>(V))
 180     return false;
 181
 182   unsigned Opc = cast<Instruction>(V)->getOpcode();
 183   return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
 184          Opc == Instruction::SRem;
 185 }
 186
 187 static bool EqualTypeSize(Value *V) {
 188   return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize;
 189 }
 190
 191 static bool LessOrEqualTypeSize(Value *V) {
 192   return V->getType()->getScalarSizeInBits() <= ARMCodeGenPrepare::TypeSize;
 193 }
 194
 195 static bool GreaterThanTypeSize(Value *V) {
 196   return V->getType()->getScalarSizeInBits() > ARMCodeGenPrepare::TypeSize;
 197 }
 198
 199 static bool LessThanTypeSize(Value *V) {
 200   return V->getType()->getScalarSizeInBits() < ARMCodeGenPrepare::TypeSize;
 201 }
 202
 203 /// Some instructions can use 8- and 16-bit operands, and we don't need to
 204 /// promote anything larger. We disallow booleans to make life easier when
 205 /// dealing with icmps but allow any other integer that is <= 16 bits. Void
 206 /// types are accepted so we can handle switches.
 207 static bool isSupportedType(Value *V) {
 208   Type *Ty = V->getType();
 209
 210   // Allow voids and pointers, these won't be promoted.
 211   if (Ty->isVoidTy() || Ty->isPointerTy())
 212     return true;
 213
 214   if (auto *Ld = dyn_cast<LoadInst>(V))
 215     Ty = cast<PointerType>(Ld->getPointerOperandType())->getElementType();
 216
 217   if (!isa<IntegerType>(Ty) ||
 218       cast<IntegerType>(V->getType())->getBitWidth() == 1)
 219     return false;
 220
 221   return LessOrEqualTypeSize(V);
 222 }
 223
 224 /// Return true if the given value is a source in the use-def chain, producing
 225 /// a narrow 'TypeSize' value. These values will be zext to start the promotion
 226 /// of the tree to i32. We guarantee that these won't populate the upper bits
 227 /// of the register. ZExt on the loads will be free, and the same for call
 228 /// return values because we only accept ones that guarantee a zeroext ret val.
 229 /// Many arguments will have the zeroext attribute too, so those would be free
 230 /// too.
 231 static bool isSource(Value *V) {
 232   if (!isa<IntegerType>(V->getType()))
 233     return false;
 234
 235   // TODO Allow zext to be sources.
 236   if (isa<Argument>(V))
 237     return true;
 238   else if (isa<LoadInst>(V))
 239     return true;
 240   else if (isa<BitCastInst>(V))
 241     return true;
 242   else if (auto *Call = dyn_cast<CallInst>(V))
 243     return Call->hasRetAttr(Attribute::AttrKind::ZExt);
 244   else if (auto *Trunc = dyn_cast<TruncInst>(V))
 245     return EqualTypeSize(Trunc);
 246   return false;
 247 }
 248
 249 /// Return true if V will require any promoted values to be truncated for the
 250 /// the IR to remain valid. We can't mutate the value type of these
 251 /// instructions.
 252 static bool isSink(Value *V) {
 253   // TODO The truncate also isn't actually necessary because we would already
 254   // proved that the data value is kept within the range of the original data
 255   // type.
 256
 257   // Sinks are:
 258   // - points where the value in the register is being observed, such as an
 259   //   icmp, switch or store.
 260   // - points where value types have to match, such as calls and returns.
 261   // - zext are included to ease the transformation and are generally removed
 262   //   later on.
 263   if (auto *Store = dyn_cast<StoreInst>(V))
 264     return LessOrEqualTypeSize(Store->getValueOperand());
 265   if (auto *Return = dyn_cast<ReturnInst>(V))
 266     return LessOrEqualTypeSize(Return->getReturnValue());
 267   if (auto *ZExt = dyn_cast<ZExtInst>(V))
 268     return GreaterThanTypeSize(ZExt);
 269   if (auto *Switch = dyn_cast<SwitchInst>(V))
 270     return LessThanTypeSize(Switch->getCondition());
 271   if (auto *ICmp = dyn_cast<ICmpInst>(V))
 272     return ICmp->isSigned() || LessThanTypeSize(ICmp->getOperand(0));
 273
 274   return isa<CallInst>(V);
 275 }
 276
 277 /// Return whether the instruction can be promoted within any modifications to
 278 /// its operands or result.
 279 bool ARMCodeGenPrepare::isSafeOverflow(Instruction *I) {
 280   // FIXME Do we need NSW too?
 281   if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
 282     return true;
 283
 284   // We can support a, potentially, overflowing instruction (I) if:
 285   // - It is only used by an unsigned icmp.
 286   // - The icmp uses a constant.
 287   // - The overflowing value (I) is decreasing, i.e would underflow - wrapping
 288   //   around zero to become a larger number than before.
 289   // - The underflowing instruction (I) also uses a constant.
 290   //
 291   // We can then use the two constants to calculate whether the result would
 292   // wrap in respect to itself in the original bitwidth. If it doesn't wrap,
 293   // just underflows the range, the icmp would give the same result whether the
 294   // result has been truncated or not. We calculate this by:
 295   // - Zero extending both constants, if needed, to 32-bits.
 296   // - Take the absolute value of I's constant, adding this to the icmp const.
 297   // - Check that this value is not out of range for small type. If it is, it
 298   //   means that it has underflowed enough to wrap around the icmp constant.
 299   //
 300   // For example:
 301   //
 302   // %sub = sub i8 %a, 2
 303   // %cmp = icmp ule i8 %sub, 254
 304   //
 305   // If %a = 0, %sub = -2 == FE == 254
 306   // But if this is evalulated as a i32
 307   // %sub = -2 == FF FF FF FE == 4294967294
 308   // So the unsigned compares (i8 and i32) would not yield the same result.
 309   //
 310   // Another way to look at it is:
 311   // %a - 2 <= 254
 312   // %a + 2 <= 254 + 2
 313   // %a <= 256
 314   // And we can't represent 256 in the i8 format, so we don't support it.
 315   //
 316   // Whereas:
 317   //
 318   // %sub i8 %a, 1
 319   // %cmp = icmp ule i8 %sub, 254
 320   //
 321   // If %a = 0, %sub = -1 == FF == 255
 322   // As i32:
 323   // %sub = -1 == FF FF FF FF == 4294967295
 324   //
 325   // In this case, the unsigned compare results would be the same and this
 326   // would also be true for ult, uge and ugt:
 327   // - (255 < 254) == (0xFFFFFFFF < 254) == false
 328   // - (255 <= 254) == (0xFFFFFFFF <= 254) == false
 329   // - (255 > 254) == (0xFFFFFFFF > 254) == true
 330   // - (255 >= 254) == (0xFFFFFFFF >= 254) == true
 331   //
 332   // To demonstrate why we can't handle increasing values:
 333   //
 334   // %add = add i8 %a, 2
 335   // %cmp = icmp ult i8 %add, 127
 336   //
 337   // If %a = 254, %add = 256 == (i8 1)
 338   // As i32:
 339   // %add = 256
 340   //
 341   // (1 < 127) != (256 < 127)
 342
 343   unsigned Opc = I->getOpcode();
 344   if (Opc != Instruction::Add && Opc != Instruction::Sub)
 345     return false;
 346
 347   if (!I->hasOneUse() ||
 348       !isa<ICmpInst>(*I->user_begin()) ||
 349       !isa<ConstantInt>(I->getOperand(1)))
 350     return false;
 351
 352   ConstantInt *OverflowConst = cast<ConstantInt>(I->getOperand(1));
 353   bool NegImm = OverflowConst->isNegative();
 354   bool IsDecreasing = ((Opc == Instruction::Sub) && !NegImm) ||
 355                        ((Opc == Instruction::Add) && NegImm);
 356   if (!IsDecreasing)
 357     return false;
 358
 359   // Don't support an icmp that deals with sign bits.
 360   auto *CI = cast<ICmpInst>(*I->user_begin());
 361   if (CI->isSigned() || CI->isEquality())
 362     return false;
 363
 364   ConstantInt *ICmpConst = nullptr;
 365   if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(0)))
 366     ICmpConst = Const;
 367   else if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(1)))
 368     ICmpConst = Const;
 369   else
 370     return false;
 371
 372   // Now check that the result can't wrap on itself.
 373   APInt Total = ICmpConst->getValue().getBitWidth() < 32 ?
 374     ICmpConst->getValue().zext(32) : ICmpConst->getValue();
 375
 376   Total += OverflowConst->getValue().getBitWidth() < 32 ?
 377     OverflowConst->getValue().abs().zext(32) : OverflowConst->getValue().abs();
 378
 379   APInt Max = APInt::getAllOnesValue(ARMCodeGenPrepare::TypeSize);
 380
 381   if (Total.getBitWidth() > Max.getBitWidth()) {
 382     if (Total.ugt(Max.zext(Total.getBitWidth())))
 383       return false;
 384   } else if (Max.getBitWidth() > Total.getBitWidth()) {
 385     if (Total.zext(Max.getBitWidth()).ugt(Max))
 386       return false;
 387   } else if (Total.ugt(Max))
 388     return false;
 389
 390   LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n");
 391   return true;
 392 }
 393
 394 static bool shouldPromote(Value *V) {
 395   if (!isa<IntegerType>(V->getType()) || isSink(V))
 396     return false;
 397
 398   if (isSource(V))
 399     return true;
 400
 401   auto *I = dyn_cast<Instruction>(V);
 402   if (!I)
 403     return false;
 404
 405   if (isa<ICmpInst>(I))
 406     return false;
 407
 408   return true;
 409 }
 410
 411 /// Return whether we can safely mutate V's type to ExtTy without having to be
 412 /// concerned with zero extending or truncation.
 413 static bool isPromotedResultSafe(Value *V) {
 414   if (!isa<Instruction>(V))
 415     return true;
 416
 417   if (generateSignBits(V))
 418     return false;
 419
 420   return !isa<OverflowingBinaryOperator>(V);
 421 }
 422
 423 /// Return the intrinsic for the instruction that can perform the same
 424 /// operation but on a narrow type. This is using the parallel dsp intrinsics
 425 /// on scalar values.
 426 static Intrinsic::ID getNarrowIntrinsic(Instruction *I) {
 427   // Whether we use the signed or unsigned versions of these intrinsics
 428   // doesn't matter because we're not using the GE bits that they set in
 429   // the APSR.
 430   switch(I->getOpcode()) {
 431   default:
 432     break;
 433   case Instruction::Add:
 434     return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_uadd16 :
 435       Intrinsic::arm_uadd8;
 436   case Instruction::Sub:
 437     return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_usub16 :
 438       Intrinsic::arm_usub8;
 439   }
 440   llvm_unreachable("unhandled opcode for narrow intrinsic");
 441 }
 442
 443 void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) {
 444   SmallVector<Instruction*, 4> Users;
 445   Instruction *InstTo = dyn_cast<Instruction>(To);
 446   bool ReplacedAll = true;
 447
 448   LLVM_DEBUG(dbgs() << "ARM CGP: Replacing " << *From << " with " << *To
 449              << "\n");
 450
 451   for (Use &U : From->uses()) {
 452     auto *User = cast<Instruction>(U.getUser());
 453     if (InstTo && User->isIdenticalTo(InstTo)) {
 454       ReplacedAll = false;
 455       continue;
 456     }
 457     Users.push_back(User);
 458   }
 459
 460   for (auto *U : Users)
 461     U->replaceUsesOfWith(From, To);
 462
 463   if (ReplacedAll)
 464     if (auto *I = dyn_cast<Instruction>(From))
 465       InstsToRemove.insert(I);
 466 }
 467
 468 void IRPromoter::PrepareConstants() {
 469   IRBuilder<> Builder{Ctx};
 470   // First step is to prepare the instructions for mutation. Most constants
 471   // just need to be zero extended into their new type, but complications arise
 472   // because:
 473   // - For nuw binary operators, negative immediates would need sign extending;
 474   //   however, instead we'll change them to positive and zext them. We can do
 475   //   this because:
 476   //   > The operators that can wrap are: add, sub, mul and shl.
 477   //   > shl interprets its second operand as unsigned and if the first operand
 478   //     is an immediate, it will need zext to be nuw.
 479   //   > I'm assuming mul has to interpret immediates as unsigned for nuw.
 480   //   > Which leaves the nuw add and sub to be handled; as with shl, if an
 481   //     immediate is used as operand 0, it will need zext to be nuw.
 482   // - We also allow add and sub to safely overflow in certain circumstances
 483   //   and only when the value (operand 0) is being decreased.
 484   //
 485   // For adds and subs, that are either nuw or safely wrap and use a negative
 486   // immediate as operand 1, we create an equivalent instruction using a
 487   // positive immediate. That positive immediate can then be zext along with
 488   // all the other immediates later.
 489   for (auto *V : *Visited) {
 490     if (!isa<Instruction>(V))
 491       continue;
 492
 493     auto *I = cast<Instruction>(V);
 494     if (SafeToPromote->count(I)) {
 495
 496       if (!isa<OverflowingBinaryOperator>(I))
 497         continue;
 498
 499       if (auto *Const = dyn_cast<ConstantInt>(I->getOperand(1))) {
 500         if (!Const->isNegative())
 501           break;
 502
 503         unsigned Opc = I->getOpcode();
 504         if (Opc != Instruction::Add && Opc != Instruction::Sub)
 505           continue;
 506
 507         LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n");
 508         auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs());
 509         Builder.SetInsertPoint(I);
 510         Value *NewVal = Opc == Instruction::Sub ?
 511           Builder.CreateAdd(I->getOperand(0), NewConst) :
 512           Builder.CreateSub(I->getOperand(0), NewConst);
 513         LLVM_DEBUG(dbgs() << "ARM CGP: New equivalent: " << *NewVal << "\n");
 514
 515         if (auto *NewInst = dyn_cast<Instruction>(NewVal)) {
 516           NewInst->copyIRFlags(I);
 517           NewInsts.insert(NewInst);
 518         }
 519         InstsToRemove.insert(I);
 520         I->replaceAllUsesWith(NewVal);
 521       }
 522     }
 523   }
 524   for (auto *I : NewInsts)
 525     Visited->insert(I);
 526 }
 527
 528 void IRPromoter::ExtendSources() {
 529   IRBuilder<> Builder{Ctx};
 530
 531   auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
 532     assert(V->getType() != ExtTy && "zext already extends to i32");
 533     LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n");
 534     Builder.SetInsertPoint(InsertPt);
 535     if (auto *I = dyn_cast<Instruction>(V))
 536       Builder.SetCurrentDebugLocation(I->getDebugLoc());
 537
 538     Value *ZExt = Builder.CreateZExt(V, ExtTy);
 539     if (auto *I = dyn_cast<Instruction>(ZExt)) {
 540       if (isa<Argument>(V))
 541         I->moveBefore(InsertPt);
 542       else
 543         I->moveAfter(InsertPt);
 544       NewInsts.insert(I);
 545     }
 546
 547     ReplaceAllUsersOfWith(V, ZExt);
 548   };
 549
 550   // Now, insert extending instructions between the sources and their users.
 551   LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n");
 552   for (auto V : *Sources) {
 553     LLVM_DEBUG(dbgs() << " - " << *V << "\n");
 554     if (auto *I = dyn_cast<Instruction>(V))
 555       InsertZExt(I, I);
 556     else if (auto *Arg = dyn_cast<Argument>(V)) {
 557       BasicBlock &BB = Arg->getParent()->front();
 558       InsertZExt(Arg, &*BB.getFirstInsertionPt());
 559     } else {
 560       llvm_unreachable("unhandled source that needs extending");
 561     }
 562     Promoted.insert(V);
 563   }
 564 }
 565
 566 void IRPromoter::PromoteTree() {
 567   LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n");
 568
 569   IRBuilder<> Builder{Ctx};
 570
 571   // Mutate the types of the instructions within the tree. Here we handle
 572   // constant operands.
 573   for (auto *V : *Visited) {
 574     if (Sources->count(V))
 575       continue;
 576
 577     auto *I = cast<Instruction>(V);
 578     if (Sinks->count(I))
 579       continue;
 580
 581     for (unsigned i = 0, e = I->getNumOperands(); i < e; ++i) {
 582       Value *Op = I->getOperand(i);
 583       if ((Op->getType() == ExtTy) || !isa<IntegerType>(Op->getType()))
 584         continue;
 585
 586       if (auto *Const = dyn_cast<ConstantInt>(Op)) {
 587         Constant *NewConst = ConstantExpr::getZExt(Const, ExtTy);
 588         I->setOperand(i, NewConst);
 589       } else if (isa<UndefValue>(Op))
 590         I->setOperand(i, UndefValue::get(ExtTy));
 591     }
 592
 593     if (shouldPromote(I)) {
 594       I->mutateType(ExtTy);
 595       Promoted.insert(I);
 596     }
 597   }
 598
 599   // Finally, any instructions that should be promoted but haven't yet been,
 600   // need to be handled using intrinsics.
 601   for (auto *V : *Visited) {
 602     auto *I = dyn_cast<Instruction>(V);
 603     if (!I)
 604       continue;
 605
 606     if (Sources->count(I) || Sinks->count(I))
 607       continue;
 608
 609     if (!shouldPromote(I) || SafeToPromote->count(I) || NewInsts.count(I))
 610       continue;
 611
 612     assert(EnableDSP && "DSP intrinisc insertion not enabled!");
 613
 614     // Replace unsafe instructions with appropriate intrinsic calls.
 615     LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
 616                << *I << "\n");
 617     Function *DSPInst =
 618       Intrinsic::getDeclaration(M, getNarrowIntrinsic(I));
 619     Builder.SetInsertPoint(I);
 620     Builder.SetCurrentDebugLocation(I->getDebugLoc());
 621     Value *Args[] = { I->getOperand(0), I->getOperand(1) };
 622     CallInst *Call = Builder.CreateCall(DSPInst, Args);
 623     NewInsts.insert(Call);
 624     ReplaceAllUsersOfWith(I, Call);
 625   }
 626 }
 627
 628 void IRPromoter::TruncateSinks() {
 629   LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n");
 630
 631   IRBuilder<> Builder{Ctx};
 632
 633   auto InsertTrunc = [&](Value *V, Type *TruncTy) -> Instruction* {
 634     if (!isa<Instruction>(V) || !isa<IntegerType>(V->getType()))
 635       return nullptr;
 636
 637     if ((!Promoted.count(V) && !NewInsts.count(V)) || Sources->count(V))
 638       return nullptr;
 639
 640     LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy << " Trunc for "
 641                << *V << "\n");
 642     Builder.SetInsertPoint(cast<Instruction>(V));
 643     auto *Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(V, TruncTy));
 644     if (Trunc)
 645       NewInsts.insert(Trunc);
 646     return Trunc;
 647   };
 648
 649   // Fix up any stores or returns that use the results of the promoted
 650   // chain.
 651   for (auto I : *Sinks) {
 652     LLVM_DEBUG(dbgs() << "ARM CGP: For Sink: " << *I << "\n");
 653
 654     // Handle calls separately as we need to iterate over arg operands.
 655     if (auto *Call = dyn_cast<CallInst>(I)) {
 656       for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) {
 657         Value *Arg = Call->getArgOperand(i);
 658         Type *Ty = TruncTysMap[Call][i];
 659         if (Instruction *Trunc = InsertTrunc(Arg, Ty)) {
 660           Trunc->moveBefore(Call);
 661           Call->setArgOperand(i, Trunc);
 662         }
 663       }
 664       continue;
 665     }
 666
 667     // Special case switches because we need to truncate the condition.
 668     if (auto *Switch = dyn_cast<SwitchInst>(I)) {
 669       Type *Ty = TruncTysMap[Switch][0];
 670       if (Instruction *Trunc = InsertTrunc(Switch->getCondition(), Ty)) {
 671         Trunc->moveBefore(Switch);
 672         Switch->setCondition(Trunc);
 673       }
 674       continue;
 675     }
 676
 677     // Now handle the others.
 678     for (unsigned i = 0; i < I->getNumOperands(); ++i) {
 679       Type *Ty = TruncTysMap[I][i];
 680       if (Instruction *Trunc = InsertTrunc(I->getOperand(i), Ty)) {
 681         Trunc->moveBefore(I);
 682         I->setOperand(i, Trunc);
 683       }
 684     }
 685   }
 686 }
 687
 688 void IRPromoter::Cleanup() {
 689   LLVM_DEBUG(dbgs() << "ARM CGP: Cleanup..\n");
 690   // Some zexts will now have become redundant, along with their trunc
 691   // operands, so remove them
 692   for (auto V : *Visited) {
 693     if (!isa<ZExtInst>(V))
 694       continue;
 695
 696     auto ZExt = cast<ZExtInst>(V);
 697     if (ZExt->getDestTy() != ExtTy)
 698       continue;
 699
 700     Value *Src = ZExt->getOperand(0);
 701     if (ZExt->getSrcTy() == ZExt->getDestTy()) {
 702       LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary cast: " << *ZExt
 703                  << "\n");
 704       ReplaceAllUsersOfWith(ZExt, Src);
 705       continue;
 706     }
 707
 708     // Unless they produce a value that is narrower than ExtTy, we can
 709     // replace the result of the zext with the input of a newly inserted
 710     // trunc.
 711     if (NewInsts.count(Src) && isa<TruncInst>(Src) &&
 712         Src->getType() == OrigTy) {
 713       auto *Trunc = cast<TruncInst>(Src);
 714       assert(Trunc->getOperand(0)->getType() == ExtTy &&
 715              "expected inserted trunc to be operating on i32");
 716       ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0));
 717     }
 718   }
 719
 720   for (auto *I : InstsToRemove) {
 721     LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
 722     I->dropAllReferences();
 723     I->eraseFromParent();
 724   }
 725
 726   InstsToRemove.clear();
 727   NewInsts.clear();
 728   TruncTysMap.clear();
 729   Promoted.clear();
 730 }
 731
 732 void IRPromoter::ConvertTruncs() {
 733   LLVM_DEBUG(dbgs() << "ARM CGP: Converting truncs..\n");
 734   IRBuilder<> Builder{Ctx};
 735
 736   for (auto *V : *Visited) {
 737     if (!isa<TruncInst>(V) || Sources->count(V))
 738       continue;
 739
 740     auto *Trunc = cast<TruncInst>(V);
 741     Builder.SetInsertPoint(Trunc);
 742     IntegerType *SrcTy = cast<IntegerType>(Trunc->getOperand(0)->getType());
 743     IntegerType *DestTy = cast<IntegerType>(TruncTysMap[Trunc][0]);
 744
 745     unsigned NumBits = DestTy->getScalarSizeInBits();
 746     ConstantInt *Mask =
 747       ConstantInt::get(SrcTy, APInt::getMaxValue(NumBits).getZExtValue());
 748     Value *Masked = Builder.CreateAnd(Trunc->getOperand(0), Mask);
 749
 750     if (auto *I = dyn_cast<Instruction>(Masked))
 751       NewInsts.insert(I);
 752
 753     ReplaceAllUsersOfWith(Trunc, Masked);
 754   }
 755 }
 756
 757 void IRPromoter::Mutate(Type *OrigTy,
 758                         SmallPtrSetImpl<Value*> &Visited,
 759                         SmallPtrSetImpl<Value*> &Sources,
 760                         SmallPtrSetImpl<Instruction*> &Sinks,
 761                         SmallPtrSetImpl<Instruction*> &SafeToPromote) {
 762   LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
 763              << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
 764
 765   assert(isa<IntegerType>(OrigTy) && "expected integer type");
 766   this->OrigTy = cast<IntegerType>(OrigTy);
 767   assert(OrigTy->getPrimitiveSizeInBits() < ExtTy->getPrimitiveSizeInBits() &&
 768          "original type not smaller than extended type");
 769
 770   this->Visited = &Visited;
 771   this->Sources = &Sources;
 772   this->Sinks = &Sinks;
 773   this->SafeToPromote = &SafeToPromote;
 774
 775   // Cache original types of the values that will likely need truncating
 776   for (auto *I : Sinks) {
 777     if (auto *Call = dyn_cast<CallInst>(I)) {
 778       for (unsigned i = 0; i < Call->getNumArgOperands(); ++i) {
 779         Value *Arg = Call->getArgOperand(i);
 780         TruncTysMap[Call].push_back(Arg->getType());
 781       }
 782     } else if (auto *Switch = dyn_cast<SwitchInst>(I))
 783       TruncTysMap[I].push_back(Switch->getCondition()->getType());
 784     else {
 785       for (unsigned i = 0; i < I->getNumOperands(); ++i)
 786         TruncTysMap[I].push_back(I->getOperand(i)->getType());
 787     }
 788   }
 789   for (auto *V : Visited) {
 790     if (!isa<TruncInst>(V) || Sources.count(V))
 791       continue;
 792     auto *Trunc = cast<TruncInst>(V);
 793     TruncTysMap[Trunc].push_back(Trunc->getDestTy());
 794   }
 795
 796   // Convert adds and subs using negative immediates to equivalent instructions
 797   // that use positive constants.
 798   PrepareConstants();
 799
 800   // Insert zext instructions between sources and their users.
 801   ExtendSources();
 802
 803   // Promote visited instructions, mutating their types in place. Also insert
 804   // DSP intrinsics, if enabled, for adds and subs which would be unsafe to
 805   // promote.
 806   PromoteTree();
 807
 808   // Convert any truncs, that aren't sources, into AND masks.
 809   ConvertTruncs();
 810
 811   // Insert trunc instructions for use by calls, stores etc...
 812   TruncateSinks();
 813
 814   // Finally, remove unecessary zexts and truncs, delete old instructions and
 815   // clear the data structures.
 816   Cleanup();
 817
 818   LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete\n");
 819 }
 820
 821 /// We accept most instructions, as well as Arguments and ConstantInsts. We
 822 /// Disallow casts other than zext and truncs and only allow calls if their
 823 /// return value is zeroext. We don't allow opcodes that can introduce sign
 824 /// bits.
 825 bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
 826   if (auto *I = dyn_cast<ICmpInst>(V)) {
 827     // Now that we allow small types than TypeSize, only allow icmp of
 828     // TypeSize because they will require a trunc to be legalised.
 829     // TODO: Allow icmp of smaller types, and calculate at the end
 830     // whether the transform would be beneficial.
 831     if (isa<PointerType>(I->getOperand(0)->getType()))
 832       return true;
 833     return EqualTypeSize(I->getOperand(0));
 834   }
 835
 836   // Memory instructions
 837   if (isa<StoreInst>(V) || isa<GetElementPtrInst>(V))
 838     return true;
 839
 840   // Branches and targets.
 841   if( isa<BranchInst>(V) || isa<SwitchInst>(V) || isa<BasicBlock>(V))
 842     return true;
 843
 844   // Non-instruction values that we can handle.
 845   if ((isa<Constant>(V) && !isa<ConstantExpr>(V)) || isa<Argument>(V))
 846     return isSupportedType(V);
 847
 848   if (isa<PHINode>(V) || isa<SelectInst>(V) || isa<ReturnInst>(V) ||
 849       isa<LoadInst>(V))
 850     return isSupportedType(V);
 851
 852   if (isa<SExtInst>(V))
 853     return false;
 854
 855   if (auto *Cast = dyn_cast<CastInst>(V))
 856     return isSupportedType(Cast) || isSupportedType(Cast->getOperand(0));
 857
 858   // Special cases for calls as we need to check for zeroext
 859   // TODO We should accept calls even if they don't have zeroext, as they can
 860   // still be sinks.
 861   if (auto *Call = dyn_cast<CallInst>(V))
 862     return isSupportedType(Call) &&
 863            Call->hasRetAttr(Attribute::AttrKind::ZExt);
 864
 865   if (!isa<BinaryOperator>(V))
 866     return false;
 867
 868   if (!isSupportedType(V))
 869     return false;
 870
 871   if (generateSignBits(V)) {
 872     LLVM_DEBUG(dbgs() << "ARM CGP: No, instruction can generate sign bits.\n");
 873     return false;
 874   }
 875   return true;
 876 }
 877
 878 /// Check that the type of V would be promoted and that the original type is
 879 /// smaller than the targeted promoted type. Check that we're not trying to
 880 /// promote something larger than our base 'TypeSize' type.
 881 bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
 882
 883   auto *I = dyn_cast<Instruction>(V);
 884   if (!I)
 885     return true;
 886
 887   if (SafeToPromote.count(I))
 888    return true;
 889
 890   if (isPromotedResultSafe(V) || isSafeOverflow(I)) {
 891     SafeToPromote.insert(I);
 892     return true;
 893   }
 894
 895   if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
 896     return false;
 897
 898   // If promotion is not safe, can we use a DSP instruction to natively
 899   // handle the narrow type?
 900   if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
 901     return false;
 902
 903   if (ST->isThumb() && !ST->hasThumb2())
 904     return false;
 905
 906   // TODO
 907   // Would it be profitable? For Thumb code, these parallel DSP instructions
 908   // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
 909   // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
 910   // halved. They also do not take immediates as operands.
 911   for (auto &Op : I->operands()) {
 912     if (isa<Constant>(Op)) {
 913       if (!EnableDSPWithImms)
 914         return false;
 915     }
 916   }
 917   LLVM_DEBUG(dbgs() << "ARM CGP: Will use an intrinsic for: " << *I << "\n");
 918   return true;
 919 }
 920
 921 bool ARMCodeGenPrepare::TryToPromote(Value *V) {
 922   OrigTy = V->getType();
 923   TypeSize = OrigTy->getPrimitiveSizeInBits();
 924   if (TypeSize > 16 || TypeSize < 8)
 925     return false;
 926
 927   SafeToPromote.clear();
 928
 929   if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
 930     return false;
 931
 932   LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << ", TypeSize = "
 933              << TypeSize << "\n");
 934
 935   SetVector<Value*> WorkList;
 936   SmallPtrSet<Value*, 8> Sources;
 937   SmallPtrSet<Instruction*, 4> Sinks;
 938   SmallPtrSet<Value*, 16> CurrentVisited;
 939   WorkList.insert(V);
 940
 941   // Return true if V was added to the worklist as a supported instruction,
 942   // if it was already visited, or if we don't need to explore it (e.g.
 943   // pointer values and GEPs), and false otherwise.
 944   auto AddLegalInst = [&](Value *V) {
 945     if (CurrentVisited.count(V))
 946       return true;
 947
 948     // Ignore GEPs because they don't need promoting and the constant indices
 949     // will prevent the transformation.
 950     if (isa<GetElementPtrInst>(V))
 951       return true;
 952
 953     if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) {
 954       LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n");
 955       return false;
 956     }
 957
 958     WorkList.insert(V);
 959     return true;
 960   };
 961
 962   // Iterate through, and add to, a tree of operands and users in the use-def.
 963   while (!WorkList.empty()) {
 964     Value *V = WorkList.back();
 965     WorkList.pop_back();
 966     if (CurrentVisited.count(V))
 967       continue;
 968
 969     // Ignore non-instructions, other than arguments.
 970     if (!isa<Instruction>(V) && !isSource(V))
 971       continue;
 972
 973     // If we've already visited this value from somewhere, bail now because
 974     // the tree has already been explored.
 975     // TODO: This could limit the transform, ie if we try to promote something
 976     // from an i8 and fail first, before trying an i16.
 977     if (AllVisited.count(V))
 978       return false;
 979
 980     CurrentVisited.insert(V);
 981     AllVisited.insert(V);
 982
 983     // Calls can be both sources and sinks.
 984     if (isSink(V))
 985       Sinks.insert(cast<Instruction>(V));
 986
 987     if (isSource(V))
 988       Sources.insert(V);
 989
 990     if (!isSink(V) && !isSource(V)) {
 991       if (auto *I = dyn_cast<Instruction>(V)) {
 992         // Visit operands of any instruction visited.
 993         for (auto &U : I->operands()) {
 994           if (!AddLegalInst(U))
 995             return false;
 996         }
 997       }
 998     }
 999
1000     // Don't visit users of a node which isn't going to be mutated unless its a
1001     // source.
1002     if (isSource(V) || shouldPromote(V)) {
1003       for (Use &U : V->uses()) {
1004         if (!AddLegalInst(U.getUser()))
1005           return false;
1006       }
1007     }
1008   }
1009
1010   LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n";
1011              for (auto *I : CurrentVisited)
1012                I->dump();
1013              );
1014   unsigned ToPromote = 0;
1015   for (auto *V : CurrentVisited) {
1016     if (Sources.count(V))
1017       continue;
1018     if (Sinks.count(cast<Instruction>(V)))
1019       continue;
1020     ++ToPromote;
1021   }
1022
1023   if (ToPromote < 2)
1024     return false;
1025
1026   Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks, SafeToPromote);
1027   return true;
1028 }
1029
1030 bool ARMCodeGenPrepare::doInitialization(Module &M) {
1031   Promoter = new IRPromoter(&M);
1032   return false;
1033 }
1034
1035 bool ARMCodeGenPrepare::runOnFunction(Function &F) {
1036   if (skipFunction(F) || DisableCGP)
1037     return false;
1038
1039   auto *TPC = &getAnalysis<TargetPassConfig>();
1040   if (!TPC)
1041     return false;
1042
1043   const TargetMachine &TM = TPC->getTM<TargetMachine>();
1044   ST = &TM.getSubtarget<ARMSubtarget>(F);
1045   bool MadeChange = false;
1046   LLVM_DEBUG(dbgs() << "ARM CGP: Running on " << F.getName() << "\n");
1047
1048   // Search up from icmps to try to promote their operands.
1049   for (BasicBlock &BB : F) {
1050     auto &Insts = BB.getInstList();
1051     for (auto &I : Insts) {
1052       if (AllVisited.count(&I))
1053         continue;
1054
1055       if (isa<ICmpInst>(I)) {
1056         auto &CI = cast<ICmpInst>(I);
1057
1058         // Skip signed or pointer compares
1059         if (CI.isSigned() || !isa<IntegerType>(CI.getOperand(0)->getType()))
1060           continue;
1061
1062         LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n");
1063
1064         for (auto &Op : CI.operands()) {
1065           if (auto *I = dyn_cast<Instruction>(Op))
1066             MadeChange |= TryToPromote(I);
1067         }
1068       }
1069     }
1070     LLVM_DEBUG(if (verifyFunction(F, &dbgs())) {
1071                 dbgs() << F;
1072                 report_fatal_error("Broken function after type promotion");
1073                });
1074   }
1075   if (MadeChange)
1076     LLVM_DEBUG(dbgs() << "After ARMCodeGenPrepare: " << F << "\n");
1077
1078   return MadeChange;
1079 }
1080
1081 bool ARMCodeGenPrepare::doFinalization(Module &M) {
1082   delete Promoter;
1083   return false;
1084 }
1085
1086 INITIALIZE_PASS_BEGIN(ARMCodeGenPrepare, DEBUG_TYPE,
1087                       "ARM IR optimizations", false, false)
1088 INITIALIZE_PASS_END(ARMCodeGenPrepare, DEBUG_TYPE, "ARM IR optimizations",
1089                     false, false)
1090
1091 char ARMCodeGenPrepare::ID = 0;
1092 unsigned ARMCodeGenPrepare::TypeSize = 0;
1093
1094 FunctionPass *llvm::createARMCodeGenPreparePass() {
1095   return new ARMCodeGenPrepare();
1096 }