llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

   1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements a TargetTransformInfo analysis pass specific to the
  10 // SystemZ target machine. It uses the target's detailed information to provide
  11 // more precise answers to certain TTI queries, while letting the target
  12 // independent and default TTI implementations handle the rest.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "SystemZTargetTransformInfo.h"
  17 #include "llvm/Analysis/TargetTransformInfo.h"
  18 #include "llvm/CodeGen/BasicTTIImpl.h"
  19 #include "llvm/CodeGen/TargetLowering.h"
  20 #include "llvm/IR/DerivedTypes.h"
  21 #include "llvm/IR/IntrinsicInst.h"
  22 #include "llvm/IR/Intrinsics.h"
  23 #include "llvm/Support/Debug.h"
  24 #include "llvm/Support/MathExtras.h"
  25
  26 using namespace llvm;
  27
  28 #define DEBUG_TYPE "systemztti"
  29
  30 //===----------------------------------------------------------------------===//
  31 //
  32 // SystemZ cost model.
  33 //
  34 //===----------------------------------------------------------------------===//
  35
  36 static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
  37   bool UsedAsMemCpySource = false;
  38   for (const User *U : V->users())
  39     if (const Instruction *User = dyn_cast<Instruction>(U)) {
  40       if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) {
  41         UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse);
  42         continue;
  43       }
  44       if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) {
  45         if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) {
  46           UsedAsMemCpySource = true;
  47           continue;
  48         }
  49       }
  50       OtherUse = true;
  51     }
  52   return UsedAsMemCpySource;
  53 }
  54
  55 static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores,
  56                                 unsigned &NumLoads, const Function *F) {
  57   if (!isa<PointerType>(Ptr->getType()))
  58     return;
  59   for (const User *U : Ptr->users())
  60     if (const Instruction *User = dyn_cast<Instruction>(U)) {
  61       if (User->getParent()->getParent() == F) {
  62         if (const auto *SI = dyn_cast<StoreInst>(User)) {
  63           if (SI->getPointerOperand() == Ptr && !SI->isVolatile())
  64             NumStores++;
  65         } else if (const auto *LI = dyn_cast<LoadInst>(User)) {
  66           if (LI->getPointerOperand() == Ptr && !LI->isVolatile())
  67             NumLoads++;
  68         } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(User)) {
  69           if (GEP->getPointerOperand() == Ptr)
  70             countNumMemAccesses(GEP, NumStores, NumLoads, F);
  71         }
  72       }
  73     }
  74 }
  75
  76 unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
  77   unsigned Bonus = 0;
  78   const Function *Caller = CB->getParent()->getParent();
  79   const Function *Callee = CB->getCalledFunction();
  80   if (!Callee)
  81     return 0;
  82   const Module *M = Caller->getParent();
  83
  84   // Increase the threshold if an incoming argument is used only as a memcpy
  85   // source.
  86   for (const Argument &Arg : Callee->args()) {
  87     bool OtherUse = false;
  88     if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse) {
  89       Bonus = 1000;
  90       break;
  91     }
  92   }
  93
  94   // Give bonus for globals used much in both caller and callee.
  95   std::set<const GlobalVariable *> CalleeGlobals;
  96   std::set<const GlobalVariable *> CallerGlobals;
  97   for (const GlobalVariable &Global : M->globals())
  98     for (const User *U : Global.users())
  99       if (const Instruction *User = dyn_cast<Instruction>(U)) {
 100         if (User->getParent()->getParent() == Callee)
 101           CalleeGlobals.insert(&Global);
 102         if (User->getParent()->getParent() == Caller)
 103           CallerGlobals.insert(&Global);
 104       }
 105   for (auto *GV : CalleeGlobals)
 106     if (CallerGlobals.count(GV)) {
 107       unsigned CalleeStores = 0, CalleeLoads = 0;
 108       unsigned CallerStores = 0, CallerLoads = 0;
 109       countNumMemAccesses(GV, CalleeStores, CalleeLoads, Callee);
 110       countNumMemAccesses(GV, CallerStores, CallerLoads, Caller);
 111       if ((CalleeStores + CalleeLoads) > 10 &&
 112           (CallerStores + CallerLoads) > 10) {
 113         Bonus = 1000;
 114         break;
 115       }
 116     }
 117
 118   // Give bonus when Callee accesses an Alloca of Caller heavily.
 119   unsigned NumStores = 0;
 120   unsigned NumLoads = 0;
 121   for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) {
 122     Value *CallerArg = CB->getArgOperand(OpIdx);
 123     Argument *CalleeArg = Callee->getArg(OpIdx);
 124     if (isa<AllocaInst>(CallerArg))
 125       countNumMemAccesses(CalleeArg, NumStores, NumLoads, Callee);
 126   }
 127   if (NumLoads > 10)
 128     Bonus += NumLoads * 50;
 129   if (NumStores > 10)
 130     Bonus += NumStores * 50;
 131   Bonus = std::min(Bonus, unsigned(1000));
 132
 133   LLVM_DEBUG(if (Bonus)
 134                dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
 135   return Bonus;
 136 }
 137
 138 InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
 139                                               TTI::TargetCostKind CostKind) {
 140   assert(Ty->isIntegerTy());
 141
 142   unsigned BitSize = Ty->getPrimitiveSizeInBits();
 143   // There is no cost model for constants with a bit size of 0. Return TCC_Free
 144   // here, so that constant hoisting will ignore this constant.
 145   if (BitSize == 0)
 146     return TTI::TCC_Free;
 147   // No cost model for operations on integers larger than 128 bit implemented yet.
 148   if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
 149     return TTI::TCC_Free;
 150
 151   if (Imm == 0)
 152     return TTI::TCC_Free;
 153
 154   if (Imm.getBitWidth() <= 64) {
 155     // Constants loaded via lgfi.
 156     if (isInt<32>(Imm.getSExtValue()))
 157       return TTI::TCC_Basic;
 158     // Constants loaded via llilf.
 159     if (isUInt<32>(Imm.getZExtValue()))
 160       return TTI::TCC_Basic;
 161     // Constants loaded via llihf:
 162     if ((Imm.getZExtValue() & 0xffffffff) == 0)
 163       return TTI::TCC_Basic;
 164
 165     return 2 * TTI::TCC_Basic;
 166   }
 167
 168   // i128 immediates loads from Constant Pool
 169   return 2 * TTI::TCC_Basic;
 170 }
 171
 172 InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
 173                                                   const APInt &Imm, Type *Ty,
 174                                                   TTI::TargetCostKind CostKind,
 175                                                   Instruction *Inst) {
 176   assert(Ty->isIntegerTy());
 177
 178   unsigned BitSize = Ty->getPrimitiveSizeInBits();
 179   // There is no cost model for constants with a bit size of 0. Return TCC_Free
 180   // here, so that constant hoisting will ignore this constant.
 181   if (BitSize == 0)
 182     return TTI::TCC_Free;
 183   // No cost model for operations on integers larger than 64 bit implemented yet.
 184   if (BitSize > 64)
 185     return TTI::TCC_Free;
 186
 187   switch (Opcode) {
 188   default:
 189     return TTI::TCC_Free;
 190   case Instruction::GetElementPtr:
 191     // Always hoist the base address of a GetElementPtr. This prevents the
 192     // creation of new constants for every base constant that gets constant
 193     // folded with the offset.
 194     if (Idx == 0)
 195       return 2 * TTI::TCC_Basic;
 196     return TTI::TCC_Free;
 197   case Instruction::Store:
 198     if (Idx == 0 && Imm.getBitWidth() <= 64) {
 199       // Any 8-bit immediate store can by implemented via mvi.
 200       if (BitSize == 8)
 201         return TTI::TCC_Free;
 202       // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
 203       if (isInt<16>(Imm.getSExtValue()))
 204         return TTI::TCC_Free;
 205     }
 206     break;
 207   case Instruction::ICmp:
 208     if (Idx == 1 && Imm.getBitWidth() <= 64) {
 209       // Comparisons against signed 32-bit immediates implemented via cgfi.
 210       if (isInt<32>(Imm.getSExtValue()))
 211         return TTI::TCC_Free;
 212       // Comparisons against unsigned 32-bit immediates implemented via clgfi.
 213       if (isUInt<32>(Imm.getZExtValue()))
 214         return TTI::TCC_Free;
 215     }
 216     break;
 217   case Instruction::Add:
 218   case Instruction::Sub:
 219     if (Idx == 1 && Imm.getBitWidth() <= 64) {
 220       // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
 221       if (isUInt<32>(Imm.getZExtValue()))
 222         return TTI::TCC_Free;
 223       // Or their negation, by swapping addition vs. subtraction.
 224       if (isUInt<32>(-Imm.getSExtValue()))
 225         return TTI::TCC_Free;
 226     }
 227     break;
 228   case Instruction::Mul:
 229     if (Idx == 1 && Imm.getBitWidth() <= 64) {
 230       // We use msgfi to multiply by 32-bit signed immediates.
 231       if (isInt<32>(Imm.getSExtValue()))
 232         return TTI::TCC_Free;
 233     }
 234     break;
 235   case Instruction::Or:
 236   case Instruction::Xor:
 237     if (Idx == 1 && Imm.getBitWidth() <= 64) {
 238       // Masks supported by oilf/xilf.
 239       if (isUInt<32>(Imm.getZExtValue()))
 240         return TTI::TCC_Free;
 241       // Masks supported by oihf/xihf.
 242       if ((Imm.getZExtValue() & 0xffffffff) == 0)
 243         return TTI::TCC_Free;
 244     }
 245     break;
 246   case Instruction::And:
 247     if (Idx == 1 && Imm.getBitWidth() <= 64) {
 248       // Any 32-bit AND operation can by implemented via nilf.
 249       if (BitSize <= 32)
 250         return TTI::TCC_Free;
 251       // 64-bit masks supported by nilf.
 252       if (isUInt<32>(~Imm.getZExtValue()))
 253         return TTI::TCC_Free;
 254       // 64-bit masks supported by nilh.
 255       if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
 256         return TTI::TCC_Free;
 257       // Some 64-bit AND operations can be implemented via risbg.
 258       const SystemZInstrInfo *TII = ST->getInstrInfo();
 259       unsigned Start, End;
 260       if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
 261         return TTI::TCC_Free;
 262     }
 263     break;
 264   case Instruction::Shl:
 265   case Instruction::LShr:
 266   case Instruction::AShr:
 267     // Always return TCC_Free for the shift value of a shift instruction.
 268     if (Idx == 1)
 269       return TTI::TCC_Free;
 270     break;
 271   case Instruction::UDiv:
 272   case Instruction::SDiv:
 273   case Instruction::URem:
 274   case Instruction::SRem:
 275   case Instruction::Trunc:
 276   case Instruction::ZExt:
 277   case Instruction::SExt:
 278   case Instruction::IntToPtr:
 279   case Instruction::PtrToInt:
 280   case Instruction::BitCast:
 281   case Instruction::PHI:
 282   case Instruction::Call:
 283   case Instruction::Select:
 284   case Instruction::Ret:
 285   case Instruction::Load:
 286     break;
 287   }
 288
 289   return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
 290 }
 291
 292 InstructionCost
 293 SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
 294                                     const APInt &Imm, Type *Ty,
 295                                     TTI::TargetCostKind CostKind) {
 296   assert(Ty->isIntegerTy());
 297
 298   unsigned BitSize = Ty->getPrimitiveSizeInBits();
 299   // There is no cost model for constants with a bit size of 0. Return TCC_Free
 300   // here, so that constant hoisting will ignore this constant.
 301   if (BitSize == 0)
 302     return TTI::TCC_Free;
 303   // No cost model for operations on integers larger than 64 bit implemented yet.
 304   if (BitSize > 64)
 305     return TTI::TCC_Free;
 306
 307   switch (IID) {
 308   default:
 309     return TTI::TCC_Free;
 310   case Intrinsic::sadd_with_overflow:
 311   case Intrinsic::uadd_with_overflow:
 312   case Intrinsic::ssub_with_overflow:
 313   case Intrinsic::usub_with_overflow:
 314     // These get expanded to include a normal addition/subtraction.
 315     if (Idx == 1 && Imm.getBitWidth() <= 64) {
 316       if (isUInt<32>(Imm.getZExtValue()))
 317         return TTI::TCC_Free;
 318       if (isUInt<32>(-Imm.getSExtValue()))
 319         return TTI::TCC_Free;
 320     }
 321     break;
 322   case Intrinsic::smul_with_overflow:
 323   case Intrinsic::umul_with_overflow:
 324     // These get expanded to include a normal multiplication.
 325     if (Idx == 1 && Imm.getBitWidth() <= 64) {
 326       if (isInt<32>(Imm.getSExtValue()))
 327         return TTI::TCC_Free;
 328     }
 329     break;
 330   case Intrinsic::experimental_stackmap:
 331     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
 332       return TTI::TCC_Free;
 333     break;
 334   case Intrinsic::experimental_patchpoint_void:
 335   case Intrinsic::experimental_patchpoint:
 336     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
 337       return TTI::TCC_Free;
 338     break;
 339   }
 340   return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
 341 }
 342
 343 TargetTransformInfo::PopcntSupportKind
 344 SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
 345   assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
 346   if (ST->hasPopulationCount() && TyWidth <= 64)
 347     return TTI::PSK_FastHardware;
 348   return TTI::PSK_Software;
 349 }
 350
 351 void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 352                                              TTI::UnrollingPreferences &UP,
 353                                              OptimizationRemarkEmitter *ORE) {
 354   // Find out if L contains a call, what the machine instruction count
 355   // estimate is, and how many stores there are.
 356   bool HasCall = false;
 357   InstructionCost NumStores = 0;
 358   for (auto &BB : L->blocks())
 359     for (auto &I : *BB) {
 360       if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
 361         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
 362           if (isLoweredToCall(F))
 363             HasCall = true;
 364           if (F->getIntrinsicID() == Intrinsic::memcpy ||
 365               F->getIntrinsicID() == Intrinsic::memset)
 366             NumStores++;
 367         } else { // indirect call.
 368           HasCall = true;
 369         }
 370       }
 371       if (isa<StoreInst>(&I)) {
 372         Type *MemAccessTy = I.getOperand(0)->getType();
 373         NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy,
 374                                      std::nullopt, 0, TTI::TCK_RecipThroughput);
 375       }
 376     }
 377
 378   // The z13 processor will run out of store tags if too many stores
 379   // are fed into it too quickly. Therefore make sure there are not
 380   // too many stores in the resulting unrolled loop.
 381   unsigned const NumStoresVal = *NumStores.getValue();
 382   unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
 383
 384   if (HasCall) {
 385     // Only allow full unrolling if loop has any calls.
 386     UP.FullUnrollMaxCount = Max;
 387     UP.MaxCount = 1;
 388     return;
 389   }
 390
 391   UP.MaxCount = Max;
 392   if (UP.MaxCount <= 1)
 393     return;
 394
 395   // Allow partial and runtime trip count unrolling.
 396   UP.Partial = UP.Runtime = true;
 397
 398   UP.PartialThreshold = 75;
 399   UP.DefaultUnrollRuntimeCount = 4;
 400
 401   // Allow expensive instructions in the pre-header of the loop.
 402   UP.AllowExpensiveTripCount = true;
 403
 404   UP.Force = true;
 405 }
 406
 407 void SystemZTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
 408                                            TTI::PeelingPreferences &PP) {
 409   BaseT::getPeelingPreferences(L, SE, PP);
 410 }
 411
 412 bool SystemZTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
 413                                    const TargetTransformInfo::LSRCost &C2) {
 414   // SystemZ specific: check instruction count (first), and don't care about
 415   // ImmCost, since offsets are checked explicitly.
 416   return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
 417                   C1.NumIVMuls, C1.NumBaseAdds,
 418                   C1.ScaleCost, C1.SetupCost) <
 419     std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
 420              C2.NumIVMuls, C2.NumBaseAdds,
 421              C2.ScaleCost, C2.SetupCost);
 422 }
 423
 424 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
 425   bool Vector = (ClassID == 1);
 426   if (!Vector)
 427     // Discount the stack pointer.  Also leave out %r0, since it can't
 428     // be used in an address.
 429     return 14;
 430   if (ST->hasVector())
 431     return 32;
 432   return 0;
 433 }
 434
 435 TypeSize
 436 SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
 437   switch (K) {
 438   case TargetTransformInfo::RGK_Scalar:
 439     return TypeSize::getFixed(64);
 440   case TargetTransformInfo::RGK_FixedWidthVector:
 441     return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
 442   case TargetTransformInfo::RGK_ScalableVector:
 443     return TypeSize::getScalable(0);
 444   }
 445
 446   llvm_unreachable("Unsupported register kind");
 447 }
 448
 449 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
 450                                               unsigned NumStridedMemAccesses,
 451                                               unsigned NumPrefetches,
 452                                               bool HasCall) const {
 453   // Don't prefetch a loop with many far apart accesses.
 454   if (NumPrefetches > 16)
 455     return UINT_MAX;
 456
 457   // Emit prefetch instructions for smaller strides in cases where we think
 458   // the hardware prefetcher might not be able to keep up.
 459   if (NumStridedMemAccesses > 32 && !HasCall &&
 460       (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
 461     return 1;
 462
 463   return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
 464 }
 465
 466 bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
 467   EVT VT = TLI->getValueType(DL, DataType);
 468   return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
 469 }
 470
 471 static bool isFreeEltLoad(Value *Op) {
 472   if (isa<LoadInst>(Op) && Op->hasOneUse()) {
 473     const Instruction *UserI = cast<Instruction>(*Op->user_begin());
 474     return !isa<StoreInst>(UserI); // Prefer MVC
 475   }
 476   return false;
 477 }
 478
 479 InstructionCost SystemZTTIImpl::getScalarizationOverhead(
 480     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
 481     TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
 482   unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
 483   InstructionCost Cost = 0;
 484
 485   if (Insert && Ty->isIntOrIntVectorTy(64)) {
 486     // VLVGP will insert two GPRs with one instruction, while VLE will load
 487     // an element directly with no extra cost
 488     assert((VL.empty() || VL.size() == NumElts) &&
 489            "Type does not match the number of values.");
 490     InstructionCost CurrVectorCost = 0;
 491     for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
 492       if (DemandedElts[Idx] && !(VL.size() && isFreeEltLoad(VL[Idx])))
 493         ++CurrVectorCost;
 494       if (Idx % 2 == 1) {
 495         Cost += std::min(InstructionCost(1), CurrVectorCost);
 496         CurrVectorCost = 0;
 497       }
 498     }
 499     Insert = false;
 500   }
 501
 502   Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
 503                                           CostKind, VL);
 504   return Cost;
 505 }
 506
 507 // Return the bit size for the scalar type or vector element
 508 // type. getScalarSizeInBits() returns 0 for a pointer type.
 509 static unsigned getScalarSizeInBits(Type *Ty) {
 510   unsigned Size =
 511     (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
 512   assert(Size > 0 && "Element must have non-zero size.");
 513   return Size;
 514 }
 515
 516 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
 517 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
 518 // 3.
 519 static unsigned getNumVectorRegs(Type *Ty) {
 520   auto *VTy = cast<FixedVectorType>(Ty);
 521   unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
 522   assert(WideBits > 0 && "Could not compute size of vector");
 523   return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
 524 }
 525
 526 InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
 527     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
 528     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
 529     ArrayRef<const Value *> Args,
 530     const Instruction *CxtI) {
 531
 532   // TODO: Handle more cost kinds.
 533   if (CostKind != TTI::TCK_RecipThroughput)
 534     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
 535                                          Op2Info, Args, CxtI);
 536
 537   // TODO: return a good value for BB-VECTORIZER that includes the
 538   // immediate loads, which we do not want to count for the loop
 539   // vectorizer, since they are hopefully hoisted out of the loop. This
 540   // would require a new parameter 'InLoop', but not sure if constant
 541   // args are common enough to motivate this.
 542
 543   unsigned ScalarBits = Ty->getScalarSizeInBits();
 544
 545   // There are thre cases of division and remainder: Dividing with a register
 546   // needs a divide instruction. A divisor which is a power of two constant
 547   // can be implemented with a sequence of shifts. Any other constant needs a
 548   // multiply and shifts.
 549   const unsigned DivInstrCost = 20;
 550   const unsigned DivMulSeqCost = 10;
 551   const unsigned SDivPow2Cost = 4;
 552
 553   bool SignedDivRem =
 554       Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
 555   bool UnsignedDivRem =
 556       Opcode == Instruction::UDiv || Opcode == Instruction::URem;
 557
 558   // Check for a constant divisor.
 559   bool DivRemConst = false;
 560   bool DivRemConstPow2 = false;
 561   if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
 562     if (const Constant *C = dyn_cast<Constant>(Args[1])) {
 563       const ConstantInt *CVal =
 564           (C->getType()->isVectorTy()
 565                ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
 566                : dyn_cast<const ConstantInt>(C));
 567       if (CVal && (CVal->getValue().isPowerOf2() ||
 568                    CVal->getValue().isNegatedPowerOf2()))
 569         DivRemConstPow2 = true;
 570       else
 571         DivRemConst = true;
 572     }
 573   }
 574
 575   if (!Ty->isVectorTy()) {
 576     // These FP operations are supported with a dedicated instruction for
 577     // float, double and fp128 (base implementation assumes float generally
 578     // costs 2).
 579     if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
 580         Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
 581       return 1;
 582
 583     // There is no native support for FRem.
 584     if (Opcode == Instruction::FRem)
 585       return LIBCALL_COST;
 586
 587     // Give discount for some combined logical operations if supported.
 588     if (Args.size() == 2) {
 589       if (Opcode == Instruction::Xor) {
 590         for (const Value *A : Args) {
 591           if (const Instruction *I = dyn_cast<Instruction>(A))
 592             if (I->hasOneUse() &&
 593                 (I->getOpcode() == Instruction::Or ||
 594                  I->getOpcode() == Instruction::And ||
 595                  I->getOpcode() == Instruction::Xor))
 596               if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
 597                   (isInt128InVR(Ty) &&
 598                    (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
 599                 return 0;
 600         }
 601       }
 602       else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
 603         for (const Value *A : Args) {
 604           if (const Instruction *I = dyn_cast<Instruction>(A))
 605             if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
 606                 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
 607                  (isInt128InVR(Ty) &&
 608                   (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
 609               return 0;
 610         }
 611       }
 612     }
 613
 614     // Or requires one instruction, although it has custom handling for i64.
 615     if (Opcode == Instruction::Or)
 616       return 1;
 617
 618     if (Opcode == Instruction::Xor && ScalarBits == 1) {
 619       if (ST->hasLoadStoreOnCond2())
 620         return 5; // 2 * (li 0; loc 1); xor
 621       return 7; // 2 * ipm sequences ; xor ; shift ; compare
 622     }
 623
 624     if (DivRemConstPow2)
 625       return (SignedDivRem ? SDivPow2Cost : 1);
 626     if (DivRemConst)
 627       return DivMulSeqCost;
 628     if (SignedDivRem || UnsignedDivRem)
 629       return DivInstrCost;
 630   }
 631   else if (ST->hasVector()) {
 632     auto *VTy = cast<FixedVectorType>(Ty);
 633     unsigned VF = VTy->getNumElements();
 634     unsigned NumVectors = getNumVectorRegs(Ty);
 635
 636     // These vector operations are custom handled, but are still supported
 637     // with one instruction per vector, regardless of element size.
 638     if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
 639         Opcode == Instruction::AShr) {
 640       return NumVectors;
 641     }
 642
 643     if (DivRemConstPow2)
 644       return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
 645     if (DivRemConst) {
 646       SmallVector<Type *> Tys(Args.size(), Ty);
 647       return VF * DivMulSeqCost +
 648              BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
 649     }
 650     if ((SignedDivRem || UnsignedDivRem) && VF > 4)
 651       // Temporary hack: disable high vectorization factors with integer
 652       // division/remainder, which will get scalarized and handled with
 653       // GR128 registers. The mischeduler is not clever enough to avoid
 654       // spilling yet.
 655       return 1000;
 656
 657     // These FP operations are supported with a single vector instruction for
 658     // double (base implementation assumes float generally costs 2). For
 659     // FP128, the scalar cost is 1, and there is no overhead since the values
 660     // are already in scalar registers.
 661     if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
 662         Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
 663       switch (ScalarBits) {
 664       case 32: {
 665         // The vector enhancements facility 1 provides v4f32 instructions.
 666         if (ST->hasVectorEnhancements1())
 667           return NumVectors;
 668         // Return the cost of multiple scalar invocation plus the cost of
 669         // inserting and extracting the values.
 670         InstructionCost ScalarCost =
 671             getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
 672         SmallVector<Type *> Tys(Args.size(), Ty);
 673         InstructionCost Cost =
 674             (VF * ScalarCost) +
 675             BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
 676         // FIXME: VF 2 for these FP operations are currently just as
 677         // expensive as for VF 4.
 678         if (VF == 2)
 679           Cost *= 2;
 680         return Cost;
 681       }
 682       case 64:
 683       case 128:
 684         return NumVectors;
 685       default:
 686         break;
 687       }
 688     }
 689
 690     // There is no native support for FRem.
 691     if (Opcode == Instruction::FRem) {
 692       SmallVector<Type *> Tys(Args.size(), Ty);
 693       InstructionCost Cost =
 694           (VF * LIBCALL_COST) +
 695           BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind);
 696       // FIXME: VF 2 for float is currently just as expensive as for VF 4.
 697       if (VF == 2 && ScalarBits == 32)
 698         Cost *= 2;
 699       return Cost;
 700     }
 701   }
 702
 703   // Fallback to the default implementation.
 704   return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
 705                                        Args, CxtI);
 706 }
 707
 708 InstructionCost SystemZTTIImpl::getShuffleCost(
 709     TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask,
 710     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
 711     ArrayRef<const Value *> Args, const Instruction *CxtI) {
 712   Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
 713   if (ST->hasVector()) {
 714     unsigned NumVectors = getNumVectorRegs(Tp);
 715
 716     // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
 717
 718     // FP128 values are always in scalar registers, so there is no work
 719     // involved with a shuffle, except for broadcast. In that case register
 720     // moves are done with a single instruction per element.
 721     if (Tp->getScalarType()->isFP128Ty())
 722       return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
 723
 724     switch (Kind) {
 725     case  TargetTransformInfo::SK_ExtractSubvector:
 726       // ExtractSubvector Index indicates start offset.
 727
 728       // Extracting a subvector from first index is a noop.
 729       return (Index == 0 ? 0 : NumVectors);
 730
 731     case TargetTransformInfo::SK_Broadcast:
 732       // Loop vectorizer calls here to figure out the extra cost of
 733       // broadcasting a loaded value to all elements of a vector. Since vlrep
 734       // loads and replicates with a single instruction, adjust the returned
 735       // value.
 736       return NumVectors - 1;
 737
 738     default:
 739
 740       // SystemZ supports single instruction permutation / replication.
 741       return NumVectors;
 742     }
 743   }
 744
 745   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
 746 }
 747
 748 // Return the log2 difference of the element sizes of the two vector types.
 749 static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
 750   unsigned Bits0 = Ty0->getScalarSizeInBits();
 751   unsigned Bits1 = Ty1->getScalarSizeInBits();
 752
 753   if (Bits1 >  Bits0)
 754     return (Log2_32(Bits1) - Log2_32(Bits0));
 755
 756   return (Log2_32(Bits0) - Log2_32(Bits1));
 757 }
 758
 759 // Return the number of instructions needed to truncate SrcTy to DstTy.
 760 unsigned SystemZTTIImpl::
 761 getVectorTruncCost(Type *SrcTy, Type *DstTy) {
 762   assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
 763   assert(SrcTy->getPrimitiveSizeInBits().getFixedValue() >
 764              DstTy->getPrimitiveSizeInBits().getFixedValue() &&
 765          "Packing must reduce size of vector type.");
 766   assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
 767              cast<FixedVectorType>(DstTy)->getNumElements() &&
 768          "Packing should not change number of elements.");
 769
 770   // TODO: Since fp32 is expanded, the extract cost should always be 0.
 771
 772   unsigned NumParts = getNumVectorRegs(SrcTy);
 773   if (NumParts <= 2)
 774     // Up to 2 vector registers can be truncated efficiently with pack or
 775     // permute. The latter requires an immediate mask to be loaded, which
 776     // typically gets hoisted out of a loop.  TODO: return a good value for
 777     // BB-VECTORIZER that includes the immediate loads, which we do not want
 778     // to count for the loop vectorizer.
 779     return 1;
 780
 781   unsigned Cost = 0;
 782   unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
 783   unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
 784   for (unsigned P = 0; P < Log2Diff; ++P) {
 785     if (NumParts > 1)
 786       NumParts /= 2;
 787     Cost += NumParts;
 788   }
 789
 790   // Currently, a general mix of permutes and pack instructions is output by
 791   // isel, which follow the cost computation above except for this case which
 792   // is one instruction less:
 793   if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
 794       DstTy->getScalarSizeInBits() == 8)
 795     Cost--;
 796
 797   return Cost;
 798 }
 799
 800 // Return the cost of converting a vector bitmask produced by a compare
 801 // (SrcTy), to the type of the select or extend instruction (DstTy).
 802 unsigned SystemZTTIImpl::
 803 getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) {
 804   assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
 805           "Should only be called with vector types.");
 806
 807   unsigned PackCost = 0;
 808   unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
 809   unsigned DstScalarBits = DstTy->getScalarSizeInBits();
 810   unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
 811   if (SrcScalarBits > DstScalarBits)
 812     // The bitmask will be truncated.
 813     PackCost = getVectorTruncCost(SrcTy, DstTy);
 814   else if (SrcScalarBits < DstScalarBits) {
 815     unsigned DstNumParts = getNumVectorRegs(DstTy);
 816     // Each vector select needs its part of the bitmask unpacked.
 817     PackCost = Log2Diff * DstNumParts;
 818     // Extra cost for moving part of mask before unpacking.
 819     PackCost += DstNumParts - 1;
 820   }
 821
 822   return PackCost;
 823 }
 824
 825 // Return the type of the compared operands. This is needed to compute the
 826 // cost for a Select / ZExt or SExt instruction.
 827 static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
 828   Type *OpTy = nullptr;
 829   if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
 830     OpTy = CI->getOperand(0)->getType();
 831   else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
 832     if (LogicI->getNumOperands() == 2)
 833       if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
 834         if (isa<CmpInst>(LogicI->getOperand(1)))
 835           OpTy = CI0->getOperand(0)->getType();
 836
 837   if (OpTy != nullptr) {
 838     if (VF == 1) {
 839       assert (!OpTy->isVectorTy() && "Expected scalar type");
 840       return OpTy;
 841     }
 842     // Return the potentially vectorized type based on 'I' and 'VF'.  'I' may
 843     // be either scalar or already vectorized with a same or lesser VF.
 844     Type *ElTy = OpTy->getScalarType();
 845     return FixedVectorType::get(ElTy, VF);
 846   }
 847
 848   return nullptr;
 849 }
 850
 851 // Get the cost of converting a boolean vector to a vector with same width
 852 // and element size as Dst, plus the cost of zero extending if needed.
 853 unsigned SystemZTTIImpl::
 854 getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
 855                               const Instruction *I) {
 856   auto *DstVTy = cast<FixedVectorType>(Dst);
 857   unsigned VF = DstVTy->getNumElements();
 858   unsigned Cost = 0;
 859   // If we know what the widths of the compared operands, get any cost of
 860   // converting it to match Dst. Otherwise assume same widths.
 861   Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
 862   if (CmpOpTy != nullptr)
 863     Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
 864   if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
 865     // One 'vn' per dst vector with an immediate mask.
 866     Cost += getNumVectorRegs(Dst);
 867   return Cost;
 868 }
 869
 870 InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
 871                                                  Type *Src,
 872                                                  TTI::CastContextHint CCH,
 873                                                  TTI::TargetCostKind CostKind,
 874                                                  const Instruction *I) {
 875   // FIXME: Can the logic below also be used for these cost kinds?
 876   if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) {
 877     auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 878     return BaseCost == 0 ? BaseCost : 1;
 879   }
 880
 881   unsigned DstScalarBits = Dst->getScalarSizeInBits();
 882   unsigned SrcScalarBits = Src->getScalarSizeInBits();
 883
 884   if (!Src->isVectorTy()) {
 885     assert (!Dst->isVectorTy());
 886
 887     if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
 888       if (Src->isIntegerTy(128))
 889         return LIBCALL_COST;
 890       if (SrcScalarBits >= 32 ||
 891           (I != nullptr && isa<LoadInst>(I->getOperand(0))))
 892         return 1;
 893       return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
 894     }
 895
 896     if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
 897         Dst->isIntegerTy(128))
 898       return LIBCALL_COST;
 899
 900     if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
 901       if (Src->isIntegerTy(1)) {
 902         if (DstScalarBits == 128)
 903           return 5 /*branch seq.*/;
 904
 905         if (ST->hasLoadStoreOnCond2())
 906           return 2; // li 0; loc 1
 907
 908         // This should be extension of a compare i1 result, which is done with
 909         // ipm and a varying sequence of instructions.
 910         unsigned Cost = 0;
 911         if (Opcode == Instruction::SExt)
 912           Cost = (DstScalarBits < 64 ? 3 : 4);
 913         if (Opcode == Instruction::ZExt)
 914           Cost = 3;
 915         Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
 916         if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
 917           // If operands of an fp-type was compared, this costs +1.
 918           Cost++;
 919         return Cost;
 920       }
 921       else if (isInt128InVR(Dst)) {
 922         // Extensions from GPR to i128 (in VR) typically costs two instructions,
 923         // but a zero-extending load would be just one extra instruction.
 924         if (Opcode == Instruction::ZExt && I != nullptr)
 925           if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
 926             if (Ld->hasOneUse())
 927               return 1;
 928         return 2;
 929       }
 930     }
 931
 932     if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
 933       if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
 934         if (Ld->hasOneUse())
 935           return 0;  // Will be converted to GPR load.
 936       bool OnlyTruncatingStores = true;
 937       for (const User *U : I->users())
 938         if (!isa<StoreInst>(U)) {
 939           OnlyTruncatingStores = false;
 940           break;
 941         }
 942       if (OnlyTruncatingStores)
 943         return 0;
 944       return 2; // Vector element extraction.
 945     }
 946   }
 947   else if (ST->hasVector()) {
 948     // Vector to scalar cast.
 949     auto *SrcVecTy = cast<FixedVectorType>(Src);
 950     auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
 951     if (!DstVecTy) {
 952       // TODO: tune vector-to-scalar cast.
 953       return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 954     }
 955     unsigned VF = SrcVecTy->getNumElements();
 956     unsigned NumDstVectors = getNumVectorRegs(Dst);
 957     unsigned NumSrcVectors = getNumVectorRegs(Src);
 958
 959     if (Opcode == Instruction::Trunc) {
 960       if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
 961         return 0; // Check for NOOP conversions.
 962       return getVectorTruncCost(Src, Dst);
 963     }
 964
 965     if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
 966       if (SrcScalarBits >= 8) {
 967         // ZExt will use either a single unpack or a vector permute.
 968         if (Opcode == Instruction::ZExt)
 969           return NumDstVectors;
 970
 971         // SExt will be handled with one unpack per doubling of width.
 972         unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
 973
 974         // For types that spans multiple vector registers, some additional
 975         // instructions are used to setup the unpacking.
 976         unsigned NumSrcVectorOps =
 977           (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
 978                           : (NumDstVectors / 2));
 979
 980         return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
 981       }
 982       else if (SrcScalarBits == 1)
 983         return getBoolVecToIntConversionCost(Opcode, Dst, I);
 984     }
 985
 986     if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
 987         Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
 988       // TODO: Fix base implementation which could simplify things a bit here
 989       // (seems to miss on differentiating on scalar/vector types).
 990
 991       // Only 64 bit vector conversions are natively supported before z15.
 992       if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
 993         if (SrcScalarBits == DstScalarBits)
 994           return NumDstVectors;
 995
 996         if (SrcScalarBits == 1)
 997           return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
 998       }
 999
1000       // Return the cost of multiple scalar invocation plus the cost of
1001       // inserting and extracting the values. Base implementation does not
1002       // realize float->int gets scalarized.
1003       InstructionCost ScalarCost = getCastInstrCost(
1004           Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
1005       InstructionCost TotCost = VF * ScalarCost;
1006       bool NeedsInserts = true, NeedsExtracts = true;
1007       // FP128 registers do not get inserted or extracted.
1008       if (DstScalarBits == 128 &&
1009           (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
1010         NeedsInserts = false;
1011       if (SrcScalarBits == 128 &&
1012           (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
1013         NeedsExtracts = false;
1014
1015       TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1016                                                  NeedsExtracts, CostKind);
1017       TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
1018                                                  /*Extract*/ false, CostKind);
1019
1020       // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
1021       if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
1022         TotCost *= 2;
1023
1024       return TotCost;
1025     }
1026
1027     if (Opcode == Instruction::FPTrunc) {
1028       if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
1029         return VF /*ldxbr/lexbr*/ +
1030                BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
1031                                                /*Extract*/ false, CostKind);
1032       else // double -> float
1033         return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
1034     }
1035
1036     if (Opcode == Instruction::FPExt) {
1037       if (SrcScalarBits == 32 && DstScalarBits == 64) {
1038         // float -> double is very rare and currently unoptimized. Instead of
1039         // using vldeb, which can do two at a time, all conversions are
1040         // scalarized.
1041         return VF * 2;
1042       }
1043       // -> fp128.  VF * lxdb/lxeb + extraction of elements.
1044       return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1045                                                   /*Extract*/ true, CostKind);
1046     }
1047   }
1048
1049   return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1050 }
1051
1052 // Scalar i8 / i16 operations will typically be made after first extending
1053 // the operands to i32.
1054 static unsigned getOperandsExtensionCost(const Instruction *I) {
1055   unsigned ExtCost = 0;
1056   for (Value *Op : I->operands())
1057     // A load of i8 or i16 sign/zero extends to i32.
1058     if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
1059       ExtCost++;
1060
1061   return ExtCost;
1062 }
1063
1064 InstructionCost SystemZTTIImpl::getCmpSelInstrCost(
1065     unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1066     TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
1067     TTI::OperandValueInfo Op2Info, const Instruction *I) {
1068   if (CostKind != TTI::TCK_RecipThroughput)
1069     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1070                                      Op1Info, Op2Info);
1071
1072   if (!ValTy->isVectorTy()) {
1073     switch (Opcode) {
1074     case Instruction::ICmp: {
1075       // A loaded value compared with 0 with multiple users becomes Load and
1076       // Test. The load is then not foldable, so return 0 cost for the ICmp.
1077       unsigned ScalarBits = ValTy->getScalarSizeInBits();
1078       if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
1079         if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
1080           if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
1081             if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
1082                 C->isZero())
1083               return 0;
1084
1085       unsigned Cost = 1;
1086       if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
1087         Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
1088       return Cost;
1089     }
1090     case Instruction::Select:
1091       if (ValTy->isFloatingPointTy() || isInt128InVR(ValTy))
1092         return 4; // No LOC for FP / i128 - costs a conditional jump.
1093       return 1; // Load On Condition / Select Register.
1094     }
1095   }
1096   else if (ST->hasVector()) {
1097     unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
1098
1099     // Called with a compare instruction.
1100     if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
1101       unsigned PredicateExtraCost = 0;
1102       if (I != nullptr) {
1103         // Some predicates cost one or two extra instructions.
1104         switch (cast<CmpInst>(I)->getPredicate()) {
1105         case CmpInst::Predicate::ICMP_NE:
1106         case CmpInst::Predicate::ICMP_UGE:
1107         case CmpInst::Predicate::ICMP_ULE:
1108         case CmpInst::Predicate::ICMP_SGE:
1109         case CmpInst::Predicate::ICMP_SLE:
1110           PredicateExtraCost = 1;
1111           break;
1112         case CmpInst::Predicate::FCMP_ONE:
1113         case CmpInst::Predicate::FCMP_ORD:
1114         case CmpInst::Predicate::FCMP_UEQ:
1115         case CmpInst::Predicate::FCMP_UNO:
1116           PredicateExtraCost = 2;
1117           break;
1118         default:
1119           break;
1120         }
1121       }
1122
1123       // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
1124       // floats.  FIXME: <2 x float> generates same code as <4 x float>.
1125       unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
1126       unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
1127
1128       unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
1129       return Cost;
1130     }
1131     else { // Called with a select instruction.
1132       assert (Opcode == Instruction::Select);
1133
1134       // We can figure out the extra cost of packing / unpacking if the
1135       // instruction was passed and the compare instruction is found.
1136       unsigned PackCost = 0;
1137       Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
1138       if (CmpOpTy != nullptr)
1139         PackCost =
1140           getVectorBitmaskConversionCost(CmpOpTy, ValTy);
1141
1142       return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
1143     }
1144   }
1145
1146   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1147                                    Op1Info, Op2Info);
1148 }
1149
1150 InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1151                                                    TTI::TargetCostKind CostKind,
1152                                                    unsigned Index, Value *Op0,
1153                                                    Value *Op1) {
1154   if (Opcode == Instruction::InsertElement) {
1155     // Vector Element Load.
1156     if (Op1 != nullptr && isFreeEltLoad(Op1))
1157       return 0;
1158
1159     // vlvgp will insert two grs into a vector register, so count half the
1160     // number of instructions as an estimate when we don't have the full
1161     // picture (as in getScalarizationOverhead()).
1162     if (Val->isIntOrIntVectorTy(64))
1163       return ((Index % 2 == 0) ? 1 : 0);
1164   }
1165
1166   if (Opcode == Instruction::ExtractElement) {
1167     int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
1168
1169     // Give a slight penalty for moving out of vector pipeline to FXU unit.
1170     if (Index == 0 && Val->isIntOrIntVectorTy())
1171       Cost += 1;
1172
1173     return Cost;
1174   }
1175
1176   return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1177 }
1178
1179 // Check if a load may be folded as a memory operand in its user.
1180 bool SystemZTTIImpl::
1181 isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
1182   if (!Ld->hasOneUse())
1183     return false;
1184   FoldedValue = Ld;
1185   const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
1186   unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
1187   unsigned TruncBits = 0;
1188   unsigned SExtBits = 0;
1189   unsigned ZExtBits = 0;
1190   if (UserI->hasOneUse()) {
1191     unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1192     if (isa<TruncInst>(UserI))
1193       TruncBits = UserBits;
1194     else if (isa<SExtInst>(UserI))
1195       SExtBits = UserBits;
1196     else if (isa<ZExtInst>(UserI))
1197       ZExtBits = UserBits;
1198   }
1199   if (TruncBits || SExtBits || ZExtBits) {
1200     FoldedValue = UserI;
1201     UserI = cast<Instruction>(*UserI->user_begin());
1202     // Load (single use) -> trunc/extend (single use) -> UserI
1203   }
1204   if ((UserI->getOpcode() == Instruction::Sub ||
1205        UserI->getOpcode() == Instruction::SDiv ||
1206        UserI->getOpcode() == Instruction::UDiv) &&
1207       UserI->getOperand(1) != FoldedValue)
1208     return false; // Not commutative, only RHS foldable.
1209   // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1210   // extension was made of the load.
1211   unsigned LoadOrTruncBits =
1212       ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1213   switch (UserI->getOpcode()) {
1214   case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1215   case Instruction::Sub:
1216   case Instruction::ICmp:
1217     if (LoadedBits == 32 && ZExtBits == 64)
1218       return true;
1219     [[fallthrough]];
1220   case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1221     if (UserI->getOpcode() != Instruction::ICmp) {
1222       if (LoadedBits == 16 &&
1223           (SExtBits == 32 ||
1224            (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1225         return true;
1226       if (LoadOrTruncBits == 16)
1227         return true;
1228     }
1229     [[fallthrough]];
1230   case Instruction::SDiv:// SE: 32->64
1231     if (LoadedBits == 32 && SExtBits == 64)
1232       return true;
1233     [[fallthrough]];
1234   case Instruction::UDiv:
1235   case Instruction::And:
1236   case Instruction::Or:
1237   case Instruction::Xor:
1238     // This also makes sense for float operations, but disabled for now due
1239     // to regressions.
1240     // case Instruction::FCmp:
1241     // case Instruction::FAdd:
1242     // case Instruction::FSub:
1243     // case Instruction::FMul:
1244     // case Instruction::FDiv:
1245
1246     // All possible extensions of memory checked above.
1247
1248     // Comparison between memory and immediate.
1249     if (UserI->getOpcode() == Instruction::ICmp)
1250       if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1251         if (CI->getValue().isIntN(16))
1252           return true;
1253     return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1254     break;
1255   }
1256   return false;
1257 }
1258
1259 static bool isBswapIntrinsicCall(const Value *V) {
1260   if (const Instruction *I = dyn_cast<Instruction>(V))
1261     if (auto *CI = dyn_cast<CallInst>(I))
1262       if (auto *F = CI->getCalledFunction())
1263         if (F->getIntrinsicID() == Intrinsic::bswap)
1264           return true;
1265   return false;
1266 }
1267
1268 InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1269                                                 MaybeAlign Alignment,
1270                                                 unsigned AddressSpace,
1271                                                 TTI::TargetCostKind CostKind,
1272                                                 TTI::OperandValueInfo OpInfo,
1273                                                 const Instruction *I) {
1274   assert(!Src->isVoidTy() && "Invalid type");
1275
1276   // TODO: Handle other cost kinds.
1277   if (CostKind != TTI::TCK_RecipThroughput)
1278     return 1;
1279
1280   if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1281     // Store the load or its truncated or extended value in FoldedValue.
1282     const Instruction *FoldedValue = nullptr;
1283     if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1284       const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1285       assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1286
1287       // UserI can't fold two loads, so in that case return 0 cost only
1288       // half of the time.
1289       for (unsigned i = 0; i < 2; ++i) {
1290         if (UserI->getOperand(i) == FoldedValue)
1291           continue;
1292
1293         if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1294           LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1295           if (!OtherLoad &&
1296               (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1297                isa<ZExtInst>(OtherOp)))
1298             OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1299           if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1300             return i == 0; // Both operands foldable.
1301         }
1302       }
1303
1304       return 0; // Only I is foldable in user.
1305     }
1306   }
1307
1308   // Type legalization (via getNumberOfParts) can't handle structs
1309   if (TLI->getValueType(DL, Src, true) == MVT::Other)
1310     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1311                                   CostKind);
1312
1313   // FP128 is a legal type but kept in a register pair on older CPUs.
1314   if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
1315     return 2;
1316
1317   unsigned NumOps =
1318     (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1319
1320   // Store/Load reversed saves one instruction.
1321   if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1322       I != nullptr) {
1323     if (Opcode == Instruction::Load && I->hasOneUse()) {
1324       const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1325       // In case of load -> bswap -> store, return normal cost for the load.
1326       if (isBswapIntrinsicCall(LdUser) &&
1327           (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1328         return 0;
1329     }
1330     else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1331       const Value *StoredVal = SI->getValueOperand();
1332       if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1333         return 0;
1334     }
1335   }
1336
1337   return  NumOps;
1338 }
1339
1340 // The generic implementation of getInterleavedMemoryOpCost() is based on
1341 // adding costs of the memory operations plus all the extracts and inserts
1342 // needed for using / defining the vector operands. The SystemZ version does
1343 // roughly the same but bases the computations on vector permutations
1344 // instead.
1345 InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
1346     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1347     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1348     bool UseMaskForCond, bool UseMaskForGaps) {
1349   if (UseMaskForCond || UseMaskForGaps)
1350     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1351                                              Alignment, AddressSpace, CostKind,
1352                                              UseMaskForCond, UseMaskForGaps);
1353   assert(isa<VectorType>(VecTy) &&
1354          "Expect a vector type for interleaved memory op");
1355
1356   unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1357   assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1358   unsigned VF = NumElts / Factor;
1359   unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1360   unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1361   unsigned NumPermutes = 0;
1362
1363   if (Opcode == Instruction::Load) {
1364     // Loading interleave groups may have gaps, which may mean fewer
1365     // loads. Find out how many vectors will be loaded in total, and in how
1366     // many of them each value will be in.
1367     BitVector UsedInsts(NumVectorMemOps, false);
1368     std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1369     for (unsigned Index : Indices)
1370       for (unsigned Elt = 0; Elt < VF; ++Elt) {
1371         unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1372         UsedInsts.set(Vec);
1373         ValueVecs[Index].set(Vec);
1374       }
1375     NumVectorMemOps = UsedInsts.count();
1376
1377     for (unsigned Index : Indices) {
1378       // Estimate that each loaded source vector containing this Index
1379       // requires one operation, except that vperm can handle two input
1380       // registers first time for each dst vector.
1381       unsigned NumSrcVecs = ValueVecs[Index].count();
1382       unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
1383       assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1384       NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1385     }
1386   } else {
1387     // Estimate the permutes for each stored vector as the smaller of the
1388     // number of elements and the number of source vectors. Subtract one per
1389     // dst vector for vperm (S.A.).
1390     unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1391     unsigned NumDstVecs = NumVectorMemOps;
1392     NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1393   }
1394
1395   // Cost of load/store operations and the permutations needed.
1396   return NumVectorMemOps + NumPermutes;
1397 }
1398
1399 static int
1400 getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1401                             const SmallVectorImpl<Type *> &ParamTys) {
1402   if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1403     return getNumVectorRegs(RetTy); // VPERM
1404
1405   if (ID == Intrinsic::vector_reduce_add) {
1406     // Retrieve number and size of elements for the vector op.
1407     auto *VTy = cast<FixedVectorType>(ParamTys.front());
1408     unsigned ScalarSize = VTy->getScalarSizeInBits();
1409     // For scalar sizes >128 bits, we fall back to the generic cost estimate.
1410     if (ScalarSize > SystemZ::VectorBits)
1411       return -1;
1412     // This many vector regs are needed to represent the input elements (V).
1413     unsigned VectorRegsNeeded = getNumVectorRegs(VTy);
1414     // This many instructions are needed for the final sum of vector elems (S).
1415     unsigned LastVectorHandling = (ScalarSize < 32) ? 3 : 2;
1416     // We use vector adds to create a sum vector, which takes
1417     // V/2 + V/4 + ... = V - 1 operations.
1418     // Then, we need S operations to sum up the elements of that sum vector,
1419     // for a total of V + S - 1 operations.
1420     int Cost = VectorRegsNeeded + LastVectorHandling - 1;
1421     return Cost;
1422   }
1423   return -1;
1424 }
1425
1426 InstructionCost
1427 SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1428                                       TTI::TargetCostKind CostKind) {
1429   InstructionCost Cost = getVectorIntrinsicInstrCost(
1430       ICA.getID(), ICA.getReturnType(), ICA.getArgTypes());
1431   if (Cost != -1)
1432     return Cost;
1433   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1434 }
1435
1436 bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
1437   // Always expand on Subtargets without vector instructions.
1438   if (!ST->hasVector())
1439     return true;
1440
1441   // Whether or not to expand is a per-intrinsic decision.
1442   switch (II->getIntrinsicID()) {
1443   default:
1444     return true;
1445   // Do not expand vector.reduce.add...
1446   case Intrinsic::vector_reduce_add:
1447     auto *VType = cast<FixedVectorType>(II->getOperand(0)->getType());
1448     // ...unless the scalar size is i64 or larger,
1449     // or the operand vector is not full, since the
1450     // performance benefit is dubious in those cases.
1451     return VType->getScalarSizeInBits() >= 64 ||
1452            VType->getPrimitiveSizeInBits() < SystemZ::VectorBits;
1453   }
1454 }