1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "PPCTargetTransformInfo.h"
10 #include "llvm/Analysis/CodeMetrics.h"
11 #include "llvm/Analysis/TargetLibraryInfo.h"
12 #include "llvm/Analysis/TargetTransformInfo.h"
13 #include "llvm/CodeGen/BasicTTIImpl.h"
14 #include "llvm/CodeGen/TargetLowering.h"
15 #include "llvm/CodeGen/TargetSchedule.h"
16 #include "llvm/IR/IntrinsicsPowerPC.h"
17 #include "llvm/IR/ProfDataUtils.h"
18 #include "llvm/Support/CommandLine.h"
19 #include "llvm/Transforms/InstCombine/InstCombiner.h"
20 #include "llvm/Transforms/Utils/Local.h"
25 #define DEBUG_TYPE "ppctti"
27 static cl::opt
<bool> VecMaskCost("ppc-vec-mask-cost",
28 cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden
);
30 static cl::opt
<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
31 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden
);
34 EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden
, cl::init(false),
35 cl::desc("Enable using coldcc calling conv for cold "
36 "internal functions"));
39 LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden
, cl::init(false),
40 cl::desc("Do not add instruction count to lsr cost model"));
42 // The latency of mtctr is only justified if there are more than 4
43 // comparisons that will be removed as a result.
44 static cl::opt
<unsigned>
45 SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden
,
46 cl::desc("Loops with a constant trip count smaller than "
47 "this value will not use the count register."));
49 //===----------------------------------------------------------------------===//
53 //===----------------------------------------------------------------------===//
55 TargetTransformInfo::PopcntSupportKind
56 PPCTTIImpl::getPopcntSupport(unsigned TyWidth
) {
57 assert(isPowerOf2_32(TyWidth
) && "Ty width must be power of 2");
58 if (ST
->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable
&& TyWidth
<= 64)
59 return ST
->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow
?
60 TTI::PSK_SlowHardware
: TTI::PSK_FastHardware
;
61 return TTI::PSK_Software
;
64 std::optional
<Instruction
*>
65 PPCTTIImpl::instCombineIntrinsic(InstCombiner
&IC
, IntrinsicInst
&II
) const {
66 Intrinsic::ID IID
= II
.getIntrinsicID();
70 case Intrinsic::ppc_altivec_lvx
:
71 case Intrinsic::ppc_altivec_lvxl
:
72 // Turn PPC lvx -> load if the pointer is known aligned.
73 if (getOrEnforceKnownAlignment(
74 II
.getArgOperand(0), Align(16), IC
.getDataLayout(), &II
,
75 &IC
.getAssumptionCache(), &IC
.getDominatorTree()) >= 16) {
76 Value
*Ptr
= II
.getArgOperand(0);
77 return new LoadInst(II
.getType(), Ptr
, "", false, Align(16));
80 case Intrinsic::ppc_vsx_lxvw4x
:
81 case Intrinsic::ppc_vsx_lxvd2x
: {
82 // Turn PPC VSX loads into normal loads.
83 Value
*Ptr
= II
.getArgOperand(0);
84 return new LoadInst(II
.getType(), Ptr
, Twine(""), false, Align(1));
86 case Intrinsic::ppc_altivec_stvx
:
87 case Intrinsic::ppc_altivec_stvxl
:
88 // Turn stvx -> store if the pointer is known aligned.
89 if (getOrEnforceKnownAlignment(
90 II
.getArgOperand(1), Align(16), IC
.getDataLayout(), &II
,
91 &IC
.getAssumptionCache(), &IC
.getDominatorTree()) >= 16) {
92 Value
*Ptr
= II
.getArgOperand(1);
93 return new StoreInst(II
.getArgOperand(0), Ptr
, false, Align(16));
96 case Intrinsic::ppc_vsx_stxvw4x
:
97 case Intrinsic::ppc_vsx_stxvd2x
: {
98 // Turn PPC VSX stores into normal stores.
99 Value
*Ptr
= II
.getArgOperand(1);
100 return new StoreInst(II
.getArgOperand(0), Ptr
, false, Align(1));
102 case Intrinsic::ppc_altivec_vperm
:
103 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
104 // Note that ppc_altivec_vperm has a big-endian bias, so when creating
105 // a vectorshuffle for little endian, we must undo the transformation
106 // performed on vec_perm in altivec.h. That is, we must complement
107 // the permutation mask with respect to 31 and reverse the order of
109 if (Constant
*Mask
= dyn_cast
<Constant
>(II
.getArgOperand(2))) {
110 assert(cast
<FixedVectorType
>(Mask
->getType())->getNumElements() == 16 &&
111 "Bad type for intrinsic!");
113 // Check that all of the elements are integer constants or undefs.
114 bool AllEltsOk
= true;
115 for (unsigned i
= 0; i
!= 16; ++i
) {
116 Constant
*Elt
= Mask
->getAggregateElement(i
);
117 if (!Elt
|| !(isa
<ConstantInt
>(Elt
) || isa
<UndefValue
>(Elt
))) {
124 // Cast the input vectors to byte vectors.
126 IC
.Builder
.CreateBitCast(II
.getArgOperand(0), Mask
->getType());
128 IC
.Builder
.CreateBitCast(II
.getArgOperand(1), Mask
->getType());
129 Value
*Result
= UndefValue::get(Op0
->getType());
131 // Only extract each element once.
132 Value
*ExtractedElts
[32];
133 memset(ExtractedElts
, 0, sizeof(ExtractedElts
));
135 for (unsigned i
= 0; i
!= 16; ++i
) {
136 if (isa
<UndefValue
>(Mask
->getAggregateElement(i
)))
139 cast
<ConstantInt
>(Mask
->getAggregateElement(i
))->getZExtValue();
140 Idx
&= 31; // Match the hardware behavior.
141 if (DL
.isLittleEndian())
144 if (!ExtractedElts
[Idx
]) {
145 Value
*Op0ToUse
= (DL
.isLittleEndian()) ? Op1
: Op0
;
146 Value
*Op1ToUse
= (DL
.isLittleEndian()) ? Op0
: Op1
;
147 ExtractedElts
[Idx
] = IC
.Builder
.CreateExtractElement(
148 Idx
< 16 ? Op0ToUse
: Op1ToUse
, IC
.Builder
.getInt32(Idx
& 15));
151 // Insert this value into the result vector.
152 Result
= IC
.Builder
.CreateInsertElement(Result
, ExtractedElts
[Idx
],
153 IC
.Builder
.getInt32(i
));
155 return CastInst::Create(Instruction::BitCast
, Result
, II
.getType());
163 InstructionCost
PPCTTIImpl::getIntImmCost(const APInt
&Imm
, Type
*Ty
,
164 TTI::TargetCostKind CostKind
) {
165 if (DisablePPCConstHoist
)
166 return BaseT::getIntImmCost(Imm
, Ty
, CostKind
);
168 assert(Ty
->isIntegerTy());
170 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
175 return TTI::TCC_Free
;
177 if (Imm
.getBitWidth() <= 64) {
178 if (isInt
<16>(Imm
.getSExtValue()))
179 return TTI::TCC_Basic
;
181 if (isInt
<32>(Imm
.getSExtValue())) {
182 // A constant that can be materialized using lis.
183 if ((Imm
.getZExtValue() & 0xFFFF) == 0)
184 return TTI::TCC_Basic
;
186 return 2 * TTI::TCC_Basic
;
190 return 4 * TTI::TCC_Basic
;
193 InstructionCost
PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID
, unsigned Idx
,
194 const APInt
&Imm
, Type
*Ty
,
195 TTI::TargetCostKind CostKind
) {
196 if (DisablePPCConstHoist
)
197 return BaseT::getIntImmCostIntrin(IID
, Idx
, Imm
, Ty
, CostKind
);
199 assert(Ty
->isIntegerTy());
201 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
207 return TTI::TCC_Free
;
208 case Intrinsic::sadd_with_overflow
:
209 case Intrinsic::uadd_with_overflow
:
210 case Intrinsic::ssub_with_overflow
:
211 case Intrinsic::usub_with_overflow
:
212 if ((Idx
== 1) && Imm
.getBitWidth() <= 64 && isInt
<16>(Imm
.getSExtValue()))
213 return TTI::TCC_Free
;
215 case Intrinsic::experimental_stackmap
:
216 if ((Idx
< 2) || (Imm
.getBitWidth() <= 64 && isInt
<64>(Imm
.getSExtValue())))
217 return TTI::TCC_Free
;
219 case Intrinsic::experimental_patchpoint_void
:
220 case Intrinsic::experimental_patchpoint
:
221 if ((Idx
< 4) || (Imm
.getBitWidth() <= 64 && isInt
<64>(Imm
.getSExtValue())))
222 return TTI::TCC_Free
;
225 return PPCTTIImpl::getIntImmCost(Imm
, Ty
, CostKind
);
228 InstructionCost
PPCTTIImpl::getIntImmCostInst(unsigned Opcode
, unsigned Idx
,
229 const APInt
&Imm
, Type
*Ty
,
230 TTI::TargetCostKind CostKind
,
232 if (DisablePPCConstHoist
)
233 return BaseT::getIntImmCostInst(Opcode
, Idx
, Imm
, Ty
, CostKind
, Inst
);
235 assert(Ty
->isIntegerTy());
237 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
241 unsigned ImmIdx
= ~0U;
242 bool ShiftedFree
= false, RunFree
= false, UnsignedFree
= false,
246 return TTI::TCC_Free
;
247 case Instruction::GetElementPtr
:
248 // Always hoist the base address of a GetElementPtr. This prevents the
249 // creation of new constants for every base constant that gets constant
250 // folded with the offset.
252 return 2 * TTI::TCC_Basic
;
253 return TTI::TCC_Free
;
254 case Instruction::And
:
255 RunFree
= true; // (for the rotate-and-mask instructions)
257 case Instruction::Add
:
258 case Instruction::Or
:
259 case Instruction::Xor
:
262 case Instruction::Sub
:
263 case Instruction::Mul
:
264 case Instruction::Shl
:
265 case Instruction::LShr
:
266 case Instruction::AShr
:
269 case Instruction::ICmp
:
272 // Zero comparisons can use record-form instructions.
274 case Instruction::Select
:
277 case Instruction::PHI
:
278 case Instruction::Call
:
279 case Instruction::Ret
:
280 case Instruction::Load
:
281 case Instruction::Store
:
285 if (ZeroFree
&& Imm
== 0)
286 return TTI::TCC_Free
;
288 if (Idx
== ImmIdx
&& Imm
.getBitWidth() <= 64) {
289 if (isInt
<16>(Imm
.getSExtValue()))
290 return TTI::TCC_Free
;
293 if (Imm
.getBitWidth() <= 32 &&
294 (isShiftedMask_32(Imm
.getZExtValue()) ||
295 isShiftedMask_32(~Imm
.getZExtValue())))
296 return TTI::TCC_Free
;
299 (isShiftedMask_64(Imm
.getZExtValue()) ||
300 isShiftedMask_64(~Imm
.getZExtValue())))
301 return TTI::TCC_Free
;
304 if (UnsignedFree
&& isUInt
<16>(Imm
.getZExtValue()))
305 return TTI::TCC_Free
;
307 if (ShiftedFree
&& (Imm
.getZExtValue() & 0xFFFF) == 0)
308 return TTI::TCC_Free
;
311 return PPCTTIImpl::getIntImmCost(Imm
, Ty
, CostKind
);
314 // Check if the current Type is an MMA vector type. Valid MMA types are
315 // v256i1 and v512i1 respectively.
316 static bool isMMAType(Type
*Ty
) {
317 return Ty
->isVectorTy() && (Ty
->getScalarSizeInBits() == 1) &&
318 (Ty
->getPrimitiveSizeInBits() > 128);
321 InstructionCost
PPCTTIImpl::getInstructionCost(const User
*U
,
322 ArrayRef
<const Value
*> Operands
,
323 TTI::TargetCostKind CostKind
) {
324 // We already implement getCastInstrCost and getMemoryOpCost where we perform
325 // the vector adjustment there.
326 if (isa
<CastInst
>(U
) || isa
<LoadInst
>(U
) || isa
<StoreInst
>(U
))
327 return BaseT::getInstructionCost(U
, Operands
, CostKind
);
329 if (U
->getType()->isVectorTy()) {
330 // Instructions that need to be split should cost more.
331 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(U
->getType());
332 return LT
.first
* BaseT::getInstructionCost(U
, Operands
, CostKind
);
335 return BaseT::getInstructionCost(U
, Operands
, CostKind
);
338 bool PPCTTIImpl::isHardwareLoopProfitable(Loop
*L
, ScalarEvolution
&SE
,
340 TargetLibraryInfo
*LibInfo
,
341 HardwareLoopInfo
&HWLoopInfo
) {
342 const PPCTargetMachine
&TM
= ST
->getTargetMachine();
343 TargetSchedModel SchedModel
;
346 // Do not convert small short loops to CTR loop.
347 unsigned ConstTripCount
= SE
.getSmallConstantTripCount(L
);
348 if (ConstTripCount
&& ConstTripCount
< SmallCTRLoopThreshold
) {
349 SmallPtrSet
<const Value
*, 32> EphValues
;
350 CodeMetrics::collectEphemeralValues(L
, &AC
, EphValues
);
352 for (BasicBlock
*BB
: L
->blocks())
353 Metrics
.analyzeBasicBlock(BB
, *this, EphValues
);
354 // 6 is an approximate latency for the mtctr instruction.
355 if (Metrics
.NumInsts
<= (6 * SchedModel
.getIssueWidth()))
359 // Check that there is no hardware loop related intrinsics in the loop.
360 for (auto *BB
: L
->getBlocks())
362 if (auto *Call
= dyn_cast
<IntrinsicInst
>(&I
))
363 if (Call
->getIntrinsicID() == Intrinsic::set_loop_iterations
||
364 Call
->getIntrinsicID() == Intrinsic::loop_decrement
)
367 SmallVector
<BasicBlock
*, 4> ExitingBlocks
;
368 L
->getExitingBlocks(ExitingBlocks
);
370 // If there is an exit edge known to be frequently taken,
371 // we should not transform this loop.
372 for (auto &BB
: ExitingBlocks
) {
373 Instruction
*TI
= BB
->getTerminator();
376 if (BranchInst
*BI
= dyn_cast
<BranchInst
>(TI
)) {
377 uint64_t TrueWeight
= 0, FalseWeight
= 0;
378 if (!BI
->isConditional() ||
379 !extractBranchWeights(*BI
, TrueWeight
, FalseWeight
))
382 // If the exit path is more frequent than the loop path,
383 // we return here without further analysis for this loop.
384 bool TrueIsExit
= !L
->contains(BI
->getSuccessor(0));
385 if (( TrueIsExit
&& FalseWeight
< TrueWeight
) ||
386 (!TrueIsExit
&& FalseWeight
> TrueWeight
))
391 LLVMContext
&C
= L
->getHeader()->getContext();
392 HWLoopInfo
.CountType
= TM
.isPPC64() ?
393 Type::getInt64Ty(C
) : Type::getInt32Ty(C
);
394 HWLoopInfo
.LoopDecrement
= ConstantInt::get(HWLoopInfo
.CountType
, 1);
398 void PPCTTIImpl::getUnrollingPreferences(Loop
*L
, ScalarEvolution
&SE
,
399 TTI::UnrollingPreferences
&UP
,
400 OptimizationRemarkEmitter
*ORE
) {
401 if (ST
->getCPUDirective() == PPC::DIR_A2
) {
402 // The A2 is in-order with a deep pipeline, and concatenation unrolling
403 // helps expose latency-hiding opportunities to the instruction scheduler.
404 UP
.Partial
= UP
.Runtime
= true;
406 // We unroll a lot on the A2 (hundreds of instructions), and the benefits
407 // often outweigh the cost of a division to compute the trip count.
408 UP
.AllowExpensiveTripCount
= true;
411 BaseT::getUnrollingPreferences(L
, SE
, UP
, ORE
);
414 void PPCTTIImpl::getPeelingPreferences(Loop
*L
, ScalarEvolution
&SE
,
415 TTI::PeelingPreferences
&PP
) {
416 BaseT::getPeelingPreferences(L
, SE
, PP
);
418 // This function returns true to allow using coldcc calling convention.
419 // Returning true results in coldcc being used for functions which are cold at
420 // all call sites when the callers of the functions are not calling any other
421 // non coldcc functions.
422 bool PPCTTIImpl::useColdCCForColdCall(Function
&F
) {
423 return EnablePPCColdCC
;
426 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions
) {
427 // On the A2, always unroll aggressively.
428 if (ST
->getCPUDirective() == PPC::DIR_A2
)
431 return LoopHasReductions
;
434 PPCTTIImpl::TTI::MemCmpExpansionOptions
435 PPCTTIImpl::enableMemCmpExpansion(bool OptSize
, bool IsZeroCmp
) const {
436 TTI::MemCmpExpansionOptions Options
;
437 Options
.LoadSizes
= {8, 4, 2, 1};
438 Options
.MaxNumLoads
= TLI
->getMaxExpandSizeMemcmp(OptSize
);
442 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
446 unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID
) const {
447 assert(ClassID
== GPRRC
|| ClassID
== FPRRC
||
448 ClassID
== VRRC
|| ClassID
== VSXRC
);
450 assert(ClassID
== GPRRC
|| ClassID
== VSXRC
|| ClassID
== VRRC
);
451 return ClassID
== VSXRC
? 64 : 32;
453 assert(ClassID
== GPRRC
|| ClassID
== FPRRC
|| ClassID
== VRRC
);
457 unsigned PPCTTIImpl::getRegisterClassForType(bool Vector
, Type
*Ty
) const {
459 return ST
->hasVSX() ? VSXRC
: VRRC
;
460 else if (Ty
&& (Ty
->getScalarType()->isFloatTy() ||
461 Ty
->getScalarType()->isDoubleTy()))
462 return ST
->hasVSX() ? VSXRC
: FPRRC
;
463 else if (Ty
&& (Ty
->getScalarType()->isFP128Ty() ||
464 Ty
->getScalarType()->isPPC_FP128Ty()))
466 else if (Ty
&& Ty
->getScalarType()->isHalfTy())
472 const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID
) const {
476 llvm_unreachable("unknown register class");
477 return "PPC::unknown register class";
478 case GPRRC
: return "PPC::GPRRC";
479 case FPRRC
: return "PPC::FPRRC";
480 case VRRC
: return "PPC::VRRC";
481 case VSXRC
: return "PPC::VSXRC";
486 PPCTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K
) const {
488 case TargetTransformInfo::RGK_Scalar
:
489 return TypeSize::getFixed(ST
->isPPC64() ? 64 : 32);
490 case TargetTransformInfo::RGK_FixedWidthVector
:
491 return TypeSize::getFixed(ST
->hasAltivec() ? 128 : 0);
492 case TargetTransformInfo::RGK_ScalableVector
:
493 return TypeSize::getScalable(0);
496 llvm_unreachable("Unsupported register kind");
499 unsigned PPCTTIImpl::getCacheLineSize() const {
500 // Starting with P7 we have a cache line size of 128.
501 unsigned Directive
= ST
->getCPUDirective();
502 // Assume that Future CPU has the same cache line size as the others.
503 if (Directive
== PPC::DIR_PWR7
|| Directive
== PPC::DIR_PWR8
||
504 Directive
== PPC::DIR_PWR9
|| Directive
== PPC::DIR_PWR10
||
505 Directive
== PPC::DIR_PWR11
|| Directive
== PPC::DIR_PWR_FUTURE
)
508 // On other processors return a default of 64 bytes.
512 unsigned PPCTTIImpl::getPrefetchDistance() const {
516 unsigned PPCTTIImpl::getMaxInterleaveFactor(ElementCount VF
) {
517 unsigned Directive
= ST
->getCPUDirective();
518 // The 440 has no SIMD support, but floating-point instructions
519 // have a 5-cycle latency, so unroll by 5x for latency hiding.
520 if (Directive
== PPC::DIR_440
)
523 // The A2 has no SIMD support, but floating-point instructions
524 // have a 6-cycle latency, so unroll by 6x for latency hiding.
525 if (Directive
== PPC::DIR_A2
)
528 // FIXME: For lack of any better information, do no harm...
529 if (Directive
== PPC::DIR_E500mc
|| Directive
== PPC::DIR_E5500
)
532 // For P7 and P8, floating-point instructions have a 6-cycle latency and
533 // there are two execution units, so unroll by 12x for latency hiding.
534 // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
535 // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
536 // Assume that future is the same as the others.
537 if (Directive
== PPC::DIR_PWR7
|| Directive
== PPC::DIR_PWR8
||
538 Directive
== PPC::DIR_PWR9
|| Directive
== PPC::DIR_PWR10
||
539 Directive
== PPC::DIR_PWR11
|| Directive
== PPC::DIR_PWR_FUTURE
)
542 // For most things, modern systems have two execution units (and
543 // out-of-order execution).
547 // Returns a cost adjustment factor to adjust the cost of vector instructions
548 // on targets which there is overlap between the vector and scalar units,
549 // thereby reducing the overall throughput of vector code wrt. scalar code.
550 // An invalid instruction cost is returned if the type is an MMA vector type.
551 InstructionCost
PPCTTIImpl::vectorCostAdjustmentFactor(unsigned Opcode
,
552 Type
*Ty1
, Type
*Ty2
) {
553 // If the vector type is of an MMA type (v256i1, v512i1), an invalid
554 // instruction cost is returned. This is to signify to other cost computing
555 // functions to return the maximum instruction cost in order to prevent any
556 // opportunities for the optimizer to produce MMA types within the IR.
558 return InstructionCost::getInvalid();
560 if (!ST
->vectorsUseTwoUnits() || !Ty1
->isVectorTy())
561 return InstructionCost(1);
563 std::pair
<InstructionCost
, MVT
> LT1
= getTypeLegalizationCost(Ty1
);
564 // If type legalization involves splitting the vector, we don't want to
565 // double the cost at every step - only the last step.
566 if (LT1
.first
!= 1 || !LT1
.second
.isVector())
567 return InstructionCost(1);
569 int ISD
= TLI
->InstructionOpcodeToISD(Opcode
);
570 if (TLI
->isOperationExpand(ISD
, LT1
.second
))
571 return InstructionCost(1);
574 std::pair
<InstructionCost
, MVT
> LT2
= getTypeLegalizationCost(Ty2
);
575 if (LT2
.first
!= 1 || !LT2
.second
.isVector())
576 return InstructionCost(1);
579 return InstructionCost(2);
582 InstructionCost
PPCTTIImpl::getArithmeticInstrCost(
583 unsigned Opcode
, Type
*Ty
, TTI::TargetCostKind CostKind
,
584 TTI::OperandValueInfo Op1Info
, TTI::OperandValueInfo Op2Info
,
585 ArrayRef
<const Value
*> Args
,
586 const Instruction
*CxtI
) {
587 assert(TLI
->InstructionOpcodeToISD(Opcode
) && "Invalid opcode");
589 InstructionCost CostFactor
= vectorCostAdjustmentFactor(Opcode
, Ty
, nullptr);
590 if (!CostFactor
.isValid())
591 return InstructionCost::getMax();
593 // TODO: Handle more cost kinds.
594 if (CostKind
!= TTI::TCK_RecipThroughput
)
595 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
,
596 Op2Info
, Args
, CxtI
);
598 // Fallback to the default implementation.
599 InstructionCost Cost
= BaseT::getArithmeticInstrCost(
600 Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
);
601 return Cost
* CostFactor
;
604 InstructionCost
PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind
, Type
*Tp
,
606 TTI::TargetCostKind CostKind
,
607 int Index
, Type
*SubTp
,
608 ArrayRef
<const Value
*> Args
,
609 const Instruction
*CxtI
) {
611 InstructionCost CostFactor
=
612 vectorCostAdjustmentFactor(Instruction::ShuffleVector
, Tp
, nullptr);
613 if (!CostFactor
.isValid())
614 return InstructionCost::getMax();
616 // Legalize the type.
617 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Tp
);
619 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
620 // (at least in the sense that there need only be one non-loop-invariant
621 // instruction). We need one such shuffle instruction for each actual
622 // register (this is not true for arbitrary shuffles, but is true for the
623 // structured types of shuffles covered by TTI::ShuffleKind).
624 return LT
.first
* CostFactor
;
627 InstructionCost
PPCTTIImpl::getCFInstrCost(unsigned Opcode
,
628 TTI::TargetCostKind CostKind
,
629 const Instruction
*I
) {
630 if (CostKind
!= TTI::TCK_RecipThroughput
)
631 return Opcode
== Instruction::PHI
? 0 : 1;
632 // Branches are assumed to be predicted.
636 InstructionCost
PPCTTIImpl::getCastInstrCost(unsigned Opcode
, Type
*Dst
,
638 TTI::CastContextHint CCH
,
639 TTI::TargetCostKind CostKind
,
640 const Instruction
*I
) {
641 assert(TLI
->InstructionOpcodeToISD(Opcode
) && "Invalid opcode");
643 InstructionCost CostFactor
= vectorCostAdjustmentFactor(Opcode
, Dst
, Src
);
644 if (!CostFactor
.isValid())
645 return InstructionCost::getMax();
647 InstructionCost Cost
=
648 BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
650 // TODO: Allow non-throughput costs that aren't binary.
651 if (CostKind
!= TTI::TCK_RecipThroughput
)
652 return Cost
== 0 ? 0 : 1;
656 InstructionCost
PPCTTIImpl::getCmpSelInstrCost(
657 unsigned Opcode
, Type
*ValTy
, Type
*CondTy
, CmpInst::Predicate VecPred
,
658 TTI::TargetCostKind CostKind
, TTI::OperandValueInfo Op1Info
,
659 TTI::OperandValueInfo Op2Info
, const Instruction
*I
) {
660 InstructionCost CostFactor
=
661 vectorCostAdjustmentFactor(Opcode
, ValTy
, nullptr);
662 if (!CostFactor
.isValid())
663 return InstructionCost::getMax();
665 InstructionCost Cost
= BaseT::getCmpSelInstrCost(
666 Opcode
, ValTy
, CondTy
, VecPred
, CostKind
, Op1Info
, Op2Info
, I
);
667 // TODO: Handle other cost kinds.
668 if (CostKind
!= TTI::TCK_RecipThroughput
)
670 return Cost
* CostFactor
;
673 InstructionCost
PPCTTIImpl::getVectorInstrCost(unsigned Opcode
, Type
*Val
,
674 TTI::TargetCostKind CostKind
,
675 unsigned Index
, Value
*Op0
,
677 assert(Val
->isVectorTy() && "This must be a vector type");
679 int ISD
= TLI
->InstructionOpcodeToISD(Opcode
);
680 assert(ISD
&& "Invalid opcode");
682 InstructionCost CostFactor
= vectorCostAdjustmentFactor(Opcode
, Val
, nullptr);
683 if (!CostFactor
.isValid())
684 return InstructionCost::getMax();
686 InstructionCost Cost
=
687 BaseT::getVectorInstrCost(Opcode
, Val
, CostKind
, Index
, Op0
, Op1
);
690 if (ST
->hasVSX() && Val
->getScalarType()->isDoubleTy()) {
691 // Double-precision scalars are already located in index #0 (or #1 if LE).
692 if (ISD
== ISD::EXTRACT_VECTOR_ELT
&&
693 Index
== (ST
->isLittleEndian() ? 1 : 0))
698 } else if (Val
->getScalarType()->isIntegerTy()) {
699 unsigned EltSize
= Val
->getScalarSizeInBits();
700 // Computing on 1 bit values requires extra mask or compare operations.
701 unsigned MaskCostForOneBitSize
= (VecMaskCost
&& EltSize
== 1) ? 1 : 0;
702 // Computing on non const index requires extra mask or compare operations.
703 unsigned MaskCostForIdx
= (Index
!= -1U) ? 0 : 1;
704 if (ST
->hasP9Altivec()) {
705 // P10 has vxform insert which can handle non const index. The
706 // MaskCostForIdx is for masking the index.
707 // P9 has insert for const index. A move-to VSR and a permute/insert.
708 // Assume vector operation cost for both (cost will be 2x on P9).
709 if (ISD
== ISD::INSERT_VECTOR_ELT
) {
710 if (ST
->hasP10Vector())
711 return CostFactor
+ MaskCostForIdx
;
712 else if (Index
!= -1U)
713 return 2 * CostFactor
;
714 } else if (ISD
== ISD::EXTRACT_VECTOR_ELT
) {
715 // It's an extract. Maybe we can do a cheap move-from VSR.
716 unsigned EltSize
= Val
->getScalarSizeInBits();
717 // P9 has both mfvsrd and mfvsrld for 64 bit integer.
718 if (EltSize
== 64 && Index
!= -1U)
720 else if (EltSize
== 32) {
721 unsigned MfvsrwzIndex
= ST
->isLittleEndian() ? 2 : 1;
722 if (Index
== MfvsrwzIndex
)
725 // For other indexs like non const, P9 has vxform extract. The
726 // MaskCostForIdx is for masking the index.
727 return CostFactor
+ MaskCostForIdx
;
730 // We need a vector extract (or mfvsrld). Assume vector operation cost.
731 // The cost of the load constant for a vector extract is disregarded
732 // (invariant, easily schedulable).
733 return CostFactor
+ MaskCostForOneBitSize
+ MaskCostForIdx
;
735 } else if (ST
->hasDirectMove() && Index
!= -1U) {
736 // Assume permute has standard cost.
737 // Assume move-to/move-from VSR have 2x standard cost.
738 if (ISD
== ISD::INSERT_VECTOR_ELT
)
740 return 3 + MaskCostForOneBitSize
;
744 // Estimated cost of a load-hit-store delay. This was obtained
745 // experimentally as a minimum needed to prevent unprofitable
746 // vectorization for the paq8p benchmark. It may need to be
747 // raised further if other unprofitable cases remain.
748 unsigned LHSPenalty
= 2;
749 if (ISD
== ISD::INSERT_VECTOR_ELT
)
752 // Vector element insert/extract with Altivec is very expensive,
753 // because they require store and reload with the attendant
754 // processor stall for load-hit-store. Until VSX is available,
755 // these need to be estimated as very costly.
756 if (ISD
== ISD::EXTRACT_VECTOR_ELT
||
757 ISD
== ISD::INSERT_VECTOR_ELT
)
758 return LHSPenalty
+ Cost
;
763 InstructionCost
PPCTTIImpl::getMemoryOpCost(unsigned Opcode
, Type
*Src
,
764 MaybeAlign Alignment
,
765 unsigned AddressSpace
,
766 TTI::TargetCostKind CostKind
,
767 TTI::OperandValueInfo OpInfo
,
768 const Instruction
*I
) {
770 InstructionCost CostFactor
= vectorCostAdjustmentFactor(Opcode
, Src
, nullptr);
771 if (!CostFactor
.isValid())
772 return InstructionCost::getMax();
774 if (TLI
->getValueType(DL
, Src
, true) == MVT::Other
)
775 return BaseT::getMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
,
777 // Legalize the type.
778 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Src
);
779 assert((Opcode
== Instruction::Load
|| Opcode
== Instruction::Store
) &&
782 InstructionCost Cost
=
783 BaseT::getMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
, CostKind
);
784 // TODO: Handle other cost kinds.
785 if (CostKind
!= TTI::TCK_RecipThroughput
)
790 bool IsAltivecType
= ST
->hasAltivec() &&
791 (LT
.second
== MVT::v16i8
|| LT
.second
== MVT::v8i16
||
792 LT
.second
== MVT::v4i32
|| LT
.second
== MVT::v4f32
);
793 bool IsVSXType
= ST
->hasVSX() &&
794 (LT
.second
== MVT::v2f64
|| LT
.second
== MVT::v2i64
);
796 // VSX has 32b/64b load instructions. Legalization can handle loading of
797 // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
798 // PPCTargetLowering can't compute the cost appropriately. So here we
799 // explicitly check this case. There are also corresponding store
801 unsigned MemBits
= Src
->getPrimitiveSizeInBits();
802 unsigned SrcBytes
= LT
.second
.getStoreSize();
803 if (ST
->hasVSX() && IsAltivecType
) {
804 if (MemBits
== 64 || (ST
->hasP8Vector() && MemBits
== 32))
807 // Use lfiwax/xxspltw
808 Align AlignBytes
= Alignment
? *Alignment
: Align(1);
809 if (Opcode
== Instruction::Load
&& MemBits
== 32 && AlignBytes
< SrcBytes
)
813 // Aligned loads and stores are easy.
814 if (!SrcBytes
|| !Alignment
|| *Alignment
>= SrcBytes
)
817 // If we can use the permutation-based load sequence, then this is also
818 // relatively cheap (not counting loop-invariant instructions): one load plus
819 // one permute (the last load in a series has extra cost, but we're
820 // neglecting that here). Note that on the P7, we could do unaligned loads
821 // for Altivec types using the VSX instructions, but that's more expensive
822 // than using the permutation-based load sequence. On the P8, that's no
824 if (Opcode
== Instruction::Load
&& (!ST
->hasP8Vector() && IsAltivecType
) &&
825 *Alignment
>= LT
.second
.getScalarType().getStoreSize())
826 return Cost
+ LT
.first
; // Add the cost of the permutations.
828 // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
829 // P7, unaligned vector loads are more expensive than the permutation-based
830 // load sequence, so that might be used instead, but regardless, the net cost
831 // is about the same (not counting loop-invariant instructions).
832 if (IsVSXType
|| (ST
->hasVSX() && IsAltivecType
))
835 // Newer PPC supports unaligned memory access.
836 if (TLI
->allowsMisalignedMemoryAccesses(LT
.second
, 0))
839 // PPC in general does not support unaligned loads and stores. They'll need
840 // to be decomposed based on the alignment factor.
842 // Add the cost of each scalar load or store.
844 Cost
+= LT
.first
* ((SrcBytes
/ Alignment
->value()) - 1);
846 // For a vector type, there is also scalarization overhead (only for
847 // stores, loads are expanded using the vector-load + permutation sequence,
848 // which is much less expensive).
849 if (Src
->isVectorTy() && Opcode
== Instruction::Store
)
850 for (int i
= 0, e
= cast
<FixedVectorType
>(Src
)->getNumElements(); i
< e
;
852 Cost
+= getVectorInstrCost(Instruction::ExtractElement
, Src
, CostKind
, i
,
858 InstructionCost
PPCTTIImpl::getInterleavedMemoryOpCost(
859 unsigned Opcode
, Type
*VecTy
, unsigned Factor
, ArrayRef
<unsigned> Indices
,
860 Align Alignment
, unsigned AddressSpace
, TTI::TargetCostKind CostKind
,
861 bool UseMaskForCond
, bool UseMaskForGaps
) {
862 InstructionCost CostFactor
=
863 vectorCostAdjustmentFactor(Opcode
, VecTy
, nullptr);
864 if (!CostFactor
.isValid())
865 return InstructionCost::getMax();
867 if (UseMaskForCond
|| UseMaskForGaps
)
868 return BaseT::getInterleavedMemoryOpCost(Opcode
, VecTy
, Factor
, Indices
,
869 Alignment
, AddressSpace
, CostKind
,
870 UseMaskForCond
, UseMaskForGaps
);
872 assert(isa
<VectorType
>(VecTy
) &&
873 "Expect a vector type for interleaved memory op");
875 // Legalize the type.
876 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(VecTy
);
878 // Firstly, the cost of load/store operation.
879 InstructionCost Cost
= getMemoryOpCost(Opcode
, VecTy
, MaybeAlign(Alignment
),
880 AddressSpace
, CostKind
);
882 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
883 // (at least in the sense that there need only be one non-loop-invariant
884 // instruction). For each result vector, we need one shuffle per incoming
885 // vector (except that the first shuffle can take two incoming vectors
886 // because it does not need to take itself).
887 Cost
+= Factor
*(LT
.first
-1);
893 PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes
&ICA
,
894 TTI::TargetCostKind CostKind
) {
895 return BaseT::getIntrinsicInstrCost(ICA
, CostKind
);
898 bool PPCTTIImpl::areTypesABICompatible(const Function
*Caller
,
899 const Function
*Callee
,
900 const ArrayRef
<Type
*> &Types
) const {
902 // We need to ensure that argument promotion does not
903 // attempt to promote pointers to MMA types (__vector_pair
904 // and __vector_quad) since these types explicitly cannot be
905 // passed as arguments. Both of these types are larger than
906 // the 128-bit Altivec vectors and have a scalar size of 1 bit.
907 if (!BaseT::areTypesABICompatible(Caller
, Callee
, Types
))
910 return llvm::none_of(Types
, [](Type
*Ty
) {
912 return Ty
->isIntOrIntVectorTy(1) && Ty
->getPrimitiveSizeInBits() > 128;
917 bool PPCTTIImpl::canSaveCmp(Loop
*L
, BranchInst
**BI
, ScalarEvolution
*SE
,
918 LoopInfo
*LI
, DominatorTree
*DT
,
919 AssumptionCache
*AC
, TargetLibraryInfo
*LibInfo
) {
920 // Process nested loops first.
922 if (canSaveCmp(I
, BI
, SE
, LI
, DT
, AC
, LibInfo
))
923 return false; // Stop search.
925 HardwareLoopInfo
HWLoopInfo(L
);
927 if (!HWLoopInfo
.canAnalyze(*LI
))
930 if (!isHardwareLoopProfitable(L
, *SE
, *AC
, LibInfo
, HWLoopInfo
))
933 if (!HWLoopInfo
.isHardwareLoopCandidate(*SE
, *LI
, *DT
))
936 *BI
= HWLoopInfo
.ExitBranch
;
940 bool PPCTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost
&C1
,
941 const TargetTransformInfo::LSRCost
&C2
) {
942 // PowerPC default behaviour here is "instruction number 1st priority".
943 // If LsrNoInsnsCost is set, call default implementation.
945 return std::tie(C1
.Insns
, C1
.NumRegs
, C1
.AddRecCost
, C1
.NumIVMuls
,
946 C1
.NumBaseAdds
, C1
.ScaleCost
, C1
.ImmCost
, C1
.SetupCost
) <
947 std::tie(C2
.Insns
, C2
.NumRegs
, C2
.AddRecCost
, C2
.NumIVMuls
,
948 C2
.NumBaseAdds
, C2
.ScaleCost
, C2
.ImmCost
, C2
.SetupCost
);
950 return TargetTransformInfoImplBase::isLSRCostLess(C1
, C2
);
953 bool PPCTTIImpl::isNumRegsMajorCostOfLSR() {
957 bool PPCTTIImpl::shouldBuildRelLookupTables() const {
958 const PPCTargetMachine
&TM
= ST
->getTargetMachine();
959 // XCOFF hasn't implemented lowerRelativeReference, disable non-ELF for now.
960 if (!TM
.isELFv2ABI())
962 return BaseT::shouldBuildRelLookupTables();
965 bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst
*Inst
,
966 MemIntrinsicInfo
&Info
) {
967 switch (Inst
->getIntrinsicID()) {
968 case Intrinsic::ppc_altivec_lvx
:
969 case Intrinsic::ppc_altivec_lvxl
:
970 case Intrinsic::ppc_altivec_lvebx
:
971 case Intrinsic::ppc_altivec_lvehx
:
972 case Intrinsic::ppc_altivec_lvewx
:
973 case Intrinsic::ppc_vsx_lxvd2x
:
974 case Intrinsic::ppc_vsx_lxvw4x
:
975 case Intrinsic::ppc_vsx_lxvd2x_be
:
976 case Intrinsic::ppc_vsx_lxvw4x_be
:
977 case Intrinsic::ppc_vsx_lxvl
:
978 case Intrinsic::ppc_vsx_lxvll
:
979 case Intrinsic::ppc_vsx_lxvp
: {
980 Info
.PtrVal
= Inst
->getArgOperand(0);
982 Info
.WriteMem
= false;
985 case Intrinsic::ppc_altivec_stvx
:
986 case Intrinsic::ppc_altivec_stvxl
:
987 case Intrinsic::ppc_altivec_stvebx
:
988 case Intrinsic::ppc_altivec_stvehx
:
989 case Intrinsic::ppc_altivec_stvewx
:
990 case Intrinsic::ppc_vsx_stxvd2x
:
991 case Intrinsic::ppc_vsx_stxvw4x
:
992 case Intrinsic::ppc_vsx_stxvd2x_be
:
993 case Intrinsic::ppc_vsx_stxvw4x_be
:
994 case Intrinsic::ppc_vsx_stxvl
:
995 case Intrinsic::ppc_vsx_stxvll
:
996 case Intrinsic::ppc_vsx_stxvp
: {
997 Info
.PtrVal
= Inst
->getArgOperand(1);
998 Info
.ReadMem
= false;
999 Info
.WriteMem
= true;
1002 case Intrinsic::ppc_stbcx
:
1003 case Intrinsic::ppc_sthcx
:
1004 case Intrinsic::ppc_stdcx
:
1005 case Intrinsic::ppc_stwcx
: {
1006 Info
.PtrVal
= Inst
->getArgOperand(0);
1007 Info
.ReadMem
= false;
1008 Info
.WriteMem
= true;
1018 bool PPCTTIImpl::hasActiveVectorLength(unsigned Opcode
, Type
*DataType
,
1019 Align Alignment
) const {
1020 // Only load and stores instructions can have variable vector length on Power.
1021 if (Opcode
!= Instruction::Load
&& Opcode
!= Instruction::Store
)
1023 // Loads/stores with length instructions use bits 0-7 of the GPR operand and
1024 // therefore cannot be used in 32-bit mode.
1025 if ((!ST
->hasP9Vector() && !ST
->hasP10Vector()) || !ST
->isPPC64())
1027 if (isa
<FixedVectorType
>(DataType
)) {
1028 unsigned VecWidth
= DataType
->getPrimitiveSizeInBits();
1029 return VecWidth
== 128;
1031 Type
*ScalarTy
= DataType
->getScalarType();
1033 if (ScalarTy
->isPointerTy())
1036 if (ScalarTy
->isFloatTy() || ScalarTy
->isDoubleTy())
1039 if (!ScalarTy
->isIntegerTy())
1042 unsigned IntWidth
= ScalarTy
->getIntegerBitWidth();
1043 return IntWidth
== 8 || IntWidth
== 16 || IntWidth
== 32 || IntWidth
== 64;
1046 InstructionCost
PPCTTIImpl::getVPMemoryOpCost(unsigned Opcode
, Type
*Src
,
1048 unsigned AddressSpace
,
1049 TTI::TargetCostKind CostKind
,
1050 const Instruction
*I
) {
1051 InstructionCost Cost
= BaseT::getVPMemoryOpCost(Opcode
, Src
, Alignment
,
1052 AddressSpace
, CostKind
, I
);
1053 if (TLI
->getValueType(DL
, Src
, true) == MVT::Other
)
1055 // TODO: Handle other cost kinds.
1056 if (CostKind
!= TTI::TCK_RecipThroughput
)
1059 assert((Opcode
== Instruction::Load
|| Opcode
== Instruction::Store
) &&
1062 auto *SrcVTy
= dyn_cast
<FixedVectorType
>(Src
);
1063 assert(SrcVTy
&& "Expected a vector type for VP memory operations");
1065 if (hasActiveVectorLength(Opcode
, Src
, Alignment
)) {
1066 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(SrcVTy
);
1068 InstructionCost CostFactor
=
1069 vectorCostAdjustmentFactor(Opcode
, Src
, nullptr);
1070 if (!CostFactor
.isValid())
1071 return InstructionCost::getMax();
1073 InstructionCost Cost
= LT
.first
* CostFactor
;
1074 assert(Cost
.isValid() && "Expected valid cost");
1076 // On P9 but not on P10, if the op is misaligned then it will cause a
1077 // pipeline flush. Otherwise the VSX masked memops cost the same as unmasked
1079 const Align
DesiredAlignment(16);
1080 if (Alignment
>= DesiredAlignment
|| ST
->getCPUDirective() != PPC::DIR_PWR9
)
1083 // Since alignment may be under estimated, we try to compute the probability
1084 // that the actual address is aligned to the desired boundary. For example
1085 // an 8-byte aligned load is assumed to be actually 16-byte aligned half the
1086 // time, while a 4-byte aligned load has a 25% chance of being 16-byte
1088 float AlignmentProb
= ((float)Alignment
.value()) / DesiredAlignment
.value();
1089 float MisalignmentProb
= 1.0 - AlignmentProb
;
1090 return (MisalignmentProb
* P9PipelineFlushEstimate
) +
1091 (AlignmentProb
* *Cost
.getValue());
1094 // Usually we should not get to this point, but the following is an attempt to
1095 // model the cost of legalization. Currently we can only lower intrinsics with
1096 // evl but no mask, on Power 9/10. Otherwise, we must scalarize.
1097 return getMaskedMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
, CostKind
);
1100 bool PPCTTIImpl::supportsTailCallFor(const CallBase
*CB
) const {
1101 return TLI
->supportsTailCallFor(CB
);