1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "PPCTargetTransformInfo.h"
10 #include "llvm/Analysis/CodeMetrics.h"
11 #include "llvm/Analysis/TargetLibraryInfo.h"
12 #include "llvm/Analysis/TargetTransformInfo.h"
13 #include "llvm/CodeGen/BasicTTIImpl.h"
14 #include "llvm/CodeGen/CostTable.h"
15 #include "llvm/CodeGen/TargetLowering.h"
16 #include "llvm/CodeGen/TargetSchedule.h"
17 #include "llvm/IR/IntrinsicsPowerPC.h"
18 #include "llvm/Support/CommandLine.h"
19 #include "llvm/Support/Debug.h"
20 #include "llvm/Support/KnownBits.h"
21 #include "llvm/Transforms/InstCombine/InstCombiner.h"
22 #include "llvm/Transforms/Utils/Local.h"
26 #define DEBUG_TYPE "ppctti"
28 static cl::opt
<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
29 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden
);
31 // This is currently only used for the data prefetch pass
32 static cl::opt
<unsigned>
33 CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden
, cl::init(64),
34 cl::desc("The loop prefetch cache line size"));
37 EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden
, cl::init(false),
38 cl::desc("Enable using coldcc calling conv for cold "
39 "internal functions"));
42 LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden
, cl::init(false),
43 cl::desc("Do not add instruction count to lsr cost model"));
45 // The latency of mtctr is only justified if there are more than 4
46 // comparisons that will be removed as a result.
47 static cl::opt
<unsigned>
48 SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden
,
49 cl::desc("Loops with a constant trip count smaller than "
50 "this value will not use the count register."));
52 //===----------------------------------------------------------------------===//
56 //===----------------------------------------------------------------------===//
58 TargetTransformInfo::PopcntSupportKind
59 PPCTTIImpl::getPopcntSupport(unsigned TyWidth
) {
60 assert(isPowerOf2_32(TyWidth
) && "Ty width must be power of 2");
61 if (ST
->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable
&& TyWidth
<= 64)
62 return ST
->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow
?
63 TTI::PSK_SlowHardware
: TTI::PSK_FastHardware
;
64 return TTI::PSK_Software
;
67 Optional
<Instruction
*>
68 PPCTTIImpl::instCombineIntrinsic(InstCombiner
&IC
, IntrinsicInst
&II
) const {
69 Intrinsic::ID IID
= II
.getIntrinsicID();
73 case Intrinsic::ppc_altivec_lvx
:
74 case Intrinsic::ppc_altivec_lvxl
:
75 // Turn PPC lvx -> load if the pointer is known aligned.
76 if (getOrEnforceKnownAlignment(
77 II
.getArgOperand(0), Align(16), IC
.getDataLayout(), &II
,
78 &IC
.getAssumptionCache(), &IC
.getDominatorTree()) >= 16) {
79 Value
*Ptr
= IC
.Builder
.CreateBitCast(
80 II
.getArgOperand(0), PointerType::getUnqual(II
.getType()));
81 return new LoadInst(II
.getType(), Ptr
, "", false, Align(16));
84 case Intrinsic::ppc_vsx_lxvw4x
:
85 case Intrinsic::ppc_vsx_lxvd2x
: {
86 // Turn PPC VSX loads into normal loads.
87 Value
*Ptr
= IC
.Builder
.CreateBitCast(II
.getArgOperand(0),
88 PointerType::getUnqual(II
.getType()));
89 return new LoadInst(II
.getType(), Ptr
, Twine(""), false, Align(1));
91 case Intrinsic::ppc_altivec_stvx
:
92 case Intrinsic::ppc_altivec_stvxl
:
93 // Turn stvx -> store if the pointer is known aligned.
94 if (getOrEnforceKnownAlignment(
95 II
.getArgOperand(1), Align(16), IC
.getDataLayout(), &II
,
96 &IC
.getAssumptionCache(), &IC
.getDominatorTree()) >= 16) {
97 Type
*OpPtrTy
= PointerType::getUnqual(II
.getArgOperand(0)->getType());
98 Value
*Ptr
= IC
.Builder
.CreateBitCast(II
.getArgOperand(1), OpPtrTy
);
99 return new StoreInst(II
.getArgOperand(0), Ptr
, false, Align(16));
102 case Intrinsic::ppc_vsx_stxvw4x
:
103 case Intrinsic::ppc_vsx_stxvd2x
: {
104 // Turn PPC VSX stores into normal stores.
105 Type
*OpPtrTy
= PointerType::getUnqual(II
.getArgOperand(0)->getType());
106 Value
*Ptr
= IC
.Builder
.CreateBitCast(II
.getArgOperand(1), OpPtrTy
);
107 return new StoreInst(II
.getArgOperand(0), Ptr
, false, Align(1));
109 case Intrinsic::ppc_altivec_vperm
:
110 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
111 // Note that ppc_altivec_vperm has a big-endian bias, so when creating
112 // a vectorshuffle for little endian, we must undo the transformation
113 // performed on vec_perm in altivec.h. That is, we must complement
114 // the permutation mask with respect to 31 and reverse the order of
116 if (Constant
*Mask
= dyn_cast
<Constant
>(II
.getArgOperand(2))) {
117 assert(cast
<FixedVectorType
>(Mask
->getType())->getNumElements() == 16 &&
118 "Bad type for intrinsic!");
120 // Check that all of the elements are integer constants or undefs.
121 bool AllEltsOk
= true;
122 for (unsigned i
= 0; i
!= 16; ++i
) {
123 Constant
*Elt
= Mask
->getAggregateElement(i
);
124 if (!Elt
|| !(isa
<ConstantInt
>(Elt
) || isa
<UndefValue
>(Elt
))) {
131 // Cast the input vectors to byte vectors.
133 IC
.Builder
.CreateBitCast(II
.getArgOperand(0), Mask
->getType());
135 IC
.Builder
.CreateBitCast(II
.getArgOperand(1), Mask
->getType());
136 Value
*Result
= UndefValue::get(Op0
->getType());
138 // Only extract each element once.
139 Value
*ExtractedElts
[32];
140 memset(ExtractedElts
, 0, sizeof(ExtractedElts
));
142 for (unsigned i
= 0; i
!= 16; ++i
) {
143 if (isa
<UndefValue
>(Mask
->getAggregateElement(i
)))
146 cast
<ConstantInt
>(Mask
->getAggregateElement(i
))->getZExtValue();
147 Idx
&= 31; // Match the hardware behavior.
148 if (DL
.isLittleEndian())
151 if (!ExtractedElts
[Idx
]) {
152 Value
*Op0ToUse
= (DL
.isLittleEndian()) ? Op1
: Op0
;
153 Value
*Op1ToUse
= (DL
.isLittleEndian()) ? Op0
: Op1
;
154 ExtractedElts
[Idx
] = IC
.Builder
.CreateExtractElement(
155 Idx
< 16 ? Op0ToUse
: Op1ToUse
, IC
.Builder
.getInt32(Idx
& 15));
158 // Insert this value into the result vector.
159 Result
= IC
.Builder
.CreateInsertElement(Result
, ExtractedElts
[Idx
],
160 IC
.Builder
.getInt32(i
));
162 return CastInst::Create(Instruction::BitCast
, Result
, II
.getType());
170 InstructionCost
PPCTTIImpl::getIntImmCost(const APInt
&Imm
, Type
*Ty
,
171 TTI::TargetCostKind CostKind
) {
172 if (DisablePPCConstHoist
)
173 return BaseT::getIntImmCost(Imm
, Ty
, CostKind
);
175 assert(Ty
->isIntegerTy());
177 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
182 return TTI::TCC_Free
;
184 if (Imm
.getBitWidth() <= 64) {
185 if (isInt
<16>(Imm
.getSExtValue()))
186 return TTI::TCC_Basic
;
188 if (isInt
<32>(Imm
.getSExtValue())) {
189 // A constant that can be materialized using lis.
190 if ((Imm
.getZExtValue() & 0xFFFF) == 0)
191 return TTI::TCC_Basic
;
193 return 2 * TTI::TCC_Basic
;
197 return 4 * TTI::TCC_Basic
;
200 InstructionCost
PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID
, unsigned Idx
,
201 const APInt
&Imm
, Type
*Ty
,
202 TTI::TargetCostKind CostKind
) {
203 if (DisablePPCConstHoist
)
204 return BaseT::getIntImmCostIntrin(IID
, Idx
, Imm
, Ty
, CostKind
);
206 assert(Ty
->isIntegerTy());
208 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
214 return TTI::TCC_Free
;
215 case Intrinsic::sadd_with_overflow
:
216 case Intrinsic::uadd_with_overflow
:
217 case Intrinsic::ssub_with_overflow
:
218 case Intrinsic::usub_with_overflow
:
219 if ((Idx
== 1) && Imm
.getBitWidth() <= 64 && isInt
<16>(Imm
.getSExtValue()))
220 return TTI::TCC_Free
;
222 case Intrinsic::experimental_stackmap
:
223 if ((Idx
< 2) || (Imm
.getBitWidth() <= 64 && isInt
<64>(Imm
.getSExtValue())))
224 return TTI::TCC_Free
;
226 case Intrinsic::experimental_patchpoint_void
:
227 case Intrinsic::experimental_patchpoint_i64
:
228 if ((Idx
< 4) || (Imm
.getBitWidth() <= 64 && isInt
<64>(Imm
.getSExtValue())))
229 return TTI::TCC_Free
;
232 return PPCTTIImpl::getIntImmCost(Imm
, Ty
, CostKind
);
235 InstructionCost
PPCTTIImpl::getIntImmCostInst(unsigned Opcode
, unsigned Idx
,
236 const APInt
&Imm
, Type
*Ty
,
237 TTI::TargetCostKind CostKind
,
239 if (DisablePPCConstHoist
)
240 return BaseT::getIntImmCostInst(Opcode
, Idx
, Imm
, Ty
, CostKind
, Inst
);
242 assert(Ty
->isIntegerTy());
244 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
248 unsigned ImmIdx
= ~0U;
249 bool ShiftedFree
= false, RunFree
= false, UnsignedFree
= false,
253 return TTI::TCC_Free
;
254 case Instruction::GetElementPtr
:
255 // Always hoist the base address of a GetElementPtr. This prevents the
256 // creation of new constants for every base constant that gets constant
257 // folded with the offset.
259 return 2 * TTI::TCC_Basic
;
260 return TTI::TCC_Free
;
261 case Instruction::And
:
262 RunFree
= true; // (for the rotate-and-mask instructions)
264 case Instruction::Add
:
265 case Instruction::Or
:
266 case Instruction::Xor
:
269 case Instruction::Sub
:
270 case Instruction::Mul
:
271 case Instruction::Shl
:
272 case Instruction::LShr
:
273 case Instruction::AShr
:
276 case Instruction::ICmp
:
279 // Zero comparisons can use record-form instructions.
281 case Instruction::Select
:
284 case Instruction::PHI
:
285 case Instruction::Call
:
286 case Instruction::Ret
:
287 case Instruction::Load
:
288 case Instruction::Store
:
292 if (ZeroFree
&& Imm
== 0)
293 return TTI::TCC_Free
;
295 if (Idx
== ImmIdx
&& Imm
.getBitWidth() <= 64) {
296 if (isInt
<16>(Imm
.getSExtValue()))
297 return TTI::TCC_Free
;
300 if (Imm
.getBitWidth() <= 32 &&
301 (isShiftedMask_32(Imm
.getZExtValue()) ||
302 isShiftedMask_32(~Imm
.getZExtValue())))
303 return TTI::TCC_Free
;
306 (isShiftedMask_64(Imm
.getZExtValue()) ||
307 isShiftedMask_64(~Imm
.getZExtValue())))
308 return TTI::TCC_Free
;
311 if (UnsignedFree
&& isUInt
<16>(Imm
.getZExtValue()))
312 return TTI::TCC_Free
;
314 if (ShiftedFree
&& (Imm
.getZExtValue() & 0xFFFF) == 0)
315 return TTI::TCC_Free
;
318 return PPCTTIImpl::getIntImmCost(Imm
, Ty
, CostKind
);
321 InstructionCost
PPCTTIImpl::getUserCost(const User
*U
,
322 ArrayRef
<const Value
*> Operands
,
323 TTI::TargetCostKind CostKind
) {
324 // We already implement getCastInstrCost and getMemoryOpCost where we perform
325 // the vector adjustment there.
326 if (isa
<CastInst
>(U
) || isa
<LoadInst
>(U
) || isa
<StoreInst
>(U
))
327 return BaseT::getUserCost(U
, Operands
, CostKind
);
329 if (U
->getType()->isVectorTy()) {
330 // Instructions that need to be split should cost more.
331 std::pair
<InstructionCost
, MVT
> LT
=
332 TLI
->getTypeLegalizationCost(DL
, U
->getType());
333 return LT
.first
* BaseT::getUserCost(U
, Operands
, CostKind
);
336 return BaseT::getUserCost(U
, Operands
, CostKind
);
339 // Determining the address of a TLS variable results in a function call in
340 // certain TLS models.
341 static bool memAddrUsesCTR(const Value
*MemAddr
, const PPCTargetMachine
&TM
,
342 SmallPtrSetImpl
<const Value
*> &Visited
) {
343 // No need to traverse again if we already checked this operand.
344 if (!Visited
.insert(MemAddr
).second
)
346 const auto *GV
= dyn_cast
<GlobalValue
>(MemAddr
);
348 // Recurse to check for constants that refer to TLS global variables.
349 if (const auto *CV
= dyn_cast
<Constant
>(MemAddr
))
350 for (const auto &CO
: CV
->operands())
351 if (memAddrUsesCTR(CO
, TM
, Visited
))
356 if (!GV
->isThreadLocal())
358 TLSModel::Model Model
= TM
.getTLSModel(GV
);
359 return Model
== TLSModel::GeneralDynamic
|| Model
== TLSModel::LocalDynamic
;
362 bool PPCTTIImpl::mightUseCTR(BasicBlock
*BB
, TargetLibraryInfo
*LibInfo
,
363 SmallPtrSetImpl
<const Value
*> &Visited
) {
364 const PPCTargetMachine
&TM
= ST
->getTargetMachine();
366 // Loop through the inline asm constraints and look for something that
368 auto asmClobbersCTR
= [](InlineAsm
*IA
) {
369 InlineAsm::ConstraintInfoVector CIV
= IA
->ParseConstraints();
370 for (unsigned i
= 0, ie
= CIV
.size(); i
< ie
; ++i
) {
371 InlineAsm::ConstraintInfo
&C
= CIV
[i
];
372 if (C
.Type
!= InlineAsm::isInput
)
373 for (unsigned j
= 0, je
= C
.Codes
.size(); j
< je
; ++j
)
374 if (StringRef(C
.Codes
[j
]).equals_insensitive("{ctr}"))
380 auto isLargeIntegerTy
= [](bool Is32Bit
, Type
*Ty
) {
381 if (IntegerType
*ITy
= dyn_cast
<IntegerType
>(Ty
))
382 return ITy
->getBitWidth() > (Is32Bit
? 32U : 64U);
387 auto supportedHalfPrecisionOp
= [](Instruction
*Inst
) {
388 switch (Inst
->getOpcode()) {
391 case Instruction::FPTrunc
:
392 case Instruction::FPExt
:
393 case Instruction::Load
:
394 case Instruction::Store
:
395 case Instruction::FPToUI
:
396 case Instruction::UIToFP
:
397 case Instruction::FPToSI
:
398 case Instruction::SIToFP
:
403 for (BasicBlock::iterator J
= BB
->begin(), JE
= BB
->end();
405 // There are no direct operations on half precision so assume that
406 // anything with that type requires a call except for a few select
407 // operations with Power9.
408 if (Instruction
*CurrInst
= dyn_cast
<Instruction
>(J
)) {
409 for (const auto &Op
: CurrInst
->operands()) {
410 if (Op
->getType()->getScalarType()->isHalfTy() ||
411 CurrInst
->getType()->getScalarType()->isHalfTy())
412 return !(ST
->isISA3_0() && supportedHalfPrecisionOp(CurrInst
));
415 if (CallInst
*CI
= dyn_cast
<CallInst
>(J
)) {
416 // Inline ASM is okay, unless it clobbers the ctr register.
417 if (InlineAsm
*IA
= dyn_cast
<InlineAsm
>(CI
->getCalledOperand())) {
418 if (asmClobbersCTR(IA
))
423 if (Function
*F
= CI
->getCalledFunction()) {
424 // Most intrinsics don't become function calls, but some might.
425 // sin, cos, exp and log are always calls.
427 if (F
->getIntrinsicID() != Intrinsic::not_intrinsic
) {
428 switch (F
->getIntrinsicID()) {
430 // If we have a call to loop_decrement or set_loop_iterations,
431 // we're definitely using CTR.
432 case Intrinsic::set_loop_iterations
:
433 case Intrinsic::loop_decrement
:
436 // Binary operations on 128-bit value will use CTR.
437 case Intrinsic::experimental_constrained_fadd
:
438 case Intrinsic::experimental_constrained_fsub
:
439 case Intrinsic::experimental_constrained_fmul
:
440 case Intrinsic::experimental_constrained_fdiv
:
441 case Intrinsic::experimental_constrained_frem
:
442 if (F
->getType()->getScalarType()->isFP128Ty() ||
443 F
->getType()->getScalarType()->isPPC_FP128Ty())
447 case Intrinsic::experimental_constrained_fptosi
:
448 case Intrinsic::experimental_constrained_fptoui
:
449 case Intrinsic::experimental_constrained_sitofp
:
450 case Intrinsic::experimental_constrained_uitofp
: {
451 Type
*SrcType
= CI
->getArgOperand(0)->getType()->getScalarType();
452 Type
*DstType
= CI
->getType()->getScalarType();
453 if (SrcType
->isPPC_FP128Ty() || DstType
->isPPC_FP128Ty() ||
454 isLargeIntegerTy(!TM
.isPPC64(), SrcType
) ||
455 isLargeIntegerTy(!TM
.isPPC64(), DstType
))
460 // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
461 // because, although it does clobber the counter register, the
462 // control can't then return to inside the loop unless there is also
463 // an eh_sjlj_setjmp.
464 case Intrinsic::eh_sjlj_setjmp
:
466 case Intrinsic::memcpy
:
467 case Intrinsic::memmove
:
468 case Intrinsic::memset
:
469 case Intrinsic::powi
:
471 case Intrinsic::log2
:
472 case Intrinsic::log10
:
474 case Intrinsic::exp2
:
478 case Intrinsic::experimental_constrained_powi
:
479 case Intrinsic::experimental_constrained_log
:
480 case Intrinsic::experimental_constrained_log2
:
481 case Intrinsic::experimental_constrained_log10
:
482 case Intrinsic::experimental_constrained_exp
:
483 case Intrinsic::experimental_constrained_exp2
:
484 case Intrinsic::experimental_constrained_pow
:
485 case Intrinsic::experimental_constrained_sin
:
486 case Intrinsic::experimental_constrained_cos
:
488 // There is no corresponding FMA instruction for PPC double double.
489 // Thus, we need to disable CTR loop generation for this type.
490 case Intrinsic::fmuladd
:
491 case Intrinsic::copysign
:
492 if (CI
->getArgOperand(0)->getType()->getScalarType()->
496 continue; // ISD::FCOPYSIGN is never a library call.
497 case Intrinsic::fma
: Opcode
= ISD::FMA
; break;
498 case Intrinsic::sqrt
: Opcode
= ISD::FSQRT
; break;
499 case Intrinsic::floor
: Opcode
= ISD::FFLOOR
; break;
500 case Intrinsic::ceil
: Opcode
= ISD::FCEIL
; break;
501 case Intrinsic::trunc
: Opcode
= ISD::FTRUNC
; break;
502 case Intrinsic::rint
: Opcode
= ISD::FRINT
; break;
503 case Intrinsic::lrint
: Opcode
= ISD::LRINT
; break;
504 case Intrinsic::llrint
: Opcode
= ISD::LLRINT
; break;
505 case Intrinsic::nearbyint
: Opcode
= ISD::FNEARBYINT
; break;
506 case Intrinsic::round
: Opcode
= ISD::FROUND
; break;
507 case Intrinsic::lround
: Opcode
= ISD::LROUND
; break;
508 case Intrinsic::llround
: Opcode
= ISD::LLROUND
; break;
509 case Intrinsic::minnum
: Opcode
= ISD::FMINNUM
; break;
510 case Intrinsic::maxnum
: Opcode
= ISD::FMAXNUM
; break;
511 case Intrinsic::experimental_constrained_fcmp
:
512 Opcode
= ISD::STRICT_FSETCC
;
514 case Intrinsic::experimental_constrained_fcmps
:
515 Opcode
= ISD::STRICT_FSETCCS
;
517 case Intrinsic::experimental_constrained_fma
:
518 Opcode
= ISD::STRICT_FMA
;
520 case Intrinsic::experimental_constrained_sqrt
:
521 Opcode
= ISD::STRICT_FSQRT
;
523 case Intrinsic::experimental_constrained_floor
:
524 Opcode
= ISD::STRICT_FFLOOR
;
526 case Intrinsic::experimental_constrained_ceil
:
527 Opcode
= ISD::STRICT_FCEIL
;
529 case Intrinsic::experimental_constrained_trunc
:
530 Opcode
= ISD::STRICT_FTRUNC
;
532 case Intrinsic::experimental_constrained_rint
:
533 Opcode
= ISD::STRICT_FRINT
;
535 case Intrinsic::experimental_constrained_lrint
:
536 Opcode
= ISD::STRICT_LRINT
;
538 case Intrinsic::experimental_constrained_llrint
:
539 Opcode
= ISD::STRICT_LLRINT
;
541 case Intrinsic::experimental_constrained_nearbyint
:
542 Opcode
= ISD::STRICT_FNEARBYINT
;
544 case Intrinsic::experimental_constrained_round
:
545 Opcode
= ISD::STRICT_FROUND
;
547 case Intrinsic::experimental_constrained_lround
:
548 Opcode
= ISD::STRICT_LROUND
;
550 case Intrinsic::experimental_constrained_llround
:
551 Opcode
= ISD::STRICT_LLROUND
;
553 case Intrinsic::experimental_constrained_minnum
:
554 Opcode
= ISD::STRICT_FMINNUM
;
556 case Intrinsic::experimental_constrained_maxnum
:
557 Opcode
= ISD::STRICT_FMAXNUM
;
559 case Intrinsic::umul_with_overflow
: Opcode
= ISD::UMULO
; break;
560 case Intrinsic::smul_with_overflow
: Opcode
= ISD::SMULO
; break;
564 // PowerPC does not use [US]DIVREM or other library calls for
565 // operations on regular types which are not otherwise library calls
566 // (i.e. soft float or atomics). If adapting for targets that do,
567 // additional care is required here.
570 if (!F
->hasLocalLinkage() && F
->hasName() && LibInfo
&&
571 LibInfo
->getLibFunc(F
->getName(), Func
) &&
572 LibInfo
->hasOptimizedCodeGen(Func
)) {
573 // Non-read-only functions are never treated as intrinsics.
574 if (!CI
->onlyReadsMemory())
577 // Conversion happens only for FP calls.
578 if (!CI
->getArgOperand(0)->getType()->isFloatingPointTy())
582 default: return true;
583 case LibFunc_copysign
:
584 case LibFunc_copysignf
:
585 continue; // ISD::FCOPYSIGN is never a library call.
586 case LibFunc_copysignl
:
591 continue; // ISD::FABS is never a library call.
595 Opcode
= ISD::FSQRT
; break;
599 Opcode
= ISD::FFLOOR
; break;
600 case LibFunc_nearbyint
:
601 case LibFunc_nearbyintf
:
602 case LibFunc_nearbyintl
:
603 Opcode
= ISD::FNEARBYINT
; break;
607 Opcode
= ISD::FCEIL
; break;
611 Opcode
= ISD::FRINT
; break;
615 Opcode
= ISD::FROUND
; break;
619 Opcode
= ISD::FTRUNC
; break;
623 Opcode
= ISD::FMINNUM
; break;
627 Opcode
= ISD::FMAXNUM
; break;
633 TLI
->getValueType(DL
, CI
->getArgOperand(0)->getType(), true);
635 if (EVTy
== MVT::Other
)
638 if (TLI
->isOperationLegalOrCustom(Opcode
, EVTy
))
640 else if (EVTy
.isVector() &&
641 TLI
->isOperationLegalOrCustom(Opcode
, EVTy
.getScalarType()))
649 } else if (isa
<BinaryOperator
>(J
) &&
650 (J
->getType()->getScalarType()->isFP128Ty() ||
651 J
->getType()->getScalarType()->isPPC_FP128Ty())) {
652 // Most operations on f128 or ppc_f128 values become calls.
654 } else if (isa
<UIToFPInst
>(J
) || isa
<SIToFPInst
>(J
) ||
655 isa
<FPToUIInst
>(J
) || isa
<FPToSIInst
>(J
)) {
656 CastInst
*CI
= cast
<CastInst
>(J
);
657 if (CI
->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
658 CI
->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
659 isLargeIntegerTy(!TM
.isPPC64(), CI
->getSrcTy()->getScalarType()) ||
660 isLargeIntegerTy(!TM
.isPPC64(), CI
->getDestTy()->getScalarType()))
662 } else if (isLargeIntegerTy(!TM
.isPPC64(),
663 J
->getType()->getScalarType()) &&
664 (J
->getOpcode() == Instruction::UDiv
||
665 J
->getOpcode() == Instruction::SDiv
||
666 J
->getOpcode() == Instruction::URem
||
667 J
->getOpcode() == Instruction::SRem
)) {
669 } else if (!TM
.isPPC64() &&
670 isLargeIntegerTy(false, J
->getType()->getScalarType()) &&
671 (J
->getOpcode() == Instruction::Shl
||
672 J
->getOpcode() == Instruction::AShr
||
673 J
->getOpcode() == Instruction::LShr
)) {
674 // Only on PPC32, for 128-bit integers (specifically not 64-bit
675 // integers), these might be runtime calls.
677 } else if (isa
<IndirectBrInst
>(J
) || isa
<InvokeInst
>(J
)) {
678 // On PowerPC, indirect jumps use the counter register.
680 } else if (SwitchInst
*SI
= dyn_cast
<SwitchInst
>(J
)) {
681 if (SI
->getNumCases() + 1 >= (unsigned)TLI
->getMinimumJumpTableEntries())
685 // FREM is always a call.
686 if (J
->getOpcode() == Instruction::FRem
)
689 if (ST
->useSoftFloat()) {
690 switch(J
->getOpcode()) {
691 case Instruction::FAdd
:
692 case Instruction::FSub
:
693 case Instruction::FMul
:
694 case Instruction::FDiv
:
695 case Instruction::FPTrunc
:
696 case Instruction::FPExt
:
697 case Instruction::FPToUI
:
698 case Instruction::FPToSI
:
699 case Instruction::UIToFP
:
700 case Instruction::SIToFP
:
701 case Instruction::FCmp
:
706 for (Value
*Operand
: J
->operands())
707 if (memAddrUsesCTR(Operand
, TM
, Visited
))
714 bool PPCTTIImpl::isHardwareLoopProfitable(Loop
*L
, ScalarEvolution
&SE
,
716 TargetLibraryInfo
*LibInfo
,
717 HardwareLoopInfo
&HWLoopInfo
) {
718 const PPCTargetMachine
&TM
= ST
->getTargetMachine();
719 TargetSchedModel SchedModel
;
722 // Do not convert small short loops to CTR loop.
723 unsigned ConstTripCount
= SE
.getSmallConstantTripCount(L
);
724 if (ConstTripCount
&& ConstTripCount
< SmallCTRLoopThreshold
) {
725 SmallPtrSet
<const Value
*, 32> EphValues
;
726 CodeMetrics::collectEphemeralValues(L
, &AC
, EphValues
);
728 for (BasicBlock
*BB
: L
->blocks())
729 Metrics
.analyzeBasicBlock(BB
, *this, EphValues
);
730 // 6 is an approximate latency for the mtctr instruction.
731 if (Metrics
.NumInsts
<= (6 * SchedModel
.getIssueWidth()))
735 // We don't want to spill/restore the counter register, and so we don't
736 // want to use the counter register if the loop contains calls.
737 SmallPtrSet
<const Value
*, 4> Visited
;
738 for (Loop::block_iterator I
= L
->block_begin(), IE
= L
->block_end();
740 if (mightUseCTR(*I
, LibInfo
, Visited
))
743 SmallVector
<BasicBlock
*, 4> ExitingBlocks
;
744 L
->getExitingBlocks(ExitingBlocks
);
746 // If there is an exit edge known to be frequently taken,
747 // we should not transform this loop.
748 for (auto &BB
: ExitingBlocks
) {
749 Instruction
*TI
= BB
->getTerminator();
752 if (BranchInst
*BI
= dyn_cast
<BranchInst
>(TI
)) {
753 uint64_t TrueWeight
= 0, FalseWeight
= 0;
754 if (!BI
->isConditional() ||
755 !BI
->extractProfMetadata(TrueWeight
, FalseWeight
))
758 // If the exit path is more frequent than the loop path,
759 // we return here without further analysis for this loop.
760 bool TrueIsExit
= !L
->contains(BI
->getSuccessor(0));
761 if (( TrueIsExit
&& FalseWeight
< TrueWeight
) ||
762 (!TrueIsExit
&& FalseWeight
> TrueWeight
))
767 // If an exit block has a PHI that accesses a TLS variable as one of the
768 // incoming values from the loop, we cannot produce a CTR loop because the
769 // address for that value will be computed in the loop.
770 SmallVector
<BasicBlock
*, 4> ExitBlocks
;
771 L
->getExitBlocks(ExitBlocks
);
772 for (auto &BB
: ExitBlocks
) {
773 for (auto &PHI
: BB
->phis()) {
774 for (int Idx
= 0, EndIdx
= PHI
.getNumIncomingValues(); Idx
< EndIdx
;
776 const BasicBlock
*IncomingBB
= PHI
.getIncomingBlock(Idx
);
777 const Value
*IncomingValue
= PHI
.getIncomingValue(Idx
);
778 if (L
->contains(IncomingBB
) &&
779 memAddrUsesCTR(IncomingValue
, TM
, Visited
))
785 LLVMContext
&C
= L
->getHeader()->getContext();
786 HWLoopInfo
.CountType
= TM
.isPPC64() ?
787 Type::getInt64Ty(C
) : Type::getInt32Ty(C
);
788 HWLoopInfo
.LoopDecrement
= ConstantInt::get(HWLoopInfo
.CountType
, 1);
792 void PPCTTIImpl::getUnrollingPreferences(Loop
*L
, ScalarEvolution
&SE
,
793 TTI::UnrollingPreferences
&UP
,
794 OptimizationRemarkEmitter
*ORE
) {
795 if (ST
->getCPUDirective() == PPC::DIR_A2
) {
796 // The A2 is in-order with a deep pipeline, and concatenation unrolling
797 // helps expose latency-hiding opportunities to the instruction scheduler.
798 UP
.Partial
= UP
.Runtime
= true;
800 // We unroll a lot on the A2 (hundreds of instructions), and the benefits
801 // often outweigh the cost of a division to compute the trip count.
802 UP
.AllowExpensiveTripCount
= true;
805 BaseT::getUnrollingPreferences(L
, SE
, UP
, ORE
);
808 void PPCTTIImpl::getPeelingPreferences(Loop
*L
, ScalarEvolution
&SE
,
809 TTI::PeelingPreferences
&PP
) {
810 BaseT::getPeelingPreferences(L
, SE
, PP
);
812 // This function returns true to allow using coldcc calling convention.
813 // Returning true results in coldcc being used for functions which are cold at
814 // all call sites when the callers of the functions are not calling any other
815 // non coldcc functions.
816 bool PPCTTIImpl::useColdCCForColdCall(Function
&F
) {
817 return EnablePPCColdCC
;
820 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions
) {
821 // On the A2, always unroll aggressively.
822 if (ST
->getCPUDirective() == PPC::DIR_A2
)
825 return LoopHasReductions
;
828 PPCTTIImpl::TTI::MemCmpExpansionOptions
829 PPCTTIImpl::enableMemCmpExpansion(bool OptSize
, bool IsZeroCmp
) const {
830 TTI::MemCmpExpansionOptions Options
;
831 Options
.LoadSizes
= {8, 4, 2, 1};
832 Options
.MaxNumLoads
= TLI
->getMaxExpandSizeMemcmp(OptSize
);
836 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
840 unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID
) const {
841 assert(ClassID
== GPRRC
|| ClassID
== FPRRC
||
842 ClassID
== VRRC
|| ClassID
== VSXRC
);
844 assert(ClassID
== GPRRC
|| ClassID
== VSXRC
|| ClassID
== VRRC
);
845 return ClassID
== VSXRC
? 64 : 32;
847 assert(ClassID
== GPRRC
|| ClassID
== FPRRC
|| ClassID
== VRRC
);
851 unsigned PPCTTIImpl::getRegisterClassForType(bool Vector
, Type
*Ty
) const {
853 return ST
->hasVSX() ? VSXRC
: VRRC
;
854 else if (Ty
&& (Ty
->getScalarType()->isFloatTy() ||
855 Ty
->getScalarType()->isDoubleTy()))
856 return ST
->hasVSX() ? VSXRC
: FPRRC
;
857 else if (Ty
&& (Ty
->getScalarType()->isFP128Ty() ||
858 Ty
->getScalarType()->isPPC_FP128Ty()))
860 else if (Ty
&& Ty
->getScalarType()->isHalfTy())
866 const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID
) const {
870 llvm_unreachable("unknown register class");
871 return "PPC::unknown register class";
872 case GPRRC
: return "PPC::GPRRC";
873 case FPRRC
: return "PPC::FPRRC";
874 case VRRC
: return "PPC::VRRC";
875 case VSXRC
: return "PPC::VSXRC";
880 PPCTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K
) const {
882 case TargetTransformInfo::RGK_Scalar
:
883 return TypeSize::getFixed(ST
->isPPC64() ? 64 : 32);
884 case TargetTransformInfo::RGK_FixedWidthVector
:
885 return TypeSize::getFixed(ST
->hasAltivec() ? 128 : 0);
886 case TargetTransformInfo::RGK_ScalableVector
:
887 return TypeSize::getScalable(0);
890 llvm_unreachable("Unsupported register kind");
893 unsigned PPCTTIImpl::getCacheLineSize() const {
894 // Check first if the user specified a custom line size.
895 if (CacheLineSize
.getNumOccurrences() > 0)
896 return CacheLineSize
;
898 // Starting with P7 we have a cache line size of 128.
899 unsigned Directive
= ST
->getCPUDirective();
900 // Assume that Future CPU has the same cache line size as the others.
901 if (Directive
== PPC::DIR_PWR7
|| Directive
== PPC::DIR_PWR8
||
902 Directive
== PPC::DIR_PWR9
|| Directive
== PPC::DIR_PWR10
||
903 Directive
== PPC::DIR_PWR_FUTURE
)
906 // On other processors return a default of 64 bytes.
910 unsigned PPCTTIImpl::getPrefetchDistance() const {
914 unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF
) {
915 unsigned Directive
= ST
->getCPUDirective();
916 // The 440 has no SIMD support, but floating-point instructions
917 // have a 5-cycle latency, so unroll by 5x for latency hiding.
918 if (Directive
== PPC::DIR_440
)
921 // The A2 has no SIMD support, but floating-point instructions
922 // have a 6-cycle latency, so unroll by 6x for latency hiding.
923 if (Directive
== PPC::DIR_A2
)
926 // FIXME: For lack of any better information, do no harm...
927 if (Directive
== PPC::DIR_E500mc
|| Directive
== PPC::DIR_E5500
)
930 // For P7 and P8, floating-point instructions have a 6-cycle latency and
931 // there are two execution units, so unroll by 12x for latency hiding.
932 // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
933 // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
934 // Assume that future is the same as the others.
935 if (Directive
== PPC::DIR_PWR7
|| Directive
== PPC::DIR_PWR8
||
936 Directive
== PPC::DIR_PWR9
|| Directive
== PPC::DIR_PWR10
||
937 Directive
== PPC::DIR_PWR_FUTURE
)
940 // For most things, modern systems have two execution units (and
941 // out-of-order execution).
945 // Adjust the cost of vector instructions on targets which there is overlap
946 // between the vector and scalar units, thereby reducing the overall throughput
947 // of vector code wrt. scalar code.
948 InstructionCost
PPCTTIImpl::vectorCostAdjustment(InstructionCost Cost
,
949 unsigned Opcode
, Type
*Ty1
,
951 if (!ST
->vectorsUseTwoUnits() || !Ty1
->isVectorTy())
954 std::pair
<InstructionCost
, MVT
> LT1
= TLI
->getTypeLegalizationCost(DL
, Ty1
);
955 // If type legalization involves splitting the vector, we don't want to
956 // double the cost at every step - only the last step.
957 if (LT1
.first
!= 1 || !LT1
.second
.isVector())
960 int ISD
= TLI
->InstructionOpcodeToISD(Opcode
);
961 if (TLI
->isOperationExpand(ISD
, LT1
.second
))
965 std::pair
<InstructionCost
, MVT
> LT2
= TLI
->getTypeLegalizationCost(DL
, Ty2
);
966 if (LT2
.first
!= 1 || !LT2
.second
.isVector())
973 InstructionCost
PPCTTIImpl::getArithmeticInstrCost(
974 unsigned Opcode
, Type
*Ty
, TTI::TargetCostKind CostKind
,
975 TTI::OperandValueKind Op1Info
, TTI::OperandValueKind Op2Info
,
976 TTI::OperandValueProperties Opd1PropInfo
,
977 TTI::OperandValueProperties Opd2PropInfo
, ArrayRef
<const Value
*> Args
,
978 const Instruction
*CxtI
) {
979 assert(TLI
->InstructionOpcodeToISD(Opcode
) && "Invalid opcode");
980 // TODO: Handle more cost kinds.
981 if (CostKind
!= TTI::TCK_RecipThroughput
)
982 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
,
983 Op2Info
, Opd1PropInfo
,
984 Opd2PropInfo
, Args
, CxtI
);
986 // Fallback to the default implementation.
987 InstructionCost Cost
= BaseT::getArithmeticInstrCost(
988 Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
, Opd1PropInfo
, Opd2PropInfo
);
989 return vectorCostAdjustment(Cost
, Opcode
, Ty
, nullptr);
992 InstructionCost
PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind
, Type
*Tp
,
993 ArrayRef
<int> Mask
, int Index
,
995 // Legalize the type.
996 std::pair
<InstructionCost
, MVT
> LT
= TLI
->getTypeLegalizationCost(DL
, Tp
);
998 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
999 // (at least in the sense that there need only be one non-loop-invariant
1000 // instruction). We need one such shuffle instruction for each actual
1001 // register (this is not true for arbitrary shuffles, but is true for the
1002 // structured types of shuffles covered by TTI::ShuffleKind).
1003 return vectorCostAdjustment(LT
.first
, Instruction::ShuffleVector
, Tp
,
1007 InstructionCost
PPCTTIImpl::getCFInstrCost(unsigned Opcode
,
1008 TTI::TargetCostKind CostKind
,
1009 const Instruction
*I
) {
1010 if (CostKind
!= TTI::TCK_RecipThroughput
)
1011 return Opcode
== Instruction::PHI
? 0 : 1;
1012 // Branches are assumed to be predicted.
1016 InstructionCost
PPCTTIImpl::getCastInstrCost(unsigned Opcode
, Type
*Dst
,
1018 TTI::CastContextHint CCH
,
1019 TTI::TargetCostKind CostKind
,
1020 const Instruction
*I
) {
1021 assert(TLI
->InstructionOpcodeToISD(Opcode
) && "Invalid opcode");
1023 InstructionCost Cost
=
1024 BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
1025 Cost
= vectorCostAdjustment(Cost
, Opcode
, Dst
, Src
);
1026 // TODO: Allow non-throughput costs that aren't binary.
1027 if (CostKind
!= TTI::TCK_RecipThroughput
)
1028 return Cost
== 0 ? 0 : 1;
1032 InstructionCost
PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode
, Type
*ValTy
,
1034 CmpInst::Predicate VecPred
,
1035 TTI::TargetCostKind CostKind
,
1036 const Instruction
*I
) {
1037 InstructionCost Cost
=
1038 BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
, I
);
1039 // TODO: Handle other cost kinds.
1040 if (CostKind
!= TTI::TCK_RecipThroughput
)
1042 return vectorCostAdjustment(Cost
, Opcode
, ValTy
, nullptr);
1045 InstructionCost
PPCTTIImpl::getVectorInstrCost(unsigned Opcode
, Type
*Val
,
1047 assert(Val
->isVectorTy() && "This must be a vector type");
1049 int ISD
= TLI
->InstructionOpcodeToISD(Opcode
);
1050 assert(ISD
&& "Invalid opcode");
1052 InstructionCost Cost
= BaseT::getVectorInstrCost(Opcode
, Val
, Index
);
1053 Cost
= vectorCostAdjustment(Cost
, Opcode
, Val
, nullptr);
1055 if (ST
->hasVSX() && Val
->getScalarType()->isDoubleTy()) {
1056 // Double-precision scalars are already located in index #0 (or #1 if LE).
1057 if (ISD
== ISD::EXTRACT_VECTOR_ELT
&&
1058 Index
== (ST
->isLittleEndian() ? 1 : 0))
1063 } else if (Val
->getScalarType()->isIntegerTy() && Index
!= -1U) {
1064 if (ST
->hasP9Altivec()) {
1065 if (ISD
== ISD::INSERT_VECTOR_ELT
)
1066 // A move-to VSR and a permute/insert. Assume vector operation cost
1067 // for both (cost will be 2x on P9).
1068 return vectorCostAdjustment(2, Opcode
, Val
, nullptr);
1070 // It's an extract. Maybe we can do a cheap move-from VSR.
1071 unsigned EltSize
= Val
->getScalarSizeInBits();
1072 if (EltSize
== 64) {
1073 unsigned MfvsrdIndex
= ST
->isLittleEndian() ? 1 : 0;
1074 if (Index
== MfvsrdIndex
)
1076 } else if (EltSize
== 32) {
1077 unsigned MfvsrwzIndex
= ST
->isLittleEndian() ? 2 : 1;
1078 if (Index
== MfvsrwzIndex
)
1082 // We need a vector extract (or mfvsrld). Assume vector operation cost.
1083 // The cost of the load constant for a vector extract is disregarded
1084 // (invariant, easily schedulable).
1085 return vectorCostAdjustment(1, Opcode
, Val
, nullptr);
1087 } else if (ST
->hasDirectMove())
1088 // Assume permute has standard cost.
1089 // Assume move-to/move-from VSR have 2x standard cost.
1093 // Estimated cost of a load-hit-store delay. This was obtained
1094 // experimentally as a minimum needed to prevent unprofitable
1095 // vectorization for the paq8p benchmark. It may need to be
1096 // raised further if other unprofitable cases remain.
1097 unsigned LHSPenalty
= 2;
1098 if (ISD
== ISD::INSERT_VECTOR_ELT
)
1101 // Vector element insert/extract with Altivec is very expensive,
1102 // because they require store and reload with the attendant
1103 // processor stall for load-hit-store. Until VSX is available,
1104 // these need to be estimated as very costly.
1105 if (ISD
== ISD::EXTRACT_VECTOR_ELT
||
1106 ISD
== ISD::INSERT_VECTOR_ELT
)
1107 return LHSPenalty
+ Cost
;
1112 InstructionCost
PPCTTIImpl::getMemoryOpCost(unsigned Opcode
, Type
*Src
,
1113 MaybeAlign Alignment
,
1114 unsigned AddressSpace
,
1115 TTI::TargetCostKind CostKind
,
1116 const Instruction
*I
) {
1117 if (TLI
->getValueType(DL
, Src
, true) == MVT::Other
)
1118 return BaseT::getMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
,
1120 // Legalize the type.
1121 std::pair
<InstructionCost
, MVT
> LT
= TLI
->getTypeLegalizationCost(DL
, Src
);
1122 assert((Opcode
== Instruction::Load
|| Opcode
== Instruction::Store
) &&
1125 InstructionCost Cost
=
1126 BaseT::getMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
, CostKind
);
1127 // TODO: Handle other cost kinds.
1128 if (CostKind
!= TTI::TCK_RecipThroughput
)
1131 Cost
= vectorCostAdjustment(Cost
, Opcode
, Src
, nullptr);
1133 bool IsAltivecType
= ST
->hasAltivec() &&
1134 (LT
.second
== MVT::v16i8
|| LT
.second
== MVT::v8i16
||
1135 LT
.second
== MVT::v4i32
|| LT
.second
== MVT::v4f32
);
1136 bool IsVSXType
= ST
->hasVSX() &&
1137 (LT
.second
== MVT::v2f64
|| LT
.second
== MVT::v2i64
);
1139 // VSX has 32b/64b load instructions. Legalization can handle loading of
1140 // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
1141 // PPCTargetLowering can't compute the cost appropriately. So here we
1142 // explicitly check this case.
1143 unsigned MemBytes
= Src
->getPrimitiveSizeInBits();
1144 if (Opcode
== Instruction::Load
&& ST
->hasVSX() && IsAltivecType
&&
1145 (MemBytes
== 64 || (ST
->hasP8Vector() && MemBytes
== 32)))
1148 // Aligned loads and stores are easy.
1149 unsigned SrcBytes
= LT
.second
.getStoreSize();
1150 if (!SrcBytes
|| !Alignment
|| *Alignment
>= SrcBytes
)
1153 // If we can use the permutation-based load sequence, then this is also
1154 // relatively cheap (not counting loop-invariant instructions): one load plus
1155 // one permute (the last load in a series has extra cost, but we're
1156 // neglecting that here). Note that on the P7, we could do unaligned loads
1157 // for Altivec types using the VSX instructions, but that's more expensive
1158 // than using the permutation-based load sequence. On the P8, that's no
1160 if (Opcode
== Instruction::Load
&& (!ST
->hasP8Vector() && IsAltivecType
) &&
1161 *Alignment
>= LT
.second
.getScalarType().getStoreSize())
1162 return Cost
+ LT
.first
; // Add the cost of the permutations.
1164 // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
1165 // P7, unaligned vector loads are more expensive than the permutation-based
1166 // load sequence, so that might be used instead, but regardless, the net cost
1167 // is about the same (not counting loop-invariant instructions).
1168 if (IsVSXType
|| (ST
->hasVSX() && IsAltivecType
))
1171 // Newer PPC supports unaligned memory access.
1172 if (TLI
->allowsMisalignedMemoryAccesses(LT
.second
, 0))
1175 // PPC in general does not support unaligned loads and stores. They'll need
1176 // to be decomposed based on the alignment factor.
1178 // Add the cost of each scalar load or store.
1180 Cost
+= LT
.first
* ((SrcBytes
/ Alignment
->value()) - 1);
1182 // For a vector type, there is also scalarization overhead (only for
1183 // stores, loads are expanded using the vector-load + permutation sequence,
1184 // which is much less expensive).
1185 if (Src
->isVectorTy() && Opcode
== Instruction::Store
)
1186 for (int i
= 0, e
= cast
<FixedVectorType
>(Src
)->getNumElements(); i
< e
;
1188 Cost
+= getVectorInstrCost(Instruction::ExtractElement
, Src
, i
);
1193 InstructionCost
PPCTTIImpl::getInterleavedMemoryOpCost(
1194 unsigned Opcode
, Type
*VecTy
, unsigned Factor
, ArrayRef
<unsigned> Indices
,
1195 Align Alignment
, unsigned AddressSpace
, TTI::TargetCostKind CostKind
,
1196 bool UseMaskForCond
, bool UseMaskForGaps
) {
1197 if (UseMaskForCond
|| UseMaskForGaps
)
1198 return BaseT::getInterleavedMemoryOpCost(Opcode
, VecTy
, Factor
, Indices
,
1199 Alignment
, AddressSpace
, CostKind
,
1200 UseMaskForCond
, UseMaskForGaps
);
1202 assert(isa
<VectorType
>(VecTy
) &&
1203 "Expect a vector type for interleaved memory op");
1205 // Legalize the type.
1206 std::pair
<InstructionCost
, MVT
> LT
= TLI
->getTypeLegalizationCost(DL
, VecTy
);
1208 // Firstly, the cost of load/store operation.
1209 InstructionCost Cost
= getMemoryOpCost(Opcode
, VecTy
, MaybeAlign(Alignment
),
1210 AddressSpace
, CostKind
);
1212 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
1213 // (at least in the sense that there need only be one non-loop-invariant
1214 // instruction). For each result vector, we need one shuffle per incoming
1215 // vector (except that the first shuffle can take two incoming vectors
1216 // because it does not need to take itself).
1217 Cost
+= Factor
*(LT
.first
-1);
1223 PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes
&ICA
,
1224 TTI::TargetCostKind CostKind
) {
1225 return BaseT::getIntrinsicInstrCost(ICA
, CostKind
);
1228 bool PPCTTIImpl::areFunctionArgsABICompatible(
1229 const Function
*Caller
, const Function
*Callee
,
1230 SmallPtrSetImpl
<Argument
*> &Args
) const {
1232 // We need to ensure that argument promotion does not
1233 // attempt to promote pointers to MMA types (__vector_pair
1234 // and __vector_quad) since these types explicitly cannot be
1235 // passed as arguments. Both of these types are larger than
1236 // the 128-bit Altivec vectors and have a scalar size of 1 bit.
1237 if (!BaseT::areFunctionArgsABICompatible(Caller
, Callee
, Args
))
1240 return llvm::none_of(Args
, [](Argument
*A
) {
1241 auto *EltTy
= cast
<PointerType
>(A
->getType())->getElementType();
1242 if (EltTy
->isSized())
1243 return (EltTy
->isIntOrIntVectorTy(1) &&
1244 EltTy
->getPrimitiveSizeInBits() > 128);
1249 bool PPCTTIImpl::canSaveCmp(Loop
*L
, BranchInst
**BI
, ScalarEvolution
*SE
,
1250 LoopInfo
*LI
, DominatorTree
*DT
,
1251 AssumptionCache
*AC
, TargetLibraryInfo
*LibInfo
) {
1252 // Process nested loops first.
1253 for (Loop::iterator I
= L
->begin(), E
= L
->end(); I
!= E
; ++I
)
1254 if (canSaveCmp(*I
, BI
, SE
, LI
, DT
, AC
, LibInfo
))
1255 return false; // Stop search.
1257 HardwareLoopInfo
HWLoopInfo(L
);
1259 if (!HWLoopInfo
.canAnalyze(*LI
))
1262 if (!isHardwareLoopProfitable(L
, *SE
, *AC
, LibInfo
, HWLoopInfo
))
1265 if (!HWLoopInfo
.isHardwareLoopCandidate(*SE
, *LI
, *DT
))
1268 *BI
= HWLoopInfo
.ExitBranch
;
1272 bool PPCTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost
&C1
,
1273 TargetTransformInfo::LSRCost
&C2
) {
1274 // PowerPC default behaviour here is "instruction number 1st priority".
1275 // If LsrNoInsnsCost is set, call default implementation.
1276 if (!LsrNoInsnsCost
)
1277 return std::tie(C1
.Insns
, C1
.NumRegs
, C1
.AddRecCost
, C1
.NumIVMuls
,
1278 C1
.NumBaseAdds
, C1
.ScaleCost
, C1
.ImmCost
, C1
.SetupCost
) <
1279 std::tie(C2
.Insns
, C2
.NumRegs
, C2
.AddRecCost
, C2
.NumIVMuls
,
1280 C2
.NumBaseAdds
, C2
.ScaleCost
, C2
.ImmCost
, C2
.SetupCost
);
1282 return TargetTransformInfoImplBase::isLSRCostLess(C1
, C2
);
1285 bool PPCTTIImpl::isNumRegsMajorCostOfLSR() {
1289 bool PPCTTIImpl::shouldBuildRelLookupTables() const {
1290 const PPCTargetMachine
&TM
= ST
->getTargetMachine();
1291 // XCOFF hasn't implemented lowerRelativeReference, disable non-ELF for now.
1292 if (!TM
.isELFv2ABI())
1294 return BaseT::shouldBuildRelLookupTables();
1297 bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst
*Inst
,
1298 MemIntrinsicInfo
&Info
) {
1299 switch (Inst
->getIntrinsicID()) {
1300 case Intrinsic::ppc_altivec_lvx
:
1301 case Intrinsic::ppc_altivec_lvxl
:
1302 case Intrinsic::ppc_altivec_lvebx
:
1303 case Intrinsic::ppc_altivec_lvehx
:
1304 case Intrinsic::ppc_altivec_lvewx
:
1305 case Intrinsic::ppc_vsx_lxvd2x
:
1306 case Intrinsic::ppc_vsx_lxvw4x
:
1307 case Intrinsic::ppc_vsx_lxvd2x_be
:
1308 case Intrinsic::ppc_vsx_lxvw4x_be
:
1309 case Intrinsic::ppc_vsx_lxvl
:
1310 case Intrinsic::ppc_vsx_lxvll
:
1311 case Intrinsic::ppc_vsx_lxvp
: {
1312 Info
.PtrVal
= Inst
->getArgOperand(0);
1313 Info
.ReadMem
= true;
1314 Info
.WriteMem
= false;
1317 case Intrinsic::ppc_altivec_stvx
:
1318 case Intrinsic::ppc_altivec_stvxl
:
1319 case Intrinsic::ppc_altivec_stvebx
:
1320 case Intrinsic::ppc_altivec_stvehx
:
1321 case Intrinsic::ppc_altivec_stvewx
:
1322 case Intrinsic::ppc_vsx_stxvd2x
:
1323 case Intrinsic::ppc_vsx_stxvw4x
:
1324 case Intrinsic::ppc_vsx_stxvd2x_be
:
1325 case Intrinsic::ppc_vsx_stxvw4x_be
:
1326 case Intrinsic::ppc_vsx_stxvl
:
1327 case Intrinsic::ppc_vsx_stxvll
:
1328 case Intrinsic::ppc_vsx_stxvp
: {
1329 Info
.PtrVal
= Inst
->getArgOperand(1);
1330 Info
.ReadMem
= false;
1331 Info
.WriteMem
= true;