1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements a TargetTransformInfo analysis pass specific to the
10 // SystemZ target machine. It uses the target's detailed information to provide
11 // more precise answers to certain TTI queries, while letting the target
12 // independent and default TTI implementations handle the rest.
14 //===----------------------------------------------------------------------===//
16 #include "SystemZTargetTransformInfo.h"
17 #include "llvm/Analysis/TargetTransformInfo.h"
18 #include "llvm/CodeGen/BasicTTIImpl.h"
19 #include "llvm/CodeGen/TargetLowering.h"
20 #include "llvm/IR/DerivedTypes.h"
21 #include "llvm/IR/IntrinsicInst.h"
22 #include "llvm/IR/Intrinsics.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/MathExtras.h"
28 #define DEBUG_TYPE "systemztti"
30 //===----------------------------------------------------------------------===//
32 // SystemZ cost model.
34 //===----------------------------------------------------------------------===//
36 static bool isUsedAsMemCpySource(const Value
*V
, bool &OtherUse
) {
37 bool UsedAsMemCpySource
= false;
38 for (const User
*U
: V
->users())
39 if (const Instruction
*User
= dyn_cast
<Instruction
>(U
)) {
40 if (isa
<BitCastInst
>(User
) || isa
<GetElementPtrInst
>(User
)) {
41 UsedAsMemCpySource
|= isUsedAsMemCpySource(User
, OtherUse
);
44 if (const MemCpyInst
*Memcpy
= dyn_cast
<MemCpyInst
>(User
)) {
45 if (Memcpy
->getOperand(1) == V
&& !Memcpy
->isVolatile()) {
46 UsedAsMemCpySource
= true;
52 return UsedAsMemCpySource
;
55 static void countNumMemAccesses(const Value
*Ptr
, unsigned &NumStores
,
56 unsigned &NumLoads
, const Function
*F
) {
57 if (!isa
<PointerType
>(Ptr
->getType()))
59 for (const User
*U
: Ptr
->users())
60 if (const Instruction
*User
= dyn_cast
<Instruction
>(U
)) {
61 if (User
->getParent()->getParent() == F
) {
62 if (const auto *SI
= dyn_cast
<StoreInst
>(User
)) {
63 if (SI
->getPointerOperand() == Ptr
&& !SI
->isVolatile())
65 } else if (const auto *LI
= dyn_cast
<LoadInst
>(User
)) {
66 if (LI
->getPointerOperand() == Ptr
&& !LI
->isVolatile())
68 } else if (const auto *GEP
= dyn_cast
<GetElementPtrInst
>(User
)) {
69 if (GEP
->getPointerOperand() == Ptr
)
70 countNumMemAccesses(GEP
, NumStores
, NumLoads
, F
);
76 unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase
*CB
) const {
78 const Function
*Caller
= CB
->getParent()->getParent();
79 const Function
*Callee
= CB
->getCalledFunction();
82 const Module
*M
= Caller
->getParent();
84 // Increase the threshold if an incoming argument is used only as a memcpy
86 for (const Argument
&Arg
: Callee
->args()) {
87 bool OtherUse
= false;
88 if (isUsedAsMemCpySource(&Arg
, OtherUse
) && !OtherUse
) {
94 // Give bonus for globals used much in both caller and callee.
95 std::set
<const GlobalVariable
*> CalleeGlobals
;
96 std::set
<const GlobalVariable
*> CallerGlobals
;
97 for (const GlobalVariable
&Global
: M
->globals())
98 for (const User
*U
: Global
.users())
99 if (const Instruction
*User
= dyn_cast
<Instruction
>(U
)) {
100 if (User
->getParent()->getParent() == Callee
)
101 CalleeGlobals
.insert(&Global
);
102 if (User
->getParent()->getParent() == Caller
)
103 CallerGlobals
.insert(&Global
);
105 for (auto *GV
: CalleeGlobals
)
106 if (CallerGlobals
.count(GV
)) {
107 unsigned CalleeStores
= 0, CalleeLoads
= 0;
108 unsigned CallerStores
= 0, CallerLoads
= 0;
109 countNumMemAccesses(GV
, CalleeStores
, CalleeLoads
, Callee
);
110 countNumMemAccesses(GV
, CallerStores
, CallerLoads
, Caller
);
111 if ((CalleeStores
+ CalleeLoads
) > 10 &&
112 (CallerStores
+ CallerLoads
) > 10) {
118 // Give bonus when Callee accesses an Alloca of Caller heavily.
119 unsigned NumStores
= 0;
120 unsigned NumLoads
= 0;
121 for (unsigned OpIdx
= 0; OpIdx
!= Callee
->arg_size(); ++OpIdx
) {
122 Value
*CallerArg
= CB
->getArgOperand(OpIdx
);
123 Argument
*CalleeArg
= Callee
->getArg(OpIdx
);
124 if (isa
<AllocaInst
>(CallerArg
))
125 countNumMemAccesses(CalleeArg
, NumStores
, NumLoads
, Callee
);
128 Bonus
+= NumLoads
* 50;
130 Bonus
+= NumStores
* 50;
131 Bonus
= std::min(Bonus
, unsigned(1000));
133 LLVM_DEBUG(if (Bonus
)
134 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus
<< "\n";);
138 InstructionCost
SystemZTTIImpl::getIntImmCost(const APInt
&Imm
, Type
*Ty
,
139 TTI::TargetCostKind CostKind
) {
140 assert(Ty
->isIntegerTy());
142 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
143 // There is no cost model for constants with a bit size of 0. Return TCC_Free
144 // here, so that constant hoisting will ignore this constant.
146 return TTI::TCC_Free
;
147 // No cost model for operations on integers larger than 128 bit implemented yet.
148 if ((!ST
->hasVector() && BitSize
> 64) || BitSize
> 128)
149 return TTI::TCC_Free
;
152 return TTI::TCC_Free
;
154 if (Imm
.getBitWidth() <= 64) {
155 // Constants loaded via lgfi.
156 if (isInt
<32>(Imm
.getSExtValue()))
157 return TTI::TCC_Basic
;
158 // Constants loaded via llilf.
159 if (isUInt
<32>(Imm
.getZExtValue()))
160 return TTI::TCC_Basic
;
161 // Constants loaded via llihf:
162 if ((Imm
.getZExtValue() & 0xffffffff) == 0)
163 return TTI::TCC_Basic
;
165 return 2 * TTI::TCC_Basic
;
168 // i128 immediates loads from Constant Pool
169 return 2 * TTI::TCC_Basic
;
172 InstructionCost
SystemZTTIImpl::getIntImmCostInst(unsigned Opcode
, unsigned Idx
,
173 const APInt
&Imm
, Type
*Ty
,
174 TTI::TargetCostKind CostKind
,
176 assert(Ty
->isIntegerTy());
178 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
179 // There is no cost model for constants with a bit size of 0. Return TCC_Free
180 // here, so that constant hoisting will ignore this constant.
182 return TTI::TCC_Free
;
183 // No cost model for operations on integers larger than 64 bit implemented yet.
185 return TTI::TCC_Free
;
189 return TTI::TCC_Free
;
190 case Instruction::GetElementPtr
:
191 // Always hoist the base address of a GetElementPtr. This prevents the
192 // creation of new constants for every base constant that gets constant
193 // folded with the offset.
195 return 2 * TTI::TCC_Basic
;
196 return TTI::TCC_Free
;
197 case Instruction::Store
:
198 if (Idx
== 0 && Imm
.getBitWidth() <= 64) {
199 // Any 8-bit immediate store can by implemented via mvi.
201 return TTI::TCC_Free
;
202 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
203 if (isInt
<16>(Imm
.getSExtValue()))
204 return TTI::TCC_Free
;
207 case Instruction::ICmp
:
208 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
209 // Comparisons against signed 32-bit immediates implemented via cgfi.
210 if (isInt
<32>(Imm
.getSExtValue()))
211 return TTI::TCC_Free
;
212 // Comparisons against unsigned 32-bit immediates implemented via clgfi.
213 if (isUInt
<32>(Imm
.getZExtValue()))
214 return TTI::TCC_Free
;
217 case Instruction::Add
:
218 case Instruction::Sub
:
219 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
220 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
221 if (isUInt
<32>(Imm
.getZExtValue()))
222 return TTI::TCC_Free
;
223 // Or their negation, by swapping addition vs. subtraction.
224 if (isUInt
<32>(-Imm
.getSExtValue()))
225 return TTI::TCC_Free
;
228 case Instruction::Mul
:
229 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
230 // We use msgfi to multiply by 32-bit signed immediates.
231 if (isInt
<32>(Imm
.getSExtValue()))
232 return TTI::TCC_Free
;
235 case Instruction::Or
:
236 case Instruction::Xor
:
237 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
238 // Masks supported by oilf/xilf.
239 if (isUInt
<32>(Imm
.getZExtValue()))
240 return TTI::TCC_Free
;
241 // Masks supported by oihf/xihf.
242 if ((Imm
.getZExtValue() & 0xffffffff) == 0)
243 return TTI::TCC_Free
;
246 case Instruction::And
:
247 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
248 // Any 32-bit AND operation can by implemented via nilf.
250 return TTI::TCC_Free
;
251 // 64-bit masks supported by nilf.
252 if (isUInt
<32>(~Imm
.getZExtValue()))
253 return TTI::TCC_Free
;
254 // 64-bit masks supported by nilh.
255 if ((Imm
.getZExtValue() & 0xffffffff) == 0xffffffff)
256 return TTI::TCC_Free
;
257 // Some 64-bit AND operations can be implemented via risbg.
258 const SystemZInstrInfo
*TII
= ST
->getInstrInfo();
260 if (TII
->isRxSBGMask(Imm
.getZExtValue(), BitSize
, Start
, End
))
261 return TTI::TCC_Free
;
264 case Instruction::Shl
:
265 case Instruction::LShr
:
266 case Instruction::AShr
:
267 // Always return TCC_Free for the shift value of a shift instruction.
269 return TTI::TCC_Free
;
271 case Instruction::UDiv
:
272 case Instruction::SDiv
:
273 case Instruction::URem
:
274 case Instruction::SRem
:
275 case Instruction::Trunc
:
276 case Instruction::ZExt
:
277 case Instruction::SExt
:
278 case Instruction::IntToPtr
:
279 case Instruction::PtrToInt
:
280 case Instruction::BitCast
:
281 case Instruction::PHI
:
282 case Instruction::Call
:
283 case Instruction::Select
:
284 case Instruction::Ret
:
285 case Instruction::Load
:
289 return SystemZTTIImpl::getIntImmCost(Imm
, Ty
, CostKind
);
293 SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID
, unsigned Idx
,
294 const APInt
&Imm
, Type
*Ty
,
295 TTI::TargetCostKind CostKind
) {
296 assert(Ty
->isIntegerTy());
298 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
299 // There is no cost model for constants with a bit size of 0. Return TCC_Free
300 // here, so that constant hoisting will ignore this constant.
302 return TTI::TCC_Free
;
303 // No cost model for operations on integers larger than 64 bit implemented yet.
305 return TTI::TCC_Free
;
309 return TTI::TCC_Free
;
310 case Intrinsic::sadd_with_overflow
:
311 case Intrinsic::uadd_with_overflow
:
312 case Intrinsic::ssub_with_overflow
:
313 case Intrinsic::usub_with_overflow
:
314 // These get expanded to include a normal addition/subtraction.
315 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
316 if (isUInt
<32>(Imm
.getZExtValue()))
317 return TTI::TCC_Free
;
318 if (isUInt
<32>(-Imm
.getSExtValue()))
319 return TTI::TCC_Free
;
322 case Intrinsic::smul_with_overflow
:
323 case Intrinsic::umul_with_overflow
:
324 // These get expanded to include a normal multiplication.
325 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
326 if (isInt
<32>(Imm
.getSExtValue()))
327 return TTI::TCC_Free
;
330 case Intrinsic::experimental_stackmap
:
331 if ((Idx
< 2) || (Imm
.getBitWidth() <= 64 && isInt
<64>(Imm
.getSExtValue())))
332 return TTI::TCC_Free
;
334 case Intrinsic::experimental_patchpoint_void
:
335 case Intrinsic::experimental_patchpoint
:
336 if ((Idx
< 4) || (Imm
.getBitWidth() <= 64 && isInt
<64>(Imm
.getSExtValue())))
337 return TTI::TCC_Free
;
340 return SystemZTTIImpl::getIntImmCost(Imm
, Ty
, CostKind
);
343 TargetTransformInfo::PopcntSupportKind
344 SystemZTTIImpl::getPopcntSupport(unsigned TyWidth
) {
345 assert(isPowerOf2_32(TyWidth
) && "Type width must be power of 2");
346 if (ST
->hasPopulationCount() && TyWidth
<= 64)
347 return TTI::PSK_FastHardware
;
348 return TTI::PSK_Software
;
351 void SystemZTTIImpl::getUnrollingPreferences(Loop
*L
, ScalarEvolution
&SE
,
352 TTI::UnrollingPreferences
&UP
,
353 OptimizationRemarkEmitter
*ORE
) {
354 // Find out if L contains a call, what the machine instruction count
355 // estimate is, and how many stores there are.
356 bool HasCall
= false;
357 InstructionCost NumStores
= 0;
358 for (auto &BB
: L
->blocks())
359 for (auto &I
: *BB
) {
360 if (isa
<CallInst
>(&I
) || isa
<InvokeInst
>(&I
)) {
361 if (const Function
*F
= cast
<CallBase
>(I
).getCalledFunction()) {
362 if (isLoweredToCall(F
))
364 if (F
->getIntrinsicID() == Intrinsic::memcpy
||
365 F
->getIntrinsicID() == Intrinsic::memset
)
367 } else { // indirect call.
371 if (isa
<StoreInst
>(&I
)) {
372 Type
*MemAccessTy
= I
.getOperand(0)->getType();
373 NumStores
+= getMemoryOpCost(Instruction::Store
, MemAccessTy
,
374 std::nullopt
, 0, TTI::TCK_RecipThroughput
);
378 // The z13 processor will run out of store tags if too many stores
379 // are fed into it too quickly. Therefore make sure there are not
380 // too many stores in the resulting unrolled loop.
381 unsigned const NumStoresVal
= *NumStores
.getValue();
382 unsigned const Max
= (NumStoresVal
? (12 / NumStoresVal
) : UINT_MAX
);
385 // Only allow full unrolling if loop has any calls.
386 UP
.FullUnrollMaxCount
= Max
;
392 if (UP
.MaxCount
<= 1)
395 // Allow partial and runtime trip count unrolling.
396 UP
.Partial
= UP
.Runtime
= true;
398 UP
.PartialThreshold
= 75;
399 UP
.DefaultUnrollRuntimeCount
= 4;
401 // Allow expensive instructions in the pre-header of the loop.
402 UP
.AllowExpensiveTripCount
= true;
407 void SystemZTTIImpl::getPeelingPreferences(Loop
*L
, ScalarEvolution
&SE
,
408 TTI::PeelingPreferences
&PP
) {
409 BaseT::getPeelingPreferences(L
, SE
, PP
);
412 bool SystemZTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost
&C1
,
413 const TargetTransformInfo::LSRCost
&C2
) {
414 // SystemZ specific: check instruction count (first), and don't care about
415 // ImmCost, since offsets are checked explicitly.
416 return std::tie(C1
.Insns
, C1
.NumRegs
, C1
.AddRecCost
,
417 C1
.NumIVMuls
, C1
.NumBaseAdds
,
418 C1
.ScaleCost
, C1
.SetupCost
) <
419 std::tie(C2
.Insns
, C2
.NumRegs
, C2
.AddRecCost
,
420 C2
.NumIVMuls
, C2
.NumBaseAdds
,
421 C2
.ScaleCost
, C2
.SetupCost
);
424 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID
) const {
425 bool Vector
= (ClassID
== 1);
427 // Discount the stack pointer. Also leave out %r0, since it can't
428 // be used in an address.
436 SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K
) const {
438 case TargetTransformInfo::RGK_Scalar
:
439 return TypeSize::getFixed(64);
440 case TargetTransformInfo::RGK_FixedWidthVector
:
441 return TypeSize::getFixed(ST
->hasVector() ? 128 : 0);
442 case TargetTransformInfo::RGK_ScalableVector
:
443 return TypeSize::getScalable(0);
446 llvm_unreachable("Unsupported register kind");
449 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses
,
450 unsigned NumStridedMemAccesses
,
451 unsigned NumPrefetches
,
452 bool HasCall
) const {
453 // Don't prefetch a loop with many far apart accesses.
454 if (NumPrefetches
> 16)
457 // Emit prefetch instructions for smaller strides in cases where we think
458 // the hardware prefetcher might not be able to keep up.
459 if (NumStridedMemAccesses
> 32 && !HasCall
&&
460 (NumMemAccesses
- NumStridedMemAccesses
) * 32 <= NumStridedMemAccesses
)
463 return ST
->hasMiscellaneousExtensions3() ? 8192 : 2048;
466 bool SystemZTTIImpl::hasDivRemOp(Type
*DataType
, bool IsSigned
) {
467 EVT VT
= TLI
->getValueType(DL
, DataType
);
468 return (VT
.isScalarInteger() && TLI
->isTypeLegal(VT
));
471 static bool isFreeEltLoad(Value
*Op
) {
472 if (isa
<LoadInst
>(Op
) && Op
->hasOneUse()) {
473 const Instruction
*UserI
= cast
<Instruction
>(*Op
->user_begin());
474 return !isa
<StoreInst
>(UserI
); // Prefer MVC
479 InstructionCost
SystemZTTIImpl::getScalarizationOverhead(
480 VectorType
*Ty
, const APInt
&DemandedElts
, bool Insert
, bool Extract
,
481 TTI::TargetCostKind CostKind
, ArrayRef
<Value
*> VL
) {
482 unsigned NumElts
= cast
<FixedVectorType
>(Ty
)->getNumElements();
483 InstructionCost Cost
= 0;
485 if (Insert
&& Ty
->isIntOrIntVectorTy(64)) {
486 // VLVGP will insert two GPRs with one instruction, while VLE will load
487 // an element directly with no extra cost
488 assert((VL
.empty() || VL
.size() == NumElts
) &&
489 "Type does not match the number of values.");
490 InstructionCost CurrVectorCost
= 0;
491 for (unsigned Idx
= 0; Idx
< NumElts
; ++Idx
) {
492 if (DemandedElts
[Idx
] && !(VL
.size() && isFreeEltLoad(VL
[Idx
])))
495 Cost
+= std::min(InstructionCost(1), CurrVectorCost
);
502 Cost
+= BaseT::getScalarizationOverhead(Ty
, DemandedElts
, Insert
, Extract
,
507 // Return the bit size for the scalar type or vector element
508 // type. getScalarSizeInBits() returns 0 for a pointer type.
509 static unsigned getScalarSizeInBits(Type
*Ty
) {
511 (Ty
->isPtrOrPtrVectorTy() ? 64U : Ty
->getScalarSizeInBits());
512 assert(Size
> 0 && "Element must have non-zero size.");
516 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
517 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
519 static unsigned getNumVectorRegs(Type
*Ty
) {
520 auto *VTy
= cast
<FixedVectorType
>(Ty
);
521 unsigned WideBits
= getScalarSizeInBits(Ty
) * VTy
->getNumElements();
522 assert(WideBits
> 0 && "Could not compute size of vector");
523 return ((WideBits
% 128U) ? ((WideBits
/ 128U) + 1) : (WideBits
/ 128U));
526 InstructionCost
SystemZTTIImpl::getArithmeticInstrCost(
527 unsigned Opcode
, Type
*Ty
, TTI::TargetCostKind CostKind
,
528 TTI::OperandValueInfo Op1Info
, TTI::OperandValueInfo Op2Info
,
529 ArrayRef
<const Value
*> Args
,
530 const Instruction
*CxtI
) {
532 // TODO: Handle more cost kinds.
533 if (CostKind
!= TTI::TCK_RecipThroughput
)
534 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
,
535 Op2Info
, Args
, CxtI
);
537 // TODO: return a good value for BB-VECTORIZER that includes the
538 // immediate loads, which we do not want to count for the loop
539 // vectorizer, since they are hopefully hoisted out of the loop. This
540 // would require a new parameter 'InLoop', but not sure if constant
541 // args are common enough to motivate this.
543 unsigned ScalarBits
= Ty
->getScalarSizeInBits();
545 // There are thre cases of division and remainder: Dividing with a register
546 // needs a divide instruction. A divisor which is a power of two constant
547 // can be implemented with a sequence of shifts. Any other constant needs a
548 // multiply and shifts.
549 const unsigned DivInstrCost
= 20;
550 const unsigned DivMulSeqCost
= 10;
551 const unsigned SDivPow2Cost
= 4;
554 Opcode
== Instruction::SDiv
|| Opcode
== Instruction::SRem
;
555 bool UnsignedDivRem
=
556 Opcode
== Instruction::UDiv
|| Opcode
== Instruction::URem
;
558 // Check for a constant divisor.
559 bool DivRemConst
= false;
560 bool DivRemConstPow2
= false;
561 if ((SignedDivRem
|| UnsignedDivRem
) && Args
.size() == 2) {
562 if (const Constant
*C
= dyn_cast
<Constant
>(Args
[1])) {
563 const ConstantInt
*CVal
=
564 (C
->getType()->isVectorTy()
565 ? dyn_cast_or_null
<const ConstantInt
>(C
->getSplatValue())
566 : dyn_cast
<const ConstantInt
>(C
));
567 if (CVal
&& (CVal
->getValue().isPowerOf2() ||
568 CVal
->getValue().isNegatedPowerOf2()))
569 DivRemConstPow2
= true;
575 if (!Ty
->isVectorTy()) {
576 // These FP operations are supported with a dedicated instruction for
577 // float, double and fp128 (base implementation assumes float generally
579 if (Opcode
== Instruction::FAdd
|| Opcode
== Instruction::FSub
||
580 Opcode
== Instruction::FMul
|| Opcode
== Instruction::FDiv
)
583 // There is no native support for FRem.
584 if (Opcode
== Instruction::FRem
)
587 // Give discount for some combined logical operations if supported.
588 if (Args
.size() == 2) {
589 if (Opcode
== Instruction::Xor
) {
590 for (const Value
*A
: Args
) {
591 if (const Instruction
*I
= dyn_cast
<Instruction
>(A
))
592 if (I
->hasOneUse() &&
593 (I
->getOpcode() == Instruction::Or
||
594 I
->getOpcode() == Instruction::And
||
595 I
->getOpcode() == Instruction::Xor
))
596 if ((ScalarBits
<= 64 && ST
->hasMiscellaneousExtensions3()) ||
598 (I
->getOpcode() == Instruction::Or
|| ST
->hasVectorEnhancements1())))
602 else if (Opcode
== Instruction::And
|| Opcode
== Instruction::Or
) {
603 for (const Value
*A
: Args
) {
604 if (const Instruction
*I
= dyn_cast
<Instruction
>(A
))
605 if ((I
->hasOneUse() && I
->getOpcode() == Instruction::Xor
) &&
606 ((ScalarBits
<= 64 && ST
->hasMiscellaneousExtensions3()) ||
608 (Opcode
== Instruction::And
|| ST
->hasVectorEnhancements1()))))
614 // Or requires one instruction, although it has custom handling for i64.
615 if (Opcode
== Instruction::Or
)
618 if (Opcode
== Instruction::Xor
&& ScalarBits
== 1) {
619 if (ST
->hasLoadStoreOnCond2())
620 return 5; // 2 * (li 0; loc 1); xor
621 return 7; // 2 * ipm sequences ; xor ; shift ; compare
625 return (SignedDivRem
? SDivPow2Cost
: 1);
627 return DivMulSeqCost
;
628 if (SignedDivRem
|| UnsignedDivRem
)
631 else if (ST
->hasVector()) {
632 auto *VTy
= cast
<FixedVectorType
>(Ty
);
633 unsigned VF
= VTy
->getNumElements();
634 unsigned NumVectors
= getNumVectorRegs(Ty
);
636 // These vector operations are custom handled, but are still supported
637 // with one instruction per vector, regardless of element size.
638 if (Opcode
== Instruction::Shl
|| Opcode
== Instruction::LShr
||
639 Opcode
== Instruction::AShr
) {
644 return (NumVectors
* (SignedDivRem
? SDivPow2Cost
: 1));
646 SmallVector
<Type
*> Tys(Args
.size(), Ty
);
647 return VF
* DivMulSeqCost
+
648 BaseT::getScalarizationOverhead(VTy
, Args
, Tys
, CostKind
);
650 if ((SignedDivRem
|| UnsignedDivRem
) && VF
> 4)
651 // Temporary hack: disable high vectorization factors with integer
652 // division/remainder, which will get scalarized and handled with
653 // GR128 registers. The mischeduler is not clever enough to avoid
657 // These FP operations are supported with a single vector instruction for
658 // double (base implementation assumes float generally costs 2). For
659 // FP128, the scalar cost is 1, and there is no overhead since the values
660 // are already in scalar registers.
661 if (Opcode
== Instruction::FAdd
|| Opcode
== Instruction::FSub
||
662 Opcode
== Instruction::FMul
|| Opcode
== Instruction::FDiv
) {
663 switch (ScalarBits
) {
665 // The vector enhancements facility 1 provides v4f32 instructions.
666 if (ST
->hasVectorEnhancements1())
668 // Return the cost of multiple scalar invocation plus the cost of
669 // inserting and extracting the values.
670 InstructionCost ScalarCost
=
671 getArithmeticInstrCost(Opcode
, Ty
->getScalarType(), CostKind
);
672 SmallVector
<Type
*> Tys(Args
.size(), Ty
);
673 InstructionCost Cost
=
675 BaseT::getScalarizationOverhead(VTy
, Args
, Tys
, CostKind
);
676 // FIXME: VF 2 for these FP operations are currently just as
677 // expensive as for VF 4.
690 // There is no native support for FRem.
691 if (Opcode
== Instruction::FRem
) {
692 SmallVector
<Type
*> Tys(Args
.size(), Ty
);
693 InstructionCost Cost
=
694 (VF
* LIBCALL_COST
) +
695 BaseT::getScalarizationOverhead(VTy
, Args
, Tys
, CostKind
);
696 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
697 if (VF
== 2 && ScalarBits
== 32)
703 // Fallback to the default implementation.
704 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
,
708 InstructionCost
SystemZTTIImpl::getShuffleCost(
709 TTI::ShuffleKind Kind
, VectorType
*Tp
, ArrayRef
<int> Mask
,
710 TTI::TargetCostKind CostKind
, int Index
, VectorType
*SubTp
,
711 ArrayRef
<const Value
*> Args
, const Instruction
*CxtI
) {
712 Kind
= improveShuffleKindFromMask(Kind
, Mask
, Tp
, Index
, SubTp
);
713 if (ST
->hasVector()) {
714 unsigned NumVectors
= getNumVectorRegs(Tp
);
716 // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
718 // FP128 values are always in scalar registers, so there is no work
719 // involved with a shuffle, except for broadcast. In that case register
720 // moves are done with a single instruction per element.
721 if (Tp
->getScalarType()->isFP128Ty())
722 return (Kind
== TargetTransformInfo::SK_Broadcast
? NumVectors
- 1 : 0);
725 case TargetTransformInfo::SK_ExtractSubvector
:
726 // ExtractSubvector Index indicates start offset.
728 // Extracting a subvector from first index is a noop.
729 return (Index
== 0 ? 0 : NumVectors
);
731 case TargetTransformInfo::SK_Broadcast
:
732 // Loop vectorizer calls here to figure out the extra cost of
733 // broadcasting a loaded value to all elements of a vector. Since vlrep
734 // loads and replicates with a single instruction, adjust the returned
736 return NumVectors
- 1;
740 // SystemZ supports single instruction permutation / replication.
745 return BaseT::getShuffleCost(Kind
, Tp
, Mask
, CostKind
, Index
, SubTp
);
748 // Return the log2 difference of the element sizes of the two vector types.
749 static unsigned getElSizeLog2Diff(Type
*Ty0
, Type
*Ty1
) {
750 unsigned Bits0
= Ty0
->getScalarSizeInBits();
751 unsigned Bits1
= Ty1
->getScalarSizeInBits();
754 return (Log2_32(Bits1
) - Log2_32(Bits0
));
756 return (Log2_32(Bits0
) - Log2_32(Bits1
));
759 // Return the number of instructions needed to truncate SrcTy to DstTy.
760 unsigned SystemZTTIImpl::
761 getVectorTruncCost(Type
*SrcTy
, Type
*DstTy
) {
762 assert (SrcTy
->isVectorTy() && DstTy
->isVectorTy());
763 assert(SrcTy
->getPrimitiveSizeInBits().getFixedValue() >
764 DstTy
->getPrimitiveSizeInBits().getFixedValue() &&
765 "Packing must reduce size of vector type.");
766 assert(cast
<FixedVectorType
>(SrcTy
)->getNumElements() ==
767 cast
<FixedVectorType
>(DstTy
)->getNumElements() &&
768 "Packing should not change number of elements.");
770 // TODO: Since fp32 is expanded, the extract cost should always be 0.
772 unsigned NumParts
= getNumVectorRegs(SrcTy
);
774 // Up to 2 vector registers can be truncated efficiently with pack or
775 // permute. The latter requires an immediate mask to be loaded, which
776 // typically gets hoisted out of a loop. TODO: return a good value for
777 // BB-VECTORIZER that includes the immediate loads, which we do not want
778 // to count for the loop vectorizer.
782 unsigned Log2Diff
= getElSizeLog2Diff(SrcTy
, DstTy
);
783 unsigned VF
= cast
<FixedVectorType
>(SrcTy
)->getNumElements();
784 for (unsigned P
= 0; P
< Log2Diff
; ++P
) {
790 // Currently, a general mix of permutes and pack instructions is output by
791 // isel, which follow the cost computation above except for this case which
792 // is one instruction less:
793 if (VF
== 8 && SrcTy
->getScalarSizeInBits() == 64 &&
794 DstTy
->getScalarSizeInBits() == 8)
800 // Return the cost of converting a vector bitmask produced by a compare
801 // (SrcTy), to the type of the select or extend instruction (DstTy).
802 unsigned SystemZTTIImpl::
803 getVectorBitmaskConversionCost(Type
*SrcTy
, Type
*DstTy
) {
804 assert (SrcTy
->isVectorTy() && DstTy
->isVectorTy() &&
805 "Should only be called with vector types.");
807 unsigned PackCost
= 0;
808 unsigned SrcScalarBits
= SrcTy
->getScalarSizeInBits();
809 unsigned DstScalarBits
= DstTy
->getScalarSizeInBits();
810 unsigned Log2Diff
= getElSizeLog2Diff(SrcTy
, DstTy
);
811 if (SrcScalarBits
> DstScalarBits
)
812 // The bitmask will be truncated.
813 PackCost
= getVectorTruncCost(SrcTy
, DstTy
);
814 else if (SrcScalarBits
< DstScalarBits
) {
815 unsigned DstNumParts
= getNumVectorRegs(DstTy
);
816 // Each vector select needs its part of the bitmask unpacked.
817 PackCost
= Log2Diff
* DstNumParts
;
818 // Extra cost for moving part of mask before unpacking.
819 PackCost
+= DstNumParts
- 1;
825 // Return the type of the compared operands. This is needed to compute the
826 // cost for a Select / ZExt or SExt instruction.
827 static Type
*getCmpOpsType(const Instruction
*I
, unsigned VF
= 1) {
828 Type
*OpTy
= nullptr;
829 if (CmpInst
*CI
= dyn_cast
<CmpInst
>(I
->getOperand(0)))
830 OpTy
= CI
->getOperand(0)->getType();
831 else if (Instruction
*LogicI
= dyn_cast
<Instruction
>(I
->getOperand(0)))
832 if (LogicI
->getNumOperands() == 2)
833 if (CmpInst
*CI0
= dyn_cast
<CmpInst
>(LogicI
->getOperand(0)))
834 if (isa
<CmpInst
>(LogicI
->getOperand(1)))
835 OpTy
= CI0
->getOperand(0)->getType();
837 if (OpTy
!= nullptr) {
839 assert (!OpTy
->isVectorTy() && "Expected scalar type");
842 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
843 // be either scalar or already vectorized with a same or lesser VF.
844 Type
*ElTy
= OpTy
->getScalarType();
845 return FixedVectorType::get(ElTy
, VF
);
851 // Get the cost of converting a boolean vector to a vector with same width
852 // and element size as Dst, plus the cost of zero extending if needed.
853 unsigned SystemZTTIImpl::
854 getBoolVecToIntConversionCost(unsigned Opcode
, Type
*Dst
,
855 const Instruction
*I
) {
856 auto *DstVTy
= cast
<FixedVectorType
>(Dst
);
857 unsigned VF
= DstVTy
->getNumElements();
859 // If we know what the widths of the compared operands, get any cost of
860 // converting it to match Dst. Otherwise assume same widths.
861 Type
*CmpOpTy
= ((I
!= nullptr) ? getCmpOpsType(I
, VF
) : nullptr);
862 if (CmpOpTy
!= nullptr)
863 Cost
= getVectorBitmaskConversionCost(CmpOpTy
, Dst
);
864 if (Opcode
== Instruction::ZExt
|| Opcode
== Instruction::UIToFP
)
865 // One 'vn' per dst vector with an immediate mask.
866 Cost
+= getNumVectorRegs(Dst
);
870 InstructionCost
SystemZTTIImpl::getCastInstrCost(unsigned Opcode
, Type
*Dst
,
872 TTI::CastContextHint CCH
,
873 TTI::TargetCostKind CostKind
,
874 const Instruction
*I
) {
875 // FIXME: Can the logic below also be used for these cost kinds?
876 if (CostKind
== TTI::TCK_CodeSize
|| CostKind
== TTI::TCK_SizeAndLatency
) {
877 auto BaseCost
= BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
878 return BaseCost
== 0 ? BaseCost
: 1;
881 unsigned DstScalarBits
= Dst
->getScalarSizeInBits();
882 unsigned SrcScalarBits
= Src
->getScalarSizeInBits();
884 if (!Src
->isVectorTy()) {
885 assert (!Dst
->isVectorTy());
887 if (Opcode
== Instruction::SIToFP
|| Opcode
== Instruction::UIToFP
) {
888 if (Src
->isIntegerTy(128))
890 if (SrcScalarBits
>= 32 ||
891 (I
!= nullptr && isa
<LoadInst
>(I
->getOperand(0))))
893 return SrcScalarBits
> 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
896 if ((Opcode
== Instruction::FPToSI
|| Opcode
== Instruction::FPToUI
) &&
897 Dst
->isIntegerTy(128))
900 if ((Opcode
== Instruction::ZExt
|| Opcode
== Instruction::SExt
)) {
901 if (Src
->isIntegerTy(1)) {
902 if (DstScalarBits
== 128)
903 return 5 /*branch seq.*/;
905 if (ST
->hasLoadStoreOnCond2())
906 return 2; // li 0; loc 1
908 // This should be extension of a compare i1 result, which is done with
909 // ipm and a varying sequence of instructions.
911 if (Opcode
== Instruction::SExt
)
912 Cost
= (DstScalarBits
< 64 ? 3 : 4);
913 if (Opcode
== Instruction::ZExt
)
915 Type
*CmpOpTy
= ((I
!= nullptr) ? getCmpOpsType(I
) : nullptr);
916 if (CmpOpTy
!= nullptr && CmpOpTy
->isFloatingPointTy())
917 // If operands of an fp-type was compared, this costs +1.
921 else if (isInt128InVR(Dst
)) {
922 // Extensions from GPR to i128 (in VR) typically costs two instructions,
923 // but a zero-extending load would be just one extra instruction.
924 if (Opcode
== Instruction::ZExt
&& I
!= nullptr)
925 if (LoadInst
*Ld
= dyn_cast
<LoadInst
>(I
->getOperand(0)))
932 if (Opcode
== Instruction::Trunc
&& isInt128InVR(Src
) && I
!= nullptr) {
933 if (LoadInst
*Ld
= dyn_cast
<LoadInst
>(I
->getOperand(0)))
935 return 0; // Will be converted to GPR load.
936 bool OnlyTruncatingStores
= true;
937 for (const User
*U
: I
->users())
938 if (!isa
<StoreInst
>(U
)) {
939 OnlyTruncatingStores
= false;
942 if (OnlyTruncatingStores
)
944 return 2; // Vector element extraction.
947 else if (ST
->hasVector()) {
948 // Vector to scalar cast.
949 auto *SrcVecTy
= cast
<FixedVectorType
>(Src
);
950 auto *DstVecTy
= dyn_cast
<FixedVectorType
>(Dst
);
952 // TODO: tune vector-to-scalar cast.
953 return BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
955 unsigned VF
= SrcVecTy
->getNumElements();
956 unsigned NumDstVectors
= getNumVectorRegs(Dst
);
957 unsigned NumSrcVectors
= getNumVectorRegs(Src
);
959 if (Opcode
== Instruction::Trunc
) {
960 if (Src
->getScalarSizeInBits() == Dst
->getScalarSizeInBits())
961 return 0; // Check for NOOP conversions.
962 return getVectorTruncCost(Src
, Dst
);
965 if (Opcode
== Instruction::ZExt
|| Opcode
== Instruction::SExt
) {
966 if (SrcScalarBits
>= 8) {
967 // ZExt will use either a single unpack or a vector permute.
968 if (Opcode
== Instruction::ZExt
)
969 return NumDstVectors
;
971 // SExt will be handled with one unpack per doubling of width.
972 unsigned NumUnpacks
= getElSizeLog2Diff(Src
, Dst
);
974 // For types that spans multiple vector registers, some additional
975 // instructions are used to setup the unpacking.
976 unsigned NumSrcVectorOps
=
977 (NumUnpacks
> 1 ? (NumDstVectors
- NumSrcVectors
)
978 : (NumDstVectors
/ 2));
980 return (NumUnpacks
* NumDstVectors
) + NumSrcVectorOps
;
982 else if (SrcScalarBits
== 1)
983 return getBoolVecToIntConversionCost(Opcode
, Dst
, I
);
986 if (Opcode
== Instruction::SIToFP
|| Opcode
== Instruction::UIToFP
||
987 Opcode
== Instruction::FPToSI
|| Opcode
== Instruction::FPToUI
) {
988 // TODO: Fix base implementation which could simplify things a bit here
989 // (seems to miss on differentiating on scalar/vector types).
991 // Only 64 bit vector conversions are natively supported before z15.
992 if (DstScalarBits
== 64 || ST
->hasVectorEnhancements2()) {
993 if (SrcScalarBits
== DstScalarBits
)
994 return NumDstVectors
;
996 if (SrcScalarBits
== 1)
997 return getBoolVecToIntConversionCost(Opcode
, Dst
, I
) + NumDstVectors
;
1000 // Return the cost of multiple scalar invocation plus the cost of
1001 // inserting and extracting the values. Base implementation does not
1002 // realize float->int gets scalarized.
1003 InstructionCost ScalarCost
= getCastInstrCost(
1004 Opcode
, Dst
->getScalarType(), Src
->getScalarType(), CCH
, CostKind
);
1005 InstructionCost TotCost
= VF
* ScalarCost
;
1006 bool NeedsInserts
= true, NeedsExtracts
= true;
1007 // FP128 registers do not get inserted or extracted.
1008 if (DstScalarBits
== 128 &&
1009 (Opcode
== Instruction::SIToFP
|| Opcode
== Instruction::UIToFP
))
1010 NeedsInserts
= false;
1011 if (SrcScalarBits
== 128 &&
1012 (Opcode
== Instruction::FPToSI
|| Opcode
== Instruction::FPToUI
))
1013 NeedsExtracts
= false;
1015 TotCost
+= BaseT::getScalarizationOverhead(SrcVecTy
, /*Insert*/ false,
1016 NeedsExtracts
, CostKind
);
1017 TotCost
+= BaseT::getScalarizationOverhead(DstVecTy
, NeedsInserts
,
1018 /*Extract*/ false, CostKind
);
1020 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
1021 if (VF
== 2 && SrcScalarBits
== 32 && DstScalarBits
== 32)
1027 if (Opcode
== Instruction::FPTrunc
) {
1028 if (SrcScalarBits
== 128) // fp128 -> double/float + inserts of elements.
1029 return VF
/*ldxbr/lexbr*/ +
1030 BaseT::getScalarizationOverhead(DstVecTy
, /*Insert*/ true,
1031 /*Extract*/ false, CostKind
);
1032 else // double -> float
1033 return VF
/ 2 /*vledb*/ + std::max(1U, VF
/ 4 /*vperm*/);
1036 if (Opcode
== Instruction::FPExt
) {
1037 if (SrcScalarBits
== 32 && DstScalarBits
== 64) {
1038 // float -> double is very rare and currently unoptimized. Instead of
1039 // using vldeb, which can do two at a time, all conversions are
1043 // -> fp128. VF * lxdb/lxeb + extraction of elements.
1044 return VF
+ BaseT::getScalarizationOverhead(SrcVecTy
, /*Insert*/ false,
1045 /*Extract*/ true, CostKind
);
1049 return BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
1052 // Scalar i8 / i16 operations will typically be made after first extending
1053 // the operands to i32.
1054 static unsigned getOperandsExtensionCost(const Instruction
*I
) {
1055 unsigned ExtCost
= 0;
1056 for (Value
*Op
: I
->operands())
1057 // A load of i8 or i16 sign/zero extends to i32.
1058 if (!isa
<LoadInst
>(Op
) && !isa
<ConstantInt
>(Op
))
1064 InstructionCost
SystemZTTIImpl::getCmpSelInstrCost(
1065 unsigned Opcode
, Type
*ValTy
, Type
*CondTy
, CmpInst::Predicate VecPred
,
1066 TTI::TargetCostKind CostKind
, TTI::OperandValueInfo Op1Info
,
1067 TTI::OperandValueInfo Op2Info
, const Instruction
*I
) {
1068 if (CostKind
!= TTI::TCK_RecipThroughput
)
1069 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
,
1072 if (!ValTy
->isVectorTy()) {
1074 case Instruction::ICmp
: {
1075 // A loaded value compared with 0 with multiple users becomes Load and
1076 // Test. The load is then not foldable, so return 0 cost for the ICmp.
1077 unsigned ScalarBits
= ValTy
->getScalarSizeInBits();
1078 if (I
!= nullptr && (ScalarBits
== 32 || ScalarBits
== 64))
1079 if (LoadInst
*Ld
= dyn_cast
<LoadInst
>(I
->getOperand(0)))
1080 if (const ConstantInt
*C
= dyn_cast
<ConstantInt
>(I
->getOperand(1)))
1081 if (!Ld
->hasOneUse() && Ld
->getParent() == I
->getParent() &&
1086 if (ValTy
->isIntegerTy() && ValTy
->getScalarSizeInBits() <= 16)
1087 Cost
+= (I
!= nullptr ? getOperandsExtensionCost(I
) : 2);
1090 case Instruction::Select
:
1091 if (ValTy
->isFloatingPointTy() || isInt128InVR(ValTy
))
1092 return 4; // No LOC for FP / i128 - costs a conditional jump.
1093 return 1; // Load On Condition / Select Register.
1096 else if (ST
->hasVector()) {
1097 unsigned VF
= cast
<FixedVectorType
>(ValTy
)->getNumElements();
1099 // Called with a compare instruction.
1100 if (Opcode
== Instruction::ICmp
|| Opcode
== Instruction::FCmp
) {
1101 unsigned PredicateExtraCost
= 0;
1103 // Some predicates cost one or two extra instructions.
1104 switch (cast
<CmpInst
>(I
)->getPredicate()) {
1105 case CmpInst::Predicate::ICMP_NE
:
1106 case CmpInst::Predicate::ICMP_UGE
:
1107 case CmpInst::Predicate::ICMP_ULE
:
1108 case CmpInst::Predicate::ICMP_SGE
:
1109 case CmpInst::Predicate::ICMP_SLE
:
1110 PredicateExtraCost
= 1;
1112 case CmpInst::Predicate::FCMP_ONE
:
1113 case CmpInst::Predicate::FCMP_ORD
:
1114 case CmpInst::Predicate::FCMP_UEQ
:
1115 case CmpInst::Predicate::FCMP_UNO
:
1116 PredicateExtraCost
= 2;
1123 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
1124 // floats. FIXME: <2 x float> generates same code as <4 x float>.
1125 unsigned CmpCostPerVector
= (ValTy
->getScalarType()->isFloatTy() ? 10 : 1);
1126 unsigned NumVecs_cmp
= getNumVectorRegs(ValTy
);
1128 unsigned Cost
= (NumVecs_cmp
* (CmpCostPerVector
+ PredicateExtraCost
));
1131 else { // Called with a select instruction.
1132 assert (Opcode
== Instruction::Select
);
1134 // We can figure out the extra cost of packing / unpacking if the
1135 // instruction was passed and the compare instruction is found.
1136 unsigned PackCost
= 0;
1137 Type
*CmpOpTy
= ((I
!= nullptr) ? getCmpOpsType(I
, VF
) : nullptr);
1138 if (CmpOpTy
!= nullptr)
1140 getVectorBitmaskConversionCost(CmpOpTy
, ValTy
);
1142 return getNumVectorRegs(ValTy
) /*vsel*/ + PackCost
;
1146 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
,
1150 InstructionCost
SystemZTTIImpl::getVectorInstrCost(unsigned Opcode
, Type
*Val
,
1151 TTI::TargetCostKind CostKind
,
1152 unsigned Index
, Value
*Op0
,
1154 if (Opcode
== Instruction::InsertElement
) {
1155 // Vector Element Load.
1156 if (Op1
!= nullptr && isFreeEltLoad(Op1
))
1159 // vlvgp will insert two grs into a vector register, so count half the
1160 // number of instructions as an estimate when we don't have the full
1161 // picture (as in getScalarizationOverhead()).
1162 if (Val
->isIntOrIntVectorTy(64))
1163 return ((Index
% 2 == 0) ? 1 : 0);
1166 if (Opcode
== Instruction::ExtractElement
) {
1167 int Cost
= ((getScalarSizeInBits(Val
) == 1) ? 2 /*+test-under-mask*/ : 1);
1169 // Give a slight penalty for moving out of vector pipeline to FXU unit.
1170 if (Index
== 0 && Val
->isIntOrIntVectorTy())
1176 return BaseT::getVectorInstrCost(Opcode
, Val
, CostKind
, Index
, Op0
, Op1
);
1179 // Check if a load may be folded as a memory operand in its user.
1180 bool SystemZTTIImpl::
1181 isFoldableLoad(const LoadInst
*Ld
, const Instruction
*&FoldedValue
) {
1182 if (!Ld
->hasOneUse())
1185 const Instruction
*UserI
= cast
<Instruction
>(*Ld
->user_begin());
1186 unsigned LoadedBits
= getScalarSizeInBits(Ld
->getType());
1187 unsigned TruncBits
= 0;
1188 unsigned SExtBits
= 0;
1189 unsigned ZExtBits
= 0;
1190 if (UserI
->hasOneUse()) {
1191 unsigned UserBits
= UserI
->getType()->getScalarSizeInBits();
1192 if (isa
<TruncInst
>(UserI
))
1193 TruncBits
= UserBits
;
1194 else if (isa
<SExtInst
>(UserI
))
1195 SExtBits
= UserBits
;
1196 else if (isa
<ZExtInst
>(UserI
))
1197 ZExtBits
= UserBits
;
1199 if (TruncBits
|| SExtBits
|| ZExtBits
) {
1200 FoldedValue
= UserI
;
1201 UserI
= cast
<Instruction
>(*UserI
->user_begin());
1202 // Load (single use) -> trunc/extend (single use) -> UserI
1204 if ((UserI
->getOpcode() == Instruction::Sub
||
1205 UserI
->getOpcode() == Instruction::SDiv
||
1206 UserI
->getOpcode() == Instruction::UDiv
) &&
1207 UserI
->getOperand(1) != FoldedValue
)
1208 return false; // Not commutative, only RHS foldable.
1209 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1210 // extension was made of the load.
1211 unsigned LoadOrTruncBits
=
1212 ((SExtBits
|| ZExtBits
) ? 0 : (TruncBits
? TruncBits
: LoadedBits
));
1213 switch (UserI
->getOpcode()) {
1214 case Instruction::Add
: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1215 case Instruction::Sub
:
1216 case Instruction::ICmp
:
1217 if (LoadedBits
== 32 && ZExtBits
== 64)
1220 case Instruction::Mul
: // SE: 16->32, 32->64, z14:16->64
1221 if (UserI
->getOpcode() != Instruction::ICmp
) {
1222 if (LoadedBits
== 16 &&
1224 (SExtBits
== 64 && ST
->hasMiscellaneousExtensions2())))
1226 if (LoadOrTruncBits
== 16)
1230 case Instruction::SDiv
:// SE: 32->64
1231 if (LoadedBits
== 32 && SExtBits
== 64)
1234 case Instruction::UDiv
:
1235 case Instruction::And
:
1236 case Instruction::Or
:
1237 case Instruction::Xor
:
1238 // This also makes sense for float operations, but disabled for now due
1240 // case Instruction::FCmp:
1241 // case Instruction::FAdd:
1242 // case Instruction::FSub:
1243 // case Instruction::FMul:
1244 // case Instruction::FDiv:
1246 // All possible extensions of memory checked above.
1248 // Comparison between memory and immediate.
1249 if (UserI
->getOpcode() == Instruction::ICmp
)
1250 if (ConstantInt
*CI
= dyn_cast
<ConstantInt
>(UserI
->getOperand(1)))
1251 if (CI
->getValue().isIntN(16))
1253 return (LoadOrTruncBits
== 32 || LoadOrTruncBits
== 64);
1259 static bool isBswapIntrinsicCall(const Value
*V
) {
1260 if (const Instruction
*I
= dyn_cast
<Instruction
>(V
))
1261 if (auto *CI
= dyn_cast
<CallInst
>(I
))
1262 if (auto *F
= CI
->getCalledFunction())
1263 if (F
->getIntrinsicID() == Intrinsic::bswap
)
1268 InstructionCost
SystemZTTIImpl::getMemoryOpCost(unsigned Opcode
, Type
*Src
,
1269 MaybeAlign Alignment
,
1270 unsigned AddressSpace
,
1271 TTI::TargetCostKind CostKind
,
1272 TTI::OperandValueInfo OpInfo
,
1273 const Instruction
*I
) {
1274 assert(!Src
->isVoidTy() && "Invalid type");
1276 // TODO: Handle other cost kinds.
1277 if (CostKind
!= TTI::TCK_RecipThroughput
)
1280 if (!Src
->isVectorTy() && Opcode
== Instruction::Load
&& I
!= nullptr) {
1281 // Store the load or its truncated or extended value in FoldedValue.
1282 const Instruction
*FoldedValue
= nullptr;
1283 if (isFoldableLoad(cast
<LoadInst
>(I
), FoldedValue
)) {
1284 const Instruction
*UserI
= cast
<Instruction
>(*FoldedValue
->user_begin());
1285 assert (UserI
->getNumOperands() == 2 && "Expected a binop.");
1287 // UserI can't fold two loads, so in that case return 0 cost only
1288 // half of the time.
1289 for (unsigned i
= 0; i
< 2; ++i
) {
1290 if (UserI
->getOperand(i
) == FoldedValue
)
1293 if (Instruction
*OtherOp
= dyn_cast
<Instruction
>(UserI
->getOperand(i
))){
1294 LoadInst
*OtherLoad
= dyn_cast
<LoadInst
>(OtherOp
);
1296 (isa
<TruncInst
>(OtherOp
) || isa
<SExtInst
>(OtherOp
) ||
1297 isa
<ZExtInst
>(OtherOp
)))
1298 OtherLoad
= dyn_cast
<LoadInst
>(OtherOp
->getOperand(0));
1299 if (OtherLoad
&& isFoldableLoad(OtherLoad
, FoldedValue
/*dummy*/))
1300 return i
== 0; // Both operands foldable.
1304 return 0; // Only I is foldable in user.
1308 // Type legalization (via getNumberOfParts) can't handle structs
1309 if (TLI
->getValueType(DL
, Src
, true) == MVT::Other
)
1310 return BaseT::getMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
,
1313 // FP128 is a legal type but kept in a register pair on older CPUs.
1314 if (Src
->isFP128Ty() && !ST
->hasVectorEnhancements1())
1318 (Src
->isVectorTy() ? getNumVectorRegs(Src
) : getNumberOfParts(Src
));
1320 // Store/Load reversed saves one instruction.
1321 if (((!Src
->isVectorTy() && NumOps
== 1) || ST
->hasVectorEnhancements2()) &&
1323 if (Opcode
== Instruction::Load
&& I
->hasOneUse()) {
1324 const Instruction
*LdUser
= cast
<Instruction
>(*I
->user_begin());
1325 // In case of load -> bswap -> store, return normal cost for the load.
1326 if (isBswapIntrinsicCall(LdUser
) &&
1327 (!LdUser
->hasOneUse() || !isa
<StoreInst
>(*LdUser
->user_begin())))
1330 else if (const StoreInst
*SI
= dyn_cast
<StoreInst
>(I
)) {
1331 const Value
*StoredVal
= SI
->getValueOperand();
1332 if (StoredVal
->hasOneUse() && isBswapIntrinsicCall(StoredVal
))
1340 // The generic implementation of getInterleavedMemoryOpCost() is based on
1341 // adding costs of the memory operations plus all the extracts and inserts
1342 // needed for using / defining the vector operands. The SystemZ version does
1343 // roughly the same but bases the computations on vector permutations
1345 InstructionCost
SystemZTTIImpl::getInterleavedMemoryOpCost(
1346 unsigned Opcode
, Type
*VecTy
, unsigned Factor
, ArrayRef
<unsigned> Indices
,
1347 Align Alignment
, unsigned AddressSpace
, TTI::TargetCostKind CostKind
,
1348 bool UseMaskForCond
, bool UseMaskForGaps
) {
1349 if (UseMaskForCond
|| UseMaskForGaps
)
1350 return BaseT::getInterleavedMemoryOpCost(Opcode
, VecTy
, Factor
, Indices
,
1351 Alignment
, AddressSpace
, CostKind
,
1352 UseMaskForCond
, UseMaskForGaps
);
1353 assert(isa
<VectorType
>(VecTy
) &&
1354 "Expect a vector type for interleaved memory op");
1356 unsigned NumElts
= cast
<FixedVectorType
>(VecTy
)->getNumElements();
1357 assert(Factor
> 1 && NumElts
% Factor
== 0 && "Invalid interleave factor");
1358 unsigned VF
= NumElts
/ Factor
;
1359 unsigned NumEltsPerVecReg
= (128U / getScalarSizeInBits(VecTy
));
1360 unsigned NumVectorMemOps
= getNumVectorRegs(VecTy
);
1361 unsigned NumPermutes
= 0;
1363 if (Opcode
== Instruction::Load
) {
1364 // Loading interleave groups may have gaps, which may mean fewer
1365 // loads. Find out how many vectors will be loaded in total, and in how
1366 // many of them each value will be in.
1367 BitVector
UsedInsts(NumVectorMemOps
, false);
1368 std::vector
<BitVector
> ValueVecs(Factor
, BitVector(NumVectorMemOps
, false));
1369 for (unsigned Index
: Indices
)
1370 for (unsigned Elt
= 0; Elt
< VF
; ++Elt
) {
1371 unsigned Vec
= (Index
+ Elt
* Factor
) / NumEltsPerVecReg
;
1373 ValueVecs
[Index
].set(Vec
);
1375 NumVectorMemOps
= UsedInsts
.count();
1377 for (unsigned Index
: Indices
) {
1378 // Estimate that each loaded source vector containing this Index
1379 // requires one operation, except that vperm can handle two input
1380 // registers first time for each dst vector.
1381 unsigned NumSrcVecs
= ValueVecs
[Index
].count();
1382 unsigned NumDstVecs
= divideCeil(VF
* getScalarSizeInBits(VecTy
), 128U);
1383 assert (NumSrcVecs
>= NumDstVecs
&& "Expected at least as many sources");
1384 NumPermutes
+= std::max(1U, NumSrcVecs
- NumDstVecs
);
1387 // Estimate the permutes for each stored vector as the smaller of the
1388 // number of elements and the number of source vectors. Subtract one per
1389 // dst vector for vperm (S.A.).
1390 unsigned NumSrcVecs
= std::min(NumEltsPerVecReg
, Factor
);
1391 unsigned NumDstVecs
= NumVectorMemOps
;
1392 NumPermutes
+= (NumDstVecs
* NumSrcVecs
) - NumDstVecs
;
1395 // Cost of load/store operations and the permutations needed.
1396 return NumVectorMemOps
+ NumPermutes
;
1400 getVectorIntrinsicInstrCost(Intrinsic::ID ID
, Type
*RetTy
,
1401 const SmallVectorImpl
<Type
*> &ParamTys
) {
1402 if (RetTy
->isVectorTy() && ID
== Intrinsic::bswap
)
1403 return getNumVectorRegs(RetTy
); // VPERM
1405 if (ID
== Intrinsic::vector_reduce_add
) {
1406 // Retrieve number and size of elements for the vector op.
1407 auto *VTy
= cast
<FixedVectorType
>(ParamTys
.front());
1408 unsigned ScalarSize
= VTy
->getScalarSizeInBits();
1409 // For scalar sizes >128 bits, we fall back to the generic cost estimate.
1410 if (ScalarSize
> SystemZ::VectorBits
)
1412 // This many vector regs are needed to represent the input elements (V).
1413 unsigned VectorRegsNeeded
= getNumVectorRegs(VTy
);
1414 // This many instructions are needed for the final sum of vector elems (S).
1415 unsigned LastVectorHandling
= (ScalarSize
< 32) ? 3 : 2;
1416 // We use vector adds to create a sum vector, which takes
1417 // V/2 + V/4 + ... = V - 1 operations.
1418 // Then, we need S operations to sum up the elements of that sum vector,
1419 // for a total of V + S - 1 operations.
1420 int Cost
= VectorRegsNeeded
+ LastVectorHandling
- 1;
1427 SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes
&ICA
,
1428 TTI::TargetCostKind CostKind
) {
1429 InstructionCost Cost
= getVectorIntrinsicInstrCost(
1430 ICA
.getID(), ICA
.getReturnType(), ICA
.getArgTypes());
1433 return BaseT::getIntrinsicInstrCost(ICA
, CostKind
);
1436 bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst
*II
) const {
1437 // Always expand on Subtargets without vector instructions.
1438 if (!ST
->hasVector())
1441 // Whether or not to expand is a per-intrinsic decision.
1442 switch (II
->getIntrinsicID()) {
1445 // Do not expand vector.reduce.add...
1446 case Intrinsic::vector_reduce_add
:
1447 auto *VType
= cast
<FixedVectorType
>(II
->getOperand(0)->getType());
1448 // ...unless the scalar size is i64 or larger,
1449 // or the operand vector is not full, since the
1450 // performance benefit is dubious in those cases.
1451 return VType
->getScalarSizeInBits() >= 64 ||
1452 VType
->getPrimitiveSizeInBits() < SystemZ::VectorBits
;