1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements a TargetTransformInfo analysis pass specific to the
10 // SystemZ target machine. It uses the target's detailed information to provide
11 // more precise answers to certain TTI queries, while letting the target
12 // independent and default TTI implementations handle the rest.
14 //===----------------------------------------------------------------------===//
16 #include "SystemZTargetTransformInfo.h"
17 #include "llvm/Analysis/TargetTransformInfo.h"
18 #include "llvm/CodeGen/BasicTTIImpl.h"
19 #include "llvm/CodeGen/CostTable.h"
20 #include "llvm/CodeGen/TargetLowering.h"
21 #include "llvm/IR/IntrinsicInst.h"
22 #include "llvm/Support/Debug.h"
25 #define DEBUG_TYPE "systemztti"
27 //===----------------------------------------------------------------------===//
29 // SystemZ cost model.
31 //===----------------------------------------------------------------------===//
33 static bool isUsedAsMemCpySource(const Value
*V
, bool &OtherUse
) {
34 bool UsedAsMemCpySource
= false;
35 for (const User
*U
: V
->users())
36 if (const Instruction
*User
= dyn_cast
<Instruction
>(U
)) {
37 if (isa
<BitCastInst
>(User
) || isa
<GetElementPtrInst
>(User
)) {
38 UsedAsMemCpySource
|= isUsedAsMemCpySource(User
, OtherUse
);
41 if (const MemCpyInst
*Memcpy
= dyn_cast
<MemCpyInst
>(User
)) {
42 if (Memcpy
->getOperand(1) == V
&& !Memcpy
->isVolatile()) {
43 UsedAsMemCpySource
= true;
49 return UsedAsMemCpySource
;
52 unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase
*CB
) const {
55 // Increase the threshold if an incoming argument is used only as a memcpy
57 if (Function
*Callee
= CB
->getCalledFunction())
58 for (Argument
&Arg
: Callee
->args()) {
59 bool OtherUse
= false;
60 if (isUsedAsMemCpySource(&Arg
, OtherUse
) && !OtherUse
)
65 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus
<< "\n";);
69 InstructionCost
SystemZTTIImpl::getIntImmCost(const APInt
&Imm
, Type
*Ty
,
70 TTI::TargetCostKind CostKind
) {
71 assert(Ty
->isIntegerTy());
73 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
74 // There is no cost model for constants with a bit size of 0. Return TCC_Free
75 // here, so that constant hoisting will ignore this constant.
78 // No cost model for operations on integers larger than 64 bit implemented yet.
85 if (Imm
.getBitWidth() <= 64) {
86 // Constants loaded via lgfi.
87 if (isInt
<32>(Imm
.getSExtValue()))
88 return TTI::TCC_Basic
;
89 // Constants loaded via llilf.
90 if (isUInt
<32>(Imm
.getZExtValue()))
91 return TTI::TCC_Basic
;
92 // Constants loaded via llihf:
93 if ((Imm
.getZExtValue() & 0xffffffff) == 0)
94 return TTI::TCC_Basic
;
96 return 2 * TTI::TCC_Basic
;
99 return 4 * TTI::TCC_Basic
;
102 InstructionCost
SystemZTTIImpl::getIntImmCostInst(unsigned Opcode
, unsigned Idx
,
103 const APInt
&Imm
, Type
*Ty
,
104 TTI::TargetCostKind CostKind
,
106 assert(Ty
->isIntegerTy());
108 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
109 // There is no cost model for constants with a bit size of 0. Return TCC_Free
110 // here, so that constant hoisting will ignore this constant.
112 return TTI::TCC_Free
;
113 // No cost model for operations on integers larger than 64 bit implemented yet.
115 return TTI::TCC_Free
;
119 return TTI::TCC_Free
;
120 case Instruction::GetElementPtr
:
121 // Always hoist the base address of a GetElementPtr. This prevents the
122 // creation of new constants for every base constant that gets constant
123 // folded with the offset.
125 return 2 * TTI::TCC_Basic
;
126 return TTI::TCC_Free
;
127 case Instruction::Store
:
128 if (Idx
== 0 && Imm
.getBitWidth() <= 64) {
129 // Any 8-bit immediate store can by implemented via mvi.
131 return TTI::TCC_Free
;
132 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
133 if (isInt
<16>(Imm
.getSExtValue()))
134 return TTI::TCC_Free
;
137 case Instruction::ICmp
:
138 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
139 // Comparisons against signed 32-bit immediates implemented via cgfi.
140 if (isInt
<32>(Imm
.getSExtValue()))
141 return TTI::TCC_Free
;
142 // Comparisons against unsigned 32-bit immediates implemented via clgfi.
143 if (isUInt
<32>(Imm
.getZExtValue()))
144 return TTI::TCC_Free
;
147 case Instruction::Add
:
148 case Instruction::Sub
:
149 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
150 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
151 if (isUInt
<32>(Imm
.getZExtValue()))
152 return TTI::TCC_Free
;
153 // Or their negation, by swapping addition vs. subtraction.
154 if (isUInt
<32>(-Imm
.getSExtValue()))
155 return TTI::TCC_Free
;
158 case Instruction::Mul
:
159 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
160 // We use msgfi to multiply by 32-bit signed immediates.
161 if (isInt
<32>(Imm
.getSExtValue()))
162 return TTI::TCC_Free
;
165 case Instruction::Or
:
166 case Instruction::Xor
:
167 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
168 // Masks supported by oilf/xilf.
169 if (isUInt
<32>(Imm
.getZExtValue()))
170 return TTI::TCC_Free
;
171 // Masks supported by oihf/xihf.
172 if ((Imm
.getZExtValue() & 0xffffffff) == 0)
173 return TTI::TCC_Free
;
176 case Instruction::And
:
177 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
178 // Any 32-bit AND operation can by implemented via nilf.
180 return TTI::TCC_Free
;
181 // 64-bit masks supported by nilf.
182 if (isUInt
<32>(~Imm
.getZExtValue()))
183 return TTI::TCC_Free
;
184 // 64-bit masks supported by nilh.
185 if ((Imm
.getZExtValue() & 0xffffffff) == 0xffffffff)
186 return TTI::TCC_Free
;
187 // Some 64-bit AND operations can be implemented via risbg.
188 const SystemZInstrInfo
*TII
= ST
->getInstrInfo();
190 if (TII
->isRxSBGMask(Imm
.getZExtValue(), BitSize
, Start
, End
))
191 return TTI::TCC_Free
;
194 case Instruction::Shl
:
195 case Instruction::LShr
:
196 case Instruction::AShr
:
197 // Always return TCC_Free for the shift value of a shift instruction.
199 return TTI::TCC_Free
;
201 case Instruction::UDiv
:
202 case Instruction::SDiv
:
203 case Instruction::URem
:
204 case Instruction::SRem
:
205 case Instruction::Trunc
:
206 case Instruction::ZExt
:
207 case Instruction::SExt
:
208 case Instruction::IntToPtr
:
209 case Instruction::PtrToInt
:
210 case Instruction::BitCast
:
211 case Instruction::PHI
:
212 case Instruction::Call
:
213 case Instruction::Select
:
214 case Instruction::Ret
:
215 case Instruction::Load
:
219 return SystemZTTIImpl::getIntImmCost(Imm
, Ty
, CostKind
);
223 SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID
, unsigned Idx
,
224 const APInt
&Imm
, Type
*Ty
,
225 TTI::TargetCostKind CostKind
) {
226 assert(Ty
->isIntegerTy());
228 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
229 // There is no cost model for constants with a bit size of 0. Return TCC_Free
230 // here, so that constant hoisting will ignore this constant.
232 return TTI::TCC_Free
;
233 // No cost model for operations on integers larger than 64 bit implemented yet.
235 return TTI::TCC_Free
;
239 return TTI::TCC_Free
;
240 case Intrinsic::sadd_with_overflow
:
241 case Intrinsic::uadd_with_overflow
:
242 case Intrinsic::ssub_with_overflow
:
243 case Intrinsic::usub_with_overflow
:
244 // These get expanded to include a normal addition/subtraction.
245 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
246 if (isUInt
<32>(Imm
.getZExtValue()))
247 return TTI::TCC_Free
;
248 if (isUInt
<32>(-Imm
.getSExtValue()))
249 return TTI::TCC_Free
;
252 case Intrinsic::smul_with_overflow
:
253 case Intrinsic::umul_with_overflow
:
254 // These get expanded to include a normal multiplication.
255 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
256 if (isInt
<32>(Imm
.getSExtValue()))
257 return TTI::TCC_Free
;
260 case Intrinsic::experimental_stackmap
:
261 if ((Idx
< 2) || (Imm
.getBitWidth() <= 64 && isInt
<64>(Imm
.getSExtValue())))
262 return TTI::TCC_Free
;
264 case Intrinsic::experimental_patchpoint_void
:
265 case Intrinsic::experimental_patchpoint_i64
:
266 if ((Idx
< 4) || (Imm
.getBitWidth() <= 64 && isInt
<64>(Imm
.getSExtValue())))
267 return TTI::TCC_Free
;
270 return SystemZTTIImpl::getIntImmCost(Imm
, Ty
, CostKind
);
273 TargetTransformInfo::PopcntSupportKind
274 SystemZTTIImpl::getPopcntSupport(unsigned TyWidth
) {
275 assert(isPowerOf2_32(TyWidth
) && "Type width must be power of 2");
276 if (ST
->hasPopulationCount() && TyWidth
<= 64)
277 return TTI::PSK_FastHardware
;
278 return TTI::PSK_Software
;
281 void SystemZTTIImpl::getUnrollingPreferences(Loop
*L
, ScalarEvolution
&SE
,
282 TTI::UnrollingPreferences
&UP
,
283 OptimizationRemarkEmitter
*ORE
) {
284 // Find out if L contains a call, what the machine instruction count
285 // estimate is, and how many stores there are.
286 bool HasCall
= false;
287 InstructionCost NumStores
= 0;
288 for (auto &BB
: L
->blocks())
289 for (auto &I
: *BB
) {
290 if (isa
<CallInst
>(&I
) || isa
<InvokeInst
>(&I
)) {
291 if (const Function
*F
= cast
<CallBase
>(I
).getCalledFunction()) {
292 if (isLoweredToCall(F
))
294 if (F
->getIntrinsicID() == Intrinsic::memcpy
||
295 F
->getIntrinsicID() == Intrinsic::memset
)
297 } else { // indirect call.
301 if (isa
<StoreInst
>(&I
)) {
302 Type
*MemAccessTy
= I
.getOperand(0)->getType();
303 NumStores
+= getMemoryOpCost(Instruction::Store
, MemAccessTy
,
304 std::nullopt
, 0, TTI::TCK_RecipThroughput
);
308 // The z13 processor will run out of store tags if too many stores
309 // are fed into it too quickly. Therefore make sure there are not
310 // too many stores in the resulting unrolled loop.
311 unsigned const NumStoresVal
= *NumStores
.getValue();
312 unsigned const Max
= (NumStoresVal
? (12 / NumStoresVal
) : UINT_MAX
);
315 // Only allow full unrolling if loop has any calls.
316 UP
.FullUnrollMaxCount
= Max
;
322 if (UP
.MaxCount
<= 1)
325 // Allow partial and runtime trip count unrolling.
326 UP
.Partial
= UP
.Runtime
= true;
328 UP
.PartialThreshold
= 75;
329 UP
.DefaultUnrollRuntimeCount
= 4;
331 // Allow expensive instructions in the pre-header of the loop.
332 UP
.AllowExpensiveTripCount
= true;
337 void SystemZTTIImpl::getPeelingPreferences(Loop
*L
, ScalarEvolution
&SE
,
338 TTI::PeelingPreferences
&PP
) {
339 BaseT::getPeelingPreferences(L
, SE
, PP
);
342 bool SystemZTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost
&C1
,
343 const TargetTransformInfo::LSRCost
&C2
) {
344 // SystemZ specific: check instruction count (first), and don't care about
345 // ImmCost, since offsets are checked explicitly.
346 return std::tie(C1
.Insns
, C1
.NumRegs
, C1
.AddRecCost
,
347 C1
.NumIVMuls
, C1
.NumBaseAdds
,
348 C1
.ScaleCost
, C1
.SetupCost
) <
349 std::tie(C2
.Insns
, C2
.NumRegs
, C2
.AddRecCost
,
350 C2
.NumIVMuls
, C2
.NumBaseAdds
,
351 C2
.ScaleCost
, C2
.SetupCost
);
354 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID
) const {
355 bool Vector
= (ClassID
== 1);
357 // Discount the stack pointer. Also leave out %r0, since it can't
358 // be used in an address.
366 SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K
) const {
368 case TargetTransformInfo::RGK_Scalar
:
369 return TypeSize::Fixed(64);
370 case TargetTransformInfo::RGK_FixedWidthVector
:
371 return TypeSize::Fixed(ST
->hasVector() ? 128 : 0);
372 case TargetTransformInfo::RGK_ScalableVector
:
373 return TypeSize::Scalable(0);
376 llvm_unreachable("Unsupported register kind");
379 unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses
,
380 unsigned NumStridedMemAccesses
,
381 unsigned NumPrefetches
,
382 bool HasCall
) const {
383 // Don't prefetch a loop with many far apart accesses.
384 if (NumPrefetches
> 16)
387 // Emit prefetch instructions for smaller strides in cases where we think
388 // the hardware prefetcher might not be able to keep up.
389 if (NumStridedMemAccesses
> 32 && !HasCall
&&
390 (NumMemAccesses
- NumStridedMemAccesses
) * 32 <= NumStridedMemAccesses
)
393 return ST
->hasMiscellaneousExtensions3() ? 8192 : 2048;
396 bool SystemZTTIImpl::hasDivRemOp(Type
*DataType
, bool IsSigned
) {
397 EVT VT
= TLI
->getValueType(DL
, DataType
);
398 return (VT
.isScalarInteger() && TLI
->isTypeLegal(VT
));
401 // Return the bit size for the scalar type or vector element
402 // type. getScalarSizeInBits() returns 0 for a pointer type.
403 static unsigned getScalarSizeInBits(Type
*Ty
) {
405 (Ty
->isPtrOrPtrVectorTy() ? 64U : Ty
->getScalarSizeInBits());
406 assert(Size
> 0 && "Element must have non-zero size.");
410 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
411 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
413 static unsigned getNumVectorRegs(Type
*Ty
) {
414 auto *VTy
= cast
<FixedVectorType
>(Ty
);
415 unsigned WideBits
= getScalarSizeInBits(Ty
) * VTy
->getNumElements();
416 assert(WideBits
> 0 && "Could not compute size of vector");
417 return ((WideBits
% 128U) ? ((WideBits
/ 128U) + 1) : (WideBits
/ 128U));
420 InstructionCost
SystemZTTIImpl::getArithmeticInstrCost(
421 unsigned Opcode
, Type
*Ty
, TTI::TargetCostKind CostKind
,
422 TTI::OperandValueInfo Op1Info
, TTI::OperandValueInfo Op2Info
,
423 ArrayRef
<const Value
*> Args
,
424 const Instruction
*CxtI
) {
426 // TODO: Handle more cost kinds.
427 if (CostKind
!= TTI::TCK_RecipThroughput
)
428 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
,
429 Op2Info
, Args
, CxtI
);
431 // TODO: return a good value for BB-VECTORIZER that includes the
432 // immediate loads, which we do not want to count for the loop
433 // vectorizer, since they are hopefully hoisted out of the loop. This
434 // would require a new parameter 'InLoop', but not sure if constant
435 // args are common enough to motivate this.
437 unsigned ScalarBits
= Ty
->getScalarSizeInBits();
439 // There are thre cases of division and remainder: Dividing with a register
440 // needs a divide instruction. A divisor which is a power of two constant
441 // can be implemented with a sequence of shifts. Any other constant needs a
442 // multiply and shifts.
443 const unsigned DivInstrCost
= 20;
444 const unsigned DivMulSeqCost
= 10;
445 const unsigned SDivPow2Cost
= 4;
448 Opcode
== Instruction::SDiv
|| Opcode
== Instruction::SRem
;
449 bool UnsignedDivRem
=
450 Opcode
== Instruction::UDiv
|| Opcode
== Instruction::URem
;
452 // Check for a constant divisor.
453 bool DivRemConst
= false;
454 bool DivRemConstPow2
= false;
455 if ((SignedDivRem
|| UnsignedDivRem
) && Args
.size() == 2) {
456 if (const Constant
*C
= dyn_cast
<Constant
>(Args
[1])) {
457 const ConstantInt
*CVal
=
458 (C
->getType()->isVectorTy()
459 ? dyn_cast_or_null
<const ConstantInt
>(C
->getSplatValue())
460 : dyn_cast
<const ConstantInt
>(C
));
461 if (CVal
&& (CVal
->getValue().isPowerOf2() ||
462 CVal
->getValue().isNegatedPowerOf2()))
463 DivRemConstPow2
= true;
469 if (!Ty
->isVectorTy()) {
470 // These FP operations are supported with a dedicated instruction for
471 // float, double and fp128 (base implementation assumes float generally
473 if (Opcode
== Instruction::FAdd
|| Opcode
== Instruction::FSub
||
474 Opcode
== Instruction::FMul
|| Opcode
== Instruction::FDiv
)
477 // There is no native support for FRem.
478 if (Opcode
== Instruction::FRem
)
481 // Give discount for some combined logical operations if supported.
482 if (Args
.size() == 2 && ST
->hasMiscellaneousExtensions3()) {
483 if (Opcode
== Instruction::Xor
) {
484 for (const Value
*A
: Args
) {
485 if (const Instruction
*I
= dyn_cast
<Instruction
>(A
))
486 if (I
->hasOneUse() &&
487 (I
->getOpcode() == Instruction::And
||
488 I
->getOpcode() == Instruction::Or
||
489 I
->getOpcode() == Instruction::Xor
))
493 else if (Opcode
== Instruction::Or
|| Opcode
== Instruction::And
) {
494 for (const Value
*A
: Args
) {
495 if (const Instruction
*I
= dyn_cast
<Instruction
>(A
))
496 if (I
->hasOneUse() && I
->getOpcode() == Instruction::Xor
)
502 // Or requires one instruction, although it has custom handling for i64.
503 if (Opcode
== Instruction::Or
)
506 if (Opcode
== Instruction::Xor
&& ScalarBits
== 1) {
507 if (ST
->hasLoadStoreOnCond2())
508 return 5; // 2 * (li 0; loc 1); xor
509 return 7; // 2 * ipm sequences ; xor ; shift ; compare
513 return (SignedDivRem
? SDivPow2Cost
: 1);
515 return DivMulSeqCost
;
516 if (SignedDivRem
|| UnsignedDivRem
)
519 else if (ST
->hasVector()) {
520 auto *VTy
= cast
<FixedVectorType
>(Ty
);
521 unsigned VF
= VTy
->getNumElements();
522 unsigned NumVectors
= getNumVectorRegs(Ty
);
524 // These vector operations are custom handled, but are still supported
525 // with one instruction per vector, regardless of element size.
526 if (Opcode
== Instruction::Shl
|| Opcode
== Instruction::LShr
||
527 Opcode
== Instruction::AShr
) {
532 return (NumVectors
* (SignedDivRem
? SDivPow2Cost
: 1));
534 SmallVector
<Type
*> Tys(Args
.size(), Ty
);
535 return VF
* DivMulSeqCost
+
536 getScalarizationOverhead(VTy
, Args
, Tys
, CostKind
);
538 if ((SignedDivRem
|| UnsignedDivRem
) && VF
> 4)
539 // Temporary hack: disable high vectorization factors with integer
540 // division/remainder, which will get scalarized and handled with
541 // GR128 registers. The mischeduler is not clever enough to avoid
545 // These FP operations are supported with a single vector instruction for
546 // double (base implementation assumes float generally costs 2). For
547 // FP128, the scalar cost is 1, and there is no overhead since the values
548 // are already in scalar registers.
549 if (Opcode
== Instruction::FAdd
|| Opcode
== Instruction::FSub
||
550 Opcode
== Instruction::FMul
|| Opcode
== Instruction::FDiv
) {
551 switch (ScalarBits
) {
553 // The vector enhancements facility 1 provides v4f32 instructions.
554 if (ST
->hasVectorEnhancements1())
556 // Return the cost of multiple scalar invocation plus the cost of
557 // inserting and extracting the values.
558 InstructionCost ScalarCost
=
559 getArithmeticInstrCost(Opcode
, Ty
->getScalarType(), CostKind
);
560 SmallVector
<Type
*> Tys(Args
.size(), Ty
);
561 InstructionCost Cost
=
563 getScalarizationOverhead(VTy
, Args
, Tys
, CostKind
);
564 // FIXME: VF 2 for these FP operations are currently just as
565 // expensive as for VF 4.
578 // There is no native support for FRem.
579 if (Opcode
== Instruction::FRem
) {
580 SmallVector
<Type
*> Tys(Args
.size(), Ty
);
581 InstructionCost Cost
= (VF
* LIBCALL_COST
) +
582 getScalarizationOverhead(VTy
, Args
, Tys
, CostKind
);
583 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
584 if (VF
== 2 && ScalarBits
== 32)
590 // Fallback to the default implementation.
591 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
,
595 InstructionCost
SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind
,
598 TTI::TargetCostKind CostKind
,
599 int Index
, VectorType
*SubTp
,
600 ArrayRef
<const Value
*> Args
) {
601 Kind
= improveShuffleKindFromMask(Kind
, Mask
, Tp
, Index
, SubTp
);
602 if (ST
->hasVector()) {
603 unsigned NumVectors
= getNumVectorRegs(Tp
);
605 // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
607 // FP128 values are always in scalar registers, so there is no work
608 // involved with a shuffle, except for broadcast. In that case register
609 // moves are done with a single instruction per element.
610 if (Tp
->getScalarType()->isFP128Ty())
611 return (Kind
== TargetTransformInfo::SK_Broadcast
? NumVectors
- 1 : 0);
614 case TargetTransformInfo::SK_ExtractSubvector
:
615 // ExtractSubvector Index indicates start offset.
617 // Extracting a subvector from first index is a noop.
618 return (Index
== 0 ? 0 : NumVectors
);
620 case TargetTransformInfo::SK_Broadcast
:
621 // Loop vectorizer calls here to figure out the extra cost of
622 // broadcasting a loaded value to all elements of a vector. Since vlrep
623 // loads and replicates with a single instruction, adjust the returned
625 return NumVectors
- 1;
629 // SystemZ supports single instruction permutation / replication.
634 return BaseT::getShuffleCost(Kind
, Tp
, Mask
, CostKind
, Index
, SubTp
);
637 // Return the log2 difference of the element sizes of the two vector types.
638 static unsigned getElSizeLog2Diff(Type
*Ty0
, Type
*Ty1
) {
639 unsigned Bits0
= Ty0
->getScalarSizeInBits();
640 unsigned Bits1
= Ty1
->getScalarSizeInBits();
643 return (Log2_32(Bits1
) - Log2_32(Bits0
));
645 return (Log2_32(Bits0
) - Log2_32(Bits1
));
648 // Return the number of instructions needed to truncate SrcTy to DstTy.
649 unsigned SystemZTTIImpl::
650 getVectorTruncCost(Type
*SrcTy
, Type
*DstTy
) {
651 assert (SrcTy
->isVectorTy() && DstTy
->isVectorTy());
652 assert(SrcTy
->getPrimitiveSizeInBits().getFixedValue() >
653 DstTy
->getPrimitiveSizeInBits().getFixedValue() &&
654 "Packing must reduce size of vector type.");
655 assert(cast
<FixedVectorType
>(SrcTy
)->getNumElements() ==
656 cast
<FixedVectorType
>(DstTy
)->getNumElements() &&
657 "Packing should not change number of elements.");
659 // TODO: Since fp32 is expanded, the extract cost should always be 0.
661 unsigned NumParts
= getNumVectorRegs(SrcTy
);
663 // Up to 2 vector registers can be truncated efficiently with pack or
664 // permute. The latter requires an immediate mask to be loaded, which
665 // typically gets hoisted out of a loop. TODO: return a good value for
666 // BB-VECTORIZER that includes the immediate loads, which we do not want
667 // to count for the loop vectorizer.
671 unsigned Log2Diff
= getElSizeLog2Diff(SrcTy
, DstTy
);
672 unsigned VF
= cast
<FixedVectorType
>(SrcTy
)->getNumElements();
673 for (unsigned P
= 0; P
< Log2Diff
; ++P
) {
679 // Currently, a general mix of permutes and pack instructions is output by
680 // isel, which follow the cost computation above except for this case which
681 // is one instruction less:
682 if (VF
== 8 && SrcTy
->getScalarSizeInBits() == 64 &&
683 DstTy
->getScalarSizeInBits() == 8)
689 // Return the cost of converting a vector bitmask produced by a compare
690 // (SrcTy), to the type of the select or extend instruction (DstTy).
691 unsigned SystemZTTIImpl::
692 getVectorBitmaskConversionCost(Type
*SrcTy
, Type
*DstTy
) {
693 assert (SrcTy
->isVectorTy() && DstTy
->isVectorTy() &&
694 "Should only be called with vector types.");
696 unsigned PackCost
= 0;
697 unsigned SrcScalarBits
= SrcTy
->getScalarSizeInBits();
698 unsigned DstScalarBits
= DstTy
->getScalarSizeInBits();
699 unsigned Log2Diff
= getElSizeLog2Diff(SrcTy
, DstTy
);
700 if (SrcScalarBits
> DstScalarBits
)
701 // The bitmask will be truncated.
702 PackCost
= getVectorTruncCost(SrcTy
, DstTy
);
703 else if (SrcScalarBits
< DstScalarBits
) {
704 unsigned DstNumParts
= getNumVectorRegs(DstTy
);
705 // Each vector select needs its part of the bitmask unpacked.
706 PackCost
= Log2Diff
* DstNumParts
;
707 // Extra cost for moving part of mask before unpacking.
708 PackCost
+= DstNumParts
- 1;
714 // Return the type of the compared operands. This is needed to compute the
715 // cost for a Select / ZExt or SExt instruction.
716 static Type
*getCmpOpsType(const Instruction
*I
, unsigned VF
= 1) {
717 Type
*OpTy
= nullptr;
718 if (CmpInst
*CI
= dyn_cast
<CmpInst
>(I
->getOperand(0)))
719 OpTy
= CI
->getOperand(0)->getType();
720 else if (Instruction
*LogicI
= dyn_cast
<Instruction
>(I
->getOperand(0)))
721 if (LogicI
->getNumOperands() == 2)
722 if (CmpInst
*CI0
= dyn_cast
<CmpInst
>(LogicI
->getOperand(0)))
723 if (isa
<CmpInst
>(LogicI
->getOperand(1)))
724 OpTy
= CI0
->getOperand(0)->getType();
726 if (OpTy
!= nullptr) {
728 assert (!OpTy
->isVectorTy() && "Expected scalar type");
731 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
732 // be either scalar or already vectorized with a same or lesser VF.
733 Type
*ElTy
= OpTy
->getScalarType();
734 return FixedVectorType::get(ElTy
, VF
);
740 // Get the cost of converting a boolean vector to a vector with same width
741 // and element size as Dst, plus the cost of zero extending if needed.
742 unsigned SystemZTTIImpl::
743 getBoolVecToIntConversionCost(unsigned Opcode
, Type
*Dst
,
744 const Instruction
*I
) {
745 auto *DstVTy
= cast
<FixedVectorType
>(Dst
);
746 unsigned VF
= DstVTy
->getNumElements();
748 // If we know what the widths of the compared operands, get any cost of
749 // converting it to match Dst. Otherwise assume same widths.
750 Type
*CmpOpTy
= ((I
!= nullptr) ? getCmpOpsType(I
, VF
) : nullptr);
751 if (CmpOpTy
!= nullptr)
752 Cost
= getVectorBitmaskConversionCost(CmpOpTy
, Dst
);
753 if (Opcode
== Instruction::ZExt
|| Opcode
== Instruction::UIToFP
)
754 // One 'vn' per dst vector with an immediate mask.
755 Cost
+= getNumVectorRegs(Dst
);
759 InstructionCost
SystemZTTIImpl::getCastInstrCost(unsigned Opcode
, Type
*Dst
,
761 TTI::CastContextHint CCH
,
762 TTI::TargetCostKind CostKind
,
763 const Instruction
*I
) {
764 // FIXME: Can the logic below also be used for these cost kinds?
765 if (CostKind
== TTI::TCK_CodeSize
|| CostKind
== TTI::TCK_SizeAndLatency
) {
766 auto BaseCost
= BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
767 return BaseCost
== 0 ? BaseCost
: 1;
770 unsigned DstScalarBits
= Dst
->getScalarSizeInBits();
771 unsigned SrcScalarBits
= Src
->getScalarSizeInBits();
773 if (!Src
->isVectorTy()) {
774 assert (!Dst
->isVectorTy());
776 if (Opcode
== Instruction::SIToFP
|| Opcode
== Instruction::UIToFP
) {
777 if (SrcScalarBits
>= 32 ||
778 (I
!= nullptr && isa
<LoadInst
>(I
->getOperand(0))))
780 return SrcScalarBits
> 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
783 if ((Opcode
== Instruction::ZExt
|| Opcode
== Instruction::SExt
) &&
784 Src
->isIntegerTy(1)) {
785 if (ST
->hasLoadStoreOnCond2())
786 return 2; // li 0; loc 1
788 // This should be extension of a compare i1 result, which is done with
789 // ipm and a varying sequence of instructions.
791 if (Opcode
== Instruction::SExt
)
792 Cost
= (DstScalarBits
< 64 ? 3 : 4);
793 if (Opcode
== Instruction::ZExt
)
795 Type
*CmpOpTy
= ((I
!= nullptr) ? getCmpOpsType(I
) : nullptr);
796 if (CmpOpTy
!= nullptr && CmpOpTy
->isFloatingPointTy())
797 // If operands of an fp-type was compared, this costs +1.
802 else if (ST
->hasVector()) {
803 // Vector to scalar cast.
804 auto *SrcVecTy
= cast
<FixedVectorType
>(Src
);
805 auto *DstVecTy
= dyn_cast
<FixedVectorType
>(Dst
);
807 // TODO: tune vector-to-scalar cast.
808 return BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
810 unsigned VF
= SrcVecTy
->getNumElements();
811 unsigned NumDstVectors
= getNumVectorRegs(Dst
);
812 unsigned NumSrcVectors
= getNumVectorRegs(Src
);
814 if (Opcode
== Instruction::Trunc
) {
815 if (Src
->getScalarSizeInBits() == Dst
->getScalarSizeInBits())
816 return 0; // Check for NOOP conversions.
817 return getVectorTruncCost(Src
, Dst
);
820 if (Opcode
== Instruction::ZExt
|| Opcode
== Instruction::SExt
) {
821 if (SrcScalarBits
>= 8) {
822 // ZExt will use either a single unpack or a vector permute.
823 if (Opcode
== Instruction::ZExt
)
824 return NumDstVectors
;
826 // SExt will be handled with one unpack per doubling of width.
827 unsigned NumUnpacks
= getElSizeLog2Diff(Src
, Dst
);
829 // For types that spans multiple vector registers, some additional
830 // instructions are used to setup the unpacking.
831 unsigned NumSrcVectorOps
=
832 (NumUnpacks
> 1 ? (NumDstVectors
- NumSrcVectors
)
833 : (NumDstVectors
/ 2));
835 return (NumUnpacks
* NumDstVectors
) + NumSrcVectorOps
;
837 else if (SrcScalarBits
== 1)
838 return getBoolVecToIntConversionCost(Opcode
, Dst
, I
);
841 if (Opcode
== Instruction::SIToFP
|| Opcode
== Instruction::UIToFP
||
842 Opcode
== Instruction::FPToSI
|| Opcode
== Instruction::FPToUI
) {
843 // TODO: Fix base implementation which could simplify things a bit here
844 // (seems to miss on differentiating on scalar/vector types).
846 // Only 64 bit vector conversions are natively supported before z15.
847 if (DstScalarBits
== 64 || ST
->hasVectorEnhancements2()) {
848 if (SrcScalarBits
== DstScalarBits
)
849 return NumDstVectors
;
851 if (SrcScalarBits
== 1)
852 return getBoolVecToIntConversionCost(Opcode
, Dst
, I
) + NumDstVectors
;
855 // Return the cost of multiple scalar invocation plus the cost of
856 // inserting and extracting the values. Base implementation does not
857 // realize float->int gets scalarized.
858 InstructionCost ScalarCost
= getCastInstrCost(
859 Opcode
, Dst
->getScalarType(), Src
->getScalarType(), CCH
, CostKind
);
860 InstructionCost TotCost
= VF
* ScalarCost
;
861 bool NeedsInserts
= true, NeedsExtracts
= true;
862 // FP128 registers do not get inserted or extracted.
863 if (DstScalarBits
== 128 &&
864 (Opcode
== Instruction::SIToFP
|| Opcode
== Instruction::UIToFP
))
865 NeedsInserts
= false;
866 if (SrcScalarBits
== 128 &&
867 (Opcode
== Instruction::FPToSI
|| Opcode
== Instruction::FPToUI
))
868 NeedsExtracts
= false;
870 TotCost
+= getScalarizationOverhead(SrcVecTy
, /*Insert*/ false,
871 NeedsExtracts
, CostKind
);
872 TotCost
+= getScalarizationOverhead(DstVecTy
, NeedsInserts
,
873 /*Extract*/ false, CostKind
);
875 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
876 if (VF
== 2 && SrcScalarBits
== 32 && DstScalarBits
== 32)
882 if (Opcode
== Instruction::FPTrunc
) {
883 if (SrcScalarBits
== 128) // fp128 -> double/float + inserts of elements.
884 return VF
/*ldxbr/lexbr*/ +
885 getScalarizationOverhead(DstVecTy
, /*Insert*/ true,
886 /*Extract*/ false, CostKind
);
887 else // double -> float
888 return VF
/ 2 /*vledb*/ + std::max(1U, VF
/ 4 /*vperm*/);
891 if (Opcode
== Instruction::FPExt
) {
892 if (SrcScalarBits
== 32 && DstScalarBits
== 64) {
893 // float -> double is very rare and currently unoptimized. Instead of
894 // using vldeb, which can do two at a time, all conversions are
898 // -> fp128. VF * lxdb/lxeb + extraction of elements.
899 return VF
+ getScalarizationOverhead(SrcVecTy
, /*Insert*/ false,
900 /*Extract*/ true, CostKind
);
904 return BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
907 // Scalar i8 / i16 operations will typically be made after first extending
908 // the operands to i32.
909 static unsigned getOperandsExtensionCost(const Instruction
*I
) {
910 unsigned ExtCost
= 0;
911 for (Value
*Op
: I
->operands())
912 // A load of i8 or i16 sign/zero extends to i32.
913 if (!isa
<LoadInst
>(Op
) && !isa
<ConstantInt
>(Op
))
919 InstructionCost
SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode
, Type
*ValTy
,
921 CmpInst::Predicate VecPred
,
922 TTI::TargetCostKind CostKind
,
923 const Instruction
*I
) {
924 if (CostKind
!= TTI::TCK_RecipThroughput
)
925 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
);
927 if (!ValTy
->isVectorTy()) {
929 case Instruction::ICmp
: {
930 // A loaded value compared with 0 with multiple users becomes Load and
931 // Test. The load is then not foldable, so return 0 cost for the ICmp.
932 unsigned ScalarBits
= ValTy
->getScalarSizeInBits();
933 if (I
!= nullptr && ScalarBits
>= 32)
934 if (LoadInst
*Ld
= dyn_cast
<LoadInst
>(I
->getOperand(0)))
935 if (const ConstantInt
*C
= dyn_cast
<ConstantInt
>(I
->getOperand(1)))
936 if (!Ld
->hasOneUse() && Ld
->getParent() == I
->getParent() &&
941 if (ValTy
->isIntegerTy() && ValTy
->getScalarSizeInBits() <= 16)
942 Cost
+= (I
!= nullptr ? getOperandsExtensionCost(I
) : 2);
945 case Instruction::Select
:
946 if (ValTy
->isFloatingPointTy())
947 return 4; // No load on condition for FP - costs a conditional jump.
948 return 1; // Load On Condition / Select Register.
951 else if (ST
->hasVector()) {
952 unsigned VF
= cast
<FixedVectorType
>(ValTy
)->getNumElements();
954 // Called with a compare instruction.
955 if (Opcode
== Instruction::ICmp
|| Opcode
== Instruction::FCmp
) {
956 unsigned PredicateExtraCost
= 0;
958 // Some predicates cost one or two extra instructions.
959 switch (cast
<CmpInst
>(I
)->getPredicate()) {
960 case CmpInst::Predicate::ICMP_NE
:
961 case CmpInst::Predicate::ICMP_UGE
:
962 case CmpInst::Predicate::ICMP_ULE
:
963 case CmpInst::Predicate::ICMP_SGE
:
964 case CmpInst::Predicate::ICMP_SLE
:
965 PredicateExtraCost
= 1;
967 case CmpInst::Predicate::FCMP_ONE
:
968 case CmpInst::Predicate::FCMP_ORD
:
969 case CmpInst::Predicate::FCMP_UEQ
:
970 case CmpInst::Predicate::FCMP_UNO
:
971 PredicateExtraCost
= 2;
978 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
979 // floats. FIXME: <2 x float> generates same code as <4 x float>.
980 unsigned CmpCostPerVector
= (ValTy
->getScalarType()->isFloatTy() ? 10 : 1);
981 unsigned NumVecs_cmp
= getNumVectorRegs(ValTy
);
983 unsigned Cost
= (NumVecs_cmp
* (CmpCostPerVector
+ PredicateExtraCost
));
986 else { // Called with a select instruction.
987 assert (Opcode
== Instruction::Select
);
989 // We can figure out the extra cost of packing / unpacking if the
990 // instruction was passed and the compare instruction is found.
991 unsigned PackCost
= 0;
992 Type
*CmpOpTy
= ((I
!= nullptr) ? getCmpOpsType(I
, VF
) : nullptr);
993 if (CmpOpTy
!= nullptr)
995 getVectorBitmaskConversionCost(CmpOpTy
, ValTy
);
997 return getNumVectorRegs(ValTy
) /*vsel*/ + PackCost
;
1001 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
);
1004 InstructionCost
SystemZTTIImpl::getVectorInstrCost(unsigned Opcode
, Type
*Val
,
1005 TTI::TargetCostKind CostKind
,
1006 unsigned Index
, Value
*Op0
,
1008 // vlvgp will insert two grs into a vector register, so only count half the
1009 // number of instructions.
1010 if (Opcode
== Instruction::InsertElement
&& Val
->isIntOrIntVectorTy(64))
1011 return ((Index
% 2 == 0) ? 1 : 0);
1013 if (Opcode
== Instruction::ExtractElement
) {
1014 int Cost
= ((getScalarSizeInBits(Val
) == 1) ? 2 /*+test-under-mask*/ : 1);
1016 // Give a slight penalty for moving out of vector pipeline to FXU unit.
1017 if (Index
== 0 && Val
->isIntOrIntVectorTy())
1023 return BaseT::getVectorInstrCost(Opcode
, Val
, CostKind
, Index
, Op0
, Op1
);
1026 // Check if a load may be folded as a memory operand in its user.
1027 bool SystemZTTIImpl::
1028 isFoldableLoad(const LoadInst
*Ld
, const Instruction
*&FoldedValue
) {
1029 if (!Ld
->hasOneUse())
1032 const Instruction
*UserI
= cast
<Instruction
>(*Ld
->user_begin());
1033 unsigned LoadedBits
= getScalarSizeInBits(Ld
->getType());
1034 unsigned TruncBits
= 0;
1035 unsigned SExtBits
= 0;
1036 unsigned ZExtBits
= 0;
1037 if (UserI
->hasOneUse()) {
1038 unsigned UserBits
= UserI
->getType()->getScalarSizeInBits();
1039 if (isa
<TruncInst
>(UserI
))
1040 TruncBits
= UserBits
;
1041 else if (isa
<SExtInst
>(UserI
))
1042 SExtBits
= UserBits
;
1043 else if (isa
<ZExtInst
>(UserI
))
1044 ZExtBits
= UserBits
;
1046 if (TruncBits
|| SExtBits
|| ZExtBits
) {
1047 FoldedValue
= UserI
;
1048 UserI
= cast
<Instruction
>(*UserI
->user_begin());
1049 // Load (single use) -> trunc/extend (single use) -> UserI
1051 if ((UserI
->getOpcode() == Instruction::Sub
||
1052 UserI
->getOpcode() == Instruction::SDiv
||
1053 UserI
->getOpcode() == Instruction::UDiv
) &&
1054 UserI
->getOperand(1) != FoldedValue
)
1055 return false; // Not commutative, only RHS foldable.
1056 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1057 // extension was made of the load.
1058 unsigned LoadOrTruncBits
=
1059 ((SExtBits
|| ZExtBits
) ? 0 : (TruncBits
? TruncBits
: LoadedBits
));
1060 switch (UserI
->getOpcode()) {
1061 case Instruction::Add
: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1062 case Instruction::Sub
:
1063 case Instruction::ICmp
:
1064 if (LoadedBits
== 32 && ZExtBits
== 64)
1067 case Instruction::Mul
: // SE: 16->32, 32->64, z14:16->64
1068 if (UserI
->getOpcode() != Instruction::ICmp
) {
1069 if (LoadedBits
== 16 &&
1071 (SExtBits
== 64 && ST
->hasMiscellaneousExtensions2())))
1073 if (LoadOrTruncBits
== 16)
1077 case Instruction::SDiv
:// SE: 32->64
1078 if (LoadedBits
== 32 && SExtBits
== 64)
1081 case Instruction::UDiv
:
1082 case Instruction::And
:
1083 case Instruction::Or
:
1084 case Instruction::Xor
:
1085 // This also makes sense for float operations, but disabled for now due
1087 // case Instruction::FCmp:
1088 // case Instruction::FAdd:
1089 // case Instruction::FSub:
1090 // case Instruction::FMul:
1091 // case Instruction::FDiv:
1093 // All possible extensions of memory checked above.
1095 // Comparison between memory and immediate.
1096 if (UserI
->getOpcode() == Instruction::ICmp
)
1097 if (ConstantInt
*CI
= dyn_cast
<ConstantInt
>(UserI
->getOperand(1)))
1098 if (CI
->getValue().isIntN(16))
1100 return (LoadOrTruncBits
== 32 || LoadOrTruncBits
== 64);
1106 static bool isBswapIntrinsicCall(const Value
*V
) {
1107 if (const Instruction
*I
= dyn_cast
<Instruction
>(V
))
1108 if (auto *CI
= dyn_cast
<CallInst
>(I
))
1109 if (auto *F
= CI
->getCalledFunction())
1110 if (F
->getIntrinsicID() == Intrinsic::bswap
)
1115 InstructionCost
SystemZTTIImpl::getMemoryOpCost(unsigned Opcode
, Type
*Src
,
1116 MaybeAlign Alignment
,
1117 unsigned AddressSpace
,
1118 TTI::TargetCostKind CostKind
,
1119 TTI::OperandValueInfo OpInfo
,
1120 const Instruction
*I
) {
1121 assert(!Src
->isVoidTy() && "Invalid type");
1123 // TODO: Handle other cost kinds.
1124 if (CostKind
!= TTI::TCK_RecipThroughput
)
1127 if (!Src
->isVectorTy() && Opcode
== Instruction::Load
&& I
!= nullptr) {
1128 // Store the load or its truncated or extended value in FoldedValue.
1129 const Instruction
*FoldedValue
= nullptr;
1130 if (isFoldableLoad(cast
<LoadInst
>(I
), FoldedValue
)) {
1131 const Instruction
*UserI
= cast
<Instruction
>(*FoldedValue
->user_begin());
1132 assert (UserI
->getNumOperands() == 2 && "Expected a binop.");
1134 // UserI can't fold two loads, so in that case return 0 cost only
1135 // half of the time.
1136 for (unsigned i
= 0; i
< 2; ++i
) {
1137 if (UserI
->getOperand(i
) == FoldedValue
)
1140 if (Instruction
*OtherOp
= dyn_cast
<Instruction
>(UserI
->getOperand(i
))){
1141 LoadInst
*OtherLoad
= dyn_cast
<LoadInst
>(OtherOp
);
1143 (isa
<TruncInst
>(OtherOp
) || isa
<SExtInst
>(OtherOp
) ||
1144 isa
<ZExtInst
>(OtherOp
)))
1145 OtherLoad
= dyn_cast
<LoadInst
>(OtherOp
->getOperand(0));
1146 if (OtherLoad
&& isFoldableLoad(OtherLoad
, FoldedValue
/*dummy*/))
1147 return i
== 0; // Both operands foldable.
1151 return 0; // Only I is foldable in user.
1155 // Type legalization (via getNumberOfParts) can't handle structs
1156 if (TLI
->getValueType(DL
, Src
, true) == MVT::Other
)
1157 return BaseT::getMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
,
1161 (Src
->isVectorTy() ? getNumVectorRegs(Src
) : getNumberOfParts(Src
));
1163 // Store/Load reversed saves one instruction.
1164 if (((!Src
->isVectorTy() && NumOps
== 1) || ST
->hasVectorEnhancements2()) &&
1166 if (Opcode
== Instruction::Load
&& I
->hasOneUse()) {
1167 const Instruction
*LdUser
= cast
<Instruction
>(*I
->user_begin());
1168 // In case of load -> bswap -> store, return normal cost for the load.
1169 if (isBswapIntrinsicCall(LdUser
) &&
1170 (!LdUser
->hasOneUse() || !isa
<StoreInst
>(*LdUser
->user_begin())))
1173 else if (const StoreInst
*SI
= dyn_cast
<StoreInst
>(I
)) {
1174 const Value
*StoredVal
= SI
->getValueOperand();
1175 if (StoredVal
->hasOneUse() && isBswapIntrinsicCall(StoredVal
))
1180 if (Src
->getScalarSizeInBits() == 128)
1181 // 128 bit scalars are held in a pair of two 64 bit registers.
1187 // The generic implementation of getInterleavedMemoryOpCost() is based on
1188 // adding costs of the memory operations plus all the extracts and inserts
1189 // needed for using / defining the vector operands. The SystemZ version does
1190 // roughly the same but bases the computations on vector permutations
1192 InstructionCost
SystemZTTIImpl::getInterleavedMemoryOpCost(
1193 unsigned Opcode
, Type
*VecTy
, unsigned Factor
, ArrayRef
<unsigned> Indices
,
1194 Align Alignment
, unsigned AddressSpace
, TTI::TargetCostKind CostKind
,
1195 bool UseMaskForCond
, bool UseMaskForGaps
) {
1196 if (UseMaskForCond
|| UseMaskForGaps
)
1197 return BaseT::getInterleavedMemoryOpCost(Opcode
, VecTy
, Factor
, Indices
,
1198 Alignment
, AddressSpace
, CostKind
,
1199 UseMaskForCond
, UseMaskForGaps
);
1200 assert(isa
<VectorType
>(VecTy
) &&
1201 "Expect a vector type for interleaved memory op");
1203 unsigned NumElts
= cast
<FixedVectorType
>(VecTy
)->getNumElements();
1204 assert(Factor
> 1 && NumElts
% Factor
== 0 && "Invalid interleave factor");
1205 unsigned VF
= NumElts
/ Factor
;
1206 unsigned NumEltsPerVecReg
= (128U / getScalarSizeInBits(VecTy
));
1207 unsigned NumVectorMemOps
= getNumVectorRegs(VecTy
);
1208 unsigned NumPermutes
= 0;
1210 if (Opcode
== Instruction::Load
) {
1211 // Loading interleave groups may have gaps, which may mean fewer
1212 // loads. Find out how many vectors will be loaded in total, and in how
1213 // many of them each value will be in.
1214 BitVector
UsedInsts(NumVectorMemOps
, false);
1215 std::vector
<BitVector
> ValueVecs(Factor
, BitVector(NumVectorMemOps
, false));
1216 for (unsigned Index
: Indices
)
1217 for (unsigned Elt
= 0; Elt
< VF
; ++Elt
) {
1218 unsigned Vec
= (Index
+ Elt
* Factor
) / NumEltsPerVecReg
;
1220 ValueVecs
[Index
].set(Vec
);
1222 NumVectorMemOps
= UsedInsts
.count();
1224 for (unsigned Index
: Indices
) {
1225 // Estimate that each loaded source vector containing this Index
1226 // requires one operation, except that vperm can handle two input
1227 // registers first time for each dst vector.
1228 unsigned NumSrcVecs
= ValueVecs
[Index
].count();
1229 unsigned NumDstVecs
= divideCeil(VF
* getScalarSizeInBits(VecTy
), 128U);
1230 assert (NumSrcVecs
>= NumDstVecs
&& "Expected at least as many sources");
1231 NumPermutes
+= std::max(1U, NumSrcVecs
- NumDstVecs
);
1234 // Estimate the permutes for each stored vector as the smaller of the
1235 // number of elements and the number of source vectors. Subtract one per
1236 // dst vector for vperm (S.A.).
1237 unsigned NumSrcVecs
= std::min(NumEltsPerVecReg
, Factor
);
1238 unsigned NumDstVecs
= NumVectorMemOps
;
1239 assert (NumSrcVecs
> 1 && "Expected at least two source vectors.");
1240 NumPermutes
+= (NumDstVecs
* NumSrcVecs
) - NumDstVecs
;
1243 // Cost of load/store operations and the permutations needed.
1244 return NumVectorMemOps
+ NumPermutes
;
1247 static int getVectorIntrinsicInstrCost(Intrinsic::ID ID
, Type
*RetTy
) {
1248 if (RetTy
->isVectorTy() && ID
== Intrinsic::bswap
)
1249 return getNumVectorRegs(RetTy
); // VPERM
1254 SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes
&ICA
,
1255 TTI::TargetCostKind CostKind
) {
1256 InstructionCost Cost
=
1257 getVectorIntrinsicInstrCost(ICA
.getID(), ICA
.getReturnType());
1260 return BaseT::getIntrinsicInstrCost(ICA
, CostKind
);