1 //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements a TargetTransformInfo analysis pass specific to the
10 // SystemZ target machine. It uses the target's detailed information to provide
11 // more precise answers to certain TTI queries, while letting the target
12 // independent and default TTI implementations handle the rest.
14 //===----------------------------------------------------------------------===//
16 #include "SystemZTargetTransformInfo.h"
17 #include "llvm/Analysis/TargetTransformInfo.h"
18 #include "llvm/CodeGen/BasicTTIImpl.h"
19 #include "llvm/CodeGen/CostTable.h"
20 #include "llvm/CodeGen/TargetLowering.h"
21 #include "llvm/IR/IntrinsicInst.h"
22 #include "llvm/Support/Debug.h"
25 #define DEBUG_TYPE "systemztti"
27 //===----------------------------------------------------------------------===//
29 // SystemZ cost model.
31 //===----------------------------------------------------------------------===//
33 int SystemZTTIImpl::getIntImmCost(const APInt
&Imm
, Type
*Ty
) {
34 assert(Ty
->isIntegerTy());
36 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
37 // There is no cost model for constants with a bit size of 0. Return TCC_Free
38 // here, so that constant hoisting will ignore this constant.
41 // No cost model for operations on integers larger than 64 bit implemented yet.
48 if (Imm
.getBitWidth() <= 64) {
49 // Constants loaded via lgfi.
50 if (isInt
<32>(Imm
.getSExtValue()))
51 return TTI::TCC_Basic
;
52 // Constants loaded via llilf.
53 if (isUInt
<32>(Imm
.getZExtValue()))
54 return TTI::TCC_Basic
;
55 // Constants loaded via llihf:
56 if ((Imm
.getZExtValue() & 0xffffffff) == 0)
57 return TTI::TCC_Basic
;
59 return 2 * TTI::TCC_Basic
;
62 return 4 * TTI::TCC_Basic
;
65 int SystemZTTIImpl::getIntImmCost(unsigned Opcode
, unsigned Idx
,
66 const APInt
&Imm
, Type
*Ty
) {
67 assert(Ty
->isIntegerTy());
69 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
70 // There is no cost model for constants with a bit size of 0. Return TCC_Free
71 // here, so that constant hoisting will ignore this constant.
74 // No cost model for operations on integers larger than 64 bit implemented yet.
81 case Instruction::GetElementPtr
:
82 // Always hoist the base address of a GetElementPtr. This prevents the
83 // creation of new constants for every base constant that gets constant
84 // folded with the offset.
86 return 2 * TTI::TCC_Basic
;
88 case Instruction::Store
:
89 if (Idx
== 0 && Imm
.getBitWidth() <= 64) {
90 // Any 8-bit immediate store can by implemented via mvi.
93 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
94 if (isInt
<16>(Imm
.getSExtValue()))
98 case Instruction::ICmp
:
99 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
100 // Comparisons against signed 32-bit immediates implemented via cgfi.
101 if (isInt
<32>(Imm
.getSExtValue()))
102 return TTI::TCC_Free
;
103 // Comparisons against unsigned 32-bit immediates implemented via clgfi.
104 if (isUInt
<32>(Imm
.getZExtValue()))
105 return TTI::TCC_Free
;
108 case Instruction::Add
:
109 case Instruction::Sub
:
110 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
111 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
112 if (isUInt
<32>(Imm
.getZExtValue()))
113 return TTI::TCC_Free
;
114 // Or their negation, by swapping addition vs. subtraction.
115 if (isUInt
<32>(-Imm
.getSExtValue()))
116 return TTI::TCC_Free
;
119 case Instruction::Mul
:
120 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
121 // We use msgfi to multiply by 32-bit signed immediates.
122 if (isInt
<32>(Imm
.getSExtValue()))
123 return TTI::TCC_Free
;
126 case Instruction::Or
:
127 case Instruction::Xor
:
128 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
129 // Masks supported by oilf/xilf.
130 if (isUInt
<32>(Imm
.getZExtValue()))
131 return TTI::TCC_Free
;
132 // Masks supported by oihf/xihf.
133 if ((Imm
.getZExtValue() & 0xffffffff) == 0)
134 return TTI::TCC_Free
;
137 case Instruction::And
:
138 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
139 // Any 32-bit AND operation can by implemented via nilf.
141 return TTI::TCC_Free
;
142 // 64-bit masks supported by nilf.
143 if (isUInt
<32>(~Imm
.getZExtValue()))
144 return TTI::TCC_Free
;
145 // 64-bit masks supported by nilh.
146 if ((Imm
.getZExtValue() & 0xffffffff) == 0xffffffff)
147 return TTI::TCC_Free
;
148 // Some 64-bit AND operations can be implemented via risbg.
149 const SystemZInstrInfo
*TII
= ST
->getInstrInfo();
151 if (TII
->isRxSBGMask(Imm
.getZExtValue(), BitSize
, Start
, End
))
152 return TTI::TCC_Free
;
155 case Instruction::Shl
:
156 case Instruction::LShr
:
157 case Instruction::AShr
:
158 // Always return TCC_Free for the shift value of a shift instruction.
160 return TTI::TCC_Free
;
162 case Instruction::UDiv
:
163 case Instruction::SDiv
:
164 case Instruction::URem
:
165 case Instruction::SRem
:
166 case Instruction::Trunc
:
167 case Instruction::ZExt
:
168 case Instruction::SExt
:
169 case Instruction::IntToPtr
:
170 case Instruction::PtrToInt
:
171 case Instruction::BitCast
:
172 case Instruction::PHI
:
173 case Instruction::Call
:
174 case Instruction::Select
:
175 case Instruction::Ret
:
176 case Instruction::Load
:
180 return SystemZTTIImpl::getIntImmCost(Imm
, Ty
);
183 int SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID
, unsigned Idx
,
184 const APInt
&Imm
, Type
*Ty
) {
185 assert(Ty
->isIntegerTy());
187 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
188 // There is no cost model for constants with a bit size of 0. Return TCC_Free
189 // here, so that constant hoisting will ignore this constant.
191 return TTI::TCC_Free
;
192 // No cost model for operations on integers larger than 64 bit implemented yet.
194 return TTI::TCC_Free
;
198 return TTI::TCC_Free
;
199 case Intrinsic::sadd_with_overflow
:
200 case Intrinsic::uadd_with_overflow
:
201 case Intrinsic::ssub_with_overflow
:
202 case Intrinsic::usub_with_overflow
:
203 // These get expanded to include a normal addition/subtraction.
204 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
205 if (isUInt
<32>(Imm
.getZExtValue()))
206 return TTI::TCC_Free
;
207 if (isUInt
<32>(-Imm
.getSExtValue()))
208 return TTI::TCC_Free
;
211 case Intrinsic::smul_with_overflow
:
212 case Intrinsic::umul_with_overflow
:
213 // These get expanded to include a normal multiplication.
214 if (Idx
== 1 && Imm
.getBitWidth() <= 64) {
215 if (isInt
<32>(Imm
.getSExtValue()))
216 return TTI::TCC_Free
;
219 case Intrinsic::experimental_stackmap
:
220 if ((Idx
< 2) || (Imm
.getBitWidth() <= 64 && isInt
<64>(Imm
.getSExtValue())))
221 return TTI::TCC_Free
;
223 case Intrinsic::experimental_patchpoint_void
:
224 case Intrinsic::experimental_patchpoint_i64
:
225 if ((Idx
< 4) || (Imm
.getBitWidth() <= 64 && isInt
<64>(Imm
.getSExtValue())))
226 return TTI::TCC_Free
;
229 return SystemZTTIImpl::getIntImmCost(Imm
, Ty
);
232 TargetTransformInfo::PopcntSupportKind
233 SystemZTTIImpl::getPopcntSupport(unsigned TyWidth
) {
234 assert(isPowerOf2_32(TyWidth
) && "Type width must be power of 2");
235 if (ST
->hasPopulationCount() && TyWidth
<= 64)
236 return TTI::PSK_FastHardware
;
237 return TTI::PSK_Software
;
240 void SystemZTTIImpl::getUnrollingPreferences(Loop
*L
, ScalarEvolution
&SE
,
241 TTI::UnrollingPreferences
&UP
) {
242 // Find out if L contains a call, what the machine instruction count
243 // estimate is, and how many stores there are.
244 bool HasCall
= false;
245 unsigned NumStores
= 0;
246 for (auto &BB
: L
->blocks())
247 for (auto &I
: *BB
) {
248 if (isa
<CallInst
>(&I
) || isa
<InvokeInst
>(&I
)) {
249 ImmutableCallSite
CS(&I
);
250 if (const Function
*F
= CS
.getCalledFunction()) {
251 if (isLoweredToCall(F
))
253 if (F
->getIntrinsicID() == Intrinsic::memcpy
||
254 F
->getIntrinsicID() == Intrinsic::memset
)
256 } else { // indirect call.
260 if (isa
<StoreInst
>(&I
)) {
261 Type
*MemAccessTy
= I
.getOperand(0)->getType();
262 NumStores
+= getMemoryOpCost(Instruction::Store
, MemAccessTy
, 0, 0);
266 // The z13 processor will run out of store tags if too many stores
267 // are fed into it too quickly. Therefore make sure there are not
268 // too many stores in the resulting unrolled loop.
269 unsigned const Max
= (NumStores
? (12 / NumStores
) : UINT_MAX
);
272 // Only allow full unrolling if loop has any calls.
273 UP
.FullUnrollMaxCount
= Max
;
279 if (UP
.MaxCount
<= 1)
282 // Allow partial and runtime trip count unrolling.
283 UP
.Partial
= UP
.Runtime
= true;
285 UP
.PartialThreshold
= 75;
286 UP
.DefaultUnrollRuntimeCount
= 4;
288 // Allow expensive instructions in the pre-header of the loop.
289 UP
.AllowExpensiveTripCount
= true;
295 bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost
&C1
,
296 TargetTransformInfo::LSRCost
&C2
) {
297 // SystemZ specific: check instruction count (first), and don't care about
298 // ImmCost, since offsets are checked explicitly.
299 return std::tie(C1
.Insns
, C1
.NumRegs
, C1
.AddRecCost
,
300 C1
.NumIVMuls
, C1
.NumBaseAdds
,
301 C1
.ScaleCost
, C1
.SetupCost
) <
302 std::tie(C2
.Insns
, C2
.NumRegs
, C2
.AddRecCost
,
303 C2
.NumIVMuls
, C2
.NumBaseAdds
,
304 C2
.ScaleCost
, C2
.SetupCost
);
307 unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector
) {
309 // Discount the stack pointer. Also leave out %r0, since it can't
310 // be used in an address.
317 unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector
) const {
325 bool SystemZTTIImpl::hasDivRemOp(Type
*DataType
, bool IsSigned
) {
326 EVT VT
= TLI
->getValueType(DL
, DataType
);
327 return (VT
.isScalarInteger() && TLI
->isTypeLegal(VT
));
330 // Return the bit size for the scalar type or vector element
331 // type. getScalarSizeInBits() returns 0 for a pointer type.
332 static unsigned getScalarSizeInBits(Type
*Ty
) {
334 (Ty
->isPtrOrPtrVectorTy() ? 64U : Ty
->getScalarSizeInBits());
335 assert(Size
> 0 && "Element must have non-zero size.");
339 // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
340 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
342 static unsigned getNumVectorRegs(Type
*Ty
) {
343 assert(Ty
->isVectorTy() && "Expected vector type");
344 unsigned WideBits
= getScalarSizeInBits(Ty
) * Ty
->getVectorNumElements();
345 assert(WideBits
> 0 && "Could not compute size of vector");
346 return ((WideBits
% 128U) ? ((WideBits
/ 128U) + 1) : (WideBits
/ 128U));
349 int SystemZTTIImpl::getArithmeticInstrCost(
350 unsigned Opcode
, Type
*Ty
,
351 TTI::OperandValueKind Op1Info
, TTI::OperandValueKind Op2Info
,
352 TTI::OperandValueProperties Opd1PropInfo
,
353 TTI::OperandValueProperties Opd2PropInfo
,
354 ArrayRef
<const Value
*> Args
) {
356 // TODO: return a good value for BB-VECTORIZER that includes the
357 // immediate loads, which we do not want to count for the loop
358 // vectorizer, since they are hopefully hoisted out of the loop. This
359 // would require a new parameter 'InLoop', but not sure if constant
360 // args are common enough to motivate this.
362 unsigned ScalarBits
= Ty
->getScalarSizeInBits();
364 // There are thre cases of division and remainder: Dividing with a register
365 // needs a divide instruction. A divisor which is a power of two constant
366 // can be implemented with a sequence of shifts. Any other constant needs a
367 // multiply and shifts.
368 const unsigned DivInstrCost
= 20;
369 const unsigned DivMulSeqCost
= 10;
370 const unsigned SDivPow2Cost
= 4;
373 Opcode
== Instruction::SDiv
|| Opcode
== Instruction::SRem
;
374 bool UnsignedDivRem
=
375 Opcode
== Instruction::UDiv
|| Opcode
== Instruction::URem
;
377 // Check for a constant divisor.
378 bool DivRemConst
= false;
379 bool DivRemConstPow2
= false;
380 if ((SignedDivRem
|| UnsignedDivRem
) && Args
.size() == 2) {
381 if (const Constant
*C
= dyn_cast
<Constant
>(Args
[1])) {
382 const ConstantInt
*CVal
=
383 (C
->getType()->isVectorTy()
384 ? dyn_cast_or_null
<const ConstantInt
>(C
->getSplatValue())
385 : dyn_cast
<const ConstantInt
>(C
));
386 if (CVal
!= nullptr &&
387 (CVal
->getValue().isPowerOf2() || (-CVal
->getValue()).isPowerOf2()))
388 DivRemConstPow2
= true;
394 if (Ty
->isVectorTy()) {
395 assert(ST
->hasVector() &&
396 "getArithmeticInstrCost() called with vector type.");
397 unsigned VF
= Ty
->getVectorNumElements();
398 unsigned NumVectors
= getNumVectorRegs(Ty
);
400 // These vector operations are custom handled, but are still supported
401 // with one instruction per vector, regardless of element size.
402 if (Opcode
== Instruction::Shl
|| Opcode
== Instruction::LShr
||
403 Opcode
== Instruction::AShr
) {
408 return (NumVectors
* (SignedDivRem
? SDivPow2Cost
: 1));
410 return VF
* DivMulSeqCost
+ getScalarizationOverhead(Ty
, Args
);
411 if ((SignedDivRem
|| UnsignedDivRem
) && VF
> 4)
412 // Temporary hack: disable high vectorization factors with integer
413 // division/remainder, which will get scalarized and handled with
414 // GR128 registers. The mischeduler is not clever enough to avoid
418 // These FP operations are supported with a single vector instruction for
419 // double (base implementation assumes float generally costs 2). For
420 // FP128, the scalar cost is 1, and there is no overhead since the values
421 // are already in scalar registers.
422 if (Opcode
== Instruction::FAdd
|| Opcode
== Instruction::FSub
||
423 Opcode
== Instruction::FMul
|| Opcode
== Instruction::FDiv
) {
424 switch (ScalarBits
) {
426 // The vector enhancements facility 1 provides v4f32 instructions.
427 if (ST
->hasVectorEnhancements1())
429 // Return the cost of multiple scalar invocation plus the cost of
430 // inserting and extracting the values.
431 unsigned ScalarCost
=
432 getArithmeticInstrCost(Opcode
, Ty
->getScalarType());
433 unsigned Cost
= (VF
* ScalarCost
) + getScalarizationOverhead(Ty
, Args
);
434 // FIXME: VF 2 for these FP operations are currently just as
435 // expensive as for VF 4.
448 // There is no native support for FRem.
449 if (Opcode
== Instruction::FRem
) {
450 unsigned Cost
= (VF
* LIBCALL_COST
) + getScalarizationOverhead(Ty
, Args
);
451 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
452 if (VF
== 2 && ScalarBits
== 32)
458 // These FP operations are supported with a dedicated instruction for
459 // float, double and fp128 (base implementation assumes float generally
461 if (Opcode
== Instruction::FAdd
|| Opcode
== Instruction::FSub
||
462 Opcode
== Instruction::FMul
|| Opcode
== Instruction::FDiv
)
465 // There is no native support for FRem.
466 if (Opcode
== Instruction::FRem
)
469 // Or requires one instruction, although it has custom handling for i64.
470 if (Opcode
== Instruction::Or
)
473 if (Opcode
== Instruction::Xor
&& ScalarBits
== 1) {
474 if (ST
->hasLoadStoreOnCond2())
475 return 5; // 2 * (li 0; loc 1); xor
476 return 7; // 2 * ipm sequences ; xor ; shift ; compare
480 return (SignedDivRem
? SDivPow2Cost
: 1);
482 return DivMulSeqCost
;
483 if (SignedDivRem
|| UnsignedDivRem
)
487 // Fallback to the default implementation.
488 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, Op1Info
, Op2Info
,
489 Opd1PropInfo
, Opd2PropInfo
, Args
);
492 int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind
, Type
*Tp
, int Index
,
494 assert (Tp
->isVectorTy());
495 assert (ST
->hasVector() && "getShuffleCost() called.");
496 unsigned NumVectors
= getNumVectorRegs(Tp
);
498 // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
500 // FP128 values are always in scalar registers, so there is no work
501 // involved with a shuffle, except for broadcast. In that case register
502 // moves are done with a single instruction per element.
503 if (Tp
->getScalarType()->isFP128Ty())
504 return (Kind
== TargetTransformInfo::SK_Broadcast
? NumVectors
- 1 : 0);
507 case TargetTransformInfo::SK_ExtractSubvector
:
508 // ExtractSubvector Index indicates start offset.
510 // Extracting a subvector from first index is a noop.
511 return (Index
== 0 ? 0 : NumVectors
);
513 case TargetTransformInfo::SK_Broadcast
:
514 // Loop vectorizer calls here to figure out the extra cost of
515 // broadcasting a loaded value to all elements of a vector. Since vlrep
516 // loads and replicates with a single instruction, adjust the returned
518 return NumVectors
- 1;
522 // SystemZ supports single instruction permutation / replication.
526 return BaseT::getShuffleCost(Kind
, Tp
, Index
, SubTp
);
529 // Return the log2 difference of the element sizes of the two vector types.
530 static unsigned getElSizeLog2Diff(Type
*Ty0
, Type
*Ty1
) {
531 unsigned Bits0
= Ty0
->getScalarSizeInBits();
532 unsigned Bits1
= Ty1
->getScalarSizeInBits();
535 return (Log2_32(Bits1
) - Log2_32(Bits0
));
537 return (Log2_32(Bits0
) - Log2_32(Bits1
));
540 // Return the number of instructions needed to truncate SrcTy to DstTy.
541 unsigned SystemZTTIImpl::
542 getVectorTruncCost(Type
*SrcTy
, Type
*DstTy
) {
543 assert (SrcTy
->isVectorTy() && DstTy
->isVectorTy());
544 assert (SrcTy
->getPrimitiveSizeInBits() > DstTy
->getPrimitiveSizeInBits() &&
545 "Packing must reduce size of vector type.");
546 assert (SrcTy
->getVectorNumElements() == DstTy
->getVectorNumElements() &&
547 "Packing should not change number of elements.");
549 // TODO: Since fp32 is expanded, the extract cost should always be 0.
551 unsigned NumParts
= getNumVectorRegs(SrcTy
);
553 // Up to 2 vector registers can be truncated efficiently with pack or
554 // permute. The latter requires an immediate mask to be loaded, which
555 // typically gets hoisted out of a loop. TODO: return a good value for
556 // BB-VECTORIZER that includes the immediate loads, which we do not want
557 // to count for the loop vectorizer.
561 unsigned Log2Diff
= getElSizeLog2Diff(SrcTy
, DstTy
);
562 unsigned VF
= SrcTy
->getVectorNumElements();
563 for (unsigned P
= 0; P
< Log2Diff
; ++P
) {
569 // Currently, a general mix of permutes and pack instructions is output by
570 // isel, which follow the cost computation above except for this case which
571 // is one instruction less:
572 if (VF
== 8 && SrcTy
->getScalarSizeInBits() == 64 &&
573 DstTy
->getScalarSizeInBits() == 8)
579 // Return the cost of converting a vector bitmask produced by a compare
580 // (SrcTy), to the type of the select or extend instruction (DstTy).
581 unsigned SystemZTTIImpl::
582 getVectorBitmaskConversionCost(Type
*SrcTy
, Type
*DstTy
) {
583 assert (SrcTy
->isVectorTy() && DstTy
->isVectorTy() &&
584 "Should only be called with vector types.");
586 unsigned PackCost
= 0;
587 unsigned SrcScalarBits
= SrcTy
->getScalarSizeInBits();
588 unsigned DstScalarBits
= DstTy
->getScalarSizeInBits();
589 unsigned Log2Diff
= getElSizeLog2Diff(SrcTy
, DstTy
);
590 if (SrcScalarBits
> DstScalarBits
)
591 // The bitmask will be truncated.
592 PackCost
= getVectorTruncCost(SrcTy
, DstTy
);
593 else if (SrcScalarBits
< DstScalarBits
) {
594 unsigned DstNumParts
= getNumVectorRegs(DstTy
);
595 // Each vector select needs its part of the bitmask unpacked.
596 PackCost
= Log2Diff
* DstNumParts
;
597 // Extra cost for moving part of mask before unpacking.
598 PackCost
+= DstNumParts
- 1;
604 // Return the type of the compared operands. This is needed to compute the
605 // cost for a Select / ZExt or SExt instruction.
606 static Type
*getCmpOpsType(const Instruction
*I
, unsigned VF
= 1) {
607 Type
*OpTy
= nullptr;
608 if (CmpInst
*CI
= dyn_cast
<CmpInst
>(I
->getOperand(0)))
609 OpTy
= CI
->getOperand(0)->getType();
610 else if (Instruction
*LogicI
= dyn_cast
<Instruction
>(I
->getOperand(0)))
611 if (LogicI
->getNumOperands() == 2)
612 if (CmpInst
*CI0
= dyn_cast
<CmpInst
>(LogicI
->getOperand(0)))
613 if (isa
<CmpInst
>(LogicI
->getOperand(1)))
614 OpTy
= CI0
->getOperand(0)->getType();
616 if (OpTy
!= nullptr) {
618 assert (!OpTy
->isVectorTy() && "Expected scalar type");
621 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
622 // be either scalar or already vectorized with a same or lesser VF.
623 Type
*ElTy
= OpTy
->getScalarType();
624 return VectorType::get(ElTy
, VF
);
630 // Get the cost of converting a boolean vector to a vector with same width
631 // and element size as Dst, plus the cost of zero extending if needed.
632 unsigned SystemZTTIImpl::
633 getBoolVecToIntConversionCost(unsigned Opcode
, Type
*Dst
,
634 const Instruction
*I
) {
635 assert (Dst
->isVectorTy());
636 unsigned VF
= Dst
->getVectorNumElements();
638 // If we know what the widths of the compared operands, get any cost of
639 // converting it to match Dst. Otherwise assume same widths.
640 Type
*CmpOpTy
= ((I
!= nullptr) ? getCmpOpsType(I
, VF
) : nullptr);
641 if (CmpOpTy
!= nullptr)
642 Cost
= getVectorBitmaskConversionCost(CmpOpTy
, Dst
);
643 if (Opcode
== Instruction::ZExt
|| Opcode
== Instruction::UIToFP
)
644 // One 'vn' per dst vector with an immediate mask.
645 Cost
+= getNumVectorRegs(Dst
);
649 int SystemZTTIImpl::getCastInstrCost(unsigned Opcode
, Type
*Dst
, Type
*Src
,
650 const Instruction
*I
) {
651 unsigned DstScalarBits
= Dst
->getScalarSizeInBits();
652 unsigned SrcScalarBits
= Src
->getScalarSizeInBits();
654 if (Src
->isVectorTy()) {
655 assert (ST
->hasVector() && "getCastInstrCost() called with vector type.");
656 assert (Dst
->isVectorTy());
657 unsigned VF
= Src
->getVectorNumElements();
658 unsigned NumDstVectors
= getNumVectorRegs(Dst
);
659 unsigned NumSrcVectors
= getNumVectorRegs(Src
);
661 if (Opcode
== Instruction::Trunc
) {
662 if (Src
->getScalarSizeInBits() == Dst
->getScalarSizeInBits())
663 return 0; // Check for NOOP conversions.
664 return getVectorTruncCost(Src
, Dst
);
667 if (Opcode
== Instruction::ZExt
|| Opcode
== Instruction::SExt
) {
668 if (SrcScalarBits
>= 8) {
669 // ZExt/SExt will be handled with one unpack per doubling of width.
670 unsigned NumUnpacks
= getElSizeLog2Diff(Src
, Dst
);
672 // For types that spans multiple vector registers, some additional
673 // instructions are used to setup the unpacking.
674 unsigned NumSrcVectorOps
=
675 (NumUnpacks
> 1 ? (NumDstVectors
- NumSrcVectors
)
676 : (NumDstVectors
/ 2));
678 return (NumUnpacks
* NumDstVectors
) + NumSrcVectorOps
;
680 else if (SrcScalarBits
== 1)
681 return getBoolVecToIntConversionCost(Opcode
, Dst
, I
);
684 if (Opcode
== Instruction::SIToFP
|| Opcode
== Instruction::UIToFP
||
685 Opcode
== Instruction::FPToSI
|| Opcode
== Instruction::FPToUI
) {
686 // TODO: Fix base implementation which could simplify things a bit here
687 // (seems to miss on differentiating on scalar/vector types).
689 // Only 64 bit vector conversions are natively supported.
690 if (DstScalarBits
== 64) {
691 if (SrcScalarBits
== 64)
692 return NumDstVectors
;
694 if (SrcScalarBits
== 1)
695 return getBoolVecToIntConversionCost(Opcode
, Dst
, I
) + NumDstVectors
;
698 // Return the cost of multiple scalar invocation plus the cost of
699 // inserting and extracting the values. Base implementation does not
700 // realize float->int gets scalarized.
701 unsigned ScalarCost
= getCastInstrCost(Opcode
, Dst
->getScalarType(),
702 Src
->getScalarType());
703 unsigned TotCost
= VF
* ScalarCost
;
704 bool NeedsInserts
= true, NeedsExtracts
= true;
705 // FP128 registers do not get inserted or extracted.
706 if (DstScalarBits
== 128 &&
707 (Opcode
== Instruction::SIToFP
|| Opcode
== Instruction::UIToFP
))
708 NeedsInserts
= false;
709 if (SrcScalarBits
== 128 &&
710 (Opcode
== Instruction::FPToSI
|| Opcode
== Instruction::FPToUI
))
711 NeedsExtracts
= false;
713 TotCost
+= getScalarizationOverhead(Src
, false, NeedsExtracts
);
714 TotCost
+= getScalarizationOverhead(Dst
, NeedsInserts
, false);
716 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
717 if (VF
== 2 && SrcScalarBits
== 32 && DstScalarBits
== 32)
723 if (Opcode
== Instruction::FPTrunc
) {
724 if (SrcScalarBits
== 128) // fp128 -> double/float + inserts of elements.
725 return VF
/*ldxbr/lexbr*/ + getScalarizationOverhead(Dst
, true, false);
726 else // double -> float
727 return VF
/ 2 /*vledb*/ + std::max(1U, VF
/ 4 /*vperm*/);
730 if (Opcode
== Instruction::FPExt
) {
731 if (SrcScalarBits
== 32 && DstScalarBits
== 64) {
732 // float -> double is very rare and currently unoptimized. Instead of
733 // using vldeb, which can do two at a time, all conversions are
737 // -> fp128. VF * lxdb/lxeb + extraction of elements.
738 return VF
+ getScalarizationOverhead(Src
, false, true);
742 assert (!Dst
->isVectorTy());
744 if (Opcode
== Instruction::SIToFP
|| Opcode
== Instruction::UIToFP
) {
745 if (SrcScalarBits
>= 32 ||
746 (I
!= nullptr && isa
<LoadInst
>(I
->getOperand(0))))
748 return SrcScalarBits
> 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
751 if ((Opcode
== Instruction::ZExt
|| Opcode
== Instruction::SExt
) &&
752 Src
->isIntegerTy(1)) {
753 if (ST
->hasLoadStoreOnCond2())
754 return 2; // li 0; loc 1
756 // This should be extension of a compare i1 result, which is done with
757 // ipm and a varying sequence of instructions.
759 if (Opcode
== Instruction::SExt
)
760 Cost
= (DstScalarBits
< 64 ? 3 : 4);
761 if (Opcode
== Instruction::ZExt
)
763 Type
*CmpOpTy
= ((I
!= nullptr) ? getCmpOpsType(I
) : nullptr);
764 if (CmpOpTy
!= nullptr && CmpOpTy
->isFloatingPointTy())
765 // If operands of an fp-type was compared, this costs +1.
771 return BaseT::getCastInstrCost(Opcode
, Dst
, Src
, I
);
774 // Scalar i8 / i16 operations will typically be made after first extending
775 // the operands to i32.
776 static unsigned getOperandsExtensionCost(const Instruction
*I
) {
777 unsigned ExtCost
= 0;
778 for (Value
*Op
: I
->operands())
779 // A load of i8 or i16 sign/zero extends to i32.
780 if (!isa
<LoadInst
>(Op
) && !isa
<ConstantInt
>(Op
))
786 int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode
, Type
*ValTy
,
787 Type
*CondTy
, const Instruction
*I
) {
788 if (ValTy
->isVectorTy()) {
789 assert (ST
->hasVector() && "getCmpSelInstrCost() called with vector type.");
790 unsigned VF
= ValTy
->getVectorNumElements();
792 // Called with a compare instruction.
793 if (Opcode
== Instruction::ICmp
|| Opcode
== Instruction::FCmp
) {
794 unsigned PredicateExtraCost
= 0;
796 // Some predicates cost one or two extra instructions.
797 switch (cast
<CmpInst
>(I
)->getPredicate()) {
798 case CmpInst::Predicate::ICMP_NE
:
799 case CmpInst::Predicate::ICMP_UGE
:
800 case CmpInst::Predicate::ICMP_ULE
:
801 case CmpInst::Predicate::ICMP_SGE
:
802 case CmpInst::Predicate::ICMP_SLE
:
803 PredicateExtraCost
= 1;
805 case CmpInst::Predicate::FCMP_ONE
:
806 case CmpInst::Predicate::FCMP_ORD
:
807 case CmpInst::Predicate::FCMP_UEQ
:
808 case CmpInst::Predicate::FCMP_UNO
:
809 PredicateExtraCost
= 2;
816 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
817 // floats. FIXME: <2 x float> generates same code as <4 x float>.
818 unsigned CmpCostPerVector
= (ValTy
->getScalarType()->isFloatTy() ? 10 : 1);
819 unsigned NumVecs_cmp
= getNumVectorRegs(ValTy
);
821 unsigned Cost
= (NumVecs_cmp
* (CmpCostPerVector
+ PredicateExtraCost
));
824 else { // Called with a select instruction.
825 assert (Opcode
== Instruction::Select
);
827 // We can figure out the extra cost of packing / unpacking if the
828 // instruction was passed and the compare instruction is found.
829 unsigned PackCost
= 0;
830 Type
*CmpOpTy
= ((I
!= nullptr) ? getCmpOpsType(I
, VF
) : nullptr);
831 if (CmpOpTy
!= nullptr)
833 getVectorBitmaskConversionCost(CmpOpTy
, ValTy
);
835 return getNumVectorRegs(ValTy
) /*vsel*/ + PackCost
;
840 case Instruction::ICmp
: {
841 // A loaded value compared with 0 with multiple users becomes Load and
842 // Test. The load is then not foldable, so return 0 cost for the ICmp.
843 unsigned ScalarBits
= ValTy
->getScalarSizeInBits();
844 if (I
!= nullptr && ScalarBits
>= 32)
845 if (LoadInst
*Ld
= dyn_cast
<LoadInst
>(I
->getOperand(0)))
846 if (const ConstantInt
*C
= dyn_cast
<ConstantInt
>(I
->getOperand(1)))
847 if (!Ld
->hasOneUse() && Ld
->getParent() == I
->getParent() &&
848 C
->getZExtValue() == 0)
852 if (ValTy
->isIntegerTy() && ValTy
->getScalarSizeInBits() <= 16)
853 Cost
+= (I
!= nullptr ? getOperandsExtensionCost(I
) : 2);
856 case Instruction::Select
:
857 if (ValTy
->isFloatingPointTy())
858 return 4; // No load on condition for FP - costs a conditional jump.
859 return 1; // Load On Condition.
863 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, nullptr);
867 getVectorInstrCost(unsigned Opcode
, Type
*Val
, unsigned Index
) {
868 // vlvgp will insert two grs into a vector register, so only count half the
869 // number of instructions.
870 if (Opcode
== Instruction::InsertElement
&& Val
->isIntOrIntVectorTy(64))
871 return ((Index
% 2 == 0) ? 1 : 0);
873 if (Opcode
== Instruction::ExtractElement
) {
874 int Cost
= ((getScalarSizeInBits(Val
) == 1) ? 2 /*+test-under-mask*/ : 1);
876 // Give a slight penalty for moving out of vector pipeline to FXU unit.
877 if (Index
== 0 && Val
->isIntOrIntVectorTy())
883 return BaseT::getVectorInstrCost(Opcode
, Val
, Index
);
886 // Check if a load may be folded as a memory operand in its user.
887 bool SystemZTTIImpl::
888 isFoldableLoad(const LoadInst
*Ld
, const Instruction
*&FoldedValue
) {
889 if (!Ld
->hasOneUse())
892 const Instruction
*UserI
= cast
<Instruction
>(*Ld
->user_begin());
893 unsigned LoadedBits
= getScalarSizeInBits(Ld
->getType());
894 unsigned TruncBits
= 0;
895 unsigned SExtBits
= 0;
896 unsigned ZExtBits
= 0;
897 if (UserI
->hasOneUse()) {
898 unsigned UserBits
= UserI
->getType()->getScalarSizeInBits();
899 if (isa
<TruncInst
>(UserI
))
900 TruncBits
= UserBits
;
901 else if (isa
<SExtInst
>(UserI
))
903 else if (isa
<ZExtInst
>(UserI
))
906 if (TruncBits
|| SExtBits
|| ZExtBits
) {
908 UserI
= cast
<Instruction
>(*UserI
->user_begin());
909 // Load (single use) -> trunc/extend (single use) -> UserI
911 if ((UserI
->getOpcode() == Instruction::Sub
||
912 UserI
->getOpcode() == Instruction::SDiv
||
913 UserI
->getOpcode() == Instruction::UDiv
) &&
914 UserI
->getOperand(1) != FoldedValue
)
915 return false; // Not commutative, only RHS foldable.
916 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
917 // extension was made of the load.
918 unsigned LoadOrTruncBits
=
919 ((SExtBits
|| ZExtBits
) ? 0 : (TruncBits
? TruncBits
: LoadedBits
));
920 switch (UserI
->getOpcode()) {
921 case Instruction::Add
: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
922 case Instruction::Sub
:
923 case Instruction::ICmp
:
924 if (LoadedBits
== 32 && ZExtBits
== 64)
927 case Instruction::Mul
: // SE: 16->32, 32->64, z14:16->64
928 if (UserI
->getOpcode() != Instruction::ICmp
) {
929 if (LoadedBits
== 16 &&
931 (SExtBits
== 64 && ST
->hasMiscellaneousExtensions2())))
933 if (LoadOrTruncBits
== 16)
937 case Instruction::SDiv
:// SE: 32->64
938 if (LoadedBits
== 32 && SExtBits
== 64)
941 case Instruction::UDiv
:
942 case Instruction::And
:
943 case Instruction::Or
:
944 case Instruction::Xor
:
945 // This also makes sense for float operations, but disabled for now due
947 // case Instruction::FCmp:
948 // case Instruction::FAdd:
949 // case Instruction::FSub:
950 // case Instruction::FMul:
951 // case Instruction::FDiv:
953 // All possible extensions of memory checked above.
955 // Comparison between memory and immediate.
956 if (UserI
->getOpcode() == Instruction::ICmp
)
957 if (ConstantInt
*CI
= dyn_cast
<ConstantInt
>(UserI
->getOperand(1)))
958 if (isUInt
<16>(CI
->getZExtValue()))
960 return (LoadOrTruncBits
== 32 || LoadOrTruncBits
== 64);
966 static bool isBswapIntrinsicCall(const Value
*V
) {
967 if (const Instruction
*I
= dyn_cast
<Instruction
>(V
))
968 if (auto *CI
= dyn_cast
<CallInst
>(I
))
969 if (auto *F
= CI
->getCalledFunction())
970 if (F
->getIntrinsicID() == Intrinsic::bswap
)
975 int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode
, Type
*Src
,
976 unsigned Alignment
, unsigned AddressSpace
,
977 const Instruction
*I
) {
978 assert(!Src
->isVoidTy() && "Invalid type");
980 if (!Src
->isVectorTy() && Opcode
== Instruction::Load
&& I
!= nullptr) {
981 // Store the load or its truncated or extended value in FoldedValue.
982 const Instruction
*FoldedValue
= nullptr;
983 if (isFoldableLoad(cast
<LoadInst
>(I
), FoldedValue
)) {
984 const Instruction
*UserI
= cast
<Instruction
>(*FoldedValue
->user_begin());
985 assert (UserI
->getNumOperands() == 2 && "Expected a binop.");
987 // UserI can't fold two loads, so in that case return 0 cost only
989 for (unsigned i
= 0; i
< 2; ++i
) {
990 if (UserI
->getOperand(i
) == FoldedValue
)
993 if (Instruction
*OtherOp
= dyn_cast
<Instruction
>(UserI
->getOperand(i
))){
994 LoadInst
*OtherLoad
= dyn_cast
<LoadInst
>(OtherOp
);
996 (isa
<TruncInst
>(OtherOp
) || isa
<SExtInst
>(OtherOp
) ||
997 isa
<ZExtInst
>(OtherOp
)))
998 OtherLoad
= dyn_cast
<LoadInst
>(OtherOp
->getOperand(0));
999 if (OtherLoad
&& isFoldableLoad(OtherLoad
, FoldedValue
/*dummy*/))
1000 return i
== 0; // Both operands foldable.
1004 return 0; // Only I is foldable in user.
1009 (Src
->isVectorTy() ? getNumVectorRegs(Src
) : getNumberOfParts(Src
));
1011 // Store/Load reversed saves one instruction.
1012 if (!Src
->isVectorTy() && NumOps
== 1 && I
!= nullptr) {
1013 if (Opcode
== Instruction::Load
&& I
->hasOneUse()) {
1014 const Instruction
*LdUser
= cast
<Instruction
>(*I
->user_begin());
1015 // In case of load -> bswap -> store, return normal cost for the load.
1016 if (isBswapIntrinsicCall(LdUser
) &&
1017 (!LdUser
->hasOneUse() || !isa
<StoreInst
>(*LdUser
->user_begin())))
1020 else if (const StoreInst
*SI
= dyn_cast
<StoreInst
>(I
)) {
1021 const Value
*StoredVal
= SI
->getValueOperand();
1022 if (StoredVal
->hasOneUse() && isBswapIntrinsicCall(StoredVal
))
1027 if (Src
->getScalarSizeInBits() == 128)
1028 // 128 bit scalars are held in a pair of two 64 bit registers.
1034 // The generic implementation of getInterleavedMemoryOpCost() is based on
1035 // adding costs of the memory operations plus all the extracts and inserts
1036 // needed for using / defining the vector operands. The SystemZ version does
1037 // roughly the same but bases the computations on vector permutations
1039 int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode
, Type
*VecTy
,
1041 ArrayRef
<unsigned> Indices
,
1043 unsigned AddressSpace
,
1044 bool UseMaskForCond
,
1045 bool UseMaskForGaps
) {
1046 if (UseMaskForCond
|| UseMaskForGaps
)
1047 return BaseT::getInterleavedMemoryOpCost(Opcode
, VecTy
, Factor
, Indices
,
1048 Alignment
, AddressSpace
,
1049 UseMaskForCond
, UseMaskForGaps
);
1050 assert(isa
<VectorType
>(VecTy
) &&
1051 "Expect a vector type for interleaved memory op");
1053 // Return the ceiling of dividing A by B.
1054 auto ceil
= [](unsigned A
, unsigned B
) { return (A
+ B
- 1) / B
; };
1056 unsigned NumElts
= VecTy
->getVectorNumElements();
1057 assert(Factor
> 1 && NumElts
% Factor
== 0 && "Invalid interleave factor");
1058 unsigned VF
= NumElts
/ Factor
;
1059 unsigned NumEltsPerVecReg
= (128U / getScalarSizeInBits(VecTy
));
1060 unsigned NumVectorMemOps
= getNumVectorRegs(VecTy
);
1061 unsigned NumPermutes
= 0;
1063 if (Opcode
== Instruction::Load
) {
1064 // Loading interleave groups may have gaps, which may mean fewer
1065 // loads. Find out how many vectors will be loaded in total, and in how
1066 // many of them each value will be in.
1067 BitVector
UsedInsts(NumVectorMemOps
, false);
1068 std::vector
<BitVector
> ValueVecs(Factor
, BitVector(NumVectorMemOps
, false));
1069 for (unsigned Index
: Indices
)
1070 for (unsigned Elt
= 0; Elt
< VF
; ++Elt
) {
1071 unsigned Vec
= (Index
+ Elt
* Factor
) / NumEltsPerVecReg
;
1073 ValueVecs
[Index
].set(Vec
);
1075 NumVectorMemOps
= UsedInsts
.count();
1077 for (unsigned Index
: Indices
) {
1078 // Estimate that each loaded source vector containing this Index
1079 // requires one operation, except that vperm can handle two input
1080 // registers first time for each dst vector.
1081 unsigned NumSrcVecs
= ValueVecs
[Index
].count();
1082 unsigned NumDstVecs
= ceil(VF
* getScalarSizeInBits(VecTy
), 128U);
1083 assert (NumSrcVecs
>= NumDstVecs
&& "Expected at least as many sources");
1084 NumPermutes
+= std::max(1U, NumSrcVecs
- NumDstVecs
);
1087 // Estimate the permutes for each stored vector as the smaller of the
1088 // number of elements and the number of source vectors. Subtract one per
1089 // dst vector for vperm (S.A.).
1090 unsigned NumSrcVecs
= std::min(NumEltsPerVecReg
, Factor
);
1091 unsigned NumDstVecs
= NumVectorMemOps
;
1092 assert (NumSrcVecs
> 1 && "Expected at least two source vectors.");
1093 NumPermutes
+= (NumDstVecs
* NumSrcVecs
) - NumDstVecs
;
1096 // Cost of load/store operations and the permutations needed.
1097 return NumVectorMemOps
+ NumPermutes
;
1100 static int getVectorIntrinsicInstrCost(Intrinsic::ID ID
, Type
*RetTy
) {
1101 if (RetTy
->isVectorTy() && ID
== Intrinsic::bswap
)
1102 return getNumVectorRegs(RetTy
); // VPERM
1106 int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID
, Type
*RetTy
,
1107 ArrayRef
<Value
*> Args
,
1108 FastMathFlags FMF
, unsigned VF
) {
1109 int Cost
= getVectorIntrinsicInstrCost(ID
, RetTy
);
1112 return BaseT::getIntrinsicInstrCost(ID
, RetTy
, Args
, FMF
, VF
);
1115 int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID
, Type
*RetTy
,
1116 ArrayRef
<Type
*> Tys
,
1118 unsigned ScalarizationCostPassed
) {
1119 int Cost
= getVectorIntrinsicInstrCost(ID
, RetTy
);
1122 return BaseT::getIntrinsicInstrCost(ID
, RetTy
, Tys
,
1123 FMF
, ScalarizationCostPassed
);