1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the Machinelegalizer class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPULegalizerInfo.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #define DEBUG_TYPE "amdgpu-legalinfo"
33 using namespace LegalizeActions
;
34 using namespace LegalizeMutations
;
35 using namespace LegalityPredicates
;
36 using namespace MIPatternMatch
;
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt
<bool> EnableNewLegality(
40 "amdgpu-global-isel-new-legality",
41 cl::desc("Use GlobalISel desired legality, rather than try to use"
42 "rules compatible with selection patterns"),
46 static constexpr unsigned MaxRegisterSize
= 1024;
48 // Round the number of elements to the next power of two elements
49 static LLT
getPow2VectorType(LLT Ty
) {
50 unsigned NElts
= Ty
.getNumElements();
51 unsigned Pow2NElts
= 1 << Log2_32_Ceil(NElts
);
52 return Ty
.changeElementCount(ElementCount::getFixed(Pow2NElts
));
55 // Round the number of bits to the next power of two bits
56 static LLT
getPow2ScalarType(LLT Ty
) {
57 unsigned Bits
= Ty
.getSizeInBits();
58 unsigned Pow2Bits
= 1 << Log2_32_Ceil(Bits
);
59 return LLT::scalar(Pow2Bits
);
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate
isSmallOddVector(unsigned TypeIdx
) {
66 return [=](const LegalityQuery
&Query
) {
67 const LLT Ty
= Query
.Types
[TypeIdx
];
71 const LLT EltTy
= Ty
.getElementType();
72 const unsigned EltSize
= EltTy
.getSizeInBits();
73 return Ty
.getNumElements() % 2 != 0 &&
74 EltSize
> 1 && EltSize
< 32 &&
75 Ty
.getSizeInBits() % 32 != 0;
79 static LegalityPredicate
sizeIsMultipleOf32(unsigned TypeIdx
) {
80 return [=](const LegalityQuery
&Query
) {
81 const LLT Ty
= Query
.Types
[TypeIdx
];
82 return Ty
.getSizeInBits() % 32 == 0;
86 static LegalityPredicate
isWideVec16(unsigned TypeIdx
) {
87 return [=](const LegalityQuery
&Query
) {
88 const LLT Ty
= Query
.Types
[TypeIdx
];
89 const LLT EltTy
= Ty
.getScalarType();
90 return EltTy
.getSizeInBits() == 16 && Ty
.getNumElements() > 2;
94 static LegalizeMutation
oneMoreElement(unsigned TypeIdx
) {
95 return [=](const LegalityQuery
&Query
) {
96 const LLT Ty
= Query
.Types
[TypeIdx
];
97 const LLT EltTy
= Ty
.getElementType();
98 return std::make_pair(TypeIdx
,
99 LLT::fixed_vector(Ty
.getNumElements() + 1, EltTy
));
103 static LegalizeMutation
fewerEltsToSize64Vector(unsigned TypeIdx
) {
104 return [=](const LegalityQuery
&Query
) {
105 const LLT Ty
= Query
.Types
[TypeIdx
];
106 const LLT EltTy
= Ty
.getElementType();
107 unsigned Size
= Ty
.getSizeInBits();
108 unsigned Pieces
= (Size
+ 63) / 64;
109 unsigned NewNumElts
= (Ty
.getNumElements() + 1) / Pieces
;
110 return std::make_pair(
112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts
), EltTy
));
116 // Increase the number of vector elements to reach the next multiple of 32-bit
118 static LegalizeMutation
moreEltsToNext32Bit(unsigned TypeIdx
) {
119 return [=](const LegalityQuery
&Query
) {
120 const LLT Ty
= Query
.Types
[TypeIdx
];
122 const LLT EltTy
= Ty
.getElementType();
123 const int Size
= Ty
.getSizeInBits();
124 const int EltSize
= EltTy
.getSizeInBits();
125 const int NextMul32
= (Size
+ 31) / 32;
127 assert(EltSize
< 32);
129 const int NewNumElts
= (32 * NextMul32
+ EltSize
- 1) / EltSize
;
130 return std::make_pair(TypeIdx
, LLT::fixed_vector(NewNumElts
, EltTy
));
134 static LLT
getBitcastRegisterType(const LLT Ty
) {
135 const unsigned Size
= Ty
.getSizeInBits();
141 return LLT::scalar(Size
);
144 return LLT::scalarOrVector(ElementCount::getFixed(Size
/ 32), 32);
147 static LegalizeMutation
bitcastToRegisterType(unsigned TypeIdx
) {
148 return [=](const LegalityQuery
&Query
) {
149 const LLT Ty
= Query
.Types
[TypeIdx
];
150 return std::make_pair(TypeIdx
, getBitcastRegisterType(Ty
));
154 static LegalizeMutation
bitcastToVectorElement32(unsigned TypeIdx
) {
155 return [=](const LegalityQuery
&Query
) {
156 const LLT Ty
= Query
.Types
[TypeIdx
];
157 unsigned Size
= Ty
.getSizeInBits();
158 assert(Size
% 32 == 0);
159 return std::make_pair(
160 TypeIdx
, LLT::scalarOrVector(ElementCount::getFixed(Size
/ 32), 32));
164 static LegalityPredicate
vectorSmallerThan(unsigned TypeIdx
, unsigned Size
) {
165 return [=](const LegalityQuery
&Query
) {
166 const LLT QueryTy
= Query
.Types
[TypeIdx
];
167 return QueryTy
.isVector() && QueryTy
.getSizeInBits() < Size
;
171 static LegalityPredicate
vectorWiderThan(unsigned TypeIdx
, unsigned Size
) {
172 return [=](const LegalityQuery
&Query
) {
173 const LLT QueryTy
= Query
.Types
[TypeIdx
];
174 return QueryTy
.isVector() && QueryTy
.getSizeInBits() > Size
;
178 static LegalityPredicate
numElementsNotEven(unsigned TypeIdx
) {
179 return [=](const LegalityQuery
&Query
) {
180 const LLT QueryTy
= Query
.Types
[TypeIdx
];
181 return QueryTy
.isVector() && QueryTy
.getNumElements() % 2 != 0;
185 static bool isRegisterSize(unsigned Size
) {
186 return Size
% 32 == 0 && Size
<= MaxRegisterSize
;
189 static bool isRegisterVectorElementType(LLT EltTy
) {
190 const int EltSize
= EltTy
.getSizeInBits();
191 return EltSize
== 16 || EltSize
% 32 == 0;
194 static bool isRegisterVectorType(LLT Ty
) {
195 const int EltSize
= Ty
.getElementType().getSizeInBits();
196 return EltSize
== 32 || EltSize
== 64 ||
197 (EltSize
== 16 && Ty
.getNumElements() % 2 == 0) ||
198 EltSize
== 128 || EltSize
== 256;
201 static bool isRegisterType(LLT Ty
) {
202 if (!isRegisterSize(Ty
.getSizeInBits()))
206 return isRegisterVectorType(Ty
);
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate
isRegisterType(unsigned TypeIdx
) {
214 return [=](const LegalityQuery
&Query
) {
215 return isRegisterType(Query
.Types
[TypeIdx
]);
219 static LegalityPredicate
elementTypeIsLegal(unsigned TypeIdx
) {
220 return [=](const LegalityQuery
&Query
) {
221 const LLT QueryTy
= Query
.Types
[TypeIdx
];
222 if (!QueryTy
.isVector())
224 const LLT EltTy
= QueryTy
.getElementType();
225 return EltTy
== LLT::scalar(16) || EltTy
.getSizeInBits() >= 32;
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate
isWideScalarExtLoadTruncStore(unsigned TypeIdx
) {
232 return [=](const LegalityQuery
&Query
) {
233 const LLT Ty
= Query
.Types
[TypeIdx
];
234 return !Ty
.isVector() && Ty
.getSizeInBits() > 32 &&
235 Query
.MMODescrs
[0].MemoryTy
.getSizeInBits() < Ty
.getSizeInBits();
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget
&ST
, unsigned AS
,
245 case AMDGPUAS::PRIVATE_ADDRESS
:
246 // FIXME: Private element size.
247 return ST
.enableFlatScratch() ? 128 : 32;
248 case AMDGPUAS::LOCAL_ADDRESS
:
249 return ST
.useDS128() ? 128 : 64;
250 case AMDGPUAS::GLOBAL_ADDRESS
:
251 case AMDGPUAS::CONSTANT_ADDRESS
:
252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT
:
253 // Treat constant and global as identical. SMRD loads are sometimes usable for
254 // global loads (ideally constant address space should be eliminated)
255 // depending on the context. Legality cannot be context dependent, but
256 // RegBankSelect can split the load as necessary depending on the pointer
257 // register bank/uniformity and if the memory is invariant or not written in a
259 return IsLoad
? 512 : 128;
261 // Flat addresses may contextually need to be split to 32-bit parts if they
262 // may alias scratch depending on the subtarget.
267 static bool isLoadStoreSizeLegal(const GCNSubtarget
&ST
,
268 const LegalityQuery
&Query
) {
269 const LLT Ty
= Query
.Types
[0];
271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272 const bool IsLoad
= Query
.Opcode
!= AMDGPU::G_STORE
;
274 unsigned RegSize
= Ty
.getSizeInBits();
275 unsigned MemSize
= Query
.MMODescrs
[0].MemoryTy
.getSizeInBits();
276 unsigned AlignBits
= Query
.MMODescrs
[0].AlignInBits
;
277 unsigned AS
= Query
.Types
[1].getAddressSpace();
279 // All of these need to be custom lowered to cast the pointer operand.
280 if (AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
)
283 // Do not handle extending vector loads.
284 if (Ty
.isVector() && MemSize
!= RegSize
)
287 // TODO: We should be able to widen loads if the alignment is high enough, but
288 // we also need to modify the memory access size.
290 // Accept widening loads based on alignment.
291 if (IsLoad
&& MemSize
< Size
)
292 MemSize
= std::max(MemSize
, Align
);
295 // Only 1-byte and 2-byte to 32-bit extloads are valid.
296 if (MemSize
!= RegSize
&& RegSize
!= 32)
299 if (MemSize
> maxSizeForAddrSpace(ST
, AS
, IsLoad
))
310 if (!ST
.hasDwordx3LoadStores())
315 // These may contextually need to be broken down.
321 assert(RegSize
>= MemSize
);
323 if (AlignBits
< MemSize
) {
324 const SITargetLowering
*TLI
= ST
.getTargetLowering();
325 if (!TLI
->allowsMisalignedMemoryAccessesImpl(MemSize
, AS
,
326 Align(AlignBits
/ 8)))
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
337 static bool loadStoreBitcastWorkaround(const LLT Ty
) {
338 if (EnableNewLegality
)
341 const unsigned Size
= Ty
.getSizeInBits();
347 LLT EltTy
= Ty
.getElementType();
348 if (EltTy
.isPointer())
351 unsigned EltSize
= EltTy
.getSizeInBits();
352 return EltSize
!= 32 && EltSize
!= 64;
355 static bool isLoadStoreLegal(const GCNSubtarget
&ST
, const LegalityQuery
&Query
) {
356 const LLT Ty
= Query
.Types
[0];
357 return isRegisterType(Ty
) && isLoadStoreSizeLegal(ST
, Query
) &&
358 !loadStoreBitcastWorkaround(Ty
);
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget
&ST
, const LLT Ty
,
365 const unsigned MemSizeInBits
= MemTy
.getSizeInBits();
366 const unsigned Size
= Ty
.getSizeInBits();
367 if (Size
!= MemSizeInBits
)
368 return Size
<= 32 && Ty
.isVector();
370 if (loadStoreBitcastWorkaround(Ty
) && isRegisterType(Ty
))
373 // Don't try to handle bitcasting vector ext loads for now.
374 return Ty
.isVector() && (!MemTy
.isVector() || MemTy
== Ty
) &&
375 (Size
<= 32 || isRegisterSize(Size
)) &&
376 !isRegisterVectorElementType(Ty
.getElementType());
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget
&ST
, LLT MemoryTy
,
383 unsigned AlignInBits
, unsigned AddrSpace
,
385 unsigned SizeInBits
= MemoryTy
.getSizeInBits();
386 // We don't want to widen cases that are naturally legal.
387 if (isPowerOf2_32(SizeInBits
))
390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391 // end up widening these for a scalar load during RegBankSelect, since there
392 // aren't 96-bit scalar loads.
393 if (SizeInBits
== 96 && ST
.hasDwordx3LoadStores())
396 if (SizeInBits
>= maxSizeForAddrSpace(ST
, AddrSpace
, Opcode
))
399 // A load is known dereferenceable up to the alignment, so it's legal to widen
402 // TODO: Could check dereferenceable for less aligned cases.
403 unsigned RoundedSize
= NextPowerOf2(SizeInBits
);
404 if (AlignInBits
< RoundedSize
)
407 // Do not widen if it would introduce a slow unaligned load.
408 const SITargetLowering
*TLI
= ST
.getTargetLowering();
410 return TLI
->allowsMisalignedMemoryAccessesImpl(
411 RoundedSize
, AddrSpace
, Align(AlignInBits
/ 8),
412 MachineMemOperand::MOLoad
, &Fast
) &&
416 static bool shouldWidenLoad(const GCNSubtarget
&ST
, const LegalityQuery
&Query
,
418 if (Query
.MMODescrs
[0].Ordering
!= AtomicOrdering::NotAtomic
)
421 return shouldWidenLoad(ST
, Query
.MMODescrs
[0].MemoryTy
,
422 Query
.MMODescrs
[0].AlignInBits
,
423 Query
.Types
[1].getAddressSpace(), Opcode
);
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget
&ST_
,
427 const GCNTargetMachine
&TM
)
429 using namespace TargetOpcode
;
431 auto GetAddrSpacePtr
= [&TM
](unsigned AS
) {
432 return LLT::pointer(AS
, TM
.getPointerSizeInBits(AS
));
435 const LLT S1
= LLT::scalar(1);
436 const LLT S8
= LLT::scalar(8);
437 const LLT S16
= LLT::scalar(16);
438 const LLT S32
= LLT::scalar(32);
439 const LLT S64
= LLT::scalar(64);
440 const LLT S128
= LLT::scalar(128);
441 const LLT S256
= LLT::scalar(256);
442 const LLT S512
= LLT::scalar(512);
443 const LLT MaxScalar
= LLT::scalar(MaxRegisterSize
);
445 const LLT V2S8
= LLT::fixed_vector(2, 8);
446 const LLT V2S16
= LLT::fixed_vector(2, 16);
447 const LLT V4S16
= LLT::fixed_vector(4, 16);
449 const LLT V2S32
= LLT::fixed_vector(2, 32);
450 const LLT V3S32
= LLT::fixed_vector(3, 32);
451 const LLT V4S32
= LLT::fixed_vector(4, 32);
452 const LLT V5S32
= LLT::fixed_vector(5, 32);
453 const LLT V6S32
= LLT::fixed_vector(6, 32);
454 const LLT V7S32
= LLT::fixed_vector(7, 32);
455 const LLT V8S32
= LLT::fixed_vector(8, 32);
456 const LLT V9S32
= LLT::fixed_vector(9, 32);
457 const LLT V10S32
= LLT::fixed_vector(10, 32);
458 const LLT V11S32
= LLT::fixed_vector(11, 32);
459 const LLT V12S32
= LLT::fixed_vector(12, 32);
460 const LLT V13S32
= LLT::fixed_vector(13, 32);
461 const LLT V14S32
= LLT::fixed_vector(14, 32);
462 const LLT V15S32
= LLT::fixed_vector(15, 32);
463 const LLT V16S32
= LLT::fixed_vector(16, 32);
464 const LLT V32S32
= LLT::fixed_vector(32, 32);
466 const LLT V2S64
= LLT::fixed_vector(2, 64);
467 const LLT V3S64
= LLT::fixed_vector(3, 64);
468 const LLT V4S64
= LLT::fixed_vector(4, 64);
469 const LLT V5S64
= LLT::fixed_vector(5, 64);
470 const LLT V6S64
= LLT::fixed_vector(6, 64);
471 const LLT V7S64
= LLT::fixed_vector(7, 64);
472 const LLT V8S64
= LLT::fixed_vector(8, 64);
473 const LLT V16S64
= LLT::fixed_vector(16, 64);
475 std::initializer_list
<LLT
> AllS32Vectors
=
476 {V2S32
, V3S32
, V4S32
, V5S32
, V6S32
, V7S32
, V8S32
,
477 V9S32
, V10S32
, V11S32
, V12S32
, V13S32
, V14S32
, V15S32
, V16S32
, V32S32
};
478 std::initializer_list
<LLT
> AllS64Vectors
=
479 {V2S64
, V3S64
, V4S64
, V5S64
, V6S64
, V7S64
, V8S64
, V16S64
};
481 const LLT GlobalPtr
= GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS
);
482 const LLT ConstantPtr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS
);
483 const LLT Constant32Ptr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT
);
484 const LLT LocalPtr
= GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS
);
485 const LLT RegionPtr
= GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS
);
486 const LLT FlatPtr
= GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS
);
487 const LLT PrivatePtr
= GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS
);
489 const LLT CodePtr
= FlatPtr
;
491 const std::initializer_list
<LLT
> AddrSpaces64
= {
492 GlobalPtr
, ConstantPtr
, FlatPtr
495 const std::initializer_list
<LLT
> AddrSpaces32
= {
496 LocalPtr
, PrivatePtr
, Constant32Ptr
, RegionPtr
499 const std::initializer_list
<LLT
> FPTypesBase
= {
503 const std::initializer_list
<LLT
> FPTypes16
= {
507 const std::initializer_list
<LLT
> FPTypesPK16
= {
511 const LLT MinScalarFPTy
= ST
.has16BitInsts() ? S16
: S32
;
513 // s1 for VCC branches, s32 for SCC branches.
514 getActionDefinitionsBuilder(G_BRCOND
).legalFor({S1
, S32
});
516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517 // elements for v3s16
518 getActionDefinitionsBuilder(G_PHI
)
519 .legalFor({S32
, S64
, V2S16
, S16
, V4S16
, S1
, S128
, S256
})
520 .legalFor(AllS32Vectors
)
521 .legalFor(AllS64Vectors
)
522 .legalFor(AddrSpaces64
)
523 .legalFor(AddrSpaces32
)
524 .legalIf(isPointer(0))
525 .clampScalar(0, S16
, S256
)
526 .widenScalarToNextPow2(0, 32)
527 .clampMaxNumElements(0, S32
, 16)
528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
531 if (ST
.hasVOP3PInsts() && ST
.hasAddNoCarry() && ST
.hasIntClamp()) {
532 // Full set of gfx9 features.
533 getActionDefinitionsBuilder({G_ADD
, G_SUB
, G_MUL
})
534 .legalFor({S32
, S16
, V2S16
})
535 .clampScalar(0, S16
, S32
)
536 .clampMaxNumElements(0, S16
, 2)
538 .widenScalarToNextPow2(0, 32);
540 getActionDefinitionsBuilder({G_UADDSAT
, G_USUBSAT
, G_SADDSAT
, G_SSUBSAT
})
541 .legalFor({S32
, S16
, V2S16
}) // Clamp modifier
542 .minScalarOrElt(0, S16
)
543 .clampMaxNumElements(0, S16
, 2)
545 .widenScalarToNextPow2(0, 32)
547 } else if (ST
.has16BitInsts()) {
548 getActionDefinitionsBuilder({G_ADD
, G_SUB
, G_MUL
})
549 .legalFor({S32
, S16
})
550 .clampScalar(0, S16
, S32
)
552 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
554 // Technically the saturating operations require clamp bit support, but this
555 // was introduced at the same time as 16-bit operations.
556 getActionDefinitionsBuilder({G_UADDSAT
, G_USUBSAT
})
557 .legalFor({S32
, S16
}) // Clamp modifier
560 .widenScalarToNextPow2(0, 16)
563 // We're just lowering this, but it helps get a better result to try to
564 // coerce to the desired type first.
565 getActionDefinitionsBuilder({G_SADDSAT
, G_SSUBSAT
})
570 getActionDefinitionsBuilder({G_ADD
, G_SUB
, G_MUL
})
572 .clampScalar(0, S32
, S32
)
575 if (ST
.hasIntClamp()) {
576 getActionDefinitionsBuilder({G_UADDSAT
, G_USUBSAT
})
577 .legalFor({S32
}) // Clamp modifier.
579 .minScalarOrElt(0, S32
)
582 // Clamp bit support was added in VI, along with 16-bit operations.
583 getActionDefinitionsBuilder({G_UADDSAT
, G_USUBSAT
})
589 // FIXME: DAG expansion gets better results. The widening uses the smaller
590 // range values and goes for the min/max lowering directly.
591 getActionDefinitionsBuilder({G_SADDSAT
, G_SSUBSAT
})
597 getActionDefinitionsBuilder(
598 {G_SDIV
, G_UDIV
, G_SREM
, G_UREM
, G_SDIVREM
, G_UDIVREM
})
599 .customFor({S32
, S64
})
600 .clampScalar(0, S32
, S64
)
601 .widenScalarToNextPow2(0, 32)
604 auto &Mulh
= getActionDefinitionsBuilder({G_UMULH
, G_SMULH
})
606 .maxScalarOrElt(0, S32
);
608 if (ST
.hasVOP3PInsts()) {
610 .clampMaxNumElements(0, S8
, 2)
618 // Report legal for any types we can handle anywhere. For the cases only legal
619 // on the SALU, RegBankSelect will be able to re-legalize.
620 getActionDefinitionsBuilder({G_AND
, G_OR
, G_XOR
})
621 .legalFor({S32
, S1
, S64
, V2S32
, S16
, V2S16
, V4S16
})
622 .clampScalar(0, S32
, S64
)
623 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
624 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
625 .widenScalarToNextPow2(0)
628 getActionDefinitionsBuilder({G_UADDO
, G_USUBO
,
629 G_UADDE
, G_SADDE
, G_USUBE
, G_SSUBE
})
630 .legalFor({{S32
, S1
}, {S32
, S32
}})
632 // TODO: .scalarize(0)
635 getActionDefinitionsBuilder(G_BITCAST
)
636 // Don't worry about the size constraint.
637 .legalIf(all(isRegisterType(0), isRegisterType(1)))
641 getActionDefinitionsBuilder(G_CONSTANT
)
642 .legalFor({S1
, S32
, S64
, S16
, GlobalPtr
,
643 LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
644 .legalIf(isPointer(0))
645 .clampScalar(0, S32
, S64
)
646 .widenScalarToNextPow2(0);
648 getActionDefinitionsBuilder(G_FCONSTANT
)
649 .legalFor({S32
, S64
, S16
})
650 .clampScalar(0, S16
, S64
);
652 getActionDefinitionsBuilder({G_IMPLICIT_DEF
, G_FREEZE
})
653 .legalIf(isRegisterType(0))
654 // s1 and s16 are special cases because they have legal operations on
655 // them, but don't really occupy registers in the normal way.
657 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
658 .clampScalarOrElt(0, S32
, MaxScalar
)
659 .widenScalarToNextPow2(0, 32)
660 .clampMaxNumElements(0, S32
, 16);
662 getActionDefinitionsBuilder(G_FRAME_INDEX
).legalFor({PrivatePtr
});
664 // If the amount is divergent, we have to do a wave reduction to get the
665 // maximum value, so this is expanded during RegBankSelect.
666 getActionDefinitionsBuilder(G_DYN_STACKALLOC
)
667 .legalFor({{PrivatePtr
, S32
}});
669 getActionDefinitionsBuilder(G_GLOBAL_VALUE
)
670 .customIf(typeIsNot(0, PrivatePtr
));
672 getActionDefinitionsBuilder(G_BLOCK_ADDR
).legalFor({CodePtr
});
674 auto &FPOpActions
= getActionDefinitionsBuilder(
675 { G_FADD
, G_FMUL
, G_FMA
, G_FCANONICALIZE
})
676 .legalFor({S32
, S64
});
677 auto &TrigActions
= getActionDefinitionsBuilder({G_FSIN
, G_FCOS
})
678 .customFor({S32
, S64
});
679 auto &FDIVActions
= getActionDefinitionsBuilder(G_FDIV
)
680 .customFor({S32
, S64
});
682 if (ST
.has16BitInsts()) {
683 if (ST
.hasVOP3PInsts())
684 FPOpActions
.legalFor({S16
, V2S16
});
686 FPOpActions
.legalFor({S16
});
688 TrigActions
.customFor({S16
});
689 FDIVActions
.customFor({S16
});
692 auto &MinNumMaxNum
= getActionDefinitionsBuilder({
693 G_FMINNUM
, G_FMAXNUM
, G_FMINNUM_IEEE
, G_FMAXNUM_IEEE
});
695 if (ST
.hasVOP3PInsts()) {
696 MinNumMaxNum
.customFor(FPTypesPK16
)
697 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
698 .clampMaxNumElements(0, S16
, 2)
699 .clampScalar(0, S16
, S64
)
701 } else if (ST
.has16BitInsts()) {
702 MinNumMaxNum
.customFor(FPTypes16
)
703 .clampScalar(0, S16
, S64
)
706 MinNumMaxNum
.customFor(FPTypesBase
)
707 .clampScalar(0, S32
, S64
)
711 if (ST
.hasVOP3PInsts())
712 FPOpActions
.clampMaxNumElements(0, S16
, 2);
716 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
720 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
724 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
726 getActionDefinitionsBuilder({G_FNEG
, G_FABS
})
727 .legalFor(FPTypesPK16
)
728 .clampMaxNumElements(0, S16
, 2)
730 .clampScalar(0, S16
, S64
);
732 if (ST
.has16BitInsts()) {
733 getActionDefinitionsBuilder({G_FSQRT
, G_FFLOOR
})
734 .legalFor({S32
, S64
, S16
})
736 .clampScalar(0, S16
, S64
);
738 getActionDefinitionsBuilder(G_FSQRT
)
739 .legalFor({S32
, S64
})
741 .clampScalar(0, S32
, S64
);
743 if (ST
.hasFractBug()) {
744 getActionDefinitionsBuilder(G_FFLOOR
)
746 .legalFor({S32
, S64
})
748 .clampScalar(0, S32
, S64
);
750 getActionDefinitionsBuilder(G_FFLOOR
)
751 .legalFor({S32
, S64
})
753 .clampScalar(0, S32
, S64
);
757 getActionDefinitionsBuilder(G_FPTRUNC
)
758 .legalFor({{S32
, S64
}, {S16
, S32
}})
762 getActionDefinitionsBuilder(G_FPEXT
)
763 .legalFor({{S64
, S32
}, {S32
, S16
}})
764 .narrowScalarFor({{S64
, S16
}}, changeTo(0, S32
))
767 getActionDefinitionsBuilder(G_FSUB
)
768 // Use actual fsub instruction
770 // Must use fadd + fneg
771 .lowerFor({S64
, S16
, V2S16
})
773 .clampScalar(0, S32
, S64
);
775 // Whether this is legal depends on the floating point mode for the function.
776 auto &FMad
= getActionDefinitionsBuilder(G_FMAD
);
777 if (ST
.hasMadF16() && ST
.hasMadMacF32Insts())
778 FMad
.customFor({S32
, S16
});
779 else if (ST
.hasMadMacF32Insts())
780 FMad
.customFor({S32
});
781 else if (ST
.hasMadF16())
782 FMad
.customFor({S16
});
786 auto &FRem
= getActionDefinitionsBuilder(G_FREM
);
787 if (ST
.has16BitInsts()) {
788 FRem
.customFor({S16
, S32
, S64
});
790 FRem
.minScalar(0, S32
)
791 .customFor({S32
, S64
});
795 // TODO: Do we need to clamp maximum bitwidth?
796 getActionDefinitionsBuilder(G_TRUNC
)
797 .legalIf(isScalar(0))
798 .legalFor({{V2S16
, V2S32
}})
799 .clampMaxNumElements(0, S16
, 2)
800 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
801 // situations (like an invalid implicit use), we don't want to infinite loop
803 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
806 getActionDefinitionsBuilder({G_SEXT
, G_ZEXT
, G_ANYEXT
})
807 .legalFor({{S64
, S32
}, {S32
, S16
}, {S64
, S16
},
808 {S32
, S1
}, {S64
, S1
}, {S16
, S1
}})
810 .clampScalar(0, S32
, S64
)
811 .widenScalarToNextPow2(1, 32);
813 // TODO: Split s1->s64 during regbankselect for VALU.
814 auto &IToFP
= getActionDefinitionsBuilder({G_SITOFP
, G_UITOFP
})
815 .legalFor({{S32
, S32
}, {S64
, S32
}, {S16
, S32
}})
816 .lowerIf(typeIs(1, S1
))
817 .customFor({{S32
, S64
}, {S64
, S64
}});
818 if (ST
.has16BitInsts())
819 IToFP
.legalFor({{S16
, S16
}});
820 IToFP
.clampScalar(1, S32
, S64
)
823 .widenScalarToNextPow2(1);
825 auto &FPToI
= getActionDefinitionsBuilder({G_FPTOSI
, G_FPTOUI
})
826 .legalFor({{S32
, S32
}, {S32
, S64
}, {S32
, S16
}})
827 .customFor({{S64
, S32
}, {S64
, S64
}})
828 .narrowScalarFor({{S64
, S16
}}, changeTo(0, S32
));
829 if (ST
.has16BitInsts())
830 FPToI
.legalFor({{S16
, S16
}});
832 FPToI
.minScalar(1, S32
);
834 FPToI
.minScalar(0, S32
)
835 .widenScalarToNextPow2(0, 32)
839 // Lower roundeven into G_FRINT
840 getActionDefinitionsBuilder({G_INTRINSIC_ROUND
, G_INTRINSIC_ROUNDEVEN
})
844 if (ST
.has16BitInsts()) {
845 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC
, G_FCEIL
, G_FRINT
})
846 .legalFor({S16
, S32
, S64
})
847 .clampScalar(0, S16
, S64
)
849 } else if (ST
.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS
) {
850 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC
, G_FCEIL
, G_FRINT
})
851 .legalFor({S32
, S64
})
852 .clampScalar(0, S32
, S64
)
855 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC
, G_FCEIL
, G_FRINT
})
858 .clampScalar(0, S32
, S64
)
862 getActionDefinitionsBuilder(G_PTR_ADD
)
863 .legalIf(all(isPointer(0), sameSize(0, 1)))
865 .scalarSameSizeAs(1, 0);
867 getActionDefinitionsBuilder(G_PTRMASK
)
868 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64
, S32
})))
869 .scalarSameSizeAs(1, 0)
873 getActionDefinitionsBuilder(G_ICMP
)
874 // The compare output type differs based on the register bank of the output,
875 // so make both s1 and s32 legal.
877 // Scalar compares producing output in scc will be promoted to s32, as that
878 // is the allocatable register type that will be needed for the copy from
879 // scc. This will be promoted during RegBankSelect, and we assume something
880 // before that won't try to use s32 result types.
882 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
884 .legalForCartesianProduct(
885 {S1
}, {S32
, S64
, GlobalPtr
, LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
886 .legalForCartesianProduct(
887 {S32
}, {S32
, S64
, GlobalPtr
, LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
});
888 if (ST
.has16BitInsts()) {
889 CmpBuilder
.legalFor({{S1
, S16
}});
893 .widenScalarToNextPow2(1)
894 .clampScalar(1, S32
, S64
)
896 .legalIf(all(typeInSet(0, {S1
, S32
}), isPointer(1)));
898 getActionDefinitionsBuilder(G_FCMP
)
899 .legalForCartesianProduct({S1
}, ST
.has16BitInsts() ? FPTypes16
: FPTypesBase
)
900 .widenScalarToNextPow2(1)
901 .clampScalar(1, S32
, S64
)
904 // FIXME: fpow has a selection pattern that should move to custom lowering.
905 auto &Exp2Ops
= getActionDefinitionsBuilder({G_FEXP2
, G_FLOG2
});
906 if (ST
.has16BitInsts())
907 Exp2Ops
.legalFor({S32
, S16
});
909 Exp2Ops
.legalFor({S32
});
910 Exp2Ops
.clampScalar(0, MinScalarFPTy
, S32
);
911 Exp2Ops
.scalarize(0);
913 auto &ExpOps
= getActionDefinitionsBuilder({G_FEXP
, G_FLOG
, G_FLOG10
, G_FPOW
});
914 if (ST
.has16BitInsts())
915 ExpOps
.customFor({{S32
}, {S16
}});
917 ExpOps
.customFor({S32
});
918 ExpOps
.clampScalar(0, MinScalarFPTy
, S32
)
921 getActionDefinitionsBuilder(G_FPOWI
)
922 .clampScalar(0, MinScalarFPTy
, S32
)
925 // The 64-bit versions produce 32-bit results, but only on the SALU.
926 getActionDefinitionsBuilder(G_CTPOP
)
927 .legalFor({{S32
, S32
}, {S32
, S64
}})
928 .clampScalar(0, S32
, S32
)
929 .clampScalar(1, S32
, S64
)
931 .widenScalarToNextPow2(0, 32)
932 .widenScalarToNextPow2(1, 32);
934 // The hardware instructions return a different result on 0 than the generic
935 // instructions expect. The hardware produces -1, but these produce the
937 getActionDefinitionsBuilder({G_CTLZ
, G_CTTZ
})
939 .clampScalar(0, S32
, S32
)
940 .clampScalar(1, S32
, S64
)
941 .widenScalarToNextPow2(0, 32)
942 .widenScalarToNextPow2(1, 32)
945 // The 64-bit versions produce 32-bit results, but only on the SALU.
946 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF
, G_CTTZ_ZERO_UNDEF
})
947 .legalFor({{S32
, S32
}, {S32
, S64
}})
948 .clampScalar(0, S32
, S32
)
949 .clampScalar(1, S32
, S64
)
951 .widenScalarToNextPow2(0, 32)
952 .widenScalarToNextPow2(1, 32);
954 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
956 getActionDefinitionsBuilder(G_BITREVERSE
)
957 .legalFor({S32
, S64
})
958 .clampScalar(0, S32
, S64
)
960 .widenScalarToNextPow2(0);
962 if (ST
.has16BitInsts()) {
963 getActionDefinitionsBuilder(G_BSWAP
)
964 .legalFor({S16
, S32
, V2S16
})
965 .clampMaxNumElements(0, S16
, 2)
966 // FIXME: Fixing non-power-of-2 before clamp is workaround for
967 // narrowScalar limitation.
968 .widenScalarToNextPow2(0)
969 .clampScalar(0, S16
, S32
)
972 if (ST
.hasVOP3PInsts()) {
973 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
, G_ABS
})
974 .legalFor({S32
, S16
, V2S16
})
975 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
976 .clampMaxNumElements(0, S16
, 2)
978 .widenScalarToNextPow2(0)
982 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
, G_ABS
})
983 .legalFor({S32
, S16
})
984 .widenScalarToNextPow2(0)
990 // TODO: Should have same legality without v_perm_b32
991 getActionDefinitionsBuilder(G_BSWAP
)
993 .lowerIf(scalarNarrowerThan(0, 32))
994 // FIXME: Fixing non-power-of-2 before clamp is workaround for
995 // narrowScalar limitation.
996 .widenScalarToNextPow2(0)
1001 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
, G_ABS
})
1004 .widenScalarToNextPow2(0)
1009 getActionDefinitionsBuilder(G_INTTOPTR
)
1010 // List the common cases
1011 .legalForCartesianProduct(AddrSpaces64
, {S64
})
1012 .legalForCartesianProduct(AddrSpaces32
, {S32
})
1014 // Accept any address space as long as the size matches
1015 .legalIf(sameSize(0, 1))
1016 .widenScalarIf(smallerThan(1, 0),
1017 [](const LegalityQuery
&Query
) {
1018 return std::make_pair(1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
1020 .narrowScalarIf(largerThan(1, 0),
1021 [](const LegalityQuery
&Query
) {
1022 return std::make_pair(1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
1025 getActionDefinitionsBuilder(G_PTRTOINT
)
1026 // List the common cases
1027 .legalForCartesianProduct(AddrSpaces64
, {S64
})
1028 .legalForCartesianProduct(AddrSpaces32
, {S32
})
1030 // Accept any address space as long as the size matches
1031 .legalIf(sameSize(0, 1))
1032 .widenScalarIf(smallerThan(0, 1),
1033 [](const LegalityQuery
&Query
) {
1034 return std::make_pair(0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
1038 [](const LegalityQuery
&Query
) {
1039 return std::make_pair(0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
1042 getActionDefinitionsBuilder(G_ADDRSPACE_CAST
)
1046 const auto needToSplitMemOp
= [=](const LegalityQuery
&Query
,
1047 bool IsLoad
) -> bool {
1048 const LLT DstTy
= Query
.Types
[0];
1050 // Split vector extloads.
1051 unsigned MemSize
= Query
.MMODescrs
[0].MemoryTy
.getSizeInBits();
1052 unsigned AlignBits
= Query
.MMODescrs
[0].AlignInBits
;
1054 if (MemSize
< DstTy
.getSizeInBits())
1055 MemSize
= std::max(MemSize
, AlignBits
);
1057 if (DstTy
.isVector() && DstTy
.getSizeInBits() > MemSize
)
1060 const LLT PtrTy
= Query
.Types
[1];
1061 unsigned AS
= PtrTy
.getAddressSpace();
1062 if (MemSize
> maxSizeForAddrSpace(ST
, AS
, IsLoad
))
1065 // Catch weird sized loads that don't evenly divide into the access sizes
1066 // TODO: May be able to widen depending on alignment etc.
1067 unsigned NumRegs
= (MemSize
+ 31) / 32;
1069 if (!ST
.hasDwordx3LoadStores())
1072 // If the alignment allows, these should have been widened.
1073 if (!isPowerOf2_32(NumRegs
))
1077 if (AlignBits
< MemSize
) {
1078 const SITargetLowering
*TLI
= ST
.getTargetLowering();
1079 return !TLI
->allowsMisalignedMemoryAccessesImpl(MemSize
, AS
,
1080 Align(AlignBits
/ 8));
1086 unsigned GlobalAlign32
= ST
.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1087 unsigned GlobalAlign16
= ST
.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1088 unsigned GlobalAlign8
= ST
.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1090 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1092 // TODO: Unsupported flat for SI.
1094 for (unsigned Op
: {G_LOAD
, G_STORE
}) {
1095 const bool IsStore
= Op
== G_STORE
;
1097 auto &Actions
= getActionDefinitionsBuilder(Op
);
1098 // Explicitly list some common cases.
1099 // TODO: Does this help compile time at all?
1100 Actions
.legalForTypesWithMemDesc({{S32
, GlobalPtr
, S32
, GlobalAlign32
},
1101 {V2S32
, GlobalPtr
, V2S32
, GlobalAlign32
},
1102 {V4S32
, GlobalPtr
, V4S32
, GlobalAlign32
},
1103 {S64
, GlobalPtr
, S64
, GlobalAlign32
},
1104 {V2S64
, GlobalPtr
, V2S64
, GlobalAlign32
},
1105 {V2S16
, GlobalPtr
, V2S16
, GlobalAlign32
},
1106 {S32
, GlobalPtr
, S8
, GlobalAlign8
},
1107 {S32
, GlobalPtr
, S16
, GlobalAlign16
},
1109 {S32
, LocalPtr
, S32
, 32},
1110 {S64
, LocalPtr
, S64
, 32},
1111 {V2S32
, LocalPtr
, V2S32
, 32},
1112 {S32
, LocalPtr
, S8
, 8},
1113 {S32
, LocalPtr
, S16
, 16},
1114 {V2S16
, LocalPtr
, S32
, 32},
1116 {S32
, PrivatePtr
, S32
, 32},
1117 {S32
, PrivatePtr
, S8
, 8},
1118 {S32
, PrivatePtr
, S16
, 16},
1119 {V2S16
, PrivatePtr
, S32
, 32},
1121 {S32
, ConstantPtr
, S32
, GlobalAlign32
},
1122 {V2S32
, ConstantPtr
, V2S32
, GlobalAlign32
},
1123 {V4S32
, ConstantPtr
, V4S32
, GlobalAlign32
},
1124 {S64
, ConstantPtr
, S64
, GlobalAlign32
},
1125 {V2S32
, ConstantPtr
, V2S32
, GlobalAlign32
}});
1127 [=](const LegalityQuery
&Query
) -> bool {
1128 return isLoadStoreLegal(ST
, Query
);
1131 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1134 // TODO: Should generalize bitcast action into coerce, which will also cover
1135 // inserting addrspacecasts.
1136 Actions
.customIf(typeIs(1, Constant32Ptr
));
1138 // Turn any illegal element vectors into something easier to deal
1139 // with. These will ultimately produce 32-bit scalar shifts to extract the
1142 // For odd 16-bit element vectors, prefer to split those into pieces with
1143 // 16-bit vector parts.
1145 [=](const LegalityQuery
&Query
) -> bool {
1146 return shouldBitcastLoadStoreType(ST
, Query
.Types
[0],
1147 Query
.MMODescrs
[0].MemoryTy
);
1148 }, bitcastToRegisterType(0));
1151 // Widen suitably aligned loads by loading extra bytes. The standard
1152 // legalization actions can't properly express widening memory operands.
1153 Actions
.customIf([=](const LegalityQuery
&Query
) -> bool {
1154 return shouldWidenLoad(ST
, Query
, G_LOAD
);
1158 // FIXME: load/store narrowing should be moved to lower action
1161 [=](const LegalityQuery
&Query
) -> bool {
1162 return !Query
.Types
[0].isVector() &&
1163 needToSplitMemOp(Query
, Op
== G_LOAD
);
1165 [=](const LegalityQuery
&Query
) -> std::pair
<unsigned, LLT
> {
1166 const LLT DstTy
= Query
.Types
[0];
1167 const LLT PtrTy
= Query
.Types
[1];
1169 const unsigned DstSize
= DstTy
.getSizeInBits();
1170 unsigned MemSize
= Query
.MMODescrs
[0].MemoryTy
.getSizeInBits();
1173 if (DstSize
> MemSize
)
1174 return std::make_pair(0, LLT::scalar(MemSize
));
1176 if (!isPowerOf2_32(DstSize
)) {
1177 // We're probably decomposing an odd sized store. Try to split
1178 // to the widest type. TODO: Account for alignment. As-is it
1179 // should be OK, since the new parts will be further legalized.
1180 unsigned FloorSize
= PowerOf2Floor(DstSize
);
1181 return std::make_pair(0, LLT::scalar(FloorSize
));
1184 if (DstSize
> 32 && (DstSize
% 32 != 0)) {
1185 // FIXME: Need a way to specify non-extload of larger size if
1186 // suitably aligned.
1187 return std::make_pair(0, LLT::scalar(32 * (DstSize
/ 32)));
1190 unsigned MaxSize
= maxSizeForAddrSpace(ST
,
1191 PtrTy
.getAddressSpace(),
1193 if (MemSize
> MaxSize
)
1194 return std::make_pair(0, LLT::scalar(MaxSize
));
1196 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
1197 return std::make_pair(0, LLT::scalar(Align
));
1200 [=](const LegalityQuery
&Query
) -> bool {
1201 return Query
.Types
[0].isVector() &&
1202 needToSplitMemOp(Query
, Op
== G_LOAD
);
1204 [=](const LegalityQuery
&Query
) -> std::pair
<unsigned, LLT
> {
1205 const LLT DstTy
= Query
.Types
[0];
1206 const LLT PtrTy
= Query
.Types
[1];
1208 LLT EltTy
= DstTy
.getElementType();
1209 unsigned MaxSize
= maxSizeForAddrSpace(ST
,
1210 PtrTy
.getAddressSpace(),
1213 // FIXME: Handle widened to power of 2 results better. This ends
1215 // FIXME: 3 element stores scalarized on SI
1217 // Split if it's too large for the address space.
1218 unsigned MemSize
= Query
.MMODescrs
[0].MemoryTy
.getSizeInBits();
1219 if (MemSize
> MaxSize
) {
1220 unsigned NumElts
= DstTy
.getNumElements();
1221 unsigned EltSize
= EltTy
.getSizeInBits();
1223 if (MaxSize
% EltSize
== 0) {
1224 return std::make_pair(
1225 0, LLT::scalarOrVector(
1226 ElementCount::getFixed(MaxSize
/ EltSize
), EltTy
));
1229 unsigned NumPieces
= MemSize
/ MaxSize
;
1231 // FIXME: Refine when odd breakdowns handled
1232 // The scalars will need to be re-legalized.
1233 if (NumPieces
== 1 || NumPieces
>= NumElts
||
1234 NumElts
% NumPieces
!= 0)
1235 return std::make_pair(0, EltTy
);
1237 return std::make_pair(
1238 0, LLT::fixed_vector(NumElts
/ NumPieces
, EltTy
));
1241 // FIXME: We could probably handle weird extending loads better.
1242 if (DstTy
.getSizeInBits() > MemSize
)
1243 return std::make_pair(0, EltTy
);
1245 unsigned EltSize
= EltTy
.getSizeInBits();
1246 unsigned DstSize
= DstTy
.getSizeInBits();
1247 if (!isPowerOf2_32(DstSize
)) {
1248 // We're probably decomposing an odd sized store. Try to split
1249 // to the widest type. TODO: Account for alignment. As-is it
1250 // should be OK, since the new parts will be further legalized.
1251 unsigned FloorSize
= PowerOf2Floor(DstSize
);
1252 return std::make_pair(
1253 0, LLT::scalarOrVector(
1254 ElementCount::getFixed(FloorSize
/ EltSize
), EltTy
));
1257 // Need to split because of alignment.
1258 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
1259 if (EltSize
> Align
&&
1260 (EltSize
/ Align
< DstTy
.getNumElements())) {
1261 return std::make_pair(
1262 0, LLT::fixed_vector(EltSize
/ Align
, EltTy
));
1265 // May need relegalization for the scalars.
1266 return std::make_pair(0, EltTy
);
1269 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32
))
1270 .widenScalarToNextPow2(0)
1271 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1275 // FIXME: Unaligned accesses not lowered.
1276 auto &ExtLoads
= getActionDefinitionsBuilder({G_SEXTLOAD
, G_ZEXTLOAD
})
1277 .legalForTypesWithMemDesc({{S32
, GlobalPtr
, S8
, 8},
1278 {S32
, GlobalPtr
, S16
, 2 * 8},
1279 {S32
, LocalPtr
, S8
, 8},
1280 {S32
, LocalPtr
, S16
, 16},
1281 {S32
, PrivatePtr
, S8
, 8},
1282 {S32
, PrivatePtr
, S16
, 16},
1283 {S32
, ConstantPtr
, S8
, 8},
1284 {S32
, ConstantPtr
, S16
, 2 * 8}})
1286 [=](const LegalityQuery
&Query
) -> bool {
1287 return isLoadStoreLegal(ST
, Query
);
1290 if (ST
.hasFlatAddressSpace()) {
1291 ExtLoads
.legalForTypesWithMemDesc(
1292 {{S32
, FlatPtr
, S8
, 8}, {S32
, FlatPtr
, S16
, 16}});
1295 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1298 // TODO: Should generalize bitcast action into coerce, which will also cover
1299 // inserting addrspacecasts.
1300 ExtLoads
.customIf(typeIs(1, Constant32Ptr
));
1302 ExtLoads
.clampScalar(0, S32
, S32
)
1303 .widenScalarToNextPow2(0)
1306 auto &Atomics
= getActionDefinitionsBuilder(
1307 {G_ATOMICRMW_XCHG
, G_ATOMICRMW_ADD
, G_ATOMICRMW_SUB
,
1308 G_ATOMICRMW_AND
, G_ATOMICRMW_OR
, G_ATOMICRMW_XOR
,
1309 G_ATOMICRMW_MAX
, G_ATOMICRMW_MIN
, G_ATOMICRMW_UMAX
,
1311 .legalFor({{S32
, GlobalPtr
}, {S32
, LocalPtr
},
1312 {S64
, GlobalPtr
}, {S64
, LocalPtr
},
1313 {S32
, RegionPtr
}, {S64
, RegionPtr
}});
1314 if (ST
.hasFlatAddressSpace()) {
1315 Atomics
.legalFor({{S32
, FlatPtr
}, {S64
, FlatPtr
}});
1318 auto &Atomic
= getActionDefinitionsBuilder(G_ATOMICRMW_FADD
);
1319 if (ST
.hasLDSFPAtomics()) {
1320 Atomic
.legalFor({{S32
, LocalPtr
}, {S32
, RegionPtr
}});
1321 if (ST
.hasGFX90AInsts())
1322 Atomic
.legalFor({{S64
, LocalPtr
}});
1324 if (ST
.hasAtomicFaddInsts())
1325 Atomic
.legalFor({{S32
, GlobalPtr
}});
1327 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1329 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG
)
1330 .customFor({{S32
, GlobalPtr
}, {S64
, GlobalPtr
},
1331 {S32
, FlatPtr
}, {S64
, FlatPtr
}})
1332 .legalFor({{S32
, LocalPtr
}, {S64
, LocalPtr
},
1333 {S32
, RegionPtr
}, {S64
, RegionPtr
}});
1334 // TODO: Pointer types, any 32-bit or 64-bit vector
1336 // Condition should be s32 for scalar, s1 for vector.
1337 getActionDefinitionsBuilder(G_SELECT
)
1338 .legalForCartesianProduct({S32
, S64
, S16
, V2S32
, V2S16
, V4S16
, GlobalPtr
,
1339 LocalPtr
, FlatPtr
, PrivatePtr
,
1340 LLT::fixed_vector(2, LocalPtr
),
1341 LLT::fixed_vector(2, PrivatePtr
)},
1343 .clampScalar(0, S16
, S64
)
1345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1346 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1347 .clampMaxNumElements(0, S32
, 2)
1348 .clampMaxNumElements(0, LocalPtr
, 2)
1349 .clampMaxNumElements(0, PrivatePtr
, 2)
1351 .widenScalarToNextPow2(0)
1352 .legalIf(all(isPointer(0), typeInSet(1, {S1
, S32
})));
1354 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1355 // be more flexible with the shift amount type.
1356 auto &Shifts
= getActionDefinitionsBuilder({G_SHL
, G_LSHR
, G_ASHR
})
1357 .legalFor({{S32
, S32
}, {S64
, S32
}});
1358 if (ST
.has16BitInsts()) {
1359 if (ST
.hasVOP3PInsts()) {
1360 Shifts
.legalFor({{S16
, S16
}, {V2S16
, V2S16
}})
1361 .clampMaxNumElements(0, S16
, 2);
1363 Shifts
.legalFor({{S16
, S16
}});
1365 // TODO: Support 16-bit shift amounts for all types
1366 Shifts
.widenScalarIf(
1367 [=](const LegalityQuery
&Query
) {
1368 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1370 const LLT ValTy
= Query
.Types
[0];
1371 const LLT AmountTy
= Query
.Types
[1];
1372 return ValTy
.getSizeInBits() <= 16 &&
1373 AmountTy
.getSizeInBits() < 16;
1374 }, changeTo(1, S16
));
1375 Shifts
.maxScalarIf(typeIs(0, S16
), 1, S16
);
1376 Shifts
.clampScalar(1, S32
, S32
);
1377 Shifts
.clampScalar(0, S16
, S64
);
1378 Shifts
.widenScalarToNextPow2(0, 16);
1380 getActionDefinitionsBuilder({G_SSHLSAT
, G_USHLSAT
})
1385 // Make sure we legalize the shift amount type first, as the general
1386 // expansion for the shifted type will produce much worse code if it hasn't
1387 // been truncated already.
1388 Shifts
.clampScalar(1, S32
, S32
);
1389 Shifts
.clampScalar(0, S32
, S64
);
1390 Shifts
.widenScalarToNextPow2(0, 32);
1392 getActionDefinitionsBuilder({G_SSHLSAT
, G_USHLSAT
})
1397 Shifts
.scalarize(0);
1399 for (unsigned Op
: {G_EXTRACT_VECTOR_ELT
, G_INSERT_VECTOR_ELT
}) {
1400 unsigned VecTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 1 : 0;
1401 unsigned EltTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 0 : 1;
1402 unsigned IdxTypeIdx
= 2;
1404 getActionDefinitionsBuilder(Op
)
1405 .customIf([=](const LegalityQuery
&Query
) {
1406 const LLT EltTy
= Query
.Types
[EltTypeIdx
];
1407 const LLT VecTy
= Query
.Types
[VecTypeIdx
];
1408 const LLT IdxTy
= Query
.Types
[IdxTypeIdx
];
1409 const unsigned EltSize
= EltTy
.getSizeInBits();
1410 return (EltSize
== 32 || EltSize
== 64) &&
1411 VecTy
.getSizeInBits() % 32 == 0 &&
1412 VecTy
.getSizeInBits() <= MaxRegisterSize
&&
1413 IdxTy
.getSizeInBits() == 32;
1415 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx
), scalarOrEltNarrowerThan(VecTypeIdx
, 32)),
1416 bitcastToVectorElement32(VecTypeIdx
))
1417 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1419 all(sizeIsMultipleOf32(VecTypeIdx
), scalarOrEltWiderThan(VecTypeIdx
, 64)),
1420 [=](const LegalityQuery
&Query
) {
1421 // For > 64-bit element types, try to turn this into a 64-bit
1422 // element vector since we may be able to do better indexing
1423 // if this is scalar. If not, fall back to 32.
1424 const LLT EltTy
= Query
.Types
[EltTypeIdx
];
1425 const LLT VecTy
= Query
.Types
[VecTypeIdx
];
1426 const unsigned DstEltSize
= EltTy
.getSizeInBits();
1427 const unsigned VecSize
= VecTy
.getSizeInBits();
1429 const unsigned TargetEltSize
= DstEltSize
% 64 == 0 ? 64 : 32;
1430 return std::make_pair(
1432 LLT::fixed_vector(VecSize
/ TargetEltSize
, TargetEltSize
));
1434 .clampScalar(EltTypeIdx
, S32
, S64
)
1435 .clampScalar(VecTypeIdx
, S32
, S64
)
1436 .clampScalar(IdxTypeIdx
, S32
, S32
)
1437 .clampMaxNumElements(VecTypeIdx
, S32
, 32)
1438 // TODO: Clamp elements for 64-bit vectors?
1439 // It should only be necessary with variable indexes.
1440 // As a last resort, lower to the stack
1444 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT
)
1445 .unsupportedIf([=](const LegalityQuery
&Query
) {
1446 const LLT
&EltTy
= Query
.Types
[1].getElementType();
1447 return Query
.Types
[0] != EltTy
;
1450 for (unsigned Op
: {G_EXTRACT
, G_INSERT
}) {
1451 unsigned BigTyIdx
= Op
== G_EXTRACT
? 1 : 0;
1452 unsigned LitTyIdx
= Op
== G_EXTRACT
? 0 : 1;
1454 // FIXME: Doesn't handle extract of illegal sizes.
1455 getActionDefinitionsBuilder(Op
)
1456 .lowerIf(all(typeIs(LitTyIdx
, S16
), sizeIs(BigTyIdx
, 32)))
1457 // FIXME: Multiples of 16 should not be legal.
1458 .legalIf([=](const LegalityQuery
&Query
) {
1459 const LLT BigTy
= Query
.Types
[BigTyIdx
];
1460 const LLT LitTy
= Query
.Types
[LitTyIdx
];
1461 return (BigTy
.getSizeInBits() % 32 == 0) &&
1462 (LitTy
.getSizeInBits() % 16 == 0);
1465 [=](const LegalityQuery
&Query
) {
1466 const LLT BigTy
= Query
.Types
[BigTyIdx
];
1467 return (BigTy
.getScalarSizeInBits() < 16);
1469 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx
, 16))
1471 [=](const LegalityQuery
&Query
) {
1472 const LLT LitTy
= Query
.Types
[LitTyIdx
];
1473 return (LitTy
.getScalarSizeInBits() < 16);
1475 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx
, 16))
1476 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
1477 .widenScalarToNextPow2(BigTyIdx
, 32);
1481 auto &BuildVector
= getActionDefinitionsBuilder(G_BUILD_VECTOR
)
1482 .legalForCartesianProduct(AllS32Vectors
, {S32
})
1483 .legalForCartesianProduct(AllS64Vectors
, {S64
})
1484 .clampNumElements(0, V16S32
, V32S32
)
1485 .clampNumElements(0, V2S64
, V16S64
)
1486 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16
));
1488 if (ST
.hasScalarPackInsts()) {
1490 // FIXME: Should probably widen s1 vectors straight to s32
1491 .minScalarOrElt(0, S16
)
1492 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1495 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
)
1496 .legalFor({V2S16
, S32
})
1498 BuildVector
.minScalarOrElt(0, S32
);
1500 BuildVector
.customFor({V2S16
, S16
});
1501 BuildVector
.minScalarOrElt(0, S32
);
1503 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
)
1504 .customFor({V2S16
, S32
})
1508 BuildVector
.legalIf(isRegisterType(0));
1510 // FIXME: Clamp maximum size
1511 getActionDefinitionsBuilder(G_CONCAT_VECTORS
)
1512 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1513 .clampMaxNumElements(0, S32
, 32)
1514 .clampMaxNumElements(1, S16
, 2) // TODO: Make 4?
1515 .clampMaxNumElements(0, S16
, 64);
1517 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1519 if (ST
.hasVOP3PInsts()) {
1520 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR
)
1521 .customFor({V2S16
, V2S16
})
1524 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR
).lower();
1527 for (unsigned Op
: {G_MERGE_VALUES
, G_UNMERGE_VALUES
}) {
1528 unsigned BigTyIdx
= Op
== G_MERGE_VALUES
? 0 : 1;
1529 unsigned LitTyIdx
= Op
== G_MERGE_VALUES
? 1 : 0;
1531 auto notValidElt
= [=](const LegalityQuery
&Query
, unsigned TypeIdx
) {
1532 const LLT Ty
= Query
.Types
[TypeIdx
];
1533 if (Ty
.isVector()) {
1534 const LLT
&EltTy
= Ty
.getElementType();
1535 if (EltTy
.getSizeInBits() < 8 || EltTy
.getSizeInBits() > 512)
1537 if (!isPowerOf2_32(EltTy
.getSizeInBits()))
1543 auto &Builder
= getActionDefinitionsBuilder(Op
)
1544 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1545 .lowerFor({{S16
, V2S16
}})
1546 .lowerIf([=](const LegalityQuery
&Query
) {
1547 const LLT BigTy
= Query
.Types
[BigTyIdx
];
1548 return BigTy
.getSizeInBits() == 32;
1550 // Try to widen to s16 first for small types.
1551 // TODO: Only do this on targets with legal s16 shifts
1552 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx
, 16), LitTyIdx
, S16
)
1553 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 16)
1554 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
1555 .fewerElementsIf(all(typeIs(0, S16
), vectorWiderThan(1, 32),
1556 elementTypeIs(1, S16
)),
1558 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1559 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1561 .clampScalar(LitTyIdx
, S32
, S512
)
1562 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 32)
1563 // Break up vectors with weird elements into scalars
1565 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, LitTyIdx
); },
1568 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, BigTyIdx
); },
1570 .clampScalar(BigTyIdx
, S32
, MaxScalar
);
1572 if (Op
== G_MERGE_VALUES
) {
1573 Builder
.widenScalarIf(
1574 // TODO: Use 16-bit shifts if legal for 8-bit values?
1575 [=](const LegalityQuery
&Query
) {
1576 const LLT Ty
= Query
.Types
[LitTyIdx
];
1577 return Ty
.getSizeInBits() < 32;
1579 changeTo(LitTyIdx
, S32
));
1582 Builder
.widenScalarIf(
1583 [=](const LegalityQuery
&Query
) {
1584 const LLT Ty
= Query
.Types
[BigTyIdx
];
1585 return !isPowerOf2_32(Ty
.getSizeInBits()) &&
1586 Ty
.getSizeInBits() % 16 != 0;
1588 [=](const LegalityQuery
&Query
) {
1589 // Pick the next power of 2, or a multiple of 64 over 128.
1590 // Whichever is smaller.
1591 const LLT
&Ty
= Query
.Types
[BigTyIdx
];
1592 unsigned NewSizeInBits
= 1 << Log2_32_Ceil(Ty
.getSizeInBits() + 1);
1593 if (NewSizeInBits
>= 256) {
1594 unsigned RoundedTo
= alignTo
<64>(Ty
.getSizeInBits() + 1);
1595 if (RoundedTo
< NewSizeInBits
)
1596 NewSizeInBits
= RoundedTo
;
1598 return std::make_pair(BigTyIdx
, LLT::scalar(NewSizeInBits
));
1600 // Any vectors left are the wrong size. Scalarize them.
1605 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1607 auto &SextInReg
= getActionDefinitionsBuilder(G_SEXT_INREG
)
1608 .legalFor({{S32
}, {S64
}});
1610 if (ST
.hasVOP3PInsts()) {
1611 SextInReg
.lowerFor({{V2S16
}})
1612 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1613 // get more vector shift opportunities, since we'll get those when
1615 .fewerElementsIf(elementTypeIs(0, S16
), changeTo(0, V2S16
));
1616 } else if (ST
.has16BitInsts()) {
1617 SextInReg
.lowerFor({{S32
}, {S64
}, {S16
}});
1619 // Prefer to promote to s32 before lowering if we don't have 16-bit
1620 // shifts. This avoid a lot of intermediate truncate and extend operations.
1621 SextInReg
.lowerFor({{S32
}, {S64
}});
1626 .clampScalar(0, S32
, S64
)
1629 // TODO: Only Try to form v2s16 with legal packed instructions.
1630 getActionDefinitionsBuilder(G_FSHR
)
1631 .legalFor({{S32
, S32
}})
1632 .lowerFor({{V2S16
, V2S16
}})
1633 .fewerElementsIf(elementTypeIs(0, S16
), changeTo(0, V2S16
))
1637 if (ST
.hasVOP3PInsts()) {
1638 getActionDefinitionsBuilder(G_FSHL
)
1639 .lowerFor({{V2S16
, V2S16
}})
1640 .fewerElementsIf(elementTypeIs(0, S16
), changeTo(0, V2S16
))
1644 getActionDefinitionsBuilder(G_FSHL
)
1649 getActionDefinitionsBuilder(G_READCYCLECOUNTER
)
1652 getActionDefinitionsBuilder(G_FENCE
)
1655 getActionDefinitionsBuilder({G_SMULO
, G_UMULO
})
1660 getActionDefinitionsBuilder({G_SBFX
, G_UBFX
})
1661 .legalFor({{S32
, S32
}, {S64
, S32
}})
1662 .clampScalar(1, S32
, S32
)
1663 .clampScalar(0, S32
, S64
)
1664 .widenScalarToNextPow2(0)
1667 getActionDefinitionsBuilder({
1668 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1671 G_ATOMIC_CMPXCHG_WITH_SUCCESS
,
1680 G_FMINIMUM
, G_FMAXIMUM
}).lower();
1682 getActionDefinitionsBuilder({G_VASTART
, G_VAARG
, G_BRJT
, G_JUMP_TABLE
,
1683 G_INDEXED_LOAD
, G_INDEXED_SEXTLOAD
,
1684 G_INDEXED_ZEXTLOAD
, G_INDEXED_STORE
})
1687 getLegacyLegalizerInfo().computeTables();
1688 verify(*ST
.getInstrInfo());
1691 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper
&Helper
,
1692 MachineInstr
&MI
) const {
1693 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
1694 MachineRegisterInfo
&MRI
= *B
.getMRI();
1696 switch (MI
.getOpcode()) {
1697 case TargetOpcode::G_ADDRSPACE_CAST
:
1698 return legalizeAddrSpaceCast(MI
, MRI
, B
);
1699 case TargetOpcode::G_FRINT
:
1700 return legalizeFrint(MI
, MRI
, B
);
1701 case TargetOpcode::G_FCEIL
:
1702 return legalizeFceil(MI
, MRI
, B
);
1703 case TargetOpcode::G_FREM
:
1704 return legalizeFrem(MI
, MRI
, B
);
1705 case TargetOpcode::G_INTRINSIC_TRUNC
:
1706 return legalizeIntrinsicTrunc(MI
, MRI
, B
);
1707 case TargetOpcode::G_SITOFP
:
1708 return legalizeITOFP(MI
, MRI
, B
, true);
1709 case TargetOpcode::G_UITOFP
:
1710 return legalizeITOFP(MI
, MRI
, B
, false);
1711 case TargetOpcode::G_FPTOSI
:
1712 return legalizeFPTOI(MI
, MRI
, B
, true);
1713 case TargetOpcode::G_FPTOUI
:
1714 return legalizeFPTOI(MI
, MRI
, B
, false);
1715 case TargetOpcode::G_FMINNUM
:
1716 case TargetOpcode::G_FMAXNUM
:
1717 case TargetOpcode::G_FMINNUM_IEEE
:
1718 case TargetOpcode::G_FMAXNUM_IEEE
:
1719 return legalizeMinNumMaxNum(Helper
, MI
);
1720 case TargetOpcode::G_EXTRACT_VECTOR_ELT
:
1721 return legalizeExtractVectorElt(MI
, MRI
, B
);
1722 case TargetOpcode::G_INSERT_VECTOR_ELT
:
1723 return legalizeInsertVectorElt(MI
, MRI
, B
);
1724 case TargetOpcode::G_SHUFFLE_VECTOR
:
1725 return legalizeShuffleVector(MI
, MRI
, B
);
1726 case TargetOpcode::G_FSIN
:
1727 case TargetOpcode::G_FCOS
:
1728 return legalizeSinCos(MI
, MRI
, B
);
1729 case TargetOpcode::G_GLOBAL_VALUE
:
1730 return legalizeGlobalValue(MI
, MRI
, B
);
1731 case TargetOpcode::G_LOAD
:
1732 case TargetOpcode::G_SEXTLOAD
:
1733 case TargetOpcode::G_ZEXTLOAD
:
1734 return legalizeLoad(Helper
, MI
);
1735 case TargetOpcode::G_FMAD
:
1736 return legalizeFMad(MI
, MRI
, B
);
1737 case TargetOpcode::G_FDIV
:
1738 return legalizeFDIV(MI
, MRI
, B
);
1739 case TargetOpcode::G_UDIV
:
1740 case TargetOpcode::G_UREM
:
1741 case TargetOpcode::G_UDIVREM
:
1742 return legalizeUnsignedDIV_REM(MI
, MRI
, B
);
1743 case TargetOpcode::G_SDIV
:
1744 case TargetOpcode::G_SREM
:
1745 case TargetOpcode::G_SDIVREM
:
1746 return legalizeSignedDIV_REM(MI
, MRI
, B
);
1747 case TargetOpcode::G_ATOMIC_CMPXCHG
:
1748 return legalizeAtomicCmpXChg(MI
, MRI
, B
);
1749 case TargetOpcode::G_FLOG
:
1750 return legalizeFlog(MI
, B
, numbers::ln2f
);
1751 case TargetOpcode::G_FLOG10
:
1752 return legalizeFlog(MI
, B
, numbers::ln2f
/ numbers::ln10f
);
1753 case TargetOpcode::G_FEXP
:
1754 return legalizeFExp(MI
, B
);
1755 case TargetOpcode::G_FPOW
:
1756 return legalizeFPow(MI
, B
);
1757 case TargetOpcode::G_FFLOOR
:
1758 return legalizeFFloor(MI
, MRI
, B
);
1759 case TargetOpcode::G_BUILD_VECTOR
:
1760 return legalizeBuildVector(MI
, MRI
, B
);
1761 case TargetOpcode::G_CTLZ
:
1762 case TargetOpcode::G_CTTZ
:
1763 return legalizeCTLZ_CTTZ(MI
, MRI
, B
);
1768 llvm_unreachable("expected switch to return");
1771 Register
AMDGPULegalizerInfo::getSegmentAperture(
1773 MachineRegisterInfo
&MRI
,
1774 MachineIRBuilder
&B
) const {
1775 MachineFunction
&MF
= B
.getMF();
1776 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1777 const LLT S32
= LLT::scalar(32);
1779 assert(AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::PRIVATE_ADDRESS
);
1781 if (ST
.hasApertureRegs()) {
1782 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1784 unsigned Offset
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
1785 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE
:
1786 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE
;
1787 unsigned WidthM1
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
1788 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE
:
1789 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE
;
1791 AMDGPU::Hwreg::ID_MEM_BASES
<< AMDGPU::Hwreg::ID_SHIFT_
|
1792 Offset
<< AMDGPU::Hwreg::OFFSET_SHIFT_
|
1793 WidthM1
<< AMDGPU::Hwreg::WIDTH_M1_SHIFT_
;
1795 Register GetReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
1797 B
.buildInstr(AMDGPU::S_GETREG_B32
)
1800 MRI
.setType(GetReg
, S32
);
1802 auto ShiftAmt
= B
.buildConstant(S32
, WidthM1
+ 1);
1803 return B
.buildShl(S32
, GetReg
, ShiftAmt
).getReg(0);
1806 Register QueuePtr
= MRI
.createGenericVirtualRegister(
1807 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
1809 if (!loadInputValue(QueuePtr
, B
, AMDGPUFunctionArgInfo::QUEUE_PTR
))
1812 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1813 // private_segment_aperture_base_hi.
1814 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
1816 // TODO: can we be smarter about machine pointer info?
1817 MachinePointerInfo
PtrInfo(AMDGPUAS::CONSTANT_ADDRESS
);
1818 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
1820 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
1821 MachineMemOperand::MOInvariant
,
1822 LLT::scalar(32), commonAlignment(Align(64), StructOffset
));
1826 B
.materializePtrAdd(LoadAddr
, QueuePtr
, LLT::scalar(64), StructOffset
);
1827 return B
.buildLoad(S32
, LoadAddr
, *MMO
).getReg(0);
1830 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1831 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1832 MachineIRBuilder
&B
) const {
1833 MachineFunction
&MF
= B
.getMF();
1835 const LLT S32
= LLT::scalar(32);
1836 Register Dst
= MI
.getOperand(0).getReg();
1837 Register Src
= MI
.getOperand(1).getReg();
1839 LLT DstTy
= MRI
.getType(Dst
);
1840 LLT SrcTy
= MRI
.getType(Src
);
1841 unsigned DestAS
= DstTy
.getAddressSpace();
1842 unsigned SrcAS
= SrcTy
.getAddressSpace();
1844 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1846 assert(!DstTy
.isVector());
1848 const AMDGPUTargetMachine
&TM
1849 = static_cast<const AMDGPUTargetMachine
&>(MF
.getTarget());
1851 if (TM
.isNoopAddrSpaceCast(SrcAS
, DestAS
)) {
1852 MI
.setDesc(B
.getTII().get(TargetOpcode::G_BITCAST
));
1856 if (DestAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
1858 B
.buildExtract(Dst
, Src
, 0);
1859 MI
.eraseFromParent();
1863 if (SrcAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
1864 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1865 uint32_t AddrHiVal
= Info
->get32BitAddressHighBits();
1867 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1868 // another. Merge operands are required to be the same type, but creating an
1869 // extra ptrtoint would be kind of pointless.
1870 auto HighAddr
= B
.buildConstant(
1871 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT
, 32), AddrHiVal
);
1872 B
.buildMerge(Dst
, {Src
, HighAddr
});
1873 MI
.eraseFromParent();
1877 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
) {
1878 assert(DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
1879 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
);
1880 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
1882 auto SegmentNull
= B
.buildConstant(DstTy
, NullVal
);
1883 auto FlatNull
= B
.buildConstant(SrcTy
, 0);
1885 // Extract low 32-bits of the pointer.
1886 auto PtrLo32
= B
.buildExtract(DstTy
, Src
, 0);
1889 B
.buildICmp(CmpInst::ICMP_NE
, LLT::scalar(1), Src
, FlatNull
.getReg(0));
1890 B
.buildSelect(Dst
, CmpRes
, PtrLo32
, SegmentNull
.getReg(0));
1892 MI
.eraseFromParent();
1896 if (SrcAS
!= AMDGPUAS::LOCAL_ADDRESS
&& SrcAS
!= AMDGPUAS::PRIVATE_ADDRESS
)
1899 if (!ST
.hasFlatAddressSpace())
1903 B
.buildConstant(SrcTy
, TM
.getNullPointerValue(SrcAS
));
1905 B
.buildConstant(DstTy
, TM
.getNullPointerValue(DestAS
));
1907 Register ApertureReg
= getSegmentAperture(SrcAS
, MRI
, B
);
1908 if (!ApertureReg
.isValid())
1912 B
.buildICmp(CmpInst::ICMP_NE
, LLT::scalar(1), Src
, SegmentNull
.getReg(0));
1914 // Coerce the type of the low half of the result so we can use merge_values.
1915 Register SrcAsInt
= B
.buildPtrToInt(S32
, Src
).getReg(0);
1917 // TODO: Should we allow mismatched types but matching sizes in merges to
1918 // avoid the ptrtoint?
1919 auto BuildPtr
= B
.buildMerge(DstTy
, {SrcAsInt
, ApertureReg
});
1920 B
.buildSelect(Dst
, CmpRes
, BuildPtr
, FlatNull
);
1922 MI
.eraseFromParent();
1926 bool AMDGPULegalizerInfo::legalizeFrint(
1927 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1928 MachineIRBuilder
&B
) const {
1929 Register Src
= MI
.getOperand(1).getReg();
1930 LLT Ty
= MRI
.getType(Src
);
1931 assert(Ty
.isScalar() && Ty
.getSizeInBits() == 64);
1933 APFloat
C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1934 APFloat
C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1936 auto C1
= B
.buildFConstant(Ty
, C1Val
);
1937 auto CopySign
= B
.buildFCopysign(Ty
, C1
, Src
);
1939 // TODO: Should this propagate fast-math-flags?
1940 auto Tmp1
= B
.buildFAdd(Ty
, Src
, CopySign
);
1941 auto Tmp2
= B
.buildFSub(Ty
, Tmp1
, CopySign
);
1943 auto C2
= B
.buildFConstant(Ty
, C2Val
);
1944 auto Fabs
= B
.buildFAbs(Ty
, Src
);
1946 auto Cond
= B
.buildFCmp(CmpInst::FCMP_OGT
, LLT::scalar(1), Fabs
, C2
);
1947 B
.buildSelect(MI
.getOperand(0).getReg(), Cond
, Src
, Tmp2
);
1948 MI
.eraseFromParent();
1952 bool AMDGPULegalizerInfo::legalizeFceil(
1953 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1954 MachineIRBuilder
&B
) const {
1956 const LLT S1
= LLT::scalar(1);
1957 const LLT S64
= LLT::scalar(64);
1959 Register Src
= MI
.getOperand(1).getReg();
1960 assert(MRI
.getType(Src
) == S64
);
1962 // result = trunc(src)
1963 // if (src > 0.0 && src != result)
1966 auto Trunc
= B
.buildIntrinsicTrunc(S64
, Src
);
1968 const auto Zero
= B
.buildFConstant(S64
, 0.0);
1969 const auto One
= B
.buildFConstant(S64
, 1.0);
1970 auto Lt0
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Src
, Zero
);
1971 auto NeTrunc
= B
.buildFCmp(CmpInst::FCMP_ONE
, S1
, Src
, Trunc
);
1972 auto And
= B
.buildAnd(S1
, Lt0
, NeTrunc
);
1973 auto Add
= B
.buildSelect(S64
, And
, One
, Zero
);
1975 // TODO: Should this propagate fast-math-flags?
1976 B
.buildFAdd(MI
.getOperand(0).getReg(), Trunc
, Add
);
1980 bool AMDGPULegalizerInfo::legalizeFrem(
1981 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1982 MachineIRBuilder
&B
) const {
1983 Register DstReg
= MI
.getOperand(0).getReg();
1984 Register Src0Reg
= MI
.getOperand(1).getReg();
1985 Register Src1Reg
= MI
.getOperand(2).getReg();
1986 auto Flags
= MI
.getFlags();
1987 LLT Ty
= MRI
.getType(DstReg
);
1989 auto Div
= B
.buildFDiv(Ty
, Src0Reg
, Src1Reg
, Flags
);
1990 auto Trunc
= B
.buildIntrinsicTrunc(Ty
, Div
, Flags
);
1991 auto Neg
= B
.buildFNeg(Ty
, Trunc
, Flags
);
1992 B
.buildFMA(DstReg
, Neg
, Src1Reg
, Src0Reg
, Flags
);
1993 MI
.eraseFromParent();
1997 static MachineInstrBuilder
extractF64Exponent(Register Hi
,
1998 MachineIRBuilder
&B
) {
1999 const unsigned FractBits
= 52;
2000 const unsigned ExpBits
= 11;
2001 LLT S32
= LLT::scalar(32);
2003 auto Const0
= B
.buildConstant(S32
, FractBits
- 32);
2004 auto Const1
= B
.buildConstant(S32
, ExpBits
);
2006 auto ExpPart
= B
.buildIntrinsic(Intrinsic::amdgcn_ubfe
, {S32
}, false)
2008 .addUse(Const0
.getReg(0))
2009 .addUse(Const1
.getReg(0));
2011 return B
.buildSub(S32
, ExpPart
, B
.buildConstant(S32
, 1023));
2014 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2015 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2016 MachineIRBuilder
&B
) const {
2017 const LLT S1
= LLT::scalar(1);
2018 const LLT S32
= LLT::scalar(32);
2019 const LLT S64
= LLT::scalar(64);
2021 Register Src
= MI
.getOperand(1).getReg();
2022 assert(MRI
.getType(Src
) == S64
);
2024 // TODO: Should this use extract since the low half is unused?
2025 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
2026 Register Hi
= Unmerge
.getReg(1);
2028 // Extract the upper half, since this is where we will find the sign and
2030 auto Exp
= extractF64Exponent(Hi
, B
);
2032 const unsigned FractBits
= 52;
2034 // Extract the sign bit.
2035 const auto SignBitMask
= B
.buildConstant(S32
, UINT32_C(1) << 31);
2036 auto SignBit
= B
.buildAnd(S32
, Hi
, SignBitMask
);
2038 const auto FractMask
= B
.buildConstant(S64
, (UINT64_C(1) << FractBits
) - 1);
2040 const auto Zero32
= B
.buildConstant(S32
, 0);
2042 // Extend back to 64-bits.
2043 auto SignBit64
= B
.buildMerge(S64
, {Zero32
, SignBit
});
2045 auto Shr
= B
.buildAShr(S64
, FractMask
, Exp
);
2046 auto Not
= B
.buildNot(S64
, Shr
);
2047 auto Tmp0
= B
.buildAnd(S64
, Src
, Not
);
2048 auto FiftyOne
= B
.buildConstant(S32
, FractBits
- 1);
2050 auto ExpLt0
= B
.buildICmp(CmpInst::ICMP_SLT
, S1
, Exp
, Zero32
);
2051 auto ExpGt51
= B
.buildICmp(CmpInst::ICMP_SGT
, S1
, Exp
, FiftyOne
);
2053 auto Tmp1
= B
.buildSelect(S64
, ExpLt0
, SignBit64
, Tmp0
);
2054 B
.buildSelect(MI
.getOperand(0).getReg(), ExpGt51
, Src
, Tmp1
);
2055 MI
.eraseFromParent();
2059 bool AMDGPULegalizerInfo::legalizeITOFP(
2060 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2061 MachineIRBuilder
&B
, bool Signed
) const {
2063 Register Dst
= MI
.getOperand(0).getReg();
2064 Register Src
= MI
.getOperand(1).getReg();
2066 const LLT S64
= LLT::scalar(64);
2067 const LLT S32
= LLT::scalar(32);
2069 assert(MRI
.getType(Src
) == S64
);
2071 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
2072 auto ThirtyTwo
= B
.buildConstant(S32
, 32);
2074 if (MRI
.getType(Dst
) == S64
) {
2075 auto CvtHi
= Signed
? B
.buildSITOFP(S64
, Unmerge
.getReg(1))
2076 : B
.buildUITOFP(S64
, Unmerge
.getReg(1));
2078 auto CvtLo
= B
.buildUITOFP(S64
, Unmerge
.getReg(0));
2079 auto LdExp
= B
.buildIntrinsic(Intrinsic::amdgcn_ldexp
, {S64
}, false)
2080 .addUse(CvtHi
.getReg(0))
2081 .addUse(ThirtyTwo
.getReg(0));
2083 // TODO: Should this propagate fast-math-flags?
2084 B
.buildFAdd(Dst
, LdExp
, CvtLo
);
2085 MI
.eraseFromParent();
2089 assert(MRI
.getType(Dst
) == S32
);
2091 auto One
= B
.buildConstant(S32
, 1);
2093 MachineInstrBuilder ShAmt
;
2095 auto ThirtyOne
= B
.buildConstant(S32
, 31);
2096 auto X
= B
.buildXor(S32
, Unmerge
.getReg(0), Unmerge
.getReg(1));
2097 auto OppositeSign
= B
.buildAShr(S32
, X
, ThirtyOne
);
2098 auto MaxShAmt
= B
.buildAdd(S32
, ThirtyTwo
, OppositeSign
);
2099 auto LS
= B
.buildIntrinsic(Intrinsic::amdgcn_sffbh
, {S32
},
2100 /*HasSideEffects=*/false)
2101 .addUse(Unmerge
.getReg(1));
2102 auto LS2
= B
.buildSub(S32
, LS
, One
);
2103 ShAmt
= B
.buildUMin(S32
, LS2
, MaxShAmt
);
2105 ShAmt
= B
.buildCTLZ(S32
, Unmerge
.getReg(1));
2106 auto Norm
= B
.buildShl(S64
, Src
, ShAmt
);
2107 auto Unmerge2
= B
.buildUnmerge({S32
, S32
}, Norm
);
2108 auto Adjust
= B
.buildUMin(S32
, One
, Unmerge2
.getReg(0));
2109 auto Norm2
= B
.buildOr(S32
, Unmerge2
.getReg(1), Adjust
);
2110 auto FVal
= Signed
? B
.buildSITOFP(S32
, Norm2
) : B
.buildUITOFP(S32
, Norm2
);
2111 auto Scale
= B
.buildSub(S32
, ThirtyTwo
, ShAmt
);
2112 B
.buildIntrinsic(Intrinsic::amdgcn_ldexp
, ArrayRef
<Register
>{Dst
},
2113 /*HasSideEffects=*/false)
2114 .addUse(FVal
.getReg(0))
2115 .addUse(Scale
.getReg(0));
2116 MI
.eraseFromParent();
2120 // TODO: Copied from DAG implementation. Verify logic and document how this
2122 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr
&MI
,
2123 MachineRegisterInfo
&MRI
,
2124 MachineIRBuilder
&B
,
2125 bool Signed
) const {
2127 Register Dst
= MI
.getOperand(0).getReg();
2128 Register Src
= MI
.getOperand(1).getReg();
2130 const LLT S64
= LLT::scalar(64);
2131 const LLT S32
= LLT::scalar(32);
2133 const LLT SrcLT
= MRI
.getType(Src
);
2134 assert((SrcLT
== S32
|| SrcLT
== S64
) && MRI
.getType(Dst
) == S64
);
2136 unsigned Flags
= MI
.getFlags();
2138 // The basic idea of converting a floating point number into a pair of 32-bit
2139 // integers is illustrated as follows:
2141 // tf := trunc(val);
2142 // hif := floor(tf * 2^-32);
2143 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2144 // hi := fptoi(hif);
2145 // lo := fptoi(lof);
2147 auto Trunc
= B
.buildIntrinsicTrunc(SrcLT
, Src
, Flags
);
2148 MachineInstrBuilder Sign
;
2149 if (Signed
&& SrcLT
== S32
) {
2150 // However, a 32-bit floating point number has only 23 bits mantissa and
2151 // it's not enough to hold all the significant bits of `lof` if val is
2152 // negative. To avoid the loss of precision, We need to take the absolute
2153 // value after truncating and flip the result back based on the original
2155 Sign
= B
.buildAShr(S32
, Src
, B
.buildConstant(S32
, 31));
2156 Trunc
= B
.buildFAbs(S32
, Trunc
, Flags
);
2158 MachineInstrBuilder K0
, K1
;
2160 K0
= B
.buildFConstant(S64
,
2161 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2162 K1
= B
.buildFConstant(S64
,
2163 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2165 K0
= B
.buildFConstant(S32
, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2166 K1
= B
.buildFConstant(S32
, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2169 auto Mul
= B
.buildFMul(SrcLT
, Trunc
, K0
, Flags
);
2170 auto FloorMul
= B
.buildFFloor(SrcLT
, Mul
, Flags
);
2171 auto Fma
= B
.buildFMA(SrcLT
, FloorMul
, K1
, Trunc
, Flags
);
2173 auto Hi
= (Signed
&& SrcLT
== S64
) ? B
.buildFPTOSI(S32
, FloorMul
)
2174 : B
.buildFPTOUI(S32
, FloorMul
);
2175 auto Lo
= B
.buildFPTOUI(S32
, Fma
);
2177 if (Signed
&& SrcLT
== S32
) {
2178 // Flip the result based on the signedness, which is either all 0s or 1s.
2179 Sign
= B
.buildMerge(S64
, {Sign
, Sign
});
2180 // r := xor({lo, hi}, sign) - sign;
2181 B
.buildSub(Dst
, B
.buildXor(S64
, B
.buildMerge(S64
, {Lo
, Hi
}), Sign
), Sign
);
2183 B
.buildMerge(Dst
, {Lo
, Hi
});
2184 MI
.eraseFromParent();
2189 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper
&Helper
,
2190 MachineInstr
&MI
) const {
2191 MachineFunction
&MF
= Helper
.MIRBuilder
.getMF();
2192 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
2194 const bool IsIEEEOp
= MI
.getOpcode() == AMDGPU::G_FMINNUM_IEEE
||
2195 MI
.getOpcode() == AMDGPU::G_FMAXNUM_IEEE
;
2197 // With ieee_mode disabled, the instructions have the correct behavior
2198 // already for G_FMINNUM/G_FMAXNUM
2199 if (!MFI
->getMode().IEEE
)
2205 return Helper
.lowerFMinNumMaxNum(MI
) == LegalizerHelper::Legalized
;
2208 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2209 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2210 MachineIRBuilder
&B
) const {
2211 // TODO: Should move some of this into LegalizerHelper.
2213 // TODO: Promote dynamic indexing of s16 to s32
2215 // FIXME: Artifact combiner probably should have replaced the truncated
2216 // constant before this, so we shouldn't need
2217 // getConstantVRegValWithLookThrough.
2218 Optional
<ValueAndVReg
> MaybeIdxVal
=
2219 getConstantVRegValWithLookThrough(MI
.getOperand(2).getReg(), MRI
);
2220 if (!MaybeIdxVal
) // Dynamic case will be selected to register indexing.
2222 const int64_t IdxVal
= MaybeIdxVal
->Value
.getSExtValue();
2224 Register Dst
= MI
.getOperand(0).getReg();
2225 Register Vec
= MI
.getOperand(1).getReg();
2227 LLT VecTy
= MRI
.getType(Vec
);
2228 LLT EltTy
= VecTy
.getElementType();
2229 assert(EltTy
== MRI
.getType(Dst
));
2231 if (IdxVal
< VecTy
.getNumElements())
2232 B
.buildExtract(Dst
, Vec
, IdxVal
* EltTy
.getSizeInBits());
2236 MI
.eraseFromParent();
2240 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2241 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2242 MachineIRBuilder
&B
) const {
2243 // TODO: Should move some of this into LegalizerHelper.
2245 // TODO: Promote dynamic indexing of s16 to s32
2247 // FIXME: Artifact combiner probably should have replaced the truncated
2248 // constant before this, so we shouldn't need
2249 // getConstantVRegValWithLookThrough.
2250 Optional
<ValueAndVReg
> MaybeIdxVal
=
2251 getConstantVRegValWithLookThrough(MI
.getOperand(3).getReg(), MRI
);
2252 if (!MaybeIdxVal
) // Dynamic case will be selected to register indexing.
2255 int64_t IdxVal
= MaybeIdxVal
->Value
.getSExtValue();
2256 Register Dst
= MI
.getOperand(0).getReg();
2257 Register Vec
= MI
.getOperand(1).getReg();
2258 Register Ins
= MI
.getOperand(2).getReg();
2260 LLT VecTy
= MRI
.getType(Vec
);
2261 LLT EltTy
= VecTy
.getElementType();
2262 assert(EltTy
== MRI
.getType(Ins
));
2264 if (IdxVal
< VecTy
.getNumElements())
2265 B
.buildInsert(Dst
, Vec
, Ins
, IdxVal
* EltTy
.getSizeInBits());
2269 MI
.eraseFromParent();
2273 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2274 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2275 MachineIRBuilder
&B
) const {
2276 const LLT V2S16
= LLT::fixed_vector(2, 16);
2278 Register Dst
= MI
.getOperand(0).getReg();
2279 Register Src0
= MI
.getOperand(1).getReg();
2280 LLT DstTy
= MRI
.getType(Dst
);
2281 LLT SrcTy
= MRI
.getType(Src0
);
2283 if (SrcTy
== V2S16
&& DstTy
== V2S16
&&
2284 AMDGPU::isLegalVOP3PShuffleMask(MI
.getOperand(3).getShuffleMask()))
2287 MachineIRBuilder
HelperBuilder(MI
);
2288 GISelObserverWrapper DummyObserver
;
2289 LegalizerHelper
Helper(B
.getMF(), DummyObserver
, HelperBuilder
);
2290 return Helper
.lowerShuffleVector(MI
) == LegalizerHelper::Legalized
;
2293 bool AMDGPULegalizerInfo::legalizeSinCos(
2294 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2295 MachineIRBuilder
&B
) const {
2297 Register DstReg
= MI
.getOperand(0).getReg();
2298 Register SrcReg
= MI
.getOperand(1).getReg();
2299 LLT Ty
= MRI
.getType(DstReg
);
2300 unsigned Flags
= MI
.getFlags();
2303 auto OneOver2Pi
= B
.buildFConstant(Ty
, 0.5 * numbers::inv_pi
);
2304 if (ST
.hasTrigReducedRange()) {
2305 auto MulVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
);
2306 TrigVal
= B
.buildIntrinsic(Intrinsic::amdgcn_fract
, {Ty
}, false)
2307 .addUse(MulVal
.getReg(0))
2308 .setMIFlags(Flags
).getReg(0);
2310 TrigVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
).getReg(0);
2312 Intrinsic::ID TrigIntrin
= MI
.getOpcode() == AMDGPU::G_FSIN
?
2313 Intrinsic::amdgcn_sin
: Intrinsic::amdgcn_cos
;
2314 B
.buildIntrinsic(TrigIntrin
, makeArrayRef
<Register
>(DstReg
), false)
2317 MI
.eraseFromParent();
2321 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg
, LLT PtrTy
,
2322 MachineIRBuilder
&B
,
2323 const GlobalValue
*GV
,
2325 unsigned GAFlags
) const {
2326 assert(isInt
<32>(Offset
+ 4) && "32-bit offset is expected!");
2327 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2328 // to the following code sequence:
2330 // For constant address space:
2331 // s_getpc_b64 s[0:1]
2332 // s_add_u32 s0, s0, $symbol
2333 // s_addc_u32 s1, s1, 0
2335 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2336 // a fixup or relocation is emitted to replace $symbol with a literal
2337 // constant, which is a pc-relative offset from the encoding of the $symbol
2338 // operand to the global variable.
2340 // For global address space:
2341 // s_getpc_b64 s[0:1]
2342 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2343 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2345 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2346 // fixups or relocations are emitted to replace $symbol@*@lo and
2347 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2348 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2349 // operand to the global variable.
2351 // What we want here is an offset from the value returned by s_getpc
2352 // (which is the address of the s_add_u32 instruction) to the global
2353 // variable, but since the encoding of $symbol starts 4 bytes after the start
2354 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2355 // small. This requires us to add 4 to the global variable offset in order to
2356 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2357 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2360 LLT ConstPtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
2362 Register PCReg
= PtrTy
.getSizeInBits() != 32 ? DstReg
:
2363 B
.getMRI()->createGenericVirtualRegister(ConstPtrTy
);
2365 MachineInstrBuilder MIB
= B
.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET
)
2368 MIB
.addGlobalAddress(GV
, Offset
+ 4, GAFlags
);
2369 if (GAFlags
== SIInstrInfo::MO_NONE
)
2372 MIB
.addGlobalAddress(GV
, Offset
+ 12, GAFlags
+ 1);
2374 B
.getMRI()->setRegClass(PCReg
, &AMDGPU::SReg_64RegClass
);
2376 if (PtrTy
.getSizeInBits() == 32)
2377 B
.buildExtract(DstReg
, PCReg
, 0);
2381 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2382 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2383 MachineIRBuilder
&B
) const {
2384 Register DstReg
= MI
.getOperand(0).getReg();
2385 LLT Ty
= MRI
.getType(DstReg
);
2386 unsigned AS
= Ty
.getAddressSpace();
2388 const GlobalValue
*GV
= MI
.getOperand(1).getGlobal();
2389 MachineFunction
&MF
= B
.getMF();
2390 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
2392 if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
2393 if (!MFI
->isModuleEntryFunction() &&
2394 !GV
->getName().equals("llvm.amdgcn.module.lds")) {
2395 const Function
&Fn
= MF
.getFunction();
2396 DiagnosticInfoUnsupported
BadLDSDecl(
2397 Fn
, "local memory global used by non-kernel function", MI
.getDebugLoc(),
2399 Fn
.getContext().diagnose(BadLDSDecl
);
2401 // We currently don't have a way to correctly allocate LDS objects that
2402 // aren't directly associated with a kernel. We do force inlining of
2403 // functions that use local objects. However, if these dead functions are
2404 // not eliminated, we don't want a compile time error. Just emit a warning
2405 // and a trap, since there should be no callable path here.
2406 B
.buildIntrinsic(Intrinsic::trap
, ArrayRef
<Register
>(), true);
2407 B
.buildUndef(DstReg
);
2408 MI
.eraseFromParent();
2412 // TODO: We could emit code to handle the initialization somewhere.
2413 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV
)) {
2414 const SITargetLowering
*TLI
= ST
.getTargetLowering();
2415 if (!TLI
->shouldUseLDSConstAddress(GV
)) {
2416 MI
.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO
);
2417 return true; // Leave in place;
2420 if (AS
== AMDGPUAS::LOCAL_ADDRESS
&& GV
->hasExternalLinkage()) {
2421 Type
*Ty
= GV
->getValueType();
2422 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2423 // zero-sized type in other languages to declare the dynamic shared
2424 // memory which size is not known at the compile time. They will be
2425 // allocated by the runtime and placed directly after the static
2426 // allocated ones. They all share the same offset.
2427 if (B
.getDataLayout().getTypeAllocSize(Ty
).isZero()) {
2428 // Adjust alignment for that dynamic shared memory array.
2429 MFI
->setDynLDSAlign(B
.getDataLayout(), *cast
<GlobalVariable
>(GV
));
2430 LLT S32
= LLT::scalar(32);
2432 B
.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize
, {S32
}, false);
2433 B
.buildIntToPtr(DstReg
, Sz
);
2434 MI
.eraseFromParent();
2441 MFI
->allocateLDSGlobal(B
.getDataLayout(), *cast
<GlobalVariable
>(GV
)));
2442 MI
.eraseFromParent();
2446 const Function
&Fn
= MF
.getFunction();
2447 DiagnosticInfoUnsupported
BadInit(
2448 Fn
, "unsupported initializer for address space", MI
.getDebugLoc());
2449 Fn
.getContext().diagnose(BadInit
);
2453 const SITargetLowering
*TLI
= ST
.getTargetLowering();
2455 if (TLI
->shouldEmitFixup(GV
)) {
2456 buildPCRelGlobalAddress(DstReg
, Ty
, B
, GV
, 0);
2457 MI
.eraseFromParent();
2461 if (TLI
->shouldEmitPCReloc(GV
)) {
2462 buildPCRelGlobalAddress(DstReg
, Ty
, B
, GV
, 0, SIInstrInfo::MO_REL32
);
2463 MI
.eraseFromParent();
2467 LLT PtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
2468 Register GOTAddr
= MRI
.createGenericVirtualRegister(PtrTy
);
2470 LLT LoadTy
= Ty
.getSizeInBits() == 32 ? PtrTy
: Ty
;
2471 MachineMemOperand
*GOTMMO
= MF
.getMachineMemOperand(
2472 MachinePointerInfo::getGOT(MF
),
2473 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
2474 MachineMemOperand::MOInvariant
,
2477 buildPCRelGlobalAddress(GOTAddr
, PtrTy
, B
, GV
, 0, SIInstrInfo::MO_GOTPCREL32
);
2479 if (Ty
.getSizeInBits() == 32) {
2480 // Truncate if this is a 32-bit constant adrdess.
2481 auto Load
= B
.buildLoad(PtrTy
, GOTAddr
, *GOTMMO
);
2482 B
.buildExtract(DstReg
, Load
, 0);
2484 B
.buildLoad(DstReg
, GOTAddr
, *GOTMMO
);
2486 MI
.eraseFromParent();
2490 static LLT
widenToNextPowerOf2(LLT Ty
) {
2492 return Ty
.changeElementCount(
2493 ElementCount::getFixed(PowerOf2Ceil(Ty
.getNumElements())));
2494 return LLT::scalar(PowerOf2Ceil(Ty
.getSizeInBits()));
2497 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper
&Helper
,
2498 MachineInstr
&MI
) const {
2499 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
2500 MachineRegisterInfo
&MRI
= *B
.getMRI();
2501 GISelChangeObserver
&Observer
= Helper
.Observer
;
2503 Register PtrReg
= MI
.getOperand(1).getReg();
2504 LLT PtrTy
= MRI
.getType(PtrReg
);
2505 unsigned AddrSpace
= PtrTy
.getAddressSpace();
2507 if (AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
2508 LLT ConstPtr
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
2509 auto Cast
= B
.buildAddrSpaceCast(ConstPtr
, PtrReg
);
2510 Observer
.changingInstr(MI
);
2511 MI
.getOperand(1).setReg(Cast
.getReg(0));
2512 Observer
.changedInstr(MI
);
2516 if (MI
.getOpcode() != AMDGPU::G_LOAD
)
2519 Register ValReg
= MI
.getOperand(0).getReg();
2520 LLT ValTy
= MRI
.getType(ValReg
);
2522 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
2523 const unsigned ValSize
= ValTy
.getSizeInBits();
2524 const LLT MemTy
= MMO
->getMemoryType();
2525 const Align MemAlign
= MMO
->getAlign();
2526 const unsigned MemSize
= MemTy
.getSizeInBits();
2527 const unsigned AlignInBits
= 8 * MemAlign
.value();
2529 // Widen non-power-of-2 loads to the alignment if needed
2530 if (shouldWidenLoad(ST
, MemTy
, AlignInBits
, AddrSpace
, MI
.getOpcode())) {
2531 const unsigned WideMemSize
= PowerOf2Ceil(MemSize
);
2533 // This was already the correct extending load result type, so just adjust
2535 if (WideMemSize
== ValSize
) {
2536 MachineFunction
&MF
= B
.getMF();
2538 MachineMemOperand
*WideMMO
=
2539 MF
.getMachineMemOperand(MMO
, 0, WideMemSize
/ 8);
2540 Observer
.changingInstr(MI
);
2541 MI
.setMemRefs(MF
, {WideMMO
});
2542 Observer
.changedInstr(MI
);
2546 // Don't bother handling edge case that should probably never be produced.
2547 if (ValSize
> WideMemSize
)
2550 LLT WideTy
= widenToNextPowerOf2(ValTy
);
2553 if (!WideTy
.isVector()) {
2554 WideLoad
= B
.buildLoadFromOffset(WideTy
, PtrReg
, *MMO
, 0).getReg(0);
2555 B
.buildTrunc(ValReg
, WideLoad
).getReg(0);
2557 // Extract the subvector.
2559 if (isRegisterType(ValTy
)) {
2560 // If this a case where G_EXTRACT is legal, use it.
2561 // (e.g. <3 x s32> -> <4 x s32>)
2562 WideLoad
= B
.buildLoadFromOffset(WideTy
, PtrReg
, *MMO
, 0).getReg(0);
2563 B
.buildExtract(ValReg
, WideLoad
, 0);
2565 // For cases where the widened type isn't a nice register value, unmerge
2566 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2567 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
2568 WideLoad
= Helper
.widenWithUnmerge(WideTy
, ValReg
);
2569 B
.setInsertPt(B
.getMBB(), MI
.getIterator());
2570 B
.buildLoadFromOffset(WideLoad
, PtrReg
, *MMO
, 0);
2574 MI
.eraseFromParent();
2581 bool AMDGPULegalizerInfo::legalizeFMad(
2582 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2583 MachineIRBuilder
&B
) const {
2584 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
2585 assert(Ty
.isScalar());
2587 MachineFunction
&MF
= B
.getMF();
2588 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
2590 // TODO: Always legal with future ftz flag.
2591 // FIXME: Do we need just output?
2592 if (Ty
== LLT::scalar(32) && !MFI
->getMode().allFP32Denormals())
2594 if (Ty
== LLT::scalar(16) && !MFI
->getMode().allFP64FP16Denormals())
2597 MachineIRBuilder
HelperBuilder(MI
);
2598 GISelObserverWrapper DummyObserver
;
2599 LegalizerHelper
Helper(MF
, DummyObserver
, HelperBuilder
);
2600 return Helper
.lowerFMad(MI
) == LegalizerHelper::Legalized
;
2603 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2604 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
2605 Register DstReg
= MI
.getOperand(0).getReg();
2606 Register PtrReg
= MI
.getOperand(1).getReg();
2607 Register CmpVal
= MI
.getOperand(2).getReg();
2608 Register NewVal
= MI
.getOperand(3).getReg();
2610 assert(AMDGPU::isFlatGlobalAddrSpace(MRI
.getType(PtrReg
).getAddressSpace()) &&
2611 "this should not have been custom lowered");
2613 LLT ValTy
= MRI
.getType(CmpVal
);
2614 LLT VecTy
= LLT::fixed_vector(2, ValTy
);
2616 Register PackedVal
= B
.buildBuildVector(VecTy
, { NewVal
, CmpVal
}).getReg(0);
2618 B
.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG
)
2622 .setMemRefs(MI
.memoperands());
2624 MI
.eraseFromParent();
2628 bool AMDGPULegalizerInfo::legalizeFlog(
2629 MachineInstr
&MI
, MachineIRBuilder
&B
, double Log2BaseInverted
) const {
2630 Register Dst
= MI
.getOperand(0).getReg();
2631 Register Src
= MI
.getOperand(1).getReg();
2632 LLT Ty
= B
.getMRI()->getType(Dst
);
2633 unsigned Flags
= MI
.getFlags();
2635 auto Log2Operand
= B
.buildFLog2(Ty
, Src
, Flags
);
2636 auto Log2BaseInvertedOperand
= B
.buildFConstant(Ty
, Log2BaseInverted
);
2638 B
.buildFMul(Dst
, Log2Operand
, Log2BaseInvertedOperand
, Flags
);
2639 MI
.eraseFromParent();
2643 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr
&MI
,
2644 MachineIRBuilder
&B
) const {
2645 Register Dst
= MI
.getOperand(0).getReg();
2646 Register Src
= MI
.getOperand(1).getReg();
2647 unsigned Flags
= MI
.getFlags();
2648 LLT Ty
= B
.getMRI()->getType(Dst
);
2650 auto K
= B
.buildFConstant(Ty
, numbers::log2e
);
2651 auto Mul
= B
.buildFMul(Ty
, Src
, K
, Flags
);
2652 B
.buildFExp2(Dst
, Mul
, Flags
);
2653 MI
.eraseFromParent();
2657 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr
&MI
,
2658 MachineIRBuilder
&B
) const {
2659 Register Dst
= MI
.getOperand(0).getReg();
2660 Register Src0
= MI
.getOperand(1).getReg();
2661 Register Src1
= MI
.getOperand(2).getReg();
2662 unsigned Flags
= MI
.getFlags();
2663 LLT Ty
= B
.getMRI()->getType(Dst
);
2664 const LLT S16
= LLT::scalar(16);
2665 const LLT S32
= LLT::scalar(32);
2668 auto Log
= B
.buildFLog2(S32
, Src0
, Flags
);
2669 auto Mul
= B
.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy
, {S32
}, false)
2670 .addUse(Log
.getReg(0))
2673 B
.buildFExp2(Dst
, Mul
, Flags
);
2674 } else if (Ty
== S16
) {
2675 // There's no f16 fmul_legacy, so we need to convert for it.
2676 auto Log
= B
.buildFLog2(S16
, Src0
, Flags
);
2677 auto Ext0
= B
.buildFPExt(S32
, Log
, Flags
);
2678 auto Ext1
= B
.buildFPExt(S32
, Src1
, Flags
);
2679 auto Mul
= B
.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy
, {S32
}, false)
2680 .addUse(Ext0
.getReg(0))
2681 .addUse(Ext1
.getReg(0))
2684 B
.buildFExp2(Dst
, B
.buildFPTrunc(S16
, Mul
), Flags
);
2688 MI
.eraseFromParent();
2692 // Find a source register, ignoring any possible source modifiers.
2693 static Register
stripAnySourceMods(Register OrigSrc
, MachineRegisterInfo
&MRI
) {
2694 Register ModSrc
= OrigSrc
;
2695 if (MachineInstr
*SrcFNeg
= getOpcodeDef(AMDGPU::G_FNEG
, ModSrc
, MRI
)) {
2696 ModSrc
= SrcFNeg
->getOperand(1).getReg();
2697 if (MachineInstr
*SrcFAbs
= getOpcodeDef(AMDGPU::G_FABS
, ModSrc
, MRI
))
2698 ModSrc
= SrcFAbs
->getOperand(1).getReg();
2699 } else if (MachineInstr
*SrcFAbs
= getOpcodeDef(AMDGPU::G_FABS
, ModSrc
, MRI
))
2700 ModSrc
= SrcFAbs
->getOperand(1).getReg();
2704 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr
&MI
,
2705 MachineRegisterInfo
&MRI
,
2706 MachineIRBuilder
&B
) const {
2708 const LLT S1
= LLT::scalar(1);
2709 const LLT S64
= LLT::scalar(64);
2710 Register Dst
= MI
.getOperand(0).getReg();
2711 Register OrigSrc
= MI
.getOperand(1).getReg();
2712 unsigned Flags
= MI
.getFlags();
2713 assert(ST
.hasFractBug() && MRI
.getType(Dst
) == S64
&&
2714 "this should not have been custom lowered");
2716 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2717 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2718 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2720 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2722 // Convert floor(x) to (x - fract(x))
2724 auto Fract
= B
.buildIntrinsic(Intrinsic::amdgcn_fract
, {S64
}, false)
2728 // Give source modifier matching some assistance before obscuring a foldable
2731 // TODO: We can avoid the neg on the fract? The input sign to fract
2732 // shouldn't matter?
2733 Register ModSrc
= stripAnySourceMods(OrigSrc
, MRI
);
2735 auto Const
= B
.buildFConstant(S64
, BitsToDouble(0x3fefffffffffffff));
2737 Register Min
= MRI
.createGenericVirtualRegister(S64
);
2739 // We don't need to concern ourselves with the snan handling difference, so
2740 // use the one which will directly select.
2741 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
2742 if (MFI
->getMode().IEEE
)
2743 B
.buildFMinNumIEEE(Min
, Fract
, Const
, Flags
);
2745 B
.buildFMinNum(Min
, Fract
, Const
, Flags
);
2747 Register CorrectedFract
= Min
;
2748 if (!MI
.getFlag(MachineInstr::FmNoNans
)) {
2749 auto IsNan
= B
.buildFCmp(CmpInst::FCMP_ORD
, S1
, ModSrc
, ModSrc
, Flags
);
2750 CorrectedFract
= B
.buildSelect(S64
, IsNan
, ModSrc
, Min
, Flags
).getReg(0);
2753 auto NegFract
= B
.buildFNeg(S64
, CorrectedFract
, Flags
);
2754 B
.buildFAdd(Dst
, OrigSrc
, NegFract
, Flags
);
2756 MI
.eraseFromParent();
2760 // Turn an illegal packed v2s16 build vector into bit operations.
2761 // TODO: This should probably be a bitcast action in LegalizerHelper.
2762 bool AMDGPULegalizerInfo::legalizeBuildVector(
2763 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
2764 Register Dst
= MI
.getOperand(0).getReg();
2765 const LLT S32
= LLT::scalar(32);
2766 assert(MRI
.getType(Dst
) == LLT::fixed_vector(2, 16));
2768 Register Src0
= MI
.getOperand(1).getReg();
2769 Register Src1
= MI
.getOperand(2).getReg();
2770 assert(MRI
.getType(Src0
) == LLT::scalar(16));
2772 auto Merge
= B
.buildMerge(S32
, {Src0
, Src1
});
2773 B
.buildBitcast(Dst
, Merge
);
2775 MI
.eraseFromParent();
2779 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2780 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2781 // case with a single min instruction instead of a compare+select.
2782 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr
&MI
,
2783 MachineRegisterInfo
&MRI
,
2784 MachineIRBuilder
&B
) const {
2785 Register Dst
= MI
.getOperand(0).getReg();
2786 Register Src
= MI
.getOperand(1).getReg();
2787 LLT DstTy
= MRI
.getType(Dst
);
2788 LLT SrcTy
= MRI
.getType(Src
);
2790 unsigned NewOpc
= MI
.getOpcode() == AMDGPU::G_CTLZ
2791 ? AMDGPU::G_AMDGPU_FFBH_U32
2792 : AMDGPU::G_AMDGPU_FFBL_B32
;
2793 auto Tmp
= B
.buildInstr(NewOpc
, {DstTy
}, {Src
});
2794 B
.buildUMin(Dst
, Tmp
, B
.buildConstant(DstTy
, SrcTy
.getSizeInBits()));
2796 MI
.eraseFromParent();
2800 // Check that this is a G_XOR x, -1
2801 static bool isNot(const MachineRegisterInfo
&MRI
, const MachineInstr
&MI
) {
2802 if (MI
.getOpcode() != TargetOpcode::G_XOR
)
2804 auto ConstVal
= getConstantVRegSExtVal(MI
.getOperand(2).getReg(), MRI
);
2805 return ConstVal
&& *ConstVal
== -1;
2808 // Return the use branch instruction, otherwise null if the usage is invalid.
2809 static MachineInstr
*
2810 verifyCFIntrinsic(MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineInstr
*&Br
,
2811 MachineBasicBlock
*&UncondBrTarget
, bool &Negated
) {
2812 Register CondDef
= MI
.getOperand(0).getReg();
2813 if (!MRI
.hasOneNonDBGUse(CondDef
))
2816 MachineBasicBlock
*Parent
= MI
.getParent();
2817 MachineInstr
*UseMI
= &*MRI
.use_instr_nodbg_begin(CondDef
);
2819 if (isNot(MRI
, *UseMI
)) {
2820 Register NegatedCond
= UseMI
->getOperand(0).getReg();
2821 if (!MRI
.hasOneNonDBGUse(NegatedCond
))
2824 // We're deleting the def of this value, so we need to remove it.
2825 UseMI
->eraseFromParent();
2827 UseMI
= &*MRI
.use_instr_nodbg_begin(NegatedCond
);
2831 if (UseMI
->getParent() != Parent
|| UseMI
->getOpcode() != AMDGPU::G_BRCOND
)
2834 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2835 MachineBasicBlock::iterator Next
= std::next(UseMI
->getIterator());
2836 if (Next
== Parent
->end()) {
2837 MachineFunction::iterator NextMBB
= std::next(Parent
->getIterator());
2838 if (NextMBB
== Parent
->getParent()->end()) // Illegal intrinsic use.
2840 UncondBrTarget
= &*NextMBB
;
2842 if (Next
->getOpcode() != AMDGPU::G_BR
)
2845 UncondBrTarget
= Br
->getOperand(0).getMBB();
2851 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg
, MachineIRBuilder
&B
,
2852 const ArgDescriptor
*Arg
,
2853 const TargetRegisterClass
*ArgRC
,
2855 MCRegister SrcReg
= Arg
->getRegister();
2856 assert(Register::isPhysicalRegister(SrcReg
) && "Physical register expected");
2857 assert(DstReg
.isVirtual() && "Virtual register expected");
2859 Register LiveIn
= getFunctionLiveInPhysReg(B
.getMF(), B
.getTII(), SrcReg
, *ArgRC
,
2861 if (Arg
->isMasked()) {
2862 // TODO: Should we try to emit this once in the entry block?
2863 const LLT S32
= LLT::scalar(32);
2864 const unsigned Mask
= Arg
->getMask();
2865 const unsigned Shift
= countTrailingZeros
<unsigned>(Mask
);
2867 Register AndMaskSrc
= LiveIn
;
2870 auto ShiftAmt
= B
.buildConstant(S32
, Shift
);
2871 AndMaskSrc
= B
.buildLShr(S32
, LiveIn
, ShiftAmt
).getReg(0);
2874 B
.buildAnd(DstReg
, AndMaskSrc
, B
.buildConstant(S32
, Mask
>> Shift
));
2876 B
.buildCopy(DstReg
, LiveIn
);
2882 bool AMDGPULegalizerInfo::loadInputValue(
2883 Register DstReg
, MachineIRBuilder
&B
,
2884 AMDGPUFunctionArgInfo::PreloadedValue ArgType
) const {
2885 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
2886 const ArgDescriptor
*Arg
;
2887 const TargetRegisterClass
*ArgRC
;
2889 std::tie(Arg
, ArgRC
, ArgTy
) = MFI
->getPreloadedValue(ArgType
);
2891 if (!Arg
->isRegister() || !Arg
->getRegister().isValid())
2892 return false; // TODO: Handle these
2893 return loadInputValue(DstReg
, B
, Arg
, ArgRC
, ArgTy
);
2896 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2897 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
,
2898 AMDGPUFunctionArgInfo::PreloadedValue ArgType
) const {
2899 if (!loadInputValue(MI
.getOperand(0).getReg(), B
, ArgType
))
2902 MI
.eraseFromParent();
2906 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr
&MI
,
2907 MachineRegisterInfo
&MRI
,
2908 MachineIRBuilder
&B
) const {
2909 Register Dst
= MI
.getOperand(0).getReg();
2910 LLT DstTy
= MRI
.getType(Dst
);
2911 LLT S16
= LLT::scalar(16);
2912 LLT S32
= LLT::scalar(32);
2913 LLT S64
= LLT::scalar(64);
2916 return legalizeFDIV16(MI
, MRI
, B
);
2918 return legalizeFDIV32(MI
, MRI
, B
);
2920 return legalizeFDIV64(MI
, MRI
, B
);
2925 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder
&B
,
2930 const LLT S1
= LLT::scalar(1);
2931 const LLT S32
= LLT::scalar(32);
2933 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2934 // algorithm used here.
2936 // Initial estimate of inv(y).
2937 auto FloatY
= B
.buildUITOFP(S32
, Y
);
2938 auto RcpIFlag
= B
.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG
, {S32
}, {FloatY
});
2939 auto Scale
= B
.buildFConstant(S32
, BitsToFloat(0x4f7ffffe));
2940 auto ScaledY
= B
.buildFMul(S32
, RcpIFlag
, Scale
);
2941 auto Z
= B
.buildFPTOUI(S32
, ScaledY
);
2943 // One round of UNR.
2944 auto NegY
= B
.buildSub(S32
, B
.buildConstant(S32
, 0), Y
);
2945 auto NegYZ
= B
.buildMul(S32
, NegY
, Z
);
2946 Z
= B
.buildAdd(S32
, Z
, B
.buildUMulH(S32
, Z
, NegYZ
));
2948 // Quotient/remainder estimate.
2949 auto Q
= B
.buildUMulH(S32
, X
, Z
);
2950 auto R
= B
.buildSub(S32
, X
, B
.buildMul(S32
, Q
, Y
));
2952 // First quotient/remainder refinement.
2953 auto One
= B
.buildConstant(S32
, 1);
2954 auto Cond
= B
.buildICmp(CmpInst::ICMP_UGE
, S1
, R
, Y
);
2956 Q
= B
.buildSelect(S32
, Cond
, B
.buildAdd(S32
, Q
, One
), Q
);
2957 R
= B
.buildSelect(S32
, Cond
, B
.buildSub(S32
, R
, Y
), R
);
2959 // Second quotient/remainder refinement.
2960 Cond
= B
.buildICmp(CmpInst::ICMP_UGE
, S1
, R
, Y
);
2962 B
.buildSelect(DstDivReg
, Cond
, B
.buildAdd(S32
, Q
, One
), Q
);
2965 B
.buildSelect(DstRemReg
, Cond
, B
.buildSub(S32
, R
, Y
), R
);
2968 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2970 // Return lo, hi of result
2972 // %cvt.lo = G_UITOFP Val.lo
2973 // %cvt.hi = G_UITOFP Val.hi
2974 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2975 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2976 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2977 // %mul2 = G_FMUL %mul1, 2**(-32)
2978 // %trunc = G_INTRINSIC_TRUNC %mul2
2979 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2980 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2981 static std::pair
<Register
, Register
> emitReciprocalU64(MachineIRBuilder
&B
,
2983 const LLT S32
= LLT::scalar(32);
2984 auto Unmerge
= B
.buildUnmerge(S32
, Val
);
2986 auto CvtLo
= B
.buildUITOFP(S32
, Unmerge
.getReg(0));
2987 auto CvtHi
= B
.buildUITOFP(S32
, Unmerge
.getReg(1));
2989 auto Mad
= B
.buildFMAD(S32
, CvtHi
, // 2**32
2990 B
.buildFConstant(S32
, BitsToFloat(0x4f800000)), CvtLo
);
2992 auto Rcp
= B
.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG
, {S32
}, {Mad
});
2994 B
.buildFMul(S32
, Rcp
, B
.buildFConstant(S32
, BitsToFloat(0x5f7ffffc)));
2998 B
.buildFMul(S32
, Mul1
, B
.buildFConstant(S32
, BitsToFloat(0x2f800000)));
2999 auto Trunc
= B
.buildIntrinsicTrunc(S32
, Mul2
);
3002 auto Mad2
= B
.buildFMAD(S32
, Trunc
,
3003 B
.buildFConstant(S32
, BitsToFloat(0xcf800000)), Mul1
);
3005 auto ResultLo
= B
.buildFPTOUI(S32
, Mad2
);
3006 auto ResultHi
= B
.buildFPTOUI(S32
, Trunc
);
3008 return {ResultLo
.getReg(0), ResultHi
.getReg(0)};
3011 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder
&B
,
3015 Register Denom
) const {
3016 const LLT S32
= LLT::scalar(32);
3017 const LLT S64
= LLT::scalar(64);
3018 const LLT S1
= LLT::scalar(1);
3019 Register RcpLo
, RcpHi
;
3021 std::tie(RcpLo
, RcpHi
) = emitReciprocalU64(B
, Denom
);
3023 auto Rcp
= B
.buildMerge(S64
, {RcpLo
, RcpHi
});
3025 auto Zero64
= B
.buildConstant(S64
, 0);
3026 auto NegDenom
= B
.buildSub(S64
, Zero64
, Denom
);
3028 auto MulLo1
= B
.buildMul(S64
, NegDenom
, Rcp
);
3029 auto MulHi1
= B
.buildUMulH(S64
, Rcp
, MulLo1
);
3031 auto UnmergeMulHi1
= B
.buildUnmerge(S32
, MulHi1
);
3032 Register MulHi1_Lo
= UnmergeMulHi1
.getReg(0);
3033 Register MulHi1_Hi
= UnmergeMulHi1
.getReg(1);
3035 auto Add1_Lo
= B
.buildUAddo(S32
, S1
, RcpLo
, MulHi1_Lo
);
3036 auto Add1_Hi
= B
.buildUAdde(S32
, S1
, RcpHi
, MulHi1_Hi
, Add1_Lo
.getReg(1));
3037 auto Add1_HiNc
= B
.buildAdd(S32
, RcpHi
, MulHi1_Hi
);
3038 auto Add1
= B
.buildMerge(S64
, {Add1_Lo
, Add1_Hi
});
3040 auto MulLo2
= B
.buildMul(S64
, NegDenom
, Add1
);
3041 auto MulHi2
= B
.buildUMulH(S64
, Add1
, MulLo2
);
3042 auto UnmergeMulHi2
= B
.buildUnmerge(S32
, MulHi2
);
3043 Register MulHi2_Lo
= UnmergeMulHi2
.getReg(0);
3044 Register MulHi2_Hi
= UnmergeMulHi2
.getReg(1);
3046 auto Zero32
= B
.buildConstant(S32
, 0);
3047 auto Add2_Lo
= B
.buildUAddo(S32
, S1
, Add1_Lo
, MulHi2_Lo
);
3049 B
.buildUAdde(S32
, S1
, Add1_HiNc
, MulHi2_Hi
, Add1_Lo
.getReg(1));
3050 auto Add2_Hi
= B
.buildUAdde(S32
, S1
, Add2_HiC
, Zero32
, Add2_Lo
.getReg(1));
3051 auto Add2
= B
.buildMerge(S64
, {Add2_Lo
, Add2_Hi
});
3053 auto UnmergeNumer
= B
.buildUnmerge(S32
, Numer
);
3054 Register NumerLo
= UnmergeNumer
.getReg(0);
3055 Register NumerHi
= UnmergeNumer
.getReg(1);
3057 auto MulHi3
= B
.buildUMulH(S64
, Numer
, Add2
);
3058 auto Mul3
= B
.buildMul(S64
, Denom
, MulHi3
);
3059 auto UnmergeMul3
= B
.buildUnmerge(S32
, Mul3
);
3060 Register Mul3_Lo
= UnmergeMul3
.getReg(0);
3061 Register Mul3_Hi
= UnmergeMul3
.getReg(1);
3062 auto Sub1_Lo
= B
.buildUSubo(S32
, S1
, NumerLo
, Mul3_Lo
);
3063 auto Sub1_Hi
= B
.buildUSube(S32
, S1
, NumerHi
, Mul3_Hi
, Sub1_Lo
.getReg(1));
3064 auto Sub1_Mi
= B
.buildSub(S32
, NumerHi
, Mul3_Hi
);
3065 auto Sub1
= B
.buildMerge(S64
, {Sub1_Lo
, Sub1_Hi
});
3067 auto UnmergeDenom
= B
.buildUnmerge(S32
, Denom
);
3068 Register DenomLo
= UnmergeDenom
.getReg(0);
3069 Register DenomHi
= UnmergeDenom
.getReg(1);
3071 auto CmpHi
= B
.buildICmp(CmpInst::ICMP_UGE
, S1
, Sub1_Hi
, DenomHi
);
3072 auto C1
= B
.buildSExt(S32
, CmpHi
);
3074 auto CmpLo
= B
.buildICmp(CmpInst::ICMP_UGE
, S1
, Sub1_Lo
, DenomLo
);
3075 auto C2
= B
.buildSExt(S32
, CmpLo
);
3077 auto CmpEq
= B
.buildICmp(CmpInst::ICMP_EQ
, S1
, Sub1_Hi
, DenomHi
);
3078 auto C3
= B
.buildSelect(S32
, CmpEq
, C2
, C1
);
3080 // TODO: Here and below portions of the code can be enclosed into if/endif.
3081 // Currently control flow is unconditional and we have 4 selects after
3082 // potential endif to substitute PHIs.
3085 auto Sub2_Lo
= B
.buildUSubo(S32
, S1
, Sub1_Lo
, DenomLo
);
3086 auto Sub2_Mi
= B
.buildUSube(S32
, S1
, Sub1_Mi
, DenomHi
, Sub1_Lo
.getReg(1));
3087 auto Sub2_Hi
= B
.buildUSube(S32
, S1
, Sub2_Mi
, Zero32
, Sub2_Lo
.getReg(1));
3088 auto Sub2
= B
.buildMerge(S64
, {Sub2_Lo
, Sub2_Hi
});
3090 auto One64
= B
.buildConstant(S64
, 1);
3091 auto Add3
= B
.buildAdd(S64
, MulHi3
, One64
);
3094 B
.buildSExt(S32
, B
.buildICmp(CmpInst::ICMP_UGE
, S1
, Sub2_Hi
, DenomHi
));
3096 B
.buildSExt(S32
, B
.buildICmp(CmpInst::ICMP_UGE
, S1
, Sub2_Lo
, DenomLo
));
3097 auto C6
= B
.buildSelect(
3098 S32
, B
.buildICmp(CmpInst::ICMP_EQ
, S1
, Sub2_Hi
, DenomHi
), C5
, C4
);
3101 auto Add4
= B
.buildAdd(S64
, Add3
, One64
);
3102 auto Sub3_Lo
= B
.buildUSubo(S32
, S1
, Sub2_Lo
, DenomLo
);
3104 auto Sub3_Mi
= B
.buildUSube(S32
, S1
, Sub2_Mi
, DenomHi
, Sub2_Lo
.getReg(1));
3105 auto Sub3_Hi
= B
.buildUSube(S32
, S1
, Sub3_Mi
, Zero32
, Sub3_Lo
.getReg(1));
3106 auto Sub3
= B
.buildMerge(S64
, {Sub3_Lo
, Sub3_Hi
});
3112 auto Sel1
= B
.buildSelect(
3113 S64
, B
.buildICmp(CmpInst::ICMP_NE
, S1
, C6
, Zero32
), Add4
, Add3
);
3114 B
.buildSelect(DstDivReg
, B
.buildICmp(CmpInst::ICMP_NE
, S1
, C3
, Zero32
),
3119 auto Sel2
= B
.buildSelect(
3120 S64
, B
.buildICmp(CmpInst::ICMP_NE
, S1
, C6
, Zero32
), Sub3
, Sub2
);
3121 B
.buildSelect(DstRemReg
, B
.buildICmp(CmpInst::ICMP_NE
, S1
, C3
, Zero32
),
3126 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr
&MI
,
3127 MachineRegisterInfo
&MRI
,
3128 MachineIRBuilder
&B
) const {
3129 Register DstDivReg
, DstRemReg
;
3130 switch (MI
.getOpcode()) {
3132 llvm_unreachable("Unexpected opcode!");
3133 case AMDGPU::G_UDIV
: {
3134 DstDivReg
= MI
.getOperand(0).getReg();
3137 case AMDGPU::G_UREM
: {
3138 DstRemReg
= MI
.getOperand(0).getReg();
3141 case AMDGPU::G_UDIVREM
: {
3142 DstDivReg
= MI
.getOperand(0).getReg();
3143 DstRemReg
= MI
.getOperand(1).getReg();
3148 const LLT S64
= LLT::scalar(64);
3149 const LLT S32
= LLT::scalar(32);
3150 const unsigned FirstSrcOpIdx
= MI
.getNumExplicitDefs();
3151 Register Num
= MI
.getOperand(FirstSrcOpIdx
).getReg();
3152 Register Den
= MI
.getOperand(FirstSrcOpIdx
+ 1).getReg();
3153 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
3156 legalizeUnsignedDIV_REM32Impl(B
, DstDivReg
, DstRemReg
, Num
, Den
);
3158 legalizeUnsignedDIV_REM64Impl(B
, DstDivReg
, DstRemReg
, Num
, Den
);
3162 MI
.eraseFromParent();
3166 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr
&MI
,
3167 MachineRegisterInfo
&MRI
,
3168 MachineIRBuilder
&B
) const {
3169 const LLT S64
= LLT::scalar(64);
3170 const LLT S32
= LLT::scalar(32);
3172 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
3173 if (Ty
!= S32
&& Ty
!= S64
)
3176 const unsigned FirstSrcOpIdx
= MI
.getNumExplicitDefs();
3177 Register LHS
= MI
.getOperand(FirstSrcOpIdx
).getReg();
3178 Register RHS
= MI
.getOperand(FirstSrcOpIdx
+ 1).getReg();
3180 auto SignBitOffset
= B
.buildConstant(S32
, Ty
.getSizeInBits() - 1);
3181 auto LHSign
= B
.buildAShr(Ty
, LHS
, SignBitOffset
);
3182 auto RHSign
= B
.buildAShr(Ty
, RHS
, SignBitOffset
);
3184 LHS
= B
.buildAdd(Ty
, LHS
, LHSign
).getReg(0);
3185 RHS
= B
.buildAdd(Ty
, RHS
, RHSign
).getReg(0);
3187 LHS
= B
.buildXor(Ty
, LHS
, LHSign
).getReg(0);
3188 RHS
= B
.buildXor(Ty
, RHS
, RHSign
).getReg(0);
3190 Register DstDivReg
, DstRemReg
, TmpDivReg
, TmpRemReg
;
3191 switch (MI
.getOpcode()) {
3193 llvm_unreachable("Unexpected opcode!");
3194 case AMDGPU::G_SDIV
: {
3195 DstDivReg
= MI
.getOperand(0).getReg();
3196 TmpDivReg
= MRI
.createGenericVirtualRegister(Ty
);
3199 case AMDGPU::G_SREM
: {
3200 DstRemReg
= MI
.getOperand(0).getReg();
3201 TmpRemReg
= MRI
.createGenericVirtualRegister(Ty
);
3204 case AMDGPU::G_SDIVREM
: {
3205 DstDivReg
= MI
.getOperand(0).getReg();
3206 DstRemReg
= MI
.getOperand(1).getReg();
3207 TmpDivReg
= MRI
.createGenericVirtualRegister(Ty
);
3208 TmpRemReg
= MRI
.createGenericVirtualRegister(Ty
);
3214 legalizeUnsignedDIV_REM32Impl(B
, TmpDivReg
, TmpRemReg
, LHS
, RHS
);
3216 legalizeUnsignedDIV_REM64Impl(B
, TmpDivReg
, TmpRemReg
, LHS
, RHS
);
3219 auto Sign
= B
.buildXor(Ty
, LHSign
, RHSign
).getReg(0);
3220 auto SignXor
= B
.buildXor(Ty
, TmpDivReg
, Sign
).getReg(0);
3221 B
.buildSub(DstDivReg
, SignXor
, Sign
);
3225 auto Sign
= LHSign
.getReg(0); // Remainder sign is the same as LHS
3226 auto SignXor
= B
.buildXor(Ty
, TmpRemReg
, Sign
).getReg(0);
3227 B
.buildSub(DstRemReg
, SignXor
, Sign
);
3230 MI
.eraseFromParent();
3234 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr
&MI
,
3235 MachineRegisterInfo
&MRI
,
3236 MachineIRBuilder
&B
) const {
3237 Register Res
= MI
.getOperand(0).getReg();
3238 Register LHS
= MI
.getOperand(1).getReg();
3239 Register RHS
= MI
.getOperand(2).getReg();
3240 uint16_t Flags
= MI
.getFlags();
3241 LLT ResTy
= MRI
.getType(Res
);
3243 const MachineFunction
&MF
= B
.getMF();
3244 bool AllowInaccurateRcp
= MF
.getTarget().Options
.UnsafeFPMath
||
3245 MI
.getFlag(MachineInstr::FmAfn
);
3247 if (!AllowInaccurateRcp
)
3250 if (auto CLHS
= getConstantFPVRegVal(LHS
, MRI
)) {
3252 if (CLHS
->isExactlyValue(1.0)) {
3253 B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, Res
, false)
3257 MI
.eraseFromParent();
3261 // -1 / x -> RCP( FNEG(x) )
3262 if (CLHS
->isExactlyValue(-1.0)) {
3263 auto FNeg
= B
.buildFNeg(ResTy
, RHS
, Flags
);
3264 B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, Res
, false)
3265 .addUse(FNeg
.getReg(0))
3268 MI
.eraseFromParent();
3273 // x / y -> x * (1.0 / y)
3274 auto RCP
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {ResTy
}, false)
3277 B
.buildFMul(Res
, LHS
, RCP
, Flags
);
3279 MI
.eraseFromParent();
3283 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr
&MI
,
3284 MachineRegisterInfo
&MRI
,
3285 MachineIRBuilder
&B
) const {
3286 Register Res
= MI
.getOperand(0).getReg();
3287 Register X
= MI
.getOperand(1).getReg();
3288 Register Y
= MI
.getOperand(2).getReg();
3289 uint16_t Flags
= MI
.getFlags();
3290 LLT ResTy
= MRI
.getType(Res
);
3292 const MachineFunction
&MF
= B
.getMF();
3293 bool AllowInaccurateRcp
= MF
.getTarget().Options
.UnsafeFPMath
||
3294 MI
.getFlag(MachineInstr::FmAfn
);
3296 if (!AllowInaccurateRcp
)
3299 auto NegY
= B
.buildFNeg(ResTy
, Y
);
3300 auto One
= B
.buildFConstant(ResTy
, 1.0);
3302 auto R
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {ResTy
}, false)
3306 auto Tmp0
= B
.buildFMA(ResTy
, NegY
, R
, One
);
3307 R
= B
.buildFMA(ResTy
, Tmp0
, R
, R
);
3309 auto Tmp1
= B
.buildFMA(ResTy
, NegY
, R
, One
);
3310 R
= B
.buildFMA(ResTy
, Tmp1
, R
, R
);
3312 auto Ret
= B
.buildFMul(ResTy
, X
, R
);
3313 auto Tmp2
= B
.buildFMA(ResTy
, NegY
, Ret
, X
);
3315 B
.buildFMA(Res
, Tmp2
, R
, Ret
);
3316 MI
.eraseFromParent();
3320 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr
&MI
,
3321 MachineRegisterInfo
&MRI
,
3322 MachineIRBuilder
&B
) const {
3323 if (legalizeFastUnsafeFDIV(MI
, MRI
, B
))
3326 Register Res
= MI
.getOperand(0).getReg();
3327 Register LHS
= MI
.getOperand(1).getReg();
3328 Register RHS
= MI
.getOperand(2).getReg();
3330 uint16_t Flags
= MI
.getFlags();
3332 LLT S16
= LLT::scalar(16);
3333 LLT S32
= LLT::scalar(32);
3335 auto LHSExt
= B
.buildFPExt(S32
, LHS
, Flags
);
3336 auto RHSExt
= B
.buildFPExt(S32
, RHS
, Flags
);
3338 auto RCP
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S32
}, false)
3339 .addUse(RHSExt
.getReg(0))
3342 auto QUOT
= B
.buildFMul(S32
, LHSExt
, RCP
, Flags
);
3343 auto RDst
= B
.buildFPTrunc(S16
, QUOT
, Flags
);
3345 B
.buildIntrinsic(Intrinsic::amdgcn_div_fixup
, Res
, false)
3346 .addUse(RDst
.getReg(0))
3351 MI
.eraseFromParent();
3355 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3356 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3357 static void toggleSPDenormMode(bool Enable
,
3358 MachineIRBuilder
&B
,
3359 const GCNSubtarget
&ST
,
3360 AMDGPU::SIModeRegisterDefaults Mode
) {
3361 // Set SP denorm mode to this value.
3362 unsigned SPDenormMode
=
3363 Enable
? FP_DENORM_FLUSH_NONE
: Mode
.fpDenormModeSPValue();
3365 if (ST
.hasDenormModeInst()) {
3366 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3367 uint32_t DPDenormModeDefault
= Mode
.fpDenormModeDPValue();
3369 uint32_t NewDenormModeValue
= SPDenormMode
| (DPDenormModeDefault
<< 2);
3370 B
.buildInstr(AMDGPU::S_DENORM_MODE
)
3371 .addImm(NewDenormModeValue
);
3374 // Select FP32 bit field in mode register.
3375 unsigned SPDenormModeBitField
= AMDGPU::Hwreg::ID_MODE
|
3376 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_
) |
3377 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_
);
3379 B
.buildInstr(AMDGPU::S_SETREG_IMM32_B32
)
3380 .addImm(SPDenormMode
)
3381 .addImm(SPDenormModeBitField
);
3385 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr
&MI
,
3386 MachineRegisterInfo
&MRI
,
3387 MachineIRBuilder
&B
) const {
3388 if (legalizeFastUnsafeFDIV(MI
, MRI
, B
))
3391 Register Res
= MI
.getOperand(0).getReg();
3392 Register LHS
= MI
.getOperand(1).getReg();
3393 Register RHS
= MI
.getOperand(2).getReg();
3394 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
3395 AMDGPU::SIModeRegisterDefaults Mode
= MFI
->getMode();
3397 uint16_t Flags
= MI
.getFlags();
3399 LLT S32
= LLT::scalar(32);
3400 LLT S1
= LLT::scalar(1);
3402 auto One
= B
.buildFConstant(S32
, 1.0f
);
3404 auto DenominatorScaled
=
3405 B
.buildIntrinsic(Intrinsic::amdgcn_div_scale
, {S32
, S1
}, false)
3410 auto NumeratorScaled
=
3411 B
.buildIntrinsic(Intrinsic::amdgcn_div_scale
, {S32
, S1
}, false)
3417 auto ApproxRcp
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S32
}, false)
3418 .addUse(DenominatorScaled
.getReg(0))
3420 auto NegDivScale0
= B
.buildFNeg(S32
, DenominatorScaled
, Flags
);
3422 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3423 // aren't modeled as reading it.
3424 if (!Mode
.allFP32Denormals())
3425 toggleSPDenormMode(true, B
, ST
, Mode
);
3427 auto Fma0
= B
.buildFMA(S32
, NegDivScale0
, ApproxRcp
, One
, Flags
);
3428 auto Fma1
= B
.buildFMA(S32
, Fma0
, ApproxRcp
, ApproxRcp
, Flags
);
3429 auto Mul
= B
.buildFMul(S32
, NumeratorScaled
, Fma1
, Flags
);
3430 auto Fma2
= B
.buildFMA(S32
, NegDivScale0
, Mul
, NumeratorScaled
, Flags
);
3431 auto Fma3
= B
.buildFMA(S32
, Fma2
, Fma1
, Mul
, Flags
);
3432 auto Fma4
= B
.buildFMA(S32
, NegDivScale0
, Fma3
, NumeratorScaled
, Flags
);
3434 if (!Mode
.allFP32Denormals())
3435 toggleSPDenormMode(false, B
, ST
, Mode
);
3437 auto Fmas
= B
.buildIntrinsic(Intrinsic::amdgcn_div_fmas
, {S32
}, false)
3438 .addUse(Fma4
.getReg(0))
3439 .addUse(Fma1
.getReg(0))
3440 .addUse(Fma3
.getReg(0))
3441 .addUse(NumeratorScaled
.getReg(1))
3444 B
.buildIntrinsic(Intrinsic::amdgcn_div_fixup
, Res
, false)
3445 .addUse(Fmas
.getReg(0))
3450 MI
.eraseFromParent();
3454 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr
&MI
,
3455 MachineRegisterInfo
&MRI
,
3456 MachineIRBuilder
&B
) const {
3457 if (legalizeFastUnsafeFDIV64(MI
, MRI
, B
))
3460 Register Res
= MI
.getOperand(0).getReg();
3461 Register LHS
= MI
.getOperand(1).getReg();
3462 Register RHS
= MI
.getOperand(2).getReg();
3464 uint16_t Flags
= MI
.getFlags();
3466 LLT S64
= LLT::scalar(64);
3467 LLT S1
= LLT::scalar(1);
3469 auto One
= B
.buildFConstant(S64
, 1.0);
3471 auto DivScale0
= B
.buildIntrinsic(Intrinsic::amdgcn_div_scale
, {S64
, S1
}, false)
3477 auto NegDivScale0
= B
.buildFNeg(S64
, DivScale0
.getReg(0), Flags
);
3479 auto Rcp
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S64
}, false)
3480 .addUse(DivScale0
.getReg(0))
3483 auto Fma0
= B
.buildFMA(S64
, NegDivScale0
, Rcp
, One
, Flags
);
3484 auto Fma1
= B
.buildFMA(S64
, Rcp
, Fma0
, Rcp
, Flags
);
3485 auto Fma2
= B
.buildFMA(S64
, NegDivScale0
, Fma1
, One
, Flags
);
3487 auto DivScale1
= B
.buildIntrinsic(Intrinsic::amdgcn_div_scale
, {S64
, S1
}, false)
3493 auto Fma3
= B
.buildFMA(S64
, Fma1
, Fma2
, Fma1
, Flags
);
3494 auto Mul
= B
.buildFMul(S64
, DivScale1
.getReg(0), Fma3
, Flags
);
3495 auto Fma4
= B
.buildFMA(S64
, NegDivScale0
, Mul
, DivScale1
.getReg(0), Flags
);
3498 if (!ST
.hasUsableDivScaleConditionOutput()) {
3499 // Workaround a hardware bug on SI where the condition output from div_scale
3502 LLT S32
= LLT::scalar(32);
3504 auto NumUnmerge
= B
.buildUnmerge(S32
, LHS
);
3505 auto DenUnmerge
= B
.buildUnmerge(S32
, RHS
);
3506 auto Scale0Unmerge
= B
.buildUnmerge(S32
, DivScale0
);
3507 auto Scale1Unmerge
= B
.buildUnmerge(S32
, DivScale1
);
3509 auto CmpNum
= B
.buildICmp(ICmpInst::ICMP_EQ
, S1
, NumUnmerge
.getReg(1),
3510 Scale1Unmerge
.getReg(1));
3511 auto CmpDen
= B
.buildICmp(ICmpInst::ICMP_EQ
, S1
, DenUnmerge
.getReg(1),
3512 Scale0Unmerge
.getReg(1));
3513 Scale
= B
.buildXor(S1
, CmpNum
, CmpDen
).getReg(0);
3515 Scale
= DivScale1
.getReg(1);
3518 auto Fmas
= B
.buildIntrinsic(Intrinsic::amdgcn_div_fmas
, {S64
}, false)
3519 .addUse(Fma4
.getReg(0))
3520 .addUse(Fma3
.getReg(0))
3521 .addUse(Mul
.getReg(0))
3525 B
.buildIntrinsic(Intrinsic::amdgcn_div_fixup
, makeArrayRef(Res
), false)
3526 .addUse(Fmas
.getReg(0))
3531 MI
.eraseFromParent();
3535 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr
&MI
,
3536 MachineRegisterInfo
&MRI
,
3537 MachineIRBuilder
&B
) const {
3538 Register Res
= MI
.getOperand(0).getReg();
3539 Register LHS
= MI
.getOperand(2).getReg();
3540 Register RHS
= MI
.getOperand(3).getReg();
3541 uint16_t Flags
= MI
.getFlags();
3543 LLT S32
= LLT::scalar(32);
3544 LLT S1
= LLT::scalar(1);
3546 auto Abs
= B
.buildFAbs(S32
, RHS
, Flags
);
3547 const APFloat
C0Val(1.0f
);
3549 auto C0
= B
.buildConstant(S32
, 0x6f800000);
3550 auto C1
= B
.buildConstant(S32
, 0x2f800000);
3551 auto C2
= B
.buildConstant(S32
, FloatToBits(1.0f
));
3553 auto CmpRes
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Abs
, C0
, Flags
);
3554 auto Sel
= B
.buildSelect(S32
, CmpRes
, C1
, C2
, Flags
);
3556 auto Mul0
= B
.buildFMul(S32
, RHS
, Sel
, Flags
);
3558 auto RCP
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S32
}, false)
3559 .addUse(Mul0
.getReg(0))
3562 auto Mul1
= B
.buildFMul(S32
, LHS
, RCP
, Flags
);
3564 B
.buildFMul(Res
, Sel
, Mul1
, Flags
);
3566 MI
.eraseFromParent();
3570 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3571 // FIXME: Why do we handle this one but not other removed instructions?
3573 // Reciprocal square root. The clamp prevents infinite results, clamping
3574 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3576 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr
&MI
,
3577 MachineRegisterInfo
&MRI
,
3578 MachineIRBuilder
&B
) const {
3579 if (ST
.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
3582 Register Dst
= MI
.getOperand(0).getReg();
3583 Register Src
= MI
.getOperand(2).getReg();
3584 auto Flags
= MI
.getFlags();
3586 LLT Ty
= MRI
.getType(Dst
);
3588 const fltSemantics
*FltSemantics
;
3589 if (Ty
== LLT::scalar(32))
3590 FltSemantics
= &APFloat::IEEEsingle();
3591 else if (Ty
== LLT::scalar(64))
3592 FltSemantics
= &APFloat::IEEEdouble();
3596 auto Rsq
= B
.buildIntrinsic(Intrinsic::amdgcn_rsq
, {Ty
}, false)
3600 // We don't need to concern ourselves with the snan handling difference, since
3601 // the rsq quieted (or not) so use the one which will directly select.
3602 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
3603 const bool UseIEEE
= MFI
->getMode().IEEE
;
3605 auto MaxFlt
= B
.buildFConstant(Ty
, APFloat::getLargest(*FltSemantics
));
3606 auto ClampMax
= UseIEEE
? B
.buildFMinNumIEEE(Ty
, Rsq
, MaxFlt
, Flags
) :
3607 B
.buildFMinNum(Ty
, Rsq
, MaxFlt
, Flags
);
3609 auto MinFlt
= B
.buildFConstant(Ty
, APFloat::getLargest(*FltSemantics
, true));
3612 B
.buildFMaxNumIEEE(Dst
, ClampMax
, MinFlt
, Flags
);
3614 B
.buildFMaxNum(Dst
, ClampMax
, MinFlt
, Flags
);
3615 MI
.eraseFromParent();
3619 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID
) {
3621 case Intrinsic::amdgcn_ds_fadd
:
3622 return AMDGPU::G_ATOMICRMW_FADD
;
3623 case Intrinsic::amdgcn_ds_fmin
:
3624 return AMDGPU::G_AMDGPU_ATOMIC_FMIN
;
3625 case Intrinsic::amdgcn_ds_fmax
:
3626 return AMDGPU::G_AMDGPU_ATOMIC_FMAX
;
3628 llvm_unreachable("not a DS FP intrinsic");
3632 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper
&Helper
,
3634 Intrinsic::ID IID
) const {
3635 GISelChangeObserver
&Observer
= Helper
.Observer
;
3636 Observer
.changingInstr(MI
);
3638 MI
.setDesc(ST
.getInstrInfo()->get(getDSFPAtomicOpcode(IID
)));
3640 // The remaining operands were used to set fields in the MemOperand on
3642 for (int I
= 6; I
> 3; --I
)
3643 MI
.RemoveOperand(I
);
3645 MI
.RemoveOperand(1); // Remove the intrinsic ID.
3646 Observer
.changedInstr(MI
);
3650 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg
,
3651 MachineRegisterInfo
&MRI
,
3652 MachineIRBuilder
&B
) const {
3654 ST
.getTargetLowering()->getImplicitParameterOffset(
3655 B
.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT
);
3656 LLT DstTy
= MRI
.getType(DstReg
);
3657 LLT IdxTy
= LLT::scalar(DstTy
.getSizeInBits());
3659 Register KernargPtrReg
= MRI
.createGenericVirtualRegister(DstTy
);
3660 if (!loadInputValue(KernargPtrReg
, B
,
3661 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
))
3664 // FIXME: This should be nuw
3665 B
.buildPtrAdd(DstReg
, KernargPtrReg
, B
.buildConstant(IdxTy
, Offset
).getReg(0));
3669 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr
&MI
,
3670 MachineRegisterInfo
&MRI
,
3671 MachineIRBuilder
&B
) const {
3672 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
3673 if (!MFI
->isEntryFunction()) {
3674 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
3675 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
3678 Register DstReg
= MI
.getOperand(0).getReg();
3679 if (!getImplicitArgPtr(DstReg
, MRI
, B
))
3682 MI
.eraseFromParent();
3686 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr
&MI
,
3687 MachineRegisterInfo
&MRI
,
3688 MachineIRBuilder
&B
,
3689 unsigned AddrSpace
) const {
3690 Register ApertureReg
= getSegmentAperture(AddrSpace
, MRI
, B
);
3691 auto Unmerge
= B
.buildUnmerge(LLT::scalar(32), MI
.getOperand(2).getReg());
3692 Register Hi32
= Unmerge
.getReg(1);
3694 B
.buildICmp(ICmpInst::ICMP_EQ
, MI
.getOperand(0), Hi32
, ApertureReg
);
3695 MI
.eraseFromParent();
3699 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3700 // offset (the offset that is included in bounds checking and swizzling, to be
3701 // split between the instruction's voffset and immoffset fields) and soffset
3702 // (the offset that is excluded from bounds checking and swizzling, to go in
3703 // the instruction's soffset field). This function takes the first kind of
3704 // offset and figures out how to split it between voffset and immoffset.
3705 std::pair
<Register
, unsigned>
3706 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder
&B
,
3707 Register OrigOffset
) const {
3708 const unsigned MaxImm
= 4095;
3711 const LLT S32
= LLT::scalar(32);
3712 MachineRegisterInfo
&MRI
= *B
.getMRI();
3714 std::tie(BaseReg
, ImmOffset
) =
3715 AMDGPU::getBaseWithConstantOffset(MRI
, OrigOffset
);
3717 // If BaseReg is a pointer, convert it to int.
3718 if (MRI
.getType(BaseReg
).isPointer())
3719 BaseReg
= B
.buildPtrToInt(MRI
.getType(OrigOffset
), BaseReg
).getReg(0);
3721 // If the immediate value is too big for the immoffset field, put the value
3722 // and -4096 into the immoffset field so that the value that is copied/added
3723 // for the voffset field is a multiple of 4096, and it stands more chance
3724 // of being CSEd with the copy/add for another similar load/store.
3725 // However, do not do that rounding down to a multiple of 4096 if that is a
3726 // negative number, as it appears to be illegal to have a negative offset
3727 // in the vgpr, even if adding the immediate offset makes it positive.
3728 unsigned Overflow
= ImmOffset
& ~MaxImm
;
3729 ImmOffset
-= Overflow
;
3730 if ((int32_t)Overflow
< 0) {
3731 Overflow
+= ImmOffset
;
3735 if (Overflow
!= 0) {
3737 BaseReg
= B
.buildConstant(S32
, Overflow
).getReg(0);
3739 auto OverflowVal
= B
.buildConstant(S32
, Overflow
);
3740 BaseReg
= B
.buildAdd(S32
, BaseReg
, OverflowVal
).getReg(0);
3745 BaseReg
= B
.buildConstant(S32
, 0).getReg(0);
3747 return std::make_pair(BaseReg
, ImmOffset
);
3750 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3751 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand
*MMO
,
3752 Register VOffset
, Register SOffset
,
3753 unsigned ImmOffset
, Register VIndex
,
3754 MachineRegisterInfo
&MRI
) const {
3755 Optional
<ValueAndVReg
> MaybeVOffsetVal
=
3756 getConstantVRegValWithLookThrough(VOffset
, MRI
);
3757 Optional
<ValueAndVReg
> MaybeSOffsetVal
=
3758 getConstantVRegValWithLookThrough(SOffset
, MRI
);
3759 Optional
<ValueAndVReg
> MaybeVIndexVal
=
3760 getConstantVRegValWithLookThrough(VIndex
, MRI
);
3761 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3762 // update the MMO with that offset. The stride is unknown so we can only do
3763 // this if VIndex is constant 0.
3764 if (MaybeVOffsetVal
&& MaybeSOffsetVal
&& MaybeVIndexVal
&&
3765 MaybeVIndexVal
->Value
== 0) {
3766 uint64_t TotalOffset
= MaybeVOffsetVal
->Value
.getZExtValue() +
3767 MaybeSOffsetVal
->Value
.getZExtValue() + ImmOffset
;
3768 MMO
->setOffset(TotalOffset
);
3770 // We don't have a constant combined offset to use in the MMO. Give up.
3771 MMO
->setValue((Value
*)nullptr);
3775 /// Handle register layout difference for f16 images for some subtargets.
3776 Register
AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder
&B
,
3777 MachineRegisterInfo
&MRI
,
3779 bool ImageStore
) const {
3780 const LLT S16
= LLT::scalar(16);
3781 const LLT S32
= LLT::scalar(32);
3782 LLT StoreVT
= MRI
.getType(Reg
);
3783 assert(StoreVT
.isVector() && StoreVT
.getElementType() == S16
);
3785 if (ST
.hasUnpackedD16VMem()) {
3786 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
3788 SmallVector
<Register
, 4> WideRegs
;
3789 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
3790 WideRegs
.push_back(B
.buildAnyExt(S32
, Unmerge
.getReg(I
)).getReg(0));
3792 int NumElts
= StoreVT
.getNumElements();
3794 return B
.buildBuildVector(LLT::fixed_vector(NumElts
, S32
), WideRegs
)
3798 if (ImageStore
&& ST
.hasImageStoreD16Bug()) {
3799 if (StoreVT
.getNumElements() == 2) {
3800 SmallVector
<Register
, 4> PackedRegs
;
3801 Reg
= B
.buildBitcast(S32
, Reg
).getReg(0);
3802 PackedRegs
.push_back(Reg
);
3803 PackedRegs
.resize(2, B
.buildUndef(S32
).getReg(0));
3804 return B
.buildBuildVector(LLT::fixed_vector(2, S32
), PackedRegs
)
3808 if (StoreVT
.getNumElements() == 3) {
3809 SmallVector
<Register
, 4> PackedRegs
;
3810 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
3811 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
3812 PackedRegs
.push_back(Unmerge
.getReg(I
));
3813 PackedRegs
.resize(6, B
.buildUndef(S16
).getReg(0));
3814 Reg
= B
.buildBuildVector(LLT::fixed_vector(6, S16
), PackedRegs
).getReg(0);
3815 return B
.buildBitcast(LLT::fixed_vector(3, S32
), Reg
).getReg(0);
3818 if (StoreVT
.getNumElements() == 4) {
3819 SmallVector
<Register
, 4> PackedRegs
;
3820 Reg
= B
.buildBitcast(LLT::fixed_vector(2, S32
), Reg
).getReg(0);
3821 auto Unmerge
= B
.buildUnmerge(S32
, Reg
);
3822 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
3823 PackedRegs
.push_back(Unmerge
.getReg(I
));
3824 PackedRegs
.resize(4, B
.buildUndef(S32
).getReg(0));
3825 return B
.buildBuildVector(LLT::fixed_vector(4, S32
), PackedRegs
)
3829 llvm_unreachable("invalid data type");
3835 Register
AMDGPULegalizerInfo::fixStoreSourceType(
3836 MachineIRBuilder
&B
, Register VData
, bool IsFormat
) const {
3837 MachineRegisterInfo
*MRI
= B
.getMRI();
3838 LLT Ty
= MRI
->getType(VData
);
3840 const LLT S16
= LLT::scalar(16);
3842 // Fixup illegal register types for i8 stores.
3843 if (Ty
== LLT::scalar(8) || Ty
== S16
) {
3844 Register AnyExt
= B
.buildAnyExt(LLT::scalar(32), VData
).getReg(0);
3848 if (Ty
.isVector()) {
3849 if (Ty
.getElementType() == S16
&& Ty
.getNumElements() <= 4) {
3851 return handleD16VData(B
, *MRI
, VData
);
3858 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr
&MI
,
3859 MachineRegisterInfo
&MRI
,
3860 MachineIRBuilder
&B
,
3862 bool IsFormat
) const {
3863 Register VData
= MI
.getOperand(1).getReg();
3864 LLT Ty
= MRI
.getType(VData
);
3865 LLT EltTy
= Ty
.getScalarType();
3866 const bool IsD16
= IsFormat
&& (EltTy
.getSizeInBits() == 16);
3867 const LLT S32
= LLT::scalar(32);
3869 VData
= fixStoreSourceType(B
, VData
, IsFormat
);
3870 Register RSrc
= MI
.getOperand(2).getReg();
3872 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
3873 const int MemSize
= MMO
->getSize();
3877 // The typed intrinsics add an immediate after the registers.
3878 const unsigned NumVIndexOps
= IsTyped
? 8 : 7;
3880 // The struct intrinsic variants add one additional operand over raw.
3881 const bool HasVIndex
= MI
.getNumOperands() == NumVIndexOps
;
3885 VIndex
= MI
.getOperand(3).getReg();
3888 VIndex
= B
.buildConstant(S32
, 0).getReg(0);
3891 Register VOffset
= MI
.getOperand(3 + OpOffset
).getReg();
3892 Register SOffset
= MI
.getOperand(4 + OpOffset
).getReg();
3894 unsigned Format
= 0;
3896 Format
= MI
.getOperand(5 + OpOffset
).getImm();
3900 unsigned AuxiliaryData
= MI
.getOperand(5 + OpOffset
).getImm();
3902 std::tie(VOffset
, ImmOffset
) = splitBufferOffsets(B
, VOffset
);
3903 updateBufferMMO(MMO
, VOffset
, SOffset
, ImmOffset
, VIndex
, MRI
);
3907 Opc
= IsD16
? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16
:
3908 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT
;
3909 } else if (IsFormat
) {
3910 Opc
= IsD16
? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16
:
3911 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT
;
3915 Opc
= AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE
;
3918 Opc
= AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT
;
3921 Opc
= AMDGPU::G_AMDGPU_BUFFER_STORE
;
3926 auto MIB
= B
.buildInstr(Opc
)
3927 .addUse(VData
) // vdata
3928 .addUse(RSrc
) // rsrc
3929 .addUse(VIndex
) // vindex
3930 .addUse(VOffset
) // voffset
3931 .addUse(SOffset
) // soffset
3932 .addImm(ImmOffset
); // offset(imm)
3937 MIB
.addImm(AuxiliaryData
) // cachepolicy, swizzled buffer(imm)
3938 .addImm(HasVIndex
? -1 : 0) // idxen(imm)
3939 .addMemOperand(MMO
);
3941 MI
.eraseFromParent();
3945 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr
&MI
,
3946 MachineRegisterInfo
&MRI
,
3947 MachineIRBuilder
&B
,
3949 bool IsTyped
) const {
3950 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3951 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
3952 const LLT MemTy
= MMO
->getMemoryType();
3953 const LLT S32
= LLT::scalar(32);
3955 Register Dst
= MI
.getOperand(0).getReg();
3956 Register RSrc
= MI
.getOperand(2).getReg();
3958 // The typed intrinsics add an immediate after the registers.
3959 const unsigned NumVIndexOps
= IsTyped
? 8 : 7;
3961 // The struct intrinsic variants add one additional operand over raw.
3962 const bool HasVIndex
= MI
.getNumOperands() == NumVIndexOps
;
3966 VIndex
= MI
.getOperand(3).getReg();
3969 VIndex
= B
.buildConstant(S32
, 0).getReg(0);
3972 Register VOffset
= MI
.getOperand(3 + OpOffset
).getReg();
3973 Register SOffset
= MI
.getOperand(4 + OpOffset
).getReg();
3975 unsigned Format
= 0;
3977 Format
= MI
.getOperand(5 + OpOffset
).getImm();
3981 unsigned AuxiliaryData
= MI
.getOperand(5 + OpOffset
).getImm();
3984 LLT Ty
= MRI
.getType(Dst
);
3985 LLT EltTy
= Ty
.getScalarType();
3986 const bool IsD16
= IsFormat
&& (EltTy
.getSizeInBits() == 16);
3987 const bool Unpacked
= ST
.hasUnpackedD16VMem();
3989 std::tie(VOffset
, ImmOffset
) = splitBufferOffsets(B
, VOffset
);
3990 updateBufferMMO(MMO
, VOffset
, SOffset
, ImmOffset
, VIndex
, MRI
);
3995 Opc
= IsD16
? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16
:
3996 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT
;
3997 } else if (IsFormat
) {
3998 Opc
= IsD16
? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16
:
3999 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT
;
4001 switch (MemTy
.getSizeInBits()) {
4003 Opc
= AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
;
4006 Opc
= AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT
;
4009 Opc
= AMDGPU::G_AMDGPU_BUFFER_LOAD
;
4014 Register LoadDstReg
;
4017 (!IsD16
&& MemTy
.getSizeInBits() < 32) || (IsD16
&& !Ty
.isVector());
4018 LLT UnpackedTy
= Ty
.changeElementSize(32);
4021 LoadDstReg
= B
.getMRI()->createGenericVirtualRegister(S32
);
4022 else if (Unpacked
&& IsD16
&& Ty
.isVector())
4023 LoadDstReg
= B
.getMRI()->createGenericVirtualRegister(UnpackedTy
);
4027 auto MIB
= B
.buildInstr(Opc
)
4028 .addDef(LoadDstReg
) // vdata
4029 .addUse(RSrc
) // rsrc
4030 .addUse(VIndex
) // vindex
4031 .addUse(VOffset
) // voffset
4032 .addUse(SOffset
) // soffset
4033 .addImm(ImmOffset
); // offset(imm)
4038 MIB
.addImm(AuxiliaryData
) // cachepolicy, swizzled buffer(imm)
4039 .addImm(HasVIndex
? -1 : 0) // idxen(imm)
4040 .addMemOperand(MMO
);
4042 if (LoadDstReg
!= Dst
) {
4043 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
4045 // Widen result for extending loads was widened.
4047 B
.buildTrunc(Dst
, LoadDstReg
);
4049 // Repack to original 16-bit vector result
4050 // FIXME: G_TRUNC should work, but legalization currently fails
4051 auto Unmerge
= B
.buildUnmerge(S32
, LoadDstReg
);
4052 SmallVector
<Register
, 4> Repack
;
4053 for (unsigned I
= 0, N
= Unmerge
->getNumOperands() - 1; I
!= N
; ++I
)
4054 Repack
.push_back(B
.buildTrunc(EltTy
, Unmerge
.getReg(I
)).getReg(0));
4055 B
.buildMerge(Dst
, Repack
);
4059 MI
.eraseFromParent();
4063 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr
&MI
,
4064 MachineIRBuilder
&B
,
4066 unsigned Opc
= IsInc
? AMDGPU::G_AMDGPU_ATOMIC_INC
:
4067 AMDGPU::G_AMDGPU_ATOMIC_DEC
;
4069 .addDef(MI
.getOperand(0).getReg())
4070 .addUse(MI
.getOperand(2).getReg())
4071 .addUse(MI
.getOperand(3).getReg())
4073 MI
.eraseFromParent();
4077 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID
) {
4079 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
4080 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
4081 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP
;
4082 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
4083 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
4084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD
;
4085 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
4086 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
4087 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB
;
4088 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
4089 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
4090 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN
;
4091 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
4092 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
4093 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN
;
4094 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
4095 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
4096 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX
;
4097 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
4098 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
4099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX
;
4100 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
4101 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
4102 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND
;
4103 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
4104 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
4105 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR
;
4106 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
4107 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
4108 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR
;
4109 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
4110 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
4111 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC
;
4112 case Intrinsic::amdgcn_raw_buffer_atomic_dec
:
4113 case Intrinsic::amdgcn_struct_buffer_atomic_dec
:
4114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC
;
4115 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
:
4116 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
:
4117 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP
;
4118 case Intrinsic::amdgcn_buffer_atomic_fadd
:
4119 case Intrinsic::amdgcn_raw_buffer_atomic_fadd
:
4120 case Intrinsic::amdgcn_struct_buffer_atomic_fadd
:
4121 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD
;
4122 case Intrinsic::amdgcn_raw_buffer_atomic_fmin
:
4123 case Intrinsic::amdgcn_struct_buffer_atomic_fmin
:
4124 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN
;
4125 case Intrinsic::amdgcn_raw_buffer_atomic_fmax
:
4126 case Intrinsic::amdgcn_struct_buffer_atomic_fmax
:
4127 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX
;
4129 llvm_unreachable("unhandled atomic opcode");
4133 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr
&MI
,
4134 MachineIRBuilder
&B
,
4135 Intrinsic::ID IID
) const {
4136 const bool IsCmpSwap
= IID
== Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
||
4137 IID
== Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
;
4138 const bool HasReturn
= MI
.getNumExplicitDefs() != 0;
4144 // A few FP atomics do not support return values.
4145 Dst
= MI
.getOperand(0).getReg();
4150 Register VData
= MI
.getOperand(2 + OpOffset
).getReg();
4154 CmpVal
= MI
.getOperand(3 + OpOffset
).getReg();
4158 Register RSrc
= MI
.getOperand(3 + OpOffset
).getReg();
4159 const unsigned NumVIndexOps
= (IsCmpSwap
? 8 : 7) + HasReturn
;
4161 // The struct intrinsic variants add one additional operand over raw.
4162 const bool HasVIndex
= MI
.getNumOperands() == NumVIndexOps
;
4165 VIndex
= MI
.getOperand(4 + OpOffset
).getReg();
4168 VIndex
= B
.buildConstant(LLT::scalar(32), 0).getReg(0);
4171 Register VOffset
= MI
.getOperand(4 + OpOffset
).getReg();
4172 Register SOffset
= MI
.getOperand(5 + OpOffset
).getReg();
4173 unsigned AuxiliaryData
= MI
.getOperand(6 + OpOffset
).getImm();
4175 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
4178 std::tie(VOffset
, ImmOffset
) = splitBufferOffsets(B
, VOffset
);
4179 updateBufferMMO(MMO
, VOffset
, SOffset
, ImmOffset
, VIndex
, *B
.getMRI());
4181 auto MIB
= B
.buildInstr(getBufferAtomicPseudo(IID
));
4186 MIB
.addUse(VData
); // vdata
4191 MIB
.addUse(RSrc
) // rsrc
4192 .addUse(VIndex
) // vindex
4193 .addUse(VOffset
) // voffset
4194 .addUse(SOffset
) // soffset
4195 .addImm(ImmOffset
) // offset(imm)
4196 .addImm(AuxiliaryData
) // cachepolicy, swizzled buffer(imm)
4197 .addImm(HasVIndex
? -1 : 0) // idxen(imm)
4198 .addMemOperand(MMO
);
4200 MI
.eraseFromParent();
4204 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4205 /// vector with s16 typed elements.
4206 static void packImage16bitOpsToDwords(MachineIRBuilder
&B
, MachineInstr
&MI
,
4207 SmallVectorImpl
<Register
> &PackedAddrs
,
4209 const AMDGPU::ImageDimIntrinsicInfo
*Intr
,
4210 bool IsA16
, bool IsG16
) {
4211 const LLT S16
= LLT::scalar(16);
4212 const LLT V2S16
= LLT::fixed_vector(2, 16);
4213 auto EndIdx
= Intr
->VAddrEnd
;
4215 for (unsigned I
= Intr
->VAddrStart
; I
< EndIdx
; I
++) {
4216 MachineOperand
&SrcOp
= MI
.getOperand(ArgOffset
+ I
);
4218 continue; // _L to _LZ may have eliminated this.
4220 Register AddrReg
= SrcOp
.getReg();
4222 if ((I
< Intr
->GradientStart
) ||
4223 (I
>= Intr
->GradientStart
&& I
< Intr
->CoordStart
&& !IsG16
) ||
4224 (I
>= Intr
->CoordStart
&& !IsA16
)) {
4225 // Handle any gradient or coordinate operands that should not be packed
4226 AddrReg
= B
.buildBitcast(V2S16
, AddrReg
).getReg(0);
4227 PackedAddrs
.push_back(AddrReg
);
4229 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4230 // derivatives dx/dh and dx/dv are packed with undef.
4231 if (((I
+ 1) >= EndIdx
) ||
4232 ((Intr
->NumGradients
/ 2) % 2 == 1 &&
4233 (I
== static_cast<unsigned>(Intr
->GradientStart
+
4234 (Intr
->NumGradients
/ 2) - 1) ||
4235 I
== static_cast<unsigned>(Intr
->GradientStart
+
4236 Intr
->NumGradients
- 1))) ||
4237 // Check for _L to _LZ optimization
4238 !MI
.getOperand(ArgOffset
+ I
+ 1).isReg()) {
4239 PackedAddrs
.push_back(
4240 B
.buildBuildVector(V2S16
, {AddrReg
, B
.buildUndef(S16
).getReg(0)})
4243 PackedAddrs
.push_back(
4245 V2S16
, {AddrReg
, MI
.getOperand(ArgOffset
+ I
+ 1).getReg()})
4253 /// Convert from separate vaddr components to a single vector address register,
4254 /// and replace the remaining operands with $noreg.
4255 static void convertImageAddrToPacked(MachineIRBuilder
&B
, MachineInstr
&MI
,
4256 int DimIdx
, int NumVAddrs
) {
4257 const LLT S32
= LLT::scalar(32);
4259 SmallVector
<Register
, 8> AddrRegs
;
4260 for (int I
= 0; I
!= NumVAddrs
; ++I
) {
4261 MachineOperand
&SrcOp
= MI
.getOperand(DimIdx
+ I
);
4262 if (SrcOp
.isReg()) {
4263 AddrRegs
.push_back(SrcOp
.getReg());
4264 assert(B
.getMRI()->getType(SrcOp
.getReg()) == S32
);
4268 int NumAddrRegs
= AddrRegs
.size();
4269 if (NumAddrRegs
!= 1) {
4270 // Above 8 elements round up to next power of 2 (i.e. 16).
4271 if (NumAddrRegs
> 8 && !isPowerOf2_32(NumAddrRegs
)) {
4272 const int RoundedNumRegs
= NextPowerOf2(NumAddrRegs
);
4273 auto Undef
= B
.buildUndef(S32
);
4274 AddrRegs
.append(RoundedNumRegs
- NumAddrRegs
, Undef
.getReg(0));
4275 NumAddrRegs
= RoundedNumRegs
;
4279 B
.buildBuildVector(LLT::fixed_vector(NumAddrRegs
, 32), AddrRegs
);
4280 MI
.getOperand(DimIdx
).setReg(VAddr
.getReg(0));
4283 for (int I
= 1; I
!= NumVAddrs
; ++I
) {
4284 MachineOperand
&SrcOp
= MI
.getOperand(DimIdx
+ I
);
4286 MI
.getOperand(DimIdx
+ I
).setReg(AMDGPU::NoRegister
);
4290 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4292 /// Depending on the subtarget, load/store with 16-bit element data need to be
4293 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4294 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4297 /// We don't want to directly select image instructions just yet, but also want
4298 /// to exposes all register repacking to the legalizer/combiners. We also don't
4299 /// want a selected instrution entering RegBankSelect. In order to avoid
4300 /// defining a multitude of intermediate image instructions, directly hack on
4301 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4302 /// now unnecessary arguments with $noreg.
4303 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4304 MachineInstr
&MI
, MachineIRBuilder
&B
, GISelChangeObserver
&Observer
,
4305 const AMDGPU::ImageDimIntrinsicInfo
*Intr
) const {
4307 const unsigned NumDefs
= MI
.getNumExplicitDefs();
4308 const unsigned ArgOffset
= NumDefs
+ 1;
4309 bool IsTFE
= NumDefs
== 2;
4310 // We are only processing the operands of d16 image operations on subtargets
4311 // that use the unpacked register layout, or need to repack the TFE result.
4313 // TODO: Do we need to guard against already legalized intrinsics?
4314 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
=
4315 AMDGPU::getMIMGBaseOpcodeInfo(Intr
->BaseOpcode
);
4317 MachineRegisterInfo
*MRI
= B
.getMRI();
4318 const LLT S32
= LLT::scalar(32);
4319 const LLT S16
= LLT::scalar(16);
4320 const LLT V2S16
= LLT::fixed_vector(2, 16);
4324 // Check for 16 bit addresses and pack if true.
4326 MRI
->getType(MI
.getOperand(ArgOffset
+ Intr
->GradientStart
).getReg());
4328 MRI
->getType(MI
.getOperand(ArgOffset
+ Intr
->CoordStart
).getReg());
4329 const bool IsG16
= GradTy
== S16
;
4330 const bool IsA16
= AddrTy
== S16
;
4333 if (!BaseOpcode
->Atomic
) {
4334 DMask
= MI
.getOperand(ArgOffset
+ Intr
->DMaskIndex
).getImm();
4335 if (BaseOpcode
->Gather4
) {
4337 } else if (DMask
!= 0) {
4338 DMaskLanes
= countPopulation(DMask
);
4339 } else if (!IsTFE
&& !BaseOpcode
->Store
) {
4340 // If dmask is 0, this is a no-op load. This can be eliminated.
4341 B
.buildUndef(MI
.getOperand(0));
4342 MI
.eraseFromParent();
4347 Observer
.changingInstr(MI
);
4348 auto ChangedInstr
= make_scope_exit([&] { Observer
.changedInstr(MI
); });
4350 unsigned NewOpcode
= NumDefs
== 0 ?
4351 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE
: AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD
;
4353 // Track that we legalized this
4354 MI
.setDesc(B
.getTII().get(NewOpcode
));
4356 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4357 // dmask to be at least 1 otherwise the instruction will fail
4358 if (IsTFE
&& DMask
== 0) {
4361 MI
.getOperand(ArgOffset
+ Intr
->DMaskIndex
).setImm(DMask
);
4364 if (BaseOpcode
->Atomic
) {
4365 Register VData0
= MI
.getOperand(2).getReg();
4366 LLT Ty
= MRI
->getType(VData0
);
4368 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4372 if (BaseOpcode
->AtomicX2
) {
4373 Register VData1
= MI
.getOperand(3).getReg();
4374 // The two values are packed in one register.
4375 LLT PackedTy
= LLT::fixed_vector(2, Ty
);
4376 auto Concat
= B
.buildBuildVector(PackedTy
, {VData0
, VData1
});
4377 MI
.getOperand(2).setReg(Concat
.getReg(0));
4378 MI
.getOperand(3).setReg(AMDGPU::NoRegister
);
4382 unsigned CorrectedNumVAddrs
= Intr
->NumVAddrs
;
4384 // Optimize _L to _LZ when _L is zero
4385 if (const AMDGPU::MIMGLZMappingInfo
*LZMappingInfo
=
4386 AMDGPU::getMIMGLZMappingInfo(Intr
->BaseOpcode
)) {
4387 const ConstantFP
*ConstantLod
;
4389 if (mi_match(MI
.getOperand(ArgOffset
+ Intr
->LodIndex
).getReg(), *MRI
,
4390 m_GFCst(ConstantLod
))) {
4391 if (ConstantLod
->isZero() || ConstantLod
->isNegative()) {
4392 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4393 const AMDGPU::ImageDimIntrinsicInfo
*NewImageDimIntr
=
4394 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo
->LZ
,
4397 // The starting indexes should remain in the same place.
4398 --CorrectedNumVAddrs
;
4400 MI
.getOperand(MI
.getNumExplicitDefs())
4401 .setIntrinsicID(static_cast<Intrinsic::ID
>(NewImageDimIntr
->Intr
));
4402 MI
.RemoveOperand(ArgOffset
+ Intr
->LodIndex
);
4403 Intr
= NewImageDimIntr
;
4408 // Optimize _mip away, when 'lod' is zero
4409 if (AMDGPU::getMIMGMIPMappingInfo(Intr
->BaseOpcode
)) {
4410 int64_t ConstantLod
;
4411 if (mi_match(MI
.getOperand(ArgOffset
+ Intr
->MipIndex
).getReg(), *MRI
,
4412 m_ICst(ConstantLod
))) {
4413 if (ConstantLod
== 0) {
4414 // TODO: Change intrinsic opcode and remove operand instead or replacing
4415 // it with 0, as the _L to _LZ handling is done above.
4416 MI
.getOperand(ArgOffset
+ Intr
->MipIndex
).ChangeToImmediate(0);
4417 --CorrectedNumVAddrs
;
4422 // Rewrite the addressing register layout before doing anything else.
4423 if (BaseOpcode
->Gradients
&& !ST
.hasG16() && (IsA16
!= IsG16
)) {
4424 // 16 bit gradients are supported, but are tied to the A16 control
4425 // so both gradients and addresses must be 16 bit
4429 if (IsA16
&& !ST
.hasA16()) {
4430 // A16 not supported
4434 if (IsA16
|| IsG16
) {
4435 if (Intr
->NumVAddrs
> 1) {
4436 SmallVector
<Register
, 4> PackedRegs
;
4438 packImage16bitOpsToDwords(B
, MI
, PackedRegs
, ArgOffset
, Intr
, IsA16
,
4441 // See also below in the non-a16 branch
4442 const bool UseNSA
= ST
.hasNSAEncoding() && PackedRegs
.size() >= 3 &&
4443 PackedRegs
.size() <= ST
.getNSAMaxSize();
4445 if (!UseNSA
&& PackedRegs
.size() > 1) {
4446 LLT PackedAddrTy
= LLT::fixed_vector(2 * PackedRegs
.size(), 16);
4447 auto Concat
= B
.buildConcatVectors(PackedAddrTy
, PackedRegs
);
4448 PackedRegs
[0] = Concat
.getReg(0);
4449 PackedRegs
.resize(1);
4452 const unsigned NumPacked
= PackedRegs
.size();
4453 for (unsigned I
= Intr
->VAddrStart
; I
< Intr
->VAddrEnd
; I
++) {
4454 MachineOperand
&SrcOp
= MI
.getOperand(ArgOffset
+ I
);
4455 if (!SrcOp
.isReg()) {
4456 assert(SrcOp
.isImm() && SrcOp
.getImm() == 0);
4460 assert(SrcOp
.getReg() != AMDGPU::NoRegister
);
4462 if (I
- Intr
->VAddrStart
< NumPacked
)
4463 SrcOp
.setReg(PackedRegs
[I
- Intr
->VAddrStart
]);
4465 SrcOp
.setReg(AMDGPU::NoRegister
);
4469 // If the register allocator cannot place the address registers contiguously
4470 // without introducing moves, then using the non-sequential address encoding
4471 // is always preferable, since it saves VALU instructions and is usually a
4472 // wash in terms of code size or even better.
4474 // However, we currently have no way of hinting to the register allocator
4475 // that MIMG addresses should be placed contiguously when it is possible to
4476 // do so, so force non-NSA for the common 2-address case as a heuristic.
4478 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4479 // allocation when possible.
4480 const bool UseNSA
= ST
.hasNSAEncoding() && CorrectedNumVAddrs
>= 3 &&
4481 CorrectedNumVAddrs
<= ST
.getNSAMaxSize();
4483 if (!UseNSA
&& Intr
->NumVAddrs
> 1)
4484 convertImageAddrToPacked(B
, MI
, ArgOffset
+ Intr
->VAddrStart
,
4493 MI
.addOperand(MachineOperand::CreateImm(Flags
));
4495 if (BaseOpcode
->Store
) { // No TFE for stores?
4496 // TODO: Handle dmask trim
4497 Register VData
= MI
.getOperand(1).getReg();
4498 LLT Ty
= MRI
->getType(VData
);
4499 if (!Ty
.isVector() || Ty
.getElementType() != S16
)
4502 Register RepackedReg
= handleD16VData(B
, *MRI
, VData
, true);
4503 if (RepackedReg
!= VData
) {
4504 MI
.getOperand(1).setReg(RepackedReg
);
4510 Register DstReg
= MI
.getOperand(0).getReg();
4511 LLT Ty
= MRI
->getType(DstReg
);
4512 const LLT EltTy
= Ty
.getScalarType();
4513 const bool IsD16
= Ty
.getScalarType() == S16
;
4514 const int NumElts
= Ty
.isVector() ? Ty
.getNumElements() : 1;
4516 // Confirm that the return type is large enough for the dmask specified
4517 if (NumElts
< DMaskLanes
)
4520 if (NumElts
> 4 || DMaskLanes
> 4)
4523 const unsigned AdjustedNumElts
= DMaskLanes
== 0 ? 1 : DMaskLanes
;
4524 const LLT AdjustedTy
=
4525 Ty
.changeElementCount(ElementCount::getFixed(AdjustedNumElts
));
4527 // The raw dword aligned data component of the load. The only legal cases
4528 // where this matters should be when using the packed D16 format, for
4529 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4532 // S32 vector to to cover all data, plus TFE result element.
4535 // Register type to use for each loaded component. Will be S32 or V2S16.
4538 if (IsD16
&& ST
.hasUnpackedD16VMem()) {
4540 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts
), 32);
4541 TFETy
= LLT::fixed_vector(AdjustedNumElts
+ 1, 32);
4544 unsigned EltSize
= EltTy
.getSizeInBits();
4545 unsigned RoundedElts
= (AdjustedTy
.getSizeInBits() + 31) / 32;
4546 unsigned RoundedSize
= 32 * RoundedElts
;
4547 RoundedTy
= LLT::scalarOrVector(
4548 ElementCount::getFixed(RoundedSize
/ EltSize
), EltSize
);
4549 TFETy
= LLT::fixed_vector(RoundedSize
/ 32 + 1, S32
);
4550 RegTy
= !IsTFE
&& EltSize
== 16 ? V2S16
: S32
;
4553 // The return type does not need adjustment.
4554 // TODO: Should we change s16 case to s32 or <2 x s16>?
4555 if (!IsTFE
&& (RoundedTy
== Ty
|| !Ty
.isVector()))
4560 // Insert after the instruction.
4561 B
.setInsertPt(*MI
.getParent(), ++MI
.getIterator());
4563 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4564 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4565 const LLT LoadResultTy
= IsTFE
? TFETy
: RoundedTy
;
4566 const int ResultNumRegs
= LoadResultTy
.getSizeInBits() / 32;
4568 Register NewResultReg
= MRI
->createGenericVirtualRegister(LoadResultTy
);
4570 MI
.getOperand(0).setReg(NewResultReg
);
4572 // In the IR, TFE is supposed to be used with a 2 element struct return
4573 // type. The intruction really returns these two values in one contiguous
4574 // register, with one additional dword beyond the loaded data. Rewrite the
4575 // return type to use a single register result.
4578 Dst1Reg
= MI
.getOperand(1).getReg();
4579 if (MRI
->getType(Dst1Reg
) != S32
)
4582 // TODO: Make sure the TFE operand bit is set.
4583 MI
.RemoveOperand(1);
4585 // Handle the easy case that requires no repack instructions.
4587 B
.buildUnmerge({DstReg
, Dst1Reg
}, NewResultReg
);
4592 // Now figure out how to copy the new result register back into the old
4594 SmallVector
<Register
, 5> ResultRegs(ResultNumRegs
, Dst1Reg
);
4596 const int NumDataRegs
= IsTFE
? ResultNumRegs
- 1 : ResultNumRegs
;
4598 if (ResultNumRegs
== 1) {
4600 ResultRegs
[0] = NewResultReg
;
4602 // We have to repack into a new vector of some kind.
4603 for (int I
= 0; I
!= NumDataRegs
; ++I
)
4604 ResultRegs
[I
] = MRI
->createGenericVirtualRegister(RegTy
);
4605 B
.buildUnmerge(ResultRegs
, NewResultReg
);
4607 // Drop the final TFE element to get the data part. The TFE result is
4608 // directly written to the right place already.
4610 ResultRegs
.resize(NumDataRegs
);
4613 // For an s16 scalar result, we form an s32 result with a truncate regardless
4614 // of packed vs. unpacked.
4615 if (IsD16
&& !Ty
.isVector()) {
4616 B
.buildTrunc(DstReg
, ResultRegs
[0]);
4620 // Avoid a build/concat_vector of 1 entry.
4621 if (Ty
== V2S16
&& NumDataRegs
== 1 && !ST
.hasUnpackedD16VMem()) {
4622 B
.buildBitcast(DstReg
, ResultRegs
[0]);
4626 assert(Ty
.isVector());
4629 // For packed D16 results with TFE enabled, all the data components are
4630 // S32. Cast back to the expected type.
4632 // TODO: We don't really need to use load s32 elements. We would only need one
4633 // cast for the TFE result if a multiple of v2s16 was used.
4634 if (RegTy
!= V2S16
&& !ST
.hasUnpackedD16VMem()) {
4635 for (Register
&Reg
: ResultRegs
)
4636 Reg
= B
.buildBitcast(V2S16
, Reg
).getReg(0);
4637 } else if (ST
.hasUnpackedD16VMem()) {
4638 for (Register
&Reg
: ResultRegs
)
4639 Reg
= B
.buildTrunc(S16
, Reg
).getReg(0);
4643 auto padWithUndef
= [&](LLT Ty
, int NumElts
) {
4646 Register Undef
= B
.buildUndef(Ty
).getReg(0);
4647 for (int I
= 0; I
!= NumElts
; ++I
)
4648 ResultRegs
.push_back(Undef
);
4651 // Pad out any elements eliminated due to the dmask.
4652 LLT ResTy
= MRI
->getType(ResultRegs
[0]);
4653 if (!ResTy
.isVector()) {
4654 padWithUndef(ResTy
, NumElts
- ResultRegs
.size());
4655 B
.buildBuildVector(DstReg
, ResultRegs
);
4659 assert(!ST
.hasUnpackedD16VMem() && ResTy
== V2S16
);
4660 const int RegsToCover
= (Ty
.getSizeInBits() + 31) / 32;
4662 // Deal with the one annoying legal case.
4663 const LLT V3S16
= LLT::fixed_vector(3, 16);
4665 padWithUndef(ResTy
, RegsToCover
- ResultRegs
.size() + 1);
4666 auto Concat
= B
.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs
);
4667 B
.buildUnmerge({DstReg
, MRI
->createGenericVirtualRegister(V3S16
)}, Concat
);
4671 padWithUndef(ResTy
, RegsToCover
- ResultRegs
.size());
4672 B
.buildConcatVectors(DstReg
, ResultRegs
);
4676 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4677 LegalizerHelper
&Helper
, MachineInstr
&MI
) const {
4678 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
4679 GISelChangeObserver
&Observer
= Helper
.Observer
;
4681 Register Dst
= MI
.getOperand(0).getReg();
4682 LLT Ty
= B
.getMRI()->getType(Dst
);
4683 unsigned Size
= Ty
.getSizeInBits();
4684 MachineFunction
&MF
= B
.getMF();
4686 Observer
.changingInstr(MI
);
4688 if (shouldBitcastLoadStoreType(ST
, Ty
, LLT::scalar(Size
))) {
4689 Ty
= getBitcastRegisterType(Ty
);
4690 Helper
.bitcastDst(MI
, Ty
, 0);
4691 Dst
= MI
.getOperand(0).getReg();
4692 B
.setInsertPt(B
.getMBB(), MI
);
4695 // FIXME: We don't really need this intermediate instruction. The intrinsic
4696 // should be fixed to have a memory operand. Since it's readnone, we're not
4697 // allowed to add one.
4698 MI
.setDesc(B
.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD
));
4699 MI
.RemoveOperand(1); // Remove intrinsic ID
4701 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4702 // TODO: Should this use datalayout alignment?
4703 const unsigned MemSize
= (Size
+ 7) / 8;
4704 const Align
MemAlign(4);
4705 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
4706 MachinePointerInfo(),
4707 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
4708 MachineMemOperand::MOInvariant
,
4710 MI
.addMemOperand(MF
, MMO
);
4712 // There are no 96-bit result scalar loads, but widening to 128-bit should
4713 // always be legal. We may need to restore this to a 96-bit result if it turns
4714 // out this needs to be converted to a vector load during RegBankSelect.
4715 if (!isPowerOf2_32(Size
)) {
4717 Helper
.moreElementsVectorDst(MI
, getPow2VectorType(Ty
), 0);
4719 Helper
.widenScalarDst(MI
, getPow2ScalarType(Ty
), 0);
4722 Observer
.changedInstr(MI
);
4726 // TODO: Move to selection
4727 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr
&MI
,
4728 MachineRegisterInfo
&MRI
,
4729 MachineIRBuilder
&B
) const {
4730 if (!ST
.isTrapHandlerEnabled() ||
4731 ST
.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA
)
4732 return legalizeTrapEndpgm(MI
, MRI
, B
);
4734 if (Optional
<uint8_t> HsaAbiVer
= AMDGPU::getHsaAbiVersion(&ST
)) {
4735 switch (*HsaAbiVer
) {
4736 case ELF::ELFABIVERSION_AMDGPU_HSA_V2
:
4737 case ELF::ELFABIVERSION_AMDGPU_HSA_V3
:
4738 return legalizeTrapHsaQueuePtr(MI
, MRI
, B
);
4739 case ELF::ELFABIVERSION_AMDGPU_HSA_V4
:
4740 return ST
.supportsGetDoorbellID() ?
4741 legalizeTrapHsa(MI
, MRI
, B
) :
4742 legalizeTrapHsaQueuePtr(MI
, MRI
, B
);
4746 llvm_unreachable("Unknown trap handler");
4749 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4750 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
4751 B
.buildInstr(AMDGPU::S_ENDPGM
).addImm(0);
4752 MI
.eraseFromParent();
4756 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4757 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
4758 // Pass queue pointer to trap handler as input, and insert trap instruction
4759 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4761 MRI
.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
4762 if (!loadInputValue(LiveIn
, B
, AMDGPUFunctionArgInfo::QUEUE_PTR
))
4765 Register
SGPR01(AMDGPU::SGPR0_SGPR1
);
4766 B
.buildCopy(SGPR01
, LiveIn
);
4767 B
.buildInstr(AMDGPU::S_TRAP
)
4768 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap
))
4769 .addReg(SGPR01
, RegState::Implicit
);
4771 MI
.eraseFromParent();
4775 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4776 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
4777 B
.buildInstr(AMDGPU::S_TRAP
)
4778 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap
));
4779 MI
.eraseFromParent();
4783 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4784 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
4785 // Is non-HSA path or trap-handler disabled? then, report a warning
4787 if (!ST
.isTrapHandlerEnabled() ||
4788 ST
.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA
) {
4789 DiagnosticInfoUnsupported
NoTrap(B
.getMF().getFunction(),
4790 "debugtrap handler not supported",
4791 MI
.getDebugLoc(), DS_Warning
);
4792 LLVMContext
&Ctx
= B
.getMF().getFunction().getContext();
4793 Ctx
.diagnose(NoTrap
);
4795 // Insert debug-trap instruction
4796 B
.buildInstr(AMDGPU::S_TRAP
)
4797 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap
));
4800 MI
.eraseFromParent();
4804 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr
&MI
,
4805 MachineIRBuilder
&B
) const {
4806 MachineRegisterInfo
&MRI
= *B
.getMRI();
4807 const LLT S16
= LLT::scalar(16);
4808 const LLT S32
= LLT::scalar(32);
4810 Register DstReg
= MI
.getOperand(0).getReg();
4811 Register NodePtr
= MI
.getOperand(2).getReg();
4812 Register RayExtent
= MI
.getOperand(3).getReg();
4813 Register RayOrigin
= MI
.getOperand(4).getReg();
4814 Register RayDir
= MI
.getOperand(5).getReg();
4815 Register RayInvDir
= MI
.getOperand(6).getReg();
4816 Register TDescr
= MI
.getOperand(7).getReg();
4818 if (!ST
.hasGFX10_AEncoding()) {
4819 DiagnosticInfoUnsupported
BadIntrin(B
.getMF().getFunction(),
4820 "intrinsic not supported on subtarget",
4822 B
.getMF().getFunction().getContext().diagnose(BadIntrin
);
4826 const bool IsA16
= MRI
.getType(RayDir
).getElementType().getSizeInBits() == 16;
4827 const bool Is64
= MRI
.getType(NodePtr
).getSizeInBits() == 64;
4828 const unsigned NumVDataDwords
= 4;
4829 const unsigned NumVAddrDwords
= IsA16
? (Is64
? 9 : 8) : (Is64
? 12 : 11);
4831 ST
.hasNSAEncoding() && NumVAddrDwords
<= ST
.getNSAMaxSize();
4832 const unsigned BaseOpcodes
[2][2] = {
4833 {AMDGPU::IMAGE_BVH_INTERSECT_RAY
, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16
},
4834 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY
,
4835 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16
}};
4839 AMDGPU::getMIMGOpcode(BaseOpcodes
[Is64
][IsA16
], AMDGPU::MIMGEncGfx10NSA
,
4840 NumVDataDwords
, NumVAddrDwords
);
4842 Opcode
= AMDGPU::getMIMGOpcode(BaseOpcodes
[Is64
][IsA16
],
4843 AMDGPU::MIMGEncGfx10Default
, NumVDataDwords
,
4844 PowerOf2Ceil(NumVAddrDwords
));
4846 assert(Opcode
!= -1);
4848 SmallVector
<Register
, 12> Ops
;
4850 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, NodePtr
);
4851 Ops
.push_back(Unmerge
.getReg(0));
4852 Ops
.push_back(Unmerge
.getReg(1));
4854 Ops
.push_back(NodePtr
);
4856 Ops
.push_back(RayExtent
);
4858 auto packLanes
= [&Ops
, &S32
, &B
] (Register Src
) {
4859 auto Unmerge
= B
.buildUnmerge({S32
, S32
, S32
, S32
}, Src
);
4860 Ops
.push_back(Unmerge
.getReg(0));
4861 Ops
.push_back(Unmerge
.getReg(1));
4862 Ops
.push_back(Unmerge
.getReg(2));
4865 packLanes(RayOrigin
);
4867 auto UnmergeRayDir
= B
.buildUnmerge({S16
, S16
, S16
, S16
}, RayDir
);
4868 auto UnmergeRayInvDir
= B
.buildUnmerge({S16
, S16
, S16
, S16
}, RayInvDir
);
4869 Register R1
= MRI
.createGenericVirtualRegister(S32
);
4870 Register R2
= MRI
.createGenericVirtualRegister(S32
);
4871 Register R3
= MRI
.createGenericVirtualRegister(S32
);
4872 B
.buildMerge(R1
, {UnmergeRayDir
.getReg(0), UnmergeRayDir
.getReg(1)});
4873 B
.buildMerge(R2
, {UnmergeRayDir
.getReg(2), UnmergeRayInvDir
.getReg(0)});
4874 B
.buildMerge(R3
, {UnmergeRayInvDir
.getReg(1), UnmergeRayInvDir
.getReg(2)});
4880 packLanes(RayInvDir
);
4884 // Build a single vector containing all the operands so far prepared.
4885 LLT OpTy
= LLT::fixed_vector(Ops
.size(), 32);
4886 Register MergedOps
= B
.buildMerge(OpTy
, Ops
).getReg(0);
4888 Ops
.push_back(MergedOps
);
4891 auto MIB
= B
.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY
)
4895 for (Register R
: Ops
) {
4900 .addImm(IsA16
? 1 : 0)
4903 MI
.eraseFromParent();
4907 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper
&Helper
,
4908 MachineInstr
&MI
) const {
4909 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
4910 MachineRegisterInfo
&MRI
= *B
.getMRI();
4912 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4913 auto IntrID
= MI
.getIntrinsicID();
4915 case Intrinsic::amdgcn_if
:
4916 case Intrinsic::amdgcn_else
: {
4917 MachineInstr
*Br
= nullptr;
4918 MachineBasicBlock
*UncondBrTarget
= nullptr;
4919 bool Negated
= false;
4920 if (MachineInstr
*BrCond
=
4921 verifyCFIntrinsic(MI
, MRI
, Br
, UncondBrTarget
, Negated
)) {
4922 const SIRegisterInfo
*TRI
4923 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
4925 Register Def
= MI
.getOperand(1).getReg();
4926 Register Use
= MI
.getOperand(3).getReg();
4928 MachineBasicBlock
*CondBrTarget
= BrCond
->getOperand(1).getMBB();
4931 std::swap(CondBrTarget
, UncondBrTarget
);
4933 B
.setInsertPt(B
.getMBB(), BrCond
->getIterator());
4934 if (IntrID
== Intrinsic::amdgcn_if
) {
4935 B
.buildInstr(AMDGPU::SI_IF
)
4938 .addMBB(UncondBrTarget
);
4940 B
.buildInstr(AMDGPU::SI_ELSE
)
4943 .addMBB(UncondBrTarget
);
4947 Br
->getOperand(0).setMBB(CondBrTarget
);
4949 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4950 // since we're swapping branch targets it needs to be reinserted.
4951 // FIXME: IRTranslator should probably not do this
4952 B
.buildBr(*CondBrTarget
);
4955 MRI
.setRegClass(Def
, TRI
->getWaveMaskRegClass());
4956 MRI
.setRegClass(Use
, TRI
->getWaveMaskRegClass());
4957 MI
.eraseFromParent();
4958 BrCond
->eraseFromParent();
4964 case Intrinsic::amdgcn_loop
: {
4965 MachineInstr
*Br
= nullptr;
4966 MachineBasicBlock
*UncondBrTarget
= nullptr;
4967 bool Negated
= false;
4968 if (MachineInstr
*BrCond
=
4969 verifyCFIntrinsic(MI
, MRI
, Br
, UncondBrTarget
, Negated
)) {
4970 const SIRegisterInfo
*TRI
4971 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
4973 MachineBasicBlock
*CondBrTarget
= BrCond
->getOperand(1).getMBB();
4974 Register Reg
= MI
.getOperand(2).getReg();
4977 std::swap(CondBrTarget
, UncondBrTarget
);
4979 B
.setInsertPt(B
.getMBB(), BrCond
->getIterator());
4980 B
.buildInstr(AMDGPU::SI_LOOP
)
4982 .addMBB(UncondBrTarget
);
4985 Br
->getOperand(0).setMBB(CondBrTarget
);
4987 B
.buildBr(*CondBrTarget
);
4989 MI
.eraseFromParent();
4990 BrCond
->eraseFromParent();
4991 MRI
.setRegClass(Reg
, TRI
->getWaveMaskRegClass());
4997 case Intrinsic::amdgcn_kernarg_segment_ptr
:
4998 if (!AMDGPU::isKernel(B
.getMF().getFunction().getCallingConv())) {
4999 // This only makes sense to call in a kernel, so just lower to null.
5000 B
.buildConstant(MI
.getOperand(0).getReg(), 0);
5001 MI
.eraseFromParent();
5005 return legalizePreloadedArgIntrin(
5006 MI
, MRI
, B
, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
5007 case Intrinsic::amdgcn_implicitarg_ptr
:
5008 return legalizeImplicitArgPtr(MI
, MRI
, B
);
5009 case Intrinsic::amdgcn_workitem_id_x
:
5010 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
5011 AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
5012 case Intrinsic::amdgcn_workitem_id_y
:
5013 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
5014 AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
5015 case Intrinsic::amdgcn_workitem_id_z
:
5016 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
5017 AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
5018 case Intrinsic::amdgcn_workgroup_id_x
:
5019 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
5020 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
5021 case Intrinsic::amdgcn_workgroup_id_y
:
5022 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
5023 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
5024 case Intrinsic::amdgcn_workgroup_id_z
:
5025 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
5026 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
5027 case Intrinsic::amdgcn_dispatch_ptr
:
5028 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
5029 AMDGPUFunctionArgInfo::DISPATCH_PTR
);
5030 case Intrinsic::amdgcn_queue_ptr
:
5031 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
5032 AMDGPUFunctionArgInfo::QUEUE_PTR
);
5033 case Intrinsic::amdgcn_implicit_buffer_ptr
:
5034 return legalizePreloadedArgIntrin(
5035 MI
, MRI
, B
, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
5036 case Intrinsic::amdgcn_dispatch_id
:
5037 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
5038 AMDGPUFunctionArgInfo::DISPATCH_ID
);
5039 case Intrinsic::amdgcn_fdiv_fast
:
5040 return legalizeFDIVFastIntrin(MI
, MRI
, B
);
5041 case Intrinsic::amdgcn_is_shared
:
5042 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::LOCAL_ADDRESS
);
5043 case Intrinsic::amdgcn_is_private
:
5044 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::PRIVATE_ADDRESS
);
5045 case Intrinsic::amdgcn_wavefrontsize
: {
5046 B
.buildConstant(MI
.getOperand(0), ST
.getWavefrontSize());
5047 MI
.eraseFromParent();
5050 case Intrinsic::amdgcn_s_buffer_load
:
5051 return legalizeSBufferLoad(Helper
, MI
);
5052 case Intrinsic::amdgcn_raw_buffer_store
:
5053 case Intrinsic::amdgcn_struct_buffer_store
:
5054 return legalizeBufferStore(MI
, MRI
, B
, false, false);
5055 case Intrinsic::amdgcn_raw_buffer_store_format
:
5056 case Intrinsic::amdgcn_struct_buffer_store_format
:
5057 return legalizeBufferStore(MI
, MRI
, B
, false, true);
5058 case Intrinsic::amdgcn_raw_tbuffer_store
:
5059 case Intrinsic::amdgcn_struct_tbuffer_store
:
5060 return legalizeBufferStore(MI
, MRI
, B
, true, true);
5061 case Intrinsic::amdgcn_raw_buffer_load
:
5062 case Intrinsic::amdgcn_struct_buffer_load
:
5063 return legalizeBufferLoad(MI
, MRI
, B
, false, false);
5064 case Intrinsic::amdgcn_raw_buffer_load_format
:
5065 case Intrinsic::amdgcn_struct_buffer_load_format
:
5066 return legalizeBufferLoad(MI
, MRI
, B
, true, false);
5067 case Intrinsic::amdgcn_raw_tbuffer_load
:
5068 case Intrinsic::amdgcn_struct_tbuffer_load
:
5069 return legalizeBufferLoad(MI
, MRI
, B
, true, true);
5070 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
5071 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
5072 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
5073 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
5074 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
5075 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
5076 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
5077 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
5078 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
5079 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
5080 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
5081 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
5082 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
5083 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
5084 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
5085 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
5086 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
5087 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
5088 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
5089 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
5090 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
5091 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
5092 case Intrinsic::amdgcn_raw_buffer_atomic_dec
:
5093 case Intrinsic::amdgcn_struct_buffer_atomic_dec
:
5094 case Intrinsic::amdgcn_raw_buffer_atomic_fadd
:
5095 case Intrinsic::amdgcn_struct_buffer_atomic_fadd
:
5096 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
:
5097 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
:
5098 case Intrinsic::amdgcn_buffer_atomic_fadd
:
5099 case Intrinsic::amdgcn_raw_buffer_atomic_fmin
:
5100 case Intrinsic::amdgcn_struct_buffer_atomic_fmin
:
5101 case Intrinsic::amdgcn_raw_buffer_atomic_fmax
:
5102 case Intrinsic::amdgcn_struct_buffer_atomic_fmax
:
5103 return legalizeBufferAtomic(MI
, B
, IntrID
);
5104 case Intrinsic::amdgcn_atomic_inc
:
5105 return legalizeAtomicIncDec(MI
, B
, true);
5106 case Intrinsic::amdgcn_atomic_dec
:
5107 return legalizeAtomicIncDec(MI
, B
, false);
5108 case Intrinsic::trap
:
5109 return legalizeTrapIntrinsic(MI
, MRI
, B
);
5110 case Intrinsic::debugtrap
:
5111 return legalizeDebugTrapIntrinsic(MI
, MRI
, B
);
5112 case Intrinsic::amdgcn_rsq_clamp
:
5113 return legalizeRsqClampIntrinsic(MI
, MRI
, B
);
5114 case Intrinsic::amdgcn_ds_fadd
:
5115 case Intrinsic::amdgcn_ds_fmin
:
5116 case Intrinsic::amdgcn_ds_fmax
:
5117 return legalizeDSAtomicFPIntrinsic(Helper
, MI
, IntrID
);
5118 case Intrinsic::amdgcn_image_bvh_intersect_ray
:
5119 return legalizeBVHIntrinsic(MI
, B
);
5121 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
5122 AMDGPU::getImageDimIntrinsicInfo(IntrID
))
5123 return legalizeImageIntrinsic(MI
, B
, Helper
.Observer
, ImageDimIntr
);