1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the Machinelegalizer class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPULegalizerInfo.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIInstrInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "SIRegisterInfo.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm/ADT/ScopeExit.h"
26 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31 #include "llvm/CodeGen/GlobalISel/Utils.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
37 #define DEBUG_TYPE "amdgpu-legalinfo"
40 using namespace LegalizeActions
;
41 using namespace LegalizeMutations
;
42 using namespace LegalityPredicates
;
43 using namespace MIPatternMatch
;
45 // Hack until load/store selection patterns support any tuple of legal types.
46 static cl::opt
<bool> EnableNewLegality(
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
53 static constexpr unsigned MaxRegisterSize
= 1024;
55 // Round the number of elements to the next power of two elements
56 static LLT
getPow2VectorType(LLT Ty
) {
57 unsigned NElts
= Ty
.getNumElements();
58 unsigned Pow2NElts
= 1 << Log2_32_Ceil(NElts
);
59 return Ty
.changeElementCount(ElementCount::getFixed(Pow2NElts
));
62 // Round the number of bits to the next power of two bits
63 static LLT
getPow2ScalarType(LLT Ty
) {
64 unsigned Bits
= Ty
.getSizeInBits();
65 unsigned Pow2Bits
= 1 << Log2_32_Ceil(Bits
);
66 return LLT::scalar(Pow2Bits
);
69 /// \returns true if this is an odd sized vector which should widen by adding an
70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71 /// excludes s1 vectors, which should always be scalarized.
72 static LegalityPredicate
isSmallOddVector(unsigned TypeIdx
) {
73 return [=](const LegalityQuery
&Query
) {
74 const LLT Ty
= Query
.Types
[TypeIdx
];
78 const LLT EltTy
= Ty
.getElementType();
79 const unsigned EltSize
= EltTy
.getSizeInBits();
80 return Ty
.getNumElements() % 2 != 0 &&
81 EltSize
> 1 && EltSize
< 32 &&
82 Ty
.getSizeInBits() % 32 != 0;
86 static LegalityPredicate
sizeIsMultipleOf32(unsigned TypeIdx
) {
87 return [=](const LegalityQuery
&Query
) {
88 const LLT Ty
= Query
.Types
[TypeIdx
];
89 return Ty
.getSizeInBits() % 32 == 0;
93 static LegalityPredicate
isWideVec16(unsigned TypeIdx
) {
94 return [=](const LegalityQuery
&Query
) {
95 const LLT Ty
= Query
.Types
[TypeIdx
];
96 const LLT EltTy
= Ty
.getScalarType();
97 return EltTy
.getSizeInBits() == 16 && Ty
.getNumElements() > 2;
101 static LegalizeMutation
oneMoreElement(unsigned TypeIdx
) {
102 return [=](const LegalityQuery
&Query
) {
103 const LLT Ty
= Query
.Types
[TypeIdx
];
104 const LLT EltTy
= Ty
.getElementType();
105 return std::pair(TypeIdx
,
106 LLT::fixed_vector(Ty
.getNumElements() + 1, EltTy
));
110 static LegalizeMutation
fewerEltsToSize64Vector(unsigned TypeIdx
) {
111 return [=](const LegalityQuery
&Query
) {
112 const LLT Ty
= Query
.Types
[TypeIdx
];
113 const LLT EltTy
= Ty
.getElementType();
114 unsigned Size
= Ty
.getSizeInBits();
115 unsigned Pieces
= (Size
+ 63) / 64;
116 unsigned NewNumElts
= (Ty
.getNumElements() + 1) / Pieces
;
117 return std::pair(TypeIdx
, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts
), EltTy
));
122 // Increase the number of vector elements to reach the next multiple of 32-bit
124 static LegalizeMutation
moreEltsToNext32Bit(unsigned TypeIdx
) {
125 return [=](const LegalityQuery
&Query
) {
126 const LLT Ty
= Query
.Types
[TypeIdx
];
128 const LLT EltTy
= Ty
.getElementType();
129 const int Size
= Ty
.getSizeInBits();
130 const int EltSize
= EltTy
.getSizeInBits();
131 const int NextMul32
= (Size
+ 31) / 32;
133 assert(EltSize
< 32);
135 const int NewNumElts
= (32 * NextMul32
+ EltSize
- 1) / EltSize
;
136 return std::pair(TypeIdx
, LLT::fixed_vector(NewNumElts
, EltTy
));
140 // Increase the number of vector elements to reach the next legal RegClass.
141 static LegalizeMutation
moreElementsToNextExistingRegClass(unsigned TypeIdx
) {
142 return [=](const LegalityQuery
&Query
) {
143 const LLT Ty
= Query
.Types
[TypeIdx
];
144 const unsigned NumElts
= Ty
.getNumElements();
145 const unsigned EltSize
= Ty
.getElementType().getSizeInBits();
146 const unsigned MaxNumElts
= MaxRegisterSize
/ EltSize
;
148 assert(EltSize
== 32 || EltSize
== 64);
149 assert(Ty
.getSizeInBits() < MaxRegisterSize
);
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts
= NumElts
; NewNumElts
< MaxNumElts
; ++NewNumElts
) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts
* EltSize
))
158 return std::pair(TypeIdx
, LLT::fixed_vector(NewNumElts
, EltSize
));
162 static LLT
getBufferRsrcScalarType(const LLT Ty
) {
164 return LLT::scalar(128);
165 const ElementCount NumElems
= Ty
.getElementCount();
166 return LLT::vector(NumElems
, LLT::scalar(128));
169 static LLT
getBufferRsrcRegisterType(const LLT Ty
) {
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems
= Ty
.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems
* 4, LLT::scalar(32));
176 static LLT
getBitcastRegisterType(const LLT Ty
) {
177 const unsigned Size
= Ty
.getSizeInBits();
182 return LLT::scalar(Size
);
185 return LLT::scalarOrVector(ElementCount::getFixed(Size
/ 32), 32);
188 static LegalizeMutation
bitcastToRegisterType(unsigned TypeIdx
) {
189 return [=](const LegalityQuery
&Query
) {
190 const LLT Ty
= Query
.Types
[TypeIdx
];
191 return std::pair(TypeIdx
, getBitcastRegisterType(Ty
));
195 static LegalizeMutation
bitcastToVectorElement32(unsigned TypeIdx
) {
196 return [=](const LegalityQuery
&Query
) {
197 const LLT Ty
= Query
.Types
[TypeIdx
];
198 unsigned Size
= Ty
.getSizeInBits();
199 assert(Size
% 32 == 0);
201 TypeIdx
, LLT::scalarOrVector(ElementCount::getFixed(Size
/ 32), 32));
205 static LegalityPredicate
vectorSmallerThan(unsigned TypeIdx
, unsigned Size
) {
206 return [=](const LegalityQuery
&Query
) {
207 const LLT QueryTy
= Query
.Types
[TypeIdx
];
208 return QueryTy
.isVector() && QueryTy
.getSizeInBits() < Size
;
212 static LegalityPredicate
vectorWiderThan(unsigned TypeIdx
, unsigned Size
) {
213 return [=](const LegalityQuery
&Query
) {
214 const LLT QueryTy
= Query
.Types
[TypeIdx
];
215 return QueryTy
.isVector() && QueryTy
.getSizeInBits() > Size
;
219 static LegalityPredicate
numElementsNotEven(unsigned TypeIdx
) {
220 return [=](const LegalityQuery
&Query
) {
221 const LLT QueryTy
= Query
.Types
[TypeIdx
];
222 return QueryTy
.isVector() && QueryTy
.getNumElements() % 2 != 0;
226 static bool isRegisterSize(unsigned Size
) {
227 return Size
% 32 == 0 && Size
<= MaxRegisterSize
;
230 static bool isRegisterVectorElementType(LLT EltTy
) {
231 const int EltSize
= EltTy
.getSizeInBits();
232 return EltSize
== 16 || EltSize
% 32 == 0;
235 static bool isRegisterVectorType(LLT Ty
) {
236 const int EltSize
= Ty
.getElementType().getSizeInBits();
237 return EltSize
== 32 || EltSize
== 64 ||
238 (EltSize
== 16 && Ty
.getNumElements() % 2 == 0) ||
239 EltSize
== 128 || EltSize
== 256;
242 // TODO: replace all uses of isRegisterType with isRegisterClassType
243 static bool isRegisterType(LLT Ty
) {
244 if (!isRegisterSize(Ty
.getSizeInBits()))
248 return isRegisterVectorType(Ty
);
253 // Any combination of 32 or 64-bit elements up the maximum register size, and
254 // multiples of v2s16.
255 static LegalityPredicate
isRegisterType(unsigned TypeIdx
) {
256 return [=](const LegalityQuery
&Query
) {
257 return isRegisterType(Query
.Types
[TypeIdx
]);
261 // RegisterType that doesn't have a corresponding RegClass.
262 // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263 // should be removed.
264 static LegalityPredicate
isIllegalRegisterType(unsigned TypeIdx
) {
265 return [=](const LegalityQuery
&Query
) {
266 LLT Ty
= Query
.Types
[TypeIdx
];
267 return isRegisterType(Ty
) &&
268 !SIRegisterInfo::getSGPRClassForBitWidth(Ty
.getSizeInBits());
272 static LegalityPredicate
elementTypeIsLegal(unsigned TypeIdx
) {
273 return [=](const LegalityQuery
&Query
) {
274 const LLT QueryTy
= Query
.Types
[TypeIdx
];
275 if (!QueryTy
.isVector())
277 const LLT EltTy
= QueryTy
.getElementType();
278 return EltTy
== LLT::scalar(16) || EltTy
.getSizeInBits() >= 32;
282 static const LLT S1
= LLT::scalar(1);
283 static const LLT S8
= LLT::scalar(8);
284 static const LLT S16
= LLT::scalar(16);
285 static const LLT S32
= LLT::scalar(32);
286 static const LLT F32
= LLT::float32();
287 static const LLT S64
= LLT::scalar(64);
288 static const LLT F64
= LLT::float64();
289 static const LLT S96
= LLT::scalar(96);
290 static const LLT S128
= LLT::scalar(128);
291 static const LLT S160
= LLT::scalar(160);
292 static const LLT S224
= LLT::scalar(224);
293 static const LLT S256
= LLT::scalar(256);
294 static const LLT S512
= LLT::scalar(512);
295 static const LLT MaxScalar
= LLT::scalar(MaxRegisterSize
);
297 static const LLT V2S8
= LLT::fixed_vector(2, 8);
298 static const LLT V2S16
= LLT::fixed_vector(2, 16);
299 static const LLT V4S16
= LLT::fixed_vector(4, 16);
300 static const LLT V6S16
= LLT::fixed_vector(6, 16);
301 static const LLT V8S16
= LLT::fixed_vector(8, 16);
302 static const LLT V10S16
= LLT::fixed_vector(10, 16);
303 static const LLT V12S16
= LLT::fixed_vector(12, 16);
304 static const LLT V16S16
= LLT::fixed_vector(16, 16);
306 static const LLT V2F16
= LLT::fixed_vector(2, LLT::float16());
307 static const LLT V2BF16
= V2F16
; // FIXME
309 static const LLT V2S32
= LLT::fixed_vector(2, 32);
310 static const LLT V3S32
= LLT::fixed_vector(3, 32);
311 static const LLT V4S32
= LLT::fixed_vector(4, 32);
312 static const LLT V5S32
= LLT::fixed_vector(5, 32);
313 static const LLT V6S32
= LLT::fixed_vector(6, 32);
314 static const LLT V7S32
= LLT::fixed_vector(7, 32);
315 static const LLT V8S32
= LLT::fixed_vector(8, 32);
316 static const LLT V9S32
= LLT::fixed_vector(9, 32);
317 static const LLT V10S32
= LLT::fixed_vector(10, 32);
318 static const LLT V11S32
= LLT::fixed_vector(11, 32);
319 static const LLT V12S32
= LLT::fixed_vector(12, 32);
320 static const LLT V16S32
= LLT::fixed_vector(16, 32);
321 static const LLT V32S32
= LLT::fixed_vector(32, 32);
323 static const LLT V2S64
= LLT::fixed_vector(2, 64);
324 static const LLT V3S64
= LLT::fixed_vector(3, 64);
325 static const LLT V4S64
= LLT::fixed_vector(4, 64);
326 static const LLT V5S64
= LLT::fixed_vector(5, 64);
327 static const LLT V6S64
= LLT::fixed_vector(6, 64);
328 static const LLT V7S64
= LLT::fixed_vector(7, 64);
329 static const LLT V8S64
= LLT::fixed_vector(8, 64);
330 static const LLT V16S64
= LLT::fixed_vector(16, 64);
332 static const LLT V2S128
= LLT::fixed_vector(2, 128);
333 static const LLT V4S128
= LLT::fixed_vector(4, 128);
335 static std::initializer_list
<LLT
> AllScalarTypes
= {S32
, S64
, S96
, S128
,
336 S160
, S224
, S256
, S512
};
338 static std::initializer_list
<LLT
> AllS16Vectors
{
339 V2S16
, V4S16
, V6S16
, V8S16
, V10S16
, V12S16
, V16S16
, V2S128
, V4S128
};
341 static std::initializer_list
<LLT
> AllS32Vectors
= {
342 V2S32
, V3S32
, V4S32
, V5S32
, V6S32
, V7S32
, V8S32
,
343 V9S32
, V10S32
, V11S32
, V12S32
, V16S32
, V32S32
};
345 static std::initializer_list
<LLT
> AllS64Vectors
= {V2S64
, V3S64
, V4S64
, V5S64
,
346 V6S64
, V7S64
, V8S64
, V16S64
};
348 // Checks whether a type is in the list of legal register types.
349 static bool isRegisterClassType(LLT Ty
) {
350 if (Ty
.isPointerOrPointerVector())
351 Ty
= Ty
.changeElementType(LLT::scalar(Ty
.getScalarSizeInBits()));
353 return is_contained(AllS32Vectors
, Ty
) || is_contained(AllS64Vectors
, Ty
) ||
354 is_contained(AllScalarTypes
, Ty
) || is_contained(AllS16Vectors
, Ty
);
357 static LegalityPredicate
isRegisterClassType(unsigned TypeIdx
) {
358 return [TypeIdx
](const LegalityQuery
&Query
) {
359 return isRegisterClassType(Query
.Types
[TypeIdx
]);
363 // If we have a truncating store or an extending load with a data size larger
364 // than 32-bits, we need to reduce to a 32-bit type.
365 static LegalityPredicate
isWideScalarExtLoadTruncStore(unsigned TypeIdx
) {
366 return [=](const LegalityQuery
&Query
) {
367 const LLT Ty
= Query
.Types
[TypeIdx
];
368 return !Ty
.isVector() && Ty
.getSizeInBits() > 32 &&
369 Query
.MMODescrs
[0].MemoryTy
.getSizeInBits() < Ty
.getSizeInBits();
373 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
374 // handle some operations by just promoting the register during
375 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
376 static unsigned maxSizeForAddrSpace(const GCNSubtarget
&ST
, unsigned AS
,
377 bool IsLoad
, bool IsAtomic
) {
379 case AMDGPUAS::PRIVATE_ADDRESS
:
380 // FIXME: Private element size.
381 return ST
.enableFlatScratch() ? 128 : 32;
382 case AMDGPUAS::LOCAL_ADDRESS
:
383 return ST
.useDS128() ? 128 : 64;
384 case AMDGPUAS::GLOBAL_ADDRESS
:
385 case AMDGPUAS::CONSTANT_ADDRESS
:
386 case AMDGPUAS::CONSTANT_ADDRESS_32BIT
:
387 case AMDGPUAS::BUFFER_RESOURCE
:
388 // Treat constant and global as identical. SMRD loads are sometimes usable for
389 // global loads (ideally constant address space should be eliminated)
390 // depending on the context. Legality cannot be context dependent, but
391 // RegBankSelect can split the load as necessary depending on the pointer
392 // register bank/uniformity and if the memory is invariant or not written in a
394 return IsLoad
? 512 : 128;
396 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
397 // if they may alias scratch depending on the subtarget. This needs to be
398 // moved to custom handling to use addressMayBeAccessedAsPrivate
399 return ST
.hasMultiDwordFlatScratchAddressing() || IsAtomic
? 128 : 32;
403 static bool isLoadStoreSizeLegal(const GCNSubtarget
&ST
,
404 const LegalityQuery
&Query
) {
405 const LLT Ty
= Query
.Types
[0];
407 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
408 const bool IsLoad
= Query
.Opcode
!= AMDGPU::G_STORE
;
410 unsigned RegSize
= Ty
.getSizeInBits();
411 uint64_t MemSize
= Query
.MMODescrs
[0].MemoryTy
.getSizeInBits();
412 uint64_t AlignBits
= Query
.MMODescrs
[0].AlignInBits
;
413 unsigned AS
= Query
.Types
[1].getAddressSpace();
415 // All of these need to be custom lowered to cast the pointer operand.
416 if (AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
)
419 // Do not handle extending vector loads.
420 if (Ty
.isVector() && MemSize
!= RegSize
)
423 // TODO: We should be able to widen loads if the alignment is high enough, but
424 // we also need to modify the memory access size.
426 // Accept widening loads based on alignment.
427 if (IsLoad
&& MemSize
< Size
)
428 MemSize
= std::max(MemSize
, Align
);
431 // Only 1-byte and 2-byte to 32-bit extloads are valid.
432 if (MemSize
!= RegSize
&& RegSize
!= 32)
435 if (MemSize
> maxSizeForAddrSpace(ST
, AS
, IsLoad
,
436 Query
.MMODescrs
[0].Ordering
!=
437 AtomicOrdering::NotAtomic
))
448 if (!ST
.hasDwordx3LoadStores())
453 // These may contextually need to be broken down.
459 assert(RegSize
>= MemSize
);
461 if (AlignBits
< MemSize
) {
462 const SITargetLowering
*TLI
= ST
.getTargetLowering();
463 if (!TLI
->allowsMisalignedMemoryAccessesImpl(MemSize
, AS
,
464 Align(AlignBits
/ 8)))
471 // The newer buffer intrinsic forms take their resource arguments as
472 // pointers in address space 8, aka s128 values. However, in order to not break
473 // SelectionDAG, the underlying operations have to continue to take v4i32
474 // arguments. Therefore, we convert resource pointers - or vectors of them
475 // to integer values here.
476 static bool hasBufferRsrcWorkaround(const LLT Ty
) {
477 if (Ty
.isPointer() && Ty
.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE
)
480 const LLT ElemTy
= Ty
.getElementType();
481 return hasBufferRsrcWorkaround(ElemTy
);
486 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
487 // workaround this. Eventually it should ignore the type for loads and only care
488 // about the size. Return true in cases where we will workaround this for now by
490 static bool loadStoreBitcastWorkaround(const LLT Ty
) {
491 if (EnableNewLegality
)
494 const unsigned Size
= Ty
.getSizeInBits();
497 // Address space 8 pointers get their own workaround.
498 if (hasBufferRsrcWorkaround(Ty
))
503 if (Ty
.isPointerVector())
506 unsigned EltSize
= Ty
.getScalarSizeInBits();
507 return EltSize
!= 32 && EltSize
!= 64;
510 static bool isLoadStoreLegal(const GCNSubtarget
&ST
, const LegalityQuery
&Query
) {
511 const LLT Ty
= Query
.Types
[0];
512 return isRegisterType(Ty
) && isLoadStoreSizeLegal(ST
, Query
) &&
513 !hasBufferRsrcWorkaround(Ty
) && !loadStoreBitcastWorkaround(Ty
);
516 /// Return true if a load or store of the type should be lowered with a bitcast
517 /// to a different type.
518 static bool shouldBitcastLoadStoreType(const GCNSubtarget
&ST
, const LLT Ty
,
520 const unsigned MemSizeInBits
= MemTy
.getSizeInBits();
521 const unsigned Size
= Ty
.getSizeInBits();
522 if (Size
!= MemSizeInBits
)
523 return Size
<= 32 && Ty
.isVector();
525 if (loadStoreBitcastWorkaround(Ty
) && isRegisterType(Ty
))
528 // Don't try to handle bitcasting vector ext loads for now.
529 return Ty
.isVector() && (!MemTy
.isVector() || MemTy
== Ty
) &&
530 (Size
<= 32 || isRegisterSize(Size
)) &&
531 !isRegisterVectorElementType(Ty
.getElementType());
534 /// Return true if we should legalize a load by widening an odd sized memory
535 /// access up to the alignment. Note this case when the memory access itself
536 /// changes, not the size of the result register.
537 static bool shouldWidenLoad(const GCNSubtarget
&ST
, LLT MemoryTy
,
538 uint64_t AlignInBits
, unsigned AddrSpace
,
540 unsigned SizeInBits
= MemoryTy
.getSizeInBits();
541 // We don't want to widen cases that are naturally legal.
542 if (isPowerOf2_32(SizeInBits
))
545 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
546 // end up widening these for a scalar load during RegBankSelect, if we don't
547 // have 96-bit scalar loads.
548 if (SizeInBits
== 96 && ST
.hasDwordx3LoadStores())
551 if (SizeInBits
>= maxSizeForAddrSpace(ST
, AddrSpace
, Opcode
, false))
554 // A load is known dereferenceable up to the alignment, so it's legal to widen
557 // TODO: Could check dereferenceable for less aligned cases.
558 unsigned RoundedSize
= NextPowerOf2(SizeInBits
);
559 if (AlignInBits
< RoundedSize
)
562 // Do not widen if it would introduce a slow unaligned load.
563 const SITargetLowering
*TLI
= ST
.getTargetLowering();
565 return TLI
->allowsMisalignedMemoryAccessesImpl(
566 RoundedSize
, AddrSpace
, Align(AlignInBits
/ 8),
567 MachineMemOperand::MOLoad
, &Fast
) &&
571 static bool shouldWidenLoad(const GCNSubtarget
&ST
, const LegalityQuery
&Query
,
573 if (Query
.MMODescrs
[0].Ordering
!= AtomicOrdering::NotAtomic
)
576 return shouldWidenLoad(ST
, Query
.MMODescrs
[0].MemoryTy
,
577 Query
.MMODescrs
[0].AlignInBits
,
578 Query
.Types
[1].getAddressSpace(), Opcode
);
581 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
582 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
583 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
584 static LLT
castBufferRsrcFromV4I32(MachineInstr
&MI
, MachineIRBuilder
&B
,
585 MachineRegisterInfo
&MRI
, unsigned Idx
) {
586 MachineOperand
&MO
= MI
.getOperand(Idx
);
588 const LLT PointerTy
= MRI
.getType(MO
.getReg());
590 // Paranoidly prevent us from doing this multiple times.
591 if (!hasBufferRsrcWorkaround(PointerTy
))
594 const LLT ScalarTy
= getBufferRsrcScalarType(PointerTy
);
595 const LLT VectorTy
= getBufferRsrcRegisterType(PointerTy
);
596 if (!PointerTy
.isVector()) {
597 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
598 const unsigned NumParts
= PointerTy
.getSizeInBits() / 32;
599 const LLT S32
= LLT::scalar(32);
601 Register VectorReg
= MRI
.createGenericVirtualRegister(VectorTy
);
602 std::array
<Register
, 4> VectorElems
;
603 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
604 for (unsigned I
= 0; I
< NumParts
; ++I
)
606 B
.buildExtractVectorElementConstant(S32
, VectorReg
, I
).getReg(0);
607 B
.buildMergeValues(MO
, VectorElems
);
608 MO
.setReg(VectorReg
);
611 Register BitcastReg
= MRI
.createGenericVirtualRegister(VectorTy
);
612 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
613 auto Scalar
= B
.buildBitcast(ScalarTy
, BitcastReg
);
614 B
.buildIntToPtr(MO
, Scalar
);
615 MO
.setReg(BitcastReg
);
620 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
621 /// the form in which the value must be in order to be passed to the low-level
622 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
623 /// needed in order to account for the fact that we can't define a register
624 /// class for s128 without breaking SelectionDAG.
625 static Register
castBufferRsrcToV4I32(Register Pointer
, MachineIRBuilder
&B
) {
626 MachineRegisterInfo
&MRI
= *B
.getMRI();
627 const LLT PointerTy
= MRI
.getType(Pointer
);
628 const LLT ScalarTy
= getBufferRsrcScalarType(PointerTy
);
629 const LLT VectorTy
= getBufferRsrcRegisterType(PointerTy
);
631 if (!PointerTy
.isVector()) {
632 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
633 SmallVector
<Register
, 4> PointerParts
;
634 const unsigned NumParts
= PointerTy
.getSizeInBits() / 32;
635 auto Unmerged
= B
.buildUnmerge(LLT::scalar(32), Pointer
);
636 for (unsigned I
= 0; I
< NumParts
; ++I
)
637 PointerParts
.push_back(Unmerged
.getReg(I
));
638 return B
.buildBuildVector(VectorTy
, PointerParts
).getReg(0);
640 Register Scalar
= B
.buildPtrToInt(ScalarTy
, Pointer
).getReg(0);
641 return B
.buildBitcast(VectorTy
, Scalar
).getReg(0);
644 static void castBufferRsrcArgToV4I32(MachineInstr
&MI
, MachineIRBuilder
&B
,
646 MachineOperand
&MO
= MI
.getOperand(Idx
);
648 const LLT PointerTy
= B
.getMRI()->getType(MO
.getReg());
649 // Paranoidly prevent us from doing this multiple times.
650 if (!hasBufferRsrcWorkaround(PointerTy
))
652 MO
.setReg(castBufferRsrcToV4I32(MO
.getReg(), B
));
655 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget
&ST_
,
656 const GCNTargetMachine
&TM
)
658 using namespace TargetOpcode
;
660 auto GetAddrSpacePtr
= [&TM
](unsigned AS
) {
661 return LLT::pointer(AS
, TM
.getPointerSizeInBits(AS
));
664 const LLT GlobalPtr
= GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS
);
665 const LLT ConstantPtr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS
);
666 const LLT Constant32Ptr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT
);
667 const LLT LocalPtr
= GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS
);
668 const LLT RegionPtr
= GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS
);
669 const LLT FlatPtr
= GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS
);
670 const LLT PrivatePtr
= GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS
);
671 const LLT BufferFatPtr
= GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER
);
672 const LLT RsrcPtr
= GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE
);
673 const LLT BufferStridedPtr
=
674 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER
);
676 const LLT CodePtr
= FlatPtr
;
678 const std::initializer_list
<LLT
> AddrSpaces64
= {
679 GlobalPtr
, ConstantPtr
, FlatPtr
682 const std::initializer_list
<LLT
> AddrSpaces32
= {
683 LocalPtr
, PrivatePtr
, Constant32Ptr
, RegionPtr
686 const std::initializer_list
<LLT
> AddrSpaces128
= {RsrcPtr
};
688 const std::initializer_list
<LLT
> FPTypesBase
= {
692 const std::initializer_list
<LLT
> FPTypes16
= {
696 const std::initializer_list
<LLT
> FPTypesPK16
= {
700 const LLT MinScalarFPTy
= ST
.has16BitInsts() ? S16
: S32
;
702 // s1 for VCC branches, s32 for SCC branches.
703 getActionDefinitionsBuilder(G_BRCOND
).legalFor({S1
, S32
});
705 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
706 // elements for v3s16
707 getActionDefinitionsBuilder(G_PHI
)
708 .legalFor({S32
, S64
, V2S16
, S16
, V4S16
, S1
, S128
, S256
})
709 .legalFor(AllS32Vectors
)
710 .legalFor(AllS64Vectors
)
711 .legalFor(AddrSpaces64
)
712 .legalFor(AddrSpaces32
)
713 .legalFor(AddrSpaces128
)
714 .legalIf(isPointer(0))
715 .clampScalar(0, S16
, S256
)
716 .widenScalarToNextPow2(0, 32)
717 .clampMaxNumElements(0, S32
, 16)
718 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
721 if (ST
.hasVOP3PInsts() && ST
.hasAddNoCarry() && ST
.hasIntClamp()) {
722 // Full set of gfx9 features.
723 if (ST
.hasScalarAddSub64()) {
724 getActionDefinitionsBuilder({G_ADD
, G_SUB
})
725 .legalFor({S64
, S32
, S16
, V2S16
})
726 .clampMaxNumElementsStrict(0, S16
, 2)
729 .widenScalarToNextMultipleOf(0, 32)
732 getActionDefinitionsBuilder({G_ADD
, G_SUB
})
733 .legalFor({S32
, S16
, V2S16
})
734 .clampMaxNumElementsStrict(0, S16
, 2)
737 .widenScalarToNextMultipleOf(0, 32)
741 if (ST
.hasScalarSMulU64()) {
742 getActionDefinitionsBuilder(G_MUL
)
743 .legalFor({S64
, S32
, S16
, V2S16
})
744 .clampMaxNumElementsStrict(0, S16
, 2)
747 .widenScalarToNextMultipleOf(0, 32)
750 getActionDefinitionsBuilder(G_MUL
)
751 .legalFor({S32
, S16
, V2S16
})
752 .clampMaxNumElementsStrict(0, S16
, 2)
755 .widenScalarToNextMultipleOf(0, 32)
758 assert(ST
.hasMad64_32());
760 getActionDefinitionsBuilder({G_UADDSAT
, G_USUBSAT
, G_SADDSAT
, G_SSUBSAT
})
761 .legalFor({S32
, S16
, V2S16
}) // Clamp modifier
762 .minScalarOrElt(0, S16
)
763 .clampMaxNumElementsStrict(0, S16
, 2)
765 .widenScalarToNextPow2(0, 32)
767 } else if (ST
.has16BitInsts()) {
768 getActionDefinitionsBuilder({G_ADD
, G_SUB
})
769 .legalFor({S32
, S16
})
771 .widenScalarToNextMultipleOf(0, 32)
775 getActionDefinitionsBuilder(G_MUL
)
776 .legalFor({S32
, S16
})
779 .widenScalarToNextMultipleOf(0, 32)
781 assert(ST
.hasMad64_32());
783 // Technically the saturating operations require clamp bit support, but this
784 // was introduced at the same time as 16-bit operations.
785 getActionDefinitionsBuilder({G_UADDSAT
, G_USUBSAT
})
786 .legalFor({S32
, S16
}) // Clamp modifier
789 .widenScalarToNextPow2(0, 16)
792 // We're just lowering this, but it helps get a better result to try to
793 // coerce to the desired type first.
794 getActionDefinitionsBuilder({G_SADDSAT
, G_SSUBSAT
})
799 getActionDefinitionsBuilder({G_ADD
, G_SUB
})
801 .widenScalarToNextMultipleOf(0, 32)
802 .clampScalar(0, S32
, S32
)
805 auto &Mul
= getActionDefinitionsBuilder(G_MUL
)
809 .widenScalarToNextMultipleOf(0, 32);
811 if (ST
.hasMad64_32())
814 Mul
.maxScalar(0, S32
);
816 if (ST
.hasIntClamp()) {
817 getActionDefinitionsBuilder({G_UADDSAT
, G_USUBSAT
})
818 .legalFor({S32
}) // Clamp modifier.
820 .minScalarOrElt(0, S32
)
823 // Clamp bit support was added in VI, along with 16-bit operations.
824 getActionDefinitionsBuilder({G_UADDSAT
, G_USUBSAT
})
830 // FIXME: DAG expansion gets better results. The widening uses the smaller
831 // range values and goes for the min/max lowering directly.
832 getActionDefinitionsBuilder({G_SADDSAT
, G_SSUBSAT
})
838 getActionDefinitionsBuilder(
839 {G_SDIV
, G_UDIV
, G_SREM
, G_UREM
, G_SDIVREM
, G_UDIVREM
})
840 .customFor({S32
, S64
})
841 .clampScalar(0, S32
, S64
)
842 .widenScalarToNextPow2(0, 32)
845 auto &Mulh
= getActionDefinitionsBuilder({G_UMULH
, G_SMULH
})
849 if (ST
.hasVOP3PInsts()) {
851 .clampMaxNumElements(0, S8
, 2)
859 // Report legal for any types we can handle anywhere. For the cases only legal
860 // on the SALU, RegBankSelect will be able to re-legalize.
861 getActionDefinitionsBuilder({G_AND
, G_OR
, G_XOR
})
862 .legalFor({S32
, S1
, S64
, V2S32
, S16
, V2S16
, V4S16
})
863 .clampScalar(0, S32
, S64
)
864 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
865 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
866 .widenScalarToNextPow2(0)
869 getActionDefinitionsBuilder(
870 {G_UADDO
, G_USUBO
, G_UADDE
, G_SADDE
, G_USUBE
, G_SSUBE
})
871 .legalFor({{S32
, S1
}, {S32
, S32
}})
872 .clampScalar(0, S32
, S32
)
875 getActionDefinitionsBuilder(G_BITCAST
)
876 // Don't worry about the size constraint.
877 .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
880 getActionDefinitionsBuilder(G_CONSTANT
)
881 .legalFor({S1
, S32
, S64
, S16
, GlobalPtr
,
882 LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
883 .legalIf(isPointer(0))
884 .clampScalar(0, S32
, S64
)
885 .widenScalarToNextPow2(0);
887 getActionDefinitionsBuilder(G_FCONSTANT
)
888 .legalFor({S32
, S64
, S16
})
889 .clampScalar(0, S16
, S64
);
891 getActionDefinitionsBuilder({G_IMPLICIT_DEF
, G_FREEZE
})
892 .legalIf(isRegisterType(0))
893 // s1 and s16 are special cases because they have legal operations on
894 // them, but don't really occupy registers in the normal way.
896 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
897 .clampScalarOrElt(0, S32
, MaxScalar
)
898 .widenScalarToNextPow2(0, 32)
899 .clampMaxNumElements(0, S32
, 16);
901 getActionDefinitionsBuilder(G_FRAME_INDEX
).legalFor({PrivatePtr
});
903 // If the amount is divergent, we have to do a wave reduction to get the
904 // maximum value, so this is expanded during RegBankSelect.
905 getActionDefinitionsBuilder(G_DYN_STACKALLOC
)
906 .legalFor({{PrivatePtr
, S32
}});
908 getActionDefinitionsBuilder(G_STACKSAVE
)
909 .customFor({PrivatePtr
});
910 getActionDefinitionsBuilder(G_STACKRESTORE
)
911 .legalFor({PrivatePtr
});
913 getActionDefinitionsBuilder({G_GET_FPENV
, G_SET_FPENV
}).customFor({S64
});
915 getActionDefinitionsBuilder(G_GLOBAL_VALUE
)
916 .customIf(typeIsNot(0, PrivatePtr
));
918 getActionDefinitionsBuilder(G_BLOCK_ADDR
).legalFor({CodePtr
});
920 auto &FPOpActions
= getActionDefinitionsBuilder(
921 { G_FADD
, G_FMUL
, G_FMA
, G_FCANONICALIZE
,
922 G_STRICT_FADD
, G_STRICT_FMUL
, G_STRICT_FMA
})
923 .legalFor({S32
, S64
});
924 auto &TrigActions
= getActionDefinitionsBuilder({G_FSIN
, G_FCOS
})
925 .customFor({S32
, S64
});
926 auto &FDIVActions
= getActionDefinitionsBuilder(G_FDIV
)
927 .customFor({S32
, S64
});
929 if (ST
.has16BitInsts()) {
930 if (ST
.hasVOP3PInsts())
931 FPOpActions
.legalFor({S16
, V2S16
});
933 FPOpActions
.legalFor({S16
});
935 TrigActions
.customFor({S16
});
936 FDIVActions
.customFor({S16
});
939 if (ST
.hasPackedFP32Ops()) {
940 FPOpActions
.legalFor({V2S32
});
941 FPOpActions
.clampMaxNumElementsStrict(0, S32
, 2);
944 auto &MinNumMaxNum
= getActionDefinitionsBuilder({
945 G_FMINNUM
, G_FMAXNUM
, G_FMINNUM_IEEE
, G_FMAXNUM_IEEE
});
947 if (ST
.hasVOP3PInsts()) {
948 MinNumMaxNum
.customFor(FPTypesPK16
)
949 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
950 .clampMaxNumElements(0, S16
, 2)
951 .clampScalar(0, S16
, S64
)
953 } else if (ST
.has16BitInsts()) {
954 MinNumMaxNum
.customFor(FPTypes16
)
955 .clampScalar(0, S16
, S64
)
958 MinNumMaxNum
.customFor(FPTypesBase
)
959 .clampScalar(0, S32
, S64
)
963 if (ST
.hasVOP3PInsts())
964 FPOpActions
.clampMaxNumElementsStrict(0, S16
, 2);
968 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
972 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
976 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
978 getActionDefinitionsBuilder({G_FNEG
, G_FABS
})
979 .legalFor(FPTypesPK16
)
980 .clampMaxNumElementsStrict(0, S16
, 2)
982 .clampScalar(0, S16
, S64
);
984 if (ST
.has16BitInsts()) {
985 getActionDefinitionsBuilder(G_FSQRT
)
987 .customFor({S32
, S64
})
990 getActionDefinitionsBuilder(G_FFLOOR
)
991 .legalFor({S32
, S64
, S16
})
993 .clampScalar(0, S16
, S64
);
995 getActionDefinitionsBuilder({G_FLDEXP
, G_STRICT_FLDEXP
})
996 .legalFor({{S32
, S32
}, {S64
, S32
}, {S16
, S16
}})
998 .maxScalarIf(typeIs(0, S16
), 1, S16
)
999 .clampScalar(1, S32
, S32
)
1002 getActionDefinitionsBuilder(G_FFREXP
)
1003 .customFor({{S32
, S32
}, {S64
, S32
}, {S16
, S16
}, {S16
, S32
}})
1007 getActionDefinitionsBuilder(G_FSQRT
)
1008 .customFor({S32
, S64
, S16
})
1013 if (ST
.hasFractBug()) {
1014 getActionDefinitionsBuilder(G_FFLOOR
)
1016 .legalFor({S32
, S64
})
1018 .clampScalar(0, S32
, S64
);
1020 getActionDefinitionsBuilder(G_FFLOOR
)
1021 .legalFor({S32
, S64
})
1023 .clampScalar(0, S32
, S64
);
1026 getActionDefinitionsBuilder({G_FLDEXP
, G_STRICT_FLDEXP
})
1027 .legalFor({{S32
, S32
}, {S64
, S32
}})
1029 .clampScalar(0, S32
, S64
)
1030 .clampScalar(1, S32
, S32
)
1033 getActionDefinitionsBuilder(G_FFREXP
)
1034 .customFor({{S32
, S32
}, {S64
, S32
}})
1037 .clampScalar(1, S32
, S32
)
1041 getActionDefinitionsBuilder(G_FPTRUNC
)
1042 .legalFor({{S32
, S64
}, {S16
, S32
}})
1046 getActionDefinitionsBuilder(G_FPEXT
)
1047 .legalFor({{S64
, S32
}, {S32
, S16
}})
1048 .narrowScalarFor({{S64
, S16
}}, changeTo(0, S32
))
1051 auto &FSubActions
= getActionDefinitionsBuilder({G_FSUB
, G_STRICT_FSUB
});
1052 if (ST
.has16BitInsts()) {
1054 // Use actual fsub instruction
1055 .legalFor({S32
, S16
})
1056 // Must use fadd + fneg
1057 .lowerFor({S64
, V2S16
});
1060 // Use actual fsub instruction
1062 // Must use fadd + fneg
1063 .lowerFor({S64
, S16
, V2S16
});
1068 .clampScalar(0, S32
, S64
);
1070 // Whether this is legal depends on the floating point mode for the function.
1071 auto &FMad
= getActionDefinitionsBuilder(G_FMAD
);
1072 if (ST
.hasMadF16() && ST
.hasMadMacF32Insts())
1073 FMad
.customFor({S32
, S16
});
1074 else if (ST
.hasMadMacF32Insts())
1075 FMad
.customFor({S32
});
1076 else if (ST
.hasMadF16())
1077 FMad
.customFor({S16
});
1081 auto &FRem
= getActionDefinitionsBuilder(G_FREM
);
1082 if (ST
.has16BitInsts()) {
1083 FRem
.customFor({S16
, S32
, S64
});
1085 FRem
.minScalar(0, S32
)
1086 .customFor({S32
, S64
});
1090 // TODO: Do we need to clamp maximum bitwidth?
1091 getActionDefinitionsBuilder(G_TRUNC
)
1092 .legalIf(isScalar(0))
1093 .legalFor({{V2S16
, V2S32
}})
1094 .clampMaxNumElements(0, S16
, 2)
1095 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1096 // situations (like an invalid implicit use), we don't want to infinite loop
1097 // in the legalizer.
1098 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1101 getActionDefinitionsBuilder({G_SEXT
, G_ZEXT
, G_ANYEXT
})
1102 .legalFor({{S64
, S32
}, {S32
, S16
}, {S64
, S16
},
1103 {S32
, S1
}, {S64
, S1
}, {S16
, S1
}})
1105 .clampScalar(0, S32
, S64
)
1106 .widenScalarToNextPow2(1, 32);
1108 // TODO: Split s1->s64 during regbankselect for VALU.
1109 auto &IToFP
= getActionDefinitionsBuilder({G_SITOFP
, G_UITOFP
})
1110 .legalFor({{S32
, S32
}, {S64
, S32
}, {S16
, S32
}})
1111 .lowerIf(typeIs(1, S1
))
1112 .customFor({{S32
, S64
}, {S64
, S64
}});
1113 if (ST
.has16BitInsts())
1114 IToFP
.legalFor({{S16
, S16
}});
1115 IToFP
.clampScalar(1, S32
, S64
)
1118 .widenScalarToNextPow2(1);
1120 auto &FPToI
= getActionDefinitionsBuilder({G_FPTOSI
, G_FPTOUI
})
1121 .legalFor({{S32
, S32
}, {S32
, S64
}, {S32
, S16
}})
1122 .customFor({{S64
, S32
}, {S64
, S64
}})
1123 .narrowScalarFor({{S64
, S16
}}, changeTo(0, S32
));
1124 if (ST
.has16BitInsts())
1125 FPToI
.legalFor({{S16
, S16
}});
1127 FPToI
.minScalar(1, S32
);
1129 FPToI
.minScalar(0, S32
)
1130 .widenScalarToNextPow2(0, 32)
1134 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND
)
1135 .customFor({S16
, S32
})
1139 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1140 getActionDefinitionsBuilder({G_INTRINSIC_ROUND
, G_FRINT
, G_FNEARBYINT
})
1144 if (ST
.has16BitInsts()) {
1145 getActionDefinitionsBuilder(
1146 {G_INTRINSIC_TRUNC
, G_FCEIL
, G_INTRINSIC_ROUNDEVEN
})
1147 .legalFor({S16
, S32
, S64
})
1148 .clampScalar(0, S16
, S64
)
1150 } else if (ST
.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS
) {
1151 getActionDefinitionsBuilder(
1152 {G_INTRINSIC_TRUNC
, G_FCEIL
, G_INTRINSIC_ROUNDEVEN
})
1153 .legalFor({S32
, S64
})
1154 .clampScalar(0, S32
, S64
)
1157 getActionDefinitionsBuilder(
1158 {G_INTRINSIC_TRUNC
, G_FCEIL
, G_INTRINSIC_ROUNDEVEN
})
1161 .clampScalar(0, S32
, S64
)
1165 getActionDefinitionsBuilder(G_PTR_ADD
)
1166 .unsupportedFor({BufferFatPtr
, BufferStridedPtr
, RsrcPtr
})
1167 .legalIf(all(isPointer(0), sameSize(0, 1)))
1169 .scalarSameSizeAs(1, 0);
1171 getActionDefinitionsBuilder(G_PTRMASK
)
1172 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64
, S32
})))
1173 .scalarSameSizeAs(1, 0)
1177 getActionDefinitionsBuilder(G_ICMP
)
1178 // The compare output type differs based on the register bank of the output,
1179 // so make both s1 and s32 legal.
1181 // Scalar compares producing output in scc will be promoted to s32, as that
1182 // is the allocatable register type that will be needed for the copy from
1183 // scc. This will be promoted during RegBankSelect, and we assume something
1184 // before that won't try to use s32 result types.
1186 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1188 .legalForCartesianProduct(
1189 {S1
}, {S32
, S64
, GlobalPtr
, LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
1190 .legalForCartesianProduct(
1191 {S32
}, {S32
, S64
, GlobalPtr
, LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
});
1192 if (ST
.has16BitInsts()) {
1193 CmpBuilder
.legalFor({{S1
, S16
}});
1197 .widenScalarToNextPow2(1)
1198 .clampScalar(1, S32
, S64
)
1200 .legalIf(all(typeInSet(0, {S1
, S32
}), isPointer(1)));
1203 getActionDefinitionsBuilder(G_FCMP
).legalForCartesianProduct(
1204 {S1
}, ST
.has16BitInsts() ? FPTypes16
: FPTypesBase
);
1206 if (ST
.hasSALUFloatInsts())
1207 FCmpBuilder
.legalForCartesianProduct({S32
}, {S16
, S32
});
1210 .widenScalarToNextPow2(1)
1211 .clampScalar(1, S32
, S64
)
1214 // FIXME: fpow has a selection pattern that should move to custom lowering.
1215 auto &ExpOps
= getActionDefinitionsBuilder(G_FPOW
);
1216 if (ST
.has16BitInsts())
1217 ExpOps
.customFor({{S32
}, {S16
}});
1219 ExpOps
.customFor({S32
});
1220 ExpOps
.clampScalar(0, MinScalarFPTy
, S32
)
1223 getActionDefinitionsBuilder(G_FPOWI
)
1224 .clampScalar(0, MinScalarFPTy
, S32
)
1227 auto &Log2Ops
= getActionDefinitionsBuilder({G_FLOG2
, G_FEXP2
});
1228 Log2Ops
.customFor({S32
});
1229 if (ST
.has16BitInsts())
1230 Log2Ops
.legalFor({S16
});
1232 Log2Ops
.customFor({S16
});
1233 Log2Ops
.scalarize(0)
1237 getActionDefinitionsBuilder({G_FLOG
, G_FLOG10
, G_FEXP
, G_FEXP10
});
1238 LogOps
.customFor({S32
, S16
});
1239 LogOps
.clampScalar(0, MinScalarFPTy
, S32
)
1242 // The 64-bit versions produce 32-bit results, but only on the SALU.
1243 getActionDefinitionsBuilder(G_CTPOP
)
1244 .legalFor({{S32
, S32
}, {S32
, S64
}})
1245 .clampScalar(0, S32
, S32
)
1246 .widenScalarToNextPow2(1, 32)
1247 .clampScalar(1, S32
, S64
)
1249 .widenScalarToNextPow2(0, 32);
1251 // If no 16 bit instr is available, lower into different instructions.
1252 if (ST
.has16BitInsts())
1253 getActionDefinitionsBuilder(G_IS_FPCLASS
)
1254 .legalForCartesianProduct({S1
}, FPTypes16
)
1255 .widenScalarToNextPow2(1)
1259 getActionDefinitionsBuilder(G_IS_FPCLASS
)
1260 .legalForCartesianProduct({S1
}, FPTypesBase
)
1261 .lowerFor({S1
, S16
})
1262 .widenScalarToNextPow2(1)
1266 // The hardware instructions return a different result on 0 than the generic
1267 // instructions expect. The hardware produces -1, but these produce the
1269 getActionDefinitionsBuilder({G_CTLZ
, G_CTTZ
})
1271 .clampScalar(0, S32
, S32
)
1272 .clampScalar(1, S32
, S64
)
1273 .widenScalarToNextPow2(0, 32)
1274 .widenScalarToNextPow2(1, 32)
1277 // The 64-bit versions produce 32-bit results, but only on the SALU.
1278 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF
)
1279 .legalFor({{S32
, S32
}, {S32
, S64
}})
1280 .customIf(scalarNarrowerThan(1, 32))
1281 .clampScalar(0, S32
, S32
)
1282 .clampScalar(1, S32
, S64
)
1284 .widenScalarToNextPow2(0, 32)
1285 .widenScalarToNextPow2(1, 32);
1287 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF
)
1288 .legalFor({{S32
, S32
}, {S32
, S64
}})
1289 .clampScalar(0, S32
, S32
)
1290 .clampScalar(1, S32
, S64
)
1292 .widenScalarToNextPow2(0, 32)
1293 .widenScalarToNextPow2(1, 32);
1295 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1297 getActionDefinitionsBuilder(G_BITREVERSE
)
1298 .legalFor({S32
, S64
})
1299 .clampScalar(0, S32
, S64
)
1301 .widenScalarToNextPow2(0);
1303 if (ST
.has16BitInsts()) {
1304 getActionDefinitionsBuilder(G_BSWAP
)
1305 .legalFor({S16
, S32
, V2S16
})
1306 .clampMaxNumElementsStrict(0, S16
, 2)
1307 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1308 // narrowScalar limitation.
1309 .widenScalarToNextPow2(0)
1310 .clampScalar(0, S16
, S32
)
1313 if (ST
.hasVOP3PInsts()) {
1314 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
, G_ABS
})
1315 .legalFor({S32
, S16
, V2S16
})
1316 .clampMaxNumElements(0, S16
, 2)
1318 .widenScalarToNextPow2(0)
1322 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
, G_ABS
})
1323 .legalFor({S32
, S16
})
1324 .widenScalarToNextPow2(0)
1330 // TODO: Should have same legality without v_perm_b32
1331 getActionDefinitionsBuilder(G_BSWAP
)
1333 .lowerIf(scalarNarrowerThan(0, 32))
1334 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1335 // narrowScalar limitation.
1336 .widenScalarToNextPow2(0)
1341 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
, G_ABS
})
1344 .widenScalarToNextPow2(0)
1349 getActionDefinitionsBuilder(G_INTTOPTR
)
1350 // List the common cases
1351 .legalForCartesianProduct(AddrSpaces64
, {S64
})
1352 .legalForCartesianProduct(AddrSpaces32
, {S32
})
1354 // Accept any address space as long as the size matches
1355 .legalIf(sameSize(0, 1))
1356 .widenScalarIf(smallerThan(1, 0),
1357 [](const LegalityQuery
&Query
) {
1359 1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
1361 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery
&Query
) {
1362 return std::pair(1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
1365 getActionDefinitionsBuilder(G_PTRTOINT
)
1366 // List the common cases
1367 .legalForCartesianProduct(AddrSpaces64
, {S64
})
1368 .legalForCartesianProduct(AddrSpaces32
, {S32
})
1370 // Accept any address space as long as the size matches
1371 .legalIf(sameSize(0, 1))
1372 .widenScalarIf(smallerThan(0, 1),
1373 [](const LegalityQuery
&Query
) {
1375 0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
1377 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery
&Query
) {
1378 return std::pair(0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
1381 getActionDefinitionsBuilder(G_ADDRSPACE_CAST
)
1385 const auto needToSplitMemOp
= [=](const LegalityQuery
&Query
,
1386 bool IsLoad
) -> bool {
1387 const LLT DstTy
= Query
.Types
[0];
1389 // Split vector extloads.
1390 unsigned MemSize
= Query
.MMODescrs
[0].MemoryTy
.getSizeInBits();
1392 if (DstTy
.isVector() && DstTy
.getSizeInBits() > MemSize
)
1395 const LLT PtrTy
= Query
.Types
[1];
1396 unsigned AS
= PtrTy
.getAddressSpace();
1397 if (MemSize
> maxSizeForAddrSpace(ST
, AS
, IsLoad
,
1398 Query
.MMODescrs
[0].Ordering
!=
1399 AtomicOrdering::NotAtomic
))
1402 // Catch weird sized loads that don't evenly divide into the access sizes
1403 // TODO: May be able to widen depending on alignment etc.
1404 unsigned NumRegs
= (MemSize
+ 31) / 32;
1406 if (!ST
.hasDwordx3LoadStores())
1409 // If the alignment allows, these should have been widened.
1410 if (!isPowerOf2_32(NumRegs
))
1417 unsigned GlobalAlign32
= ST
.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1418 unsigned GlobalAlign16
= ST
.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1419 unsigned GlobalAlign8
= ST
.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1421 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1423 // TODO: Unsupported flat for SI.
1425 for (unsigned Op
: {G_LOAD
, G_STORE
}) {
1426 const bool IsStore
= Op
== G_STORE
;
1428 auto &Actions
= getActionDefinitionsBuilder(Op
);
1429 // Explicitly list some common cases.
1430 // TODO: Does this help compile time at all?
1431 Actions
.legalForTypesWithMemDesc({{S32
, GlobalPtr
, S32
, GlobalAlign32
},
1432 {V2S32
, GlobalPtr
, V2S32
, GlobalAlign32
},
1433 {V4S32
, GlobalPtr
, V4S32
, GlobalAlign32
},
1434 {S64
, GlobalPtr
, S64
, GlobalAlign32
},
1435 {V2S64
, GlobalPtr
, V2S64
, GlobalAlign32
},
1436 {V2S16
, GlobalPtr
, V2S16
, GlobalAlign32
},
1437 {S32
, GlobalPtr
, S8
, GlobalAlign8
},
1438 {S32
, GlobalPtr
, S16
, GlobalAlign16
},
1440 {S32
, LocalPtr
, S32
, 32},
1441 {S64
, LocalPtr
, S64
, 32},
1442 {V2S32
, LocalPtr
, V2S32
, 32},
1443 {S32
, LocalPtr
, S8
, 8},
1444 {S32
, LocalPtr
, S16
, 16},
1445 {V2S16
, LocalPtr
, S32
, 32},
1447 {S32
, PrivatePtr
, S32
, 32},
1448 {S32
, PrivatePtr
, S8
, 8},
1449 {S32
, PrivatePtr
, S16
, 16},
1450 {V2S16
, PrivatePtr
, S32
, 32},
1452 {S32
, ConstantPtr
, S32
, GlobalAlign32
},
1453 {V2S32
, ConstantPtr
, V2S32
, GlobalAlign32
},
1454 {V4S32
, ConstantPtr
, V4S32
, GlobalAlign32
},
1455 {S64
, ConstantPtr
, S64
, GlobalAlign32
},
1456 {V2S32
, ConstantPtr
, V2S32
, GlobalAlign32
}});
1458 [=](const LegalityQuery
&Query
) -> bool {
1459 return isLoadStoreLegal(ST
, Query
);
1462 // The custom pointers (fat pointers, buffer resources) don't work with load
1463 // and store at this level. Fat pointers should have been lowered to
1464 // intrinsics before the translation to MIR.
1465 Actions
.unsupportedIf(
1466 typeInSet(1, {BufferFatPtr
, BufferStridedPtr
, RsrcPtr
}));
1468 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1469 // ptrtoint. This is needed to account for the fact that we can't have i128
1470 // as a register class for SelectionDAG reasons.
1471 Actions
.customIf([=](const LegalityQuery
&Query
) -> bool {
1472 return hasBufferRsrcWorkaround(Query
.Types
[0]);
1475 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1478 // TODO: Should generalize bitcast action into coerce, which will also cover
1479 // inserting addrspacecasts.
1480 Actions
.customIf(typeIs(1, Constant32Ptr
));
1482 // Turn any illegal element vectors into something easier to deal
1483 // with. These will ultimately produce 32-bit scalar shifts to extract the
1486 // For odd 16-bit element vectors, prefer to split those into pieces with
1487 // 16-bit vector parts.
1489 [=](const LegalityQuery
&Query
) -> bool {
1490 return shouldBitcastLoadStoreType(ST
, Query
.Types
[0],
1491 Query
.MMODescrs
[0].MemoryTy
);
1492 }, bitcastToRegisterType(0));
1495 // Widen suitably aligned loads by loading extra bytes. The standard
1496 // legalization actions can't properly express widening memory operands.
1497 Actions
.customIf([=](const LegalityQuery
&Query
) -> bool {
1498 return shouldWidenLoad(ST
, Query
, G_LOAD
);
1502 // FIXME: load/store narrowing should be moved to lower action
1505 [=](const LegalityQuery
&Query
) -> bool {
1506 return !Query
.Types
[0].isVector() &&
1507 needToSplitMemOp(Query
, Op
== G_LOAD
);
1509 [=](const LegalityQuery
&Query
) -> std::pair
<unsigned, LLT
> {
1510 const LLT DstTy
= Query
.Types
[0];
1511 const LLT PtrTy
= Query
.Types
[1];
1513 const unsigned DstSize
= DstTy
.getSizeInBits();
1514 unsigned MemSize
= Query
.MMODescrs
[0].MemoryTy
.getSizeInBits();
1517 if (DstSize
> MemSize
)
1518 return std::pair(0, LLT::scalar(MemSize
));
1520 unsigned MaxSize
= maxSizeForAddrSpace(
1521 ST
, PtrTy
.getAddressSpace(), Op
== G_LOAD
,
1522 Query
.MMODescrs
[0].Ordering
!= AtomicOrdering::NotAtomic
);
1523 if (MemSize
> MaxSize
)
1524 return std::pair(0, LLT::scalar(MaxSize
));
1526 uint64_t Align
= Query
.MMODescrs
[0].AlignInBits
;
1527 return std::pair(0, LLT::scalar(Align
));
1530 [=](const LegalityQuery
&Query
) -> bool {
1531 return Query
.Types
[0].isVector() &&
1532 needToSplitMemOp(Query
, Op
== G_LOAD
);
1534 [=](const LegalityQuery
&Query
) -> std::pair
<unsigned, LLT
> {
1535 const LLT DstTy
= Query
.Types
[0];
1536 const LLT PtrTy
= Query
.Types
[1];
1538 LLT EltTy
= DstTy
.getElementType();
1539 unsigned MaxSize
= maxSizeForAddrSpace(
1540 ST
, PtrTy
.getAddressSpace(), Op
== G_LOAD
,
1541 Query
.MMODescrs
[0].Ordering
!= AtomicOrdering::NotAtomic
);
1543 // FIXME: Handle widened to power of 2 results better. This ends
1545 // FIXME: 3 element stores scalarized on SI
1547 // Split if it's too large for the address space.
1548 unsigned MemSize
= Query
.MMODescrs
[0].MemoryTy
.getSizeInBits();
1549 if (MemSize
> MaxSize
) {
1550 unsigned NumElts
= DstTy
.getNumElements();
1551 unsigned EltSize
= EltTy
.getSizeInBits();
1553 if (MaxSize
% EltSize
== 0) {
1555 0, LLT::scalarOrVector(
1556 ElementCount::getFixed(MaxSize
/ EltSize
), EltTy
));
1559 unsigned NumPieces
= MemSize
/ MaxSize
;
1561 // FIXME: Refine when odd breakdowns handled
1562 // The scalars will need to be re-legalized.
1563 if (NumPieces
== 1 || NumPieces
>= NumElts
||
1564 NumElts
% NumPieces
!= 0)
1565 return std::pair(0, EltTy
);
1568 LLT::fixed_vector(NumElts
/ NumPieces
, EltTy
));
1571 // FIXME: We could probably handle weird extending loads better.
1572 if (DstTy
.getSizeInBits() > MemSize
)
1573 return std::pair(0, EltTy
);
1575 unsigned EltSize
= EltTy
.getSizeInBits();
1576 unsigned DstSize
= DstTy
.getSizeInBits();
1577 if (!isPowerOf2_32(DstSize
)) {
1578 // We're probably decomposing an odd sized store. Try to split
1579 // to the widest type. TODO: Account for alignment. As-is it
1580 // should be OK, since the new parts will be further legalized.
1581 unsigned FloorSize
= llvm::bit_floor(DstSize
);
1583 0, LLT::scalarOrVector(
1584 ElementCount::getFixed(FloorSize
/ EltSize
), EltTy
));
1587 // May need relegalization for the scalars.
1588 return std::pair(0, EltTy
);
1591 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32
))
1592 .widenScalarToNextPow2(0)
1593 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1597 // FIXME: Unaligned accesses not lowered.
1598 auto &ExtLoads
= getActionDefinitionsBuilder({G_SEXTLOAD
, G_ZEXTLOAD
})
1599 .legalForTypesWithMemDesc({{S32
, GlobalPtr
, S8
, 8},
1600 {S32
, GlobalPtr
, S16
, 2 * 8},
1601 {S32
, LocalPtr
, S8
, 8},
1602 {S32
, LocalPtr
, S16
, 16},
1603 {S32
, PrivatePtr
, S8
, 8},
1604 {S32
, PrivatePtr
, S16
, 16},
1605 {S32
, ConstantPtr
, S8
, 8},
1606 {S32
, ConstantPtr
, S16
, 2 * 8}})
1608 [=](const LegalityQuery
&Query
) -> bool {
1609 return isLoadStoreLegal(ST
, Query
);
1612 if (ST
.hasFlatAddressSpace()) {
1613 ExtLoads
.legalForTypesWithMemDesc(
1614 {{S32
, FlatPtr
, S8
, 8}, {S32
, FlatPtr
, S16
, 16}});
1617 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1620 // TODO: Should generalize bitcast action into coerce, which will also cover
1621 // inserting addrspacecasts.
1622 ExtLoads
.customIf(typeIs(1, Constant32Ptr
));
1624 ExtLoads
.clampScalar(0, S32
, S32
)
1625 .widenScalarToNextPow2(0)
1628 auto &Atomics
= getActionDefinitionsBuilder(
1629 {G_ATOMICRMW_XCHG
, G_ATOMICRMW_ADD
, G_ATOMICRMW_SUB
,
1630 G_ATOMICRMW_AND
, G_ATOMICRMW_OR
, G_ATOMICRMW_XOR
,
1631 G_ATOMICRMW_MAX
, G_ATOMICRMW_MIN
, G_ATOMICRMW_UMAX
,
1632 G_ATOMICRMW_UMIN
, G_ATOMICRMW_UINC_WRAP
, G_ATOMICRMW_UDEC_WRAP
})
1633 .legalFor({{S32
, GlobalPtr
}, {S32
, LocalPtr
},
1634 {S64
, GlobalPtr
}, {S64
, LocalPtr
},
1635 {S32
, RegionPtr
}, {S64
, RegionPtr
}});
1636 if (ST
.hasFlatAddressSpace()) {
1637 Atomics
.legalFor({{S32
, FlatPtr
}, {S64
, FlatPtr
}});
1640 // TODO: v2bf16 operations, and fat buffer pointer support.
1641 auto &Atomic
= getActionDefinitionsBuilder(G_ATOMICRMW_FADD
);
1642 if (ST
.hasLDSFPAtomicAddF32()) {
1643 Atomic
.legalFor({{S32
, LocalPtr
}, {S32
, RegionPtr
}});
1644 if (ST
.hasLdsAtomicAddF64())
1645 Atomic
.legalFor({{S64
, LocalPtr
}});
1646 if (ST
.hasAtomicDsPkAdd16Insts())
1647 Atomic
.legalFor({{V2F16
, LocalPtr
}, {V2BF16
, LocalPtr
}});
1649 if (ST
.hasAtomicFaddInsts())
1650 Atomic
.legalFor({{S32
, GlobalPtr
}});
1651 if (ST
.hasFlatAtomicFaddF32Inst())
1652 Atomic
.legalFor({{S32
, FlatPtr
}});
1654 if (ST
.hasGFX90AInsts()) {
1655 // These are legal with some caveats, and should have undergone expansion in
1656 // the IR in most situations
1657 // TODO: Move atomic expansion into legalizer
1665 if (ST
.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1666 ST
.hasAtomicBufferGlobalPkAddF16Insts())
1667 Atomic
.legalFor({{V2F16
, GlobalPtr
}, {V2F16
, BufferFatPtr
}});
1668 if (ST
.hasAtomicGlobalPkAddBF16Inst())
1669 Atomic
.legalFor({{V2BF16
, GlobalPtr
}});
1670 if (ST
.hasAtomicFlatPkAdd16Insts())
1671 Atomic
.legalFor({{V2F16
, FlatPtr
}, {V2BF16
, FlatPtr
}});
1674 // Most of the legalization work here is done by AtomicExpand. We could
1675 // probably use a simpler legality rule that just assumes anything is OK.
1676 auto &AtomicFMinFMax
=
1677 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN
, G_ATOMICRMW_FMAX
})
1678 .legalFor({{F32
, LocalPtr
}, {F64
, LocalPtr
}});
1680 if (ST
.hasAtomicFMinFMaxF32GlobalInsts())
1681 AtomicFMinFMax
.legalFor({{F32
, GlobalPtr
},{F32
, BufferFatPtr
}});
1682 if (ST
.hasAtomicFMinFMaxF64GlobalInsts())
1683 AtomicFMinFMax
.legalFor({{F64
, GlobalPtr
}, {F64
, BufferFatPtr
}});
1684 if (ST
.hasAtomicFMinFMaxF32FlatInsts())
1685 AtomicFMinFMax
.legalFor({F32
, FlatPtr
});
1686 if (ST
.hasAtomicFMinFMaxF64FlatInsts())
1687 AtomicFMinFMax
.legalFor({F64
, FlatPtr
});
1689 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1691 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG
)
1692 .customFor({{S32
, GlobalPtr
}, {S64
, GlobalPtr
},
1693 {S32
, FlatPtr
}, {S64
, FlatPtr
}})
1694 .legalFor({{S32
, LocalPtr
}, {S64
, LocalPtr
},
1695 {S32
, RegionPtr
}, {S64
, RegionPtr
}});
1696 // TODO: Pointer types, any 32-bit or 64-bit vector
1698 // Condition should be s32 for scalar, s1 for vector.
1699 getActionDefinitionsBuilder(G_SELECT
)
1700 .legalForCartesianProduct({S32
, S64
, S16
, V2S32
, V2S16
, V4S16
, GlobalPtr
,
1701 LocalPtr
, FlatPtr
, PrivatePtr
,
1702 LLT::fixed_vector(2, LocalPtr
),
1703 LLT::fixed_vector(2, PrivatePtr
)},
1705 .clampScalar(0, S16
, S64
)
1707 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1708 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1709 .clampMaxNumElements(0, S32
, 2)
1710 .clampMaxNumElements(0, LocalPtr
, 2)
1711 .clampMaxNumElements(0, PrivatePtr
, 2)
1713 .widenScalarToNextPow2(0)
1714 .legalIf(all(isPointer(0), typeInSet(1, {S1
, S32
})));
1716 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1717 // be more flexible with the shift amount type.
1718 auto &Shifts
= getActionDefinitionsBuilder({G_SHL
, G_LSHR
, G_ASHR
})
1719 .legalFor({{S32
, S32
}, {S64
, S32
}});
1720 if (ST
.has16BitInsts()) {
1721 if (ST
.hasVOP3PInsts()) {
1722 Shifts
.legalFor({{S16
, S16
}, {V2S16
, V2S16
}})
1723 .clampMaxNumElements(0, S16
, 2);
1725 Shifts
.legalFor({{S16
, S16
}});
1727 // TODO: Support 16-bit shift amounts for all types
1728 Shifts
.widenScalarIf(
1729 [=](const LegalityQuery
&Query
) {
1730 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1732 const LLT ValTy
= Query
.Types
[0];
1733 const LLT AmountTy
= Query
.Types
[1];
1734 return ValTy
.getSizeInBits() <= 16 &&
1735 AmountTy
.getSizeInBits() < 16;
1736 }, changeTo(1, S16
));
1737 Shifts
.maxScalarIf(typeIs(0, S16
), 1, S16
);
1738 Shifts
.clampScalar(1, S32
, S32
);
1739 Shifts
.widenScalarToNextPow2(0, 16);
1740 Shifts
.clampScalar(0, S16
, S64
);
1742 getActionDefinitionsBuilder({G_SSHLSAT
, G_USHLSAT
})
1747 // Make sure we legalize the shift amount type first, as the general
1748 // expansion for the shifted type will produce much worse code if it hasn't
1749 // been truncated already.
1750 Shifts
.clampScalar(1, S32
, S32
);
1751 Shifts
.widenScalarToNextPow2(0, 32);
1752 Shifts
.clampScalar(0, S32
, S64
);
1754 getActionDefinitionsBuilder({G_SSHLSAT
, G_USHLSAT
})
1759 Shifts
.scalarize(0);
1761 for (unsigned Op
: {G_EXTRACT_VECTOR_ELT
, G_INSERT_VECTOR_ELT
}) {
1762 unsigned VecTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 1 : 0;
1763 unsigned EltTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 0 : 1;
1764 unsigned IdxTypeIdx
= 2;
1766 getActionDefinitionsBuilder(Op
)
1767 .customIf([=](const LegalityQuery
&Query
) {
1768 const LLT EltTy
= Query
.Types
[EltTypeIdx
];
1769 const LLT VecTy
= Query
.Types
[VecTypeIdx
];
1770 const LLT IdxTy
= Query
.Types
[IdxTypeIdx
];
1771 const unsigned EltSize
= EltTy
.getSizeInBits();
1772 const bool isLegalVecType
=
1773 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy
.getSizeInBits());
1774 // Address space 8 pointers are 128-bit wide values, but the logic
1775 // below will try to bitcast them to 2N x s64, which will fail.
1776 // Therefore, as an intermediate step, wrap extracts/insertions from a
1777 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1778 // extraction result) in order to produce a vector operation that can
1779 // be handled by the logic below.
1780 if (EltTy
.isPointer() && EltSize
> 64)
1782 return (EltSize
== 32 || EltSize
== 64) &&
1783 VecTy
.getSizeInBits() % 32 == 0 &&
1784 VecTy
.getSizeInBits() <= MaxRegisterSize
&&
1785 IdxTy
.getSizeInBits() == 32 &&
1788 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx
), scalarOrEltNarrowerThan(VecTypeIdx
, 32)),
1789 bitcastToVectorElement32(VecTypeIdx
))
1790 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1792 all(sizeIsMultipleOf32(VecTypeIdx
), scalarOrEltWiderThan(VecTypeIdx
, 64)),
1793 [=](const LegalityQuery
&Query
) {
1794 // For > 64-bit element types, try to turn this into a 64-bit
1795 // element vector since we may be able to do better indexing
1796 // if this is scalar. If not, fall back to 32.
1797 const LLT EltTy
= Query
.Types
[EltTypeIdx
];
1798 const LLT VecTy
= Query
.Types
[VecTypeIdx
];
1799 const unsigned DstEltSize
= EltTy
.getSizeInBits();
1800 const unsigned VecSize
= VecTy
.getSizeInBits();
1802 const unsigned TargetEltSize
= DstEltSize
% 64 == 0 ? 64 : 32;
1805 LLT::fixed_vector(VecSize
/ TargetEltSize
, TargetEltSize
));
1807 .clampScalar(EltTypeIdx
, S32
, S64
)
1808 .clampScalar(VecTypeIdx
, S32
, S64
)
1809 .clampScalar(IdxTypeIdx
, S32
, S32
)
1810 .clampMaxNumElements(VecTypeIdx
, S32
, 32)
1811 // TODO: Clamp elements for 64-bit vectors?
1813 isIllegalRegisterType(VecTypeIdx
),
1814 moreElementsToNextExistingRegClass(VecTypeIdx
))
1815 // It should only be necessary with variable indexes.
1816 // As a last resort, lower to the stack
1820 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT
)
1821 .unsupportedIf([=](const LegalityQuery
&Query
) {
1822 const LLT
&EltTy
= Query
.Types
[1].getElementType();
1823 return Query
.Types
[0] != EltTy
;
1826 for (unsigned Op
: {G_EXTRACT
, G_INSERT
}) {
1827 unsigned BigTyIdx
= Op
== G_EXTRACT
? 1 : 0;
1828 unsigned LitTyIdx
= Op
== G_EXTRACT
? 0 : 1;
1830 // FIXME: Doesn't handle extract of illegal sizes.
1831 getActionDefinitionsBuilder(Op
)
1832 .lowerIf(all(typeIs(LitTyIdx
, S16
), sizeIs(BigTyIdx
, 32)))
1833 .lowerIf([=](const LegalityQuery
&Query
) {
1834 // Sub-vector(or single element) insert and extract.
1835 // TODO: verify immediate offset here since lower only works with
1837 const LLT BigTy
= Query
.Types
[BigTyIdx
];
1838 return BigTy
.isVector();
1840 // FIXME: Multiples of 16 should not be legal.
1841 .legalIf([=](const LegalityQuery
&Query
) {
1842 const LLT BigTy
= Query
.Types
[BigTyIdx
];
1843 const LLT LitTy
= Query
.Types
[LitTyIdx
];
1844 return (BigTy
.getSizeInBits() % 32 == 0) &&
1845 (LitTy
.getSizeInBits() % 16 == 0);
1848 [=](const LegalityQuery
&Query
) {
1849 const LLT BigTy
= Query
.Types
[BigTyIdx
];
1850 return (BigTy
.getScalarSizeInBits() < 16);
1852 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx
, 16))
1854 [=](const LegalityQuery
&Query
) {
1855 const LLT LitTy
= Query
.Types
[LitTyIdx
];
1856 return (LitTy
.getScalarSizeInBits() < 16);
1858 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx
, 16))
1859 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
1860 .widenScalarToNextPow2(BigTyIdx
, 32);
1864 auto &BuildVector
= getActionDefinitionsBuilder(G_BUILD_VECTOR
)
1865 .legalForCartesianProduct(AllS32Vectors
, {S32
})
1866 .legalForCartesianProduct(AllS64Vectors
, {S64
})
1867 .clampNumElements(0, V16S32
, V32S32
)
1868 .clampNumElements(0, V2S64
, V16S64
)
1869 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16
))
1871 isIllegalRegisterType(0),
1872 moreElementsToNextExistingRegClass(0));
1874 if (ST
.hasScalarPackInsts()) {
1876 // FIXME: Should probably widen s1 vectors straight to s32
1877 .minScalarOrElt(0, S16
)
1880 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
)
1881 .legalFor({V2S16
, S32
})
1884 BuildVector
.customFor({V2S16
, S16
});
1885 BuildVector
.minScalarOrElt(0, S32
);
1887 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
)
1888 .customFor({V2S16
, S32
})
1892 BuildVector
.legalIf(isRegisterType(0));
1894 // FIXME: Clamp maximum size
1895 getActionDefinitionsBuilder(G_CONCAT_VECTORS
)
1896 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1897 .clampMaxNumElements(0, S32
, 32)
1898 .clampMaxNumElements(1, S16
, 2) // TODO: Make 4?
1899 .clampMaxNumElements(0, S16
, 64);
1901 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR
).lower();
1904 for (unsigned Op
: {G_MERGE_VALUES
, G_UNMERGE_VALUES
}) {
1905 unsigned BigTyIdx
= Op
== G_MERGE_VALUES
? 0 : 1;
1906 unsigned LitTyIdx
= Op
== G_MERGE_VALUES
? 1 : 0;
1908 auto notValidElt
= [=](const LegalityQuery
&Query
, unsigned TypeIdx
) {
1909 const LLT Ty
= Query
.Types
[TypeIdx
];
1910 if (Ty
.isVector()) {
1911 const LLT
&EltTy
= Ty
.getElementType();
1912 if (EltTy
.getSizeInBits() < 8 || EltTy
.getSizeInBits() > 512)
1914 if (!llvm::has_single_bit
<uint32_t>(EltTy
.getSizeInBits()))
1920 auto &Builder
= getActionDefinitionsBuilder(Op
)
1921 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1922 .lowerFor({{S16
, V2S16
}})
1923 .lowerIf([=](const LegalityQuery
&Query
) {
1924 const LLT BigTy
= Query
.Types
[BigTyIdx
];
1925 return BigTy
.getSizeInBits() == 32;
1927 // Try to widen to s16 first for small types.
1928 // TODO: Only do this on targets with legal s16 shifts
1929 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx
, 16), LitTyIdx
, S16
)
1930 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 16)
1931 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
1932 .fewerElementsIf(all(typeIs(0, S16
), vectorWiderThan(1, 32),
1933 elementTypeIs(1, S16
)),
1935 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1936 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1938 .clampScalar(LitTyIdx
, S32
, S512
)
1939 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 32)
1940 // Break up vectors with weird elements into scalars
1942 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, LitTyIdx
); },
1945 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, BigTyIdx
); },
1947 .clampScalar(BigTyIdx
, S32
, MaxScalar
);
1949 if (Op
== G_MERGE_VALUES
) {
1950 Builder
.widenScalarIf(
1951 // TODO: Use 16-bit shifts if legal for 8-bit values?
1952 [=](const LegalityQuery
&Query
) {
1953 const LLT Ty
= Query
.Types
[LitTyIdx
];
1954 return Ty
.getSizeInBits() < 32;
1956 changeTo(LitTyIdx
, S32
));
1959 Builder
.widenScalarIf(
1960 [=](const LegalityQuery
&Query
) {
1961 const LLT Ty
= Query
.Types
[BigTyIdx
];
1962 return Ty
.getSizeInBits() % 16 != 0;
1964 [=](const LegalityQuery
&Query
) {
1965 // Pick the next power of 2, or a multiple of 64 over 128.
1966 // Whichever is smaller.
1967 const LLT
&Ty
= Query
.Types
[BigTyIdx
];
1968 unsigned NewSizeInBits
= 1 << Log2_32_Ceil(Ty
.getSizeInBits() + 1);
1969 if (NewSizeInBits
>= 256) {
1970 unsigned RoundedTo
= alignTo
<64>(Ty
.getSizeInBits() + 1);
1971 if (RoundedTo
< NewSizeInBits
)
1972 NewSizeInBits
= RoundedTo
;
1974 return std::pair(BigTyIdx
, LLT::scalar(NewSizeInBits
));
1976 // Any vectors left are the wrong size. Scalarize them.
1981 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1983 auto &SextInReg
= getActionDefinitionsBuilder(G_SEXT_INREG
)
1984 .legalFor({{S32
}, {S64
}});
1986 if (ST
.hasVOP3PInsts()) {
1987 SextInReg
.lowerFor({{V2S16
}})
1988 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1989 // get more vector shift opportunities, since we'll get those when
1991 .clampMaxNumElementsStrict(0, S16
, 2);
1992 } else if (ST
.has16BitInsts()) {
1993 SextInReg
.lowerFor({{S32
}, {S64
}, {S16
}});
1995 // Prefer to promote to s32 before lowering if we don't have 16-bit
1996 // shifts. This avoid a lot of intermediate truncate and extend operations.
1997 SextInReg
.lowerFor({{S32
}, {S64
}});
2002 .clampScalar(0, S32
, S64
)
2005 getActionDefinitionsBuilder({G_ROTR
, G_ROTL
})
2009 // TODO: Only Try to form v2s16 with legal packed instructions.
2010 getActionDefinitionsBuilder(G_FSHR
)
2011 .legalFor({{S32
, S32
}})
2012 .lowerFor({{V2S16
, V2S16
}})
2013 .clampMaxNumElementsStrict(0, S16
, 2)
2017 if (ST
.hasVOP3PInsts()) {
2018 getActionDefinitionsBuilder(G_FSHL
)
2019 .lowerFor({{V2S16
, V2S16
}})
2020 .clampMaxNumElementsStrict(0, S16
, 2)
2024 getActionDefinitionsBuilder(G_FSHL
)
2029 getActionDefinitionsBuilder(G_READCYCLECOUNTER
)
2032 getActionDefinitionsBuilder(G_READSTEADYCOUNTER
).legalFor({S64
});
2034 getActionDefinitionsBuilder(G_FENCE
)
2037 getActionDefinitionsBuilder({G_SMULO
, G_UMULO
})
2042 getActionDefinitionsBuilder({G_SBFX
, G_UBFX
})
2043 .legalFor({{S32
, S32
}, {S64
, S32
}})
2044 .clampScalar(1, S32
, S32
)
2045 .clampScalar(0, S32
, S64
)
2046 .widenScalarToNextPow2(0)
2049 getActionDefinitionsBuilder(
2050 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2053 G_ATOMIC_CMPXCHG_WITH_SUCCESS
, G_ATOMICRMW_NAND
, G_ATOMICRMW_FSUB
,
2054 G_READ_REGISTER
, G_WRITE_REGISTER
,
2059 if (ST
.hasIEEEMinMax()) {
2060 getActionDefinitionsBuilder({G_FMINIMUM
, G_FMAXIMUM
})
2061 .legalFor(FPTypesPK16
)
2062 .clampMaxNumElements(0, S16
, 2)
2066 getActionDefinitionsBuilder({G_FMINIMUM
, G_FMAXIMUM
}).lower();
2069 getActionDefinitionsBuilder({G_MEMCPY
, G_MEMCPY_INLINE
, G_MEMMOVE
, G_MEMSET
})
2072 getActionDefinitionsBuilder({G_TRAP
, G_DEBUGTRAP
}).custom();
2074 getActionDefinitionsBuilder({G_VASTART
, G_VAARG
, G_BRJT
, G_JUMP_TABLE
,
2075 G_INDEXED_LOAD
, G_INDEXED_SEXTLOAD
,
2076 G_INDEXED_ZEXTLOAD
, G_INDEXED_STORE
})
2079 getActionDefinitionsBuilder(G_PREFETCH
).alwaysLegal();
2081 getLegacyLegalizerInfo().computeTables();
2082 verify(*ST
.getInstrInfo());
2085 bool AMDGPULegalizerInfo::legalizeCustom(
2086 LegalizerHelper
&Helper
, MachineInstr
&MI
,
2087 LostDebugLocObserver
&LocObserver
) const {
2088 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
2089 MachineRegisterInfo
&MRI
= *B
.getMRI();
2091 switch (MI
.getOpcode()) {
2092 case TargetOpcode::G_ADDRSPACE_CAST
:
2093 return legalizeAddrSpaceCast(MI
, MRI
, B
);
2094 case TargetOpcode::G_INTRINSIC_ROUNDEVEN
:
2095 return legalizeFroundeven(MI
, MRI
, B
);
2096 case TargetOpcode::G_FCEIL
:
2097 return legalizeFceil(MI
, MRI
, B
);
2098 case TargetOpcode::G_FREM
:
2099 return legalizeFrem(MI
, MRI
, B
);
2100 case TargetOpcode::G_INTRINSIC_TRUNC
:
2101 return legalizeIntrinsicTrunc(MI
, MRI
, B
);
2102 case TargetOpcode::G_SITOFP
:
2103 return legalizeITOFP(MI
, MRI
, B
, true);
2104 case TargetOpcode::G_UITOFP
:
2105 return legalizeITOFP(MI
, MRI
, B
, false);
2106 case TargetOpcode::G_FPTOSI
:
2107 return legalizeFPTOI(MI
, MRI
, B
, true);
2108 case TargetOpcode::G_FPTOUI
:
2109 return legalizeFPTOI(MI
, MRI
, B
, false);
2110 case TargetOpcode::G_FMINNUM
:
2111 case TargetOpcode::G_FMAXNUM
:
2112 case TargetOpcode::G_FMINNUM_IEEE
:
2113 case TargetOpcode::G_FMAXNUM_IEEE
:
2114 return legalizeMinNumMaxNum(Helper
, MI
);
2115 case TargetOpcode::G_EXTRACT_VECTOR_ELT
:
2116 return legalizeExtractVectorElt(MI
, MRI
, B
);
2117 case TargetOpcode::G_INSERT_VECTOR_ELT
:
2118 return legalizeInsertVectorElt(MI
, MRI
, B
);
2119 case TargetOpcode::G_FSIN
:
2120 case TargetOpcode::G_FCOS
:
2121 return legalizeSinCos(MI
, MRI
, B
);
2122 case TargetOpcode::G_GLOBAL_VALUE
:
2123 return legalizeGlobalValue(MI
, MRI
, B
);
2124 case TargetOpcode::G_LOAD
:
2125 case TargetOpcode::G_SEXTLOAD
:
2126 case TargetOpcode::G_ZEXTLOAD
:
2127 return legalizeLoad(Helper
, MI
);
2128 case TargetOpcode::G_STORE
:
2129 return legalizeStore(Helper
, MI
);
2130 case TargetOpcode::G_FMAD
:
2131 return legalizeFMad(MI
, MRI
, B
);
2132 case TargetOpcode::G_FDIV
:
2133 return legalizeFDIV(MI
, MRI
, B
);
2134 case TargetOpcode::G_FFREXP
:
2135 return legalizeFFREXP(MI
, MRI
, B
);
2136 case TargetOpcode::G_FSQRT
:
2137 return legalizeFSQRT(MI
, MRI
, B
);
2138 case TargetOpcode::G_UDIV
:
2139 case TargetOpcode::G_UREM
:
2140 case TargetOpcode::G_UDIVREM
:
2141 return legalizeUnsignedDIV_REM(MI
, MRI
, B
);
2142 case TargetOpcode::G_SDIV
:
2143 case TargetOpcode::G_SREM
:
2144 case TargetOpcode::G_SDIVREM
:
2145 return legalizeSignedDIV_REM(MI
, MRI
, B
);
2146 case TargetOpcode::G_ATOMIC_CMPXCHG
:
2147 return legalizeAtomicCmpXChg(MI
, MRI
, B
);
2148 case TargetOpcode::G_FLOG2
:
2149 return legalizeFlog2(MI
, B
);
2150 case TargetOpcode::G_FLOG
:
2151 case TargetOpcode::G_FLOG10
:
2152 return legalizeFlogCommon(MI
, B
);
2153 case TargetOpcode::G_FEXP2
:
2154 return legalizeFExp2(MI
, B
);
2155 case TargetOpcode::G_FEXP
:
2156 case TargetOpcode::G_FEXP10
:
2157 return legalizeFExp(MI
, B
);
2158 case TargetOpcode::G_FPOW
:
2159 return legalizeFPow(MI
, B
);
2160 case TargetOpcode::G_FFLOOR
:
2161 return legalizeFFloor(MI
, MRI
, B
);
2162 case TargetOpcode::G_BUILD_VECTOR
:
2163 case TargetOpcode::G_BUILD_VECTOR_TRUNC
:
2164 return legalizeBuildVector(MI
, MRI
, B
);
2165 case TargetOpcode::G_MUL
:
2166 return legalizeMul(Helper
, MI
);
2167 case TargetOpcode::G_CTLZ
:
2168 case TargetOpcode::G_CTTZ
:
2169 return legalizeCTLZ_CTTZ(MI
, MRI
, B
);
2170 case TargetOpcode::G_CTLZ_ZERO_UNDEF
:
2171 return legalizeCTLZ_ZERO_UNDEF(MI
, MRI
, B
);
2172 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND
:
2173 return legalizeFPTruncRound(MI
, B
);
2174 case TargetOpcode::G_STACKSAVE
:
2175 return legalizeStackSave(MI
, B
);
2176 case TargetOpcode::G_GET_FPENV
:
2177 return legalizeGetFPEnv(MI
, MRI
, B
);
2178 case TargetOpcode::G_SET_FPENV
:
2179 return legalizeSetFPEnv(MI
, MRI
, B
);
2180 case TargetOpcode::G_TRAP
:
2181 return legalizeTrap(MI
, MRI
, B
);
2182 case TargetOpcode::G_DEBUGTRAP
:
2183 return legalizeDebugTrap(MI
, MRI
, B
);
2188 llvm_unreachable("expected switch to return");
2191 Register
AMDGPULegalizerInfo::getSegmentAperture(
2193 MachineRegisterInfo
&MRI
,
2194 MachineIRBuilder
&B
) const {
2195 MachineFunction
&MF
= B
.getMF();
2196 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
2197 const LLT S32
= LLT::scalar(32);
2198 const LLT S64
= LLT::scalar(64);
2200 assert(AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::PRIVATE_ADDRESS
);
2202 if (ST
.hasApertureRegs()) {
2203 // Note: this register is somewhat broken. When used as a 32-bit operand,
2204 // it only returns zeroes. The real value is in the upper 32 bits.
2205 // Thus, we must emit extract the high 32 bits.
2206 const unsigned ApertureRegNo
= (AS
== AMDGPUAS::LOCAL_ADDRESS
)
2207 ? AMDGPU::SRC_SHARED_BASE
2208 : AMDGPU::SRC_PRIVATE_BASE
;
2209 // FIXME: It would be more natural to emit a COPY here, but then copy
2210 // coalescing would kick in and it would think it's okay to use the "HI"
2211 // subregister (instead of extracting the HI 32 bits) which is an artificial
2212 // (unusable) register.
2213 // Register TableGen definitions would need an overhaul to get rid of the
2214 // artificial "HI" aperture registers and prevent this kind of issue from
2216 Register Dst
= MRI
.createGenericVirtualRegister(S64
);
2217 MRI
.setRegClass(Dst
, &AMDGPU::SReg_64RegClass
);
2218 B
.buildInstr(AMDGPU::S_MOV_B64
, {Dst
}, {Register(ApertureRegNo
)});
2219 return B
.buildUnmerge(S32
, Dst
).getReg(1);
2222 // TODO: can we be smarter about machine pointer info?
2223 MachinePointerInfo
PtrInfo(AMDGPUAS::CONSTANT_ADDRESS
);
2224 Register LoadAddr
= MRI
.createGenericVirtualRegister(
2225 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
2226 // For code object version 5, private_base and shared_base are passed through
2227 // implicit kernargs.
2228 if (AMDGPU::getAMDHSACodeObjectVersion(*MF
.getFunction().getParent()) >=
2229 AMDGPU::AMDHSA_COV5
) {
2230 AMDGPUTargetLowering::ImplicitParameter Param
=
2231 AS
== AMDGPUAS::LOCAL_ADDRESS
? AMDGPUTargetLowering::SHARED_BASE
2232 : AMDGPUTargetLowering::PRIVATE_BASE
;
2234 ST
.getTargetLowering()->getImplicitParameterOffset(B
.getMF(), Param
);
2236 Register KernargPtrReg
= MRI
.createGenericVirtualRegister(
2237 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
2239 if (!loadInputValue(KernargPtrReg
, B
,
2240 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
))
2243 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
2245 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
2246 MachineMemOperand::MOInvariant
,
2247 LLT::scalar(32), commonAlignment(Align(64), Offset
));
2250 B
.buildPtrAdd(LoadAddr
, KernargPtrReg
,
2251 B
.buildConstant(LLT::scalar(64), Offset
).getReg(0));
2253 return B
.buildLoad(S32
, LoadAddr
, *MMO
).getReg(0);
2256 Register QueuePtr
= MRI
.createGenericVirtualRegister(
2257 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
2259 if (!loadInputValue(QueuePtr
, B
, AMDGPUFunctionArgInfo::QUEUE_PTR
))
2262 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2263 // private_segment_aperture_base_hi.
2264 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
2266 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
2268 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
2269 MachineMemOperand::MOInvariant
,
2270 LLT::scalar(32), commonAlignment(Align(64), StructOffset
));
2272 B
.buildPtrAdd(LoadAddr
, QueuePtr
,
2273 B
.buildConstant(LLT::scalar(64), StructOffset
).getReg(0));
2274 return B
.buildLoad(S32
, LoadAddr
, *MMO
).getReg(0);
2277 /// Return true if the value is a known valid address, such that a null check is
2279 static bool isKnownNonNull(Register Val
, MachineRegisterInfo
&MRI
,
2280 const AMDGPUTargetMachine
&TM
, unsigned AddrSpace
) {
2281 MachineInstr
*Def
= MRI
.getVRegDef(Val
);
2282 switch (Def
->getOpcode()) {
2283 case AMDGPU::G_FRAME_INDEX
:
2284 case AMDGPU::G_GLOBAL_VALUE
:
2285 case AMDGPU::G_BLOCK_ADDR
:
2287 case AMDGPU::G_CONSTANT
: {
2288 const ConstantInt
*CI
= Def
->getOperand(1).getCImm();
2289 return CI
->getSExtValue() != TM
.getNullPointerValue(AddrSpace
);
2298 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2299 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2300 MachineIRBuilder
&B
) const {
2301 MachineFunction
&MF
= B
.getMF();
2303 // MI can either be a G_ADDRSPACE_CAST or a
2304 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2305 assert(MI
.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST
||
2306 (isa
<GIntrinsic
>(MI
) && cast
<GIntrinsic
>(MI
).getIntrinsicID() ==
2307 Intrinsic::amdgcn_addrspacecast_nonnull
));
2309 const LLT S32
= LLT::scalar(32);
2310 Register Dst
= MI
.getOperand(0).getReg();
2311 Register Src
= isa
<GIntrinsic
>(MI
) ? MI
.getOperand(2).getReg()
2312 : MI
.getOperand(1).getReg();
2313 LLT DstTy
= MRI
.getType(Dst
);
2314 LLT SrcTy
= MRI
.getType(Src
);
2315 unsigned DestAS
= DstTy
.getAddressSpace();
2316 unsigned SrcAS
= SrcTy
.getAddressSpace();
2318 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2320 assert(!DstTy
.isVector());
2322 const AMDGPUTargetMachine
&TM
2323 = static_cast<const AMDGPUTargetMachine
&>(MF
.getTarget());
2325 if (TM
.isNoopAddrSpaceCast(SrcAS
, DestAS
)) {
2326 MI
.setDesc(B
.getTII().get(TargetOpcode::G_BITCAST
));
2330 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
&&
2331 (DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
2332 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
)) {
2333 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2334 // G_ADDRSPACE_CAST we need to guess.
2335 if (isa
<GIntrinsic
>(MI
) || isKnownNonNull(Src
, MRI
, TM
, SrcAS
)) {
2336 // Extract low 32-bits of the pointer.
2337 B
.buildExtract(Dst
, Src
, 0);
2338 MI
.eraseFromParent();
2342 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
2344 auto SegmentNull
= B
.buildConstant(DstTy
, NullVal
);
2345 auto FlatNull
= B
.buildConstant(SrcTy
, 0);
2347 // Extract low 32-bits of the pointer.
2348 auto PtrLo32
= B
.buildExtract(DstTy
, Src
, 0);
2351 B
.buildICmp(CmpInst::ICMP_NE
, LLT::scalar(1), Src
, FlatNull
.getReg(0));
2352 B
.buildSelect(Dst
, CmpRes
, PtrLo32
, SegmentNull
.getReg(0));
2354 MI
.eraseFromParent();
2358 if (DestAS
== AMDGPUAS::FLAT_ADDRESS
&&
2359 (SrcAS
== AMDGPUAS::LOCAL_ADDRESS
||
2360 SrcAS
== AMDGPUAS::PRIVATE_ADDRESS
)) {
2361 Register ApertureReg
= getSegmentAperture(SrcAS
, MRI
, B
);
2362 if (!ApertureReg
.isValid())
2365 // Coerce the type of the low half of the result so we can use merge_values.
2366 Register SrcAsInt
= B
.buildPtrToInt(S32
, Src
).getReg(0);
2368 // TODO: Should we allow mismatched types but matching sizes in merges to
2369 // avoid the ptrtoint?
2370 auto BuildPtr
= B
.buildMergeLikeInstr(DstTy
, {SrcAsInt
, ApertureReg
});
2372 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2373 // G_ADDRSPACE_CAST we need to guess.
2374 if (isa
<GIntrinsic
>(MI
) || isKnownNonNull(Src
, MRI
, TM
, SrcAS
)) {
2375 B
.buildCopy(Dst
, BuildPtr
);
2376 MI
.eraseFromParent();
2380 auto SegmentNull
= B
.buildConstant(SrcTy
, TM
.getNullPointerValue(SrcAS
));
2381 auto FlatNull
= B
.buildConstant(DstTy
, TM
.getNullPointerValue(DestAS
));
2383 auto CmpRes
= B
.buildICmp(CmpInst::ICMP_NE
, LLT::scalar(1), Src
,
2384 SegmentNull
.getReg(0));
2386 B
.buildSelect(Dst
, CmpRes
, BuildPtr
, FlatNull
);
2388 MI
.eraseFromParent();
2392 if (DestAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
&&
2393 SrcTy
.getSizeInBits() == 64) {
2395 B
.buildExtract(Dst
, Src
, 0);
2396 MI
.eraseFromParent();
2400 if (SrcAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
&&
2401 DstTy
.getSizeInBits() == 64) {
2402 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2403 uint32_t AddrHiVal
= Info
->get32BitAddressHighBits();
2404 auto PtrLo
= B
.buildPtrToInt(S32
, Src
);
2405 auto HighAddr
= B
.buildConstant(S32
, AddrHiVal
);
2406 B
.buildMergeLikeInstr(Dst
, {PtrLo
, HighAddr
});
2407 MI
.eraseFromParent();
2411 DiagnosticInfoUnsupported
InvalidAddrSpaceCast(
2412 MF
.getFunction(), "invalid addrspacecast", B
.getDebugLoc());
2414 LLVMContext
&Ctx
= MF
.getFunction().getContext();
2415 Ctx
.diagnose(InvalidAddrSpaceCast
);
2417 MI
.eraseFromParent();
2421 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr
&MI
,
2422 MachineRegisterInfo
&MRI
,
2423 MachineIRBuilder
&B
) const {
2424 Register Src
= MI
.getOperand(1).getReg();
2425 LLT Ty
= MRI
.getType(Src
);
2426 assert(Ty
.isScalar() && Ty
.getSizeInBits() == 64);
2428 APFloat
C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2429 APFloat
C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2431 auto C1
= B
.buildFConstant(Ty
, C1Val
);
2432 auto CopySign
= B
.buildFCopysign(Ty
, C1
, Src
);
2434 // TODO: Should this propagate fast-math-flags?
2435 auto Tmp1
= B
.buildFAdd(Ty
, Src
, CopySign
);
2436 auto Tmp2
= B
.buildFSub(Ty
, Tmp1
, CopySign
);
2438 auto C2
= B
.buildFConstant(Ty
, C2Val
);
2439 auto Fabs
= B
.buildFAbs(Ty
, Src
);
2441 auto Cond
= B
.buildFCmp(CmpInst::FCMP_OGT
, LLT::scalar(1), Fabs
, C2
);
2442 B
.buildSelect(MI
.getOperand(0).getReg(), Cond
, Src
, Tmp2
);
2443 MI
.eraseFromParent();
2447 bool AMDGPULegalizerInfo::legalizeFceil(
2448 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2449 MachineIRBuilder
&B
) const {
2451 const LLT S1
= LLT::scalar(1);
2452 const LLT S64
= LLT::scalar(64);
2454 Register Src
= MI
.getOperand(1).getReg();
2455 assert(MRI
.getType(Src
) == S64
);
2457 // result = trunc(src)
2458 // if (src > 0.0 && src != result)
2461 auto Trunc
= B
.buildIntrinsicTrunc(S64
, Src
);
2463 const auto Zero
= B
.buildFConstant(S64
, 0.0);
2464 const auto One
= B
.buildFConstant(S64
, 1.0);
2465 auto Lt0
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Src
, Zero
);
2466 auto NeTrunc
= B
.buildFCmp(CmpInst::FCMP_ONE
, S1
, Src
, Trunc
);
2467 auto And
= B
.buildAnd(S1
, Lt0
, NeTrunc
);
2468 auto Add
= B
.buildSelect(S64
, And
, One
, Zero
);
2470 // TODO: Should this propagate fast-math-flags?
2471 B
.buildFAdd(MI
.getOperand(0).getReg(), Trunc
, Add
);
2472 MI
.eraseFromParent();
2476 bool AMDGPULegalizerInfo::legalizeFrem(
2477 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2478 MachineIRBuilder
&B
) const {
2479 Register DstReg
= MI
.getOperand(0).getReg();
2480 Register Src0Reg
= MI
.getOperand(1).getReg();
2481 Register Src1Reg
= MI
.getOperand(2).getReg();
2482 auto Flags
= MI
.getFlags();
2483 LLT Ty
= MRI
.getType(DstReg
);
2485 auto Div
= B
.buildFDiv(Ty
, Src0Reg
, Src1Reg
, Flags
);
2486 auto Trunc
= B
.buildIntrinsicTrunc(Ty
, Div
, Flags
);
2487 auto Neg
= B
.buildFNeg(Ty
, Trunc
, Flags
);
2488 B
.buildFMA(DstReg
, Neg
, Src1Reg
, Src0Reg
, Flags
);
2489 MI
.eraseFromParent();
2493 static MachineInstrBuilder
extractF64Exponent(Register Hi
,
2494 MachineIRBuilder
&B
) {
2495 const unsigned FractBits
= 52;
2496 const unsigned ExpBits
= 11;
2497 LLT S32
= LLT::scalar(32);
2499 auto Const0
= B
.buildConstant(S32
, FractBits
- 32);
2500 auto Const1
= B
.buildConstant(S32
, ExpBits
);
2502 auto ExpPart
= B
.buildIntrinsic(Intrinsic::amdgcn_ubfe
, {S32
})
2504 .addUse(Const0
.getReg(0))
2505 .addUse(Const1
.getReg(0));
2507 return B
.buildSub(S32
, ExpPart
, B
.buildConstant(S32
, 1023));
2510 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2511 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2512 MachineIRBuilder
&B
) const {
2513 const LLT S1
= LLT::scalar(1);
2514 const LLT S32
= LLT::scalar(32);
2515 const LLT S64
= LLT::scalar(64);
2517 Register Src
= MI
.getOperand(1).getReg();
2518 assert(MRI
.getType(Src
) == S64
);
2520 // TODO: Should this use extract since the low half is unused?
2521 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
2522 Register Hi
= Unmerge
.getReg(1);
2524 // Extract the upper half, since this is where we will find the sign and
2526 auto Exp
= extractF64Exponent(Hi
, B
);
2528 const unsigned FractBits
= 52;
2530 // Extract the sign bit.
2531 const auto SignBitMask
= B
.buildConstant(S32
, UINT32_C(1) << 31);
2532 auto SignBit
= B
.buildAnd(S32
, Hi
, SignBitMask
);
2534 const auto FractMask
= B
.buildConstant(S64
, (UINT64_C(1) << FractBits
) - 1);
2536 const auto Zero32
= B
.buildConstant(S32
, 0);
2538 // Extend back to 64-bits.
2539 auto SignBit64
= B
.buildMergeLikeInstr(S64
, {Zero32
, SignBit
});
2541 auto Shr
= B
.buildAShr(S64
, FractMask
, Exp
);
2542 auto Not
= B
.buildNot(S64
, Shr
);
2543 auto Tmp0
= B
.buildAnd(S64
, Src
, Not
);
2544 auto FiftyOne
= B
.buildConstant(S32
, FractBits
- 1);
2546 auto ExpLt0
= B
.buildICmp(CmpInst::ICMP_SLT
, S1
, Exp
, Zero32
);
2547 auto ExpGt51
= B
.buildICmp(CmpInst::ICMP_SGT
, S1
, Exp
, FiftyOne
);
2549 auto Tmp1
= B
.buildSelect(S64
, ExpLt0
, SignBit64
, Tmp0
);
2550 B
.buildSelect(MI
.getOperand(0).getReg(), ExpGt51
, Src
, Tmp1
);
2551 MI
.eraseFromParent();
2555 bool AMDGPULegalizerInfo::legalizeITOFP(
2556 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2557 MachineIRBuilder
&B
, bool Signed
) const {
2559 Register Dst
= MI
.getOperand(0).getReg();
2560 Register Src
= MI
.getOperand(1).getReg();
2562 const LLT S64
= LLT::scalar(64);
2563 const LLT S32
= LLT::scalar(32);
2565 assert(MRI
.getType(Src
) == S64
);
2567 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
2568 auto ThirtyTwo
= B
.buildConstant(S32
, 32);
2570 if (MRI
.getType(Dst
) == S64
) {
2571 auto CvtHi
= Signed
? B
.buildSITOFP(S64
, Unmerge
.getReg(1))
2572 : B
.buildUITOFP(S64
, Unmerge
.getReg(1));
2574 auto CvtLo
= B
.buildUITOFP(S64
, Unmerge
.getReg(0));
2575 auto LdExp
= B
.buildFLdexp(S64
, CvtHi
, ThirtyTwo
);
2577 // TODO: Should this propagate fast-math-flags?
2578 B
.buildFAdd(Dst
, LdExp
, CvtLo
);
2579 MI
.eraseFromParent();
2583 assert(MRI
.getType(Dst
) == S32
);
2585 auto One
= B
.buildConstant(S32
, 1);
2587 MachineInstrBuilder ShAmt
;
2589 auto ThirtyOne
= B
.buildConstant(S32
, 31);
2590 auto X
= B
.buildXor(S32
, Unmerge
.getReg(0), Unmerge
.getReg(1));
2591 auto OppositeSign
= B
.buildAShr(S32
, X
, ThirtyOne
);
2592 auto MaxShAmt
= B
.buildAdd(S32
, ThirtyTwo
, OppositeSign
);
2593 auto LS
= B
.buildIntrinsic(Intrinsic::amdgcn_sffbh
, {S32
})
2594 .addUse(Unmerge
.getReg(1));
2595 auto LS2
= B
.buildSub(S32
, LS
, One
);
2596 ShAmt
= B
.buildUMin(S32
, LS2
, MaxShAmt
);
2598 ShAmt
= B
.buildCTLZ(S32
, Unmerge
.getReg(1));
2599 auto Norm
= B
.buildShl(S64
, Src
, ShAmt
);
2600 auto Unmerge2
= B
.buildUnmerge({S32
, S32
}, Norm
);
2601 auto Adjust
= B
.buildUMin(S32
, One
, Unmerge2
.getReg(0));
2602 auto Norm2
= B
.buildOr(S32
, Unmerge2
.getReg(1), Adjust
);
2603 auto FVal
= Signed
? B
.buildSITOFP(S32
, Norm2
) : B
.buildUITOFP(S32
, Norm2
);
2604 auto Scale
= B
.buildSub(S32
, ThirtyTwo
, ShAmt
);
2605 B
.buildFLdexp(Dst
, FVal
, Scale
);
2606 MI
.eraseFromParent();
2610 // TODO: Copied from DAG implementation. Verify logic and document how this
2612 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr
&MI
,
2613 MachineRegisterInfo
&MRI
,
2614 MachineIRBuilder
&B
,
2615 bool Signed
) const {
2617 Register Dst
= MI
.getOperand(0).getReg();
2618 Register Src
= MI
.getOperand(1).getReg();
2620 const LLT S64
= LLT::scalar(64);
2621 const LLT S32
= LLT::scalar(32);
2623 const LLT SrcLT
= MRI
.getType(Src
);
2624 assert((SrcLT
== S32
|| SrcLT
== S64
) && MRI
.getType(Dst
) == S64
);
2626 unsigned Flags
= MI
.getFlags();
2628 // The basic idea of converting a floating point number into a pair of 32-bit
2629 // integers is illustrated as follows:
2631 // tf := trunc(val);
2632 // hif := floor(tf * 2^-32);
2633 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2634 // hi := fptoi(hif);
2635 // lo := fptoi(lof);
2637 auto Trunc
= B
.buildIntrinsicTrunc(SrcLT
, Src
, Flags
);
2638 MachineInstrBuilder Sign
;
2639 if (Signed
&& SrcLT
== S32
) {
2640 // However, a 32-bit floating point number has only 23 bits mantissa and
2641 // it's not enough to hold all the significant bits of `lof` if val is
2642 // negative. To avoid the loss of precision, We need to take the absolute
2643 // value after truncating and flip the result back based on the original
2645 Sign
= B
.buildAShr(S32
, Src
, B
.buildConstant(S32
, 31));
2646 Trunc
= B
.buildFAbs(S32
, Trunc
, Flags
);
2648 MachineInstrBuilder K0
, K1
;
2650 K0
= B
.buildFConstant(
2651 S64
, llvm::bit_cast
<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2652 K1
= B
.buildFConstant(
2653 S64
, llvm::bit_cast
<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2655 K0
= B
.buildFConstant(
2656 S32
, llvm::bit_cast
<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2657 K1
= B
.buildFConstant(
2658 S32
, llvm::bit_cast
<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2661 auto Mul
= B
.buildFMul(SrcLT
, Trunc
, K0
, Flags
);
2662 auto FloorMul
= B
.buildFFloor(SrcLT
, Mul
, Flags
);
2663 auto Fma
= B
.buildFMA(SrcLT
, FloorMul
, K1
, Trunc
, Flags
);
2665 auto Hi
= (Signed
&& SrcLT
== S64
) ? B
.buildFPTOSI(S32
, FloorMul
)
2666 : B
.buildFPTOUI(S32
, FloorMul
);
2667 auto Lo
= B
.buildFPTOUI(S32
, Fma
);
2669 if (Signed
&& SrcLT
== S32
) {
2670 // Flip the result based on the signedness, which is either all 0s or 1s.
2671 Sign
= B
.buildMergeLikeInstr(S64
, {Sign
, Sign
});
2672 // r := xor({lo, hi}, sign) - sign;
2673 B
.buildSub(Dst
, B
.buildXor(S64
, B
.buildMergeLikeInstr(S64
, {Lo
, Hi
}), Sign
),
2676 B
.buildMergeLikeInstr(Dst
, {Lo
, Hi
});
2677 MI
.eraseFromParent();
2682 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper
&Helper
,
2683 MachineInstr
&MI
) const {
2684 MachineFunction
&MF
= Helper
.MIRBuilder
.getMF();
2685 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
2687 const bool IsIEEEOp
= MI
.getOpcode() == AMDGPU::G_FMINNUM_IEEE
||
2688 MI
.getOpcode() == AMDGPU::G_FMAXNUM_IEEE
;
2690 // With ieee_mode disabled, the instructions have the correct behavior
2691 // already for G_FMINNUM/G_FMAXNUM
2692 if (!MFI
->getMode().IEEE
)
2698 return Helper
.lowerFMinNumMaxNum(MI
) == LegalizerHelper::Legalized
;
2701 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2702 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2703 MachineIRBuilder
&B
) const {
2704 // TODO: Should move some of this into LegalizerHelper.
2706 // TODO: Promote dynamic indexing of s16 to s32
2708 Register Dst
= MI
.getOperand(0).getReg();
2709 Register Vec
= MI
.getOperand(1).getReg();
2711 LLT VecTy
= MRI
.getType(Vec
);
2712 LLT EltTy
= VecTy
.getElementType();
2713 assert(EltTy
== MRI
.getType(Dst
));
2715 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2716 // but we can't go directly to that logic becasue you can't bitcast a vector
2717 // of pointers to a vector of integers. Therefore, introduce an intermediate
2718 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2719 // drive the legalization forward.
2720 if (EltTy
.isPointer() && EltTy
.getSizeInBits() > 64) {
2721 LLT IntTy
= LLT::scalar(EltTy
.getSizeInBits());
2722 LLT IntVecTy
= VecTy
.changeElementType(IntTy
);
2724 auto IntVec
= B
.buildPtrToInt(IntVecTy
, Vec
);
2725 auto IntElt
= B
.buildExtractVectorElement(IntTy
, IntVec
, MI
.getOperand(2));
2726 B
.buildIntToPtr(Dst
, IntElt
);
2728 MI
.eraseFromParent();
2732 // FIXME: Artifact combiner probably should have replaced the truncated
2733 // constant before this, so we shouldn't need
2734 // getIConstantVRegValWithLookThrough.
2735 std::optional
<ValueAndVReg
> MaybeIdxVal
=
2736 getIConstantVRegValWithLookThrough(MI
.getOperand(2).getReg(), MRI
);
2737 if (!MaybeIdxVal
) // Dynamic case will be selected to register indexing.
2739 const uint64_t IdxVal
= MaybeIdxVal
->Value
.getZExtValue();
2741 if (IdxVal
< VecTy
.getNumElements()) {
2742 auto Unmerge
= B
.buildUnmerge(EltTy
, Vec
);
2743 B
.buildCopy(Dst
, Unmerge
.getReg(IdxVal
));
2748 MI
.eraseFromParent();
2752 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2753 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2754 MachineIRBuilder
&B
) const {
2755 // TODO: Should move some of this into LegalizerHelper.
2757 // TODO: Promote dynamic indexing of s16 to s32
2759 Register Dst
= MI
.getOperand(0).getReg();
2760 Register Vec
= MI
.getOperand(1).getReg();
2761 Register Ins
= MI
.getOperand(2).getReg();
2763 LLT VecTy
= MRI
.getType(Vec
);
2764 LLT EltTy
= VecTy
.getElementType();
2765 assert(EltTy
== MRI
.getType(Ins
));
2767 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2768 // but we can't go directly to that logic becasue you can't bitcast a vector
2769 // of pointers to a vector of integers. Therefore, make the pointer vector
2770 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2771 // new value, and then inttoptr the result vector back. This will then allow
2772 // the rest of legalization to take over.
2773 if (EltTy
.isPointer() && EltTy
.getSizeInBits() > 64) {
2774 LLT IntTy
= LLT::scalar(EltTy
.getSizeInBits());
2775 LLT IntVecTy
= VecTy
.changeElementType(IntTy
);
2777 auto IntVecSource
= B
.buildPtrToInt(IntVecTy
, Vec
);
2778 auto IntIns
= B
.buildPtrToInt(IntTy
, Ins
);
2779 auto IntVecDest
= B
.buildInsertVectorElement(IntVecTy
, IntVecSource
, IntIns
,
2781 B
.buildIntToPtr(Dst
, IntVecDest
);
2782 MI
.eraseFromParent();
2786 // FIXME: Artifact combiner probably should have replaced the truncated
2787 // constant before this, so we shouldn't need
2788 // getIConstantVRegValWithLookThrough.
2789 std::optional
<ValueAndVReg
> MaybeIdxVal
=
2790 getIConstantVRegValWithLookThrough(MI
.getOperand(3).getReg(), MRI
);
2791 if (!MaybeIdxVal
) // Dynamic case will be selected to register indexing.
2794 const uint64_t IdxVal
= MaybeIdxVal
->Value
.getZExtValue();
2796 unsigned NumElts
= VecTy
.getNumElements();
2797 if (IdxVal
< NumElts
) {
2798 SmallVector
<Register
, 8> SrcRegs
;
2799 for (unsigned i
= 0; i
< NumElts
; ++i
)
2800 SrcRegs
.push_back(MRI
.createGenericVirtualRegister(EltTy
));
2801 B
.buildUnmerge(SrcRegs
, Vec
);
2803 SrcRegs
[IdxVal
] = MI
.getOperand(2).getReg();
2804 B
.buildMergeLikeInstr(Dst
, SrcRegs
);
2809 MI
.eraseFromParent();
2813 bool AMDGPULegalizerInfo::legalizeSinCos(
2814 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2815 MachineIRBuilder
&B
) const {
2817 Register DstReg
= MI
.getOperand(0).getReg();
2818 Register SrcReg
= MI
.getOperand(1).getReg();
2819 LLT Ty
= MRI
.getType(DstReg
);
2820 unsigned Flags
= MI
.getFlags();
2823 auto OneOver2Pi
= B
.buildFConstant(Ty
, 0.5 * numbers::inv_pi
);
2824 if (ST
.hasTrigReducedRange()) {
2825 auto MulVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
);
2826 TrigVal
= B
.buildIntrinsic(Intrinsic::amdgcn_fract
, {Ty
})
2827 .addUse(MulVal
.getReg(0))
2831 TrigVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
).getReg(0);
2833 Intrinsic::ID TrigIntrin
= MI
.getOpcode() == AMDGPU::G_FSIN
?
2834 Intrinsic::amdgcn_sin
: Intrinsic::amdgcn_cos
;
2835 B
.buildIntrinsic(TrigIntrin
, ArrayRef
<Register
>(DstReg
))
2838 MI
.eraseFromParent();
2842 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg
, LLT PtrTy
,
2843 MachineIRBuilder
&B
,
2844 const GlobalValue
*GV
,
2846 unsigned GAFlags
) const {
2847 assert(isInt
<32>(Offset
+ 4) && "32-bit offset is expected!");
2848 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2849 // to the following code sequence:
2851 // For constant address space:
2852 // s_getpc_b64 s[0:1]
2853 // s_add_u32 s0, s0, $symbol
2854 // s_addc_u32 s1, s1, 0
2856 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2857 // a fixup or relocation is emitted to replace $symbol with a literal
2858 // constant, which is a pc-relative offset from the encoding of the $symbol
2859 // operand to the global variable.
2861 // For global address space:
2862 // s_getpc_b64 s[0:1]
2863 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2864 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2866 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2867 // fixups or relocations are emitted to replace $symbol@*@lo and
2868 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2869 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2870 // operand to the global variable.
2872 LLT ConstPtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
2874 Register PCReg
= PtrTy
.getSizeInBits() != 32 ? DstReg
:
2875 B
.getMRI()->createGenericVirtualRegister(ConstPtrTy
);
2877 MachineInstrBuilder MIB
= B
.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET
)
2880 MIB
.addGlobalAddress(GV
, Offset
, GAFlags
);
2881 if (GAFlags
== SIInstrInfo::MO_NONE
)
2884 MIB
.addGlobalAddress(GV
, Offset
, GAFlags
+ 1);
2886 if (!B
.getMRI()->getRegClassOrNull(PCReg
))
2887 B
.getMRI()->setRegClass(PCReg
, &AMDGPU::SReg_64RegClass
);
2889 if (PtrTy
.getSizeInBits() == 32)
2890 B
.buildExtract(DstReg
, PCReg
, 0);
2894 // Emit a ABS32_LO / ABS32_HI relocation stub.
2895 void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2896 Register DstReg
, LLT PtrTy
, MachineIRBuilder
&B
, const GlobalValue
*GV
,
2897 MachineRegisterInfo
&MRI
) const {
2898 bool RequiresHighHalf
= PtrTy
.getSizeInBits() != 32;
2900 LLT S32
= LLT::scalar(32);
2902 // Use the destination directly, if and only if we store the lower address
2903 // part only and we don't have a register class being set.
2904 Register AddrLo
= !RequiresHighHalf
&& !MRI
.getRegClassOrNull(DstReg
)
2906 : MRI
.createGenericVirtualRegister(S32
);
2908 if (!MRI
.getRegClassOrNull(AddrLo
))
2909 MRI
.setRegClass(AddrLo
, &AMDGPU::SReg_32RegClass
);
2911 // Write the lower half.
2912 B
.buildInstr(AMDGPU::S_MOV_B32
)
2914 .addGlobalAddress(GV
, 0, SIInstrInfo::MO_ABS32_LO
);
2916 // If required, write the upper half as well.
2917 if (RequiresHighHalf
) {
2918 assert(PtrTy
.getSizeInBits() == 64 &&
2919 "Must provide a 64-bit pointer type!");
2921 Register AddrHi
= MRI
.createGenericVirtualRegister(S32
);
2922 MRI
.setRegClass(AddrHi
, &AMDGPU::SReg_32RegClass
);
2924 B
.buildInstr(AMDGPU::S_MOV_B32
)
2926 .addGlobalAddress(GV
, 0, SIInstrInfo::MO_ABS32_HI
);
2928 // Use the destination directly, if and only if we don't have a register
2930 Register AddrDst
= !MRI
.getRegClassOrNull(DstReg
)
2932 : MRI
.createGenericVirtualRegister(LLT::scalar(64));
2934 if (!MRI
.getRegClassOrNull(AddrDst
))
2935 MRI
.setRegClass(AddrDst
, &AMDGPU::SReg_64RegClass
);
2937 B
.buildMergeValues(AddrDst
, {AddrLo
, AddrHi
});
2939 // If we created a new register for the destination, cast the result into
2940 // the final output.
2941 if (AddrDst
!= DstReg
)
2942 B
.buildCast(DstReg
, AddrDst
);
2943 } else if (AddrLo
!= DstReg
) {
2944 // If we created a new register for the destination, cast the result into
2945 // the final output.
2946 B
.buildCast(DstReg
, AddrLo
);
2950 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2951 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2952 MachineIRBuilder
&B
) const {
2953 Register DstReg
= MI
.getOperand(0).getReg();
2954 LLT Ty
= MRI
.getType(DstReg
);
2955 unsigned AS
= Ty
.getAddressSpace();
2957 const GlobalValue
*GV
= MI
.getOperand(1).getGlobal();
2958 MachineFunction
&MF
= B
.getMF();
2959 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
2961 if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
2962 if (!MFI
->isModuleEntryFunction() &&
2963 GV
->getName() != "llvm.amdgcn.module.lds") {
2964 const Function
&Fn
= MF
.getFunction();
2965 DiagnosticInfoUnsupported
BadLDSDecl(
2966 Fn
, "local memory global used by non-kernel function", MI
.getDebugLoc(),
2968 Fn
.getContext().diagnose(BadLDSDecl
);
2970 // We currently don't have a way to correctly allocate LDS objects that
2971 // aren't directly associated with a kernel. We do force inlining of
2972 // functions that use local objects. However, if these dead functions are
2973 // not eliminated, we don't want a compile time error. Just emit a warning
2974 // and a trap, since there should be no callable path here.
2976 B
.buildUndef(DstReg
);
2977 MI
.eraseFromParent();
2981 // TODO: We could emit code to handle the initialization somewhere.
2982 // We ignore the initializer for now and legalize it to allow selection.
2983 // The initializer will anyway get errored out during assembly emission.
2984 const SITargetLowering
*TLI
= ST
.getTargetLowering();
2985 if (!TLI
->shouldUseLDSConstAddress(GV
)) {
2986 MI
.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO
);
2987 return true; // Leave in place;
2990 if (AS
== AMDGPUAS::LOCAL_ADDRESS
&& GV
->hasExternalLinkage()) {
2991 Type
*Ty
= GV
->getValueType();
2992 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2993 // zero-sized type in other languages to declare the dynamic shared
2994 // memory which size is not known at the compile time. They will be
2995 // allocated by the runtime and placed directly after the static
2996 // allocated ones. They all share the same offset.
2997 if (B
.getDataLayout().getTypeAllocSize(Ty
).isZero()) {
2998 // Adjust alignment for that dynamic shared memory array.
2999 MFI
->setDynLDSAlign(MF
.getFunction(), *cast
<GlobalVariable
>(GV
));
3000 LLT S32
= LLT::scalar(32);
3001 auto Sz
= B
.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize
, {S32
});
3002 B
.buildIntToPtr(DstReg
, Sz
);
3003 MI
.eraseFromParent();
3008 B
.buildConstant(DstReg
, MFI
->allocateLDSGlobal(B
.getDataLayout(),
3009 *cast
<GlobalVariable
>(GV
)));
3010 MI
.eraseFromParent();
3014 if (ST
.isAmdPalOS() || ST
.isMesa3DOS()) {
3015 buildAbsGlobalAddress(DstReg
, Ty
, B
, GV
, MRI
);
3016 MI
.eraseFromParent();
3020 const SITargetLowering
*TLI
= ST
.getTargetLowering();
3022 if (TLI
->shouldEmitFixup(GV
)) {
3023 buildPCRelGlobalAddress(DstReg
, Ty
, B
, GV
, 0);
3024 MI
.eraseFromParent();
3028 if (TLI
->shouldEmitPCReloc(GV
)) {
3029 buildPCRelGlobalAddress(DstReg
, Ty
, B
, GV
, 0, SIInstrInfo::MO_REL32
);
3030 MI
.eraseFromParent();
3034 LLT PtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
3035 Register GOTAddr
= MRI
.createGenericVirtualRegister(PtrTy
);
3037 LLT LoadTy
= Ty
.getSizeInBits() == 32 ? PtrTy
: Ty
;
3038 MachineMemOperand
*GOTMMO
= MF
.getMachineMemOperand(
3039 MachinePointerInfo::getGOT(MF
),
3040 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
3041 MachineMemOperand::MOInvariant
,
3044 buildPCRelGlobalAddress(GOTAddr
, PtrTy
, B
, GV
, 0, SIInstrInfo::MO_GOTPCREL32
);
3046 if (Ty
.getSizeInBits() == 32) {
3047 // Truncate if this is a 32-bit constant address.
3048 auto Load
= B
.buildLoad(PtrTy
, GOTAddr
, *GOTMMO
);
3049 B
.buildExtract(DstReg
, Load
, 0);
3051 B
.buildLoad(DstReg
, GOTAddr
, *GOTMMO
);
3053 MI
.eraseFromParent();
3057 static LLT
widenToNextPowerOf2(LLT Ty
) {
3059 return Ty
.changeElementCount(
3060 ElementCount::getFixed(PowerOf2Ceil(Ty
.getNumElements())));
3061 return LLT::scalar(PowerOf2Ceil(Ty
.getSizeInBits()));
3064 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper
&Helper
,
3065 MachineInstr
&MI
) const {
3066 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
3067 MachineRegisterInfo
&MRI
= *B
.getMRI();
3068 GISelChangeObserver
&Observer
= Helper
.Observer
;
3070 Register PtrReg
= MI
.getOperand(1).getReg();
3071 LLT PtrTy
= MRI
.getType(PtrReg
);
3072 unsigned AddrSpace
= PtrTy
.getAddressSpace();
3074 if (AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
3075 LLT ConstPtr
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
3076 auto Cast
= B
.buildAddrSpaceCast(ConstPtr
, PtrReg
);
3077 Observer
.changingInstr(MI
);
3078 MI
.getOperand(1).setReg(Cast
.getReg(0));
3079 Observer
.changedInstr(MI
);
3083 if (MI
.getOpcode() != AMDGPU::G_LOAD
)
3086 Register ValReg
= MI
.getOperand(0).getReg();
3087 LLT ValTy
= MRI
.getType(ValReg
);
3089 if (hasBufferRsrcWorkaround(ValTy
)) {
3090 Observer
.changingInstr(MI
);
3091 castBufferRsrcFromV4I32(MI
, B
, MRI
, 0);
3092 Observer
.changedInstr(MI
);
3096 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
3097 const unsigned ValSize
= ValTy
.getSizeInBits();
3098 const LLT MemTy
= MMO
->getMemoryType();
3099 const Align MemAlign
= MMO
->getAlign();
3100 const unsigned MemSize
= MemTy
.getSizeInBits();
3101 const uint64_t AlignInBits
= 8 * MemAlign
.value();
3103 // Widen non-power-of-2 loads to the alignment if needed
3104 if (shouldWidenLoad(ST
, MemTy
, AlignInBits
, AddrSpace
, MI
.getOpcode())) {
3105 const unsigned WideMemSize
= PowerOf2Ceil(MemSize
);
3107 // This was already the correct extending load result type, so just adjust
3109 if (WideMemSize
== ValSize
) {
3110 MachineFunction
&MF
= B
.getMF();
3112 MachineMemOperand
*WideMMO
=
3113 MF
.getMachineMemOperand(MMO
, 0, WideMemSize
/ 8);
3114 Observer
.changingInstr(MI
);
3115 MI
.setMemRefs(MF
, {WideMMO
});
3116 Observer
.changedInstr(MI
);
3120 // Don't bother handling edge case that should probably never be produced.
3121 if (ValSize
> WideMemSize
)
3124 LLT WideTy
= widenToNextPowerOf2(ValTy
);
3127 if (!WideTy
.isVector()) {
3128 WideLoad
= B
.buildLoadFromOffset(WideTy
, PtrReg
, *MMO
, 0).getReg(0);
3129 B
.buildTrunc(ValReg
, WideLoad
).getReg(0);
3131 // Extract the subvector.
3133 if (isRegisterType(ValTy
)) {
3134 // If this a case where G_EXTRACT is legal, use it.
3135 // (e.g. <3 x s32> -> <4 x s32>)
3136 WideLoad
= B
.buildLoadFromOffset(WideTy
, PtrReg
, *MMO
, 0).getReg(0);
3137 B
.buildExtract(ValReg
, WideLoad
, 0);
3139 // For cases where the widened type isn't a nice register value, unmerge
3140 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3141 WideLoad
= B
.buildLoadFromOffset(WideTy
, PtrReg
, *MMO
, 0).getReg(0);
3142 B
.buildDeleteTrailingVectorElements(ValReg
, WideLoad
);
3146 MI
.eraseFromParent();
3153 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper
&Helper
,
3154 MachineInstr
&MI
) const {
3155 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
3156 MachineRegisterInfo
&MRI
= *B
.getMRI();
3157 GISelChangeObserver
&Observer
= Helper
.Observer
;
3159 Register DataReg
= MI
.getOperand(0).getReg();
3160 LLT DataTy
= MRI
.getType(DataReg
);
3162 if (hasBufferRsrcWorkaround(DataTy
)) {
3163 Observer
.changingInstr(MI
);
3164 castBufferRsrcArgToV4I32(MI
, B
, 0);
3165 Observer
.changedInstr(MI
);
3171 bool AMDGPULegalizerInfo::legalizeFMad(
3172 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
3173 MachineIRBuilder
&B
) const {
3174 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
3175 assert(Ty
.isScalar());
3177 MachineFunction
&MF
= B
.getMF();
3178 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
3180 // TODO: Always legal with future ftz flag.
3181 // FIXME: Do we need just output?
3182 if (Ty
== LLT::float32() &&
3183 MFI
->getMode().FP32Denormals
== DenormalMode::getPreserveSign())
3185 if (Ty
== LLT::float16() &&
3186 MFI
->getMode().FP64FP16Denormals
== DenormalMode::getPreserveSign())
3189 MachineIRBuilder
HelperBuilder(MI
);
3190 GISelObserverWrapper DummyObserver
;
3191 LegalizerHelper
Helper(MF
, DummyObserver
, HelperBuilder
);
3192 return Helper
.lowerFMad(MI
) == LegalizerHelper::Legalized
;
3195 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3196 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
3197 Register DstReg
= MI
.getOperand(0).getReg();
3198 Register PtrReg
= MI
.getOperand(1).getReg();
3199 Register CmpVal
= MI
.getOperand(2).getReg();
3200 Register NewVal
= MI
.getOperand(3).getReg();
3202 assert(AMDGPU::isFlatGlobalAddrSpace(MRI
.getType(PtrReg
).getAddressSpace()) &&
3203 "this should not have been custom lowered");
3205 LLT ValTy
= MRI
.getType(CmpVal
);
3206 LLT VecTy
= LLT::fixed_vector(2, ValTy
);
3208 Register PackedVal
= B
.buildBuildVector(VecTy
, { NewVal
, CmpVal
}).getReg(0);
3210 B
.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG
)
3214 .setMemRefs(MI
.memoperands());
3216 MI
.eraseFromParent();
3220 /// Return true if it's known that \p Src can never be an f32 denormal value.
3221 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo
&MRI
,
3223 const MachineInstr
*DefMI
= MRI
.getVRegDef(Src
);
3224 switch (DefMI
->getOpcode()) {
3225 case TargetOpcode::G_INTRINSIC
: {
3226 switch (cast
<GIntrinsic
>(DefMI
)->getIntrinsicID()) {
3227 case Intrinsic::amdgcn_frexp_mant
:
3235 case TargetOpcode::G_FFREXP
: {
3236 if (DefMI
->getOperand(0).getReg() == Src
)
3240 case TargetOpcode::G_FPEXT
: {
3241 return MRI
.getType(DefMI
->getOperand(1).getReg()) == LLT::scalar(16);
3250 static bool allowApproxFunc(const MachineFunction
&MF
, unsigned Flags
) {
3251 if (Flags
& MachineInstr::FmAfn
)
3253 const auto &Options
= MF
.getTarget().Options
;
3254 return Options
.UnsafeFPMath
|| Options
.ApproxFuncFPMath
;
3257 static bool needsDenormHandlingF32(const MachineFunction
&MF
, Register Src
,
3259 return !valueIsKnownNeverF32Denorm(MF
.getRegInfo(), Src
) &&
3260 MF
.getDenormalMode(APFloat::IEEEsingle()).Input
!=
3261 DenormalMode::PreserveSign
;
3264 std::pair
<Register
, Register
>
3265 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder
&B
, Register Src
,
3266 unsigned Flags
) const {
3267 if (!needsDenormHandlingF32(B
.getMF(), Src
, Flags
))
3270 const LLT F32
= LLT::scalar(32);
3271 auto SmallestNormal
= B
.buildFConstant(
3272 F32
, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3273 auto IsLtSmallestNormal
=
3274 B
.buildFCmp(CmpInst::FCMP_OLT
, LLT::scalar(1), Src
, SmallestNormal
);
3276 auto Scale32
= B
.buildFConstant(F32
, 0x1.0p
+32);
3277 auto One
= B
.buildFConstant(F32
, 1.0);
3279 B
.buildSelect(F32
, IsLtSmallestNormal
, Scale32
, One
, Flags
);
3280 auto ScaledInput
= B
.buildFMul(F32
, Src
, ScaleFactor
, Flags
);
3282 return {ScaledInput
.getReg(0), IsLtSmallestNormal
.getReg(0)};
3285 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr
&MI
,
3286 MachineIRBuilder
&B
) const {
3287 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3288 // If we have to handle denormals, scale up the input and adjust the result.
3290 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3291 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3293 Register Dst
= MI
.getOperand(0).getReg();
3294 Register Src
= MI
.getOperand(1).getReg();
3295 LLT Ty
= B
.getMRI()->getType(Dst
);
3296 unsigned Flags
= MI
.getFlags();
3298 if (Ty
== LLT::scalar(16)) {
3299 const LLT F32
= LLT::scalar(32);
3300 // Nothing in half is a denormal when promoted to f32.
3301 auto Ext
= B
.buildFPExt(F32
, Src
, Flags
);
3302 auto Log2
= B
.buildIntrinsic(Intrinsic::amdgcn_log
, {F32
})
3303 .addUse(Ext
.getReg(0))
3305 B
.buildFPTrunc(Dst
, Log2
, Flags
);
3306 MI
.eraseFromParent();
3310 assert(Ty
== LLT::scalar(32));
3312 auto [ScaledInput
, IsLtSmallestNormal
] = getScaledLogInput(B
, Src
, Flags
);
3314 B
.buildIntrinsic(Intrinsic::amdgcn_log
, {MI
.getOperand(0)})
3317 MI
.eraseFromParent();
3321 auto Log2
= B
.buildIntrinsic(Intrinsic::amdgcn_log
, {Ty
})
3322 .addUse(ScaledInput
)
3325 auto ThirtyTwo
= B
.buildFConstant(Ty
, 32.0);
3326 auto Zero
= B
.buildFConstant(Ty
, 0.0);
3328 B
.buildSelect(Ty
, IsLtSmallestNormal
, ThirtyTwo
, Zero
, Flags
);
3329 B
.buildFSub(Dst
, Log2
, ResultOffset
, Flags
);
3331 MI
.eraseFromParent();
3335 static Register
getMad(MachineIRBuilder
&B
, LLT Ty
, Register X
, Register Y
,
3336 Register Z
, unsigned Flags
) {
3337 auto FMul
= B
.buildFMul(Ty
, X
, Y
, Flags
);
3338 return B
.buildFAdd(Ty
, FMul
, Z
, Flags
).getReg(0);
3341 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr
&MI
,
3342 MachineIRBuilder
&B
) const {
3343 const bool IsLog10
= MI
.getOpcode() == TargetOpcode::G_FLOG10
;
3344 assert(IsLog10
|| MI
.getOpcode() == TargetOpcode::G_FLOG
);
3346 MachineRegisterInfo
&MRI
= *B
.getMRI();
3347 Register Dst
= MI
.getOperand(0).getReg();
3348 Register X
= MI
.getOperand(1).getReg();
3349 unsigned Flags
= MI
.getFlags();
3350 const LLT Ty
= MRI
.getType(X
);
3351 MachineFunction
&MF
= B
.getMF();
3353 const LLT F32
= LLT::scalar(32);
3354 const LLT F16
= LLT::scalar(16);
3356 const AMDGPUTargetMachine
&TM
=
3357 static_cast<const AMDGPUTargetMachine
&>(MF
.getTarget());
3359 if (Ty
== F16
|| MI
.getFlag(MachineInstr::FmAfn
) ||
3360 TM
.Options
.ApproxFuncFPMath
|| TM
.Options
.UnsafeFPMath
) {
3361 if (Ty
== F16
&& !ST
.has16BitInsts()) {
3362 Register LogVal
= MRI
.createGenericVirtualRegister(F32
);
3363 auto PromoteSrc
= B
.buildFPExt(F32
, X
);
3364 legalizeFlogUnsafe(B
, LogVal
, PromoteSrc
.getReg(0), IsLog10
, Flags
);
3365 B
.buildFPTrunc(Dst
, LogVal
);
3367 legalizeFlogUnsafe(B
, Dst
, X
, IsLog10
, Flags
);
3370 MI
.eraseFromParent();
3374 auto [ScaledInput
, IsScaled
] = getScaledLogInput(B
, X
, Flags
);
3379 B
.buildIntrinsic(Intrinsic::amdgcn_log
, {Ty
}).addUse(X
).setMIFlags(Flags
);
3382 if (ST
.hasFastFMAF32()) {
3383 // c+cc are ln(2)/ln(10) to more than 49 bits
3384 const float c_log10
= 0x1.344134p
-2f
;
3385 const float cc_log10
= 0x1.09f79ep
-26f
;
3387 // c + cc is ln(2) to more than 49 bits
3388 const float c_log
= 0x1.62e42ep
-1f
;
3389 const float cc_log
= 0x1.efa39ep
-25f
;
3391 auto C
= B
.buildFConstant(Ty
, IsLog10
? c_log10
: c_log
);
3392 auto CC
= B
.buildFConstant(Ty
, IsLog10
? cc_log10
: cc_log
);
3394 R
= B
.buildFMul(Ty
, Y
, C
, Flags
).getReg(0);
3395 auto NegR
= B
.buildFNeg(Ty
, R
, Flags
);
3396 auto FMA0
= B
.buildFMA(Ty
, Y
, C
, NegR
, Flags
);
3397 auto FMA1
= B
.buildFMA(Ty
, Y
, CC
, FMA0
, Flags
);
3398 R
= B
.buildFAdd(Ty
, R
, FMA1
, Flags
).getReg(0);
3400 // ch+ct is ln(2)/ln(10) to more than 36 bits
3401 const float ch_log10
= 0x1.344000p
-2f
;
3402 const float ct_log10
= 0x1.3509f6p
-18f
;
3404 // ch + ct is ln(2) to more than 36 bits
3405 const float ch_log
= 0x1.62e000p
-1f
;
3406 const float ct_log
= 0x1.0bfbe8p
-15f
;
3408 auto CH
= B
.buildFConstant(Ty
, IsLog10
? ch_log10
: ch_log
);
3409 auto CT
= B
.buildFConstant(Ty
, IsLog10
? ct_log10
: ct_log
);
3411 auto MaskConst
= B
.buildConstant(Ty
, 0xfffff000);
3412 auto YH
= B
.buildAnd(Ty
, Y
, MaskConst
);
3413 auto YT
= B
.buildFSub(Ty
, Y
, YH
, Flags
);
3414 auto YTCT
= B
.buildFMul(Ty
, YT
, CT
, Flags
);
3417 getMad(B
, Ty
, YH
.getReg(0), CT
.getReg(0), YTCT
.getReg(0), Flags
);
3418 Register Mad1
= getMad(B
, Ty
, YT
.getReg(0), CH
.getReg(0), Mad0
, Flags
);
3419 R
= getMad(B
, Ty
, YH
.getReg(0), CH
.getReg(0), Mad1
, Flags
);
3422 const bool IsFiniteOnly
=
3423 (MI
.getFlag(MachineInstr::FmNoNans
) || TM
.Options
.NoNaNsFPMath
) &&
3424 (MI
.getFlag(MachineInstr::FmNoInfs
) || TM
.Options
.NoInfsFPMath
);
3426 if (!IsFiniteOnly
) {
3427 // Expand isfinite(x) => fabs(x) < inf
3428 auto Inf
= B
.buildFConstant(Ty
, APFloat::getInf(APFloat::IEEEsingle()));
3429 auto Fabs
= B
.buildFAbs(Ty
, Y
);
3431 B
.buildFCmp(CmpInst::FCMP_OLT
, LLT::scalar(1), Fabs
, Inf
, Flags
);
3432 R
= B
.buildSelect(Ty
, IsFinite
, R
, Y
, Flags
).getReg(0);
3436 auto Zero
= B
.buildFConstant(Ty
, 0.0);
3438 B
.buildFConstant(Ty
, IsLog10
? 0x1.344136p
+3f
: 0x1.62e430p
+4f
);
3439 auto Shift
= B
.buildSelect(Ty
, IsScaled
, ShiftK
, Zero
, Flags
);
3440 B
.buildFSub(Dst
, R
, Shift
, Flags
);
3442 B
.buildCopy(Dst
, R
);
3445 MI
.eraseFromParent();
3449 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder
&B
, Register Dst
,
3450 Register Src
, bool IsLog10
,
3451 unsigned Flags
) const {
3452 const double Log2BaseInverted
=
3453 IsLog10
? numbers::ln2
/ numbers::ln10
: numbers::ln2
;
3455 LLT Ty
= B
.getMRI()->getType(Dst
);
3457 if (Ty
== LLT::scalar(32)) {
3458 auto [ScaledInput
, IsScaled
] = getScaledLogInput(B
, Src
, Flags
);
3460 auto LogSrc
= B
.buildIntrinsic(Intrinsic::amdgcn_log
, {Ty
})
3463 auto ScaledResultOffset
= B
.buildFConstant(Ty
, -32.0 * Log2BaseInverted
);
3464 auto Zero
= B
.buildFConstant(Ty
, 0.0);
3466 B
.buildSelect(Ty
, IsScaled
, ScaledResultOffset
, Zero
, Flags
);
3467 auto Log2Inv
= B
.buildFConstant(Ty
, Log2BaseInverted
);
3469 if (ST
.hasFastFMAF32())
3470 B
.buildFMA(Dst
, LogSrc
, Log2Inv
, ResultOffset
, Flags
);
3472 auto Mul
= B
.buildFMul(Ty
, LogSrc
, Log2Inv
, Flags
);
3473 B
.buildFAdd(Dst
, Mul
, ResultOffset
, Flags
);
3480 auto Log2Operand
= Ty
== LLT::scalar(16)
3481 ? B
.buildFLog2(Ty
, Src
, Flags
)
3482 : B
.buildIntrinsic(Intrinsic::amdgcn_log
, {Ty
})
3485 auto Log2BaseInvertedOperand
= B
.buildFConstant(Ty
, Log2BaseInverted
);
3486 B
.buildFMul(Dst
, Log2Operand
, Log2BaseInvertedOperand
, Flags
);
3490 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr
&MI
,
3491 MachineIRBuilder
&B
) const {
3492 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3493 // If we have to handle denormals, scale up the input and adjust the result.
3495 Register Dst
= MI
.getOperand(0).getReg();
3496 Register Src
= MI
.getOperand(1).getReg();
3497 unsigned Flags
= MI
.getFlags();
3498 LLT Ty
= B
.getMRI()->getType(Dst
);
3499 const LLT F16
= LLT::scalar(16);
3500 const LLT F32
= LLT::scalar(32);
3503 // Nothing in half is a denormal when promoted to f32.
3504 auto Ext
= B
.buildFPExt(F32
, Src
, Flags
);
3505 auto Log2
= B
.buildIntrinsic(Intrinsic::amdgcn_exp2
, {F32
})
3506 .addUse(Ext
.getReg(0))
3508 B
.buildFPTrunc(Dst
, Log2
, Flags
);
3509 MI
.eraseFromParent();
3515 if (!needsDenormHandlingF32(B
.getMF(), Src
, Flags
)) {
3516 B
.buildIntrinsic(Intrinsic::amdgcn_exp2
, ArrayRef
<Register
>{Dst
})
3519 MI
.eraseFromParent();
3523 // bool needs_scaling = x < -0x1.f80000p+6f;
3524 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3526 // -nextafter(128.0, -1)
3527 auto RangeCheckConst
= B
.buildFConstant(Ty
, -0x1.f80000p
+6f
);
3528 auto NeedsScaling
= B
.buildFCmp(CmpInst::FCMP_OLT
, LLT::scalar(1), Src
,
3529 RangeCheckConst
, Flags
);
3531 auto SixtyFour
= B
.buildFConstant(Ty
, 0x1.0p
+6f
);
3532 auto Zero
= B
.buildFConstant(Ty
, 0.0);
3533 auto AddOffset
= B
.buildSelect(F32
, NeedsScaling
, SixtyFour
, Zero
, Flags
);
3534 auto AddInput
= B
.buildFAdd(F32
, Src
, AddOffset
, Flags
);
3536 auto Exp2
= B
.buildIntrinsic(Intrinsic::amdgcn_exp2
, {Ty
})
3537 .addUse(AddInput
.getReg(0))
3540 auto TwoExpNeg64
= B
.buildFConstant(Ty
, 0x1.0p
-64f
);
3541 auto One
= B
.buildFConstant(Ty
, 1.0);
3542 auto ResultScale
= B
.buildSelect(F32
, NeedsScaling
, TwoExpNeg64
, One
, Flags
);
3543 B
.buildFMul(Dst
, Exp2
, ResultScale
, Flags
);
3544 MI
.eraseFromParent();
3548 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder
&B
, Register Dst
,
3549 Register X
, unsigned Flags
) const {
3550 LLT Ty
= B
.getMRI()->getType(Dst
);
3551 LLT F32
= LLT::scalar(32);
3553 if (Ty
!= F32
|| !needsDenormHandlingF32(B
.getMF(), X
, Flags
)) {
3554 auto Log2E
= B
.buildFConstant(Ty
, numbers::log2e
);
3555 auto Mul
= B
.buildFMul(Ty
, X
, Log2E
, Flags
);
3558 B
.buildIntrinsic(Intrinsic::amdgcn_exp2
, ArrayRef
<Register
>{Dst
})
3559 .addUse(Mul
.getReg(0))
3562 B
.buildFExp2(Dst
, Mul
.getReg(0), Flags
);
3568 auto Threshold
= B
.buildFConstant(Ty
, -0x1.5d58a0p
+6f
);
3570 B
.buildFCmp(CmpInst::FCMP_OLT
, LLT::scalar(1), X
, Threshold
, Flags
);
3571 auto ScaleOffset
= B
.buildFConstant(Ty
, 0x1.0p
+6f
);
3572 auto ScaledX
= B
.buildFAdd(Ty
, X
, ScaleOffset
, Flags
);
3573 auto AdjustedX
= B
.buildSelect(Ty
, NeedsScaling
, ScaledX
, X
, Flags
);
3575 auto Log2E
= B
.buildFConstant(Ty
, numbers::log2e
);
3576 auto ExpInput
= B
.buildFMul(Ty
, AdjustedX
, Log2E
, Flags
);
3578 auto Exp2
= B
.buildIntrinsic(Intrinsic::amdgcn_exp2
, {Ty
})
3579 .addUse(ExpInput
.getReg(0))
3582 auto ResultScaleFactor
= B
.buildFConstant(Ty
, 0x1.969d48p
-93f
);
3583 auto AdjustedResult
= B
.buildFMul(Ty
, Exp2
, ResultScaleFactor
, Flags
);
3584 B
.buildSelect(Dst
, NeedsScaling
, AdjustedResult
, Exp2
, Flags
);
3588 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr
&MI
,
3589 MachineIRBuilder
&B
) const {
3590 Register Dst
= MI
.getOperand(0).getReg();
3591 Register X
= MI
.getOperand(1).getReg();
3592 const unsigned Flags
= MI
.getFlags();
3593 MachineFunction
&MF
= B
.getMF();
3594 MachineRegisterInfo
&MRI
= *B
.getMRI();
3595 LLT Ty
= MRI
.getType(Dst
);
3596 const LLT F16
= LLT::scalar(16);
3597 const LLT F32
= LLT::scalar(32);
3598 const bool IsExp10
= MI
.getOpcode() == TargetOpcode::G_FEXP10
;
3601 // v_exp_f16 (fmul x, log2e)
3602 if (allowApproxFunc(MF
, Flags
)) {
3603 // TODO: Does this really require fast?
3604 legalizeFExpUnsafe(B
, Dst
, X
, Flags
);
3605 MI
.eraseFromParent();
3610 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3612 // Nothing in half is a denormal when promoted to f32.
3613 auto Ext
= B
.buildFPExt(F32
, X
, Flags
);
3614 Register Lowered
= MRI
.createGenericVirtualRegister(F32
);
3615 legalizeFExpUnsafe(B
, Lowered
, Ext
.getReg(0), Flags
);
3616 B
.buildFPTrunc(Dst
, Lowered
, Flags
);
3617 MI
.eraseFromParent();
3623 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3624 // library behavior. Also, is known-not-daz source sufficient?
3625 if (allowApproxFunc(MF
, Flags
)) {
3626 legalizeFExpUnsafe(B
, Dst
, X
, Flags
);
3627 MI
.eraseFromParent();
3633 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3635 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3636 // n = 64*m + j, 0 <= j < 64
3638 // e^x = 2^((64*m + j + f)/64)
3639 // = (2^m) * (2^(j/64)) * 2^(f/64)
3640 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3642 // f = x*(64/ln(2)) - n
3643 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3645 // e^x = (2^m) * (2^(j/64)) * e^r
3647 // (2^(j/64)) is precomputed
3649 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3652 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3654 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3655 const unsigned FlagsNoContract
= Flags
& ~MachineInstr::FmContract
;
3658 if (ST
.hasFastFMAF32()) {
3659 const float c_exp
= numbers::log2ef
;
3660 const float cc_exp
= 0x1.4ae0bep
-26f
; // c+cc are 49 bits
3661 const float c_exp10
= 0x1.a934f0p
+1f
;
3662 const float cc_exp10
= 0x1.2f346ep
-24f
;
3664 auto C
= B
.buildFConstant(Ty
, IsExp10
? c_exp10
: c_exp
);
3665 PH
= B
.buildFMul(Ty
, X
, C
, Flags
).getReg(0);
3666 auto NegPH
= B
.buildFNeg(Ty
, PH
, Flags
);
3667 auto FMA0
= B
.buildFMA(Ty
, X
, C
, NegPH
, Flags
);
3669 auto CC
= B
.buildFConstant(Ty
, IsExp10
? cc_exp10
: cc_exp
);
3670 PL
= B
.buildFMA(Ty
, X
, CC
, FMA0
, Flags
).getReg(0);
3672 const float ch_exp
= 0x1.714000p
+0f
;
3673 const float cl_exp
= 0x1.47652ap
-12f
; // ch + cl are 36 bits
3675 const float ch_exp10
= 0x1.a92000p
+1f
;
3676 const float cl_exp10
= 0x1.4f0978p
-11f
;
3678 auto MaskConst
= B
.buildConstant(Ty
, 0xfffff000);
3679 auto XH
= B
.buildAnd(Ty
, X
, MaskConst
);
3680 auto XL
= B
.buildFSub(Ty
, X
, XH
, Flags
);
3682 auto CH
= B
.buildFConstant(Ty
, IsExp10
? ch_exp10
: ch_exp
);
3683 PH
= B
.buildFMul(Ty
, XH
, CH
, Flags
).getReg(0);
3685 auto CL
= B
.buildFConstant(Ty
, IsExp10
? cl_exp10
: cl_exp
);
3686 auto XLCL
= B
.buildFMul(Ty
, XL
, CL
, Flags
);
3689 getMad(B
, Ty
, XL
.getReg(0), CH
.getReg(0), XLCL
.getReg(0), Flags
);
3690 PL
= getMad(B
, Ty
, XH
.getReg(0), CL
.getReg(0), Mad0
, Flags
);
3693 auto E
= B
.buildIntrinsicRoundeven(Ty
, PH
, Flags
);
3695 // It is unsafe to contract this fsub into the PH multiply.
3696 auto PHSubE
= B
.buildFSub(Ty
, PH
, E
, FlagsNoContract
);
3697 auto A
= B
.buildFAdd(Ty
, PHSubE
, PL
, Flags
);
3698 auto IntE
= B
.buildFPTOSI(LLT::scalar(32), E
);
3700 auto Exp2
= B
.buildIntrinsic(Intrinsic::amdgcn_exp2
, {Ty
})
3701 .addUse(A
.getReg(0))
3703 auto R
= B
.buildFLdexp(Ty
, Exp2
, IntE
, Flags
);
3705 auto UnderflowCheckConst
=
3706 B
.buildFConstant(Ty
, IsExp10
? -0x1.66d3e8p
+5f
: -0x1.9d1da0p
+6f
);
3707 auto Zero
= B
.buildFConstant(Ty
, 0.0);
3709 B
.buildFCmp(CmpInst::FCMP_OLT
, LLT::scalar(1), X
, UnderflowCheckConst
);
3711 R
= B
.buildSelect(Ty
, Underflow
, Zero
, R
);
3713 const auto &Options
= MF
.getTarget().Options
;
3715 if (!(Flags
& MachineInstr::FmNoInfs
) && !Options
.NoInfsFPMath
) {
3716 auto OverflowCheckConst
=
3717 B
.buildFConstant(Ty
, IsExp10
? 0x1.344136p
+5f
: 0x1.62e430p
+6f
);
3720 B
.buildFCmp(CmpInst::FCMP_OGT
, LLT::scalar(1), X
, OverflowCheckConst
);
3721 auto Inf
= B
.buildFConstant(Ty
, APFloat::getInf(APFloat::IEEEsingle()));
3722 R
= B
.buildSelect(Ty
, Overflow
, Inf
, R
, Flags
);
3725 B
.buildCopy(Dst
, R
);
3726 MI
.eraseFromParent();
3730 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr
&MI
,
3731 MachineIRBuilder
&B
) const {
3732 Register Dst
= MI
.getOperand(0).getReg();
3733 Register Src0
= MI
.getOperand(1).getReg();
3734 Register Src1
= MI
.getOperand(2).getReg();
3735 unsigned Flags
= MI
.getFlags();
3736 LLT Ty
= B
.getMRI()->getType(Dst
);
3737 const LLT F16
= LLT::float16();
3738 const LLT F32
= LLT::float32();
3741 auto Log
= B
.buildFLog2(F32
, Src0
, Flags
);
3742 auto Mul
= B
.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy
, {F32
})
3743 .addUse(Log
.getReg(0))
3746 B
.buildFExp2(Dst
, Mul
, Flags
);
3747 } else if (Ty
== F16
) {
3748 // There's no f16 fmul_legacy, so we need to convert for it.
3749 auto Log
= B
.buildFLog2(F16
, Src0
, Flags
);
3750 auto Ext0
= B
.buildFPExt(F32
, Log
, Flags
);
3751 auto Ext1
= B
.buildFPExt(F32
, Src1
, Flags
);
3752 auto Mul
= B
.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy
, {F32
})
3753 .addUse(Ext0
.getReg(0))
3754 .addUse(Ext1
.getReg(0))
3756 B
.buildFExp2(Dst
, B
.buildFPTrunc(F16
, Mul
), Flags
);
3760 MI
.eraseFromParent();
3764 // Find a source register, ignoring any possible source modifiers.
3765 static Register
stripAnySourceMods(Register OrigSrc
, MachineRegisterInfo
&MRI
) {
3766 Register ModSrc
= OrigSrc
;
3767 if (MachineInstr
*SrcFNeg
= getOpcodeDef(AMDGPU::G_FNEG
, ModSrc
, MRI
)) {
3768 ModSrc
= SrcFNeg
->getOperand(1).getReg();
3769 if (MachineInstr
*SrcFAbs
= getOpcodeDef(AMDGPU::G_FABS
, ModSrc
, MRI
))
3770 ModSrc
= SrcFAbs
->getOperand(1).getReg();
3771 } else if (MachineInstr
*SrcFAbs
= getOpcodeDef(AMDGPU::G_FABS
, ModSrc
, MRI
))
3772 ModSrc
= SrcFAbs
->getOperand(1).getReg();
3776 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr
&MI
,
3777 MachineRegisterInfo
&MRI
,
3778 MachineIRBuilder
&B
) const {
3780 const LLT S1
= LLT::scalar(1);
3781 const LLT F64
= LLT::float64();
3782 Register Dst
= MI
.getOperand(0).getReg();
3783 Register OrigSrc
= MI
.getOperand(1).getReg();
3784 unsigned Flags
= MI
.getFlags();
3785 assert(ST
.hasFractBug() && MRI
.getType(Dst
) == F64
&&
3786 "this should not have been custom lowered");
3788 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3789 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3790 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3792 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3794 // Convert floor(x) to (x - fract(x))
3796 auto Fract
= B
.buildIntrinsic(Intrinsic::amdgcn_fract
, {F64
})
3800 // Give source modifier matching some assistance before obscuring a foldable
3803 // TODO: We can avoid the neg on the fract? The input sign to fract
3804 // shouldn't matter?
3805 Register ModSrc
= stripAnySourceMods(OrigSrc
, MRI
);
3808 B
.buildFConstant(F64
, llvm::bit_cast
<double>(0x3fefffffffffffff));
3810 Register Min
= MRI
.createGenericVirtualRegister(F64
);
3812 // We don't need to concern ourselves with the snan handling difference, so
3813 // use the one which will directly select.
3814 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
3815 if (MFI
->getMode().IEEE
)
3816 B
.buildFMinNumIEEE(Min
, Fract
, Const
, Flags
);
3818 B
.buildFMinNum(Min
, Fract
, Const
, Flags
);
3820 Register CorrectedFract
= Min
;
3821 if (!MI
.getFlag(MachineInstr::FmNoNans
)) {
3822 auto IsNan
= B
.buildFCmp(CmpInst::FCMP_ORD
, S1
, ModSrc
, ModSrc
, Flags
);
3823 CorrectedFract
= B
.buildSelect(F64
, IsNan
, ModSrc
, Min
, Flags
).getReg(0);
3826 auto NegFract
= B
.buildFNeg(F64
, CorrectedFract
, Flags
);
3827 B
.buildFAdd(Dst
, OrigSrc
, NegFract
, Flags
);
3829 MI
.eraseFromParent();
3833 // Turn an illegal packed v2s16 build vector into bit operations.
3834 // TODO: This should probably be a bitcast action in LegalizerHelper.
3835 bool AMDGPULegalizerInfo::legalizeBuildVector(
3836 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
3837 Register Dst
= MI
.getOperand(0).getReg();
3838 const LLT S32
= LLT::scalar(32);
3839 const LLT S16
= LLT::scalar(16);
3840 assert(MRI
.getType(Dst
) == LLT::fixed_vector(2, 16));
3842 Register Src0
= MI
.getOperand(1).getReg();
3843 Register Src1
= MI
.getOperand(2).getReg();
3845 if (MI
.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC
) {
3846 assert(MRI
.getType(Src0
) == S32
);
3847 Src0
= B
.buildTrunc(S16
, MI
.getOperand(1).getReg()).getReg(0);
3848 Src1
= B
.buildTrunc(S16
, MI
.getOperand(2).getReg()).getReg(0);
3851 auto Merge
= B
.buildMergeLikeInstr(S32
, {Src0
, Src1
});
3852 B
.buildBitcast(Dst
, Merge
);
3854 MI
.eraseFromParent();
3858 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3860 // Source and accumulation registers must all be 32-bits.
3862 // TODO: When the multiply is uniform, we should produce a code sequence
3863 // that is better suited to instruction selection on the SALU. Instead of
3864 // the outer loop going over parts of the result, the outer loop should go
3865 // over parts of one of the factors. This should result in instruction
3866 // selection that makes full use of S_ADDC_U32 instructions.
3867 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper
&Helper
,
3868 MutableArrayRef
<Register
> Accum
,
3869 ArrayRef
<Register
> Src0
,
3870 ArrayRef
<Register
> Src1
,
3871 bool UsePartialMad64_32
,
3872 bool SeparateOddAlignedProducts
) const {
3873 // Use (possibly empty) vectors of S1 registers to represent the set of
3874 // carries from one pair of positions to the next.
3875 using Carry
= SmallVector
<Register
, 2>;
3877 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
3878 GISelKnownBits
&KB
= *Helper
.getKnownBits();
3880 const LLT S1
= LLT::scalar(1);
3881 const LLT S32
= LLT::scalar(32);
3882 const LLT S64
= LLT::scalar(64);
3887 auto getZero32
= [&]() -> Register
{
3889 Zero32
= B
.buildConstant(S32
, 0).getReg(0);
3892 auto getZero64
= [&]() -> Register
{
3894 Zero64
= B
.buildConstant(S64
, 0).getReg(0);
3898 SmallVector
<bool, 2> Src0KnownZeros
, Src1KnownZeros
;
3899 for (unsigned i
= 0; i
< Src0
.size(); ++i
) {
3900 Src0KnownZeros
.push_back(KB
.getKnownBits(Src0
[i
]).isZero());
3901 Src1KnownZeros
.push_back(KB
.getKnownBits(Src1
[i
]).isZero());
3904 // Merge the given carries into the 32-bit LocalAccum, which is modified
3907 // Returns the carry-out, which is a single S1 register or null.
3909 [&](Register
&LocalAccum
, const Carry
&CarryIn
) -> Register
{
3910 if (CarryIn
.empty())
3913 bool HaveCarryOut
= true;
3914 Register CarryAccum
;
3915 if (CarryIn
.size() == 1) {
3917 LocalAccum
= B
.buildZExt(S32
, CarryIn
[0]).getReg(0);
3921 CarryAccum
= getZero32();
3923 CarryAccum
= B
.buildZExt(S32
, CarryIn
[0]).getReg(0);
3924 for (unsigned i
= 1; i
+ 1 < CarryIn
.size(); ++i
) {
3926 B
.buildUAdde(S32
, S1
, CarryAccum
, getZero32(), CarryIn
[i
])
3931 LocalAccum
= getZero32();
3932 HaveCarryOut
= false;
3937 B
.buildUAdde(S32
, S1
, CarryAccum
, LocalAccum
, CarryIn
.back());
3938 LocalAccum
= Add
.getReg(0);
3939 return HaveCarryOut
? Add
.getReg(1) : Register();
3942 // Build a multiply-add chain to compute
3944 // LocalAccum + (partial products at DstIndex)
3945 // + (opportunistic subset of CarryIn)
3947 // LocalAccum is an array of one or two 32-bit registers that are updated
3948 // in-place. The incoming registers may be null.
3950 // In some edge cases, carry-ins can be consumed "for free". In that case,
3951 // the consumed carry bits are removed from CarryIn in-place.
3952 auto buildMadChain
=
3953 [&](MutableArrayRef
<Register
> LocalAccum
, unsigned DstIndex
, Carry
&CarryIn
)
3955 assert((DstIndex
+ 1 < Accum
.size() && LocalAccum
.size() == 2) ||
3956 (DstIndex
+ 1 >= Accum
.size() && LocalAccum
.size() == 1));
3961 // Use plain 32-bit multiplication for the most significant part of the
3962 // result by default.
3963 if (LocalAccum
.size() == 1 &&
3964 (!UsePartialMad64_32
|| !CarryIn
.empty())) {
3966 // Skip multiplication if one of the operands is 0
3967 unsigned j1
= DstIndex
- j0
;
3968 if (Src0KnownZeros
[j0
] || Src1KnownZeros
[j1
]) {
3972 auto Mul
= B
.buildMul(S32
, Src0
[j0
], Src1
[j1
]);
3973 if (!LocalAccum
[0] || KB
.getKnownBits(LocalAccum
[0]).isZero()) {
3974 LocalAccum
[0] = Mul
.getReg(0);
3976 if (CarryIn
.empty()) {
3977 LocalAccum
[0] = B
.buildAdd(S32
, LocalAccum
[0], Mul
).getReg(0);
3980 B
.buildUAdde(S32
, S1
, LocalAccum
[0], Mul
, CarryIn
.back())
3986 } while (j0
<= DstIndex
&& (!UsePartialMad64_32
|| !CarryIn
.empty()));
3989 // Build full 64-bit multiplies.
3990 if (j0
<= DstIndex
) {
3991 bool HaveSmallAccum
= false;
3994 if (LocalAccum
[0]) {
3995 if (LocalAccum
.size() == 1) {
3996 Tmp
= B
.buildAnyExt(S64
, LocalAccum
[0]).getReg(0);
3997 HaveSmallAccum
= true;
3998 } else if (LocalAccum
[1]) {
3999 Tmp
= B
.buildMergeLikeInstr(S64
, LocalAccum
).getReg(0);
4000 HaveSmallAccum
= false;
4002 Tmp
= B
.buildZExt(S64
, LocalAccum
[0]).getReg(0);
4003 HaveSmallAccum
= true;
4006 assert(LocalAccum
.size() == 1 || !LocalAccum
[1]);
4008 HaveSmallAccum
= true;
4012 unsigned j1
= DstIndex
- j0
;
4013 if (Src0KnownZeros
[j0
] || Src1KnownZeros
[j1
]) {
4017 auto Mad
= B
.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32
, {S64
, S1
},
4018 {Src0
[j0
], Src1
[j1
], Tmp
});
4019 Tmp
= Mad
.getReg(0);
4020 if (!HaveSmallAccum
)
4021 CarryOut
.push_back(Mad
.getReg(1));
4022 HaveSmallAccum
= false;
4025 } while (j0
<= DstIndex
);
4027 auto Unmerge
= B
.buildUnmerge(S32
, Tmp
);
4028 LocalAccum
[0] = Unmerge
.getReg(0);
4029 if (LocalAccum
.size() > 1)
4030 LocalAccum
[1] = Unmerge
.getReg(1);
4036 // Outer multiply loop, iterating over destination parts from least
4037 // significant to most significant parts.
4039 // The columns of the following diagram correspond to the destination parts
4040 // affected by one iteration of the outer loop (ignoring boundary
4043 // Dest index relative to 2 * i: 1 0 -1
4045 // Carries from previous iteration: e o
4046 // Even-aligned partial product sum: E E .
4047 // Odd-aligned partial product sum: O O
4049 // 'o' is OddCarry, 'e' is EvenCarry.
4050 // EE and OO are computed from partial products via buildMadChain and use
4051 // accumulation where possible and appropriate.
4053 Register SeparateOddCarry
;
4057 for (unsigned i
= 0; i
<= Accum
.size() / 2; ++i
) {
4058 Carry OddCarryIn
= std::move(OddCarry
);
4059 Carry EvenCarryIn
= std::move(EvenCarry
);
4063 // Partial products at offset 2 * i.
4064 if (2 * i
< Accum
.size()) {
4065 auto LocalAccum
= Accum
.drop_front(2 * i
).take_front(2);
4066 EvenCarry
= buildMadChain(LocalAccum
, 2 * i
, EvenCarryIn
);
4069 // Partial products at offset 2 * i - 1.
4071 if (!SeparateOddAlignedProducts
) {
4072 auto LocalAccum
= Accum
.drop_front(2 * i
- 1).take_front(2);
4073 OddCarry
= buildMadChain(LocalAccum
, 2 * i
- 1, OddCarryIn
);
4075 bool IsHighest
= 2 * i
>= Accum
.size();
4076 Register SeparateOddOut
[2];
4077 auto LocalAccum
= MutableArrayRef(SeparateOddOut
)
4078 .take_front(IsHighest
? 1 : 2);
4079 OddCarry
= buildMadChain(LocalAccum
, 2 * i
- 1, OddCarryIn
);
4085 Lo
= B
.buildUAddo(S32
, S1
, Accum
[2 * i
- 1], SeparateOddOut
[0]);
4087 Lo
= B
.buildAdd(S32
, Accum
[2 * i
- 1], SeparateOddOut
[0]);
4089 Lo
= B
.buildUAdde(S32
, S1
, Accum
[2 * i
- 1], SeparateOddOut
[0],
4092 Accum
[2 * i
- 1] = Lo
->getOperand(0).getReg();
4095 auto Hi
= B
.buildUAdde(S32
, S1
, Accum
[2 * i
], SeparateOddOut
[1],
4096 Lo
->getOperand(1).getReg());
4097 Accum
[2 * i
] = Hi
.getReg(0);
4098 SeparateOddCarry
= Hi
.getReg(1);
4103 // Add in the carries from the previous iteration
4105 if (Register CarryOut
= mergeCarry(Accum
[2 * i
- 1], OddCarryIn
))
4106 EvenCarryIn
.push_back(CarryOut
);
4108 if (2 * i
< Accum
.size()) {
4109 if (Register CarryOut
= mergeCarry(Accum
[2 * i
], EvenCarryIn
))
4110 OddCarry
.push_back(CarryOut
);
4116 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4118 // TODO: If the multiply is followed by an addition, we should attempt to
4119 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4120 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper
&Helper
,
4121 MachineInstr
&MI
) const {
4122 assert(ST
.hasMad64_32());
4123 assert(MI
.getOpcode() == TargetOpcode::G_MUL
);
4125 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
4126 MachineRegisterInfo
&MRI
= *B
.getMRI();
4128 Register DstReg
= MI
.getOperand(0).getReg();
4129 Register Src0
= MI
.getOperand(1).getReg();
4130 Register Src1
= MI
.getOperand(2).getReg();
4132 LLT Ty
= MRI
.getType(DstReg
);
4133 assert(Ty
.isScalar());
4135 unsigned Size
= Ty
.getSizeInBits();
4136 unsigned NumParts
= Size
/ 32;
4137 assert((Size
% 32) == 0);
4138 assert(NumParts
>= 2);
4140 // Whether to use MAD_64_32 for partial products whose high half is
4141 // discarded. This avoids some ADD instructions but risks false dependency
4142 // stalls on some subtargets in some cases.
4143 const bool UsePartialMad64_32
= ST
.getGeneration() < AMDGPUSubtarget::GFX10
;
4145 // Whether to compute odd-aligned partial products separately. This is
4146 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4147 // in an even-aligned VGPR.
4148 const bool SeparateOddAlignedProducts
= ST
.hasFullRate64Ops();
4150 LLT S32
= LLT::scalar(32);
4151 SmallVector
<Register
, 2> Src0Parts
, Src1Parts
;
4152 for (unsigned i
= 0; i
< NumParts
; ++i
) {
4153 Src0Parts
.push_back(MRI
.createGenericVirtualRegister(S32
));
4154 Src1Parts
.push_back(MRI
.createGenericVirtualRegister(S32
));
4156 B
.buildUnmerge(Src0Parts
, Src0
);
4157 B
.buildUnmerge(Src1Parts
, Src1
);
4159 SmallVector
<Register
, 2> AccumRegs(NumParts
);
4160 buildMultiply(Helper
, AccumRegs
, Src0Parts
, Src1Parts
, UsePartialMad64_32
,
4161 SeparateOddAlignedProducts
);
4163 B
.buildMergeLikeInstr(DstReg
, AccumRegs
);
4164 MI
.eraseFromParent();
4168 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4169 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4170 // case with a single min instruction instead of a compare+select.
4171 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr
&MI
,
4172 MachineRegisterInfo
&MRI
,
4173 MachineIRBuilder
&B
) const {
4174 Register Dst
= MI
.getOperand(0).getReg();
4175 Register Src
= MI
.getOperand(1).getReg();
4176 LLT DstTy
= MRI
.getType(Dst
);
4177 LLT SrcTy
= MRI
.getType(Src
);
4179 unsigned NewOpc
= MI
.getOpcode() == AMDGPU::G_CTLZ
4180 ? AMDGPU::G_AMDGPU_FFBH_U32
4181 : AMDGPU::G_AMDGPU_FFBL_B32
;
4182 auto Tmp
= B
.buildInstr(NewOpc
, {DstTy
}, {Src
});
4183 B
.buildUMin(Dst
, Tmp
, B
.buildConstant(DstTy
, SrcTy
.getSizeInBits()));
4185 MI
.eraseFromParent();
4189 bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr
&MI
,
4190 MachineRegisterInfo
&MRI
,
4191 MachineIRBuilder
&B
) const {
4192 Register Dst
= MI
.getOperand(0).getReg();
4193 Register Src
= MI
.getOperand(1).getReg();
4194 LLT SrcTy
= MRI
.getType(Src
);
4195 TypeSize NumBits
= SrcTy
.getSizeInBits();
4197 assert(NumBits
< 32u);
4199 auto ShiftAmt
= B
.buildConstant(S32
, 32u - NumBits
);
4200 auto Extend
= B
.buildAnyExt(S32
, {Src
}).getReg(0u);
4201 auto Shift
= B
.buildShl(S32
, Extend
, ShiftAmt
);
4202 auto Ctlz
= B
.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32
, {S32
}, {Shift
});
4203 B
.buildTrunc(Dst
, Ctlz
);
4204 MI
.eraseFromParent();
4208 // Check that this is a G_XOR x, -1
4209 static bool isNot(const MachineRegisterInfo
&MRI
, const MachineInstr
&MI
) {
4210 if (MI
.getOpcode() != TargetOpcode::G_XOR
)
4212 auto ConstVal
= getIConstantVRegSExtVal(MI
.getOperand(2).getReg(), MRI
);
4213 return ConstVal
&& *ConstVal
== -1;
4216 // Return the use branch instruction, otherwise null if the usage is invalid.
4217 static MachineInstr
*
4218 verifyCFIntrinsic(MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineInstr
*&Br
,
4219 MachineBasicBlock
*&UncondBrTarget
, bool &Negated
) {
4220 Register CondDef
= MI
.getOperand(0).getReg();
4221 if (!MRI
.hasOneNonDBGUse(CondDef
))
4224 MachineBasicBlock
*Parent
= MI
.getParent();
4225 MachineInstr
*UseMI
= &*MRI
.use_instr_nodbg_begin(CondDef
);
4227 if (isNot(MRI
, *UseMI
)) {
4228 Register NegatedCond
= UseMI
->getOperand(0).getReg();
4229 if (!MRI
.hasOneNonDBGUse(NegatedCond
))
4232 // We're deleting the def of this value, so we need to remove it.
4233 eraseInstr(*UseMI
, MRI
);
4235 UseMI
= &*MRI
.use_instr_nodbg_begin(NegatedCond
);
4239 if (UseMI
->getParent() != Parent
|| UseMI
->getOpcode() != AMDGPU::G_BRCOND
)
4242 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4243 MachineBasicBlock::iterator Next
= std::next(UseMI
->getIterator());
4244 if (Next
== Parent
->end()) {
4245 MachineFunction::iterator NextMBB
= std::next(Parent
->getIterator());
4246 if (NextMBB
== Parent
->getParent()->end()) // Illegal intrinsic use.
4248 UncondBrTarget
= &*NextMBB
;
4250 if (Next
->getOpcode() != AMDGPU::G_BR
)
4253 UncondBrTarget
= Br
->getOperand(0).getMBB();
4259 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg
, MachineIRBuilder
&B
,
4260 const ArgDescriptor
*Arg
,
4261 const TargetRegisterClass
*ArgRC
,
4263 MCRegister SrcReg
= Arg
->getRegister();
4264 assert(Register::isPhysicalRegister(SrcReg
) && "Physical register expected");
4265 assert(DstReg
.isVirtual() && "Virtual register expected");
4267 Register LiveIn
= getFunctionLiveInPhysReg(B
.getMF(), B
.getTII(), SrcReg
,
4268 *ArgRC
, B
.getDebugLoc(), ArgTy
);
4269 if (Arg
->isMasked()) {
4270 // TODO: Should we try to emit this once in the entry block?
4271 const LLT S32
= LLT::scalar(32);
4272 const unsigned Mask
= Arg
->getMask();
4273 const unsigned Shift
= llvm::countr_zero
<unsigned>(Mask
);
4275 Register AndMaskSrc
= LiveIn
;
4277 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4280 auto ShiftAmt
= B
.buildConstant(S32
, Shift
);
4281 AndMaskSrc
= B
.buildLShr(S32
, LiveIn
, ShiftAmt
).getReg(0);
4284 B
.buildAnd(DstReg
, AndMaskSrc
, B
.buildConstant(S32
, Mask
>> Shift
));
4286 B
.buildCopy(DstReg
, LiveIn
);
4292 bool AMDGPULegalizerInfo::loadInputValue(
4293 Register DstReg
, MachineIRBuilder
&B
,
4294 AMDGPUFunctionArgInfo::PreloadedValue ArgType
) const {
4295 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
4296 const ArgDescriptor
*Arg
= nullptr;
4297 const TargetRegisterClass
*ArgRC
;
4300 CallingConv::ID CC
= B
.getMF().getFunction().getCallingConv();
4301 const ArgDescriptor WorkGroupIDX
=
4302 ArgDescriptor::createRegister(AMDGPU::TTMP9
);
4303 // If GridZ is not programmed in an entry function then the hardware will set
4304 // it to all zeros, so there is no need to mask the GridY value in the low
4306 const ArgDescriptor WorkGroupIDY
= ArgDescriptor::createRegister(
4308 AMDGPU::isEntryFunctionCC(CC
) && !MFI
->hasWorkGroupIDZ() ? ~0u : 0xFFFFu
);
4309 const ArgDescriptor WorkGroupIDZ
=
4310 ArgDescriptor::createRegister(AMDGPU::TTMP7
, 0xFFFF0000u
);
4311 if (ST
.hasArchitectedSGPRs() &&
4312 (AMDGPU::isCompute(CC
) || CC
== CallingConv::AMDGPU_Gfx
)) {
4314 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X
:
4315 Arg
= &WorkGroupIDX
;
4316 ArgRC
= &AMDGPU::SReg_32RegClass
;
4317 ArgTy
= LLT::scalar(32);
4319 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
:
4320 Arg
= &WorkGroupIDY
;
4321 ArgRC
= &AMDGPU::SReg_32RegClass
;
4322 ArgTy
= LLT::scalar(32);
4324 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
:
4325 Arg
= &WorkGroupIDZ
;
4326 ArgRC
= &AMDGPU::SReg_32RegClass
;
4327 ArgTy
= LLT::scalar(32);
4335 std::tie(Arg
, ArgRC
, ArgTy
) = MFI
->getPreloadedValue(ArgType
);
4338 if (ArgType
== AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
) {
4339 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4340 // case the pointer argument may be missing and we use null.
4341 B
.buildConstant(DstReg
, 0);
4345 // It's undefined behavior if a function marked with the amdgpu-no-*
4346 // attributes uses the corresponding intrinsic.
4347 B
.buildUndef(DstReg
);
4351 if (!Arg
->isRegister() || !Arg
->getRegister().isValid())
4352 return false; // TODO: Handle these
4353 return loadInputValue(DstReg
, B
, Arg
, ArgRC
, ArgTy
);
4356 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4357 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
,
4358 AMDGPUFunctionArgInfo::PreloadedValue ArgType
) const {
4359 if (!loadInputValue(MI
.getOperand(0).getReg(), B
, ArgType
))
4362 MI
.eraseFromParent();
4366 static bool replaceWithConstant(MachineIRBuilder
&B
, MachineInstr
&MI
,
4368 B
.buildConstant(MI
.getOperand(0).getReg(), C
);
4369 MI
.eraseFromParent();
4373 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4374 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
,
4375 unsigned Dim
, AMDGPUFunctionArgInfo::PreloadedValue ArgType
) const {
4376 unsigned MaxID
= ST
.getMaxWorkitemID(B
.getMF().getFunction(), Dim
);
4378 return replaceWithConstant(B
, MI
, 0);
4380 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
4381 const ArgDescriptor
*Arg
;
4382 const TargetRegisterClass
*ArgRC
;
4384 std::tie(Arg
, ArgRC
, ArgTy
) = MFI
->getPreloadedValue(ArgType
);
4386 Register DstReg
= MI
.getOperand(0).getReg();
4388 // It's undefined behavior if a function marked with the amdgpu-no-*
4389 // attributes uses the corresponding intrinsic.
4390 B
.buildUndef(DstReg
);
4391 MI
.eraseFromParent();
4395 if (Arg
->isMasked()) {
4396 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4397 // masking operations anyway.
4399 // TODO: We could assert the top bit is 0 for the source copy.
4400 if (!loadInputValue(DstReg
, B
, ArgType
))
4403 Register TmpReg
= MRI
.createGenericVirtualRegister(LLT::scalar(32));
4404 if (!loadInputValue(TmpReg
, B
, ArgType
))
4406 B
.buildAssertZExt(DstReg
, TmpReg
, llvm::bit_width(MaxID
));
4409 MI
.eraseFromParent();
4413 Register
AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder
&B
,
4414 int64_t Offset
) const {
4415 LLT PtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
4416 Register KernArgReg
= B
.getMRI()->createGenericVirtualRegister(PtrTy
);
4418 // TODO: If we passed in the base kernel offset we could have a better
4419 // alignment than 4, but we don't really need it.
4420 if (!loadInputValue(KernArgReg
, B
,
4421 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
))
4422 llvm_unreachable("failed to find kernarg segment ptr");
4424 auto COffset
= B
.buildConstant(LLT::scalar(64), Offset
);
4425 // TODO: Should get nuw
4426 return B
.buildPtrAdd(PtrTy
, KernArgReg
, COffset
).getReg(0);
4429 /// Legalize a value that's loaded from kernel arguments. This is only used by
4430 /// legacy intrinsics.
4431 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr
&MI
,
4432 MachineIRBuilder
&B
,
4434 Align Alignment
) const {
4435 Register DstReg
= MI
.getOperand(0).getReg();
4437 assert(B
.getMRI()->getType(DstReg
) == LLT::scalar(32) &&
4438 "unexpected kernarg parameter type");
4440 Register Ptr
= getKernargParameterPtr(B
, Offset
);
4441 MachinePointerInfo
PtrInfo(AMDGPUAS::CONSTANT_ADDRESS
);
4442 B
.buildLoad(DstReg
, Ptr
, PtrInfo
, Align(4),
4443 MachineMemOperand::MODereferenceable
|
4444 MachineMemOperand::MOInvariant
);
4445 MI
.eraseFromParent();
4449 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr
&MI
,
4450 MachineRegisterInfo
&MRI
,
4451 MachineIRBuilder
&B
) const {
4452 Register Dst
= MI
.getOperand(0).getReg();
4453 LLT DstTy
= MRI
.getType(Dst
);
4454 LLT S16
= LLT::scalar(16);
4455 LLT S32
= LLT::scalar(32);
4456 LLT S64
= LLT::scalar(64);
4459 return legalizeFDIV16(MI
, MRI
, B
);
4461 return legalizeFDIV32(MI
, MRI
, B
);
4463 return legalizeFDIV64(MI
, MRI
, B
);
4468 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder
&B
,
4473 const LLT S1
= LLT::scalar(1);
4474 const LLT S32
= LLT::scalar(32);
4476 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4477 // algorithm used here.
4479 // Initial estimate of inv(y).
4480 auto FloatY
= B
.buildUITOFP(S32
, Y
);
4481 auto RcpIFlag
= B
.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG
, {S32
}, {FloatY
});
4482 auto Scale
= B
.buildFConstant(S32
, llvm::bit_cast
<float>(0x4f7ffffe));
4483 auto ScaledY
= B
.buildFMul(S32
, RcpIFlag
, Scale
);
4484 auto Z
= B
.buildFPTOUI(S32
, ScaledY
);
4486 // One round of UNR.
4487 auto NegY
= B
.buildSub(S32
, B
.buildConstant(S32
, 0), Y
);
4488 auto NegYZ
= B
.buildMul(S32
, NegY
, Z
);
4489 Z
= B
.buildAdd(S32
, Z
, B
.buildUMulH(S32
, Z
, NegYZ
));
4491 // Quotient/remainder estimate.
4492 auto Q
= B
.buildUMulH(S32
, X
, Z
);
4493 auto R
= B
.buildSub(S32
, X
, B
.buildMul(S32
, Q
, Y
));
4495 // First quotient/remainder refinement.
4496 auto One
= B
.buildConstant(S32
, 1);
4497 auto Cond
= B
.buildICmp(CmpInst::ICMP_UGE
, S1
, R
, Y
);
4499 Q
= B
.buildSelect(S32
, Cond
, B
.buildAdd(S32
, Q
, One
), Q
);
4500 R
= B
.buildSelect(S32
, Cond
, B
.buildSub(S32
, R
, Y
), R
);
4502 // Second quotient/remainder refinement.
4503 Cond
= B
.buildICmp(CmpInst::ICMP_UGE
, S1
, R
, Y
);
4505 B
.buildSelect(DstDivReg
, Cond
, B
.buildAdd(S32
, Q
, One
), Q
);
4508 B
.buildSelect(DstRemReg
, Cond
, B
.buildSub(S32
, R
, Y
), R
);
4511 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4513 // Return lo, hi of result
4515 // %cvt.lo = G_UITOFP Val.lo
4516 // %cvt.hi = G_UITOFP Val.hi
4517 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4518 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4519 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4520 // %mul2 = G_FMUL %mul1, 2**(-32)
4521 // %trunc = G_INTRINSIC_TRUNC %mul2
4522 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4523 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4524 static std::pair
<Register
, Register
> emitReciprocalU64(MachineIRBuilder
&B
,
4526 const LLT S32
= LLT::scalar(32);
4527 auto Unmerge
= B
.buildUnmerge(S32
, Val
);
4529 auto CvtLo
= B
.buildUITOFP(S32
, Unmerge
.getReg(0));
4530 auto CvtHi
= B
.buildUITOFP(S32
, Unmerge
.getReg(1));
4532 auto Mad
= B
.buildFMAD(
4533 S32
, CvtHi
, // 2**32
4534 B
.buildFConstant(S32
, llvm::bit_cast
<float>(0x4f800000)), CvtLo
);
4536 auto Rcp
= B
.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG
, {S32
}, {Mad
});
4537 auto Mul1
= B
.buildFMul(
4538 S32
, Rcp
, B
.buildFConstant(S32
, llvm::bit_cast
<float>(0x5f7ffffc)));
4541 auto Mul2
= B
.buildFMul(
4542 S32
, Mul1
, B
.buildFConstant(S32
, llvm::bit_cast
<float>(0x2f800000)));
4543 auto Trunc
= B
.buildIntrinsicTrunc(S32
, Mul2
);
4546 auto Mad2
= B
.buildFMAD(
4547 S32
, Trunc
, B
.buildFConstant(S32
, llvm::bit_cast
<float>(0xcf800000)),
4550 auto ResultLo
= B
.buildFPTOUI(S32
, Mad2
);
4551 auto ResultHi
= B
.buildFPTOUI(S32
, Trunc
);
4553 return {ResultLo
.getReg(0), ResultHi
.getReg(0)};
4556 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder
&B
,
4560 Register Denom
) const {
4561 const LLT S32
= LLT::scalar(32);
4562 const LLT S64
= LLT::scalar(64);
4563 const LLT S1
= LLT::scalar(1);
4564 Register RcpLo
, RcpHi
;
4566 std::tie(RcpLo
, RcpHi
) = emitReciprocalU64(B
, Denom
);
4568 auto Rcp
= B
.buildMergeLikeInstr(S64
, {RcpLo
, RcpHi
});
4570 auto Zero64
= B
.buildConstant(S64
, 0);
4571 auto NegDenom
= B
.buildSub(S64
, Zero64
, Denom
);
4573 auto MulLo1
= B
.buildMul(S64
, NegDenom
, Rcp
);
4574 auto MulHi1
= B
.buildUMulH(S64
, Rcp
, MulLo1
);
4576 auto UnmergeMulHi1
= B
.buildUnmerge(S32
, MulHi1
);
4577 Register MulHi1_Lo
= UnmergeMulHi1
.getReg(0);
4578 Register MulHi1_Hi
= UnmergeMulHi1
.getReg(1);
4580 auto Add1_Lo
= B
.buildUAddo(S32
, S1
, RcpLo
, MulHi1_Lo
);
4581 auto Add1_Hi
= B
.buildUAdde(S32
, S1
, RcpHi
, MulHi1_Hi
, Add1_Lo
.getReg(1));
4582 auto Add1
= B
.buildMergeLikeInstr(S64
, {Add1_Lo
, Add1_Hi
});
4584 auto MulLo2
= B
.buildMul(S64
, NegDenom
, Add1
);
4585 auto MulHi2
= B
.buildUMulH(S64
, Add1
, MulLo2
);
4586 auto UnmergeMulHi2
= B
.buildUnmerge(S32
, MulHi2
);
4587 Register MulHi2_Lo
= UnmergeMulHi2
.getReg(0);
4588 Register MulHi2_Hi
= UnmergeMulHi2
.getReg(1);
4590 auto Zero32
= B
.buildConstant(S32
, 0);
4591 auto Add2_Lo
= B
.buildUAddo(S32
, S1
, Add1_Lo
, MulHi2_Lo
);
4592 auto Add2_Hi
= B
.buildUAdde(S32
, S1
, Add1_Hi
, MulHi2_Hi
, Add2_Lo
.getReg(1));
4593 auto Add2
= B
.buildMergeLikeInstr(S64
, {Add2_Lo
, Add2_Hi
});
4595 auto UnmergeNumer
= B
.buildUnmerge(S32
, Numer
);
4596 Register NumerLo
= UnmergeNumer
.getReg(0);
4597 Register NumerHi
= UnmergeNumer
.getReg(1);
4599 auto MulHi3
= B
.buildUMulH(S64
, Numer
, Add2
);
4600 auto Mul3
= B
.buildMul(S64
, Denom
, MulHi3
);
4601 auto UnmergeMul3
= B
.buildUnmerge(S32
, Mul3
);
4602 Register Mul3_Lo
= UnmergeMul3
.getReg(0);
4603 Register Mul3_Hi
= UnmergeMul3
.getReg(1);
4604 auto Sub1_Lo
= B
.buildUSubo(S32
, S1
, NumerLo
, Mul3_Lo
);
4605 auto Sub1_Hi
= B
.buildUSube(S32
, S1
, NumerHi
, Mul3_Hi
, Sub1_Lo
.getReg(1));
4606 auto Sub1_Mi
= B
.buildSub(S32
, NumerHi
, Mul3_Hi
);
4607 auto Sub1
= B
.buildMergeLikeInstr(S64
, {Sub1_Lo
, Sub1_Hi
});
4609 auto UnmergeDenom
= B
.buildUnmerge(S32
, Denom
);
4610 Register DenomLo
= UnmergeDenom
.getReg(0);
4611 Register DenomHi
= UnmergeDenom
.getReg(1);
4613 auto CmpHi
= B
.buildICmp(CmpInst::ICMP_UGE
, S1
, Sub1_Hi
, DenomHi
);
4614 auto C1
= B
.buildSExt(S32
, CmpHi
);
4616 auto CmpLo
= B
.buildICmp(CmpInst::ICMP_UGE
, S1
, Sub1_Lo
, DenomLo
);
4617 auto C2
= B
.buildSExt(S32
, CmpLo
);
4619 auto CmpEq
= B
.buildICmp(CmpInst::ICMP_EQ
, S1
, Sub1_Hi
, DenomHi
);
4620 auto C3
= B
.buildSelect(S32
, CmpEq
, C2
, C1
);
4622 // TODO: Here and below portions of the code can be enclosed into if/endif.
4623 // Currently control flow is unconditional and we have 4 selects after
4624 // potential endif to substitute PHIs.
4627 auto Sub2_Lo
= B
.buildUSubo(S32
, S1
, Sub1_Lo
, DenomLo
);
4628 auto Sub2_Mi
= B
.buildUSube(S32
, S1
, Sub1_Mi
, DenomHi
, Sub1_Lo
.getReg(1));
4629 auto Sub2_Hi
= B
.buildUSube(S32
, S1
, Sub2_Mi
, Zero32
, Sub2_Lo
.getReg(1));
4630 auto Sub2
= B
.buildMergeLikeInstr(S64
, {Sub2_Lo
, Sub2_Hi
});
4632 auto One64
= B
.buildConstant(S64
, 1);
4633 auto Add3
= B
.buildAdd(S64
, MulHi3
, One64
);
4636 B
.buildSExt(S32
, B
.buildICmp(CmpInst::ICMP_UGE
, S1
, Sub2_Hi
, DenomHi
));
4638 B
.buildSExt(S32
, B
.buildICmp(CmpInst::ICMP_UGE
, S1
, Sub2_Lo
, DenomLo
));
4639 auto C6
= B
.buildSelect(
4640 S32
, B
.buildICmp(CmpInst::ICMP_EQ
, S1
, Sub2_Hi
, DenomHi
), C5
, C4
);
4643 auto Add4
= B
.buildAdd(S64
, Add3
, One64
);
4644 auto Sub3_Lo
= B
.buildUSubo(S32
, S1
, Sub2_Lo
, DenomLo
);
4646 auto Sub3_Mi
= B
.buildUSube(S32
, S1
, Sub2_Mi
, DenomHi
, Sub2_Lo
.getReg(1));
4647 auto Sub3_Hi
= B
.buildUSube(S32
, S1
, Sub3_Mi
, Zero32
, Sub3_Lo
.getReg(1));
4648 auto Sub3
= B
.buildMergeLikeInstr(S64
, {Sub3_Lo
, Sub3_Hi
});
4654 auto Sel1
= B
.buildSelect(
4655 S64
, B
.buildICmp(CmpInst::ICMP_NE
, S1
, C6
, Zero32
), Add4
, Add3
);
4656 B
.buildSelect(DstDivReg
, B
.buildICmp(CmpInst::ICMP_NE
, S1
, C3
, Zero32
),
4661 auto Sel2
= B
.buildSelect(
4662 S64
, B
.buildICmp(CmpInst::ICMP_NE
, S1
, C6
, Zero32
), Sub3
, Sub2
);
4663 B
.buildSelect(DstRemReg
, B
.buildICmp(CmpInst::ICMP_NE
, S1
, C3
, Zero32
),
4668 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr
&MI
,
4669 MachineRegisterInfo
&MRI
,
4670 MachineIRBuilder
&B
) const {
4671 Register DstDivReg
, DstRemReg
;
4672 switch (MI
.getOpcode()) {
4674 llvm_unreachable("Unexpected opcode!");
4675 case AMDGPU::G_UDIV
: {
4676 DstDivReg
= MI
.getOperand(0).getReg();
4679 case AMDGPU::G_UREM
: {
4680 DstRemReg
= MI
.getOperand(0).getReg();
4683 case AMDGPU::G_UDIVREM
: {
4684 DstDivReg
= MI
.getOperand(0).getReg();
4685 DstRemReg
= MI
.getOperand(1).getReg();
4690 const LLT S64
= LLT::scalar(64);
4691 const LLT S32
= LLT::scalar(32);
4692 const unsigned FirstSrcOpIdx
= MI
.getNumExplicitDefs();
4693 Register Num
= MI
.getOperand(FirstSrcOpIdx
).getReg();
4694 Register Den
= MI
.getOperand(FirstSrcOpIdx
+ 1).getReg();
4695 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
4698 legalizeUnsignedDIV_REM32Impl(B
, DstDivReg
, DstRemReg
, Num
, Den
);
4700 legalizeUnsignedDIV_REM64Impl(B
, DstDivReg
, DstRemReg
, Num
, Den
);
4704 MI
.eraseFromParent();
4708 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr
&MI
,
4709 MachineRegisterInfo
&MRI
,
4710 MachineIRBuilder
&B
) const {
4711 const LLT S64
= LLT::scalar(64);
4712 const LLT S32
= LLT::scalar(32);
4714 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
4715 if (Ty
!= S32
&& Ty
!= S64
)
4718 const unsigned FirstSrcOpIdx
= MI
.getNumExplicitDefs();
4719 Register LHS
= MI
.getOperand(FirstSrcOpIdx
).getReg();
4720 Register RHS
= MI
.getOperand(FirstSrcOpIdx
+ 1).getReg();
4722 auto SignBitOffset
= B
.buildConstant(S32
, Ty
.getSizeInBits() - 1);
4723 auto LHSign
= B
.buildAShr(Ty
, LHS
, SignBitOffset
);
4724 auto RHSign
= B
.buildAShr(Ty
, RHS
, SignBitOffset
);
4726 LHS
= B
.buildAdd(Ty
, LHS
, LHSign
).getReg(0);
4727 RHS
= B
.buildAdd(Ty
, RHS
, RHSign
).getReg(0);
4729 LHS
= B
.buildXor(Ty
, LHS
, LHSign
).getReg(0);
4730 RHS
= B
.buildXor(Ty
, RHS
, RHSign
).getReg(0);
4732 Register DstDivReg
, DstRemReg
, TmpDivReg
, TmpRemReg
;
4733 switch (MI
.getOpcode()) {
4735 llvm_unreachable("Unexpected opcode!");
4736 case AMDGPU::G_SDIV
: {
4737 DstDivReg
= MI
.getOperand(0).getReg();
4738 TmpDivReg
= MRI
.createGenericVirtualRegister(Ty
);
4741 case AMDGPU::G_SREM
: {
4742 DstRemReg
= MI
.getOperand(0).getReg();
4743 TmpRemReg
= MRI
.createGenericVirtualRegister(Ty
);
4746 case AMDGPU::G_SDIVREM
: {
4747 DstDivReg
= MI
.getOperand(0).getReg();
4748 DstRemReg
= MI
.getOperand(1).getReg();
4749 TmpDivReg
= MRI
.createGenericVirtualRegister(Ty
);
4750 TmpRemReg
= MRI
.createGenericVirtualRegister(Ty
);
4756 legalizeUnsignedDIV_REM32Impl(B
, TmpDivReg
, TmpRemReg
, LHS
, RHS
);
4758 legalizeUnsignedDIV_REM64Impl(B
, TmpDivReg
, TmpRemReg
, LHS
, RHS
);
4761 auto Sign
= B
.buildXor(Ty
, LHSign
, RHSign
).getReg(0);
4762 auto SignXor
= B
.buildXor(Ty
, TmpDivReg
, Sign
).getReg(0);
4763 B
.buildSub(DstDivReg
, SignXor
, Sign
);
4767 auto Sign
= LHSign
.getReg(0); // Remainder sign is the same as LHS
4768 auto SignXor
= B
.buildXor(Ty
, TmpRemReg
, Sign
).getReg(0);
4769 B
.buildSub(DstRemReg
, SignXor
, Sign
);
4772 MI
.eraseFromParent();
4776 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr
&MI
,
4777 MachineRegisterInfo
&MRI
,
4778 MachineIRBuilder
&B
) const {
4779 Register Res
= MI
.getOperand(0).getReg();
4780 Register LHS
= MI
.getOperand(1).getReg();
4781 Register RHS
= MI
.getOperand(2).getReg();
4782 uint16_t Flags
= MI
.getFlags();
4783 LLT ResTy
= MRI
.getType(Res
);
4785 const MachineFunction
&MF
= B
.getMF();
4786 bool AllowInaccurateRcp
= MI
.getFlag(MachineInstr::FmAfn
) ||
4787 MF
.getTarget().Options
.UnsafeFPMath
;
4789 if (auto CLHS
= getConstantFPVRegVal(LHS
, MRI
)) {
4790 if (!AllowInaccurateRcp
&& ResTy
!= LLT::scalar(16))
4793 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4794 // the CI documentation has a worst case error of 1 ulp.
4795 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4796 // use it as long as we aren't trying to use denormals.
4798 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4801 if (CLHS
->isExactlyValue(1.0)) {
4802 B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, Res
)
4806 MI
.eraseFromParent();
4810 // -1 / x -> RCP( FNEG(x) )
4811 if (CLHS
->isExactlyValue(-1.0)) {
4812 auto FNeg
= B
.buildFNeg(ResTy
, RHS
, Flags
);
4813 B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, Res
)
4814 .addUse(FNeg
.getReg(0))
4817 MI
.eraseFromParent();
4822 // For f16 require afn or arcp.
4823 // For f32 require afn.
4824 if (!AllowInaccurateRcp
&& (ResTy
!= LLT::scalar(16) ||
4825 !MI
.getFlag(MachineInstr::FmArcp
)))
4828 // x / y -> x * (1.0 / y)
4829 auto RCP
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {ResTy
})
4832 B
.buildFMul(Res
, LHS
, RCP
, Flags
);
4834 MI
.eraseFromParent();
4838 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr
&MI
,
4839 MachineRegisterInfo
&MRI
,
4840 MachineIRBuilder
&B
) const {
4841 Register Res
= MI
.getOperand(0).getReg();
4842 Register X
= MI
.getOperand(1).getReg();
4843 Register Y
= MI
.getOperand(2).getReg();
4844 uint16_t Flags
= MI
.getFlags();
4845 LLT ResTy
= MRI
.getType(Res
);
4847 const MachineFunction
&MF
= B
.getMF();
4848 bool AllowInaccurateRcp
= MF
.getTarget().Options
.UnsafeFPMath
||
4849 MI
.getFlag(MachineInstr::FmAfn
);
4851 if (!AllowInaccurateRcp
)
4854 auto NegY
= B
.buildFNeg(ResTy
, Y
);
4855 auto One
= B
.buildFConstant(ResTy
, 1.0);
4857 auto R
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {ResTy
})
4861 auto Tmp0
= B
.buildFMA(ResTy
, NegY
, R
, One
);
4862 R
= B
.buildFMA(ResTy
, Tmp0
, R
, R
);
4864 auto Tmp1
= B
.buildFMA(ResTy
, NegY
, R
, One
);
4865 R
= B
.buildFMA(ResTy
, Tmp1
, R
, R
);
4867 auto Ret
= B
.buildFMul(ResTy
, X
, R
);
4868 auto Tmp2
= B
.buildFMA(ResTy
, NegY
, Ret
, X
);
4870 B
.buildFMA(Res
, Tmp2
, R
, Ret
);
4871 MI
.eraseFromParent();
4875 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr
&MI
,
4876 MachineRegisterInfo
&MRI
,
4877 MachineIRBuilder
&B
) const {
4878 if (legalizeFastUnsafeFDIV(MI
, MRI
, B
))
4881 Register Res
= MI
.getOperand(0).getReg();
4882 Register LHS
= MI
.getOperand(1).getReg();
4883 Register RHS
= MI
.getOperand(2).getReg();
4885 uint16_t Flags
= MI
.getFlags();
4887 LLT S16
= LLT::scalar(16);
4888 LLT S32
= LLT::scalar(32);
4890 auto LHSExt
= B
.buildFPExt(S32
, LHS
, Flags
);
4891 auto RHSExt
= B
.buildFPExt(S32
, RHS
, Flags
);
4893 auto RCP
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S32
})
4894 .addUse(RHSExt
.getReg(0))
4897 auto QUOT
= B
.buildFMul(S32
, LHSExt
, RCP
, Flags
);
4898 auto RDst
= B
.buildFPTrunc(S16
, QUOT
, Flags
);
4900 B
.buildIntrinsic(Intrinsic::amdgcn_div_fixup
, Res
)
4901 .addUse(RDst
.getReg(0))
4906 MI
.eraseFromParent();
4910 static constexpr unsigned SPDenormModeBitField
=
4911 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE
, 4, 2);
4913 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4914 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
4915 static void toggleSPDenormMode(bool Enable
, MachineIRBuilder
&B
,
4916 const GCNSubtarget
&ST
,
4917 SIModeRegisterDefaults Mode
) {
4918 // Set SP denorm mode to this value.
4919 unsigned SPDenormMode
=
4920 Enable
? FP_DENORM_FLUSH_NONE
: Mode
.fpDenormModeSPValue();
4922 if (ST
.hasDenormModeInst()) {
4923 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4924 uint32_t DPDenormModeDefault
= Mode
.fpDenormModeDPValue();
4926 uint32_t NewDenormModeValue
= SPDenormMode
| (DPDenormModeDefault
<< 2);
4927 B
.buildInstr(AMDGPU::S_DENORM_MODE
)
4928 .addImm(NewDenormModeValue
);
4931 B
.buildInstr(AMDGPU::S_SETREG_IMM32_B32
)
4932 .addImm(SPDenormMode
)
4933 .addImm(SPDenormModeBitField
);
4937 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr
&MI
,
4938 MachineRegisterInfo
&MRI
,
4939 MachineIRBuilder
&B
) const {
4940 if (legalizeFastUnsafeFDIV(MI
, MRI
, B
))
4943 Register Res
= MI
.getOperand(0).getReg();
4944 Register LHS
= MI
.getOperand(1).getReg();
4945 Register RHS
= MI
.getOperand(2).getReg();
4946 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
4947 SIModeRegisterDefaults Mode
= MFI
->getMode();
4949 uint16_t Flags
= MI
.getFlags();
4951 LLT S32
= LLT::scalar(32);
4952 LLT S1
= LLT::scalar(1);
4954 auto One
= B
.buildFConstant(S32
, 1.0f
);
4956 auto DenominatorScaled
=
4957 B
.buildIntrinsic(Intrinsic::amdgcn_div_scale
, {S32
, S1
})
4962 auto NumeratorScaled
=
4963 B
.buildIntrinsic(Intrinsic::amdgcn_div_scale
, {S32
, S1
})
4969 auto ApproxRcp
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S32
})
4970 .addUse(DenominatorScaled
.getReg(0))
4972 auto NegDivScale0
= B
.buildFNeg(S32
, DenominatorScaled
, Flags
);
4974 const bool PreservesDenormals
= Mode
.FP32Denormals
== DenormalMode::getIEEE();
4975 const bool HasDynamicDenormals
=
4976 (Mode
.FP32Denormals
.Input
== DenormalMode::Dynamic
) ||
4977 (Mode
.FP32Denormals
.Output
== DenormalMode::Dynamic
);
4979 Register SavedSPDenormMode
;
4980 if (!PreservesDenormals
) {
4981 if (HasDynamicDenormals
) {
4982 SavedSPDenormMode
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
4983 B
.buildInstr(AMDGPU::S_GETREG_B32
)
4984 .addDef(SavedSPDenormMode
)
4985 .addImm(SPDenormModeBitField
);
4987 toggleSPDenormMode(true, B
, ST
, Mode
);
4990 auto Fma0
= B
.buildFMA(S32
, NegDivScale0
, ApproxRcp
, One
, Flags
);
4991 auto Fma1
= B
.buildFMA(S32
, Fma0
, ApproxRcp
, ApproxRcp
, Flags
);
4992 auto Mul
= B
.buildFMul(S32
, NumeratorScaled
, Fma1
, Flags
);
4993 auto Fma2
= B
.buildFMA(S32
, NegDivScale0
, Mul
, NumeratorScaled
, Flags
);
4994 auto Fma3
= B
.buildFMA(S32
, Fma2
, Fma1
, Mul
, Flags
);
4995 auto Fma4
= B
.buildFMA(S32
, NegDivScale0
, Fma3
, NumeratorScaled
, Flags
);
4997 if (!PreservesDenormals
) {
4998 if (HasDynamicDenormals
) {
4999 assert(SavedSPDenormMode
);
5000 B
.buildInstr(AMDGPU::S_SETREG_B32
)
5001 .addReg(SavedSPDenormMode
)
5002 .addImm(SPDenormModeBitField
);
5004 toggleSPDenormMode(false, B
, ST
, Mode
);
5007 auto Fmas
= B
.buildIntrinsic(Intrinsic::amdgcn_div_fmas
, {S32
})
5008 .addUse(Fma4
.getReg(0))
5009 .addUse(Fma1
.getReg(0))
5010 .addUse(Fma3
.getReg(0))
5011 .addUse(NumeratorScaled
.getReg(1))
5014 B
.buildIntrinsic(Intrinsic::amdgcn_div_fixup
, Res
)
5015 .addUse(Fmas
.getReg(0))
5020 MI
.eraseFromParent();
5024 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr
&MI
,
5025 MachineRegisterInfo
&MRI
,
5026 MachineIRBuilder
&B
) const {
5027 if (legalizeFastUnsafeFDIV64(MI
, MRI
, B
))
5030 Register Res
= MI
.getOperand(0).getReg();
5031 Register LHS
= MI
.getOperand(1).getReg();
5032 Register RHS
= MI
.getOperand(2).getReg();
5034 uint16_t Flags
= MI
.getFlags();
5036 LLT S64
= LLT::scalar(64);
5037 LLT S1
= LLT::scalar(1);
5039 auto One
= B
.buildFConstant(S64
, 1.0);
5041 auto DivScale0
= B
.buildIntrinsic(Intrinsic::amdgcn_div_scale
, {S64
, S1
})
5047 auto NegDivScale0
= B
.buildFNeg(S64
, DivScale0
.getReg(0), Flags
);
5049 auto Rcp
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S64
})
5050 .addUse(DivScale0
.getReg(0))
5053 auto Fma0
= B
.buildFMA(S64
, NegDivScale0
, Rcp
, One
, Flags
);
5054 auto Fma1
= B
.buildFMA(S64
, Rcp
, Fma0
, Rcp
, Flags
);
5055 auto Fma2
= B
.buildFMA(S64
, NegDivScale0
, Fma1
, One
, Flags
);
5057 auto DivScale1
= B
.buildIntrinsic(Intrinsic::amdgcn_div_scale
, {S64
, S1
})
5063 auto Fma3
= B
.buildFMA(S64
, Fma1
, Fma2
, Fma1
, Flags
);
5064 auto Mul
= B
.buildFMul(S64
, DivScale1
.getReg(0), Fma3
, Flags
);
5065 auto Fma4
= B
.buildFMA(S64
, NegDivScale0
, Mul
, DivScale1
.getReg(0), Flags
);
5068 if (!ST
.hasUsableDivScaleConditionOutput()) {
5069 // Workaround a hardware bug on SI where the condition output from div_scale
5072 LLT S32
= LLT::scalar(32);
5074 auto NumUnmerge
= B
.buildUnmerge(S32
, LHS
);
5075 auto DenUnmerge
= B
.buildUnmerge(S32
, RHS
);
5076 auto Scale0Unmerge
= B
.buildUnmerge(S32
, DivScale0
);
5077 auto Scale1Unmerge
= B
.buildUnmerge(S32
, DivScale1
);
5079 auto CmpNum
= B
.buildICmp(ICmpInst::ICMP_EQ
, S1
, NumUnmerge
.getReg(1),
5080 Scale1Unmerge
.getReg(1));
5081 auto CmpDen
= B
.buildICmp(ICmpInst::ICMP_EQ
, S1
, DenUnmerge
.getReg(1),
5082 Scale0Unmerge
.getReg(1));
5083 Scale
= B
.buildXor(S1
, CmpNum
, CmpDen
).getReg(0);
5085 Scale
= DivScale1
.getReg(1);
5088 auto Fmas
= B
.buildIntrinsic(Intrinsic::amdgcn_div_fmas
, {S64
})
5089 .addUse(Fma4
.getReg(0))
5090 .addUse(Fma3
.getReg(0))
5091 .addUse(Mul
.getReg(0))
5095 B
.buildIntrinsic(Intrinsic::amdgcn_div_fixup
, ArrayRef(Res
))
5096 .addUse(Fmas
.getReg(0))
5101 MI
.eraseFromParent();
5105 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr
&MI
,
5106 MachineRegisterInfo
&MRI
,
5107 MachineIRBuilder
&B
) const {
5108 Register Res0
= MI
.getOperand(0).getReg();
5109 Register Res1
= MI
.getOperand(1).getReg();
5110 Register Val
= MI
.getOperand(2).getReg();
5111 uint16_t Flags
= MI
.getFlags();
5113 LLT Ty
= MRI
.getType(Res0
);
5114 LLT InstrExpTy
= Ty
== LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5116 auto Mant
= B
.buildIntrinsic(Intrinsic::amdgcn_frexp_mant
, {Ty
})
5119 auto Exp
= B
.buildIntrinsic(Intrinsic::amdgcn_frexp_exp
, {InstrExpTy
})
5123 if (ST
.hasFractBug()) {
5124 auto Fabs
= B
.buildFAbs(Ty
, Val
);
5125 auto Inf
= B
.buildFConstant(Ty
, APFloat::getInf(getFltSemanticForLLT(Ty
)));
5127 B
.buildFCmp(CmpInst::FCMP_OLT
, LLT::scalar(1), Fabs
, Inf
, Flags
);
5128 auto Zero
= B
.buildConstant(InstrExpTy
, 0);
5129 Exp
= B
.buildSelect(InstrExpTy
, IsFinite
, Exp
, Zero
);
5130 Mant
= B
.buildSelect(Ty
, IsFinite
, Mant
, Val
);
5133 B
.buildCopy(Res0
, Mant
);
5134 B
.buildSExtOrTrunc(Res1
, Exp
);
5136 MI
.eraseFromParent();
5140 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr
&MI
,
5141 MachineRegisterInfo
&MRI
,
5142 MachineIRBuilder
&B
) const {
5143 Register Res
= MI
.getOperand(0).getReg();
5144 Register LHS
= MI
.getOperand(2).getReg();
5145 Register RHS
= MI
.getOperand(3).getReg();
5146 uint16_t Flags
= MI
.getFlags();
5148 LLT S32
= LLT::scalar(32);
5149 LLT S1
= LLT::scalar(1);
5151 auto Abs
= B
.buildFAbs(S32
, RHS
, Flags
);
5152 const APFloat
C0Val(1.0f
);
5154 auto C0
= B
.buildFConstant(S32
, 0x1p
+96f
);
5155 auto C1
= B
.buildFConstant(S32
, 0x1p
-32f
);
5156 auto C2
= B
.buildFConstant(S32
, 1.0f
);
5158 auto CmpRes
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Abs
, C0
, Flags
);
5159 auto Sel
= B
.buildSelect(S32
, CmpRes
, C1
, C2
, Flags
);
5161 auto Mul0
= B
.buildFMul(S32
, RHS
, Sel
, Flags
);
5163 auto RCP
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S32
})
5164 .addUse(Mul0
.getReg(0))
5167 auto Mul1
= B
.buildFMul(S32
, LHS
, RCP
, Flags
);
5169 B
.buildFMul(Res
, Sel
, Mul1
, Flags
);
5171 MI
.eraseFromParent();
5175 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr
&MI
,
5176 MachineRegisterInfo
&MRI
,
5177 MachineIRBuilder
&B
) const {
5178 // Bypass the correct expansion a standard promotion through G_FSQRT would
5179 // get. The f32 op is accurate enough for the f16 cas.
5180 unsigned Flags
= MI
.getFlags();
5181 assert(!ST
.has16BitInsts());
5182 const LLT F32
= LLT::scalar(32);
5183 auto Ext
= B
.buildFPExt(F32
, MI
.getOperand(1), Flags
);
5184 auto Log2
= B
.buildIntrinsic(Intrinsic::amdgcn_sqrt
, {F32
})
5185 .addUse(Ext
.getReg(0))
5187 B
.buildFPTrunc(MI
.getOperand(0), Log2
, Flags
);
5188 MI
.eraseFromParent();
5192 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr
&MI
,
5193 MachineRegisterInfo
&MRI
,
5194 MachineIRBuilder
&B
) const {
5195 MachineFunction
&MF
= B
.getMF();
5196 Register Dst
= MI
.getOperand(0).getReg();
5197 Register X
= MI
.getOperand(1).getReg();
5198 const unsigned Flags
= MI
.getFlags();
5199 const LLT S1
= LLT::scalar(1);
5200 const LLT F32
= LLT::scalar(32);
5201 const LLT I32
= LLT::scalar(32);
5203 if (allowApproxFunc(MF
, Flags
)) {
5204 B
.buildIntrinsic(Intrinsic::amdgcn_sqrt
, ArrayRef
<Register
>({Dst
}))
5207 MI
.eraseFromParent();
5211 auto ScaleThreshold
= B
.buildFConstant(F32
, 0x1.0p
-96f
);
5212 auto NeedScale
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, ScaleThreshold
, X
, Flags
);
5213 auto ScaleUpFactor
= B
.buildFConstant(F32
, 0x1.0p
+32f
);
5214 auto ScaledX
= B
.buildFMul(F32
, X
, ScaleUpFactor
, Flags
);
5215 auto SqrtX
= B
.buildSelect(F32
, NeedScale
, ScaledX
, X
, Flags
);
5217 Register SqrtS
= MRI
.createGenericVirtualRegister(F32
);
5218 if (needsDenormHandlingF32(MF
, X
, Flags
)) {
5219 B
.buildIntrinsic(Intrinsic::amdgcn_sqrt
, ArrayRef
<Register
>({SqrtS
}))
5220 .addUse(SqrtX
.getReg(0))
5223 auto NegOne
= B
.buildConstant(I32
, -1);
5224 auto SqrtSNextDown
= B
.buildAdd(I32
, SqrtS
, NegOne
);
5226 auto NegSqrtSNextDown
= B
.buildFNeg(F32
, SqrtSNextDown
, Flags
);
5227 auto SqrtVP
= B
.buildFMA(F32
, NegSqrtSNextDown
, SqrtS
, SqrtX
, Flags
);
5229 auto PosOne
= B
.buildConstant(I32
, 1);
5230 auto SqrtSNextUp
= B
.buildAdd(I32
, SqrtS
, PosOne
);
5232 auto NegSqrtSNextUp
= B
.buildFNeg(F32
, SqrtSNextUp
, Flags
);
5233 auto SqrtVS
= B
.buildFMA(F32
, NegSqrtSNextUp
, SqrtS
, SqrtX
, Flags
);
5235 auto Zero
= B
.buildFConstant(F32
, 0.0f
);
5236 auto SqrtVPLE0
= B
.buildFCmp(CmpInst::FCMP_OLE
, S1
, SqrtVP
, Zero
, Flags
);
5239 B
.buildSelect(F32
, SqrtVPLE0
, SqrtSNextDown
, SqrtS
, Flags
).getReg(0);
5241 auto SqrtVPVSGT0
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, SqrtVS
, Zero
, Flags
);
5243 B
.buildSelect(F32
, SqrtVPVSGT0
, SqrtSNextUp
, SqrtS
, Flags
).getReg(0);
5246 B
.buildIntrinsic(Intrinsic::amdgcn_rsq
, {F32
}).addReg(SqrtX
.getReg(0));
5247 B
.buildFMul(SqrtS
, SqrtX
, SqrtR
, Flags
);
5249 auto Half
= B
.buildFConstant(F32
, 0.5f
);
5250 auto SqrtH
= B
.buildFMul(F32
, SqrtR
, Half
, Flags
);
5251 auto NegSqrtH
= B
.buildFNeg(F32
, SqrtH
, Flags
);
5252 auto SqrtE
= B
.buildFMA(F32
, NegSqrtH
, SqrtS
, Half
, Flags
);
5253 SqrtH
= B
.buildFMA(F32
, SqrtH
, SqrtE
, SqrtH
, Flags
);
5254 SqrtS
= B
.buildFMA(F32
, SqrtS
, SqrtE
, SqrtS
, Flags
).getReg(0);
5255 auto NegSqrtS
= B
.buildFNeg(F32
, SqrtS
, Flags
);
5256 auto SqrtD
= B
.buildFMA(F32
, NegSqrtS
, SqrtS
, SqrtX
, Flags
);
5257 SqrtS
= B
.buildFMA(F32
, SqrtD
, SqrtH
, SqrtS
, Flags
).getReg(0);
5260 auto ScaleDownFactor
= B
.buildFConstant(F32
, 0x1.0p
-16f
);
5262 auto ScaledDown
= B
.buildFMul(F32
, SqrtS
, ScaleDownFactor
, Flags
);
5264 SqrtS
= B
.buildSelect(F32
, NeedScale
, ScaledDown
, SqrtS
, Flags
).getReg(0);
5266 auto IsZeroOrInf
= B
.buildIsFPClass(LLT::scalar(1), SqrtX
, fcZero
| fcPosInf
);
5267 B
.buildSelect(Dst
, IsZeroOrInf
, SqrtX
, SqrtS
, Flags
);
5269 MI
.eraseFromParent();
5273 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr
&MI
,
5274 MachineRegisterInfo
&MRI
,
5275 MachineIRBuilder
&B
) const {
5276 // For double type, the SQRT and RSQ instructions don't have required
5277 // precision, we apply Goldschmidt's algorithm to improve the result:
5283 // r0 = 0.5 - h0 * g0
5284 // g1 = g0 * r0 + g0
5285 // h1 = h0 * r0 + h0
5287 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5288 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5289 // h2 = h1 * r1 + h1
5291 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5292 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5296 const LLT S1
= LLT::scalar(1);
5297 const LLT S32
= LLT::scalar(32);
5298 const LLT F64
= LLT::scalar(64);
5300 Register Dst
= MI
.getOperand(0).getReg();
5301 assert(MRI
.getType(Dst
) == F64
&& "only expect to lower f64 sqrt");
5303 Register X
= MI
.getOperand(1).getReg();
5304 unsigned Flags
= MI
.getFlags();
5306 auto ScaleConstant
= B
.buildFConstant(F64
, 0x1.0p
-767);
5308 auto ZeroInt
= B
.buildConstant(S32
, 0);
5309 auto Scaling
= B
.buildFCmp(FCmpInst::FCMP_OLT
, S1
, X
, ScaleConstant
);
5311 // Scale up input if it is too small.
5312 auto ScaleUpFactor
= B
.buildConstant(S32
, 256);
5313 auto ScaleUp
= B
.buildSelect(S32
, Scaling
, ScaleUpFactor
, ZeroInt
);
5314 auto SqrtX
= B
.buildFLdexp(F64
, X
, ScaleUp
, Flags
);
5317 B
.buildIntrinsic(Intrinsic::amdgcn_rsq
, {F64
}).addReg(SqrtX
.getReg(0));
5319 auto Half
= B
.buildFConstant(F64
, 0.5);
5320 auto SqrtH0
= B
.buildFMul(F64
, SqrtY
, Half
);
5321 auto SqrtS0
= B
.buildFMul(F64
, SqrtX
, SqrtY
);
5323 auto NegSqrtH0
= B
.buildFNeg(F64
, SqrtH0
);
5324 auto SqrtR0
= B
.buildFMA(F64
, NegSqrtH0
, SqrtS0
, Half
);
5326 auto SqrtS1
= B
.buildFMA(F64
, SqrtS0
, SqrtR0
, SqrtS0
);
5327 auto SqrtH1
= B
.buildFMA(F64
, SqrtH0
, SqrtR0
, SqrtH0
);
5329 auto NegSqrtS1
= B
.buildFNeg(F64
, SqrtS1
);
5330 auto SqrtD0
= B
.buildFMA(F64
, NegSqrtS1
, SqrtS1
, SqrtX
);
5332 auto SqrtS2
= B
.buildFMA(F64
, SqrtD0
, SqrtH1
, SqrtS1
);
5334 auto NegSqrtS2
= B
.buildFNeg(F64
, SqrtS2
);
5335 auto SqrtD1
= B
.buildFMA(F64
, NegSqrtS2
, SqrtS2
, SqrtX
);
5337 auto SqrtRet
= B
.buildFMA(F64
, SqrtD1
, SqrtH1
, SqrtS2
);
5339 // Scale down the result.
5340 auto ScaleDownFactor
= B
.buildConstant(S32
, -128);
5341 auto ScaleDown
= B
.buildSelect(S32
, Scaling
, ScaleDownFactor
, ZeroInt
);
5342 SqrtRet
= B
.buildFLdexp(F64
, SqrtRet
, ScaleDown
, Flags
);
5344 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5345 // with finite only or nsz because rsq(+/-0) = +/-inf
5347 // TODO: Check for DAZ and expand to subnormals
5348 auto IsZeroOrInf
= B
.buildIsFPClass(LLT::scalar(1), SqrtX
, fcZero
| fcPosInf
);
5350 // If x is +INF, +0, or -0, use its original value
5351 B
.buildSelect(Dst
, IsZeroOrInf
, SqrtX
, SqrtRet
, Flags
);
5353 MI
.eraseFromParent();
5357 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr
&MI
,
5358 MachineRegisterInfo
&MRI
,
5359 MachineIRBuilder
&B
) const {
5360 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
5361 if (Ty
== LLT::scalar(32))
5362 return legalizeFSQRTF32(MI
, MRI
, B
);
5363 if (Ty
== LLT::scalar(64))
5364 return legalizeFSQRTF64(MI
, MRI
, B
);
5365 if (Ty
== LLT::scalar(16))
5366 return legalizeFSQRTF16(MI
, MRI
, B
);
5370 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5371 // FIXME: Why do we handle this one but not other removed instructions?
5373 // Reciprocal square root. The clamp prevents infinite results, clamping
5374 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5376 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr
&MI
,
5377 MachineRegisterInfo
&MRI
,
5378 MachineIRBuilder
&B
) const {
5379 if (ST
.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5382 Register Dst
= MI
.getOperand(0).getReg();
5383 Register Src
= MI
.getOperand(2).getReg();
5384 auto Flags
= MI
.getFlags();
5386 LLT Ty
= MRI
.getType(Dst
);
5388 const fltSemantics
*FltSemantics
;
5389 if (Ty
== LLT::scalar(32))
5390 FltSemantics
= &APFloat::IEEEsingle();
5391 else if (Ty
== LLT::scalar(64))
5392 FltSemantics
= &APFloat::IEEEdouble();
5396 auto Rsq
= B
.buildIntrinsic(Intrinsic::amdgcn_rsq
, {Ty
})
5400 // We don't need to concern ourselves with the snan handling difference, since
5401 // the rsq quieted (or not) so use the one which will directly select.
5402 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
5403 const bool UseIEEE
= MFI
->getMode().IEEE
;
5405 auto MaxFlt
= B
.buildFConstant(Ty
, APFloat::getLargest(*FltSemantics
));
5406 auto ClampMax
= UseIEEE
? B
.buildFMinNumIEEE(Ty
, Rsq
, MaxFlt
, Flags
) :
5407 B
.buildFMinNum(Ty
, Rsq
, MaxFlt
, Flags
);
5409 auto MinFlt
= B
.buildFConstant(Ty
, APFloat::getLargest(*FltSemantics
, true));
5412 B
.buildFMaxNumIEEE(Dst
, ClampMax
, MinFlt
, Flags
);
5414 B
.buildFMaxNum(Dst
, ClampMax
, MinFlt
, Flags
);
5415 MI
.eraseFromParent();
5419 // TODO: Fix pointer type handling
5420 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper
&Helper
,
5422 Intrinsic::ID IID
) const {
5424 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
5425 MachineRegisterInfo
&MRI
= *B
.getMRI();
5427 bool IsPermLane16
= IID
== Intrinsic::amdgcn_permlane16
||
5428 IID
== Intrinsic::amdgcn_permlanex16
;
5430 auto createLaneOp
= [&IID
, &B
, &MI
](Register Src0
, Register Src1
,
5431 Register Src2
, LLT VT
) -> Register
{
5432 auto LaneOp
= B
.buildIntrinsic(IID
, {VT
}).addUse(Src0
);
5434 case Intrinsic::amdgcn_readfirstlane
:
5435 case Intrinsic::amdgcn_permlane64
:
5436 return LaneOp
.getReg(0);
5437 case Intrinsic::amdgcn_readlane
:
5438 return LaneOp
.addUse(Src1
).getReg(0);
5439 case Intrinsic::amdgcn_writelane
:
5440 return LaneOp
.addUse(Src1
).addUse(Src2
).getReg(0);
5441 case Intrinsic::amdgcn_permlane16
:
5442 case Intrinsic::amdgcn_permlanex16
: {
5443 Register Src3
= MI
.getOperand(5).getReg();
5444 Register Src4
= MI
.getOperand(6).getImm();
5445 Register Src5
= MI
.getOperand(7).getImm();
5446 return LaneOp
.addUse(Src1
)
5454 llvm_unreachable("unhandled lane op");
5458 Register DstReg
= MI
.getOperand(0).getReg();
5459 Register Src0
= MI
.getOperand(2).getReg();
5460 Register Src1
, Src2
;
5461 if (IID
== Intrinsic::amdgcn_readlane
|| IID
== Intrinsic::amdgcn_writelane
||
5463 Src1
= MI
.getOperand(3).getReg();
5464 if (IID
== Intrinsic::amdgcn_writelane
|| IsPermLane16
) {
5465 Src2
= MI
.getOperand(4).getReg();
5469 LLT Ty
= MRI
.getType(DstReg
);
5470 unsigned Size
= Ty
.getSizeInBits();
5478 Src0
= B
.buildAnyExt(S32
, Src0
).getReg(0);
5481 Src1
= B
.buildAnyExt(LLT::scalar(32), Src1
).getReg(0);
5483 if (IID
== Intrinsic::amdgcn_writelane
)
5484 Src2
= B
.buildAnyExt(LLT::scalar(32), Src2
).getReg(0);
5486 Register LaneOpDst
= createLaneOp(Src0
, Src1
, Src2
, S32
);
5487 B
.buildTrunc(DstReg
, LaneOpDst
);
5488 MI
.eraseFromParent();
5495 LLT PartialResTy
= S32
;
5496 if (Ty
.isVector()) {
5497 LLT EltTy
= Ty
.getElementType();
5498 switch (EltTy
.getSizeInBits()) {
5500 PartialResTy
= Ty
.changeElementCount(ElementCount::getFixed(2));
5503 PartialResTy
= EltTy
;
5506 // Handle all other cases via S32 pieces;
5511 SmallVector
<Register
, 2> PartialRes
;
5512 unsigned NumParts
= Size
/ 32;
5513 MachineInstrBuilder Src0Parts
= B
.buildUnmerge(PartialResTy
, Src0
);
5514 MachineInstrBuilder Src1Parts
, Src2Parts
;
5517 Src1Parts
= B
.buildUnmerge(PartialResTy
, Src1
);
5519 if (IID
== Intrinsic::amdgcn_writelane
)
5520 Src2Parts
= B
.buildUnmerge(PartialResTy
, Src2
);
5522 for (unsigned i
= 0; i
< NumParts
; ++i
) {
5523 Src0
= Src0Parts
.getReg(i
);
5526 Src1
= Src1Parts
.getReg(i
);
5528 if (IID
== Intrinsic::amdgcn_writelane
)
5529 Src2
= Src2Parts
.getReg(i
);
5531 PartialRes
.push_back(createLaneOp(Src0
, Src1
, Src2
, PartialResTy
));
5534 B
.buildMergeLikeInstr(DstReg
, PartialRes
);
5535 MI
.eraseFromParent();
5539 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg
,
5540 MachineRegisterInfo
&MRI
,
5541 MachineIRBuilder
&B
) const {
5543 ST
.getTargetLowering()->getImplicitParameterOffset(
5544 B
.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT
);
5545 LLT DstTy
= MRI
.getType(DstReg
);
5546 LLT IdxTy
= LLT::scalar(DstTy
.getSizeInBits());
5548 Register KernargPtrReg
= MRI
.createGenericVirtualRegister(DstTy
);
5549 if (!loadInputValue(KernargPtrReg
, B
,
5550 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
))
5553 // FIXME: This should be nuw
5554 B
.buildPtrAdd(DstReg
, KernargPtrReg
, B
.buildConstant(IdxTy
, Offset
).getReg(0));
5558 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5559 /// bits of the pointer and replace them with the stride argument, then
5560 /// merge_values everything together. In the common case of a raw buffer (the
5561 /// stride component is 0), we can just AND off the upper half.
5562 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5563 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
5564 Register Result
= MI
.getOperand(0).getReg();
5565 Register Pointer
= MI
.getOperand(2).getReg();
5566 Register Stride
= MI
.getOperand(3).getReg();
5567 Register NumRecords
= MI
.getOperand(4).getReg();
5568 Register Flags
= MI
.getOperand(5).getReg();
5570 LLT S32
= LLT::scalar(32);
5572 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
5573 auto Unmerge
= B
.buildUnmerge(S32
, Pointer
);
5574 Register LowHalf
= Unmerge
.getReg(0);
5575 Register HighHalf
= Unmerge
.getReg(1);
5577 auto AndMask
= B
.buildConstant(S32
, 0x0000ffff);
5578 auto Masked
= B
.buildAnd(S32
, HighHalf
, AndMask
);
5580 MachineInstrBuilder NewHighHalf
= Masked
;
5581 std::optional
<ValueAndVReg
> StrideConst
=
5582 getIConstantVRegValWithLookThrough(Stride
, MRI
);
5583 if (!StrideConst
|| !StrideConst
->Value
.isZero()) {
5584 MachineInstrBuilder ShiftedStride
;
5586 uint32_t StrideVal
= StrideConst
->Value
.getZExtValue();
5587 uint32_t ShiftedStrideVal
= StrideVal
<< 16;
5588 ShiftedStride
= B
.buildConstant(S32
, ShiftedStrideVal
);
5590 auto ExtStride
= B
.buildAnyExt(S32
, Stride
);
5591 auto ShiftConst
= B
.buildConstant(S32
, 16);
5592 ShiftedStride
= B
.buildShl(S32
, ExtStride
, ShiftConst
);
5594 NewHighHalf
= B
.buildOr(S32
, Masked
, ShiftedStride
);
5596 Register NewHighHalfReg
= NewHighHalf
.getReg(0);
5597 B
.buildMergeValues(Result
, {LowHalf
, NewHighHalfReg
, NumRecords
, Flags
});
5598 MI
.eraseFromParent();
5602 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr
&MI
,
5603 MachineRegisterInfo
&MRI
,
5604 MachineIRBuilder
&B
) const {
5605 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
5606 if (!MFI
->isEntryFunction()) {
5607 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
5608 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
5611 Register DstReg
= MI
.getOperand(0).getReg();
5612 if (!getImplicitArgPtr(DstReg
, MRI
, B
))
5615 MI
.eraseFromParent();
5619 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg
,
5620 MachineRegisterInfo
&MRI
,
5621 MachineIRBuilder
&B
) const {
5622 Function
&F
= B
.getMF().getFunction();
5623 std::optional
<uint32_t> KnownSize
=
5624 AMDGPUMachineFunction::getLDSKernelIdMetadata(F
);
5625 if (KnownSize
.has_value())
5626 B
.buildConstant(DstReg
, *KnownSize
);
5630 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr
&MI
,
5631 MachineRegisterInfo
&MRI
,
5632 MachineIRBuilder
&B
) const {
5634 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
5635 if (!MFI
->isEntryFunction()) {
5636 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
5637 AMDGPUFunctionArgInfo::LDS_KERNEL_ID
);
5640 Register DstReg
= MI
.getOperand(0).getReg();
5641 if (!getLDSKernelId(DstReg
, MRI
, B
))
5644 MI
.eraseFromParent();
5648 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr
&MI
,
5649 MachineRegisterInfo
&MRI
,
5650 MachineIRBuilder
&B
,
5651 unsigned AddrSpace
) const {
5652 Register ApertureReg
= getSegmentAperture(AddrSpace
, MRI
, B
);
5653 auto Unmerge
= B
.buildUnmerge(LLT::scalar(32), MI
.getOperand(2).getReg());
5654 Register Hi32
= Unmerge
.getReg(1);
5656 B
.buildICmp(ICmpInst::ICMP_EQ
, MI
.getOperand(0), Hi32
, ApertureReg
);
5657 MI
.eraseFromParent();
5661 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5662 // offset (the offset that is included in bounds checking and swizzling, to be
5663 // split between the instruction's voffset and immoffset fields) and soffset
5664 // (the offset that is excluded from bounds checking and swizzling, to go in
5665 // the instruction's soffset field). This function takes the first kind of
5666 // offset and figures out how to split it between voffset and immoffset.
5667 std::pair
<Register
, unsigned>
5668 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder
&B
,
5669 Register OrigOffset
) const {
5670 const unsigned MaxImm
= SIInstrInfo::getMaxMUBUFImmOffset(ST
);
5673 const LLT S32
= LLT::scalar(32);
5674 MachineRegisterInfo
&MRI
= *B
.getMRI();
5676 std::tie(BaseReg
, ImmOffset
) =
5677 AMDGPU::getBaseWithConstantOffset(MRI
, OrigOffset
);
5679 // If BaseReg is a pointer, convert it to int.
5680 if (MRI
.getType(BaseReg
).isPointer())
5681 BaseReg
= B
.buildPtrToInt(MRI
.getType(OrigOffset
), BaseReg
).getReg(0);
5683 // If the immediate value is too big for the immoffset field, put only bits
5684 // that would normally fit in the immoffset field. The remaining value that
5685 // is copied/added for the voffset field is a large power of 2, and it
5686 // stands more chance of being CSEd with the copy/add for another similar
5688 // However, do not do that rounding down if that is a negative
5689 // number, as it appears to be illegal to have a negative offset in the
5690 // vgpr, even if adding the immediate offset makes it positive.
5691 unsigned Overflow
= ImmOffset
& ~MaxImm
;
5692 ImmOffset
-= Overflow
;
5693 if ((int32_t)Overflow
< 0) {
5694 Overflow
+= ImmOffset
;
5698 if (Overflow
!= 0) {
5700 BaseReg
= B
.buildConstant(S32
, Overflow
).getReg(0);
5702 auto OverflowVal
= B
.buildConstant(S32
, Overflow
);
5703 BaseReg
= B
.buildAdd(S32
, BaseReg
, OverflowVal
).getReg(0);
5708 BaseReg
= B
.buildConstant(S32
, 0).getReg(0);
5710 return std::pair(BaseReg
, ImmOffset
);
5713 /// Handle register layout difference for f16 images for some subtargets.
5714 Register
AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder
&B
,
5715 MachineRegisterInfo
&MRI
,
5717 bool ImageStore
) const {
5718 const LLT S16
= LLT::scalar(16);
5719 const LLT S32
= LLT::scalar(32);
5720 LLT StoreVT
= MRI
.getType(Reg
);
5721 assert(StoreVT
.isVector() && StoreVT
.getElementType() == S16
);
5723 if (ST
.hasUnpackedD16VMem()) {
5724 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
5726 SmallVector
<Register
, 4> WideRegs
;
5727 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
5728 WideRegs
.push_back(B
.buildAnyExt(S32
, Unmerge
.getReg(I
)).getReg(0));
5730 int NumElts
= StoreVT
.getNumElements();
5732 return B
.buildBuildVector(LLT::fixed_vector(NumElts
, S32
), WideRegs
)
5736 if (ImageStore
&& ST
.hasImageStoreD16Bug()) {
5737 if (StoreVT
.getNumElements() == 2) {
5738 SmallVector
<Register
, 4> PackedRegs
;
5739 Reg
= B
.buildBitcast(S32
, Reg
).getReg(0);
5740 PackedRegs
.push_back(Reg
);
5741 PackedRegs
.resize(2, B
.buildUndef(S32
).getReg(0));
5742 return B
.buildBuildVector(LLT::fixed_vector(2, S32
), PackedRegs
)
5746 if (StoreVT
.getNumElements() == 3) {
5747 SmallVector
<Register
, 4> PackedRegs
;
5748 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
5749 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
5750 PackedRegs
.push_back(Unmerge
.getReg(I
));
5751 PackedRegs
.resize(6, B
.buildUndef(S16
).getReg(0));
5752 Reg
= B
.buildBuildVector(LLT::fixed_vector(6, S16
), PackedRegs
).getReg(0);
5753 return B
.buildBitcast(LLT::fixed_vector(3, S32
), Reg
).getReg(0);
5756 if (StoreVT
.getNumElements() == 4) {
5757 SmallVector
<Register
, 4> PackedRegs
;
5758 Reg
= B
.buildBitcast(LLT::fixed_vector(2, S32
), Reg
).getReg(0);
5759 auto Unmerge
= B
.buildUnmerge(S32
, Reg
);
5760 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
5761 PackedRegs
.push_back(Unmerge
.getReg(I
));
5762 PackedRegs
.resize(4, B
.buildUndef(S32
).getReg(0));
5763 return B
.buildBuildVector(LLT::fixed_vector(4, S32
), PackedRegs
)
5767 llvm_unreachable("invalid data type");
5770 if (StoreVT
== LLT::fixed_vector(3, S16
)) {
5771 Reg
= B
.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16
), Reg
)
5777 Register
AMDGPULegalizerInfo::fixStoreSourceType(
5778 MachineIRBuilder
&B
, Register VData
, bool IsFormat
) const {
5779 MachineRegisterInfo
*MRI
= B
.getMRI();
5780 LLT Ty
= MRI
->getType(VData
);
5782 const LLT S16
= LLT::scalar(16);
5784 // Fixup buffer resources themselves needing to be v4i128.
5785 if (hasBufferRsrcWorkaround(Ty
))
5786 return castBufferRsrcToV4I32(VData
, B
);
5788 // Fixup illegal register types for i8 stores.
5789 if (Ty
== LLT::scalar(8) || Ty
== S16
) {
5790 Register AnyExt
= B
.buildAnyExt(LLT::scalar(32), VData
).getReg(0);
5794 if (Ty
.isVector()) {
5795 if (Ty
.getElementType() == S16
&& Ty
.getNumElements() <= 4) {
5797 return handleD16VData(B
, *MRI
, VData
);
5804 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr
&MI
,
5805 MachineRegisterInfo
&MRI
,
5806 MachineIRBuilder
&B
,
5808 bool IsFormat
) const {
5809 Register VData
= MI
.getOperand(1).getReg();
5810 LLT Ty
= MRI
.getType(VData
);
5811 LLT EltTy
= Ty
.getScalarType();
5812 const bool IsD16
= IsFormat
&& (EltTy
.getSizeInBits() == 16);
5813 const LLT S32
= LLT::scalar(32);
5815 VData
= fixStoreSourceType(B
, VData
, IsFormat
);
5816 castBufferRsrcArgToV4I32(MI
, B
, 2);
5817 Register RSrc
= MI
.getOperand(2).getReg();
5819 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
5820 const int MemSize
= MMO
->getSize().getValue();
5824 // The typed intrinsics add an immediate after the registers.
5825 const unsigned NumVIndexOps
= IsTyped
? 8 : 7;
5827 // The struct intrinsic variants add one additional operand over raw.
5828 const bool HasVIndex
= MI
.getNumOperands() == NumVIndexOps
;
5832 VIndex
= MI
.getOperand(3).getReg();
5835 VIndex
= B
.buildConstant(S32
, 0).getReg(0);
5838 Register VOffset
= MI
.getOperand(3 + OpOffset
).getReg();
5839 Register SOffset
= MI
.getOperand(4 + OpOffset
).getReg();
5841 unsigned Format
= 0;
5843 Format
= MI
.getOperand(5 + OpOffset
).getImm();
5847 unsigned AuxiliaryData
= MI
.getOperand(5 + OpOffset
).getImm();
5849 std::tie(VOffset
, ImmOffset
) = splitBufferOffsets(B
, VOffset
);
5853 Opc
= IsD16
? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16
:
5854 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT
;
5855 } else if (IsFormat
) {
5856 Opc
= IsD16
? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16
:
5857 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT
;
5861 Opc
= AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE
;
5864 Opc
= AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT
;
5867 Opc
= AMDGPU::G_AMDGPU_BUFFER_STORE
;
5872 auto MIB
= B
.buildInstr(Opc
)
5873 .addUse(VData
) // vdata
5874 .addUse(RSrc
) // rsrc
5875 .addUse(VIndex
) // vindex
5876 .addUse(VOffset
) // voffset
5877 .addUse(SOffset
) // soffset
5878 .addImm(ImmOffset
); // offset(imm)
5883 MIB
.addImm(AuxiliaryData
) // cachepolicy, swizzled buffer(imm)
5884 .addImm(HasVIndex
? -1 : 0) // idxen(imm)
5885 .addMemOperand(MMO
);
5887 MI
.eraseFromParent();
5891 static void buildBufferLoad(unsigned Opc
, Register LoadDstReg
, Register RSrc
,
5892 Register VIndex
, Register VOffset
, Register SOffset
,
5893 unsigned ImmOffset
, unsigned Format
,
5894 unsigned AuxiliaryData
, MachineMemOperand
*MMO
,
5895 bool IsTyped
, bool HasVIndex
, MachineIRBuilder
&B
) {
5896 auto MIB
= B
.buildInstr(Opc
)
5897 .addDef(LoadDstReg
) // vdata
5898 .addUse(RSrc
) // rsrc
5899 .addUse(VIndex
) // vindex
5900 .addUse(VOffset
) // voffset
5901 .addUse(SOffset
) // soffset
5902 .addImm(ImmOffset
); // offset(imm)
5907 MIB
.addImm(AuxiliaryData
) // cachepolicy, swizzled buffer(imm)
5908 .addImm(HasVIndex
? -1 : 0) // idxen(imm)
5909 .addMemOperand(MMO
);
5912 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr
&MI
,
5913 MachineRegisterInfo
&MRI
,
5914 MachineIRBuilder
&B
,
5916 bool IsTyped
) const {
5917 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5918 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
5919 const LLT MemTy
= MMO
->getMemoryType();
5920 const LLT S32
= LLT::scalar(32);
5922 Register Dst
= MI
.getOperand(0).getReg();
5926 assert(MI
.getNumExplicitDefs() == 1 || MI
.getNumExplicitDefs() == 2);
5927 bool IsTFE
= MI
.getNumExplicitDefs() == 2;
5929 StatusDst
= MI
.getOperand(1).getReg();
5933 castBufferRsrcArgToV4I32(MI
, B
, 2 + OpOffset
);
5934 Register RSrc
= MI
.getOperand(2 + OpOffset
).getReg();
5936 // The typed intrinsics add an immediate after the registers.
5937 const unsigned NumVIndexOps
= IsTyped
? 8 : 7;
5939 // The struct intrinsic variants add one additional operand over raw.
5940 const bool HasVIndex
= MI
.getNumOperands() == NumVIndexOps
+ OpOffset
;
5943 VIndex
= MI
.getOperand(3 + OpOffset
).getReg();
5946 VIndex
= B
.buildConstant(S32
, 0).getReg(0);
5949 Register VOffset
= MI
.getOperand(3 + OpOffset
).getReg();
5950 Register SOffset
= MI
.getOperand(4 + OpOffset
).getReg();
5952 unsigned Format
= 0;
5954 Format
= MI
.getOperand(5 + OpOffset
).getImm();
5958 unsigned AuxiliaryData
= MI
.getOperand(5 + OpOffset
).getImm();
5961 LLT Ty
= MRI
.getType(Dst
);
5962 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5963 // logic doesn't have to handle that case.
5964 if (hasBufferRsrcWorkaround(Ty
)) {
5965 Ty
= castBufferRsrcFromV4I32(MI
, B
, MRI
, 0);
5966 Dst
= MI
.getOperand(0).getReg();
5968 LLT EltTy
= Ty
.getScalarType();
5969 const bool IsD16
= IsFormat
&& (EltTy
.getSizeInBits() == 16);
5970 const bool Unpacked
= ST
.hasUnpackedD16VMem();
5972 std::tie(VOffset
, ImmOffset
) = splitBufferOffsets(B
, VOffset
);
5976 // TODO: Support TFE for typed and narrow loads.
5980 Opc
= IsD16
? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16
:
5981 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT
;
5982 } else if (IsFormat
) {
5986 Opc
= AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16
;
5988 Opc
= IsTFE
? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5989 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT
;
5992 switch (MemTy
.getSizeInBits()) {
5994 Opc
= IsTFE
? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
5995 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
;
5998 Opc
= IsTFE
? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
5999 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT
;
6002 Opc
= IsTFE
? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6003 : AMDGPU::G_AMDGPU_BUFFER_LOAD
;
6009 unsigned NumValueDWords
= divideCeil(Ty
.getSizeInBits(), 32);
6010 unsigned NumLoadDWords
= NumValueDWords
+ 1;
6011 LLT LoadTy
= LLT::fixed_vector(NumLoadDWords
, S32
);
6012 Register LoadDstReg
= B
.getMRI()->createGenericVirtualRegister(LoadTy
);
6013 buildBufferLoad(Opc
, LoadDstReg
, RSrc
, VIndex
, VOffset
, SOffset
, ImmOffset
,
6014 Format
, AuxiliaryData
, MMO
, IsTyped
, HasVIndex
, B
);
6015 if (MemTy
.getSizeInBits() < 32) {
6016 Register ExtDst
= B
.getMRI()->createGenericVirtualRegister(S32
);
6017 B
.buildUnmerge({ExtDst
, StatusDst
}, LoadDstReg
);
6018 B
.buildTrunc(Dst
, ExtDst
);
6019 } else if (NumValueDWords
== 1) {
6020 B
.buildUnmerge({Dst
, StatusDst
}, LoadDstReg
);
6022 SmallVector
<Register
, 5> LoadElts
;
6023 for (unsigned I
= 0; I
!= NumValueDWords
; ++I
)
6024 LoadElts
.push_back(B
.getMRI()->createGenericVirtualRegister(S32
));
6025 LoadElts
.push_back(StatusDst
);
6026 B
.buildUnmerge(LoadElts
, LoadDstReg
);
6027 LoadElts
.truncate(NumValueDWords
);
6028 B
.buildMergeLikeInstr(Dst
, LoadElts
);
6030 } else if ((!IsD16
&& MemTy
.getSizeInBits() < 32) ||
6031 (IsD16
&& !Ty
.isVector())) {
6032 Register LoadDstReg
= B
.getMRI()->createGenericVirtualRegister(S32
);
6033 buildBufferLoad(Opc
, LoadDstReg
, RSrc
, VIndex
, VOffset
, SOffset
, ImmOffset
,
6034 Format
, AuxiliaryData
, MMO
, IsTyped
, HasVIndex
, B
);
6035 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
6036 B
.buildTrunc(Dst
, LoadDstReg
);
6037 } else if (Unpacked
&& IsD16
&& Ty
.isVector()) {
6038 LLT UnpackedTy
= Ty
.changeElementSize(32);
6039 Register LoadDstReg
= B
.getMRI()->createGenericVirtualRegister(UnpackedTy
);
6040 buildBufferLoad(Opc
, LoadDstReg
, RSrc
, VIndex
, VOffset
, SOffset
, ImmOffset
,
6041 Format
, AuxiliaryData
, MMO
, IsTyped
, HasVIndex
, B
);
6042 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
6043 // FIXME: G_TRUNC should work, but legalization currently fails
6044 auto Unmerge
= B
.buildUnmerge(S32
, LoadDstReg
);
6045 SmallVector
<Register
, 4> Repack
;
6046 for (unsigned I
= 0, N
= Unmerge
->getNumOperands() - 1; I
!= N
; ++I
)
6047 Repack
.push_back(B
.buildTrunc(EltTy
, Unmerge
.getReg(I
)).getReg(0));
6048 B
.buildMergeLikeInstr(Dst
, Repack
);
6050 buildBufferLoad(Opc
, Dst
, RSrc
, VIndex
, VOffset
, SOffset
, ImmOffset
, Format
,
6051 AuxiliaryData
, MMO
, IsTyped
, HasVIndex
, B
);
6054 MI
.eraseFromParent();
6058 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID
) {
6060 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
6061 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap
:
6062 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
6063 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap
:
6064 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP
;
6065 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
6066 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add
:
6067 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
6068 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add
:
6069 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD
;
6070 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
6071 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub
:
6072 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
6073 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub
:
6074 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB
;
6075 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
6076 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin
:
6077 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
6078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin
:
6079 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN
;
6080 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
6081 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin
:
6082 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
6083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin
:
6084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN
;
6085 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
6086 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax
:
6087 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
6088 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax
:
6089 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX
;
6090 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
6091 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax
:
6092 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
6093 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax
:
6094 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX
;
6095 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
6096 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and
:
6097 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
6098 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and
:
6099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND
;
6100 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
6101 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or
:
6102 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
6103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or
:
6104 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR
;
6105 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
6106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor
:
6107 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
6108 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor
:
6109 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR
;
6110 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
6111 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc
:
6112 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
6113 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc
:
6114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC
;
6115 case Intrinsic::amdgcn_raw_buffer_atomic_dec
:
6116 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec
:
6117 case Intrinsic::amdgcn_struct_buffer_atomic_dec
:
6118 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec
:
6119 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC
;
6120 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
:
6121 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap
:
6122 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
:
6123 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap
:
6124 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP
;
6125 case Intrinsic::amdgcn_raw_buffer_atomic_fadd
:
6126 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd
:
6127 case Intrinsic::amdgcn_struct_buffer_atomic_fadd
:
6128 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd
:
6129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD
;
6130 case Intrinsic::amdgcn_raw_buffer_atomic_fmin
:
6131 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin
:
6132 case Intrinsic::amdgcn_struct_buffer_atomic_fmin
:
6133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin
:
6134 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN
;
6135 case Intrinsic::amdgcn_raw_buffer_atomic_fmax
:
6136 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax
:
6137 case Intrinsic::amdgcn_struct_buffer_atomic_fmax
:
6138 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax
:
6139 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX
;
6140 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32
:
6141 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32
:
6142 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32
;
6144 llvm_unreachable("unhandled atomic opcode");
6148 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr
&MI
,
6149 MachineIRBuilder
&B
,
6150 Intrinsic::ID IID
) const {
6151 const bool IsCmpSwap
=
6152 IID
== Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
||
6153 IID
== Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
||
6154 IID
== Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap
||
6155 IID
== Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap
;
6157 Register Dst
= MI
.getOperand(0).getReg();
6158 // Since we don't have 128-bit atomics, we don't need to handle the case of
6159 // p8 argmunents to the atomic itself
6160 Register VData
= MI
.getOperand(2).getReg();
6166 CmpVal
= MI
.getOperand(3).getReg();
6170 castBufferRsrcArgToV4I32(MI
, B
, 3 + OpOffset
);
6171 Register RSrc
= MI
.getOperand(3 + OpOffset
).getReg();
6172 const unsigned NumVIndexOps
= IsCmpSwap
? 9 : 8;
6174 // The struct intrinsic variants add one additional operand over raw.
6175 const bool HasVIndex
= MI
.getNumOperands() == NumVIndexOps
;
6178 VIndex
= MI
.getOperand(4 + OpOffset
).getReg();
6181 VIndex
= B
.buildConstant(LLT::scalar(32), 0).getReg(0);
6184 Register VOffset
= MI
.getOperand(4 + OpOffset
).getReg();
6185 Register SOffset
= MI
.getOperand(5 + OpOffset
).getReg();
6186 unsigned AuxiliaryData
= MI
.getOperand(6 + OpOffset
).getImm();
6188 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
6191 std::tie(VOffset
, ImmOffset
) = splitBufferOffsets(B
, VOffset
);
6193 auto MIB
= B
.buildInstr(getBufferAtomicPseudo(IID
))
6195 .addUse(VData
); // vdata
6200 MIB
.addUse(RSrc
) // rsrc
6201 .addUse(VIndex
) // vindex
6202 .addUse(VOffset
) // voffset
6203 .addUse(SOffset
) // soffset
6204 .addImm(ImmOffset
) // offset(imm)
6205 .addImm(AuxiliaryData
) // cachepolicy, swizzled buffer(imm)
6206 .addImm(HasVIndex
? -1 : 0) // idxen(imm)
6207 .addMemOperand(MMO
);
6209 MI
.eraseFromParent();
6213 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6214 /// vector with s16 typed elements.
6215 static void packImage16bitOpsToDwords(MachineIRBuilder
&B
, MachineInstr
&MI
,
6216 SmallVectorImpl
<Register
> &PackedAddrs
,
6218 const AMDGPU::ImageDimIntrinsicInfo
*Intr
,
6219 bool IsA16
, bool IsG16
) {
6220 const LLT S16
= LLT::scalar(16);
6221 const LLT V2S16
= LLT::fixed_vector(2, 16);
6222 auto EndIdx
= Intr
->VAddrEnd
;
6224 for (unsigned I
= Intr
->VAddrStart
; I
< EndIdx
; I
++) {
6225 MachineOperand
&SrcOp
= MI
.getOperand(ArgOffset
+ I
);
6227 continue; // _L to _LZ may have eliminated this.
6229 Register AddrReg
= SrcOp
.getReg();
6231 if ((I
< Intr
->GradientStart
) ||
6232 (I
>= Intr
->GradientStart
&& I
< Intr
->CoordStart
&& !IsG16
) ||
6233 (I
>= Intr
->CoordStart
&& !IsA16
)) {
6234 if ((I
< Intr
->GradientStart
) && IsA16
&&
6235 (B
.getMRI()->getType(AddrReg
) == S16
)) {
6236 assert(I
== Intr
->BiasIndex
&& "Got unexpected 16-bit extra argument");
6237 // Special handling of bias when A16 is on. Bias is of type half but
6238 // occupies full 32-bit.
6239 PackedAddrs
.push_back(
6240 B
.buildBuildVector(V2S16
, {AddrReg
, B
.buildUndef(S16
).getReg(0)})
6243 assert((!IsA16
|| Intr
->NumBiasArgs
== 0 || I
!= Intr
->BiasIndex
) &&
6244 "Bias needs to be converted to 16 bit in A16 mode");
6245 // Handle any gradient or coordinate operands that should not be packed
6246 AddrReg
= B
.buildBitcast(V2S16
, AddrReg
).getReg(0);
6247 PackedAddrs
.push_back(AddrReg
);
6250 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6251 // derivatives dx/dh and dx/dv are packed with undef.
6252 if (((I
+ 1) >= EndIdx
) ||
6253 ((Intr
->NumGradients
/ 2) % 2 == 1 &&
6254 (I
== static_cast<unsigned>(Intr
->GradientStart
+
6255 (Intr
->NumGradients
/ 2) - 1) ||
6256 I
== static_cast<unsigned>(Intr
->GradientStart
+
6257 Intr
->NumGradients
- 1))) ||
6258 // Check for _L to _LZ optimization
6259 !MI
.getOperand(ArgOffset
+ I
+ 1).isReg()) {
6260 PackedAddrs
.push_back(
6261 B
.buildBuildVector(V2S16
, {AddrReg
, B
.buildUndef(S16
).getReg(0)})
6264 PackedAddrs
.push_back(
6266 V2S16
, {AddrReg
, MI
.getOperand(ArgOffset
+ I
+ 1).getReg()})
6274 /// Convert from separate vaddr components to a single vector address register,
6275 /// and replace the remaining operands with $noreg.
6276 static void convertImageAddrToPacked(MachineIRBuilder
&B
, MachineInstr
&MI
,
6277 int DimIdx
, int NumVAddrs
) {
6278 const LLT S32
= LLT::scalar(32);
6280 SmallVector
<Register
, 8> AddrRegs
;
6281 for (int I
= 0; I
!= NumVAddrs
; ++I
) {
6282 MachineOperand
&SrcOp
= MI
.getOperand(DimIdx
+ I
);
6283 if (SrcOp
.isReg()) {
6284 AddrRegs
.push_back(SrcOp
.getReg());
6285 assert(B
.getMRI()->getType(SrcOp
.getReg()) == S32
);
6289 int NumAddrRegs
= AddrRegs
.size();
6290 if (NumAddrRegs
!= 1) {
6292 B
.buildBuildVector(LLT::fixed_vector(NumAddrRegs
, 32), AddrRegs
);
6293 MI
.getOperand(DimIdx
).setReg(VAddr
.getReg(0));
6296 for (int I
= 1; I
!= NumVAddrs
; ++I
) {
6297 MachineOperand
&SrcOp
= MI
.getOperand(DimIdx
+ I
);
6299 MI
.getOperand(DimIdx
+ I
).setReg(AMDGPU::NoRegister
);
6303 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
6305 /// Depending on the subtarget, load/store with 16-bit element data need to be
6306 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6307 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6310 /// We don't want to directly select image instructions just yet, but also want
6311 /// to exposes all register repacking to the legalizer/combiners. We also don't
6312 /// want a selected instruction entering RegBankSelect. In order to avoid
6313 /// defining a multitude of intermediate image instructions, directly hack on
6314 /// the intrinsic's arguments. In cases like a16 addresses, this requires
6315 /// padding now unnecessary arguments with $noreg.
6316 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6317 MachineInstr
&MI
, MachineIRBuilder
&B
, GISelChangeObserver
&Observer
,
6318 const AMDGPU::ImageDimIntrinsicInfo
*Intr
) const {
6320 const MachineFunction
&MF
= *MI
.getMF();
6321 const unsigned NumDefs
= MI
.getNumExplicitDefs();
6322 const unsigned ArgOffset
= NumDefs
+ 1;
6323 bool IsTFE
= NumDefs
== 2;
6324 // We are only processing the operands of d16 image operations on subtargets
6325 // that use the unpacked register layout, or need to repack the TFE result.
6327 // TODO: Do we need to guard against already legalized intrinsics?
6328 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
=
6329 AMDGPU::getMIMGBaseOpcodeInfo(Intr
->BaseOpcode
);
6331 MachineRegisterInfo
*MRI
= B
.getMRI();
6332 const LLT S32
= LLT::scalar(32);
6333 const LLT S16
= LLT::scalar(16);
6334 const LLT V2S16
= LLT::fixed_vector(2, 16);
6340 if (!BaseOpcode
->NoReturn
|| BaseOpcode
->Store
) {
6341 VData
= MI
.getOperand(NumDefs
== 0 ? 1 : 0).getReg();
6342 Ty
= MRI
->getType(VData
);
6345 const bool IsAtomicPacked16Bit
=
6346 (BaseOpcode
->BaseOpcode
== AMDGPU::IMAGE_ATOMIC_PK_ADD_F16
||
6347 BaseOpcode
->BaseOpcode
== AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16
);
6349 // Check for 16 bit addresses and pack if true.
6351 MRI
->getType(MI
.getOperand(ArgOffset
+ Intr
->GradientStart
).getReg());
6353 MRI
->getType(MI
.getOperand(ArgOffset
+ Intr
->CoordStart
).getReg());
6355 ST
.hasG16() ? (BaseOpcode
->Gradients
&& GradTy
== S16
) : GradTy
== S16
;
6356 const bool IsA16
= AddrTy
== S16
;
6357 const bool IsD16
= !IsAtomicPacked16Bit
&& Ty
.getScalarType() == S16
;
6360 if (!BaseOpcode
->Atomic
) {
6361 DMask
= MI
.getOperand(ArgOffset
+ Intr
->DMaskIndex
).getImm();
6362 if (BaseOpcode
->Gather4
) {
6364 } else if (DMask
!= 0) {
6365 DMaskLanes
= llvm::popcount(DMask
);
6366 } else if (!IsTFE
&& !BaseOpcode
->Store
) {
6367 // If dmask is 0, this is a no-op load. This can be eliminated.
6368 B
.buildUndef(MI
.getOperand(0));
6369 MI
.eraseFromParent();
6374 Observer
.changingInstr(MI
);
6375 auto ChangedInstr
= make_scope_exit([&] { Observer
.changedInstr(MI
); });
6377 const unsigned StoreOpcode
= IsD16
? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6378 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE
;
6379 const unsigned LoadOpcode
= IsD16
? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6380 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD
;
6381 unsigned NewOpcode
= LoadOpcode
;
6382 if (BaseOpcode
->Store
)
6383 NewOpcode
= StoreOpcode
;
6384 else if (BaseOpcode
->NoReturn
)
6385 NewOpcode
= AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET
;
6387 // Track that we legalized this
6388 MI
.setDesc(B
.getTII().get(NewOpcode
));
6390 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6391 // dmask to be at least 1 otherwise the instruction will fail
6392 if (IsTFE
&& DMask
== 0) {
6395 MI
.getOperand(ArgOffset
+ Intr
->DMaskIndex
).setImm(DMask
);
6398 if (BaseOpcode
->Atomic
) {
6399 Register VData0
= MI
.getOperand(2).getReg();
6400 LLT Ty
= MRI
->getType(VData0
);
6402 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6403 if (Ty
.isVector() && !IsAtomicPacked16Bit
)
6406 if (BaseOpcode
->AtomicX2
) {
6407 Register VData1
= MI
.getOperand(3).getReg();
6408 // The two values are packed in one register.
6409 LLT PackedTy
= LLT::fixed_vector(2, Ty
);
6410 auto Concat
= B
.buildBuildVector(PackedTy
, {VData0
, VData1
});
6411 MI
.getOperand(2).setReg(Concat
.getReg(0));
6412 MI
.getOperand(3).setReg(AMDGPU::NoRegister
);
6416 unsigned CorrectedNumVAddrs
= Intr
->NumVAddrs
;
6418 // Rewrite the addressing register layout before doing anything else.
6419 if (BaseOpcode
->Gradients
&& !ST
.hasG16() && (IsA16
!= IsG16
)) {
6420 // 16 bit gradients are supported, but are tied to the A16 control
6421 // so both gradients and addresses must be 16 bit
6425 if (IsA16
&& !ST
.hasA16()) {
6426 // A16 not supported
6430 const unsigned NSAMaxSize
= ST
.getNSAMaxSize(BaseOpcode
->Sampler
);
6431 const unsigned HasPartialNSA
= ST
.hasPartialNSAEncoding();
6433 if (IsA16
|| IsG16
) {
6434 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6435 // instructions expect VGPR_32
6436 SmallVector
<Register
, 4> PackedRegs
;
6438 packImage16bitOpsToDwords(B
, MI
, PackedRegs
, ArgOffset
, Intr
, IsA16
, IsG16
);
6440 // See also below in the non-a16 branch
6441 const bool UseNSA
= ST
.hasNSAEncoding() &&
6442 PackedRegs
.size() >= ST
.getNSAThreshold(MF
) &&
6443 (PackedRegs
.size() <= NSAMaxSize
|| HasPartialNSA
);
6444 const bool UsePartialNSA
=
6445 UseNSA
&& HasPartialNSA
&& PackedRegs
.size() > NSAMaxSize
;
6447 if (UsePartialNSA
) {
6448 // Pack registers that would go over NSAMaxSize into last VAddr register
6450 LLT::fixed_vector(2 * (PackedRegs
.size() - NSAMaxSize
+ 1), 16);
6451 auto Concat
= B
.buildConcatVectors(
6452 PackedAddrTy
, ArrayRef(PackedRegs
).slice(NSAMaxSize
- 1));
6453 PackedRegs
[NSAMaxSize
- 1] = Concat
.getReg(0);
6454 PackedRegs
.resize(NSAMaxSize
);
6455 } else if (!UseNSA
&& PackedRegs
.size() > 1) {
6456 LLT PackedAddrTy
= LLT::fixed_vector(2 * PackedRegs
.size(), 16);
6457 auto Concat
= B
.buildConcatVectors(PackedAddrTy
, PackedRegs
);
6458 PackedRegs
[0] = Concat
.getReg(0);
6459 PackedRegs
.resize(1);
6462 const unsigned NumPacked
= PackedRegs
.size();
6463 for (unsigned I
= Intr
->VAddrStart
; I
< Intr
->VAddrEnd
; I
++) {
6464 MachineOperand
&SrcOp
= MI
.getOperand(ArgOffset
+ I
);
6465 if (!SrcOp
.isReg()) {
6466 assert(SrcOp
.isImm() && SrcOp
.getImm() == 0);
6470 assert(SrcOp
.getReg() != AMDGPU::NoRegister
);
6472 if (I
- Intr
->VAddrStart
< NumPacked
)
6473 SrcOp
.setReg(PackedRegs
[I
- Intr
->VAddrStart
]);
6475 SrcOp
.setReg(AMDGPU::NoRegister
);
6478 // If the register allocator cannot place the address registers contiguously
6479 // without introducing moves, then using the non-sequential address encoding
6480 // is always preferable, since it saves VALU instructions and is usually a
6481 // wash in terms of code size or even better.
6483 // However, we currently have no way of hinting to the register allocator
6484 // that MIMG addresses should be placed contiguously when it is possible to
6485 // do so, so force non-NSA for the common 2-address case as a heuristic.
6487 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6488 // allocation when possible.
6490 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6491 // set of the remaining addresses.
6492 const bool UseNSA
= ST
.hasNSAEncoding() &&
6493 CorrectedNumVAddrs
>= ST
.getNSAThreshold(MF
) &&
6494 (CorrectedNumVAddrs
<= NSAMaxSize
|| HasPartialNSA
);
6495 const bool UsePartialNSA
=
6496 UseNSA
&& HasPartialNSA
&& CorrectedNumVAddrs
> NSAMaxSize
;
6498 if (UsePartialNSA
) {
6499 convertImageAddrToPacked(B
, MI
,
6500 ArgOffset
+ Intr
->VAddrStart
+ NSAMaxSize
- 1,
6501 Intr
->NumVAddrs
- NSAMaxSize
+ 1);
6502 } else if (!UseNSA
&& Intr
->NumVAddrs
> 1) {
6503 convertImageAddrToPacked(B
, MI
, ArgOffset
+ Intr
->VAddrStart
,
6513 MI
.addOperand(MachineOperand::CreateImm(Flags
));
6515 if (BaseOpcode
->NoReturn
) { // No TFE for stores?
6516 // TODO: Handle dmask trim
6517 if (!Ty
.isVector() || !IsD16
)
6520 Register RepackedReg
= handleD16VData(B
, *MRI
, VData
, true);
6521 if (RepackedReg
!= VData
) {
6522 MI
.getOperand(1).setReg(RepackedReg
);
6528 Register DstReg
= MI
.getOperand(0).getReg();
6529 const LLT EltTy
= Ty
.getScalarType();
6530 const int NumElts
= Ty
.isVector() ? Ty
.getNumElements() : 1;
6532 // Confirm that the return type is large enough for the dmask specified
6533 if (NumElts
< DMaskLanes
)
6536 if (NumElts
> 4 || DMaskLanes
> 4)
6539 // Image atomic instructions are using DMask to specify how many bits
6540 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6541 // DMaskLanes for image atomic has default value '0'.
6542 // We must be sure that atomic variants (especially packed) will not be
6543 // truncated from v2s16 or v4s16 to s16 type.
6545 // ChangeElementCount will be needed for image load where Ty is always scalar.
6546 const unsigned AdjustedNumElts
= DMaskLanes
== 0 ? 1 : DMaskLanes
;
6547 const LLT AdjustedTy
=
6550 : Ty
.changeElementCount(ElementCount::getFixed(AdjustedNumElts
));
6552 // The raw dword aligned data component of the load. The only legal cases
6553 // where this matters should be when using the packed D16 format, for
6554 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6557 // S32 vector to cover all data, plus TFE result element.
6560 // Register type to use for each loaded component. Will be S32 or V2S16.
6563 if (IsD16
&& ST
.hasUnpackedD16VMem()) {
6565 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts
), 32);
6566 TFETy
= LLT::fixed_vector(AdjustedNumElts
+ 1, 32);
6569 unsigned EltSize
= EltTy
.getSizeInBits();
6570 unsigned RoundedElts
= (AdjustedTy
.getSizeInBits() + 31) / 32;
6571 unsigned RoundedSize
= 32 * RoundedElts
;
6572 RoundedTy
= LLT::scalarOrVector(
6573 ElementCount::getFixed(RoundedSize
/ EltSize
), EltSize
);
6574 TFETy
= LLT::fixed_vector(RoundedSize
/ 32 + 1, S32
);
6575 RegTy
= !IsTFE
&& EltSize
== 16 ? V2S16
: S32
;
6578 // The return type does not need adjustment.
6579 // TODO: Should we change s16 case to s32 or <2 x s16>?
6580 if (!IsTFE
&& (RoundedTy
== Ty
|| !Ty
.isVector()))
6585 // Insert after the instruction.
6586 B
.setInsertPt(*MI
.getParent(), ++MI
.getIterator());
6588 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6589 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6590 const LLT LoadResultTy
= IsTFE
? TFETy
: RoundedTy
;
6591 const int ResultNumRegs
= LoadResultTy
.getSizeInBits() / 32;
6593 Register NewResultReg
= MRI
->createGenericVirtualRegister(LoadResultTy
);
6595 MI
.getOperand(0).setReg(NewResultReg
);
6597 // In the IR, TFE is supposed to be used with a 2 element struct return
6598 // type. The instruction really returns these two values in one contiguous
6599 // register, with one additional dword beyond the loaded data. Rewrite the
6600 // return type to use a single register result.
6603 Dst1Reg
= MI
.getOperand(1).getReg();
6604 if (MRI
->getType(Dst1Reg
) != S32
)
6607 // TODO: Make sure the TFE operand bit is set.
6608 MI
.removeOperand(1);
6610 // Handle the easy case that requires no repack instructions.
6612 B
.buildUnmerge({DstReg
, Dst1Reg
}, NewResultReg
);
6617 // Now figure out how to copy the new result register back into the old
6619 SmallVector
<Register
, 5> ResultRegs(ResultNumRegs
, Dst1Reg
);
6621 const int NumDataRegs
= IsTFE
? ResultNumRegs
- 1 : ResultNumRegs
;
6623 if (ResultNumRegs
== 1) {
6625 ResultRegs
[0] = NewResultReg
;
6627 // We have to repack into a new vector of some kind.
6628 for (int I
= 0; I
!= NumDataRegs
; ++I
)
6629 ResultRegs
[I
] = MRI
->createGenericVirtualRegister(RegTy
);
6630 B
.buildUnmerge(ResultRegs
, NewResultReg
);
6632 // Drop the final TFE element to get the data part. The TFE result is
6633 // directly written to the right place already.
6635 ResultRegs
.resize(NumDataRegs
);
6638 // For an s16 scalar result, we form an s32 result with a truncate regardless
6639 // of packed vs. unpacked.
6640 if (IsD16
&& !Ty
.isVector()) {
6641 B
.buildTrunc(DstReg
, ResultRegs
[0]);
6645 // Avoid a build/concat_vector of 1 entry.
6646 if (Ty
== V2S16
&& NumDataRegs
== 1 && !ST
.hasUnpackedD16VMem()) {
6647 B
.buildBitcast(DstReg
, ResultRegs
[0]);
6651 assert(Ty
.isVector());
6654 // For packed D16 results with TFE enabled, all the data components are
6655 // S32. Cast back to the expected type.
6657 // TODO: We don't really need to use load s32 elements. We would only need one
6658 // cast for the TFE result if a multiple of v2s16 was used.
6659 if (RegTy
!= V2S16
&& !ST
.hasUnpackedD16VMem()) {
6660 for (Register
&Reg
: ResultRegs
)
6661 Reg
= B
.buildBitcast(V2S16
, Reg
).getReg(0);
6662 } else if (ST
.hasUnpackedD16VMem()) {
6663 for (Register
&Reg
: ResultRegs
)
6664 Reg
= B
.buildTrunc(S16
, Reg
).getReg(0);
6668 auto padWithUndef
= [&](LLT Ty
, int NumElts
) {
6671 Register Undef
= B
.buildUndef(Ty
).getReg(0);
6672 for (int I
= 0; I
!= NumElts
; ++I
)
6673 ResultRegs
.push_back(Undef
);
6676 // Pad out any elements eliminated due to the dmask.
6677 LLT ResTy
= MRI
->getType(ResultRegs
[0]);
6678 if (!ResTy
.isVector()) {
6679 padWithUndef(ResTy
, NumElts
- ResultRegs
.size());
6680 B
.buildBuildVector(DstReg
, ResultRegs
);
6684 assert(!ST
.hasUnpackedD16VMem() && ResTy
== V2S16
);
6685 const int RegsToCover
= (Ty
.getSizeInBits() + 31) / 32;
6687 // Deal with the one annoying legal case.
6688 const LLT V3S16
= LLT::fixed_vector(3, 16);
6691 if (ResultRegs
.size() == 1) {
6692 NewResultReg
= ResultRegs
[0];
6693 } else if (ResultRegs
.size() == 2) {
6694 LLT V4S16
= LLT::fixed_vector(4, 16);
6695 NewResultReg
= B
.buildConcatVectors(V4S16
, ResultRegs
).getReg(0);
6701 if (MRI
->getType(DstReg
).getNumElements() <
6702 MRI
->getType(NewResultReg
).getNumElements()) {
6703 B
.buildDeleteTrailingVectorElements(DstReg
, NewResultReg
);
6705 B
.buildPadVectorWithUndefElements(DstReg
, NewResultReg
);
6710 padWithUndef(ResTy
, RegsToCover
- ResultRegs
.size());
6711 B
.buildConcatVectors(DstReg
, ResultRegs
);
6715 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper
&Helper
,
6716 MachineInstr
&MI
) const {
6717 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
6718 GISelChangeObserver
&Observer
= Helper
.Observer
;
6720 Register OrigDst
= MI
.getOperand(0).getReg();
6722 LLT Ty
= B
.getMRI()->getType(OrigDst
);
6723 unsigned Size
= Ty
.getSizeInBits();
6724 MachineFunction
&MF
= B
.getMF();
6726 if (Size
< 32 && ST
.hasScalarSubwordLoads()) {
6727 assert(Size
== 8 || Size
== 16);
6728 Opc
= Size
== 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6729 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT
;
6730 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6731 // destination register.
6732 Dst
= B
.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6734 Opc
= AMDGPU::G_AMDGPU_S_BUFFER_LOAD
;
6738 Observer
.changingInstr(MI
);
6740 // Handle needing to s.buffer.load() a p8 value.
6741 if (hasBufferRsrcWorkaround(Ty
)) {
6742 Ty
= castBufferRsrcFromV4I32(MI
, B
, *B
.getMRI(), 0);
6743 B
.setInsertPt(B
.getMBB(), MI
);
6745 if (shouldBitcastLoadStoreType(ST
, Ty
, LLT::scalar(Size
))) {
6746 Ty
= getBitcastRegisterType(Ty
);
6747 Helper
.bitcastDst(MI
, Ty
, 0);
6748 B
.setInsertPt(B
.getMBB(), MI
);
6751 // FIXME: We don't really need this intermediate instruction. The intrinsic
6752 // should be fixed to have a memory operand. Since it's readnone, we're not
6753 // allowed to add one.
6754 MI
.setDesc(B
.getTII().get(Opc
));
6755 MI
.removeOperand(1); // Remove intrinsic ID
6757 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6758 const unsigned MemSize
= (Size
+ 7) / 8;
6759 const Align MemAlign
= B
.getDataLayout().getABITypeAlign(
6760 getTypeForLLT(Ty
, MF
.getFunction().getContext()));
6761 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
6762 MachinePointerInfo(),
6763 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
6764 MachineMemOperand::MOInvariant
,
6766 MI
.addMemOperand(MF
, MMO
);
6767 if (Dst
!= OrigDst
) {
6768 MI
.getOperand(0).setReg(Dst
);
6769 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
6770 B
.buildTrunc(OrigDst
, Dst
);
6773 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6774 // always be legal. We may need to restore this to a 96-bit result if it turns
6775 // out this needs to be converted to a vector load during RegBankSelect.
6776 if (!isPowerOf2_32(Size
) && (Size
!= 96 || !ST
.hasScalarDwordx3Loads())) {
6778 Helper
.moreElementsVectorDst(MI
, getPow2VectorType(Ty
), 0);
6780 Helper
.widenScalarDst(MI
, getPow2ScalarType(Ty
), 0);
6783 Observer
.changedInstr(MI
);
6787 // TODO: Move to selection
6788 bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr
&MI
,
6789 MachineRegisterInfo
&MRI
,
6790 MachineIRBuilder
&B
) const {
6791 if (!ST
.isTrapHandlerEnabled() ||
6792 ST
.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA
)
6793 return legalizeTrapEndpgm(MI
, MRI
, B
);
6795 return ST
.supportsGetDoorbellID() ?
6796 legalizeTrapHsa(MI
, MRI
, B
) : legalizeTrapHsaQueuePtr(MI
, MRI
, B
);
6799 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6800 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
6801 const DebugLoc
&DL
= MI
.getDebugLoc();
6802 MachineBasicBlock
&BB
= B
.getMBB();
6803 MachineFunction
*MF
= BB
.getParent();
6805 if (BB
.succ_empty() && std::next(MI
.getIterator()) == BB
.end()) {
6806 BuildMI(BB
, BB
.end(), DL
, B
.getTII().get(AMDGPU::S_ENDPGM
))
6808 MI
.eraseFromParent();
6812 // We need a block split to make the real endpgm a terminator. We also don't
6813 // want to break phis in successor blocks, so we can't just delete to the
6814 // end of the block.
6815 BB
.splitAt(MI
, false /*UpdateLiveIns*/);
6816 MachineBasicBlock
*TrapBB
= MF
->CreateMachineBasicBlock();
6817 MF
->push_back(TrapBB
);
6818 BuildMI(*TrapBB
, TrapBB
->end(), DL
, B
.getTII().get(AMDGPU::S_ENDPGM
))
6820 BuildMI(BB
, &MI
, DL
, B
.getTII().get(AMDGPU::S_CBRANCH_EXECNZ
))
6823 BB
.addSuccessor(TrapBB
);
6824 MI
.eraseFromParent();
6828 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6829 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
6830 MachineFunction
&MF
= B
.getMF();
6831 const LLT S64
= LLT::scalar(64);
6833 Register
SGPR01(AMDGPU::SGPR0_SGPR1
);
6834 // For code object version 5, queue_ptr is passed through implicit kernarg.
6835 if (AMDGPU::getAMDHSACodeObjectVersion(*MF
.getFunction().getParent()) >=
6836 AMDGPU::AMDHSA_COV5
) {
6837 AMDGPUTargetLowering::ImplicitParameter Param
=
6838 AMDGPUTargetLowering::QUEUE_PTR
;
6840 ST
.getTargetLowering()->getImplicitParameterOffset(B
.getMF(), Param
);
6842 Register KernargPtrReg
= MRI
.createGenericVirtualRegister(
6843 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
6845 if (!loadInputValue(KernargPtrReg
, B
,
6846 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
))
6849 // TODO: can we be smarter about machine pointer info?
6850 MachinePointerInfo
PtrInfo(AMDGPUAS::CONSTANT_ADDRESS
);
6851 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
6853 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
6854 MachineMemOperand::MOInvariant
,
6855 LLT::scalar(64), commonAlignment(Align(64), Offset
));
6858 Register LoadAddr
= MRI
.createGenericVirtualRegister(
6859 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
6860 B
.buildPtrAdd(LoadAddr
, KernargPtrReg
,
6861 B
.buildConstant(LLT::scalar(64), Offset
).getReg(0));
6863 Register Temp
= B
.buildLoad(S64
, LoadAddr
, *MMO
).getReg(0);
6864 B
.buildCopy(SGPR01
, Temp
);
6865 B
.buildInstr(AMDGPU::S_TRAP
)
6866 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap
))
6867 .addReg(SGPR01
, RegState::Implicit
);
6868 MI
.eraseFromParent();
6872 // Pass queue pointer to trap handler as input, and insert trap instruction
6873 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6875 MRI
.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
6876 if (!loadInputValue(LiveIn
, B
, AMDGPUFunctionArgInfo::QUEUE_PTR
))
6879 B
.buildCopy(SGPR01
, LiveIn
);
6880 B
.buildInstr(AMDGPU::S_TRAP
)
6881 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap
))
6882 .addReg(SGPR01
, RegState::Implicit
);
6884 MI
.eraseFromParent();
6888 bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr
&MI
,
6889 MachineRegisterInfo
&MRI
,
6890 MachineIRBuilder
&B
) const {
6891 // We need to simulate the 's_trap 2' instruction on targets that run in
6892 // PRIV=1 (where it is treated as a nop).
6893 if (ST
.hasPrivEnabledTrap2NopBug()) {
6894 ST
.getInstrInfo()->insertSimulatedTrap(MRI
, B
.getMBB(), MI
,
6896 MI
.eraseFromParent();
6900 B
.buildInstr(AMDGPU::S_TRAP
)
6901 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap
));
6902 MI
.eraseFromParent();
6906 bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr
&MI
,
6907 MachineRegisterInfo
&MRI
,
6908 MachineIRBuilder
&B
) const {
6909 // Is non-HSA path or trap-handler disabled? Then, report a warning
6911 if (!ST
.isTrapHandlerEnabled() ||
6912 ST
.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA
) {
6913 DiagnosticInfoUnsupported
NoTrap(B
.getMF().getFunction(),
6914 "debugtrap handler not supported",
6915 MI
.getDebugLoc(), DS_Warning
);
6916 LLVMContext
&Ctx
= B
.getMF().getFunction().getContext();
6917 Ctx
.diagnose(NoTrap
);
6919 // Insert debug-trap instruction
6920 B
.buildInstr(AMDGPU::S_TRAP
)
6921 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap
));
6924 MI
.eraseFromParent();
6928 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr
&MI
,
6929 MachineIRBuilder
&B
) const {
6930 MachineRegisterInfo
&MRI
= *B
.getMRI();
6931 const LLT S16
= LLT::scalar(16);
6932 const LLT S32
= LLT::scalar(32);
6933 const LLT V2S16
= LLT::fixed_vector(2, 16);
6934 const LLT V3S32
= LLT::fixed_vector(3, 32);
6936 Register DstReg
= MI
.getOperand(0).getReg();
6937 Register NodePtr
= MI
.getOperand(2).getReg();
6938 Register RayExtent
= MI
.getOperand(3).getReg();
6939 Register RayOrigin
= MI
.getOperand(4).getReg();
6940 Register RayDir
= MI
.getOperand(5).getReg();
6941 Register RayInvDir
= MI
.getOperand(6).getReg();
6942 Register TDescr
= MI
.getOperand(7).getReg();
6944 if (!ST
.hasGFX10_AEncoding()) {
6945 DiagnosticInfoUnsupported
BadIntrin(B
.getMF().getFunction(),
6946 "intrinsic not supported on subtarget",
6948 B
.getMF().getFunction().getContext().diagnose(BadIntrin
);
6952 const bool IsGFX11
= AMDGPU::isGFX11(ST
);
6953 const bool IsGFX11Plus
= AMDGPU::isGFX11Plus(ST
);
6954 const bool IsGFX12Plus
= AMDGPU::isGFX12Plus(ST
);
6955 const bool IsA16
= MRI
.getType(RayDir
).getElementType().getSizeInBits() == 16;
6956 const bool Is64
= MRI
.getType(NodePtr
).getSizeInBits() == 64;
6957 const unsigned NumVDataDwords
= 4;
6958 const unsigned NumVAddrDwords
= IsA16
? (Is64
? 9 : 8) : (Is64
? 12 : 11);
6959 const unsigned NumVAddrs
= IsGFX11Plus
? (IsA16
? 4 : 5) : NumVAddrDwords
;
6961 IsGFX12Plus
|| (ST
.hasNSAEncoding() && NumVAddrs
<= ST
.getNSAMaxSize());
6963 const unsigned BaseOpcodes
[2][2] = {
6964 {AMDGPU::IMAGE_BVH_INTERSECT_RAY
, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16
},
6965 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY
,
6966 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16
}};
6969 Opcode
= AMDGPU::getMIMGOpcode(BaseOpcodes
[Is64
][IsA16
],
6970 IsGFX12Plus
? AMDGPU::MIMGEncGfx12
6971 : IsGFX11
? AMDGPU::MIMGEncGfx11NSA
6972 : AMDGPU::MIMGEncGfx10NSA
,
6973 NumVDataDwords
, NumVAddrDwords
);
6975 assert(!IsGFX12Plus
);
6976 Opcode
= AMDGPU::getMIMGOpcode(BaseOpcodes
[Is64
][IsA16
],
6977 IsGFX11
? AMDGPU::MIMGEncGfx11Default
6978 : AMDGPU::MIMGEncGfx10Default
,
6979 NumVDataDwords
, NumVAddrDwords
);
6981 assert(Opcode
!= -1);
6983 SmallVector
<Register
, 12> Ops
;
6984 if (UseNSA
&& IsGFX11Plus
) {
6985 auto packLanes
= [&Ops
, &S32
, &V3S32
, &B
](Register Src
) {
6986 auto Unmerge
= B
.buildUnmerge({S32
, S32
, S32
}, Src
);
6987 auto Merged
= B
.buildMergeLikeInstr(
6988 V3S32
, {Unmerge
.getReg(0), Unmerge
.getReg(1), Unmerge
.getReg(2)});
6989 Ops
.push_back(Merged
.getReg(0));
6992 Ops
.push_back(NodePtr
);
6993 Ops
.push_back(RayExtent
);
6994 packLanes(RayOrigin
);
6997 auto UnmergeRayDir
= B
.buildUnmerge({S16
, S16
, S16
}, RayDir
);
6998 auto UnmergeRayInvDir
= B
.buildUnmerge({S16
, S16
, S16
}, RayInvDir
);
6999 auto MergedDir
= B
.buildMergeLikeInstr(
7002 S32
, B
.buildMergeLikeInstr(V2S16
, {UnmergeRayInvDir
.getReg(0),
7003 UnmergeRayDir
.getReg(0)}))
7006 S32
, B
.buildMergeLikeInstr(V2S16
, {UnmergeRayInvDir
.getReg(1),
7007 UnmergeRayDir
.getReg(1)}))
7010 S32
, B
.buildMergeLikeInstr(V2S16
, {UnmergeRayInvDir
.getReg(2),
7011 UnmergeRayDir
.getReg(2)}))
7013 Ops
.push_back(MergedDir
.getReg(0));
7016 packLanes(RayInvDir
);
7020 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, NodePtr
);
7021 Ops
.push_back(Unmerge
.getReg(0));
7022 Ops
.push_back(Unmerge
.getReg(1));
7024 Ops
.push_back(NodePtr
);
7026 Ops
.push_back(RayExtent
);
7028 auto packLanes
= [&Ops
, &S32
, &B
](Register Src
) {
7029 auto Unmerge
= B
.buildUnmerge({S32
, S32
, S32
}, Src
);
7030 Ops
.push_back(Unmerge
.getReg(0));
7031 Ops
.push_back(Unmerge
.getReg(1));
7032 Ops
.push_back(Unmerge
.getReg(2));
7035 packLanes(RayOrigin
);
7037 auto UnmergeRayDir
= B
.buildUnmerge({S16
, S16
, S16
}, RayDir
);
7038 auto UnmergeRayInvDir
= B
.buildUnmerge({S16
, S16
, S16
}, RayInvDir
);
7039 Register R1
= MRI
.createGenericVirtualRegister(S32
);
7040 Register R2
= MRI
.createGenericVirtualRegister(S32
);
7041 Register R3
= MRI
.createGenericVirtualRegister(S32
);
7042 B
.buildMergeLikeInstr(R1
,
7043 {UnmergeRayDir
.getReg(0), UnmergeRayDir
.getReg(1)});
7044 B
.buildMergeLikeInstr(
7045 R2
, {UnmergeRayDir
.getReg(2), UnmergeRayInvDir
.getReg(0)});
7046 B
.buildMergeLikeInstr(
7047 R3
, {UnmergeRayInvDir
.getReg(1), UnmergeRayInvDir
.getReg(2)});
7053 packLanes(RayInvDir
);
7058 // Build a single vector containing all the operands so far prepared.
7059 LLT OpTy
= LLT::fixed_vector(Ops
.size(), 32);
7060 Register MergedOps
= B
.buildMergeLikeInstr(OpTy
, Ops
).getReg(0);
7062 Ops
.push_back(MergedOps
);
7065 auto MIB
= B
.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY
)
7069 for (Register R
: Ops
) {
7074 .addImm(IsA16
? 1 : 0)
7077 MI
.eraseFromParent();
7081 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr
&MI
,
7082 MachineIRBuilder
&B
) const {
7084 int RoundMode
= MI
.getOperand(2).getImm();
7086 if (RoundMode
== (int)RoundingMode::TowardPositive
)
7087 Opc
= AMDGPU::G_FPTRUNC_ROUND_UPWARD
;
7088 else if (RoundMode
== (int)RoundingMode::TowardNegative
)
7089 Opc
= AMDGPU::G_FPTRUNC_ROUND_DOWNWARD
;
7094 .addDef(MI
.getOperand(0).getReg())
7095 .addUse(MI
.getOperand(1).getReg());
7097 MI
.eraseFromParent();
7102 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr
&MI
,
7103 MachineIRBuilder
&B
) const {
7104 const SITargetLowering
*TLI
= ST
.getTargetLowering();
7105 Register StackPtr
= TLI
->getStackPointerRegisterToSaveRestore();
7106 Register DstReg
= MI
.getOperand(0).getReg();
7107 B
.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS
, {DstReg
}, {StackPtr
});
7108 MI
.eraseFromParent();
7112 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr
&MI
,
7113 MachineIRBuilder
&B
) const {
7114 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7115 if (!ST
.hasArchitectedSGPRs())
7117 LLT S32
= LLT::scalar(32);
7118 Register DstReg
= MI
.getOperand(0).getReg();
7119 auto TTMP8
= B
.buildCopy(S32
, Register(AMDGPU::TTMP8
));
7120 auto LSB
= B
.buildConstant(S32
, 25);
7121 auto Width
= B
.buildConstant(S32
, 5);
7122 B
.buildUbfx(DstReg
, TTMP8
, LSB
, Width
);
7123 MI
.eraseFromParent();
7127 static constexpr unsigned FPEnvModeBitField
=
7128 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE
, 0, 23);
7130 static constexpr unsigned FPEnvTrapBitField
=
7131 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS
, 0, 5);
7133 bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr
&MI
,
7134 MachineRegisterInfo
&MRI
,
7135 MachineIRBuilder
&B
) const {
7136 Register Src
= MI
.getOperand(0).getReg();
7137 if (MRI
.getType(Src
) != S64
)
7141 B
.buildIntrinsic(Intrinsic::amdgcn_s_getreg
, {S32
},
7142 /*HasSideEffects=*/true, /*isConvergent=*/false)
7143 .addImm(FPEnvModeBitField
);
7145 B
.buildIntrinsic(Intrinsic::amdgcn_s_getreg
, {S32
},
7146 /*HasSideEffects=*/true, /*isConvergent=*/false)
7147 .addImm(FPEnvTrapBitField
);
7148 B
.buildMergeLikeInstr(Src
, {ModeReg
, TrapReg
});
7149 MI
.eraseFromParent();
7153 bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr
&MI
,
7154 MachineRegisterInfo
&MRI
,
7155 MachineIRBuilder
&B
) const {
7156 Register Src
= MI
.getOperand(0).getReg();
7157 if (MRI
.getType(Src
) != S64
)
7160 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, MI
.getOperand(0));
7161 B
.buildIntrinsic(Intrinsic::amdgcn_s_setreg
, ArrayRef
<DstOp
>(),
7162 /*HasSideEffects=*/true, /*isConvergent=*/false)
7163 .addImm(static_cast<int16_t>(FPEnvModeBitField
))
7164 .addReg(Unmerge
.getReg(0));
7165 B
.buildIntrinsic(Intrinsic::amdgcn_s_setreg
, ArrayRef
<DstOp
>(),
7166 /*HasSideEffects=*/true, /*isConvergent=*/false)
7167 .addImm(static_cast<int16_t>(FPEnvTrapBitField
))
7168 .addReg(Unmerge
.getReg(1));
7169 MI
.eraseFromParent();
7173 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper
&Helper
,
7174 MachineInstr
&MI
) const {
7175 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
7176 MachineRegisterInfo
&MRI
= *B
.getMRI();
7178 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7179 auto IntrID
= cast
<GIntrinsic
>(MI
).getIntrinsicID();
7181 case Intrinsic::amdgcn_if
:
7182 case Intrinsic::amdgcn_else
: {
7183 MachineInstr
*Br
= nullptr;
7184 MachineBasicBlock
*UncondBrTarget
= nullptr;
7185 bool Negated
= false;
7186 if (MachineInstr
*BrCond
=
7187 verifyCFIntrinsic(MI
, MRI
, Br
, UncondBrTarget
, Negated
)) {
7188 const SIRegisterInfo
*TRI
7189 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
7191 Register Def
= MI
.getOperand(1).getReg();
7192 Register Use
= MI
.getOperand(3).getReg();
7194 MachineBasicBlock
*CondBrTarget
= BrCond
->getOperand(1).getMBB();
7197 std::swap(CondBrTarget
, UncondBrTarget
);
7199 B
.setInsertPt(B
.getMBB(), BrCond
->getIterator());
7200 if (IntrID
== Intrinsic::amdgcn_if
) {
7201 B
.buildInstr(AMDGPU::SI_IF
)
7204 .addMBB(UncondBrTarget
);
7206 B
.buildInstr(AMDGPU::SI_ELSE
)
7209 .addMBB(UncondBrTarget
);
7213 Br
->getOperand(0).setMBB(CondBrTarget
);
7215 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7216 // since we're swapping branch targets it needs to be reinserted.
7217 // FIXME: IRTranslator should probably not do this
7218 B
.buildBr(*CondBrTarget
);
7221 MRI
.setRegClass(Def
, TRI
->getWaveMaskRegClass());
7222 MRI
.setRegClass(Use
, TRI
->getWaveMaskRegClass());
7223 MI
.eraseFromParent();
7224 BrCond
->eraseFromParent();
7230 case Intrinsic::amdgcn_loop
: {
7231 MachineInstr
*Br
= nullptr;
7232 MachineBasicBlock
*UncondBrTarget
= nullptr;
7233 bool Negated
= false;
7234 if (MachineInstr
*BrCond
=
7235 verifyCFIntrinsic(MI
, MRI
, Br
, UncondBrTarget
, Negated
)) {
7236 const SIRegisterInfo
*TRI
7237 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
7239 MachineBasicBlock
*CondBrTarget
= BrCond
->getOperand(1).getMBB();
7240 Register Reg
= MI
.getOperand(2).getReg();
7243 std::swap(CondBrTarget
, UncondBrTarget
);
7245 B
.setInsertPt(B
.getMBB(), BrCond
->getIterator());
7246 B
.buildInstr(AMDGPU::SI_LOOP
)
7248 .addMBB(UncondBrTarget
);
7251 Br
->getOperand(0).setMBB(CondBrTarget
);
7253 B
.buildBr(*CondBrTarget
);
7255 MI
.eraseFromParent();
7256 BrCond
->eraseFromParent();
7257 MRI
.setRegClass(Reg
, TRI
->getWaveMaskRegClass());
7263 case Intrinsic::amdgcn_addrspacecast_nonnull
:
7264 return legalizeAddrSpaceCast(MI
, MRI
, B
);
7265 case Intrinsic::amdgcn_make_buffer_rsrc
:
7266 return legalizePointerAsRsrcIntrin(MI
, MRI
, B
);
7267 case Intrinsic::amdgcn_kernarg_segment_ptr
:
7268 if (!AMDGPU::isKernel(B
.getMF().getFunction().getCallingConv())) {
7269 // This only makes sense to call in a kernel, so just lower to null.
7270 B
.buildConstant(MI
.getOperand(0).getReg(), 0);
7271 MI
.eraseFromParent();
7275 return legalizePreloadedArgIntrin(
7276 MI
, MRI
, B
, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
7277 case Intrinsic::amdgcn_implicitarg_ptr
:
7278 return legalizeImplicitArgPtr(MI
, MRI
, B
);
7279 case Intrinsic::amdgcn_workitem_id_x
:
7280 return legalizeWorkitemIDIntrinsic(MI
, MRI
, B
, 0,
7281 AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
7282 case Intrinsic::amdgcn_workitem_id_y
:
7283 return legalizeWorkitemIDIntrinsic(MI
, MRI
, B
, 1,
7284 AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
7285 case Intrinsic::amdgcn_workitem_id_z
:
7286 return legalizeWorkitemIDIntrinsic(MI
, MRI
, B
, 2,
7287 AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
7288 case Intrinsic::amdgcn_workgroup_id_x
:
7289 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7290 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
7291 case Intrinsic::amdgcn_workgroup_id_y
:
7292 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7293 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
7294 case Intrinsic::amdgcn_workgroup_id_z
:
7295 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7296 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
7297 case Intrinsic::amdgcn_wave_id
:
7298 return legalizeWaveID(MI
, B
);
7299 case Intrinsic::amdgcn_lds_kernel_id
:
7300 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7301 AMDGPUFunctionArgInfo::LDS_KERNEL_ID
);
7302 case Intrinsic::amdgcn_dispatch_ptr
:
7303 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7304 AMDGPUFunctionArgInfo::DISPATCH_PTR
);
7305 case Intrinsic::amdgcn_queue_ptr
:
7306 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7307 AMDGPUFunctionArgInfo::QUEUE_PTR
);
7308 case Intrinsic::amdgcn_implicit_buffer_ptr
:
7309 return legalizePreloadedArgIntrin(
7310 MI
, MRI
, B
, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
7311 case Intrinsic::amdgcn_dispatch_id
:
7312 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7313 AMDGPUFunctionArgInfo::DISPATCH_ID
);
7314 case Intrinsic::r600_read_ngroups_x
:
7315 // TODO: Emit error for hsa
7316 return legalizeKernargMemParameter(MI
, B
,
7317 SI::KernelInputOffsets::NGROUPS_X
);
7318 case Intrinsic::r600_read_ngroups_y
:
7319 return legalizeKernargMemParameter(MI
, B
,
7320 SI::KernelInputOffsets::NGROUPS_Y
);
7321 case Intrinsic::r600_read_ngroups_z
:
7322 return legalizeKernargMemParameter(MI
, B
,
7323 SI::KernelInputOffsets::NGROUPS_Z
);
7324 case Intrinsic::r600_read_local_size_x
:
7325 // TODO: Could insert G_ASSERT_ZEXT from s16
7326 return legalizeKernargMemParameter(MI
, B
, SI::KernelInputOffsets::LOCAL_SIZE_X
);
7327 case Intrinsic::r600_read_local_size_y
:
7328 // TODO: Could insert G_ASSERT_ZEXT from s16
7329 return legalizeKernargMemParameter(MI
, B
, SI::KernelInputOffsets::LOCAL_SIZE_Y
);
7330 // TODO: Could insert G_ASSERT_ZEXT from s16
7331 case Intrinsic::r600_read_local_size_z
:
7332 return legalizeKernargMemParameter(MI
, B
, SI::KernelInputOffsets::LOCAL_SIZE_Z
);
7333 case Intrinsic::r600_read_global_size_x
:
7334 return legalizeKernargMemParameter(MI
, B
, SI::KernelInputOffsets::GLOBAL_SIZE_X
);
7335 case Intrinsic::r600_read_global_size_y
:
7336 return legalizeKernargMemParameter(MI
, B
, SI::KernelInputOffsets::GLOBAL_SIZE_Y
);
7337 case Intrinsic::r600_read_global_size_z
:
7338 return legalizeKernargMemParameter(MI
, B
, SI::KernelInputOffsets::GLOBAL_SIZE_Z
);
7339 case Intrinsic::amdgcn_fdiv_fast
:
7340 return legalizeFDIVFastIntrin(MI
, MRI
, B
);
7341 case Intrinsic::amdgcn_is_shared
:
7342 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::LOCAL_ADDRESS
);
7343 case Intrinsic::amdgcn_is_private
:
7344 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::PRIVATE_ADDRESS
);
7345 case Intrinsic::amdgcn_wavefrontsize
: {
7346 B
.buildConstant(MI
.getOperand(0), ST
.getWavefrontSize());
7347 MI
.eraseFromParent();
7350 case Intrinsic::amdgcn_s_buffer_load
:
7351 return legalizeSBufferLoad(Helper
, MI
);
7352 case Intrinsic::amdgcn_raw_buffer_store
:
7353 case Intrinsic::amdgcn_raw_ptr_buffer_store
:
7354 case Intrinsic::amdgcn_struct_buffer_store
:
7355 case Intrinsic::amdgcn_struct_ptr_buffer_store
:
7356 return legalizeBufferStore(MI
, MRI
, B
, false, false);
7357 case Intrinsic::amdgcn_raw_buffer_store_format
:
7358 case Intrinsic::amdgcn_raw_ptr_buffer_store_format
:
7359 case Intrinsic::amdgcn_struct_buffer_store_format
:
7360 case Intrinsic::amdgcn_struct_ptr_buffer_store_format
:
7361 return legalizeBufferStore(MI
, MRI
, B
, false, true);
7362 case Intrinsic::amdgcn_raw_tbuffer_store
:
7363 case Intrinsic::amdgcn_raw_ptr_tbuffer_store
:
7364 case Intrinsic::amdgcn_struct_tbuffer_store
:
7365 case Intrinsic::amdgcn_struct_ptr_tbuffer_store
:
7366 return legalizeBufferStore(MI
, MRI
, B
, true, true);
7367 case Intrinsic::amdgcn_raw_buffer_load
:
7368 case Intrinsic::amdgcn_raw_ptr_buffer_load
:
7369 case Intrinsic::amdgcn_raw_atomic_buffer_load
:
7370 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load
:
7371 case Intrinsic::amdgcn_struct_buffer_load
:
7372 case Intrinsic::amdgcn_struct_ptr_buffer_load
:
7373 return legalizeBufferLoad(MI
, MRI
, B
, false, false);
7374 case Intrinsic::amdgcn_raw_buffer_load_format
:
7375 case Intrinsic::amdgcn_raw_ptr_buffer_load_format
:
7376 case Intrinsic::amdgcn_struct_buffer_load_format
:
7377 case Intrinsic::amdgcn_struct_ptr_buffer_load_format
:
7378 return legalizeBufferLoad(MI
, MRI
, B
, true, false);
7379 case Intrinsic::amdgcn_raw_tbuffer_load
:
7380 case Intrinsic::amdgcn_raw_ptr_tbuffer_load
:
7381 case Intrinsic::amdgcn_struct_tbuffer_load
:
7382 case Intrinsic::amdgcn_struct_ptr_tbuffer_load
:
7383 return legalizeBufferLoad(MI
, MRI
, B
, true, true);
7384 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
7385 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap
:
7386 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
7387 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap
:
7388 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
7389 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add
:
7390 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
7391 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add
:
7392 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
7393 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub
:
7394 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
7395 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub
:
7396 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
7397 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin
:
7398 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
7399 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin
:
7400 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
7401 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin
:
7402 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
7403 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin
:
7404 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
7405 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax
:
7406 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
7407 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax
:
7408 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
7409 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax
:
7410 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
7411 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax
:
7412 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
7413 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and
:
7414 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
7415 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and
:
7416 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
7417 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or
:
7418 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
7419 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or
:
7420 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
7421 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor
:
7422 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
7423 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor
:
7424 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
7425 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc
:
7426 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
7427 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc
:
7428 case Intrinsic::amdgcn_raw_buffer_atomic_dec
:
7429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec
:
7430 case Intrinsic::amdgcn_struct_buffer_atomic_dec
:
7431 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec
:
7432 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
:
7433 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap
:
7434 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
:
7435 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap
:
7436 case Intrinsic::amdgcn_raw_buffer_atomic_fmin
:
7437 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin
:
7438 case Intrinsic::amdgcn_struct_buffer_atomic_fmin
:
7439 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin
:
7440 case Intrinsic::amdgcn_raw_buffer_atomic_fmax
:
7441 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax
:
7442 case Intrinsic::amdgcn_struct_buffer_atomic_fmax
:
7443 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax
:
7444 case Intrinsic::amdgcn_raw_buffer_atomic_fadd
:
7445 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd
:
7446 case Intrinsic::amdgcn_struct_buffer_atomic_fadd
:
7447 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd
:
7448 return legalizeBufferAtomic(MI
, B
, IntrID
);
7449 case Intrinsic::amdgcn_rsq_clamp
:
7450 return legalizeRsqClampIntrinsic(MI
, MRI
, B
);
7451 case Intrinsic::amdgcn_image_bvh_intersect_ray
:
7452 return legalizeBVHIntrinsic(MI
, B
);
7453 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16
:
7454 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16
:
7455 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16
:
7456 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16
:
7457 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8
:
7458 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8
:
7459 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8
:
7460 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8
: {
7461 Register Index
= MI
.getOperand(5).getReg();
7462 LLT S32
= LLT::scalar(32);
7463 if (MRI
.getType(Index
) != S32
)
7464 MI
.getOperand(5).setReg(B
.buildAnyExt(S32
, Index
).getReg(0));
7467 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4
:
7468 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8
:
7469 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4
: {
7470 Register Index
= MI
.getOperand(7).getReg();
7471 LLT S32
= LLT::scalar(32);
7472 if (MRI
.getType(Index
) != S32
)
7473 MI
.getOperand(7).setReg(B
.buildAnyExt(S32
, Index
).getReg(0));
7476 case Intrinsic::amdgcn_fmed3
: {
7477 GISelChangeObserver
&Observer
= Helper
.Observer
;
7479 // FIXME: This is to workaround the inability of tablegen match combiners to
7480 // match intrinsics in patterns.
7481 Observer
.changingInstr(MI
);
7482 MI
.setDesc(B
.getTII().get(AMDGPU::G_AMDGPU_FMED3
));
7483 MI
.removeOperand(1);
7484 Observer
.changedInstr(MI
);
7487 case Intrinsic::amdgcn_readlane
:
7488 case Intrinsic::amdgcn_writelane
:
7489 case Intrinsic::amdgcn_readfirstlane
:
7490 case Intrinsic::amdgcn_permlane16
:
7491 case Intrinsic::amdgcn_permlanex16
:
7492 case Intrinsic::amdgcn_permlane64
:
7493 return legalizeLaneOp(Helper
, MI
, IntrID
);
7495 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
7496 AMDGPU::getImageDimIntrinsicInfo(IntrID
))
7497 return legalizeImageIntrinsic(MI
, B
, Helper
.Observer
, ImageDimIntr
);