1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the Machinelegalizer class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPULegalizerInfo.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUMemoryUtils.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/ScopeExit.h"
27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31 #include "llvm/CodeGen/GlobalISel/Utils.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
37 #define DEBUG_TYPE "amdgpu-legalinfo"
40 using namespace LegalizeActions
;
41 using namespace LegalizeMutations
;
42 using namespace LegalityPredicates
;
43 using namespace MIPatternMatch
;
45 // Hack until load/store selection patterns support any tuple of legal types.
46 static cl::opt
<bool> EnableNewLegality(
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
53 static constexpr unsigned MaxRegisterSize
= 1024;
55 // Round the number of elements to the next power of two elements
56 static LLT
getPow2VectorType(LLT Ty
) {
57 unsigned NElts
= Ty
.getNumElements();
58 unsigned Pow2NElts
= 1 << Log2_32_Ceil(NElts
);
59 return Ty
.changeElementCount(ElementCount::getFixed(Pow2NElts
));
62 // Round the number of bits to the next power of two bits
63 static LLT
getPow2ScalarType(LLT Ty
) {
64 unsigned Bits
= Ty
.getSizeInBits();
65 unsigned Pow2Bits
= 1 << Log2_32_Ceil(Bits
);
66 return LLT::scalar(Pow2Bits
);
69 /// \returns true if this is an odd sized vector which should widen by adding an
70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71 /// excludes s1 vectors, which should always be scalarized.
72 static LegalityPredicate
isSmallOddVector(unsigned TypeIdx
) {
73 return [=](const LegalityQuery
&Query
) {
74 const LLT Ty
= Query
.Types
[TypeIdx
];
78 const LLT EltTy
= Ty
.getElementType();
79 const unsigned EltSize
= EltTy
.getSizeInBits();
80 return Ty
.getNumElements() % 2 != 0 &&
81 EltSize
> 1 && EltSize
< 32 &&
82 Ty
.getSizeInBits() % 32 != 0;
86 static LegalityPredicate
sizeIsMultipleOf32(unsigned TypeIdx
) {
87 return [=](const LegalityQuery
&Query
) {
88 const LLT Ty
= Query
.Types
[TypeIdx
];
89 return Ty
.getSizeInBits() % 32 == 0;
93 static LegalityPredicate
isWideVec16(unsigned TypeIdx
) {
94 return [=](const LegalityQuery
&Query
) {
95 const LLT Ty
= Query
.Types
[TypeIdx
];
96 const LLT EltTy
= Ty
.getScalarType();
97 return EltTy
.getSizeInBits() == 16 && Ty
.getNumElements() > 2;
101 static LegalizeMutation
oneMoreElement(unsigned TypeIdx
) {
102 return [=](const LegalityQuery
&Query
) {
103 const LLT Ty
= Query
.Types
[TypeIdx
];
104 const LLT EltTy
= Ty
.getElementType();
105 return std::pair(TypeIdx
,
106 LLT::fixed_vector(Ty
.getNumElements() + 1, EltTy
));
110 static LegalizeMutation
fewerEltsToSize64Vector(unsigned TypeIdx
) {
111 return [=](const LegalityQuery
&Query
) {
112 const LLT Ty
= Query
.Types
[TypeIdx
];
113 const LLT EltTy
= Ty
.getElementType();
114 unsigned Size
= Ty
.getSizeInBits();
115 unsigned Pieces
= (Size
+ 63) / 64;
116 unsigned NewNumElts
= (Ty
.getNumElements() + 1) / Pieces
;
117 return std::pair(TypeIdx
, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts
), EltTy
));
122 // Increase the number of vector elements to reach the next multiple of 32-bit
124 static LegalizeMutation
moreEltsToNext32Bit(unsigned TypeIdx
) {
125 return [=](const LegalityQuery
&Query
) {
126 const LLT Ty
= Query
.Types
[TypeIdx
];
128 const LLT EltTy
= Ty
.getElementType();
129 const int Size
= Ty
.getSizeInBits();
130 const int EltSize
= EltTy
.getSizeInBits();
131 const int NextMul32
= (Size
+ 31) / 32;
133 assert(EltSize
< 32);
135 const int NewNumElts
= (32 * NextMul32
+ EltSize
- 1) / EltSize
;
136 return std::pair(TypeIdx
, LLT::fixed_vector(NewNumElts
, EltTy
));
140 // Increase the number of vector elements to reach the next legal RegClass.
141 static LegalizeMutation
moreElementsToNextExistingRegClass(unsigned TypeIdx
) {
142 return [=](const LegalityQuery
&Query
) {
143 const LLT Ty
= Query
.Types
[TypeIdx
];
144 const unsigned NumElts
= Ty
.getNumElements();
145 const unsigned EltSize
= Ty
.getElementType().getSizeInBits();
146 const unsigned MaxNumElts
= MaxRegisterSize
/ EltSize
;
148 assert(EltSize
== 32 || EltSize
== 64);
149 assert(Ty
.getSizeInBits() < MaxRegisterSize
);
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts
= NumElts
; NewNumElts
< MaxNumElts
; ++NewNumElts
) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts
* EltSize
))
158 return std::pair(TypeIdx
, LLT::fixed_vector(NewNumElts
, EltSize
));
162 static LLT
getBufferRsrcScalarType(const LLT Ty
) {
164 return LLT::scalar(128);
165 const ElementCount NumElems
= Ty
.getElementCount();
166 return LLT::vector(NumElems
, LLT::scalar(128));
169 static LLT
getBufferRsrcRegisterType(const LLT Ty
) {
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems
= Ty
.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems
* 4, LLT::scalar(32));
176 static LLT
getBitcastRegisterType(const LLT Ty
) {
177 const unsigned Size
= Ty
.getSizeInBits();
182 return LLT::scalar(Size
);
185 return LLT::scalarOrVector(ElementCount::getFixed(Size
/ 32), 32);
188 static LegalizeMutation
bitcastToRegisterType(unsigned TypeIdx
) {
189 return [=](const LegalityQuery
&Query
) {
190 const LLT Ty
= Query
.Types
[TypeIdx
];
191 return std::pair(TypeIdx
, getBitcastRegisterType(Ty
));
195 static LegalizeMutation
bitcastToVectorElement32(unsigned TypeIdx
) {
196 return [=](const LegalityQuery
&Query
) {
197 const LLT Ty
= Query
.Types
[TypeIdx
];
198 unsigned Size
= Ty
.getSizeInBits();
199 assert(Size
% 32 == 0);
201 TypeIdx
, LLT::scalarOrVector(ElementCount::getFixed(Size
/ 32), 32));
205 static LegalityPredicate
vectorSmallerThan(unsigned TypeIdx
, unsigned Size
) {
206 return [=](const LegalityQuery
&Query
) {
207 const LLT QueryTy
= Query
.Types
[TypeIdx
];
208 return QueryTy
.isVector() && QueryTy
.getSizeInBits() < Size
;
212 static LegalityPredicate
vectorWiderThan(unsigned TypeIdx
, unsigned Size
) {
213 return [=](const LegalityQuery
&Query
) {
214 const LLT QueryTy
= Query
.Types
[TypeIdx
];
215 return QueryTy
.isVector() && QueryTy
.getSizeInBits() > Size
;
219 static LegalityPredicate
numElementsNotEven(unsigned TypeIdx
) {
220 return [=](const LegalityQuery
&Query
) {
221 const LLT QueryTy
= Query
.Types
[TypeIdx
];
222 return QueryTy
.isVector() && QueryTy
.getNumElements() % 2 != 0;
226 static bool isRegisterSize(unsigned Size
) {
227 return Size
% 32 == 0 && Size
<= MaxRegisterSize
;
230 static bool isRegisterVectorElementType(LLT EltTy
) {
231 const int EltSize
= EltTy
.getSizeInBits();
232 return EltSize
== 16 || EltSize
% 32 == 0;
235 static bool isRegisterVectorType(LLT Ty
) {
236 const int EltSize
= Ty
.getElementType().getSizeInBits();
237 return EltSize
== 32 || EltSize
== 64 ||
238 (EltSize
== 16 && Ty
.getNumElements() % 2 == 0) ||
239 EltSize
== 128 || EltSize
== 256;
242 // TODO: replace all uses of isRegisterType with isRegisterClassType
243 static bool isRegisterType(LLT Ty
) {
244 if (!isRegisterSize(Ty
.getSizeInBits()))
248 return isRegisterVectorType(Ty
);
253 // Any combination of 32 or 64-bit elements up the maximum register size, and
254 // multiples of v2s16.
255 static LegalityPredicate
isRegisterType(unsigned TypeIdx
) {
256 return [=](const LegalityQuery
&Query
) {
257 return isRegisterType(Query
.Types
[TypeIdx
]);
261 // RegisterType that doesn't have a corresponding RegClass.
262 // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263 // should be removed.
264 static LegalityPredicate
isIllegalRegisterType(unsigned TypeIdx
) {
265 return [=](const LegalityQuery
&Query
) {
266 LLT Ty
= Query
.Types
[TypeIdx
];
267 return isRegisterType(Ty
) &&
268 !SIRegisterInfo::getSGPRClassForBitWidth(Ty
.getSizeInBits());
272 static LegalityPredicate
elementTypeIsLegal(unsigned TypeIdx
) {
273 return [=](const LegalityQuery
&Query
) {
274 const LLT QueryTy
= Query
.Types
[TypeIdx
];
275 if (!QueryTy
.isVector())
277 const LLT EltTy
= QueryTy
.getElementType();
278 return EltTy
== LLT::scalar(16) || EltTy
.getSizeInBits() >= 32;
282 static const LLT S1
= LLT::scalar(1);
283 static const LLT S8
= LLT::scalar(8);
284 static const LLT S16
= LLT::scalar(16);
285 static const LLT S32
= LLT::scalar(32);
286 static const LLT F32
= LLT::float32();
287 static const LLT S64
= LLT::scalar(64);
288 static const LLT F64
= LLT::float64();
289 static const LLT S96
= LLT::scalar(96);
290 static const LLT S128
= LLT::scalar(128);
291 static const LLT S160
= LLT::scalar(160);
292 static const LLT S192
= LLT::scalar(192);
293 static const LLT S224
= LLT::scalar(224);
294 static const LLT S256
= LLT::scalar(256);
295 static const LLT S512
= LLT::scalar(512);
296 static const LLT S1024
= LLT::scalar(1024);
297 static const LLT MaxScalar
= LLT::scalar(MaxRegisterSize
);
299 static const LLT V2S8
= LLT::fixed_vector(2, 8);
300 static const LLT V2S16
= LLT::fixed_vector(2, 16);
301 static const LLT V4S16
= LLT::fixed_vector(4, 16);
302 static const LLT V6S16
= LLT::fixed_vector(6, 16);
303 static const LLT V8S16
= LLT::fixed_vector(8, 16);
304 static const LLT V10S16
= LLT::fixed_vector(10, 16);
305 static const LLT V12S16
= LLT::fixed_vector(12, 16);
306 static const LLT V16S16
= LLT::fixed_vector(16, 16);
308 static const LLT V2F16
= LLT::fixed_vector(2, LLT::float16());
309 static const LLT V2BF16
= V2F16
; // FIXME
311 static const LLT V2S32
= LLT::fixed_vector(2, 32);
312 static const LLT V3S32
= LLT::fixed_vector(3, 32);
313 static const LLT V4S32
= LLT::fixed_vector(4, 32);
314 static const LLT V5S32
= LLT::fixed_vector(5, 32);
315 static const LLT V6S32
= LLT::fixed_vector(6, 32);
316 static const LLT V7S32
= LLT::fixed_vector(7, 32);
317 static const LLT V8S32
= LLT::fixed_vector(8, 32);
318 static const LLT V9S32
= LLT::fixed_vector(9, 32);
319 static const LLT V10S32
= LLT::fixed_vector(10, 32);
320 static const LLT V11S32
= LLT::fixed_vector(11, 32);
321 static const LLT V12S32
= LLT::fixed_vector(12, 32);
322 static const LLT V16S32
= LLT::fixed_vector(16, 32);
323 static const LLT V32S32
= LLT::fixed_vector(32, 32);
325 static const LLT V2S64
= LLT::fixed_vector(2, 64);
326 static const LLT V3S64
= LLT::fixed_vector(3, 64);
327 static const LLT V4S64
= LLT::fixed_vector(4, 64);
328 static const LLT V5S64
= LLT::fixed_vector(5, 64);
329 static const LLT V6S64
= LLT::fixed_vector(6, 64);
330 static const LLT V7S64
= LLT::fixed_vector(7, 64);
331 static const LLT V8S64
= LLT::fixed_vector(8, 64);
332 static const LLT V16S64
= LLT::fixed_vector(16, 64);
334 static const LLT V2S128
= LLT::fixed_vector(2, 128);
335 static const LLT V4S128
= LLT::fixed_vector(4, 128);
337 static std::initializer_list
<LLT
> AllScalarTypes
= {
338 S32
, S64
, S96
, S128
, S160
, S192
, S224
, S256
, S512
, S1024
};
340 static std::initializer_list
<LLT
> AllS16Vectors
{
341 V2S16
, V4S16
, V6S16
, V8S16
, V10S16
, V12S16
, V16S16
, V2S128
, V4S128
};
343 static std::initializer_list
<LLT
> AllS32Vectors
= {
344 V2S32
, V3S32
, V4S32
, V5S32
, V6S32
, V7S32
, V8S32
,
345 V9S32
, V10S32
, V11S32
, V12S32
, V16S32
, V32S32
};
347 static std::initializer_list
<LLT
> AllS64Vectors
= {V2S64
, V3S64
, V4S64
, V5S64
,
348 V6S64
, V7S64
, V8S64
, V16S64
};
350 // Checks whether a type is in the list of legal register types.
351 static bool isRegisterClassType(LLT Ty
) {
352 if (Ty
.isPointerOrPointerVector())
353 Ty
= Ty
.changeElementType(LLT::scalar(Ty
.getScalarSizeInBits()));
355 return is_contained(AllS32Vectors
, Ty
) || is_contained(AllS64Vectors
, Ty
) ||
356 is_contained(AllScalarTypes
, Ty
) || is_contained(AllS16Vectors
, Ty
);
359 static LegalityPredicate
isRegisterClassType(unsigned TypeIdx
) {
360 return [TypeIdx
](const LegalityQuery
&Query
) {
361 return isRegisterClassType(Query
.Types
[TypeIdx
]);
365 // If we have a truncating store or an extending load with a data size larger
366 // than 32-bits, we need to reduce to a 32-bit type.
367 static LegalityPredicate
isWideScalarExtLoadTruncStore(unsigned TypeIdx
) {
368 return [=](const LegalityQuery
&Query
) {
369 const LLT Ty
= Query
.Types
[TypeIdx
];
370 return !Ty
.isVector() && Ty
.getSizeInBits() > 32 &&
371 Query
.MMODescrs
[0].MemoryTy
.getSizeInBits() < Ty
.getSizeInBits();
375 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
376 // handle some operations by just promoting the register during
377 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
378 static unsigned maxSizeForAddrSpace(const GCNSubtarget
&ST
, unsigned AS
,
379 bool IsLoad
, bool IsAtomic
) {
381 case AMDGPUAS::PRIVATE_ADDRESS
:
382 // FIXME: Private element size.
383 return ST
.enableFlatScratch() ? 128 : 32;
384 case AMDGPUAS::LOCAL_ADDRESS
:
385 return ST
.useDS128() ? 128 : 64;
386 case AMDGPUAS::GLOBAL_ADDRESS
:
387 case AMDGPUAS::CONSTANT_ADDRESS
:
388 case AMDGPUAS::CONSTANT_ADDRESS_32BIT
:
389 case AMDGPUAS::BUFFER_RESOURCE
:
390 // Treat constant and global as identical. SMRD loads are sometimes usable for
391 // global loads (ideally constant address space should be eliminated)
392 // depending on the context. Legality cannot be context dependent, but
393 // RegBankSelect can split the load as necessary depending on the pointer
394 // register bank/uniformity and if the memory is invariant or not written in a
396 return IsLoad
? 512 : 128;
398 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
399 // if they may alias scratch depending on the subtarget. This needs to be
400 // moved to custom handling to use addressMayBeAccessedAsPrivate
401 return ST
.hasMultiDwordFlatScratchAddressing() || IsAtomic
? 128 : 32;
405 static bool isLoadStoreSizeLegal(const GCNSubtarget
&ST
,
406 const LegalityQuery
&Query
) {
407 const LLT Ty
= Query
.Types
[0];
409 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
410 const bool IsLoad
= Query
.Opcode
!= AMDGPU::G_STORE
;
412 unsigned RegSize
= Ty
.getSizeInBits();
413 uint64_t MemSize
= Query
.MMODescrs
[0].MemoryTy
.getSizeInBits();
414 uint64_t AlignBits
= Query
.MMODescrs
[0].AlignInBits
;
415 unsigned AS
= Query
.Types
[1].getAddressSpace();
417 // All of these need to be custom lowered to cast the pointer operand.
418 if (AS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
)
421 // Do not handle extending vector loads.
422 if (Ty
.isVector() && MemSize
!= RegSize
)
425 // TODO: We should be able to widen loads if the alignment is high enough, but
426 // we also need to modify the memory access size.
428 // Accept widening loads based on alignment.
429 if (IsLoad
&& MemSize
< Size
)
430 MemSize
= std::max(MemSize
, Align
);
433 // Only 1-byte and 2-byte to 32-bit extloads are valid.
434 if (MemSize
!= RegSize
&& RegSize
!= 32)
437 if (MemSize
> maxSizeForAddrSpace(ST
, AS
, IsLoad
,
438 Query
.MMODescrs
[0].Ordering
!=
439 AtomicOrdering::NotAtomic
))
450 if (!ST
.hasDwordx3LoadStores())
455 // These may contextually need to be broken down.
461 assert(RegSize
>= MemSize
);
463 if (AlignBits
< MemSize
) {
464 const SITargetLowering
*TLI
= ST
.getTargetLowering();
465 if (!TLI
->allowsMisalignedMemoryAccessesImpl(MemSize
, AS
,
466 Align(AlignBits
/ 8)))
473 // The newer buffer intrinsic forms take their resource arguments as
474 // pointers in address space 8, aka s128 values. However, in order to not break
475 // SelectionDAG, the underlying operations have to continue to take v4i32
476 // arguments. Therefore, we convert resource pointers - or vectors of them
477 // to integer values here.
478 static bool hasBufferRsrcWorkaround(const LLT Ty
) {
479 if (Ty
.isPointer() && Ty
.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE
)
482 const LLT ElemTy
= Ty
.getElementType();
483 return hasBufferRsrcWorkaround(ElemTy
);
488 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
489 // workaround this. Eventually it should ignore the type for loads and only care
490 // about the size. Return true in cases where we will workaround this for now by
492 static bool loadStoreBitcastWorkaround(const LLT Ty
) {
493 if (EnableNewLegality
)
496 const unsigned Size
= Ty
.getSizeInBits();
497 if (Ty
.isPointerVector())
501 // Address space 8 pointers get their own workaround.
502 if (hasBufferRsrcWorkaround(Ty
))
507 unsigned EltSize
= Ty
.getScalarSizeInBits();
508 return EltSize
!= 32 && EltSize
!= 64;
511 static bool isLoadStoreLegal(const GCNSubtarget
&ST
, const LegalityQuery
&Query
) {
512 const LLT Ty
= Query
.Types
[0];
513 return isRegisterType(Ty
) && isLoadStoreSizeLegal(ST
, Query
) &&
514 !hasBufferRsrcWorkaround(Ty
) && !loadStoreBitcastWorkaround(Ty
);
517 /// Return true if a load or store of the type should be lowered with a bitcast
518 /// to a different type.
519 static bool shouldBitcastLoadStoreType(const GCNSubtarget
&ST
, const LLT Ty
,
521 const unsigned MemSizeInBits
= MemTy
.getSizeInBits();
522 const unsigned Size
= Ty
.getSizeInBits();
523 if (Size
!= MemSizeInBits
)
524 return Size
<= 32 && Ty
.isVector();
526 if (loadStoreBitcastWorkaround(Ty
) && isRegisterType(Ty
))
529 // Don't try to handle bitcasting vector ext loads for now.
530 return Ty
.isVector() && (!MemTy
.isVector() || MemTy
== Ty
) &&
531 (Size
<= 32 || isRegisterSize(Size
)) &&
532 !isRegisterVectorElementType(Ty
.getElementType());
535 /// Return true if we should legalize a load by widening an odd sized memory
536 /// access up to the alignment. Note this case when the memory access itself
537 /// changes, not the size of the result register.
538 static bool shouldWidenLoad(const GCNSubtarget
&ST
, LLT MemoryTy
,
539 uint64_t AlignInBits
, unsigned AddrSpace
,
541 unsigned SizeInBits
= MemoryTy
.getSizeInBits();
542 // We don't want to widen cases that are naturally legal.
543 if (isPowerOf2_32(SizeInBits
))
546 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
547 // end up widening these for a scalar load during RegBankSelect, if we don't
548 // have 96-bit scalar loads.
549 if (SizeInBits
== 96 && ST
.hasDwordx3LoadStores())
552 if (SizeInBits
>= maxSizeForAddrSpace(ST
, AddrSpace
, Opcode
, false))
555 // A load is known dereferenceable up to the alignment, so it's legal to widen
558 // TODO: Could check dereferenceable for less aligned cases.
559 unsigned RoundedSize
= NextPowerOf2(SizeInBits
);
560 if (AlignInBits
< RoundedSize
)
563 // Do not widen if it would introduce a slow unaligned load.
564 const SITargetLowering
*TLI
= ST
.getTargetLowering();
566 return TLI
->allowsMisalignedMemoryAccessesImpl(
567 RoundedSize
, AddrSpace
, Align(AlignInBits
/ 8),
568 MachineMemOperand::MOLoad
, &Fast
) &&
572 static bool shouldWidenLoad(const GCNSubtarget
&ST
, const LegalityQuery
&Query
,
574 if (Query
.MMODescrs
[0].Ordering
!= AtomicOrdering::NotAtomic
)
577 return shouldWidenLoad(ST
, Query
.MMODescrs
[0].MemoryTy
,
578 Query
.MMODescrs
[0].AlignInBits
,
579 Query
.Types
[1].getAddressSpace(), Opcode
);
582 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
583 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
584 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
585 static LLT
castBufferRsrcFromV4I32(MachineInstr
&MI
, MachineIRBuilder
&B
,
586 MachineRegisterInfo
&MRI
, unsigned Idx
) {
587 MachineOperand
&MO
= MI
.getOperand(Idx
);
589 const LLT PointerTy
= MRI
.getType(MO
.getReg());
591 // Paranoidly prevent us from doing this multiple times.
592 if (!hasBufferRsrcWorkaround(PointerTy
))
595 const LLT ScalarTy
= getBufferRsrcScalarType(PointerTy
);
596 const LLT VectorTy
= getBufferRsrcRegisterType(PointerTy
);
597 if (!PointerTy
.isVector()) {
598 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
599 const unsigned NumParts
= PointerTy
.getSizeInBits() / 32;
600 const LLT S32
= LLT::scalar(32);
602 Register VectorReg
= MRI
.createGenericVirtualRegister(VectorTy
);
603 std::array
<Register
, 4> VectorElems
;
604 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
605 for (unsigned I
= 0; I
< NumParts
; ++I
)
607 B
.buildExtractVectorElementConstant(S32
, VectorReg
, I
).getReg(0);
608 B
.buildMergeValues(MO
, VectorElems
);
609 MO
.setReg(VectorReg
);
612 Register BitcastReg
= MRI
.createGenericVirtualRegister(VectorTy
);
613 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
614 auto Scalar
= B
.buildBitcast(ScalarTy
, BitcastReg
);
615 B
.buildIntToPtr(MO
, Scalar
);
616 MO
.setReg(BitcastReg
);
621 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
622 /// the form in which the value must be in order to be passed to the low-level
623 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
624 /// needed in order to account for the fact that we can't define a register
625 /// class for s128 without breaking SelectionDAG.
626 static Register
castBufferRsrcToV4I32(Register Pointer
, MachineIRBuilder
&B
) {
627 MachineRegisterInfo
&MRI
= *B
.getMRI();
628 const LLT PointerTy
= MRI
.getType(Pointer
);
629 const LLT ScalarTy
= getBufferRsrcScalarType(PointerTy
);
630 const LLT VectorTy
= getBufferRsrcRegisterType(PointerTy
);
632 if (!PointerTy
.isVector()) {
633 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
634 SmallVector
<Register
, 4> PointerParts
;
635 const unsigned NumParts
= PointerTy
.getSizeInBits() / 32;
636 auto Unmerged
= B
.buildUnmerge(LLT::scalar(32), Pointer
);
637 for (unsigned I
= 0; I
< NumParts
; ++I
)
638 PointerParts
.push_back(Unmerged
.getReg(I
));
639 return B
.buildBuildVector(VectorTy
, PointerParts
).getReg(0);
641 Register Scalar
= B
.buildPtrToInt(ScalarTy
, Pointer
).getReg(0);
642 return B
.buildBitcast(VectorTy
, Scalar
).getReg(0);
645 static void castBufferRsrcArgToV4I32(MachineInstr
&MI
, MachineIRBuilder
&B
,
647 MachineOperand
&MO
= MI
.getOperand(Idx
);
649 const LLT PointerTy
= B
.getMRI()->getType(MO
.getReg());
650 // Paranoidly prevent us from doing this multiple times.
651 if (!hasBufferRsrcWorkaround(PointerTy
))
653 MO
.setReg(castBufferRsrcToV4I32(MO
.getReg(), B
));
656 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget
&ST_
,
657 const GCNTargetMachine
&TM
)
659 using namespace TargetOpcode
;
661 auto GetAddrSpacePtr
= [&TM
](unsigned AS
) {
662 return LLT::pointer(AS
, TM
.getPointerSizeInBits(AS
));
665 const LLT GlobalPtr
= GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS
);
666 const LLT ConstantPtr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS
);
667 const LLT Constant32Ptr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT
);
668 const LLT LocalPtr
= GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS
);
669 const LLT RegionPtr
= GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS
);
670 const LLT FlatPtr
= GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS
);
671 const LLT PrivatePtr
= GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS
);
672 const LLT BufferFatPtr
= GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER
);
673 const LLT RsrcPtr
= GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE
);
674 const LLT BufferStridedPtr
=
675 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER
);
677 const LLT CodePtr
= FlatPtr
;
679 const std::initializer_list
<LLT
> AddrSpaces64
= {
680 GlobalPtr
, ConstantPtr
, FlatPtr
683 const std::initializer_list
<LLT
> AddrSpaces32
= {
684 LocalPtr
, PrivatePtr
, Constant32Ptr
, RegionPtr
687 const std::initializer_list
<LLT
> AddrSpaces128
= {RsrcPtr
};
689 const std::initializer_list
<LLT
> FPTypesBase
= {
693 const std::initializer_list
<LLT
> FPTypes16
= {
697 const std::initializer_list
<LLT
> FPTypesPK16
= {
701 const LLT MinScalarFPTy
= ST
.has16BitInsts() ? S16
: S32
;
703 // s1 for VCC branches, s32 for SCC branches.
704 getActionDefinitionsBuilder(G_BRCOND
).legalFor({S1
, S32
});
706 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
707 // elements for v3s16
708 getActionDefinitionsBuilder(G_PHI
)
709 .legalFor({S32
, S64
, V2S16
, S16
, V4S16
, S1
, S128
, S256
})
710 .legalFor(AllS32Vectors
)
711 .legalFor(AllS64Vectors
)
712 .legalFor(AddrSpaces64
)
713 .legalFor(AddrSpaces32
)
714 .legalFor(AddrSpaces128
)
715 .legalIf(isPointer(0))
716 .clampScalar(0, S16
, S256
)
717 .widenScalarToNextPow2(0, 32)
718 .clampMaxNumElements(0, S32
, 16)
719 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
722 if (ST
.hasVOP3PInsts() && ST
.hasAddNoCarry() && ST
.hasIntClamp()) {
723 // Full set of gfx9 features.
724 if (ST
.hasScalarAddSub64()) {
725 getActionDefinitionsBuilder({G_ADD
, G_SUB
})
726 .legalFor({S64
, S32
, S16
, V2S16
})
727 .clampMaxNumElementsStrict(0, S16
, 2)
730 .widenScalarToNextMultipleOf(0, 32)
733 getActionDefinitionsBuilder({G_ADD
, G_SUB
})
734 .legalFor({S32
, S16
, V2S16
})
735 .clampMaxNumElementsStrict(0, S16
, 2)
738 .widenScalarToNextMultipleOf(0, 32)
742 if (ST
.hasScalarSMulU64()) {
743 getActionDefinitionsBuilder(G_MUL
)
744 .legalFor({S64
, S32
, S16
, V2S16
})
745 .clampMaxNumElementsStrict(0, S16
, 2)
748 .widenScalarToNextMultipleOf(0, 32)
751 getActionDefinitionsBuilder(G_MUL
)
752 .legalFor({S32
, S16
, V2S16
})
753 .clampMaxNumElementsStrict(0, S16
, 2)
756 .widenScalarToNextMultipleOf(0, 32)
759 assert(ST
.hasMad64_32());
761 getActionDefinitionsBuilder({G_UADDSAT
, G_USUBSAT
, G_SADDSAT
, G_SSUBSAT
})
762 .legalFor({S32
, S16
, V2S16
}) // Clamp modifier
763 .minScalarOrElt(0, S16
)
764 .clampMaxNumElementsStrict(0, S16
, 2)
766 .widenScalarToNextPow2(0, 32)
768 } else if (ST
.has16BitInsts()) {
769 getActionDefinitionsBuilder({G_ADD
, G_SUB
})
770 .legalFor({S32
, S16
})
772 .widenScalarToNextMultipleOf(0, 32)
776 getActionDefinitionsBuilder(G_MUL
)
777 .legalFor({S32
, S16
})
780 .widenScalarToNextMultipleOf(0, 32)
782 assert(ST
.hasMad64_32());
784 // Technically the saturating operations require clamp bit support, but this
785 // was introduced at the same time as 16-bit operations.
786 getActionDefinitionsBuilder({G_UADDSAT
, G_USUBSAT
})
787 .legalFor({S32
, S16
}) // Clamp modifier
790 .widenScalarToNextPow2(0, 16)
793 // We're just lowering this, but it helps get a better result to try to
794 // coerce to the desired type first.
795 getActionDefinitionsBuilder({G_SADDSAT
, G_SSUBSAT
})
800 getActionDefinitionsBuilder({G_ADD
, G_SUB
})
802 .widenScalarToNextMultipleOf(0, 32)
803 .clampScalar(0, S32
, S32
)
806 auto &Mul
= getActionDefinitionsBuilder(G_MUL
)
810 .widenScalarToNextMultipleOf(0, 32);
812 if (ST
.hasMad64_32())
815 Mul
.maxScalar(0, S32
);
817 if (ST
.hasIntClamp()) {
818 getActionDefinitionsBuilder({G_UADDSAT
, G_USUBSAT
})
819 .legalFor({S32
}) // Clamp modifier.
821 .minScalarOrElt(0, S32
)
824 // Clamp bit support was added in VI, along with 16-bit operations.
825 getActionDefinitionsBuilder({G_UADDSAT
, G_USUBSAT
})
831 // FIXME: DAG expansion gets better results. The widening uses the smaller
832 // range values and goes for the min/max lowering directly.
833 getActionDefinitionsBuilder({G_SADDSAT
, G_SSUBSAT
})
839 getActionDefinitionsBuilder(
840 {G_SDIV
, G_UDIV
, G_SREM
, G_UREM
, G_SDIVREM
, G_UDIVREM
})
841 .customFor({S32
, S64
})
842 .clampScalar(0, S32
, S64
)
843 .widenScalarToNextPow2(0, 32)
846 auto &Mulh
= getActionDefinitionsBuilder({G_UMULH
, G_SMULH
})
850 if (ST
.hasVOP3PInsts()) {
852 .clampMaxNumElements(0, S8
, 2)
860 // Report legal for any types we can handle anywhere. For the cases only legal
861 // on the SALU, RegBankSelect will be able to re-legalize.
862 getActionDefinitionsBuilder({G_AND
, G_OR
, G_XOR
})
863 .legalFor({S32
, S1
, S64
, V2S32
, S16
, V2S16
, V4S16
})
864 .clampScalar(0, S32
, S64
)
865 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
866 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
867 .widenScalarToNextPow2(0)
870 getActionDefinitionsBuilder(
871 {G_UADDO
, G_USUBO
, G_UADDE
, G_SADDE
, G_USUBE
, G_SSUBE
})
872 .legalFor({{S32
, S1
}, {S32
, S32
}})
873 .clampScalar(0, S32
, S32
)
876 getActionDefinitionsBuilder(G_BITCAST
)
877 // Don't worry about the size constraint.
878 .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
881 getActionDefinitionsBuilder(G_CONSTANT
)
882 .legalFor({S1
, S32
, S64
, S16
, GlobalPtr
,
883 LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
884 .legalIf(isPointer(0))
885 .clampScalar(0, S32
, S64
)
886 .widenScalarToNextPow2(0);
888 getActionDefinitionsBuilder(G_FCONSTANT
)
889 .legalFor({S32
, S64
, S16
})
890 .clampScalar(0, S16
, S64
);
892 getActionDefinitionsBuilder({G_IMPLICIT_DEF
, G_FREEZE
})
893 .legalIf(isRegisterClassType(0))
894 // s1 and s16 are special cases because they have legal operations on
895 // them, but don't really occupy registers in the normal way.
897 .clampNumElements(0, V16S32
, V32S32
)
898 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
899 .clampScalarOrElt(0, S32
, MaxScalar
)
900 .widenScalarToNextPow2(0, 32)
901 .clampMaxNumElements(0, S32
, 16);
903 getActionDefinitionsBuilder(G_FRAME_INDEX
).legalFor({PrivatePtr
});
905 // If the amount is divergent, we have to do a wave reduction to get the
906 // maximum value, so this is expanded during RegBankSelect.
907 getActionDefinitionsBuilder(G_DYN_STACKALLOC
)
908 .legalFor({{PrivatePtr
, S32
}});
910 getActionDefinitionsBuilder(G_STACKSAVE
)
911 .customFor({PrivatePtr
});
912 getActionDefinitionsBuilder(G_STACKRESTORE
)
913 .legalFor({PrivatePtr
});
915 getActionDefinitionsBuilder({G_GET_FPENV
, G_SET_FPENV
}).customFor({S64
});
917 getActionDefinitionsBuilder(G_GLOBAL_VALUE
)
918 .customIf(typeIsNot(0, PrivatePtr
));
920 getActionDefinitionsBuilder(G_BLOCK_ADDR
).legalFor({CodePtr
});
922 auto &FPOpActions
= getActionDefinitionsBuilder(
923 { G_FADD
, G_FMUL
, G_FMA
, G_FCANONICALIZE
,
924 G_STRICT_FADD
, G_STRICT_FMUL
, G_STRICT_FMA
})
925 .legalFor({S32
, S64
});
926 auto &TrigActions
= getActionDefinitionsBuilder({G_FSIN
, G_FCOS
})
927 .customFor({S32
, S64
});
928 auto &FDIVActions
= getActionDefinitionsBuilder(G_FDIV
)
929 .customFor({S32
, S64
});
931 if (ST
.has16BitInsts()) {
932 if (ST
.hasVOP3PInsts())
933 FPOpActions
.legalFor({S16
, V2S16
});
935 FPOpActions
.legalFor({S16
});
937 TrigActions
.customFor({S16
});
938 FDIVActions
.customFor({S16
});
941 if (ST
.hasPackedFP32Ops()) {
942 FPOpActions
.legalFor({V2S32
});
943 FPOpActions
.clampMaxNumElementsStrict(0, S32
, 2);
946 auto &MinNumMaxNum
= getActionDefinitionsBuilder({
947 G_FMINNUM
, G_FMAXNUM
, G_FMINNUM_IEEE
, G_FMAXNUM_IEEE
});
949 if (ST
.hasVOP3PInsts()) {
950 MinNumMaxNum
.customFor(FPTypesPK16
)
951 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
952 .clampMaxNumElements(0, S16
, 2)
953 .clampScalar(0, S16
, S64
)
955 } else if (ST
.has16BitInsts()) {
956 MinNumMaxNum
.customFor(FPTypes16
)
957 .clampScalar(0, S16
, S64
)
960 MinNumMaxNum
.customFor(FPTypesBase
)
961 .clampScalar(0, S32
, S64
)
965 if (ST
.hasVOP3PInsts())
966 FPOpActions
.clampMaxNumElementsStrict(0, S16
, 2);
970 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
974 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
978 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
980 getActionDefinitionsBuilder({G_FNEG
, G_FABS
})
981 .legalFor(FPTypesPK16
)
982 .clampMaxNumElementsStrict(0, S16
, 2)
984 .clampScalar(0, S16
, S64
);
986 if (ST
.has16BitInsts()) {
987 getActionDefinitionsBuilder(G_FSQRT
)
989 .customFor({S32
, S64
})
992 getActionDefinitionsBuilder(G_FFLOOR
)
993 .legalFor({S32
, S64
, S16
})
995 .clampScalar(0, S16
, S64
);
997 getActionDefinitionsBuilder({G_FLDEXP
, G_STRICT_FLDEXP
})
998 .legalFor({{S32
, S32
}, {S64
, S32
}, {S16
, S16
}})
1000 .maxScalarIf(typeIs(0, S16
), 1, S16
)
1001 .clampScalar(1, S32
, S32
)
1004 getActionDefinitionsBuilder(G_FFREXP
)
1005 .customFor({{S32
, S32
}, {S64
, S32
}, {S16
, S16
}, {S16
, S32
}})
1009 getActionDefinitionsBuilder(G_FSQRT
)
1010 .customFor({S32
, S64
, S16
})
1015 if (ST
.hasFractBug()) {
1016 getActionDefinitionsBuilder(G_FFLOOR
)
1018 .legalFor({S32
, S64
})
1020 .clampScalar(0, S32
, S64
);
1022 getActionDefinitionsBuilder(G_FFLOOR
)
1023 .legalFor({S32
, S64
})
1025 .clampScalar(0, S32
, S64
);
1028 getActionDefinitionsBuilder({G_FLDEXP
, G_STRICT_FLDEXP
})
1029 .legalFor({{S32
, S32
}, {S64
, S32
}})
1031 .clampScalar(0, S32
, S64
)
1032 .clampScalar(1, S32
, S32
)
1035 getActionDefinitionsBuilder(G_FFREXP
)
1036 .customFor({{S32
, S32
}, {S64
, S32
}})
1039 .clampScalar(1, S32
, S32
)
1043 getActionDefinitionsBuilder(G_FPTRUNC
)
1044 .legalFor({{S32
, S64
}, {S16
, S32
}})
1048 getActionDefinitionsBuilder(G_FPEXT
)
1049 .legalFor({{S64
, S32
}, {S32
, S16
}})
1050 .narrowScalarFor({{S64
, S16
}}, changeTo(0, S32
))
1053 auto &FSubActions
= getActionDefinitionsBuilder({G_FSUB
, G_STRICT_FSUB
});
1054 if (ST
.has16BitInsts()) {
1056 // Use actual fsub instruction
1057 .legalFor({S32
, S16
})
1058 // Must use fadd + fneg
1059 .lowerFor({S64
, V2S16
});
1062 // Use actual fsub instruction
1064 // Must use fadd + fneg
1065 .lowerFor({S64
, S16
, V2S16
});
1070 .clampScalar(0, S32
, S64
);
1072 // Whether this is legal depends on the floating point mode for the function.
1073 auto &FMad
= getActionDefinitionsBuilder(G_FMAD
);
1074 if (ST
.hasMadF16() && ST
.hasMadMacF32Insts())
1075 FMad
.customFor({S32
, S16
});
1076 else if (ST
.hasMadMacF32Insts())
1077 FMad
.customFor({S32
});
1078 else if (ST
.hasMadF16())
1079 FMad
.customFor({S16
});
1083 auto &FRem
= getActionDefinitionsBuilder(G_FREM
);
1084 if (ST
.has16BitInsts()) {
1085 FRem
.customFor({S16
, S32
, S64
});
1087 FRem
.minScalar(0, S32
)
1088 .customFor({S32
, S64
});
1092 // TODO: Do we need to clamp maximum bitwidth?
1093 getActionDefinitionsBuilder(G_TRUNC
)
1094 .legalIf(isScalar(0))
1095 .legalFor({{V2S16
, V2S32
}})
1096 .clampMaxNumElements(0, S16
, 2)
1097 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1098 // situations (like an invalid implicit use), we don't want to infinite loop
1099 // in the legalizer.
1100 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1103 getActionDefinitionsBuilder({G_SEXT
, G_ZEXT
, G_ANYEXT
})
1104 .legalFor({{S64
, S32
}, {S32
, S16
}, {S64
, S16
},
1105 {S32
, S1
}, {S64
, S1
}, {S16
, S1
}})
1107 .clampScalar(0, S32
, S64
)
1108 .widenScalarToNextPow2(1, 32);
1110 // TODO: Split s1->s64 during regbankselect for VALU.
1111 auto &IToFP
= getActionDefinitionsBuilder({G_SITOFP
, G_UITOFP
})
1112 .legalFor({{S32
, S32
}, {S64
, S32
}, {S16
, S32
}})
1113 .lowerIf(typeIs(1, S1
))
1114 .customFor({{S32
, S64
}, {S64
, S64
}});
1115 if (ST
.has16BitInsts())
1116 IToFP
.legalFor({{S16
, S16
}});
1117 IToFP
.clampScalar(1, S32
, S64
)
1120 .widenScalarToNextPow2(1);
1122 auto &FPToI
= getActionDefinitionsBuilder({G_FPTOSI
, G_FPTOUI
})
1123 .legalFor({{S32
, S32
}, {S32
, S64
}, {S32
, S16
}})
1124 .customFor({{S64
, S32
}, {S64
, S64
}})
1125 .narrowScalarFor({{S64
, S16
}}, changeTo(0, S32
));
1126 if (ST
.has16BitInsts())
1127 FPToI
.legalFor({{S16
, S16
}});
1129 FPToI
.minScalar(1, S32
);
1131 FPToI
.minScalar(0, S32
)
1132 .widenScalarToNextPow2(0, 32)
1136 getActionDefinitionsBuilder({G_LROUND
, G_LLROUND
})
1137 .clampScalar(0, S16
, S64
)
1141 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND
)
1142 .legalFor({S16
, S32
})
1146 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1147 getActionDefinitionsBuilder({G_INTRINSIC_ROUND
, G_FRINT
, G_FNEARBYINT
})
1151 getActionDefinitionsBuilder({G_INTRINSIC_LRINT
, G_INTRINSIC_LLRINT
})
1152 .clampScalar(0, S16
, S64
)
1156 if (ST
.has16BitInsts()) {
1157 getActionDefinitionsBuilder(
1158 {G_INTRINSIC_TRUNC
, G_FCEIL
, G_INTRINSIC_ROUNDEVEN
})
1159 .legalFor({S16
, S32
, S64
})
1160 .clampScalar(0, S16
, S64
)
1162 } else if (ST
.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS
) {
1163 getActionDefinitionsBuilder(
1164 {G_INTRINSIC_TRUNC
, G_FCEIL
, G_INTRINSIC_ROUNDEVEN
})
1165 .legalFor({S32
, S64
})
1166 .clampScalar(0, S32
, S64
)
1169 getActionDefinitionsBuilder(
1170 {G_INTRINSIC_TRUNC
, G_FCEIL
, G_INTRINSIC_ROUNDEVEN
})
1173 .clampScalar(0, S32
, S64
)
1177 getActionDefinitionsBuilder(G_PTR_ADD
)
1178 .unsupportedFor({BufferFatPtr
, BufferStridedPtr
, RsrcPtr
})
1179 .legalIf(all(isPointer(0), sameSize(0, 1)))
1181 .scalarSameSizeAs(1, 0);
1183 getActionDefinitionsBuilder(G_PTRMASK
)
1184 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64
, S32
})))
1185 .scalarSameSizeAs(1, 0)
1189 getActionDefinitionsBuilder(G_ICMP
)
1190 // The compare output type differs based on the register bank of the output,
1191 // so make both s1 and s32 legal.
1193 // Scalar compares producing output in scc will be promoted to s32, as that
1194 // is the allocatable register type that will be needed for the copy from
1195 // scc. This will be promoted during RegBankSelect, and we assume something
1196 // before that won't try to use s32 result types.
1198 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1200 .legalForCartesianProduct(
1201 {S1
}, {S32
, S64
, GlobalPtr
, LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
1202 .legalForCartesianProduct(
1203 {S32
}, {S32
, S64
, GlobalPtr
, LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
});
1204 if (ST
.has16BitInsts()) {
1205 CmpBuilder
.legalFor({{S1
, S16
}});
1209 .widenScalarToNextPow2(1)
1210 .clampScalar(1, S32
, S64
)
1212 .legalIf(all(typeInSet(0, {S1
, S32
}), isPointer(1)));
1215 getActionDefinitionsBuilder(G_FCMP
).legalForCartesianProduct(
1216 {S1
}, ST
.has16BitInsts() ? FPTypes16
: FPTypesBase
);
1218 if (ST
.hasSALUFloatInsts())
1219 FCmpBuilder
.legalForCartesianProduct({S32
}, {S16
, S32
});
1222 .widenScalarToNextPow2(1)
1223 .clampScalar(1, S32
, S64
)
1226 // FIXME: fpow has a selection pattern that should move to custom lowering.
1227 auto &ExpOps
= getActionDefinitionsBuilder(G_FPOW
);
1228 if (ST
.has16BitInsts())
1229 ExpOps
.customFor({{S32
}, {S16
}});
1231 ExpOps
.customFor({S32
});
1232 ExpOps
.clampScalar(0, MinScalarFPTy
, S32
)
1235 getActionDefinitionsBuilder(G_FPOWI
)
1236 .clampScalar(0, MinScalarFPTy
, S32
)
1239 auto &Log2Ops
= getActionDefinitionsBuilder({G_FLOG2
, G_FEXP2
});
1240 Log2Ops
.customFor({S32
});
1241 if (ST
.has16BitInsts())
1242 Log2Ops
.legalFor({S16
});
1244 Log2Ops
.customFor({S16
});
1245 Log2Ops
.scalarize(0)
1249 getActionDefinitionsBuilder({G_FLOG
, G_FLOG10
, G_FEXP
, G_FEXP10
});
1250 LogOps
.customFor({S32
, S16
});
1251 LogOps
.clampScalar(0, MinScalarFPTy
, S32
)
1254 // The 64-bit versions produce 32-bit results, but only on the SALU.
1255 getActionDefinitionsBuilder(G_CTPOP
)
1256 .legalFor({{S32
, S32
}, {S32
, S64
}})
1257 .clampScalar(0, S32
, S32
)
1258 .widenScalarToNextPow2(1, 32)
1259 .clampScalar(1, S32
, S64
)
1261 .widenScalarToNextPow2(0, 32);
1263 // If no 16 bit instr is available, lower into different instructions.
1264 if (ST
.has16BitInsts())
1265 getActionDefinitionsBuilder(G_IS_FPCLASS
)
1266 .legalForCartesianProduct({S1
}, FPTypes16
)
1267 .widenScalarToNextPow2(1)
1271 getActionDefinitionsBuilder(G_IS_FPCLASS
)
1272 .legalForCartesianProduct({S1
}, FPTypesBase
)
1273 .lowerFor({S1
, S16
})
1274 .widenScalarToNextPow2(1)
1278 // The hardware instructions return a different result on 0 than the generic
1279 // instructions expect. The hardware produces -1, but these produce the
1281 getActionDefinitionsBuilder({G_CTLZ
, G_CTTZ
})
1283 .clampScalar(0, S32
, S32
)
1284 .clampScalar(1, S32
, S64
)
1285 .widenScalarToNextPow2(0, 32)
1286 .widenScalarToNextPow2(1, 32)
1289 // The 64-bit versions produce 32-bit results, but only on the SALU.
1290 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF
)
1291 .legalFor({{S32
, S32
}, {S32
, S64
}})
1292 .customIf(scalarNarrowerThan(1, 32))
1293 .clampScalar(0, S32
, S32
)
1294 .clampScalar(1, S32
, S64
)
1296 .widenScalarToNextPow2(0, 32)
1297 .widenScalarToNextPow2(1, 32);
1299 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF
)
1300 .legalFor({{S32
, S32
}, {S32
, S64
}})
1301 .clampScalar(0, S32
, S32
)
1302 .clampScalar(1, S32
, S64
)
1304 .widenScalarToNextPow2(0, 32)
1305 .widenScalarToNextPow2(1, 32);
1307 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1309 getActionDefinitionsBuilder(G_BITREVERSE
)
1310 .legalFor({S32
, S64
})
1311 .clampScalar(0, S32
, S64
)
1313 .widenScalarToNextPow2(0);
1315 if (ST
.has16BitInsts()) {
1316 getActionDefinitionsBuilder(G_BSWAP
)
1317 .legalFor({S16
, S32
, V2S16
})
1318 .clampMaxNumElementsStrict(0, S16
, 2)
1319 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1320 // narrowScalar limitation.
1321 .widenScalarToNextPow2(0)
1322 .clampScalar(0, S16
, S32
)
1325 if (ST
.hasVOP3PInsts()) {
1326 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
, G_ABS
})
1327 .legalFor({S32
, S16
, V2S16
})
1328 .clampMaxNumElements(0, S16
, 2)
1330 .widenScalarToNextPow2(0)
1334 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
, G_ABS
})
1335 .legalFor({S32
, S16
})
1336 .widenScalarToNextPow2(0)
1342 // TODO: Should have same legality without v_perm_b32
1343 getActionDefinitionsBuilder(G_BSWAP
)
1345 .lowerIf(scalarNarrowerThan(0, 32))
1346 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1347 // narrowScalar limitation.
1348 .widenScalarToNextPow2(0)
1353 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
, G_ABS
})
1356 .widenScalarToNextPow2(0)
1361 getActionDefinitionsBuilder(G_INTTOPTR
)
1362 // List the common cases
1363 .legalForCartesianProduct(AddrSpaces64
, {S64
})
1364 .legalForCartesianProduct(AddrSpaces32
, {S32
})
1366 // Accept any address space as long as the size matches
1367 .legalIf(sameSize(0, 1))
1368 .widenScalarIf(smallerThan(1, 0),
1369 [](const LegalityQuery
&Query
) {
1371 1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
1373 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery
&Query
) {
1374 return std::pair(1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
1377 getActionDefinitionsBuilder(G_PTRTOINT
)
1378 // List the common cases
1379 .legalForCartesianProduct(AddrSpaces64
, {S64
})
1380 .legalForCartesianProduct(AddrSpaces32
, {S32
})
1382 // Accept any address space as long as the size matches
1383 .legalIf(sameSize(0, 1))
1384 .widenScalarIf(smallerThan(0, 1),
1385 [](const LegalityQuery
&Query
) {
1387 0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
1389 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery
&Query
) {
1390 return std::pair(0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
1393 getActionDefinitionsBuilder(G_ADDRSPACE_CAST
)
1397 const auto needToSplitMemOp
= [=](const LegalityQuery
&Query
,
1398 bool IsLoad
) -> bool {
1399 const LLT DstTy
= Query
.Types
[0];
1401 // Split vector extloads.
1402 unsigned MemSize
= Query
.MMODescrs
[0].MemoryTy
.getSizeInBits();
1404 if (DstTy
.isVector() && DstTy
.getSizeInBits() > MemSize
)
1407 const LLT PtrTy
= Query
.Types
[1];
1408 unsigned AS
= PtrTy
.getAddressSpace();
1409 if (MemSize
> maxSizeForAddrSpace(ST
, AS
, IsLoad
,
1410 Query
.MMODescrs
[0].Ordering
!=
1411 AtomicOrdering::NotAtomic
))
1414 // Catch weird sized loads that don't evenly divide into the access sizes
1415 // TODO: May be able to widen depending on alignment etc.
1416 unsigned NumRegs
= (MemSize
+ 31) / 32;
1418 if (!ST
.hasDwordx3LoadStores())
1421 // If the alignment allows, these should have been widened.
1422 if (!isPowerOf2_32(NumRegs
))
1429 unsigned GlobalAlign32
= ST
.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1430 unsigned GlobalAlign16
= ST
.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1431 unsigned GlobalAlign8
= ST
.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1433 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1435 // TODO: Unsupported flat for SI.
1437 for (unsigned Op
: {G_LOAD
, G_STORE
}) {
1438 const bool IsStore
= Op
== G_STORE
;
1440 auto &Actions
= getActionDefinitionsBuilder(Op
);
1441 // Explicitly list some common cases.
1442 // TODO: Does this help compile time at all?
1443 Actions
.legalForTypesWithMemDesc({{S32
, GlobalPtr
, S32
, GlobalAlign32
},
1444 {V2S32
, GlobalPtr
, V2S32
, GlobalAlign32
},
1445 {V4S32
, GlobalPtr
, V4S32
, GlobalAlign32
},
1446 {S64
, GlobalPtr
, S64
, GlobalAlign32
},
1447 {V2S64
, GlobalPtr
, V2S64
, GlobalAlign32
},
1448 {V2S16
, GlobalPtr
, V2S16
, GlobalAlign32
},
1449 {S32
, GlobalPtr
, S8
, GlobalAlign8
},
1450 {S32
, GlobalPtr
, S16
, GlobalAlign16
},
1452 {S32
, LocalPtr
, S32
, 32},
1453 {S64
, LocalPtr
, S64
, 32},
1454 {V2S32
, LocalPtr
, V2S32
, 32},
1455 {S32
, LocalPtr
, S8
, 8},
1456 {S32
, LocalPtr
, S16
, 16},
1457 {V2S16
, LocalPtr
, S32
, 32},
1459 {S32
, PrivatePtr
, S32
, 32},
1460 {S32
, PrivatePtr
, S8
, 8},
1461 {S32
, PrivatePtr
, S16
, 16},
1462 {V2S16
, PrivatePtr
, S32
, 32},
1464 {S32
, ConstantPtr
, S32
, GlobalAlign32
},
1465 {V2S32
, ConstantPtr
, V2S32
, GlobalAlign32
},
1466 {V4S32
, ConstantPtr
, V4S32
, GlobalAlign32
},
1467 {S64
, ConstantPtr
, S64
, GlobalAlign32
},
1468 {V2S32
, ConstantPtr
, V2S32
, GlobalAlign32
}});
1470 [=](const LegalityQuery
&Query
) -> bool {
1471 return isLoadStoreLegal(ST
, Query
);
1474 // The custom pointers (fat pointers, buffer resources) don't work with load
1475 // and store at this level. Fat pointers should have been lowered to
1476 // intrinsics before the translation to MIR.
1477 Actions
.unsupportedIf(
1478 typeInSet(1, {BufferFatPtr
, BufferStridedPtr
, RsrcPtr
}));
1480 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1481 // ptrtoint. This is needed to account for the fact that we can't have i128
1482 // as a register class for SelectionDAG reasons.
1483 Actions
.customIf([=](const LegalityQuery
&Query
) -> bool {
1484 return hasBufferRsrcWorkaround(Query
.Types
[0]);
1487 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1490 // TODO: Should generalize bitcast action into coerce, which will also cover
1491 // inserting addrspacecasts.
1492 Actions
.customIf(typeIs(1, Constant32Ptr
));
1494 // Turn any illegal element vectors into something easier to deal
1495 // with. These will ultimately produce 32-bit scalar shifts to extract the
1498 // For odd 16-bit element vectors, prefer to split those into pieces with
1499 // 16-bit vector parts.
1501 [=](const LegalityQuery
&Query
) -> bool {
1502 return shouldBitcastLoadStoreType(ST
, Query
.Types
[0],
1503 Query
.MMODescrs
[0].MemoryTy
);
1504 }, bitcastToRegisterType(0));
1507 // Widen suitably aligned loads by loading extra bytes. The standard
1508 // legalization actions can't properly express widening memory operands.
1509 Actions
.customIf([=](const LegalityQuery
&Query
) -> bool {
1510 return shouldWidenLoad(ST
, Query
, G_LOAD
);
1514 // FIXME: load/store narrowing should be moved to lower action
1517 [=](const LegalityQuery
&Query
) -> bool {
1518 return !Query
.Types
[0].isVector() &&
1519 needToSplitMemOp(Query
, Op
== G_LOAD
);
1521 [=](const LegalityQuery
&Query
) -> std::pair
<unsigned, LLT
> {
1522 const LLT DstTy
= Query
.Types
[0];
1523 const LLT PtrTy
= Query
.Types
[1];
1525 const unsigned DstSize
= DstTy
.getSizeInBits();
1526 unsigned MemSize
= Query
.MMODescrs
[0].MemoryTy
.getSizeInBits();
1529 if (DstSize
> MemSize
)
1530 return std::pair(0, LLT::scalar(MemSize
));
1532 unsigned MaxSize
= maxSizeForAddrSpace(
1533 ST
, PtrTy
.getAddressSpace(), Op
== G_LOAD
,
1534 Query
.MMODescrs
[0].Ordering
!= AtomicOrdering::NotAtomic
);
1535 if (MemSize
> MaxSize
)
1536 return std::pair(0, LLT::scalar(MaxSize
));
1538 uint64_t Align
= Query
.MMODescrs
[0].AlignInBits
;
1539 return std::pair(0, LLT::scalar(Align
));
1542 [=](const LegalityQuery
&Query
) -> bool {
1543 return Query
.Types
[0].isVector() &&
1544 needToSplitMemOp(Query
, Op
== G_LOAD
);
1546 [=](const LegalityQuery
&Query
) -> std::pair
<unsigned, LLT
> {
1547 const LLT DstTy
= Query
.Types
[0];
1548 const LLT PtrTy
= Query
.Types
[1];
1550 LLT EltTy
= DstTy
.getElementType();
1551 unsigned MaxSize
= maxSizeForAddrSpace(
1552 ST
, PtrTy
.getAddressSpace(), Op
== G_LOAD
,
1553 Query
.MMODescrs
[0].Ordering
!= AtomicOrdering::NotAtomic
);
1555 // FIXME: Handle widened to power of 2 results better. This ends
1557 // FIXME: 3 element stores scalarized on SI
1559 // Split if it's too large for the address space.
1560 unsigned MemSize
= Query
.MMODescrs
[0].MemoryTy
.getSizeInBits();
1561 if (MemSize
> MaxSize
) {
1562 unsigned NumElts
= DstTy
.getNumElements();
1563 unsigned EltSize
= EltTy
.getSizeInBits();
1565 if (MaxSize
% EltSize
== 0) {
1567 0, LLT::scalarOrVector(
1568 ElementCount::getFixed(MaxSize
/ EltSize
), EltTy
));
1571 unsigned NumPieces
= MemSize
/ MaxSize
;
1573 // FIXME: Refine when odd breakdowns handled
1574 // The scalars will need to be re-legalized.
1575 if (NumPieces
== 1 || NumPieces
>= NumElts
||
1576 NumElts
% NumPieces
!= 0)
1577 return std::pair(0, EltTy
);
1580 LLT::fixed_vector(NumElts
/ NumPieces
, EltTy
));
1583 // FIXME: We could probably handle weird extending loads better.
1584 if (DstTy
.getSizeInBits() > MemSize
)
1585 return std::pair(0, EltTy
);
1587 unsigned EltSize
= EltTy
.getSizeInBits();
1588 unsigned DstSize
= DstTy
.getSizeInBits();
1589 if (!isPowerOf2_32(DstSize
)) {
1590 // We're probably decomposing an odd sized store. Try to split
1591 // to the widest type. TODO: Account for alignment. As-is it
1592 // should be OK, since the new parts will be further legalized.
1593 unsigned FloorSize
= llvm::bit_floor(DstSize
);
1595 0, LLT::scalarOrVector(
1596 ElementCount::getFixed(FloorSize
/ EltSize
), EltTy
));
1599 // May need relegalization for the scalars.
1600 return std::pair(0, EltTy
);
1603 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32
))
1604 .widenScalarToNextPow2(0)
1605 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1609 // FIXME: Unaligned accesses not lowered.
1610 auto &ExtLoads
= getActionDefinitionsBuilder({G_SEXTLOAD
, G_ZEXTLOAD
})
1611 .legalForTypesWithMemDesc({{S32
, GlobalPtr
, S8
, 8},
1612 {S32
, GlobalPtr
, S16
, 2 * 8},
1613 {S32
, LocalPtr
, S8
, 8},
1614 {S32
, LocalPtr
, S16
, 16},
1615 {S32
, PrivatePtr
, S8
, 8},
1616 {S32
, PrivatePtr
, S16
, 16},
1617 {S32
, ConstantPtr
, S8
, 8},
1618 {S32
, ConstantPtr
, S16
, 2 * 8}})
1620 [=](const LegalityQuery
&Query
) -> bool {
1621 return isLoadStoreLegal(ST
, Query
);
1624 if (ST
.hasFlatAddressSpace()) {
1625 ExtLoads
.legalForTypesWithMemDesc(
1626 {{S32
, FlatPtr
, S8
, 8}, {S32
, FlatPtr
, S16
, 16}});
1629 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1632 // TODO: Should generalize bitcast action into coerce, which will also cover
1633 // inserting addrspacecasts.
1634 ExtLoads
.customIf(typeIs(1, Constant32Ptr
));
1636 ExtLoads
.clampScalar(0, S32
, S32
)
1637 .widenScalarToNextPow2(0)
1640 auto &Atomics
= getActionDefinitionsBuilder(
1641 {G_ATOMICRMW_XCHG
, G_ATOMICRMW_ADD
, G_ATOMICRMW_SUB
,
1642 G_ATOMICRMW_AND
, G_ATOMICRMW_OR
, G_ATOMICRMW_XOR
,
1643 G_ATOMICRMW_MAX
, G_ATOMICRMW_MIN
, G_ATOMICRMW_UMAX
,
1644 G_ATOMICRMW_UMIN
, G_ATOMICRMW_UINC_WRAP
, G_ATOMICRMW_UDEC_WRAP
})
1645 .legalFor({{S32
, GlobalPtr
}, {S32
, LocalPtr
},
1646 {S64
, GlobalPtr
}, {S64
, LocalPtr
},
1647 {S32
, RegionPtr
}, {S64
, RegionPtr
}});
1648 if (ST
.hasFlatAddressSpace()) {
1649 Atomics
.legalFor({{S32
, FlatPtr
}, {S64
, FlatPtr
}});
1652 // TODO: v2bf16 operations, and fat buffer pointer support.
1653 auto &Atomic
= getActionDefinitionsBuilder(G_ATOMICRMW_FADD
);
1654 if (ST
.hasLDSFPAtomicAddF32()) {
1655 Atomic
.legalFor({{S32
, LocalPtr
}, {S32
, RegionPtr
}});
1656 if (ST
.hasLdsAtomicAddF64())
1657 Atomic
.legalFor({{S64
, LocalPtr
}});
1658 if (ST
.hasAtomicDsPkAdd16Insts())
1659 Atomic
.legalFor({{V2F16
, LocalPtr
}, {V2BF16
, LocalPtr
}});
1661 if (ST
.hasAtomicFaddInsts())
1662 Atomic
.legalFor({{S32
, GlobalPtr
}});
1663 if (ST
.hasFlatAtomicFaddF32Inst())
1664 Atomic
.legalFor({{S32
, FlatPtr
}});
1666 if (ST
.hasGFX90AInsts()) {
1667 // These are legal with some caveats, and should have undergone expansion in
1668 // the IR in most situations
1669 // TODO: Move atomic expansion into legalizer
1677 if (ST
.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1678 ST
.hasAtomicBufferGlobalPkAddF16Insts())
1679 Atomic
.legalFor({{V2F16
, GlobalPtr
}, {V2F16
, BufferFatPtr
}});
1680 if (ST
.hasAtomicGlobalPkAddBF16Inst())
1681 Atomic
.legalFor({{V2BF16
, GlobalPtr
}});
1682 if (ST
.hasAtomicFlatPkAdd16Insts())
1683 Atomic
.legalFor({{V2F16
, FlatPtr
}, {V2BF16
, FlatPtr
}});
1686 // Most of the legalization work here is done by AtomicExpand. We could
1687 // probably use a simpler legality rule that just assumes anything is OK.
1688 auto &AtomicFMinFMax
=
1689 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN
, G_ATOMICRMW_FMAX
})
1690 .legalFor({{F32
, LocalPtr
}, {F64
, LocalPtr
}});
1692 if (ST
.hasAtomicFMinFMaxF32GlobalInsts())
1693 AtomicFMinFMax
.legalFor({{F32
, GlobalPtr
},{F32
, BufferFatPtr
}});
1694 if (ST
.hasAtomicFMinFMaxF64GlobalInsts())
1695 AtomicFMinFMax
.legalFor({{F64
, GlobalPtr
}, {F64
, BufferFatPtr
}});
1696 if (ST
.hasAtomicFMinFMaxF32FlatInsts())
1697 AtomicFMinFMax
.legalFor({F32
, FlatPtr
});
1698 if (ST
.hasAtomicFMinFMaxF64FlatInsts())
1699 AtomicFMinFMax
.legalFor({F64
, FlatPtr
});
1701 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1703 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG
)
1704 .customFor({{S32
, GlobalPtr
}, {S64
, GlobalPtr
},
1705 {S32
, FlatPtr
}, {S64
, FlatPtr
}})
1706 .legalFor({{S32
, LocalPtr
}, {S64
, LocalPtr
},
1707 {S32
, RegionPtr
}, {S64
, RegionPtr
}});
1708 // TODO: Pointer types, any 32-bit or 64-bit vector
1710 // Condition should be s32 for scalar, s1 for vector.
1711 getActionDefinitionsBuilder(G_SELECT
)
1712 .legalForCartesianProduct({S32
, S64
, S16
, V2S32
, V2S16
, V4S16
, GlobalPtr
,
1713 LocalPtr
, FlatPtr
, PrivatePtr
,
1714 LLT::fixed_vector(2, LocalPtr
),
1715 LLT::fixed_vector(2, PrivatePtr
)},
1717 .clampScalar(0, S16
, S64
)
1719 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1720 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1721 .clampMaxNumElements(0, S32
, 2)
1722 .clampMaxNumElements(0, LocalPtr
, 2)
1723 .clampMaxNumElements(0, PrivatePtr
, 2)
1725 .widenScalarToNextPow2(0)
1726 .legalIf(all(isPointer(0), typeInSet(1, {S1
, S32
})));
1728 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1729 // be more flexible with the shift amount type.
1730 auto &Shifts
= getActionDefinitionsBuilder({G_SHL
, G_LSHR
, G_ASHR
})
1731 .legalFor({{S32
, S32
}, {S64
, S32
}});
1732 if (ST
.has16BitInsts()) {
1733 if (ST
.hasVOP3PInsts()) {
1734 Shifts
.legalFor({{S16
, S16
}, {V2S16
, V2S16
}})
1735 .clampMaxNumElements(0, S16
, 2);
1737 Shifts
.legalFor({{S16
, S16
}});
1739 // TODO: Support 16-bit shift amounts for all types
1740 Shifts
.widenScalarIf(
1741 [=](const LegalityQuery
&Query
) {
1742 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1744 const LLT ValTy
= Query
.Types
[0];
1745 const LLT AmountTy
= Query
.Types
[1];
1746 return ValTy
.getSizeInBits() <= 16 &&
1747 AmountTy
.getSizeInBits() < 16;
1748 }, changeTo(1, S16
));
1749 Shifts
.maxScalarIf(typeIs(0, S16
), 1, S16
);
1750 Shifts
.clampScalar(1, S32
, S32
);
1751 Shifts
.widenScalarToNextPow2(0, 16);
1752 Shifts
.clampScalar(0, S16
, S64
);
1754 getActionDefinitionsBuilder({G_SSHLSAT
, G_USHLSAT
})
1759 // Make sure we legalize the shift amount type first, as the general
1760 // expansion for the shifted type will produce much worse code if it hasn't
1761 // been truncated already.
1762 Shifts
.clampScalar(1, S32
, S32
);
1763 Shifts
.widenScalarToNextPow2(0, 32);
1764 Shifts
.clampScalar(0, S32
, S64
);
1766 getActionDefinitionsBuilder({G_SSHLSAT
, G_USHLSAT
})
1771 Shifts
.scalarize(0);
1773 for (unsigned Op
: {G_EXTRACT_VECTOR_ELT
, G_INSERT_VECTOR_ELT
}) {
1774 unsigned VecTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 1 : 0;
1775 unsigned EltTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 0 : 1;
1776 unsigned IdxTypeIdx
= 2;
1778 getActionDefinitionsBuilder(Op
)
1779 .customIf([=](const LegalityQuery
&Query
) {
1780 const LLT EltTy
= Query
.Types
[EltTypeIdx
];
1781 const LLT VecTy
= Query
.Types
[VecTypeIdx
];
1782 const LLT IdxTy
= Query
.Types
[IdxTypeIdx
];
1783 const unsigned EltSize
= EltTy
.getSizeInBits();
1784 const bool isLegalVecType
=
1785 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy
.getSizeInBits());
1786 // Address space 8 pointers are 128-bit wide values, but the logic
1787 // below will try to bitcast them to 2N x s64, which will fail.
1788 // Therefore, as an intermediate step, wrap extracts/insertions from a
1789 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1790 // extraction result) in order to produce a vector operation that can
1791 // be handled by the logic below.
1792 if (EltTy
.isPointer() && EltSize
> 64)
1794 return (EltSize
== 32 || EltSize
== 64) &&
1795 VecTy
.getSizeInBits() % 32 == 0 &&
1796 VecTy
.getSizeInBits() <= MaxRegisterSize
&&
1797 IdxTy
.getSizeInBits() == 32 &&
1800 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx
), scalarOrEltNarrowerThan(VecTypeIdx
, 32)),
1801 bitcastToVectorElement32(VecTypeIdx
))
1802 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1804 all(sizeIsMultipleOf32(VecTypeIdx
), scalarOrEltWiderThan(VecTypeIdx
, 64)),
1805 [=](const LegalityQuery
&Query
) {
1806 // For > 64-bit element types, try to turn this into a 64-bit
1807 // element vector since we may be able to do better indexing
1808 // if this is scalar. If not, fall back to 32.
1809 const LLT EltTy
= Query
.Types
[EltTypeIdx
];
1810 const LLT VecTy
= Query
.Types
[VecTypeIdx
];
1811 const unsigned DstEltSize
= EltTy
.getSizeInBits();
1812 const unsigned VecSize
= VecTy
.getSizeInBits();
1814 const unsigned TargetEltSize
= DstEltSize
% 64 == 0 ? 64 : 32;
1817 LLT::fixed_vector(VecSize
/ TargetEltSize
, TargetEltSize
));
1819 .clampScalar(EltTypeIdx
, S32
, S64
)
1820 .clampScalar(VecTypeIdx
, S32
, S64
)
1821 .clampScalar(IdxTypeIdx
, S32
, S32
)
1822 .clampMaxNumElements(VecTypeIdx
, S32
, 32)
1823 // TODO: Clamp elements for 64-bit vectors?
1825 isIllegalRegisterType(VecTypeIdx
),
1826 moreElementsToNextExistingRegClass(VecTypeIdx
))
1827 // It should only be necessary with variable indexes.
1828 // As a last resort, lower to the stack
1832 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT
)
1833 .unsupportedIf([=](const LegalityQuery
&Query
) {
1834 const LLT
&EltTy
= Query
.Types
[1].getElementType();
1835 return Query
.Types
[0] != EltTy
;
1838 for (unsigned Op
: {G_EXTRACT
, G_INSERT
}) {
1839 unsigned BigTyIdx
= Op
== G_EXTRACT
? 1 : 0;
1840 unsigned LitTyIdx
= Op
== G_EXTRACT
? 0 : 1;
1842 // FIXME: Doesn't handle extract of illegal sizes.
1843 getActionDefinitionsBuilder(Op
)
1844 .lowerIf(all(typeIs(LitTyIdx
, S16
), sizeIs(BigTyIdx
, 32)))
1845 .lowerIf([=](const LegalityQuery
&Query
) {
1846 // Sub-vector(or single element) insert and extract.
1847 // TODO: verify immediate offset here since lower only works with
1849 const LLT BigTy
= Query
.Types
[BigTyIdx
];
1850 return BigTy
.isVector();
1852 // FIXME: Multiples of 16 should not be legal.
1853 .legalIf([=](const LegalityQuery
&Query
) {
1854 const LLT BigTy
= Query
.Types
[BigTyIdx
];
1855 const LLT LitTy
= Query
.Types
[LitTyIdx
];
1856 return (BigTy
.getSizeInBits() % 32 == 0) &&
1857 (LitTy
.getSizeInBits() % 16 == 0);
1860 [=](const LegalityQuery
&Query
) {
1861 const LLT BigTy
= Query
.Types
[BigTyIdx
];
1862 return (BigTy
.getScalarSizeInBits() < 16);
1864 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx
, 16))
1866 [=](const LegalityQuery
&Query
) {
1867 const LLT LitTy
= Query
.Types
[LitTyIdx
];
1868 return (LitTy
.getScalarSizeInBits() < 16);
1870 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx
, 16))
1871 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
1872 .widenScalarToNextPow2(BigTyIdx
, 32);
1876 auto &BuildVector
= getActionDefinitionsBuilder(G_BUILD_VECTOR
)
1877 .legalForCartesianProduct(AllS32Vectors
, {S32
})
1878 .legalForCartesianProduct(AllS64Vectors
, {S64
})
1879 .clampNumElements(0, V16S32
, V32S32
)
1880 .clampNumElements(0, V2S64
, V16S64
)
1881 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16
))
1883 isIllegalRegisterType(0),
1884 moreElementsToNextExistingRegClass(0));
1886 if (ST
.hasScalarPackInsts()) {
1888 // FIXME: Should probably widen s1 vectors straight to s32
1889 .minScalarOrElt(0, S16
)
1892 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
)
1893 .legalFor({V2S16
, S32
})
1896 BuildVector
.customFor({V2S16
, S16
});
1897 BuildVector
.minScalarOrElt(0, S32
);
1899 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
)
1900 .customFor({V2S16
, S32
})
1904 BuildVector
.legalIf(isRegisterType(0));
1906 // FIXME: Clamp maximum size
1907 getActionDefinitionsBuilder(G_CONCAT_VECTORS
)
1908 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1909 .clampMaxNumElements(0, S32
, 32)
1910 .clampMaxNumElements(1, S16
, 2) // TODO: Make 4?
1911 .clampMaxNumElements(0, S16
, 64);
1913 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR
).lower();
1916 for (unsigned Op
: {G_MERGE_VALUES
, G_UNMERGE_VALUES
}) {
1917 unsigned BigTyIdx
= Op
== G_MERGE_VALUES
? 0 : 1;
1918 unsigned LitTyIdx
= Op
== G_MERGE_VALUES
? 1 : 0;
1920 auto notValidElt
= [=](const LegalityQuery
&Query
, unsigned TypeIdx
) {
1921 const LLT Ty
= Query
.Types
[TypeIdx
];
1922 if (Ty
.isVector()) {
1923 const LLT
&EltTy
= Ty
.getElementType();
1924 if (EltTy
.getSizeInBits() < 8 || EltTy
.getSizeInBits() > 512)
1926 if (!llvm::has_single_bit
<uint32_t>(EltTy
.getSizeInBits()))
1932 auto &Builder
= getActionDefinitionsBuilder(Op
)
1933 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1934 .lowerFor({{S16
, V2S16
}})
1935 .lowerIf([=](const LegalityQuery
&Query
) {
1936 const LLT BigTy
= Query
.Types
[BigTyIdx
];
1937 return BigTy
.getSizeInBits() == 32;
1939 // Try to widen to s16 first for small types.
1940 // TODO: Only do this on targets with legal s16 shifts
1941 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx
, 16), LitTyIdx
, S16
)
1942 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 16)
1943 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
1944 .fewerElementsIf(all(typeIs(0, S16
), vectorWiderThan(1, 32),
1945 elementTypeIs(1, S16
)),
1947 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1948 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1950 .clampScalar(LitTyIdx
, S32
, S512
)
1951 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 32)
1952 // Break up vectors with weird elements into scalars
1954 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, LitTyIdx
); },
1957 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, BigTyIdx
); },
1959 .clampScalar(BigTyIdx
, S32
, MaxScalar
);
1961 if (Op
== G_MERGE_VALUES
) {
1962 Builder
.widenScalarIf(
1963 // TODO: Use 16-bit shifts if legal for 8-bit values?
1964 [=](const LegalityQuery
&Query
) {
1965 const LLT Ty
= Query
.Types
[LitTyIdx
];
1966 return Ty
.getSizeInBits() < 32;
1968 changeTo(LitTyIdx
, S32
));
1971 Builder
.widenScalarIf(
1972 [=](const LegalityQuery
&Query
) {
1973 const LLT Ty
= Query
.Types
[BigTyIdx
];
1974 return Ty
.getSizeInBits() % 16 != 0;
1976 [=](const LegalityQuery
&Query
) {
1977 // Pick the next power of 2, or a multiple of 64 over 128.
1978 // Whichever is smaller.
1979 const LLT
&Ty
= Query
.Types
[BigTyIdx
];
1980 unsigned NewSizeInBits
= 1 << Log2_32_Ceil(Ty
.getSizeInBits() + 1);
1981 if (NewSizeInBits
>= 256) {
1982 unsigned RoundedTo
= alignTo
<64>(Ty
.getSizeInBits() + 1);
1983 if (RoundedTo
< NewSizeInBits
)
1984 NewSizeInBits
= RoundedTo
;
1986 return std::pair(BigTyIdx
, LLT::scalar(NewSizeInBits
));
1988 // Any vectors left are the wrong size. Scalarize them.
1993 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1995 auto &SextInReg
= getActionDefinitionsBuilder(G_SEXT_INREG
)
1996 .legalFor({{S32
}, {S64
}});
1998 if (ST
.hasVOP3PInsts()) {
1999 SextInReg
.lowerFor({{V2S16
}})
2000 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2001 // get more vector shift opportunities, since we'll get those when
2003 .clampMaxNumElementsStrict(0, S16
, 2);
2004 } else if (ST
.has16BitInsts()) {
2005 SextInReg
.lowerFor({{S32
}, {S64
}, {S16
}});
2007 // Prefer to promote to s32 before lowering if we don't have 16-bit
2008 // shifts. This avoid a lot of intermediate truncate and extend operations.
2009 SextInReg
.lowerFor({{S32
}, {S64
}});
2014 .clampScalar(0, S32
, S64
)
2017 getActionDefinitionsBuilder({G_ROTR
, G_ROTL
})
2021 // TODO: Only Try to form v2s16 with legal packed instructions.
2022 getActionDefinitionsBuilder(G_FSHR
)
2023 .legalFor({{S32
, S32
}})
2024 .lowerFor({{V2S16
, V2S16
}})
2025 .clampMaxNumElementsStrict(0, S16
, 2)
2029 if (ST
.hasVOP3PInsts()) {
2030 getActionDefinitionsBuilder(G_FSHL
)
2031 .lowerFor({{V2S16
, V2S16
}})
2032 .clampMaxNumElementsStrict(0, S16
, 2)
2036 getActionDefinitionsBuilder(G_FSHL
)
2041 getActionDefinitionsBuilder(G_READCYCLECOUNTER
)
2044 getActionDefinitionsBuilder(G_READSTEADYCOUNTER
).legalFor({S64
});
2046 getActionDefinitionsBuilder(G_FENCE
)
2049 getActionDefinitionsBuilder({G_SMULO
, G_UMULO
})
2054 getActionDefinitionsBuilder({G_SBFX
, G_UBFX
})
2055 .legalFor({{S32
, S32
}, {S64
, S32
}})
2056 .clampScalar(1, S32
, S32
)
2057 .clampScalar(0, S32
, S64
)
2058 .widenScalarToNextPow2(0)
2061 getActionDefinitionsBuilder(
2062 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2065 G_ATOMIC_CMPXCHG_WITH_SUCCESS
, G_ATOMICRMW_NAND
, G_ATOMICRMW_FSUB
,
2066 G_READ_REGISTER
, G_WRITE_REGISTER
,
2071 if (ST
.hasIEEEMinMax()) {
2072 getActionDefinitionsBuilder({G_FMINIMUM
, G_FMAXIMUM
})
2073 .legalFor(FPTypesPK16
)
2074 .clampMaxNumElements(0, S16
, 2)
2078 getActionDefinitionsBuilder({G_FMINIMUM
, G_FMAXIMUM
}).lower();
2081 getActionDefinitionsBuilder({G_MEMCPY
, G_MEMCPY_INLINE
, G_MEMMOVE
, G_MEMSET
})
2084 getActionDefinitionsBuilder({G_TRAP
, G_DEBUGTRAP
}).custom();
2086 getActionDefinitionsBuilder({G_VASTART
, G_VAARG
, G_BRJT
, G_JUMP_TABLE
,
2087 G_INDEXED_LOAD
, G_INDEXED_SEXTLOAD
,
2088 G_INDEXED_ZEXTLOAD
, G_INDEXED_STORE
})
2091 getActionDefinitionsBuilder(G_PREFETCH
).alwaysLegal();
2093 getLegacyLegalizerInfo().computeTables();
2094 verify(*ST
.getInstrInfo());
2097 bool AMDGPULegalizerInfo::legalizeCustom(
2098 LegalizerHelper
&Helper
, MachineInstr
&MI
,
2099 LostDebugLocObserver
&LocObserver
) const {
2100 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
2101 MachineRegisterInfo
&MRI
= *B
.getMRI();
2103 switch (MI
.getOpcode()) {
2104 case TargetOpcode::G_ADDRSPACE_CAST
:
2105 return legalizeAddrSpaceCast(MI
, MRI
, B
);
2106 case TargetOpcode::G_INTRINSIC_ROUNDEVEN
:
2107 return legalizeFroundeven(MI
, MRI
, B
);
2108 case TargetOpcode::G_FCEIL
:
2109 return legalizeFceil(MI
, MRI
, B
);
2110 case TargetOpcode::G_FREM
:
2111 return legalizeFrem(MI
, MRI
, B
);
2112 case TargetOpcode::G_INTRINSIC_TRUNC
:
2113 return legalizeIntrinsicTrunc(MI
, MRI
, B
);
2114 case TargetOpcode::G_SITOFP
:
2115 return legalizeITOFP(MI
, MRI
, B
, true);
2116 case TargetOpcode::G_UITOFP
:
2117 return legalizeITOFP(MI
, MRI
, B
, false);
2118 case TargetOpcode::G_FPTOSI
:
2119 return legalizeFPTOI(MI
, MRI
, B
, true);
2120 case TargetOpcode::G_FPTOUI
:
2121 return legalizeFPTOI(MI
, MRI
, B
, false);
2122 case TargetOpcode::G_FMINNUM
:
2123 case TargetOpcode::G_FMAXNUM
:
2124 case TargetOpcode::G_FMINNUM_IEEE
:
2125 case TargetOpcode::G_FMAXNUM_IEEE
:
2126 return legalizeMinNumMaxNum(Helper
, MI
);
2127 case TargetOpcode::G_EXTRACT_VECTOR_ELT
:
2128 return legalizeExtractVectorElt(MI
, MRI
, B
);
2129 case TargetOpcode::G_INSERT_VECTOR_ELT
:
2130 return legalizeInsertVectorElt(MI
, MRI
, B
);
2131 case TargetOpcode::G_FSIN
:
2132 case TargetOpcode::G_FCOS
:
2133 return legalizeSinCos(MI
, MRI
, B
);
2134 case TargetOpcode::G_GLOBAL_VALUE
:
2135 return legalizeGlobalValue(MI
, MRI
, B
);
2136 case TargetOpcode::G_LOAD
:
2137 case TargetOpcode::G_SEXTLOAD
:
2138 case TargetOpcode::G_ZEXTLOAD
:
2139 return legalizeLoad(Helper
, MI
);
2140 case TargetOpcode::G_STORE
:
2141 return legalizeStore(Helper
, MI
);
2142 case TargetOpcode::G_FMAD
:
2143 return legalizeFMad(MI
, MRI
, B
);
2144 case TargetOpcode::G_FDIV
:
2145 return legalizeFDIV(MI
, MRI
, B
);
2146 case TargetOpcode::G_FFREXP
:
2147 return legalizeFFREXP(MI
, MRI
, B
);
2148 case TargetOpcode::G_FSQRT
:
2149 return legalizeFSQRT(MI
, MRI
, B
);
2150 case TargetOpcode::G_UDIV
:
2151 case TargetOpcode::G_UREM
:
2152 case TargetOpcode::G_UDIVREM
:
2153 return legalizeUnsignedDIV_REM(MI
, MRI
, B
);
2154 case TargetOpcode::G_SDIV
:
2155 case TargetOpcode::G_SREM
:
2156 case TargetOpcode::G_SDIVREM
:
2157 return legalizeSignedDIV_REM(MI
, MRI
, B
);
2158 case TargetOpcode::G_ATOMIC_CMPXCHG
:
2159 return legalizeAtomicCmpXChg(MI
, MRI
, B
);
2160 case TargetOpcode::G_FLOG2
:
2161 return legalizeFlog2(MI
, B
);
2162 case TargetOpcode::G_FLOG
:
2163 case TargetOpcode::G_FLOG10
:
2164 return legalizeFlogCommon(MI
, B
);
2165 case TargetOpcode::G_FEXP2
:
2166 return legalizeFExp2(MI
, B
);
2167 case TargetOpcode::G_FEXP
:
2168 case TargetOpcode::G_FEXP10
:
2169 return legalizeFExp(MI
, B
);
2170 case TargetOpcode::G_FPOW
:
2171 return legalizeFPow(MI
, B
);
2172 case TargetOpcode::G_FFLOOR
:
2173 return legalizeFFloor(MI
, MRI
, B
);
2174 case TargetOpcode::G_BUILD_VECTOR
:
2175 case TargetOpcode::G_BUILD_VECTOR_TRUNC
:
2176 return legalizeBuildVector(MI
, MRI
, B
);
2177 case TargetOpcode::G_MUL
:
2178 return legalizeMul(Helper
, MI
);
2179 case TargetOpcode::G_CTLZ
:
2180 case TargetOpcode::G_CTTZ
:
2181 return legalizeCTLZ_CTTZ(MI
, MRI
, B
);
2182 case TargetOpcode::G_CTLZ_ZERO_UNDEF
:
2183 return legalizeCTLZ_ZERO_UNDEF(MI
, MRI
, B
);
2184 case TargetOpcode::G_STACKSAVE
:
2185 return legalizeStackSave(MI
, B
);
2186 case TargetOpcode::G_GET_FPENV
:
2187 return legalizeGetFPEnv(MI
, MRI
, B
);
2188 case TargetOpcode::G_SET_FPENV
:
2189 return legalizeSetFPEnv(MI
, MRI
, B
);
2190 case TargetOpcode::G_TRAP
:
2191 return legalizeTrap(MI
, MRI
, B
);
2192 case TargetOpcode::G_DEBUGTRAP
:
2193 return legalizeDebugTrap(MI
, MRI
, B
);
2198 llvm_unreachable("expected switch to return");
2201 Register
AMDGPULegalizerInfo::getSegmentAperture(
2203 MachineRegisterInfo
&MRI
,
2204 MachineIRBuilder
&B
) const {
2205 MachineFunction
&MF
= B
.getMF();
2206 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
2207 const LLT S32
= LLT::scalar(32);
2208 const LLT S64
= LLT::scalar(64);
2210 assert(AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::PRIVATE_ADDRESS
);
2212 if (ST
.hasApertureRegs()) {
2213 // Note: this register is somewhat broken. When used as a 32-bit operand,
2214 // it only returns zeroes. The real value is in the upper 32 bits.
2215 // Thus, we must emit extract the high 32 bits.
2216 const unsigned ApertureRegNo
= (AS
== AMDGPUAS::LOCAL_ADDRESS
)
2217 ? AMDGPU::SRC_SHARED_BASE
2218 : AMDGPU::SRC_PRIVATE_BASE
;
2219 // FIXME: It would be more natural to emit a COPY here, but then copy
2220 // coalescing would kick in and it would think it's okay to use the "HI"
2221 // subregister (instead of extracting the HI 32 bits) which is an artificial
2222 // (unusable) register.
2223 // Register TableGen definitions would need an overhaul to get rid of the
2224 // artificial "HI" aperture registers and prevent this kind of issue from
2226 Register Dst
= MRI
.createGenericVirtualRegister(S64
);
2227 MRI
.setRegClass(Dst
, &AMDGPU::SReg_64RegClass
);
2228 B
.buildInstr(AMDGPU::S_MOV_B64
, {Dst
}, {Register(ApertureRegNo
)});
2229 return B
.buildUnmerge(S32
, Dst
).getReg(1);
2232 // TODO: can we be smarter about machine pointer info?
2233 MachinePointerInfo
PtrInfo(AMDGPUAS::CONSTANT_ADDRESS
);
2234 Register LoadAddr
= MRI
.createGenericVirtualRegister(
2235 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
2236 // For code object version 5, private_base and shared_base are passed through
2237 // implicit kernargs.
2238 if (AMDGPU::getAMDHSACodeObjectVersion(*MF
.getFunction().getParent()) >=
2239 AMDGPU::AMDHSA_COV5
) {
2240 AMDGPUTargetLowering::ImplicitParameter Param
=
2241 AS
== AMDGPUAS::LOCAL_ADDRESS
? AMDGPUTargetLowering::SHARED_BASE
2242 : AMDGPUTargetLowering::PRIVATE_BASE
;
2244 ST
.getTargetLowering()->getImplicitParameterOffset(B
.getMF(), Param
);
2246 Register KernargPtrReg
= MRI
.createGenericVirtualRegister(
2247 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
2249 if (!loadInputValue(KernargPtrReg
, B
,
2250 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
))
2253 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
2255 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
2256 MachineMemOperand::MOInvariant
,
2257 LLT::scalar(32), commonAlignment(Align(64), Offset
));
2260 B
.buildPtrAdd(LoadAddr
, KernargPtrReg
,
2261 B
.buildConstant(LLT::scalar(64), Offset
).getReg(0));
2263 return B
.buildLoad(S32
, LoadAddr
, *MMO
).getReg(0);
2266 Register QueuePtr
= MRI
.createGenericVirtualRegister(
2267 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
2269 if (!loadInputValue(QueuePtr
, B
, AMDGPUFunctionArgInfo::QUEUE_PTR
))
2272 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2273 // private_segment_aperture_base_hi.
2274 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
2276 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
2278 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
2279 MachineMemOperand::MOInvariant
,
2280 LLT::scalar(32), commonAlignment(Align(64), StructOffset
));
2282 B
.buildPtrAdd(LoadAddr
, QueuePtr
,
2283 B
.buildConstant(LLT::scalar(64), StructOffset
).getReg(0));
2284 return B
.buildLoad(S32
, LoadAddr
, *MMO
).getReg(0);
2287 /// Return true if the value is a known valid address, such that a null check is
2289 static bool isKnownNonNull(Register Val
, MachineRegisterInfo
&MRI
,
2290 const AMDGPUTargetMachine
&TM
, unsigned AddrSpace
) {
2291 MachineInstr
*Def
= MRI
.getVRegDef(Val
);
2292 switch (Def
->getOpcode()) {
2293 case AMDGPU::G_FRAME_INDEX
:
2294 case AMDGPU::G_GLOBAL_VALUE
:
2295 case AMDGPU::G_BLOCK_ADDR
:
2297 case AMDGPU::G_CONSTANT
: {
2298 const ConstantInt
*CI
= Def
->getOperand(1).getCImm();
2299 return CI
->getSExtValue() != TM
.getNullPointerValue(AddrSpace
);
2308 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2309 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2310 MachineIRBuilder
&B
) const {
2311 MachineFunction
&MF
= B
.getMF();
2313 // MI can either be a G_ADDRSPACE_CAST or a
2314 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2315 assert(MI
.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST
||
2316 (isa
<GIntrinsic
>(MI
) && cast
<GIntrinsic
>(MI
).getIntrinsicID() ==
2317 Intrinsic::amdgcn_addrspacecast_nonnull
));
2319 const LLT S32
= LLT::scalar(32);
2320 Register Dst
= MI
.getOperand(0).getReg();
2321 Register Src
= isa
<GIntrinsic
>(MI
) ? MI
.getOperand(2).getReg()
2322 : MI
.getOperand(1).getReg();
2323 LLT DstTy
= MRI
.getType(Dst
);
2324 LLT SrcTy
= MRI
.getType(Src
);
2325 unsigned DestAS
= DstTy
.getAddressSpace();
2326 unsigned SrcAS
= SrcTy
.getAddressSpace();
2328 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2330 assert(!DstTy
.isVector());
2332 const AMDGPUTargetMachine
&TM
2333 = static_cast<const AMDGPUTargetMachine
&>(MF
.getTarget());
2335 if (TM
.isNoopAddrSpaceCast(SrcAS
, DestAS
)) {
2336 MI
.setDesc(B
.getTII().get(TargetOpcode::G_BITCAST
));
2340 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
&&
2341 (DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
2342 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
)) {
2343 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2344 // G_ADDRSPACE_CAST we need to guess.
2345 if (isa
<GIntrinsic
>(MI
) || isKnownNonNull(Src
, MRI
, TM
, SrcAS
)) {
2346 // Extract low 32-bits of the pointer.
2347 B
.buildExtract(Dst
, Src
, 0);
2348 MI
.eraseFromParent();
2352 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
2354 auto SegmentNull
= B
.buildConstant(DstTy
, NullVal
);
2355 auto FlatNull
= B
.buildConstant(SrcTy
, 0);
2357 // Extract low 32-bits of the pointer.
2358 auto PtrLo32
= B
.buildExtract(DstTy
, Src
, 0);
2361 B
.buildICmp(CmpInst::ICMP_NE
, LLT::scalar(1), Src
, FlatNull
.getReg(0));
2362 B
.buildSelect(Dst
, CmpRes
, PtrLo32
, SegmentNull
.getReg(0));
2364 MI
.eraseFromParent();
2368 if (DestAS
== AMDGPUAS::FLAT_ADDRESS
&&
2369 (SrcAS
== AMDGPUAS::LOCAL_ADDRESS
||
2370 SrcAS
== AMDGPUAS::PRIVATE_ADDRESS
)) {
2371 auto castLocalOrPrivateToFlat
= [&](const DstOp
&Dst
) -> Register
{
2372 Register ApertureReg
= getSegmentAperture(SrcAS
, MRI
, B
);
2373 if (!ApertureReg
.isValid())
2376 // Coerce the type of the low half of the result so we can use
2378 Register SrcAsInt
= B
.buildPtrToInt(S32
, Src
).getReg(0);
2380 // TODO: Should we allow mismatched types but matching sizes in merges to
2381 // avoid the ptrtoint?
2382 return B
.buildMergeLikeInstr(Dst
, {SrcAsInt
, ApertureReg
}).getReg(0);
2385 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2386 // G_ADDRSPACE_CAST we need to guess.
2387 if (isa
<GIntrinsic
>(MI
) || isKnownNonNull(Src
, MRI
, TM
, SrcAS
)) {
2388 castLocalOrPrivateToFlat(Dst
);
2389 MI
.eraseFromParent();
2393 Register BuildPtr
= castLocalOrPrivateToFlat(DstTy
);
2395 auto SegmentNull
= B
.buildConstant(SrcTy
, TM
.getNullPointerValue(SrcAS
));
2396 auto FlatNull
= B
.buildConstant(DstTy
, TM
.getNullPointerValue(DestAS
));
2398 auto CmpRes
= B
.buildICmp(CmpInst::ICMP_NE
, LLT::scalar(1), Src
,
2399 SegmentNull
.getReg(0));
2401 B
.buildSelect(Dst
, CmpRes
, BuildPtr
, FlatNull
);
2403 MI
.eraseFromParent();
2407 if (DestAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
&&
2408 SrcTy
.getSizeInBits() == 64) {
2410 B
.buildExtract(Dst
, Src
, 0);
2411 MI
.eraseFromParent();
2415 if (SrcAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
&&
2416 DstTy
.getSizeInBits() == 64) {
2417 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
2418 uint32_t AddrHiVal
= Info
->get32BitAddressHighBits();
2419 auto PtrLo
= B
.buildPtrToInt(S32
, Src
);
2420 auto HighAddr
= B
.buildConstant(S32
, AddrHiVal
);
2421 B
.buildMergeLikeInstr(Dst
, {PtrLo
, HighAddr
});
2422 MI
.eraseFromParent();
2426 DiagnosticInfoUnsupported
InvalidAddrSpaceCast(
2427 MF
.getFunction(), "invalid addrspacecast", B
.getDebugLoc());
2429 LLVMContext
&Ctx
= MF
.getFunction().getContext();
2430 Ctx
.diagnose(InvalidAddrSpaceCast
);
2432 MI
.eraseFromParent();
2436 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr
&MI
,
2437 MachineRegisterInfo
&MRI
,
2438 MachineIRBuilder
&B
) const {
2439 Register Src
= MI
.getOperand(1).getReg();
2440 LLT Ty
= MRI
.getType(Src
);
2441 assert(Ty
.isScalar() && Ty
.getSizeInBits() == 64);
2443 APFloat
C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2444 APFloat
C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2446 auto C1
= B
.buildFConstant(Ty
, C1Val
);
2447 auto CopySign
= B
.buildFCopysign(Ty
, C1
, Src
);
2449 // TODO: Should this propagate fast-math-flags?
2450 auto Tmp1
= B
.buildFAdd(Ty
, Src
, CopySign
);
2451 auto Tmp2
= B
.buildFSub(Ty
, Tmp1
, CopySign
);
2453 auto C2
= B
.buildFConstant(Ty
, C2Val
);
2454 auto Fabs
= B
.buildFAbs(Ty
, Src
);
2456 auto Cond
= B
.buildFCmp(CmpInst::FCMP_OGT
, LLT::scalar(1), Fabs
, C2
);
2457 B
.buildSelect(MI
.getOperand(0).getReg(), Cond
, Src
, Tmp2
);
2458 MI
.eraseFromParent();
2462 bool AMDGPULegalizerInfo::legalizeFceil(
2463 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2464 MachineIRBuilder
&B
) const {
2466 const LLT S1
= LLT::scalar(1);
2467 const LLT S64
= LLT::scalar(64);
2469 Register Src
= MI
.getOperand(1).getReg();
2470 assert(MRI
.getType(Src
) == S64
);
2472 // result = trunc(src)
2473 // if (src > 0.0 && src != result)
2476 auto Trunc
= B
.buildIntrinsicTrunc(S64
, Src
);
2478 const auto Zero
= B
.buildFConstant(S64
, 0.0);
2479 const auto One
= B
.buildFConstant(S64
, 1.0);
2480 auto Lt0
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Src
, Zero
);
2481 auto NeTrunc
= B
.buildFCmp(CmpInst::FCMP_ONE
, S1
, Src
, Trunc
);
2482 auto And
= B
.buildAnd(S1
, Lt0
, NeTrunc
);
2483 auto Add
= B
.buildSelect(S64
, And
, One
, Zero
);
2485 // TODO: Should this propagate fast-math-flags?
2486 B
.buildFAdd(MI
.getOperand(0).getReg(), Trunc
, Add
);
2487 MI
.eraseFromParent();
2491 bool AMDGPULegalizerInfo::legalizeFrem(
2492 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2493 MachineIRBuilder
&B
) const {
2494 Register DstReg
= MI
.getOperand(0).getReg();
2495 Register Src0Reg
= MI
.getOperand(1).getReg();
2496 Register Src1Reg
= MI
.getOperand(2).getReg();
2497 auto Flags
= MI
.getFlags();
2498 LLT Ty
= MRI
.getType(DstReg
);
2500 auto Div
= B
.buildFDiv(Ty
, Src0Reg
, Src1Reg
, Flags
);
2501 auto Trunc
= B
.buildIntrinsicTrunc(Ty
, Div
, Flags
);
2502 auto Neg
= B
.buildFNeg(Ty
, Trunc
, Flags
);
2503 B
.buildFMA(DstReg
, Neg
, Src1Reg
, Src0Reg
, Flags
);
2504 MI
.eraseFromParent();
2508 static MachineInstrBuilder
extractF64Exponent(Register Hi
,
2509 MachineIRBuilder
&B
) {
2510 const unsigned FractBits
= 52;
2511 const unsigned ExpBits
= 11;
2512 LLT S32
= LLT::scalar(32);
2514 auto Const0
= B
.buildConstant(S32
, FractBits
- 32);
2515 auto Const1
= B
.buildConstant(S32
, ExpBits
);
2517 auto ExpPart
= B
.buildIntrinsic(Intrinsic::amdgcn_ubfe
, {S32
})
2519 .addUse(Const0
.getReg(0))
2520 .addUse(Const1
.getReg(0));
2522 return B
.buildSub(S32
, ExpPart
, B
.buildConstant(S32
, 1023));
2525 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2526 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2527 MachineIRBuilder
&B
) const {
2528 const LLT S1
= LLT::scalar(1);
2529 const LLT S32
= LLT::scalar(32);
2530 const LLT S64
= LLT::scalar(64);
2532 Register Src
= MI
.getOperand(1).getReg();
2533 assert(MRI
.getType(Src
) == S64
);
2535 // TODO: Should this use extract since the low half is unused?
2536 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
2537 Register Hi
= Unmerge
.getReg(1);
2539 // Extract the upper half, since this is where we will find the sign and
2541 auto Exp
= extractF64Exponent(Hi
, B
);
2543 const unsigned FractBits
= 52;
2545 // Extract the sign bit.
2546 const auto SignBitMask
= B
.buildConstant(S32
, UINT32_C(1) << 31);
2547 auto SignBit
= B
.buildAnd(S32
, Hi
, SignBitMask
);
2549 const auto FractMask
= B
.buildConstant(S64
, (UINT64_C(1) << FractBits
) - 1);
2551 const auto Zero32
= B
.buildConstant(S32
, 0);
2553 // Extend back to 64-bits.
2554 auto SignBit64
= B
.buildMergeLikeInstr(S64
, {Zero32
, SignBit
});
2556 auto Shr
= B
.buildAShr(S64
, FractMask
, Exp
);
2557 auto Not
= B
.buildNot(S64
, Shr
);
2558 auto Tmp0
= B
.buildAnd(S64
, Src
, Not
);
2559 auto FiftyOne
= B
.buildConstant(S32
, FractBits
- 1);
2561 auto ExpLt0
= B
.buildICmp(CmpInst::ICMP_SLT
, S1
, Exp
, Zero32
);
2562 auto ExpGt51
= B
.buildICmp(CmpInst::ICMP_SGT
, S1
, Exp
, FiftyOne
);
2564 auto Tmp1
= B
.buildSelect(S64
, ExpLt0
, SignBit64
, Tmp0
);
2565 B
.buildSelect(MI
.getOperand(0).getReg(), ExpGt51
, Src
, Tmp1
);
2566 MI
.eraseFromParent();
2570 bool AMDGPULegalizerInfo::legalizeITOFP(
2571 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2572 MachineIRBuilder
&B
, bool Signed
) const {
2574 Register Dst
= MI
.getOperand(0).getReg();
2575 Register Src
= MI
.getOperand(1).getReg();
2577 const LLT S64
= LLT::scalar(64);
2578 const LLT S32
= LLT::scalar(32);
2580 assert(MRI
.getType(Src
) == S64
);
2582 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
2583 auto ThirtyTwo
= B
.buildConstant(S32
, 32);
2585 if (MRI
.getType(Dst
) == S64
) {
2586 auto CvtHi
= Signed
? B
.buildSITOFP(S64
, Unmerge
.getReg(1))
2587 : B
.buildUITOFP(S64
, Unmerge
.getReg(1));
2589 auto CvtLo
= B
.buildUITOFP(S64
, Unmerge
.getReg(0));
2590 auto LdExp
= B
.buildFLdexp(S64
, CvtHi
, ThirtyTwo
);
2592 // TODO: Should this propagate fast-math-flags?
2593 B
.buildFAdd(Dst
, LdExp
, CvtLo
);
2594 MI
.eraseFromParent();
2598 assert(MRI
.getType(Dst
) == S32
);
2600 auto One
= B
.buildConstant(S32
, 1);
2602 MachineInstrBuilder ShAmt
;
2604 auto ThirtyOne
= B
.buildConstant(S32
, 31);
2605 auto X
= B
.buildXor(S32
, Unmerge
.getReg(0), Unmerge
.getReg(1));
2606 auto OppositeSign
= B
.buildAShr(S32
, X
, ThirtyOne
);
2607 auto MaxShAmt
= B
.buildAdd(S32
, ThirtyTwo
, OppositeSign
);
2608 auto LS
= B
.buildIntrinsic(Intrinsic::amdgcn_sffbh
, {S32
})
2609 .addUse(Unmerge
.getReg(1));
2610 auto LS2
= B
.buildSub(S32
, LS
, One
);
2611 ShAmt
= B
.buildUMin(S32
, LS2
, MaxShAmt
);
2613 ShAmt
= B
.buildCTLZ(S32
, Unmerge
.getReg(1));
2614 auto Norm
= B
.buildShl(S64
, Src
, ShAmt
);
2615 auto Unmerge2
= B
.buildUnmerge({S32
, S32
}, Norm
);
2616 auto Adjust
= B
.buildUMin(S32
, One
, Unmerge2
.getReg(0));
2617 auto Norm2
= B
.buildOr(S32
, Unmerge2
.getReg(1), Adjust
);
2618 auto FVal
= Signed
? B
.buildSITOFP(S32
, Norm2
) : B
.buildUITOFP(S32
, Norm2
);
2619 auto Scale
= B
.buildSub(S32
, ThirtyTwo
, ShAmt
);
2620 B
.buildFLdexp(Dst
, FVal
, Scale
);
2621 MI
.eraseFromParent();
2625 // TODO: Copied from DAG implementation. Verify logic and document how this
2627 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr
&MI
,
2628 MachineRegisterInfo
&MRI
,
2629 MachineIRBuilder
&B
,
2630 bool Signed
) const {
2632 Register Dst
= MI
.getOperand(0).getReg();
2633 Register Src
= MI
.getOperand(1).getReg();
2635 const LLT S64
= LLT::scalar(64);
2636 const LLT S32
= LLT::scalar(32);
2638 const LLT SrcLT
= MRI
.getType(Src
);
2639 assert((SrcLT
== S32
|| SrcLT
== S64
) && MRI
.getType(Dst
) == S64
);
2641 unsigned Flags
= MI
.getFlags();
2643 // The basic idea of converting a floating point number into a pair of 32-bit
2644 // integers is illustrated as follows:
2646 // tf := trunc(val);
2647 // hif := floor(tf * 2^-32);
2648 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2649 // hi := fptoi(hif);
2650 // lo := fptoi(lof);
2652 auto Trunc
= B
.buildIntrinsicTrunc(SrcLT
, Src
, Flags
);
2653 MachineInstrBuilder Sign
;
2654 if (Signed
&& SrcLT
== S32
) {
2655 // However, a 32-bit floating point number has only 23 bits mantissa and
2656 // it's not enough to hold all the significant bits of `lof` if val is
2657 // negative. To avoid the loss of precision, We need to take the absolute
2658 // value after truncating and flip the result back based on the original
2660 Sign
= B
.buildAShr(S32
, Src
, B
.buildConstant(S32
, 31));
2661 Trunc
= B
.buildFAbs(S32
, Trunc
, Flags
);
2663 MachineInstrBuilder K0
, K1
;
2665 K0
= B
.buildFConstant(
2666 S64
, llvm::bit_cast
<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2667 K1
= B
.buildFConstant(
2668 S64
, llvm::bit_cast
<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2670 K0
= B
.buildFConstant(
2671 S32
, llvm::bit_cast
<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2672 K1
= B
.buildFConstant(
2673 S32
, llvm::bit_cast
<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2676 auto Mul
= B
.buildFMul(SrcLT
, Trunc
, K0
, Flags
);
2677 auto FloorMul
= B
.buildFFloor(SrcLT
, Mul
, Flags
);
2678 auto Fma
= B
.buildFMA(SrcLT
, FloorMul
, K1
, Trunc
, Flags
);
2680 auto Hi
= (Signed
&& SrcLT
== S64
) ? B
.buildFPTOSI(S32
, FloorMul
)
2681 : B
.buildFPTOUI(S32
, FloorMul
);
2682 auto Lo
= B
.buildFPTOUI(S32
, Fma
);
2684 if (Signed
&& SrcLT
== S32
) {
2685 // Flip the result based on the signedness, which is either all 0s or 1s.
2686 Sign
= B
.buildMergeLikeInstr(S64
, {Sign
, Sign
});
2687 // r := xor({lo, hi}, sign) - sign;
2688 B
.buildSub(Dst
, B
.buildXor(S64
, B
.buildMergeLikeInstr(S64
, {Lo
, Hi
}), Sign
),
2691 B
.buildMergeLikeInstr(Dst
, {Lo
, Hi
});
2692 MI
.eraseFromParent();
2697 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper
&Helper
,
2698 MachineInstr
&MI
) const {
2699 MachineFunction
&MF
= Helper
.MIRBuilder
.getMF();
2700 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
2702 const bool IsIEEEOp
= MI
.getOpcode() == AMDGPU::G_FMINNUM_IEEE
||
2703 MI
.getOpcode() == AMDGPU::G_FMAXNUM_IEEE
;
2705 // With ieee_mode disabled, the instructions have the correct behavior
2706 // already for G_FMINNUM/G_FMAXNUM
2707 if (!MFI
->getMode().IEEE
)
2713 return Helper
.lowerFMinNumMaxNum(MI
) == LegalizerHelper::Legalized
;
2716 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2717 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2718 MachineIRBuilder
&B
) const {
2719 // TODO: Should move some of this into LegalizerHelper.
2721 // TODO: Promote dynamic indexing of s16 to s32
2723 Register Dst
= MI
.getOperand(0).getReg();
2724 Register Vec
= MI
.getOperand(1).getReg();
2726 LLT VecTy
= MRI
.getType(Vec
);
2727 LLT EltTy
= VecTy
.getElementType();
2728 assert(EltTy
== MRI
.getType(Dst
));
2730 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2731 // but we can't go directly to that logic becasue you can't bitcast a vector
2732 // of pointers to a vector of integers. Therefore, introduce an intermediate
2733 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2734 // drive the legalization forward.
2735 if (EltTy
.isPointer() && EltTy
.getSizeInBits() > 64) {
2736 LLT IntTy
= LLT::scalar(EltTy
.getSizeInBits());
2737 LLT IntVecTy
= VecTy
.changeElementType(IntTy
);
2739 auto IntVec
= B
.buildPtrToInt(IntVecTy
, Vec
);
2740 auto IntElt
= B
.buildExtractVectorElement(IntTy
, IntVec
, MI
.getOperand(2));
2741 B
.buildIntToPtr(Dst
, IntElt
);
2743 MI
.eraseFromParent();
2747 // FIXME: Artifact combiner probably should have replaced the truncated
2748 // constant before this, so we shouldn't need
2749 // getIConstantVRegValWithLookThrough.
2750 std::optional
<ValueAndVReg
> MaybeIdxVal
=
2751 getIConstantVRegValWithLookThrough(MI
.getOperand(2).getReg(), MRI
);
2752 if (!MaybeIdxVal
) // Dynamic case will be selected to register indexing.
2754 const uint64_t IdxVal
= MaybeIdxVal
->Value
.getZExtValue();
2756 if (IdxVal
< VecTy
.getNumElements()) {
2757 auto Unmerge
= B
.buildUnmerge(EltTy
, Vec
);
2758 B
.buildCopy(Dst
, Unmerge
.getReg(IdxVal
));
2763 MI
.eraseFromParent();
2767 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2768 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2769 MachineIRBuilder
&B
) const {
2770 // TODO: Should move some of this into LegalizerHelper.
2772 // TODO: Promote dynamic indexing of s16 to s32
2774 Register Dst
= MI
.getOperand(0).getReg();
2775 Register Vec
= MI
.getOperand(1).getReg();
2776 Register Ins
= MI
.getOperand(2).getReg();
2778 LLT VecTy
= MRI
.getType(Vec
);
2779 LLT EltTy
= VecTy
.getElementType();
2780 assert(EltTy
== MRI
.getType(Ins
));
2782 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2783 // but we can't go directly to that logic becasue you can't bitcast a vector
2784 // of pointers to a vector of integers. Therefore, make the pointer vector
2785 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2786 // new value, and then inttoptr the result vector back. This will then allow
2787 // the rest of legalization to take over.
2788 if (EltTy
.isPointer() && EltTy
.getSizeInBits() > 64) {
2789 LLT IntTy
= LLT::scalar(EltTy
.getSizeInBits());
2790 LLT IntVecTy
= VecTy
.changeElementType(IntTy
);
2792 auto IntVecSource
= B
.buildPtrToInt(IntVecTy
, Vec
);
2793 auto IntIns
= B
.buildPtrToInt(IntTy
, Ins
);
2794 auto IntVecDest
= B
.buildInsertVectorElement(IntVecTy
, IntVecSource
, IntIns
,
2796 B
.buildIntToPtr(Dst
, IntVecDest
);
2797 MI
.eraseFromParent();
2801 // FIXME: Artifact combiner probably should have replaced the truncated
2802 // constant before this, so we shouldn't need
2803 // getIConstantVRegValWithLookThrough.
2804 std::optional
<ValueAndVReg
> MaybeIdxVal
=
2805 getIConstantVRegValWithLookThrough(MI
.getOperand(3).getReg(), MRI
);
2806 if (!MaybeIdxVal
) // Dynamic case will be selected to register indexing.
2809 const uint64_t IdxVal
= MaybeIdxVal
->Value
.getZExtValue();
2811 unsigned NumElts
= VecTy
.getNumElements();
2812 if (IdxVal
< NumElts
) {
2813 SmallVector
<Register
, 8> SrcRegs
;
2814 for (unsigned i
= 0; i
< NumElts
; ++i
)
2815 SrcRegs
.push_back(MRI
.createGenericVirtualRegister(EltTy
));
2816 B
.buildUnmerge(SrcRegs
, Vec
);
2818 SrcRegs
[IdxVal
] = MI
.getOperand(2).getReg();
2819 B
.buildMergeLikeInstr(Dst
, SrcRegs
);
2824 MI
.eraseFromParent();
2828 bool AMDGPULegalizerInfo::legalizeSinCos(
2829 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2830 MachineIRBuilder
&B
) const {
2832 Register DstReg
= MI
.getOperand(0).getReg();
2833 Register SrcReg
= MI
.getOperand(1).getReg();
2834 LLT Ty
= MRI
.getType(DstReg
);
2835 unsigned Flags
= MI
.getFlags();
2838 auto OneOver2Pi
= B
.buildFConstant(Ty
, 0.5 * numbers::inv_pi
);
2839 if (ST
.hasTrigReducedRange()) {
2840 auto MulVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
);
2841 TrigVal
= B
.buildIntrinsic(Intrinsic::amdgcn_fract
, {Ty
})
2842 .addUse(MulVal
.getReg(0))
2846 TrigVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
).getReg(0);
2848 Intrinsic::ID TrigIntrin
= MI
.getOpcode() == AMDGPU::G_FSIN
?
2849 Intrinsic::amdgcn_sin
: Intrinsic::amdgcn_cos
;
2850 B
.buildIntrinsic(TrigIntrin
, ArrayRef
<Register
>(DstReg
))
2853 MI
.eraseFromParent();
2857 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg
, LLT PtrTy
,
2858 MachineIRBuilder
&B
,
2859 const GlobalValue
*GV
,
2861 unsigned GAFlags
) const {
2862 assert(isInt
<32>(Offset
+ 4) && "32-bit offset is expected!");
2863 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2864 // to the following code sequence:
2866 // For constant address space:
2867 // s_getpc_b64 s[0:1]
2868 // s_add_u32 s0, s0, $symbol
2869 // s_addc_u32 s1, s1, 0
2871 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2872 // a fixup or relocation is emitted to replace $symbol with a literal
2873 // constant, which is a pc-relative offset from the encoding of the $symbol
2874 // operand to the global variable.
2876 // For global address space:
2877 // s_getpc_b64 s[0:1]
2878 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2879 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2881 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2882 // fixups or relocations are emitted to replace $symbol@*@lo and
2883 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2884 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2885 // operand to the global variable.
2887 LLT ConstPtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
2889 Register PCReg
= PtrTy
.getSizeInBits() != 32 ? DstReg
:
2890 B
.getMRI()->createGenericVirtualRegister(ConstPtrTy
);
2892 MachineInstrBuilder MIB
= B
.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET
)
2895 MIB
.addGlobalAddress(GV
, Offset
, GAFlags
);
2896 if (GAFlags
== SIInstrInfo::MO_NONE
)
2899 MIB
.addGlobalAddress(GV
, Offset
, GAFlags
+ 1);
2901 if (!B
.getMRI()->getRegClassOrNull(PCReg
))
2902 B
.getMRI()->setRegClass(PCReg
, &AMDGPU::SReg_64RegClass
);
2904 if (PtrTy
.getSizeInBits() == 32)
2905 B
.buildExtract(DstReg
, PCReg
, 0);
2909 // Emit a ABS32_LO / ABS32_HI relocation stub.
2910 void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2911 Register DstReg
, LLT PtrTy
, MachineIRBuilder
&B
, const GlobalValue
*GV
,
2912 MachineRegisterInfo
&MRI
) const {
2913 bool RequiresHighHalf
= PtrTy
.getSizeInBits() != 32;
2915 LLT S32
= LLT::scalar(32);
2917 // Use the destination directly, if and only if we store the lower address
2918 // part only and we don't have a register class being set.
2919 Register AddrLo
= !RequiresHighHalf
&& !MRI
.getRegClassOrNull(DstReg
)
2921 : MRI
.createGenericVirtualRegister(S32
);
2923 if (!MRI
.getRegClassOrNull(AddrLo
))
2924 MRI
.setRegClass(AddrLo
, &AMDGPU::SReg_32RegClass
);
2926 // Write the lower half.
2927 B
.buildInstr(AMDGPU::S_MOV_B32
)
2929 .addGlobalAddress(GV
, 0, SIInstrInfo::MO_ABS32_LO
);
2931 // If required, write the upper half as well.
2932 if (RequiresHighHalf
) {
2933 assert(PtrTy
.getSizeInBits() == 64 &&
2934 "Must provide a 64-bit pointer type!");
2936 Register AddrHi
= MRI
.createGenericVirtualRegister(S32
);
2937 MRI
.setRegClass(AddrHi
, &AMDGPU::SReg_32RegClass
);
2939 B
.buildInstr(AMDGPU::S_MOV_B32
)
2941 .addGlobalAddress(GV
, 0, SIInstrInfo::MO_ABS32_HI
);
2943 // Use the destination directly, if and only if we don't have a register
2945 Register AddrDst
= !MRI
.getRegClassOrNull(DstReg
)
2947 : MRI
.createGenericVirtualRegister(LLT::scalar(64));
2949 if (!MRI
.getRegClassOrNull(AddrDst
))
2950 MRI
.setRegClass(AddrDst
, &AMDGPU::SReg_64RegClass
);
2952 B
.buildMergeValues(AddrDst
, {AddrLo
, AddrHi
});
2954 // If we created a new register for the destination, cast the result into
2955 // the final output.
2956 if (AddrDst
!= DstReg
)
2957 B
.buildCast(DstReg
, AddrDst
);
2958 } else if (AddrLo
!= DstReg
) {
2959 // If we created a new register for the destination, cast the result into
2960 // the final output.
2961 B
.buildCast(DstReg
, AddrLo
);
2965 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2966 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
2967 MachineIRBuilder
&B
) const {
2968 Register DstReg
= MI
.getOperand(0).getReg();
2969 LLT Ty
= MRI
.getType(DstReg
);
2970 unsigned AS
= Ty
.getAddressSpace();
2972 const GlobalValue
*GV
= MI
.getOperand(1).getGlobal();
2973 MachineFunction
&MF
= B
.getMF();
2974 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
2976 if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
2977 if (!MFI
->isModuleEntryFunction() &&
2978 GV
->getName() != "llvm.amdgcn.module.lds" &&
2979 !AMDGPU::isNamedBarrier(*cast
<GlobalVariable
>(GV
))) {
2980 const Function
&Fn
= MF
.getFunction();
2981 DiagnosticInfoUnsupported
BadLDSDecl(
2982 Fn
, "local memory global used by non-kernel function", MI
.getDebugLoc(),
2984 Fn
.getContext().diagnose(BadLDSDecl
);
2986 // We currently don't have a way to correctly allocate LDS objects that
2987 // aren't directly associated with a kernel. We do force inlining of
2988 // functions that use local objects. However, if these dead functions are
2989 // not eliminated, we don't want a compile time error. Just emit a warning
2990 // and a trap, since there should be no callable path here.
2992 B
.buildUndef(DstReg
);
2993 MI
.eraseFromParent();
2997 // TODO: We could emit code to handle the initialization somewhere.
2998 // We ignore the initializer for now and legalize it to allow selection.
2999 // The initializer will anyway get errored out during assembly emission.
3000 const SITargetLowering
*TLI
= ST
.getTargetLowering();
3001 if (!TLI
->shouldUseLDSConstAddress(GV
)) {
3002 MI
.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO
);
3003 return true; // Leave in place;
3006 if (AS
== AMDGPUAS::LOCAL_ADDRESS
&& GV
->hasExternalLinkage()) {
3007 Type
*Ty
= GV
->getValueType();
3008 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3009 // zero-sized type in other languages to declare the dynamic shared
3010 // memory which size is not known at the compile time. They will be
3011 // allocated by the runtime and placed directly after the static
3012 // allocated ones. They all share the same offset.
3013 if (B
.getDataLayout().getTypeAllocSize(Ty
).isZero()) {
3014 // Adjust alignment for that dynamic shared memory array.
3015 MFI
->setDynLDSAlign(MF
.getFunction(), *cast
<GlobalVariable
>(GV
));
3016 LLT S32
= LLT::scalar(32);
3017 auto Sz
= B
.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize
, {S32
});
3018 B
.buildIntToPtr(DstReg
, Sz
);
3019 MI
.eraseFromParent();
3024 B
.buildConstant(DstReg
, MFI
->allocateLDSGlobal(B
.getDataLayout(),
3025 *cast
<GlobalVariable
>(GV
)));
3026 MI
.eraseFromParent();
3030 if (ST
.isAmdPalOS() || ST
.isMesa3DOS()) {
3031 buildAbsGlobalAddress(DstReg
, Ty
, B
, GV
, MRI
);
3032 MI
.eraseFromParent();
3036 const SITargetLowering
*TLI
= ST
.getTargetLowering();
3038 if (TLI
->shouldEmitFixup(GV
)) {
3039 buildPCRelGlobalAddress(DstReg
, Ty
, B
, GV
, 0);
3040 MI
.eraseFromParent();
3044 if (TLI
->shouldEmitPCReloc(GV
)) {
3045 buildPCRelGlobalAddress(DstReg
, Ty
, B
, GV
, 0, SIInstrInfo::MO_REL32
);
3046 MI
.eraseFromParent();
3050 LLT PtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
3051 Register GOTAddr
= MRI
.createGenericVirtualRegister(PtrTy
);
3053 LLT LoadTy
= Ty
.getSizeInBits() == 32 ? PtrTy
: Ty
;
3054 MachineMemOperand
*GOTMMO
= MF
.getMachineMemOperand(
3055 MachinePointerInfo::getGOT(MF
),
3056 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
3057 MachineMemOperand::MOInvariant
,
3060 buildPCRelGlobalAddress(GOTAddr
, PtrTy
, B
, GV
, 0, SIInstrInfo::MO_GOTPCREL32
);
3062 if (Ty
.getSizeInBits() == 32) {
3063 // Truncate if this is a 32-bit constant address.
3064 auto Load
= B
.buildLoad(PtrTy
, GOTAddr
, *GOTMMO
);
3065 B
.buildExtract(DstReg
, Load
, 0);
3067 B
.buildLoad(DstReg
, GOTAddr
, *GOTMMO
);
3069 MI
.eraseFromParent();
3073 static LLT
widenToNextPowerOf2(LLT Ty
) {
3075 return Ty
.changeElementCount(
3076 ElementCount::getFixed(PowerOf2Ceil(Ty
.getNumElements())));
3077 return LLT::scalar(PowerOf2Ceil(Ty
.getSizeInBits()));
3080 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper
&Helper
,
3081 MachineInstr
&MI
) const {
3082 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
3083 MachineRegisterInfo
&MRI
= *B
.getMRI();
3084 GISelChangeObserver
&Observer
= Helper
.Observer
;
3086 Register PtrReg
= MI
.getOperand(1).getReg();
3087 LLT PtrTy
= MRI
.getType(PtrReg
);
3088 unsigned AddrSpace
= PtrTy
.getAddressSpace();
3090 if (AddrSpace
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
3091 LLT ConstPtr
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
3092 auto Cast
= B
.buildAddrSpaceCast(ConstPtr
, PtrReg
);
3093 Observer
.changingInstr(MI
);
3094 MI
.getOperand(1).setReg(Cast
.getReg(0));
3095 Observer
.changedInstr(MI
);
3099 if (MI
.getOpcode() != AMDGPU::G_LOAD
)
3102 Register ValReg
= MI
.getOperand(0).getReg();
3103 LLT ValTy
= MRI
.getType(ValReg
);
3105 if (hasBufferRsrcWorkaround(ValTy
)) {
3106 Observer
.changingInstr(MI
);
3107 castBufferRsrcFromV4I32(MI
, B
, MRI
, 0);
3108 Observer
.changedInstr(MI
);
3112 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
3113 const unsigned ValSize
= ValTy
.getSizeInBits();
3114 const LLT MemTy
= MMO
->getMemoryType();
3115 const Align MemAlign
= MMO
->getAlign();
3116 const unsigned MemSize
= MemTy
.getSizeInBits();
3117 const uint64_t AlignInBits
= 8 * MemAlign
.value();
3119 // Widen non-power-of-2 loads to the alignment if needed
3120 if (shouldWidenLoad(ST
, MemTy
, AlignInBits
, AddrSpace
, MI
.getOpcode())) {
3121 const unsigned WideMemSize
= PowerOf2Ceil(MemSize
);
3123 // This was already the correct extending load result type, so just adjust
3125 if (WideMemSize
== ValSize
) {
3126 MachineFunction
&MF
= B
.getMF();
3128 MachineMemOperand
*WideMMO
=
3129 MF
.getMachineMemOperand(MMO
, 0, WideMemSize
/ 8);
3130 Observer
.changingInstr(MI
);
3131 MI
.setMemRefs(MF
, {WideMMO
});
3132 Observer
.changedInstr(MI
);
3136 // Don't bother handling edge case that should probably never be produced.
3137 if (ValSize
> WideMemSize
)
3140 LLT WideTy
= widenToNextPowerOf2(ValTy
);
3143 if (!WideTy
.isVector()) {
3144 WideLoad
= B
.buildLoadFromOffset(WideTy
, PtrReg
, *MMO
, 0).getReg(0);
3145 B
.buildTrunc(ValReg
, WideLoad
).getReg(0);
3147 // Extract the subvector.
3149 if (isRegisterType(ValTy
)) {
3150 // If this a case where G_EXTRACT is legal, use it.
3151 // (e.g. <3 x s32> -> <4 x s32>)
3152 WideLoad
= B
.buildLoadFromOffset(WideTy
, PtrReg
, *MMO
, 0).getReg(0);
3153 B
.buildExtract(ValReg
, WideLoad
, 0);
3155 // For cases where the widened type isn't a nice register value, unmerge
3156 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3157 WideLoad
= B
.buildLoadFromOffset(WideTy
, PtrReg
, *MMO
, 0).getReg(0);
3158 B
.buildDeleteTrailingVectorElements(ValReg
, WideLoad
);
3162 MI
.eraseFromParent();
3169 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper
&Helper
,
3170 MachineInstr
&MI
) const {
3171 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
3172 MachineRegisterInfo
&MRI
= *B
.getMRI();
3173 GISelChangeObserver
&Observer
= Helper
.Observer
;
3175 Register DataReg
= MI
.getOperand(0).getReg();
3176 LLT DataTy
= MRI
.getType(DataReg
);
3178 if (hasBufferRsrcWorkaround(DataTy
)) {
3179 Observer
.changingInstr(MI
);
3180 castBufferRsrcArgToV4I32(MI
, B
, 0);
3181 Observer
.changedInstr(MI
);
3187 bool AMDGPULegalizerInfo::legalizeFMad(
3188 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
3189 MachineIRBuilder
&B
) const {
3190 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
3191 assert(Ty
.isScalar());
3193 MachineFunction
&MF
= B
.getMF();
3194 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
3196 // TODO: Always legal with future ftz flag.
3197 // FIXME: Do we need just output?
3198 if (Ty
== LLT::float32() &&
3199 MFI
->getMode().FP32Denormals
== DenormalMode::getPreserveSign())
3201 if (Ty
== LLT::float16() &&
3202 MFI
->getMode().FP64FP16Denormals
== DenormalMode::getPreserveSign())
3205 MachineIRBuilder
HelperBuilder(MI
);
3206 GISelObserverWrapper DummyObserver
;
3207 LegalizerHelper
Helper(MF
, DummyObserver
, HelperBuilder
);
3208 return Helper
.lowerFMad(MI
) == LegalizerHelper::Legalized
;
3211 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3212 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
3213 Register DstReg
= MI
.getOperand(0).getReg();
3214 Register PtrReg
= MI
.getOperand(1).getReg();
3215 Register CmpVal
= MI
.getOperand(2).getReg();
3216 Register NewVal
= MI
.getOperand(3).getReg();
3218 assert(AMDGPU::isFlatGlobalAddrSpace(MRI
.getType(PtrReg
).getAddressSpace()) &&
3219 "this should not have been custom lowered");
3221 LLT ValTy
= MRI
.getType(CmpVal
);
3222 LLT VecTy
= LLT::fixed_vector(2, ValTy
);
3224 Register PackedVal
= B
.buildBuildVector(VecTy
, { NewVal
, CmpVal
}).getReg(0);
3226 B
.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG
)
3230 .setMemRefs(MI
.memoperands());
3232 MI
.eraseFromParent();
3236 /// Return true if it's known that \p Src can never be an f32 denormal value.
3237 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo
&MRI
,
3239 const MachineInstr
*DefMI
= MRI
.getVRegDef(Src
);
3240 switch (DefMI
->getOpcode()) {
3241 case TargetOpcode::G_INTRINSIC
: {
3242 switch (cast
<GIntrinsic
>(DefMI
)->getIntrinsicID()) {
3243 case Intrinsic::amdgcn_frexp_mant
:
3251 case TargetOpcode::G_FFREXP
: {
3252 if (DefMI
->getOperand(0).getReg() == Src
)
3256 case TargetOpcode::G_FPEXT
: {
3257 return MRI
.getType(DefMI
->getOperand(1).getReg()) == LLT::scalar(16);
3266 static bool allowApproxFunc(const MachineFunction
&MF
, unsigned Flags
) {
3267 if (Flags
& MachineInstr::FmAfn
)
3269 const auto &Options
= MF
.getTarget().Options
;
3270 return Options
.UnsafeFPMath
|| Options
.ApproxFuncFPMath
;
3273 static bool needsDenormHandlingF32(const MachineFunction
&MF
, Register Src
,
3275 return !valueIsKnownNeverF32Denorm(MF
.getRegInfo(), Src
) &&
3276 MF
.getDenormalMode(APFloat::IEEEsingle()).Input
!=
3277 DenormalMode::PreserveSign
;
3280 std::pair
<Register
, Register
>
3281 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder
&B
, Register Src
,
3282 unsigned Flags
) const {
3283 if (!needsDenormHandlingF32(B
.getMF(), Src
, Flags
))
3286 const LLT F32
= LLT::scalar(32);
3287 auto SmallestNormal
= B
.buildFConstant(
3288 F32
, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3289 auto IsLtSmallestNormal
=
3290 B
.buildFCmp(CmpInst::FCMP_OLT
, LLT::scalar(1), Src
, SmallestNormal
);
3292 auto Scale32
= B
.buildFConstant(F32
, 0x1.0p
+32);
3293 auto One
= B
.buildFConstant(F32
, 1.0);
3295 B
.buildSelect(F32
, IsLtSmallestNormal
, Scale32
, One
, Flags
);
3296 auto ScaledInput
= B
.buildFMul(F32
, Src
, ScaleFactor
, Flags
);
3298 return {ScaledInput
.getReg(0), IsLtSmallestNormal
.getReg(0)};
3301 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr
&MI
,
3302 MachineIRBuilder
&B
) const {
3303 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3304 // If we have to handle denormals, scale up the input and adjust the result.
3306 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3307 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3309 Register Dst
= MI
.getOperand(0).getReg();
3310 Register Src
= MI
.getOperand(1).getReg();
3311 LLT Ty
= B
.getMRI()->getType(Dst
);
3312 unsigned Flags
= MI
.getFlags();
3314 if (Ty
== LLT::scalar(16)) {
3315 const LLT F32
= LLT::scalar(32);
3316 // Nothing in half is a denormal when promoted to f32.
3317 auto Ext
= B
.buildFPExt(F32
, Src
, Flags
);
3318 auto Log2
= B
.buildIntrinsic(Intrinsic::amdgcn_log
, {F32
})
3319 .addUse(Ext
.getReg(0))
3321 B
.buildFPTrunc(Dst
, Log2
, Flags
);
3322 MI
.eraseFromParent();
3326 assert(Ty
== LLT::scalar(32));
3328 auto [ScaledInput
, IsLtSmallestNormal
] = getScaledLogInput(B
, Src
, Flags
);
3330 B
.buildIntrinsic(Intrinsic::amdgcn_log
, {MI
.getOperand(0)})
3333 MI
.eraseFromParent();
3337 auto Log2
= B
.buildIntrinsic(Intrinsic::amdgcn_log
, {Ty
})
3338 .addUse(ScaledInput
)
3341 auto ThirtyTwo
= B
.buildFConstant(Ty
, 32.0);
3342 auto Zero
= B
.buildFConstant(Ty
, 0.0);
3344 B
.buildSelect(Ty
, IsLtSmallestNormal
, ThirtyTwo
, Zero
, Flags
);
3345 B
.buildFSub(Dst
, Log2
, ResultOffset
, Flags
);
3347 MI
.eraseFromParent();
3351 static Register
getMad(MachineIRBuilder
&B
, LLT Ty
, Register X
, Register Y
,
3352 Register Z
, unsigned Flags
) {
3353 auto FMul
= B
.buildFMul(Ty
, X
, Y
, Flags
);
3354 return B
.buildFAdd(Ty
, FMul
, Z
, Flags
).getReg(0);
3357 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr
&MI
,
3358 MachineIRBuilder
&B
) const {
3359 const bool IsLog10
= MI
.getOpcode() == TargetOpcode::G_FLOG10
;
3360 assert(IsLog10
|| MI
.getOpcode() == TargetOpcode::G_FLOG
);
3362 MachineRegisterInfo
&MRI
= *B
.getMRI();
3363 Register Dst
= MI
.getOperand(0).getReg();
3364 Register X
= MI
.getOperand(1).getReg();
3365 unsigned Flags
= MI
.getFlags();
3366 const LLT Ty
= MRI
.getType(X
);
3367 MachineFunction
&MF
= B
.getMF();
3369 const LLT F32
= LLT::scalar(32);
3370 const LLT F16
= LLT::scalar(16);
3372 const AMDGPUTargetMachine
&TM
=
3373 static_cast<const AMDGPUTargetMachine
&>(MF
.getTarget());
3375 if (Ty
== F16
|| MI
.getFlag(MachineInstr::FmAfn
) ||
3376 TM
.Options
.ApproxFuncFPMath
|| TM
.Options
.UnsafeFPMath
) {
3377 if (Ty
== F16
&& !ST
.has16BitInsts()) {
3378 Register LogVal
= MRI
.createGenericVirtualRegister(F32
);
3379 auto PromoteSrc
= B
.buildFPExt(F32
, X
);
3380 legalizeFlogUnsafe(B
, LogVal
, PromoteSrc
.getReg(0), IsLog10
, Flags
);
3381 B
.buildFPTrunc(Dst
, LogVal
);
3383 legalizeFlogUnsafe(B
, Dst
, X
, IsLog10
, Flags
);
3386 MI
.eraseFromParent();
3390 auto [ScaledInput
, IsScaled
] = getScaledLogInput(B
, X
, Flags
);
3395 B
.buildIntrinsic(Intrinsic::amdgcn_log
, {Ty
}).addUse(X
).setMIFlags(Flags
);
3398 if (ST
.hasFastFMAF32()) {
3399 // c+cc are ln(2)/ln(10) to more than 49 bits
3400 const float c_log10
= 0x1.344134p
-2f
;
3401 const float cc_log10
= 0x1.09f79ep
-26f
;
3403 // c + cc is ln(2) to more than 49 bits
3404 const float c_log
= 0x1.62e42ep
-1f
;
3405 const float cc_log
= 0x1.efa39ep
-25f
;
3407 auto C
= B
.buildFConstant(Ty
, IsLog10
? c_log10
: c_log
);
3408 auto CC
= B
.buildFConstant(Ty
, IsLog10
? cc_log10
: cc_log
);
3410 R
= B
.buildFMul(Ty
, Y
, C
, Flags
).getReg(0);
3411 auto NegR
= B
.buildFNeg(Ty
, R
, Flags
);
3412 auto FMA0
= B
.buildFMA(Ty
, Y
, C
, NegR
, Flags
);
3413 auto FMA1
= B
.buildFMA(Ty
, Y
, CC
, FMA0
, Flags
);
3414 R
= B
.buildFAdd(Ty
, R
, FMA1
, Flags
).getReg(0);
3416 // ch+ct is ln(2)/ln(10) to more than 36 bits
3417 const float ch_log10
= 0x1.344000p
-2f
;
3418 const float ct_log10
= 0x1.3509f6p
-18f
;
3420 // ch + ct is ln(2) to more than 36 bits
3421 const float ch_log
= 0x1.62e000p
-1f
;
3422 const float ct_log
= 0x1.0bfbe8p
-15f
;
3424 auto CH
= B
.buildFConstant(Ty
, IsLog10
? ch_log10
: ch_log
);
3425 auto CT
= B
.buildFConstant(Ty
, IsLog10
? ct_log10
: ct_log
);
3427 auto MaskConst
= B
.buildConstant(Ty
, 0xfffff000);
3428 auto YH
= B
.buildAnd(Ty
, Y
, MaskConst
);
3429 auto YT
= B
.buildFSub(Ty
, Y
, YH
, Flags
);
3430 auto YTCT
= B
.buildFMul(Ty
, YT
, CT
, Flags
);
3433 getMad(B
, Ty
, YH
.getReg(0), CT
.getReg(0), YTCT
.getReg(0), Flags
);
3434 Register Mad1
= getMad(B
, Ty
, YT
.getReg(0), CH
.getReg(0), Mad0
, Flags
);
3435 R
= getMad(B
, Ty
, YH
.getReg(0), CH
.getReg(0), Mad1
, Flags
);
3438 const bool IsFiniteOnly
=
3439 (MI
.getFlag(MachineInstr::FmNoNans
) || TM
.Options
.NoNaNsFPMath
) &&
3440 (MI
.getFlag(MachineInstr::FmNoInfs
) || TM
.Options
.NoInfsFPMath
);
3442 if (!IsFiniteOnly
) {
3443 // Expand isfinite(x) => fabs(x) < inf
3444 auto Inf
= B
.buildFConstant(Ty
, APFloat::getInf(APFloat::IEEEsingle()));
3445 auto Fabs
= B
.buildFAbs(Ty
, Y
);
3447 B
.buildFCmp(CmpInst::FCMP_OLT
, LLT::scalar(1), Fabs
, Inf
, Flags
);
3448 R
= B
.buildSelect(Ty
, IsFinite
, R
, Y
, Flags
).getReg(0);
3452 auto Zero
= B
.buildFConstant(Ty
, 0.0);
3454 B
.buildFConstant(Ty
, IsLog10
? 0x1.344136p
+3f
: 0x1.62e430p
+4f
);
3455 auto Shift
= B
.buildSelect(Ty
, IsScaled
, ShiftK
, Zero
, Flags
);
3456 B
.buildFSub(Dst
, R
, Shift
, Flags
);
3458 B
.buildCopy(Dst
, R
);
3461 MI
.eraseFromParent();
3465 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder
&B
, Register Dst
,
3466 Register Src
, bool IsLog10
,
3467 unsigned Flags
) const {
3468 const double Log2BaseInverted
=
3469 IsLog10
? numbers::ln2
/ numbers::ln10
: numbers::ln2
;
3471 LLT Ty
= B
.getMRI()->getType(Dst
);
3473 if (Ty
== LLT::scalar(32)) {
3474 auto [ScaledInput
, IsScaled
] = getScaledLogInput(B
, Src
, Flags
);
3476 auto LogSrc
= B
.buildIntrinsic(Intrinsic::amdgcn_log
, {Ty
})
3479 auto ScaledResultOffset
= B
.buildFConstant(Ty
, -32.0 * Log2BaseInverted
);
3480 auto Zero
= B
.buildFConstant(Ty
, 0.0);
3482 B
.buildSelect(Ty
, IsScaled
, ScaledResultOffset
, Zero
, Flags
);
3483 auto Log2Inv
= B
.buildFConstant(Ty
, Log2BaseInverted
);
3485 if (ST
.hasFastFMAF32())
3486 B
.buildFMA(Dst
, LogSrc
, Log2Inv
, ResultOffset
, Flags
);
3488 auto Mul
= B
.buildFMul(Ty
, LogSrc
, Log2Inv
, Flags
);
3489 B
.buildFAdd(Dst
, Mul
, ResultOffset
, Flags
);
3496 auto Log2Operand
= Ty
== LLT::scalar(16)
3497 ? B
.buildFLog2(Ty
, Src
, Flags
)
3498 : B
.buildIntrinsic(Intrinsic::amdgcn_log
, {Ty
})
3501 auto Log2BaseInvertedOperand
= B
.buildFConstant(Ty
, Log2BaseInverted
);
3502 B
.buildFMul(Dst
, Log2Operand
, Log2BaseInvertedOperand
, Flags
);
3506 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr
&MI
,
3507 MachineIRBuilder
&B
) const {
3508 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3509 // If we have to handle denormals, scale up the input and adjust the result.
3511 Register Dst
= MI
.getOperand(0).getReg();
3512 Register Src
= MI
.getOperand(1).getReg();
3513 unsigned Flags
= MI
.getFlags();
3514 LLT Ty
= B
.getMRI()->getType(Dst
);
3515 const LLT F16
= LLT::scalar(16);
3516 const LLT F32
= LLT::scalar(32);
3519 // Nothing in half is a denormal when promoted to f32.
3520 auto Ext
= B
.buildFPExt(F32
, Src
, Flags
);
3521 auto Log2
= B
.buildIntrinsic(Intrinsic::amdgcn_exp2
, {F32
})
3522 .addUse(Ext
.getReg(0))
3524 B
.buildFPTrunc(Dst
, Log2
, Flags
);
3525 MI
.eraseFromParent();
3531 if (!needsDenormHandlingF32(B
.getMF(), Src
, Flags
)) {
3532 B
.buildIntrinsic(Intrinsic::amdgcn_exp2
, ArrayRef
<Register
>{Dst
})
3535 MI
.eraseFromParent();
3539 // bool needs_scaling = x < -0x1.f80000p+6f;
3540 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3542 // -nextafter(128.0, -1)
3543 auto RangeCheckConst
= B
.buildFConstant(Ty
, -0x1.f80000p
+6f
);
3544 auto NeedsScaling
= B
.buildFCmp(CmpInst::FCMP_OLT
, LLT::scalar(1), Src
,
3545 RangeCheckConst
, Flags
);
3547 auto SixtyFour
= B
.buildFConstant(Ty
, 0x1.0p
+6f
);
3548 auto Zero
= B
.buildFConstant(Ty
, 0.0);
3549 auto AddOffset
= B
.buildSelect(F32
, NeedsScaling
, SixtyFour
, Zero
, Flags
);
3550 auto AddInput
= B
.buildFAdd(F32
, Src
, AddOffset
, Flags
);
3552 auto Exp2
= B
.buildIntrinsic(Intrinsic::amdgcn_exp2
, {Ty
})
3553 .addUse(AddInput
.getReg(0))
3556 auto TwoExpNeg64
= B
.buildFConstant(Ty
, 0x1.0p
-64f
);
3557 auto One
= B
.buildFConstant(Ty
, 1.0);
3558 auto ResultScale
= B
.buildSelect(F32
, NeedsScaling
, TwoExpNeg64
, One
, Flags
);
3559 B
.buildFMul(Dst
, Exp2
, ResultScale
, Flags
);
3560 MI
.eraseFromParent();
3564 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder
&B
, Register Dst
,
3565 Register X
, unsigned Flags
) const {
3566 LLT Ty
= B
.getMRI()->getType(Dst
);
3567 LLT F32
= LLT::scalar(32);
3569 if (Ty
!= F32
|| !needsDenormHandlingF32(B
.getMF(), X
, Flags
)) {
3570 auto Log2E
= B
.buildFConstant(Ty
, numbers::log2e
);
3571 auto Mul
= B
.buildFMul(Ty
, X
, Log2E
, Flags
);
3574 B
.buildIntrinsic(Intrinsic::amdgcn_exp2
, ArrayRef
<Register
>{Dst
})
3575 .addUse(Mul
.getReg(0))
3578 B
.buildFExp2(Dst
, Mul
.getReg(0), Flags
);
3584 auto Threshold
= B
.buildFConstant(Ty
, -0x1.5d58a0p
+6f
);
3586 B
.buildFCmp(CmpInst::FCMP_OLT
, LLT::scalar(1), X
, Threshold
, Flags
);
3587 auto ScaleOffset
= B
.buildFConstant(Ty
, 0x1.0p
+6f
);
3588 auto ScaledX
= B
.buildFAdd(Ty
, X
, ScaleOffset
, Flags
);
3589 auto AdjustedX
= B
.buildSelect(Ty
, NeedsScaling
, ScaledX
, X
, Flags
);
3591 auto Log2E
= B
.buildFConstant(Ty
, numbers::log2e
);
3592 auto ExpInput
= B
.buildFMul(Ty
, AdjustedX
, Log2E
, Flags
);
3594 auto Exp2
= B
.buildIntrinsic(Intrinsic::amdgcn_exp2
, {Ty
})
3595 .addUse(ExpInput
.getReg(0))
3598 auto ResultScaleFactor
= B
.buildFConstant(Ty
, 0x1.969d48p
-93f
);
3599 auto AdjustedResult
= B
.buildFMul(Ty
, Exp2
, ResultScaleFactor
, Flags
);
3600 B
.buildSelect(Dst
, NeedsScaling
, AdjustedResult
, Exp2
, Flags
);
3604 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr
&MI
,
3605 MachineIRBuilder
&B
) const {
3606 Register Dst
= MI
.getOperand(0).getReg();
3607 Register X
= MI
.getOperand(1).getReg();
3608 const unsigned Flags
= MI
.getFlags();
3609 MachineFunction
&MF
= B
.getMF();
3610 MachineRegisterInfo
&MRI
= *B
.getMRI();
3611 LLT Ty
= MRI
.getType(Dst
);
3612 const LLT F16
= LLT::scalar(16);
3613 const LLT F32
= LLT::scalar(32);
3614 const bool IsExp10
= MI
.getOpcode() == TargetOpcode::G_FEXP10
;
3617 // v_exp_f16 (fmul x, log2e)
3618 if (allowApproxFunc(MF
, Flags
)) {
3619 // TODO: Does this really require fast?
3620 legalizeFExpUnsafe(B
, Dst
, X
, Flags
);
3621 MI
.eraseFromParent();
3626 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3628 // Nothing in half is a denormal when promoted to f32.
3629 auto Ext
= B
.buildFPExt(F32
, X
, Flags
);
3630 Register Lowered
= MRI
.createGenericVirtualRegister(F32
);
3631 legalizeFExpUnsafe(B
, Lowered
, Ext
.getReg(0), Flags
);
3632 B
.buildFPTrunc(Dst
, Lowered
, Flags
);
3633 MI
.eraseFromParent();
3639 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3640 // library behavior. Also, is known-not-daz source sufficient?
3641 if (allowApproxFunc(MF
, Flags
)) {
3642 legalizeFExpUnsafe(B
, Dst
, X
, Flags
);
3643 MI
.eraseFromParent();
3649 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3651 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3652 // n = 64*m + j, 0 <= j < 64
3654 // e^x = 2^((64*m + j + f)/64)
3655 // = (2^m) * (2^(j/64)) * 2^(f/64)
3656 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3658 // f = x*(64/ln(2)) - n
3659 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3661 // e^x = (2^m) * (2^(j/64)) * e^r
3663 // (2^(j/64)) is precomputed
3665 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3668 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3670 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3671 const unsigned FlagsNoContract
= Flags
& ~MachineInstr::FmContract
;
3674 if (ST
.hasFastFMAF32()) {
3675 const float c_exp
= numbers::log2ef
;
3676 const float cc_exp
= 0x1.4ae0bep
-26f
; // c+cc are 49 bits
3677 const float c_exp10
= 0x1.a934f0p
+1f
;
3678 const float cc_exp10
= 0x1.2f346ep
-24f
;
3680 auto C
= B
.buildFConstant(Ty
, IsExp10
? c_exp10
: c_exp
);
3681 PH
= B
.buildFMul(Ty
, X
, C
, Flags
).getReg(0);
3682 auto NegPH
= B
.buildFNeg(Ty
, PH
, Flags
);
3683 auto FMA0
= B
.buildFMA(Ty
, X
, C
, NegPH
, Flags
);
3685 auto CC
= B
.buildFConstant(Ty
, IsExp10
? cc_exp10
: cc_exp
);
3686 PL
= B
.buildFMA(Ty
, X
, CC
, FMA0
, Flags
).getReg(0);
3688 const float ch_exp
= 0x1.714000p
+0f
;
3689 const float cl_exp
= 0x1.47652ap
-12f
; // ch + cl are 36 bits
3691 const float ch_exp10
= 0x1.a92000p
+1f
;
3692 const float cl_exp10
= 0x1.4f0978p
-11f
;
3694 auto MaskConst
= B
.buildConstant(Ty
, 0xfffff000);
3695 auto XH
= B
.buildAnd(Ty
, X
, MaskConst
);
3696 auto XL
= B
.buildFSub(Ty
, X
, XH
, Flags
);
3698 auto CH
= B
.buildFConstant(Ty
, IsExp10
? ch_exp10
: ch_exp
);
3699 PH
= B
.buildFMul(Ty
, XH
, CH
, Flags
).getReg(0);
3701 auto CL
= B
.buildFConstant(Ty
, IsExp10
? cl_exp10
: cl_exp
);
3702 auto XLCL
= B
.buildFMul(Ty
, XL
, CL
, Flags
);
3705 getMad(B
, Ty
, XL
.getReg(0), CH
.getReg(0), XLCL
.getReg(0), Flags
);
3706 PL
= getMad(B
, Ty
, XH
.getReg(0), CL
.getReg(0), Mad0
, Flags
);
3709 auto E
= B
.buildIntrinsicRoundeven(Ty
, PH
, Flags
);
3711 // It is unsafe to contract this fsub into the PH multiply.
3712 auto PHSubE
= B
.buildFSub(Ty
, PH
, E
, FlagsNoContract
);
3713 auto A
= B
.buildFAdd(Ty
, PHSubE
, PL
, Flags
);
3714 auto IntE
= B
.buildFPTOSI(LLT::scalar(32), E
);
3716 auto Exp2
= B
.buildIntrinsic(Intrinsic::amdgcn_exp2
, {Ty
})
3717 .addUse(A
.getReg(0))
3719 auto R
= B
.buildFLdexp(Ty
, Exp2
, IntE
, Flags
);
3721 auto UnderflowCheckConst
=
3722 B
.buildFConstant(Ty
, IsExp10
? -0x1.66d3e8p
+5f
: -0x1.9d1da0p
+6f
);
3723 auto Zero
= B
.buildFConstant(Ty
, 0.0);
3725 B
.buildFCmp(CmpInst::FCMP_OLT
, LLT::scalar(1), X
, UnderflowCheckConst
);
3727 R
= B
.buildSelect(Ty
, Underflow
, Zero
, R
);
3729 const auto &Options
= MF
.getTarget().Options
;
3731 if (!(Flags
& MachineInstr::FmNoInfs
) && !Options
.NoInfsFPMath
) {
3732 auto OverflowCheckConst
=
3733 B
.buildFConstant(Ty
, IsExp10
? 0x1.344136p
+5f
: 0x1.62e430p
+6f
);
3736 B
.buildFCmp(CmpInst::FCMP_OGT
, LLT::scalar(1), X
, OverflowCheckConst
);
3737 auto Inf
= B
.buildFConstant(Ty
, APFloat::getInf(APFloat::IEEEsingle()));
3738 R
= B
.buildSelect(Ty
, Overflow
, Inf
, R
, Flags
);
3741 B
.buildCopy(Dst
, R
);
3742 MI
.eraseFromParent();
3746 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr
&MI
,
3747 MachineIRBuilder
&B
) const {
3748 Register Dst
= MI
.getOperand(0).getReg();
3749 Register Src0
= MI
.getOperand(1).getReg();
3750 Register Src1
= MI
.getOperand(2).getReg();
3751 unsigned Flags
= MI
.getFlags();
3752 LLT Ty
= B
.getMRI()->getType(Dst
);
3753 const LLT F16
= LLT::float16();
3754 const LLT F32
= LLT::float32();
3757 auto Log
= B
.buildFLog2(F32
, Src0
, Flags
);
3758 auto Mul
= B
.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy
, {F32
})
3759 .addUse(Log
.getReg(0))
3762 B
.buildFExp2(Dst
, Mul
, Flags
);
3763 } else if (Ty
== F16
) {
3764 // There's no f16 fmul_legacy, so we need to convert for it.
3765 auto Log
= B
.buildFLog2(F16
, Src0
, Flags
);
3766 auto Ext0
= B
.buildFPExt(F32
, Log
, Flags
);
3767 auto Ext1
= B
.buildFPExt(F32
, Src1
, Flags
);
3768 auto Mul
= B
.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy
, {F32
})
3769 .addUse(Ext0
.getReg(0))
3770 .addUse(Ext1
.getReg(0))
3772 B
.buildFExp2(Dst
, B
.buildFPTrunc(F16
, Mul
), Flags
);
3776 MI
.eraseFromParent();
3780 // Find a source register, ignoring any possible source modifiers.
3781 static Register
stripAnySourceMods(Register OrigSrc
, MachineRegisterInfo
&MRI
) {
3782 Register ModSrc
= OrigSrc
;
3783 if (MachineInstr
*SrcFNeg
= getOpcodeDef(AMDGPU::G_FNEG
, ModSrc
, MRI
)) {
3784 ModSrc
= SrcFNeg
->getOperand(1).getReg();
3785 if (MachineInstr
*SrcFAbs
= getOpcodeDef(AMDGPU::G_FABS
, ModSrc
, MRI
))
3786 ModSrc
= SrcFAbs
->getOperand(1).getReg();
3787 } else if (MachineInstr
*SrcFAbs
= getOpcodeDef(AMDGPU::G_FABS
, ModSrc
, MRI
))
3788 ModSrc
= SrcFAbs
->getOperand(1).getReg();
3792 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr
&MI
,
3793 MachineRegisterInfo
&MRI
,
3794 MachineIRBuilder
&B
) const {
3796 const LLT S1
= LLT::scalar(1);
3797 const LLT F64
= LLT::float64();
3798 Register Dst
= MI
.getOperand(0).getReg();
3799 Register OrigSrc
= MI
.getOperand(1).getReg();
3800 unsigned Flags
= MI
.getFlags();
3801 assert(ST
.hasFractBug() && MRI
.getType(Dst
) == F64
&&
3802 "this should not have been custom lowered");
3804 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3805 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3806 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3808 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3810 // Convert floor(x) to (x - fract(x))
3812 auto Fract
= B
.buildIntrinsic(Intrinsic::amdgcn_fract
, {F64
})
3816 // Give source modifier matching some assistance before obscuring a foldable
3819 // TODO: We can avoid the neg on the fract? The input sign to fract
3820 // shouldn't matter?
3821 Register ModSrc
= stripAnySourceMods(OrigSrc
, MRI
);
3824 B
.buildFConstant(F64
, llvm::bit_cast
<double>(0x3fefffffffffffff));
3826 Register Min
= MRI
.createGenericVirtualRegister(F64
);
3828 // We don't need to concern ourselves with the snan handling difference, so
3829 // use the one which will directly select.
3830 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
3831 if (MFI
->getMode().IEEE
)
3832 B
.buildFMinNumIEEE(Min
, Fract
, Const
, Flags
);
3834 B
.buildFMinNum(Min
, Fract
, Const
, Flags
);
3836 Register CorrectedFract
= Min
;
3837 if (!MI
.getFlag(MachineInstr::FmNoNans
)) {
3838 auto IsNan
= B
.buildFCmp(CmpInst::FCMP_ORD
, S1
, ModSrc
, ModSrc
, Flags
);
3839 CorrectedFract
= B
.buildSelect(F64
, IsNan
, ModSrc
, Min
, Flags
).getReg(0);
3842 auto NegFract
= B
.buildFNeg(F64
, CorrectedFract
, Flags
);
3843 B
.buildFAdd(Dst
, OrigSrc
, NegFract
, Flags
);
3845 MI
.eraseFromParent();
3849 // Turn an illegal packed v2s16 build vector into bit operations.
3850 // TODO: This should probably be a bitcast action in LegalizerHelper.
3851 bool AMDGPULegalizerInfo::legalizeBuildVector(
3852 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
3853 Register Dst
= MI
.getOperand(0).getReg();
3854 const LLT S32
= LLT::scalar(32);
3855 const LLT S16
= LLT::scalar(16);
3856 assert(MRI
.getType(Dst
) == LLT::fixed_vector(2, 16));
3858 Register Src0
= MI
.getOperand(1).getReg();
3859 Register Src1
= MI
.getOperand(2).getReg();
3861 if (MI
.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC
) {
3862 assert(MRI
.getType(Src0
) == S32
);
3863 Src0
= B
.buildTrunc(S16
, MI
.getOperand(1).getReg()).getReg(0);
3864 Src1
= B
.buildTrunc(S16
, MI
.getOperand(2).getReg()).getReg(0);
3867 auto Merge
= B
.buildMergeLikeInstr(S32
, {Src0
, Src1
});
3868 B
.buildBitcast(Dst
, Merge
);
3870 MI
.eraseFromParent();
3874 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3876 // Source and accumulation registers must all be 32-bits.
3878 // TODO: When the multiply is uniform, we should produce a code sequence
3879 // that is better suited to instruction selection on the SALU. Instead of
3880 // the outer loop going over parts of the result, the outer loop should go
3881 // over parts of one of the factors. This should result in instruction
3882 // selection that makes full use of S_ADDC_U32 instructions.
3883 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper
&Helper
,
3884 MutableArrayRef
<Register
> Accum
,
3885 ArrayRef
<Register
> Src0
,
3886 ArrayRef
<Register
> Src1
,
3887 bool UsePartialMad64_32
,
3888 bool SeparateOddAlignedProducts
) const {
3889 // Use (possibly empty) vectors of S1 registers to represent the set of
3890 // carries from one pair of positions to the next.
3891 using Carry
= SmallVector
<Register
, 2>;
3893 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
3894 GISelKnownBits
&KB
= *Helper
.getKnownBits();
3896 const LLT S1
= LLT::scalar(1);
3897 const LLT S32
= LLT::scalar(32);
3898 const LLT S64
= LLT::scalar(64);
3903 auto getZero32
= [&]() -> Register
{
3905 Zero32
= B
.buildConstant(S32
, 0).getReg(0);
3908 auto getZero64
= [&]() -> Register
{
3910 Zero64
= B
.buildConstant(S64
, 0).getReg(0);
3914 SmallVector
<bool, 2> Src0KnownZeros
, Src1KnownZeros
;
3915 for (unsigned i
= 0; i
< Src0
.size(); ++i
) {
3916 Src0KnownZeros
.push_back(KB
.getKnownBits(Src0
[i
]).isZero());
3917 Src1KnownZeros
.push_back(KB
.getKnownBits(Src1
[i
]).isZero());
3920 // Merge the given carries into the 32-bit LocalAccum, which is modified
3923 // Returns the carry-out, which is a single S1 register or null.
3925 [&](Register
&LocalAccum
, const Carry
&CarryIn
) -> Register
{
3926 if (CarryIn
.empty())
3929 bool HaveCarryOut
= true;
3930 Register CarryAccum
;
3931 if (CarryIn
.size() == 1) {
3933 LocalAccum
= B
.buildZExt(S32
, CarryIn
[0]).getReg(0);
3937 CarryAccum
= getZero32();
3939 CarryAccum
= B
.buildZExt(S32
, CarryIn
[0]).getReg(0);
3940 for (unsigned i
= 1; i
+ 1 < CarryIn
.size(); ++i
) {
3942 B
.buildUAdde(S32
, S1
, CarryAccum
, getZero32(), CarryIn
[i
])
3947 LocalAccum
= getZero32();
3948 HaveCarryOut
= false;
3953 B
.buildUAdde(S32
, S1
, CarryAccum
, LocalAccum
, CarryIn
.back());
3954 LocalAccum
= Add
.getReg(0);
3955 return HaveCarryOut
? Add
.getReg(1) : Register();
3958 // Build a multiply-add chain to compute
3960 // LocalAccum + (partial products at DstIndex)
3961 // + (opportunistic subset of CarryIn)
3963 // LocalAccum is an array of one or two 32-bit registers that are updated
3964 // in-place. The incoming registers may be null.
3966 // In some edge cases, carry-ins can be consumed "for free". In that case,
3967 // the consumed carry bits are removed from CarryIn in-place.
3968 auto buildMadChain
=
3969 [&](MutableArrayRef
<Register
> LocalAccum
, unsigned DstIndex
, Carry
&CarryIn
)
3971 assert((DstIndex
+ 1 < Accum
.size() && LocalAccum
.size() == 2) ||
3972 (DstIndex
+ 1 >= Accum
.size() && LocalAccum
.size() == 1));
3977 // Use plain 32-bit multiplication for the most significant part of the
3978 // result by default.
3979 if (LocalAccum
.size() == 1 &&
3980 (!UsePartialMad64_32
|| !CarryIn
.empty())) {
3982 // Skip multiplication if one of the operands is 0
3983 unsigned j1
= DstIndex
- j0
;
3984 if (Src0KnownZeros
[j0
] || Src1KnownZeros
[j1
]) {
3988 auto Mul
= B
.buildMul(S32
, Src0
[j0
], Src1
[j1
]);
3989 if (!LocalAccum
[0] || KB
.getKnownBits(LocalAccum
[0]).isZero()) {
3990 LocalAccum
[0] = Mul
.getReg(0);
3992 if (CarryIn
.empty()) {
3993 LocalAccum
[0] = B
.buildAdd(S32
, LocalAccum
[0], Mul
).getReg(0);
3996 B
.buildUAdde(S32
, S1
, LocalAccum
[0], Mul
, CarryIn
.back())
4002 } while (j0
<= DstIndex
&& (!UsePartialMad64_32
|| !CarryIn
.empty()));
4005 // Build full 64-bit multiplies.
4006 if (j0
<= DstIndex
) {
4007 bool HaveSmallAccum
= false;
4010 if (LocalAccum
[0]) {
4011 if (LocalAccum
.size() == 1) {
4012 Tmp
= B
.buildAnyExt(S64
, LocalAccum
[0]).getReg(0);
4013 HaveSmallAccum
= true;
4014 } else if (LocalAccum
[1]) {
4015 Tmp
= B
.buildMergeLikeInstr(S64
, LocalAccum
).getReg(0);
4016 HaveSmallAccum
= false;
4018 Tmp
= B
.buildZExt(S64
, LocalAccum
[0]).getReg(0);
4019 HaveSmallAccum
= true;
4022 assert(LocalAccum
.size() == 1 || !LocalAccum
[1]);
4024 HaveSmallAccum
= true;
4028 unsigned j1
= DstIndex
- j0
;
4029 if (Src0KnownZeros
[j0
] || Src1KnownZeros
[j1
]) {
4033 auto Mad
= B
.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32
, {S64
, S1
},
4034 {Src0
[j0
], Src1
[j1
], Tmp
});
4035 Tmp
= Mad
.getReg(0);
4036 if (!HaveSmallAccum
)
4037 CarryOut
.push_back(Mad
.getReg(1));
4038 HaveSmallAccum
= false;
4041 } while (j0
<= DstIndex
);
4043 auto Unmerge
= B
.buildUnmerge(S32
, Tmp
);
4044 LocalAccum
[0] = Unmerge
.getReg(0);
4045 if (LocalAccum
.size() > 1)
4046 LocalAccum
[1] = Unmerge
.getReg(1);
4052 // Outer multiply loop, iterating over destination parts from least
4053 // significant to most significant parts.
4055 // The columns of the following diagram correspond to the destination parts
4056 // affected by one iteration of the outer loop (ignoring boundary
4059 // Dest index relative to 2 * i: 1 0 -1
4061 // Carries from previous iteration: e o
4062 // Even-aligned partial product sum: E E .
4063 // Odd-aligned partial product sum: O O
4065 // 'o' is OddCarry, 'e' is EvenCarry.
4066 // EE and OO are computed from partial products via buildMadChain and use
4067 // accumulation where possible and appropriate.
4069 Register SeparateOddCarry
;
4073 for (unsigned i
= 0; i
<= Accum
.size() / 2; ++i
) {
4074 Carry OddCarryIn
= std::move(OddCarry
);
4075 Carry EvenCarryIn
= std::move(EvenCarry
);
4079 // Partial products at offset 2 * i.
4080 if (2 * i
< Accum
.size()) {
4081 auto LocalAccum
= Accum
.drop_front(2 * i
).take_front(2);
4082 EvenCarry
= buildMadChain(LocalAccum
, 2 * i
, EvenCarryIn
);
4085 // Partial products at offset 2 * i - 1.
4087 if (!SeparateOddAlignedProducts
) {
4088 auto LocalAccum
= Accum
.drop_front(2 * i
- 1).take_front(2);
4089 OddCarry
= buildMadChain(LocalAccum
, 2 * i
- 1, OddCarryIn
);
4091 bool IsHighest
= 2 * i
>= Accum
.size();
4092 Register SeparateOddOut
[2];
4093 auto LocalAccum
= MutableArrayRef(SeparateOddOut
)
4094 .take_front(IsHighest
? 1 : 2);
4095 OddCarry
= buildMadChain(LocalAccum
, 2 * i
- 1, OddCarryIn
);
4101 Lo
= B
.buildUAddo(S32
, S1
, Accum
[2 * i
- 1], SeparateOddOut
[0]);
4103 Lo
= B
.buildAdd(S32
, Accum
[2 * i
- 1], SeparateOddOut
[0]);
4105 Lo
= B
.buildUAdde(S32
, S1
, Accum
[2 * i
- 1], SeparateOddOut
[0],
4108 Accum
[2 * i
- 1] = Lo
->getOperand(0).getReg();
4111 auto Hi
= B
.buildUAdde(S32
, S1
, Accum
[2 * i
], SeparateOddOut
[1],
4112 Lo
->getOperand(1).getReg());
4113 Accum
[2 * i
] = Hi
.getReg(0);
4114 SeparateOddCarry
= Hi
.getReg(1);
4119 // Add in the carries from the previous iteration
4121 if (Register CarryOut
= mergeCarry(Accum
[2 * i
- 1], OddCarryIn
))
4122 EvenCarryIn
.push_back(CarryOut
);
4124 if (2 * i
< Accum
.size()) {
4125 if (Register CarryOut
= mergeCarry(Accum
[2 * i
], EvenCarryIn
))
4126 OddCarry
.push_back(CarryOut
);
4132 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4134 // TODO: If the multiply is followed by an addition, we should attempt to
4135 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4136 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper
&Helper
,
4137 MachineInstr
&MI
) const {
4138 assert(ST
.hasMad64_32());
4139 assert(MI
.getOpcode() == TargetOpcode::G_MUL
);
4141 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
4142 MachineRegisterInfo
&MRI
= *B
.getMRI();
4144 Register DstReg
= MI
.getOperand(0).getReg();
4145 Register Src0
= MI
.getOperand(1).getReg();
4146 Register Src1
= MI
.getOperand(2).getReg();
4148 LLT Ty
= MRI
.getType(DstReg
);
4149 assert(Ty
.isScalar());
4151 unsigned Size
= Ty
.getSizeInBits();
4152 unsigned NumParts
= Size
/ 32;
4153 assert((Size
% 32) == 0);
4154 assert(NumParts
>= 2);
4156 // Whether to use MAD_64_32 for partial products whose high half is
4157 // discarded. This avoids some ADD instructions but risks false dependency
4158 // stalls on some subtargets in some cases.
4159 const bool UsePartialMad64_32
= ST
.getGeneration() < AMDGPUSubtarget::GFX10
;
4161 // Whether to compute odd-aligned partial products separately. This is
4162 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4163 // in an even-aligned VGPR.
4164 const bool SeparateOddAlignedProducts
= ST
.hasFullRate64Ops();
4166 LLT S32
= LLT::scalar(32);
4167 SmallVector
<Register
, 2> Src0Parts
, Src1Parts
;
4168 for (unsigned i
= 0; i
< NumParts
; ++i
) {
4169 Src0Parts
.push_back(MRI
.createGenericVirtualRegister(S32
));
4170 Src1Parts
.push_back(MRI
.createGenericVirtualRegister(S32
));
4172 B
.buildUnmerge(Src0Parts
, Src0
);
4173 B
.buildUnmerge(Src1Parts
, Src1
);
4175 SmallVector
<Register
, 2> AccumRegs(NumParts
);
4176 buildMultiply(Helper
, AccumRegs
, Src0Parts
, Src1Parts
, UsePartialMad64_32
,
4177 SeparateOddAlignedProducts
);
4179 B
.buildMergeLikeInstr(DstReg
, AccumRegs
);
4180 MI
.eraseFromParent();
4184 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4185 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4186 // case with a single min instruction instead of a compare+select.
4187 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr
&MI
,
4188 MachineRegisterInfo
&MRI
,
4189 MachineIRBuilder
&B
) const {
4190 Register Dst
= MI
.getOperand(0).getReg();
4191 Register Src
= MI
.getOperand(1).getReg();
4192 LLT DstTy
= MRI
.getType(Dst
);
4193 LLT SrcTy
= MRI
.getType(Src
);
4195 unsigned NewOpc
= MI
.getOpcode() == AMDGPU::G_CTLZ
4196 ? AMDGPU::G_AMDGPU_FFBH_U32
4197 : AMDGPU::G_AMDGPU_FFBL_B32
;
4198 auto Tmp
= B
.buildInstr(NewOpc
, {DstTy
}, {Src
});
4199 B
.buildUMin(Dst
, Tmp
, B
.buildConstant(DstTy
, SrcTy
.getSizeInBits()));
4201 MI
.eraseFromParent();
4205 bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr
&MI
,
4206 MachineRegisterInfo
&MRI
,
4207 MachineIRBuilder
&B
) const {
4208 Register Dst
= MI
.getOperand(0).getReg();
4209 Register Src
= MI
.getOperand(1).getReg();
4210 LLT SrcTy
= MRI
.getType(Src
);
4211 TypeSize NumBits
= SrcTy
.getSizeInBits();
4213 assert(NumBits
< 32u);
4215 auto ShiftAmt
= B
.buildConstant(S32
, 32u - NumBits
);
4216 auto Extend
= B
.buildAnyExt(S32
, {Src
}).getReg(0u);
4217 auto Shift
= B
.buildShl(S32
, Extend
, ShiftAmt
);
4218 auto Ctlz
= B
.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32
, {S32
}, {Shift
});
4219 B
.buildTrunc(Dst
, Ctlz
);
4220 MI
.eraseFromParent();
4224 // Check that this is a G_XOR x, -1
4225 static bool isNot(const MachineRegisterInfo
&MRI
, const MachineInstr
&MI
) {
4226 if (MI
.getOpcode() != TargetOpcode::G_XOR
)
4228 auto ConstVal
= getIConstantVRegSExtVal(MI
.getOperand(2).getReg(), MRI
);
4229 return ConstVal
&& *ConstVal
== -1;
4232 // Return the use branch instruction, otherwise null if the usage is invalid.
4233 static MachineInstr
*
4234 verifyCFIntrinsic(MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineInstr
*&Br
,
4235 MachineBasicBlock
*&UncondBrTarget
, bool &Negated
) {
4236 Register CondDef
= MI
.getOperand(0).getReg();
4237 if (!MRI
.hasOneNonDBGUse(CondDef
))
4240 MachineBasicBlock
*Parent
= MI
.getParent();
4241 MachineInstr
*UseMI
= &*MRI
.use_instr_nodbg_begin(CondDef
);
4243 if (isNot(MRI
, *UseMI
)) {
4244 Register NegatedCond
= UseMI
->getOperand(0).getReg();
4245 if (!MRI
.hasOneNonDBGUse(NegatedCond
))
4248 // We're deleting the def of this value, so we need to remove it.
4249 eraseInstr(*UseMI
, MRI
);
4251 UseMI
= &*MRI
.use_instr_nodbg_begin(NegatedCond
);
4255 if (UseMI
->getParent() != Parent
|| UseMI
->getOpcode() != AMDGPU::G_BRCOND
)
4258 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4259 MachineBasicBlock::iterator Next
= std::next(UseMI
->getIterator());
4260 if (Next
== Parent
->end()) {
4261 MachineFunction::iterator NextMBB
= std::next(Parent
->getIterator());
4262 if (NextMBB
== Parent
->getParent()->end()) // Illegal intrinsic use.
4264 UncondBrTarget
= &*NextMBB
;
4266 if (Next
->getOpcode() != AMDGPU::G_BR
)
4269 UncondBrTarget
= Br
->getOperand(0).getMBB();
4275 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg
, MachineIRBuilder
&B
,
4276 const ArgDescriptor
*Arg
,
4277 const TargetRegisterClass
*ArgRC
,
4279 MCRegister SrcReg
= Arg
->getRegister();
4280 assert(Register::isPhysicalRegister(SrcReg
) && "Physical register expected");
4281 assert(DstReg
.isVirtual() && "Virtual register expected");
4283 Register LiveIn
= getFunctionLiveInPhysReg(B
.getMF(), B
.getTII(), SrcReg
,
4284 *ArgRC
, B
.getDebugLoc(), ArgTy
);
4285 if (Arg
->isMasked()) {
4286 // TODO: Should we try to emit this once in the entry block?
4287 const LLT S32
= LLT::scalar(32);
4288 const unsigned Mask
= Arg
->getMask();
4289 const unsigned Shift
= llvm::countr_zero
<unsigned>(Mask
);
4291 Register AndMaskSrc
= LiveIn
;
4293 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4296 auto ShiftAmt
= B
.buildConstant(S32
, Shift
);
4297 AndMaskSrc
= B
.buildLShr(S32
, LiveIn
, ShiftAmt
).getReg(0);
4300 B
.buildAnd(DstReg
, AndMaskSrc
, B
.buildConstant(S32
, Mask
>> Shift
));
4302 B
.buildCopy(DstReg
, LiveIn
);
4308 bool AMDGPULegalizerInfo::loadInputValue(
4309 Register DstReg
, MachineIRBuilder
&B
,
4310 AMDGPUFunctionArgInfo::PreloadedValue ArgType
) const {
4311 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
4312 const ArgDescriptor
*Arg
= nullptr;
4313 const TargetRegisterClass
*ArgRC
;
4316 CallingConv::ID CC
= B
.getMF().getFunction().getCallingConv();
4317 const ArgDescriptor WorkGroupIDX
=
4318 ArgDescriptor::createRegister(AMDGPU::TTMP9
);
4319 // If GridZ is not programmed in an entry function then the hardware will set
4320 // it to all zeros, so there is no need to mask the GridY value in the low
4322 const ArgDescriptor WorkGroupIDY
= ArgDescriptor::createRegister(
4324 AMDGPU::isEntryFunctionCC(CC
) && !MFI
->hasWorkGroupIDZ() ? ~0u : 0xFFFFu
);
4325 const ArgDescriptor WorkGroupIDZ
=
4326 ArgDescriptor::createRegister(AMDGPU::TTMP7
, 0xFFFF0000u
);
4327 if (ST
.hasArchitectedSGPRs() &&
4328 (AMDGPU::isCompute(CC
) || CC
== CallingConv::AMDGPU_Gfx
)) {
4330 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X
:
4331 Arg
= &WorkGroupIDX
;
4332 ArgRC
= &AMDGPU::SReg_32RegClass
;
4333 ArgTy
= LLT::scalar(32);
4335 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
:
4336 Arg
= &WorkGroupIDY
;
4337 ArgRC
= &AMDGPU::SReg_32RegClass
;
4338 ArgTy
= LLT::scalar(32);
4340 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
:
4341 Arg
= &WorkGroupIDZ
;
4342 ArgRC
= &AMDGPU::SReg_32RegClass
;
4343 ArgTy
= LLT::scalar(32);
4351 std::tie(Arg
, ArgRC
, ArgTy
) = MFI
->getPreloadedValue(ArgType
);
4354 if (ArgType
== AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
) {
4355 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4356 // case the pointer argument may be missing and we use null.
4357 B
.buildConstant(DstReg
, 0);
4361 // It's undefined behavior if a function marked with the amdgpu-no-*
4362 // attributes uses the corresponding intrinsic.
4363 B
.buildUndef(DstReg
);
4367 if (!Arg
->isRegister() || !Arg
->getRegister().isValid())
4368 return false; // TODO: Handle these
4369 return loadInputValue(DstReg
, B
, Arg
, ArgRC
, ArgTy
);
4372 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4373 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
,
4374 AMDGPUFunctionArgInfo::PreloadedValue ArgType
) const {
4375 if (!loadInputValue(MI
.getOperand(0).getReg(), B
, ArgType
))
4378 MI
.eraseFromParent();
4382 static bool replaceWithConstant(MachineIRBuilder
&B
, MachineInstr
&MI
,
4384 B
.buildConstant(MI
.getOperand(0).getReg(), C
);
4385 MI
.eraseFromParent();
4389 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4390 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
,
4391 unsigned Dim
, AMDGPUFunctionArgInfo::PreloadedValue ArgType
) const {
4392 unsigned MaxID
= ST
.getMaxWorkitemID(B
.getMF().getFunction(), Dim
);
4394 return replaceWithConstant(B
, MI
, 0);
4396 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
4397 const ArgDescriptor
*Arg
;
4398 const TargetRegisterClass
*ArgRC
;
4400 std::tie(Arg
, ArgRC
, ArgTy
) = MFI
->getPreloadedValue(ArgType
);
4402 Register DstReg
= MI
.getOperand(0).getReg();
4404 // It's undefined behavior if a function marked with the amdgpu-no-*
4405 // attributes uses the corresponding intrinsic.
4406 B
.buildUndef(DstReg
);
4407 MI
.eraseFromParent();
4411 if (Arg
->isMasked()) {
4412 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4413 // masking operations anyway.
4415 // TODO: We could assert the top bit is 0 for the source copy.
4416 if (!loadInputValue(DstReg
, B
, ArgType
))
4419 Register TmpReg
= MRI
.createGenericVirtualRegister(LLT::scalar(32));
4420 if (!loadInputValue(TmpReg
, B
, ArgType
))
4422 B
.buildAssertZExt(DstReg
, TmpReg
, llvm::bit_width(MaxID
));
4425 MI
.eraseFromParent();
4429 Register
AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder
&B
,
4430 int64_t Offset
) const {
4431 LLT PtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
4432 Register KernArgReg
= B
.getMRI()->createGenericVirtualRegister(PtrTy
);
4434 // TODO: If we passed in the base kernel offset we could have a better
4435 // alignment than 4, but we don't really need it.
4436 if (!loadInputValue(KernArgReg
, B
,
4437 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
))
4438 llvm_unreachable("failed to find kernarg segment ptr");
4440 auto COffset
= B
.buildConstant(LLT::scalar(64), Offset
);
4441 // TODO: Should get nuw
4442 return B
.buildPtrAdd(PtrTy
, KernArgReg
, COffset
).getReg(0);
4445 /// Legalize a value that's loaded from kernel arguments. This is only used by
4446 /// legacy intrinsics.
4447 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr
&MI
,
4448 MachineIRBuilder
&B
,
4450 Align Alignment
) const {
4451 Register DstReg
= MI
.getOperand(0).getReg();
4453 assert(B
.getMRI()->getType(DstReg
) == LLT::scalar(32) &&
4454 "unexpected kernarg parameter type");
4456 Register Ptr
= getKernargParameterPtr(B
, Offset
);
4457 MachinePointerInfo
PtrInfo(AMDGPUAS::CONSTANT_ADDRESS
);
4458 B
.buildLoad(DstReg
, Ptr
, PtrInfo
, Align(4),
4459 MachineMemOperand::MODereferenceable
|
4460 MachineMemOperand::MOInvariant
);
4461 MI
.eraseFromParent();
4465 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr
&MI
,
4466 MachineRegisterInfo
&MRI
,
4467 MachineIRBuilder
&B
) const {
4468 Register Dst
= MI
.getOperand(0).getReg();
4469 LLT DstTy
= MRI
.getType(Dst
);
4470 LLT S16
= LLT::scalar(16);
4471 LLT S32
= LLT::scalar(32);
4472 LLT S64
= LLT::scalar(64);
4475 return legalizeFDIV16(MI
, MRI
, B
);
4477 return legalizeFDIV32(MI
, MRI
, B
);
4479 return legalizeFDIV64(MI
, MRI
, B
);
4484 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder
&B
,
4489 const LLT S1
= LLT::scalar(1);
4490 const LLT S32
= LLT::scalar(32);
4492 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4493 // algorithm used here.
4495 // Initial estimate of inv(y).
4496 auto FloatY
= B
.buildUITOFP(S32
, Y
);
4497 auto RcpIFlag
= B
.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG
, {S32
}, {FloatY
});
4498 auto Scale
= B
.buildFConstant(S32
, llvm::bit_cast
<float>(0x4f7ffffe));
4499 auto ScaledY
= B
.buildFMul(S32
, RcpIFlag
, Scale
);
4500 auto Z
= B
.buildFPTOUI(S32
, ScaledY
);
4502 // One round of UNR.
4503 auto NegY
= B
.buildSub(S32
, B
.buildConstant(S32
, 0), Y
);
4504 auto NegYZ
= B
.buildMul(S32
, NegY
, Z
);
4505 Z
= B
.buildAdd(S32
, Z
, B
.buildUMulH(S32
, Z
, NegYZ
));
4507 // Quotient/remainder estimate.
4508 auto Q
= B
.buildUMulH(S32
, X
, Z
);
4509 auto R
= B
.buildSub(S32
, X
, B
.buildMul(S32
, Q
, Y
));
4511 // First quotient/remainder refinement.
4512 auto One
= B
.buildConstant(S32
, 1);
4513 auto Cond
= B
.buildICmp(CmpInst::ICMP_UGE
, S1
, R
, Y
);
4515 Q
= B
.buildSelect(S32
, Cond
, B
.buildAdd(S32
, Q
, One
), Q
);
4516 R
= B
.buildSelect(S32
, Cond
, B
.buildSub(S32
, R
, Y
), R
);
4518 // Second quotient/remainder refinement.
4519 Cond
= B
.buildICmp(CmpInst::ICMP_UGE
, S1
, R
, Y
);
4521 B
.buildSelect(DstDivReg
, Cond
, B
.buildAdd(S32
, Q
, One
), Q
);
4524 B
.buildSelect(DstRemReg
, Cond
, B
.buildSub(S32
, R
, Y
), R
);
4527 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4529 // Return lo, hi of result
4531 // %cvt.lo = G_UITOFP Val.lo
4532 // %cvt.hi = G_UITOFP Val.hi
4533 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4534 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4535 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4536 // %mul2 = G_FMUL %mul1, 2**(-32)
4537 // %trunc = G_INTRINSIC_TRUNC %mul2
4538 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4539 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4540 static std::pair
<Register
, Register
> emitReciprocalU64(MachineIRBuilder
&B
,
4542 const LLT S32
= LLT::scalar(32);
4543 auto Unmerge
= B
.buildUnmerge(S32
, Val
);
4545 auto CvtLo
= B
.buildUITOFP(S32
, Unmerge
.getReg(0));
4546 auto CvtHi
= B
.buildUITOFP(S32
, Unmerge
.getReg(1));
4548 auto Mad
= B
.buildFMAD(
4549 S32
, CvtHi
, // 2**32
4550 B
.buildFConstant(S32
, llvm::bit_cast
<float>(0x4f800000)), CvtLo
);
4552 auto Rcp
= B
.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG
, {S32
}, {Mad
});
4553 auto Mul1
= B
.buildFMul(
4554 S32
, Rcp
, B
.buildFConstant(S32
, llvm::bit_cast
<float>(0x5f7ffffc)));
4557 auto Mul2
= B
.buildFMul(
4558 S32
, Mul1
, B
.buildFConstant(S32
, llvm::bit_cast
<float>(0x2f800000)));
4559 auto Trunc
= B
.buildIntrinsicTrunc(S32
, Mul2
);
4562 auto Mad2
= B
.buildFMAD(
4563 S32
, Trunc
, B
.buildFConstant(S32
, llvm::bit_cast
<float>(0xcf800000)),
4566 auto ResultLo
= B
.buildFPTOUI(S32
, Mad2
);
4567 auto ResultHi
= B
.buildFPTOUI(S32
, Trunc
);
4569 return {ResultLo
.getReg(0), ResultHi
.getReg(0)};
4572 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder
&B
,
4576 Register Denom
) const {
4577 const LLT S32
= LLT::scalar(32);
4578 const LLT S64
= LLT::scalar(64);
4579 const LLT S1
= LLT::scalar(1);
4580 Register RcpLo
, RcpHi
;
4582 std::tie(RcpLo
, RcpHi
) = emitReciprocalU64(B
, Denom
);
4584 auto Rcp
= B
.buildMergeLikeInstr(S64
, {RcpLo
, RcpHi
});
4586 auto Zero64
= B
.buildConstant(S64
, 0);
4587 auto NegDenom
= B
.buildSub(S64
, Zero64
, Denom
);
4589 auto MulLo1
= B
.buildMul(S64
, NegDenom
, Rcp
);
4590 auto MulHi1
= B
.buildUMulH(S64
, Rcp
, MulLo1
);
4592 auto UnmergeMulHi1
= B
.buildUnmerge(S32
, MulHi1
);
4593 Register MulHi1_Lo
= UnmergeMulHi1
.getReg(0);
4594 Register MulHi1_Hi
= UnmergeMulHi1
.getReg(1);
4596 auto Add1_Lo
= B
.buildUAddo(S32
, S1
, RcpLo
, MulHi1_Lo
);
4597 auto Add1_Hi
= B
.buildUAdde(S32
, S1
, RcpHi
, MulHi1_Hi
, Add1_Lo
.getReg(1));
4598 auto Add1
= B
.buildMergeLikeInstr(S64
, {Add1_Lo
, Add1_Hi
});
4600 auto MulLo2
= B
.buildMul(S64
, NegDenom
, Add1
);
4601 auto MulHi2
= B
.buildUMulH(S64
, Add1
, MulLo2
);
4602 auto UnmergeMulHi2
= B
.buildUnmerge(S32
, MulHi2
);
4603 Register MulHi2_Lo
= UnmergeMulHi2
.getReg(0);
4604 Register MulHi2_Hi
= UnmergeMulHi2
.getReg(1);
4606 auto Zero32
= B
.buildConstant(S32
, 0);
4607 auto Add2_Lo
= B
.buildUAddo(S32
, S1
, Add1_Lo
, MulHi2_Lo
);
4608 auto Add2_Hi
= B
.buildUAdde(S32
, S1
, Add1_Hi
, MulHi2_Hi
, Add2_Lo
.getReg(1));
4609 auto Add2
= B
.buildMergeLikeInstr(S64
, {Add2_Lo
, Add2_Hi
});
4611 auto UnmergeNumer
= B
.buildUnmerge(S32
, Numer
);
4612 Register NumerLo
= UnmergeNumer
.getReg(0);
4613 Register NumerHi
= UnmergeNumer
.getReg(1);
4615 auto MulHi3
= B
.buildUMulH(S64
, Numer
, Add2
);
4616 auto Mul3
= B
.buildMul(S64
, Denom
, MulHi3
);
4617 auto UnmergeMul3
= B
.buildUnmerge(S32
, Mul3
);
4618 Register Mul3_Lo
= UnmergeMul3
.getReg(0);
4619 Register Mul3_Hi
= UnmergeMul3
.getReg(1);
4620 auto Sub1_Lo
= B
.buildUSubo(S32
, S1
, NumerLo
, Mul3_Lo
);
4621 auto Sub1_Hi
= B
.buildUSube(S32
, S1
, NumerHi
, Mul3_Hi
, Sub1_Lo
.getReg(1));
4622 auto Sub1_Mi
= B
.buildSub(S32
, NumerHi
, Mul3_Hi
);
4623 auto Sub1
= B
.buildMergeLikeInstr(S64
, {Sub1_Lo
, Sub1_Hi
});
4625 auto UnmergeDenom
= B
.buildUnmerge(S32
, Denom
);
4626 Register DenomLo
= UnmergeDenom
.getReg(0);
4627 Register DenomHi
= UnmergeDenom
.getReg(1);
4629 auto CmpHi
= B
.buildICmp(CmpInst::ICMP_UGE
, S1
, Sub1_Hi
, DenomHi
);
4630 auto C1
= B
.buildSExt(S32
, CmpHi
);
4632 auto CmpLo
= B
.buildICmp(CmpInst::ICMP_UGE
, S1
, Sub1_Lo
, DenomLo
);
4633 auto C2
= B
.buildSExt(S32
, CmpLo
);
4635 auto CmpEq
= B
.buildICmp(CmpInst::ICMP_EQ
, S1
, Sub1_Hi
, DenomHi
);
4636 auto C3
= B
.buildSelect(S32
, CmpEq
, C2
, C1
);
4638 // TODO: Here and below portions of the code can be enclosed into if/endif.
4639 // Currently control flow is unconditional and we have 4 selects after
4640 // potential endif to substitute PHIs.
4643 auto Sub2_Lo
= B
.buildUSubo(S32
, S1
, Sub1_Lo
, DenomLo
);
4644 auto Sub2_Mi
= B
.buildUSube(S32
, S1
, Sub1_Mi
, DenomHi
, Sub1_Lo
.getReg(1));
4645 auto Sub2_Hi
= B
.buildUSube(S32
, S1
, Sub2_Mi
, Zero32
, Sub2_Lo
.getReg(1));
4646 auto Sub2
= B
.buildMergeLikeInstr(S64
, {Sub2_Lo
, Sub2_Hi
});
4648 auto One64
= B
.buildConstant(S64
, 1);
4649 auto Add3
= B
.buildAdd(S64
, MulHi3
, One64
);
4652 B
.buildSExt(S32
, B
.buildICmp(CmpInst::ICMP_UGE
, S1
, Sub2_Hi
, DenomHi
));
4654 B
.buildSExt(S32
, B
.buildICmp(CmpInst::ICMP_UGE
, S1
, Sub2_Lo
, DenomLo
));
4655 auto C6
= B
.buildSelect(
4656 S32
, B
.buildICmp(CmpInst::ICMP_EQ
, S1
, Sub2_Hi
, DenomHi
), C5
, C4
);
4659 auto Add4
= B
.buildAdd(S64
, Add3
, One64
);
4660 auto Sub3_Lo
= B
.buildUSubo(S32
, S1
, Sub2_Lo
, DenomLo
);
4662 auto Sub3_Mi
= B
.buildUSube(S32
, S1
, Sub2_Mi
, DenomHi
, Sub2_Lo
.getReg(1));
4663 auto Sub3_Hi
= B
.buildUSube(S32
, S1
, Sub3_Mi
, Zero32
, Sub3_Lo
.getReg(1));
4664 auto Sub3
= B
.buildMergeLikeInstr(S64
, {Sub3_Lo
, Sub3_Hi
});
4670 auto Sel1
= B
.buildSelect(
4671 S64
, B
.buildICmp(CmpInst::ICMP_NE
, S1
, C6
, Zero32
), Add4
, Add3
);
4672 B
.buildSelect(DstDivReg
, B
.buildICmp(CmpInst::ICMP_NE
, S1
, C3
, Zero32
),
4677 auto Sel2
= B
.buildSelect(
4678 S64
, B
.buildICmp(CmpInst::ICMP_NE
, S1
, C6
, Zero32
), Sub3
, Sub2
);
4679 B
.buildSelect(DstRemReg
, B
.buildICmp(CmpInst::ICMP_NE
, S1
, C3
, Zero32
),
4684 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr
&MI
,
4685 MachineRegisterInfo
&MRI
,
4686 MachineIRBuilder
&B
) const {
4687 Register DstDivReg
, DstRemReg
;
4688 switch (MI
.getOpcode()) {
4690 llvm_unreachable("Unexpected opcode!");
4691 case AMDGPU::G_UDIV
: {
4692 DstDivReg
= MI
.getOperand(0).getReg();
4695 case AMDGPU::G_UREM
: {
4696 DstRemReg
= MI
.getOperand(0).getReg();
4699 case AMDGPU::G_UDIVREM
: {
4700 DstDivReg
= MI
.getOperand(0).getReg();
4701 DstRemReg
= MI
.getOperand(1).getReg();
4706 const LLT S64
= LLT::scalar(64);
4707 const LLT S32
= LLT::scalar(32);
4708 const unsigned FirstSrcOpIdx
= MI
.getNumExplicitDefs();
4709 Register Num
= MI
.getOperand(FirstSrcOpIdx
).getReg();
4710 Register Den
= MI
.getOperand(FirstSrcOpIdx
+ 1).getReg();
4711 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
4714 legalizeUnsignedDIV_REM32Impl(B
, DstDivReg
, DstRemReg
, Num
, Den
);
4716 legalizeUnsignedDIV_REM64Impl(B
, DstDivReg
, DstRemReg
, Num
, Den
);
4720 MI
.eraseFromParent();
4724 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr
&MI
,
4725 MachineRegisterInfo
&MRI
,
4726 MachineIRBuilder
&B
) const {
4727 const LLT S64
= LLT::scalar(64);
4728 const LLT S32
= LLT::scalar(32);
4730 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
4731 if (Ty
!= S32
&& Ty
!= S64
)
4734 const unsigned FirstSrcOpIdx
= MI
.getNumExplicitDefs();
4735 Register LHS
= MI
.getOperand(FirstSrcOpIdx
).getReg();
4736 Register RHS
= MI
.getOperand(FirstSrcOpIdx
+ 1).getReg();
4738 auto SignBitOffset
= B
.buildConstant(S32
, Ty
.getSizeInBits() - 1);
4739 auto LHSign
= B
.buildAShr(Ty
, LHS
, SignBitOffset
);
4740 auto RHSign
= B
.buildAShr(Ty
, RHS
, SignBitOffset
);
4742 LHS
= B
.buildAdd(Ty
, LHS
, LHSign
).getReg(0);
4743 RHS
= B
.buildAdd(Ty
, RHS
, RHSign
).getReg(0);
4745 LHS
= B
.buildXor(Ty
, LHS
, LHSign
).getReg(0);
4746 RHS
= B
.buildXor(Ty
, RHS
, RHSign
).getReg(0);
4748 Register DstDivReg
, DstRemReg
, TmpDivReg
, TmpRemReg
;
4749 switch (MI
.getOpcode()) {
4751 llvm_unreachable("Unexpected opcode!");
4752 case AMDGPU::G_SDIV
: {
4753 DstDivReg
= MI
.getOperand(0).getReg();
4754 TmpDivReg
= MRI
.createGenericVirtualRegister(Ty
);
4757 case AMDGPU::G_SREM
: {
4758 DstRemReg
= MI
.getOperand(0).getReg();
4759 TmpRemReg
= MRI
.createGenericVirtualRegister(Ty
);
4762 case AMDGPU::G_SDIVREM
: {
4763 DstDivReg
= MI
.getOperand(0).getReg();
4764 DstRemReg
= MI
.getOperand(1).getReg();
4765 TmpDivReg
= MRI
.createGenericVirtualRegister(Ty
);
4766 TmpRemReg
= MRI
.createGenericVirtualRegister(Ty
);
4772 legalizeUnsignedDIV_REM32Impl(B
, TmpDivReg
, TmpRemReg
, LHS
, RHS
);
4774 legalizeUnsignedDIV_REM64Impl(B
, TmpDivReg
, TmpRemReg
, LHS
, RHS
);
4777 auto Sign
= B
.buildXor(Ty
, LHSign
, RHSign
).getReg(0);
4778 auto SignXor
= B
.buildXor(Ty
, TmpDivReg
, Sign
).getReg(0);
4779 B
.buildSub(DstDivReg
, SignXor
, Sign
);
4783 auto Sign
= LHSign
.getReg(0); // Remainder sign is the same as LHS
4784 auto SignXor
= B
.buildXor(Ty
, TmpRemReg
, Sign
).getReg(0);
4785 B
.buildSub(DstRemReg
, SignXor
, Sign
);
4788 MI
.eraseFromParent();
4792 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr
&MI
,
4793 MachineRegisterInfo
&MRI
,
4794 MachineIRBuilder
&B
) const {
4795 Register Res
= MI
.getOperand(0).getReg();
4796 Register LHS
= MI
.getOperand(1).getReg();
4797 Register RHS
= MI
.getOperand(2).getReg();
4798 uint16_t Flags
= MI
.getFlags();
4799 LLT ResTy
= MRI
.getType(Res
);
4801 const MachineFunction
&MF
= B
.getMF();
4802 bool AllowInaccurateRcp
= MI
.getFlag(MachineInstr::FmAfn
) ||
4803 MF
.getTarget().Options
.UnsafeFPMath
;
4805 if (const auto *CLHS
= getConstantFPVRegVal(LHS
, MRI
)) {
4806 if (!AllowInaccurateRcp
&& ResTy
!= LLT::scalar(16))
4809 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4810 // the CI documentation has a worst case error of 1 ulp.
4811 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4812 // use it as long as we aren't trying to use denormals.
4814 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4817 if (CLHS
->isExactlyValue(1.0)) {
4818 B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, Res
)
4822 MI
.eraseFromParent();
4826 // -1 / x -> RCP( FNEG(x) )
4827 if (CLHS
->isExactlyValue(-1.0)) {
4828 auto FNeg
= B
.buildFNeg(ResTy
, RHS
, Flags
);
4829 B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, Res
)
4830 .addUse(FNeg
.getReg(0))
4833 MI
.eraseFromParent();
4838 // For f16 require afn or arcp.
4839 // For f32 require afn.
4840 if (!AllowInaccurateRcp
&& (ResTy
!= LLT::scalar(16) ||
4841 !MI
.getFlag(MachineInstr::FmArcp
)))
4844 // x / y -> x * (1.0 / y)
4845 auto RCP
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {ResTy
})
4848 B
.buildFMul(Res
, LHS
, RCP
, Flags
);
4850 MI
.eraseFromParent();
4854 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr
&MI
,
4855 MachineRegisterInfo
&MRI
,
4856 MachineIRBuilder
&B
) const {
4857 Register Res
= MI
.getOperand(0).getReg();
4858 Register X
= MI
.getOperand(1).getReg();
4859 Register Y
= MI
.getOperand(2).getReg();
4860 uint16_t Flags
= MI
.getFlags();
4861 LLT ResTy
= MRI
.getType(Res
);
4863 const MachineFunction
&MF
= B
.getMF();
4864 bool AllowInaccurateRcp
= MF
.getTarget().Options
.UnsafeFPMath
||
4865 MI
.getFlag(MachineInstr::FmAfn
);
4867 if (!AllowInaccurateRcp
)
4870 auto NegY
= B
.buildFNeg(ResTy
, Y
);
4871 auto One
= B
.buildFConstant(ResTy
, 1.0);
4873 auto R
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {ResTy
})
4877 auto Tmp0
= B
.buildFMA(ResTy
, NegY
, R
, One
);
4878 R
= B
.buildFMA(ResTy
, Tmp0
, R
, R
);
4880 auto Tmp1
= B
.buildFMA(ResTy
, NegY
, R
, One
);
4881 R
= B
.buildFMA(ResTy
, Tmp1
, R
, R
);
4883 auto Ret
= B
.buildFMul(ResTy
, X
, R
);
4884 auto Tmp2
= B
.buildFMA(ResTy
, NegY
, Ret
, X
);
4886 B
.buildFMA(Res
, Tmp2
, R
, Ret
);
4887 MI
.eraseFromParent();
4891 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr
&MI
,
4892 MachineRegisterInfo
&MRI
,
4893 MachineIRBuilder
&B
) const {
4894 if (legalizeFastUnsafeFDIV(MI
, MRI
, B
))
4897 Register Res
= MI
.getOperand(0).getReg();
4898 Register LHS
= MI
.getOperand(1).getReg();
4899 Register RHS
= MI
.getOperand(2).getReg();
4901 uint16_t Flags
= MI
.getFlags();
4903 LLT S16
= LLT::scalar(16);
4904 LLT S32
= LLT::scalar(32);
4906 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
4907 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
4908 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
4909 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
4910 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4911 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
4912 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4913 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
4914 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
4915 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
4916 // q16.u = opx(V_CVT_F16_F32, q32.u);
4917 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
4919 auto LHSExt
= B
.buildFPExt(S32
, LHS
, Flags
);
4920 auto RHSExt
= B
.buildFPExt(S32
, RHS
, Flags
);
4921 auto NegRHSExt
= B
.buildFNeg(S32
, RHSExt
);
4922 auto Rcp
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S32
})
4923 .addUse(RHSExt
.getReg(0))
4925 auto Quot
= B
.buildFMul(S32
, LHSExt
, Rcp
, Flags
);
4926 MachineInstrBuilder Err
;
4927 if (ST
.hasMadMacF32Insts()) {
4928 Err
= B
.buildFMAD(S32
, NegRHSExt
, Quot
, LHSExt
, Flags
);
4929 Quot
= B
.buildFMAD(S32
, Err
, Rcp
, Quot
, Flags
);
4930 Err
= B
.buildFMAD(S32
, NegRHSExt
, Quot
, LHSExt
, Flags
);
4932 Err
= B
.buildFMA(S32
, NegRHSExt
, Quot
, LHSExt
, Flags
);
4933 Quot
= B
.buildFMA(S32
, Err
, Rcp
, Quot
, Flags
);
4934 Err
= B
.buildFMA(S32
, NegRHSExt
, Quot
, LHSExt
, Flags
);
4936 auto Tmp
= B
.buildFMul(S32
, Err
, Rcp
, Flags
);
4937 Tmp
= B
.buildAnd(S32
, Tmp
, B
.buildConstant(S32
, 0xff800000));
4938 Quot
= B
.buildFAdd(S32
, Tmp
, Quot
, Flags
);
4939 auto RDst
= B
.buildFPTrunc(S16
, Quot
, Flags
);
4940 B
.buildIntrinsic(Intrinsic::amdgcn_div_fixup
, Res
)
4941 .addUse(RDst
.getReg(0))
4946 MI
.eraseFromParent();
4950 static constexpr unsigned SPDenormModeBitField
=
4951 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE
, 4, 2);
4953 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4954 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
4955 static void toggleSPDenormMode(bool Enable
, MachineIRBuilder
&B
,
4956 const GCNSubtarget
&ST
,
4957 SIModeRegisterDefaults Mode
) {
4958 // Set SP denorm mode to this value.
4959 unsigned SPDenormMode
=
4960 Enable
? FP_DENORM_FLUSH_NONE
: Mode
.fpDenormModeSPValue();
4962 if (ST
.hasDenormModeInst()) {
4963 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4964 uint32_t DPDenormModeDefault
= Mode
.fpDenormModeDPValue();
4966 uint32_t NewDenormModeValue
= SPDenormMode
| (DPDenormModeDefault
<< 2);
4967 B
.buildInstr(AMDGPU::S_DENORM_MODE
)
4968 .addImm(NewDenormModeValue
);
4971 B
.buildInstr(AMDGPU::S_SETREG_IMM32_B32
)
4972 .addImm(SPDenormMode
)
4973 .addImm(SPDenormModeBitField
);
4977 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr
&MI
,
4978 MachineRegisterInfo
&MRI
,
4979 MachineIRBuilder
&B
) const {
4980 if (legalizeFastUnsafeFDIV(MI
, MRI
, B
))
4983 Register Res
= MI
.getOperand(0).getReg();
4984 Register LHS
= MI
.getOperand(1).getReg();
4985 Register RHS
= MI
.getOperand(2).getReg();
4986 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
4987 SIModeRegisterDefaults Mode
= MFI
->getMode();
4989 uint16_t Flags
= MI
.getFlags();
4991 LLT S32
= LLT::scalar(32);
4992 LLT S1
= LLT::scalar(1);
4994 auto One
= B
.buildFConstant(S32
, 1.0f
);
4996 auto DenominatorScaled
=
4997 B
.buildIntrinsic(Intrinsic::amdgcn_div_scale
, {S32
, S1
})
5002 auto NumeratorScaled
=
5003 B
.buildIntrinsic(Intrinsic::amdgcn_div_scale
, {S32
, S1
})
5009 auto ApproxRcp
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S32
})
5010 .addUse(DenominatorScaled
.getReg(0))
5012 auto NegDivScale0
= B
.buildFNeg(S32
, DenominatorScaled
, Flags
);
5014 const bool PreservesDenormals
= Mode
.FP32Denormals
== DenormalMode::getIEEE();
5015 const bool HasDynamicDenormals
=
5016 (Mode
.FP32Denormals
.Input
== DenormalMode::Dynamic
) ||
5017 (Mode
.FP32Denormals
.Output
== DenormalMode::Dynamic
);
5019 Register SavedSPDenormMode
;
5020 if (!PreservesDenormals
) {
5021 if (HasDynamicDenormals
) {
5022 SavedSPDenormMode
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
5023 B
.buildInstr(AMDGPU::S_GETREG_B32
)
5024 .addDef(SavedSPDenormMode
)
5025 .addImm(SPDenormModeBitField
);
5027 toggleSPDenormMode(true, B
, ST
, Mode
);
5030 auto Fma0
= B
.buildFMA(S32
, NegDivScale0
, ApproxRcp
, One
, Flags
);
5031 auto Fma1
= B
.buildFMA(S32
, Fma0
, ApproxRcp
, ApproxRcp
, Flags
);
5032 auto Mul
= B
.buildFMul(S32
, NumeratorScaled
, Fma1
, Flags
);
5033 auto Fma2
= B
.buildFMA(S32
, NegDivScale0
, Mul
, NumeratorScaled
, Flags
);
5034 auto Fma3
= B
.buildFMA(S32
, Fma2
, Fma1
, Mul
, Flags
);
5035 auto Fma4
= B
.buildFMA(S32
, NegDivScale0
, Fma3
, NumeratorScaled
, Flags
);
5037 if (!PreservesDenormals
) {
5038 if (HasDynamicDenormals
) {
5039 assert(SavedSPDenormMode
);
5040 B
.buildInstr(AMDGPU::S_SETREG_B32
)
5041 .addReg(SavedSPDenormMode
)
5042 .addImm(SPDenormModeBitField
);
5044 toggleSPDenormMode(false, B
, ST
, Mode
);
5047 auto Fmas
= B
.buildIntrinsic(Intrinsic::amdgcn_div_fmas
, {S32
})
5048 .addUse(Fma4
.getReg(0))
5049 .addUse(Fma1
.getReg(0))
5050 .addUse(Fma3
.getReg(0))
5051 .addUse(NumeratorScaled
.getReg(1))
5054 B
.buildIntrinsic(Intrinsic::amdgcn_div_fixup
, Res
)
5055 .addUse(Fmas
.getReg(0))
5060 MI
.eraseFromParent();
5064 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr
&MI
,
5065 MachineRegisterInfo
&MRI
,
5066 MachineIRBuilder
&B
) const {
5067 if (legalizeFastUnsafeFDIV64(MI
, MRI
, B
))
5070 Register Res
= MI
.getOperand(0).getReg();
5071 Register LHS
= MI
.getOperand(1).getReg();
5072 Register RHS
= MI
.getOperand(2).getReg();
5074 uint16_t Flags
= MI
.getFlags();
5076 LLT S64
= LLT::scalar(64);
5077 LLT S1
= LLT::scalar(1);
5079 auto One
= B
.buildFConstant(S64
, 1.0);
5081 auto DivScale0
= B
.buildIntrinsic(Intrinsic::amdgcn_div_scale
, {S64
, S1
})
5087 auto NegDivScale0
= B
.buildFNeg(S64
, DivScale0
.getReg(0), Flags
);
5089 auto Rcp
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S64
})
5090 .addUse(DivScale0
.getReg(0))
5093 auto Fma0
= B
.buildFMA(S64
, NegDivScale0
, Rcp
, One
, Flags
);
5094 auto Fma1
= B
.buildFMA(S64
, Rcp
, Fma0
, Rcp
, Flags
);
5095 auto Fma2
= B
.buildFMA(S64
, NegDivScale0
, Fma1
, One
, Flags
);
5097 auto DivScale1
= B
.buildIntrinsic(Intrinsic::amdgcn_div_scale
, {S64
, S1
})
5103 auto Fma3
= B
.buildFMA(S64
, Fma1
, Fma2
, Fma1
, Flags
);
5104 auto Mul
= B
.buildFMul(S64
, DivScale1
.getReg(0), Fma3
, Flags
);
5105 auto Fma4
= B
.buildFMA(S64
, NegDivScale0
, Mul
, DivScale1
.getReg(0), Flags
);
5108 if (!ST
.hasUsableDivScaleConditionOutput()) {
5109 // Workaround a hardware bug on SI where the condition output from div_scale
5112 LLT S32
= LLT::scalar(32);
5114 auto NumUnmerge
= B
.buildUnmerge(S32
, LHS
);
5115 auto DenUnmerge
= B
.buildUnmerge(S32
, RHS
);
5116 auto Scale0Unmerge
= B
.buildUnmerge(S32
, DivScale0
);
5117 auto Scale1Unmerge
= B
.buildUnmerge(S32
, DivScale1
);
5119 auto CmpNum
= B
.buildICmp(ICmpInst::ICMP_EQ
, S1
, NumUnmerge
.getReg(1),
5120 Scale1Unmerge
.getReg(1));
5121 auto CmpDen
= B
.buildICmp(ICmpInst::ICMP_EQ
, S1
, DenUnmerge
.getReg(1),
5122 Scale0Unmerge
.getReg(1));
5123 Scale
= B
.buildXor(S1
, CmpNum
, CmpDen
).getReg(0);
5125 Scale
= DivScale1
.getReg(1);
5128 auto Fmas
= B
.buildIntrinsic(Intrinsic::amdgcn_div_fmas
, {S64
})
5129 .addUse(Fma4
.getReg(0))
5130 .addUse(Fma3
.getReg(0))
5131 .addUse(Mul
.getReg(0))
5135 B
.buildIntrinsic(Intrinsic::amdgcn_div_fixup
, ArrayRef(Res
))
5136 .addUse(Fmas
.getReg(0))
5141 MI
.eraseFromParent();
5145 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr
&MI
,
5146 MachineRegisterInfo
&MRI
,
5147 MachineIRBuilder
&B
) const {
5148 Register Res0
= MI
.getOperand(0).getReg();
5149 Register Res1
= MI
.getOperand(1).getReg();
5150 Register Val
= MI
.getOperand(2).getReg();
5151 uint16_t Flags
= MI
.getFlags();
5153 LLT Ty
= MRI
.getType(Res0
);
5154 LLT InstrExpTy
= Ty
== LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5156 auto Mant
= B
.buildIntrinsic(Intrinsic::amdgcn_frexp_mant
, {Ty
})
5159 auto Exp
= B
.buildIntrinsic(Intrinsic::amdgcn_frexp_exp
, {InstrExpTy
})
5163 if (ST
.hasFractBug()) {
5164 auto Fabs
= B
.buildFAbs(Ty
, Val
);
5165 auto Inf
= B
.buildFConstant(Ty
, APFloat::getInf(getFltSemanticForLLT(Ty
)));
5167 B
.buildFCmp(CmpInst::FCMP_OLT
, LLT::scalar(1), Fabs
, Inf
, Flags
);
5168 auto Zero
= B
.buildConstant(InstrExpTy
, 0);
5169 Exp
= B
.buildSelect(InstrExpTy
, IsFinite
, Exp
, Zero
);
5170 Mant
= B
.buildSelect(Ty
, IsFinite
, Mant
, Val
);
5173 B
.buildCopy(Res0
, Mant
);
5174 B
.buildSExtOrTrunc(Res1
, Exp
);
5176 MI
.eraseFromParent();
5180 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr
&MI
,
5181 MachineRegisterInfo
&MRI
,
5182 MachineIRBuilder
&B
) const {
5183 Register Res
= MI
.getOperand(0).getReg();
5184 Register LHS
= MI
.getOperand(2).getReg();
5185 Register RHS
= MI
.getOperand(3).getReg();
5186 uint16_t Flags
= MI
.getFlags();
5188 LLT S32
= LLT::scalar(32);
5189 LLT S1
= LLT::scalar(1);
5191 auto Abs
= B
.buildFAbs(S32
, RHS
, Flags
);
5192 const APFloat
C0Val(1.0f
);
5194 auto C0
= B
.buildFConstant(S32
, 0x1p
+96f
);
5195 auto C1
= B
.buildFConstant(S32
, 0x1p
-32f
);
5196 auto C2
= B
.buildFConstant(S32
, 1.0f
);
5198 auto CmpRes
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Abs
, C0
, Flags
);
5199 auto Sel
= B
.buildSelect(S32
, CmpRes
, C1
, C2
, Flags
);
5201 auto Mul0
= B
.buildFMul(S32
, RHS
, Sel
, Flags
);
5203 auto RCP
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S32
})
5204 .addUse(Mul0
.getReg(0))
5207 auto Mul1
= B
.buildFMul(S32
, LHS
, RCP
, Flags
);
5209 B
.buildFMul(Res
, Sel
, Mul1
, Flags
);
5211 MI
.eraseFromParent();
5215 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr
&MI
,
5216 MachineRegisterInfo
&MRI
,
5217 MachineIRBuilder
&B
) const {
5218 // Bypass the correct expansion a standard promotion through G_FSQRT would
5219 // get. The f32 op is accurate enough for the f16 cas.
5220 unsigned Flags
= MI
.getFlags();
5221 assert(!ST
.has16BitInsts());
5222 const LLT F32
= LLT::scalar(32);
5223 auto Ext
= B
.buildFPExt(F32
, MI
.getOperand(1), Flags
);
5224 auto Log2
= B
.buildIntrinsic(Intrinsic::amdgcn_sqrt
, {F32
})
5225 .addUse(Ext
.getReg(0))
5227 B
.buildFPTrunc(MI
.getOperand(0), Log2
, Flags
);
5228 MI
.eraseFromParent();
5232 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr
&MI
,
5233 MachineRegisterInfo
&MRI
,
5234 MachineIRBuilder
&B
) const {
5235 MachineFunction
&MF
= B
.getMF();
5236 Register Dst
= MI
.getOperand(0).getReg();
5237 Register X
= MI
.getOperand(1).getReg();
5238 const unsigned Flags
= MI
.getFlags();
5239 const LLT S1
= LLT::scalar(1);
5240 const LLT F32
= LLT::scalar(32);
5241 const LLT I32
= LLT::scalar(32);
5243 if (allowApproxFunc(MF
, Flags
)) {
5244 B
.buildIntrinsic(Intrinsic::amdgcn_sqrt
, ArrayRef
<Register
>({Dst
}))
5247 MI
.eraseFromParent();
5251 auto ScaleThreshold
= B
.buildFConstant(F32
, 0x1.0p
-96f
);
5252 auto NeedScale
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, ScaleThreshold
, X
, Flags
);
5253 auto ScaleUpFactor
= B
.buildFConstant(F32
, 0x1.0p
+32f
);
5254 auto ScaledX
= B
.buildFMul(F32
, X
, ScaleUpFactor
, Flags
);
5255 auto SqrtX
= B
.buildSelect(F32
, NeedScale
, ScaledX
, X
, Flags
);
5257 Register SqrtS
= MRI
.createGenericVirtualRegister(F32
);
5258 if (needsDenormHandlingF32(MF
, X
, Flags
)) {
5259 B
.buildIntrinsic(Intrinsic::amdgcn_sqrt
, ArrayRef
<Register
>({SqrtS
}))
5260 .addUse(SqrtX
.getReg(0))
5263 auto NegOne
= B
.buildConstant(I32
, -1);
5264 auto SqrtSNextDown
= B
.buildAdd(I32
, SqrtS
, NegOne
);
5266 auto NegSqrtSNextDown
= B
.buildFNeg(F32
, SqrtSNextDown
, Flags
);
5267 auto SqrtVP
= B
.buildFMA(F32
, NegSqrtSNextDown
, SqrtS
, SqrtX
, Flags
);
5269 auto PosOne
= B
.buildConstant(I32
, 1);
5270 auto SqrtSNextUp
= B
.buildAdd(I32
, SqrtS
, PosOne
);
5272 auto NegSqrtSNextUp
= B
.buildFNeg(F32
, SqrtSNextUp
, Flags
);
5273 auto SqrtVS
= B
.buildFMA(F32
, NegSqrtSNextUp
, SqrtS
, SqrtX
, Flags
);
5275 auto Zero
= B
.buildFConstant(F32
, 0.0f
);
5276 auto SqrtVPLE0
= B
.buildFCmp(CmpInst::FCMP_OLE
, S1
, SqrtVP
, Zero
, Flags
);
5279 B
.buildSelect(F32
, SqrtVPLE0
, SqrtSNextDown
, SqrtS
, Flags
).getReg(0);
5281 auto SqrtVPVSGT0
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, SqrtVS
, Zero
, Flags
);
5283 B
.buildSelect(F32
, SqrtVPVSGT0
, SqrtSNextUp
, SqrtS
, Flags
).getReg(0);
5286 B
.buildIntrinsic(Intrinsic::amdgcn_rsq
, {F32
}).addReg(SqrtX
.getReg(0));
5287 B
.buildFMul(SqrtS
, SqrtX
, SqrtR
, Flags
);
5289 auto Half
= B
.buildFConstant(F32
, 0.5f
);
5290 auto SqrtH
= B
.buildFMul(F32
, SqrtR
, Half
, Flags
);
5291 auto NegSqrtH
= B
.buildFNeg(F32
, SqrtH
, Flags
);
5292 auto SqrtE
= B
.buildFMA(F32
, NegSqrtH
, SqrtS
, Half
, Flags
);
5293 SqrtH
= B
.buildFMA(F32
, SqrtH
, SqrtE
, SqrtH
, Flags
);
5294 SqrtS
= B
.buildFMA(F32
, SqrtS
, SqrtE
, SqrtS
, Flags
).getReg(0);
5295 auto NegSqrtS
= B
.buildFNeg(F32
, SqrtS
, Flags
);
5296 auto SqrtD
= B
.buildFMA(F32
, NegSqrtS
, SqrtS
, SqrtX
, Flags
);
5297 SqrtS
= B
.buildFMA(F32
, SqrtD
, SqrtH
, SqrtS
, Flags
).getReg(0);
5300 auto ScaleDownFactor
= B
.buildFConstant(F32
, 0x1.0p
-16f
);
5302 auto ScaledDown
= B
.buildFMul(F32
, SqrtS
, ScaleDownFactor
, Flags
);
5304 SqrtS
= B
.buildSelect(F32
, NeedScale
, ScaledDown
, SqrtS
, Flags
).getReg(0);
5306 auto IsZeroOrInf
= B
.buildIsFPClass(LLT::scalar(1), SqrtX
, fcZero
| fcPosInf
);
5307 B
.buildSelect(Dst
, IsZeroOrInf
, SqrtX
, SqrtS
, Flags
);
5309 MI
.eraseFromParent();
5313 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr
&MI
,
5314 MachineRegisterInfo
&MRI
,
5315 MachineIRBuilder
&B
) const {
5316 // For double type, the SQRT and RSQ instructions don't have required
5317 // precision, we apply Goldschmidt's algorithm to improve the result:
5323 // r0 = 0.5 - h0 * g0
5324 // g1 = g0 * r0 + g0
5325 // h1 = h0 * r0 + h0
5327 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5328 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5329 // h2 = h1 * r1 + h1
5331 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5332 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5336 const LLT S1
= LLT::scalar(1);
5337 const LLT S32
= LLT::scalar(32);
5338 const LLT F64
= LLT::scalar(64);
5340 Register Dst
= MI
.getOperand(0).getReg();
5341 assert(MRI
.getType(Dst
) == F64
&& "only expect to lower f64 sqrt");
5343 Register X
= MI
.getOperand(1).getReg();
5344 unsigned Flags
= MI
.getFlags();
5346 auto ScaleConstant
= B
.buildFConstant(F64
, 0x1.0p
-767);
5348 auto ZeroInt
= B
.buildConstant(S32
, 0);
5349 auto Scaling
= B
.buildFCmp(FCmpInst::FCMP_OLT
, S1
, X
, ScaleConstant
);
5351 // Scale up input if it is too small.
5352 auto ScaleUpFactor
= B
.buildConstant(S32
, 256);
5353 auto ScaleUp
= B
.buildSelect(S32
, Scaling
, ScaleUpFactor
, ZeroInt
);
5354 auto SqrtX
= B
.buildFLdexp(F64
, X
, ScaleUp
, Flags
);
5357 B
.buildIntrinsic(Intrinsic::amdgcn_rsq
, {F64
}).addReg(SqrtX
.getReg(0));
5359 auto Half
= B
.buildFConstant(F64
, 0.5);
5360 auto SqrtH0
= B
.buildFMul(F64
, SqrtY
, Half
);
5361 auto SqrtS0
= B
.buildFMul(F64
, SqrtX
, SqrtY
);
5363 auto NegSqrtH0
= B
.buildFNeg(F64
, SqrtH0
);
5364 auto SqrtR0
= B
.buildFMA(F64
, NegSqrtH0
, SqrtS0
, Half
);
5366 auto SqrtS1
= B
.buildFMA(F64
, SqrtS0
, SqrtR0
, SqrtS0
);
5367 auto SqrtH1
= B
.buildFMA(F64
, SqrtH0
, SqrtR0
, SqrtH0
);
5369 auto NegSqrtS1
= B
.buildFNeg(F64
, SqrtS1
);
5370 auto SqrtD0
= B
.buildFMA(F64
, NegSqrtS1
, SqrtS1
, SqrtX
);
5372 auto SqrtS2
= B
.buildFMA(F64
, SqrtD0
, SqrtH1
, SqrtS1
);
5374 auto NegSqrtS2
= B
.buildFNeg(F64
, SqrtS2
);
5375 auto SqrtD1
= B
.buildFMA(F64
, NegSqrtS2
, SqrtS2
, SqrtX
);
5377 auto SqrtRet
= B
.buildFMA(F64
, SqrtD1
, SqrtH1
, SqrtS2
);
5379 // Scale down the result.
5380 auto ScaleDownFactor
= B
.buildConstant(S32
, -128);
5381 auto ScaleDown
= B
.buildSelect(S32
, Scaling
, ScaleDownFactor
, ZeroInt
);
5382 SqrtRet
= B
.buildFLdexp(F64
, SqrtRet
, ScaleDown
, Flags
);
5384 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5385 // with finite only or nsz because rsq(+/-0) = +/-inf
5387 // TODO: Check for DAZ and expand to subnormals
5388 auto IsZeroOrInf
= B
.buildIsFPClass(LLT::scalar(1), SqrtX
, fcZero
| fcPosInf
);
5390 // If x is +INF, +0, or -0, use its original value
5391 B
.buildSelect(Dst
, IsZeroOrInf
, SqrtX
, SqrtRet
, Flags
);
5393 MI
.eraseFromParent();
5397 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr
&MI
,
5398 MachineRegisterInfo
&MRI
,
5399 MachineIRBuilder
&B
) const {
5400 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
5401 if (Ty
== LLT::scalar(32))
5402 return legalizeFSQRTF32(MI
, MRI
, B
);
5403 if (Ty
== LLT::scalar(64))
5404 return legalizeFSQRTF64(MI
, MRI
, B
);
5405 if (Ty
== LLT::scalar(16))
5406 return legalizeFSQRTF16(MI
, MRI
, B
);
5410 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5411 // FIXME: Why do we handle this one but not other removed instructions?
5413 // Reciprocal square root. The clamp prevents infinite results, clamping
5414 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5416 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr
&MI
,
5417 MachineRegisterInfo
&MRI
,
5418 MachineIRBuilder
&B
) const {
5419 if (ST
.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
)
5422 Register Dst
= MI
.getOperand(0).getReg();
5423 Register Src
= MI
.getOperand(2).getReg();
5424 auto Flags
= MI
.getFlags();
5426 LLT Ty
= MRI
.getType(Dst
);
5428 const fltSemantics
*FltSemantics
;
5429 if (Ty
== LLT::scalar(32))
5430 FltSemantics
= &APFloat::IEEEsingle();
5431 else if (Ty
== LLT::scalar(64))
5432 FltSemantics
= &APFloat::IEEEdouble();
5436 auto Rsq
= B
.buildIntrinsic(Intrinsic::amdgcn_rsq
, {Ty
})
5440 // We don't need to concern ourselves with the snan handling difference, since
5441 // the rsq quieted (or not) so use the one which will directly select.
5442 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
5443 const bool UseIEEE
= MFI
->getMode().IEEE
;
5445 auto MaxFlt
= B
.buildFConstant(Ty
, APFloat::getLargest(*FltSemantics
));
5446 auto ClampMax
= UseIEEE
? B
.buildFMinNumIEEE(Ty
, Rsq
, MaxFlt
, Flags
) :
5447 B
.buildFMinNum(Ty
, Rsq
, MaxFlt
, Flags
);
5449 auto MinFlt
= B
.buildFConstant(Ty
, APFloat::getLargest(*FltSemantics
, true));
5452 B
.buildFMaxNumIEEE(Dst
, ClampMax
, MinFlt
, Flags
);
5454 B
.buildFMaxNum(Dst
, ClampMax
, MinFlt
, Flags
);
5455 MI
.eraseFromParent();
5459 // TODO: Fix pointer type handling
5460 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper
&Helper
,
5462 Intrinsic::ID IID
) const {
5464 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
5465 MachineRegisterInfo
&MRI
= *B
.getMRI();
5467 bool IsPermLane16
= IID
== Intrinsic::amdgcn_permlane16
||
5468 IID
== Intrinsic::amdgcn_permlanex16
;
5469 bool IsSetInactive
= IID
== Intrinsic::amdgcn_set_inactive
||
5470 IID
== Intrinsic::amdgcn_set_inactive_chain_arg
;
5472 auto createLaneOp
= [&IID
, &B
, &MI
](Register Src0
, Register Src1
,
5473 Register Src2
, LLT VT
) -> Register
{
5474 auto LaneOp
= B
.buildIntrinsic(IID
, {VT
}).addUse(Src0
);
5476 case Intrinsic::amdgcn_readfirstlane
:
5477 case Intrinsic::amdgcn_permlane64
:
5478 return LaneOp
.getReg(0);
5479 case Intrinsic::amdgcn_readlane
:
5480 case Intrinsic::amdgcn_set_inactive
:
5481 case Intrinsic::amdgcn_set_inactive_chain_arg
:
5482 return LaneOp
.addUse(Src1
).getReg(0);
5483 case Intrinsic::amdgcn_writelane
:
5484 return LaneOp
.addUse(Src1
).addUse(Src2
).getReg(0);
5485 case Intrinsic::amdgcn_permlane16
:
5486 case Intrinsic::amdgcn_permlanex16
: {
5487 Register Src3
= MI
.getOperand(5).getReg();
5488 Register Src4
= MI
.getOperand(6).getImm();
5489 Register Src5
= MI
.getOperand(7).getImm();
5490 return LaneOp
.addUse(Src1
)
5497 case Intrinsic::amdgcn_mov_dpp8
:
5498 return LaneOp
.addImm(MI
.getOperand(3).getImm()).getReg(0);
5499 case Intrinsic::amdgcn_update_dpp
:
5500 return LaneOp
.addUse(Src1
)
5501 .addImm(MI
.getOperand(4).getImm())
5502 .addImm(MI
.getOperand(5).getImm())
5503 .addImm(MI
.getOperand(6).getImm())
5504 .addImm(MI
.getOperand(7).getImm())
5507 llvm_unreachable("unhandled lane op");
5511 Register DstReg
= MI
.getOperand(0).getReg();
5512 Register Src0
= MI
.getOperand(2).getReg();
5513 Register Src1
, Src2
;
5514 if (IID
== Intrinsic::amdgcn_readlane
|| IID
== Intrinsic::amdgcn_writelane
||
5515 IID
== Intrinsic::amdgcn_update_dpp
|| IsSetInactive
|| IsPermLane16
) {
5516 Src1
= MI
.getOperand(3).getReg();
5517 if (IID
== Intrinsic::amdgcn_writelane
|| IsPermLane16
) {
5518 Src2
= MI
.getOperand(4).getReg();
5522 LLT Ty
= MRI
.getType(DstReg
);
5523 unsigned Size
= Ty
.getSizeInBits();
5525 unsigned SplitSize
= 32;
5526 if (IID
== Intrinsic::amdgcn_update_dpp
&& (Size
% 64 == 0) &&
5527 ST
.hasDPALU_DPP() &&
5528 AMDGPU::isLegalDPALU_DPPControl(MI
.getOperand(4).getImm()))
5531 if (Size
== SplitSize
) {
5537 Src0
= B
.buildAnyExt(S32
, Src0
).getReg(0);
5539 if (IID
== Intrinsic::amdgcn_update_dpp
|| IsSetInactive
|| IsPermLane16
)
5540 Src1
= B
.buildAnyExt(LLT::scalar(32), Src1
).getReg(0);
5542 if (IID
== Intrinsic::amdgcn_writelane
)
5543 Src2
= B
.buildAnyExt(LLT::scalar(32), Src2
).getReg(0);
5545 Register LaneOpDst
= createLaneOp(Src0
, Src1
, Src2
, S32
);
5546 B
.buildTrunc(DstReg
, LaneOpDst
);
5547 MI
.eraseFromParent();
5551 if (Size
% SplitSize
!= 0)
5554 LLT PartialResTy
= LLT::scalar(SplitSize
);
5555 if (Ty
.isVector()) {
5556 LLT EltTy
= Ty
.getElementType();
5557 unsigned EltSize
= EltTy
.getSizeInBits();
5558 if (EltSize
== SplitSize
) {
5559 PartialResTy
= EltTy
;
5560 } else if (EltSize
== 16 || EltSize
== 32) {
5561 unsigned NElem
= SplitSize
/ EltSize
;
5562 PartialResTy
= Ty
.changeElementCount(ElementCount::getFixed(NElem
));
5564 // Handle all other cases via S32/S64 pieces;
5567 SmallVector
<Register
, 4> PartialRes
;
5568 unsigned NumParts
= Size
/ SplitSize
;
5569 MachineInstrBuilder Src0Parts
= B
.buildUnmerge(PartialResTy
, Src0
);
5570 MachineInstrBuilder Src1Parts
, Src2Parts
;
5572 if (IID
== Intrinsic::amdgcn_update_dpp
|| IsSetInactive
|| IsPermLane16
)
5573 Src1Parts
= B
.buildUnmerge(PartialResTy
, Src1
);
5575 if (IID
== Intrinsic::amdgcn_writelane
)
5576 Src2Parts
= B
.buildUnmerge(PartialResTy
, Src2
);
5578 for (unsigned i
= 0; i
< NumParts
; ++i
) {
5579 Src0
= Src0Parts
.getReg(i
);
5581 if (IID
== Intrinsic::amdgcn_update_dpp
|| IsSetInactive
|| IsPermLane16
)
5582 Src1
= Src1Parts
.getReg(i
);
5584 if (IID
== Intrinsic::amdgcn_writelane
)
5585 Src2
= Src2Parts
.getReg(i
);
5587 PartialRes
.push_back(createLaneOp(Src0
, Src1
, Src2
, PartialResTy
));
5590 B
.buildMergeLikeInstr(DstReg
, PartialRes
);
5591 MI
.eraseFromParent();
5595 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg
,
5596 MachineRegisterInfo
&MRI
,
5597 MachineIRBuilder
&B
) const {
5599 ST
.getTargetLowering()->getImplicitParameterOffset(
5600 B
.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT
);
5601 LLT DstTy
= MRI
.getType(DstReg
);
5602 LLT IdxTy
= LLT::scalar(DstTy
.getSizeInBits());
5604 Register KernargPtrReg
= MRI
.createGenericVirtualRegister(DstTy
);
5605 if (!loadInputValue(KernargPtrReg
, B
,
5606 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
))
5609 // FIXME: This should be nuw
5610 B
.buildPtrAdd(DstReg
, KernargPtrReg
, B
.buildConstant(IdxTy
, Offset
).getReg(0));
5614 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5615 /// bits of the pointer and replace them with the stride argument, then
5616 /// merge_values everything together. In the common case of a raw buffer (the
5617 /// stride component is 0), we can just AND off the upper half.
5618 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5619 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
5620 Register Result
= MI
.getOperand(0).getReg();
5621 Register Pointer
= MI
.getOperand(2).getReg();
5622 Register Stride
= MI
.getOperand(3).getReg();
5623 Register NumRecords
= MI
.getOperand(4).getReg();
5624 Register Flags
= MI
.getOperand(5).getReg();
5626 LLT S32
= LLT::scalar(32);
5628 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
5629 auto Unmerge
= B
.buildUnmerge(S32
, Pointer
);
5630 Register LowHalf
= Unmerge
.getReg(0);
5631 Register HighHalf
= Unmerge
.getReg(1);
5633 auto AndMask
= B
.buildConstant(S32
, 0x0000ffff);
5634 auto Masked
= B
.buildAnd(S32
, HighHalf
, AndMask
);
5636 MachineInstrBuilder NewHighHalf
= Masked
;
5637 std::optional
<ValueAndVReg
> StrideConst
=
5638 getIConstantVRegValWithLookThrough(Stride
, MRI
);
5639 if (!StrideConst
|| !StrideConst
->Value
.isZero()) {
5640 MachineInstrBuilder ShiftedStride
;
5642 uint32_t StrideVal
= StrideConst
->Value
.getZExtValue();
5643 uint32_t ShiftedStrideVal
= StrideVal
<< 16;
5644 ShiftedStride
= B
.buildConstant(S32
, ShiftedStrideVal
);
5646 auto ExtStride
= B
.buildAnyExt(S32
, Stride
);
5647 auto ShiftConst
= B
.buildConstant(S32
, 16);
5648 ShiftedStride
= B
.buildShl(S32
, ExtStride
, ShiftConst
);
5650 NewHighHalf
= B
.buildOr(S32
, Masked
, ShiftedStride
);
5652 Register NewHighHalfReg
= NewHighHalf
.getReg(0);
5653 B
.buildMergeValues(Result
, {LowHalf
, NewHighHalfReg
, NumRecords
, Flags
});
5654 MI
.eraseFromParent();
5658 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr
&MI
,
5659 MachineRegisterInfo
&MRI
,
5660 MachineIRBuilder
&B
) const {
5661 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
5662 if (!MFI
->isEntryFunction()) {
5663 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
5664 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
5667 Register DstReg
= MI
.getOperand(0).getReg();
5668 if (!getImplicitArgPtr(DstReg
, MRI
, B
))
5671 MI
.eraseFromParent();
5675 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg
,
5676 MachineRegisterInfo
&MRI
,
5677 MachineIRBuilder
&B
) const {
5678 Function
&F
= B
.getMF().getFunction();
5679 std::optional
<uint32_t> KnownSize
=
5680 AMDGPUMachineFunction::getLDSKernelIdMetadata(F
);
5681 if (KnownSize
.has_value())
5682 B
.buildConstant(DstReg
, *KnownSize
);
5686 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr
&MI
,
5687 MachineRegisterInfo
&MRI
,
5688 MachineIRBuilder
&B
) const {
5690 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
5691 if (!MFI
->isEntryFunction()) {
5692 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
5693 AMDGPUFunctionArgInfo::LDS_KERNEL_ID
);
5696 Register DstReg
= MI
.getOperand(0).getReg();
5697 if (!getLDSKernelId(DstReg
, MRI
, B
))
5700 MI
.eraseFromParent();
5704 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr
&MI
,
5705 MachineRegisterInfo
&MRI
,
5706 MachineIRBuilder
&B
,
5707 unsigned AddrSpace
) const {
5708 Register ApertureReg
= getSegmentAperture(AddrSpace
, MRI
, B
);
5709 auto Unmerge
= B
.buildUnmerge(LLT::scalar(32), MI
.getOperand(2).getReg());
5710 Register Hi32
= Unmerge
.getReg(1);
5712 B
.buildICmp(ICmpInst::ICMP_EQ
, MI
.getOperand(0), Hi32
, ApertureReg
);
5713 MI
.eraseFromParent();
5717 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5718 // offset (the offset that is included in bounds checking and swizzling, to be
5719 // split between the instruction's voffset and immoffset fields) and soffset
5720 // (the offset that is excluded from bounds checking and swizzling, to go in
5721 // the instruction's soffset field). This function takes the first kind of
5722 // offset and figures out how to split it between voffset and immoffset.
5723 std::pair
<Register
, unsigned>
5724 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder
&B
,
5725 Register OrigOffset
) const {
5726 const unsigned MaxImm
= SIInstrInfo::getMaxMUBUFImmOffset(ST
);
5729 const LLT S32
= LLT::scalar(32);
5730 MachineRegisterInfo
&MRI
= *B
.getMRI();
5732 std::tie(BaseReg
, ImmOffset
) =
5733 AMDGPU::getBaseWithConstantOffset(MRI
, OrigOffset
);
5735 // If BaseReg is a pointer, convert it to int.
5736 if (MRI
.getType(BaseReg
).isPointer())
5737 BaseReg
= B
.buildPtrToInt(MRI
.getType(OrigOffset
), BaseReg
).getReg(0);
5739 // If the immediate value is too big for the immoffset field, put only bits
5740 // that would normally fit in the immoffset field. The remaining value that
5741 // is copied/added for the voffset field is a large power of 2, and it
5742 // stands more chance of being CSEd with the copy/add for another similar
5744 // However, do not do that rounding down if that is a negative
5745 // number, as it appears to be illegal to have a negative offset in the
5746 // vgpr, even if adding the immediate offset makes it positive.
5747 unsigned Overflow
= ImmOffset
& ~MaxImm
;
5748 ImmOffset
-= Overflow
;
5749 if ((int32_t)Overflow
< 0) {
5750 Overflow
+= ImmOffset
;
5754 if (Overflow
!= 0) {
5756 BaseReg
= B
.buildConstant(S32
, Overflow
).getReg(0);
5758 auto OverflowVal
= B
.buildConstant(S32
, Overflow
);
5759 BaseReg
= B
.buildAdd(S32
, BaseReg
, OverflowVal
).getReg(0);
5764 BaseReg
= B
.buildConstant(S32
, 0).getReg(0);
5766 return std::pair(BaseReg
, ImmOffset
);
5769 /// Handle register layout difference for f16 images for some subtargets.
5770 Register
AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder
&B
,
5771 MachineRegisterInfo
&MRI
,
5773 bool ImageStore
) const {
5774 const LLT S16
= LLT::scalar(16);
5775 const LLT S32
= LLT::scalar(32);
5776 LLT StoreVT
= MRI
.getType(Reg
);
5777 assert(StoreVT
.isVector() && StoreVT
.getElementType() == S16
);
5779 if (ST
.hasUnpackedD16VMem()) {
5780 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
5782 SmallVector
<Register
, 4> WideRegs
;
5783 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
5784 WideRegs
.push_back(B
.buildAnyExt(S32
, Unmerge
.getReg(I
)).getReg(0));
5786 int NumElts
= StoreVT
.getNumElements();
5788 return B
.buildBuildVector(LLT::fixed_vector(NumElts
, S32
), WideRegs
)
5792 if (ImageStore
&& ST
.hasImageStoreD16Bug()) {
5793 if (StoreVT
.getNumElements() == 2) {
5794 SmallVector
<Register
, 4> PackedRegs
;
5795 Reg
= B
.buildBitcast(S32
, Reg
).getReg(0);
5796 PackedRegs
.push_back(Reg
);
5797 PackedRegs
.resize(2, B
.buildUndef(S32
).getReg(0));
5798 return B
.buildBuildVector(LLT::fixed_vector(2, S32
), PackedRegs
)
5802 if (StoreVT
.getNumElements() == 3) {
5803 SmallVector
<Register
, 4> PackedRegs
;
5804 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
5805 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
5806 PackedRegs
.push_back(Unmerge
.getReg(I
));
5807 PackedRegs
.resize(6, B
.buildUndef(S16
).getReg(0));
5808 Reg
= B
.buildBuildVector(LLT::fixed_vector(6, S16
), PackedRegs
).getReg(0);
5809 return B
.buildBitcast(LLT::fixed_vector(3, S32
), Reg
).getReg(0);
5812 if (StoreVT
.getNumElements() == 4) {
5813 SmallVector
<Register
, 4> PackedRegs
;
5814 Reg
= B
.buildBitcast(LLT::fixed_vector(2, S32
), Reg
).getReg(0);
5815 auto Unmerge
= B
.buildUnmerge(S32
, Reg
);
5816 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
5817 PackedRegs
.push_back(Unmerge
.getReg(I
));
5818 PackedRegs
.resize(4, B
.buildUndef(S32
).getReg(0));
5819 return B
.buildBuildVector(LLT::fixed_vector(4, S32
), PackedRegs
)
5823 llvm_unreachable("invalid data type");
5826 if (StoreVT
== LLT::fixed_vector(3, S16
)) {
5827 Reg
= B
.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16
), Reg
)
5833 Register
AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder
&B
,
5834 Register VData
, LLT MemTy
,
5835 bool IsFormat
) const {
5836 MachineRegisterInfo
*MRI
= B
.getMRI();
5837 LLT Ty
= MRI
->getType(VData
);
5839 const LLT S16
= LLT::scalar(16);
5841 // Fixup buffer resources themselves needing to be v4i128.
5842 if (hasBufferRsrcWorkaround(Ty
))
5843 return castBufferRsrcToV4I32(VData
, B
);
5845 if (shouldBitcastLoadStoreType(ST
, Ty
, MemTy
)) {
5846 Ty
= getBitcastRegisterType(Ty
);
5847 VData
= B
.buildBitcast(Ty
, VData
).getReg(0);
5849 // Fixup illegal register types for i8 stores.
5850 if (Ty
== LLT::scalar(8) || Ty
== S16
) {
5851 Register AnyExt
= B
.buildAnyExt(LLT::scalar(32), VData
).getReg(0);
5855 if (Ty
.isVector()) {
5856 if (Ty
.getElementType() == S16
&& Ty
.getNumElements() <= 4) {
5858 return handleD16VData(B
, *MRI
, VData
);
5865 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr
&MI
,
5866 LegalizerHelper
&Helper
,
5868 bool IsFormat
) const {
5869 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
5870 MachineRegisterInfo
&MRI
= *B
.getMRI();
5872 Register VData
= MI
.getOperand(1).getReg();
5873 LLT Ty
= MRI
.getType(VData
);
5874 LLT EltTy
= Ty
.getScalarType();
5875 const bool IsD16
= IsFormat
&& (EltTy
.getSizeInBits() == 16);
5876 const LLT S32
= LLT::scalar(32);
5878 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
5879 const int MemSize
= MMO
->getSize().getValue();
5880 LLT MemTy
= MMO
->getMemoryType();
5882 VData
= fixStoreSourceType(B
, VData
, MemTy
, IsFormat
);
5884 castBufferRsrcArgToV4I32(MI
, B
, 2);
5885 Register RSrc
= MI
.getOperand(2).getReg();
5889 // The typed intrinsics add an immediate after the registers.
5890 const unsigned NumVIndexOps
= IsTyped
? 8 : 7;
5892 // The struct intrinsic variants add one additional operand over raw.
5893 const bool HasVIndex
= MI
.getNumOperands() == NumVIndexOps
;
5897 VIndex
= MI
.getOperand(3).getReg();
5900 VIndex
= B
.buildConstant(S32
, 0).getReg(0);
5903 Register VOffset
= MI
.getOperand(3 + OpOffset
).getReg();
5904 Register SOffset
= MI
.getOperand(4 + OpOffset
).getReg();
5906 unsigned Format
= 0;
5908 Format
= MI
.getOperand(5 + OpOffset
).getImm();
5912 unsigned AuxiliaryData
= MI
.getOperand(5 + OpOffset
).getImm();
5914 std::tie(VOffset
, ImmOffset
) = splitBufferOffsets(B
, VOffset
);
5918 Opc
= IsD16
? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16
:
5919 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT
;
5920 } else if (IsFormat
) {
5921 Opc
= IsD16
? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16
:
5922 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT
;
5926 Opc
= AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE
;
5929 Opc
= AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT
;
5932 Opc
= AMDGPU::G_AMDGPU_BUFFER_STORE
;
5937 auto MIB
= B
.buildInstr(Opc
)
5938 .addUse(VData
) // vdata
5939 .addUse(RSrc
) // rsrc
5940 .addUse(VIndex
) // vindex
5941 .addUse(VOffset
) // voffset
5942 .addUse(SOffset
) // soffset
5943 .addImm(ImmOffset
); // offset(imm)
5948 MIB
.addImm(AuxiliaryData
) // cachepolicy, swizzled buffer(imm)
5949 .addImm(HasVIndex
? -1 : 0) // idxen(imm)
5950 .addMemOperand(MMO
);
5952 MI
.eraseFromParent();
5956 static void buildBufferLoad(unsigned Opc
, Register LoadDstReg
, Register RSrc
,
5957 Register VIndex
, Register VOffset
, Register SOffset
,
5958 unsigned ImmOffset
, unsigned Format
,
5959 unsigned AuxiliaryData
, MachineMemOperand
*MMO
,
5960 bool IsTyped
, bool HasVIndex
, MachineIRBuilder
&B
) {
5961 auto MIB
= B
.buildInstr(Opc
)
5962 .addDef(LoadDstReg
) // vdata
5963 .addUse(RSrc
) // rsrc
5964 .addUse(VIndex
) // vindex
5965 .addUse(VOffset
) // voffset
5966 .addUse(SOffset
) // soffset
5967 .addImm(ImmOffset
); // offset(imm)
5972 MIB
.addImm(AuxiliaryData
) // cachepolicy, swizzled buffer(imm)
5973 .addImm(HasVIndex
? -1 : 0) // idxen(imm)
5974 .addMemOperand(MMO
);
5977 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr
&MI
,
5978 LegalizerHelper
&Helper
,
5980 bool IsTyped
) const {
5981 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
5982 MachineRegisterInfo
&MRI
= *B
.getMRI();
5983 GISelChangeObserver
&Observer
= Helper
.Observer
;
5985 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5986 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
5987 const LLT MemTy
= MMO
->getMemoryType();
5988 const LLT S32
= LLT::scalar(32);
5990 Register Dst
= MI
.getOperand(0).getReg();
5994 assert(MI
.getNumExplicitDefs() == 1 || MI
.getNumExplicitDefs() == 2);
5995 bool IsTFE
= MI
.getNumExplicitDefs() == 2;
5997 StatusDst
= MI
.getOperand(1).getReg();
6001 castBufferRsrcArgToV4I32(MI
, B
, 2 + OpOffset
);
6002 Register RSrc
= MI
.getOperand(2 + OpOffset
).getReg();
6004 // The typed intrinsics add an immediate after the registers.
6005 const unsigned NumVIndexOps
= IsTyped
? 8 : 7;
6007 // The struct intrinsic variants add one additional operand over raw.
6008 const bool HasVIndex
= MI
.getNumOperands() == NumVIndexOps
+ OpOffset
;
6011 VIndex
= MI
.getOperand(3 + OpOffset
).getReg();
6014 VIndex
= B
.buildConstant(S32
, 0).getReg(0);
6017 Register VOffset
= MI
.getOperand(3 + OpOffset
).getReg();
6018 Register SOffset
= MI
.getOperand(4 + OpOffset
).getReg();
6020 unsigned Format
= 0;
6022 Format
= MI
.getOperand(5 + OpOffset
).getImm();
6026 unsigned AuxiliaryData
= MI
.getOperand(5 + OpOffset
).getImm();
6029 LLT Ty
= MRI
.getType(Dst
);
6030 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6031 // logic doesn't have to handle that case.
6032 if (hasBufferRsrcWorkaround(Ty
)) {
6033 Observer
.changingInstr(MI
);
6034 Ty
= castBufferRsrcFromV4I32(MI
, B
, MRI
, 0);
6035 Observer
.changedInstr(MI
);
6036 Dst
= MI
.getOperand(0).getReg();
6037 B
.setInsertPt(B
.getMBB(), MI
);
6039 if (shouldBitcastLoadStoreType(ST
, Ty
, MemTy
)) {
6040 Ty
= getBitcastRegisterType(Ty
);
6041 Observer
.changingInstr(MI
);
6042 Helper
.bitcastDst(MI
, Ty
, 0);
6043 Observer
.changedInstr(MI
);
6044 Dst
= MI
.getOperand(0).getReg();
6045 B
.setInsertPt(B
.getMBB(), MI
);
6048 LLT EltTy
= Ty
.getScalarType();
6049 const bool IsD16
= IsFormat
&& (EltTy
.getSizeInBits() == 16);
6050 const bool Unpacked
= ST
.hasUnpackedD16VMem();
6052 std::tie(VOffset
, ImmOffset
) = splitBufferOffsets(B
, VOffset
);
6056 // TODO: Support TFE for typed and narrow loads.
6060 Opc
= IsD16
? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16
:
6061 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT
;
6062 } else if (IsFormat
) {
6066 Opc
= AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16
;
6068 Opc
= IsTFE
? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6069 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT
;
6072 switch (MemTy
.getSizeInBits()) {
6074 Opc
= IsTFE
? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6075 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
;
6078 Opc
= IsTFE
? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6079 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT
;
6082 Opc
= IsTFE
? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6083 : AMDGPU::G_AMDGPU_BUFFER_LOAD
;
6089 unsigned NumValueDWords
= divideCeil(Ty
.getSizeInBits(), 32);
6090 unsigned NumLoadDWords
= NumValueDWords
+ 1;
6091 LLT LoadTy
= LLT::fixed_vector(NumLoadDWords
, S32
);
6092 Register LoadDstReg
= B
.getMRI()->createGenericVirtualRegister(LoadTy
);
6093 buildBufferLoad(Opc
, LoadDstReg
, RSrc
, VIndex
, VOffset
, SOffset
, ImmOffset
,
6094 Format
, AuxiliaryData
, MMO
, IsTyped
, HasVIndex
, B
);
6095 if (MemTy
.getSizeInBits() < 32) {
6096 Register ExtDst
= B
.getMRI()->createGenericVirtualRegister(S32
);
6097 B
.buildUnmerge({ExtDst
, StatusDst
}, LoadDstReg
);
6098 B
.buildTrunc(Dst
, ExtDst
);
6099 } else if (NumValueDWords
== 1) {
6100 B
.buildUnmerge({Dst
, StatusDst
}, LoadDstReg
);
6102 SmallVector
<Register
, 5> LoadElts
;
6103 for (unsigned I
= 0; I
!= NumValueDWords
; ++I
)
6104 LoadElts
.push_back(B
.getMRI()->createGenericVirtualRegister(S32
));
6105 LoadElts
.push_back(StatusDst
);
6106 B
.buildUnmerge(LoadElts
, LoadDstReg
);
6107 LoadElts
.truncate(NumValueDWords
);
6108 B
.buildMergeLikeInstr(Dst
, LoadElts
);
6110 } else if ((!IsD16
&& MemTy
.getSizeInBits() < 32) ||
6111 (IsD16
&& !Ty
.isVector())) {
6112 Register LoadDstReg
= B
.getMRI()->createGenericVirtualRegister(S32
);
6113 buildBufferLoad(Opc
, LoadDstReg
, RSrc
, VIndex
, VOffset
, SOffset
, ImmOffset
,
6114 Format
, AuxiliaryData
, MMO
, IsTyped
, HasVIndex
, B
);
6115 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
6116 B
.buildTrunc(Dst
, LoadDstReg
);
6117 } else if (Unpacked
&& IsD16
&& Ty
.isVector()) {
6118 LLT UnpackedTy
= Ty
.changeElementSize(32);
6119 Register LoadDstReg
= B
.getMRI()->createGenericVirtualRegister(UnpackedTy
);
6120 buildBufferLoad(Opc
, LoadDstReg
, RSrc
, VIndex
, VOffset
, SOffset
, ImmOffset
,
6121 Format
, AuxiliaryData
, MMO
, IsTyped
, HasVIndex
, B
);
6122 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
6123 // FIXME: G_TRUNC should work, but legalization currently fails
6124 auto Unmerge
= B
.buildUnmerge(S32
, LoadDstReg
);
6125 SmallVector
<Register
, 4> Repack
;
6126 for (unsigned I
= 0, N
= Unmerge
->getNumOperands() - 1; I
!= N
; ++I
)
6127 Repack
.push_back(B
.buildTrunc(EltTy
, Unmerge
.getReg(I
)).getReg(0));
6128 B
.buildMergeLikeInstr(Dst
, Repack
);
6130 buildBufferLoad(Opc
, Dst
, RSrc
, VIndex
, VOffset
, SOffset
, ImmOffset
, Format
,
6131 AuxiliaryData
, MMO
, IsTyped
, HasVIndex
, B
);
6134 MI
.eraseFromParent();
6138 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID
) {
6140 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
6141 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap
:
6142 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
6143 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap
:
6144 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP
;
6145 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
6146 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add
:
6147 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
6148 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add
:
6149 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD
;
6150 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
6151 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub
:
6152 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
6153 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub
:
6154 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB
;
6155 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
6156 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin
:
6157 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
6158 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin
:
6159 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN
;
6160 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
6161 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin
:
6162 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
6163 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin
:
6164 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN
;
6165 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
6166 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax
:
6167 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
6168 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax
:
6169 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX
;
6170 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
6171 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax
:
6172 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
6173 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax
:
6174 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX
;
6175 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
6176 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and
:
6177 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
6178 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and
:
6179 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND
;
6180 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
6181 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or
:
6182 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
6183 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or
:
6184 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR
;
6185 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
6186 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor
:
6187 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
6188 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor
:
6189 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR
;
6190 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
6191 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc
:
6192 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
6193 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc
:
6194 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC
;
6195 case Intrinsic::amdgcn_raw_buffer_atomic_dec
:
6196 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec
:
6197 case Intrinsic::amdgcn_struct_buffer_atomic_dec
:
6198 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec
:
6199 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC
;
6200 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
:
6201 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap
:
6202 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
:
6203 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap
:
6204 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP
;
6205 case Intrinsic::amdgcn_raw_buffer_atomic_fadd
:
6206 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd
:
6207 case Intrinsic::amdgcn_struct_buffer_atomic_fadd
:
6208 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd
:
6209 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD
;
6210 case Intrinsic::amdgcn_raw_buffer_atomic_fmin
:
6211 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin
:
6212 case Intrinsic::amdgcn_struct_buffer_atomic_fmin
:
6213 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin
:
6214 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN
;
6215 case Intrinsic::amdgcn_raw_buffer_atomic_fmax
:
6216 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax
:
6217 case Intrinsic::amdgcn_struct_buffer_atomic_fmax
:
6218 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax
:
6219 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX
;
6220 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32
:
6221 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32
:
6222 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32
;
6224 llvm_unreachable("unhandled atomic opcode");
6228 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr
&MI
,
6229 MachineIRBuilder
&B
,
6230 Intrinsic::ID IID
) const {
6231 const bool IsCmpSwap
=
6232 IID
== Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
||
6233 IID
== Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
||
6234 IID
== Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap
||
6235 IID
== Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap
;
6237 Register Dst
= MI
.getOperand(0).getReg();
6238 // Since we don't have 128-bit atomics, we don't need to handle the case of
6239 // p8 argmunents to the atomic itself
6240 Register VData
= MI
.getOperand(2).getReg();
6246 CmpVal
= MI
.getOperand(3).getReg();
6250 castBufferRsrcArgToV4I32(MI
, B
, 3 + OpOffset
);
6251 Register RSrc
= MI
.getOperand(3 + OpOffset
).getReg();
6252 const unsigned NumVIndexOps
= IsCmpSwap
? 9 : 8;
6254 // The struct intrinsic variants add one additional operand over raw.
6255 const bool HasVIndex
= MI
.getNumOperands() == NumVIndexOps
;
6258 VIndex
= MI
.getOperand(4 + OpOffset
).getReg();
6261 VIndex
= B
.buildConstant(LLT::scalar(32), 0).getReg(0);
6264 Register VOffset
= MI
.getOperand(4 + OpOffset
).getReg();
6265 Register SOffset
= MI
.getOperand(5 + OpOffset
).getReg();
6266 unsigned AuxiliaryData
= MI
.getOperand(6 + OpOffset
).getImm();
6268 MachineMemOperand
*MMO
= *MI
.memoperands_begin();
6271 std::tie(VOffset
, ImmOffset
) = splitBufferOffsets(B
, VOffset
);
6273 auto MIB
= B
.buildInstr(getBufferAtomicPseudo(IID
))
6275 .addUse(VData
); // vdata
6280 MIB
.addUse(RSrc
) // rsrc
6281 .addUse(VIndex
) // vindex
6282 .addUse(VOffset
) // voffset
6283 .addUse(SOffset
) // soffset
6284 .addImm(ImmOffset
) // offset(imm)
6285 .addImm(AuxiliaryData
) // cachepolicy, swizzled buffer(imm)
6286 .addImm(HasVIndex
? -1 : 0) // idxen(imm)
6287 .addMemOperand(MMO
);
6289 MI
.eraseFromParent();
6293 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6294 /// vector with s16 typed elements.
6295 static void packImage16bitOpsToDwords(MachineIRBuilder
&B
, MachineInstr
&MI
,
6296 SmallVectorImpl
<Register
> &PackedAddrs
,
6298 const AMDGPU::ImageDimIntrinsicInfo
*Intr
,
6299 bool IsA16
, bool IsG16
) {
6300 const LLT S16
= LLT::scalar(16);
6301 const LLT V2S16
= LLT::fixed_vector(2, 16);
6302 auto EndIdx
= Intr
->VAddrEnd
;
6304 for (unsigned I
= Intr
->VAddrStart
; I
< EndIdx
; I
++) {
6305 MachineOperand
&SrcOp
= MI
.getOperand(ArgOffset
+ I
);
6307 continue; // _L to _LZ may have eliminated this.
6309 Register AddrReg
= SrcOp
.getReg();
6311 if ((I
< Intr
->GradientStart
) ||
6312 (I
>= Intr
->GradientStart
&& I
< Intr
->CoordStart
&& !IsG16
) ||
6313 (I
>= Intr
->CoordStart
&& !IsA16
)) {
6314 if ((I
< Intr
->GradientStart
) && IsA16
&&
6315 (B
.getMRI()->getType(AddrReg
) == S16
)) {
6316 assert(I
== Intr
->BiasIndex
&& "Got unexpected 16-bit extra argument");
6317 // Special handling of bias when A16 is on. Bias is of type half but
6318 // occupies full 32-bit.
6319 PackedAddrs
.push_back(
6320 B
.buildBuildVector(V2S16
, {AddrReg
, B
.buildUndef(S16
).getReg(0)})
6323 assert((!IsA16
|| Intr
->NumBiasArgs
== 0 || I
!= Intr
->BiasIndex
) &&
6324 "Bias needs to be converted to 16 bit in A16 mode");
6325 // Handle any gradient or coordinate operands that should not be packed
6326 AddrReg
= B
.buildBitcast(V2S16
, AddrReg
).getReg(0);
6327 PackedAddrs
.push_back(AddrReg
);
6330 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6331 // derivatives dx/dh and dx/dv are packed with undef.
6332 if (((I
+ 1) >= EndIdx
) ||
6333 ((Intr
->NumGradients
/ 2) % 2 == 1 &&
6334 (I
== static_cast<unsigned>(Intr
->GradientStart
+
6335 (Intr
->NumGradients
/ 2) - 1) ||
6336 I
== static_cast<unsigned>(Intr
->GradientStart
+
6337 Intr
->NumGradients
- 1))) ||
6338 // Check for _L to _LZ optimization
6339 !MI
.getOperand(ArgOffset
+ I
+ 1).isReg()) {
6340 PackedAddrs
.push_back(
6341 B
.buildBuildVector(V2S16
, {AddrReg
, B
.buildUndef(S16
).getReg(0)})
6344 PackedAddrs
.push_back(
6346 V2S16
, {AddrReg
, MI
.getOperand(ArgOffset
+ I
+ 1).getReg()})
6354 /// Convert from separate vaddr components to a single vector address register,
6355 /// and replace the remaining operands with $noreg.
6356 static void convertImageAddrToPacked(MachineIRBuilder
&B
, MachineInstr
&MI
,
6357 int DimIdx
, int NumVAddrs
) {
6358 const LLT S32
= LLT::scalar(32);
6360 SmallVector
<Register
, 8> AddrRegs
;
6361 for (int I
= 0; I
!= NumVAddrs
; ++I
) {
6362 MachineOperand
&SrcOp
= MI
.getOperand(DimIdx
+ I
);
6363 if (SrcOp
.isReg()) {
6364 AddrRegs
.push_back(SrcOp
.getReg());
6365 assert(B
.getMRI()->getType(SrcOp
.getReg()) == S32
);
6369 int NumAddrRegs
= AddrRegs
.size();
6370 if (NumAddrRegs
!= 1) {
6372 B
.buildBuildVector(LLT::fixed_vector(NumAddrRegs
, 32), AddrRegs
);
6373 MI
.getOperand(DimIdx
).setReg(VAddr
.getReg(0));
6376 for (int I
= 1; I
!= NumVAddrs
; ++I
) {
6377 MachineOperand
&SrcOp
= MI
.getOperand(DimIdx
+ I
);
6379 MI
.getOperand(DimIdx
+ I
).setReg(AMDGPU::NoRegister
);
6383 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
6385 /// Depending on the subtarget, load/store with 16-bit element data need to be
6386 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6387 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6390 /// We don't want to directly select image instructions just yet, but also want
6391 /// to exposes all register repacking to the legalizer/combiners. We also don't
6392 /// want a selected instruction entering RegBankSelect. In order to avoid
6393 /// defining a multitude of intermediate image instructions, directly hack on
6394 /// the intrinsic's arguments. In cases like a16 addresses, this requires
6395 /// padding now unnecessary arguments with $noreg.
6396 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6397 MachineInstr
&MI
, MachineIRBuilder
&B
, GISelChangeObserver
&Observer
,
6398 const AMDGPU::ImageDimIntrinsicInfo
*Intr
) const {
6400 const MachineFunction
&MF
= *MI
.getMF();
6401 const unsigned NumDefs
= MI
.getNumExplicitDefs();
6402 const unsigned ArgOffset
= NumDefs
+ 1;
6403 bool IsTFE
= NumDefs
== 2;
6404 // We are only processing the operands of d16 image operations on subtargets
6405 // that use the unpacked register layout, or need to repack the TFE result.
6407 // TODO: Do we need to guard against already legalized intrinsics?
6408 const AMDGPU::MIMGBaseOpcodeInfo
*BaseOpcode
=
6409 AMDGPU::getMIMGBaseOpcodeInfo(Intr
->BaseOpcode
);
6411 MachineRegisterInfo
*MRI
= B
.getMRI();
6412 const LLT S32
= LLT::scalar(32);
6413 const LLT S16
= LLT::scalar(16);
6414 const LLT V2S16
= LLT::fixed_vector(2, 16);
6420 if (!BaseOpcode
->NoReturn
|| BaseOpcode
->Store
) {
6421 VData
= MI
.getOperand(NumDefs
== 0 ? 1 : 0).getReg();
6422 Ty
= MRI
->getType(VData
);
6425 const bool IsAtomicPacked16Bit
=
6426 (BaseOpcode
->BaseOpcode
== AMDGPU::IMAGE_ATOMIC_PK_ADD_F16
||
6427 BaseOpcode
->BaseOpcode
== AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16
);
6429 // Check for 16 bit addresses and pack if true.
6431 MRI
->getType(MI
.getOperand(ArgOffset
+ Intr
->GradientStart
).getReg());
6433 MRI
->getType(MI
.getOperand(ArgOffset
+ Intr
->CoordStart
).getReg());
6435 ST
.hasG16() ? (BaseOpcode
->Gradients
&& GradTy
== S16
) : GradTy
== S16
;
6436 const bool IsA16
= AddrTy
== S16
;
6437 const bool IsD16
= !IsAtomicPacked16Bit
&& Ty
.getScalarType() == S16
;
6440 if (!BaseOpcode
->Atomic
) {
6441 DMask
= MI
.getOperand(ArgOffset
+ Intr
->DMaskIndex
).getImm();
6442 if (BaseOpcode
->Gather4
) {
6444 } else if (DMask
!= 0) {
6445 DMaskLanes
= llvm::popcount(DMask
);
6446 } else if (!IsTFE
&& !BaseOpcode
->Store
) {
6447 // If dmask is 0, this is a no-op load. This can be eliminated.
6448 B
.buildUndef(MI
.getOperand(0));
6449 MI
.eraseFromParent();
6454 Observer
.changingInstr(MI
);
6455 auto ChangedInstr
= make_scope_exit([&] { Observer
.changedInstr(MI
); });
6457 const unsigned StoreOpcode
= IsD16
? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6458 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE
;
6459 const unsigned LoadOpcode
= IsD16
? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6460 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD
;
6461 unsigned NewOpcode
= LoadOpcode
;
6462 if (BaseOpcode
->Store
)
6463 NewOpcode
= StoreOpcode
;
6464 else if (BaseOpcode
->NoReturn
)
6465 NewOpcode
= AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET
;
6467 // Track that we legalized this
6468 MI
.setDesc(B
.getTII().get(NewOpcode
));
6470 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6471 // dmask to be at least 1 otherwise the instruction will fail
6472 if (IsTFE
&& DMask
== 0) {
6475 MI
.getOperand(ArgOffset
+ Intr
->DMaskIndex
).setImm(DMask
);
6478 if (BaseOpcode
->Atomic
) {
6479 Register VData0
= MI
.getOperand(2).getReg();
6480 LLT Ty
= MRI
->getType(VData0
);
6482 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6483 if (Ty
.isVector() && !IsAtomicPacked16Bit
)
6486 if (BaseOpcode
->AtomicX2
) {
6487 Register VData1
= MI
.getOperand(3).getReg();
6488 // The two values are packed in one register.
6489 LLT PackedTy
= LLT::fixed_vector(2, Ty
);
6490 auto Concat
= B
.buildBuildVector(PackedTy
, {VData0
, VData1
});
6491 MI
.getOperand(2).setReg(Concat
.getReg(0));
6492 MI
.getOperand(3).setReg(AMDGPU::NoRegister
);
6496 unsigned CorrectedNumVAddrs
= Intr
->NumVAddrs
;
6498 // Rewrite the addressing register layout before doing anything else.
6499 if (BaseOpcode
->Gradients
&& !ST
.hasG16() && (IsA16
!= IsG16
)) {
6500 // 16 bit gradients are supported, but are tied to the A16 control
6501 // so both gradients and addresses must be 16 bit
6505 if (IsA16
&& !ST
.hasA16()) {
6506 // A16 not supported
6510 const unsigned NSAMaxSize
= ST
.getNSAMaxSize(BaseOpcode
->Sampler
);
6511 const unsigned HasPartialNSA
= ST
.hasPartialNSAEncoding();
6513 if (IsA16
|| IsG16
) {
6514 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6515 // instructions expect VGPR_32
6516 SmallVector
<Register
, 4> PackedRegs
;
6518 packImage16bitOpsToDwords(B
, MI
, PackedRegs
, ArgOffset
, Intr
, IsA16
, IsG16
);
6520 // See also below in the non-a16 branch
6521 const bool UseNSA
= ST
.hasNSAEncoding() &&
6522 PackedRegs
.size() >= ST
.getNSAThreshold(MF
) &&
6523 (PackedRegs
.size() <= NSAMaxSize
|| HasPartialNSA
);
6524 const bool UsePartialNSA
=
6525 UseNSA
&& HasPartialNSA
&& PackedRegs
.size() > NSAMaxSize
;
6527 if (UsePartialNSA
) {
6528 // Pack registers that would go over NSAMaxSize into last VAddr register
6530 LLT::fixed_vector(2 * (PackedRegs
.size() - NSAMaxSize
+ 1), 16);
6531 auto Concat
= B
.buildConcatVectors(
6532 PackedAddrTy
, ArrayRef(PackedRegs
).slice(NSAMaxSize
- 1));
6533 PackedRegs
[NSAMaxSize
- 1] = Concat
.getReg(0);
6534 PackedRegs
.resize(NSAMaxSize
);
6535 } else if (!UseNSA
&& PackedRegs
.size() > 1) {
6536 LLT PackedAddrTy
= LLT::fixed_vector(2 * PackedRegs
.size(), 16);
6537 auto Concat
= B
.buildConcatVectors(PackedAddrTy
, PackedRegs
);
6538 PackedRegs
[0] = Concat
.getReg(0);
6539 PackedRegs
.resize(1);
6542 const unsigned NumPacked
= PackedRegs
.size();
6543 for (unsigned I
= Intr
->VAddrStart
; I
< Intr
->VAddrEnd
; I
++) {
6544 MachineOperand
&SrcOp
= MI
.getOperand(ArgOffset
+ I
);
6545 if (!SrcOp
.isReg()) {
6546 assert(SrcOp
.isImm() && SrcOp
.getImm() == 0);
6550 assert(SrcOp
.getReg() != AMDGPU::NoRegister
);
6552 if (I
- Intr
->VAddrStart
< NumPacked
)
6553 SrcOp
.setReg(PackedRegs
[I
- Intr
->VAddrStart
]);
6555 SrcOp
.setReg(AMDGPU::NoRegister
);
6558 // If the register allocator cannot place the address registers contiguously
6559 // without introducing moves, then using the non-sequential address encoding
6560 // is always preferable, since it saves VALU instructions and is usually a
6561 // wash in terms of code size or even better.
6563 // However, we currently have no way of hinting to the register allocator
6564 // that MIMG addresses should be placed contiguously when it is possible to
6565 // do so, so force non-NSA for the common 2-address case as a heuristic.
6567 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6568 // allocation when possible.
6570 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6571 // set of the remaining addresses.
6572 const bool UseNSA
= ST
.hasNSAEncoding() &&
6573 CorrectedNumVAddrs
>= ST
.getNSAThreshold(MF
) &&
6574 (CorrectedNumVAddrs
<= NSAMaxSize
|| HasPartialNSA
);
6575 const bool UsePartialNSA
=
6576 UseNSA
&& HasPartialNSA
&& CorrectedNumVAddrs
> NSAMaxSize
;
6578 if (UsePartialNSA
) {
6579 convertImageAddrToPacked(B
, MI
,
6580 ArgOffset
+ Intr
->VAddrStart
+ NSAMaxSize
- 1,
6581 Intr
->NumVAddrs
- NSAMaxSize
+ 1);
6582 } else if (!UseNSA
&& Intr
->NumVAddrs
> 1) {
6583 convertImageAddrToPacked(B
, MI
, ArgOffset
+ Intr
->VAddrStart
,
6593 MI
.addOperand(MachineOperand::CreateImm(Flags
));
6595 if (BaseOpcode
->NoReturn
) { // No TFE for stores?
6596 // TODO: Handle dmask trim
6597 if (!Ty
.isVector() || !IsD16
)
6600 Register RepackedReg
= handleD16VData(B
, *MRI
, VData
, true);
6601 if (RepackedReg
!= VData
) {
6602 MI
.getOperand(1).setReg(RepackedReg
);
6608 Register DstReg
= MI
.getOperand(0).getReg();
6609 const LLT EltTy
= Ty
.getScalarType();
6610 const int NumElts
= Ty
.isVector() ? Ty
.getNumElements() : 1;
6612 // Confirm that the return type is large enough for the dmask specified
6613 if (NumElts
< DMaskLanes
)
6616 if (NumElts
> 4 || DMaskLanes
> 4)
6619 // Image atomic instructions are using DMask to specify how many bits
6620 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6621 // DMaskLanes for image atomic has default value '0'.
6622 // We must be sure that atomic variants (especially packed) will not be
6623 // truncated from v2s16 or v4s16 to s16 type.
6625 // ChangeElementCount will be needed for image load where Ty is always scalar.
6626 const unsigned AdjustedNumElts
= DMaskLanes
== 0 ? 1 : DMaskLanes
;
6627 const LLT AdjustedTy
=
6630 : Ty
.changeElementCount(ElementCount::getFixed(AdjustedNumElts
));
6632 // The raw dword aligned data component of the load. The only legal cases
6633 // where this matters should be when using the packed D16 format, for
6634 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6637 // S32 vector to cover all data, plus TFE result element.
6640 // Register type to use for each loaded component. Will be S32 or V2S16.
6643 if (IsD16
&& ST
.hasUnpackedD16VMem()) {
6645 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts
), 32);
6646 TFETy
= LLT::fixed_vector(AdjustedNumElts
+ 1, 32);
6649 unsigned EltSize
= EltTy
.getSizeInBits();
6650 unsigned RoundedElts
= (AdjustedTy
.getSizeInBits() + 31) / 32;
6651 unsigned RoundedSize
= 32 * RoundedElts
;
6652 RoundedTy
= LLT::scalarOrVector(
6653 ElementCount::getFixed(RoundedSize
/ EltSize
), EltSize
);
6654 TFETy
= LLT::fixed_vector(RoundedSize
/ 32 + 1, S32
);
6655 RegTy
= !IsTFE
&& EltSize
== 16 ? V2S16
: S32
;
6658 // The return type does not need adjustment.
6659 // TODO: Should we change s16 case to s32 or <2 x s16>?
6660 if (!IsTFE
&& (RoundedTy
== Ty
|| !Ty
.isVector()))
6665 // Insert after the instruction.
6666 B
.setInsertPt(*MI
.getParent(), ++MI
.getIterator());
6668 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6669 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6670 const LLT LoadResultTy
= IsTFE
? TFETy
: RoundedTy
;
6671 const int ResultNumRegs
= LoadResultTy
.getSizeInBits() / 32;
6673 Register NewResultReg
= MRI
->createGenericVirtualRegister(LoadResultTy
);
6675 MI
.getOperand(0).setReg(NewResultReg
);
6677 // In the IR, TFE is supposed to be used with a 2 element struct return
6678 // type. The instruction really returns these two values in one contiguous
6679 // register, with one additional dword beyond the loaded data. Rewrite the
6680 // return type to use a single register result.
6683 Dst1Reg
= MI
.getOperand(1).getReg();
6684 if (MRI
->getType(Dst1Reg
) != S32
)
6687 // TODO: Make sure the TFE operand bit is set.
6688 MI
.removeOperand(1);
6690 // Handle the easy case that requires no repack instructions.
6692 B
.buildUnmerge({DstReg
, Dst1Reg
}, NewResultReg
);
6697 // Now figure out how to copy the new result register back into the old
6699 SmallVector
<Register
, 5> ResultRegs(ResultNumRegs
, Dst1Reg
);
6701 const int NumDataRegs
= IsTFE
? ResultNumRegs
- 1 : ResultNumRegs
;
6703 if (ResultNumRegs
== 1) {
6705 ResultRegs
[0] = NewResultReg
;
6707 // We have to repack into a new vector of some kind.
6708 for (int I
= 0; I
!= NumDataRegs
; ++I
)
6709 ResultRegs
[I
] = MRI
->createGenericVirtualRegister(RegTy
);
6710 B
.buildUnmerge(ResultRegs
, NewResultReg
);
6712 // Drop the final TFE element to get the data part. The TFE result is
6713 // directly written to the right place already.
6715 ResultRegs
.resize(NumDataRegs
);
6718 // For an s16 scalar result, we form an s32 result with a truncate regardless
6719 // of packed vs. unpacked.
6720 if (IsD16
&& !Ty
.isVector()) {
6721 B
.buildTrunc(DstReg
, ResultRegs
[0]);
6725 // Avoid a build/concat_vector of 1 entry.
6726 if (Ty
== V2S16
&& NumDataRegs
== 1 && !ST
.hasUnpackedD16VMem()) {
6727 B
.buildBitcast(DstReg
, ResultRegs
[0]);
6731 assert(Ty
.isVector());
6734 // For packed D16 results with TFE enabled, all the data components are
6735 // S32. Cast back to the expected type.
6737 // TODO: We don't really need to use load s32 elements. We would only need one
6738 // cast for the TFE result if a multiple of v2s16 was used.
6739 if (RegTy
!= V2S16
&& !ST
.hasUnpackedD16VMem()) {
6740 for (Register
&Reg
: ResultRegs
)
6741 Reg
= B
.buildBitcast(V2S16
, Reg
).getReg(0);
6742 } else if (ST
.hasUnpackedD16VMem()) {
6743 for (Register
&Reg
: ResultRegs
)
6744 Reg
= B
.buildTrunc(S16
, Reg
).getReg(0);
6748 auto padWithUndef
= [&](LLT Ty
, int NumElts
) {
6751 Register Undef
= B
.buildUndef(Ty
).getReg(0);
6752 for (int I
= 0; I
!= NumElts
; ++I
)
6753 ResultRegs
.push_back(Undef
);
6756 // Pad out any elements eliminated due to the dmask.
6757 LLT ResTy
= MRI
->getType(ResultRegs
[0]);
6758 if (!ResTy
.isVector()) {
6759 padWithUndef(ResTy
, NumElts
- ResultRegs
.size());
6760 B
.buildBuildVector(DstReg
, ResultRegs
);
6764 assert(!ST
.hasUnpackedD16VMem() && ResTy
== V2S16
);
6765 const int RegsToCover
= (Ty
.getSizeInBits() + 31) / 32;
6767 // Deal with the one annoying legal case.
6768 const LLT V3S16
= LLT::fixed_vector(3, 16);
6771 if (ResultRegs
.size() == 1) {
6772 NewResultReg
= ResultRegs
[0];
6773 } else if (ResultRegs
.size() == 2) {
6774 LLT V4S16
= LLT::fixed_vector(4, 16);
6775 NewResultReg
= B
.buildConcatVectors(V4S16
, ResultRegs
).getReg(0);
6781 if (MRI
->getType(DstReg
).getNumElements() <
6782 MRI
->getType(NewResultReg
).getNumElements()) {
6783 B
.buildDeleteTrailingVectorElements(DstReg
, NewResultReg
);
6785 B
.buildPadVectorWithUndefElements(DstReg
, NewResultReg
);
6790 padWithUndef(ResTy
, RegsToCover
- ResultRegs
.size());
6791 B
.buildConcatVectors(DstReg
, ResultRegs
);
6795 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper
&Helper
,
6796 MachineInstr
&MI
) const {
6797 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
6798 GISelChangeObserver
&Observer
= Helper
.Observer
;
6800 Register OrigDst
= MI
.getOperand(0).getReg();
6802 LLT Ty
= B
.getMRI()->getType(OrigDst
);
6803 unsigned Size
= Ty
.getSizeInBits();
6804 MachineFunction
&MF
= B
.getMF();
6806 if (Size
< 32 && ST
.hasScalarSubwordLoads()) {
6807 assert(Size
== 8 || Size
== 16);
6808 Opc
= Size
== 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6809 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT
;
6810 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6811 // destination register.
6812 Dst
= B
.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6814 Opc
= AMDGPU::G_AMDGPU_S_BUFFER_LOAD
;
6818 Observer
.changingInstr(MI
);
6820 // Handle needing to s.buffer.load() a p8 value.
6821 if (hasBufferRsrcWorkaround(Ty
)) {
6822 Ty
= castBufferRsrcFromV4I32(MI
, B
, *B
.getMRI(), 0);
6823 B
.setInsertPt(B
.getMBB(), MI
);
6825 if (shouldBitcastLoadStoreType(ST
, Ty
, LLT::scalar(Size
))) {
6826 Ty
= getBitcastRegisterType(Ty
);
6827 Helper
.bitcastDst(MI
, Ty
, 0);
6828 B
.setInsertPt(B
.getMBB(), MI
);
6831 // FIXME: We don't really need this intermediate instruction. The intrinsic
6832 // should be fixed to have a memory operand. Since it's readnone, we're not
6833 // allowed to add one.
6834 MI
.setDesc(B
.getTII().get(Opc
));
6835 MI
.removeOperand(1); // Remove intrinsic ID
6837 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6838 const unsigned MemSize
= (Size
+ 7) / 8;
6839 const Align MemAlign
= B
.getDataLayout().getABITypeAlign(
6840 getTypeForLLT(Ty
, MF
.getFunction().getContext()));
6841 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
6842 MachinePointerInfo(),
6843 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
6844 MachineMemOperand::MOInvariant
,
6846 MI
.addMemOperand(MF
, MMO
);
6847 if (Dst
!= OrigDst
) {
6848 MI
.getOperand(0).setReg(Dst
);
6849 B
.setInsertPt(B
.getMBB(), ++B
.getInsertPt());
6850 B
.buildTrunc(OrigDst
, Dst
);
6853 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6854 // always be legal. We may need to restore this to a 96-bit result if it turns
6855 // out this needs to be converted to a vector load during RegBankSelect.
6856 if (!isPowerOf2_32(Size
) && (Size
!= 96 || !ST
.hasScalarDwordx3Loads())) {
6858 Helper
.moreElementsVectorDst(MI
, getPow2VectorType(Ty
), 0);
6860 Helper
.widenScalarDst(MI
, getPow2ScalarType(Ty
), 0);
6863 Observer
.changedInstr(MI
);
6867 bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper
&Helper
,
6868 MachineInstr
&MI
) const {
6869 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
6870 GISelChangeObserver
&Observer
= Helper
.Observer
;
6871 Observer
.changingInstr(MI
);
6872 MI
.setDesc(B
.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH
));
6873 MI
.removeOperand(0); // Remove intrinsic ID
6874 castBufferRsrcArgToV4I32(MI
, B
, 0);
6875 Observer
.changedInstr(MI
);
6879 // TODO: Move to selection
6880 bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr
&MI
,
6881 MachineRegisterInfo
&MRI
,
6882 MachineIRBuilder
&B
) const {
6883 if (!ST
.isTrapHandlerEnabled() ||
6884 ST
.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA
)
6885 return legalizeTrapEndpgm(MI
, MRI
, B
);
6887 return ST
.supportsGetDoorbellID() ?
6888 legalizeTrapHsa(MI
, MRI
, B
) : legalizeTrapHsaQueuePtr(MI
, MRI
, B
);
6891 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6892 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
6893 const DebugLoc
&DL
= MI
.getDebugLoc();
6894 MachineBasicBlock
&BB
= B
.getMBB();
6895 MachineFunction
*MF
= BB
.getParent();
6897 if (BB
.succ_empty() && std::next(MI
.getIterator()) == BB
.end()) {
6898 BuildMI(BB
, BB
.end(), DL
, B
.getTII().get(AMDGPU::S_ENDPGM
))
6900 MI
.eraseFromParent();
6904 // We need a block split to make the real endpgm a terminator. We also don't
6905 // want to break phis in successor blocks, so we can't just delete to the
6906 // end of the block.
6907 BB
.splitAt(MI
, false /*UpdateLiveIns*/);
6908 MachineBasicBlock
*TrapBB
= MF
->CreateMachineBasicBlock();
6909 MF
->push_back(TrapBB
);
6910 BuildMI(*TrapBB
, TrapBB
->end(), DL
, B
.getTII().get(AMDGPU::S_ENDPGM
))
6912 BuildMI(BB
, &MI
, DL
, B
.getTII().get(AMDGPU::S_CBRANCH_EXECNZ
))
6915 BB
.addSuccessor(TrapBB
);
6916 MI
.eraseFromParent();
6920 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6921 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&B
) const {
6922 MachineFunction
&MF
= B
.getMF();
6923 const LLT S64
= LLT::scalar(64);
6925 Register
SGPR01(AMDGPU::SGPR0_SGPR1
);
6926 // For code object version 5, queue_ptr is passed through implicit kernarg.
6927 if (AMDGPU::getAMDHSACodeObjectVersion(*MF
.getFunction().getParent()) >=
6928 AMDGPU::AMDHSA_COV5
) {
6929 AMDGPUTargetLowering::ImplicitParameter Param
=
6930 AMDGPUTargetLowering::QUEUE_PTR
;
6932 ST
.getTargetLowering()->getImplicitParameterOffset(B
.getMF(), Param
);
6934 Register KernargPtrReg
= MRI
.createGenericVirtualRegister(
6935 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
6937 if (!loadInputValue(KernargPtrReg
, B
,
6938 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
))
6941 // TODO: can we be smarter about machine pointer info?
6942 MachinePointerInfo
PtrInfo(AMDGPUAS::CONSTANT_ADDRESS
);
6943 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
6945 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
6946 MachineMemOperand::MOInvariant
,
6947 LLT::scalar(64), commonAlignment(Align(64), Offset
));
6950 Register LoadAddr
= MRI
.createGenericVirtualRegister(
6951 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
6952 B
.buildPtrAdd(LoadAddr
, KernargPtrReg
,
6953 B
.buildConstant(LLT::scalar(64), Offset
).getReg(0));
6955 Register Temp
= B
.buildLoad(S64
, LoadAddr
, *MMO
).getReg(0);
6956 B
.buildCopy(SGPR01
, Temp
);
6957 B
.buildInstr(AMDGPU::S_TRAP
)
6958 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap
))
6959 .addReg(SGPR01
, RegState::Implicit
);
6960 MI
.eraseFromParent();
6964 // Pass queue pointer to trap handler as input, and insert trap instruction
6965 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6967 MRI
.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
6968 if (!loadInputValue(LiveIn
, B
, AMDGPUFunctionArgInfo::QUEUE_PTR
))
6971 B
.buildCopy(SGPR01
, LiveIn
);
6972 B
.buildInstr(AMDGPU::S_TRAP
)
6973 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap
))
6974 .addReg(SGPR01
, RegState::Implicit
);
6976 MI
.eraseFromParent();
6980 bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr
&MI
,
6981 MachineRegisterInfo
&MRI
,
6982 MachineIRBuilder
&B
) const {
6983 // We need to simulate the 's_trap 2' instruction on targets that run in
6984 // PRIV=1 (where it is treated as a nop).
6985 if (ST
.hasPrivEnabledTrap2NopBug()) {
6986 ST
.getInstrInfo()->insertSimulatedTrap(MRI
, B
.getMBB(), MI
,
6988 MI
.eraseFromParent();
6992 B
.buildInstr(AMDGPU::S_TRAP
)
6993 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap
));
6994 MI
.eraseFromParent();
6998 bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr
&MI
,
6999 MachineRegisterInfo
&MRI
,
7000 MachineIRBuilder
&B
) const {
7001 // Is non-HSA path or trap-handler disabled? Then, report a warning
7003 if (!ST
.isTrapHandlerEnabled() ||
7004 ST
.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA
) {
7005 DiagnosticInfoUnsupported
NoTrap(B
.getMF().getFunction(),
7006 "debugtrap handler not supported",
7007 MI
.getDebugLoc(), DS_Warning
);
7008 LLVMContext
&Ctx
= B
.getMF().getFunction().getContext();
7009 Ctx
.diagnose(NoTrap
);
7011 // Insert debug-trap instruction
7012 B
.buildInstr(AMDGPU::S_TRAP
)
7013 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap
));
7016 MI
.eraseFromParent();
7020 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr
&MI
,
7021 MachineIRBuilder
&B
) const {
7022 MachineRegisterInfo
&MRI
= *B
.getMRI();
7023 const LLT S16
= LLT::scalar(16);
7024 const LLT S32
= LLT::scalar(32);
7025 const LLT V2S16
= LLT::fixed_vector(2, 16);
7026 const LLT V3S32
= LLT::fixed_vector(3, 32);
7028 Register DstReg
= MI
.getOperand(0).getReg();
7029 Register NodePtr
= MI
.getOperand(2).getReg();
7030 Register RayExtent
= MI
.getOperand(3).getReg();
7031 Register RayOrigin
= MI
.getOperand(4).getReg();
7032 Register RayDir
= MI
.getOperand(5).getReg();
7033 Register RayInvDir
= MI
.getOperand(6).getReg();
7034 Register TDescr
= MI
.getOperand(7).getReg();
7036 if (!ST
.hasGFX10_AEncoding()) {
7037 DiagnosticInfoUnsupported
BadIntrin(B
.getMF().getFunction(),
7038 "intrinsic not supported on subtarget",
7040 B
.getMF().getFunction().getContext().diagnose(BadIntrin
);
7044 const bool IsGFX11
= AMDGPU::isGFX11(ST
);
7045 const bool IsGFX11Plus
= AMDGPU::isGFX11Plus(ST
);
7046 const bool IsGFX12Plus
= AMDGPU::isGFX12Plus(ST
);
7047 const bool IsA16
= MRI
.getType(RayDir
).getElementType().getSizeInBits() == 16;
7048 const bool Is64
= MRI
.getType(NodePtr
).getSizeInBits() == 64;
7049 const unsigned NumVDataDwords
= 4;
7050 const unsigned NumVAddrDwords
= IsA16
? (Is64
? 9 : 8) : (Is64
? 12 : 11);
7051 const unsigned NumVAddrs
= IsGFX11Plus
? (IsA16
? 4 : 5) : NumVAddrDwords
;
7053 IsGFX12Plus
|| (ST
.hasNSAEncoding() && NumVAddrs
<= ST
.getNSAMaxSize());
7055 const unsigned BaseOpcodes
[2][2] = {
7056 {AMDGPU::IMAGE_BVH_INTERSECT_RAY
, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16
},
7057 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY
,
7058 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16
}};
7061 Opcode
= AMDGPU::getMIMGOpcode(BaseOpcodes
[Is64
][IsA16
],
7062 IsGFX12Plus
? AMDGPU::MIMGEncGfx12
7063 : IsGFX11
? AMDGPU::MIMGEncGfx11NSA
7064 : AMDGPU::MIMGEncGfx10NSA
,
7065 NumVDataDwords
, NumVAddrDwords
);
7067 assert(!IsGFX12Plus
);
7068 Opcode
= AMDGPU::getMIMGOpcode(BaseOpcodes
[Is64
][IsA16
],
7069 IsGFX11
? AMDGPU::MIMGEncGfx11Default
7070 : AMDGPU::MIMGEncGfx10Default
,
7071 NumVDataDwords
, NumVAddrDwords
);
7073 assert(Opcode
!= -1);
7075 SmallVector
<Register
, 12> Ops
;
7076 if (UseNSA
&& IsGFX11Plus
) {
7077 auto packLanes
= [&Ops
, &S32
, &V3S32
, &B
](Register Src
) {
7078 auto Unmerge
= B
.buildUnmerge({S32
, S32
, S32
}, Src
);
7079 auto Merged
= B
.buildMergeLikeInstr(
7080 V3S32
, {Unmerge
.getReg(0), Unmerge
.getReg(1), Unmerge
.getReg(2)});
7081 Ops
.push_back(Merged
.getReg(0));
7084 Ops
.push_back(NodePtr
);
7085 Ops
.push_back(RayExtent
);
7086 packLanes(RayOrigin
);
7089 auto UnmergeRayDir
= B
.buildUnmerge({S16
, S16
, S16
}, RayDir
);
7090 auto UnmergeRayInvDir
= B
.buildUnmerge({S16
, S16
, S16
}, RayInvDir
);
7091 auto MergedDir
= B
.buildMergeLikeInstr(
7094 S32
, B
.buildMergeLikeInstr(V2S16
, {UnmergeRayInvDir
.getReg(0),
7095 UnmergeRayDir
.getReg(0)}))
7098 S32
, B
.buildMergeLikeInstr(V2S16
, {UnmergeRayInvDir
.getReg(1),
7099 UnmergeRayDir
.getReg(1)}))
7102 S32
, B
.buildMergeLikeInstr(V2S16
, {UnmergeRayInvDir
.getReg(2),
7103 UnmergeRayDir
.getReg(2)}))
7105 Ops
.push_back(MergedDir
.getReg(0));
7108 packLanes(RayInvDir
);
7112 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, NodePtr
);
7113 Ops
.push_back(Unmerge
.getReg(0));
7114 Ops
.push_back(Unmerge
.getReg(1));
7116 Ops
.push_back(NodePtr
);
7118 Ops
.push_back(RayExtent
);
7120 auto packLanes
= [&Ops
, &S32
, &B
](Register Src
) {
7121 auto Unmerge
= B
.buildUnmerge({S32
, S32
, S32
}, Src
);
7122 Ops
.push_back(Unmerge
.getReg(0));
7123 Ops
.push_back(Unmerge
.getReg(1));
7124 Ops
.push_back(Unmerge
.getReg(2));
7127 packLanes(RayOrigin
);
7129 auto UnmergeRayDir
= B
.buildUnmerge({S16
, S16
, S16
}, RayDir
);
7130 auto UnmergeRayInvDir
= B
.buildUnmerge({S16
, S16
, S16
}, RayInvDir
);
7131 Register R1
= MRI
.createGenericVirtualRegister(S32
);
7132 Register R2
= MRI
.createGenericVirtualRegister(S32
);
7133 Register R3
= MRI
.createGenericVirtualRegister(S32
);
7134 B
.buildMergeLikeInstr(R1
,
7135 {UnmergeRayDir
.getReg(0), UnmergeRayDir
.getReg(1)});
7136 B
.buildMergeLikeInstr(
7137 R2
, {UnmergeRayDir
.getReg(2), UnmergeRayInvDir
.getReg(0)});
7138 B
.buildMergeLikeInstr(
7139 R3
, {UnmergeRayInvDir
.getReg(1), UnmergeRayInvDir
.getReg(2)});
7145 packLanes(RayInvDir
);
7150 // Build a single vector containing all the operands so far prepared.
7151 LLT OpTy
= LLT::fixed_vector(Ops
.size(), 32);
7152 Register MergedOps
= B
.buildMergeLikeInstr(OpTy
, Ops
).getReg(0);
7154 Ops
.push_back(MergedOps
);
7157 auto MIB
= B
.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY
)
7161 for (Register R
: Ops
) {
7166 .addImm(IsA16
? 1 : 0)
7169 MI
.eraseFromParent();
7173 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr
&MI
,
7174 MachineIRBuilder
&B
) const {
7175 const SITargetLowering
*TLI
= ST
.getTargetLowering();
7176 Register StackPtr
= TLI
->getStackPointerRegisterToSaveRestore();
7177 Register DstReg
= MI
.getOperand(0).getReg();
7178 B
.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS
, {DstReg
}, {StackPtr
});
7179 MI
.eraseFromParent();
7183 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr
&MI
,
7184 MachineIRBuilder
&B
) const {
7185 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7186 if (!ST
.hasArchitectedSGPRs())
7188 LLT S32
= LLT::scalar(32);
7189 Register DstReg
= MI
.getOperand(0).getReg();
7190 auto TTMP8
= B
.buildCopy(S32
, Register(AMDGPU::TTMP8
));
7191 auto LSB
= B
.buildConstant(S32
, 25);
7192 auto Width
= B
.buildConstant(S32
, 5);
7193 B
.buildUbfx(DstReg
, TTMP8
, LSB
, Width
);
7194 MI
.eraseFromParent();
7198 static constexpr unsigned FPEnvModeBitField
=
7199 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE
, 0, 23);
7201 static constexpr unsigned FPEnvTrapBitField
=
7202 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS
, 0, 5);
7204 bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr
&MI
,
7205 MachineRegisterInfo
&MRI
,
7206 MachineIRBuilder
&B
) const {
7207 Register Src
= MI
.getOperand(0).getReg();
7208 if (MRI
.getType(Src
) != S64
)
7212 B
.buildIntrinsic(Intrinsic::amdgcn_s_getreg
, {S32
},
7213 /*HasSideEffects=*/true, /*isConvergent=*/false)
7214 .addImm(FPEnvModeBitField
);
7216 B
.buildIntrinsic(Intrinsic::amdgcn_s_getreg
, {S32
},
7217 /*HasSideEffects=*/true, /*isConvergent=*/false)
7218 .addImm(FPEnvTrapBitField
);
7219 B
.buildMergeLikeInstr(Src
, {ModeReg
, TrapReg
});
7220 MI
.eraseFromParent();
7224 bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr
&MI
,
7225 MachineRegisterInfo
&MRI
,
7226 MachineIRBuilder
&B
) const {
7227 Register Src
= MI
.getOperand(0).getReg();
7228 if (MRI
.getType(Src
) != S64
)
7231 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, MI
.getOperand(0));
7232 B
.buildIntrinsic(Intrinsic::amdgcn_s_setreg
, ArrayRef
<DstOp
>(),
7233 /*HasSideEffects=*/true, /*isConvergent=*/false)
7234 .addImm(static_cast<int16_t>(FPEnvModeBitField
))
7235 .addReg(Unmerge
.getReg(0));
7236 B
.buildIntrinsic(Intrinsic::amdgcn_s_setreg
, ArrayRef
<DstOp
>(),
7237 /*HasSideEffects=*/true, /*isConvergent=*/false)
7238 .addImm(static_cast<int16_t>(FPEnvTrapBitField
))
7239 .addReg(Unmerge
.getReg(1));
7240 MI
.eraseFromParent();
7244 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper
&Helper
,
7245 MachineInstr
&MI
) const {
7246 MachineIRBuilder
&B
= Helper
.MIRBuilder
;
7247 MachineRegisterInfo
&MRI
= *B
.getMRI();
7249 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7250 auto IntrID
= cast
<GIntrinsic
>(MI
).getIntrinsicID();
7252 case Intrinsic::amdgcn_if
:
7253 case Intrinsic::amdgcn_else
: {
7254 MachineInstr
*Br
= nullptr;
7255 MachineBasicBlock
*UncondBrTarget
= nullptr;
7256 bool Negated
= false;
7257 if (MachineInstr
*BrCond
=
7258 verifyCFIntrinsic(MI
, MRI
, Br
, UncondBrTarget
, Negated
)) {
7259 const SIRegisterInfo
*TRI
7260 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
7262 Register Def
= MI
.getOperand(1).getReg();
7263 Register Use
= MI
.getOperand(3).getReg();
7265 MachineBasicBlock
*CondBrTarget
= BrCond
->getOperand(1).getMBB();
7268 std::swap(CondBrTarget
, UncondBrTarget
);
7270 B
.setInsertPt(B
.getMBB(), BrCond
->getIterator());
7271 if (IntrID
== Intrinsic::amdgcn_if
) {
7272 B
.buildInstr(AMDGPU::SI_IF
)
7275 .addMBB(UncondBrTarget
);
7277 B
.buildInstr(AMDGPU::SI_ELSE
)
7280 .addMBB(UncondBrTarget
);
7284 Br
->getOperand(0).setMBB(CondBrTarget
);
7286 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7287 // since we're swapping branch targets it needs to be reinserted.
7288 // FIXME: IRTranslator should probably not do this
7289 B
.buildBr(*CondBrTarget
);
7292 MRI
.setRegClass(Def
, TRI
->getWaveMaskRegClass());
7293 MRI
.setRegClass(Use
, TRI
->getWaveMaskRegClass());
7294 MI
.eraseFromParent();
7295 BrCond
->eraseFromParent();
7301 case Intrinsic::amdgcn_loop
: {
7302 MachineInstr
*Br
= nullptr;
7303 MachineBasicBlock
*UncondBrTarget
= nullptr;
7304 bool Negated
= false;
7305 if (MachineInstr
*BrCond
=
7306 verifyCFIntrinsic(MI
, MRI
, Br
, UncondBrTarget
, Negated
)) {
7307 const SIRegisterInfo
*TRI
7308 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
7310 MachineBasicBlock
*CondBrTarget
= BrCond
->getOperand(1).getMBB();
7311 Register Reg
= MI
.getOperand(2).getReg();
7314 std::swap(CondBrTarget
, UncondBrTarget
);
7316 B
.setInsertPt(B
.getMBB(), BrCond
->getIterator());
7317 B
.buildInstr(AMDGPU::SI_LOOP
)
7319 .addMBB(UncondBrTarget
);
7322 Br
->getOperand(0).setMBB(CondBrTarget
);
7324 B
.buildBr(*CondBrTarget
);
7326 MI
.eraseFromParent();
7327 BrCond
->eraseFromParent();
7328 MRI
.setRegClass(Reg
, TRI
->getWaveMaskRegClass());
7334 case Intrinsic::amdgcn_addrspacecast_nonnull
:
7335 return legalizeAddrSpaceCast(MI
, MRI
, B
);
7336 case Intrinsic::amdgcn_make_buffer_rsrc
:
7337 return legalizePointerAsRsrcIntrin(MI
, MRI
, B
);
7338 case Intrinsic::amdgcn_kernarg_segment_ptr
:
7339 if (!AMDGPU::isKernel(B
.getMF().getFunction().getCallingConv())) {
7340 // This only makes sense to call in a kernel, so just lower to null.
7341 B
.buildConstant(MI
.getOperand(0).getReg(), 0);
7342 MI
.eraseFromParent();
7346 return legalizePreloadedArgIntrin(
7347 MI
, MRI
, B
, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
7348 case Intrinsic::amdgcn_implicitarg_ptr
:
7349 return legalizeImplicitArgPtr(MI
, MRI
, B
);
7350 case Intrinsic::amdgcn_workitem_id_x
:
7351 return legalizeWorkitemIDIntrinsic(MI
, MRI
, B
, 0,
7352 AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
7353 case Intrinsic::amdgcn_workitem_id_y
:
7354 return legalizeWorkitemIDIntrinsic(MI
, MRI
, B
, 1,
7355 AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
7356 case Intrinsic::amdgcn_workitem_id_z
:
7357 return legalizeWorkitemIDIntrinsic(MI
, MRI
, B
, 2,
7358 AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
7359 case Intrinsic::amdgcn_workgroup_id_x
:
7360 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7361 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
7362 case Intrinsic::amdgcn_workgroup_id_y
:
7363 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7364 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
7365 case Intrinsic::amdgcn_workgroup_id_z
:
7366 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7367 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
7368 case Intrinsic::amdgcn_wave_id
:
7369 return legalizeWaveID(MI
, B
);
7370 case Intrinsic::amdgcn_lds_kernel_id
:
7371 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7372 AMDGPUFunctionArgInfo::LDS_KERNEL_ID
);
7373 case Intrinsic::amdgcn_dispatch_ptr
:
7374 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7375 AMDGPUFunctionArgInfo::DISPATCH_PTR
);
7376 case Intrinsic::amdgcn_queue_ptr
:
7377 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7378 AMDGPUFunctionArgInfo::QUEUE_PTR
);
7379 case Intrinsic::amdgcn_implicit_buffer_ptr
:
7380 return legalizePreloadedArgIntrin(
7381 MI
, MRI
, B
, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
7382 case Intrinsic::amdgcn_dispatch_id
:
7383 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
7384 AMDGPUFunctionArgInfo::DISPATCH_ID
);
7385 case Intrinsic::r600_read_ngroups_x
:
7386 // TODO: Emit error for hsa
7387 return legalizeKernargMemParameter(MI
, B
,
7388 SI::KernelInputOffsets::NGROUPS_X
);
7389 case Intrinsic::r600_read_ngroups_y
:
7390 return legalizeKernargMemParameter(MI
, B
,
7391 SI::KernelInputOffsets::NGROUPS_Y
);
7392 case Intrinsic::r600_read_ngroups_z
:
7393 return legalizeKernargMemParameter(MI
, B
,
7394 SI::KernelInputOffsets::NGROUPS_Z
);
7395 case Intrinsic::r600_read_local_size_x
:
7396 // TODO: Could insert G_ASSERT_ZEXT from s16
7397 return legalizeKernargMemParameter(MI
, B
, SI::KernelInputOffsets::LOCAL_SIZE_X
);
7398 case Intrinsic::r600_read_local_size_y
:
7399 // TODO: Could insert G_ASSERT_ZEXT from s16
7400 return legalizeKernargMemParameter(MI
, B
, SI::KernelInputOffsets::LOCAL_SIZE_Y
);
7401 // TODO: Could insert G_ASSERT_ZEXT from s16
7402 case Intrinsic::r600_read_local_size_z
:
7403 return legalizeKernargMemParameter(MI
, B
, SI::KernelInputOffsets::LOCAL_SIZE_Z
);
7404 case Intrinsic::r600_read_global_size_x
:
7405 return legalizeKernargMemParameter(MI
, B
, SI::KernelInputOffsets::GLOBAL_SIZE_X
);
7406 case Intrinsic::r600_read_global_size_y
:
7407 return legalizeKernargMemParameter(MI
, B
, SI::KernelInputOffsets::GLOBAL_SIZE_Y
);
7408 case Intrinsic::r600_read_global_size_z
:
7409 return legalizeKernargMemParameter(MI
, B
, SI::KernelInputOffsets::GLOBAL_SIZE_Z
);
7410 case Intrinsic::amdgcn_fdiv_fast
:
7411 return legalizeFDIVFastIntrin(MI
, MRI
, B
);
7412 case Intrinsic::amdgcn_is_shared
:
7413 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::LOCAL_ADDRESS
);
7414 case Intrinsic::amdgcn_is_private
:
7415 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::PRIVATE_ADDRESS
);
7416 case Intrinsic::amdgcn_wavefrontsize
: {
7417 B
.buildConstant(MI
.getOperand(0), ST
.getWavefrontSize());
7418 MI
.eraseFromParent();
7421 case Intrinsic::amdgcn_s_buffer_load
:
7422 return legalizeSBufferLoad(Helper
, MI
);
7423 case Intrinsic::amdgcn_raw_buffer_store
:
7424 case Intrinsic::amdgcn_raw_ptr_buffer_store
:
7425 case Intrinsic::amdgcn_struct_buffer_store
:
7426 case Intrinsic::amdgcn_struct_ptr_buffer_store
:
7427 return legalizeBufferStore(MI
, Helper
, false, false);
7428 case Intrinsic::amdgcn_raw_buffer_store_format
:
7429 case Intrinsic::amdgcn_raw_ptr_buffer_store_format
:
7430 case Intrinsic::amdgcn_struct_buffer_store_format
:
7431 case Intrinsic::amdgcn_struct_ptr_buffer_store_format
:
7432 return legalizeBufferStore(MI
, Helper
, false, true);
7433 case Intrinsic::amdgcn_raw_tbuffer_store
:
7434 case Intrinsic::amdgcn_raw_ptr_tbuffer_store
:
7435 case Intrinsic::amdgcn_struct_tbuffer_store
:
7436 case Intrinsic::amdgcn_struct_ptr_tbuffer_store
:
7437 return legalizeBufferStore(MI
, Helper
, true, true);
7438 case Intrinsic::amdgcn_raw_buffer_load
:
7439 case Intrinsic::amdgcn_raw_ptr_buffer_load
:
7440 case Intrinsic::amdgcn_raw_atomic_buffer_load
:
7441 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load
:
7442 case Intrinsic::amdgcn_struct_buffer_load
:
7443 case Intrinsic::amdgcn_struct_ptr_buffer_load
:
7444 case Intrinsic::amdgcn_struct_atomic_buffer_load
:
7445 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load
:
7446 return legalizeBufferLoad(MI
, Helper
, false, false);
7447 case Intrinsic::amdgcn_raw_buffer_load_format
:
7448 case Intrinsic::amdgcn_raw_ptr_buffer_load_format
:
7449 case Intrinsic::amdgcn_struct_buffer_load_format
:
7450 case Intrinsic::amdgcn_struct_ptr_buffer_load_format
:
7451 return legalizeBufferLoad(MI
, Helper
, true, false);
7452 case Intrinsic::amdgcn_raw_tbuffer_load
:
7453 case Intrinsic::amdgcn_raw_ptr_tbuffer_load
:
7454 case Intrinsic::amdgcn_struct_tbuffer_load
:
7455 case Intrinsic::amdgcn_struct_ptr_tbuffer_load
:
7456 return legalizeBufferLoad(MI
, Helper
, true, true);
7457 case Intrinsic::amdgcn_raw_buffer_atomic_swap
:
7458 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap
:
7459 case Intrinsic::amdgcn_struct_buffer_atomic_swap
:
7460 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap
:
7461 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
7462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add
:
7463 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
7464 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add
:
7465 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
7466 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub
:
7467 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
7468 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub
:
7469 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
7470 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin
:
7471 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
7472 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin
:
7473 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
7474 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin
:
7475 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
7476 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin
:
7477 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
7478 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax
:
7479 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
7480 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax
:
7481 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
7482 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax
:
7483 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
7484 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax
:
7485 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
7486 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and
:
7487 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
7488 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and
:
7489 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
7490 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or
:
7491 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
7492 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or
:
7493 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
7494 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor
:
7495 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
7496 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor
:
7497 case Intrinsic::amdgcn_raw_buffer_atomic_inc
:
7498 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc
:
7499 case Intrinsic::amdgcn_struct_buffer_atomic_inc
:
7500 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc
:
7501 case Intrinsic::amdgcn_raw_buffer_atomic_dec
:
7502 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec
:
7503 case Intrinsic::amdgcn_struct_buffer_atomic_dec
:
7504 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec
:
7505 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap
:
7506 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap
:
7507 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap
:
7508 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap
:
7509 case Intrinsic::amdgcn_raw_buffer_atomic_fmin
:
7510 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin
:
7511 case Intrinsic::amdgcn_struct_buffer_atomic_fmin
:
7512 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin
:
7513 case Intrinsic::amdgcn_raw_buffer_atomic_fmax
:
7514 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax
:
7515 case Intrinsic::amdgcn_struct_buffer_atomic_fmax
:
7516 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax
:
7517 case Intrinsic::amdgcn_raw_buffer_atomic_fadd
:
7518 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd
:
7519 case Intrinsic::amdgcn_struct_buffer_atomic_fadd
:
7520 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd
:
7521 return legalizeBufferAtomic(MI
, B
, IntrID
);
7522 case Intrinsic::amdgcn_rsq_clamp
:
7523 return legalizeRsqClampIntrinsic(MI
, MRI
, B
);
7524 case Intrinsic::amdgcn_image_bvh_intersect_ray
:
7525 return legalizeBVHIntrinsic(MI
, B
);
7526 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16
:
7527 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16
:
7528 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16
:
7529 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16
:
7530 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8
:
7531 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8
:
7532 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8
:
7533 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8
: {
7534 Register Index
= MI
.getOperand(5).getReg();
7535 LLT S32
= LLT::scalar(32);
7536 if (MRI
.getType(Index
) != S32
)
7537 MI
.getOperand(5).setReg(B
.buildAnyExt(S32
, Index
).getReg(0));
7540 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4
:
7541 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8
:
7542 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4
: {
7543 Register Index
= MI
.getOperand(7).getReg();
7544 LLT S32
= LLT::scalar(32);
7545 if (MRI
.getType(Index
) != S32
)
7546 MI
.getOperand(7).setReg(B
.buildAnyExt(S32
, Index
).getReg(0));
7549 case Intrinsic::amdgcn_fmed3
: {
7550 GISelChangeObserver
&Observer
= Helper
.Observer
;
7552 // FIXME: This is to workaround the inability of tablegen match combiners to
7553 // match intrinsics in patterns.
7554 Observer
.changingInstr(MI
);
7555 MI
.setDesc(B
.getTII().get(AMDGPU::G_AMDGPU_FMED3
));
7556 MI
.removeOperand(1);
7557 Observer
.changedInstr(MI
);
7560 case Intrinsic::amdgcn_readlane
:
7561 case Intrinsic::amdgcn_writelane
:
7562 case Intrinsic::amdgcn_readfirstlane
:
7563 case Intrinsic::amdgcn_permlane16
:
7564 case Intrinsic::amdgcn_permlanex16
:
7565 case Intrinsic::amdgcn_permlane64
:
7566 case Intrinsic::amdgcn_set_inactive
:
7567 case Intrinsic::amdgcn_set_inactive_chain_arg
:
7568 case Intrinsic::amdgcn_mov_dpp8
:
7569 case Intrinsic::amdgcn_update_dpp
:
7570 return legalizeLaneOp(Helper
, MI
, IntrID
);
7571 case Intrinsic::amdgcn_s_buffer_prefetch_data
:
7572 return legalizeSBufferPrefetch(Helper
, MI
);
7574 if (const AMDGPU::ImageDimIntrinsicInfo
*ImageDimIntr
=
7575 AMDGPU::getImageDimIntrinsicInfo(IntrID
))
7576 return legalizeImageIntrinsic(MI
, B
, Helper
.Observer
, ImageDimIntr
);