Revert " [LoongArch][ISel] Check the number of sign bits in `PatGprGpr_32` (#107432)"
[llvm-project.git] / llvm / lib / Target / AMDGPU / AMDGPULegalizerInfo.cpp
blob74e93b0620d26e692fad23068755c7939b10f7e2
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIInstrInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "SIRegisterInfo.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm/ADT/ScopeExit.h"
26 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31 #include "llvm/CodeGen/GlobalISel/Utils.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
37 #define DEBUG_TYPE "amdgpu-legalinfo"
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
45 // Hack until load/store selection patterns support any tuple of legal types.
46 static cl::opt<bool> EnableNewLegality(
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
51 cl::ReallyHidden);
53 static constexpr unsigned MaxRegisterSize = 1024;
55 // Round the number of elements to the next power of two elements
56 static LLT getPow2VectorType(LLT Ty) {
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
62 // Round the number of bits to the next power of two bits
63 static LLT getPow2ScalarType(LLT Ty) {
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
69 /// \returns true if this is an odd sized vector which should widen by adding an
70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71 /// excludes s1 vectors, which should always be scalarized.
72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
93 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
122 // Increase the number of vector elements to reach the next multiple of 32-bit
123 // type.
124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
133 assert(EltSize < 32);
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
140 // Increase the number of vector elements to reach the next legal RegClass.
141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
148 assert(EltSize == 32 || EltSize == 64);
149 assert(Ty.getSizeInBits() < MaxRegisterSize);
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
162 static LLT getBufferRsrcScalarType(const LLT Ty) {
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
169 static LLT getBufferRsrcRegisterType(const LLT Ty) {
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
176 static LLT getBitcastRegisterType(const LLT Ty) {
177 const unsigned Size = Ty.getSizeInBits();
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
185 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
201 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
226 static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
230 static bool isRegisterVectorElementType(LLT EltTy) {
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
235 static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
242 // TODO: replace all uses of isRegisterType with isRegisterClassType
243 static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Ty.getSizeInBits()))
245 return false;
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
250 return true;
253 // Any combination of 32 or 64-bit elements up the maximum register size, and
254 // multiples of v2s16.
255 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Query.Types[TypeIdx]);
261 // RegisterType that doesn't have a corresponding RegClass.
262 // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263 // should be removed.
264 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
268 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
272 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
282 static const LLT S1 = LLT::scalar(1);
283 static const LLT S8 = LLT::scalar(8);
284 static const LLT S16 = LLT::scalar(16);
285 static const LLT S32 = LLT::scalar(32);
286 static const LLT F32 = LLT::float32();
287 static const LLT S64 = LLT::scalar(64);
288 static const LLT F64 = LLT::float64();
289 static const LLT S96 = LLT::scalar(96);
290 static const LLT S128 = LLT::scalar(128);
291 static const LLT S160 = LLT::scalar(160);
292 static const LLT S224 = LLT::scalar(224);
293 static const LLT S256 = LLT::scalar(256);
294 static const LLT S512 = LLT::scalar(512);
295 static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
297 static const LLT V2S8 = LLT::fixed_vector(2, 8);
298 static const LLT V2S16 = LLT::fixed_vector(2, 16);
299 static const LLT V4S16 = LLT::fixed_vector(4, 16);
300 static const LLT V6S16 = LLT::fixed_vector(6, 16);
301 static const LLT V8S16 = LLT::fixed_vector(8, 16);
302 static const LLT V10S16 = LLT::fixed_vector(10, 16);
303 static const LLT V12S16 = LLT::fixed_vector(12, 16);
304 static const LLT V16S16 = LLT::fixed_vector(16, 16);
306 static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16());
307 static const LLT V2BF16 = V2F16; // FIXME
309 static const LLT V2S32 = LLT::fixed_vector(2, 32);
310 static const LLT V3S32 = LLT::fixed_vector(3, 32);
311 static const LLT V4S32 = LLT::fixed_vector(4, 32);
312 static const LLT V5S32 = LLT::fixed_vector(5, 32);
313 static const LLT V6S32 = LLT::fixed_vector(6, 32);
314 static const LLT V7S32 = LLT::fixed_vector(7, 32);
315 static const LLT V8S32 = LLT::fixed_vector(8, 32);
316 static const LLT V9S32 = LLT::fixed_vector(9, 32);
317 static const LLT V10S32 = LLT::fixed_vector(10, 32);
318 static const LLT V11S32 = LLT::fixed_vector(11, 32);
319 static const LLT V12S32 = LLT::fixed_vector(12, 32);
320 static const LLT V16S32 = LLT::fixed_vector(16, 32);
321 static const LLT V32S32 = LLT::fixed_vector(32, 32);
323 static const LLT V2S64 = LLT::fixed_vector(2, 64);
324 static const LLT V3S64 = LLT::fixed_vector(3, 64);
325 static const LLT V4S64 = LLT::fixed_vector(4, 64);
326 static const LLT V5S64 = LLT::fixed_vector(5, 64);
327 static const LLT V6S64 = LLT::fixed_vector(6, 64);
328 static const LLT V7S64 = LLT::fixed_vector(7, 64);
329 static const LLT V8S64 = LLT::fixed_vector(8, 64);
330 static const LLT V16S64 = LLT::fixed_vector(16, 64);
332 static const LLT V2S128 = LLT::fixed_vector(2, 128);
333 static const LLT V4S128 = LLT::fixed_vector(4, 128);
335 static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
336 S160, S224, S256, S512};
338 static std::initializer_list<LLT> AllS16Vectors{
339 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
341 static std::initializer_list<LLT> AllS32Vectors = {
342 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
343 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
345 static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
346 V6S64, V7S64, V8S64, V16S64};
348 // Checks whether a type is in the list of legal register types.
349 static bool isRegisterClassType(LLT Ty) {
350 if (Ty.isPointerOrPointerVector())
351 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
353 return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
354 is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
357 static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
358 return [TypeIdx](const LegalityQuery &Query) {
359 return isRegisterClassType(Query.Types[TypeIdx]);
363 // If we have a truncating store or an extending load with a data size larger
364 // than 32-bits, we need to reduce to a 32-bit type.
365 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
366 return [=](const LegalityQuery &Query) {
367 const LLT Ty = Query.Types[TypeIdx];
368 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
373 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
374 // handle some operations by just promoting the register during
375 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
376 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
377 bool IsLoad, bool IsAtomic) {
378 switch (AS) {
379 case AMDGPUAS::PRIVATE_ADDRESS:
380 // FIXME: Private element size.
381 return ST.enableFlatScratch() ? 128 : 32;
382 case AMDGPUAS::LOCAL_ADDRESS:
383 return ST.useDS128() ? 128 : 64;
384 case AMDGPUAS::GLOBAL_ADDRESS:
385 case AMDGPUAS::CONSTANT_ADDRESS:
386 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
387 case AMDGPUAS::BUFFER_RESOURCE:
388 // Treat constant and global as identical. SMRD loads are sometimes usable for
389 // global loads (ideally constant address space should be eliminated)
390 // depending on the context. Legality cannot be context dependent, but
391 // RegBankSelect can split the load as necessary depending on the pointer
392 // register bank/uniformity and if the memory is invariant or not written in a
393 // kernel.
394 return IsLoad ? 512 : 128;
395 default:
396 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
397 // if they may alias scratch depending on the subtarget. This needs to be
398 // moved to custom handling to use addressMayBeAccessedAsPrivate
399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
403 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
404 const LegalityQuery &Query) {
405 const LLT Ty = Query.Types[0];
407 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
408 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
410 unsigned RegSize = Ty.getSizeInBits();
411 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
412 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
413 unsigned AS = Query.Types[1].getAddressSpace();
415 // All of these need to be custom lowered to cast the pointer operand.
416 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
417 return false;
419 // Do not handle extending vector loads.
420 if (Ty.isVector() && MemSize != RegSize)
421 return false;
423 // TODO: We should be able to widen loads if the alignment is high enough, but
424 // we also need to modify the memory access size.
425 #if 0
426 // Accept widening loads based on alignment.
427 if (IsLoad && MemSize < Size)
428 MemSize = std::max(MemSize, Align);
429 #endif
431 // Only 1-byte and 2-byte to 32-bit extloads are valid.
432 if (MemSize != RegSize && RegSize != 32)
433 return false;
435 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
436 Query.MMODescrs[0].Ordering !=
437 AtomicOrdering::NotAtomic))
438 return false;
440 switch (MemSize) {
441 case 8:
442 case 16:
443 case 32:
444 case 64:
445 case 128:
446 break;
447 case 96:
448 if (!ST.hasDwordx3LoadStores())
449 return false;
450 break;
451 case 256:
452 case 512:
453 // These may contextually need to be broken down.
454 break;
455 default:
456 return false;
459 assert(RegSize >= MemSize);
461 if (AlignBits < MemSize) {
462 const SITargetLowering *TLI = ST.getTargetLowering();
463 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
464 Align(AlignBits / 8)))
465 return false;
468 return true;
471 // The newer buffer intrinsic forms take their resource arguments as
472 // pointers in address space 8, aka s128 values. However, in order to not break
473 // SelectionDAG, the underlying operations have to continue to take v4i32
474 // arguments. Therefore, we convert resource pointers - or vectors of them
475 // to integer values here.
476 static bool hasBufferRsrcWorkaround(const LLT Ty) {
477 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
478 return true;
479 if (Ty.isVector()) {
480 const LLT ElemTy = Ty.getElementType();
481 return hasBufferRsrcWorkaround(ElemTy);
483 return false;
486 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
487 // workaround this. Eventually it should ignore the type for loads and only care
488 // about the size. Return true in cases where we will workaround this for now by
489 // bitcasting.
490 static bool loadStoreBitcastWorkaround(const LLT Ty) {
491 if (EnableNewLegality)
492 return false;
494 const unsigned Size = Ty.getSizeInBits();
495 if (Size <= 64)
496 return false;
497 // Address space 8 pointers get their own workaround.
498 if (hasBufferRsrcWorkaround(Ty))
499 return false;
500 if (!Ty.isVector())
501 return true;
503 if (Ty.isPointerVector())
504 return true;
506 unsigned EltSize = Ty.getScalarSizeInBits();
507 return EltSize != 32 && EltSize != 64;
510 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
511 const LLT Ty = Query.Types[0];
512 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
513 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
516 /// Return true if a load or store of the type should be lowered with a bitcast
517 /// to a different type.
518 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
519 const LLT MemTy) {
520 const unsigned MemSizeInBits = MemTy.getSizeInBits();
521 const unsigned Size = Ty.getSizeInBits();
522 if (Size != MemSizeInBits)
523 return Size <= 32 && Ty.isVector();
525 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
526 return true;
528 // Don't try to handle bitcasting vector ext loads for now.
529 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
530 (Size <= 32 || isRegisterSize(Size)) &&
531 !isRegisterVectorElementType(Ty.getElementType());
534 /// Return true if we should legalize a load by widening an odd sized memory
535 /// access up to the alignment. Note this case when the memory access itself
536 /// changes, not the size of the result register.
537 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
538 uint64_t AlignInBits, unsigned AddrSpace,
539 unsigned Opcode) {
540 unsigned SizeInBits = MemoryTy.getSizeInBits();
541 // We don't want to widen cases that are naturally legal.
542 if (isPowerOf2_32(SizeInBits))
543 return false;
545 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
546 // end up widening these for a scalar load during RegBankSelect, if we don't
547 // have 96-bit scalar loads.
548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
549 return false;
551 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
552 return false;
554 // A load is known dereferenceable up to the alignment, so it's legal to widen
555 // to it.
557 // TODO: Could check dereferenceable for less aligned cases.
558 unsigned RoundedSize = NextPowerOf2(SizeInBits);
559 if (AlignInBits < RoundedSize)
560 return false;
562 // Do not widen if it would introduce a slow unaligned load.
563 const SITargetLowering *TLI = ST.getTargetLowering();
564 unsigned Fast = 0;
565 return TLI->allowsMisalignedMemoryAccessesImpl(
566 RoundedSize, AddrSpace, Align(AlignInBits / 8),
567 MachineMemOperand::MOLoad, &Fast) &&
568 Fast;
571 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
572 unsigned Opcode) {
573 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
574 return false;
576 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
577 Query.MMODescrs[0].AlignInBits,
578 Query.Types[1].getAddressSpace(), Opcode);
581 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
582 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
583 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
584 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
585 MachineRegisterInfo &MRI, unsigned Idx) {
586 MachineOperand &MO = MI.getOperand(Idx);
588 const LLT PointerTy = MRI.getType(MO.getReg());
590 // Paranoidly prevent us from doing this multiple times.
591 if (!hasBufferRsrcWorkaround(PointerTy))
592 return PointerTy;
594 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
595 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
596 if (!PointerTy.isVector()) {
597 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
598 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
599 const LLT S32 = LLT::scalar(32);
601 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
602 std::array<Register, 4> VectorElems;
603 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
604 for (unsigned I = 0; I < NumParts; ++I)
605 VectorElems[I] =
606 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
607 B.buildMergeValues(MO, VectorElems);
608 MO.setReg(VectorReg);
609 return VectorTy;
611 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
612 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
613 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
614 B.buildIntToPtr(MO, Scalar);
615 MO.setReg(BitcastReg);
617 return VectorTy;
620 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
621 /// the form in which the value must be in order to be passed to the low-level
622 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
623 /// needed in order to account for the fact that we can't define a register
624 /// class for s128 without breaking SelectionDAG.
625 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
626 MachineRegisterInfo &MRI = *B.getMRI();
627 const LLT PointerTy = MRI.getType(Pointer);
628 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
631 if (!PointerTy.isVector()) {
632 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
633 SmallVector<Register, 4> PointerParts;
634 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
635 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
636 for (unsigned I = 0; I < NumParts; ++I)
637 PointerParts.push_back(Unmerged.getReg(I));
638 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
640 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
641 return B.buildBitcast(VectorTy, Scalar).getReg(0);
644 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
645 unsigned Idx) {
646 MachineOperand &MO = MI.getOperand(Idx);
648 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
649 // Paranoidly prevent us from doing this multiple times.
650 if (!hasBufferRsrcWorkaround(PointerTy))
651 return;
652 MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
655 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
656 const GCNTargetMachine &TM)
657 : ST(ST_) {
658 using namespace TargetOpcode;
660 auto GetAddrSpacePtr = [&TM](unsigned AS) {
661 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
664 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
665 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
666 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
667 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
668 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
669 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
670 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
671 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
672 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
673 const LLT BufferStridedPtr =
674 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
676 const LLT CodePtr = FlatPtr;
678 const std::initializer_list<LLT> AddrSpaces64 = {
679 GlobalPtr, ConstantPtr, FlatPtr
682 const std::initializer_list<LLT> AddrSpaces32 = {
683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
688 const std::initializer_list<LLT> FPTypesBase = {
689 S32, S64
692 const std::initializer_list<LLT> FPTypes16 = {
693 S32, S64, S16
696 const std::initializer_list<LLT> FPTypesPK16 = {
697 S32, S64, S16, V2S16
700 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
702 // s1 for VCC branches, s32 for SCC branches.
703 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
705 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
706 // elements for v3s16
707 getActionDefinitionsBuilder(G_PHI)
708 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
709 .legalFor(AllS32Vectors)
710 .legalFor(AllS64Vectors)
711 .legalFor(AddrSpaces64)
712 .legalFor(AddrSpaces32)
713 .legalFor(AddrSpaces128)
714 .legalIf(isPointer(0))
715 .clampScalar(0, S16, S256)
716 .widenScalarToNextPow2(0, 32)
717 .clampMaxNumElements(0, S32, 16)
718 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
719 .scalarize(0);
721 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
722 // Full set of gfx9 features.
723 if (ST.hasScalarAddSub64()) {
724 getActionDefinitionsBuilder({G_ADD, G_SUB})
725 .legalFor({S64, S32, S16, V2S16})
726 .clampMaxNumElementsStrict(0, S16, 2)
727 .scalarize(0)
728 .minScalar(0, S16)
729 .widenScalarToNextMultipleOf(0, 32)
730 .maxScalar(0, S32);
731 } else {
732 getActionDefinitionsBuilder({G_ADD, G_SUB})
733 .legalFor({S32, S16, V2S16})
734 .clampMaxNumElementsStrict(0, S16, 2)
735 .scalarize(0)
736 .minScalar(0, S16)
737 .widenScalarToNextMultipleOf(0, 32)
738 .maxScalar(0, S32);
741 if (ST.hasScalarSMulU64()) {
742 getActionDefinitionsBuilder(G_MUL)
743 .legalFor({S64, S32, S16, V2S16})
744 .clampMaxNumElementsStrict(0, S16, 2)
745 .scalarize(0)
746 .minScalar(0, S16)
747 .widenScalarToNextMultipleOf(0, 32)
748 .custom();
749 } else {
750 getActionDefinitionsBuilder(G_MUL)
751 .legalFor({S32, S16, V2S16})
752 .clampMaxNumElementsStrict(0, S16, 2)
753 .scalarize(0)
754 .minScalar(0, S16)
755 .widenScalarToNextMultipleOf(0, 32)
756 .custom();
758 assert(ST.hasMad64_32());
760 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
761 .legalFor({S32, S16, V2S16}) // Clamp modifier
762 .minScalarOrElt(0, S16)
763 .clampMaxNumElementsStrict(0, S16, 2)
764 .scalarize(0)
765 .widenScalarToNextPow2(0, 32)
766 .lower();
767 } else if (ST.has16BitInsts()) {
768 getActionDefinitionsBuilder({G_ADD, G_SUB})
769 .legalFor({S32, S16})
770 .minScalar(0, S16)
771 .widenScalarToNextMultipleOf(0, 32)
772 .maxScalar(0, S32)
773 .scalarize(0);
775 getActionDefinitionsBuilder(G_MUL)
776 .legalFor({S32, S16})
777 .scalarize(0)
778 .minScalar(0, S16)
779 .widenScalarToNextMultipleOf(0, 32)
780 .custom();
781 assert(ST.hasMad64_32());
783 // Technically the saturating operations require clamp bit support, but this
784 // was introduced at the same time as 16-bit operations.
785 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
786 .legalFor({S32, S16}) // Clamp modifier
787 .minScalar(0, S16)
788 .scalarize(0)
789 .widenScalarToNextPow2(0, 16)
790 .lower();
792 // We're just lowering this, but it helps get a better result to try to
793 // coerce to the desired type first.
794 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
795 .minScalar(0, S16)
796 .scalarize(0)
797 .lower();
798 } else {
799 getActionDefinitionsBuilder({G_ADD, G_SUB})
800 .legalFor({S32})
801 .widenScalarToNextMultipleOf(0, 32)
802 .clampScalar(0, S32, S32)
803 .scalarize(0);
805 auto &Mul = getActionDefinitionsBuilder(G_MUL)
806 .legalFor({S32})
807 .scalarize(0)
808 .minScalar(0, S32)
809 .widenScalarToNextMultipleOf(0, 32);
811 if (ST.hasMad64_32())
812 Mul.custom();
813 else
814 Mul.maxScalar(0, S32);
816 if (ST.hasIntClamp()) {
817 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
818 .legalFor({S32}) // Clamp modifier.
819 .scalarize(0)
820 .minScalarOrElt(0, S32)
821 .lower();
822 } else {
823 // Clamp bit support was added in VI, along with 16-bit operations.
824 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
825 .minScalar(0, S32)
826 .scalarize(0)
827 .lower();
830 // FIXME: DAG expansion gets better results. The widening uses the smaller
831 // range values and goes for the min/max lowering directly.
832 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
833 .minScalar(0, S32)
834 .scalarize(0)
835 .lower();
838 getActionDefinitionsBuilder(
839 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
840 .customFor({S32, S64})
841 .clampScalar(0, S32, S64)
842 .widenScalarToNextPow2(0, 32)
843 .scalarize(0);
845 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
846 .legalFor({S32})
847 .maxScalar(0, S32);
849 if (ST.hasVOP3PInsts()) {
850 Mulh
851 .clampMaxNumElements(0, S8, 2)
852 .lowerFor({V2S8});
855 Mulh
856 .scalarize(0)
857 .lower();
859 // Report legal for any types we can handle anywhere. For the cases only legal
860 // on the SALU, RegBankSelect will be able to re-legalize.
861 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
862 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
863 .clampScalar(0, S32, S64)
864 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
865 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
866 .widenScalarToNextPow2(0)
867 .scalarize(0);
869 getActionDefinitionsBuilder(
870 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
871 .legalFor({{S32, S1}, {S32, S32}})
872 .clampScalar(0, S32, S32)
873 .scalarize(0);
875 getActionDefinitionsBuilder(G_BITCAST)
876 // Don't worry about the size constraint.
877 .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
878 .lower();
880 getActionDefinitionsBuilder(G_CONSTANT)
881 .legalFor({S1, S32, S64, S16, GlobalPtr,
882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
883 .legalIf(isPointer(0))
884 .clampScalar(0, S32, S64)
885 .widenScalarToNextPow2(0);
887 getActionDefinitionsBuilder(G_FCONSTANT)
888 .legalFor({S32, S64, S16})
889 .clampScalar(0, S16, S64);
891 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
892 .legalIf(isRegisterType(0))
893 // s1 and s16 are special cases because they have legal operations on
894 // them, but don't really occupy registers in the normal way.
895 .legalFor({S1, S16})
896 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
897 .clampScalarOrElt(0, S32, MaxScalar)
898 .widenScalarToNextPow2(0, 32)
899 .clampMaxNumElements(0, S32, 16);
901 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
903 // If the amount is divergent, we have to do a wave reduction to get the
904 // maximum value, so this is expanded during RegBankSelect.
905 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
906 .legalFor({{PrivatePtr, S32}});
908 getActionDefinitionsBuilder(G_STACKSAVE)
909 .customFor({PrivatePtr});
910 getActionDefinitionsBuilder(G_STACKRESTORE)
911 .legalFor({PrivatePtr});
913 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
915 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
916 .customIf(typeIsNot(0, PrivatePtr));
918 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
920 auto &FPOpActions = getActionDefinitionsBuilder(
921 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
923 .legalFor({S32, S64});
924 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
925 .customFor({S32, S64});
926 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
927 .customFor({S32, S64});
929 if (ST.has16BitInsts()) {
930 if (ST.hasVOP3PInsts())
931 FPOpActions.legalFor({S16, V2S16});
932 else
933 FPOpActions.legalFor({S16});
935 TrigActions.customFor({S16});
936 FDIVActions.customFor({S16});
939 if (ST.hasPackedFP32Ops()) {
940 FPOpActions.legalFor({V2S32});
941 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
944 auto &MinNumMaxNum = getActionDefinitionsBuilder({
945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
947 if (ST.hasVOP3PInsts()) {
948 MinNumMaxNum.customFor(FPTypesPK16)
949 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
950 .clampMaxNumElements(0, S16, 2)
951 .clampScalar(0, S16, S64)
952 .scalarize(0);
953 } else if (ST.has16BitInsts()) {
954 MinNumMaxNum.customFor(FPTypes16)
955 .clampScalar(0, S16, S64)
956 .scalarize(0);
957 } else {
958 MinNumMaxNum.customFor(FPTypesBase)
959 .clampScalar(0, S32, S64)
960 .scalarize(0);
963 if (ST.hasVOP3PInsts())
964 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
966 FPOpActions
967 .scalarize(0)
968 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
970 TrigActions
971 .scalarize(0)
972 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
974 FDIVActions
975 .scalarize(0)
976 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
978 getActionDefinitionsBuilder({G_FNEG, G_FABS})
979 .legalFor(FPTypesPK16)
980 .clampMaxNumElementsStrict(0, S16, 2)
981 .scalarize(0)
982 .clampScalar(0, S16, S64);
984 if (ST.has16BitInsts()) {
985 getActionDefinitionsBuilder(G_FSQRT)
986 .legalFor({S16})
987 .customFor({S32, S64})
988 .scalarize(0)
989 .unsupported();
990 getActionDefinitionsBuilder(G_FFLOOR)
991 .legalFor({S32, S64, S16})
992 .scalarize(0)
993 .clampScalar(0, S16, S64);
995 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
996 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
997 .scalarize(0)
998 .maxScalarIf(typeIs(0, S16), 1, S16)
999 .clampScalar(1, S32, S32)
1000 .lower();
1002 getActionDefinitionsBuilder(G_FFREXP)
1003 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1004 .scalarize(0)
1005 .lower();
1006 } else {
1007 getActionDefinitionsBuilder(G_FSQRT)
1008 .customFor({S32, S64, S16})
1009 .scalarize(0)
1010 .unsupported();
1013 if (ST.hasFractBug()) {
1014 getActionDefinitionsBuilder(G_FFLOOR)
1015 .customFor({S64})
1016 .legalFor({S32, S64})
1017 .scalarize(0)
1018 .clampScalar(0, S32, S64);
1019 } else {
1020 getActionDefinitionsBuilder(G_FFLOOR)
1021 .legalFor({S32, S64})
1022 .scalarize(0)
1023 .clampScalar(0, S32, S64);
1026 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1027 .legalFor({{S32, S32}, {S64, S32}})
1028 .scalarize(0)
1029 .clampScalar(0, S32, S64)
1030 .clampScalar(1, S32, S32)
1031 .lower();
1033 getActionDefinitionsBuilder(G_FFREXP)
1034 .customFor({{S32, S32}, {S64, S32}})
1035 .scalarize(0)
1036 .minScalar(0, S32)
1037 .clampScalar(1, S32, S32)
1038 .lower();
1041 getActionDefinitionsBuilder(G_FPTRUNC)
1042 .legalFor({{S32, S64}, {S16, S32}})
1043 .scalarize(0)
1044 .lower();
1046 getActionDefinitionsBuilder(G_FPEXT)
1047 .legalFor({{S64, S32}, {S32, S16}})
1048 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1049 .scalarize(0);
1051 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1052 if (ST.has16BitInsts()) {
1053 FSubActions
1054 // Use actual fsub instruction
1055 .legalFor({S32, S16})
1056 // Must use fadd + fneg
1057 .lowerFor({S64, V2S16});
1058 } else {
1059 FSubActions
1060 // Use actual fsub instruction
1061 .legalFor({S32})
1062 // Must use fadd + fneg
1063 .lowerFor({S64, S16, V2S16});
1066 FSubActions
1067 .scalarize(0)
1068 .clampScalar(0, S32, S64);
1070 // Whether this is legal depends on the floating point mode for the function.
1071 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1072 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1073 FMad.customFor({S32, S16});
1074 else if (ST.hasMadMacF32Insts())
1075 FMad.customFor({S32});
1076 else if (ST.hasMadF16())
1077 FMad.customFor({S16});
1078 FMad.scalarize(0)
1079 .lower();
1081 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1082 if (ST.has16BitInsts()) {
1083 FRem.customFor({S16, S32, S64});
1084 } else {
1085 FRem.minScalar(0, S32)
1086 .customFor({S32, S64});
1088 FRem.scalarize(0);
1090 // TODO: Do we need to clamp maximum bitwidth?
1091 getActionDefinitionsBuilder(G_TRUNC)
1092 .legalIf(isScalar(0))
1093 .legalFor({{V2S16, V2S32}})
1094 .clampMaxNumElements(0, S16, 2)
1095 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1096 // situations (like an invalid implicit use), we don't want to infinite loop
1097 // in the legalizer.
1098 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1099 .alwaysLegal();
1101 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1102 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1103 {S32, S1}, {S64, S1}, {S16, S1}})
1104 .scalarize(0)
1105 .clampScalar(0, S32, S64)
1106 .widenScalarToNextPow2(1, 32);
1108 // TODO: Split s1->s64 during regbankselect for VALU.
1109 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1110 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1111 .lowerIf(typeIs(1, S1))
1112 .customFor({{S32, S64}, {S64, S64}});
1113 if (ST.has16BitInsts())
1114 IToFP.legalFor({{S16, S16}});
1115 IToFP.clampScalar(1, S32, S64)
1116 .minScalar(0, S32)
1117 .scalarize(0)
1118 .widenScalarToNextPow2(1);
1120 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1121 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1122 .customFor({{S64, S32}, {S64, S64}})
1123 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1124 if (ST.has16BitInsts())
1125 FPToI.legalFor({{S16, S16}});
1126 else
1127 FPToI.minScalar(1, S32);
1129 FPToI.minScalar(0, S32)
1130 .widenScalarToNextPow2(0, 32)
1131 .scalarize(0)
1132 .lower();
1134 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1135 .customFor({S16, S32})
1136 .scalarize(0)
1137 .lower();
1139 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1140 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1141 .scalarize(0)
1142 .lower();
1144 if (ST.has16BitInsts()) {
1145 getActionDefinitionsBuilder(
1146 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1147 .legalFor({S16, S32, S64})
1148 .clampScalar(0, S16, S64)
1149 .scalarize(0);
1150 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1151 getActionDefinitionsBuilder(
1152 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1153 .legalFor({S32, S64})
1154 .clampScalar(0, S32, S64)
1155 .scalarize(0);
1156 } else {
1157 getActionDefinitionsBuilder(
1158 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1159 .legalFor({S32})
1160 .customFor({S64})
1161 .clampScalar(0, S32, S64)
1162 .scalarize(0);
1165 getActionDefinitionsBuilder(G_PTR_ADD)
1166 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1167 .legalIf(all(isPointer(0), sameSize(0, 1)))
1168 .scalarize(0)
1169 .scalarSameSizeAs(1, 0);
1171 getActionDefinitionsBuilder(G_PTRMASK)
1172 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1173 .scalarSameSizeAs(1, 0)
1174 .scalarize(0);
1176 auto &CmpBuilder =
1177 getActionDefinitionsBuilder(G_ICMP)
1178 // The compare output type differs based on the register bank of the output,
1179 // so make both s1 and s32 legal.
1181 // Scalar compares producing output in scc will be promoted to s32, as that
1182 // is the allocatable register type that will be needed for the copy from
1183 // scc. This will be promoted during RegBankSelect, and we assume something
1184 // before that won't try to use s32 result types.
1186 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1187 // bank.
1188 .legalForCartesianProduct(
1189 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1190 .legalForCartesianProduct(
1191 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1192 if (ST.has16BitInsts()) {
1193 CmpBuilder.legalFor({{S1, S16}});
1196 CmpBuilder
1197 .widenScalarToNextPow2(1)
1198 .clampScalar(1, S32, S64)
1199 .scalarize(0)
1200 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1202 auto &FCmpBuilder =
1203 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1204 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1206 if (ST.hasSALUFloatInsts())
1207 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1209 FCmpBuilder
1210 .widenScalarToNextPow2(1)
1211 .clampScalar(1, S32, S64)
1212 .scalarize(0);
1214 // FIXME: fpow has a selection pattern that should move to custom lowering.
1215 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1216 if (ST.has16BitInsts())
1217 ExpOps.customFor({{S32}, {S16}});
1218 else
1219 ExpOps.customFor({S32});
1220 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1221 .scalarize(0);
1223 getActionDefinitionsBuilder(G_FPOWI)
1224 .clampScalar(0, MinScalarFPTy, S32)
1225 .lower();
1227 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1228 Log2Ops.customFor({S32});
1229 if (ST.has16BitInsts())
1230 Log2Ops.legalFor({S16});
1231 else
1232 Log2Ops.customFor({S16});
1233 Log2Ops.scalarize(0)
1234 .lower();
1236 auto &LogOps =
1237 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1238 LogOps.customFor({S32, S16});
1239 LogOps.clampScalar(0, MinScalarFPTy, S32)
1240 .scalarize(0);
1242 // The 64-bit versions produce 32-bit results, but only on the SALU.
1243 getActionDefinitionsBuilder(G_CTPOP)
1244 .legalFor({{S32, S32}, {S32, S64}})
1245 .clampScalar(0, S32, S32)
1246 .widenScalarToNextPow2(1, 32)
1247 .clampScalar(1, S32, S64)
1248 .scalarize(0)
1249 .widenScalarToNextPow2(0, 32);
1251 // If no 16 bit instr is available, lower into different instructions.
1252 if (ST.has16BitInsts())
1253 getActionDefinitionsBuilder(G_IS_FPCLASS)
1254 .legalForCartesianProduct({S1}, FPTypes16)
1255 .widenScalarToNextPow2(1)
1256 .scalarize(0)
1257 .lower();
1258 else
1259 getActionDefinitionsBuilder(G_IS_FPCLASS)
1260 .legalForCartesianProduct({S1}, FPTypesBase)
1261 .lowerFor({S1, S16})
1262 .widenScalarToNextPow2(1)
1263 .scalarize(0)
1264 .lower();
1266 // The hardware instructions return a different result on 0 than the generic
1267 // instructions expect. The hardware produces -1, but these produce the
1268 // bitwidth.
1269 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1270 .scalarize(0)
1271 .clampScalar(0, S32, S32)
1272 .clampScalar(1, S32, S64)
1273 .widenScalarToNextPow2(0, 32)
1274 .widenScalarToNextPow2(1, 32)
1275 .custom();
1277 // The 64-bit versions produce 32-bit results, but only on the SALU.
1278 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1279 .legalFor({{S32, S32}, {S32, S64}})
1280 .customIf(scalarNarrowerThan(1, 32))
1281 .clampScalar(0, S32, S32)
1282 .clampScalar(1, S32, S64)
1283 .scalarize(0)
1284 .widenScalarToNextPow2(0, 32)
1285 .widenScalarToNextPow2(1, 32);
1287 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1288 .legalFor({{S32, S32}, {S32, S64}})
1289 .clampScalar(0, S32, S32)
1290 .clampScalar(1, S32, S64)
1291 .scalarize(0)
1292 .widenScalarToNextPow2(0, 32)
1293 .widenScalarToNextPow2(1, 32);
1295 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1296 // RegBankSelect.
1297 getActionDefinitionsBuilder(G_BITREVERSE)
1298 .legalFor({S32, S64})
1299 .clampScalar(0, S32, S64)
1300 .scalarize(0)
1301 .widenScalarToNextPow2(0);
1303 if (ST.has16BitInsts()) {
1304 getActionDefinitionsBuilder(G_BSWAP)
1305 .legalFor({S16, S32, V2S16})
1306 .clampMaxNumElementsStrict(0, S16, 2)
1307 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1308 // narrowScalar limitation.
1309 .widenScalarToNextPow2(0)
1310 .clampScalar(0, S16, S32)
1311 .scalarize(0);
1313 if (ST.hasVOP3PInsts()) {
1314 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1315 .legalFor({S32, S16, V2S16})
1316 .clampMaxNumElements(0, S16, 2)
1317 .minScalar(0, S16)
1318 .widenScalarToNextPow2(0)
1319 .scalarize(0)
1320 .lower();
1321 } else {
1322 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1323 .legalFor({S32, S16})
1324 .widenScalarToNextPow2(0)
1325 .minScalar(0, S16)
1326 .scalarize(0)
1327 .lower();
1329 } else {
1330 // TODO: Should have same legality without v_perm_b32
1331 getActionDefinitionsBuilder(G_BSWAP)
1332 .legalFor({S32})
1333 .lowerIf(scalarNarrowerThan(0, 32))
1334 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1335 // narrowScalar limitation.
1336 .widenScalarToNextPow2(0)
1337 .maxScalar(0, S32)
1338 .scalarize(0)
1339 .lower();
1341 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1342 .legalFor({S32})
1343 .minScalar(0, S32)
1344 .widenScalarToNextPow2(0)
1345 .scalarize(0)
1346 .lower();
1349 getActionDefinitionsBuilder(G_INTTOPTR)
1350 // List the common cases
1351 .legalForCartesianProduct(AddrSpaces64, {S64})
1352 .legalForCartesianProduct(AddrSpaces32, {S32})
1353 .scalarize(0)
1354 // Accept any address space as long as the size matches
1355 .legalIf(sameSize(0, 1))
1356 .widenScalarIf(smallerThan(1, 0),
1357 [](const LegalityQuery &Query) {
1358 return std::pair(
1359 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1361 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1362 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1365 getActionDefinitionsBuilder(G_PTRTOINT)
1366 // List the common cases
1367 .legalForCartesianProduct(AddrSpaces64, {S64})
1368 .legalForCartesianProduct(AddrSpaces32, {S32})
1369 .scalarize(0)
1370 // Accept any address space as long as the size matches
1371 .legalIf(sameSize(0, 1))
1372 .widenScalarIf(smallerThan(0, 1),
1373 [](const LegalityQuery &Query) {
1374 return std::pair(
1375 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1377 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1378 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1381 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1382 .scalarize(0)
1383 .custom();
1385 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1386 bool IsLoad) -> bool {
1387 const LLT DstTy = Query.Types[0];
1389 // Split vector extloads.
1390 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1392 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1393 return true;
1395 const LLT PtrTy = Query.Types[1];
1396 unsigned AS = PtrTy.getAddressSpace();
1397 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1398 Query.MMODescrs[0].Ordering !=
1399 AtomicOrdering::NotAtomic))
1400 return true;
1402 // Catch weird sized loads that don't evenly divide into the access sizes
1403 // TODO: May be able to widen depending on alignment etc.
1404 unsigned NumRegs = (MemSize + 31) / 32;
1405 if (NumRegs == 3) {
1406 if (!ST.hasDwordx3LoadStores())
1407 return true;
1408 } else {
1409 // If the alignment allows, these should have been widened.
1410 if (!isPowerOf2_32(NumRegs))
1411 return true;
1414 return false;
1417 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1418 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1419 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1421 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1422 // LDS
1423 // TODO: Unsupported flat for SI.
1425 for (unsigned Op : {G_LOAD, G_STORE}) {
1426 const bool IsStore = Op == G_STORE;
1428 auto &Actions = getActionDefinitionsBuilder(Op);
1429 // Explicitly list some common cases.
1430 // TODO: Does this help compile time at all?
1431 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1432 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1433 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1434 {S64, GlobalPtr, S64, GlobalAlign32},
1435 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1436 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1437 {S32, GlobalPtr, S8, GlobalAlign8},
1438 {S32, GlobalPtr, S16, GlobalAlign16},
1440 {S32, LocalPtr, S32, 32},
1441 {S64, LocalPtr, S64, 32},
1442 {V2S32, LocalPtr, V2S32, 32},
1443 {S32, LocalPtr, S8, 8},
1444 {S32, LocalPtr, S16, 16},
1445 {V2S16, LocalPtr, S32, 32},
1447 {S32, PrivatePtr, S32, 32},
1448 {S32, PrivatePtr, S8, 8},
1449 {S32, PrivatePtr, S16, 16},
1450 {V2S16, PrivatePtr, S32, 32},
1452 {S32, ConstantPtr, S32, GlobalAlign32},
1453 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1454 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1455 {S64, ConstantPtr, S64, GlobalAlign32},
1456 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1457 Actions.legalIf(
1458 [=](const LegalityQuery &Query) -> bool {
1459 return isLoadStoreLegal(ST, Query);
1462 // The custom pointers (fat pointers, buffer resources) don't work with load
1463 // and store at this level. Fat pointers should have been lowered to
1464 // intrinsics before the translation to MIR.
1465 Actions.unsupportedIf(
1466 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1468 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1469 // ptrtoint. This is needed to account for the fact that we can't have i128
1470 // as a register class for SelectionDAG reasons.
1471 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1472 return hasBufferRsrcWorkaround(Query.Types[0]);
1475 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1476 // 64-bits.
1478 // TODO: Should generalize bitcast action into coerce, which will also cover
1479 // inserting addrspacecasts.
1480 Actions.customIf(typeIs(1, Constant32Ptr));
1482 // Turn any illegal element vectors into something easier to deal
1483 // with. These will ultimately produce 32-bit scalar shifts to extract the
1484 // parts anyway.
1486 // For odd 16-bit element vectors, prefer to split those into pieces with
1487 // 16-bit vector parts.
1488 Actions.bitcastIf(
1489 [=](const LegalityQuery &Query) -> bool {
1490 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1491 Query.MMODescrs[0].MemoryTy);
1492 }, bitcastToRegisterType(0));
1494 if (!IsStore) {
1495 // Widen suitably aligned loads by loading extra bytes. The standard
1496 // legalization actions can't properly express widening memory operands.
1497 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1498 return shouldWidenLoad(ST, Query, G_LOAD);
1502 // FIXME: load/store narrowing should be moved to lower action
1503 Actions
1504 .narrowScalarIf(
1505 [=](const LegalityQuery &Query) -> bool {
1506 return !Query.Types[0].isVector() &&
1507 needToSplitMemOp(Query, Op == G_LOAD);
1509 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1510 const LLT DstTy = Query.Types[0];
1511 const LLT PtrTy = Query.Types[1];
1513 const unsigned DstSize = DstTy.getSizeInBits();
1514 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1516 // Split extloads.
1517 if (DstSize > MemSize)
1518 return std::pair(0, LLT::scalar(MemSize));
1520 unsigned MaxSize = maxSizeForAddrSpace(
1521 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1522 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1523 if (MemSize > MaxSize)
1524 return std::pair(0, LLT::scalar(MaxSize));
1526 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1527 return std::pair(0, LLT::scalar(Align));
1529 .fewerElementsIf(
1530 [=](const LegalityQuery &Query) -> bool {
1531 return Query.Types[0].isVector() &&
1532 needToSplitMemOp(Query, Op == G_LOAD);
1534 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1535 const LLT DstTy = Query.Types[0];
1536 const LLT PtrTy = Query.Types[1];
1538 LLT EltTy = DstTy.getElementType();
1539 unsigned MaxSize = maxSizeForAddrSpace(
1540 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1541 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1543 // FIXME: Handle widened to power of 2 results better. This ends
1544 // up scalarizing.
1545 // FIXME: 3 element stores scalarized on SI
1547 // Split if it's too large for the address space.
1548 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1549 if (MemSize > MaxSize) {
1550 unsigned NumElts = DstTy.getNumElements();
1551 unsigned EltSize = EltTy.getSizeInBits();
1553 if (MaxSize % EltSize == 0) {
1554 return std::pair(
1555 0, LLT::scalarOrVector(
1556 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1559 unsigned NumPieces = MemSize / MaxSize;
1561 // FIXME: Refine when odd breakdowns handled
1562 // The scalars will need to be re-legalized.
1563 if (NumPieces == 1 || NumPieces >= NumElts ||
1564 NumElts % NumPieces != 0)
1565 return std::pair(0, EltTy);
1567 return std::pair(0,
1568 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1571 // FIXME: We could probably handle weird extending loads better.
1572 if (DstTy.getSizeInBits() > MemSize)
1573 return std::pair(0, EltTy);
1575 unsigned EltSize = EltTy.getSizeInBits();
1576 unsigned DstSize = DstTy.getSizeInBits();
1577 if (!isPowerOf2_32(DstSize)) {
1578 // We're probably decomposing an odd sized store. Try to split
1579 // to the widest type. TODO: Account for alignment. As-is it
1580 // should be OK, since the new parts will be further legalized.
1581 unsigned FloorSize = llvm::bit_floor(DstSize);
1582 return std::pair(
1583 0, LLT::scalarOrVector(
1584 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1587 // May need relegalization for the scalars.
1588 return std::pair(0, EltTy);
1590 .minScalar(0, S32)
1591 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1592 .widenScalarToNextPow2(0)
1593 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1594 .lower();
1597 // FIXME: Unaligned accesses not lowered.
1598 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1599 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1600 {S32, GlobalPtr, S16, 2 * 8},
1601 {S32, LocalPtr, S8, 8},
1602 {S32, LocalPtr, S16, 16},
1603 {S32, PrivatePtr, S8, 8},
1604 {S32, PrivatePtr, S16, 16},
1605 {S32, ConstantPtr, S8, 8},
1606 {S32, ConstantPtr, S16, 2 * 8}})
1607 .legalIf(
1608 [=](const LegalityQuery &Query) -> bool {
1609 return isLoadStoreLegal(ST, Query);
1612 if (ST.hasFlatAddressSpace()) {
1613 ExtLoads.legalForTypesWithMemDesc(
1614 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1617 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1618 // 64-bits.
1620 // TODO: Should generalize bitcast action into coerce, which will also cover
1621 // inserting addrspacecasts.
1622 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1624 ExtLoads.clampScalar(0, S32, S32)
1625 .widenScalarToNextPow2(0)
1626 .lower();
1628 auto &Atomics = getActionDefinitionsBuilder(
1629 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1630 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1631 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1632 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1633 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1634 {S64, GlobalPtr}, {S64, LocalPtr},
1635 {S32, RegionPtr}, {S64, RegionPtr}});
1636 if (ST.hasFlatAddressSpace()) {
1637 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1640 // TODO: v2bf16 operations, and fat buffer pointer support.
1641 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1642 if (ST.hasLDSFPAtomicAddF32()) {
1643 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1644 if (ST.hasLdsAtomicAddF64())
1645 Atomic.legalFor({{S64, LocalPtr}});
1646 if (ST.hasAtomicDsPkAdd16Insts())
1647 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1649 if (ST.hasAtomicFaddInsts())
1650 Atomic.legalFor({{S32, GlobalPtr}});
1651 if (ST.hasFlatAtomicFaddF32Inst())
1652 Atomic.legalFor({{S32, FlatPtr}});
1654 if (ST.hasGFX90AInsts()) {
1655 // These are legal with some caveats, and should have undergone expansion in
1656 // the IR in most situations
1657 // TODO: Move atomic expansion into legalizer
1658 Atomic.legalFor({
1659 {S32, GlobalPtr},
1660 {S64, GlobalPtr},
1661 {S64, FlatPtr}
1665 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1666 ST.hasAtomicBufferGlobalPkAddF16Insts())
1667 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1668 if (ST.hasAtomicGlobalPkAddBF16Inst())
1669 Atomic.legalFor({{V2BF16, GlobalPtr}});
1670 if (ST.hasAtomicFlatPkAdd16Insts())
1671 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1674 // Most of the legalization work here is done by AtomicExpand. We could
1675 // probably use a simpler legality rule that just assumes anything is OK.
1676 auto &AtomicFMinFMax =
1677 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1678 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1680 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1681 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1682 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1683 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1684 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1685 AtomicFMinFMax.legalFor({F32, FlatPtr});
1686 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1687 AtomicFMinFMax.legalFor({F64, FlatPtr});
1689 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1690 // demarshalling
1691 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1692 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1693 {S32, FlatPtr}, {S64, FlatPtr}})
1694 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1695 {S32, RegionPtr}, {S64, RegionPtr}});
1696 // TODO: Pointer types, any 32-bit or 64-bit vector
1698 // Condition should be s32 for scalar, s1 for vector.
1699 getActionDefinitionsBuilder(G_SELECT)
1700 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1701 LocalPtr, FlatPtr, PrivatePtr,
1702 LLT::fixed_vector(2, LocalPtr),
1703 LLT::fixed_vector(2, PrivatePtr)},
1704 {S1, S32})
1705 .clampScalar(0, S16, S64)
1706 .scalarize(1)
1707 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1708 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1709 .clampMaxNumElements(0, S32, 2)
1710 .clampMaxNumElements(0, LocalPtr, 2)
1711 .clampMaxNumElements(0, PrivatePtr, 2)
1712 .scalarize(0)
1713 .widenScalarToNextPow2(0)
1714 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1716 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1717 // be more flexible with the shift amount type.
1718 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1719 .legalFor({{S32, S32}, {S64, S32}});
1720 if (ST.has16BitInsts()) {
1721 if (ST.hasVOP3PInsts()) {
1722 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1723 .clampMaxNumElements(0, S16, 2);
1724 } else
1725 Shifts.legalFor({{S16, S16}});
1727 // TODO: Support 16-bit shift amounts for all types
1728 Shifts.widenScalarIf(
1729 [=](const LegalityQuery &Query) {
1730 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1731 // 32-bit amount.
1732 const LLT ValTy = Query.Types[0];
1733 const LLT AmountTy = Query.Types[1];
1734 return ValTy.getSizeInBits() <= 16 &&
1735 AmountTy.getSizeInBits() < 16;
1736 }, changeTo(1, S16));
1737 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1738 Shifts.clampScalar(1, S32, S32);
1739 Shifts.widenScalarToNextPow2(0, 16);
1740 Shifts.clampScalar(0, S16, S64);
1742 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1743 .minScalar(0, S16)
1744 .scalarize(0)
1745 .lower();
1746 } else {
1747 // Make sure we legalize the shift amount type first, as the general
1748 // expansion for the shifted type will produce much worse code if it hasn't
1749 // been truncated already.
1750 Shifts.clampScalar(1, S32, S32);
1751 Shifts.widenScalarToNextPow2(0, 32);
1752 Shifts.clampScalar(0, S32, S64);
1754 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1755 .minScalar(0, S32)
1756 .scalarize(0)
1757 .lower();
1759 Shifts.scalarize(0);
1761 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1762 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1763 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1764 unsigned IdxTypeIdx = 2;
1766 getActionDefinitionsBuilder(Op)
1767 .customIf([=](const LegalityQuery &Query) {
1768 const LLT EltTy = Query.Types[EltTypeIdx];
1769 const LLT VecTy = Query.Types[VecTypeIdx];
1770 const LLT IdxTy = Query.Types[IdxTypeIdx];
1771 const unsigned EltSize = EltTy.getSizeInBits();
1772 const bool isLegalVecType =
1773 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1774 // Address space 8 pointers are 128-bit wide values, but the logic
1775 // below will try to bitcast them to 2N x s64, which will fail.
1776 // Therefore, as an intermediate step, wrap extracts/insertions from a
1777 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1778 // extraction result) in order to produce a vector operation that can
1779 // be handled by the logic below.
1780 if (EltTy.isPointer() && EltSize > 64)
1781 return true;
1782 return (EltSize == 32 || EltSize == 64) &&
1783 VecTy.getSizeInBits() % 32 == 0 &&
1784 VecTy.getSizeInBits() <= MaxRegisterSize &&
1785 IdxTy.getSizeInBits() == 32 &&
1786 isLegalVecType;
1788 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1789 bitcastToVectorElement32(VecTypeIdx))
1790 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1791 .bitcastIf(
1792 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1793 [=](const LegalityQuery &Query) {
1794 // For > 64-bit element types, try to turn this into a 64-bit
1795 // element vector since we may be able to do better indexing
1796 // if this is scalar. If not, fall back to 32.
1797 const LLT EltTy = Query.Types[EltTypeIdx];
1798 const LLT VecTy = Query.Types[VecTypeIdx];
1799 const unsigned DstEltSize = EltTy.getSizeInBits();
1800 const unsigned VecSize = VecTy.getSizeInBits();
1802 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1803 return std::pair(
1804 VecTypeIdx,
1805 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1807 .clampScalar(EltTypeIdx, S32, S64)
1808 .clampScalar(VecTypeIdx, S32, S64)
1809 .clampScalar(IdxTypeIdx, S32, S32)
1810 .clampMaxNumElements(VecTypeIdx, S32, 32)
1811 // TODO: Clamp elements for 64-bit vectors?
1812 .moreElementsIf(
1813 isIllegalRegisterType(VecTypeIdx),
1814 moreElementsToNextExistingRegClass(VecTypeIdx))
1815 // It should only be necessary with variable indexes.
1816 // As a last resort, lower to the stack
1817 .lower();
1820 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1821 .unsupportedIf([=](const LegalityQuery &Query) {
1822 const LLT &EltTy = Query.Types[1].getElementType();
1823 return Query.Types[0] != EltTy;
1826 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1827 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1828 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1830 // FIXME: Doesn't handle extract of illegal sizes.
1831 getActionDefinitionsBuilder(Op)
1832 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1833 .lowerIf([=](const LegalityQuery &Query) {
1834 // Sub-vector(or single element) insert and extract.
1835 // TODO: verify immediate offset here since lower only works with
1836 // whole elements.
1837 const LLT BigTy = Query.Types[BigTyIdx];
1838 return BigTy.isVector();
1840 // FIXME: Multiples of 16 should not be legal.
1841 .legalIf([=](const LegalityQuery &Query) {
1842 const LLT BigTy = Query.Types[BigTyIdx];
1843 const LLT LitTy = Query.Types[LitTyIdx];
1844 return (BigTy.getSizeInBits() % 32 == 0) &&
1845 (LitTy.getSizeInBits() % 16 == 0);
1847 .widenScalarIf(
1848 [=](const LegalityQuery &Query) {
1849 const LLT BigTy = Query.Types[BigTyIdx];
1850 return (BigTy.getScalarSizeInBits() < 16);
1852 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1853 .widenScalarIf(
1854 [=](const LegalityQuery &Query) {
1855 const LLT LitTy = Query.Types[LitTyIdx];
1856 return (LitTy.getScalarSizeInBits() < 16);
1858 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1859 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1860 .widenScalarToNextPow2(BigTyIdx, 32);
1864 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1865 .legalForCartesianProduct(AllS32Vectors, {S32})
1866 .legalForCartesianProduct(AllS64Vectors, {S64})
1867 .clampNumElements(0, V16S32, V32S32)
1868 .clampNumElements(0, V2S64, V16S64)
1869 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1870 .moreElementsIf(
1871 isIllegalRegisterType(0),
1872 moreElementsToNextExistingRegClass(0));
1874 if (ST.hasScalarPackInsts()) {
1875 BuildVector
1876 // FIXME: Should probably widen s1 vectors straight to s32
1877 .minScalarOrElt(0, S16)
1878 .minScalar(1, S16);
1880 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1881 .legalFor({V2S16, S32})
1882 .lower();
1883 } else {
1884 BuildVector.customFor({V2S16, S16});
1885 BuildVector.minScalarOrElt(0, S32);
1887 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1888 .customFor({V2S16, S32})
1889 .lower();
1892 BuildVector.legalIf(isRegisterType(0));
1894 // FIXME: Clamp maximum size
1895 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1896 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1897 .clampMaxNumElements(0, S32, 32)
1898 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1899 .clampMaxNumElements(0, S16, 64);
1901 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1903 // Merge/Unmerge
1904 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1905 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1906 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1908 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1909 const LLT Ty = Query.Types[TypeIdx];
1910 if (Ty.isVector()) {
1911 const LLT &EltTy = Ty.getElementType();
1912 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1913 return true;
1914 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1915 return true;
1917 return false;
1920 auto &Builder = getActionDefinitionsBuilder(Op)
1921 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1922 .lowerFor({{S16, V2S16}})
1923 .lowerIf([=](const LegalityQuery &Query) {
1924 const LLT BigTy = Query.Types[BigTyIdx];
1925 return BigTy.getSizeInBits() == 32;
1927 // Try to widen to s16 first for small types.
1928 // TODO: Only do this on targets with legal s16 shifts
1929 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1930 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1931 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1932 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1933 elementTypeIs(1, S16)),
1934 changeTo(1, V2S16))
1935 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1936 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1937 // valid.
1938 .clampScalar(LitTyIdx, S32, S512)
1939 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1940 // Break up vectors with weird elements into scalars
1941 .fewerElementsIf(
1942 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1943 scalarize(0))
1944 .fewerElementsIf(
1945 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1946 scalarize(1))
1947 .clampScalar(BigTyIdx, S32, MaxScalar);
1949 if (Op == G_MERGE_VALUES) {
1950 Builder.widenScalarIf(
1951 // TODO: Use 16-bit shifts if legal for 8-bit values?
1952 [=](const LegalityQuery &Query) {
1953 const LLT Ty = Query.Types[LitTyIdx];
1954 return Ty.getSizeInBits() < 32;
1956 changeTo(LitTyIdx, S32));
1959 Builder.widenScalarIf(
1960 [=](const LegalityQuery &Query) {
1961 const LLT Ty = Query.Types[BigTyIdx];
1962 return Ty.getSizeInBits() % 16 != 0;
1964 [=](const LegalityQuery &Query) {
1965 // Pick the next power of 2, or a multiple of 64 over 128.
1966 // Whichever is smaller.
1967 const LLT &Ty = Query.Types[BigTyIdx];
1968 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1969 if (NewSizeInBits >= 256) {
1970 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1971 if (RoundedTo < NewSizeInBits)
1972 NewSizeInBits = RoundedTo;
1974 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1976 // Any vectors left are the wrong size. Scalarize them.
1977 .scalarize(0)
1978 .scalarize(1);
1981 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1982 // RegBankSelect.
1983 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1984 .legalFor({{S32}, {S64}});
1986 if (ST.hasVOP3PInsts()) {
1987 SextInReg.lowerFor({{V2S16}})
1988 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1989 // get more vector shift opportunities, since we'll get those when
1990 // expanded.
1991 .clampMaxNumElementsStrict(0, S16, 2);
1992 } else if (ST.has16BitInsts()) {
1993 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1994 } else {
1995 // Prefer to promote to s32 before lowering if we don't have 16-bit
1996 // shifts. This avoid a lot of intermediate truncate and extend operations.
1997 SextInReg.lowerFor({{S32}, {S64}});
2000 SextInReg
2001 .scalarize(0)
2002 .clampScalar(0, S32, S64)
2003 .lower();
2005 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2006 .scalarize(0)
2007 .lower();
2009 // TODO: Only Try to form v2s16 with legal packed instructions.
2010 getActionDefinitionsBuilder(G_FSHR)
2011 .legalFor({{S32, S32}})
2012 .lowerFor({{V2S16, V2S16}})
2013 .clampMaxNumElementsStrict(0, S16, 2)
2014 .scalarize(0)
2015 .lower();
2017 if (ST.hasVOP3PInsts()) {
2018 getActionDefinitionsBuilder(G_FSHL)
2019 .lowerFor({{V2S16, V2S16}})
2020 .clampMaxNumElementsStrict(0, S16, 2)
2021 .scalarize(0)
2022 .lower();
2023 } else {
2024 getActionDefinitionsBuilder(G_FSHL)
2025 .scalarize(0)
2026 .lower();
2029 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2030 .legalFor({S64});
2032 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2034 getActionDefinitionsBuilder(G_FENCE)
2035 .alwaysLegal();
2037 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2038 .scalarize(0)
2039 .minScalar(0, S32)
2040 .lower();
2042 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2043 .legalFor({{S32, S32}, {S64, S32}})
2044 .clampScalar(1, S32, S32)
2045 .clampScalar(0, S32, S64)
2046 .widenScalarToNextPow2(0)
2047 .scalarize(0);
2049 getActionDefinitionsBuilder(
2050 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2051 G_FCOPYSIGN,
2053 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2054 G_READ_REGISTER, G_WRITE_REGISTER,
2056 G_SADDO, G_SSUBO})
2057 .lower();
2059 if (ST.hasIEEEMinMax()) {
2060 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2061 .legalFor(FPTypesPK16)
2062 .clampMaxNumElements(0, S16, 2)
2063 .scalarize(0);
2064 } else {
2065 // TODO: Implement
2066 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2069 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2070 .lower();
2072 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2074 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2075 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2076 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2077 .unsupported();
2079 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2081 getLegacyLegalizerInfo().computeTables();
2082 verify(*ST.getInstrInfo());
2085 bool AMDGPULegalizerInfo::legalizeCustom(
2086 LegalizerHelper &Helper, MachineInstr &MI,
2087 LostDebugLocObserver &LocObserver) const {
2088 MachineIRBuilder &B = Helper.MIRBuilder;
2089 MachineRegisterInfo &MRI = *B.getMRI();
2091 switch (MI.getOpcode()) {
2092 case TargetOpcode::G_ADDRSPACE_CAST:
2093 return legalizeAddrSpaceCast(MI, MRI, B);
2094 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2095 return legalizeFroundeven(MI, MRI, B);
2096 case TargetOpcode::G_FCEIL:
2097 return legalizeFceil(MI, MRI, B);
2098 case TargetOpcode::G_FREM:
2099 return legalizeFrem(MI, MRI, B);
2100 case TargetOpcode::G_INTRINSIC_TRUNC:
2101 return legalizeIntrinsicTrunc(MI, MRI, B);
2102 case TargetOpcode::G_SITOFP:
2103 return legalizeITOFP(MI, MRI, B, true);
2104 case TargetOpcode::G_UITOFP:
2105 return legalizeITOFP(MI, MRI, B, false);
2106 case TargetOpcode::G_FPTOSI:
2107 return legalizeFPTOI(MI, MRI, B, true);
2108 case TargetOpcode::G_FPTOUI:
2109 return legalizeFPTOI(MI, MRI, B, false);
2110 case TargetOpcode::G_FMINNUM:
2111 case TargetOpcode::G_FMAXNUM:
2112 case TargetOpcode::G_FMINNUM_IEEE:
2113 case TargetOpcode::G_FMAXNUM_IEEE:
2114 return legalizeMinNumMaxNum(Helper, MI);
2115 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2116 return legalizeExtractVectorElt(MI, MRI, B);
2117 case TargetOpcode::G_INSERT_VECTOR_ELT:
2118 return legalizeInsertVectorElt(MI, MRI, B);
2119 case TargetOpcode::G_FSIN:
2120 case TargetOpcode::G_FCOS:
2121 return legalizeSinCos(MI, MRI, B);
2122 case TargetOpcode::G_GLOBAL_VALUE:
2123 return legalizeGlobalValue(MI, MRI, B);
2124 case TargetOpcode::G_LOAD:
2125 case TargetOpcode::G_SEXTLOAD:
2126 case TargetOpcode::G_ZEXTLOAD:
2127 return legalizeLoad(Helper, MI);
2128 case TargetOpcode::G_STORE:
2129 return legalizeStore(Helper, MI);
2130 case TargetOpcode::G_FMAD:
2131 return legalizeFMad(MI, MRI, B);
2132 case TargetOpcode::G_FDIV:
2133 return legalizeFDIV(MI, MRI, B);
2134 case TargetOpcode::G_FFREXP:
2135 return legalizeFFREXP(MI, MRI, B);
2136 case TargetOpcode::G_FSQRT:
2137 return legalizeFSQRT(MI, MRI, B);
2138 case TargetOpcode::G_UDIV:
2139 case TargetOpcode::G_UREM:
2140 case TargetOpcode::G_UDIVREM:
2141 return legalizeUnsignedDIV_REM(MI, MRI, B);
2142 case TargetOpcode::G_SDIV:
2143 case TargetOpcode::G_SREM:
2144 case TargetOpcode::G_SDIVREM:
2145 return legalizeSignedDIV_REM(MI, MRI, B);
2146 case TargetOpcode::G_ATOMIC_CMPXCHG:
2147 return legalizeAtomicCmpXChg(MI, MRI, B);
2148 case TargetOpcode::G_FLOG2:
2149 return legalizeFlog2(MI, B);
2150 case TargetOpcode::G_FLOG:
2151 case TargetOpcode::G_FLOG10:
2152 return legalizeFlogCommon(MI, B);
2153 case TargetOpcode::G_FEXP2:
2154 return legalizeFExp2(MI, B);
2155 case TargetOpcode::G_FEXP:
2156 case TargetOpcode::G_FEXP10:
2157 return legalizeFExp(MI, B);
2158 case TargetOpcode::G_FPOW:
2159 return legalizeFPow(MI, B);
2160 case TargetOpcode::G_FFLOOR:
2161 return legalizeFFloor(MI, MRI, B);
2162 case TargetOpcode::G_BUILD_VECTOR:
2163 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2164 return legalizeBuildVector(MI, MRI, B);
2165 case TargetOpcode::G_MUL:
2166 return legalizeMul(Helper, MI);
2167 case TargetOpcode::G_CTLZ:
2168 case TargetOpcode::G_CTTZ:
2169 return legalizeCTLZ_CTTZ(MI, MRI, B);
2170 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2171 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2172 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2173 return legalizeFPTruncRound(MI, B);
2174 case TargetOpcode::G_STACKSAVE:
2175 return legalizeStackSave(MI, B);
2176 case TargetOpcode::G_GET_FPENV:
2177 return legalizeGetFPEnv(MI, MRI, B);
2178 case TargetOpcode::G_SET_FPENV:
2179 return legalizeSetFPEnv(MI, MRI, B);
2180 case TargetOpcode::G_TRAP:
2181 return legalizeTrap(MI, MRI, B);
2182 case TargetOpcode::G_DEBUGTRAP:
2183 return legalizeDebugTrap(MI, MRI, B);
2184 default:
2185 return false;
2188 llvm_unreachable("expected switch to return");
2191 Register AMDGPULegalizerInfo::getSegmentAperture(
2192 unsigned AS,
2193 MachineRegisterInfo &MRI,
2194 MachineIRBuilder &B) const {
2195 MachineFunction &MF = B.getMF();
2196 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2197 const LLT S32 = LLT::scalar(32);
2198 const LLT S64 = LLT::scalar(64);
2200 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2202 if (ST.hasApertureRegs()) {
2203 // Note: this register is somewhat broken. When used as a 32-bit operand,
2204 // it only returns zeroes. The real value is in the upper 32 bits.
2205 // Thus, we must emit extract the high 32 bits.
2206 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2207 ? AMDGPU::SRC_SHARED_BASE
2208 : AMDGPU::SRC_PRIVATE_BASE;
2209 // FIXME: It would be more natural to emit a COPY here, but then copy
2210 // coalescing would kick in and it would think it's okay to use the "HI"
2211 // subregister (instead of extracting the HI 32 bits) which is an artificial
2212 // (unusable) register.
2213 // Register TableGen definitions would need an overhaul to get rid of the
2214 // artificial "HI" aperture registers and prevent this kind of issue from
2215 // happening.
2216 Register Dst = MRI.createGenericVirtualRegister(S64);
2217 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2218 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2219 return B.buildUnmerge(S32, Dst).getReg(1);
2222 // TODO: can we be smarter about machine pointer info?
2223 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2224 Register LoadAddr = MRI.createGenericVirtualRegister(
2225 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2226 // For code object version 5, private_base and shared_base are passed through
2227 // implicit kernargs.
2228 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
2229 AMDGPU::AMDHSA_COV5) {
2230 AMDGPUTargetLowering::ImplicitParameter Param =
2231 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2232 : AMDGPUTargetLowering::PRIVATE_BASE;
2233 uint64_t Offset =
2234 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2236 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2237 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2239 if (!loadInputValue(KernargPtrReg, B,
2240 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2241 return Register();
2243 MachineMemOperand *MMO = MF.getMachineMemOperand(
2244 PtrInfo,
2245 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2246 MachineMemOperand::MOInvariant,
2247 LLT::scalar(32), commonAlignment(Align(64), Offset));
2249 // Pointer address
2250 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2251 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2252 // Load address
2253 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2256 Register QueuePtr = MRI.createGenericVirtualRegister(
2257 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2259 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
2260 return Register();
2262 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2263 // private_segment_aperture_base_hi.
2264 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2266 MachineMemOperand *MMO = MF.getMachineMemOperand(
2267 PtrInfo,
2268 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2269 MachineMemOperand::MOInvariant,
2270 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2272 B.buildPtrAdd(LoadAddr, QueuePtr,
2273 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2274 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2277 /// Return true if the value is a known valid address, such that a null check is
2278 /// not necessary.
2279 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2280 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2281 MachineInstr *Def = MRI.getVRegDef(Val);
2282 switch (Def->getOpcode()) {
2283 case AMDGPU::G_FRAME_INDEX:
2284 case AMDGPU::G_GLOBAL_VALUE:
2285 case AMDGPU::G_BLOCK_ADDR:
2286 return true;
2287 case AMDGPU::G_CONSTANT: {
2288 const ConstantInt *CI = Def->getOperand(1).getCImm();
2289 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2291 default:
2292 return false;
2295 return false;
2298 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2299 MachineInstr &MI, MachineRegisterInfo &MRI,
2300 MachineIRBuilder &B) const {
2301 MachineFunction &MF = B.getMF();
2303 // MI can either be a G_ADDRSPACE_CAST or a
2304 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2305 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2306 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2307 Intrinsic::amdgcn_addrspacecast_nonnull));
2309 const LLT S32 = LLT::scalar(32);
2310 Register Dst = MI.getOperand(0).getReg();
2311 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2312 : MI.getOperand(1).getReg();
2313 LLT DstTy = MRI.getType(Dst);
2314 LLT SrcTy = MRI.getType(Src);
2315 unsigned DestAS = DstTy.getAddressSpace();
2316 unsigned SrcAS = SrcTy.getAddressSpace();
2318 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2319 // vector element.
2320 assert(!DstTy.isVector());
2322 const AMDGPUTargetMachine &TM
2323 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2325 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2326 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2327 return true;
2330 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2331 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2332 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2333 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2334 // G_ADDRSPACE_CAST we need to guess.
2335 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2336 // Extract low 32-bits of the pointer.
2337 B.buildExtract(Dst, Src, 0);
2338 MI.eraseFromParent();
2339 return true;
2342 unsigned NullVal = TM.getNullPointerValue(DestAS);
2344 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2345 auto FlatNull = B.buildConstant(SrcTy, 0);
2347 // Extract low 32-bits of the pointer.
2348 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2350 auto CmpRes =
2351 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2352 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2354 MI.eraseFromParent();
2355 return true;
2358 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2359 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2360 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2361 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2362 if (!ApertureReg.isValid())
2363 return false;
2365 // Coerce the type of the low half of the result so we can use merge_values.
2366 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2368 // TODO: Should we allow mismatched types but matching sizes in merges to
2369 // avoid the ptrtoint?
2370 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2372 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2373 // G_ADDRSPACE_CAST we need to guess.
2374 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2375 B.buildCopy(Dst, BuildPtr);
2376 MI.eraseFromParent();
2377 return true;
2380 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2381 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2383 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2384 SegmentNull.getReg(0));
2386 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2388 MI.eraseFromParent();
2389 return true;
2392 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2393 SrcTy.getSizeInBits() == 64) {
2394 // Truncate.
2395 B.buildExtract(Dst, Src, 0);
2396 MI.eraseFromParent();
2397 return true;
2400 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2401 DstTy.getSizeInBits() == 64) {
2402 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2403 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2404 auto PtrLo = B.buildPtrToInt(S32, Src);
2405 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2406 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2407 MI.eraseFromParent();
2408 return true;
2411 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2412 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2414 LLVMContext &Ctx = MF.getFunction().getContext();
2415 Ctx.diagnose(InvalidAddrSpaceCast);
2416 B.buildUndef(Dst);
2417 MI.eraseFromParent();
2418 return true;
2421 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2422 MachineRegisterInfo &MRI,
2423 MachineIRBuilder &B) const {
2424 Register Src = MI.getOperand(1).getReg();
2425 LLT Ty = MRI.getType(Src);
2426 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2428 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2429 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2431 auto C1 = B.buildFConstant(Ty, C1Val);
2432 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2434 // TODO: Should this propagate fast-math-flags?
2435 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2436 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2438 auto C2 = B.buildFConstant(Ty, C2Val);
2439 auto Fabs = B.buildFAbs(Ty, Src);
2441 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2442 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2443 MI.eraseFromParent();
2444 return true;
2447 bool AMDGPULegalizerInfo::legalizeFceil(
2448 MachineInstr &MI, MachineRegisterInfo &MRI,
2449 MachineIRBuilder &B) const {
2451 const LLT S1 = LLT::scalar(1);
2452 const LLT S64 = LLT::scalar(64);
2454 Register Src = MI.getOperand(1).getReg();
2455 assert(MRI.getType(Src) == S64);
2457 // result = trunc(src)
2458 // if (src > 0.0 && src != result)
2459 // result += 1.0
2461 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2463 const auto Zero = B.buildFConstant(S64, 0.0);
2464 const auto One = B.buildFConstant(S64, 1.0);
2465 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2466 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2467 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2468 auto Add = B.buildSelect(S64, And, One, Zero);
2470 // TODO: Should this propagate fast-math-flags?
2471 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2472 MI.eraseFromParent();
2473 return true;
2476 bool AMDGPULegalizerInfo::legalizeFrem(
2477 MachineInstr &MI, MachineRegisterInfo &MRI,
2478 MachineIRBuilder &B) const {
2479 Register DstReg = MI.getOperand(0).getReg();
2480 Register Src0Reg = MI.getOperand(1).getReg();
2481 Register Src1Reg = MI.getOperand(2).getReg();
2482 auto Flags = MI.getFlags();
2483 LLT Ty = MRI.getType(DstReg);
2485 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2486 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2487 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2488 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2489 MI.eraseFromParent();
2490 return true;
2493 static MachineInstrBuilder extractF64Exponent(Register Hi,
2494 MachineIRBuilder &B) {
2495 const unsigned FractBits = 52;
2496 const unsigned ExpBits = 11;
2497 LLT S32 = LLT::scalar(32);
2499 auto Const0 = B.buildConstant(S32, FractBits - 32);
2500 auto Const1 = B.buildConstant(S32, ExpBits);
2502 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2503 .addUse(Hi)
2504 .addUse(Const0.getReg(0))
2505 .addUse(Const1.getReg(0));
2507 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2510 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2511 MachineInstr &MI, MachineRegisterInfo &MRI,
2512 MachineIRBuilder &B) const {
2513 const LLT S1 = LLT::scalar(1);
2514 const LLT S32 = LLT::scalar(32);
2515 const LLT S64 = LLT::scalar(64);
2517 Register Src = MI.getOperand(1).getReg();
2518 assert(MRI.getType(Src) == S64);
2520 // TODO: Should this use extract since the low half is unused?
2521 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2522 Register Hi = Unmerge.getReg(1);
2524 // Extract the upper half, since this is where we will find the sign and
2525 // exponent.
2526 auto Exp = extractF64Exponent(Hi, B);
2528 const unsigned FractBits = 52;
2530 // Extract the sign bit.
2531 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2532 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2534 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2536 const auto Zero32 = B.buildConstant(S32, 0);
2538 // Extend back to 64-bits.
2539 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2541 auto Shr = B.buildAShr(S64, FractMask, Exp);
2542 auto Not = B.buildNot(S64, Shr);
2543 auto Tmp0 = B.buildAnd(S64, Src, Not);
2544 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2546 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2547 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2549 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2550 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2551 MI.eraseFromParent();
2552 return true;
2555 bool AMDGPULegalizerInfo::legalizeITOFP(
2556 MachineInstr &MI, MachineRegisterInfo &MRI,
2557 MachineIRBuilder &B, bool Signed) const {
2559 Register Dst = MI.getOperand(0).getReg();
2560 Register Src = MI.getOperand(1).getReg();
2562 const LLT S64 = LLT::scalar(64);
2563 const LLT S32 = LLT::scalar(32);
2565 assert(MRI.getType(Src) == S64);
2567 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2568 auto ThirtyTwo = B.buildConstant(S32, 32);
2570 if (MRI.getType(Dst) == S64) {
2571 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2572 : B.buildUITOFP(S64, Unmerge.getReg(1));
2574 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2575 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2577 // TODO: Should this propagate fast-math-flags?
2578 B.buildFAdd(Dst, LdExp, CvtLo);
2579 MI.eraseFromParent();
2580 return true;
2583 assert(MRI.getType(Dst) == S32);
2585 auto One = B.buildConstant(S32, 1);
2587 MachineInstrBuilder ShAmt;
2588 if (Signed) {
2589 auto ThirtyOne = B.buildConstant(S32, 31);
2590 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2591 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2592 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2593 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2594 .addUse(Unmerge.getReg(1));
2595 auto LS2 = B.buildSub(S32, LS, One);
2596 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2597 } else
2598 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2599 auto Norm = B.buildShl(S64, Src, ShAmt);
2600 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2601 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2602 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2603 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2604 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2605 B.buildFLdexp(Dst, FVal, Scale);
2606 MI.eraseFromParent();
2607 return true;
2610 // TODO: Copied from DAG implementation. Verify logic and document how this
2611 // actually works.
2612 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2613 MachineRegisterInfo &MRI,
2614 MachineIRBuilder &B,
2615 bool Signed) const {
2617 Register Dst = MI.getOperand(0).getReg();
2618 Register Src = MI.getOperand(1).getReg();
2620 const LLT S64 = LLT::scalar(64);
2621 const LLT S32 = LLT::scalar(32);
2623 const LLT SrcLT = MRI.getType(Src);
2624 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2626 unsigned Flags = MI.getFlags();
2628 // The basic idea of converting a floating point number into a pair of 32-bit
2629 // integers is illustrated as follows:
2631 // tf := trunc(val);
2632 // hif := floor(tf * 2^-32);
2633 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2634 // hi := fptoi(hif);
2635 // lo := fptoi(lof);
2637 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2638 MachineInstrBuilder Sign;
2639 if (Signed && SrcLT == S32) {
2640 // However, a 32-bit floating point number has only 23 bits mantissa and
2641 // it's not enough to hold all the significant bits of `lof` if val is
2642 // negative. To avoid the loss of precision, We need to take the absolute
2643 // value after truncating and flip the result back based on the original
2644 // signedness.
2645 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2646 Trunc = B.buildFAbs(S32, Trunc, Flags);
2648 MachineInstrBuilder K0, K1;
2649 if (SrcLT == S64) {
2650 K0 = B.buildFConstant(
2651 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2652 K1 = B.buildFConstant(
2653 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2654 } else {
2655 K0 = B.buildFConstant(
2656 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2657 K1 = B.buildFConstant(
2658 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2661 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2662 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2663 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2665 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2666 : B.buildFPTOUI(S32, FloorMul);
2667 auto Lo = B.buildFPTOUI(S32, Fma);
2669 if (Signed && SrcLT == S32) {
2670 // Flip the result based on the signedness, which is either all 0s or 1s.
2671 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2672 // r := xor({lo, hi}, sign) - sign;
2673 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2674 Sign);
2675 } else
2676 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2677 MI.eraseFromParent();
2679 return true;
2682 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2683 MachineInstr &MI) const {
2684 MachineFunction &MF = Helper.MIRBuilder.getMF();
2685 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2687 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2688 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2690 // With ieee_mode disabled, the instructions have the correct behavior
2691 // already for G_FMINNUM/G_FMAXNUM
2692 if (!MFI->getMode().IEEE)
2693 return !IsIEEEOp;
2695 if (IsIEEEOp)
2696 return true;
2698 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2701 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2702 MachineInstr &MI, MachineRegisterInfo &MRI,
2703 MachineIRBuilder &B) const {
2704 // TODO: Should move some of this into LegalizerHelper.
2706 // TODO: Promote dynamic indexing of s16 to s32
2708 Register Dst = MI.getOperand(0).getReg();
2709 Register Vec = MI.getOperand(1).getReg();
2711 LLT VecTy = MRI.getType(Vec);
2712 LLT EltTy = VecTy.getElementType();
2713 assert(EltTy == MRI.getType(Dst));
2715 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2716 // but we can't go directly to that logic becasue you can't bitcast a vector
2717 // of pointers to a vector of integers. Therefore, introduce an intermediate
2718 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2719 // drive the legalization forward.
2720 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2721 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2722 LLT IntVecTy = VecTy.changeElementType(IntTy);
2724 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2725 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2726 B.buildIntToPtr(Dst, IntElt);
2728 MI.eraseFromParent();
2729 return true;
2732 // FIXME: Artifact combiner probably should have replaced the truncated
2733 // constant before this, so we shouldn't need
2734 // getIConstantVRegValWithLookThrough.
2735 std::optional<ValueAndVReg> MaybeIdxVal =
2736 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2737 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2738 return true;
2739 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2741 if (IdxVal < VecTy.getNumElements()) {
2742 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2743 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2744 } else {
2745 B.buildUndef(Dst);
2748 MI.eraseFromParent();
2749 return true;
2752 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2753 MachineInstr &MI, MachineRegisterInfo &MRI,
2754 MachineIRBuilder &B) const {
2755 // TODO: Should move some of this into LegalizerHelper.
2757 // TODO: Promote dynamic indexing of s16 to s32
2759 Register Dst = MI.getOperand(0).getReg();
2760 Register Vec = MI.getOperand(1).getReg();
2761 Register Ins = MI.getOperand(2).getReg();
2763 LLT VecTy = MRI.getType(Vec);
2764 LLT EltTy = VecTy.getElementType();
2765 assert(EltTy == MRI.getType(Ins));
2767 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2768 // but we can't go directly to that logic becasue you can't bitcast a vector
2769 // of pointers to a vector of integers. Therefore, make the pointer vector
2770 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2771 // new value, and then inttoptr the result vector back. This will then allow
2772 // the rest of legalization to take over.
2773 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2774 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2775 LLT IntVecTy = VecTy.changeElementType(IntTy);
2777 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2778 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2779 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2780 MI.getOperand(3));
2781 B.buildIntToPtr(Dst, IntVecDest);
2782 MI.eraseFromParent();
2783 return true;
2786 // FIXME: Artifact combiner probably should have replaced the truncated
2787 // constant before this, so we shouldn't need
2788 // getIConstantVRegValWithLookThrough.
2789 std::optional<ValueAndVReg> MaybeIdxVal =
2790 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2791 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2792 return true;
2794 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2796 unsigned NumElts = VecTy.getNumElements();
2797 if (IdxVal < NumElts) {
2798 SmallVector<Register, 8> SrcRegs;
2799 for (unsigned i = 0; i < NumElts; ++i)
2800 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2801 B.buildUnmerge(SrcRegs, Vec);
2803 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2804 B.buildMergeLikeInstr(Dst, SrcRegs);
2805 } else {
2806 B.buildUndef(Dst);
2809 MI.eraseFromParent();
2810 return true;
2813 bool AMDGPULegalizerInfo::legalizeSinCos(
2814 MachineInstr &MI, MachineRegisterInfo &MRI,
2815 MachineIRBuilder &B) const {
2817 Register DstReg = MI.getOperand(0).getReg();
2818 Register SrcReg = MI.getOperand(1).getReg();
2819 LLT Ty = MRI.getType(DstReg);
2820 unsigned Flags = MI.getFlags();
2822 Register TrigVal;
2823 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2824 if (ST.hasTrigReducedRange()) {
2825 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2826 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2827 .addUse(MulVal.getReg(0))
2828 .setMIFlags(Flags)
2829 .getReg(0);
2830 } else
2831 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2833 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2834 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2835 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2836 .addUse(TrigVal)
2837 .setMIFlags(Flags);
2838 MI.eraseFromParent();
2839 return true;
2842 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2843 MachineIRBuilder &B,
2844 const GlobalValue *GV,
2845 int64_t Offset,
2846 unsigned GAFlags) const {
2847 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2848 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2849 // to the following code sequence:
2851 // For constant address space:
2852 // s_getpc_b64 s[0:1]
2853 // s_add_u32 s0, s0, $symbol
2854 // s_addc_u32 s1, s1, 0
2856 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2857 // a fixup or relocation is emitted to replace $symbol with a literal
2858 // constant, which is a pc-relative offset from the encoding of the $symbol
2859 // operand to the global variable.
2861 // For global address space:
2862 // s_getpc_b64 s[0:1]
2863 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2864 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2866 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2867 // fixups or relocations are emitted to replace $symbol@*@lo and
2868 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2869 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2870 // operand to the global variable.
2872 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2874 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2875 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2877 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2878 .addDef(PCReg);
2880 MIB.addGlobalAddress(GV, Offset, GAFlags);
2881 if (GAFlags == SIInstrInfo::MO_NONE)
2882 MIB.addImm(0);
2883 else
2884 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2886 if (!B.getMRI()->getRegClassOrNull(PCReg))
2887 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2889 if (PtrTy.getSizeInBits() == 32)
2890 B.buildExtract(DstReg, PCReg, 0);
2891 return true;
2894 // Emit a ABS32_LO / ABS32_HI relocation stub.
2895 void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2896 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2897 MachineRegisterInfo &MRI) const {
2898 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2900 LLT S32 = LLT::scalar(32);
2902 // Use the destination directly, if and only if we store the lower address
2903 // part only and we don't have a register class being set.
2904 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2905 ? DstReg
2906 : MRI.createGenericVirtualRegister(S32);
2908 if (!MRI.getRegClassOrNull(AddrLo))
2909 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2911 // Write the lower half.
2912 B.buildInstr(AMDGPU::S_MOV_B32)
2913 .addDef(AddrLo)
2914 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2916 // If required, write the upper half as well.
2917 if (RequiresHighHalf) {
2918 assert(PtrTy.getSizeInBits() == 64 &&
2919 "Must provide a 64-bit pointer type!");
2921 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2922 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2924 B.buildInstr(AMDGPU::S_MOV_B32)
2925 .addDef(AddrHi)
2926 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2928 // Use the destination directly, if and only if we don't have a register
2929 // class being set.
2930 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2931 ? DstReg
2932 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2934 if (!MRI.getRegClassOrNull(AddrDst))
2935 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2937 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2939 // If we created a new register for the destination, cast the result into
2940 // the final output.
2941 if (AddrDst != DstReg)
2942 B.buildCast(DstReg, AddrDst);
2943 } else if (AddrLo != DstReg) {
2944 // If we created a new register for the destination, cast the result into
2945 // the final output.
2946 B.buildCast(DstReg, AddrLo);
2950 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2951 MachineInstr &MI, MachineRegisterInfo &MRI,
2952 MachineIRBuilder &B) const {
2953 Register DstReg = MI.getOperand(0).getReg();
2954 LLT Ty = MRI.getType(DstReg);
2955 unsigned AS = Ty.getAddressSpace();
2957 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2958 MachineFunction &MF = B.getMF();
2959 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2961 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2962 if (!MFI->isModuleEntryFunction() &&
2963 GV->getName() != "llvm.amdgcn.module.lds") {
2964 const Function &Fn = MF.getFunction();
2965 DiagnosticInfoUnsupported BadLDSDecl(
2966 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2967 DS_Warning);
2968 Fn.getContext().diagnose(BadLDSDecl);
2970 // We currently don't have a way to correctly allocate LDS objects that
2971 // aren't directly associated with a kernel. We do force inlining of
2972 // functions that use local objects. However, if these dead functions are
2973 // not eliminated, we don't want a compile time error. Just emit a warning
2974 // and a trap, since there should be no callable path here.
2975 B.buildTrap();
2976 B.buildUndef(DstReg);
2977 MI.eraseFromParent();
2978 return true;
2981 // TODO: We could emit code to handle the initialization somewhere.
2982 // We ignore the initializer for now and legalize it to allow selection.
2983 // The initializer will anyway get errored out during assembly emission.
2984 const SITargetLowering *TLI = ST.getTargetLowering();
2985 if (!TLI->shouldUseLDSConstAddress(GV)) {
2986 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2987 return true; // Leave in place;
2990 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2991 Type *Ty = GV->getValueType();
2992 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2993 // zero-sized type in other languages to declare the dynamic shared
2994 // memory which size is not known at the compile time. They will be
2995 // allocated by the runtime and placed directly after the static
2996 // allocated ones. They all share the same offset.
2997 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2998 // Adjust alignment for that dynamic shared memory array.
2999 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3000 LLT S32 = LLT::scalar(32);
3001 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3002 B.buildIntToPtr(DstReg, Sz);
3003 MI.eraseFromParent();
3004 return true;
3008 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3009 *cast<GlobalVariable>(GV)));
3010 MI.eraseFromParent();
3011 return true;
3014 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3015 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3016 MI.eraseFromParent();
3017 return true;
3020 const SITargetLowering *TLI = ST.getTargetLowering();
3022 if (TLI->shouldEmitFixup(GV)) {
3023 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3024 MI.eraseFromParent();
3025 return true;
3028 if (TLI->shouldEmitPCReloc(GV)) {
3029 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3030 MI.eraseFromParent();
3031 return true;
3034 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3035 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3037 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3038 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3039 MachinePointerInfo::getGOT(MF),
3040 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3041 MachineMemOperand::MOInvariant,
3042 LoadTy, Align(8));
3044 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3046 if (Ty.getSizeInBits() == 32) {
3047 // Truncate if this is a 32-bit constant address.
3048 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3049 B.buildExtract(DstReg, Load, 0);
3050 } else
3051 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3053 MI.eraseFromParent();
3054 return true;
3057 static LLT widenToNextPowerOf2(LLT Ty) {
3058 if (Ty.isVector())
3059 return Ty.changeElementCount(
3060 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3061 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3064 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3065 MachineInstr &MI) const {
3066 MachineIRBuilder &B = Helper.MIRBuilder;
3067 MachineRegisterInfo &MRI = *B.getMRI();
3068 GISelChangeObserver &Observer = Helper.Observer;
3070 Register PtrReg = MI.getOperand(1).getReg();
3071 LLT PtrTy = MRI.getType(PtrReg);
3072 unsigned AddrSpace = PtrTy.getAddressSpace();
3074 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3075 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3076 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3077 Observer.changingInstr(MI);
3078 MI.getOperand(1).setReg(Cast.getReg(0));
3079 Observer.changedInstr(MI);
3080 return true;
3083 if (MI.getOpcode() != AMDGPU::G_LOAD)
3084 return false;
3086 Register ValReg = MI.getOperand(0).getReg();
3087 LLT ValTy = MRI.getType(ValReg);
3089 if (hasBufferRsrcWorkaround(ValTy)) {
3090 Observer.changingInstr(MI);
3091 castBufferRsrcFromV4I32(MI, B, MRI, 0);
3092 Observer.changedInstr(MI);
3093 return true;
3096 MachineMemOperand *MMO = *MI.memoperands_begin();
3097 const unsigned ValSize = ValTy.getSizeInBits();
3098 const LLT MemTy = MMO->getMemoryType();
3099 const Align MemAlign = MMO->getAlign();
3100 const unsigned MemSize = MemTy.getSizeInBits();
3101 const uint64_t AlignInBits = 8 * MemAlign.value();
3103 // Widen non-power-of-2 loads to the alignment if needed
3104 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3105 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3107 // This was already the correct extending load result type, so just adjust
3108 // the memory type.
3109 if (WideMemSize == ValSize) {
3110 MachineFunction &MF = B.getMF();
3112 MachineMemOperand *WideMMO =
3113 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3114 Observer.changingInstr(MI);
3115 MI.setMemRefs(MF, {WideMMO});
3116 Observer.changedInstr(MI);
3117 return true;
3120 // Don't bother handling edge case that should probably never be produced.
3121 if (ValSize > WideMemSize)
3122 return false;
3124 LLT WideTy = widenToNextPowerOf2(ValTy);
3126 Register WideLoad;
3127 if (!WideTy.isVector()) {
3128 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3129 B.buildTrunc(ValReg, WideLoad).getReg(0);
3130 } else {
3131 // Extract the subvector.
3133 if (isRegisterType(ValTy)) {
3134 // If this a case where G_EXTRACT is legal, use it.
3135 // (e.g. <3 x s32> -> <4 x s32>)
3136 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3137 B.buildExtract(ValReg, WideLoad, 0);
3138 } else {
3139 // For cases where the widened type isn't a nice register value, unmerge
3140 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3141 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3142 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3146 MI.eraseFromParent();
3147 return true;
3150 return false;
3153 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3154 MachineInstr &MI) const {
3155 MachineIRBuilder &B = Helper.MIRBuilder;
3156 MachineRegisterInfo &MRI = *B.getMRI();
3157 GISelChangeObserver &Observer = Helper.Observer;
3159 Register DataReg = MI.getOperand(0).getReg();
3160 LLT DataTy = MRI.getType(DataReg);
3162 if (hasBufferRsrcWorkaround(DataTy)) {
3163 Observer.changingInstr(MI);
3164 castBufferRsrcArgToV4I32(MI, B, 0);
3165 Observer.changedInstr(MI);
3166 return true;
3168 return false;
3171 bool AMDGPULegalizerInfo::legalizeFMad(
3172 MachineInstr &MI, MachineRegisterInfo &MRI,
3173 MachineIRBuilder &B) const {
3174 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3175 assert(Ty.isScalar());
3177 MachineFunction &MF = B.getMF();
3178 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3180 // TODO: Always legal with future ftz flag.
3181 // FIXME: Do we need just output?
3182 if (Ty == LLT::float32() &&
3183 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3184 return true;
3185 if (Ty == LLT::float16() &&
3186 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3187 return true;
3189 MachineIRBuilder HelperBuilder(MI);
3190 GISelObserverWrapper DummyObserver;
3191 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3192 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3195 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3196 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3197 Register DstReg = MI.getOperand(0).getReg();
3198 Register PtrReg = MI.getOperand(1).getReg();
3199 Register CmpVal = MI.getOperand(2).getReg();
3200 Register NewVal = MI.getOperand(3).getReg();
3202 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3203 "this should not have been custom lowered");
3205 LLT ValTy = MRI.getType(CmpVal);
3206 LLT VecTy = LLT::fixed_vector(2, ValTy);
3208 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3210 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3211 .addDef(DstReg)
3212 .addUse(PtrReg)
3213 .addUse(PackedVal)
3214 .setMemRefs(MI.memoperands());
3216 MI.eraseFromParent();
3217 return true;
3220 /// Return true if it's known that \p Src can never be an f32 denormal value.
3221 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3222 Register Src) {
3223 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3224 switch (DefMI->getOpcode()) {
3225 case TargetOpcode::G_INTRINSIC: {
3226 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3227 case Intrinsic::amdgcn_frexp_mant:
3228 return true;
3229 default:
3230 break;
3233 break;
3235 case TargetOpcode::G_FFREXP: {
3236 if (DefMI->getOperand(0).getReg() == Src)
3237 return true;
3238 break;
3240 case TargetOpcode::G_FPEXT: {
3241 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3243 default:
3244 return false;
3247 return false;
3250 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3251 if (Flags & MachineInstr::FmAfn)
3252 return true;
3253 const auto &Options = MF.getTarget().Options;
3254 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3257 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3258 unsigned Flags) {
3259 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3260 MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
3261 DenormalMode::PreserveSign;
3264 std::pair<Register, Register>
3265 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3266 unsigned Flags) const {
3267 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3268 return {};
3270 const LLT F32 = LLT::scalar(32);
3271 auto SmallestNormal = B.buildFConstant(
3272 F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3273 auto IsLtSmallestNormal =
3274 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3276 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3277 auto One = B.buildFConstant(F32, 1.0);
3278 auto ScaleFactor =
3279 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3280 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3282 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3285 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3286 MachineIRBuilder &B) const {
3287 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3288 // If we have to handle denormals, scale up the input and adjust the result.
3290 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3291 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3293 Register Dst = MI.getOperand(0).getReg();
3294 Register Src = MI.getOperand(1).getReg();
3295 LLT Ty = B.getMRI()->getType(Dst);
3296 unsigned Flags = MI.getFlags();
3298 if (Ty == LLT::scalar(16)) {
3299 const LLT F32 = LLT::scalar(32);
3300 // Nothing in half is a denormal when promoted to f32.
3301 auto Ext = B.buildFPExt(F32, Src, Flags);
3302 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3303 .addUse(Ext.getReg(0))
3304 .setMIFlags(Flags);
3305 B.buildFPTrunc(Dst, Log2, Flags);
3306 MI.eraseFromParent();
3307 return true;
3310 assert(Ty == LLT::scalar(32));
3312 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3313 if (!ScaledInput) {
3314 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3315 .addUse(Src)
3316 .setMIFlags(Flags);
3317 MI.eraseFromParent();
3318 return true;
3321 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3322 .addUse(ScaledInput)
3323 .setMIFlags(Flags);
3325 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3326 auto Zero = B.buildFConstant(Ty, 0.0);
3327 auto ResultOffset =
3328 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3329 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3331 MI.eraseFromParent();
3332 return true;
3335 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3336 Register Z, unsigned Flags) {
3337 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3338 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3341 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3342 MachineIRBuilder &B) const {
3343 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3344 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3346 MachineRegisterInfo &MRI = *B.getMRI();
3347 Register Dst = MI.getOperand(0).getReg();
3348 Register X = MI.getOperand(1).getReg();
3349 unsigned Flags = MI.getFlags();
3350 const LLT Ty = MRI.getType(X);
3351 MachineFunction &MF = B.getMF();
3353 const LLT F32 = LLT::scalar(32);
3354 const LLT F16 = LLT::scalar(16);
3356 const AMDGPUTargetMachine &TM =
3357 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3359 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3360 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3361 if (Ty == F16 && !ST.has16BitInsts()) {
3362 Register LogVal = MRI.createGenericVirtualRegister(F32);
3363 auto PromoteSrc = B.buildFPExt(F32, X);
3364 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3365 B.buildFPTrunc(Dst, LogVal);
3366 } else {
3367 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3370 MI.eraseFromParent();
3371 return true;
3374 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3375 if (ScaledInput)
3376 X = ScaledInput;
3378 auto Y =
3379 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3381 Register R;
3382 if (ST.hasFastFMAF32()) {
3383 // c+cc are ln(2)/ln(10) to more than 49 bits
3384 const float c_log10 = 0x1.344134p-2f;
3385 const float cc_log10 = 0x1.09f79ep-26f;
3387 // c + cc is ln(2) to more than 49 bits
3388 const float c_log = 0x1.62e42ep-1f;
3389 const float cc_log = 0x1.efa39ep-25f;
3391 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3392 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3394 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3395 auto NegR = B.buildFNeg(Ty, R, Flags);
3396 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3397 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3398 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3399 } else {
3400 // ch+ct is ln(2)/ln(10) to more than 36 bits
3401 const float ch_log10 = 0x1.344000p-2f;
3402 const float ct_log10 = 0x1.3509f6p-18f;
3404 // ch + ct is ln(2) to more than 36 bits
3405 const float ch_log = 0x1.62e000p-1f;
3406 const float ct_log = 0x1.0bfbe8p-15f;
3408 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3409 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3411 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3412 auto YH = B.buildAnd(Ty, Y, MaskConst);
3413 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3414 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3416 Register Mad0 =
3417 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3418 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3419 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3422 const bool IsFiniteOnly =
3423 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3424 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3426 if (!IsFiniteOnly) {
3427 // Expand isfinite(x) => fabs(x) < inf
3428 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3429 auto Fabs = B.buildFAbs(Ty, Y);
3430 auto IsFinite =
3431 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3432 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3435 if (ScaledInput) {
3436 auto Zero = B.buildFConstant(Ty, 0.0);
3437 auto ShiftK =
3438 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3439 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3440 B.buildFSub(Dst, R, Shift, Flags);
3441 } else {
3442 B.buildCopy(Dst, R);
3445 MI.eraseFromParent();
3446 return true;
3449 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3450 Register Src, bool IsLog10,
3451 unsigned Flags) const {
3452 const double Log2BaseInverted =
3453 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3455 LLT Ty = B.getMRI()->getType(Dst);
3457 if (Ty == LLT::scalar(32)) {
3458 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3459 if (ScaledInput) {
3460 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3461 .addUse(Src)
3462 .setMIFlags(Flags);
3463 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3464 auto Zero = B.buildFConstant(Ty, 0.0);
3465 auto ResultOffset =
3466 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3467 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3469 if (ST.hasFastFMAF32())
3470 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3471 else {
3472 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3473 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3476 return true;
3480 auto Log2Operand = Ty == LLT::scalar(16)
3481 ? B.buildFLog2(Ty, Src, Flags)
3482 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3483 .addUse(Src)
3484 .setMIFlags(Flags);
3485 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3486 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3487 return true;
3490 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3491 MachineIRBuilder &B) const {
3492 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3493 // If we have to handle denormals, scale up the input and adjust the result.
3495 Register Dst = MI.getOperand(0).getReg();
3496 Register Src = MI.getOperand(1).getReg();
3497 unsigned Flags = MI.getFlags();
3498 LLT Ty = B.getMRI()->getType(Dst);
3499 const LLT F16 = LLT::scalar(16);
3500 const LLT F32 = LLT::scalar(32);
3502 if (Ty == F16) {
3503 // Nothing in half is a denormal when promoted to f32.
3504 auto Ext = B.buildFPExt(F32, Src, Flags);
3505 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3506 .addUse(Ext.getReg(0))
3507 .setMIFlags(Flags);
3508 B.buildFPTrunc(Dst, Log2, Flags);
3509 MI.eraseFromParent();
3510 return true;
3513 assert(Ty == F32);
3515 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3516 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3517 .addUse(Src)
3518 .setMIFlags(Flags);
3519 MI.eraseFromParent();
3520 return true;
3523 // bool needs_scaling = x < -0x1.f80000p+6f;
3524 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3526 // -nextafter(128.0, -1)
3527 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3528 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3529 RangeCheckConst, Flags);
3531 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3532 auto Zero = B.buildFConstant(Ty, 0.0);
3533 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3534 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3536 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3537 .addUse(AddInput.getReg(0))
3538 .setMIFlags(Flags);
3540 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3541 auto One = B.buildFConstant(Ty, 1.0);
3542 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3543 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3544 MI.eraseFromParent();
3545 return true;
3548 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3549 Register X, unsigned Flags) const {
3550 LLT Ty = B.getMRI()->getType(Dst);
3551 LLT F32 = LLT::scalar(32);
3553 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3554 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3555 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3557 if (Ty == F32) {
3558 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3559 .addUse(Mul.getReg(0))
3560 .setMIFlags(Flags);
3561 } else {
3562 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3565 return true;
3568 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3569 auto NeedsScaling =
3570 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3571 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3572 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3573 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3575 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3576 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3578 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3579 .addUse(ExpInput.getReg(0))
3580 .setMIFlags(Flags);
3582 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3583 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3584 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3585 return true;
3588 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3589 MachineIRBuilder &B) const {
3590 Register Dst = MI.getOperand(0).getReg();
3591 Register X = MI.getOperand(1).getReg();
3592 const unsigned Flags = MI.getFlags();
3593 MachineFunction &MF = B.getMF();
3594 MachineRegisterInfo &MRI = *B.getMRI();
3595 LLT Ty = MRI.getType(Dst);
3596 const LLT F16 = LLT::scalar(16);
3597 const LLT F32 = LLT::scalar(32);
3598 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3600 if (Ty == F16) {
3601 // v_exp_f16 (fmul x, log2e)
3602 if (allowApproxFunc(MF, Flags)) {
3603 // TODO: Does this really require fast?
3604 legalizeFExpUnsafe(B, Dst, X, Flags);
3605 MI.eraseFromParent();
3606 return true;
3609 // exp(f16 x) ->
3610 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3612 // Nothing in half is a denormal when promoted to f32.
3613 auto Ext = B.buildFPExt(F32, X, Flags);
3614 Register Lowered = MRI.createGenericVirtualRegister(F32);
3615 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3616 B.buildFPTrunc(Dst, Lowered, Flags);
3617 MI.eraseFromParent();
3618 return true;
3621 assert(Ty == F32);
3623 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3624 // library behavior. Also, is known-not-daz source sufficient?
3625 if (allowApproxFunc(MF, Flags)) {
3626 legalizeFExpUnsafe(B, Dst, X, Flags);
3627 MI.eraseFromParent();
3628 return true;
3631 // Algorithm:
3633 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3635 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3636 // n = 64*m + j, 0 <= j < 64
3638 // e^x = 2^((64*m + j + f)/64)
3639 // = (2^m) * (2^(j/64)) * 2^(f/64)
3640 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3642 // f = x*(64/ln(2)) - n
3643 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3645 // e^x = (2^m) * (2^(j/64)) * e^r
3647 // (2^(j/64)) is precomputed
3649 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3650 // e^r = 1 + q
3652 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3654 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3655 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3656 Register PH, PL;
3658 if (ST.hasFastFMAF32()) {
3659 const float c_exp = numbers::log2ef;
3660 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3661 const float c_exp10 = 0x1.a934f0p+1f;
3662 const float cc_exp10 = 0x1.2f346ep-24f;
3664 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3665 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3666 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3667 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3669 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3670 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3671 } else {
3672 const float ch_exp = 0x1.714000p+0f;
3673 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3675 const float ch_exp10 = 0x1.a92000p+1f;
3676 const float cl_exp10 = 0x1.4f0978p-11f;
3678 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3679 auto XH = B.buildAnd(Ty, X, MaskConst);
3680 auto XL = B.buildFSub(Ty, X, XH, Flags);
3682 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3683 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3685 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3686 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3688 Register Mad0 =
3689 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3690 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3693 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3695 // It is unsafe to contract this fsub into the PH multiply.
3696 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3697 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3698 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3700 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3701 .addUse(A.getReg(0))
3702 .setMIFlags(Flags);
3703 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3705 auto UnderflowCheckConst =
3706 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3707 auto Zero = B.buildFConstant(Ty, 0.0);
3708 auto Underflow =
3709 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3711 R = B.buildSelect(Ty, Underflow, Zero, R);
3713 const auto &Options = MF.getTarget().Options;
3715 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3716 auto OverflowCheckConst =
3717 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3719 auto Overflow =
3720 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3721 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3722 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3725 B.buildCopy(Dst, R);
3726 MI.eraseFromParent();
3727 return true;
3730 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3731 MachineIRBuilder &B) const {
3732 Register Dst = MI.getOperand(0).getReg();
3733 Register Src0 = MI.getOperand(1).getReg();
3734 Register Src1 = MI.getOperand(2).getReg();
3735 unsigned Flags = MI.getFlags();
3736 LLT Ty = B.getMRI()->getType(Dst);
3737 const LLT F16 = LLT::float16();
3738 const LLT F32 = LLT::float32();
3740 if (Ty == F32) {
3741 auto Log = B.buildFLog2(F32, Src0, Flags);
3742 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3743 .addUse(Log.getReg(0))
3744 .addUse(Src1)
3745 .setMIFlags(Flags);
3746 B.buildFExp2(Dst, Mul, Flags);
3747 } else if (Ty == F16) {
3748 // There's no f16 fmul_legacy, so we need to convert for it.
3749 auto Log = B.buildFLog2(F16, Src0, Flags);
3750 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3751 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3752 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3753 .addUse(Ext0.getReg(0))
3754 .addUse(Ext1.getReg(0))
3755 .setMIFlags(Flags);
3756 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3757 } else
3758 return false;
3760 MI.eraseFromParent();
3761 return true;
3764 // Find a source register, ignoring any possible source modifiers.
3765 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3766 Register ModSrc = OrigSrc;
3767 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3768 ModSrc = SrcFNeg->getOperand(1).getReg();
3769 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3770 ModSrc = SrcFAbs->getOperand(1).getReg();
3771 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3772 ModSrc = SrcFAbs->getOperand(1).getReg();
3773 return ModSrc;
3776 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3777 MachineRegisterInfo &MRI,
3778 MachineIRBuilder &B) const {
3780 const LLT S1 = LLT::scalar(1);
3781 const LLT F64 = LLT::float64();
3782 Register Dst = MI.getOperand(0).getReg();
3783 Register OrigSrc = MI.getOperand(1).getReg();
3784 unsigned Flags = MI.getFlags();
3785 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3786 "this should not have been custom lowered");
3788 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3789 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3790 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3791 // V_FRACT bug is:
3792 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3794 // Convert floor(x) to (x - fract(x))
3796 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3797 .addUse(OrigSrc)
3798 .setMIFlags(Flags);
3800 // Give source modifier matching some assistance before obscuring a foldable
3801 // pattern.
3803 // TODO: We can avoid the neg on the fract? The input sign to fract
3804 // shouldn't matter?
3805 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3807 auto Const =
3808 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3810 Register Min = MRI.createGenericVirtualRegister(F64);
3812 // We don't need to concern ourselves with the snan handling difference, so
3813 // use the one which will directly select.
3814 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3815 if (MFI->getMode().IEEE)
3816 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3817 else
3818 B.buildFMinNum(Min, Fract, Const, Flags);
3820 Register CorrectedFract = Min;
3821 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3822 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3823 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3826 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3827 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3829 MI.eraseFromParent();
3830 return true;
3833 // Turn an illegal packed v2s16 build vector into bit operations.
3834 // TODO: This should probably be a bitcast action in LegalizerHelper.
3835 bool AMDGPULegalizerInfo::legalizeBuildVector(
3836 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3837 Register Dst = MI.getOperand(0).getReg();
3838 const LLT S32 = LLT::scalar(32);
3839 const LLT S16 = LLT::scalar(16);
3840 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3842 Register Src0 = MI.getOperand(1).getReg();
3843 Register Src1 = MI.getOperand(2).getReg();
3845 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3846 assert(MRI.getType(Src0) == S32);
3847 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3848 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3851 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3852 B.buildBitcast(Dst, Merge);
3854 MI.eraseFromParent();
3855 return true;
3858 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3860 // Source and accumulation registers must all be 32-bits.
3862 // TODO: When the multiply is uniform, we should produce a code sequence
3863 // that is better suited to instruction selection on the SALU. Instead of
3864 // the outer loop going over parts of the result, the outer loop should go
3865 // over parts of one of the factors. This should result in instruction
3866 // selection that makes full use of S_ADDC_U32 instructions.
3867 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3868 MutableArrayRef<Register> Accum,
3869 ArrayRef<Register> Src0,
3870 ArrayRef<Register> Src1,
3871 bool UsePartialMad64_32,
3872 bool SeparateOddAlignedProducts) const {
3873 // Use (possibly empty) vectors of S1 registers to represent the set of
3874 // carries from one pair of positions to the next.
3875 using Carry = SmallVector<Register, 2>;
3877 MachineIRBuilder &B = Helper.MIRBuilder;
3878 GISelKnownBits &KB = *Helper.getKnownBits();
3880 const LLT S1 = LLT::scalar(1);
3881 const LLT S32 = LLT::scalar(32);
3882 const LLT S64 = LLT::scalar(64);
3884 Register Zero32;
3885 Register Zero64;
3887 auto getZero32 = [&]() -> Register {
3888 if (!Zero32)
3889 Zero32 = B.buildConstant(S32, 0).getReg(0);
3890 return Zero32;
3892 auto getZero64 = [&]() -> Register {
3893 if (!Zero64)
3894 Zero64 = B.buildConstant(S64, 0).getReg(0);
3895 return Zero64;
3898 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3899 for (unsigned i = 0; i < Src0.size(); ++i) {
3900 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3901 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3904 // Merge the given carries into the 32-bit LocalAccum, which is modified
3905 // in-place.
3907 // Returns the carry-out, which is a single S1 register or null.
3908 auto mergeCarry =
3909 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3910 if (CarryIn.empty())
3911 return Register();
3913 bool HaveCarryOut = true;
3914 Register CarryAccum;
3915 if (CarryIn.size() == 1) {
3916 if (!LocalAccum) {
3917 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3918 return Register();
3921 CarryAccum = getZero32();
3922 } else {
3923 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3924 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3925 CarryAccum =
3926 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3927 .getReg(0);
3930 if (!LocalAccum) {
3931 LocalAccum = getZero32();
3932 HaveCarryOut = false;
3936 auto Add =
3937 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3938 LocalAccum = Add.getReg(0);
3939 return HaveCarryOut ? Add.getReg(1) : Register();
3942 // Build a multiply-add chain to compute
3944 // LocalAccum + (partial products at DstIndex)
3945 // + (opportunistic subset of CarryIn)
3947 // LocalAccum is an array of one or two 32-bit registers that are updated
3948 // in-place. The incoming registers may be null.
3950 // In some edge cases, carry-ins can be consumed "for free". In that case,
3951 // the consumed carry bits are removed from CarryIn in-place.
3952 auto buildMadChain =
3953 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3954 -> Carry {
3955 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3956 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3958 Carry CarryOut;
3959 unsigned j0 = 0;
3961 // Use plain 32-bit multiplication for the most significant part of the
3962 // result by default.
3963 if (LocalAccum.size() == 1 &&
3964 (!UsePartialMad64_32 || !CarryIn.empty())) {
3965 do {
3966 // Skip multiplication if one of the operands is 0
3967 unsigned j1 = DstIndex - j0;
3968 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3969 ++j0;
3970 continue;
3972 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3973 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3974 LocalAccum[0] = Mul.getReg(0);
3975 } else {
3976 if (CarryIn.empty()) {
3977 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3978 } else {
3979 LocalAccum[0] =
3980 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3981 .getReg(0);
3982 CarryIn.pop_back();
3985 ++j0;
3986 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3989 // Build full 64-bit multiplies.
3990 if (j0 <= DstIndex) {
3991 bool HaveSmallAccum = false;
3992 Register Tmp;
3994 if (LocalAccum[0]) {
3995 if (LocalAccum.size() == 1) {
3996 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3997 HaveSmallAccum = true;
3998 } else if (LocalAccum[1]) {
3999 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4000 HaveSmallAccum = false;
4001 } else {
4002 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4003 HaveSmallAccum = true;
4005 } else {
4006 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4007 Tmp = getZero64();
4008 HaveSmallAccum = true;
4011 do {
4012 unsigned j1 = DstIndex - j0;
4013 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4014 ++j0;
4015 continue;
4017 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4018 {Src0[j0], Src1[j1], Tmp});
4019 Tmp = Mad.getReg(0);
4020 if (!HaveSmallAccum)
4021 CarryOut.push_back(Mad.getReg(1));
4022 HaveSmallAccum = false;
4024 ++j0;
4025 } while (j0 <= DstIndex);
4027 auto Unmerge = B.buildUnmerge(S32, Tmp);
4028 LocalAccum[0] = Unmerge.getReg(0);
4029 if (LocalAccum.size() > 1)
4030 LocalAccum[1] = Unmerge.getReg(1);
4033 return CarryOut;
4036 // Outer multiply loop, iterating over destination parts from least
4037 // significant to most significant parts.
4039 // The columns of the following diagram correspond to the destination parts
4040 // affected by one iteration of the outer loop (ignoring boundary
4041 // conditions).
4043 // Dest index relative to 2 * i: 1 0 -1
4044 // ------
4045 // Carries from previous iteration: e o
4046 // Even-aligned partial product sum: E E .
4047 // Odd-aligned partial product sum: O O
4049 // 'o' is OddCarry, 'e' is EvenCarry.
4050 // EE and OO are computed from partial products via buildMadChain and use
4051 // accumulation where possible and appropriate.
4053 Register SeparateOddCarry;
4054 Carry EvenCarry;
4055 Carry OddCarry;
4057 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4058 Carry OddCarryIn = std::move(OddCarry);
4059 Carry EvenCarryIn = std::move(EvenCarry);
4060 OddCarry.clear();
4061 EvenCarry.clear();
4063 // Partial products at offset 2 * i.
4064 if (2 * i < Accum.size()) {
4065 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4066 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4069 // Partial products at offset 2 * i - 1.
4070 if (i > 0) {
4071 if (!SeparateOddAlignedProducts) {
4072 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4073 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4074 } else {
4075 bool IsHighest = 2 * i >= Accum.size();
4076 Register SeparateOddOut[2];
4077 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4078 .take_front(IsHighest ? 1 : 2);
4079 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4081 MachineInstr *Lo;
4083 if (i == 1) {
4084 if (!IsHighest)
4085 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4086 else
4087 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4088 } else {
4089 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4090 SeparateOddCarry);
4092 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4094 if (!IsHighest) {
4095 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4096 Lo->getOperand(1).getReg());
4097 Accum[2 * i] = Hi.getReg(0);
4098 SeparateOddCarry = Hi.getReg(1);
4103 // Add in the carries from the previous iteration
4104 if (i > 0) {
4105 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4106 EvenCarryIn.push_back(CarryOut);
4108 if (2 * i < Accum.size()) {
4109 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4110 OddCarry.push_back(CarryOut);
4116 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4118 // TODO: If the multiply is followed by an addition, we should attempt to
4119 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4120 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4121 MachineInstr &MI) const {
4122 assert(ST.hasMad64_32());
4123 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4125 MachineIRBuilder &B = Helper.MIRBuilder;
4126 MachineRegisterInfo &MRI = *B.getMRI();
4128 Register DstReg = MI.getOperand(0).getReg();
4129 Register Src0 = MI.getOperand(1).getReg();
4130 Register Src1 = MI.getOperand(2).getReg();
4132 LLT Ty = MRI.getType(DstReg);
4133 assert(Ty.isScalar());
4135 unsigned Size = Ty.getSizeInBits();
4136 unsigned NumParts = Size / 32;
4137 assert((Size % 32) == 0);
4138 assert(NumParts >= 2);
4140 // Whether to use MAD_64_32 for partial products whose high half is
4141 // discarded. This avoids some ADD instructions but risks false dependency
4142 // stalls on some subtargets in some cases.
4143 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4145 // Whether to compute odd-aligned partial products separately. This is
4146 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4147 // in an even-aligned VGPR.
4148 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4150 LLT S32 = LLT::scalar(32);
4151 SmallVector<Register, 2> Src0Parts, Src1Parts;
4152 for (unsigned i = 0; i < NumParts; ++i) {
4153 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4154 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4156 B.buildUnmerge(Src0Parts, Src0);
4157 B.buildUnmerge(Src1Parts, Src1);
4159 SmallVector<Register, 2> AccumRegs(NumParts);
4160 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4161 SeparateOddAlignedProducts);
4163 B.buildMergeLikeInstr(DstReg, AccumRegs);
4164 MI.eraseFromParent();
4165 return true;
4168 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4169 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4170 // case with a single min instruction instead of a compare+select.
4171 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4172 MachineRegisterInfo &MRI,
4173 MachineIRBuilder &B) const {
4174 Register Dst = MI.getOperand(0).getReg();
4175 Register Src = MI.getOperand(1).getReg();
4176 LLT DstTy = MRI.getType(Dst);
4177 LLT SrcTy = MRI.getType(Src);
4179 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4180 ? AMDGPU::G_AMDGPU_FFBH_U32
4181 : AMDGPU::G_AMDGPU_FFBL_B32;
4182 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4183 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4185 MI.eraseFromParent();
4186 return true;
4189 bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4190 MachineRegisterInfo &MRI,
4191 MachineIRBuilder &B) const {
4192 Register Dst = MI.getOperand(0).getReg();
4193 Register Src = MI.getOperand(1).getReg();
4194 LLT SrcTy = MRI.getType(Src);
4195 TypeSize NumBits = SrcTy.getSizeInBits();
4197 assert(NumBits < 32u);
4199 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4200 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4201 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4202 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4203 B.buildTrunc(Dst, Ctlz);
4204 MI.eraseFromParent();
4205 return true;
4208 // Check that this is a G_XOR x, -1
4209 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4210 if (MI.getOpcode() != TargetOpcode::G_XOR)
4211 return false;
4212 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4213 return ConstVal && *ConstVal == -1;
4216 // Return the use branch instruction, otherwise null if the usage is invalid.
4217 static MachineInstr *
4218 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4219 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4220 Register CondDef = MI.getOperand(0).getReg();
4221 if (!MRI.hasOneNonDBGUse(CondDef))
4222 return nullptr;
4224 MachineBasicBlock *Parent = MI.getParent();
4225 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4227 if (isNot(MRI, *UseMI)) {
4228 Register NegatedCond = UseMI->getOperand(0).getReg();
4229 if (!MRI.hasOneNonDBGUse(NegatedCond))
4230 return nullptr;
4232 // We're deleting the def of this value, so we need to remove it.
4233 eraseInstr(*UseMI, MRI);
4235 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4236 Negated = true;
4239 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4240 return nullptr;
4242 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4243 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4244 if (Next == Parent->end()) {
4245 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4246 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4247 return nullptr;
4248 UncondBrTarget = &*NextMBB;
4249 } else {
4250 if (Next->getOpcode() != AMDGPU::G_BR)
4251 return nullptr;
4252 Br = &*Next;
4253 UncondBrTarget = Br->getOperand(0).getMBB();
4256 return UseMI;
4259 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4260 const ArgDescriptor *Arg,
4261 const TargetRegisterClass *ArgRC,
4262 LLT ArgTy) const {
4263 MCRegister SrcReg = Arg->getRegister();
4264 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4265 assert(DstReg.isVirtual() && "Virtual register expected");
4267 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4268 *ArgRC, B.getDebugLoc(), ArgTy);
4269 if (Arg->isMasked()) {
4270 // TODO: Should we try to emit this once in the entry block?
4271 const LLT S32 = LLT::scalar(32);
4272 const unsigned Mask = Arg->getMask();
4273 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4275 Register AndMaskSrc = LiveIn;
4277 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4278 // 0.
4279 if (Shift != 0) {
4280 auto ShiftAmt = B.buildConstant(S32, Shift);
4281 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4284 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4285 } else {
4286 B.buildCopy(DstReg, LiveIn);
4289 return true;
4292 bool AMDGPULegalizerInfo::loadInputValue(
4293 Register DstReg, MachineIRBuilder &B,
4294 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4295 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4296 const ArgDescriptor *Arg = nullptr;
4297 const TargetRegisterClass *ArgRC;
4298 LLT ArgTy;
4300 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4301 const ArgDescriptor WorkGroupIDX =
4302 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4303 // If GridZ is not programmed in an entry function then the hardware will set
4304 // it to all zeros, so there is no need to mask the GridY value in the low
4305 // order bits.
4306 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4307 AMDGPU::TTMP7,
4308 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4309 const ArgDescriptor WorkGroupIDZ =
4310 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4311 if (ST.hasArchitectedSGPRs() &&
4312 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4313 switch (ArgType) {
4314 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4315 Arg = &WorkGroupIDX;
4316 ArgRC = &AMDGPU::SReg_32RegClass;
4317 ArgTy = LLT::scalar(32);
4318 break;
4319 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4320 Arg = &WorkGroupIDY;
4321 ArgRC = &AMDGPU::SReg_32RegClass;
4322 ArgTy = LLT::scalar(32);
4323 break;
4324 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4325 Arg = &WorkGroupIDZ;
4326 ArgRC = &AMDGPU::SReg_32RegClass;
4327 ArgTy = LLT::scalar(32);
4328 break;
4329 default:
4330 break;
4334 if (!Arg)
4335 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4337 if (!Arg) {
4338 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4339 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4340 // case the pointer argument may be missing and we use null.
4341 B.buildConstant(DstReg, 0);
4342 return true;
4345 // It's undefined behavior if a function marked with the amdgpu-no-*
4346 // attributes uses the corresponding intrinsic.
4347 B.buildUndef(DstReg);
4348 return true;
4351 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4352 return false; // TODO: Handle these
4353 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4356 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4357 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4358 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4359 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4360 return false;
4362 MI.eraseFromParent();
4363 return true;
4366 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4367 int64_t C) {
4368 B.buildConstant(MI.getOperand(0).getReg(), C);
4369 MI.eraseFromParent();
4370 return true;
4373 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4374 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4375 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4376 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4377 if (MaxID == 0)
4378 return replaceWithConstant(B, MI, 0);
4380 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4381 const ArgDescriptor *Arg;
4382 const TargetRegisterClass *ArgRC;
4383 LLT ArgTy;
4384 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4386 Register DstReg = MI.getOperand(0).getReg();
4387 if (!Arg) {
4388 // It's undefined behavior if a function marked with the amdgpu-no-*
4389 // attributes uses the corresponding intrinsic.
4390 B.buildUndef(DstReg);
4391 MI.eraseFromParent();
4392 return true;
4395 if (Arg->isMasked()) {
4396 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4397 // masking operations anyway.
4399 // TODO: We could assert the top bit is 0 for the source copy.
4400 if (!loadInputValue(DstReg, B, ArgType))
4401 return false;
4402 } else {
4403 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4404 if (!loadInputValue(TmpReg, B, ArgType))
4405 return false;
4406 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4409 MI.eraseFromParent();
4410 return true;
4413 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4414 int64_t Offset) const {
4415 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
4416 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4418 // TODO: If we passed in the base kernel offset we could have a better
4419 // alignment than 4, but we don't really need it.
4420 if (!loadInputValue(KernArgReg, B,
4421 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4422 llvm_unreachable("failed to find kernarg segment ptr");
4424 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4425 // TODO: Should get nuw
4426 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4429 /// Legalize a value that's loaded from kernel arguments. This is only used by
4430 /// legacy intrinsics.
4431 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4432 MachineIRBuilder &B,
4433 uint64_t Offset,
4434 Align Alignment) const {
4435 Register DstReg = MI.getOperand(0).getReg();
4437 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4438 "unexpected kernarg parameter type");
4440 Register Ptr = getKernargParameterPtr(B, Offset);
4441 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4442 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4443 MachineMemOperand::MODereferenceable |
4444 MachineMemOperand::MOInvariant);
4445 MI.eraseFromParent();
4446 return true;
4449 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4450 MachineRegisterInfo &MRI,
4451 MachineIRBuilder &B) const {
4452 Register Dst = MI.getOperand(0).getReg();
4453 LLT DstTy = MRI.getType(Dst);
4454 LLT S16 = LLT::scalar(16);
4455 LLT S32 = LLT::scalar(32);
4456 LLT S64 = LLT::scalar(64);
4458 if (DstTy == S16)
4459 return legalizeFDIV16(MI, MRI, B);
4460 if (DstTy == S32)
4461 return legalizeFDIV32(MI, MRI, B);
4462 if (DstTy == S64)
4463 return legalizeFDIV64(MI, MRI, B);
4465 return false;
4468 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4469 Register DstDivReg,
4470 Register DstRemReg,
4471 Register X,
4472 Register Y) const {
4473 const LLT S1 = LLT::scalar(1);
4474 const LLT S32 = LLT::scalar(32);
4476 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4477 // algorithm used here.
4479 // Initial estimate of inv(y).
4480 auto FloatY = B.buildUITOFP(S32, Y);
4481 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4482 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4483 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4484 auto Z = B.buildFPTOUI(S32, ScaledY);
4486 // One round of UNR.
4487 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4488 auto NegYZ = B.buildMul(S32, NegY, Z);
4489 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4491 // Quotient/remainder estimate.
4492 auto Q = B.buildUMulH(S32, X, Z);
4493 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4495 // First quotient/remainder refinement.
4496 auto One = B.buildConstant(S32, 1);
4497 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4498 if (DstDivReg)
4499 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4500 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4502 // Second quotient/remainder refinement.
4503 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4504 if (DstDivReg)
4505 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4507 if (DstRemReg)
4508 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4511 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4513 // Return lo, hi of result
4515 // %cvt.lo = G_UITOFP Val.lo
4516 // %cvt.hi = G_UITOFP Val.hi
4517 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4518 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4519 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4520 // %mul2 = G_FMUL %mul1, 2**(-32)
4521 // %trunc = G_INTRINSIC_TRUNC %mul2
4522 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4523 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4524 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4525 Register Val) {
4526 const LLT S32 = LLT::scalar(32);
4527 auto Unmerge = B.buildUnmerge(S32, Val);
4529 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4530 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4532 auto Mad = B.buildFMAD(
4533 S32, CvtHi, // 2**32
4534 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4536 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4537 auto Mul1 = B.buildFMul(
4538 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4540 // 2**(-32)
4541 auto Mul2 = B.buildFMul(
4542 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4543 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4545 // -(2**32)
4546 auto Mad2 = B.buildFMAD(
4547 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4548 Mul1);
4550 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4551 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4553 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4556 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4557 Register DstDivReg,
4558 Register DstRemReg,
4559 Register Numer,
4560 Register Denom) const {
4561 const LLT S32 = LLT::scalar(32);
4562 const LLT S64 = LLT::scalar(64);
4563 const LLT S1 = LLT::scalar(1);
4564 Register RcpLo, RcpHi;
4566 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4568 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4570 auto Zero64 = B.buildConstant(S64, 0);
4571 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4573 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4574 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4576 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4577 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4578 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4580 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4581 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4582 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4584 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4585 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4586 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4587 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4588 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4590 auto Zero32 = B.buildConstant(S32, 0);
4591 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4592 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4593 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4595 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4596 Register NumerLo = UnmergeNumer.getReg(0);
4597 Register NumerHi = UnmergeNumer.getReg(1);
4599 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4600 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4601 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4602 Register Mul3_Lo = UnmergeMul3.getReg(0);
4603 Register Mul3_Hi = UnmergeMul3.getReg(1);
4604 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4605 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4606 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4607 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4609 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4610 Register DenomLo = UnmergeDenom.getReg(0);
4611 Register DenomHi = UnmergeDenom.getReg(1);
4613 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4614 auto C1 = B.buildSExt(S32, CmpHi);
4616 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4617 auto C2 = B.buildSExt(S32, CmpLo);
4619 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4620 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4622 // TODO: Here and below portions of the code can be enclosed into if/endif.
4623 // Currently control flow is unconditional and we have 4 selects after
4624 // potential endif to substitute PHIs.
4626 // if C3 != 0 ...
4627 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4628 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4629 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4630 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4632 auto One64 = B.buildConstant(S64, 1);
4633 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4635 auto C4 =
4636 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4637 auto C5 =
4638 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4639 auto C6 = B.buildSelect(
4640 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4642 // if (C6 != 0)
4643 auto Add4 = B.buildAdd(S64, Add3, One64);
4644 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4646 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4647 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4648 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4650 // endif C6
4651 // endif C3
4653 if (DstDivReg) {
4654 auto Sel1 = B.buildSelect(
4655 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4656 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4657 Sel1, MulHi3);
4660 if (DstRemReg) {
4661 auto Sel2 = B.buildSelect(
4662 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4663 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4664 Sel2, Sub1);
4668 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4669 MachineRegisterInfo &MRI,
4670 MachineIRBuilder &B) const {
4671 Register DstDivReg, DstRemReg;
4672 switch (MI.getOpcode()) {
4673 default:
4674 llvm_unreachable("Unexpected opcode!");
4675 case AMDGPU::G_UDIV: {
4676 DstDivReg = MI.getOperand(0).getReg();
4677 break;
4679 case AMDGPU::G_UREM: {
4680 DstRemReg = MI.getOperand(0).getReg();
4681 break;
4683 case AMDGPU::G_UDIVREM: {
4684 DstDivReg = MI.getOperand(0).getReg();
4685 DstRemReg = MI.getOperand(1).getReg();
4686 break;
4690 const LLT S64 = LLT::scalar(64);
4691 const LLT S32 = LLT::scalar(32);
4692 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4693 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4694 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4695 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4697 if (Ty == S32)
4698 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4699 else if (Ty == S64)
4700 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4701 else
4702 return false;
4704 MI.eraseFromParent();
4705 return true;
4708 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4709 MachineRegisterInfo &MRI,
4710 MachineIRBuilder &B) const {
4711 const LLT S64 = LLT::scalar(64);
4712 const LLT S32 = LLT::scalar(32);
4714 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4715 if (Ty != S32 && Ty != S64)
4716 return false;
4718 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4719 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4720 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4722 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4723 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4724 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4726 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4727 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4729 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4730 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4732 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4733 switch (MI.getOpcode()) {
4734 default:
4735 llvm_unreachable("Unexpected opcode!");
4736 case AMDGPU::G_SDIV: {
4737 DstDivReg = MI.getOperand(0).getReg();
4738 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4739 break;
4741 case AMDGPU::G_SREM: {
4742 DstRemReg = MI.getOperand(0).getReg();
4743 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4744 break;
4746 case AMDGPU::G_SDIVREM: {
4747 DstDivReg = MI.getOperand(0).getReg();
4748 DstRemReg = MI.getOperand(1).getReg();
4749 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4750 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4751 break;
4755 if (Ty == S32)
4756 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4757 else
4758 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4760 if (DstDivReg) {
4761 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4762 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4763 B.buildSub(DstDivReg, SignXor, Sign);
4766 if (DstRemReg) {
4767 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4768 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4769 B.buildSub(DstRemReg, SignXor, Sign);
4772 MI.eraseFromParent();
4773 return true;
4776 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4777 MachineRegisterInfo &MRI,
4778 MachineIRBuilder &B) const {
4779 Register Res = MI.getOperand(0).getReg();
4780 Register LHS = MI.getOperand(1).getReg();
4781 Register RHS = MI.getOperand(2).getReg();
4782 uint16_t Flags = MI.getFlags();
4783 LLT ResTy = MRI.getType(Res);
4785 const MachineFunction &MF = B.getMF();
4786 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4787 MF.getTarget().Options.UnsafeFPMath;
4789 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4790 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4791 return false;
4793 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4794 // the CI documentation has a worst case error of 1 ulp.
4795 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4796 // use it as long as we aren't trying to use denormals.
4798 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4800 // 1 / x -> RCP(x)
4801 if (CLHS->isExactlyValue(1.0)) {
4802 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4803 .addUse(RHS)
4804 .setMIFlags(Flags);
4806 MI.eraseFromParent();
4807 return true;
4810 // -1 / x -> RCP( FNEG(x) )
4811 if (CLHS->isExactlyValue(-1.0)) {
4812 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4813 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4814 .addUse(FNeg.getReg(0))
4815 .setMIFlags(Flags);
4817 MI.eraseFromParent();
4818 return true;
4822 // For f16 require afn or arcp.
4823 // For f32 require afn.
4824 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4825 !MI.getFlag(MachineInstr::FmArcp)))
4826 return false;
4828 // x / y -> x * (1.0 / y)
4829 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4830 .addUse(RHS)
4831 .setMIFlags(Flags);
4832 B.buildFMul(Res, LHS, RCP, Flags);
4834 MI.eraseFromParent();
4835 return true;
4838 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4839 MachineRegisterInfo &MRI,
4840 MachineIRBuilder &B) const {
4841 Register Res = MI.getOperand(0).getReg();
4842 Register X = MI.getOperand(1).getReg();
4843 Register Y = MI.getOperand(2).getReg();
4844 uint16_t Flags = MI.getFlags();
4845 LLT ResTy = MRI.getType(Res);
4847 const MachineFunction &MF = B.getMF();
4848 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4849 MI.getFlag(MachineInstr::FmAfn);
4851 if (!AllowInaccurateRcp)
4852 return false;
4854 auto NegY = B.buildFNeg(ResTy, Y);
4855 auto One = B.buildFConstant(ResTy, 1.0);
4857 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4858 .addUse(Y)
4859 .setMIFlags(Flags);
4861 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4862 R = B.buildFMA(ResTy, Tmp0, R, R);
4864 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4865 R = B.buildFMA(ResTy, Tmp1, R, R);
4867 auto Ret = B.buildFMul(ResTy, X, R);
4868 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4870 B.buildFMA(Res, Tmp2, R, Ret);
4871 MI.eraseFromParent();
4872 return true;
4875 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4876 MachineRegisterInfo &MRI,
4877 MachineIRBuilder &B) const {
4878 if (legalizeFastUnsafeFDIV(MI, MRI, B))
4879 return true;
4881 Register Res = MI.getOperand(0).getReg();
4882 Register LHS = MI.getOperand(1).getReg();
4883 Register RHS = MI.getOperand(2).getReg();
4885 uint16_t Flags = MI.getFlags();
4887 LLT S16 = LLT::scalar(16);
4888 LLT S32 = LLT::scalar(32);
4890 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4891 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4893 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4894 .addUse(RHSExt.getReg(0))
4895 .setMIFlags(Flags);
4897 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4898 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4900 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4901 .addUse(RDst.getReg(0))
4902 .addUse(RHS)
4903 .addUse(LHS)
4904 .setMIFlags(Flags);
4906 MI.eraseFromParent();
4907 return true;
4910 static constexpr unsigned SPDenormModeBitField =
4911 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
4913 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4914 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
4915 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4916 const GCNSubtarget &ST,
4917 SIModeRegisterDefaults Mode) {
4918 // Set SP denorm mode to this value.
4919 unsigned SPDenormMode =
4920 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4922 if (ST.hasDenormModeInst()) {
4923 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4924 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4926 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4927 B.buildInstr(AMDGPU::S_DENORM_MODE)
4928 .addImm(NewDenormModeValue);
4930 } else {
4931 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4932 .addImm(SPDenormMode)
4933 .addImm(SPDenormModeBitField);
4937 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4938 MachineRegisterInfo &MRI,
4939 MachineIRBuilder &B) const {
4940 if (legalizeFastUnsafeFDIV(MI, MRI, B))
4941 return true;
4943 Register Res = MI.getOperand(0).getReg();
4944 Register LHS = MI.getOperand(1).getReg();
4945 Register RHS = MI.getOperand(2).getReg();
4946 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4947 SIModeRegisterDefaults Mode = MFI->getMode();
4949 uint16_t Flags = MI.getFlags();
4951 LLT S32 = LLT::scalar(32);
4952 LLT S1 = LLT::scalar(1);
4954 auto One = B.buildFConstant(S32, 1.0f);
4956 auto DenominatorScaled =
4957 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4958 .addUse(LHS)
4959 .addUse(RHS)
4960 .addImm(0)
4961 .setMIFlags(Flags);
4962 auto NumeratorScaled =
4963 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4964 .addUse(LHS)
4965 .addUse(RHS)
4966 .addImm(1)
4967 .setMIFlags(Flags);
4969 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4970 .addUse(DenominatorScaled.getReg(0))
4971 .setMIFlags(Flags);
4972 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4974 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4975 const bool HasDynamicDenormals =
4976 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4977 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4979 Register SavedSPDenormMode;
4980 if (!PreservesDenormals) {
4981 if (HasDynamicDenormals) {
4982 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4983 B.buildInstr(AMDGPU::S_GETREG_B32)
4984 .addDef(SavedSPDenormMode)
4985 .addImm(SPDenormModeBitField);
4987 toggleSPDenormMode(true, B, ST, Mode);
4990 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4991 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4992 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4993 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4994 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4995 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4997 if (!PreservesDenormals) {
4998 if (HasDynamicDenormals) {
4999 assert(SavedSPDenormMode);
5000 B.buildInstr(AMDGPU::S_SETREG_B32)
5001 .addReg(SavedSPDenormMode)
5002 .addImm(SPDenormModeBitField);
5003 } else
5004 toggleSPDenormMode(false, B, ST, Mode);
5007 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5008 .addUse(Fma4.getReg(0))
5009 .addUse(Fma1.getReg(0))
5010 .addUse(Fma3.getReg(0))
5011 .addUse(NumeratorScaled.getReg(1))
5012 .setMIFlags(Flags);
5014 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5015 .addUse(Fmas.getReg(0))
5016 .addUse(RHS)
5017 .addUse(LHS)
5018 .setMIFlags(Flags);
5020 MI.eraseFromParent();
5021 return true;
5024 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5025 MachineRegisterInfo &MRI,
5026 MachineIRBuilder &B) const {
5027 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5028 return true;
5030 Register Res = MI.getOperand(0).getReg();
5031 Register LHS = MI.getOperand(1).getReg();
5032 Register RHS = MI.getOperand(2).getReg();
5034 uint16_t Flags = MI.getFlags();
5036 LLT S64 = LLT::scalar(64);
5037 LLT S1 = LLT::scalar(1);
5039 auto One = B.buildFConstant(S64, 1.0);
5041 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5042 .addUse(LHS)
5043 .addUse(RHS)
5044 .addImm(0)
5045 .setMIFlags(Flags);
5047 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5049 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5050 .addUse(DivScale0.getReg(0))
5051 .setMIFlags(Flags);
5053 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5054 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5055 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5057 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5058 .addUse(LHS)
5059 .addUse(RHS)
5060 .addImm(1)
5061 .setMIFlags(Flags);
5063 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5064 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5065 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5067 Register Scale;
5068 if (!ST.hasUsableDivScaleConditionOutput()) {
5069 // Workaround a hardware bug on SI where the condition output from div_scale
5070 // is not usable.
5072 LLT S32 = LLT::scalar(32);
5074 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5075 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5076 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5077 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5079 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5080 Scale1Unmerge.getReg(1));
5081 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5082 Scale0Unmerge.getReg(1));
5083 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5084 } else {
5085 Scale = DivScale1.getReg(1);
5088 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5089 .addUse(Fma4.getReg(0))
5090 .addUse(Fma3.getReg(0))
5091 .addUse(Mul.getReg(0))
5092 .addUse(Scale)
5093 .setMIFlags(Flags);
5095 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5096 .addUse(Fmas.getReg(0))
5097 .addUse(RHS)
5098 .addUse(LHS)
5099 .setMIFlags(Flags);
5101 MI.eraseFromParent();
5102 return true;
5105 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5106 MachineRegisterInfo &MRI,
5107 MachineIRBuilder &B) const {
5108 Register Res0 = MI.getOperand(0).getReg();
5109 Register Res1 = MI.getOperand(1).getReg();
5110 Register Val = MI.getOperand(2).getReg();
5111 uint16_t Flags = MI.getFlags();
5113 LLT Ty = MRI.getType(Res0);
5114 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5116 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5117 .addUse(Val)
5118 .setMIFlags(Flags);
5119 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5120 .addUse(Val)
5121 .setMIFlags(Flags);
5123 if (ST.hasFractBug()) {
5124 auto Fabs = B.buildFAbs(Ty, Val);
5125 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5126 auto IsFinite =
5127 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5128 auto Zero = B.buildConstant(InstrExpTy, 0);
5129 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5130 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5133 B.buildCopy(Res0, Mant);
5134 B.buildSExtOrTrunc(Res1, Exp);
5136 MI.eraseFromParent();
5137 return true;
5140 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5141 MachineRegisterInfo &MRI,
5142 MachineIRBuilder &B) const {
5143 Register Res = MI.getOperand(0).getReg();
5144 Register LHS = MI.getOperand(2).getReg();
5145 Register RHS = MI.getOperand(3).getReg();
5146 uint16_t Flags = MI.getFlags();
5148 LLT S32 = LLT::scalar(32);
5149 LLT S1 = LLT::scalar(1);
5151 auto Abs = B.buildFAbs(S32, RHS, Flags);
5152 const APFloat C0Val(1.0f);
5154 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5155 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5156 auto C2 = B.buildFConstant(S32, 1.0f);
5158 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5159 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5161 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5163 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5164 .addUse(Mul0.getReg(0))
5165 .setMIFlags(Flags);
5167 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5169 B.buildFMul(Res, Sel, Mul1, Flags);
5171 MI.eraseFromParent();
5172 return true;
5175 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5176 MachineRegisterInfo &MRI,
5177 MachineIRBuilder &B) const {
5178 // Bypass the correct expansion a standard promotion through G_FSQRT would
5179 // get. The f32 op is accurate enough for the f16 cas.
5180 unsigned Flags = MI.getFlags();
5181 assert(!ST.has16BitInsts());
5182 const LLT F32 = LLT::scalar(32);
5183 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5184 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5185 .addUse(Ext.getReg(0))
5186 .setMIFlags(Flags);
5187 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5188 MI.eraseFromParent();
5189 return true;
5192 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5193 MachineRegisterInfo &MRI,
5194 MachineIRBuilder &B) const {
5195 MachineFunction &MF = B.getMF();
5196 Register Dst = MI.getOperand(0).getReg();
5197 Register X = MI.getOperand(1).getReg();
5198 const unsigned Flags = MI.getFlags();
5199 const LLT S1 = LLT::scalar(1);
5200 const LLT F32 = LLT::scalar(32);
5201 const LLT I32 = LLT::scalar(32);
5203 if (allowApproxFunc(MF, Flags)) {
5204 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5205 .addUse(X)
5206 .setMIFlags(Flags);
5207 MI.eraseFromParent();
5208 return true;
5211 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5212 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5213 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5214 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5215 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5217 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5218 if (needsDenormHandlingF32(MF, X, Flags)) {
5219 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5220 .addUse(SqrtX.getReg(0))
5221 .setMIFlags(Flags);
5223 auto NegOne = B.buildConstant(I32, -1);
5224 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5226 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5227 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5229 auto PosOne = B.buildConstant(I32, 1);
5230 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5232 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5233 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5235 auto Zero = B.buildFConstant(F32, 0.0f);
5236 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5238 SqrtS =
5239 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5241 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5242 SqrtS =
5243 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5244 } else {
5245 auto SqrtR =
5246 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5247 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5249 auto Half = B.buildFConstant(F32, 0.5f);
5250 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5251 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5252 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5253 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5254 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5255 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5256 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5257 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5260 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5262 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5264 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5266 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5267 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5269 MI.eraseFromParent();
5270 return true;
5273 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5274 MachineRegisterInfo &MRI,
5275 MachineIRBuilder &B) const {
5276 // For double type, the SQRT and RSQ instructions don't have required
5277 // precision, we apply Goldschmidt's algorithm to improve the result:
5279 // y0 = rsq(x)
5280 // g0 = x * y0
5281 // h0 = 0.5 * y0
5283 // r0 = 0.5 - h0 * g0
5284 // g1 = g0 * r0 + g0
5285 // h1 = h0 * r0 + h0
5287 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5288 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5289 // h2 = h1 * r1 + h1
5291 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5292 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5294 // sqrt(x) = g3
5296 const LLT S1 = LLT::scalar(1);
5297 const LLT S32 = LLT::scalar(32);
5298 const LLT F64 = LLT::scalar(64);
5300 Register Dst = MI.getOperand(0).getReg();
5301 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5303 Register X = MI.getOperand(1).getReg();
5304 unsigned Flags = MI.getFlags();
5306 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5308 auto ZeroInt = B.buildConstant(S32, 0);
5309 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5311 // Scale up input if it is too small.
5312 auto ScaleUpFactor = B.buildConstant(S32, 256);
5313 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5314 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5316 auto SqrtY =
5317 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5319 auto Half = B.buildFConstant(F64, 0.5);
5320 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5321 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5323 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5324 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5326 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5327 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5329 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5330 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5332 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5334 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5335 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5337 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5339 // Scale down the result.
5340 auto ScaleDownFactor = B.buildConstant(S32, -128);
5341 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5342 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5344 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5345 // with finite only or nsz because rsq(+/-0) = +/-inf
5347 // TODO: Check for DAZ and expand to subnormals
5348 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5350 // If x is +INF, +0, or -0, use its original value
5351 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5353 MI.eraseFromParent();
5354 return true;
5357 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5358 MachineRegisterInfo &MRI,
5359 MachineIRBuilder &B) const {
5360 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5361 if (Ty == LLT::scalar(32))
5362 return legalizeFSQRTF32(MI, MRI, B);
5363 if (Ty == LLT::scalar(64))
5364 return legalizeFSQRTF64(MI, MRI, B);
5365 if (Ty == LLT::scalar(16))
5366 return legalizeFSQRTF16(MI, MRI, B);
5367 return false;
5370 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5371 // FIXME: Why do we handle this one but not other removed instructions?
5373 // Reciprocal square root. The clamp prevents infinite results, clamping
5374 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5375 // +-max_float.
5376 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5377 MachineRegisterInfo &MRI,
5378 MachineIRBuilder &B) const {
5379 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5380 return true;
5382 Register Dst = MI.getOperand(0).getReg();
5383 Register Src = MI.getOperand(2).getReg();
5384 auto Flags = MI.getFlags();
5386 LLT Ty = MRI.getType(Dst);
5388 const fltSemantics *FltSemantics;
5389 if (Ty == LLT::scalar(32))
5390 FltSemantics = &APFloat::IEEEsingle();
5391 else if (Ty == LLT::scalar(64))
5392 FltSemantics = &APFloat::IEEEdouble();
5393 else
5394 return false;
5396 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5397 .addUse(Src)
5398 .setMIFlags(Flags);
5400 // We don't need to concern ourselves with the snan handling difference, since
5401 // the rsq quieted (or not) so use the one which will directly select.
5402 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5403 const bool UseIEEE = MFI->getMode().IEEE;
5405 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5406 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5407 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5409 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5411 if (UseIEEE)
5412 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5413 else
5414 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5415 MI.eraseFromParent();
5416 return true;
5419 // TODO: Fix pointer type handling
5420 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5421 MachineInstr &MI,
5422 Intrinsic::ID IID) const {
5424 MachineIRBuilder &B = Helper.MIRBuilder;
5425 MachineRegisterInfo &MRI = *B.getMRI();
5427 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5428 IID == Intrinsic::amdgcn_permlanex16;
5430 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5431 Register Src2, LLT VT) -> Register {
5432 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5433 switch (IID) {
5434 case Intrinsic::amdgcn_readfirstlane:
5435 case Intrinsic::amdgcn_permlane64:
5436 return LaneOp.getReg(0);
5437 case Intrinsic::amdgcn_readlane:
5438 return LaneOp.addUse(Src1).getReg(0);
5439 case Intrinsic::amdgcn_writelane:
5440 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5441 case Intrinsic::amdgcn_permlane16:
5442 case Intrinsic::amdgcn_permlanex16: {
5443 Register Src3 = MI.getOperand(5).getReg();
5444 Register Src4 = MI.getOperand(6).getImm();
5445 Register Src5 = MI.getOperand(7).getImm();
5446 return LaneOp.addUse(Src1)
5447 .addUse(Src2)
5448 .addUse(Src3)
5449 .addImm(Src4)
5450 .addImm(Src5)
5451 .getReg(0);
5453 default:
5454 llvm_unreachable("unhandled lane op");
5458 Register DstReg = MI.getOperand(0).getReg();
5459 Register Src0 = MI.getOperand(2).getReg();
5460 Register Src1, Src2;
5461 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5462 IsPermLane16) {
5463 Src1 = MI.getOperand(3).getReg();
5464 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5465 Src2 = MI.getOperand(4).getReg();
5469 LLT Ty = MRI.getType(DstReg);
5470 unsigned Size = Ty.getSizeInBits();
5472 if (Size == 32) {
5473 // Already legal
5474 return true;
5477 if (Size < 32) {
5478 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5480 if (IsPermLane16)
5481 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5483 if (IID == Intrinsic::amdgcn_writelane)
5484 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5486 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5487 B.buildTrunc(DstReg, LaneOpDst);
5488 MI.eraseFromParent();
5489 return true;
5492 if (Size % 32 != 0)
5493 return false;
5495 LLT PartialResTy = S32;
5496 if (Ty.isVector()) {
5497 LLT EltTy = Ty.getElementType();
5498 switch (EltTy.getSizeInBits()) {
5499 case 16:
5500 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
5501 break;
5502 case 32:
5503 PartialResTy = EltTy;
5504 break;
5505 default:
5506 // Handle all other cases via S32 pieces;
5507 break;
5511 SmallVector<Register, 2> PartialRes;
5512 unsigned NumParts = Size / 32;
5513 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5514 MachineInstrBuilder Src1Parts, Src2Parts;
5516 if (IsPermLane16)
5517 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5519 if (IID == Intrinsic::amdgcn_writelane)
5520 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5522 for (unsigned i = 0; i < NumParts; ++i) {
5523 Src0 = Src0Parts.getReg(i);
5525 if (IsPermLane16)
5526 Src1 = Src1Parts.getReg(i);
5528 if (IID == Intrinsic::amdgcn_writelane)
5529 Src2 = Src2Parts.getReg(i);
5531 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5534 B.buildMergeLikeInstr(DstReg, PartialRes);
5535 MI.eraseFromParent();
5536 return true;
5539 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5540 MachineRegisterInfo &MRI,
5541 MachineIRBuilder &B) const {
5542 uint64_t Offset =
5543 ST.getTargetLowering()->getImplicitParameterOffset(
5544 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5545 LLT DstTy = MRI.getType(DstReg);
5546 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5548 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5549 if (!loadInputValue(KernargPtrReg, B,
5550 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5551 return false;
5553 // FIXME: This should be nuw
5554 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5555 return true;
5558 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5559 /// bits of the pointer and replace them with the stride argument, then
5560 /// merge_values everything together. In the common case of a raw buffer (the
5561 /// stride component is 0), we can just AND off the upper half.
5562 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5563 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5564 Register Result = MI.getOperand(0).getReg();
5565 Register Pointer = MI.getOperand(2).getReg();
5566 Register Stride = MI.getOperand(3).getReg();
5567 Register NumRecords = MI.getOperand(4).getReg();
5568 Register Flags = MI.getOperand(5).getReg();
5570 LLT S32 = LLT::scalar(32);
5572 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5573 auto Unmerge = B.buildUnmerge(S32, Pointer);
5574 Register LowHalf = Unmerge.getReg(0);
5575 Register HighHalf = Unmerge.getReg(1);
5577 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5578 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5580 MachineInstrBuilder NewHighHalf = Masked;
5581 std::optional<ValueAndVReg> StrideConst =
5582 getIConstantVRegValWithLookThrough(Stride, MRI);
5583 if (!StrideConst || !StrideConst->Value.isZero()) {
5584 MachineInstrBuilder ShiftedStride;
5585 if (StrideConst) {
5586 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5587 uint32_t ShiftedStrideVal = StrideVal << 16;
5588 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5589 } else {
5590 auto ExtStride = B.buildAnyExt(S32, Stride);
5591 auto ShiftConst = B.buildConstant(S32, 16);
5592 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5594 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5596 Register NewHighHalfReg = NewHighHalf.getReg(0);
5597 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5598 MI.eraseFromParent();
5599 return true;
5602 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5603 MachineRegisterInfo &MRI,
5604 MachineIRBuilder &B) const {
5605 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5606 if (!MFI->isEntryFunction()) {
5607 return legalizePreloadedArgIntrin(MI, MRI, B,
5608 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5611 Register DstReg = MI.getOperand(0).getReg();
5612 if (!getImplicitArgPtr(DstReg, MRI, B))
5613 return false;
5615 MI.eraseFromParent();
5616 return true;
5619 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5620 MachineRegisterInfo &MRI,
5621 MachineIRBuilder &B) const {
5622 Function &F = B.getMF().getFunction();
5623 std::optional<uint32_t> KnownSize =
5624 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5625 if (KnownSize.has_value())
5626 B.buildConstant(DstReg, *KnownSize);
5627 return false;
5630 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5631 MachineRegisterInfo &MRI,
5632 MachineIRBuilder &B) const {
5634 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5635 if (!MFI->isEntryFunction()) {
5636 return legalizePreloadedArgIntrin(MI, MRI, B,
5637 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5640 Register DstReg = MI.getOperand(0).getReg();
5641 if (!getLDSKernelId(DstReg, MRI, B))
5642 return false;
5644 MI.eraseFromParent();
5645 return true;
5648 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5649 MachineRegisterInfo &MRI,
5650 MachineIRBuilder &B,
5651 unsigned AddrSpace) const {
5652 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5653 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5654 Register Hi32 = Unmerge.getReg(1);
5656 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5657 MI.eraseFromParent();
5658 return true;
5661 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5662 // offset (the offset that is included in bounds checking and swizzling, to be
5663 // split between the instruction's voffset and immoffset fields) and soffset
5664 // (the offset that is excluded from bounds checking and swizzling, to go in
5665 // the instruction's soffset field). This function takes the first kind of
5666 // offset and figures out how to split it between voffset and immoffset.
5667 std::pair<Register, unsigned>
5668 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5669 Register OrigOffset) const {
5670 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5671 Register BaseReg;
5672 unsigned ImmOffset;
5673 const LLT S32 = LLT::scalar(32);
5674 MachineRegisterInfo &MRI = *B.getMRI();
5676 std::tie(BaseReg, ImmOffset) =
5677 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
5679 // If BaseReg is a pointer, convert it to int.
5680 if (MRI.getType(BaseReg).isPointer())
5681 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5683 // If the immediate value is too big for the immoffset field, put only bits
5684 // that would normally fit in the immoffset field. The remaining value that
5685 // is copied/added for the voffset field is a large power of 2, and it
5686 // stands more chance of being CSEd with the copy/add for another similar
5687 // load/store.
5688 // However, do not do that rounding down if that is a negative
5689 // number, as it appears to be illegal to have a negative offset in the
5690 // vgpr, even if adding the immediate offset makes it positive.
5691 unsigned Overflow = ImmOffset & ~MaxImm;
5692 ImmOffset -= Overflow;
5693 if ((int32_t)Overflow < 0) {
5694 Overflow += ImmOffset;
5695 ImmOffset = 0;
5698 if (Overflow != 0) {
5699 if (!BaseReg) {
5700 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5701 } else {
5702 auto OverflowVal = B.buildConstant(S32, Overflow);
5703 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5707 if (!BaseReg)
5708 BaseReg = B.buildConstant(S32, 0).getReg(0);
5710 return std::pair(BaseReg, ImmOffset);
5713 /// Handle register layout difference for f16 images for some subtargets.
5714 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5715 MachineRegisterInfo &MRI,
5716 Register Reg,
5717 bool ImageStore) const {
5718 const LLT S16 = LLT::scalar(16);
5719 const LLT S32 = LLT::scalar(32);
5720 LLT StoreVT = MRI.getType(Reg);
5721 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5723 if (ST.hasUnpackedD16VMem()) {
5724 auto Unmerge = B.buildUnmerge(S16, Reg);
5726 SmallVector<Register, 4> WideRegs;
5727 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5728 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5730 int NumElts = StoreVT.getNumElements();
5732 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5733 .getReg(0);
5736 if (ImageStore && ST.hasImageStoreD16Bug()) {
5737 if (StoreVT.getNumElements() == 2) {
5738 SmallVector<Register, 4> PackedRegs;
5739 Reg = B.buildBitcast(S32, Reg).getReg(0);
5740 PackedRegs.push_back(Reg);
5741 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5742 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5743 .getReg(0);
5746 if (StoreVT.getNumElements() == 3) {
5747 SmallVector<Register, 4> PackedRegs;
5748 auto Unmerge = B.buildUnmerge(S16, Reg);
5749 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5750 PackedRegs.push_back(Unmerge.getReg(I));
5751 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5752 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5753 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5756 if (StoreVT.getNumElements() == 4) {
5757 SmallVector<Register, 4> PackedRegs;
5758 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5759 auto Unmerge = B.buildUnmerge(S32, Reg);
5760 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5761 PackedRegs.push_back(Unmerge.getReg(I));
5762 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5763 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5764 .getReg(0);
5767 llvm_unreachable("invalid data type");
5770 if (StoreVT == LLT::fixed_vector(3, S16)) {
5771 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5772 .getReg(0);
5774 return Reg;
5777 Register AMDGPULegalizerInfo::fixStoreSourceType(
5778 MachineIRBuilder &B, Register VData, bool IsFormat) const {
5779 MachineRegisterInfo *MRI = B.getMRI();
5780 LLT Ty = MRI->getType(VData);
5782 const LLT S16 = LLT::scalar(16);
5784 // Fixup buffer resources themselves needing to be v4i128.
5785 if (hasBufferRsrcWorkaround(Ty))
5786 return castBufferRsrcToV4I32(VData, B);
5788 // Fixup illegal register types for i8 stores.
5789 if (Ty == LLT::scalar(8) || Ty == S16) {
5790 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5791 return AnyExt;
5794 if (Ty.isVector()) {
5795 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5796 if (IsFormat)
5797 return handleD16VData(B, *MRI, VData);
5801 return VData;
5804 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5805 MachineRegisterInfo &MRI,
5806 MachineIRBuilder &B,
5807 bool IsTyped,
5808 bool IsFormat) const {
5809 Register VData = MI.getOperand(1).getReg();
5810 LLT Ty = MRI.getType(VData);
5811 LLT EltTy = Ty.getScalarType();
5812 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5813 const LLT S32 = LLT::scalar(32);
5815 VData = fixStoreSourceType(B, VData, IsFormat);
5816 castBufferRsrcArgToV4I32(MI, B, 2);
5817 Register RSrc = MI.getOperand(2).getReg();
5819 MachineMemOperand *MMO = *MI.memoperands_begin();
5820 const int MemSize = MMO->getSize().getValue();
5822 unsigned ImmOffset;
5824 // The typed intrinsics add an immediate after the registers.
5825 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5827 // The struct intrinsic variants add one additional operand over raw.
5828 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5829 Register VIndex;
5830 int OpOffset = 0;
5831 if (HasVIndex) {
5832 VIndex = MI.getOperand(3).getReg();
5833 OpOffset = 1;
5834 } else {
5835 VIndex = B.buildConstant(S32, 0).getReg(0);
5838 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5839 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5841 unsigned Format = 0;
5842 if (IsTyped) {
5843 Format = MI.getOperand(5 + OpOffset).getImm();
5844 ++OpOffset;
5847 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5849 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5851 unsigned Opc;
5852 if (IsTyped) {
5853 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5854 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5855 } else if (IsFormat) {
5856 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5857 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5858 } else {
5859 switch (MemSize) {
5860 case 1:
5861 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5862 break;
5863 case 2:
5864 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5865 break;
5866 default:
5867 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5868 break;
5872 auto MIB = B.buildInstr(Opc)
5873 .addUse(VData) // vdata
5874 .addUse(RSrc) // rsrc
5875 .addUse(VIndex) // vindex
5876 .addUse(VOffset) // voffset
5877 .addUse(SOffset) // soffset
5878 .addImm(ImmOffset); // offset(imm)
5880 if (IsTyped)
5881 MIB.addImm(Format);
5883 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5884 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5885 .addMemOperand(MMO);
5887 MI.eraseFromParent();
5888 return true;
5891 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5892 Register VIndex, Register VOffset, Register SOffset,
5893 unsigned ImmOffset, unsigned Format,
5894 unsigned AuxiliaryData, MachineMemOperand *MMO,
5895 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5896 auto MIB = B.buildInstr(Opc)
5897 .addDef(LoadDstReg) // vdata
5898 .addUse(RSrc) // rsrc
5899 .addUse(VIndex) // vindex
5900 .addUse(VOffset) // voffset
5901 .addUse(SOffset) // soffset
5902 .addImm(ImmOffset); // offset(imm)
5904 if (IsTyped)
5905 MIB.addImm(Format);
5907 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5908 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5909 .addMemOperand(MMO);
5912 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5913 MachineRegisterInfo &MRI,
5914 MachineIRBuilder &B,
5915 bool IsFormat,
5916 bool IsTyped) const {
5917 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5918 MachineMemOperand *MMO = *MI.memoperands_begin();
5919 const LLT MemTy = MMO->getMemoryType();
5920 const LLT S32 = LLT::scalar(32);
5922 Register Dst = MI.getOperand(0).getReg();
5924 Register StatusDst;
5925 int OpOffset = 0;
5926 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5927 bool IsTFE = MI.getNumExplicitDefs() == 2;
5928 if (IsTFE) {
5929 StatusDst = MI.getOperand(1).getReg();
5930 ++OpOffset;
5933 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5934 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
5936 // The typed intrinsics add an immediate after the registers.
5937 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5939 // The struct intrinsic variants add one additional operand over raw.
5940 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5941 Register VIndex;
5942 if (HasVIndex) {
5943 VIndex = MI.getOperand(3 + OpOffset).getReg();
5944 ++OpOffset;
5945 } else {
5946 VIndex = B.buildConstant(S32, 0).getReg(0);
5949 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5950 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5952 unsigned Format = 0;
5953 if (IsTyped) {
5954 Format = MI.getOperand(5 + OpOffset).getImm();
5955 ++OpOffset;
5958 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5959 unsigned ImmOffset;
5961 LLT Ty = MRI.getType(Dst);
5962 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5963 // logic doesn't have to handle that case.
5964 if (hasBufferRsrcWorkaround(Ty)) {
5965 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5966 Dst = MI.getOperand(0).getReg();
5968 LLT EltTy = Ty.getScalarType();
5969 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5970 const bool Unpacked = ST.hasUnpackedD16VMem();
5972 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5974 unsigned Opc;
5976 // TODO: Support TFE for typed and narrow loads.
5977 if (IsTyped) {
5978 if (IsTFE)
5979 return false;
5980 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5981 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5982 } else if (IsFormat) {
5983 if (IsD16) {
5984 if (IsTFE)
5985 return false;
5986 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5987 } else {
5988 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5989 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5991 } else {
5992 switch (MemTy.getSizeInBits()) {
5993 case 8:
5994 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
5995 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5996 break;
5997 case 16:
5998 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
5999 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6000 break;
6001 default:
6002 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6003 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6004 break;
6008 if (IsTFE) {
6009 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6010 unsigned NumLoadDWords = NumValueDWords + 1;
6011 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6012 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6013 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6014 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6015 if (MemTy.getSizeInBits() < 32) {
6016 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6017 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6018 B.buildTrunc(Dst, ExtDst);
6019 } else if (NumValueDWords == 1) {
6020 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6021 } else {
6022 SmallVector<Register, 5> LoadElts;
6023 for (unsigned I = 0; I != NumValueDWords; ++I)
6024 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6025 LoadElts.push_back(StatusDst);
6026 B.buildUnmerge(LoadElts, LoadDstReg);
6027 LoadElts.truncate(NumValueDWords);
6028 B.buildMergeLikeInstr(Dst, LoadElts);
6030 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6031 (IsD16 && !Ty.isVector())) {
6032 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6033 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6034 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6035 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6036 B.buildTrunc(Dst, LoadDstReg);
6037 } else if (Unpacked && IsD16 && Ty.isVector()) {
6038 LLT UnpackedTy = Ty.changeElementSize(32);
6039 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6040 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6041 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6042 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6043 // FIXME: G_TRUNC should work, but legalization currently fails
6044 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6045 SmallVector<Register, 4> Repack;
6046 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6047 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6048 B.buildMergeLikeInstr(Dst, Repack);
6049 } else {
6050 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6051 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6054 MI.eraseFromParent();
6055 return true;
6058 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6059 switch (IntrID) {
6060 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6061 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6062 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6063 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6064 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6065 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6066 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6067 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6068 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6069 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6070 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6071 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6072 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6073 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6074 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6075 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6076 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6077 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6079 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6080 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6081 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6082 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6085 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6086 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6087 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6088 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6089 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6090 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6091 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6092 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6093 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6094 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6095 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6096 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6097 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6098 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6100 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6101 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6102 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6104 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6105 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6107 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6108 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6109 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6110 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6111 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6112 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6113 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6115 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6116 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6117 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6118 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6119 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6120 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6121 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6122 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6123 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6124 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6125 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6126 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6127 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6128 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6130 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6131 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6132 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6134 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6135 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6136 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6137 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6138 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6139 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6140 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6141 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6142 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6143 default:
6144 llvm_unreachable("unhandled atomic opcode");
6148 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6149 MachineIRBuilder &B,
6150 Intrinsic::ID IID) const {
6151 const bool IsCmpSwap =
6152 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6153 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6154 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6155 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6157 Register Dst = MI.getOperand(0).getReg();
6158 // Since we don't have 128-bit atomics, we don't need to handle the case of
6159 // p8 argmunents to the atomic itself
6160 Register VData = MI.getOperand(2).getReg();
6162 Register CmpVal;
6163 int OpOffset = 0;
6165 if (IsCmpSwap) {
6166 CmpVal = MI.getOperand(3).getReg();
6167 ++OpOffset;
6170 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6171 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6172 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6174 // The struct intrinsic variants add one additional operand over raw.
6175 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6176 Register VIndex;
6177 if (HasVIndex) {
6178 VIndex = MI.getOperand(4 + OpOffset).getReg();
6179 ++OpOffset;
6180 } else {
6181 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6184 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6185 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6186 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6188 MachineMemOperand *MMO = *MI.memoperands_begin();
6190 unsigned ImmOffset;
6191 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6193 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6194 .addDef(Dst)
6195 .addUse(VData); // vdata
6197 if (IsCmpSwap)
6198 MIB.addReg(CmpVal);
6200 MIB.addUse(RSrc) // rsrc
6201 .addUse(VIndex) // vindex
6202 .addUse(VOffset) // voffset
6203 .addUse(SOffset) // soffset
6204 .addImm(ImmOffset) // offset(imm)
6205 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6206 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6207 .addMemOperand(MMO);
6209 MI.eraseFromParent();
6210 return true;
6213 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6214 /// vector with s16 typed elements.
6215 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6216 SmallVectorImpl<Register> &PackedAddrs,
6217 unsigned ArgOffset,
6218 const AMDGPU::ImageDimIntrinsicInfo *Intr,
6219 bool IsA16, bool IsG16) {
6220 const LLT S16 = LLT::scalar(16);
6221 const LLT V2S16 = LLT::fixed_vector(2, 16);
6222 auto EndIdx = Intr->VAddrEnd;
6224 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6225 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6226 if (!SrcOp.isReg())
6227 continue; // _L to _LZ may have eliminated this.
6229 Register AddrReg = SrcOp.getReg();
6231 if ((I < Intr->GradientStart) ||
6232 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6233 (I >= Intr->CoordStart && !IsA16)) {
6234 if ((I < Intr->GradientStart) && IsA16 &&
6235 (B.getMRI()->getType(AddrReg) == S16)) {
6236 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6237 // Special handling of bias when A16 is on. Bias is of type half but
6238 // occupies full 32-bit.
6239 PackedAddrs.push_back(
6240 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6241 .getReg(0));
6242 } else {
6243 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6244 "Bias needs to be converted to 16 bit in A16 mode");
6245 // Handle any gradient or coordinate operands that should not be packed
6246 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6247 PackedAddrs.push_back(AddrReg);
6249 } else {
6250 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6251 // derivatives dx/dh and dx/dv are packed with undef.
6252 if (((I + 1) >= EndIdx) ||
6253 ((Intr->NumGradients / 2) % 2 == 1 &&
6254 (I == static_cast<unsigned>(Intr->GradientStart +
6255 (Intr->NumGradients / 2) - 1) ||
6256 I == static_cast<unsigned>(Intr->GradientStart +
6257 Intr->NumGradients - 1))) ||
6258 // Check for _L to _LZ optimization
6259 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6260 PackedAddrs.push_back(
6261 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6262 .getReg(0));
6263 } else {
6264 PackedAddrs.push_back(
6265 B.buildBuildVector(
6266 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6267 .getReg(0));
6268 ++I;
6274 /// Convert from separate vaddr components to a single vector address register,
6275 /// and replace the remaining operands with $noreg.
6276 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6277 int DimIdx, int NumVAddrs) {
6278 const LLT S32 = LLT::scalar(32);
6279 (void)S32;
6280 SmallVector<Register, 8> AddrRegs;
6281 for (int I = 0; I != NumVAddrs; ++I) {
6282 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6283 if (SrcOp.isReg()) {
6284 AddrRegs.push_back(SrcOp.getReg());
6285 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6289 int NumAddrRegs = AddrRegs.size();
6290 if (NumAddrRegs != 1) {
6291 auto VAddr =
6292 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6293 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6296 for (int I = 1; I != NumVAddrs; ++I) {
6297 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6298 if (SrcOp.isReg())
6299 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6303 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
6305 /// Depending on the subtarget, load/store with 16-bit element data need to be
6306 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6307 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6308 /// registers.
6310 /// We don't want to directly select image instructions just yet, but also want
6311 /// to exposes all register repacking to the legalizer/combiners. We also don't
6312 /// want a selected instruction entering RegBankSelect. In order to avoid
6313 /// defining a multitude of intermediate image instructions, directly hack on
6314 /// the intrinsic's arguments. In cases like a16 addresses, this requires
6315 /// padding now unnecessary arguments with $noreg.
6316 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6317 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6318 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6320 const MachineFunction &MF = *MI.getMF();
6321 const unsigned NumDefs = MI.getNumExplicitDefs();
6322 const unsigned ArgOffset = NumDefs + 1;
6323 bool IsTFE = NumDefs == 2;
6324 // We are only processing the operands of d16 image operations on subtargets
6325 // that use the unpacked register layout, or need to repack the TFE result.
6327 // TODO: Do we need to guard against already legalized intrinsics?
6328 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6329 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6331 MachineRegisterInfo *MRI = B.getMRI();
6332 const LLT S32 = LLT::scalar(32);
6333 const LLT S16 = LLT::scalar(16);
6334 const LLT V2S16 = LLT::fixed_vector(2, 16);
6336 unsigned DMask = 0;
6337 Register VData;
6338 LLT Ty;
6340 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6341 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6342 Ty = MRI->getType(VData);
6345 const bool IsAtomicPacked16Bit =
6346 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6347 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6349 // Check for 16 bit addresses and pack if true.
6350 LLT GradTy =
6351 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6352 LLT AddrTy =
6353 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6354 const bool IsG16 =
6355 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6356 const bool IsA16 = AddrTy == S16;
6357 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6359 int DMaskLanes = 0;
6360 if (!BaseOpcode->Atomic) {
6361 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6362 if (BaseOpcode->Gather4) {
6363 DMaskLanes = 4;
6364 } else if (DMask != 0) {
6365 DMaskLanes = llvm::popcount(DMask);
6366 } else if (!IsTFE && !BaseOpcode->Store) {
6367 // If dmask is 0, this is a no-op load. This can be eliminated.
6368 B.buildUndef(MI.getOperand(0));
6369 MI.eraseFromParent();
6370 return true;
6374 Observer.changingInstr(MI);
6375 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6377 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6378 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6379 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6380 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6381 unsigned NewOpcode = LoadOpcode;
6382 if (BaseOpcode->Store)
6383 NewOpcode = StoreOpcode;
6384 else if (BaseOpcode->NoReturn)
6385 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6387 // Track that we legalized this
6388 MI.setDesc(B.getTII().get(NewOpcode));
6390 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6391 // dmask to be at least 1 otherwise the instruction will fail
6392 if (IsTFE && DMask == 0) {
6393 DMask = 0x1;
6394 DMaskLanes = 1;
6395 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6398 if (BaseOpcode->Atomic) {
6399 Register VData0 = MI.getOperand(2).getReg();
6400 LLT Ty = MRI->getType(VData0);
6402 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6403 if (Ty.isVector() && !IsAtomicPacked16Bit)
6404 return false;
6406 if (BaseOpcode->AtomicX2) {
6407 Register VData1 = MI.getOperand(3).getReg();
6408 // The two values are packed in one register.
6409 LLT PackedTy = LLT::fixed_vector(2, Ty);
6410 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6411 MI.getOperand(2).setReg(Concat.getReg(0));
6412 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6416 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6418 // Rewrite the addressing register layout before doing anything else.
6419 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6420 // 16 bit gradients are supported, but are tied to the A16 control
6421 // so both gradients and addresses must be 16 bit
6422 return false;
6425 if (IsA16 && !ST.hasA16()) {
6426 // A16 not supported
6427 return false;
6430 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6431 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6433 if (IsA16 || IsG16) {
6434 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6435 // instructions expect VGPR_32
6436 SmallVector<Register, 4> PackedRegs;
6438 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6440 // See also below in the non-a16 branch
6441 const bool UseNSA = ST.hasNSAEncoding() &&
6442 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6443 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6444 const bool UsePartialNSA =
6445 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6447 if (UsePartialNSA) {
6448 // Pack registers that would go over NSAMaxSize into last VAddr register
6449 LLT PackedAddrTy =
6450 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6451 auto Concat = B.buildConcatVectors(
6452 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6453 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6454 PackedRegs.resize(NSAMaxSize);
6455 } else if (!UseNSA && PackedRegs.size() > 1) {
6456 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6457 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6458 PackedRegs[0] = Concat.getReg(0);
6459 PackedRegs.resize(1);
6462 const unsigned NumPacked = PackedRegs.size();
6463 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6464 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6465 if (!SrcOp.isReg()) {
6466 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6467 continue;
6470 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6472 if (I - Intr->VAddrStart < NumPacked)
6473 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6474 else
6475 SrcOp.setReg(AMDGPU::NoRegister);
6477 } else {
6478 // If the register allocator cannot place the address registers contiguously
6479 // without introducing moves, then using the non-sequential address encoding
6480 // is always preferable, since it saves VALU instructions and is usually a
6481 // wash in terms of code size or even better.
6483 // However, we currently have no way of hinting to the register allocator
6484 // that MIMG addresses should be placed contiguously when it is possible to
6485 // do so, so force non-NSA for the common 2-address case as a heuristic.
6487 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6488 // allocation when possible.
6490 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6491 // set of the remaining addresses.
6492 const bool UseNSA = ST.hasNSAEncoding() &&
6493 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6494 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6495 const bool UsePartialNSA =
6496 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6498 if (UsePartialNSA) {
6499 convertImageAddrToPacked(B, MI,
6500 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6501 Intr->NumVAddrs - NSAMaxSize + 1);
6502 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6503 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6504 Intr->NumVAddrs);
6508 int Flags = 0;
6509 if (IsA16)
6510 Flags |= 1;
6511 if (IsG16)
6512 Flags |= 2;
6513 MI.addOperand(MachineOperand::CreateImm(Flags));
6515 if (BaseOpcode->NoReturn) { // No TFE for stores?
6516 // TODO: Handle dmask trim
6517 if (!Ty.isVector() || !IsD16)
6518 return true;
6520 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6521 if (RepackedReg != VData) {
6522 MI.getOperand(1).setReg(RepackedReg);
6525 return true;
6528 Register DstReg = MI.getOperand(0).getReg();
6529 const LLT EltTy = Ty.getScalarType();
6530 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6532 // Confirm that the return type is large enough for the dmask specified
6533 if (NumElts < DMaskLanes)
6534 return false;
6536 if (NumElts > 4 || DMaskLanes > 4)
6537 return false;
6539 // Image atomic instructions are using DMask to specify how many bits
6540 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6541 // DMaskLanes for image atomic has default value '0'.
6542 // We must be sure that atomic variants (especially packed) will not be
6543 // truncated from v2s16 or v4s16 to s16 type.
6545 // ChangeElementCount will be needed for image load where Ty is always scalar.
6546 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6547 const LLT AdjustedTy =
6548 DMaskLanes == 0
6549 ? Ty
6550 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6552 // The raw dword aligned data component of the load. The only legal cases
6553 // where this matters should be when using the packed D16 format, for
6554 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6555 LLT RoundedTy;
6557 // S32 vector to cover all data, plus TFE result element.
6558 LLT TFETy;
6560 // Register type to use for each loaded component. Will be S32 or V2S16.
6561 LLT RegTy;
6563 if (IsD16 && ST.hasUnpackedD16VMem()) {
6564 RoundedTy =
6565 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6566 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6567 RegTy = S32;
6568 } else {
6569 unsigned EltSize = EltTy.getSizeInBits();
6570 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6571 unsigned RoundedSize = 32 * RoundedElts;
6572 RoundedTy = LLT::scalarOrVector(
6573 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6574 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6575 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6578 // The return type does not need adjustment.
6579 // TODO: Should we change s16 case to s32 or <2 x s16>?
6580 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6581 return true;
6583 Register Dst1Reg;
6585 // Insert after the instruction.
6586 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6588 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6589 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6590 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6591 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6593 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6595 MI.getOperand(0).setReg(NewResultReg);
6597 // In the IR, TFE is supposed to be used with a 2 element struct return
6598 // type. The instruction really returns these two values in one contiguous
6599 // register, with one additional dword beyond the loaded data. Rewrite the
6600 // return type to use a single register result.
6602 if (IsTFE) {
6603 Dst1Reg = MI.getOperand(1).getReg();
6604 if (MRI->getType(Dst1Reg) != S32)
6605 return false;
6607 // TODO: Make sure the TFE operand bit is set.
6608 MI.removeOperand(1);
6610 // Handle the easy case that requires no repack instructions.
6611 if (Ty == S32) {
6612 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6613 return true;
6617 // Now figure out how to copy the new result register back into the old
6618 // result.
6619 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6621 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6623 if (ResultNumRegs == 1) {
6624 assert(!IsTFE);
6625 ResultRegs[0] = NewResultReg;
6626 } else {
6627 // We have to repack into a new vector of some kind.
6628 for (int I = 0; I != NumDataRegs; ++I)
6629 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6630 B.buildUnmerge(ResultRegs, NewResultReg);
6632 // Drop the final TFE element to get the data part. The TFE result is
6633 // directly written to the right place already.
6634 if (IsTFE)
6635 ResultRegs.resize(NumDataRegs);
6638 // For an s16 scalar result, we form an s32 result with a truncate regardless
6639 // of packed vs. unpacked.
6640 if (IsD16 && !Ty.isVector()) {
6641 B.buildTrunc(DstReg, ResultRegs[0]);
6642 return true;
6645 // Avoid a build/concat_vector of 1 entry.
6646 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6647 B.buildBitcast(DstReg, ResultRegs[0]);
6648 return true;
6651 assert(Ty.isVector());
6653 if (IsD16) {
6654 // For packed D16 results with TFE enabled, all the data components are
6655 // S32. Cast back to the expected type.
6657 // TODO: We don't really need to use load s32 elements. We would only need one
6658 // cast for the TFE result if a multiple of v2s16 was used.
6659 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6660 for (Register &Reg : ResultRegs)
6661 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6662 } else if (ST.hasUnpackedD16VMem()) {
6663 for (Register &Reg : ResultRegs)
6664 Reg = B.buildTrunc(S16, Reg).getReg(0);
6668 auto padWithUndef = [&](LLT Ty, int NumElts) {
6669 if (NumElts == 0)
6670 return;
6671 Register Undef = B.buildUndef(Ty).getReg(0);
6672 for (int I = 0; I != NumElts; ++I)
6673 ResultRegs.push_back(Undef);
6676 // Pad out any elements eliminated due to the dmask.
6677 LLT ResTy = MRI->getType(ResultRegs[0]);
6678 if (!ResTy.isVector()) {
6679 padWithUndef(ResTy, NumElts - ResultRegs.size());
6680 B.buildBuildVector(DstReg, ResultRegs);
6681 return true;
6684 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6685 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6687 // Deal with the one annoying legal case.
6688 const LLT V3S16 = LLT::fixed_vector(3, 16);
6689 if (Ty == V3S16) {
6690 if (IsTFE) {
6691 if (ResultRegs.size() == 1) {
6692 NewResultReg = ResultRegs[0];
6693 } else if (ResultRegs.size() == 2) {
6694 LLT V4S16 = LLT::fixed_vector(4, 16);
6695 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6696 } else {
6697 return false;
6701 if (MRI->getType(DstReg).getNumElements() <
6702 MRI->getType(NewResultReg).getNumElements()) {
6703 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6704 } else {
6705 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6707 return true;
6710 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6711 B.buildConcatVectors(DstReg, ResultRegs);
6712 return true;
6715 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6716 MachineInstr &MI) const {
6717 MachineIRBuilder &B = Helper.MIRBuilder;
6718 GISelChangeObserver &Observer = Helper.Observer;
6720 Register OrigDst = MI.getOperand(0).getReg();
6721 Register Dst;
6722 LLT Ty = B.getMRI()->getType(OrigDst);
6723 unsigned Size = Ty.getSizeInBits();
6724 MachineFunction &MF = B.getMF();
6725 unsigned Opc = 0;
6726 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6727 assert(Size == 8 || Size == 16);
6728 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6729 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6730 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6731 // destination register.
6732 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6733 } else {
6734 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6735 Dst = OrigDst;
6738 Observer.changingInstr(MI);
6740 // Handle needing to s.buffer.load() a p8 value.
6741 if (hasBufferRsrcWorkaround(Ty)) {
6742 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6743 B.setInsertPt(B.getMBB(), MI);
6745 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6746 Ty = getBitcastRegisterType(Ty);
6747 Helper.bitcastDst(MI, Ty, 0);
6748 B.setInsertPt(B.getMBB(), MI);
6751 // FIXME: We don't really need this intermediate instruction. The intrinsic
6752 // should be fixed to have a memory operand. Since it's readnone, we're not
6753 // allowed to add one.
6754 MI.setDesc(B.getTII().get(Opc));
6755 MI.removeOperand(1); // Remove intrinsic ID
6757 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6758 const unsigned MemSize = (Size + 7) / 8;
6759 const Align MemAlign = B.getDataLayout().getABITypeAlign(
6760 getTypeForLLT(Ty, MF.getFunction().getContext()));
6761 MachineMemOperand *MMO = MF.getMachineMemOperand(
6762 MachinePointerInfo(),
6763 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6764 MachineMemOperand::MOInvariant,
6765 MemSize, MemAlign);
6766 MI.addMemOperand(MF, MMO);
6767 if (Dst != OrigDst) {
6768 MI.getOperand(0).setReg(Dst);
6769 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6770 B.buildTrunc(OrigDst, Dst);
6773 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6774 // always be legal. We may need to restore this to a 96-bit result if it turns
6775 // out this needs to be converted to a vector load during RegBankSelect.
6776 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6777 if (Ty.isVector())
6778 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
6779 else
6780 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6783 Observer.changedInstr(MI);
6784 return true;
6787 // TODO: Move to selection
6788 bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
6789 MachineRegisterInfo &MRI,
6790 MachineIRBuilder &B) const {
6791 if (!ST.isTrapHandlerEnabled() ||
6792 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6793 return legalizeTrapEndpgm(MI, MRI, B);
6795 return ST.supportsGetDoorbellID() ?
6796 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6799 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6800 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6801 const DebugLoc &DL = MI.getDebugLoc();
6802 MachineBasicBlock &BB = B.getMBB();
6803 MachineFunction *MF = BB.getParent();
6805 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6806 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6807 .addImm(0);
6808 MI.eraseFromParent();
6809 return true;
6812 // We need a block split to make the real endpgm a terminator. We also don't
6813 // want to break phis in successor blocks, so we can't just delete to the
6814 // end of the block.
6815 BB.splitAt(MI, false /*UpdateLiveIns*/);
6816 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6817 MF->push_back(TrapBB);
6818 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6819 .addImm(0);
6820 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6821 .addMBB(TrapBB);
6823 BB.addSuccessor(TrapBB);
6824 MI.eraseFromParent();
6825 return true;
6828 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6829 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6830 MachineFunction &MF = B.getMF();
6831 const LLT S64 = LLT::scalar(64);
6833 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6834 // For code object version 5, queue_ptr is passed through implicit kernarg.
6835 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
6836 AMDGPU::AMDHSA_COV5) {
6837 AMDGPUTargetLowering::ImplicitParameter Param =
6838 AMDGPUTargetLowering::QUEUE_PTR;
6839 uint64_t Offset =
6840 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6842 Register KernargPtrReg = MRI.createGenericVirtualRegister(
6843 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6845 if (!loadInputValue(KernargPtrReg, B,
6846 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6847 return false;
6849 // TODO: can we be smarter about machine pointer info?
6850 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6851 MachineMemOperand *MMO = MF.getMachineMemOperand(
6852 PtrInfo,
6853 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6854 MachineMemOperand::MOInvariant,
6855 LLT::scalar(64), commonAlignment(Align(64), Offset));
6857 // Pointer address
6858 Register LoadAddr = MRI.createGenericVirtualRegister(
6859 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6860 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6861 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6862 // Load address
6863 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6864 B.buildCopy(SGPR01, Temp);
6865 B.buildInstr(AMDGPU::S_TRAP)
6866 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6867 .addReg(SGPR01, RegState::Implicit);
6868 MI.eraseFromParent();
6869 return true;
6872 // Pass queue pointer to trap handler as input, and insert trap instruction
6873 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6874 Register LiveIn =
6875 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6876 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
6877 return false;
6879 B.buildCopy(SGPR01, LiveIn);
6880 B.buildInstr(AMDGPU::S_TRAP)
6881 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6882 .addReg(SGPR01, RegState::Implicit);
6884 MI.eraseFromParent();
6885 return true;
6888 bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
6889 MachineRegisterInfo &MRI,
6890 MachineIRBuilder &B) const {
6891 // We need to simulate the 's_trap 2' instruction on targets that run in
6892 // PRIV=1 (where it is treated as a nop).
6893 if (ST.hasPrivEnabledTrap2NopBug()) {
6894 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6895 MI.getDebugLoc());
6896 MI.eraseFromParent();
6897 return true;
6900 B.buildInstr(AMDGPU::S_TRAP)
6901 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6902 MI.eraseFromParent();
6903 return true;
6906 bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
6907 MachineRegisterInfo &MRI,
6908 MachineIRBuilder &B) const {
6909 // Is non-HSA path or trap-handler disabled? Then, report a warning
6910 // accordingly
6911 if (!ST.isTrapHandlerEnabled() ||
6912 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6913 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6914 "debugtrap handler not supported",
6915 MI.getDebugLoc(), DS_Warning);
6916 LLVMContext &Ctx = B.getMF().getFunction().getContext();
6917 Ctx.diagnose(NoTrap);
6918 } else {
6919 // Insert debug-trap instruction
6920 B.buildInstr(AMDGPU::S_TRAP)
6921 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6924 MI.eraseFromParent();
6925 return true;
6928 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6929 MachineIRBuilder &B) const {
6930 MachineRegisterInfo &MRI = *B.getMRI();
6931 const LLT S16 = LLT::scalar(16);
6932 const LLT S32 = LLT::scalar(32);
6933 const LLT V2S16 = LLT::fixed_vector(2, 16);
6934 const LLT V3S32 = LLT::fixed_vector(3, 32);
6936 Register DstReg = MI.getOperand(0).getReg();
6937 Register NodePtr = MI.getOperand(2).getReg();
6938 Register RayExtent = MI.getOperand(3).getReg();
6939 Register RayOrigin = MI.getOperand(4).getReg();
6940 Register RayDir = MI.getOperand(5).getReg();
6941 Register RayInvDir = MI.getOperand(6).getReg();
6942 Register TDescr = MI.getOperand(7).getReg();
6944 if (!ST.hasGFX10_AEncoding()) {
6945 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6946 "intrinsic not supported on subtarget",
6947 MI.getDebugLoc());
6948 B.getMF().getFunction().getContext().diagnose(BadIntrin);
6949 return false;
6952 const bool IsGFX11 = AMDGPU::isGFX11(ST);
6953 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6954 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6955 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6956 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6957 const unsigned NumVDataDwords = 4;
6958 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6959 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6960 const bool UseNSA =
6961 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6963 const unsigned BaseOpcodes[2][2] = {
6964 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6965 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6966 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6967 int Opcode;
6968 if (UseNSA) {
6969 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6970 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6971 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6972 : AMDGPU::MIMGEncGfx10NSA,
6973 NumVDataDwords, NumVAddrDwords);
6974 } else {
6975 assert(!IsGFX12Plus);
6976 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6977 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6978 : AMDGPU::MIMGEncGfx10Default,
6979 NumVDataDwords, NumVAddrDwords);
6981 assert(Opcode != -1);
6983 SmallVector<Register, 12> Ops;
6984 if (UseNSA && IsGFX11Plus) {
6985 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6986 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6987 auto Merged = B.buildMergeLikeInstr(
6988 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6989 Ops.push_back(Merged.getReg(0));
6992 Ops.push_back(NodePtr);
6993 Ops.push_back(RayExtent);
6994 packLanes(RayOrigin);
6996 if (IsA16) {
6997 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6998 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6999 auto MergedDir = B.buildMergeLikeInstr(
7000 V3S32,
7001 {B.buildBitcast(
7002 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7003 UnmergeRayDir.getReg(0)}))
7004 .getReg(0),
7005 B.buildBitcast(
7006 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7007 UnmergeRayDir.getReg(1)}))
7008 .getReg(0),
7009 B.buildBitcast(
7010 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7011 UnmergeRayDir.getReg(2)}))
7012 .getReg(0)});
7013 Ops.push_back(MergedDir.getReg(0));
7014 } else {
7015 packLanes(RayDir);
7016 packLanes(RayInvDir);
7018 } else {
7019 if (Is64) {
7020 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7021 Ops.push_back(Unmerge.getReg(0));
7022 Ops.push_back(Unmerge.getReg(1));
7023 } else {
7024 Ops.push_back(NodePtr);
7026 Ops.push_back(RayExtent);
7028 auto packLanes = [&Ops, &S32, &B](Register Src) {
7029 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7030 Ops.push_back(Unmerge.getReg(0));
7031 Ops.push_back(Unmerge.getReg(1));
7032 Ops.push_back(Unmerge.getReg(2));
7035 packLanes(RayOrigin);
7036 if (IsA16) {
7037 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7038 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7039 Register R1 = MRI.createGenericVirtualRegister(S32);
7040 Register R2 = MRI.createGenericVirtualRegister(S32);
7041 Register R3 = MRI.createGenericVirtualRegister(S32);
7042 B.buildMergeLikeInstr(R1,
7043 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7044 B.buildMergeLikeInstr(
7045 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7046 B.buildMergeLikeInstr(
7047 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7048 Ops.push_back(R1);
7049 Ops.push_back(R2);
7050 Ops.push_back(R3);
7051 } else {
7052 packLanes(RayDir);
7053 packLanes(RayInvDir);
7057 if (!UseNSA) {
7058 // Build a single vector containing all the operands so far prepared.
7059 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7060 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7061 Ops.clear();
7062 Ops.push_back(MergedOps);
7065 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7066 .addDef(DstReg)
7067 .addImm(Opcode);
7069 for (Register R : Ops) {
7070 MIB.addUse(R);
7073 MIB.addUse(TDescr)
7074 .addImm(IsA16 ? 1 : 0)
7075 .cloneMemRefs(MI);
7077 MI.eraseFromParent();
7078 return true;
7081 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
7082 MachineIRBuilder &B) const {
7083 unsigned Opc;
7084 int RoundMode = MI.getOperand(2).getImm();
7086 if (RoundMode == (int)RoundingMode::TowardPositive)
7087 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
7088 else if (RoundMode == (int)RoundingMode::TowardNegative)
7089 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
7090 else
7091 return false;
7093 B.buildInstr(Opc)
7094 .addDef(MI.getOperand(0).getReg())
7095 .addUse(MI.getOperand(1).getReg());
7097 MI.eraseFromParent();
7099 return true;
7102 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7103 MachineIRBuilder &B) const {
7104 const SITargetLowering *TLI = ST.getTargetLowering();
7105 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7106 Register DstReg = MI.getOperand(0).getReg();
7107 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7108 MI.eraseFromParent();
7109 return true;
7112 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7113 MachineIRBuilder &B) const {
7114 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7115 if (!ST.hasArchitectedSGPRs())
7116 return false;
7117 LLT S32 = LLT::scalar(32);
7118 Register DstReg = MI.getOperand(0).getReg();
7119 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7120 auto LSB = B.buildConstant(S32, 25);
7121 auto Width = B.buildConstant(S32, 5);
7122 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7123 MI.eraseFromParent();
7124 return true;
7127 static constexpr unsigned FPEnvModeBitField =
7128 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
7130 static constexpr unsigned FPEnvTrapBitField =
7131 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
7133 bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7134 MachineRegisterInfo &MRI,
7135 MachineIRBuilder &B) const {
7136 Register Src = MI.getOperand(0).getReg();
7137 if (MRI.getType(Src) != S64)
7138 return false;
7140 auto ModeReg =
7141 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7142 /*HasSideEffects=*/true, /*isConvergent=*/false)
7143 .addImm(FPEnvModeBitField);
7144 auto TrapReg =
7145 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7146 /*HasSideEffects=*/true, /*isConvergent=*/false)
7147 .addImm(FPEnvTrapBitField);
7148 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7149 MI.eraseFromParent();
7150 return true;
7153 bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7154 MachineRegisterInfo &MRI,
7155 MachineIRBuilder &B) const {
7156 Register Src = MI.getOperand(0).getReg();
7157 if (MRI.getType(Src) != S64)
7158 return false;
7160 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7161 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7162 /*HasSideEffects=*/true, /*isConvergent=*/false)
7163 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7164 .addReg(Unmerge.getReg(0));
7165 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7166 /*HasSideEffects=*/true, /*isConvergent=*/false)
7167 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7168 .addReg(Unmerge.getReg(1));
7169 MI.eraseFromParent();
7170 return true;
7173 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7174 MachineInstr &MI) const {
7175 MachineIRBuilder &B = Helper.MIRBuilder;
7176 MachineRegisterInfo &MRI = *B.getMRI();
7178 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7179 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7180 switch (IntrID) {
7181 case Intrinsic::amdgcn_if:
7182 case Intrinsic::amdgcn_else: {
7183 MachineInstr *Br = nullptr;
7184 MachineBasicBlock *UncondBrTarget = nullptr;
7185 bool Negated = false;
7186 if (MachineInstr *BrCond =
7187 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7188 const SIRegisterInfo *TRI
7189 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7191 Register Def = MI.getOperand(1).getReg();
7192 Register Use = MI.getOperand(3).getReg();
7194 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7196 if (Negated)
7197 std::swap(CondBrTarget, UncondBrTarget);
7199 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7200 if (IntrID == Intrinsic::amdgcn_if) {
7201 B.buildInstr(AMDGPU::SI_IF)
7202 .addDef(Def)
7203 .addUse(Use)
7204 .addMBB(UncondBrTarget);
7205 } else {
7206 B.buildInstr(AMDGPU::SI_ELSE)
7207 .addDef(Def)
7208 .addUse(Use)
7209 .addMBB(UncondBrTarget);
7212 if (Br) {
7213 Br->getOperand(0).setMBB(CondBrTarget);
7214 } else {
7215 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7216 // since we're swapping branch targets it needs to be reinserted.
7217 // FIXME: IRTranslator should probably not do this
7218 B.buildBr(*CondBrTarget);
7221 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7222 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7223 MI.eraseFromParent();
7224 BrCond->eraseFromParent();
7225 return true;
7228 return false;
7230 case Intrinsic::amdgcn_loop: {
7231 MachineInstr *Br = nullptr;
7232 MachineBasicBlock *UncondBrTarget = nullptr;
7233 bool Negated = false;
7234 if (MachineInstr *BrCond =
7235 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7236 const SIRegisterInfo *TRI
7237 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7239 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7240 Register Reg = MI.getOperand(2).getReg();
7242 if (Negated)
7243 std::swap(CondBrTarget, UncondBrTarget);
7245 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7246 B.buildInstr(AMDGPU::SI_LOOP)
7247 .addUse(Reg)
7248 .addMBB(UncondBrTarget);
7250 if (Br)
7251 Br->getOperand(0).setMBB(CondBrTarget);
7252 else
7253 B.buildBr(*CondBrTarget);
7255 MI.eraseFromParent();
7256 BrCond->eraseFromParent();
7257 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7258 return true;
7261 return false;
7263 case Intrinsic::amdgcn_addrspacecast_nonnull:
7264 return legalizeAddrSpaceCast(MI, MRI, B);
7265 case Intrinsic::amdgcn_make_buffer_rsrc:
7266 return legalizePointerAsRsrcIntrin(MI, MRI, B);
7267 case Intrinsic::amdgcn_kernarg_segment_ptr:
7268 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7269 // This only makes sense to call in a kernel, so just lower to null.
7270 B.buildConstant(MI.getOperand(0).getReg(), 0);
7271 MI.eraseFromParent();
7272 return true;
7275 return legalizePreloadedArgIntrin(
7276 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7277 case Intrinsic::amdgcn_implicitarg_ptr:
7278 return legalizeImplicitArgPtr(MI, MRI, B);
7279 case Intrinsic::amdgcn_workitem_id_x:
7280 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7281 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7282 case Intrinsic::amdgcn_workitem_id_y:
7283 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7284 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7285 case Intrinsic::amdgcn_workitem_id_z:
7286 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7287 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7288 case Intrinsic::amdgcn_workgroup_id_x:
7289 return legalizePreloadedArgIntrin(MI, MRI, B,
7290 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7291 case Intrinsic::amdgcn_workgroup_id_y:
7292 return legalizePreloadedArgIntrin(MI, MRI, B,
7293 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7294 case Intrinsic::amdgcn_workgroup_id_z:
7295 return legalizePreloadedArgIntrin(MI, MRI, B,
7296 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7297 case Intrinsic::amdgcn_wave_id:
7298 return legalizeWaveID(MI, B);
7299 case Intrinsic::amdgcn_lds_kernel_id:
7300 return legalizePreloadedArgIntrin(MI, MRI, B,
7301 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7302 case Intrinsic::amdgcn_dispatch_ptr:
7303 return legalizePreloadedArgIntrin(MI, MRI, B,
7304 AMDGPUFunctionArgInfo::DISPATCH_PTR);
7305 case Intrinsic::amdgcn_queue_ptr:
7306 return legalizePreloadedArgIntrin(MI, MRI, B,
7307 AMDGPUFunctionArgInfo::QUEUE_PTR);
7308 case Intrinsic::amdgcn_implicit_buffer_ptr:
7309 return legalizePreloadedArgIntrin(
7310 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7311 case Intrinsic::amdgcn_dispatch_id:
7312 return legalizePreloadedArgIntrin(MI, MRI, B,
7313 AMDGPUFunctionArgInfo::DISPATCH_ID);
7314 case Intrinsic::r600_read_ngroups_x:
7315 // TODO: Emit error for hsa
7316 return legalizeKernargMemParameter(MI, B,
7317 SI::KernelInputOffsets::NGROUPS_X);
7318 case Intrinsic::r600_read_ngroups_y:
7319 return legalizeKernargMemParameter(MI, B,
7320 SI::KernelInputOffsets::NGROUPS_Y);
7321 case Intrinsic::r600_read_ngroups_z:
7322 return legalizeKernargMemParameter(MI, B,
7323 SI::KernelInputOffsets::NGROUPS_Z);
7324 case Intrinsic::r600_read_local_size_x:
7325 // TODO: Could insert G_ASSERT_ZEXT from s16
7326 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
7327 case Intrinsic::r600_read_local_size_y:
7328 // TODO: Could insert G_ASSERT_ZEXT from s16
7329 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y);
7330 // TODO: Could insert G_ASSERT_ZEXT from s16
7331 case Intrinsic::r600_read_local_size_z:
7332 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
7333 case Intrinsic::r600_read_global_size_x:
7334 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
7335 case Intrinsic::r600_read_global_size_y:
7336 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
7337 case Intrinsic::r600_read_global_size_z:
7338 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
7339 case Intrinsic::amdgcn_fdiv_fast:
7340 return legalizeFDIVFastIntrin(MI, MRI, B);
7341 case Intrinsic::amdgcn_is_shared:
7342 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
7343 case Intrinsic::amdgcn_is_private:
7344 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
7345 case Intrinsic::amdgcn_wavefrontsize: {
7346 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7347 MI.eraseFromParent();
7348 return true;
7350 case Intrinsic::amdgcn_s_buffer_load:
7351 return legalizeSBufferLoad(Helper, MI);
7352 case Intrinsic::amdgcn_raw_buffer_store:
7353 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7354 case Intrinsic::amdgcn_struct_buffer_store:
7355 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7356 return legalizeBufferStore(MI, MRI, B, false, false);
7357 case Intrinsic::amdgcn_raw_buffer_store_format:
7358 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7359 case Intrinsic::amdgcn_struct_buffer_store_format:
7360 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7361 return legalizeBufferStore(MI, MRI, B, false, true);
7362 case Intrinsic::amdgcn_raw_tbuffer_store:
7363 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7364 case Intrinsic::amdgcn_struct_tbuffer_store:
7365 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7366 return legalizeBufferStore(MI, MRI, B, true, true);
7367 case Intrinsic::amdgcn_raw_buffer_load:
7368 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7369 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7370 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7371 case Intrinsic::amdgcn_struct_buffer_load:
7372 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7373 return legalizeBufferLoad(MI, MRI, B, false, false);
7374 case Intrinsic::amdgcn_raw_buffer_load_format:
7375 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7376 case Intrinsic::amdgcn_struct_buffer_load_format:
7377 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7378 return legalizeBufferLoad(MI, MRI, B, true, false);
7379 case Intrinsic::amdgcn_raw_tbuffer_load:
7380 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7381 case Intrinsic::amdgcn_struct_tbuffer_load:
7382 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7383 return legalizeBufferLoad(MI, MRI, B, true, true);
7384 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7385 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7386 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7387 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7388 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7389 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7390 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7391 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7392 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7393 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7394 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7395 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7396 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7397 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7398 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7399 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7400 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7401 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7402 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7403 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7404 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7405 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7406 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7407 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7408 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7409 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7410 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7411 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7412 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7413 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7414 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7415 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7416 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7417 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7418 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7419 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7420 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7421 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7422 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7423 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7424 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7425 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7426 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7427 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7428 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7430 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7431 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7432 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7433 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7434 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7435 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7436 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7437 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7438 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7439 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7440 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7441 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7442 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7443 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7444 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7445 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7446 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7447 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7448 return legalizeBufferAtomic(MI, B, IntrID);
7449 case Intrinsic::amdgcn_rsq_clamp:
7450 return legalizeRsqClampIntrinsic(MI, MRI, B);
7451 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7452 return legalizeBVHIntrinsic(MI, B);
7453 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7454 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7455 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7456 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7457 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7458 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7459 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7460 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7461 Register Index = MI.getOperand(5).getReg();
7462 LLT S32 = LLT::scalar(32);
7463 if (MRI.getType(Index) != S32)
7464 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7465 return true;
7467 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7468 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7469 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7470 Register Index = MI.getOperand(7).getReg();
7471 LLT S32 = LLT::scalar(32);
7472 if (MRI.getType(Index) != S32)
7473 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7474 return true;
7476 case Intrinsic::amdgcn_fmed3: {
7477 GISelChangeObserver &Observer = Helper.Observer;
7479 // FIXME: This is to workaround the inability of tablegen match combiners to
7480 // match intrinsics in patterns.
7481 Observer.changingInstr(MI);
7482 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7483 MI.removeOperand(1);
7484 Observer.changedInstr(MI);
7485 return true;
7487 case Intrinsic::amdgcn_readlane:
7488 case Intrinsic::amdgcn_writelane:
7489 case Intrinsic::amdgcn_readfirstlane:
7490 case Intrinsic::amdgcn_permlane16:
7491 case Intrinsic::amdgcn_permlanex16:
7492 case Intrinsic::amdgcn_permlane64:
7493 return legalizeLaneOp(Helper, MI, IntrID);
7494 default: {
7495 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7496 AMDGPU::getImageDimIntrinsicInfo(IntrID))
7497 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7498 return true;
7502 return true;