[Clang/AMDGPU] Zero sized arrays not allowed in HIP device code. (#113470)
[llvm-project.git] / llvm / lib / Target / AMDGPU / AMDGPULegalizerInfo.cpp
blob9bf1f281c32a094b448ff0be31bc0f0a95491c25
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUMemoryUtils.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/ScopeExit.h"
27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31 #include "llvm/CodeGen/GlobalISel/Utils.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
37 #define DEBUG_TYPE "amdgpu-legalinfo"
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
45 // Hack until load/store selection patterns support any tuple of legal types.
46 static cl::opt<bool> EnableNewLegality(
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
51 cl::ReallyHidden);
53 static constexpr unsigned MaxRegisterSize = 1024;
55 // Round the number of elements to the next power of two elements
56 static LLT getPow2VectorType(LLT Ty) {
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
62 // Round the number of bits to the next power of two bits
63 static LLT getPow2ScalarType(LLT Ty) {
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
69 /// \returns true if this is an odd sized vector which should widen by adding an
70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71 /// excludes s1 vectors, which should always be scalarized.
72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
93 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
122 // Increase the number of vector elements to reach the next multiple of 32-bit
123 // type.
124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
133 assert(EltSize < 32);
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
140 // Increase the number of vector elements to reach the next legal RegClass.
141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
148 assert(EltSize == 32 || EltSize == 64);
149 assert(Ty.getSizeInBits() < MaxRegisterSize);
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
162 static LLT getBufferRsrcScalarType(const LLT Ty) {
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
169 static LLT getBufferRsrcRegisterType(const LLT Ty) {
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
176 static LLT getBitcastRegisterType(const LLT Ty) {
177 const unsigned Size = Ty.getSizeInBits();
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
185 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
201 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
226 static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
230 static bool isRegisterVectorElementType(LLT EltTy) {
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
235 static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
242 // TODO: replace all uses of isRegisterType with isRegisterClassType
243 static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Ty.getSizeInBits()))
245 return false;
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
250 return true;
253 // Any combination of 32 or 64-bit elements up the maximum register size, and
254 // multiples of v2s16.
255 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Query.Types[TypeIdx]);
261 // RegisterType that doesn't have a corresponding RegClass.
262 // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263 // should be removed.
264 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
268 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
272 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
282 static const LLT S1 = LLT::scalar(1);
283 static const LLT S8 = LLT::scalar(8);
284 static const LLT S16 = LLT::scalar(16);
285 static const LLT S32 = LLT::scalar(32);
286 static const LLT F32 = LLT::float32();
287 static const LLT S64 = LLT::scalar(64);
288 static const LLT F64 = LLT::float64();
289 static const LLT S96 = LLT::scalar(96);
290 static const LLT S128 = LLT::scalar(128);
291 static const LLT S160 = LLT::scalar(160);
292 static const LLT S192 = LLT::scalar(192);
293 static const LLT S224 = LLT::scalar(224);
294 static const LLT S256 = LLT::scalar(256);
295 static const LLT S512 = LLT::scalar(512);
296 static const LLT S1024 = LLT::scalar(1024);
297 static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
299 static const LLT V2S8 = LLT::fixed_vector(2, 8);
300 static const LLT V2S16 = LLT::fixed_vector(2, 16);
301 static const LLT V4S16 = LLT::fixed_vector(4, 16);
302 static const LLT V6S16 = LLT::fixed_vector(6, 16);
303 static const LLT V8S16 = LLT::fixed_vector(8, 16);
304 static const LLT V10S16 = LLT::fixed_vector(10, 16);
305 static const LLT V12S16 = LLT::fixed_vector(12, 16);
306 static const LLT V16S16 = LLT::fixed_vector(16, 16);
308 static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16());
309 static const LLT V2BF16 = V2F16; // FIXME
311 static const LLT V2S32 = LLT::fixed_vector(2, 32);
312 static const LLT V3S32 = LLT::fixed_vector(3, 32);
313 static const LLT V4S32 = LLT::fixed_vector(4, 32);
314 static const LLT V5S32 = LLT::fixed_vector(5, 32);
315 static const LLT V6S32 = LLT::fixed_vector(6, 32);
316 static const LLT V7S32 = LLT::fixed_vector(7, 32);
317 static const LLT V8S32 = LLT::fixed_vector(8, 32);
318 static const LLT V9S32 = LLT::fixed_vector(9, 32);
319 static const LLT V10S32 = LLT::fixed_vector(10, 32);
320 static const LLT V11S32 = LLT::fixed_vector(11, 32);
321 static const LLT V12S32 = LLT::fixed_vector(12, 32);
322 static const LLT V16S32 = LLT::fixed_vector(16, 32);
323 static const LLT V32S32 = LLT::fixed_vector(32, 32);
325 static const LLT V2S64 = LLT::fixed_vector(2, 64);
326 static const LLT V3S64 = LLT::fixed_vector(3, 64);
327 static const LLT V4S64 = LLT::fixed_vector(4, 64);
328 static const LLT V5S64 = LLT::fixed_vector(5, 64);
329 static const LLT V6S64 = LLT::fixed_vector(6, 64);
330 static const LLT V7S64 = LLT::fixed_vector(7, 64);
331 static const LLT V8S64 = LLT::fixed_vector(8, 64);
332 static const LLT V16S64 = LLT::fixed_vector(16, 64);
334 static const LLT V2S128 = LLT::fixed_vector(2, 128);
335 static const LLT V4S128 = LLT::fixed_vector(4, 128);
337 static std::initializer_list<LLT> AllScalarTypes = {
338 S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
340 static std::initializer_list<LLT> AllS16Vectors{
341 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
343 static std::initializer_list<LLT> AllS32Vectors = {
344 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
345 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
347 static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
348 V6S64, V7S64, V8S64, V16S64};
350 // Checks whether a type is in the list of legal register types.
351 static bool isRegisterClassType(LLT Ty) {
352 if (Ty.isPointerOrPointerVector())
353 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
355 return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
356 is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
359 static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
360 return [TypeIdx](const LegalityQuery &Query) {
361 return isRegisterClassType(Query.Types[TypeIdx]);
365 // If we have a truncating store or an extending load with a data size larger
366 // than 32-bits, we need to reduce to a 32-bit type.
367 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
368 return [=](const LegalityQuery &Query) {
369 const LLT Ty = Query.Types[TypeIdx];
370 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
371 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
375 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
376 // handle some operations by just promoting the register during
377 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
378 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
379 bool IsLoad, bool IsAtomic) {
380 switch (AS) {
381 case AMDGPUAS::PRIVATE_ADDRESS:
382 // FIXME: Private element size.
383 return ST.enableFlatScratch() ? 128 : 32;
384 case AMDGPUAS::LOCAL_ADDRESS:
385 return ST.useDS128() ? 128 : 64;
386 case AMDGPUAS::GLOBAL_ADDRESS:
387 case AMDGPUAS::CONSTANT_ADDRESS:
388 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
389 case AMDGPUAS::BUFFER_RESOURCE:
390 // Treat constant and global as identical. SMRD loads are sometimes usable for
391 // global loads (ideally constant address space should be eliminated)
392 // depending on the context. Legality cannot be context dependent, but
393 // RegBankSelect can split the load as necessary depending on the pointer
394 // register bank/uniformity and if the memory is invariant or not written in a
395 // kernel.
396 return IsLoad ? 512 : 128;
397 default:
398 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
399 // if they may alias scratch depending on the subtarget. This needs to be
400 // moved to custom handling to use addressMayBeAccessedAsPrivate
401 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
405 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
406 const LegalityQuery &Query) {
407 const LLT Ty = Query.Types[0];
409 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
410 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
412 unsigned RegSize = Ty.getSizeInBits();
413 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
414 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
415 unsigned AS = Query.Types[1].getAddressSpace();
417 // All of these need to be custom lowered to cast the pointer operand.
418 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
419 return false;
421 // Do not handle extending vector loads.
422 if (Ty.isVector() && MemSize != RegSize)
423 return false;
425 // TODO: We should be able to widen loads if the alignment is high enough, but
426 // we also need to modify the memory access size.
427 #if 0
428 // Accept widening loads based on alignment.
429 if (IsLoad && MemSize < Size)
430 MemSize = std::max(MemSize, Align);
431 #endif
433 // Only 1-byte and 2-byte to 32-bit extloads are valid.
434 if (MemSize != RegSize && RegSize != 32)
435 return false;
437 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
438 Query.MMODescrs[0].Ordering !=
439 AtomicOrdering::NotAtomic))
440 return false;
442 switch (MemSize) {
443 case 8:
444 case 16:
445 case 32:
446 case 64:
447 case 128:
448 break;
449 case 96:
450 if (!ST.hasDwordx3LoadStores())
451 return false;
452 break;
453 case 256:
454 case 512:
455 // These may contextually need to be broken down.
456 break;
457 default:
458 return false;
461 assert(RegSize >= MemSize);
463 if (AlignBits < MemSize) {
464 const SITargetLowering *TLI = ST.getTargetLowering();
465 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
466 Align(AlignBits / 8)))
467 return false;
470 return true;
473 // The newer buffer intrinsic forms take their resource arguments as
474 // pointers in address space 8, aka s128 values. However, in order to not break
475 // SelectionDAG, the underlying operations have to continue to take v4i32
476 // arguments. Therefore, we convert resource pointers - or vectors of them
477 // to integer values here.
478 static bool hasBufferRsrcWorkaround(const LLT Ty) {
479 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
480 return true;
481 if (Ty.isVector()) {
482 const LLT ElemTy = Ty.getElementType();
483 return hasBufferRsrcWorkaround(ElemTy);
485 return false;
488 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
489 // workaround this. Eventually it should ignore the type for loads and only care
490 // about the size. Return true in cases where we will workaround this for now by
491 // bitcasting.
492 static bool loadStoreBitcastWorkaround(const LLT Ty) {
493 if (EnableNewLegality)
494 return false;
496 const unsigned Size = Ty.getSizeInBits();
497 if (Ty.isPointerVector())
498 return true;
499 if (Size <= 64)
500 return false;
501 // Address space 8 pointers get their own workaround.
502 if (hasBufferRsrcWorkaround(Ty))
503 return false;
504 if (!Ty.isVector())
505 return true;
507 unsigned EltSize = Ty.getScalarSizeInBits();
508 return EltSize != 32 && EltSize != 64;
511 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
512 const LLT Ty = Query.Types[0];
513 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
514 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
517 /// Return true if a load or store of the type should be lowered with a bitcast
518 /// to a different type.
519 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
520 const LLT MemTy) {
521 const unsigned MemSizeInBits = MemTy.getSizeInBits();
522 const unsigned Size = Ty.getSizeInBits();
523 if (Size != MemSizeInBits)
524 return Size <= 32 && Ty.isVector();
526 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
527 return true;
529 // Don't try to handle bitcasting vector ext loads for now.
530 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
531 (Size <= 32 || isRegisterSize(Size)) &&
532 !isRegisterVectorElementType(Ty.getElementType());
535 /// Return true if we should legalize a load by widening an odd sized memory
536 /// access up to the alignment. Note this case when the memory access itself
537 /// changes, not the size of the result register.
538 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
539 uint64_t AlignInBits, unsigned AddrSpace,
540 unsigned Opcode) {
541 unsigned SizeInBits = MemoryTy.getSizeInBits();
542 // We don't want to widen cases that are naturally legal.
543 if (isPowerOf2_32(SizeInBits))
544 return false;
546 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
547 // end up widening these for a scalar load during RegBankSelect, if we don't
548 // have 96-bit scalar loads.
549 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
550 return false;
552 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
553 return false;
555 // A load is known dereferenceable up to the alignment, so it's legal to widen
556 // to it.
558 // TODO: Could check dereferenceable for less aligned cases.
559 unsigned RoundedSize = NextPowerOf2(SizeInBits);
560 if (AlignInBits < RoundedSize)
561 return false;
563 // Do not widen if it would introduce a slow unaligned load.
564 const SITargetLowering *TLI = ST.getTargetLowering();
565 unsigned Fast = 0;
566 return TLI->allowsMisalignedMemoryAccessesImpl(
567 RoundedSize, AddrSpace, Align(AlignInBits / 8),
568 MachineMemOperand::MOLoad, &Fast) &&
569 Fast;
572 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
573 unsigned Opcode) {
574 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
575 return false;
577 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
578 Query.MMODescrs[0].AlignInBits,
579 Query.Types[1].getAddressSpace(), Opcode);
582 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
583 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
584 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
585 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
586 MachineRegisterInfo &MRI, unsigned Idx) {
587 MachineOperand &MO = MI.getOperand(Idx);
589 const LLT PointerTy = MRI.getType(MO.getReg());
591 // Paranoidly prevent us from doing this multiple times.
592 if (!hasBufferRsrcWorkaround(PointerTy))
593 return PointerTy;
595 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
596 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
597 if (!PointerTy.isVector()) {
598 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
599 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
600 const LLT S32 = LLT::scalar(32);
602 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
603 std::array<Register, 4> VectorElems;
604 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
605 for (unsigned I = 0; I < NumParts; ++I)
606 VectorElems[I] =
607 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
608 B.buildMergeValues(MO, VectorElems);
609 MO.setReg(VectorReg);
610 return VectorTy;
612 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
613 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
614 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
615 B.buildIntToPtr(MO, Scalar);
616 MO.setReg(BitcastReg);
618 return VectorTy;
621 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
622 /// the form in which the value must be in order to be passed to the low-level
623 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
624 /// needed in order to account for the fact that we can't define a register
625 /// class for s128 without breaking SelectionDAG.
626 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
627 MachineRegisterInfo &MRI = *B.getMRI();
628 const LLT PointerTy = MRI.getType(Pointer);
629 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
630 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
632 if (!PointerTy.isVector()) {
633 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
634 SmallVector<Register, 4> PointerParts;
635 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
636 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
637 for (unsigned I = 0; I < NumParts; ++I)
638 PointerParts.push_back(Unmerged.getReg(I));
639 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
641 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
642 return B.buildBitcast(VectorTy, Scalar).getReg(0);
645 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
646 unsigned Idx) {
647 MachineOperand &MO = MI.getOperand(Idx);
649 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
650 // Paranoidly prevent us from doing this multiple times.
651 if (!hasBufferRsrcWorkaround(PointerTy))
652 return;
653 MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
656 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
657 const GCNTargetMachine &TM)
658 : ST(ST_) {
659 using namespace TargetOpcode;
661 auto GetAddrSpacePtr = [&TM](unsigned AS) {
662 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
665 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
666 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
667 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
668 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
669 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
670 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
671 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
672 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
673 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
674 const LLT BufferStridedPtr =
675 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
677 const LLT CodePtr = FlatPtr;
679 const std::initializer_list<LLT> AddrSpaces64 = {
680 GlobalPtr, ConstantPtr, FlatPtr
683 const std::initializer_list<LLT> AddrSpaces32 = {
684 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
687 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
689 const std::initializer_list<LLT> FPTypesBase = {
690 S32, S64
693 const std::initializer_list<LLT> FPTypes16 = {
694 S32, S64, S16
697 const std::initializer_list<LLT> FPTypesPK16 = {
698 S32, S64, S16, V2S16
701 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
703 // s1 for VCC branches, s32 for SCC branches.
704 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
706 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
707 // elements for v3s16
708 getActionDefinitionsBuilder(G_PHI)
709 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
710 .legalFor(AllS32Vectors)
711 .legalFor(AllS64Vectors)
712 .legalFor(AddrSpaces64)
713 .legalFor(AddrSpaces32)
714 .legalFor(AddrSpaces128)
715 .legalIf(isPointer(0))
716 .clampScalar(0, S16, S256)
717 .widenScalarToNextPow2(0, 32)
718 .clampMaxNumElements(0, S32, 16)
719 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
720 .scalarize(0);
722 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
723 // Full set of gfx9 features.
724 if (ST.hasScalarAddSub64()) {
725 getActionDefinitionsBuilder({G_ADD, G_SUB})
726 .legalFor({S64, S32, S16, V2S16})
727 .clampMaxNumElementsStrict(0, S16, 2)
728 .scalarize(0)
729 .minScalar(0, S16)
730 .widenScalarToNextMultipleOf(0, 32)
731 .maxScalar(0, S32);
732 } else {
733 getActionDefinitionsBuilder({G_ADD, G_SUB})
734 .legalFor({S32, S16, V2S16})
735 .clampMaxNumElementsStrict(0, S16, 2)
736 .scalarize(0)
737 .minScalar(0, S16)
738 .widenScalarToNextMultipleOf(0, 32)
739 .maxScalar(0, S32);
742 if (ST.hasScalarSMulU64()) {
743 getActionDefinitionsBuilder(G_MUL)
744 .legalFor({S64, S32, S16, V2S16})
745 .clampMaxNumElementsStrict(0, S16, 2)
746 .scalarize(0)
747 .minScalar(0, S16)
748 .widenScalarToNextMultipleOf(0, 32)
749 .custom();
750 } else {
751 getActionDefinitionsBuilder(G_MUL)
752 .legalFor({S32, S16, V2S16})
753 .clampMaxNumElementsStrict(0, S16, 2)
754 .scalarize(0)
755 .minScalar(0, S16)
756 .widenScalarToNextMultipleOf(0, 32)
757 .custom();
759 assert(ST.hasMad64_32());
761 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
762 .legalFor({S32, S16, V2S16}) // Clamp modifier
763 .minScalarOrElt(0, S16)
764 .clampMaxNumElementsStrict(0, S16, 2)
765 .scalarize(0)
766 .widenScalarToNextPow2(0, 32)
767 .lower();
768 } else if (ST.has16BitInsts()) {
769 getActionDefinitionsBuilder({G_ADD, G_SUB})
770 .legalFor({S32, S16})
771 .minScalar(0, S16)
772 .widenScalarToNextMultipleOf(0, 32)
773 .maxScalar(0, S32)
774 .scalarize(0);
776 getActionDefinitionsBuilder(G_MUL)
777 .legalFor({S32, S16})
778 .scalarize(0)
779 .minScalar(0, S16)
780 .widenScalarToNextMultipleOf(0, 32)
781 .custom();
782 assert(ST.hasMad64_32());
784 // Technically the saturating operations require clamp bit support, but this
785 // was introduced at the same time as 16-bit operations.
786 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
787 .legalFor({S32, S16}) // Clamp modifier
788 .minScalar(0, S16)
789 .scalarize(0)
790 .widenScalarToNextPow2(0, 16)
791 .lower();
793 // We're just lowering this, but it helps get a better result to try to
794 // coerce to the desired type first.
795 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
796 .minScalar(0, S16)
797 .scalarize(0)
798 .lower();
799 } else {
800 getActionDefinitionsBuilder({G_ADD, G_SUB})
801 .legalFor({S32})
802 .widenScalarToNextMultipleOf(0, 32)
803 .clampScalar(0, S32, S32)
804 .scalarize(0);
806 auto &Mul = getActionDefinitionsBuilder(G_MUL)
807 .legalFor({S32})
808 .scalarize(0)
809 .minScalar(0, S32)
810 .widenScalarToNextMultipleOf(0, 32);
812 if (ST.hasMad64_32())
813 Mul.custom();
814 else
815 Mul.maxScalar(0, S32);
817 if (ST.hasIntClamp()) {
818 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
819 .legalFor({S32}) // Clamp modifier.
820 .scalarize(0)
821 .minScalarOrElt(0, S32)
822 .lower();
823 } else {
824 // Clamp bit support was added in VI, along with 16-bit operations.
825 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
826 .minScalar(0, S32)
827 .scalarize(0)
828 .lower();
831 // FIXME: DAG expansion gets better results. The widening uses the smaller
832 // range values and goes for the min/max lowering directly.
833 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
834 .minScalar(0, S32)
835 .scalarize(0)
836 .lower();
839 getActionDefinitionsBuilder(
840 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
841 .customFor({S32, S64})
842 .clampScalar(0, S32, S64)
843 .widenScalarToNextPow2(0, 32)
844 .scalarize(0);
846 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
847 .legalFor({S32})
848 .maxScalar(0, S32);
850 if (ST.hasVOP3PInsts()) {
851 Mulh
852 .clampMaxNumElements(0, S8, 2)
853 .lowerFor({V2S8});
856 Mulh
857 .scalarize(0)
858 .lower();
860 // Report legal for any types we can handle anywhere. For the cases only legal
861 // on the SALU, RegBankSelect will be able to re-legalize.
862 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
863 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
864 .clampScalar(0, S32, S64)
865 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
866 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
867 .widenScalarToNextPow2(0)
868 .scalarize(0);
870 getActionDefinitionsBuilder(
871 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
872 .legalFor({{S32, S1}, {S32, S32}})
873 .clampScalar(0, S32, S32)
874 .scalarize(0);
876 getActionDefinitionsBuilder(G_BITCAST)
877 // Don't worry about the size constraint.
878 .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
879 .lower();
881 getActionDefinitionsBuilder(G_CONSTANT)
882 .legalFor({S1, S32, S64, S16, GlobalPtr,
883 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
884 .legalIf(isPointer(0))
885 .clampScalar(0, S32, S64)
886 .widenScalarToNextPow2(0);
888 getActionDefinitionsBuilder(G_FCONSTANT)
889 .legalFor({S32, S64, S16})
890 .clampScalar(0, S16, S64);
892 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
893 .legalIf(isRegisterClassType(0))
894 // s1 and s16 are special cases because they have legal operations on
895 // them, but don't really occupy registers in the normal way.
896 .legalFor({S1, S16})
897 .clampNumElements(0, V16S32, V32S32)
898 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
899 .clampScalarOrElt(0, S32, MaxScalar)
900 .widenScalarToNextPow2(0, 32)
901 .clampMaxNumElements(0, S32, 16);
903 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
905 // If the amount is divergent, we have to do a wave reduction to get the
906 // maximum value, so this is expanded during RegBankSelect.
907 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
908 .legalFor({{PrivatePtr, S32}});
910 getActionDefinitionsBuilder(G_STACKSAVE)
911 .customFor({PrivatePtr});
912 getActionDefinitionsBuilder(G_STACKRESTORE)
913 .legalFor({PrivatePtr});
915 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
917 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
918 .customIf(typeIsNot(0, PrivatePtr));
920 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
922 auto &FPOpActions = getActionDefinitionsBuilder(
923 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
924 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
925 .legalFor({S32, S64});
926 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
927 .customFor({S32, S64});
928 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
929 .customFor({S32, S64});
931 if (ST.has16BitInsts()) {
932 if (ST.hasVOP3PInsts())
933 FPOpActions.legalFor({S16, V2S16});
934 else
935 FPOpActions.legalFor({S16});
937 TrigActions.customFor({S16});
938 FDIVActions.customFor({S16});
941 if (ST.hasPackedFP32Ops()) {
942 FPOpActions.legalFor({V2S32});
943 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
946 auto &MinNumMaxNum = getActionDefinitionsBuilder({
947 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
949 if (ST.hasVOP3PInsts()) {
950 MinNumMaxNum.customFor(FPTypesPK16)
951 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
952 .clampMaxNumElements(0, S16, 2)
953 .clampScalar(0, S16, S64)
954 .scalarize(0);
955 } else if (ST.has16BitInsts()) {
956 MinNumMaxNum.customFor(FPTypes16)
957 .clampScalar(0, S16, S64)
958 .scalarize(0);
959 } else {
960 MinNumMaxNum.customFor(FPTypesBase)
961 .clampScalar(0, S32, S64)
962 .scalarize(0);
965 if (ST.hasVOP3PInsts())
966 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
968 FPOpActions
969 .scalarize(0)
970 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
972 TrigActions
973 .scalarize(0)
974 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
976 FDIVActions
977 .scalarize(0)
978 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
980 getActionDefinitionsBuilder({G_FNEG, G_FABS})
981 .legalFor(FPTypesPK16)
982 .clampMaxNumElementsStrict(0, S16, 2)
983 .scalarize(0)
984 .clampScalar(0, S16, S64);
986 if (ST.has16BitInsts()) {
987 getActionDefinitionsBuilder(G_FSQRT)
988 .legalFor({S16})
989 .customFor({S32, S64})
990 .scalarize(0)
991 .unsupported();
992 getActionDefinitionsBuilder(G_FFLOOR)
993 .legalFor({S32, S64, S16})
994 .scalarize(0)
995 .clampScalar(0, S16, S64);
997 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
998 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
999 .scalarize(0)
1000 .maxScalarIf(typeIs(0, S16), 1, S16)
1001 .clampScalar(1, S32, S32)
1002 .lower();
1004 getActionDefinitionsBuilder(G_FFREXP)
1005 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1006 .scalarize(0)
1007 .lower();
1008 } else {
1009 getActionDefinitionsBuilder(G_FSQRT)
1010 .customFor({S32, S64, S16})
1011 .scalarize(0)
1012 .unsupported();
1015 if (ST.hasFractBug()) {
1016 getActionDefinitionsBuilder(G_FFLOOR)
1017 .customFor({S64})
1018 .legalFor({S32, S64})
1019 .scalarize(0)
1020 .clampScalar(0, S32, S64);
1021 } else {
1022 getActionDefinitionsBuilder(G_FFLOOR)
1023 .legalFor({S32, S64})
1024 .scalarize(0)
1025 .clampScalar(0, S32, S64);
1028 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1029 .legalFor({{S32, S32}, {S64, S32}})
1030 .scalarize(0)
1031 .clampScalar(0, S32, S64)
1032 .clampScalar(1, S32, S32)
1033 .lower();
1035 getActionDefinitionsBuilder(G_FFREXP)
1036 .customFor({{S32, S32}, {S64, S32}})
1037 .scalarize(0)
1038 .minScalar(0, S32)
1039 .clampScalar(1, S32, S32)
1040 .lower();
1043 getActionDefinitionsBuilder(G_FPTRUNC)
1044 .legalFor({{S32, S64}, {S16, S32}})
1045 .scalarize(0)
1046 .lower();
1048 getActionDefinitionsBuilder(G_FPEXT)
1049 .legalFor({{S64, S32}, {S32, S16}})
1050 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1051 .scalarize(0);
1053 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1054 if (ST.has16BitInsts()) {
1055 FSubActions
1056 // Use actual fsub instruction
1057 .legalFor({S32, S16})
1058 // Must use fadd + fneg
1059 .lowerFor({S64, V2S16});
1060 } else {
1061 FSubActions
1062 // Use actual fsub instruction
1063 .legalFor({S32})
1064 // Must use fadd + fneg
1065 .lowerFor({S64, S16, V2S16});
1068 FSubActions
1069 .scalarize(0)
1070 .clampScalar(0, S32, S64);
1072 // Whether this is legal depends on the floating point mode for the function.
1073 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1074 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1075 FMad.customFor({S32, S16});
1076 else if (ST.hasMadMacF32Insts())
1077 FMad.customFor({S32});
1078 else if (ST.hasMadF16())
1079 FMad.customFor({S16});
1080 FMad.scalarize(0)
1081 .lower();
1083 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1084 if (ST.has16BitInsts()) {
1085 FRem.customFor({S16, S32, S64});
1086 } else {
1087 FRem.minScalar(0, S32)
1088 .customFor({S32, S64});
1090 FRem.scalarize(0);
1092 // TODO: Do we need to clamp maximum bitwidth?
1093 getActionDefinitionsBuilder(G_TRUNC)
1094 .legalIf(isScalar(0))
1095 .legalFor({{V2S16, V2S32}})
1096 .clampMaxNumElements(0, S16, 2)
1097 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1098 // situations (like an invalid implicit use), we don't want to infinite loop
1099 // in the legalizer.
1100 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1101 .alwaysLegal();
1103 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1104 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1105 {S32, S1}, {S64, S1}, {S16, S1}})
1106 .scalarize(0)
1107 .clampScalar(0, S32, S64)
1108 .widenScalarToNextPow2(1, 32);
1110 // TODO: Split s1->s64 during regbankselect for VALU.
1111 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1112 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1113 .lowerIf(typeIs(1, S1))
1114 .customFor({{S32, S64}, {S64, S64}});
1115 if (ST.has16BitInsts())
1116 IToFP.legalFor({{S16, S16}});
1117 IToFP.clampScalar(1, S32, S64)
1118 .minScalar(0, S32)
1119 .scalarize(0)
1120 .widenScalarToNextPow2(1);
1122 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1123 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1124 .customFor({{S64, S32}, {S64, S64}})
1125 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1126 if (ST.has16BitInsts())
1127 FPToI.legalFor({{S16, S16}});
1128 else
1129 FPToI.minScalar(1, S32);
1131 FPToI.minScalar(0, S32)
1132 .widenScalarToNextPow2(0, 32)
1133 .scalarize(0)
1134 .lower();
1136 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1137 .clampScalar(0, S16, S64)
1138 .scalarize(0)
1139 .lower();
1141 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1142 .legalFor({S16, S32})
1143 .scalarize(0)
1144 .lower();
1146 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1147 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1148 .scalarize(0)
1149 .lower();
1151 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1152 .clampScalar(0, S16, S64)
1153 .scalarize(0)
1154 .lower();
1156 if (ST.has16BitInsts()) {
1157 getActionDefinitionsBuilder(
1158 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1159 .legalFor({S16, S32, S64})
1160 .clampScalar(0, S16, S64)
1161 .scalarize(0);
1162 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1163 getActionDefinitionsBuilder(
1164 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1165 .legalFor({S32, S64})
1166 .clampScalar(0, S32, S64)
1167 .scalarize(0);
1168 } else {
1169 getActionDefinitionsBuilder(
1170 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1171 .legalFor({S32})
1172 .customFor({S64})
1173 .clampScalar(0, S32, S64)
1174 .scalarize(0);
1177 getActionDefinitionsBuilder(G_PTR_ADD)
1178 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1179 .legalIf(all(isPointer(0), sameSize(0, 1)))
1180 .scalarize(0)
1181 .scalarSameSizeAs(1, 0);
1183 getActionDefinitionsBuilder(G_PTRMASK)
1184 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1185 .scalarSameSizeAs(1, 0)
1186 .scalarize(0);
1188 auto &CmpBuilder =
1189 getActionDefinitionsBuilder(G_ICMP)
1190 // The compare output type differs based on the register bank of the output,
1191 // so make both s1 and s32 legal.
1193 // Scalar compares producing output in scc will be promoted to s32, as that
1194 // is the allocatable register type that will be needed for the copy from
1195 // scc. This will be promoted during RegBankSelect, and we assume something
1196 // before that won't try to use s32 result types.
1198 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1199 // bank.
1200 .legalForCartesianProduct(
1201 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1202 .legalForCartesianProduct(
1203 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1204 if (ST.has16BitInsts()) {
1205 CmpBuilder.legalFor({{S1, S16}});
1208 CmpBuilder
1209 .widenScalarToNextPow2(1)
1210 .clampScalar(1, S32, S64)
1211 .scalarize(0)
1212 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1214 auto &FCmpBuilder =
1215 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1216 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1218 if (ST.hasSALUFloatInsts())
1219 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1221 FCmpBuilder
1222 .widenScalarToNextPow2(1)
1223 .clampScalar(1, S32, S64)
1224 .scalarize(0);
1226 // FIXME: fpow has a selection pattern that should move to custom lowering.
1227 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1228 if (ST.has16BitInsts())
1229 ExpOps.customFor({{S32}, {S16}});
1230 else
1231 ExpOps.customFor({S32});
1232 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1233 .scalarize(0);
1235 getActionDefinitionsBuilder(G_FPOWI)
1236 .clampScalar(0, MinScalarFPTy, S32)
1237 .lower();
1239 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1240 Log2Ops.customFor({S32});
1241 if (ST.has16BitInsts())
1242 Log2Ops.legalFor({S16});
1243 else
1244 Log2Ops.customFor({S16});
1245 Log2Ops.scalarize(0)
1246 .lower();
1248 auto &LogOps =
1249 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1250 LogOps.customFor({S32, S16});
1251 LogOps.clampScalar(0, MinScalarFPTy, S32)
1252 .scalarize(0);
1254 // The 64-bit versions produce 32-bit results, but only on the SALU.
1255 getActionDefinitionsBuilder(G_CTPOP)
1256 .legalFor({{S32, S32}, {S32, S64}})
1257 .clampScalar(0, S32, S32)
1258 .widenScalarToNextPow2(1, 32)
1259 .clampScalar(1, S32, S64)
1260 .scalarize(0)
1261 .widenScalarToNextPow2(0, 32);
1263 // If no 16 bit instr is available, lower into different instructions.
1264 if (ST.has16BitInsts())
1265 getActionDefinitionsBuilder(G_IS_FPCLASS)
1266 .legalForCartesianProduct({S1}, FPTypes16)
1267 .widenScalarToNextPow2(1)
1268 .scalarize(0)
1269 .lower();
1270 else
1271 getActionDefinitionsBuilder(G_IS_FPCLASS)
1272 .legalForCartesianProduct({S1}, FPTypesBase)
1273 .lowerFor({S1, S16})
1274 .widenScalarToNextPow2(1)
1275 .scalarize(0)
1276 .lower();
1278 // The hardware instructions return a different result on 0 than the generic
1279 // instructions expect. The hardware produces -1, but these produce the
1280 // bitwidth.
1281 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1282 .scalarize(0)
1283 .clampScalar(0, S32, S32)
1284 .clampScalar(1, S32, S64)
1285 .widenScalarToNextPow2(0, 32)
1286 .widenScalarToNextPow2(1, 32)
1287 .custom();
1289 // The 64-bit versions produce 32-bit results, but only on the SALU.
1290 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1291 .legalFor({{S32, S32}, {S32, S64}})
1292 .customIf(scalarNarrowerThan(1, 32))
1293 .clampScalar(0, S32, S32)
1294 .clampScalar(1, S32, S64)
1295 .scalarize(0)
1296 .widenScalarToNextPow2(0, 32)
1297 .widenScalarToNextPow2(1, 32);
1299 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1300 .legalFor({{S32, S32}, {S32, S64}})
1301 .clampScalar(0, S32, S32)
1302 .clampScalar(1, S32, S64)
1303 .scalarize(0)
1304 .widenScalarToNextPow2(0, 32)
1305 .widenScalarToNextPow2(1, 32);
1307 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1308 // RegBankSelect.
1309 getActionDefinitionsBuilder(G_BITREVERSE)
1310 .legalFor({S32, S64})
1311 .clampScalar(0, S32, S64)
1312 .scalarize(0)
1313 .widenScalarToNextPow2(0);
1315 if (ST.has16BitInsts()) {
1316 getActionDefinitionsBuilder(G_BSWAP)
1317 .legalFor({S16, S32, V2S16})
1318 .clampMaxNumElementsStrict(0, S16, 2)
1319 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1320 // narrowScalar limitation.
1321 .widenScalarToNextPow2(0)
1322 .clampScalar(0, S16, S32)
1323 .scalarize(0);
1325 if (ST.hasVOP3PInsts()) {
1326 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1327 .legalFor({S32, S16, V2S16})
1328 .clampMaxNumElements(0, S16, 2)
1329 .minScalar(0, S16)
1330 .widenScalarToNextPow2(0)
1331 .scalarize(0)
1332 .lower();
1333 } else {
1334 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1335 .legalFor({S32, S16})
1336 .widenScalarToNextPow2(0)
1337 .minScalar(0, S16)
1338 .scalarize(0)
1339 .lower();
1341 } else {
1342 // TODO: Should have same legality without v_perm_b32
1343 getActionDefinitionsBuilder(G_BSWAP)
1344 .legalFor({S32})
1345 .lowerIf(scalarNarrowerThan(0, 32))
1346 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1347 // narrowScalar limitation.
1348 .widenScalarToNextPow2(0)
1349 .maxScalar(0, S32)
1350 .scalarize(0)
1351 .lower();
1353 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1354 .legalFor({S32})
1355 .minScalar(0, S32)
1356 .widenScalarToNextPow2(0)
1357 .scalarize(0)
1358 .lower();
1361 getActionDefinitionsBuilder(G_INTTOPTR)
1362 // List the common cases
1363 .legalForCartesianProduct(AddrSpaces64, {S64})
1364 .legalForCartesianProduct(AddrSpaces32, {S32})
1365 .scalarize(0)
1366 // Accept any address space as long as the size matches
1367 .legalIf(sameSize(0, 1))
1368 .widenScalarIf(smallerThan(1, 0),
1369 [](const LegalityQuery &Query) {
1370 return std::pair(
1371 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1373 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1374 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1377 getActionDefinitionsBuilder(G_PTRTOINT)
1378 // List the common cases
1379 .legalForCartesianProduct(AddrSpaces64, {S64})
1380 .legalForCartesianProduct(AddrSpaces32, {S32})
1381 .scalarize(0)
1382 // Accept any address space as long as the size matches
1383 .legalIf(sameSize(0, 1))
1384 .widenScalarIf(smallerThan(0, 1),
1385 [](const LegalityQuery &Query) {
1386 return std::pair(
1387 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1389 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1390 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1393 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1394 .scalarize(0)
1395 .custom();
1397 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1398 bool IsLoad) -> bool {
1399 const LLT DstTy = Query.Types[0];
1401 // Split vector extloads.
1402 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1404 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1405 return true;
1407 const LLT PtrTy = Query.Types[1];
1408 unsigned AS = PtrTy.getAddressSpace();
1409 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1410 Query.MMODescrs[0].Ordering !=
1411 AtomicOrdering::NotAtomic))
1412 return true;
1414 // Catch weird sized loads that don't evenly divide into the access sizes
1415 // TODO: May be able to widen depending on alignment etc.
1416 unsigned NumRegs = (MemSize + 31) / 32;
1417 if (NumRegs == 3) {
1418 if (!ST.hasDwordx3LoadStores())
1419 return true;
1420 } else {
1421 // If the alignment allows, these should have been widened.
1422 if (!isPowerOf2_32(NumRegs))
1423 return true;
1426 return false;
1429 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1430 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1431 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1433 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1434 // LDS
1435 // TODO: Unsupported flat for SI.
1437 for (unsigned Op : {G_LOAD, G_STORE}) {
1438 const bool IsStore = Op == G_STORE;
1440 auto &Actions = getActionDefinitionsBuilder(Op);
1441 // Explicitly list some common cases.
1442 // TODO: Does this help compile time at all?
1443 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1444 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1445 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1446 {S64, GlobalPtr, S64, GlobalAlign32},
1447 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1448 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1449 {S32, GlobalPtr, S8, GlobalAlign8},
1450 {S32, GlobalPtr, S16, GlobalAlign16},
1452 {S32, LocalPtr, S32, 32},
1453 {S64, LocalPtr, S64, 32},
1454 {V2S32, LocalPtr, V2S32, 32},
1455 {S32, LocalPtr, S8, 8},
1456 {S32, LocalPtr, S16, 16},
1457 {V2S16, LocalPtr, S32, 32},
1459 {S32, PrivatePtr, S32, 32},
1460 {S32, PrivatePtr, S8, 8},
1461 {S32, PrivatePtr, S16, 16},
1462 {V2S16, PrivatePtr, S32, 32},
1464 {S32, ConstantPtr, S32, GlobalAlign32},
1465 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1466 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1467 {S64, ConstantPtr, S64, GlobalAlign32},
1468 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1469 Actions.legalIf(
1470 [=](const LegalityQuery &Query) -> bool {
1471 return isLoadStoreLegal(ST, Query);
1474 // The custom pointers (fat pointers, buffer resources) don't work with load
1475 // and store at this level. Fat pointers should have been lowered to
1476 // intrinsics before the translation to MIR.
1477 Actions.unsupportedIf(
1478 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1480 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1481 // ptrtoint. This is needed to account for the fact that we can't have i128
1482 // as a register class for SelectionDAG reasons.
1483 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1484 return hasBufferRsrcWorkaround(Query.Types[0]);
1487 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1488 // 64-bits.
1490 // TODO: Should generalize bitcast action into coerce, which will also cover
1491 // inserting addrspacecasts.
1492 Actions.customIf(typeIs(1, Constant32Ptr));
1494 // Turn any illegal element vectors into something easier to deal
1495 // with. These will ultimately produce 32-bit scalar shifts to extract the
1496 // parts anyway.
1498 // For odd 16-bit element vectors, prefer to split those into pieces with
1499 // 16-bit vector parts.
1500 Actions.bitcastIf(
1501 [=](const LegalityQuery &Query) -> bool {
1502 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1503 Query.MMODescrs[0].MemoryTy);
1504 }, bitcastToRegisterType(0));
1506 if (!IsStore) {
1507 // Widen suitably aligned loads by loading extra bytes. The standard
1508 // legalization actions can't properly express widening memory operands.
1509 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1510 return shouldWidenLoad(ST, Query, G_LOAD);
1514 // FIXME: load/store narrowing should be moved to lower action
1515 Actions
1516 .narrowScalarIf(
1517 [=](const LegalityQuery &Query) -> bool {
1518 return !Query.Types[0].isVector() &&
1519 needToSplitMemOp(Query, Op == G_LOAD);
1521 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1522 const LLT DstTy = Query.Types[0];
1523 const LLT PtrTy = Query.Types[1];
1525 const unsigned DstSize = DstTy.getSizeInBits();
1526 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1528 // Split extloads.
1529 if (DstSize > MemSize)
1530 return std::pair(0, LLT::scalar(MemSize));
1532 unsigned MaxSize = maxSizeForAddrSpace(
1533 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1534 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1535 if (MemSize > MaxSize)
1536 return std::pair(0, LLT::scalar(MaxSize));
1538 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1539 return std::pair(0, LLT::scalar(Align));
1541 .fewerElementsIf(
1542 [=](const LegalityQuery &Query) -> bool {
1543 return Query.Types[0].isVector() &&
1544 needToSplitMemOp(Query, Op == G_LOAD);
1546 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1547 const LLT DstTy = Query.Types[0];
1548 const LLT PtrTy = Query.Types[1];
1550 LLT EltTy = DstTy.getElementType();
1551 unsigned MaxSize = maxSizeForAddrSpace(
1552 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1553 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1555 // FIXME: Handle widened to power of 2 results better. This ends
1556 // up scalarizing.
1557 // FIXME: 3 element stores scalarized on SI
1559 // Split if it's too large for the address space.
1560 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1561 if (MemSize > MaxSize) {
1562 unsigned NumElts = DstTy.getNumElements();
1563 unsigned EltSize = EltTy.getSizeInBits();
1565 if (MaxSize % EltSize == 0) {
1566 return std::pair(
1567 0, LLT::scalarOrVector(
1568 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1571 unsigned NumPieces = MemSize / MaxSize;
1573 // FIXME: Refine when odd breakdowns handled
1574 // The scalars will need to be re-legalized.
1575 if (NumPieces == 1 || NumPieces >= NumElts ||
1576 NumElts % NumPieces != 0)
1577 return std::pair(0, EltTy);
1579 return std::pair(0,
1580 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1583 // FIXME: We could probably handle weird extending loads better.
1584 if (DstTy.getSizeInBits() > MemSize)
1585 return std::pair(0, EltTy);
1587 unsigned EltSize = EltTy.getSizeInBits();
1588 unsigned DstSize = DstTy.getSizeInBits();
1589 if (!isPowerOf2_32(DstSize)) {
1590 // We're probably decomposing an odd sized store. Try to split
1591 // to the widest type. TODO: Account for alignment. As-is it
1592 // should be OK, since the new parts will be further legalized.
1593 unsigned FloorSize = llvm::bit_floor(DstSize);
1594 return std::pair(
1595 0, LLT::scalarOrVector(
1596 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1599 // May need relegalization for the scalars.
1600 return std::pair(0, EltTy);
1602 .minScalar(0, S32)
1603 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1604 .widenScalarToNextPow2(0)
1605 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1606 .lower();
1609 // FIXME: Unaligned accesses not lowered.
1610 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1611 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1612 {S32, GlobalPtr, S16, 2 * 8},
1613 {S32, LocalPtr, S8, 8},
1614 {S32, LocalPtr, S16, 16},
1615 {S32, PrivatePtr, S8, 8},
1616 {S32, PrivatePtr, S16, 16},
1617 {S32, ConstantPtr, S8, 8},
1618 {S32, ConstantPtr, S16, 2 * 8}})
1619 .legalIf(
1620 [=](const LegalityQuery &Query) -> bool {
1621 return isLoadStoreLegal(ST, Query);
1624 if (ST.hasFlatAddressSpace()) {
1625 ExtLoads.legalForTypesWithMemDesc(
1626 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1629 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1630 // 64-bits.
1632 // TODO: Should generalize bitcast action into coerce, which will also cover
1633 // inserting addrspacecasts.
1634 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1636 ExtLoads.clampScalar(0, S32, S32)
1637 .widenScalarToNextPow2(0)
1638 .lower();
1640 auto &Atomics = getActionDefinitionsBuilder(
1641 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1642 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1643 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1644 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1645 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1646 {S64, GlobalPtr}, {S64, LocalPtr},
1647 {S32, RegionPtr}, {S64, RegionPtr}});
1648 if (ST.hasFlatAddressSpace()) {
1649 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1652 // TODO: v2bf16 operations, and fat buffer pointer support.
1653 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1654 if (ST.hasLDSFPAtomicAddF32()) {
1655 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1656 if (ST.hasLdsAtomicAddF64())
1657 Atomic.legalFor({{S64, LocalPtr}});
1658 if (ST.hasAtomicDsPkAdd16Insts())
1659 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1661 if (ST.hasAtomicFaddInsts())
1662 Atomic.legalFor({{S32, GlobalPtr}});
1663 if (ST.hasFlatAtomicFaddF32Inst())
1664 Atomic.legalFor({{S32, FlatPtr}});
1666 if (ST.hasGFX90AInsts()) {
1667 // These are legal with some caveats, and should have undergone expansion in
1668 // the IR in most situations
1669 // TODO: Move atomic expansion into legalizer
1670 Atomic.legalFor({
1671 {S32, GlobalPtr},
1672 {S64, GlobalPtr},
1673 {S64, FlatPtr}
1677 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1678 ST.hasAtomicBufferGlobalPkAddF16Insts())
1679 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1680 if (ST.hasAtomicGlobalPkAddBF16Inst())
1681 Atomic.legalFor({{V2BF16, GlobalPtr}});
1682 if (ST.hasAtomicFlatPkAdd16Insts())
1683 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1686 // Most of the legalization work here is done by AtomicExpand. We could
1687 // probably use a simpler legality rule that just assumes anything is OK.
1688 auto &AtomicFMinFMax =
1689 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1690 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1692 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1693 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1694 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1695 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1696 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1697 AtomicFMinFMax.legalFor({F32, FlatPtr});
1698 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1699 AtomicFMinFMax.legalFor({F64, FlatPtr});
1701 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1702 // demarshalling
1703 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1704 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1705 {S32, FlatPtr}, {S64, FlatPtr}})
1706 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1707 {S32, RegionPtr}, {S64, RegionPtr}});
1708 // TODO: Pointer types, any 32-bit or 64-bit vector
1710 // Condition should be s32 for scalar, s1 for vector.
1711 getActionDefinitionsBuilder(G_SELECT)
1712 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1713 LocalPtr, FlatPtr, PrivatePtr,
1714 LLT::fixed_vector(2, LocalPtr),
1715 LLT::fixed_vector(2, PrivatePtr)},
1716 {S1, S32})
1717 .clampScalar(0, S16, S64)
1718 .scalarize(1)
1719 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1720 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1721 .clampMaxNumElements(0, S32, 2)
1722 .clampMaxNumElements(0, LocalPtr, 2)
1723 .clampMaxNumElements(0, PrivatePtr, 2)
1724 .scalarize(0)
1725 .widenScalarToNextPow2(0)
1726 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1728 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1729 // be more flexible with the shift amount type.
1730 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1731 .legalFor({{S32, S32}, {S64, S32}});
1732 if (ST.has16BitInsts()) {
1733 if (ST.hasVOP3PInsts()) {
1734 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1735 .clampMaxNumElements(0, S16, 2);
1736 } else
1737 Shifts.legalFor({{S16, S16}});
1739 // TODO: Support 16-bit shift amounts for all types
1740 Shifts.widenScalarIf(
1741 [=](const LegalityQuery &Query) {
1742 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1743 // 32-bit amount.
1744 const LLT ValTy = Query.Types[0];
1745 const LLT AmountTy = Query.Types[1];
1746 return ValTy.getSizeInBits() <= 16 &&
1747 AmountTy.getSizeInBits() < 16;
1748 }, changeTo(1, S16));
1749 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1750 Shifts.clampScalar(1, S32, S32);
1751 Shifts.widenScalarToNextPow2(0, 16);
1752 Shifts.clampScalar(0, S16, S64);
1754 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1755 .minScalar(0, S16)
1756 .scalarize(0)
1757 .lower();
1758 } else {
1759 // Make sure we legalize the shift amount type first, as the general
1760 // expansion for the shifted type will produce much worse code if it hasn't
1761 // been truncated already.
1762 Shifts.clampScalar(1, S32, S32);
1763 Shifts.widenScalarToNextPow2(0, 32);
1764 Shifts.clampScalar(0, S32, S64);
1766 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1767 .minScalar(0, S32)
1768 .scalarize(0)
1769 .lower();
1771 Shifts.scalarize(0);
1773 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1774 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1775 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1776 unsigned IdxTypeIdx = 2;
1778 getActionDefinitionsBuilder(Op)
1779 .customIf([=](const LegalityQuery &Query) {
1780 const LLT EltTy = Query.Types[EltTypeIdx];
1781 const LLT VecTy = Query.Types[VecTypeIdx];
1782 const LLT IdxTy = Query.Types[IdxTypeIdx];
1783 const unsigned EltSize = EltTy.getSizeInBits();
1784 const bool isLegalVecType =
1785 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1786 // Address space 8 pointers are 128-bit wide values, but the logic
1787 // below will try to bitcast them to 2N x s64, which will fail.
1788 // Therefore, as an intermediate step, wrap extracts/insertions from a
1789 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1790 // extraction result) in order to produce a vector operation that can
1791 // be handled by the logic below.
1792 if (EltTy.isPointer() && EltSize > 64)
1793 return true;
1794 return (EltSize == 32 || EltSize == 64) &&
1795 VecTy.getSizeInBits() % 32 == 0 &&
1796 VecTy.getSizeInBits() <= MaxRegisterSize &&
1797 IdxTy.getSizeInBits() == 32 &&
1798 isLegalVecType;
1800 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1801 bitcastToVectorElement32(VecTypeIdx))
1802 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1803 .bitcastIf(
1804 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1805 [=](const LegalityQuery &Query) {
1806 // For > 64-bit element types, try to turn this into a 64-bit
1807 // element vector since we may be able to do better indexing
1808 // if this is scalar. If not, fall back to 32.
1809 const LLT EltTy = Query.Types[EltTypeIdx];
1810 const LLT VecTy = Query.Types[VecTypeIdx];
1811 const unsigned DstEltSize = EltTy.getSizeInBits();
1812 const unsigned VecSize = VecTy.getSizeInBits();
1814 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1815 return std::pair(
1816 VecTypeIdx,
1817 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1819 .clampScalar(EltTypeIdx, S32, S64)
1820 .clampScalar(VecTypeIdx, S32, S64)
1821 .clampScalar(IdxTypeIdx, S32, S32)
1822 .clampMaxNumElements(VecTypeIdx, S32, 32)
1823 // TODO: Clamp elements for 64-bit vectors?
1824 .moreElementsIf(
1825 isIllegalRegisterType(VecTypeIdx),
1826 moreElementsToNextExistingRegClass(VecTypeIdx))
1827 // It should only be necessary with variable indexes.
1828 // As a last resort, lower to the stack
1829 .lower();
1832 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1833 .unsupportedIf([=](const LegalityQuery &Query) {
1834 const LLT &EltTy = Query.Types[1].getElementType();
1835 return Query.Types[0] != EltTy;
1838 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1839 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1840 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1842 // FIXME: Doesn't handle extract of illegal sizes.
1843 getActionDefinitionsBuilder(Op)
1844 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1845 .lowerIf([=](const LegalityQuery &Query) {
1846 // Sub-vector(or single element) insert and extract.
1847 // TODO: verify immediate offset here since lower only works with
1848 // whole elements.
1849 const LLT BigTy = Query.Types[BigTyIdx];
1850 return BigTy.isVector();
1852 // FIXME: Multiples of 16 should not be legal.
1853 .legalIf([=](const LegalityQuery &Query) {
1854 const LLT BigTy = Query.Types[BigTyIdx];
1855 const LLT LitTy = Query.Types[LitTyIdx];
1856 return (BigTy.getSizeInBits() % 32 == 0) &&
1857 (LitTy.getSizeInBits() % 16 == 0);
1859 .widenScalarIf(
1860 [=](const LegalityQuery &Query) {
1861 const LLT BigTy = Query.Types[BigTyIdx];
1862 return (BigTy.getScalarSizeInBits() < 16);
1864 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1865 .widenScalarIf(
1866 [=](const LegalityQuery &Query) {
1867 const LLT LitTy = Query.Types[LitTyIdx];
1868 return (LitTy.getScalarSizeInBits() < 16);
1870 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1871 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1872 .widenScalarToNextPow2(BigTyIdx, 32);
1876 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1877 .legalForCartesianProduct(AllS32Vectors, {S32})
1878 .legalForCartesianProduct(AllS64Vectors, {S64})
1879 .clampNumElements(0, V16S32, V32S32)
1880 .clampNumElements(0, V2S64, V16S64)
1881 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1882 .moreElementsIf(
1883 isIllegalRegisterType(0),
1884 moreElementsToNextExistingRegClass(0));
1886 if (ST.hasScalarPackInsts()) {
1887 BuildVector
1888 // FIXME: Should probably widen s1 vectors straight to s32
1889 .minScalarOrElt(0, S16)
1890 .minScalar(1, S16);
1892 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1893 .legalFor({V2S16, S32})
1894 .lower();
1895 } else {
1896 BuildVector.customFor({V2S16, S16});
1897 BuildVector.minScalarOrElt(0, S32);
1899 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1900 .customFor({V2S16, S32})
1901 .lower();
1904 BuildVector.legalIf(isRegisterType(0));
1906 // FIXME: Clamp maximum size
1907 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1908 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1909 .clampMaxNumElements(0, S32, 32)
1910 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1911 .clampMaxNumElements(0, S16, 64);
1913 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1915 // Merge/Unmerge
1916 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1917 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1918 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1920 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1921 const LLT Ty = Query.Types[TypeIdx];
1922 if (Ty.isVector()) {
1923 const LLT &EltTy = Ty.getElementType();
1924 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1925 return true;
1926 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1927 return true;
1929 return false;
1932 auto &Builder = getActionDefinitionsBuilder(Op)
1933 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1934 .lowerFor({{S16, V2S16}})
1935 .lowerIf([=](const LegalityQuery &Query) {
1936 const LLT BigTy = Query.Types[BigTyIdx];
1937 return BigTy.getSizeInBits() == 32;
1939 // Try to widen to s16 first for small types.
1940 // TODO: Only do this on targets with legal s16 shifts
1941 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1942 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1943 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1944 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1945 elementTypeIs(1, S16)),
1946 changeTo(1, V2S16))
1947 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1948 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1949 // valid.
1950 .clampScalar(LitTyIdx, S32, S512)
1951 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1952 // Break up vectors with weird elements into scalars
1953 .fewerElementsIf(
1954 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1955 scalarize(0))
1956 .fewerElementsIf(
1957 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1958 scalarize(1))
1959 .clampScalar(BigTyIdx, S32, MaxScalar);
1961 if (Op == G_MERGE_VALUES) {
1962 Builder.widenScalarIf(
1963 // TODO: Use 16-bit shifts if legal for 8-bit values?
1964 [=](const LegalityQuery &Query) {
1965 const LLT Ty = Query.Types[LitTyIdx];
1966 return Ty.getSizeInBits() < 32;
1968 changeTo(LitTyIdx, S32));
1971 Builder.widenScalarIf(
1972 [=](const LegalityQuery &Query) {
1973 const LLT Ty = Query.Types[BigTyIdx];
1974 return Ty.getSizeInBits() % 16 != 0;
1976 [=](const LegalityQuery &Query) {
1977 // Pick the next power of 2, or a multiple of 64 over 128.
1978 // Whichever is smaller.
1979 const LLT &Ty = Query.Types[BigTyIdx];
1980 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1981 if (NewSizeInBits >= 256) {
1982 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1983 if (RoundedTo < NewSizeInBits)
1984 NewSizeInBits = RoundedTo;
1986 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1988 // Any vectors left are the wrong size. Scalarize them.
1989 .scalarize(0)
1990 .scalarize(1);
1993 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1994 // RegBankSelect.
1995 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1996 .legalFor({{S32}, {S64}});
1998 if (ST.hasVOP3PInsts()) {
1999 SextInReg.lowerFor({{V2S16}})
2000 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2001 // get more vector shift opportunities, since we'll get those when
2002 // expanded.
2003 .clampMaxNumElementsStrict(0, S16, 2);
2004 } else if (ST.has16BitInsts()) {
2005 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2006 } else {
2007 // Prefer to promote to s32 before lowering if we don't have 16-bit
2008 // shifts. This avoid a lot of intermediate truncate and extend operations.
2009 SextInReg.lowerFor({{S32}, {S64}});
2012 SextInReg
2013 .scalarize(0)
2014 .clampScalar(0, S32, S64)
2015 .lower();
2017 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2018 .scalarize(0)
2019 .lower();
2021 // TODO: Only Try to form v2s16 with legal packed instructions.
2022 getActionDefinitionsBuilder(G_FSHR)
2023 .legalFor({{S32, S32}})
2024 .lowerFor({{V2S16, V2S16}})
2025 .clampMaxNumElementsStrict(0, S16, 2)
2026 .scalarize(0)
2027 .lower();
2029 if (ST.hasVOP3PInsts()) {
2030 getActionDefinitionsBuilder(G_FSHL)
2031 .lowerFor({{V2S16, V2S16}})
2032 .clampMaxNumElementsStrict(0, S16, 2)
2033 .scalarize(0)
2034 .lower();
2035 } else {
2036 getActionDefinitionsBuilder(G_FSHL)
2037 .scalarize(0)
2038 .lower();
2041 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2042 .legalFor({S64});
2044 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2046 getActionDefinitionsBuilder(G_FENCE)
2047 .alwaysLegal();
2049 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2050 .scalarize(0)
2051 .minScalar(0, S32)
2052 .lower();
2054 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2055 .legalFor({{S32, S32}, {S64, S32}})
2056 .clampScalar(1, S32, S32)
2057 .clampScalar(0, S32, S64)
2058 .widenScalarToNextPow2(0)
2059 .scalarize(0);
2061 getActionDefinitionsBuilder(
2062 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2063 G_FCOPYSIGN,
2065 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2066 G_READ_REGISTER, G_WRITE_REGISTER,
2068 G_SADDO, G_SSUBO})
2069 .lower();
2071 if (ST.hasIEEEMinMax()) {
2072 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2073 .legalFor(FPTypesPK16)
2074 .clampMaxNumElements(0, S16, 2)
2075 .scalarize(0);
2076 } else {
2077 // TODO: Implement
2078 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2081 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2082 .lower();
2084 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2086 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2087 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2088 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2089 .unsupported();
2091 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2093 getLegacyLegalizerInfo().computeTables();
2094 verify(*ST.getInstrInfo());
2097 bool AMDGPULegalizerInfo::legalizeCustom(
2098 LegalizerHelper &Helper, MachineInstr &MI,
2099 LostDebugLocObserver &LocObserver) const {
2100 MachineIRBuilder &B = Helper.MIRBuilder;
2101 MachineRegisterInfo &MRI = *B.getMRI();
2103 switch (MI.getOpcode()) {
2104 case TargetOpcode::G_ADDRSPACE_CAST:
2105 return legalizeAddrSpaceCast(MI, MRI, B);
2106 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2107 return legalizeFroundeven(MI, MRI, B);
2108 case TargetOpcode::G_FCEIL:
2109 return legalizeFceil(MI, MRI, B);
2110 case TargetOpcode::G_FREM:
2111 return legalizeFrem(MI, MRI, B);
2112 case TargetOpcode::G_INTRINSIC_TRUNC:
2113 return legalizeIntrinsicTrunc(MI, MRI, B);
2114 case TargetOpcode::G_SITOFP:
2115 return legalizeITOFP(MI, MRI, B, true);
2116 case TargetOpcode::G_UITOFP:
2117 return legalizeITOFP(MI, MRI, B, false);
2118 case TargetOpcode::G_FPTOSI:
2119 return legalizeFPTOI(MI, MRI, B, true);
2120 case TargetOpcode::G_FPTOUI:
2121 return legalizeFPTOI(MI, MRI, B, false);
2122 case TargetOpcode::G_FMINNUM:
2123 case TargetOpcode::G_FMAXNUM:
2124 case TargetOpcode::G_FMINNUM_IEEE:
2125 case TargetOpcode::G_FMAXNUM_IEEE:
2126 return legalizeMinNumMaxNum(Helper, MI);
2127 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2128 return legalizeExtractVectorElt(MI, MRI, B);
2129 case TargetOpcode::G_INSERT_VECTOR_ELT:
2130 return legalizeInsertVectorElt(MI, MRI, B);
2131 case TargetOpcode::G_FSIN:
2132 case TargetOpcode::G_FCOS:
2133 return legalizeSinCos(MI, MRI, B);
2134 case TargetOpcode::G_GLOBAL_VALUE:
2135 return legalizeGlobalValue(MI, MRI, B);
2136 case TargetOpcode::G_LOAD:
2137 case TargetOpcode::G_SEXTLOAD:
2138 case TargetOpcode::G_ZEXTLOAD:
2139 return legalizeLoad(Helper, MI);
2140 case TargetOpcode::G_STORE:
2141 return legalizeStore(Helper, MI);
2142 case TargetOpcode::G_FMAD:
2143 return legalizeFMad(MI, MRI, B);
2144 case TargetOpcode::G_FDIV:
2145 return legalizeFDIV(MI, MRI, B);
2146 case TargetOpcode::G_FFREXP:
2147 return legalizeFFREXP(MI, MRI, B);
2148 case TargetOpcode::G_FSQRT:
2149 return legalizeFSQRT(MI, MRI, B);
2150 case TargetOpcode::G_UDIV:
2151 case TargetOpcode::G_UREM:
2152 case TargetOpcode::G_UDIVREM:
2153 return legalizeUnsignedDIV_REM(MI, MRI, B);
2154 case TargetOpcode::G_SDIV:
2155 case TargetOpcode::G_SREM:
2156 case TargetOpcode::G_SDIVREM:
2157 return legalizeSignedDIV_REM(MI, MRI, B);
2158 case TargetOpcode::G_ATOMIC_CMPXCHG:
2159 return legalizeAtomicCmpXChg(MI, MRI, B);
2160 case TargetOpcode::G_FLOG2:
2161 return legalizeFlog2(MI, B);
2162 case TargetOpcode::G_FLOG:
2163 case TargetOpcode::G_FLOG10:
2164 return legalizeFlogCommon(MI, B);
2165 case TargetOpcode::G_FEXP2:
2166 return legalizeFExp2(MI, B);
2167 case TargetOpcode::G_FEXP:
2168 case TargetOpcode::G_FEXP10:
2169 return legalizeFExp(MI, B);
2170 case TargetOpcode::G_FPOW:
2171 return legalizeFPow(MI, B);
2172 case TargetOpcode::G_FFLOOR:
2173 return legalizeFFloor(MI, MRI, B);
2174 case TargetOpcode::G_BUILD_VECTOR:
2175 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2176 return legalizeBuildVector(MI, MRI, B);
2177 case TargetOpcode::G_MUL:
2178 return legalizeMul(Helper, MI);
2179 case TargetOpcode::G_CTLZ:
2180 case TargetOpcode::G_CTTZ:
2181 return legalizeCTLZ_CTTZ(MI, MRI, B);
2182 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2183 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2184 case TargetOpcode::G_STACKSAVE:
2185 return legalizeStackSave(MI, B);
2186 case TargetOpcode::G_GET_FPENV:
2187 return legalizeGetFPEnv(MI, MRI, B);
2188 case TargetOpcode::G_SET_FPENV:
2189 return legalizeSetFPEnv(MI, MRI, B);
2190 case TargetOpcode::G_TRAP:
2191 return legalizeTrap(MI, MRI, B);
2192 case TargetOpcode::G_DEBUGTRAP:
2193 return legalizeDebugTrap(MI, MRI, B);
2194 default:
2195 return false;
2198 llvm_unreachable("expected switch to return");
2201 Register AMDGPULegalizerInfo::getSegmentAperture(
2202 unsigned AS,
2203 MachineRegisterInfo &MRI,
2204 MachineIRBuilder &B) const {
2205 MachineFunction &MF = B.getMF();
2206 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2207 const LLT S32 = LLT::scalar(32);
2208 const LLT S64 = LLT::scalar(64);
2210 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2212 if (ST.hasApertureRegs()) {
2213 // Note: this register is somewhat broken. When used as a 32-bit operand,
2214 // it only returns zeroes. The real value is in the upper 32 bits.
2215 // Thus, we must emit extract the high 32 bits.
2216 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2217 ? AMDGPU::SRC_SHARED_BASE
2218 : AMDGPU::SRC_PRIVATE_BASE;
2219 // FIXME: It would be more natural to emit a COPY here, but then copy
2220 // coalescing would kick in and it would think it's okay to use the "HI"
2221 // subregister (instead of extracting the HI 32 bits) which is an artificial
2222 // (unusable) register.
2223 // Register TableGen definitions would need an overhaul to get rid of the
2224 // artificial "HI" aperture registers and prevent this kind of issue from
2225 // happening.
2226 Register Dst = MRI.createGenericVirtualRegister(S64);
2227 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2228 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2229 return B.buildUnmerge(S32, Dst).getReg(1);
2232 // TODO: can we be smarter about machine pointer info?
2233 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2234 Register LoadAddr = MRI.createGenericVirtualRegister(
2235 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2236 // For code object version 5, private_base and shared_base are passed through
2237 // implicit kernargs.
2238 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
2239 AMDGPU::AMDHSA_COV5) {
2240 AMDGPUTargetLowering::ImplicitParameter Param =
2241 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2242 : AMDGPUTargetLowering::PRIVATE_BASE;
2243 uint64_t Offset =
2244 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2246 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2247 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2249 if (!loadInputValue(KernargPtrReg, B,
2250 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2251 return Register();
2253 MachineMemOperand *MMO = MF.getMachineMemOperand(
2254 PtrInfo,
2255 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2256 MachineMemOperand::MOInvariant,
2257 LLT::scalar(32), commonAlignment(Align(64), Offset));
2259 // Pointer address
2260 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2261 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2262 // Load address
2263 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2266 Register QueuePtr = MRI.createGenericVirtualRegister(
2267 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2269 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
2270 return Register();
2272 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2273 // private_segment_aperture_base_hi.
2274 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2276 MachineMemOperand *MMO = MF.getMachineMemOperand(
2277 PtrInfo,
2278 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2279 MachineMemOperand::MOInvariant,
2280 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2282 B.buildPtrAdd(LoadAddr, QueuePtr,
2283 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2284 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2287 /// Return true if the value is a known valid address, such that a null check is
2288 /// not necessary.
2289 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2290 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2291 MachineInstr *Def = MRI.getVRegDef(Val);
2292 switch (Def->getOpcode()) {
2293 case AMDGPU::G_FRAME_INDEX:
2294 case AMDGPU::G_GLOBAL_VALUE:
2295 case AMDGPU::G_BLOCK_ADDR:
2296 return true;
2297 case AMDGPU::G_CONSTANT: {
2298 const ConstantInt *CI = Def->getOperand(1).getCImm();
2299 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2301 default:
2302 return false;
2305 return false;
2308 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2309 MachineInstr &MI, MachineRegisterInfo &MRI,
2310 MachineIRBuilder &B) const {
2311 MachineFunction &MF = B.getMF();
2313 // MI can either be a G_ADDRSPACE_CAST or a
2314 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2315 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2316 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2317 Intrinsic::amdgcn_addrspacecast_nonnull));
2319 const LLT S32 = LLT::scalar(32);
2320 Register Dst = MI.getOperand(0).getReg();
2321 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2322 : MI.getOperand(1).getReg();
2323 LLT DstTy = MRI.getType(Dst);
2324 LLT SrcTy = MRI.getType(Src);
2325 unsigned DestAS = DstTy.getAddressSpace();
2326 unsigned SrcAS = SrcTy.getAddressSpace();
2328 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2329 // vector element.
2330 assert(!DstTy.isVector());
2332 const AMDGPUTargetMachine &TM
2333 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2335 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2336 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2337 return true;
2340 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2341 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2342 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2343 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2344 // G_ADDRSPACE_CAST we need to guess.
2345 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2346 // Extract low 32-bits of the pointer.
2347 B.buildExtract(Dst, Src, 0);
2348 MI.eraseFromParent();
2349 return true;
2352 unsigned NullVal = TM.getNullPointerValue(DestAS);
2354 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2355 auto FlatNull = B.buildConstant(SrcTy, 0);
2357 // Extract low 32-bits of the pointer.
2358 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2360 auto CmpRes =
2361 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2362 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2364 MI.eraseFromParent();
2365 return true;
2368 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2369 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2370 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2371 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2372 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2373 if (!ApertureReg.isValid())
2374 return false;
2376 // Coerce the type of the low half of the result so we can use
2377 // merge_values.
2378 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2380 // TODO: Should we allow mismatched types but matching sizes in merges to
2381 // avoid the ptrtoint?
2382 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2385 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2386 // G_ADDRSPACE_CAST we need to guess.
2387 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2388 castLocalOrPrivateToFlat(Dst);
2389 MI.eraseFromParent();
2390 return true;
2393 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2395 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2396 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2398 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2399 SegmentNull.getReg(0));
2401 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2403 MI.eraseFromParent();
2404 return true;
2407 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2408 SrcTy.getSizeInBits() == 64) {
2409 // Truncate.
2410 B.buildExtract(Dst, Src, 0);
2411 MI.eraseFromParent();
2412 return true;
2415 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2416 DstTy.getSizeInBits() == 64) {
2417 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2418 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2419 auto PtrLo = B.buildPtrToInt(S32, Src);
2420 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2421 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2422 MI.eraseFromParent();
2423 return true;
2426 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2427 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2429 LLVMContext &Ctx = MF.getFunction().getContext();
2430 Ctx.diagnose(InvalidAddrSpaceCast);
2431 B.buildUndef(Dst);
2432 MI.eraseFromParent();
2433 return true;
2436 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2437 MachineRegisterInfo &MRI,
2438 MachineIRBuilder &B) const {
2439 Register Src = MI.getOperand(1).getReg();
2440 LLT Ty = MRI.getType(Src);
2441 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2443 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2444 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2446 auto C1 = B.buildFConstant(Ty, C1Val);
2447 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2449 // TODO: Should this propagate fast-math-flags?
2450 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2451 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2453 auto C2 = B.buildFConstant(Ty, C2Val);
2454 auto Fabs = B.buildFAbs(Ty, Src);
2456 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2457 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2458 MI.eraseFromParent();
2459 return true;
2462 bool AMDGPULegalizerInfo::legalizeFceil(
2463 MachineInstr &MI, MachineRegisterInfo &MRI,
2464 MachineIRBuilder &B) const {
2466 const LLT S1 = LLT::scalar(1);
2467 const LLT S64 = LLT::scalar(64);
2469 Register Src = MI.getOperand(1).getReg();
2470 assert(MRI.getType(Src) == S64);
2472 // result = trunc(src)
2473 // if (src > 0.0 && src != result)
2474 // result += 1.0
2476 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2478 const auto Zero = B.buildFConstant(S64, 0.0);
2479 const auto One = B.buildFConstant(S64, 1.0);
2480 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2481 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2482 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2483 auto Add = B.buildSelect(S64, And, One, Zero);
2485 // TODO: Should this propagate fast-math-flags?
2486 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2487 MI.eraseFromParent();
2488 return true;
2491 bool AMDGPULegalizerInfo::legalizeFrem(
2492 MachineInstr &MI, MachineRegisterInfo &MRI,
2493 MachineIRBuilder &B) const {
2494 Register DstReg = MI.getOperand(0).getReg();
2495 Register Src0Reg = MI.getOperand(1).getReg();
2496 Register Src1Reg = MI.getOperand(2).getReg();
2497 auto Flags = MI.getFlags();
2498 LLT Ty = MRI.getType(DstReg);
2500 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2501 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2502 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2503 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2504 MI.eraseFromParent();
2505 return true;
2508 static MachineInstrBuilder extractF64Exponent(Register Hi,
2509 MachineIRBuilder &B) {
2510 const unsigned FractBits = 52;
2511 const unsigned ExpBits = 11;
2512 LLT S32 = LLT::scalar(32);
2514 auto Const0 = B.buildConstant(S32, FractBits - 32);
2515 auto Const1 = B.buildConstant(S32, ExpBits);
2517 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2518 .addUse(Hi)
2519 .addUse(Const0.getReg(0))
2520 .addUse(Const1.getReg(0));
2522 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2525 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2526 MachineInstr &MI, MachineRegisterInfo &MRI,
2527 MachineIRBuilder &B) const {
2528 const LLT S1 = LLT::scalar(1);
2529 const LLT S32 = LLT::scalar(32);
2530 const LLT S64 = LLT::scalar(64);
2532 Register Src = MI.getOperand(1).getReg();
2533 assert(MRI.getType(Src) == S64);
2535 // TODO: Should this use extract since the low half is unused?
2536 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2537 Register Hi = Unmerge.getReg(1);
2539 // Extract the upper half, since this is where we will find the sign and
2540 // exponent.
2541 auto Exp = extractF64Exponent(Hi, B);
2543 const unsigned FractBits = 52;
2545 // Extract the sign bit.
2546 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2547 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2549 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2551 const auto Zero32 = B.buildConstant(S32, 0);
2553 // Extend back to 64-bits.
2554 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2556 auto Shr = B.buildAShr(S64, FractMask, Exp);
2557 auto Not = B.buildNot(S64, Shr);
2558 auto Tmp0 = B.buildAnd(S64, Src, Not);
2559 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2561 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2562 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2564 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2565 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2566 MI.eraseFromParent();
2567 return true;
2570 bool AMDGPULegalizerInfo::legalizeITOFP(
2571 MachineInstr &MI, MachineRegisterInfo &MRI,
2572 MachineIRBuilder &B, bool Signed) const {
2574 Register Dst = MI.getOperand(0).getReg();
2575 Register Src = MI.getOperand(1).getReg();
2577 const LLT S64 = LLT::scalar(64);
2578 const LLT S32 = LLT::scalar(32);
2580 assert(MRI.getType(Src) == S64);
2582 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2583 auto ThirtyTwo = B.buildConstant(S32, 32);
2585 if (MRI.getType(Dst) == S64) {
2586 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2587 : B.buildUITOFP(S64, Unmerge.getReg(1));
2589 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2590 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2592 // TODO: Should this propagate fast-math-flags?
2593 B.buildFAdd(Dst, LdExp, CvtLo);
2594 MI.eraseFromParent();
2595 return true;
2598 assert(MRI.getType(Dst) == S32);
2600 auto One = B.buildConstant(S32, 1);
2602 MachineInstrBuilder ShAmt;
2603 if (Signed) {
2604 auto ThirtyOne = B.buildConstant(S32, 31);
2605 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2606 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2607 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2608 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2609 .addUse(Unmerge.getReg(1));
2610 auto LS2 = B.buildSub(S32, LS, One);
2611 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2612 } else
2613 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2614 auto Norm = B.buildShl(S64, Src, ShAmt);
2615 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2616 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2617 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2618 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2619 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2620 B.buildFLdexp(Dst, FVal, Scale);
2621 MI.eraseFromParent();
2622 return true;
2625 // TODO: Copied from DAG implementation. Verify logic and document how this
2626 // actually works.
2627 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2628 MachineRegisterInfo &MRI,
2629 MachineIRBuilder &B,
2630 bool Signed) const {
2632 Register Dst = MI.getOperand(0).getReg();
2633 Register Src = MI.getOperand(1).getReg();
2635 const LLT S64 = LLT::scalar(64);
2636 const LLT S32 = LLT::scalar(32);
2638 const LLT SrcLT = MRI.getType(Src);
2639 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2641 unsigned Flags = MI.getFlags();
2643 // The basic idea of converting a floating point number into a pair of 32-bit
2644 // integers is illustrated as follows:
2646 // tf := trunc(val);
2647 // hif := floor(tf * 2^-32);
2648 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2649 // hi := fptoi(hif);
2650 // lo := fptoi(lof);
2652 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2653 MachineInstrBuilder Sign;
2654 if (Signed && SrcLT == S32) {
2655 // However, a 32-bit floating point number has only 23 bits mantissa and
2656 // it's not enough to hold all the significant bits of `lof` if val is
2657 // negative. To avoid the loss of precision, We need to take the absolute
2658 // value after truncating and flip the result back based on the original
2659 // signedness.
2660 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2661 Trunc = B.buildFAbs(S32, Trunc, Flags);
2663 MachineInstrBuilder K0, K1;
2664 if (SrcLT == S64) {
2665 K0 = B.buildFConstant(
2666 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2667 K1 = B.buildFConstant(
2668 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2669 } else {
2670 K0 = B.buildFConstant(
2671 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2672 K1 = B.buildFConstant(
2673 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2676 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2677 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2678 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2680 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2681 : B.buildFPTOUI(S32, FloorMul);
2682 auto Lo = B.buildFPTOUI(S32, Fma);
2684 if (Signed && SrcLT == S32) {
2685 // Flip the result based on the signedness, which is either all 0s or 1s.
2686 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2687 // r := xor({lo, hi}, sign) - sign;
2688 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2689 Sign);
2690 } else
2691 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2692 MI.eraseFromParent();
2694 return true;
2697 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2698 MachineInstr &MI) const {
2699 MachineFunction &MF = Helper.MIRBuilder.getMF();
2700 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2702 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2703 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2705 // With ieee_mode disabled, the instructions have the correct behavior
2706 // already for G_FMINNUM/G_FMAXNUM
2707 if (!MFI->getMode().IEEE)
2708 return !IsIEEEOp;
2710 if (IsIEEEOp)
2711 return true;
2713 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2716 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2717 MachineInstr &MI, MachineRegisterInfo &MRI,
2718 MachineIRBuilder &B) const {
2719 // TODO: Should move some of this into LegalizerHelper.
2721 // TODO: Promote dynamic indexing of s16 to s32
2723 Register Dst = MI.getOperand(0).getReg();
2724 Register Vec = MI.getOperand(1).getReg();
2726 LLT VecTy = MRI.getType(Vec);
2727 LLT EltTy = VecTy.getElementType();
2728 assert(EltTy == MRI.getType(Dst));
2730 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2731 // but we can't go directly to that logic becasue you can't bitcast a vector
2732 // of pointers to a vector of integers. Therefore, introduce an intermediate
2733 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2734 // drive the legalization forward.
2735 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2736 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2737 LLT IntVecTy = VecTy.changeElementType(IntTy);
2739 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2740 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2741 B.buildIntToPtr(Dst, IntElt);
2743 MI.eraseFromParent();
2744 return true;
2747 // FIXME: Artifact combiner probably should have replaced the truncated
2748 // constant before this, so we shouldn't need
2749 // getIConstantVRegValWithLookThrough.
2750 std::optional<ValueAndVReg> MaybeIdxVal =
2751 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2752 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2753 return true;
2754 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2756 if (IdxVal < VecTy.getNumElements()) {
2757 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2758 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2759 } else {
2760 B.buildUndef(Dst);
2763 MI.eraseFromParent();
2764 return true;
2767 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2768 MachineInstr &MI, MachineRegisterInfo &MRI,
2769 MachineIRBuilder &B) const {
2770 // TODO: Should move some of this into LegalizerHelper.
2772 // TODO: Promote dynamic indexing of s16 to s32
2774 Register Dst = MI.getOperand(0).getReg();
2775 Register Vec = MI.getOperand(1).getReg();
2776 Register Ins = MI.getOperand(2).getReg();
2778 LLT VecTy = MRI.getType(Vec);
2779 LLT EltTy = VecTy.getElementType();
2780 assert(EltTy == MRI.getType(Ins));
2782 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2783 // but we can't go directly to that logic becasue you can't bitcast a vector
2784 // of pointers to a vector of integers. Therefore, make the pointer vector
2785 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2786 // new value, and then inttoptr the result vector back. This will then allow
2787 // the rest of legalization to take over.
2788 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2789 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2790 LLT IntVecTy = VecTy.changeElementType(IntTy);
2792 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2793 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2794 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2795 MI.getOperand(3));
2796 B.buildIntToPtr(Dst, IntVecDest);
2797 MI.eraseFromParent();
2798 return true;
2801 // FIXME: Artifact combiner probably should have replaced the truncated
2802 // constant before this, so we shouldn't need
2803 // getIConstantVRegValWithLookThrough.
2804 std::optional<ValueAndVReg> MaybeIdxVal =
2805 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2806 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2807 return true;
2809 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2811 unsigned NumElts = VecTy.getNumElements();
2812 if (IdxVal < NumElts) {
2813 SmallVector<Register, 8> SrcRegs;
2814 for (unsigned i = 0; i < NumElts; ++i)
2815 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2816 B.buildUnmerge(SrcRegs, Vec);
2818 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2819 B.buildMergeLikeInstr(Dst, SrcRegs);
2820 } else {
2821 B.buildUndef(Dst);
2824 MI.eraseFromParent();
2825 return true;
2828 bool AMDGPULegalizerInfo::legalizeSinCos(
2829 MachineInstr &MI, MachineRegisterInfo &MRI,
2830 MachineIRBuilder &B) const {
2832 Register DstReg = MI.getOperand(0).getReg();
2833 Register SrcReg = MI.getOperand(1).getReg();
2834 LLT Ty = MRI.getType(DstReg);
2835 unsigned Flags = MI.getFlags();
2837 Register TrigVal;
2838 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2839 if (ST.hasTrigReducedRange()) {
2840 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2841 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2842 .addUse(MulVal.getReg(0))
2843 .setMIFlags(Flags)
2844 .getReg(0);
2845 } else
2846 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2848 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2849 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2850 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2851 .addUse(TrigVal)
2852 .setMIFlags(Flags);
2853 MI.eraseFromParent();
2854 return true;
2857 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2858 MachineIRBuilder &B,
2859 const GlobalValue *GV,
2860 int64_t Offset,
2861 unsigned GAFlags) const {
2862 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2863 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2864 // to the following code sequence:
2866 // For constant address space:
2867 // s_getpc_b64 s[0:1]
2868 // s_add_u32 s0, s0, $symbol
2869 // s_addc_u32 s1, s1, 0
2871 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2872 // a fixup or relocation is emitted to replace $symbol with a literal
2873 // constant, which is a pc-relative offset from the encoding of the $symbol
2874 // operand to the global variable.
2876 // For global address space:
2877 // s_getpc_b64 s[0:1]
2878 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2879 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2881 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2882 // fixups or relocations are emitted to replace $symbol@*@lo and
2883 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2884 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2885 // operand to the global variable.
2887 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2889 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2890 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2892 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2893 .addDef(PCReg);
2895 MIB.addGlobalAddress(GV, Offset, GAFlags);
2896 if (GAFlags == SIInstrInfo::MO_NONE)
2897 MIB.addImm(0);
2898 else
2899 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2901 if (!B.getMRI()->getRegClassOrNull(PCReg))
2902 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2904 if (PtrTy.getSizeInBits() == 32)
2905 B.buildExtract(DstReg, PCReg, 0);
2906 return true;
2909 // Emit a ABS32_LO / ABS32_HI relocation stub.
2910 void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2911 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2912 MachineRegisterInfo &MRI) const {
2913 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2915 LLT S32 = LLT::scalar(32);
2917 // Use the destination directly, if and only if we store the lower address
2918 // part only and we don't have a register class being set.
2919 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2920 ? DstReg
2921 : MRI.createGenericVirtualRegister(S32);
2923 if (!MRI.getRegClassOrNull(AddrLo))
2924 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2926 // Write the lower half.
2927 B.buildInstr(AMDGPU::S_MOV_B32)
2928 .addDef(AddrLo)
2929 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2931 // If required, write the upper half as well.
2932 if (RequiresHighHalf) {
2933 assert(PtrTy.getSizeInBits() == 64 &&
2934 "Must provide a 64-bit pointer type!");
2936 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2937 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2939 B.buildInstr(AMDGPU::S_MOV_B32)
2940 .addDef(AddrHi)
2941 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2943 // Use the destination directly, if and only if we don't have a register
2944 // class being set.
2945 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2946 ? DstReg
2947 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2949 if (!MRI.getRegClassOrNull(AddrDst))
2950 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2952 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2954 // If we created a new register for the destination, cast the result into
2955 // the final output.
2956 if (AddrDst != DstReg)
2957 B.buildCast(DstReg, AddrDst);
2958 } else if (AddrLo != DstReg) {
2959 // If we created a new register for the destination, cast the result into
2960 // the final output.
2961 B.buildCast(DstReg, AddrLo);
2965 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2966 MachineInstr &MI, MachineRegisterInfo &MRI,
2967 MachineIRBuilder &B) const {
2968 Register DstReg = MI.getOperand(0).getReg();
2969 LLT Ty = MRI.getType(DstReg);
2970 unsigned AS = Ty.getAddressSpace();
2972 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2973 MachineFunction &MF = B.getMF();
2974 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2976 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2977 if (!MFI->isModuleEntryFunction() &&
2978 GV->getName() != "llvm.amdgcn.module.lds" &&
2979 !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
2980 const Function &Fn = MF.getFunction();
2981 DiagnosticInfoUnsupported BadLDSDecl(
2982 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2983 DS_Warning);
2984 Fn.getContext().diagnose(BadLDSDecl);
2986 // We currently don't have a way to correctly allocate LDS objects that
2987 // aren't directly associated with a kernel. We do force inlining of
2988 // functions that use local objects. However, if these dead functions are
2989 // not eliminated, we don't want a compile time error. Just emit a warning
2990 // and a trap, since there should be no callable path here.
2991 B.buildTrap();
2992 B.buildUndef(DstReg);
2993 MI.eraseFromParent();
2994 return true;
2997 // TODO: We could emit code to handle the initialization somewhere.
2998 // We ignore the initializer for now and legalize it to allow selection.
2999 // The initializer will anyway get errored out during assembly emission.
3000 const SITargetLowering *TLI = ST.getTargetLowering();
3001 if (!TLI->shouldUseLDSConstAddress(GV)) {
3002 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3003 return true; // Leave in place;
3006 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3007 Type *Ty = GV->getValueType();
3008 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3009 // zero-sized type in other languages to declare the dynamic shared
3010 // memory which size is not known at the compile time. They will be
3011 // allocated by the runtime and placed directly after the static
3012 // allocated ones. They all share the same offset.
3013 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3014 // Adjust alignment for that dynamic shared memory array.
3015 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3016 LLT S32 = LLT::scalar(32);
3017 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3018 B.buildIntToPtr(DstReg, Sz);
3019 MI.eraseFromParent();
3020 return true;
3024 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3025 *cast<GlobalVariable>(GV)));
3026 MI.eraseFromParent();
3027 return true;
3030 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3031 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3032 MI.eraseFromParent();
3033 return true;
3036 const SITargetLowering *TLI = ST.getTargetLowering();
3038 if (TLI->shouldEmitFixup(GV)) {
3039 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3040 MI.eraseFromParent();
3041 return true;
3044 if (TLI->shouldEmitPCReloc(GV)) {
3045 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3046 MI.eraseFromParent();
3047 return true;
3050 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3051 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3053 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3054 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3055 MachinePointerInfo::getGOT(MF),
3056 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3057 MachineMemOperand::MOInvariant,
3058 LoadTy, Align(8));
3060 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3062 if (Ty.getSizeInBits() == 32) {
3063 // Truncate if this is a 32-bit constant address.
3064 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3065 B.buildExtract(DstReg, Load, 0);
3066 } else
3067 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3069 MI.eraseFromParent();
3070 return true;
3073 static LLT widenToNextPowerOf2(LLT Ty) {
3074 if (Ty.isVector())
3075 return Ty.changeElementCount(
3076 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3077 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3080 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3081 MachineInstr &MI) const {
3082 MachineIRBuilder &B = Helper.MIRBuilder;
3083 MachineRegisterInfo &MRI = *B.getMRI();
3084 GISelChangeObserver &Observer = Helper.Observer;
3086 Register PtrReg = MI.getOperand(1).getReg();
3087 LLT PtrTy = MRI.getType(PtrReg);
3088 unsigned AddrSpace = PtrTy.getAddressSpace();
3090 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3091 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3092 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3093 Observer.changingInstr(MI);
3094 MI.getOperand(1).setReg(Cast.getReg(0));
3095 Observer.changedInstr(MI);
3096 return true;
3099 if (MI.getOpcode() != AMDGPU::G_LOAD)
3100 return false;
3102 Register ValReg = MI.getOperand(0).getReg();
3103 LLT ValTy = MRI.getType(ValReg);
3105 if (hasBufferRsrcWorkaround(ValTy)) {
3106 Observer.changingInstr(MI);
3107 castBufferRsrcFromV4I32(MI, B, MRI, 0);
3108 Observer.changedInstr(MI);
3109 return true;
3112 MachineMemOperand *MMO = *MI.memoperands_begin();
3113 const unsigned ValSize = ValTy.getSizeInBits();
3114 const LLT MemTy = MMO->getMemoryType();
3115 const Align MemAlign = MMO->getAlign();
3116 const unsigned MemSize = MemTy.getSizeInBits();
3117 const uint64_t AlignInBits = 8 * MemAlign.value();
3119 // Widen non-power-of-2 loads to the alignment if needed
3120 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3121 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3123 // This was already the correct extending load result type, so just adjust
3124 // the memory type.
3125 if (WideMemSize == ValSize) {
3126 MachineFunction &MF = B.getMF();
3128 MachineMemOperand *WideMMO =
3129 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3130 Observer.changingInstr(MI);
3131 MI.setMemRefs(MF, {WideMMO});
3132 Observer.changedInstr(MI);
3133 return true;
3136 // Don't bother handling edge case that should probably never be produced.
3137 if (ValSize > WideMemSize)
3138 return false;
3140 LLT WideTy = widenToNextPowerOf2(ValTy);
3142 Register WideLoad;
3143 if (!WideTy.isVector()) {
3144 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3145 B.buildTrunc(ValReg, WideLoad).getReg(0);
3146 } else {
3147 // Extract the subvector.
3149 if (isRegisterType(ValTy)) {
3150 // If this a case where G_EXTRACT is legal, use it.
3151 // (e.g. <3 x s32> -> <4 x s32>)
3152 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3153 B.buildExtract(ValReg, WideLoad, 0);
3154 } else {
3155 // For cases where the widened type isn't a nice register value, unmerge
3156 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3157 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3158 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3162 MI.eraseFromParent();
3163 return true;
3166 return false;
3169 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3170 MachineInstr &MI) const {
3171 MachineIRBuilder &B = Helper.MIRBuilder;
3172 MachineRegisterInfo &MRI = *B.getMRI();
3173 GISelChangeObserver &Observer = Helper.Observer;
3175 Register DataReg = MI.getOperand(0).getReg();
3176 LLT DataTy = MRI.getType(DataReg);
3178 if (hasBufferRsrcWorkaround(DataTy)) {
3179 Observer.changingInstr(MI);
3180 castBufferRsrcArgToV4I32(MI, B, 0);
3181 Observer.changedInstr(MI);
3182 return true;
3184 return false;
3187 bool AMDGPULegalizerInfo::legalizeFMad(
3188 MachineInstr &MI, MachineRegisterInfo &MRI,
3189 MachineIRBuilder &B) const {
3190 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3191 assert(Ty.isScalar());
3193 MachineFunction &MF = B.getMF();
3194 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3196 // TODO: Always legal with future ftz flag.
3197 // FIXME: Do we need just output?
3198 if (Ty == LLT::float32() &&
3199 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3200 return true;
3201 if (Ty == LLT::float16() &&
3202 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3203 return true;
3205 MachineIRBuilder HelperBuilder(MI);
3206 GISelObserverWrapper DummyObserver;
3207 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3208 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3211 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3212 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3213 Register DstReg = MI.getOperand(0).getReg();
3214 Register PtrReg = MI.getOperand(1).getReg();
3215 Register CmpVal = MI.getOperand(2).getReg();
3216 Register NewVal = MI.getOperand(3).getReg();
3218 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3219 "this should not have been custom lowered");
3221 LLT ValTy = MRI.getType(CmpVal);
3222 LLT VecTy = LLT::fixed_vector(2, ValTy);
3224 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3226 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3227 .addDef(DstReg)
3228 .addUse(PtrReg)
3229 .addUse(PackedVal)
3230 .setMemRefs(MI.memoperands());
3232 MI.eraseFromParent();
3233 return true;
3236 /// Return true if it's known that \p Src can never be an f32 denormal value.
3237 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3238 Register Src) {
3239 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3240 switch (DefMI->getOpcode()) {
3241 case TargetOpcode::G_INTRINSIC: {
3242 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3243 case Intrinsic::amdgcn_frexp_mant:
3244 return true;
3245 default:
3246 break;
3249 break;
3251 case TargetOpcode::G_FFREXP: {
3252 if (DefMI->getOperand(0).getReg() == Src)
3253 return true;
3254 break;
3256 case TargetOpcode::G_FPEXT: {
3257 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3259 default:
3260 return false;
3263 return false;
3266 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3267 if (Flags & MachineInstr::FmAfn)
3268 return true;
3269 const auto &Options = MF.getTarget().Options;
3270 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3273 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3274 unsigned Flags) {
3275 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3276 MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
3277 DenormalMode::PreserveSign;
3280 std::pair<Register, Register>
3281 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3282 unsigned Flags) const {
3283 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3284 return {};
3286 const LLT F32 = LLT::scalar(32);
3287 auto SmallestNormal = B.buildFConstant(
3288 F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3289 auto IsLtSmallestNormal =
3290 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3292 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3293 auto One = B.buildFConstant(F32, 1.0);
3294 auto ScaleFactor =
3295 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3296 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3298 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3301 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3302 MachineIRBuilder &B) const {
3303 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3304 // If we have to handle denormals, scale up the input and adjust the result.
3306 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3307 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3309 Register Dst = MI.getOperand(0).getReg();
3310 Register Src = MI.getOperand(1).getReg();
3311 LLT Ty = B.getMRI()->getType(Dst);
3312 unsigned Flags = MI.getFlags();
3314 if (Ty == LLT::scalar(16)) {
3315 const LLT F32 = LLT::scalar(32);
3316 // Nothing in half is a denormal when promoted to f32.
3317 auto Ext = B.buildFPExt(F32, Src, Flags);
3318 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3319 .addUse(Ext.getReg(0))
3320 .setMIFlags(Flags);
3321 B.buildFPTrunc(Dst, Log2, Flags);
3322 MI.eraseFromParent();
3323 return true;
3326 assert(Ty == LLT::scalar(32));
3328 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3329 if (!ScaledInput) {
3330 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3331 .addUse(Src)
3332 .setMIFlags(Flags);
3333 MI.eraseFromParent();
3334 return true;
3337 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3338 .addUse(ScaledInput)
3339 .setMIFlags(Flags);
3341 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3342 auto Zero = B.buildFConstant(Ty, 0.0);
3343 auto ResultOffset =
3344 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3345 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3347 MI.eraseFromParent();
3348 return true;
3351 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3352 Register Z, unsigned Flags) {
3353 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3354 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3357 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3358 MachineIRBuilder &B) const {
3359 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3360 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3362 MachineRegisterInfo &MRI = *B.getMRI();
3363 Register Dst = MI.getOperand(0).getReg();
3364 Register X = MI.getOperand(1).getReg();
3365 unsigned Flags = MI.getFlags();
3366 const LLT Ty = MRI.getType(X);
3367 MachineFunction &MF = B.getMF();
3369 const LLT F32 = LLT::scalar(32);
3370 const LLT F16 = LLT::scalar(16);
3372 const AMDGPUTargetMachine &TM =
3373 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3375 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3376 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3377 if (Ty == F16 && !ST.has16BitInsts()) {
3378 Register LogVal = MRI.createGenericVirtualRegister(F32);
3379 auto PromoteSrc = B.buildFPExt(F32, X);
3380 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3381 B.buildFPTrunc(Dst, LogVal);
3382 } else {
3383 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3386 MI.eraseFromParent();
3387 return true;
3390 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3391 if (ScaledInput)
3392 X = ScaledInput;
3394 auto Y =
3395 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3397 Register R;
3398 if (ST.hasFastFMAF32()) {
3399 // c+cc are ln(2)/ln(10) to more than 49 bits
3400 const float c_log10 = 0x1.344134p-2f;
3401 const float cc_log10 = 0x1.09f79ep-26f;
3403 // c + cc is ln(2) to more than 49 bits
3404 const float c_log = 0x1.62e42ep-1f;
3405 const float cc_log = 0x1.efa39ep-25f;
3407 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3408 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3410 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3411 auto NegR = B.buildFNeg(Ty, R, Flags);
3412 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3413 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3414 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3415 } else {
3416 // ch+ct is ln(2)/ln(10) to more than 36 bits
3417 const float ch_log10 = 0x1.344000p-2f;
3418 const float ct_log10 = 0x1.3509f6p-18f;
3420 // ch + ct is ln(2) to more than 36 bits
3421 const float ch_log = 0x1.62e000p-1f;
3422 const float ct_log = 0x1.0bfbe8p-15f;
3424 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3425 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3427 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3428 auto YH = B.buildAnd(Ty, Y, MaskConst);
3429 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3430 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3432 Register Mad0 =
3433 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3434 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3435 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3438 const bool IsFiniteOnly =
3439 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3440 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3442 if (!IsFiniteOnly) {
3443 // Expand isfinite(x) => fabs(x) < inf
3444 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3445 auto Fabs = B.buildFAbs(Ty, Y);
3446 auto IsFinite =
3447 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3448 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3451 if (ScaledInput) {
3452 auto Zero = B.buildFConstant(Ty, 0.0);
3453 auto ShiftK =
3454 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3455 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3456 B.buildFSub(Dst, R, Shift, Flags);
3457 } else {
3458 B.buildCopy(Dst, R);
3461 MI.eraseFromParent();
3462 return true;
3465 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3466 Register Src, bool IsLog10,
3467 unsigned Flags) const {
3468 const double Log2BaseInverted =
3469 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3471 LLT Ty = B.getMRI()->getType(Dst);
3473 if (Ty == LLT::scalar(32)) {
3474 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3475 if (ScaledInput) {
3476 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3477 .addUse(Src)
3478 .setMIFlags(Flags);
3479 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3480 auto Zero = B.buildFConstant(Ty, 0.0);
3481 auto ResultOffset =
3482 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3483 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3485 if (ST.hasFastFMAF32())
3486 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3487 else {
3488 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3489 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3492 return true;
3496 auto Log2Operand = Ty == LLT::scalar(16)
3497 ? B.buildFLog2(Ty, Src, Flags)
3498 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3499 .addUse(Src)
3500 .setMIFlags(Flags);
3501 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3502 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3503 return true;
3506 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3507 MachineIRBuilder &B) const {
3508 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3509 // If we have to handle denormals, scale up the input and adjust the result.
3511 Register Dst = MI.getOperand(0).getReg();
3512 Register Src = MI.getOperand(1).getReg();
3513 unsigned Flags = MI.getFlags();
3514 LLT Ty = B.getMRI()->getType(Dst);
3515 const LLT F16 = LLT::scalar(16);
3516 const LLT F32 = LLT::scalar(32);
3518 if (Ty == F16) {
3519 // Nothing in half is a denormal when promoted to f32.
3520 auto Ext = B.buildFPExt(F32, Src, Flags);
3521 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3522 .addUse(Ext.getReg(0))
3523 .setMIFlags(Flags);
3524 B.buildFPTrunc(Dst, Log2, Flags);
3525 MI.eraseFromParent();
3526 return true;
3529 assert(Ty == F32);
3531 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3532 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3533 .addUse(Src)
3534 .setMIFlags(Flags);
3535 MI.eraseFromParent();
3536 return true;
3539 // bool needs_scaling = x < -0x1.f80000p+6f;
3540 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3542 // -nextafter(128.0, -1)
3543 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3544 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3545 RangeCheckConst, Flags);
3547 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3548 auto Zero = B.buildFConstant(Ty, 0.0);
3549 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3550 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3552 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3553 .addUse(AddInput.getReg(0))
3554 .setMIFlags(Flags);
3556 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3557 auto One = B.buildFConstant(Ty, 1.0);
3558 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3559 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3560 MI.eraseFromParent();
3561 return true;
3564 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3565 Register X, unsigned Flags) const {
3566 LLT Ty = B.getMRI()->getType(Dst);
3567 LLT F32 = LLT::scalar(32);
3569 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3570 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3571 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3573 if (Ty == F32) {
3574 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3575 .addUse(Mul.getReg(0))
3576 .setMIFlags(Flags);
3577 } else {
3578 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3581 return true;
3584 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3585 auto NeedsScaling =
3586 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3587 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3588 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3589 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3591 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3592 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3594 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3595 .addUse(ExpInput.getReg(0))
3596 .setMIFlags(Flags);
3598 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3599 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3600 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3601 return true;
3604 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3605 MachineIRBuilder &B) const {
3606 Register Dst = MI.getOperand(0).getReg();
3607 Register X = MI.getOperand(1).getReg();
3608 const unsigned Flags = MI.getFlags();
3609 MachineFunction &MF = B.getMF();
3610 MachineRegisterInfo &MRI = *B.getMRI();
3611 LLT Ty = MRI.getType(Dst);
3612 const LLT F16 = LLT::scalar(16);
3613 const LLT F32 = LLT::scalar(32);
3614 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3616 if (Ty == F16) {
3617 // v_exp_f16 (fmul x, log2e)
3618 if (allowApproxFunc(MF, Flags)) {
3619 // TODO: Does this really require fast?
3620 legalizeFExpUnsafe(B, Dst, X, Flags);
3621 MI.eraseFromParent();
3622 return true;
3625 // exp(f16 x) ->
3626 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3628 // Nothing in half is a denormal when promoted to f32.
3629 auto Ext = B.buildFPExt(F32, X, Flags);
3630 Register Lowered = MRI.createGenericVirtualRegister(F32);
3631 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3632 B.buildFPTrunc(Dst, Lowered, Flags);
3633 MI.eraseFromParent();
3634 return true;
3637 assert(Ty == F32);
3639 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3640 // library behavior. Also, is known-not-daz source sufficient?
3641 if (allowApproxFunc(MF, Flags)) {
3642 legalizeFExpUnsafe(B, Dst, X, Flags);
3643 MI.eraseFromParent();
3644 return true;
3647 // Algorithm:
3649 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3651 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3652 // n = 64*m + j, 0 <= j < 64
3654 // e^x = 2^((64*m + j + f)/64)
3655 // = (2^m) * (2^(j/64)) * 2^(f/64)
3656 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3658 // f = x*(64/ln(2)) - n
3659 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3661 // e^x = (2^m) * (2^(j/64)) * e^r
3663 // (2^(j/64)) is precomputed
3665 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3666 // e^r = 1 + q
3668 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3670 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3671 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3672 Register PH, PL;
3674 if (ST.hasFastFMAF32()) {
3675 const float c_exp = numbers::log2ef;
3676 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3677 const float c_exp10 = 0x1.a934f0p+1f;
3678 const float cc_exp10 = 0x1.2f346ep-24f;
3680 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3681 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3682 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3683 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3685 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3686 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3687 } else {
3688 const float ch_exp = 0x1.714000p+0f;
3689 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3691 const float ch_exp10 = 0x1.a92000p+1f;
3692 const float cl_exp10 = 0x1.4f0978p-11f;
3694 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3695 auto XH = B.buildAnd(Ty, X, MaskConst);
3696 auto XL = B.buildFSub(Ty, X, XH, Flags);
3698 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3699 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3701 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3702 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3704 Register Mad0 =
3705 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3706 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3709 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3711 // It is unsafe to contract this fsub into the PH multiply.
3712 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3713 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3714 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3716 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3717 .addUse(A.getReg(0))
3718 .setMIFlags(Flags);
3719 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3721 auto UnderflowCheckConst =
3722 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3723 auto Zero = B.buildFConstant(Ty, 0.0);
3724 auto Underflow =
3725 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3727 R = B.buildSelect(Ty, Underflow, Zero, R);
3729 const auto &Options = MF.getTarget().Options;
3731 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3732 auto OverflowCheckConst =
3733 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3735 auto Overflow =
3736 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3737 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3738 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3741 B.buildCopy(Dst, R);
3742 MI.eraseFromParent();
3743 return true;
3746 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3747 MachineIRBuilder &B) const {
3748 Register Dst = MI.getOperand(0).getReg();
3749 Register Src0 = MI.getOperand(1).getReg();
3750 Register Src1 = MI.getOperand(2).getReg();
3751 unsigned Flags = MI.getFlags();
3752 LLT Ty = B.getMRI()->getType(Dst);
3753 const LLT F16 = LLT::float16();
3754 const LLT F32 = LLT::float32();
3756 if (Ty == F32) {
3757 auto Log = B.buildFLog2(F32, Src0, Flags);
3758 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3759 .addUse(Log.getReg(0))
3760 .addUse(Src1)
3761 .setMIFlags(Flags);
3762 B.buildFExp2(Dst, Mul, Flags);
3763 } else if (Ty == F16) {
3764 // There's no f16 fmul_legacy, so we need to convert for it.
3765 auto Log = B.buildFLog2(F16, Src0, Flags);
3766 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3767 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3768 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3769 .addUse(Ext0.getReg(0))
3770 .addUse(Ext1.getReg(0))
3771 .setMIFlags(Flags);
3772 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3773 } else
3774 return false;
3776 MI.eraseFromParent();
3777 return true;
3780 // Find a source register, ignoring any possible source modifiers.
3781 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3782 Register ModSrc = OrigSrc;
3783 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3784 ModSrc = SrcFNeg->getOperand(1).getReg();
3785 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3786 ModSrc = SrcFAbs->getOperand(1).getReg();
3787 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3788 ModSrc = SrcFAbs->getOperand(1).getReg();
3789 return ModSrc;
3792 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3793 MachineRegisterInfo &MRI,
3794 MachineIRBuilder &B) const {
3796 const LLT S1 = LLT::scalar(1);
3797 const LLT F64 = LLT::float64();
3798 Register Dst = MI.getOperand(0).getReg();
3799 Register OrigSrc = MI.getOperand(1).getReg();
3800 unsigned Flags = MI.getFlags();
3801 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3802 "this should not have been custom lowered");
3804 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3805 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3806 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3807 // V_FRACT bug is:
3808 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3810 // Convert floor(x) to (x - fract(x))
3812 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3813 .addUse(OrigSrc)
3814 .setMIFlags(Flags);
3816 // Give source modifier matching some assistance before obscuring a foldable
3817 // pattern.
3819 // TODO: We can avoid the neg on the fract? The input sign to fract
3820 // shouldn't matter?
3821 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3823 auto Const =
3824 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3826 Register Min = MRI.createGenericVirtualRegister(F64);
3828 // We don't need to concern ourselves with the snan handling difference, so
3829 // use the one which will directly select.
3830 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3831 if (MFI->getMode().IEEE)
3832 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3833 else
3834 B.buildFMinNum(Min, Fract, Const, Flags);
3836 Register CorrectedFract = Min;
3837 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3838 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3839 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3842 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3843 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3845 MI.eraseFromParent();
3846 return true;
3849 // Turn an illegal packed v2s16 build vector into bit operations.
3850 // TODO: This should probably be a bitcast action in LegalizerHelper.
3851 bool AMDGPULegalizerInfo::legalizeBuildVector(
3852 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3853 Register Dst = MI.getOperand(0).getReg();
3854 const LLT S32 = LLT::scalar(32);
3855 const LLT S16 = LLT::scalar(16);
3856 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3858 Register Src0 = MI.getOperand(1).getReg();
3859 Register Src1 = MI.getOperand(2).getReg();
3861 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3862 assert(MRI.getType(Src0) == S32);
3863 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3864 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3867 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3868 B.buildBitcast(Dst, Merge);
3870 MI.eraseFromParent();
3871 return true;
3874 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3876 // Source and accumulation registers must all be 32-bits.
3878 // TODO: When the multiply is uniform, we should produce a code sequence
3879 // that is better suited to instruction selection on the SALU. Instead of
3880 // the outer loop going over parts of the result, the outer loop should go
3881 // over parts of one of the factors. This should result in instruction
3882 // selection that makes full use of S_ADDC_U32 instructions.
3883 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3884 MutableArrayRef<Register> Accum,
3885 ArrayRef<Register> Src0,
3886 ArrayRef<Register> Src1,
3887 bool UsePartialMad64_32,
3888 bool SeparateOddAlignedProducts) const {
3889 // Use (possibly empty) vectors of S1 registers to represent the set of
3890 // carries from one pair of positions to the next.
3891 using Carry = SmallVector<Register, 2>;
3893 MachineIRBuilder &B = Helper.MIRBuilder;
3894 GISelKnownBits &KB = *Helper.getKnownBits();
3896 const LLT S1 = LLT::scalar(1);
3897 const LLT S32 = LLT::scalar(32);
3898 const LLT S64 = LLT::scalar(64);
3900 Register Zero32;
3901 Register Zero64;
3903 auto getZero32 = [&]() -> Register {
3904 if (!Zero32)
3905 Zero32 = B.buildConstant(S32, 0).getReg(0);
3906 return Zero32;
3908 auto getZero64 = [&]() -> Register {
3909 if (!Zero64)
3910 Zero64 = B.buildConstant(S64, 0).getReg(0);
3911 return Zero64;
3914 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3915 for (unsigned i = 0; i < Src0.size(); ++i) {
3916 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3917 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3920 // Merge the given carries into the 32-bit LocalAccum, which is modified
3921 // in-place.
3923 // Returns the carry-out, which is a single S1 register or null.
3924 auto mergeCarry =
3925 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3926 if (CarryIn.empty())
3927 return Register();
3929 bool HaveCarryOut = true;
3930 Register CarryAccum;
3931 if (CarryIn.size() == 1) {
3932 if (!LocalAccum) {
3933 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3934 return Register();
3937 CarryAccum = getZero32();
3938 } else {
3939 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3940 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3941 CarryAccum =
3942 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3943 .getReg(0);
3946 if (!LocalAccum) {
3947 LocalAccum = getZero32();
3948 HaveCarryOut = false;
3952 auto Add =
3953 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3954 LocalAccum = Add.getReg(0);
3955 return HaveCarryOut ? Add.getReg(1) : Register();
3958 // Build a multiply-add chain to compute
3960 // LocalAccum + (partial products at DstIndex)
3961 // + (opportunistic subset of CarryIn)
3963 // LocalAccum is an array of one or two 32-bit registers that are updated
3964 // in-place. The incoming registers may be null.
3966 // In some edge cases, carry-ins can be consumed "for free". In that case,
3967 // the consumed carry bits are removed from CarryIn in-place.
3968 auto buildMadChain =
3969 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3970 -> Carry {
3971 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3972 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3974 Carry CarryOut;
3975 unsigned j0 = 0;
3977 // Use plain 32-bit multiplication for the most significant part of the
3978 // result by default.
3979 if (LocalAccum.size() == 1 &&
3980 (!UsePartialMad64_32 || !CarryIn.empty())) {
3981 do {
3982 // Skip multiplication if one of the operands is 0
3983 unsigned j1 = DstIndex - j0;
3984 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3985 ++j0;
3986 continue;
3988 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3989 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3990 LocalAccum[0] = Mul.getReg(0);
3991 } else {
3992 if (CarryIn.empty()) {
3993 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3994 } else {
3995 LocalAccum[0] =
3996 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3997 .getReg(0);
3998 CarryIn.pop_back();
4001 ++j0;
4002 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4005 // Build full 64-bit multiplies.
4006 if (j0 <= DstIndex) {
4007 bool HaveSmallAccum = false;
4008 Register Tmp;
4010 if (LocalAccum[0]) {
4011 if (LocalAccum.size() == 1) {
4012 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4013 HaveSmallAccum = true;
4014 } else if (LocalAccum[1]) {
4015 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4016 HaveSmallAccum = false;
4017 } else {
4018 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4019 HaveSmallAccum = true;
4021 } else {
4022 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4023 Tmp = getZero64();
4024 HaveSmallAccum = true;
4027 do {
4028 unsigned j1 = DstIndex - j0;
4029 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4030 ++j0;
4031 continue;
4033 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4034 {Src0[j0], Src1[j1], Tmp});
4035 Tmp = Mad.getReg(0);
4036 if (!HaveSmallAccum)
4037 CarryOut.push_back(Mad.getReg(1));
4038 HaveSmallAccum = false;
4040 ++j0;
4041 } while (j0 <= DstIndex);
4043 auto Unmerge = B.buildUnmerge(S32, Tmp);
4044 LocalAccum[0] = Unmerge.getReg(0);
4045 if (LocalAccum.size() > 1)
4046 LocalAccum[1] = Unmerge.getReg(1);
4049 return CarryOut;
4052 // Outer multiply loop, iterating over destination parts from least
4053 // significant to most significant parts.
4055 // The columns of the following diagram correspond to the destination parts
4056 // affected by one iteration of the outer loop (ignoring boundary
4057 // conditions).
4059 // Dest index relative to 2 * i: 1 0 -1
4060 // ------
4061 // Carries from previous iteration: e o
4062 // Even-aligned partial product sum: E E .
4063 // Odd-aligned partial product sum: O O
4065 // 'o' is OddCarry, 'e' is EvenCarry.
4066 // EE and OO are computed from partial products via buildMadChain and use
4067 // accumulation where possible and appropriate.
4069 Register SeparateOddCarry;
4070 Carry EvenCarry;
4071 Carry OddCarry;
4073 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4074 Carry OddCarryIn = std::move(OddCarry);
4075 Carry EvenCarryIn = std::move(EvenCarry);
4076 OddCarry.clear();
4077 EvenCarry.clear();
4079 // Partial products at offset 2 * i.
4080 if (2 * i < Accum.size()) {
4081 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4082 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4085 // Partial products at offset 2 * i - 1.
4086 if (i > 0) {
4087 if (!SeparateOddAlignedProducts) {
4088 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4089 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4090 } else {
4091 bool IsHighest = 2 * i >= Accum.size();
4092 Register SeparateOddOut[2];
4093 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4094 .take_front(IsHighest ? 1 : 2);
4095 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4097 MachineInstr *Lo;
4099 if (i == 1) {
4100 if (!IsHighest)
4101 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4102 else
4103 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4104 } else {
4105 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4106 SeparateOddCarry);
4108 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4110 if (!IsHighest) {
4111 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4112 Lo->getOperand(1).getReg());
4113 Accum[2 * i] = Hi.getReg(0);
4114 SeparateOddCarry = Hi.getReg(1);
4119 // Add in the carries from the previous iteration
4120 if (i > 0) {
4121 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4122 EvenCarryIn.push_back(CarryOut);
4124 if (2 * i < Accum.size()) {
4125 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4126 OddCarry.push_back(CarryOut);
4132 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4134 // TODO: If the multiply is followed by an addition, we should attempt to
4135 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4136 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4137 MachineInstr &MI) const {
4138 assert(ST.hasMad64_32());
4139 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4141 MachineIRBuilder &B = Helper.MIRBuilder;
4142 MachineRegisterInfo &MRI = *B.getMRI();
4144 Register DstReg = MI.getOperand(0).getReg();
4145 Register Src0 = MI.getOperand(1).getReg();
4146 Register Src1 = MI.getOperand(2).getReg();
4148 LLT Ty = MRI.getType(DstReg);
4149 assert(Ty.isScalar());
4151 unsigned Size = Ty.getSizeInBits();
4152 unsigned NumParts = Size / 32;
4153 assert((Size % 32) == 0);
4154 assert(NumParts >= 2);
4156 // Whether to use MAD_64_32 for partial products whose high half is
4157 // discarded. This avoids some ADD instructions but risks false dependency
4158 // stalls on some subtargets in some cases.
4159 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4161 // Whether to compute odd-aligned partial products separately. This is
4162 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4163 // in an even-aligned VGPR.
4164 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4166 LLT S32 = LLT::scalar(32);
4167 SmallVector<Register, 2> Src0Parts, Src1Parts;
4168 for (unsigned i = 0; i < NumParts; ++i) {
4169 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4170 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4172 B.buildUnmerge(Src0Parts, Src0);
4173 B.buildUnmerge(Src1Parts, Src1);
4175 SmallVector<Register, 2> AccumRegs(NumParts);
4176 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4177 SeparateOddAlignedProducts);
4179 B.buildMergeLikeInstr(DstReg, AccumRegs);
4180 MI.eraseFromParent();
4181 return true;
4184 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4185 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4186 // case with a single min instruction instead of a compare+select.
4187 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4188 MachineRegisterInfo &MRI,
4189 MachineIRBuilder &B) const {
4190 Register Dst = MI.getOperand(0).getReg();
4191 Register Src = MI.getOperand(1).getReg();
4192 LLT DstTy = MRI.getType(Dst);
4193 LLT SrcTy = MRI.getType(Src);
4195 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4196 ? AMDGPU::G_AMDGPU_FFBH_U32
4197 : AMDGPU::G_AMDGPU_FFBL_B32;
4198 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4199 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4201 MI.eraseFromParent();
4202 return true;
4205 bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4206 MachineRegisterInfo &MRI,
4207 MachineIRBuilder &B) const {
4208 Register Dst = MI.getOperand(0).getReg();
4209 Register Src = MI.getOperand(1).getReg();
4210 LLT SrcTy = MRI.getType(Src);
4211 TypeSize NumBits = SrcTy.getSizeInBits();
4213 assert(NumBits < 32u);
4215 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4216 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4217 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4218 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4219 B.buildTrunc(Dst, Ctlz);
4220 MI.eraseFromParent();
4221 return true;
4224 // Check that this is a G_XOR x, -1
4225 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4226 if (MI.getOpcode() != TargetOpcode::G_XOR)
4227 return false;
4228 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4229 return ConstVal && *ConstVal == -1;
4232 // Return the use branch instruction, otherwise null if the usage is invalid.
4233 static MachineInstr *
4234 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4235 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4236 Register CondDef = MI.getOperand(0).getReg();
4237 if (!MRI.hasOneNonDBGUse(CondDef))
4238 return nullptr;
4240 MachineBasicBlock *Parent = MI.getParent();
4241 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4243 if (isNot(MRI, *UseMI)) {
4244 Register NegatedCond = UseMI->getOperand(0).getReg();
4245 if (!MRI.hasOneNonDBGUse(NegatedCond))
4246 return nullptr;
4248 // We're deleting the def of this value, so we need to remove it.
4249 eraseInstr(*UseMI, MRI);
4251 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4252 Negated = true;
4255 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4256 return nullptr;
4258 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4259 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4260 if (Next == Parent->end()) {
4261 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4262 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4263 return nullptr;
4264 UncondBrTarget = &*NextMBB;
4265 } else {
4266 if (Next->getOpcode() != AMDGPU::G_BR)
4267 return nullptr;
4268 Br = &*Next;
4269 UncondBrTarget = Br->getOperand(0).getMBB();
4272 return UseMI;
4275 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4276 const ArgDescriptor *Arg,
4277 const TargetRegisterClass *ArgRC,
4278 LLT ArgTy) const {
4279 MCRegister SrcReg = Arg->getRegister();
4280 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4281 assert(DstReg.isVirtual() && "Virtual register expected");
4283 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4284 *ArgRC, B.getDebugLoc(), ArgTy);
4285 if (Arg->isMasked()) {
4286 // TODO: Should we try to emit this once in the entry block?
4287 const LLT S32 = LLT::scalar(32);
4288 const unsigned Mask = Arg->getMask();
4289 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4291 Register AndMaskSrc = LiveIn;
4293 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4294 // 0.
4295 if (Shift != 0) {
4296 auto ShiftAmt = B.buildConstant(S32, Shift);
4297 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4300 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4301 } else {
4302 B.buildCopy(DstReg, LiveIn);
4305 return true;
4308 bool AMDGPULegalizerInfo::loadInputValue(
4309 Register DstReg, MachineIRBuilder &B,
4310 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4311 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4312 const ArgDescriptor *Arg = nullptr;
4313 const TargetRegisterClass *ArgRC;
4314 LLT ArgTy;
4316 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4317 const ArgDescriptor WorkGroupIDX =
4318 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4319 // If GridZ is not programmed in an entry function then the hardware will set
4320 // it to all zeros, so there is no need to mask the GridY value in the low
4321 // order bits.
4322 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4323 AMDGPU::TTMP7,
4324 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4325 const ArgDescriptor WorkGroupIDZ =
4326 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4327 if (ST.hasArchitectedSGPRs() &&
4328 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4329 switch (ArgType) {
4330 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4331 Arg = &WorkGroupIDX;
4332 ArgRC = &AMDGPU::SReg_32RegClass;
4333 ArgTy = LLT::scalar(32);
4334 break;
4335 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4336 Arg = &WorkGroupIDY;
4337 ArgRC = &AMDGPU::SReg_32RegClass;
4338 ArgTy = LLT::scalar(32);
4339 break;
4340 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4341 Arg = &WorkGroupIDZ;
4342 ArgRC = &AMDGPU::SReg_32RegClass;
4343 ArgTy = LLT::scalar(32);
4344 break;
4345 default:
4346 break;
4350 if (!Arg)
4351 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4353 if (!Arg) {
4354 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4355 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4356 // case the pointer argument may be missing and we use null.
4357 B.buildConstant(DstReg, 0);
4358 return true;
4361 // It's undefined behavior if a function marked with the amdgpu-no-*
4362 // attributes uses the corresponding intrinsic.
4363 B.buildUndef(DstReg);
4364 return true;
4367 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4368 return false; // TODO: Handle these
4369 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4372 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4373 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4374 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4375 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4376 return false;
4378 MI.eraseFromParent();
4379 return true;
4382 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4383 int64_t C) {
4384 B.buildConstant(MI.getOperand(0).getReg(), C);
4385 MI.eraseFromParent();
4386 return true;
4389 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4390 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4391 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4392 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4393 if (MaxID == 0)
4394 return replaceWithConstant(B, MI, 0);
4396 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4397 const ArgDescriptor *Arg;
4398 const TargetRegisterClass *ArgRC;
4399 LLT ArgTy;
4400 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4402 Register DstReg = MI.getOperand(0).getReg();
4403 if (!Arg) {
4404 // It's undefined behavior if a function marked with the amdgpu-no-*
4405 // attributes uses the corresponding intrinsic.
4406 B.buildUndef(DstReg);
4407 MI.eraseFromParent();
4408 return true;
4411 if (Arg->isMasked()) {
4412 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4413 // masking operations anyway.
4415 // TODO: We could assert the top bit is 0 for the source copy.
4416 if (!loadInputValue(DstReg, B, ArgType))
4417 return false;
4418 } else {
4419 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4420 if (!loadInputValue(TmpReg, B, ArgType))
4421 return false;
4422 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4425 MI.eraseFromParent();
4426 return true;
4429 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4430 int64_t Offset) const {
4431 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
4432 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4434 // TODO: If we passed in the base kernel offset we could have a better
4435 // alignment than 4, but we don't really need it.
4436 if (!loadInputValue(KernArgReg, B,
4437 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4438 llvm_unreachable("failed to find kernarg segment ptr");
4440 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4441 // TODO: Should get nuw
4442 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4445 /// Legalize a value that's loaded from kernel arguments. This is only used by
4446 /// legacy intrinsics.
4447 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4448 MachineIRBuilder &B,
4449 uint64_t Offset,
4450 Align Alignment) const {
4451 Register DstReg = MI.getOperand(0).getReg();
4453 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4454 "unexpected kernarg parameter type");
4456 Register Ptr = getKernargParameterPtr(B, Offset);
4457 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4458 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4459 MachineMemOperand::MODereferenceable |
4460 MachineMemOperand::MOInvariant);
4461 MI.eraseFromParent();
4462 return true;
4465 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4466 MachineRegisterInfo &MRI,
4467 MachineIRBuilder &B) const {
4468 Register Dst = MI.getOperand(0).getReg();
4469 LLT DstTy = MRI.getType(Dst);
4470 LLT S16 = LLT::scalar(16);
4471 LLT S32 = LLT::scalar(32);
4472 LLT S64 = LLT::scalar(64);
4474 if (DstTy == S16)
4475 return legalizeFDIV16(MI, MRI, B);
4476 if (DstTy == S32)
4477 return legalizeFDIV32(MI, MRI, B);
4478 if (DstTy == S64)
4479 return legalizeFDIV64(MI, MRI, B);
4481 return false;
4484 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4485 Register DstDivReg,
4486 Register DstRemReg,
4487 Register X,
4488 Register Y) const {
4489 const LLT S1 = LLT::scalar(1);
4490 const LLT S32 = LLT::scalar(32);
4492 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4493 // algorithm used here.
4495 // Initial estimate of inv(y).
4496 auto FloatY = B.buildUITOFP(S32, Y);
4497 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4498 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4499 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4500 auto Z = B.buildFPTOUI(S32, ScaledY);
4502 // One round of UNR.
4503 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4504 auto NegYZ = B.buildMul(S32, NegY, Z);
4505 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4507 // Quotient/remainder estimate.
4508 auto Q = B.buildUMulH(S32, X, Z);
4509 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4511 // First quotient/remainder refinement.
4512 auto One = B.buildConstant(S32, 1);
4513 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4514 if (DstDivReg)
4515 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4516 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4518 // Second quotient/remainder refinement.
4519 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4520 if (DstDivReg)
4521 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4523 if (DstRemReg)
4524 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4527 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4529 // Return lo, hi of result
4531 // %cvt.lo = G_UITOFP Val.lo
4532 // %cvt.hi = G_UITOFP Val.hi
4533 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4534 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4535 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4536 // %mul2 = G_FMUL %mul1, 2**(-32)
4537 // %trunc = G_INTRINSIC_TRUNC %mul2
4538 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4539 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4540 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4541 Register Val) {
4542 const LLT S32 = LLT::scalar(32);
4543 auto Unmerge = B.buildUnmerge(S32, Val);
4545 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4546 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4548 auto Mad = B.buildFMAD(
4549 S32, CvtHi, // 2**32
4550 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4552 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4553 auto Mul1 = B.buildFMul(
4554 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4556 // 2**(-32)
4557 auto Mul2 = B.buildFMul(
4558 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4559 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4561 // -(2**32)
4562 auto Mad2 = B.buildFMAD(
4563 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4564 Mul1);
4566 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4567 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4569 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4572 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4573 Register DstDivReg,
4574 Register DstRemReg,
4575 Register Numer,
4576 Register Denom) const {
4577 const LLT S32 = LLT::scalar(32);
4578 const LLT S64 = LLT::scalar(64);
4579 const LLT S1 = LLT::scalar(1);
4580 Register RcpLo, RcpHi;
4582 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4584 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4586 auto Zero64 = B.buildConstant(S64, 0);
4587 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4589 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4590 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4592 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4593 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4594 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4596 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4597 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4598 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4600 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4601 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4602 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4603 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4604 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4606 auto Zero32 = B.buildConstant(S32, 0);
4607 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4608 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4609 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4611 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4612 Register NumerLo = UnmergeNumer.getReg(0);
4613 Register NumerHi = UnmergeNumer.getReg(1);
4615 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4616 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4617 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4618 Register Mul3_Lo = UnmergeMul3.getReg(0);
4619 Register Mul3_Hi = UnmergeMul3.getReg(1);
4620 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4621 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4622 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4623 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4625 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4626 Register DenomLo = UnmergeDenom.getReg(0);
4627 Register DenomHi = UnmergeDenom.getReg(1);
4629 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4630 auto C1 = B.buildSExt(S32, CmpHi);
4632 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4633 auto C2 = B.buildSExt(S32, CmpLo);
4635 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4636 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4638 // TODO: Here and below portions of the code can be enclosed into if/endif.
4639 // Currently control flow is unconditional and we have 4 selects after
4640 // potential endif to substitute PHIs.
4642 // if C3 != 0 ...
4643 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4644 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4645 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4646 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4648 auto One64 = B.buildConstant(S64, 1);
4649 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4651 auto C4 =
4652 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4653 auto C5 =
4654 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4655 auto C6 = B.buildSelect(
4656 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4658 // if (C6 != 0)
4659 auto Add4 = B.buildAdd(S64, Add3, One64);
4660 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4662 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4663 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4664 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4666 // endif C6
4667 // endif C3
4669 if (DstDivReg) {
4670 auto Sel1 = B.buildSelect(
4671 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4672 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4673 Sel1, MulHi3);
4676 if (DstRemReg) {
4677 auto Sel2 = B.buildSelect(
4678 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4679 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4680 Sel2, Sub1);
4684 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4685 MachineRegisterInfo &MRI,
4686 MachineIRBuilder &B) const {
4687 Register DstDivReg, DstRemReg;
4688 switch (MI.getOpcode()) {
4689 default:
4690 llvm_unreachable("Unexpected opcode!");
4691 case AMDGPU::G_UDIV: {
4692 DstDivReg = MI.getOperand(0).getReg();
4693 break;
4695 case AMDGPU::G_UREM: {
4696 DstRemReg = MI.getOperand(0).getReg();
4697 break;
4699 case AMDGPU::G_UDIVREM: {
4700 DstDivReg = MI.getOperand(0).getReg();
4701 DstRemReg = MI.getOperand(1).getReg();
4702 break;
4706 const LLT S64 = LLT::scalar(64);
4707 const LLT S32 = LLT::scalar(32);
4708 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4709 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4710 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4711 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4713 if (Ty == S32)
4714 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4715 else if (Ty == S64)
4716 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4717 else
4718 return false;
4720 MI.eraseFromParent();
4721 return true;
4724 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4725 MachineRegisterInfo &MRI,
4726 MachineIRBuilder &B) const {
4727 const LLT S64 = LLT::scalar(64);
4728 const LLT S32 = LLT::scalar(32);
4730 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4731 if (Ty != S32 && Ty != S64)
4732 return false;
4734 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4735 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4736 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4738 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4739 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4740 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4742 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4743 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4745 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4746 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4748 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4749 switch (MI.getOpcode()) {
4750 default:
4751 llvm_unreachable("Unexpected opcode!");
4752 case AMDGPU::G_SDIV: {
4753 DstDivReg = MI.getOperand(0).getReg();
4754 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4755 break;
4757 case AMDGPU::G_SREM: {
4758 DstRemReg = MI.getOperand(0).getReg();
4759 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4760 break;
4762 case AMDGPU::G_SDIVREM: {
4763 DstDivReg = MI.getOperand(0).getReg();
4764 DstRemReg = MI.getOperand(1).getReg();
4765 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4766 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4767 break;
4771 if (Ty == S32)
4772 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4773 else
4774 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4776 if (DstDivReg) {
4777 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4778 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4779 B.buildSub(DstDivReg, SignXor, Sign);
4782 if (DstRemReg) {
4783 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4784 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4785 B.buildSub(DstRemReg, SignXor, Sign);
4788 MI.eraseFromParent();
4789 return true;
4792 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4793 MachineRegisterInfo &MRI,
4794 MachineIRBuilder &B) const {
4795 Register Res = MI.getOperand(0).getReg();
4796 Register LHS = MI.getOperand(1).getReg();
4797 Register RHS = MI.getOperand(2).getReg();
4798 uint16_t Flags = MI.getFlags();
4799 LLT ResTy = MRI.getType(Res);
4801 const MachineFunction &MF = B.getMF();
4802 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4803 MF.getTarget().Options.UnsafeFPMath;
4805 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
4806 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4807 return false;
4809 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4810 // the CI documentation has a worst case error of 1 ulp.
4811 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4812 // use it as long as we aren't trying to use denormals.
4814 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4816 // 1 / x -> RCP(x)
4817 if (CLHS->isExactlyValue(1.0)) {
4818 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4819 .addUse(RHS)
4820 .setMIFlags(Flags);
4822 MI.eraseFromParent();
4823 return true;
4826 // -1 / x -> RCP( FNEG(x) )
4827 if (CLHS->isExactlyValue(-1.0)) {
4828 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4829 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4830 .addUse(FNeg.getReg(0))
4831 .setMIFlags(Flags);
4833 MI.eraseFromParent();
4834 return true;
4838 // For f16 require afn or arcp.
4839 // For f32 require afn.
4840 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4841 !MI.getFlag(MachineInstr::FmArcp)))
4842 return false;
4844 // x / y -> x * (1.0 / y)
4845 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4846 .addUse(RHS)
4847 .setMIFlags(Flags);
4848 B.buildFMul(Res, LHS, RCP, Flags);
4850 MI.eraseFromParent();
4851 return true;
4854 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4855 MachineRegisterInfo &MRI,
4856 MachineIRBuilder &B) const {
4857 Register Res = MI.getOperand(0).getReg();
4858 Register X = MI.getOperand(1).getReg();
4859 Register Y = MI.getOperand(2).getReg();
4860 uint16_t Flags = MI.getFlags();
4861 LLT ResTy = MRI.getType(Res);
4863 const MachineFunction &MF = B.getMF();
4864 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4865 MI.getFlag(MachineInstr::FmAfn);
4867 if (!AllowInaccurateRcp)
4868 return false;
4870 auto NegY = B.buildFNeg(ResTy, Y);
4871 auto One = B.buildFConstant(ResTy, 1.0);
4873 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4874 .addUse(Y)
4875 .setMIFlags(Flags);
4877 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4878 R = B.buildFMA(ResTy, Tmp0, R, R);
4880 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4881 R = B.buildFMA(ResTy, Tmp1, R, R);
4883 auto Ret = B.buildFMul(ResTy, X, R);
4884 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4886 B.buildFMA(Res, Tmp2, R, Ret);
4887 MI.eraseFromParent();
4888 return true;
4891 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4892 MachineRegisterInfo &MRI,
4893 MachineIRBuilder &B) const {
4894 if (legalizeFastUnsafeFDIV(MI, MRI, B))
4895 return true;
4897 Register Res = MI.getOperand(0).getReg();
4898 Register LHS = MI.getOperand(1).getReg();
4899 Register RHS = MI.getOperand(2).getReg();
4901 uint16_t Flags = MI.getFlags();
4903 LLT S16 = LLT::scalar(16);
4904 LLT S32 = LLT::scalar(32);
4906 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
4907 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
4908 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
4909 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
4910 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4911 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
4912 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4913 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
4914 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
4915 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
4916 // q16.u = opx(V_CVT_F16_F32, q32.u);
4917 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
4919 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4920 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4921 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
4922 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4923 .addUse(RHSExt.getReg(0))
4924 .setMIFlags(Flags);
4925 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
4926 MachineInstrBuilder Err;
4927 if (ST.hasMadMacF32Insts()) {
4928 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
4929 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
4930 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
4931 } else {
4932 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
4933 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
4934 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
4936 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
4937 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
4938 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
4939 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
4940 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4941 .addUse(RDst.getReg(0))
4942 .addUse(RHS)
4943 .addUse(LHS)
4944 .setMIFlags(Flags);
4946 MI.eraseFromParent();
4947 return true;
4950 static constexpr unsigned SPDenormModeBitField =
4951 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
4953 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4954 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
4955 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4956 const GCNSubtarget &ST,
4957 SIModeRegisterDefaults Mode) {
4958 // Set SP denorm mode to this value.
4959 unsigned SPDenormMode =
4960 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4962 if (ST.hasDenormModeInst()) {
4963 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4964 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4966 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4967 B.buildInstr(AMDGPU::S_DENORM_MODE)
4968 .addImm(NewDenormModeValue);
4970 } else {
4971 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4972 .addImm(SPDenormMode)
4973 .addImm(SPDenormModeBitField);
4977 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4978 MachineRegisterInfo &MRI,
4979 MachineIRBuilder &B) const {
4980 if (legalizeFastUnsafeFDIV(MI, MRI, B))
4981 return true;
4983 Register Res = MI.getOperand(0).getReg();
4984 Register LHS = MI.getOperand(1).getReg();
4985 Register RHS = MI.getOperand(2).getReg();
4986 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4987 SIModeRegisterDefaults Mode = MFI->getMode();
4989 uint16_t Flags = MI.getFlags();
4991 LLT S32 = LLT::scalar(32);
4992 LLT S1 = LLT::scalar(1);
4994 auto One = B.buildFConstant(S32, 1.0f);
4996 auto DenominatorScaled =
4997 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4998 .addUse(LHS)
4999 .addUse(RHS)
5000 .addImm(0)
5001 .setMIFlags(Flags);
5002 auto NumeratorScaled =
5003 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5004 .addUse(LHS)
5005 .addUse(RHS)
5006 .addImm(1)
5007 .setMIFlags(Flags);
5009 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5010 .addUse(DenominatorScaled.getReg(0))
5011 .setMIFlags(Flags);
5012 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5014 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5015 const bool HasDynamicDenormals =
5016 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5017 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5019 Register SavedSPDenormMode;
5020 if (!PreservesDenormals) {
5021 if (HasDynamicDenormals) {
5022 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5023 B.buildInstr(AMDGPU::S_GETREG_B32)
5024 .addDef(SavedSPDenormMode)
5025 .addImm(SPDenormModeBitField);
5027 toggleSPDenormMode(true, B, ST, Mode);
5030 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5031 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5032 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5033 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5034 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5035 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5037 if (!PreservesDenormals) {
5038 if (HasDynamicDenormals) {
5039 assert(SavedSPDenormMode);
5040 B.buildInstr(AMDGPU::S_SETREG_B32)
5041 .addReg(SavedSPDenormMode)
5042 .addImm(SPDenormModeBitField);
5043 } else
5044 toggleSPDenormMode(false, B, ST, Mode);
5047 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5048 .addUse(Fma4.getReg(0))
5049 .addUse(Fma1.getReg(0))
5050 .addUse(Fma3.getReg(0))
5051 .addUse(NumeratorScaled.getReg(1))
5052 .setMIFlags(Flags);
5054 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5055 .addUse(Fmas.getReg(0))
5056 .addUse(RHS)
5057 .addUse(LHS)
5058 .setMIFlags(Flags);
5060 MI.eraseFromParent();
5061 return true;
5064 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5065 MachineRegisterInfo &MRI,
5066 MachineIRBuilder &B) const {
5067 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5068 return true;
5070 Register Res = MI.getOperand(0).getReg();
5071 Register LHS = MI.getOperand(1).getReg();
5072 Register RHS = MI.getOperand(2).getReg();
5074 uint16_t Flags = MI.getFlags();
5076 LLT S64 = LLT::scalar(64);
5077 LLT S1 = LLT::scalar(1);
5079 auto One = B.buildFConstant(S64, 1.0);
5081 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5082 .addUse(LHS)
5083 .addUse(RHS)
5084 .addImm(0)
5085 .setMIFlags(Flags);
5087 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5089 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5090 .addUse(DivScale0.getReg(0))
5091 .setMIFlags(Flags);
5093 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5094 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5095 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5097 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5098 .addUse(LHS)
5099 .addUse(RHS)
5100 .addImm(1)
5101 .setMIFlags(Flags);
5103 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5104 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5105 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5107 Register Scale;
5108 if (!ST.hasUsableDivScaleConditionOutput()) {
5109 // Workaround a hardware bug on SI where the condition output from div_scale
5110 // is not usable.
5112 LLT S32 = LLT::scalar(32);
5114 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5115 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5116 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5117 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5119 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5120 Scale1Unmerge.getReg(1));
5121 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5122 Scale0Unmerge.getReg(1));
5123 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5124 } else {
5125 Scale = DivScale1.getReg(1);
5128 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5129 .addUse(Fma4.getReg(0))
5130 .addUse(Fma3.getReg(0))
5131 .addUse(Mul.getReg(0))
5132 .addUse(Scale)
5133 .setMIFlags(Flags);
5135 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5136 .addUse(Fmas.getReg(0))
5137 .addUse(RHS)
5138 .addUse(LHS)
5139 .setMIFlags(Flags);
5141 MI.eraseFromParent();
5142 return true;
5145 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5146 MachineRegisterInfo &MRI,
5147 MachineIRBuilder &B) const {
5148 Register Res0 = MI.getOperand(0).getReg();
5149 Register Res1 = MI.getOperand(1).getReg();
5150 Register Val = MI.getOperand(2).getReg();
5151 uint16_t Flags = MI.getFlags();
5153 LLT Ty = MRI.getType(Res0);
5154 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5156 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5157 .addUse(Val)
5158 .setMIFlags(Flags);
5159 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5160 .addUse(Val)
5161 .setMIFlags(Flags);
5163 if (ST.hasFractBug()) {
5164 auto Fabs = B.buildFAbs(Ty, Val);
5165 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5166 auto IsFinite =
5167 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5168 auto Zero = B.buildConstant(InstrExpTy, 0);
5169 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5170 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5173 B.buildCopy(Res0, Mant);
5174 B.buildSExtOrTrunc(Res1, Exp);
5176 MI.eraseFromParent();
5177 return true;
5180 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5181 MachineRegisterInfo &MRI,
5182 MachineIRBuilder &B) const {
5183 Register Res = MI.getOperand(0).getReg();
5184 Register LHS = MI.getOperand(2).getReg();
5185 Register RHS = MI.getOperand(3).getReg();
5186 uint16_t Flags = MI.getFlags();
5188 LLT S32 = LLT::scalar(32);
5189 LLT S1 = LLT::scalar(1);
5191 auto Abs = B.buildFAbs(S32, RHS, Flags);
5192 const APFloat C0Val(1.0f);
5194 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5195 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5196 auto C2 = B.buildFConstant(S32, 1.0f);
5198 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5199 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5201 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5203 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5204 .addUse(Mul0.getReg(0))
5205 .setMIFlags(Flags);
5207 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5209 B.buildFMul(Res, Sel, Mul1, Flags);
5211 MI.eraseFromParent();
5212 return true;
5215 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5216 MachineRegisterInfo &MRI,
5217 MachineIRBuilder &B) const {
5218 // Bypass the correct expansion a standard promotion through G_FSQRT would
5219 // get. The f32 op is accurate enough for the f16 cas.
5220 unsigned Flags = MI.getFlags();
5221 assert(!ST.has16BitInsts());
5222 const LLT F32 = LLT::scalar(32);
5223 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5224 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5225 .addUse(Ext.getReg(0))
5226 .setMIFlags(Flags);
5227 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5228 MI.eraseFromParent();
5229 return true;
5232 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5233 MachineRegisterInfo &MRI,
5234 MachineIRBuilder &B) const {
5235 MachineFunction &MF = B.getMF();
5236 Register Dst = MI.getOperand(0).getReg();
5237 Register X = MI.getOperand(1).getReg();
5238 const unsigned Flags = MI.getFlags();
5239 const LLT S1 = LLT::scalar(1);
5240 const LLT F32 = LLT::scalar(32);
5241 const LLT I32 = LLT::scalar(32);
5243 if (allowApproxFunc(MF, Flags)) {
5244 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5245 .addUse(X)
5246 .setMIFlags(Flags);
5247 MI.eraseFromParent();
5248 return true;
5251 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5252 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5253 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5254 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5255 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5257 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5258 if (needsDenormHandlingF32(MF, X, Flags)) {
5259 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5260 .addUse(SqrtX.getReg(0))
5261 .setMIFlags(Flags);
5263 auto NegOne = B.buildConstant(I32, -1);
5264 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5266 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5267 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5269 auto PosOne = B.buildConstant(I32, 1);
5270 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5272 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5273 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5275 auto Zero = B.buildFConstant(F32, 0.0f);
5276 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5278 SqrtS =
5279 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5281 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5282 SqrtS =
5283 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5284 } else {
5285 auto SqrtR =
5286 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5287 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5289 auto Half = B.buildFConstant(F32, 0.5f);
5290 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5291 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5292 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5293 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5294 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5295 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5296 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5297 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5300 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5302 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5304 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5306 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5307 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5309 MI.eraseFromParent();
5310 return true;
5313 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5314 MachineRegisterInfo &MRI,
5315 MachineIRBuilder &B) const {
5316 // For double type, the SQRT and RSQ instructions don't have required
5317 // precision, we apply Goldschmidt's algorithm to improve the result:
5319 // y0 = rsq(x)
5320 // g0 = x * y0
5321 // h0 = 0.5 * y0
5323 // r0 = 0.5 - h0 * g0
5324 // g1 = g0 * r0 + g0
5325 // h1 = h0 * r0 + h0
5327 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5328 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5329 // h2 = h1 * r1 + h1
5331 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5332 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5334 // sqrt(x) = g3
5336 const LLT S1 = LLT::scalar(1);
5337 const LLT S32 = LLT::scalar(32);
5338 const LLT F64 = LLT::scalar(64);
5340 Register Dst = MI.getOperand(0).getReg();
5341 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5343 Register X = MI.getOperand(1).getReg();
5344 unsigned Flags = MI.getFlags();
5346 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5348 auto ZeroInt = B.buildConstant(S32, 0);
5349 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5351 // Scale up input if it is too small.
5352 auto ScaleUpFactor = B.buildConstant(S32, 256);
5353 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5354 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5356 auto SqrtY =
5357 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5359 auto Half = B.buildFConstant(F64, 0.5);
5360 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5361 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5363 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5364 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5366 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5367 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5369 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5370 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5372 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5374 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5375 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5377 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5379 // Scale down the result.
5380 auto ScaleDownFactor = B.buildConstant(S32, -128);
5381 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5382 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5384 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5385 // with finite only or nsz because rsq(+/-0) = +/-inf
5387 // TODO: Check for DAZ and expand to subnormals
5388 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5390 // If x is +INF, +0, or -0, use its original value
5391 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5393 MI.eraseFromParent();
5394 return true;
5397 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5398 MachineRegisterInfo &MRI,
5399 MachineIRBuilder &B) const {
5400 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5401 if (Ty == LLT::scalar(32))
5402 return legalizeFSQRTF32(MI, MRI, B);
5403 if (Ty == LLT::scalar(64))
5404 return legalizeFSQRTF64(MI, MRI, B);
5405 if (Ty == LLT::scalar(16))
5406 return legalizeFSQRTF16(MI, MRI, B);
5407 return false;
5410 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5411 // FIXME: Why do we handle this one but not other removed instructions?
5413 // Reciprocal square root. The clamp prevents infinite results, clamping
5414 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5415 // +-max_float.
5416 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5417 MachineRegisterInfo &MRI,
5418 MachineIRBuilder &B) const {
5419 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5420 return true;
5422 Register Dst = MI.getOperand(0).getReg();
5423 Register Src = MI.getOperand(2).getReg();
5424 auto Flags = MI.getFlags();
5426 LLT Ty = MRI.getType(Dst);
5428 const fltSemantics *FltSemantics;
5429 if (Ty == LLT::scalar(32))
5430 FltSemantics = &APFloat::IEEEsingle();
5431 else if (Ty == LLT::scalar(64))
5432 FltSemantics = &APFloat::IEEEdouble();
5433 else
5434 return false;
5436 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5437 .addUse(Src)
5438 .setMIFlags(Flags);
5440 // We don't need to concern ourselves with the snan handling difference, since
5441 // the rsq quieted (or not) so use the one which will directly select.
5442 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5443 const bool UseIEEE = MFI->getMode().IEEE;
5445 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5446 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5447 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5449 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5451 if (UseIEEE)
5452 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5453 else
5454 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5455 MI.eraseFromParent();
5456 return true;
5459 // TODO: Fix pointer type handling
5460 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5461 MachineInstr &MI,
5462 Intrinsic::ID IID) const {
5464 MachineIRBuilder &B = Helper.MIRBuilder;
5465 MachineRegisterInfo &MRI = *B.getMRI();
5467 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5468 IID == Intrinsic::amdgcn_permlanex16;
5469 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5470 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5472 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5473 Register Src2, LLT VT) -> Register {
5474 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5475 switch (IID) {
5476 case Intrinsic::amdgcn_readfirstlane:
5477 case Intrinsic::amdgcn_permlane64:
5478 return LaneOp.getReg(0);
5479 case Intrinsic::amdgcn_readlane:
5480 case Intrinsic::amdgcn_set_inactive:
5481 case Intrinsic::amdgcn_set_inactive_chain_arg:
5482 return LaneOp.addUse(Src1).getReg(0);
5483 case Intrinsic::amdgcn_writelane:
5484 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5485 case Intrinsic::amdgcn_permlane16:
5486 case Intrinsic::amdgcn_permlanex16: {
5487 Register Src3 = MI.getOperand(5).getReg();
5488 Register Src4 = MI.getOperand(6).getImm();
5489 Register Src5 = MI.getOperand(7).getImm();
5490 return LaneOp.addUse(Src1)
5491 .addUse(Src2)
5492 .addUse(Src3)
5493 .addImm(Src4)
5494 .addImm(Src5)
5495 .getReg(0);
5497 case Intrinsic::amdgcn_mov_dpp8:
5498 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
5499 case Intrinsic::amdgcn_update_dpp:
5500 return LaneOp.addUse(Src1)
5501 .addImm(MI.getOperand(4).getImm())
5502 .addImm(MI.getOperand(5).getImm())
5503 .addImm(MI.getOperand(6).getImm())
5504 .addImm(MI.getOperand(7).getImm())
5505 .getReg(0);
5506 default:
5507 llvm_unreachable("unhandled lane op");
5511 Register DstReg = MI.getOperand(0).getReg();
5512 Register Src0 = MI.getOperand(2).getReg();
5513 Register Src1, Src2;
5514 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5515 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5516 Src1 = MI.getOperand(3).getReg();
5517 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5518 Src2 = MI.getOperand(4).getReg();
5522 LLT Ty = MRI.getType(DstReg);
5523 unsigned Size = Ty.getSizeInBits();
5525 unsigned SplitSize = 32;
5526 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5527 ST.hasDPALU_DPP() &&
5528 AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm()))
5529 SplitSize = 64;
5531 if (Size == SplitSize) {
5532 // Already legal
5533 return true;
5536 if (Size < 32) {
5537 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5539 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5540 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5542 if (IID == Intrinsic::amdgcn_writelane)
5543 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5545 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5546 B.buildTrunc(DstReg, LaneOpDst);
5547 MI.eraseFromParent();
5548 return true;
5551 if (Size % SplitSize != 0)
5552 return false;
5554 LLT PartialResTy = LLT::scalar(SplitSize);
5555 if (Ty.isVector()) {
5556 LLT EltTy = Ty.getElementType();
5557 unsigned EltSize = EltTy.getSizeInBits();
5558 if (EltSize == SplitSize) {
5559 PartialResTy = EltTy;
5560 } else if (EltSize == 16 || EltSize == 32) {
5561 unsigned NElem = SplitSize / EltSize;
5562 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
5564 // Handle all other cases via S32/S64 pieces;
5567 SmallVector<Register, 4> PartialRes;
5568 unsigned NumParts = Size / SplitSize;
5569 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5570 MachineInstrBuilder Src1Parts, Src2Parts;
5572 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5573 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5575 if (IID == Intrinsic::amdgcn_writelane)
5576 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5578 for (unsigned i = 0; i < NumParts; ++i) {
5579 Src0 = Src0Parts.getReg(i);
5581 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5582 Src1 = Src1Parts.getReg(i);
5584 if (IID == Intrinsic::amdgcn_writelane)
5585 Src2 = Src2Parts.getReg(i);
5587 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5590 B.buildMergeLikeInstr(DstReg, PartialRes);
5591 MI.eraseFromParent();
5592 return true;
5595 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5596 MachineRegisterInfo &MRI,
5597 MachineIRBuilder &B) const {
5598 uint64_t Offset =
5599 ST.getTargetLowering()->getImplicitParameterOffset(
5600 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5601 LLT DstTy = MRI.getType(DstReg);
5602 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5604 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5605 if (!loadInputValue(KernargPtrReg, B,
5606 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5607 return false;
5609 // FIXME: This should be nuw
5610 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5611 return true;
5614 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5615 /// bits of the pointer and replace them with the stride argument, then
5616 /// merge_values everything together. In the common case of a raw buffer (the
5617 /// stride component is 0), we can just AND off the upper half.
5618 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5619 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5620 Register Result = MI.getOperand(0).getReg();
5621 Register Pointer = MI.getOperand(2).getReg();
5622 Register Stride = MI.getOperand(3).getReg();
5623 Register NumRecords = MI.getOperand(4).getReg();
5624 Register Flags = MI.getOperand(5).getReg();
5626 LLT S32 = LLT::scalar(32);
5628 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5629 auto Unmerge = B.buildUnmerge(S32, Pointer);
5630 Register LowHalf = Unmerge.getReg(0);
5631 Register HighHalf = Unmerge.getReg(1);
5633 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5634 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5636 MachineInstrBuilder NewHighHalf = Masked;
5637 std::optional<ValueAndVReg> StrideConst =
5638 getIConstantVRegValWithLookThrough(Stride, MRI);
5639 if (!StrideConst || !StrideConst->Value.isZero()) {
5640 MachineInstrBuilder ShiftedStride;
5641 if (StrideConst) {
5642 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5643 uint32_t ShiftedStrideVal = StrideVal << 16;
5644 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5645 } else {
5646 auto ExtStride = B.buildAnyExt(S32, Stride);
5647 auto ShiftConst = B.buildConstant(S32, 16);
5648 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5650 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5652 Register NewHighHalfReg = NewHighHalf.getReg(0);
5653 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5654 MI.eraseFromParent();
5655 return true;
5658 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5659 MachineRegisterInfo &MRI,
5660 MachineIRBuilder &B) const {
5661 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5662 if (!MFI->isEntryFunction()) {
5663 return legalizePreloadedArgIntrin(MI, MRI, B,
5664 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5667 Register DstReg = MI.getOperand(0).getReg();
5668 if (!getImplicitArgPtr(DstReg, MRI, B))
5669 return false;
5671 MI.eraseFromParent();
5672 return true;
5675 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5676 MachineRegisterInfo &MRI,
5677 MachineIRBuilder &B) const {
5678 Function &F = B.getMF().getFunction();
5679 std::optional<uint32_t> KnownSize =
5680 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5681 if (KnownSize.has_value())
5682 B.buildConstant(DstReg, *KnownSize);
5683 return false;
5686 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5687 MachineRegisterInfo &MRI,
5688 MachineIRBuilder &B) const {
5690 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5691 if (!MFI->isEntryFunction()) {
5692 return legalizePreloadedArgIntrin(MI, MRI, B,
5693 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5696 Register DstReg = MI.getOperand(0).getReg();
5697 if (!getLDSKernelId(DstReg, MRI, B))
5698 return false;
5700 MI.eraseFromParent();
5701 return true;
5704 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5705 MachineRegisterInfo &MRI,
5706 MachineIRBuilder &B,
5707 unsigned AddrSpace) const {
5708 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5709 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5710 Register Hi32 = Unmerge.getReg(1);
5712 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5713 MI.eraseFromParent();
5714 return true;
5717 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5718 // offset (the offset that is included in bounds checking and swizzling, to be
5719 // split between the instruction's voffset and immoffset fields) and soffset
5720 // (the offset that is excluded from bounds checking and swizzling, to go in
5721 // the instruction's soffset field). This function takes the first kind of
5722 // offset and figures out how to split it between voffset and immoffset.
5723 std::pair<Register, unsigned>
5724 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5725 Register OrigOffset) const {
5726 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5727 Register BaseReg;
5728 unsigned ImmOffset;
5729 const LLT S32 = LLT::scalar(32);
5730 MachineRegisterInfo &MRI = *B.getMRI();
5732 std::tie(BaseReg, ImmOffset) =
5733 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
5735 // If BaseReg is a pointer, convert it to int.
5736 if (MRI.getType(BaseReg).isPointer())
5737 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5739 // If the immediate value is too big for the immoffset field, put only bits
5740 // that would normally fit in the immoffset field. The remaining value that
5741 // is copied/added for the voffset field is a large power of 2, and it
5742 // stands more chance of being CSEd with the copy/add for another similar
5743 // load/store.
5744 // However, do not do that rounding down if that is a negative
5745 // number, as it appears to be illegal to have a negative offset in the
5746 // vgpr, even if adding the immediate offset makes it positive.
5747 unsigned Overflow = ImmOffset & ~MaxImm;
5748 ImmOffset -= Overflow;
5749 if ((int32_t)Overflow < 0) {
5750 Overflow += ImmOffset;
5751 ImmOffset = 0;
5754 if (Overflow != 0) {
5755 if (!BaseReg) {
5756 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5757 } else {
5758 auto OverflowVal = B.buildConstant(S32, Overflow);
5759 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5763 if (!BaseReg)
5764 BaseReg = B.buildConstant(S32, 0).getReg(0);
5766 return std::pair(BaseReg, ImmOffset);
5769 /// Handle register layout difference for f16 images for some subtargets.
5770 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5771 MachineRegisterInfo &MRI,
5772 Register Reg,
5773 bool ImageStore) const {
5774 const LLT S16 = LLT::scalar(16);
5775 const LLT S32 = LLT::scalar(32);
5776 LLT StoreVT = MRI.getType(Reg);
5777 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5779 if (ST.hasUnpackedD16VMem()) {
5780 auto Unmerge = B.buildUnmerge(S16, Reg);
5782 SmallVector<Register, 4> WideRegs;
5783 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5784 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5786 int NumElts = StoreVT.getNumElements();
5788 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5789 .getReg(0);
5792 if (ImageStore && ST.hasImageStoreD16Bug()) {
5793 if (StoreVT.getNumElements() == 2) {
5794 SmallVector<Register, 4> PackedRegs;
5795 Reg = B.buildBitcast(S32, Reg).getReg(0);
5796 PackedRegs.push_back(Reg);
5797 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5798 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5799 .getReg(0);
5802 if (StoreVT.getNumElements() == 3) {
5803 SmallVector<Register, 4> PackedRegs;
5804 auto Unmerge = B.buildUnmerge(S16, Reg);
5805 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5806 PackedRegs.push_back(Unmerge.getReg(I));
5807 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5808 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5809 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5812 if (StoreVT.getNumElements() == 4) {
5813 SmallVector<Register, 4> PackedRegs;
5814 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5815 auto Unmerge = B.buildUnmerge(S32, Reg);
5816 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5817 PackedRegs.push_back(Unmerge.getReg(I));
5818 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5819 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5820 .getReg(0);
5823 llvm_unreachable("invalid data type");
5826 if (StoreVT == LLT::fixed_vector(3, S16)) {
5827 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5828 .getReg(0);
5830 return Reg;
5833 Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
5834 Register VData, LLT MemTy,
5835 bool IsFormat) const {
5836 MachineRegisterInfo *MRI = B.getMRI();
5837 LLT Ty = MRI->getType(VData);
5839 const LLT S16 = LLT::scalar(16);
5841 // Fixup buffer resources themselves needing to be v4i128.
5842 if (hasBufferRsrcWorkaround(Ty))
5843 return castBufferRsrcToV4I32(VData, B);
5845 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
5846 Ty = getBitcastRegisterType(Ty);
5847 VData = B.buildBitcast(Ty, VData).getReg(0);
5849 // Fixup illegal register types for i8 stores.
5850 if (Ty == LLT::scalar(8) || Ty == S16) {
5851 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5852 return AnyExt;
5855 if (Ty.isVector()) {
5856 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5857 if (IsFormat)
5858 return handleD16VData(B, *MRI, VData);
5862 return VData;
5865 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5866 LegalizerHelper &Helper,
5867 bool IsTyped,
5868 bool IsFormat) const {
5869 MachineIRBuilder &B = Helper.MIRBuilder;
5870 MachineRegisterInfo &MRI = *B.getMRI();
5872 Register VData = MI.getOperand(1).getReg();
5873 LLT Ty = MRI.getType(VData);
5874 LLT EltTy = Ty.getScalarType();
5875 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5876 const LLT S32 = LLT::scalar(32);
5878 MachineMemOperand *MMO = *MI.memoperands_begin();
5879 const int MemSize = MMO->getSize().getValue();
5880 LLT MemTy = MMO->getMemoryType();
5882 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
5884 castBufferRsrcArgToV4I32(MI, B, 2);
5885 Register RSrc = MI.getOperand(2).getReg();
5887 unsigned ImmOffset;
5889 // The typed intrinsics add an immediate after the registers.
5890 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5892 // The struct intrinsic variants add one additional operand over raw.
5893 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5894 Register VIndex;
5895 int OpOffset = 0;
5896 if (HasVIndex) {
5897 VIndex = MI.getOperand(3).getReg();
5898 OpOffset = 1;
5899 } else {
5900 VIndex = B.buildConstant(S32, 0).getReg(0);
5903 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5904 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5906 unsigned Format = 0;
5907 if (IsTyped) {
5908 Format = MI.getOperand(5 + OpOffset).getImm();
5909 ++OpOffset;
5912 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5914 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5916 unsigned Opc;
5917 if (IsTyped) {
5918 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5919 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5920 } else if (IsFormat) {
5921 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5922 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5923 } else {
5924 switch (MemSize) {
5925 case 1:
5926 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5927 break;
5928 case 2:
5929 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5930 break;
5931 default:
5932 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5933 break;
5937 auto MIB = B.buildInstr(Opc)
5938 .addUse(VData) // vdata
5939 .addUse(RSrc) // rsrc
5940 .addUse(VIndex) // vindex
5941 .addUse(VOffset) // voffset
5942 .addUse(SOffset) // soffset
5943 .addImm(ImmOffset); // offset(imm)
5945 if (IsTyped)
5946 MIB.addImm(Format);
5948 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5949 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5950 .addMemOperand(MMO);
5952 MI.eraseFromParent();
5953 return true;
5956 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5957 Register VIndex, Register VOffset, Register SOffset,
5958 unsigned ImmOffset, unsigned Format,
5959 unsigned AuxiliaryData, MachineMemOperand *MMO,
5960 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5961 auto MIB = B.buildInstr(Opc)
5962 .addDef(LoadDstReg) // vdata
5963 .addUse(RSrc) // rsrc
5964 .addUse(VIndex) // vindex
5965 .addUse(VOffset) // voffset
5966 .addUse(SOffset) // soffset
5967 .addImm(ImmOffset); // offset(imm)
5969 if (IsTyped)
5970 MIB.addImm(Format);
5972 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5973 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5974 .addMemOperand(MMO);
5977 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5978 LegalizerHelper &Helper,
5979 bool IsFormat,
5980 bool IsTyped) const {
5981 MachineIRBuilder &B = Helper.MIRBuilder;
5982 MachineRegisterInfo &MRI = *B.getMRI();
5983 GISelChangeObserver &Observer = Helper.Observer;
5985 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5986 MachineMemOperand *MMO = *MI.memoperands_begin();
5987 const LLT MemTy = MMO->getMemoryType();
5988 const LLT S32 = LLT::scalar(32);
5990 Register Dst = MI.getOperand(0).getReg();
5992 Register StatusDst;
5993 int OpOffset = 0;
5994 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5995 bool IsTFE = MI.getNumExplicitDefs() == 2;
5996 if (IsTFE) {
5997 StatusDst = MI.getOperand(1).getReg();
5998 ++OpOffset;
6001 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6002 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6004 // The typed intrinsics add an immediate after the registers.
6005 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6007 // The struct intrinsic variants add one additional operand over raw.
6008 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6009 Register VIndex;
6010 if (HasVIndex) {
6011 VIndex = MI.getOperand(3 + OpOffset).getReg();
6012 ++OpOffset;
6013 } else {
6014 VIndex = B.buildConstant(S32, 0).getReg(0);
6017 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6018 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6020 unsigned Format = 0;
6021 if (IsTyped) {
6022 Format = MI.getOperand(5 + OpOffset).getImm();
6023 ++OpOffset;
6026 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6027 unsigned ImmOffset;
6029 LLT Ty = MRI.getType(Dst);
6030 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6031 // logic doesn't have to handle that case.
6032 if (hasBufferRsrcWorkaround(Ty)) {
6033 Observer.changingInstr(MI);
6034 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6035 Observer.changedInstr(MI);
6036 Dst = MI.getOperand(0).getReg();
6037 B.setInsertPt(B.getMBB(), MI);
6039 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6040 Ty = getBitcastRegisterType(Ty);
6041 Observer.changingInstr(MI);
6042 Helper.bitcastDst(MI, Ty, 0);
6043 Observer.changedInstr(MI);
6044 Dst = MI.getOperand(0).getReg();
6045 B.setInsertPt(B.getMBB(), MI);
6048 LLT EltTy = Ty.getScalarType();
6049 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6050 const bool Unpacked = ST.hasUnpackedD16VMem();
6052 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6054 unsigned Opc;
6056 // TODO: Support TFE for typed and narrow loads.
6057 if (IsTyped) {
6058 if (IsTFE)
6059 return false;
6060 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6061 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6062 } else if (IsFormat) {
6063 if (IsD16) {
6064 if (IsTFE)
6065 return false;
6066 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6067 } else {
6068 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6069 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6071 } else {
6072 switch (MemTy.getSizeInBits()) {
6073 case 8:
6074 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6075 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6076 break;
6077 case 16:
6078 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6079 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6080 break;
6081 default:
6082 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6083 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6084 break;
6088 if (IsTFE) {
6089 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6090 unsigned NumLoadDWords = NumValueDWords + 1;
6091 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6092 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6093 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6094 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6095 if (MemTy.getSizeInBits() < 32) {
6096 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6097 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6098 B.buildTrunc(Dst, ExtDst);
6099 } else if (NumValueDWords == 1) {
6100 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6101 } else {
6102 SmallVector<Register, 5> LoadElts;
6103 for (unsigned I = 0; I != NumValueDWords; ++I)
6104 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6105 LoadElts.push_back(StatusDst);
6106 B.buildUnmerge(LoadElts, LoadDstReg);
6107 LoadElts.truncate(NumValueDWords);
6108 B.buildMergeLikeInstr(Dst, LoadElts);
6110 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6111 (IsD16 && !Ty.isVector())) {
6112 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6113 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6114 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6115 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6116 B.buildTrunc(Dst, LoadDstReg);
6117 } else if (Unpacked && IsD16 && Ty.isVector()) {
6118 LLT UnpackedTy = Ty.changeElementSize(32);
6119 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6120 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6121 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6122 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6123 // FIXME: G_TRUNC should work, but legalization currently fails
6124 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6125 SmallVector<Register, 4> Repack;
6126 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6127 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6128 B.buildMergeLikeInstr(Dst, Repack);
6129 } else {
6130 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6131 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6134 MI.eraseFromParent();
6135 return true;
6138 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6139 switch (IntrID) {
6140 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6141 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6142 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6143 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6144 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6145 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6146 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6147 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6148 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6149 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6150 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6151 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6152 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6153 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6154 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6155 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6156 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6157 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6158 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6159 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6160 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6161 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6162 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6163 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6164 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6165 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6166 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6167 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6168 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6169 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6170 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6171 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6172 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6173 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6174 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6175 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6176 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6177 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6178 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6179 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6180 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6181 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6182 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6183 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6184 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6185 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6186 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6187 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6188 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6189 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6190 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6191 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6192 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6193 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6194 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6195 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6196 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6197 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6198 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6199 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6200 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6201 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6202 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6203 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6204 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6205 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6206 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6207 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6208 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6209 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6210 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6211 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6212 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6213 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6214 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6215 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6216 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6217 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6218 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6219 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6220 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6221 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6222 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6223 default:
6224 llvm_unreachable("unhandled atomic opcode");
6228 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6229 MachineIRBuilder &B,
6230 Intrinsic::ID IID) const {
6231 const bool IsCmpSwap =
6232 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6233 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6234 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6235 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6237 Register Dst = MI.getOperand(0).getReg();
6238 // Since we don't have 128-bit atomics, we don't need to handle the case of
6239 // p8 argmunents to the atomic itself
6240 Register VData = MI.getOperand(2).getReg();
6242 Register CmpVal;
6243 int OpOffset = 0;
6245 if (IsCmpSwap) {
6246 CmpVal = MI.getOperand(3).getReg();
6247 ++OpOffset;
6250 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6251 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6252 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6254 // The struct intrinsic variants add one additional operand over raw.
6255 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6256 Register VIndex;
6257 if (HasVIndex) {
6258 VIndex = MI.getOperand(4 + OpOffset).getReg();
6259 ++OpOffset;
6260 } else {
6261 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6264 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6265 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6266 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6268 MachineMemOperand *MMO = *MI.memoperands_begin();
6270 unsigned ImmOffset;
6271 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6273 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6274 .addDef(Dst)
6275 .addUse(VData); // vdata
6277 if (IsCmpSwap)
6278 MIB.addReg(CmpVal);
6280 MIB.addUse(RSrc) // rsrc
6281 .addUse(VIndex) // vindex
6282 .addUse(VOffset) // voffset
6283 .addUse(SOffset) // soffset
6284 .addImm(ImmOffset) // offset(imm)
6285 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6286 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6287 .addMemOperand(MMO);
6289 MI.eraseFromParent();
6290 return true;
6293 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6294 /// vector with s16 typed elements.
6295 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6296 SmallVectorImpl<Register> &PackedAddrs,
6297 unsigned ArgOffset,
6298 const AMDGPU::ImageDimIntrinsicInfo *Intr,
6299 bool IsA16, bool IsG16) {
6300 const LLT S16 = LLT::scalar(16);
6301 const LLT V2S16 = LLT::fixed_vector(2, 16);
6302 auto EndIdx = Intr->VAddrEnd;
6304 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6305 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6306 if (!SrcOp.isReg())
6307 continue; // _L to _LZ may have eliminated this.
6309 Register AddrReg = SrcOp.getReg();
6311 if ((I < Intr->GradientStart) ||
6312 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6313 (I >= Intr->CoordStart && !IsA16)) {
6314 if ((I < Intr->GradientStart) && IsA16 &&
6315 (B.getMRI()->getType(AddrReg) == S16)) {
6316 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6317 // Special handling of bias when A16 is on. Bias is of type half but
6318 // occupies full 32-bit.
6319 PackedAddrs.push_back(
6320 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6321 .getReg(0));
6322 } else {
6323 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6324 "Bias needs to be converted to 16 bit in A16 mode");
6325 // Handle any gradient or coordinate operands that should not be packed
6326 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6327 PackedAddrs.push_back(AddrReg);
6329 } else {
6330 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6331 // derivatives dx/dh and dx/dv are packed with undef.
6332 if (((I + 1) >= EndIdx) ||
6333 ((Intr->NumGradients / 2) % 2 == 1 &&
6334 (I == static_cast<unsigned>(Intr->GradientStart +
6335 (Intr->NumGradients / 2) - 1) ||
6336 I == static_cast<unsigned>(Intr->GradientStart +
6337 Intr->NumGradients - 1))) ||
6338 // Check for _L to _LZ optimization
6339 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6340 PackedAddrs.push_back(
6341 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6342 .getReg(0));
6343 } else {
6344 PackedAddrs.push_back(
6345 B.buildBuildVector(
6346 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6347 .getReg(0));
6348 ++I;
6354 /// Convert from separate vaddr components to a single vector address register,
6355 /// and replace the remaining operands with $noreg.
6356 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6357 int DimIdx, int NumVAddrs) {
6358 const LLT S32 = LLT::scalar(32);
6359 (void)S32;
6360 SmallVector<Register, 8> AddrRegs;
6361 for (int I = 0; I != NumVAddrs; ++I) {
6362 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6363 if (SrcOp.isReg()) {
6364 AddrRegs.push_back(SrcOp.getReg());
6365 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6369 int NumAddrRegs = AddrRegs.size();
6370 if (NumAddrRegs != 1) {
6371 auto VAddr =
6372 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6373 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6376 for (int I = 1; I != NumVAddrs; ++I) {
6377 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6378 if (SrcOp.isReg())
6379 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6383 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
6385 /// Depending on the subtarget, load/store with 16-bit element data need to be
6386 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6387 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6388 /// registers.
6390 /// We don't want to directly select image instructions just yet, but also want
6391 /// to exposes all register repacking to the legalizer/combiners. We also don't
6392 /// want a selected instruction entering RegBankSelect. In order to avoid
6393 /// defining a multitude of intermediate image instructions, directly hack on
6394 /// the intrinsic's arguments. In cases like a16 addresses, this requires
6395 /// padding now unnecessary arguments with $noreg.
6396 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6397 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6398 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6400 const MachineFunction &MF = *MI.getMF();
6401 const unsigned NumDefs = MI.getNumExplicitDefs();
6402 const unsigned ArgOffset = NumDefs + 1;
6403 bool IsTFE = NumDefs == 2;
6404 // We are only processing the operands of d16 image operations on subtargets
6405 // that use the unpacked register layout, or need to repack the TFE result.
6407 // TODO: Do we need to guard against already legalized intrinsics?
6408 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6409 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6411 MachineRegisterInfo *MRI = B.getMRI();
6412 const LLT S32 = LLT::scalar(32);
6413 const LLT S16 = LLT::scalar(16);
6414 const LLT V2S16 = LLT::fixed_vector(2, 16);
6416 unsigned DMask = 0;
6417 Register VData;
6418 LLT Ty;
6420 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6421 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6422 Ty = MRI->getType(VData);
6425 const bool IsAtomicPacked16Bit =
6426 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6427 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6429 // Check for 16 bit addresses and pack if true.
6430 LLT GradTy =
6431 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6432 LLT AddrTy =
6433 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6434 const bool IsG16 =
6435 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6436 const bool IsA16 = AddrTy == S16;
6437 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6439 int DMaskLanes = 0;
6440 if (!BaseOpcode->Atomic) {
6441 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6442 if (BaseOpcode->Gather4) {
6443 DMaskLanes = 4;
6444 } else if (DMask != 0) {
6445 DMaskLanes = llvm::popcount(DMask);
6446 } else if (!IsTFE && !BaseOpcode->Store) {
6447 // If dmask is 0, this is a no-op load. This can be eliminated.
6448 B.buildUndef(MI.getOperand(0));
6449 MI.eraseFromParent();
6450 return true;
6454 Observer.changingInstr(MI);
6455 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6457 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6458 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6459 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6460 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6461 unsigned NewOpcode = LoadOpcode;
6462 if (BaseOpcode->Store)
6463 NewOpcode = StoreOpcode;
6464 else if (BaseOpcode->NoReturn)
6465 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6467 // Track that we legalized this
6468 MI.setDesc(B.getTII().get(NewOpcode));
6470 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6471 // dmask to be at least 1 otherwise the instruction will fail
6472 if (IsTFE && DMask == 0) {
6473 DMask = 0x1;
6474 DMaskLanes = 1;
6475 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6478 if (BaseOpcode->Atomic) {
6479 Register VData0 = MI.getOperand(2).getReg();
6480 LLT Ty = MRI->getType(VData0);
6482 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6483 if (Ty.isVector() && !IsAtomicPacked16Bit)
6484 return false;
6486 if (BaseOpcode->AtomicX2) {
6487 Register VData1 = MI.getOperand(3).getReg();
6488 // The two values are packed in one register.
6489 LLT PackedTy = LLT::fixed_vector(2, Ty);
6490 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6491 MI.getOperand(2).setReg(Concat.getReg(0));
6492 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6496 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6498 // Rewrite the addressing register layout before doing anything else.
6499 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6500 // 16 bit gradients are supported, but are tied to the A16 control
6501 // so both gradients and addresses must be 16 bit
6502 return false;
6505 if (IsA16 && !ST.hasA16()) {
6506 // A16 not supported
6507 return false;
6510 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6511 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6513 if (IsA16 || IsG16) {
6514 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6515 // instructions expect VGPR_32
6516 SmallVector<Register, 4> PackedRegs;
6518 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6520 // See also below in the non-a16 branch
6521 const bool UseNSA = ST.hasNSAEncoding() &&
6522 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6523 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6524 const bool UsePartialNSA =
6525 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6527 if (UsePartialNSA) {
6528 // Pack registers that would go over NSAMaxSize into last VAddr register
6529 LLT PackedAddrTy =
6530 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6531 auto Concat = B.buildConcatVectors(
6532 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6533 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6534 PackedRegs.resize(NSAMaxSize);
6535 } else if (!UseNSA && PackedRegs.size() > 1) {
6536 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6537 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6538 PackedRegs[0] = Concat.getReg(0);
6539 PackedRegs.resize(1);
6542 const unsigned NumPacked = PackedRegs.size();
6543 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6544 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6545 if (!SrcOp.isReg()) {
6546 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6547 continue;
6550 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6552 if (I - Intr->VAddrStart < NumPacked)
6553 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6554 else
6555 SrcOp.setReg(AMDGPU::NoRegister);
6557 } else {
6558 // If the register allocator cannot place the address registers contiguously
6559 // without introducing moves, then using the non-sequential address encoding
6560 // is always preferable, since it saves VALU instructions and is usually a
6561 // wash in terms of code size or even better.
6563 // However, we currently have no way of hinting to the register allocator
6564 // that MIMG addresses should be placed contiguously when it is possible to
6565 // do so, so force non-NSA for the common 2-address case as a heuristic.
6567 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6568 // allocation when possible.
6570 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6571 // set of the remaining addresses.
6572 const bool UseNSA = ST.hasNSAEncoding() &&
6573 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6574 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6575 const bool UsePartialNSA =
6576 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6578 if (UsePartialNSA) {
6579 convertImageAddrToPacked(B, MI,
6580 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6581 Intr->NumVAddrs - NSAMaxSize + 1);
6582 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6583 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6584 Intr->NumVAddrs);
6588 int Flags = 0;
6589 if (IsA16)
6590 Flags |= 1;
6591 if (IsG16)
6592 Flags |= 2;
6593 MI.addOperand(MachineOperand::CreateImm(Flags));
6595 if (BaseOpcode->NoReturn) { // No TFE for stores?
6596 // TODO: Handle dmask trim
6597 if (!Ty.isVector() || !IsD16)
6598 return true;
6600 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6601 if (RepackedReg != VData) {
6602 MI.getOperand(1).setReg(RepackedReg);
6605 return true;
6608 Register DstReg = MI.getOperand(0).getReg();
6609 const LLT EltTy = Ty.getScalarType();
6610 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6612 // Confirm that the return type is large enough for the dmask specified
6613 if (NumElts < DMaskLanes)
6614 return false;
6616 if (NumElts > 4 || DMaskLanes > 4)
6617 return false;
6619 // Image atomic instructions are using DMask to specify how many bits
6620 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6621 // DMaskLanes for image atomic has default value '0'.
6622 // We must be sure that atomic variants (especially packed) will not be
6623 // truncated from v2s16 or v4s16 to s16 type.
6625 // ChangeElementCount will be needed for image load where Ty is always scalar.
6626 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6627 const LLT AdjustedTy =
6628 DMaskLanes == 0
6629 ? Ty
6630 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6632 // The raw dword aligned data component of the load. The only legal cases
6633 // where this matters should be when using the packed D16 format, for
6634 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6635 LLT RoundedTy;
6637 // S32 vector to cover all data, plus TFE result element.
6638 LLT TFETy;
6640 // Register type to use for each loaded component. Will be S32 or V2S16.
6641 LLT RegTy;
6643 if (IsD16 && ST.hasUnpackedD16VMem()) {
6644 RoundedTy =
6645 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6646 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6647 RegTy = S32;
6648 } else {
6649 unsigned EltSize = EltTy.getSizeInBits();
6650 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6651 unsigned RoundedSize = 32 * RoundedElts;
6652 RoundedTy = LLT::scalarOrVector(
6653 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6654 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6655 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6658 // The return type does not need adjustment.
6659 // TODO: Should we change s16 case to s32 or <2 x s16>?
6660 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6661 return true;
6663 Register Dst1Reg;
6665 // Insert after the instruction.
6666 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6668 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6669 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6670 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6671 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6673 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6675 MI.getOperand(0).setReg(NewResultReg);
6677 // In the IR, TFE is supposed to be used with a 2 element struct return
6678 // type. The instruction really returns these two values in one contiguous
6679 // register, with one additional dword beyond the loaded data. Rewrite the
6680 // return type to use a single register result.
6682 if (IsTFE) {
6683 Dst1Reg = MI.getOperand(1).getReg();
6684 if (MRI->getType(Dst1Reg) != S32)
6685 return false;
6687 // TODO: Make sure the TFE operand bit is set.
6688 MI.removeOperand(1);
6690 // Handle the easy case that requires no repack instructions.
6691 if (Ty == S32) {
6692 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6693 return true;
6697 // Now figure out how to copy the new result register back into the old
6698 // result.
6699 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6701 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6703 if (ResultNumRegs == 1) {
6704 assert(!IsTFE);
6705 ResultRegs[0] = NewResultReg;
6706 } else {
6707 // We have to repack into a new vector of some kind.
6708 for (int I = 0; I != NumDataRegs; ++I)
6709 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6710 B.buildUnmerge(ResultRegs, NewResultReg);
6712 // Drop the final TFE element to get the data part. The TFE result is
6713 // directly written to the right place already.
6714 if (IsTFE)
6715 ResultRegs.resize(NumDataRegs);
6718 // For an s16 scalar result, we form an s32 result with a truncate regardless
6719 // of packed vs. unpacked.
6720 if (IsD16 && !Ty.isVector()) {
6721 B.buildTrunc(DstReg, ResultRegs[0]);
6722 return true;
6725 // Avoid a build/concat_vector of 1 entry.
6726 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6727 B.buildBitcast(DstReg, ResultRegs[0]);
6728 return true;
6731 assert(Ty.isVector());
6733 if (IsD16) {
6734 // For packed D16 results with TFE enabled, all the data components are
6735 // S32. Cast back to the expected type.
6737 // TODO: We don't really need to use load s32 elements. We would only need one
6738 // cast for the TFE result if a multiple of v2s16 was used.
6739 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6740 for (Register &Reg : ResultRegs)
6741 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6742 } else if (ST.hasUnpackedD16VMem()) {
6743 for (Register &Reg : ResultRegs)
6744 Reg = B.buildTrunc(S16, Reg).getReg(0);
6748 auto padWithUndef = [&](LLT Ty, int NumElts) {
6749 if (NumElts == 0)
6750 return;
6751 Register Undef = B.buildUndef(Ty).getReg(0);
6752 for (int I = 0; I != NumElts; ++I)
6753 ResultRegs.push_back(Undef);
6756 // Pad out any elements eliminated due to the dmask.
6757 LLT ResTy = MRI->getType(ResultRegs[0]);
6758 if (!ResTy.isVector()) {
6759 padWithUndef(ResTy, NumElts - ResultRegs.size());
6760 B.buildBuildVector(DstReg, ResultRegs);
6761 return true;
6764 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6765 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6767 // Deal with the one annoying legal case.
6768 const LLT V3S16 = LLT::fixed_vector(3, 16);
6769 if (Ty == V3S16) {
6770 if (IsTFE) {
6771 if (ResultRegs.size() == 1) {
6772 NewResultReg = ResultRegs[0];
6773 } else if (ResultRegs.size() == 2) {
6774 LLT V4S16 = LLT::fixed_vector(4, 16);
6775 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6776 } else {
6777 return false;
6781 if (MRI->getType(DstReg).getNumElements() <
6782 MRI->getType(NewResultReg).getNumElements()) {
6783 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6784 } else {
6785 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6787 return true;
6790 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6791 B.buildConcatVectors(DstReg, ResultRegs);
6792 return true;
6795 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6796 MachineInstr &MI) const {
6797 MachineIRBuilder &B = Helper.MIRBuilder;
6798 GISelChangeObserver &Observer = Helper.Observer;
6800 Register OrigDst = MI.getOperand(0).getReg();
6801 Register Dst;
6802 LLT Ty = B.getMRI()->getType(OrigDst);
6803 unsigned Size = Ty.getSizeInBits();
6804 MachineFunction &MF = B.getMF();
6805 unsigned Opc = 0;
6806 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6807 assert(Size == 8 || Size == 16);
6808 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6809 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6810 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6811 // destination register.
6812 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6813 } else {
6814 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6815 Dst = OrigDst;
6818 Observer.changingInstr(MI);
6820 // Handle needing to s.buffer.load() a p8 value.
6821 if (hasBufferRsrcWorkaround(Ty)) {
6822 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6823 B.setInsertPt(B.getMBB(), MI);
6825 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6826 Ty = getBitcastRegisterType(Ty);
6827 Helper.bitcastDst(MI, Ty, 0);
6828 B.setInsertPt(B.getMBB(), MI);
6831 // FIXME: We don't really need this intermediate instruction. The intrinsic
6832 // should be fixed to have a memory operand. Since it's readnone, we're not
6833 // allowed to add one.
6834 MI.setDesc(B.getTII().get(Opc));
6835 MI.removeOperand(1); // Remove intrinsic ID
6837 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6838 const unsigned MemSize = (Size + 7) / 8;
6839 const Align MemAlign = B.getDataLayout().getABITypeAlign(
6840 getTypeForLLT(Ty, MF.getFunction().getContext()));
6841 MachineMemOperand *MMO = MF.getMachineMemOperand(
6842 MachinePointerInfo(),
6843 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6844 MachineMemOperand::MOInvariant,
6845 MemSize, MemAlign);
6846 MI.addMemOperand(MF, MMO);
6847 if (Dst != OrigDst) {
6848 MI.getOperand(0).setReg(Dst);
6849 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6850 B.buildTrunc(OrigDst, Dst);
6853 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6854 // always be legal. We may need to restore this to a 96-bit result if it turns
6855 // out this needs to be converted to a vector load during RegBankSelect.
6856 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6857 if (Ty.isVector())
6858 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
6859 else
6860 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6863 Observer.changedInstr(MI);
6864 return true;
6867 bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
6868 MachineInstr &MI) const {
6869 MachineIRBuilder &B = Helper.MIRBuilder;
6870 GISelChangeObserver &Observer = Helper.Observer;
6871 Observer.changingInstr(MI);
6872 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
6873 MI.removeOperand(0); // Remove intrinsic ID
6874 castBufferRsrcArgToV4I32(MI, B, 0);
6875 Observer.changedInstr(MI);
6876 return true;
6879 // TODO: Move to selection
6880 bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
6881 MachineRegisterInfo &MRI,
6882 MachineIRBuilder &B) const {
6883 if (!ST.isTrapHandlerEnabled() ||
6884 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6885 return legalizeTrapEndpgm(MI, MRI, B);
6887 return ST.supportsGetDoorbellID() ?
6888 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6891 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6892 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6893 const DebugLoc &DL = MI.getDebugLoc();
6894 MachineBasicBlock &BB = B.getMBB();
6895 MachineFunction *MF = BB.getParent();
6897 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6898 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6899 .addImm(0);
6900 MI.eraseFromParent();
6901 return true;
6904 // We need a block split to make the real endpgm a terminator. We also don't
6905 // want to break phis in successor blocks, so we can't just delete to the
6906 // end of the block.
6907 BB.splitAt(MI, false /*UpdateLiveIns*/);
6908 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6909 MF->push_back(TrapBB);
6910 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6911 .addImm(0);
6912 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6913 .addMBB(TrapBB);
6915 BB.addSuccessor(TrapBB);
6916 MI.eraseFromParent();
6917 return true;
6920 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6921 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6922 MachineFunction &MF = B.getMF();
6923 const LLT S64 = LLT::scalar(64);
6925 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6926 // For code object version 5, queue_ptr is passed through implicit kernarg.
6927 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
6928 AMDGPU::AMDHSA_COV5) {
6929 AMDGPUTargetLowering::ImplicitParameter Param =
6930 AMDGPUTargetLowering::QUEUE_PTR;
6931 uint64_t Offset =
6932 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6934 Register KernargPtrReg = MRI.createGenericVirtualRegister(
6935 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6937 if (!loadInputValue(KernargPtrReg, B,
6938 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6939 return false;
6941 // TODO: can we be smarter about machine pointer info?
6942 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6943 MachineMemOperand *MMO = MF.getMachineMemOperand(
6944 PtrInfo,
6945 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6946 MachineMemOperand::MOInvariant,
6947 LLT::scalar(64), commonAlignment(Align(64), Offset));
6949 // Pointer address
6950 Register LoadAddr = MRI.createGenericVirtualRegister(
6951 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6952 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6953 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6954 // Load address
6955 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6956 B.buildCopy(SGPR01, Temp);
6957 B.buildInstr(AMDGPU::S_TRAP)
6958 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6959 .addReg(SGPR01, RegState::Implicit);
6960 MI.eraseFromParent();
6961 return true;
6964 // Pass queue pointer to trap handler as input, and insert trap instruction
6965 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6966 Register LiveIn =
6967 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6968 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
6969 return false;
6971 B.buildCopy(SGPR01, LiveIn);
6972 B.buildInstr(AMDGPU::S_TRAP)
6973 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6974 .addReg(SGPR01, RegState::Implicit);
6976 MI.eraseFromParent();
6977 return true;
6980 bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
6981 MachineRegisterInfo &MRI,
6982 MachineIRBuilder &B) const {
6983 // We need to simulate the 's_trap 2' instruction on targets that run in
6984 // PRIV=1 (where it is treated as a nop).
6985 if (ST.hasPrivEnabledTrap2NopBug()) {
6986 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6987 MI.getDebugLoc());
6988 MI.eraseFromParent();
6989 return true;
6992 B.buildInstr(AMDGPU::S_TRAP)
6993 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6994 MI.eraseFromParent();
6995 return true;
6998 bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
6999 MachineRegisterInfo &MRI,
7000 MachineIRBuilder &B) const {
7001 // Is non-HSA path or trap-handler disabled? Then, report a warning
7002 // accordingly
7003 if (!ST.isTrapHandlerEnabled() ||
7004 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7005 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
7006 "debugtrap handler not supported",
7007 MI.getDebugLoc(), DS_Warning);
7008 LLVMContext &Ctx = B.getMF().getFunction().getContext();
7009 Ctx.diagnose(NoTrap);
7010 } else {
7011 // Insert debug-trap instruction
7012 B.buildInstr(AMDGPU::S_TRAP)
7013 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7016 MI.eraseFromParent();
7017 return true;
7020 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
7021 MachineIRBuilder &B) const {
7022 MachineRegisterInfo &MRI = *B.getMRI();
7023 const LLT S16 = LLT::scalar(16);
7024 const LLT S32 = LLT::scalar(32);
7025 const LLT V2S16 = LLT::fixed_vector(2, 16);
7026 const LLT V3S32 = LLT::fixed_vector(3, 32);
7028 Register DstReg = MI.getOperand(0).getReg();
7029 Register NodePtr = MI.getOperand(2).getReg();
7030 Register RayExtent = MI.getOperand(3).getReg();
7031 Register RayOrigin = MI.getOperand(4).getReg();
7032 Register RayDir = MI.getOperand(5).getReg();
7033 Register RayInvDir = MI.getOperand(6).getReg();
7034 Register TDescr = MI.getOperand(7).getReg();
7036 if (!ST.hasGFX10_AEncoding()) {
7037 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
7038 "intrinsic not supported on subtarget",
7039 MI.getDebugLoc());
7040 B.getMF().getFunction().getContext().diagnose(BadIntrin);
7041 return false;
7044 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7045 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7046 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7047 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7048 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7049 const unsigned NumVDataDwords = 4;
7050 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7051 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7052 const bool UseNSA =
7053 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7055 const unsigned BaseOpcodes[2][2] = {
7056 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7057 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7058 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7059 int Opcode;
7060 if (UseNSA) {
7061 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7062 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7063 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7064 : AMDGPU::MIMGEncGfx10NSA,
7065 NumVDataDwords, NumVAddrDwords);
7066 } else {
7067 assert(!IsGFX12Plus);
7068 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7069 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7070 : AMDGPU::MIMGEncGfx10Default,
7071 NumVDataDwords, NumVAddrDwords);
7073 assert(Opcode != -1);
7075 SmallVector<Register, 12> Ops;
7076 if (UseNSA && IsGFX11Plus) {
7077 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7078 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7079 auto Merged = B.buildMergeLikeInstr(
7080 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7081 Ops.push_back(Merged.getReg(0));
7084 Ops.push_back(NodePtr);
7085 Ops.push_back(RayExtent);
7086 packLanes(RayOrigin);
7088 if (IsA16) {
7089 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7090 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7091 auto MergedDir = B.buildMergeLikeInstr(
7092 V3S32,
7093 {B.buildBitcast(
7094 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7095 UnmergeRayDir.getReg(0)}))
7096 .getReg(0),
7097 B.buildBitcast(
7098 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7099 UnmergeRayDir.getReg(1)}))
7100 .getReg(0),
7101 B.buildBitcast(
7102 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7103 UnmergeRayDir.getReg(2)}))
7104 .getReg(0)});
7105 Ops.push_back(MergedDir.getReg(0));
7106 } else {
7107 packLanes(RayDir);
7108 packLanes(RayInvDir);
7110 } else {
7111 if (Is64) {
7112 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7113 Ops.push_back(Unmerge.getReg(0));
7114 Ops.push_back(Unmerge.getReg(1));
7115 } else {
7116 Ops.push_back(NodePtr);
7118 Ops.push_back(RayExtent);
7120 auto packLanes = [&Ops, &S32, &B](Register Src) {
7121 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7122 Ops.push_back(Unmerge.getReg(0));
7123 Ops.push_back(Unmerge.getReg(1));
7124 Ops.push_back(Unmerge.getReg(2));
7127 packLanes(RayOrigin);
7128 if (IsA16) {
7129 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7130 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7131 Register R1 = MRI.createGenericVirtualRegister(S32);
7132 Register R2 = MRI.createGenericVirtualRegister(S32);
7133 Register R3 = MRI.createGenericVirtualRegister(S32);
7134 B.buildMergeLikeInstr(R1,
7135 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7136 B.buildMergeLikeInstr(
7137 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7138 B.buildMergeLikeInstr(
7139 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7140 Ops.push_back(R1);
7141 Ops.push_back(R2);
7142 Ops.push_back(R3);
7143 } else {
7144 packLanes(RayDir);
7145 packLanes(RayInvDir);
7149 if (!UseNSA) {
7150 // Build a single vector containing all the operands so far prepared.
7151 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7152 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7153 Ops.clear();
7154 Ops.push_back(MergedOps);
7157 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7158 .addDef(DstReg)
7159 .addImm(Opcode);
7161 for (Register R : Ops) {
7162 MIB.addUse(R);
7165 MIB.addUse(TDescr)
7166 .addImm(IsA16 ? 1 : 0)
7167 .cloneMemRefs(MI);
7169 MI.eraseFromParent();
7170 return true;
7173 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7174 MachineIRBuilder &B) const {
7175 const SITargetLowering *TLI = ST.getTargetLowering();
7176 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7177 Register DstReg = MI.getOperand(0).getReg();
7178 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7179 MI.eraseFromParent();
7180 return true;
7183 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7184 MachineIRBuilder &B) const {
7185 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7186 if (!ST.hasArchitectedSGPRs())
7187 return false;
7188 LLT S32 = LLT::scalar(32);
7189 Register DstReg = MI.getOperand(0).getReg();
7190 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7191 auto LSB = B.buildConstant(S32, 25);
7192 auto Width = B.buildConstant(S32, 5);
7193 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7194 MI.eraseFromParent();
7195 return true;
7198 static constexpr unsigned FPEnvModeBitField =
7199 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
7201 static constexpr unsigned FPEnvTrapBitField =
7202 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
7204 bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7205 MachineRegisterInfo &MRI,
7206 MachineIRBuilder &B) const {
7207 Register Src = MI.getOperand(0).getReg();
7208 if (MRI.getType(Src) != S64)
7209 return false;
7211 auto ModeReg =
7212 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7213 /*HasSideEffects=*/true, /*isConvergent=*/false)
7214 .addImm(FPEnvModeBitField);
7215 auto TrapReg =
7216 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7217 /*HasSideEffects=*/true, /*isConvergent=*/false)
7218 .addImm(FPEnvTrapBitField);
7219 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7220 MI.eraseFromParent();
7221 return true;
7224 bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7225 MachineRegisterInfo &MRI,
7226 MachineIRBuilder &B) const {
7227 Register Src = MI.getOperand(0).getReg();
7228 if (MRI.getType(Src) != S64)
7229 return false;
7231 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7232 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7233 /*HasSideEffects=*/true, /*isConvergent=*/false)
7234 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7235 .addReg(Unmerge.getReg(0));
7236 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7237 /*HasSideEffects=*/true, /*isConvergent=*/false)
7238 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7239 .addReg(Unmerge.getReg(1));
7240 MI.eraseFromParent();
7241 return true;
7244 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7245 MachineInstr &MI) const {
7246 MachineIRBuilder &B = Helper.MIRBuilder;
7247 MachineRegisterInfo &MRI = *B.getMRI();
7249 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7250 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7251 switch (IntrID) {
7252 case Intrinsic::amdgcn_if:
7253 case Intrinsic::amdgcn_else: {
7254 MachineInstr *Br = nullptr;
7255 MachineBasicBlock *UncondBrTarget = nullptr;
7256 bool Negated = false;
7257 if (MachineInstr *BrCond =
7258 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7259 const SIRegisterInfo *TRI
7260 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7262 Register Def = MI.getOperand(1).getReg();
7263 Register Use = MI.getOperand(3).getReg();
7265 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7267 if (Negated)
7268 std::swap(CondBrTarget, UncondBrTarget);
7270 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7271 if (IntrID == Intrinsic::amdgcn_if) {
7272 B.buildInstr(AMDGPU::SI_IF)
7273 .addDef(Def)
7274 .addUse(Use)
7275 .addMBB(UncondBrTarget);
7276 } else {
7277 B.buildInstr(AMDGPU::SI_ELSE)
7278 .addDef(Def)
7279 .addUse(Use)
7280 .addMBB(UncondBrTarget);
7283 if (Br) {
7284 Br->getOperand(0).setMBB(CondBrTarget);
7285 } else {
7286 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7287 // since we're swapping branch targets it needs to be reinserted.
7288 // FIXME: IRTranslator should probably not do this
7289 B.buildBr(*CondBrTarget);
7292 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7293 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7294 MI.eraseFromParent();
7295 BrCond->eraseFromParent();
7296 return true;
7299 return false;
7301 case Intrinsic::amdgcn_loop: {
7302 MachineInstr *Br = nullptr;
7303 MachineBasicBlock *UncondBrTarget = nullptr;
7304 bool Negated = false;
7305 if (MachineInstr *BrCond =
7306 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7307 const SIRegisterInfo *TRI
7308 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7310 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7311 Register Reg = MI.getOperand(2).getReg();
7313 if (Negated)
7314 std::swap(CondBrTarget, UncondBrTarget);
7316 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7317 B.buildInstr(AMDGPU::SI_LOOP)
7318 .addUse(Reg)
7319 .addMBB(UncondBrTarget);
7321 if (Br)
7322 Br->getOperand(0).setMBB(CondBrTarget);
7323 else
7324 B.buildBr(*CondBrTarget);
7326 MI.eraseFromParent();
7327 BrCond->eraseFromParent();
7328 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7329 return true;
7332 return false;
7334 case Intrinsic::amdgcn_addrspacecast_nonnull:
7335 return legalizeAddrSpaceCast(MI, MRI, B);
7336 case Intrinsic::amdgcn_make_buffer_rsrc:
7337 return legalizePointerAsRsrcIntrin(MI, MRI, B);
7338 case Intrinsic::amdgcn_kernarg_segment_ptr:
7339 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7340 // This only makes sense to call in a kernel, so just lower to null.
7341 B.buildConstant(MI.getOperand(0).getReg(), 0);
7342 MI.eraseFromParent();
7343 return true;
7346 return legalizePreloadedArgIntrin(
7347 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7348 case Intrinsic::amdgcn_implicitarg_ptr:
7349 return legalizeImplicitArgPtr(MI, MRI, B);
7350 case Intrinsic::amdgcn_workitem_id_x:
7351 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7352 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7353 case Intrinsic::amdgcn_workitem_id_y:
7354 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7355 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7356 case Intrinsic::amdgcn_workitem_id_z:
7357 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7358 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7359 case Intrinsic::amdgcn_workgroup_id_x:
7360 return legalizePreloadedArgIntrin(MI, MRI, B,
7361 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7362 case Intrinsic::amdgcn_workgroup_id_y:
7363 return legalizePreloadedArgIntrin(MI, MRI, B,
7364 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7365 case Intrinsic::amdgcn_workgroup_id_z:
7366 return legalizePreloadedArgIntrin(MI, MRI, B,
7367 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7368 case Intrinsic::amdgcn_wave_id:
7369 return legalizeWaveID(MI, B);
7370 case Intrinsic::amdgcn_lds_kernel_id:
7371 return legalizePreloadedArgIntrin(MI, MRI, B,
7372 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7373 case Intrinsic::amdgcn_dispatch_ptr:
7374 return legalizePreloadedArgIntrin(MI, MRI, B,
7375 AMDGPUFunctionArgInfo::DISPATCH_PTR);
7376 case Intrinsic::amdgcn_queue_ptr:
7377 return legalizePreloadedArgIntrin(MI, MRI, B,
7378 AMDGPUFunctionArgInfo::QUEUE_PTR);
7379 case Intrinsic::amdgcn_implicit_buffer_ptr:
7380 return legalizePreloadedArgIntrin(
7381 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7382 case Intrinsic::amdgcn_dispatch_id:
7383 return legalizePreloadedArgIntrin(MI, MRI, B,
7384 AMDGPUFunctionArgInfo::DISPATCH_ID);
7385 case Intrinsic::r600_read_ngroups_x:
7386 // TODO: Emit error for hsa
7387 return legalizeKernargMemParameter(MI, B,
7388 SI::KernelInputOffsets::NGROUPS_X);
7389 case Intrinsic::r600_read_ngroups_y:
7390 return legalizeKernargMemParameter(MI, B,
7391 SI::KernelInputOffsets::NGROUPS_Y);
7392 case Intrinsic::r600_read_ngroups_z:
7393 return legalizeKernargMemParameter(MI, B,
7394 SI::KernelInputOffsets::NGROUPS_Z);
7395 case Intrinsic::r600_read_local_size_x:
7396 // TODO: Could insert G_ASSERT_ZEXT from s16
7397 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
7398 case Intrinsic::r600_read_local_size_y:
7399 // TODO: Could insert G_ASSERT_ZEXT from s16
7400 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y);
7401 // TODO: Could insert G_ASSERT_ZEXT from s16
7402 case Intrinsic::r600_read_local_size_z:
7403 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
7404 case Intrinsic::r600_read_global_size_x:
7405 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
7406 case Intrinsic::r600_read_global_size_y:
7407 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
7408 case Intrinsic::r600_read_global_size_z:
7409 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
7410 case Intrinsic::amdgcn_fdiv_fast:
7411 return legalizeFDIVFastIntrin(MI, MRI, B);
7412 case Intrinsic::amdgcn_is_shared:
7413 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
7414 case Intrinsic::amdgcn_is_private:
7415 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
7416 case Intrinsic::amdgcn_wavefrontsize: {
7417 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7418 MI.eraseFromParent();
7419 return true;
7421 case Intrinsic::amdgcn_s_buffer_load:
7422 return legalizeSBufferLoad(Helper, MI);
7423 case Intrinsic::amdgcn_raw_buffer_store:
7424 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7425 case Intrinsic::amdgcn_struct_buffer_store:
7426 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7427 return legalizeBufferStore(MI, Helper, false, false);
7428 case Intrinsic::amdgcn_raw_buffer_store_format:
7429 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7430 case Intrinsic::amdgcn_struct_buffer_store_format:
7431 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7432 return legalizeBufferStore(MI, Helper, false, true);
7433 case Intrinsic::amdgcn_raw_tbuffer_store:
7434 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7435 case Intrinsic::amdgcn_struct_tbuffer_store:
7436 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7437 return legalizeBufferStore(MI, Helper, true, true);
7438 case Intrinsic::amdgcn_raw_buffer_load:
7439 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7440 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7441 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7442 case Intrinsic::amdgcn_struct_buffer_load:
7443 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7444 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7445 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7446 return legalizeBufferLoad(MI, Helper, false, false);
7447 case Intrinsic::amdgcn_raw_buffer_load_format:
7448 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7449 case Intrinsic::amdgcn_struct_buffer_load_format:
7450 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7451 return legalizeBufferLoad(MI, Helper, true, false);
7452 case Intrinsic::amdgcn_raw_tbuffer_load:
7453 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7454 case Intrinsic::amdgcn_struct_tbuffer_load:
7455 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7456 return legalizeBufferLoad(MI, Helper, true, true);
7457 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7458 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7459 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7460 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7461 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7463 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7464 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7465 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7466 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7467 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7468 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7469 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7470 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7471 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7472 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7473 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7474 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7475 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7476 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7477 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7478 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7479 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7480 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7481 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7482 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7483 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7484 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7485 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7486 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7487 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7488 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7489 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7490 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7491 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7492 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7493 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7494 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7495 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7496 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7497 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7498 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7499 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7500 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7501 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7502 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7503 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7504 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7505 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7506 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7507 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7508 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7509 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7510 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7511 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7512 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7513 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7514 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7515 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7516 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7517 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7518 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7519 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7520 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7521 return legalizeBufferAtomic(MI, B, IntrID);
7522 case Intrinsic::amdgcn_rsq_clamp:
7523 return legalizeRsqClampIntrinsic(MI, MRI, B);
7524 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7525 return legalizeBVHIntrinsic(MI, B);
7526 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7527 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7528 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7529 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7530 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7531 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7532 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7533 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7534 Register Index = MI.getOperand(5).getReg();
7535 LLT S32 = LLT::scalar(32);
7536 if (MRI.getType(Index) != S32)
7537 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7538 return true;
7540 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7541 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7542 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7543 Register Index = MI.getOperand(7).getReg();
7544 LLT S32 = LLT::scalar(32);
7545 if (MRI.getType(Index) != S32)
7546 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7547 return true;
7549 case Intrinsic::amdgcn_fmed3: {
7550 GISelChangeObserver &Observer = Helper.Observer;
7552 // FIXME: This is to workaround the inability of tablegen match combiners to
7553 // match intrinsics in patterns.
7554 Observer.changingInstr(MI);
7555 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7556 MI.removeOperand(1);
7557 Observer.changedInstr(MI);
7558 return true;
7560 case Intrinsic::amdgcn_readlane:
7561 case Intrinsic::amdgcn_writelane:
7562 case Intrinsic::amdgcn_readfirstlane:
7563 case Intrinsic::amdgcn_permlane16:
7564 case Intrinsic::amdgcn_permlanex16:
7565 case Intrinsic::amdgcn_permlane64:
7566 case Intrinsic::amdgcn_set_inactive:
7567 case Intrinsic::amdgcn_set_inactive_chain_arg:
7568 case Intrinsic::amdgcn_mov_dpp8:
7569 case Intrinsic::amdgcn_update_dpp:
7570 return legalizeLaneOp(Helper, MI, IntrID);
7571 case Intrinsic::amdgcn_s_buffer_prefetch_data:
7572 return legalizeSBufferPrefetch(Helper, MI);
7573 default: {
7574 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7575 AMDGPU::getImageDimIntrinsicInfo(IntrID))
7576 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7577 return true;
7581 return true;