[ORC] Add std::tuple support to SimplePackedSerialization.
[llvm-project.git] / llvm / lib / Target / AMDGPU / AMDGPULegalizerInfo.cpp
blob71e120974c2e2299e800cf2955179ee81f24ea07
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #define DEBUG_TYPE "amdgpu-legalinfo"
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40 "amdgpu-global-isel-new-legality",
41 cl::desc("Use GlobalISel desired legality, rather than try to use"
42 "rules compatible with selection patterns"),
43 cl::init(false),
44 cl::ReallyHidden);
46 static constexpr unsigned MaxRegisterSize = 1024;
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50 unsigned NElts = Ty.getNumElements();
51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57 unsigned Bits = Ty.getSizeInBits();
58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59 return LLT::scalar(Pow2Bits);
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66 return [=](const LegalityQuery &Query) {
67 const LLT Ty = Query.Types[TypeIdx];
68 if (!Ty.isVector())
69 return false;
71 const LLT EltTy = Ty.getElementType();
72 const unsigned EltSize = EltTy.getSizeInBits();
73 return Ty.getNumElements() % 2 != 0 &&
74 EltSize > 1 && EltSize < 32 &&
75 Ty.getSizeInBits() % 32 != 0;
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80 return [=](const LegalityQuery &Query) {
81 const LLT Ty = Query.Types[TypeIdx];
82 return Ty.getSizeInBits() % 32 == 0;
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 const LLT EltTy = Ty.getScalarType();
90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 const LLT EltTy = Ty.getElementType();
98 return std::make_pair(TypeIdx,
99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104 return [=](const LegalityQuery &Query) {
105 const LLT Ty = Query.Types[TypeIdx];
106 const LLT EltTy = Ty.getElementType();
107 unsigned Size = Ty.getSizeInBits();
108 unsigned Pieces = (Size + 63) / 64;
109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110 return std::make_pair(
111 TypeIdx,
112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119 return [=](const LegalityQuery &Query) {
120 const LLT Ty = Query.Types[TypeIdx];
122 const LLT EltTy = Ty.getElementType();
123 const int Size = Ty.getSizeInBits();
124 const int EltSize = EltTy.getSizeInBits();
125 const int NextMul32 = (Size + 31) / 32;
127 assert(EltSize < 32);
129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
134 static LLT getBitcastRegisterType(const LLT Ty) {
135 const unsigned Size = Ty.getSizeInBits();
137 LLT CoercedTy;
138 if (Size <= 32) {
139 // <2 x s8> -> s16
140 // <4 x s8> -> s32
141 return LLT::scalar(Size);
144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148 return [=](const LegalityQuery &Query) {
149 const LLT Ty = Query.Types[TypeIdx];
150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155 return [=](const LegalityQuery &Query) {
156 const LLT Ty = Query.Types[TypeIdx];
157 unsigned Size = Ty.getSizeInBits();
158 assert(Size % 32 == 0);
159 return std::make_pair(
160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165 return [=](const LegalityQuery &Query) {
166 const LLT QueryTy = Query.Types[TypeIdx];
167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172 return [=](const LegalityQuery &Query) {
173 const LLT QueryTy = Query.Types[TypeIdx];
174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179 return [=](const LegalityQuery &Query) {
180 const LLT QueryTy = Query.Types[TypeIdx];
181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
185 static bool isRegisterSize(unsigned Size) {
186 return Size % 32 == 0 && Size <= MaxRegisterSize;
189 static bool isRegisterVectorElementType(LLT EltTy) {
190 const int EltSize = EltTy.getSizeInBits();
191 return EltSize == 16 || EltSize % 32 == 0;
194 static bool isRegisterVectorType(LLT Ty) {
195 const int EltSize = Ty.getElementType().getSizeInBits();
196 return EltSize == 32 || EltSize == 64 ||
197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198 EltSize == 128 || EltSize == 256;
201 static bool isRegisterType(LLT Ty) {
202 if (!isRegisterSize(Ty.getSizeInBits()))
203 return false;
205 if (Ty.isVector())
206 return isRegisterVectorType(Ty);
208 return true;
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214 return [=](const LegalityQuery &Query) {
215 return isRegisterType(Query.Types[TypeIdx]);
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 if (!QueryTy.isVector())
223 return false;
224 const LLT EltTy = QueryTy.getElementType();
225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232 return [=](const LegalityQuery &Query) {
233 const LLT Ty = Query.Types[TypeIdx];
234 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243 bool IsLoad) {
244 switch (AS) {
245 case AMDGPUAS::PRIVATE_ADDRESS:
246 // FIXME: Private element size.
247 return ST.enableFlatScratch() ? 128 : 32;
248 case AMDGPUAS::LOCAL_ADDRESS:
249 return ST.useDS128() ? 128 : 64;
250 case AMDGPUAS::GLOBAL_ADDRESS:
251 case AMDGPUAS::CONSTANT_ADDRESS:
252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253 // Treat constant and global as identical. SMRD loads are sometimes usable for
254 // global loads (ideally constant address space should be eliminated)
255 // depending on the context. Legality cannot be context dependent, but
256 // RegBankSelect can split the load as necessary depending on the pointer
257 // register bank/uniformity and if the memory is invariant or not written in a
258 // kernel.
259 return IsLoad ? 512 : 128;
260 default:
261 // Flat addresses may contextually need to be split to 32-bit parts if they
262 // may alias scratch depending on the subtarget.
263 return 128;
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268 const LegalityQuery &Query) {
269 const LLT Ty = Query.Types[0];
271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
274 unsigned RegSize = Ty.getSizeInBits();
275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277 unsigned AS = Query.Types[1].getAddressSpace();
279 // All of these need to be custom lowered to cast the pointer operand.
280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281 return false;
283 // Do not handle extending vector loads.
284 if (Ty.isVector() && MemSize != RegSize)
285 return false;
287 // TODO: We should be able to widen loads if the alignment is high enough, but
288 // we also need to modify the memory access size.
289 #if 0
290 // Accept widening loads based on alignment.
291 if (IsLoad && MemSize < Size)
292 MemSize = std::max(MemSize, Align);
293 #endif
295 // Only 1-byte and 2-byte to 32-bit extloads are valid.
296 if (MemSize != RegSize && RegSize != 32)
297 return false;
299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300 return false;
302 switch (MemSize) {
303 case 8:
304 case 16:
305 case 32:
306 case 64:
307 case 128:
308 break;
309 case 96:
310 if (!ST.hasDwordx3LoadStores())
311 return false;
312 break;
313 case 256:
314 case 512:
315 // These may contextually need to be broken down.
316 break;
317 default:
318 return false;
321 assert(RegSize >= MemSize);
323 if (AlignBits < MemSize) {
324 const SITargetLowering *TLI = ST.getTargetLowering();
325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326 Align(AlignBits / 8)))
327 return false;
330 return true;
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338 if (EnableNewLegality)
339 return false;
341 const unsigned Size = Ty.getSizeInBits();
342 if (Size <= 64)
343 return false;
344 if (!Ty.isVector())
345 return true;
347 LLT EltTy = Ty.getElementType();
348 if (EltTy.isPointer())
349 return true;
351 unsigned EltSize = EltTy.getSizeInBits();
352 return EltSize != 32 && EltSize != 64;
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356 const LLT Ty = Query.Types[0];
357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358 !loadStoreBitcastWorkaround(Ty);
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364 const LLT MemTy) {
365 const unsigned MemSizeInBits = MemTy.getSizeInBits();
366 const unsigned Size = Ty.getSizeInBits();
367 if (Size != MemSizeInBits)
368 return Size <= 32 && Ty.isVector();
370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371 return true;
373 // Don't try to handle bitcasting vector ext loads for now.
374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375 (Size <= 32 || isRegisterSize(Size)) &&
376 !isRegisterVectorElementType(Ty.getElementType());
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383 unsigned AlignInBits, unsigned AddrSpace,
384 unsigned Opcode) {
385 unsigned SizeInBits = MemoryTy.getSizeInBits();
386 // We don't want to widen cases that are naturally legal.
387 if (isPowerOf2_32(SizeInBits))
388 return false;
390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391 // end up widening these for a scalar load during RegBankSelect, since there
392 // aren't 96-bit scalar loads.
393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394 return false;
396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397 return false;
399 // A load is known dereferenceable up to the alignment, so it's legal to widen
400 // to it.
402 // TODO: Could check dereferenceable for less aligned cases.
403 unsigned RoundedSize = NextPowerOf2(SizeInBits);
404 if (AlignInBits < RoundedSize)
405 return false;
407 // Do not widen if it would introduce a slow unaligned load.
408 const SITargetLowering *TLI = ST.getTargetLowering();
409 bool Fast = false;
410 return TLI->allowsMisalignedMemoryAccessesImpl(
411 RoundedSize, AddrSpace, Align(AlignInBits / 8),
412 MachineMemOperand::MOLoad, &Fast) &&
413 Fast;
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417 unsigned Opcode) {
418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419 return false;
421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422 Query.MMODescrs[0].AlignInBits,
423 Query.Types[1].getAddressSpace(), Opcode);
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427 const GCNTargetMachine &TM)
428 : ST(ST_) {
429 using namespace TargetOpcode;
431 auto GetAddrSpacePtr = [&TM](unsigned AS) {
432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
435 const LLT S1 = LLT::scalar(1);
436 const LLT S8 = LLT::scalar(8);
437 const LLT S16 = LLT::scalar(16);
438 const LLT S32 = LLT::scalar(32);
439 const LLT S64 = LLT::scalar(64);
440 const LLT S128 = LLT::scalar(128);
441 const LLT S256 = LLT::scalar(256);
442 const LLT S512 = LLT::scalar(512);
443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
445 const LLT V2S8 = LLT::fixed_vector(2, 8);
446 const LLT V2S16 = LLT::fixed_vector(2, 16);
447 const LLT V4S16 = LLT::fixed_vector(4, 16);
449 const LLT V2S32 = LLT::fixed_vector(2, 32);
450 const LLT V3S32 = LLT::fixed_vector(3, 32);
451 const LLT V4S32 = LLT::fixed_vector(4, 32);
452 const LLT V5S32 = LLT::fixed_vector(5, 32);
453 const LLT V6S32 = LLT::fixed_vector(6, 32);
454 const LLT V7S32 = LLT::fixed_vector(7, 32);
455 const LLT V8S32 = LLT::fixed_vector(8, 32);
456 const LLT V9S32 = LLT::fixed_vector(9, 32);
457 const LLT V10S32 = LLT::fixed_vector(10, 32);
458 const LLT V11S32 = LLT::fixed_vector(11, 32);
459 const LLT V12S32 = LLT::fixed_vector(12, 32);
460 const LLT V13S32 = LLT::fixed_vector(13, 32);
461 const LLT V14S32 = LLT::fixed_vector(14, 32);
462 const LLT V15S32 = LLT::fixed_vector(15, 32);
463 const LLT V16S32 = LLT::fixed_vector(16, 32);
464 const LLT V32S32 = LLT::fixed_vector(32, 32);
466 const LLT V2S64 = LLT::fixed_vector(2, 64);
467 const LLT V3S64 = LLT::fixed_vector(3, 64);
468 const LLT V4S64 = LLT::fixed_vector(4, 64);
469 const LLT V5S64 = LLT::fixed_vector(5, 64);
470 const LLT V6S64 = LLT::fixed_vector(6, 64);
471 const LLT V7S64 = LLT::fixed_vector(7, 64);
472 const LLT V8S64 = LLT::fixed_vector(8, 64);
473 const LLT V16S64 = LLT::fixed_vector(16, 64);
475 std::initializer_list<LLT> AllS32Vectors =
476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478 std::initializer_list<LLT> AllS64Vectors =
479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
489 const LLT CodePtr = FlatPtr;
491 const std::initializer_list<LLT> AddrSpaces64 = {
492 GlobalPtr, ConstantPtr, FlatPtr
495 const std::initializer_list<LLT> AddrSpaces32 = {
496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
499 const std::initializer_list<LLT> FPTypesBase = {
500 S32, S64
503 const std::initializer_list<LLT> FPTypes16 = {
504 S32, S64, S16
507 const std::initializer_list<LLT> FPTypesPK16 = {
508 S32, S64, S16, V2S16
511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
513 // s1 for VCC branches, s32 for SCC branches.
514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517 // elements for v3s16
518 getActionDefinitionsBuilder(G_PHI)
519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520 .legalFor(AllS32Vectors)
521 .legalFor(AllS64Vectors)
522 .legalFor(AddrSpaces64)
523 .legalFor(AddrSpaces32)
524 .legalIf(isPointer(0))
525 .clampScalar(0, S16, S256)
526 .widenScalarToNextPow2(0, 32)
527 .clampMaxNumElements(0, S32, 16)
528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529 .scalarize(0);
531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532 // Full set of gfx9 features.
533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534 .legalFor({S32, S16, V2S16})
535 .clampScalar(0, S16, S32)
536 .clampMaxNumElements(0, S16, 2)
537 .scalarize(0)
538 .widenScalarToNextPow2(0, 32);
540 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541 .legalFor({S32, S16, V2S16}) // Clamp modifier
542 .minScalarOrElt(0, S16)
543 .clampMaxNumElements(0, S16, 2)
544 .scalarize(0)
545 .widenScalarToNextPow2(0, 32)
546 .lower();
547 } else if (ST.has16BitInsts()) {
548 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
549 .legalFor({S32, S16})
550 .clampScalar(0, S16, S32)
551 .scalarize(0)
552 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
554 // Technically the saturating operations require clamp bit support, but this
555 // was introduced at the same time as 16-bit operations.
556 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
557 .legalFor({S32, S16}) // Clamp modifier
558 .minScalar(0, S16)
559 .scalarize(0)
560 .widenScalarToNextPow2(0, 16)
561 .lower();
563 // We're just lowering this, but it helps get a better result to try to
564 // coerce to the desired type first.
565 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
566 .minScalar(0, S16)
567 .scalarize(0)
568 .lower();
569 } else {
570 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
571 .legalFor({S32})
572 .clampScalar(0, S32, S32)
573 .scalarize(0);
575 if (ST.hasIntClamp()) {
576 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
577 .legalFor({S32}) // Clamp modifier.
578 .scalarize(0)
579 .minScalarOrElt(0, S32)
580 .lower();
581 } else {
582 // Clamp bit support was added in VI, along with 16-bit operations.
583 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
584 .minScalar(0, S32)
585 .scalarize(0)
586 .lower();
589 // FIXME: DAG expansion gets better results. The widening uses the smaller
590 // range values and goes for the min/max lowering directly.
591 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
592 .minScalar(0, S32)
593 .scalarize(0)
594 .lower();
597 getActionDefinitionsBuilder(
598 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
599 .customFor({S32, S64})
600 .clampScalar(0, S32, S64)
601 .widenScalarToNextPow2(0, 32)
602 .scalarize(0);
604 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
605 .legalFor({S32})
606 .maxScalarOrElt(0, S32);
608 if (ST.hasVOP3PInsts()) {
609 Mulh
610 .clampMaxNumElements(0, S8, 2)
611 .lowerFor({V2S8});
614 Mulh
615 .scalarize(0)
616 .lower();
618 // Report legal for any types we can handle anywhere. For the cases only legal
619 // on the SALU, RegBankSelect will be able to re-legalize.
620 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
621 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
622 .clampScalar(0, S32, S64)
623 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
624 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
625 .widenScalarToNextPow2(0)
626 .scalarize(0);
628 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
629 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
630 .legalFor({{S32, S1}, {S32, S32}})
631 .minScalar(0, S32)
632 // TODO: .scalarize(0)
633 .lower();
635 getActionDefinitionsBuilder(G_BITCAST)
636 // Don't worry about the size constraint.
637 .legalIf(all(isRegisterType(0), isRegisterType(1)))
638 .lower();
641 getActionDefinitionsBuilder(G_CONSTANT)
642 .legalFor({S1, S32, S64, S16, GlobalPtr,
643 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
644 .legalIf(isPointer(0))
645 .clampScalar(0, S32, S64)
646 .widenScalarToNextPow2(0);
648 getActionDefinitionsBuilder(G_FCONSTANT)
649 .legalFor({S32, S64, S16})
650 .clampScalar(0, S16, S64);
652 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
653 .legalIf(isRegisterType(0))
654 // s1 and s16 are special cases because they have legal operations on
655 // them, but don't really occupy registers in the normal way.
656 .legalFor({S1, S16})
657 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
658 .clampScalarOrElt(0, S32, MaxScalar)
659 .widenScalarToNextPow2(0, 32)
660 .clampMaxNumElements(0, S32, 16);
662 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
664 // If the amount is divergent, we have to do a wave reduction to get the
665 // maximum value, so this is expanded during RegBankSelect.
666 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
667 .legalFor({{PrivatePtr, S32}});
669 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
670 .customIf(typeIsNot(0, PrivatePtr));
672 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
674 auto &FPOpActions = getActionDefinitionsBuilder(
675 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
676 .legalFor({S32, S64});
677 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
678 .customFor({S32, S64});
679 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
680 .customFor({S32, S64});
682 if (ST.has16BitInsts()) {
683 if (ST.hasVOP3PInsts())
684 FPOpActions.legalFor({S16, V2S16});
685 else
686 FPOpActions.legalFor({S16});
688 TrigActions.customFor({S16});
689 FDIVActions.customFor({S16});
692 auto &MinNumMaxNum = getActionDefinitionsBuilder({
693 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
695 if (ST.hasVOP3PInsts()) {
696 MinNumMaxNum.customFor(FPTypesPK16)
697 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
698 .clampMaxNumElements(0, S16, 2)
699 .clampScalar(0, S16, S64)
700 .scalarize(0);
701 } else if (ST.has16BitInsts()) {
702 MinNumMaxNum.customFor(FPTypes16)
703 .clampScalar(0, S16, S64)
704 .scalarize(0);
705 } else {
706 MinNumMaxNum.customFor(FPTypesBase)
707 .clampScalar(0, S32, S64)
708 .scalarize(0);
711 if (ST.hasVOP3PInsts())
712 FPOpActions.clampMaxNumElements(0, S16, 2);
714 FPOpActions
715 .scalarize(0)
716 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
718 TrigActions
719 .scalarize(0)
720 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
722 FDIVActions
723 .scalarize(0)
724 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
726 getActionDefinitionsBuilder({G_FNEG, G_FABS})
727 .legalFor(FPTypesPK16)
728 .clampMaxNumElements(0, S16, 2)
729 .scalarize(0)
730 .clampScalar(0, S16, S64);
732 if (ST.has16BitInsts()) {
733 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
734 .legalFor({S32, S64, S16})
735 .scalarize(0)
736 .clampScalar(0, S16, S64);
737 } else {
738 getActionDefinitionsBuilder(G_FSQRT)
739 .legalFor({S32, S64})
740 .scalarize(0)
741 .clampScalar(0, S32, S64);
743 if (ST.hasFractBug()) {
744 getActionDefinitionsBuilder(G_FFLOOR)
745 .customFor({S64})
746 .legalFor({S32, S64})
747 .scalarize(0)
748 .clampScalar(0, S32, S64);
749 } else {
750 getActionDefinitionsBuilder(G_FFLOOR)
751 .legalFor({S32, S64})
752 .scalarize(0)
753 .clampScalar(0, S32, S64);
757 getActionDefinitionsBuilder(G_FPTRUNC)
758 .legalFor({{S32, S64}, {S16, S32}})
759 .scalarize(0)
760 .lower();
762 getActionDefinitionsBuilder(G_FPEXT)
763 .legalFor({{S64, S32}, {S32, S16}})
764 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
765 .scalarize(0);
767 getActionDefinitionsBuilder(G_FSUB)
768 // Use actual fsub instruction
769 .legalFor({S32})
770 // Must use fadd + fneg
771 .lowerFor({S64, S16, V2S16})
772 .scalarize(0)
773 .clampScalar(0, S32, S64);
775 // Whether this is legal depends on the floating point mode for the function.
776 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
777 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
778 FMad.customFor({S32, S16});
779 else if (ST.hasMadMacF32Insts())
780 FMad.customFor({S32});
781 else if (ST.hasMadF16())
782 FMad.customFor({S16});
783 FMad.scalarize(0)
784 .lower();
786 auto &FRem = getActionDefinitionsBuilder(G_FREM);
787 if (ST.has16BitInsts()) {
788 FRem.customFor({S16, S32, S64});
789 } else {
790 FRem.minScalar(0, S32)
791 .customFor({S32, S64});
793 FRem.scalarize(0);
795 // TODO: Do we need to clamp maximum bitwidth?
796 getActionDefinitionsBuilder(G_TRUNC)
797 .legalIf(isScalar(0))
798 .legalFor({{V2S16, V2S32}})
799 .clampMaxNumElements(0, S16, 2)
800 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
801 // situations (like an invalid implicit use), we don't want to infinite loop
802 // in the legalizer.
803 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
804 .alwaysLegal();
806 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
807 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
808 {S32, S1}, {S64, S1}, {S16, S1}})
809 .scalarize(0)
810 .clampScalar(0, S32, S64)
811 .widenScalarToNextPow2(1, 32);
813 // TODO: Split s1->s64 during regbankselect for VALU.
814 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
815 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
816 .lowerIf(typeIs(1, S1))
817 .customFor({{S32, S64}, {S64, S64}});
818 if (ST.has16BitInsts())
819 IToFP.legalFor({{S16, S16}});
820 IToFP.clampScalar(1, S32, S64)
821 .minScalar(0, S32)
822 .scalarize(0)
823 .widenScalarToNextPow2(1);
825 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
826 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
827 .customFor({{S64, S32}, {S64, S64}})
828 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
829 if (ST.has16BitInsts())
830 FPToI.legalFor({{S16, S16}});
831 else
832 FPToI.minScalar(1, S32);
834 FPToI.minScalar(0, S32)
835 .widenScalarToNextPow2(0, 32)
836 .scalarize(0)
837 .lower();
839 // Lower roundeven into G_FRINT
840 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
841 .scalarize(0)
842 .lower();
844 if (ST.has16BitInsts()) {
845 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
846 .legalFor({S16, S32, S64})
847 .clampScalar(0, S16, S64)
848 .scalarize(0);
849 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
850 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
851 .legalFor({S32, S64})
852 .clampScalar(0, S32, S64)
853 .scalarize(0);
854 } else {
855 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
856 .legalFor({S32})
857 .customFor({S64})
858 .clampScalar(0, S32, S64)
859 .scalarize(0);
862 getActionDefinitionsBuilder(G_PTR_ADD)
863 .legalIf(all(isPointer(0), sameSize(0, 1)))
864 .scalarize(0)
865 .scalarSameSizeAs(1, 0);
867 getActionDefinitionsBuilder(G_PTRMASK)
868 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
869 .scalarSameSizeAs(1, 0)
870 .scalarize(0);
872 auto &CmpBuilder =
873 getActionDefinitionsBuilder(G_ICMP)
874 // The compare output type differs based on the register bank of the output,
875 // so make both s1 and s32 legal.
877 // Scalar compares producing output in scc will be promoted to s32, as that
878 // is the allocatable register type that will be needed for the copy from
879 // scc. This will be promoted during RegBankSelect, and we assume something
880 // before that won't try to use s32 result types.
882 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
883 // bank.
884 .legalForCartesianProduct(
885 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
886 .legalForCartesianProduct(
887 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
888 if (ST.has16BitInsts()) {
889 CmpBuilder.legalFor({{S1, S16}});
892 CmpBuilder
893 .widenScalarToNextPow2(1)
894 .clampScalar(1, S32, S64)
895 .scalarize(0)
896 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
898 getActionDefinitionsBuilder(G_FCMP)
899 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
900 .widenScalarToNextPow2(1)
901 .clampScalar(1, S32, S64)
902 .scalarize(0);
904 // FIXME: fpow has a selection pattern that should move to custom lowering.
905 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
906 if (ST.has16BitInsts())
907 Exp2Ops.legalFor({S32, S16});
908 else
909 Exp2Ops.legalFor({S32});
910 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
911 Exp2Ops.scalarize(0);
913 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
914 if (ST.has16BitInsts())
915 ExpOps.customFor({{S32}, {S16}});
916 else
917 ExpOps.customFor({S32});
918 ExpOps.clampScalar(0, MinScalarFPTy, S32)
919 .scalarize(0);
921 getActionDefinitionsBuilder(G_FPOWI)
922 .clampScalar(0, MinScalarFPTy, S32)
923 .lower();
925 // The 64-bit versions produce 32-bit results, but only on the SALU.
926 getActionDefinitionsBuilder(G_CTPOP)
927 .legalFor({{S32, S32}, {S32, S64}})
928 .clampScalar(0, S32, S32)
929 .clampScalar(1, S32, S64)
930 .scalarize(0)
931 .widenScalarToNextPow2(0, 32)
932 .widenScalarToNextPow2(1, 32);
934 // The hardware instructions return a different result on 0 than the generic
935 // instructions expect. The hardware produces -1, but these produce the
936 // bitwidth.
937 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
938 .scalarize(0)
939 .clampScalar(0, S32, S32)
940 .clampScalar(1, S32, S64)
941 .widenScalarToNextPow2(0, 32)
942 .widenScalarToNextPow2(1, 32)
943 .custom();
945 // The 64-bit versions produce 32-bit results, but only on the SALU.
946 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
947 .legalFor({{S32, S32}, {S32, S64}})
948 .clampScalar(0, S32, S32)
949 .clampScalar(1, S32, S64)
950 .scalarize(0)
951 .widenScalarToNextPow2(0, 32)
952 .widenScalarToNextPow2(1, 32);
954 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
955 // RegBankSelect.
956 getActionDefinitionsBuilder(G_BITREVERSE)
957 .legalFor({S32, S64})
958 .clampScalar(0, S32, S64)
959 .scalarize(0)
960 .widenScalarToNextPow2(0);
962 if (ST.has16BitInsts()) {
963 getActionDefinitionsBuilder(G_BSWAP)
964 .legalFor({S16, S32, V2S16})
965 .clampMaxNumElements(0, S16, 2)
966 // FIXME: Fixing non-power-of-2 before clamp is workaround for
967 // narrowScalar limitation.
968 .widenScalarToNextPow2(0)
969 .clampScalar(0, S16, S32)
970 .scalarize(0);
972 if (ST.hasVOP3PInsts()) {
973 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
974 .legalFor({S32, S16, V2S16})
975 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
976 .clampMaxNumElements(0, S16, 2)
977 .minScalar(0, S16)
978 .widenScalarToNextPow2(0)
979 .scalarize(0)
980 .lower();
981 } else {
982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
983 .legalFor({S32, S16})
984 .widenScalarToNextPow2(0)
985 .minScalar(0, S16)
986 .scalarize(0)
987 .lower();
989 } else {
990 // TODO: Should have same legality without v_perm_b32
991 getActionDefinitionsBuilder(G_BSWAP)
992 .legalFor({S32})
993 .lowerIf(scalarNarrowerThan(0, 32))
994 // FIXME: Fixing non-power-of-2 before clamp is workaround for
995 // narrowScalar limitation.
996 .widenScalarToNextPow2(0)
997 .maxScalar(0, S32)
998 .scalarize(0)
999 .lower();
1001 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1002 .legalFor({S32})
1003 .minScalar(0, S32)
1004 .widenScalarToNextPow2(0)
1005 .scalarize(0)
1006 .lower();
1009 getActionDefinitionsBuilder(G_INTTOPTR)
1010 // List the common cases
1011 .legalForCartesianProduct(AddrSpaces64, {S64})
1012 .legalForCartesianProduct(AddrSpaces32, {S32})
1013 .scalarize(0)
1014 // Accept any address space as long as the size matches
1015 .legalIf(sameSize(0, 1))
1016 .widenScalarIf(smallerThan(1, 0),
1017 [](const LegalityQuery &Query) {
1018 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1020 .narrowScalarIf(largerThan(1, 0),
1021 [](const LegalityQuery &Query) {
1022 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1025 getActionDefinitionsBuilder(G_PTRTOINT)
1026 // List the common cases
1027 .legalForCartesianProduct(AddrSpaces64, {S64})
1028 .legalForCartesianProduct(AddrSpaces32, {S32})
1029 .scalarize(0)
1030 // Accept any address space as long as the size matches
1031 .legalIf(sameSize(0, 1))
1032 .widenScalarIf(smallerThan(0, 1),
1033 [](const LegalityQuery &Query) {
1034 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036 .narrowScalarIf(
1037 largerThan(0, 1),
1038 [](const LegalityQuery &Query) {
1039 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1042 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1043 .scalarize(0)
1044 .custom();
1046 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1047 bool IsLoad) -> bool {
1048 const LLT DstTy = Query.Types[0];
1050 // Split vector extloads.
1051 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1052 unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1054 if (MemSize < DstTy.getSizeInBits())
1055 MemSize = std::max(MemSize, AlignBits);
1057 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1058 return true;
1060 const LLT PtrTy = Query.Types[1];
1061 unsigned AS = PtrTy.getAddressSpace();
1062 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1063 return true;
1065 // Catch weird sized loads that don't evenly divide into the access sizes
1066 // TODO: May be able to widen depending on alignment etc.
1067 unsigned NumRegs = (MemSize + 31) / 32;
1068 if (NumRegs == 3) {
1069 if (!ST.hasDwordx3LoadStores())
1070 return true;
1071 } else {
1072 // If the alignment allows, these should have been widened.
1073 if (!isPowerOf2_32(NumRegs))
1074 return true;
1077 if (AlignBits < MemSize) {
1078 const SITargetLowering *TLI = ST.getTargetLowering();
1079 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1080 Align(AlignBits / 8));
1083 return false;
1086 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1087 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1088 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1090 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1091 // LDS
1092 // TODO: Unsupported flat for SI.
1094 for (unsigned Op : {G_LOAD, G_STORE}) {
1095 const bool IsStore = Op == G_STORE;
1097 auto &Actions = getActionDefinitionsBuilder(Op);
1098 // Explicitly list some common cases.
1099 // TODO: Does this help compile time at all?
1100 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1101 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1102 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1103 {S64, GlobalPtr, S64, GlobalAlign32},
1104 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1105 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1106 {S32, GlobalPtr, S8, GlobalAlign8},
1107 {S32, GlobalPtr, S16, GlobalAlign16},
1109 {S32, LocalPtr, S32, 32},
1110 {S64, LocalPtr, S64, 32},
1111 {V2S32, LocalPtr, V2S32, 32},
1112 {S32, LocalPtr, S8, 8},
1113 {S32, LocalPtr, S16, 16},
1114 {V2S16, LocalPtr, S32, 32},
1116 {S32, PrivatePtr, S32, 32},
1117 {S32, PrivatePtr, S8, 8},
1118 {S32, PrivatePtr, S16, 16},
1119 {V2S16, PrivatePtr, S32, 32},
1121 {S32, ConstantPtr, S32, GlobalAlign32},
1122 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1123 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1124 {S64, ConstantPtr, S64, GlobalAlign32},
1125 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1126 Actions.legalIf(
1127 [=](const LegalityQuery &Query) -> bool {
1128 return isLoadStoreLegal(ST, Query);
1131 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1132 // 64-bits.
1134 // TODO: Should generalize bitcast action into coerce, which will also cover
1135 // inserting addrspacecasts.
1136 Actions.customIf(typeIs(1, Constant32Ptr));
1138 // Turn any illegal element vectors into something easier to deal
1139 // with. These will ultimately produce 32-bit scalar shifts to extract the
1140 // parts anyway.
1142 // For odd 16-bit element vectors, prefer to split those into pieces with
1143 // 16-bit vector parts.
1144 Actions.bitcastIf(
1145 [=](const LegalityQuery &Query) -> bool {
1146 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1147 Query.MMODescrs[0].MemoryTy);
1148 }, bitcastToRegisterType(0));
1150 if (!IsStore) {
1151 // Widen suitably aligned loads by loading extra bytes. The standard
1152 // legalization actions can't properly express widening memory operands.
1153 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1154 return shouldWidenLoad(ST, Query, G_LOAD);
1158 // FIXME: load/store narrowing should be moved to lower action
1159 Actions
1160 .narrowScalarIf(
1161 [=](const LegalityQuery &Query) -> bool {
1162 return !Query.Types[0].isVector() &&
1163 needToSplitMemOp(Query, Op == G_LOAD);
1165 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1166 const LLT DstTy = Query.Types[0];
1167 const LLT PtrTy = Query.Types[1];
1169 const unsigned DstSize = DstTy.getSizeInBits();
1170 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1172 // Split extloads.
1173 if (DstSize > MemSize)
1174 return std::make_pair(0, LLT::scalar(MemSize));
1176 if (!isPowerOf2_32(DstSize)) {
1177 // We're probably decomposing an odd sized store. Try to split
1178 // to the widest type. TODO: Account for alignment. As-is it
1179 // should be OK, since the new parts will be further legalized.
1180 unsigned FloorSize = PowerOf2Floor(DstSize);
1181 return std::make_pair(0, LLT::scalar(FloorSize));
1184 if (DstSize > 32 && (DstSize % 32 != 0)) {
1185 // FIXME: Need a way to specify non-extload of larger size if
1186 // suitably aligned.
1187 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1190 unsigned MaxSize = maxSizeForAddrSpace(ST,
1191 PtrTy.getAddressSpace(),
1192 Op == G_LOAD);
1193 if (MemSize > MaxSize)
1194 return std::make_pair(0, LLT::scalar(MaxSize));
1196 unsigned Align = Query.MMODescrs[0].AlignInBits;
1197 return std::make_pair(0, LLT::scalar(Align));
1199 .fewerElementsIf(
1200 [=](const LegalityQuery &Query) -> bool {
1201 return Query.Types[0].isVector() &&
1202 needToSplitMemOp(Query, Op == G_LOAD);
1204 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1205 const LLT DstTy = Query.Types[0];
1206 const LLT PtrTy = Query.Types[1];
1208 LLT EltTy = DstTy.getElementType();
1209 unsigned MaxSize = maxSizeForAddrSpace(ST,
1210 PtrTy.getAddressSpace(),
1211 Op == G_LOAD);
1213 // FIXME: Handle widened to power of 2 results better. This ends
1214 // up scalarizing.
1215 // FIXME: 3 element stores scalarized on SI
1217 // Split if it's too large for the address space.
1218 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1219 if (MemSize > MaxSize) {
1220 unsigned NumElts = DstTy.getNumElements();
1221 unsigned EltSize = EltTy.getSizeInBits();
1223 if (MaxSize % EltSize == 0) {
1224 return std::make_pair(
1225 0, LLT::scalarOrVector(
1226 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1229 unsigned NumPieces = MemSize / MaxSize;
1231 // FIXME: Refine when odd breakdowns handled
1232 // The scalars will need to be re-legalized.
1233 if (NumPieces == 1 || NumPieces >= NumElts ||
1234 NumElts % NumPieces != 0)
1235 return std::make_pair(0, EltTy);
1237 return std::make_pair(
1238 0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1241 // FIXME: We could probably handle weird extending loads better.
1242 if (DstTy.getSizeInBits() > MemSize)
1243 return std::make_pair(0, EltTy);
1245 unsigned EltSize = EltTy.getSizeInBits();
1246 unsigned DstSize = DstTy.getSizeInBits();
1247 if (!isPowerOf2_32(DstSize)) {
1248 // We're probably decomposing an odd sized store. Try to split
1249 // to the widest type. TODO: Account for alignment. As-is it
1250 // should be OK, since the new parts will be further legalized.
1251 unsigned FloorSize = PowerOf2Floor(DstSize);
1252 return std::make_pair(
1253 0, LLT::scalarOrVector(
1254 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1257 // Need to split because of alignment.
1258 unsigned Align = Query.MMODescrs[0].AlignInBits;
1259 if (EltSize > Align &&
1260 (EltSize / Align < DstTy.getNumElements())) {
1261 return std::make_pair(
1262 0, LLT::fixed_vector(EltSize / Align, EltTy));
1265 // May need relegalization for the scalars.
1266 return std::make_pair(0, EltTy);
1268 .minScalar(0, S32)
1269 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1270 .widenScalarToNextPow2(0)
1271 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1272 .lower();
1275 // FIXME: Unaligned accesses not lowered.
1276 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1277 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1278 {S32, GlobalPtr, S16, 2 * 8},
1279 {S32, LocalPtr, S8, 8},
1280 {S32, LocalPtr, S16, 16},
1281 {S32, PrivatePtr, S8, 8},
1282 {S32, PrivatePtr, S16, 16},
1283 {S32, ConstantPtr, S8, 8},
1284 {S32, ConstantPtr, S16, 2 * 8}})
1285 .legalIf(
1286 [=](const LegalityQuery &Query) -> bool {
1287 return isLoadStoreLegal(ST, Query);
1290 if (ST.hasFlatAddressSpace()) {
1291 ExtLoads.legalForTypesWithMemDesc(
1292 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1295 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1296 // 64-bits.
1298 // TODO: Should generalize bitcast action into coerce, which will also cover
1299 // inserting addrspacecasts.
1300 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1302 ExtLoads.clampScalar(0, S32, S32)
1303 .widenScalarToNextPow2(0)
1304 .lower();
1306 auto &Atomics = getActionDefinitionsBuilder(
1307 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1308 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1309 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1310 G_ATOMICRMW_UMIN})
1311 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1312 {S64, GlobalPtr}, {S64, LocalPtr},
1313 {S32, RegionPtr}, {S64, RegionPtr}});
1314 if (ST.hasFlatAddressSpace()) {
1315 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1318 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1319 if (ST.hasLDSFPAtomics()) {
1320 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1321 if (ST.hasGFX90AInsts())
1322 Atomic.legalFor({{S64, LocalPtr}});
1324 if (ST.hasAtomicFaddInsts())
1325 Atomic.legalFor({{S32, GlobalPtr}});
1327 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1328 // demarshalling
1329 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1330 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1331 {S32, FlatPtr}, {S64, FlatPtr}})
1332 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1333 {S32, RegionPtr}, {S64, RegionPtr}});
1334 // TODO: Pointer types, any 32-bit or 64-bit vector
1336 // Condition should be s32 for scalar, s1 for vector.
1337 getActionDefinitionsBuilder(G_SELECT)
1338 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1339 LocalPtr, FlatPtr, PrivatePtr,
1340 LLT::fixed_vector(2, LocalPtr),
1341 LLT::fixed_vector(2, PrivatePtr)},
1342 {S1, S32})
1343 .clampScalar(0, S16, S64)
1344 .scalarize(1)
1345 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1346 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1347 .clampMaxNumElements(0, S32, 2)
1348 .clampMaxNumElements(0, LocalPtr, 2)
1349 .clampMaxNumElements(0, PrivatePtr, 2)
1350 .scalarize(0)
1351 .widenScalarToNextPow2(0)
1352 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1354 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1355 // be more flexible with the shift amount type.
1356 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1357 .legalFor({{S32, S32}, {S64, S32}});
1358 if (ST.has16BitInsts()) {
1359 if (ST.hasVOP3PInsts()) {
1360 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1361 .clampMaxNumElements(0, S16, 2);
1362 } else
1363 Shifts.legalFor({{S16, S16}});
1365 // TODO: Support 16-bit shift amounts for all types
1366 Shifts.widenScalarIf(
1367 [=](const LegalityQuery &Query) {
1368 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1369 // 32-bit amount.
1370 const LLT ValTy = Query.Types[0];
1371 const LLT AmountTy = Query.Types[1];
1372 return ValTy.getSizeInBits() <= 16 &&
1373 AmountTy.getSizeInBits() < 16;
1374 }, changeTo(1, S16));
1375 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1376 Shifts.clampScalar(1, S32, S32);
1377 Shifts.clampScalar(0, S16, S64);
1378 Shifts.widenScalarToNextPow2(0, 16);
1380 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1381 .minScalar(0, S16)
1382 .scalarize(0)
1383 .lower();
1384 } else {
1385 // Make sure we legalize the shift amount type first, as the general
1386 // expansion for the shifted type will produce much worse code if it hasn't
1387 // been truncated already.
1388 Shifts.clampScalar(1, S32, S32);
1389 Shifts.clampScalar(0, S32, S64);
1390 Shifts.widenScalarToNextPow2(0, 32);
1392 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1393 .minScalar(0, S32)
1394 .scalarize(0)
1395 .lower();
1397 Shifts.scalarize(0);
1399 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1400 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1401 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1402 unsigned IdxTypeIdx = 2;
1404 getActionDefinitionsBuilder(Op)
1405 .customIf([=](const LegalityQuery &Query) {
1406 const LLT EltTy = Query.Types[EltTypeIdx];
1407 const LLT VecTy = Query.Types[VecTypeIdx];
1408 const LLT IdxTy = Query.Types[IdxTypeIdx];
1409 const unsigned EltSize = EltTy.getSizeInBits();
1410 return (EltSize == 32 || EltSize == 64) &&
1411 VecTy.getSizeInBits() % 32 == 0 &&
1412 VecTy.getSizeInBits() <= MaxRegisterSize &&
1413 IdxTy.getSizeInBits() == 32;
1415 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1416 bitcastToVectorElement32(VecTypeIdx))
1417 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1418 .bitcastIf(
1419 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1420 [=](const LegalityQuery &Query) {
1421 // For > 64-bit element types, try to turn this into a 64-bit
1422 // element vector since we may be able to do better indexing
1423 // if this is scalar. If not, fall back to 32.
1424 const LLT EltTy = Query.Types[EltTypeIdx];
1425 const LLT VecTy = Query.Types[VecTypeIdx];
1426 const unsigned DstEltSize = EltTy.getSizeInBits();
1427 const unsigned VecSize = VecTy.getSizeInBits();
1429 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1430 return std::make_pair(
1431 VecTypeIdx,
1432 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1434 .clampScalar(EltTypeIdx, S32, S64)
1435 .clampScalar(VecTypeIdx, S32, S64)
1436 .clampScalar(IdxTypeIdx, S32, S32)
1437 .clampMaxNumElements(VecTypeIdx, S32, 32)
1438 // TODO: Clamp elements for 64-bit vectors?
1439 // It should only be necessary with variable indexes.
1440 // As a last resort, lower to the stack
1441 .lower();
1444 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1445 .unsupportedIf([=](const LegalityQuery &Query) {
1446 const LLT &EltTy = Query.Types[1].getElementType();
1447 return Query.Types[0] != EltTy;
1450 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1451 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1452 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1454 // FIXME: Doesn't handle extract of illegal sizes.
1455 getActionDefinitionsBuilder(Op)
1456 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1457 // FIXME: Multiples of 16 should not be legal.
1458 .legalIf([=](const LegalityQuery &Query) {
1459 const LLT BigTy = Query.Types[BigTyIdx];
1460 const LLT LitTy = Query.Types[LitTyIdx];
1461 return (BigTy.getSizeInBits() % 32 == 0) &&
1462 (LitTy.getSizeInBits() % 16 == 0);
1464 .widenScalarIf(
1465 [=](const LegalityQuery &Query) {
1466 const LLT BigTy = Query.Types[BigTyIdx];
1467 return (BigTy.getScalarSizeInBits() < 16);
1469 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1470 .widenScalarIf(
1471 [=](const LegalityQuery &Query) {
1472 const LLT LitTy = Query.Types[LitTyIdx];
1473 return (LitTy.getScalarSizeInBits() < 16);
1475 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1476 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1477 .widenScalarToNextPow2(BigTyIdx, 32);
1481 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1482 .legalForCartesianProduct(AllS32Vectors, {S32})
1483 .legalForCartesianProduct(AllS64Vectors, {S64})
1484 .clampNumElements(0, V16S32, V32S32)
1485 .clampNumElements(0, V2S64, V16S64)
1486 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1488 if (ST.hasScalarPackInsts()) {
1489 BuildVector
1490 // FIXME: Should probably widen s1 vectors straight to s32
1491 .minScalarOrElt(0, S16)
1492 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1493 .minScalar(1, S32);
1495 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1496 .legalFor({V2S16, S32})
1497 .lower();
1498 BuildVector.minScalarOrElt(0, S32);
1499 } else {
1500 BuildVector.customFor({V2S16, S16});
1501 BuildVector.minScalarOrElt(0, S32);
1503 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1504 .customFor({V2S16, S32})
1505 .lower();
1508 BuildVector.legalIf(isRegisterType(0));
1510 // FIXME: Clamp maximum size
1511 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1512 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1513 .clampMaxNumElements(0, S32, 32)
1514 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1515 .clampMaxNumElements(0, S16, 64);
1517 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1518 // pre-legalize.
1519 if (ST.hasVOP3PInsts()) {
1520 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1521 .customFor({V2S16, V2S16})
1522 .lower();
1523 } else
1524 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1526 // Merge/Unmerge
1527 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1528 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1529 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1531 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1532 const LLT Ty = Query.Types[TypeIdx];
1533 if (Ty.isVector()) {
1534 const LLT &EltTy = Ty.getElementType();
1535 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1536 return true;
1537 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1538 return true;
1540 return false;
1543 auto &Builder = getActionDefinitionsBuilder(Op)
1544 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1545 .lowerFor({{S16, V2S16}})
1546 .lowerIf([=](const LegalityQuery &Query) {
1547 const LLT BigTy = Query.Types[BigTyIdx];
1548 return BigTy.getSizeInBits() == 32;
1550 // Try to widen to s16 first for small types.
1551 // TODO: Only do this on targets with legal s16 shifts
1552 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1554 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1555 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1556 elementTypeIs(1, S16)),
1557 changeTo(1, V2S16))
1558 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1559 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1560 // valid.
1561 .clampScalar(LitTyIdx, S32, S512)
1562 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1563 // Break up vectors with weird elements into scalars
1564 .fewerElementsIf(
1565 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1566 scalarize(0))
1567 .fewerElementsIf(
1568 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1569 scalarize(1))
1570 .clampScalar(BigTyIdx, S32, MaxScalar);
1572 if (Op == G_MERGE_VALUES) {
1573 Builder.widenScalarIf(
1574 // TODO: Use 16-bit shifts if legal for 8-bit values?
1575 [=](const LegalityQuery &Query) {
1576 const LLT Ty = Query.Types[LitTyIdx];
1577 return Ty.getSizeInBits() < 32;
1579 changeTo(LitTyIdx, S32));
1582 Builder.widenScalarIf(
1583 [=](const LegalityQuery &Query) {
1584 const LLT Ty = Query.Types[BigTyIdx];
1585 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1586 Ty.getSizeInBits() % 16 != 0;
1588 [=](const LegalityQuery &Query) {
1589 // Pick the next power of 2, or a multiple of 64 over 128.
1590 // Whichever is smaller.
1591 const LLT &Ty = Query.Types[BigTyIdx];
1592 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1593 if (NewSizeInBits >= 256) {
1594 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1595 if (RoundedTo < NewSizeInBits)
1596 NewSizeInBits = RoundedTo;
1598 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1600 // Any vectors left are the wrong size. Scalarize them.
1601 .scalarize(0)
1602 .scalarize(1);
1605 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1606 // RegBankSelect.
1607 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1608 .legalFor({{S32}, {S64}});
1610 if (ST.hasVOP3PInsts()) {
1611 SextInReg.lowerFor({{V2S16}})
1612 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1613 // get more vector shift opportunities, since we'll get those when
1614 // expanded.
1615 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1616 } else if (ST.has16BitInsts()) {
1617 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1618 } else {
1619 // Prefer to promote to s32 before lowering if we don't have 16-bit
1620 // shifts. This avoid a lot of intermediate truncate and extend operations.
1621 SextInReg.lowerFor({{S32}, {S64}});
1624 SextInReg
1625 .scalarize(0)
1626 .clampScalar(0, S32, S64)
1627 .lower();
1629 // TODO: Only Try to form v2s16 with legal packed instructions.
1630 getActionDefinitionsBuilder(G_FSHR)
1631 .legalFor({{S32, S32}})
1632 .lowerFor({{V2S16, V2S16}})
1633 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1634 .scalarize(0)
1635 .lower();
1637 if (ST.hasVOP3PInsts()) {
1638 getActionDefinitionsBuilder(G_FSHL)
1639 .lowerFor({{V2S16, V2S16}})
1640 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1641 .scalarize(0)
1642 .lower();
1643 } else {
1644 getActionDefinitionsBuilder(G_FSHL)
1645 .scalarize(0)
1646 .lower();
1649 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1650 .legalFor({S64});
1652 getActionDefinitionsBuilder(G_FENCE)
1653 .alwaysLegal();
1655 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1656 .scalarize(0)
1657 .minScalar(0, S32)
1658 .lower();
1660 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1661 .legalFor({{S32, S32}, {S64, S32}})
1662 .clampScalar(1, S32, S32)
1663 .clampScalar(0, S32, S64)
1664 .widenScalarToNextPow2(0)
1665 .scalarize(0);
1667 getActionDefinitionsBuilder({
1668 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1669 G_FCOPYSIGN,
1671 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1672 G_ATOMICRMW_NAND,
1673 G_ATOMICRMW_FSUB,
1674 G_READ_REGISTER,
1675 G_WRITE_REGISTER,
1677 G_SADDO, G_SSUBO,
1679 // TODO: Implement
1680 G_FMINIMUM, G_FMAXIMUM}).lower();
1682 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1683 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1684 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1685 .unsupported();
1687 getLegacyLegalizerInfo().computeTables();
1688 verify(*ST.getInstrInfo());
1691 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1692 MachineInstr &MI) const {
1693 MachineIRBuilder &B = Helper.MIRBuilder;
1694 MachineRegisterInfo &MRI = *B.getMRI();
1696 switch (MI.getOpcode()) {
1697 case TargetOpcode::G_ADDRSPACE_CAST:
1698 return legalizeAddrSpaceCast(MI, MRI, B);
1699 case TargetOpcode::G_FRINT:
1700 return legalizeFrint(MI, MRI, B);
1701 case TargetOpcode::G_FCEIL:
1702 return legalizeFceil(MI, MRI, B);
1703 case TargetOpcode::G_FREM:
1704 return legalizeFrem(MI, MRI, B);
1705 case TargetOpcode::G_INTRINSIC_TRUNC:
1706 return legalizeIntrinsicTrunc(MI, MRI, B);
1707 case TargetOpcode::G_SITOFP:
1708 return legalizeITOFP(MI, MRI, B, true);
1709 case TargetOpcode::G_UITOFP:
1710 return legalizeITOFP(MI, MRI, B, false);
1711 case TargetOpcode::G_FPTOSI:
1712 return legalizeFPTOI(MI, MRI, B, true);
1713 case TargetOpcode::G_FPTOUI:
1714 return legalizeFPTOI(MI, MRI, B, false);
1715 case TargetOpcode::G_FMINNUM:
1716 case TargetOpcode::G_FMAXNUM:
1717 case TargetOpcode::G_FMINNUM_IEEE:
1718 case TargetOpcode::G_FMAXNUM_IEEE:
1719 return legalizeMinNumMaxNum(Helper, MI);
1720 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1721 return legalizeExtractVectorElt(MI, MRI, B);
1722 case TargetOpcode::G_INSERT_VECTOR_ELT:
1723 return legalizeInsertVectorElt(MI, MRI, B);
1724 case TargetOpcode::G_SHUFFLE_VECTOR:
1725 return legalizeShuffleVector(MI, MRI, B);
1726 case TargetOpcode::G_FSIN:
1727 case TargetOpcode::G_FCOS:
1728 return legalizeSinCos(MI, MRI, B);
1729 case TargetOpcode::G_GLOBAL_VALUE:
1730 return legalizeGlobalValue(MI, MRI, B);
1731 case TargetOpcode::G_LOAD:
1732 case TargetOpcode::G_SEXTLOAD:
1733 case TargetOpcode::G_ZEXTLOAD:
1734 return legalizeLoad(Helper, MI);
1735 case TargetOpcode::G_FMAD:
1736 return legalizeFMad(MI, MRI, B);
1737 case TargetOpcode::G_FDIV:
1738 return legalizeFDIV(MI, MRI, B);
1739 case TargetOpcode::G_UDIV:
1740 case TargetOpcode::G_UREM:
1741 case TargetOpcode::G_UDIVREM:
1742 return legalizeUnsignedDIV_REM(MI, MRI, B);
1743 case TargetOpcode::G_SDIV:
1744 case TargetOpcode::G_SREM:
1745 case TargetOpcode::G_SDIVREM:
1746 return legalizeSignedDIV_REM(MI, MRI, B);
1747 case TargetOpcode::G_ATOMIC_CMPXCHG:
1748 return legalizeAtomicCmpXChg(MI, MRI, B);
1749 case TargetOpcode::G_FLOG:
1750 return legalizeFlog(MI, B, numbers::ln2f);
1751 case TargetOpcode::G_FLOG10:
1752 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1753 case TargetOpcode::G_FEXP:
1754 return legalizeFExp(MI, B);
1755 case TargetOpcode::G_FPOW:
1756 return legalizeFPow(MI, B);
1757 case TargetOpcode::G_FFLOOR:
1758 return legalizeFFloor(MI, MRI, B);
1759 case TargetOpcode::G_BUILD_VECTOR:
1760 return legalizeBuildVector(MI, MRI, B);
1761 case TargetOpcode::G_CTLZ:
1762 case TargetOpcode::G_CTTZ:
1763 return legalizeCTLZ_CTTZ(MI, MRI, B);
1764 default:
1765 return false;
1768 llvm_unreachable("expected switch to return");
1771 Register AMDGPULegalizerInfo::getSegmentAperture(
1772 unsigned AS,
1773 MachineRegisterInfo &MRI,
1774 MachineIRBuilder &B) const {
1775 MachineFunction &MF = B.getMF();
1776 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1777 const LLT S32 = LLT::scalar(32);
1779 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1781 if (ST.hasApertureRegs()) {
1782 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1783 // getreg.
1784 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1785 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1786 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1787 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1788 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1789 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1790 unsigned Encoding =
1791 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1792 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1793 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1795 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1797 B.buildInstr(AMDGPU::S_GETREG_B32)
1798 .addDef(GetReg)
1799 .addImm(Encoding);
1800 MRI.setType(GetReg, S32);
1802 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1803 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1806 Register QueuePtr = MRI.createGenericVirtualRegister(
1807 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1809 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1810 return Register();
1812 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1813 // private_segment_aperture_base_hi.
1814 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1816 // TODO: can we be smarter about machine pointer info?
1817 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1818 MachineMemOperand *MMO = MF.getMachineMemOperand(
1819 PtrInfo,
1820 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1821 MachineMemOperand::MOInvariant,
1822 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1824 Register LoadAddr;
1826 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1827 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1830 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1831 MachineInstr &MI, MachineRegisterInfo &MRI,
1832 MachineIRBuilder &B) const {
1833 MachineFunction &MF = B.getMF();
1835 const LLT S32 = LLT::scalar(32);
1836 Register Dst = MI.getOperand(0).getReg();
1837 Register Src = MI.getOperand(1).getReg();
1839 LLT DstTy = MRI.getType(Dst);
1840 LLT SrcTy = MRI.getType(Src);
1841 unsigned DestAS = DstTy.getAddressSpace();
1842 unsigned SrcAS = SrcTy.getAddressSpace();
1844 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1845 // vector element.
1846 assert(!DstTy.isVector());
1848 const AMDGPUTargetMachine &TM
1849 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1851 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1852 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1853 return true;
1856 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1857 // Truncate.
1858 B.buildExtract(Dst, Src, 0);
1859 MI.eraseFromParent();
1860 return true;
1863 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1864 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1865 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1867 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1868 // another. Merge operands are required to be the same type, but creating an
1869 // extra ptrtoint would be kind of pointless.
1870 auto HighAddr = B.buildConstant(
1871 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1872 B.buildMerge(Dst, {Src, HighAddr});
1873 MI.eraseFromParent();
1874 return true;
1877 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1878 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1879 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1880 unsigned NullVal = TM.getNullPointerValue(DestAS);
1882 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1883 auto FlatNull = B.buildConstant(SrcTy, 0);
1885 // Extract low 32-bits of the pointer.
1886 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1888 auto CmpRes =
1889 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1890 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1892 MI.eraseFromParent();
1893 return true;
1896 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1897 return false;
1899 if (!ST.hasFlatAddressSpace())
1900 return false;
1902 auto SegmentNull =
1903 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1904 auto FlatNull =
1905 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1907 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1908 if (!ApertureReg.isValid())
1909 return false;
1911 auto CmpRes =
1912 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1914 // Coerce the type of the low half of the result so we can use merge_values.
1915 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1917 // TODO: Should we allow mismatched types but matching sizes in merges to
1918 // avoid the ptrtoint?
1919 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1920 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1922 MI.eraseFromParent();
1923 return true;
1926 bool AMDGPULegalizerInfo::legalizeFrint(
1927 MachineInstr &MI, MachineRegisterInfo &MRI,
1928 MachineIRBuilder &B) const {
1929 Register Src = MI.getOperand(1).getReg();
1930 LLT Ty = MRI.getType(Src);
1931 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1933 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1934 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1936 auto C1 = B.buildFConstant(Ty, C1Val);
1937 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1939 // TODO: Should this propagate fast-math-flags?
1940 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1941 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1943 auto C2 = B.buildFConstant(Ty, C2Val);
1944 auto Fabs = B.buildFAbs(Ty, Src);
1946 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1947 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1948 MI.eraseFromParent();
1949 return true;
1952 bool AMDGPULegalizerInfo::legalizeFceil(
1953 MachineInstr &MI, MachineRegisterInfo &MRI,
1954 MachineIRBuilder &B) const {
1956 const LLT S1 = LLT::scalar(1);
1957 const LLT S64 = LLT::scalar(64);
1959 Register Src = MI.getOperand(1).getReg();
1960 assert(MRI.getType(Src) == S64);
1962 // result = trunc(src)
1963 // if (src > 0.0 && src != result)
1964 // result += 1.0
1966 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1968 const auto Zero = B.buildFConstant(S64, 0.0);
1969 const auto One = B.buildFConstant(S64, 1.0);
1970 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1971 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1972 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1973 auto Add = B.buildSelect(S64, And, One, Zero);
1975 // TODO: Should this propagate fast-math-flags?
1976 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1977 return true;
1980 bool AMDGPULegalizerInfo::legalizeFrem(
1981 MachineInstr &MI, MachineRegisterInfo &MRI,
1982 MachineIRBuilder &B) const {
1983 Register DstReg = MI.getOperand(0).getReg();
1984 Register Src0Reg = MI.getOperand(1).getReg();
1985 Register Src1Reg = MI.getOperand(2).getReg();
1986 auto Flags = MI.getFlags();
1987 LLT Ty = MRI.getType(DstReg);
1989 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1990 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1991 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1992 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1993 MI.eraseFromParent();
1994 return true;
1997 static MachineInstrBuilder extractF64Exponent(Register Hi,
1998 MachineIRBuilder &B) {
1999 const unsigned FractBits = 52;
2000 const unsigned ExpBits = 11;
2001 LLT S32 = LLT::scalar(32);
2003 auto Const0 = B.buildConstant(S32, FractBits - 32);
2004 auto Const1 = B.buildConstant(S32, ExpBits);
2006 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2007 .addUse(Hi)
2008 .addUse(Const0.getReg(0))
2009 .addUse(Const1.getReg(0));
2011 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2014 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2015 MachineInstr &MI, MachineRegisterInfo &MRI,
2016 MachineIRBuilder &B) const {
2017 const LLT S1 = LLT::scalar(1);
2018 const LLT S32 = LLT::scalar(32);
2019 const LLT S64 = LLT::scalar(64);
2021 Register Src = MI.getOperand(1).getReg();
2022 assert(MRI.getType(Src) == S64);
2024 // TODO: Should this use extract since the low half is unused?
2025 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2026 Register Hi = Unmerge.getReg(1);
2028 // Extract the upper half, since this is where we will find the sign and
2029 // exponent.
2030 auto Exp = extractF64Exponent(Hi, B);
2032 const unsigned FractBits = 52;
2034 // Extract the sign bit.
2035 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2036 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2038 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2040 const auto Zero32 = B.buildConstant(S32, 0);
2042 // Extend back to 64-bits.
2043 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2045 auto Shr = B.buildAShr(S64, FractMask, Exp);
2046 auto Not = B.buildNot(S64, Shr);
2047 auto Tmp0 = B.buildAnd(S64, Src, Not);
2048 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2050 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2051 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2053 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2054 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2055 MI.eraseFromParent();
2056 return true;
2059 bool AMDGPULegalizerInfo::legalizeITOFP(
2060 MachineInstr &MI, MachineRegisterInfo &MRI,
2061 MachineIRBuilder &B, bool Signed) const {
2063 Register Dst = MI.getOperand(0).getReg();
2064 Register Src = MI.getOperand(1).getReg();
2066 const LLT S64 = LLT::scalar(64);
2067 const LLT S32 = LLT::scalar(32);
2069 assert(MRI.getType(Src) == S64);
2071 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2072 auto ThirtyTwo = B.buildConstant(S32, 32);
2074 if (MRI.getType(Dst) == S64) {
2075 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2076 : B.buildUITOFP(S64, Unmerge.getReg(1));
2078 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2079 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2080 .addUse(CvtHi.getReg(0))
2081 .addUse(ThirtyTwo.getReg(0));
2083 // TODO: Should this propagate fast-math-flags?
2084 B.buildFAdd(Dst, LdExp, CvtLo);
2085 MI.eraseFromParent();
2086 return true;
2089 assert(MRI.getType(Dst) == S32);
2091 auto One = B.buildConstant(S32, 1);
2093 MachineInstrBuilder ShAmt;
2094 if (Signed) {
2095 auto ThirtyOne = B.buildConstant(S32, 31);
2096 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2097 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2098 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2099 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2100 /*HasSideEffects=*/false)
2101 .addUse(Unmerge.getReg(1));
2102 auto LS2 = B.buildSub(S32, LS, One);
2103 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2104 } else
2105 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2106 auto Norm = B.buildShl(S64, Src, ShAmt);
2107 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2108 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2109 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2110 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2111 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2112 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2113 /*HasSideEffects=*/false)
2114 .addUse(FVal.getReg(0))
2115 .addUse(Scale.getReg(0));
2116 MI.eraseFromParent();
2117 return true;
2120 // TODO: Copied from DAG implementation. Verify logic and document how this
2121 // actually works.
2122 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2123 MachineRegisterInfo &MRI,
2124 MachineIRBuilder &B,
2125 bool Signed) const {
2127 Register Dst = MI.getOperand(0).getReg();
2128 Register Src = MI.getOperand(1).getReg();
2130 const LLT S64 = LLT::scalar(64);
2131 const LLT S32 = LLT::scalar(32);
2133 const LLT SrcLT = MRI.getType(Src);
2134 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2136 unsigned Flags = MI.getFlags();
2138 // The basic idea of converting a floating point number into a pair of 32-bit
2139 // integers is illustrated as follows:
2141 // tf := trunc(val);
2142 // hif := floor(tf * 2^-32);
2143 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2144 // hi := fptoi(hif);
2145 // lo := fptoi(lof);
2147 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2148 MachineInstrBuilder Sign;
2149 if (Signed && SrcLT == S32) {
2150 // However, a 32-bit floating point number has only 23 bits mantissa and
2151 // it's not enough to hold all the significant bits of `lof` if val is
2152 // negative. To avoid the loss of precision, We need to take the absolute
2153 // value after truncating and flip the result back based on the original
2154 // signedness.
2155 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2156 Trunc = B.buildFAbs(S32, Trunc, Flags);
2158 MachineInstrBuilder K0, K1;
2159 if (SrcLT == S64) {
2160 K0 = B.buildFConstant(S64,
2161 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2162 K1 = B.buildFConstant(S64,
2163 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2164 } else {
2165 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2166 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2169 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2170 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2171 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2173 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2174 : B.buildFPTOUI(S32, FloorMul);
2175 auto Lo = B.buildFPTOUI(S32, Fma);
2177 if (Signed && SrcLT == S32) {
2178 // Flip the result based on the signedness, which is either all 0s or 1s.
2179 Sign = B.buildMerge(S64, {Sign, Sign});
2180 // r := xor({lo, hi}, sign) - sign;
2181 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2182 } else
2183 B.buildMerge(Dst, {Lo, Hi});
2184 MI.eraseFromParent();
2186 return true;
2189 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2190 MachineInstr &MI) const {
2191 MachineFunction &MF = Helper.MIRBuilder.getMF();
2192 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2194 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2195 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2197 // With ieee_mode disabled, the instructions have the correct behavior
2198 // already for G_FMINNUM/G_FMAXNUM
2199 if (!MFI->getMode().IEEE)
2200 return !IsIEEEOp;
2202 if (IsIEEEOp)
2203 return true;
2205 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2208 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2209 MachineInstr &MI, MachineRegisterInfo &MRI,
2210 MachineIRBuilder &B) const {
2211 // TODO: Should move some of this into LegalizerHelper.
2213 // TODO: Promote dynamic indexing of s16 to s32
2215 // FIXME: Artifact combiner probably should have replaced the truncated
2216 // constant before this, so we shouldn't need
2217 // getConstantVRegValWithLookThrough.
2218 Optional<ValueAndVReg> MaybeIdxVal =
2219 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2220 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2221 return true;
2222 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2224 Register Dst = MI.getOperand(0).getReg();
2225 Register Vec = MI.getOperand(1).getReg();
2227 LLT VecTy = MRI.getType(Vec);
2228 LLT EltTy = VecTy.getElementType();
2229 assert(EltTy == MRI.getType(Dst));
2231 if (IdxVal < VecTy.getNumElements())
2232 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2233 else
2234 B.buildUndef(Dst);
2236 MI.eraseFromParent();
2237 return true;
2240 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2241 MachineInstr &MI, MachineRegisterInfo &MRI,
2242 MachineIRBuilder &B) const {
2243 // TODO: Should move some of this into LegalizerHelper.
2245 // TODO: Promote dynamic indexing of s16 to s32
2247 // FIXME: Artifact combiner probably should have replaced the truncated
2248 // constant before this, so we shouldn't need
2249 // getConstantVRegValWithLookThrough.
2250 Optional<ValueAndVReg> MaybeIdxVal =
2251 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2252 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2253 return true;
2255 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2256 Register Dst = MI.getOperand(0).getReg();
2257 Register Vec = MI.getOperand(1).getReg();
2258 Register Ins = MI.getOperand(2).getReg();
2260 LLT VecTy = MRI.getType(Vec);
2261 LLT EltTy = VecTy.getElementType();
2262 assert(EltTy == MRI.getType(Ins));
2264 if (IdxVal < VecTy.getNumElements())
2265 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2266 else
2267 B.buildUndef(Dst);
2269 MI.eraseFromParent();
2270 return true;
2273 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2274 MachineInstr &MI, MachineRegisterInfo &MRI,
2275 MachineIRBuilder &B) const {
2276 const LLT V2S16 = LLT::fixed_vector(2, 16);
2278 Register Dst = MI.getOperand(0).getReg();
2279 Register Src0 = MI.getOperand(1).getReg();
2280 LLT DstTy = MRI.getType(Dst);
2281 LLT SrcTy = MRI.getType(Src0);
2283 if (SrcTy == V2S16 && DstTy == V2S16 &&
2284 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2285 return true;
2287 MachineIRBuilder HelperBuilder(MI);
2288 GISelObserverWrapper DummyObserver;
2289 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2290 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2293 bool AMDGPULegalizerInfo::legalizeSinCos(
2294 MachineInstr &MI, MachineRegisterInfo &MRI,
2295 MachineIRBuilder &B) const {
2297 Register DstReg = MI.getOperand(0).getReg();
2298 Register SrcReg = MI.getOperand(1).getReg();
2299 LLT Ty = MRI.getType(DstReg);
2300 unsigned Flags = MI.getFlags();
2302 Register TrigVal;
2303 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2304 if (ST.hasTrigReducedRange()) {
2305 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2306 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2307 .addUse(MulVal.getReg(0))
2308 .setMIFlags(Flags).getReg(0);
2309 } else
2310 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2312 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2313 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2314 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2315 .addUse(TrigVal)
2316 .setMIFlags(Flags);
2317 MI.eraseFromParent();
2318 return true;
2321 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2322 MachineIRBuilder &B,
2323 const GlobalValue *GV,
2324 int64_t Offset,
2325 unsigned GAFlags) const {
2326 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2327 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2328 // to the following code sequence:
2330 // For constant address space:
2331 // s_getpc_b64 s[0:1]
2332 // s_add_u32 s0, s0, $symbol
2333 // s_addc_u32 s1, s1, 0
2335 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2336 // a fixup or relocation is emitted to replace $symbol with a literal
2337 // constant, which is a pc-relative offset from the encoding of the $symbol
2338 // operand to the global variable.
2340 // For global address space:
2341 // s_getpc_b64 s[0:1]
2342 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2343 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2345 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2346 // fixups or relocations are emitted to replace $symbol@*@lo and
2347 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2348 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2349 // operand to the global variable.
2351 // What we want here is an offset from the value returned by s_getpc
2352 // (which is the address of the s_add_u32 instruction) to the global
2353 // variable, but since the encoding of $symbol starts 4 bytes after the start
2354 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2355 // small. This requires us to add 4 to the global variable offset in order to
2356 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2357 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2358 // instruction.
2360 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2362 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2363 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2365 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2366 .addDef(PCReg);
2368 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2369 if (GAFlags == SIInstrInfo::MO_NONE)
2370 MIB.addImm(0);
2371 else
2372 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2374 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2376 if (PtrTy.getSizeInBits() == 32)
2377 B.buildExtract(DstReg, PCReg, 0);
2378 return true;
2381 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2382 MachineInstr &MI, MachineRegisterInfo &MRI,
2383 MachineIRBuilder &B) const {
2384 Register DstReg = MI.getOperand(0).getReg();
2385 LLT Ty = MRI.getType(DstReg);
2386 unsigned AS = Ty.getAddressSpace();
2388 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2389 MachineFunction &MF = B.getMF();
2390 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2392 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2393 if (!MFI->isModuleEntryFunction() &&
2394 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2395 const Function &Fn = MF.getFunction();
2396 DiagnosticInfoUnsupported BadLDSDecl(
2397 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2398 DS_Warning);
2399 Fn.getContext().diagnose(BadLDSDecl);
2401 // We currently don't have a way to correctly allocate LDS objects that
2402 // aren't directly associated with a kernel. We do force inlining of
2403 // functions that use local objects. However, if these dead functions are
2404 // not eliminated, we don't want a compile time error. Just emit a warning
2405 // and a trap, since there should be no callable path here.
2406 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2407 B.buildUndef(DstReg);
2408 MI.eraseFromParent();
2409 return true;
2412 // TODO: We could emit code to handle the initialization somewhere.
2413 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2414 const SITargetLowering *TLI = ST.getTargetLowering();
2415 if (!TLI->shouldUseLDSConstAddress(GV)) {
2416 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2417 return true; // Leave in place;
2420 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2421 Type *Ty = GV->getValueType();
2422 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2423 // zero-sized type in other languages to declare the dynamic shared
2424 // memory which size is not known at the compile time. They will be
2425 // allocated by the runtime and placed directly after the static
2426 // allocated ones. They all share the same offset.
2427 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2428 // Adjust alignment for that dynamic shared memory array.
2429 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2430 LLT S32 = LLT::scalar(32);
2431 auto Sz =
2432 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2433 B.buildIntToPtr(DstReg, Sz);
2434 MI.eraseFromParent();
2435 return true;
2439 B.buildConstant(
2440 DstReg,
2441 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2442 MI.eraseFromParent();
2443 return true;
2446 const Function &Fn = MF.getFunction();
2447 DiagnosticInfoUnsupported BadInit(
2448 Fn, "unsupported initializer for address space", MI.getDebugLoc());
2449 Fn.getContext().diagnose(BadInit);
2450 return true;
2453 const SITargetLowering *TLI = ST.getTargetLowering();
2455 if (TLI->shouldEmitFixup(GV)) {
2456 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2457 MI.eraseFromParent();
2458 return true;
2461 if (TLI->shouldEmitPCReloc(GV)) {
2462 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2463 MI.eraseFromParent();
2464 return true;
2467 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2468 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2470 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2471 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2472 MachinePointerInfo::getGOT(MF),
2473 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2474 MachineMemOperand::MOInvariant,
2475 LoadTy, Align(8));
2477 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2479 if (Ty.getSizeInBits() == 32) {
2480 // Truncate if this is a 32-bit constant adrdess.
2481 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2482 B.buildExtract(DstReg, Load, 0);
2483 } else
2484 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2486 MI.eraseFromParent();
2487 return true;
2490 static LLT widenToNextPowerOf2(LLT Ty) {
2491 if (Ty.isVector())
2492 return Ty.changeElementCount(
2493 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2494 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2497 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2498 MachineInstr &MI) const {
2499 MachineIRBuilder &B = Helper.MIRBuilder;
2500 MachineRegisterInfo &MRI = *B.getMRI();
2501 GISelChangeObserver &Observer = Helper.Observer;
2503 Register PtrReg = MI.getOperand(1).getReg();
2504 LLT PtrTy = MRI.getType(PtrReg);
2505 unsigned AddrSpace = PtrTy.getAddressSpace();
2507 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2508 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2509 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2510 Observer.changingInstr(MI);
2511 MI.getOperand(1).setReg(Cast.getReg(0));
2512 Observer.changedInstr(MI);
2513 return true;
2516 if (MI.getOpcode() != AMDGPU::G_LOAD)
2517 return false;
2519 Register ValReg = MI.getOperand(0).getReg();
2520 LLT ValTy = MRI.getType(ValReg);
2522 MachineMemOperand *MMO = *MI.memoperands_begin();
2523 const unsigned ValSize = ValTy.getSizeInBits();
2524 const LLT MemTy = MMO->getMemoryType();
2525 const Align MemAlign = MMO->getAlign();
2526 const unsigned MemSize = MemTy.getSizeInBits();
2527 const unsigned AlignInBits = 8 * MemAlign.value();
2529 // Widen non-power-of-2 loads to the alignment if needed
2530 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2531 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2533 // This was already the correct extending load result type, so just adjust
2534 // the memory type.
2535 if (WideMemSize == ValSize) {
2536 MachineFunction &MF = B.getMF();
2538 MachineMemOperand *WideMMO =
2539 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2540 Observer.changingInstr(MI);
2541 MI.setMemRefs(MF, {WideMMO});
2542 Observer.changedInstr(MI);
2543 return true;
2546 // Don't bother handling edge case that should probably never be produced.
2547 if (ValSize > WideMemSize)
2548 return false;
2550 LLT WideTy = widenToNextPowerOf2(ValTy);
2552 Register WideLoad;
2553 if (!WideTy.isVector()) {
2554 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2555 B.buildTrunc(ValReg, WideLoad).getReg(0);
2556 } else {
2557 // Extract the subvector.
2559 if (isRegisterType(ValTy)) {
2560 // If this a case where G_EXTRACT is legal, use it.
2561 // (e.g. <3 x s32> -> <4 x s32>)
2562 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2563 B.buildExtract(ValReg, WideLoad, 0);
2564 } else {
2565 // For cases where the widened type isn't a nice register value, unmerge
2566 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2567 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2568 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2569 B.setInsertPt(B.getMBB(), MI.getIterator());
2570 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2574 MI.eraseFromParent();
2575 return true;
2578 return false;
2581 bool AMDGPULegalizerInfo::legalizeFMad(
2582 MachineInstr &MI, MachineRegisterInfo &MRI,
2583 MachineIRBuilder &B) const {
2584 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2585 assert(Ty.isScalar());
2587 MachineFunction &MF = B.getMF();
2588 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2590 // TODO: Always legal with future ftz flag.
2591 // FIXME: Do we need just output?
2592 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2593 return true;
2594 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2595 return true;
2597 MachineIRBuilder HelperBuilder(MI);
2598 GISelObserverWrapper DummyObserver;
2599 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2600 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2603 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2604 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2605 Register DstReg = MI.getOperand(0).getReg();
2606 Register PtrReg = MI.getOperand(1).getReg();
2607 Register CmpVal = MI.getOperand(2).getReg();
2608 Register NewVal = MI.getOperand(3).getReg();
2610 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2611 "this should not have been custom lowered");
2613 LLT ValTy = MRI.getType(CmpVal);
2614 LLT VecTy = LLT::fixed_vector(2, ValTy);
2616 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2618 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2619 .addDef(DstReg)
2620 .addUse(PtrReg)
2621 .addUse(PackedVal)
2622 .setMemRefs(MI.memoperands());
2624 MI.eraseFromParent();
2625 return true;
2628 bool AMDGPULegalizerInfo::legalizeFlog(
2629 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2630 Register Dst = MI.getOperand(0).getReg();
2631 Register Src = MI.getOperand(1).getReg();
2632 LLT Ty = B.getMRI()->getType(Dst);
2633 unsigned Flags = MI.getFlags();
2635 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2636 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2638 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2639 MI.eraseFromParent();
2640 return true;
2643 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2644 MachineIRBuilder &B) const {
2645 Register Dst = MI.getOperand(0).getReg();
2646 Register Src = MI.getOperand(1).getReg();
2647 unsigned Flags = MI.getFlags();
2648 LLT Ty = B.getMRI()->getType(Dst);
2650 auto K = B.buildFConstant(Ty, numbers::log2e);
2651 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2652 B.buildFExp2(Dst, Mul, Flags);
2653 MI.eraseFromParent();
2654 return true;
2657 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2658 MachineIRBuilder &B) const {
2659 Register Dst = MI.getOperand(0).getReg();
2660 Register Src0 = MI.getOperand(1).getReg();
2661 Register Src1 = MI.getOperand(2).getReg();
2662 unsigned Flags = MI.getFlags();
2663 LLT Ty = B.getMRI()->getType(Dst);
2664 const LLT S16 = LLT::scalar(16);
2665 const LLT S32 = LLT::scalar(32);
2667 if (Ty == S32) {
2668 auto Log = B.buildFLog2(S32, Src0, Flags);
2669 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2670 .addUse(Log.getReg(0))
2671 .addUse(Src1)
2672 .setMIFlags(Flags);
2673 B.buildFExp2(Dst, Mul, Flags);
2674 } else if (Ty == S16) {
2675 // There's no f16 fmul_legacy, so we need to convert for it.
2676 auto Log = B.buildFLog2(S16, Src0, Flags);
2677 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2678 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2679 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2680 .addUse(Ext0.getReg(0))
2681 .addUse(Ext1.getReg(0))
2682 .setMIFlags(Flags);
2684 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2685 } else
2686 return false;
2688 MI.eraseFromParent();
2689 return true;
2692 // Find a source register, ignoring any possible source modifiers.
2693 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2694 Register ModSrc = OrigSrc;
2695 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2696 ModSrc = SrcFNeg->getOperand(1).getReg();
2697 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2698 ModSrc = SrcFAbs->getOperand(1).getReg();
2699 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2700 ModSrc = SrcFAbs->getOperand(1).getReg();
2701 return ModSrc;
2704 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2705 MachineRegisterInfo &MRI,
2706 MachineIRBuilder &B) const {
2708 const LLT S1 = LLT::scalar(1);
2709 const LLT S64 = LLT::scalar(64);
2710 Register Dst = MI.getOperand(0).getReg();
2711 Register OrigSrc = MI.getOperand(1).getReg();
2712 unsigned Flags = MI.getFlags();
2713 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2714 "this should not have been custom lowered");
2716 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2717 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2718 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2719 // V_FRACT bug is:
2720 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2722 // Convert floor(x) to (x - fract(x))
2724 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2725 .addUse(OrigSrc)
2726 .setMIFlags(Flags);
2728 // Give source modifier matching some assistance before obscuring a foldable
2729 // pattern.
2731 // TODO: We can avoid the neg on the fract? The input sign to fract
2732 // shouldn't matter?
2733 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2735 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2737 Register Min = MRI.createGenericVirtualRegister(S64);
2739 // We don't need to concern ourselves with the snan handling difference, so
2740 // use the one which will directly select.
2741 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2742 if (MFI->getMode().IEEE)
2743 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2744 else
2745 B.buildFMinNum(Min, Fract, Const, Flags);
2747 Register CorrectedFract = Min;
2748 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2749 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2750 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2753 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2754 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2756 MI.eraseFromParent();
2757 return true;
2760 // Turn an illegal packed v2s16 build vector into bit operations.
2761 // TODO: This should probably be a bitcast action in LegalizerHelper.
2762 bool AMDGPULegalizerInfo::legalizeBuildVector(
2763 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2764 Register Dst = MI.getOperand(0).getReg();
2765 const LLT S32 = LLT::scalar(32);
2766 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2768 Register Src0 = MI.getOperand(1).getReg();
2769 Register Src1 = MI.getOperand(2).getReg();
2770 assert(MRI.getType(Src0) == LLT::scalar(16));
2772 auto Merge = B.buildMerge(S32, {Src0, Src1});
2773 B.buildBitcast(Dst, Merge);
2775 MI.eraseFromParent();
2776 return true;
2779 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2780 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2781 // case with a single min instruction instead of a compare+select.
2782 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2783 MachineRegisterInfo &MRI,
2784 MachineIRBuilder &B) const {
2785 Register Dst = MI.getOperand(0).getReg();
2786 Register Src = MI.getOperand(1).getReg();
2787 LLT DstTy = MRI.getType(Dst);
2788 LLT SrcTy = MRI.getType(Src);
2790 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2791 ? AMDGPU::G_AMDGPU_FFBH_U32
2792 : AMDGPU::G_AMDGPU_FFBL_B32;
2793 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2794 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2796 MI.eraseFromParent();
2797 return true;
2800 // Check that this is a G_XOR x, -1
2801 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2802 if (MI.getOpcode() != TargetOpcode::G_XOR)
2803 return false;
2804 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2805 return ConstVal && *ConstVal == -1;
2808 // Return the use branch instruction, otherwise null if the usage is invalid.
2809 static MachineInstr *
2810 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2811 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2812 Register CondDef = MI.getOperand(0).getReg();
2813 if (!MRI.hasOneNonDBGUse(CondDef))
2814 return nullptr;
2816 MachineBasicBlock *Parent = MI.getParent();
2817 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2819 if (isNot(MRI, *UseMI)) {
2820 Register NegatedCond = UseMI->getOperand(0).getReg();
2821 if (!MRI.hasOneNonDBGUse(NegatedCond))
2822 return nullptr;
2824 // We're deleting the def of this value, so we need to remove it.
2825 UseMI->eraseFromParent();
2827 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2828 Negated = true;
2831 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2832 return nullptr;
2834 // Make sure the cond br is followed by a G_BR, or is the last instruction.
2835 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2836 if (Next == Parent->end()) {
2837 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2838 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2839 return nullptr;
2840 UncondBrTarget = &*NextMBB;
2841 } else {
2842 if (Next->getOpcode() != AMDGPU::G_BR)
2843 return nullptr;
2844 Br = &*Next;
2845 UncondBrTarget = Br->getOperand(0).getMBB();
2848 return UseMI;
2851 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2852 const ArgDescriptor *Arg,
2853 const TargetRegisterClass *ArgRC,
2854 LLT ArgTy) const {
2855 MCRegister SrcReg = Arg->getRegister();
2856 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2857 assert(DstReg.isVirtual() && "Virtual register expected");
2859 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2860 ArgTy);
2861 if (Arg->isMasked()) {
2862 // TODO: Should we try to emit this once in the entry block?
2863 const LLT S32 = LLT::scalar(32);
2864 const unsigned Mask = Arg->getMask();
2865 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2867 Register AndMaskSrc = LiveIn;
2869 if (Shift != 0) {
2870 auto ShiftAmt = B.buildConstant(S32, Shift);
2871 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2874 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2875 } else {
2876 B.buildCopy(DstReg, LiveIn);
2879 return true;
2882 bool AMDGPULegalizerInfo::loadInputValue(
2883 Register DstReg, MachineIRBuilder &B,
2884 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2885 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2886 const ArgDescriptor *Arg;
2887 const TargetRegisterClass *ArgRC;
2888 LLT ArgTy;
2889 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2891 if (!Arg->isRegister() || !Arg->getRegister().isValid())
2892 return false; // TODO: Handle these
2893 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2896 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2897 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2898 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2899 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2900 return false;
2902 MI.eraseFromParent();
2903 return true;
2906 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2907 MachineRegisterInfo &MRI,
2908 MachineIRBuilder &B) const {
2909 Register Dst = MI.getOperand(0).getReg();
2910 LLT DstTy = MRI.getType(Dst);
2911 LLT S16 = LLT::scalar(16);
2912 LLT S32 = LLT::scalar(32);
2913 LLT S64 = LLT::scalar(64);
2915 if (DstTy == S16)
2916 return legalizeFDIV16(MI, MRI, B);
2917 if (DstTy == S32)
2918 return legalizeFDIV32(MI, MRI, B);
2919 if (DstTy == S64)
2920 return legalizeFDIV64(MI, MRI, B);
2922 return false;
2925 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2926 Register DstDivReg,
2927 Register DstRemReg,
2928 Register X,
2929 Register Y) const {
2930 const LLT S1 = LLT::scalar(1);
2931 const LLT S32 = LLT::scalar(32);
2933 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2934 // algorithm used here.
2936 // Initial estimate of inv(y).
2937 auto FloatY = B.buildUITOFP(S32, Y);
2938 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2939 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2940 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2941 auto Z = B.buildFPTOUI(S32, ScaledY);
2943 // One round of UNR.
2944 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2945 auto NegYZ = B.buildMul(S32, NegY, Z);
2946 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2948 // Quotient/remainder estimate.
2949 auto Q = B.buildUMulH(S32, X, Z);
2950 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2952 // First quotient/remainder refinement.
2953 auto One = B.buildConstant(S32, 1);
2954 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2955 if (DstDivReg)
2956 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2957 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2959 // Second quotient/remainder refinement.
2960 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2961 if (DstDivReg)
2962 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2964 if (DstRemReg)
2965 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2968 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2970 // Return lo, hi of result
2972 // %cvt.lo = G_UITOFP Val.lo
2973 // %cvt.hi = G_UITOFP Val.hi
2974 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2975 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2976 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2977 // %mul2 = G_FMUL %mul1, 2**(-32)
2978 // %trunc = G_INTRINSIC_TRUNC %mul2
2979 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2980 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2981 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2982 Register Val) {
2983 const LLT S32 = LLT::scalar(32);
2984 auto Unmerge = B.buildUnmerge(S32, Val);
2986 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2987 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2989 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2990 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2992 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2993 auto Mul1 =
2994 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2996 // 2**(-32)
2997 auto Mul2 =
2998 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2999 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3001 // -(2**32)
3002 auto Mad2 = B.buildFMAD(S32, Trunc,
3003 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3005 auto ResultLo = B.buildFPTOUI(S32, Mad2);
3006 auto ResultHi = B.buildFPTOUI(S32, Trunc);
3008 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3011 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3012 Register DstDivReg,
3013 Register DstRemReg,
3014 Register Numer,
3015 Register Denom) const {
3016 const LLT S32 = LLT::scalar(32);
3017 const LLT S64 = LLT::scalar(64);
3018 const LLT S1 = LLT::scalar(1);
3019 Register RcpLo, RcpHi;
3021 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3023 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3025 auto Zero64 = B.buildConstant(S64, 0);
3026 auto NegDenom = B.buildSub(S64, Zero64, Denom);
3028 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3029 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3031 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3032 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3033 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3035 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3036 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3037 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
3038 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3040 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3041 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3042 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3043 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3044 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3046 auto Zero32 = B.buildConstant(S32, 0);
3047 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3048 auto Add2_HiC =
3049 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
3050 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
3051 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3053 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3054 Register NumerLo = UnmergeNumer.getReg(0);
3055 Register NumerHi = UnmergeNumer.getReg(1);
3057 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3058 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3059 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3060 Register Mul3_Lo = UnmergeMul3.getReg(0);
3061 Register Mul3_Hi = UnmergeMul3.getReg(1);
3062 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3063 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3064 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3065 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3067 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3068 Register DenomLo = UnmergeDenom.getReg(0);
3069 Register DenomHi = UnmergeDenom.getReg(1);
3071 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3072 auto C1 = B.buildSExt(S32, CmpHi);
3074 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3075 auto C2 = B.buildSExt(S32, CmpLo);
3077 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3078 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3080 // TODO: Here and below portions of the code can be enclosed into if/endif.
3081 // Currently control flow is unconditional and we have 4 selects after
3082 // potential endif to substitute PHIs.
3084 // if C3 != 0 ...
3085 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3086 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3087 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3088 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3090 auto One64 = B.buildConstant(S64, 1);
3091 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3093 auto C4 =
3094 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3095 auto C5 =
3096 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3097 auto C6 = B.buildSelect(
3098 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3100 // if (C6 != 0)
3101 auto Add4 = B.buildAdd(S64, Add3, One64);
3102 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3104 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3105 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3106 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3108 // endif C6
3109 // endif C3
3111 if (DstDivReg) {
3112 auto Sel1 = B.buildSelect(
3113 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3114 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3115 Sel1, MulHi3);
3118 if (DstRemReg) {
3119 auto Sel2 = B.buildSelect(
3120 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3121 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3122 Sel2, Sub1);
3126 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3127 MachineRegisterInfo &MRI,
3128 MachineIRBuilder &B) const {
3129 Register DstDivReg, DstRemReg;
3130 switch (MI.getOpcode()) {
3131 default:
3132 llvm_unreachable("Unexpected opcode!");
3133 case AMDGPU::G_UDIV: {
3134 DstDivReg = MI.getOperand(0).getReg();
3135 break;
3137 case AMDGPU::G_UREM: {
3138 DstRemReg = MI.getOperand(0).getReg();
3139 break;
3141 case AMDGPU::G_UDIVREM: {
3142 DstDivReg = MI.getOperand(0).getReg();
3143 DstRemReg = MI.getOperand(1).getReg();
3144 break;
3148 const LLT S64 = LLT::scalar(64);
3149 const LLT S32 = LLT::scalar(32);
3150 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3151 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3152 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3153 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3155 if (Ty == S32)
3156 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3157 else if (Ty == S64)
3158 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3159 else
3160 return false;
3162 MI.eraseFromParent();
3163 return true;
3166 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3167 MachineRegisterInfo &MRI,
3168 MachineIRBuilder &B) const {
3169 const LLT S64 = LLT::scalar(64);
3170 const LLT S32 = LLT::scalar(32);
3172 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3173 if (Ty != S32 && Ty != S64)
3174 return false;
3176 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3177 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3178 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3180 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3181 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3182 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3184 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3185 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3187 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3188 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3190 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3191 switch (MI.getOpcode()) {
3192 default:
3193 llvm_unreachable("Unexpected opcode!");
3194 case AMDGPU::G_SDIV: {
3195 DstDivReg = MI.getOperand(0).getReg();
3196 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3197 break;
3199 case AMDGPU::G_SREM: {
3200 DstRemReg = MI.getOperand(0).getReg();
3201 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3202 break;
3204 case AMDGPU::G_SDIVREM: {
3205 DstDivReg = MI.getOperand(0).getReg();
3206 DstRemReg = MI.getOperand(1).getReg();
3207 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3208 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3209 break;
3213 if (Ty == S32)
3214 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3215 else
3216 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3218 if (DstDivReg) {
3219 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3220 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3221 B.buildSub(DstDivReg, SignXor, Sign);
3224 if (DstRemReg) {
3225 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3226 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3227 B.buildSub(DstRemReg, SignXor, Sign);
3230 MI.eraseFromParent();
3231 return true;
3234 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3235 MachineRegisterInfo &MRI,
3236 MachineIRBuilder &B) const {
3237 Register Res = MI.getOperand(0).getReg();
3238 Register LHS = MI.getOperand(1).getReg();
3239 Register RHS = MI.getOperand(2).getReg();
3240 uint16_t Flags = MI.getFlags();
3241 LLT ResTy = MRI.getType(Res);
3243 const MachineFunction &MF = B.getMF();
3244 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3245 MI.getFlag(MachineInstr::FmAfn);
3247 if (!AllowInaccurateRcp)
3248 return false;
3250 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3251 // 1 / x -> RCP(x)
3252 if (CLHS->isExactlyValue(1.0)) {
3253 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3254 .addUse(RHS)
3255 .setMIFlags(Flags);
3257 MI.eraseFromParent();
3258 return true;
3261 // -1 / x -> RCP( FNEG(x) )
3262 if (CLHS->isExactlyValue(-1.0)) {
3263 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3264 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3265 .addUse(FNeg.getReg(0))
3266 .setMIFlags(Flags);
3268 MI.eraseFromParent();
3269 return true;
3273 // x / y -> x * (1.0 / y)
3274 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3275 .addUse(RHS)
3276 .setMIFlags(Flags);
3277 B.buildFMul(Res, LHS, RCP, Flags);
3279 MI.eraseFromParent();
3280 return true;
3283 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3284 MachineRegisterInfo &MRI,
3285 MachineIRBuilder &B) const {
3286 Register Res = MI.getOperand(0).getReg();
3287 Register X = MI.getOperand(1).getReg();
3288 Register Y = MI.getOperand(2).getReg();
3289 uint16_t Flags = MI.getFlags();
3290 LLT ResTy = MRI.getType(Res);
3292 const MachineFunction &MF = B.getMF();
3293 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3294 MI.getFlag(MachineInstr::FmAfn);
3296 if (!AllowInaccurateRcp)
3297 return false;
3299 auto NegY = B.buildFNeg(ResTy, Y);
3300 auto One = B.buildFConstant(ResTy, 1.0);
3302 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3303 .addUse(Y)
3304 .setMIFlags(Flags);
3306 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3307 R = B.buildFMA(ResTy, Tmp0, R, R);
3309 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3310 R = B.buildFMA(ResTy, Tmp1, R, R);
3312 auto Ret = B.buildFMul(ResTy, X, R);
3313 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3315 B.buildFMA(Res, Tmp2, R, Ret);
3316 MI.eraseFromParent();
3317 return true;
3320 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3321 MachineRegisterInfo &MRI,
3322 MachineIRBuilder &B) const {
3323 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3324 return true;
3326 Register Res = MI.getOperand(0).getReg();
3327 Register LHS = MI.getOperand(1).getReg();
3328 Register RHS = MI.getOperand(2).getReg();
3330 uint16_t Flags = MI.getFlags();
3332 LLT S16 = LLT::scalar(16);
3333 LLT S32 = LLT::scalar(32);
3335 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3336 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3338 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3339 .addUse(RHSExt.getReg(0))
3340 .setMIFlags(Flags);
3342 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3343 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3345 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3346 .addUse(RDst.getReg(0))
3347 .addUse(RHS)
3348 .addUse(LHS)
3349 .setMIFlags(Flags);
3351 MI.eraseFromParent();
3352 return true;
3355 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3356 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3357 static void toggleSPDenormMode(bool Enable,
3358 MachineIRBuilder &B,
3359 const GCNSubtarget &ST,
3360 AMDGPU::SIModeRegisterDefaults Mode) {
3361 // Set SP denorm mode to this value.
3362 unsigned SPDenormMode =
3363 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3365 if (ST.hasDenormModeInst()) {
3366 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3367 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3369 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3370 B.buildInstr(AMDGPU::S_DENORM_MODE)
3371 .addImm(NewDenormModeValue);
3373 } else {
3374 // Select FP32 bit field in mode register.
3375 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3376 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3377 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3379 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3380 .addImm(SPDenormMode)
3381 .addImm(SPDenormModeBitField);
3385 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3386 MachineRegisterInfo &MRI,
3387 MachineIRBuilder &B) const {
3388 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3389 return true;
3391 Register Res = MI.getOperand(0).getReg();
3392 Register LHS = MI.getOperand(1).getReg();
3393 Register RHS = MI.getOperand(2).getReg();
3394 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3395 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3397 uint16_t Flags = MI.getFlags();
3399 LLT S32 = LLT::scalar(32);
3400 LLT S1 = LLT::scalar(1);
3402 auto One = B.buildFConstant(S32, 1.0f);
3404 auto DenominatorScaled =
3405 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3406 .addUse(LHS)
3407 .addUse(RHS)
3408 .addImm(0)
3409 .setMIFlags(Flags);
3410 auto NumeratorScaled =
3411 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3412 .addUse(LHS)
3413 .addUse(RHS)
3414 .addImm(1)
3415 .setMIFlags(Flags);
3417 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3418 .addUse(DenominatorScaled.getReg(0))
3419 .setMIFlags(Flags);
3420 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3422 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3423 // aren't modeled as reading it.
3424 if (!Mode.allFP32Denormals())
3425 toggleSPDenormMode(true, B, ST, Mode);
3427 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3428 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3429 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3430 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3431 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3432 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3434 if (!Mode.allFP32Denormals())
3435 toggleSPDenormMode(false, B, ST, Mode);
3437 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3438 .addUse(Fma4.getReg(0))
3439 .addUse(Fma1.getReg(0))
3440 .addUse(Fma3.getReg(0))
3441 .addUse(NumeratorScaled.getReg(1))
3442 .setMIFlags(Flags);
3444 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3445 .addUse(Fmas.getReg(0))
3446 .addUse(RHS)
3447 .addUse(LHS)
3448 .setMIFlags(Flags);
3450 MI.eraseFromParent();
3451 return true;
3454 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3455 MachineRegisterInfo &MRI,
3456 MachineIRBuilder &B) const {
3457 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3458 return true;
3460 Register Res = MI.getOperand(0).getReg();
3461 Register LHS = MI.getOperand(1).getReg();
3462 Register RHS = MI.getOperand(2).getReg();
3464 uint16_t Flags = MI.getFlags();
3466 LLT S64 = LLT::scalar(64);
3467 LLT S1 = LLT::scalar(1);
3469 auto One = B.buildFConstant(S64, 1.0);
3471 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3472 .addUse(LHS)
3473 .addUse(RHS)
3474 .addImm(0)
3475 .setMIFlags(Flags);
3477 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3479 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3480 .addUse(DivScale0.getReg(0))
3481 .setMIFlags(Flags);
3483 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3484 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3485 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3487 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3488 .addUse(LHS)
3489 .addUse(RHS)
3490 .addImm(1)
3491 .setMIFlags(Flags);
3493 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3494 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3495 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3497 Register Scale;
3498 if (!ST.hasUsableDivScaleConditionOutput()) {
3499 // Workaround a hardware bug on SI where the condition output from div_scale
3500 // is not usable.
3502 LLT S32 = LLT::scalar(32);
3504 auto NumUnmerge = B.buildUnmerge(S32, LHS);
3505 auto DenUnmerge = B.buildUnmerge(S32, RHS);
3506 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3507 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3509 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3510 Scale1Unmerge.getReg(1));
3511 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3512 Scale0Unmerge.getReg(1));
3513 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3514 } else {
3515 Scale = DivScale1.getReg(1);
3518 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3519 .addUse(Fma4.getReg(0))
3520 .addUse(Fma3.getReg(0))
3521 .addUse(Mul.getReg(0))
3522 .addUse(Scale)
3523 .setMIFlags(Flags);
3525 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3526 .addUse(Fmas.getReg(0))
3527 .addUse(RHS)
3528 .addUse(LHS)
3529 .setMIFlags(Flags);
3531 MI.eraseFromParent();
3532 return true;
3535 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3536 MachineRegisterInfo &MRI,
3537 MachineIRBuilder &B) const {
3538 Register Res = MI.getOperand(0).getReg();
3539 Register LHS = MI.getOperand(2).getReg();
3540 Register RHS = MI.getOperand(3).getReg();
3541 uint16_t Flags = MI.getFlags();
3543 LLT S32 = LLT::scalar(32);
3544 LLT S1 = LLT::scalar(1);
3546 auto Abs = B.buildFAbs(S32, RHS, Flags);
3547 const APFloat C0Val(1.0f);
3549 auto C0 = B.buildConstant(S32, 0x6f800000);
3550 auto C1 = B.buildConstant(S32, 0x2f800000);
3551 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3553 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3554 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3556 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3558 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3559 .addUse(Mul0.getReg(0))
3560 .setMIFlags(Flags);
3562 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3564 B.buildFMul(Res, Sel, Mul1, Flags);
3566 MI.eraseFromParent();
3567 return true;
3570 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3571 // FIXME: Why do we handle this one but not other removed instructions?
3573 // Reciprocal square root. The clamp prevents infinite results, clamping
3574 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3575 // +-max_float.
3576 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3577 MachineRegisterInfo &MRI,
3578 MachineIRBuilder &B) const {
3579 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3580 return true;
3582 Register Dst = MI.getOperand(0).getReg();
3583 Register Src = MI.getOperand(2).getReg();
3584 auto Flags = MI.getFlags();
3586 LLT Ty = MRI.getType(Dst);
3588 const fltSemantics *FltSemantics;
3589 if (Ty == LLT::scalar(32))
3590 FltSemantics = &APFloat::IEEEsingle();
3591 else if (Ty == LLT::scalar(64))
3592 FltSemantics = &APFloat::IEEEdouble();
3593 else
3594 return false;
3596 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3597 .addUse(Src)
3598 .setMIFlags(Flags);
3600 // We don't need to concern ourselves with the snan handling difference, since
3601 // the rsq quieted (or not) so use the one which will directly select.
3602 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3603 const bool UseIEEE = MFI->getMode().IEEE;
3605 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3606 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3607 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3609 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3611 if (UseIEEE)
3612 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3613 else
3614 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3615 MI.eraseFromParent();
3616 return true;
3619 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3620 switch (IID) {
3621 case Intrinsic::amdgcn_ds_fadd:
3622 return AMDGPU::G_ATOMICRMW_FADD;
3623 case Intrinsic::amdgcn_ds_fmin:
3624 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3625 case Intrinsic::amdgcn_ds_fmax:
3626 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3627 default:
3628 llvm_unreachable("not a DS FP intrinsic");
3632 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3633 MachineInstr &MI,
3634 Intrinsic::ID IID) const {
3635 GISelChangeObserver &Observer = Helper.Observer;
3636 Observer.changingInstr(MI);
3638 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3640 // The remaining operands were used to set fields in the MemOperand on
3641 // construction.
3642 for (int I = 6; I > 3; --I)
3643 MI.RemoveOperand(I);
3645 MI.RemoveOperand(1); // Remove the intrinsic ID.
3646 Observer.changedInstr(MI);
3647 return true;
3650 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3651 MachineRegisterInfo &MRI,
3652 MachineIRBuilder &B) const {
3653 uint64_t Offset =
3654 ST.getTargetLowering()->getImplicitParameterOffset(
3655 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3656 LLT DstTy = MRI.getType(DstReg);
3657 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3659 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3660 if (!loadInputValue(KernargPtrReg, B,
3661 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3662 return false;
3664 // FIXME: This should be nuw
3665 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3666 return true;
3669 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3670 MachineRegisterInfo &MRI,
3671 MachineIRBuilder &B) const {
3672 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3673 if (!MFI->isEntryFunction()) {
3674 return legalizePreloadedArgIntrin(MI, MRI, B,
3675 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3678 Register DstReg = MI.getOperand(0).getReg();
3679 if (!getImplicitArgPtr(DstReg, MRI, B))
3680 return false;
3682 MI.eraseFromParent();
3683 return true;
3686 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3687 MachineRegisterInfo &MRI,
3688 MachineIRBuilder &B,
3689 unsigned AddrSpace) const {
3690 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3691 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3692 Register Hi32 = Unmerge.getReg(1);
3694 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3695 MI.eraseFromParent();
3696 return true;
3699 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3700 // offset (the offset that is included in bounds checking and swizzling, to be
3701 // split between the instruction's voffset and immoffset fields) and soffset
3702 // (the offset that is excluded from bounds checking and swizzling, to go in
3703 // the instruction's soffset field). This function takes the first kind of
3704 // offset and figures out how to split it between voffset and immoffset.
3705 std::pair<Register, unsigned>
3706 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3707 Register OrigOffset) const {
3708 const unsigned MaxImm = 4095;
3709 Register BaseReg;
3710 unsigned ImmOffset;
3711 const LLT S32 = LLT::scalar(32);
3712 MachineRegisterInfo &MRI = *B.getMRI();
3714 std::tie(BaseReg, ImmOffset) =
3715 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3717 // If BaseReg is a pointer, convert it to int.
3718 if (MRI.getType(BaseReg).isPointer())
3719 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3721 // If the immediate value is too big for the immoffset field, put the value
3722 // and -4096 into the immoffset field so that the value that is copied/added
3723 // for the voffset field is a multiple of 4096, and it stands more chance
3724 // of being CSEd with the copy/add for another similar load/store.
3725 // However, do not do that rounding down to a multiple of 4096 if that is a
3726 // negative number, as it appears to be illegal to have a negative offset
3727 // in the vgpr, even if adding the immediate offset makes it positive.
3728 unsigned Overflow = ImmOffset & ~MaxImm;
3729 ImmOffset -= Overflow;
3730 if ((int32_t)Overflow < 0) {
3731 Overflow += ImmOffset;
3732 ImmOffset = 0;
3735 if (Overflow != 0) {
3736 if (!BaseReg) {
3737 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3738 } else {
3739 auto OverflowVal = B.buildConstant(S32, Overflow);
3740 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3744 if (!BaseReg)
3745 BaseReg = B.buildConstant(S32, 0).getReg(0);
3747 return std::make_pair(BaseReg, ImmOffset);
3750 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3751 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3752 Register VOffset, Register SOffset,
3753 unsigned ImmOffset, Register VIndex,
3754 MachineRegisterInfo &MRI) const {
3755 Optional<ValueAndVReg> MaybeVOffsetVal =
3756 getConstantVRegValWithLookThrough(VOffset, MRI);
3757 Optional<ValueAndVReg> MaybeSOffsetVal =
3758 getConstantVRegValWithLookThrough(SOffset, MRI);
3759 Optional<ValueAndVReg> MaybeVIndexVal =
3760 getConstantVRegValWithLookThrough(VIndex, MRI);
3761 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3762 // update the MMO with that offset. The stride is unknown so we can only do
3763 // this if VIndex is constant 0.
3764 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3765 MaybeVIndexVal->Value == 0) {
3766 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3767 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3768 MMO->setOffset(TotalOffset);
3769 } else {
3770 // We don't have a constant combined offset to use in the MMO. Give up.
3771 MMO->setValue((Value *)nullptr);
3775 /// Handle register layout difference for f16 images for some subtargets.
3776 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3777 MachineRegisterInfo &MRI,
3778 Register Reg,
3779 bool ImageStore) const {
3780 const LLT S16 = LLT::scalar(16);
3781 const LLT S32 = LLT::scalar(32);
3782 LLT StoreVT = MRI.getType(Reg);
3783 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3785 if (ST.hasUnpackedD16VMem()) {
3786 auto Unmerge = B.buildUnmerge(S16, Reg);
3788 SmallVector<Register, 4> WideRegs;
3789 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3790 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3792 int NumElts = StoreVT.getNumElements();
3794 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3795 .getReg(0);
3798 if (ImageStore && ST.hasImageStoreD16Bug()) {
3799 if (StoreVT.getNumElements() == 2) {
3800 SmallVector<Register, 4> PackedRegs;
3801 Reg = B.buildBitcast(S32, Reg).getReg(0);
3802 PackedRegs.push_back(Reg);
3803 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3804 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3805 .getReg(0);
3808 if (StoreVT.getNumElements() == 3) {
3809 SmallVector<Register, 4> PackedRegs;
3810 auto Unmerge = B.buildUnmerge(S16, Reg);
3811 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3812 PackedRegs.push_back(Unmerge.getReg(I));
3813 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3814 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3815 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3818 if (StoreVT.getNumElements() == 4) {
3819 SmallVector<Register, 4> PackedRegs;
3820 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3821 auto Unmerge = B.buildUnmerge(S32, Reg);
3822 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3823 PackedRegs.push_back(Unmerge.getReg(I));
3824 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3825 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3826 .getReg(0);
3829 llvm_unreachable("invalid data type");
3832 return Reg;
3835 Register AMDGPULegalizerInfo::fixStoreSourceType(
3836 MachineIRBuilder &B, Register VData, bool IsFormat) const {
3837 MachineRegisterInfo *MRI = B.getMRI();
3838 LLT Ty = MRI->getType(VData);
3840 const LLT S16 = LLT::scalar(16);
3842 // Fixup illegal register types for i8 stores.
3843 if (Ty == LLT::scalar(8) || Ty == S16) {
3844 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3845 return AnyExt;
3848 if (Ty.isVector()) {
3849 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3850 if (IsFormat)
3851 return handleD16VData(B, *MRI, VData);
3855 return VData;
3858 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3859 MachineRegisterInfo &MRI,
3860 MachineIRBuilder &B,
3861 bool IsTyped,
3862 bool IsFormat) const {
3863 Register VData = MI.getOperand(1).getReg();
3864 LLT Ty = MRI.getType(VData);
3865 LLT EltTy = Ty.getScalarType();
3866 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3867 const LLT S32 = LLT::scalar(32);
3869 VData = fixStoreSourceType(B, VData, IsFormat);
3870 Register RSrc = MI.getOperand(2).getReg();
3872 MachineMemOperand *MMO = *MI.memoperands_begin();
3873 const int MemSize = MMO->getSize();
3875 unsigned ImmOffset;
3877 // The typed intrinsics add an immediate after the registers.
3878 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3880 // The struct intrinsic variants add one additional operand over raw.
3881 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3882 Register VIndex;
3883 int OpOffset = 0;
3884 if (HasVIndex) {
3885 VIndex = MI.getOperand(3).getReg();
3886 OpOffset = 1;
3887 } else {
3888 VIndex = B.buildConstant(S32, 0).getReg(0);
3891 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3892 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3894 unsigned Format = 0;
3895 if (IsTyped) {
3896 Format = MI.getOperand(5 + OpOffset).getImm();
3897 ++OpOffset;
3900 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3902 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3903 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3905 unsigned Opc;
3906 if (IsTyped) {
3907 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3908 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3909 } else if (IsFormat) {
3910 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3911 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3912 } else {
3913 switch (MemSize) {
3914 case 1:
3915 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3916 break;
3917 case 2:
3918 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3919 break;
3920 default:
3921 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3922 break;
3926 auto MIB = B.buildInstr(Opc)
3927 .addUse(VData) // vdata
3928 .addUse(RSrc) // rsrc
3929 .addUse(VIndex) // vindex
3930 .addUse(VOffset) // voffset
3931 .addUse(SOffset) // soffset
3932 .addImm(ImmOffset); // offset(imm)
3934 if (IsTyped)
3935 MIB.addImm(Format);
3937 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3938 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3939 .addMemOperand(MMO);
3941 MI.eraseFromParent();
3942 return true;
3945 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3946 MachineRegisterInfo &MRI,
3947 MachineIRBuilder &B,
3948 bool IsFormat,
3949 bool IsTyped) const {
3950 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3951 MachineMemOperand *MMO = *MI.memoperands_begin();
3952 const LLT MemTy = MMO->getMemoryType();
3953 const LLT S32 = LLT::scalar(32);
3955 Register Dst = MI.getOperand(0).getReg();
3956 Register RSrc = MI.getOperand(2).getReg();
3958 // The typed intrinsics add an immediate after the registers.
3959 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3961 // The struct intrinsic variants add one additional operand over raw.
3962 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3963 Register VIndex;
3964 int OpOffset = 0;
3965 if (HasVIndex) {
3966 VIndex = MI.getOperand(3).getReg();
3967 OpOffset = 1;
3968 } else {
3969 VIndex = B.buildConstant(S32, 0).getReg(0);
3972 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3973 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3975 unsigned Format = 0;
3976 if (IsTyped) {
3977 Format = MI.getOperand(5 + OpOffset).getImm();
3978 ++OpOffset;
3981 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3982 unsigned ImmOffset;
3984 LLT Ty = MRI.getType(Dst);
3985 LLT EltTy = Ty.getScalarType();
3986 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3987 const bool Unpacked = ST.hasUnpackedD16VMem();
3989 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3990 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3992 unsigned Opc;
3994 if (IsTyped) {
3995 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3996 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3997 } else if (IsFormat) {
3998 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3999 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4000 } else {
4001 switch (MemTy.getSizeInBits()) {
4002 case 8:
4003 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4004 break;
4005 case 16:
4006 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4007 break;
4008 default:
4009 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4010 break;
4014 Register LoadDstReg;
4016 bool IsExtLoad =
4017 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4018 LLT UnpackedTy = Ty.changeElementSize(32);
4020 if (IsExtLoad)
4021 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4022 else if (Unpacked && IsD16 && Ty.isVector())
4023 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4024 else
4025 LoadDstReg = Dst;
4027 auto MIB = B.buildInstr(Opc)
4028 .addDef(LoadDstReg) // vdata
4029 .addUse(RSrc) // rsrc
4030 .addUse(VIndex) // vindex
4031 .addUse(VOffset) // voffset
4032 .addUse(SOffset) // soffset
4033 .addImm(ImmOffset); // offset(imm)
4035 if (IsTyped)
4036 MIB.addImm(Format);
4038 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4039 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4040 .addMemOperand(MMO);
4042 if (LoadDstReg != Dst) {
4043 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4045 // Widen result for extending loads was widened.
4046 if (IsExtLoad)
4047 B.buildTrunc(Dst, LoadDstReg);
4048 else {
4049 // Repack to original 16-bit vector result
4050 // FIXME: G_TRUNC should work, but legalization currently fails
4051 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4052 SmallVector<Register, 4> Repack;
4053 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4054 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4055 B.buildMerge(Dst, Repack);
4059 MI.eraseFromParent();
4060 return true;
4063 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4064 MachineIRBuilder &B,
4065 bool IsInc) const {
4066 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4067 AMDGPU::G_AMDGPU_ATOMIC_DEC;
4068 B.buildInstr(Opc)
4069 .addDef(MI.getOperand(0).getReg())
4070 .addUse(MI.getOperand(2).getReg())
4071 .addUse(MI.getOperand(3).getReg())
4072 .cloneMemRefs(MI);
4073 MI.eraseFromParent();
4074 return true;
4077 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4078 switch (IntrID) {
4079 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4080 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4081 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4082 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4083 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4085 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4086 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4087 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4088 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4089 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4090 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4091 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4092 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4093 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4094 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4095 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4096 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4097 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4098 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4100 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4101 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4102 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4103 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4104 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4105 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4106 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4107 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4108 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4109 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4110 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4111 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4112 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4113 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4115 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4116 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4117 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4118 case Intrinsic::amdgcn_buffer_atomic_fadd:
4119 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4120 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4121 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4122 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4123 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4124 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4125 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4126 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4127 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4128 default:
4129 llvm_unreachable("unhandled atomic opcode");
4133 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4134 MachineIRBuilder &B,
4135 Intrinsic::ID IID) const {
4136 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4137 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4138 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4140 Register Dst;
4142 int OpOffset = 0;
4143 if (HasReturn) {
4144 // A few FP atomics do not support return values.
4145 Dst = MI.getOperand(0).getReg();
4146 } else {
4147 OpOffset = -1;
4150 Register VData = MI.getOperand(2 + OpOffset).getReg();
4151 Register CmpVal;
4153 if (IsCmpSwap) {
4154 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4155 ++OpOffset;
4158 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4159 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4161 // The struct intrinsic variants add one additional operand over raw.
4162 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4163 Register VIndex;
4164 if (HasVIndex) {
4165 VIndex = MI.getOperand(4 + OpOffset).getReg();
4166 ++OpOffset;
4167 } else {
4168 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4171 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4172 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4173 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4175 MachineMemOperand *MMO = *MI.memoperands_begin();
4177 unsigned ImmOffset;
4178 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4179 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4181 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4183 if (HasReturn)
4184 MIB.addDef(Dst);
4186 MIB.addUse(VData); // vdata
4188 if (IsCmpSwap)
4189 MIB.addReg(CmpVal);
4191 MIB.addUse(RSrc) // rsrc
4192 .addUse(VIndex) // vindex
4193 .addUse(VOffset) // voffset
4194 .addUse(SOffset) // soffset
4195 .addImm(ImmOffset) // offset(imm)
4196 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4197 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4198 .addMemOperand(MMO);
4200 MI.eraseFromParent();
4201 return true;
4204 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4205 /// vector with s16 typed elements.
4206 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4207 SmallVectorImpl<Register> &PackedAddrs,
4208 unsigned ArgOffset,
4209 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4210 bool IsA16, bool IsG16) {
4211 const LLT S16 = LLT::scalar(16);
4212 const LLT V2S16 = LLT::fixed_vector(2, 16);
4213 auto EndIdx = Intr->VAddrEnd;
4215 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4216 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4217 if (!SrcOp.isReg())
4218 continue; // _L to _LZ may have eliminated this.
4220 Register AddrReg = SrcOp.getReg();
4222 if ((I < Intr->GradientStart) ||
4223 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4224 (I >= Intr->CoordStart && !IsA16)) {
4225 // Handle any gradient or coordinate operands that should not be packed
4226 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4227 PackedAddrs.push_back(AddrReg);
4228 } else {
4229 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4230 // derivatives dx/dh and dx/dv are packed with undef.
4231 if (((I + 1) >= EndIdx) ||
4232 ((Intr->NumGradients / 2) % 2 == 1 &&
4233 (I == static_cast<unsigned>(Intr->GradientStart +
4234 (Intr->NumGradients / 2) - 1) ||
4235 I == static_cast<unsigned>(Intr->GradientStart +
4236 Intr->NumGradients - 1))) ||
4237 // Check for _L to _LZ optimization
4238 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4239 PackedAddrs.push_back(
4240 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4241 .getReg(0));
4242 } else {
4243 PackedAddrs.push_back(
4244 B.buildBuildVector(
4245 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4246 .getReg(0));
4247 ++I;
4253 /// Convert from separate vaddr components to a single vector address register,
4254 /// and replace the remaining operands with $noreg.
4255 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4256 int DimIdx, int NumVAddrs) {
4257 const LLT S32 = LLT::scalar(32);
4259 SmallVector<Register, 8> AddrRegs;
4260 for (int I = 0; I != NumVAddrs; ++I) {
4261 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4262 if (SrcOp.isReg()) {
4263 AddrRegs.push_back(SrcOp.getReg());
4264 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4268 int NumAddrRegs = AddrRegs.size();
4269 if (NumAddrRegs != 1) {
4270 // Above 8 elements round up to next power of 2 (i.e. 16).
4271 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4272 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4273 auto Undef = B.buildUndef(S32);
4274 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4275 NumAddrRegs = RoundedNumRegs;
4278 auto VAddr =
4279 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4280 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4283 for (int I = 1; I != NumVAddrs; ++I) {
4284 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4285 if (SrcOp.isReg())
4286 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4290 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4292 /// Depending on the subtarget, load/store with 16-bit element data need to be
4293 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4294 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4295 /// registers.
4297 /// We don't want to directly select image instructions just yet, but also want
4298 /// to exposes all register repacking to the legalizer/combiners. We also don't
4299 /// want a selected instrution entering RegBankSelect. In order to avoid
4300 /// defining a multitude of intermediate image instructions, directly hack on
4301 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4302 /// now unnecessary arguments with $noreg.
4303 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4304 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4305 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4307 const unsigned NumDefs = MI.getNumExplicitDefs();
4308 const unsigned ArgOffset = NumDefs + 1;
4309 bool IsTFE = NumDefs == 2;
4310 // We are only processing the operands of d16 image operations on subtargets
4311 // that use the unpacked register layout, or need to repack the TFE result.
4313 // TODO: Do we need to guard against already legalized intrinsics?
4314 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4315 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4317 MachineRegisterInfo *MRI = B.getMRI();
4318 const LLT S32 = LLT::scalar(32);
4319 const LLT S16 = LLT::scalar(16);
4320 const LLT V2S16 = LLT::fixed_vector(2, 16);
4322 unsigned DMask = 0;
4324 // Check for 16 bit addresses and pack if true.
4325 LLT GradTy =
4326 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4327 LLT AddrTy =
4328 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4329 const bool IsG16 = GradTy == S16;
4330 const bool IsA16 = AddrTy == S16;
4332 int DMaskLanes = 0;
4333 if (!BaseOpcode->Atomic) {
4334 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4335 if (BaseOpcode->Gather4) {
4336 DMaskLanes = 4;
4337 } else if (DMask != 0) {
4338 DMaskLanes = countPopulation(DMask);
4339 } else if (!IsTFE && !BaseOpcode->Store) {
4340 // If dmask is 0, this is a no-op load. This can be eliminated.
4341 B.buildUndef(MI.getOperand(0));
4342 MI.eraseFromParent();
4343 return true;
4347 Observer.changingInstr(MI);
4348 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4350 unsigned NewOpcode = NumDefs == 0 ?
4351 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4353 // Track that we legalized this
4354 MI.setDesc(B.getTII().get(NewOpcode));
4356 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4357 // dmask to be at least 1 otherwise the instruction will fail
4358 if (IsTFE && DMask == 0) {
4359 DMask = 0x1;
4360 DMaskLanes = 1;
4361 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4364 if (BaseOpcode->Atomic) {
4365 Register VData0 = MI.getOperand(2).getReg();
4366 LLT Ty = MRI->getType(VData0);
4368 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4369 if (Ty.isVector())
4370 return false;
4372 if (BaseOpcode->AtomicX2) {
4373 Register VData1 = MI.getOperand(3).getReg();
4374 // The two values are packed in one register.
4375 LLT PackedTy = LLT::fixed_vector(2, Ty);
4376 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4377 MI.getOperand(2).setReg(Concat.getReg(0));
4378 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4382 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4384 // Optimize _L to _LZ when _L is zero
4385 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4386 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4387 const ConstantFP *ConstantLod;
4389 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4390 m_GFCst(ConstantLod))) {
4391 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4392 // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4393 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4394 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4395 Intr->Dim);
4397 // The starting indexes should remain in the same place.
4398 --CorrectedNumVAddrs;
4400 MI.getOperand(MI.getNumExplicitDefs())
4401 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4402 MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4403 Intr = NewImageDimIntr;
4408 // Optimize _mip away, when 'lod' is zero
4409 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4410 int64_t ConstantLod;
4411 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4412 m_ICst(ConstantLod))) {
4413 if (ConstantLod == 0) {
4414 // TODO: Change intrinsic opcode and remove operand instead or replacing
4415 // it with 0, as the _L to _LZ handling is done above.
4416 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4417 --CorrectedNumVAddrs;
4422 // Rewrite the addressing register layout before doing anything else.
4423 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4424 // 16 bit gradients are supported, but are tied to the A16 control
4425 // so both gradients and addresses must be 16 bit
4426 return false;
4429 if (IsA16 && !ST.hasA16()) {
4430 // A16 not supported
4431 return false;
4434 if (IsA16 || IsG16) {
4435 if (Intr->NumVAddrs > 1) {
4436 SmallVector<Register, 4> PackedRegs;
4438 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4439 IsG16);
4441 // See also below in the non-a16 branch
4442 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4443 PackedRegs.size() <= ST.getNSAMaxSize();
4445 if (!UseNSA && PackedRegs.size() > 1) {
4446 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4447 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4448 PackedRegs[0] = Concat.getReg(0);
4449 PackedRegs.resize(1);
4452 const unsigned NumPacked = PackedRegs.size();
4453 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4454 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4455 if (!SrcOp.isReg()) {
4456 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4457 continue;
4460 assert(SrcOp.getReg() != AMDGPU::NoRegister);
4462 if (I - Intr->VAddrStart < NumPacked)
4463 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4464 else
4465 SrcOp.setReg(AMDGPU::NoRegister);
4468 } else {
4469 // If the register allocator cannot place the address registers contiguously
4470 // without introducing moves, then using the non-sequential address encoding
4471 // is always preferable, since it saves VALU instructions and is usually a
4472 // wash in terms of code size or even better.
4474 // However, we currently have no way of hinting to the register allocator
4475 // that MIMG addresses should be placed contiguously when it is possible to
4476 // do so, so force non-NSA for the common 2-address case as a heuristic.
4478 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4479 // allocation when possible.
4480 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4481 CorrectedNumVAddrs <= ST.getNSAMaxSize();
4483 if (!UseNSA && Intr->NumVAddrs > 1)
4484 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4485 Intr->NumVAddrs);
4488 int Flags = 0;
4489 if (IsA16)
4490 Flags |= 1;
4491 if (IsG16)
4492 Flags |= 2;
4493 MI.addOperand(MachineOperand::CreateImm(Flags));
4495 if (BaseOpcode->Store) { // No TFE for stores?
4496 // TODO: Handle dmask trim
4497 Register VData = MI.getOperand(1).getReg();
4498 LLT Ty = MRI->getType(VData);
4499 if (!Ty.isVector() || Ty.getElementType() != S16)
4500 return true;
4502 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4503 if (RepackedReg != VData) {
4504 MI.getOperand(1).setReg(RepackedReg);
4507 return true;
4510 Register DstReg = MI.getOperand(0).getReg();
4511 LLT Ty = MRI->getType(DstReg);
4512 const LLT EltTy = Ty.getScalarType();
4513 const bool IsD16 = Ty.getScalarType() == S16;
4514 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4516 // Confirm that the return type is large enough for the dmask specified
4517 if (NumElts < DMaskLanes)
4518 return false;
4520 if (NumElts > 4 || DMaskLanes > 4)
4521 return false;
4523 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4524 const LLT AdjustedTy =
4525 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4527 // The raw dword aligned data component of the load. The only legal cases
4528 // where this matters should be when using the packed D16 format, for
4529 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4530 LLT RoundedTy;
4532 // S32 vector to to cover all data, plus TFE result element.
4533 LLT TFETy;
4535 // Register type to use for each loaded component. Will be S32 or V2S16.
4536 LLT RegTy;
4538 if (IsD16 && ST.hasUnpackedD16VMem()) {
4539 RoundedTy =
4540 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4541 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4542 RegTy = S32;
4543 } else {
4544 unsigned EltSize = EltTy.getSizeInBits();
4545 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4546 unsigned RoundedSize = 32 * RoundedElts;
4547 RoundedTy = LLT::scalarOrVector(
4548 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4549 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4550 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4553 // The return type does not need adjustment.
4554 // TODO: Should we change s16 case to s32 or <2 x s16>?
4555 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4556 return true;
4558 Register Dst1Reg;
4560 // Insert after the instruction.
4561 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4563 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4564 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4565 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4566 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4568 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4570 MI.getOperand(0).setReg(NewResultReg);
4572 // In the IR, TFE is supposed to be used with a 2 element struct return
4573 // type. The intruction really returns these two values in one contiguous
4574 // register, with one additional dword beyond the loaded data. Rewrite the
4575 // return type to use a single register result.
4577 if (IsTFE) {
4578 Dst1Reg = MI.getOperand(1).getReg();
4579 if (MRI->getType(Dst1Reg) != S32)
4580 return false;
4582 // TODO: Make sure the TFE operand bit is set.
4583 MI.RemoveOperand(1);
4585 // Handle the easy case that requires no repack instructions.
4586 if (Ty == S32) {
4587 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4588 return true;
4592 // Now figure out how to copy the new result register back into the old
4593 // result.
4594 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4596 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4598 if (ResultNumRegs == 1) {
4599 assert(!IsTFE);
4600 ResultRegs[0] = NewResultReg;
4601 } else {
4602 // We have to repack into a new vector of some kind.
4603 for (int I = 0; I != NumDataRegs; ++I)
4604 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4605 B.buildUnmerge(ResultRegs, NewResultReg);
4607 // Drop the final TFE element to get the data part. The TFE result is
4608 // directly written to the right place already.
4609 if (IsTFE)
4610 ResultRegs.resize(NumDataRegs);
4613 // For an s16 scalar result, we form an s32 result with a truncate regardless
4614 // of packed vs. unpacked.
4615 if (IsD16 && !Ty.isVector()) {
4616 B.buildTrunc(DstReg, ResultRegs[0]);
4617 return true;
4620 // Avoid a build/concat_vector of 1 entry.
4621 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4622 B.buildBitcast(DstReg, ResultRegs[0]);
4623 return true;
4626 assert(Ty.isVector());
4628 if (IsD16) {
4629 // For packed D16 results with TFE enabled, all the data components are
4630 // S32. Cast back to the expected type.
4632 // TODO: We don't really need to use load s32 elements. We would only need one
4633 // cast for the TFE result if a multiple of v2s16 was used.
4634 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4635 for (Register &Reg : ResultRegs)
4636 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4637 } else if (ST.hasUnpackedD16VMem()) {
4638 for (Register &Reg : ResultRegs)
4639 Reg = B.buildTrunc(S16, Reg).getReg(0);
4643 auto padWithUndef = [&](LLT Ty, int NumElts) {
4644 if (NumElts == 0)
4645 return;
4646 Register Undef = B.buildUndef(Ty).getReg(0);
4647 for (int I = 0; I != NumElts; ++I)
4648 ResultRegs.push_back(Undef);
4651 // Pad out any elements eliminated due to the dmask.
4652 LLT ResTy = MRI->getType(ResultRegs[0]);
4653 if (!ResTy.isVector()) {
4654 padWithUndef(ResTy, NumElts - ResultRegs.size());
4655 B.buildBuildVector(DstReg, ResultRegs);
4656 return true;
4659 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4660 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4662 // Deal with the one annoying legal case.
4663 const LLT V3S16 = LLT::fixed_vector(3, 16);
4664 if (Ty == V3S16) {
4665 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4666 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4667 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4668 return true;
4671 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4672 B.buildConcatVectors(DstReg, ResultRegs);
4673 return true;
4676 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4677 LegalizerHelper &Helper, MachineInstr &MI) const {
4678 MachineIRBuilder &B = Helper.MIRBuilder;
4679 GISelChangeObserver &Observer = Helper.Observer;
4681 Register Dst = MI.getOperand(0).getReg();
4682 LLT Ty = B.getMRI()->getType(Dst);
4683 unsigned Size = Ty.getSizeInBits();
4684 MachineFunction &MF = B.getMF();
4686 Observer.changingInstr(MI);
4688 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4689 Ty = getBitcastRegisterType(Ty);
4690 Helper.bitcastDst(MI, Ty, 0);
4691 Dst = MI.getOperand(0).getReg();
4692 B.setInsertPt(B.getMBB(), MI);
4695 // FIXME: We don't really need this intermediate instruction. The intrinsic
4696 // should be fixed to have a memory operand. Since it's readnone, we're not
4697 // allowed to add one.
4698 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4699 MI.RemoveOperand(1); // Remove intrinsic ID
4701 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4702 // TODO: Should this use datalayout alignment?
4703 const unsigned MemSize = (Size + 7) / 8;
4704 const Align MemAlign(4);
4705 MachineMemOperand *MMO = MF.getMachineMemOperand(
4706 MachinePointerInfo(),
4707 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4708 MachineMemOperand::MOInvariant,
4709 MemSize, MemAlign);
4710 MI.addMemOperand(MF, MMO);
4712 // There are no 96-bit result scalar loads, but widening to 128-bit should
4713 // always be legal. We may need to restore this to a 96-bit result if it turns
4714 // out this needs to be converted to a vector load during RegBankSelect.
4715 if (!isPowerOf2_32(Size)) {
4716 if (Ty.isVector())
4717 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4718 else
4719 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4722 Observer.changedInstr(MI);
4723 return true;
4726 // TODO: Move to selection
4727 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4728 MachineRegisterInfo &MRI,
4729 MachineIRBuilder &B) const {
4730 if (!ST.isTrapHandlerEnabled() ||
4731 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4732 return legalizeTrapEndpgm(MI, MRI, B);
4734 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4735 switch (*HsaAbiVer) {
4736 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4737 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4738 return legalizeTrapHsaQueuePtr(MI, MRI, B);
4739 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4740 return ST.supportsGetDoorbellID() ?
4741 legalizeTrapHsa(MI, MRI, B) :
4742 legalizeTrapHsaQueuePtr(MI, MRI, B);
4746 llvm_unreachable("Unknown trap handler");
4749 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4750 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4751 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4752 MI.eraseFromParent();
4753 return true;
4756 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4757 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4758 // Pass queue pointer to trap handler as input, and insert trap instruction
4759 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4760 Register LiveIn =
4761 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4762 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4763 return false;
4765 Register SGPR01(AMDGPU::SGPR0_SGPR1);
4766 B.buildCopy(SGPR01, LiveIn);
4767 B.buildInstr(AMDGPU::S_TRAP)
4768 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4769 .addReg(SGPR01, RegState::Implicit);
4771 MI.eraseFromParent();
4772 return true;
4775 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4776 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4777 B.buildInstr(AMDGPU::S_TRAP)
4778 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4779 MI.eraseFromParent();
4780 return true;
4783 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4784 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4785 // Is non-HSA path or trap-handler disabled? then, report a warning
4786 // accordingly
4787 if (!ST.isTrapHandlerEnabled() ||
4788 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4789 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4790 "debugtrap handler not supported",
4791 MI.getDebugLoc(), DS_Warning);
4792 LLVMContext &Ctx = B.getMF().getFunction().getContext();
4793 Ctx.diagnose(NoTrap);
4794 } else {
4795 // Insert debug-trap instruction
4796 B.buildInstr(AMDGPU::S_TRAP)
4797 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4800 MI.eraseFromParent();
4801 return true;
4804 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4805 MachineIRBuilder &B) const {
4806 MachineRegisterInfo &MRI = *B.getMRI();
4807 const LLT S16 = LLT::scalar(16);
4808 const LLT S32 = LLT::scalar(32);
4810 Register DstReg = MI.getOperand(0).getReg();
4811 Register NodePtr = MI.getOperand(2).getReg();
4812 Register RayExtent = MI.getOperand(3).getReg();
4813 Register RayOrigin = MI.getOperand(4).getReg();
4814 Register RayDir = MI.getOperand(5).getReg();
4815 Register RayInvDir = MI.getOperand(6).getReg();
4816 Register TDescr = MI.getOperand(7).getReg();
4818 if (!ST.hasGFX10_AEncoding()) {
4819 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4820 "intrinsic not supported on subtarget",
4821 MI.getDebugLoc());
4822 B.getMF().getFunction().getContext().diagnose(BadIntrin);
4823 return false;
4826 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4827 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4828 const unsigned NumVDataDwords = 4;
4829 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4830 const bool UseNSA =
4831 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4832 const unsigned BaseOpcodes[2][2] = {
4833 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4834 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4835 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4836 int Opcode;
4837 if (UseNSA) {
4838 Opcode =
4839 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4840 NumVDataDwords, NumVAddrDwords);
4841 } else {
4842 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4843 AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4844 PowerOf2Ceil(NumVAddrDwords));
4846 assert(Opcode != -1);
4848 SmallVector<Register, 12> Ops;
4849 if (Is64) {
4850 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4851 Ops.push_back(Unmerge.getReg(0));
4852 Ops.push_back(Unmerge.getReg(1));
4853 } else {
4854 Ops.push_back(NodePtr);
4856 Ops.push_back(RayExtent);
4858 auto packLanes = [&Ops, &S32, &B] (Register Src) {
4859 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4860 Ops.push_back(Unmerge.getReg(0));
4861 Ops.push_back(Unmerge.getReg(1));
4862 Ops.push_back(Unmerge.getReg(2));
4865 packLanes(RayOrigin);
4866 if (IsA16) {
4867 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4868 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4869 Register R1 = MRI.createGenericVirtualRegister(S32);
4870 Register R2 = MRI.createGenericVirtualRegister(S32);
4871 Register R3 = MRI.createGenericVirtualRegister(S32);
4872 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4873 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4874 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4875 Ops.push_back(R1);
4876 Ops.push_back(R2);
4877 Ops.push_back(R3);
4878 } else {
4879 packLanes(RayDir);
4880 packLanes(RayInvDir);
4883 if (!UseNSA) {
4884 // Build a single vector containing all the operands so far prepared.
4885 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4886 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4887 Ops.clear();
4888 Ops.push_back(MergedOps);
4891 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4892 .addDef(DstReg)
4893 .addImm(Opcode);
4895 for (Register R : Ops) {
4896 MIB.addUse(R);
4899 MIB.addUse(TDescr)
4900 .addImm(IsA16 ? 1 : 0)
4901 .cloneMemRefs(MI);
4903 MI.eraseFromParent();
4904 return true;
4907 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4908 MachineInstr &MI) const {
4909 MachineIRBuilder &B = Helper.MIRBuilder;
4910 MachineRegisterInfo &MRI = *B.getMRI();
4912 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4913 auto IntrID = MI.getIntrinsicID();
4914 switch (IntrID) {
4915 case Intrinsic::amdgcn_if:
4916 case Intrinsic::amdgcn_else: {
4917 MachineInstr *Br = nullptr;
4918 MachineBasicBlock *UncondBrTarget = nullptr;
4919 bool Negated = false;
4920 if (MachineInstr *BrCond =
4921 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4922 const SIRegisterInfo *TRI
4923 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4925 Register Def = MI.getOperand(1).getReg();
4926 Register Use = MI.getOperand(3).getReg();
4928 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4930 if (Negated)
4931 std::swap(CondBrTarget, UncondBrTarget);
4933 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4934 if (IntrID == Intrinsic::amdgcn_if) {
4935 B.buildInstr(AMDGPU::SI_IF)
4936 .addDef(Def)
4937 .addUse(Use)
4938 .addMBB(UncondBrTarget);
4939 } else {
4940 B.buildInstr(AMDGPU::SI_ELSE)
4941 .addDef(Def)
4942 .addUse(Use)
4943 .addMBB(UncondBrTarget);
4946 if (Br) {
4947 Br->getOperand(0).setMBB(CondBrTarget);
4948 } else {
4949 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4950 // since we're swapping branch targets it needs to be reinserted.
4951 // FIXME: IRTranslator should probably not do this
4952 B.buildBr(*CondBrTarget);
4955 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4956 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4957 MI.eraseFromParent();
4958 BrCond->eraseFromParent();
4959 return true;
4962 return false;
4964 case Intrinsic::amdgcn_loop: {
4965 MachineInstr *Br = nullptr;
4966 MachineBasicBlock *UncondBrTarget = nullptr;
4967 bool Negated = false;
4968 if (MachineInstr *BrCond =
4969 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4970 const SIRegisterInfo *TRI
4971 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4973 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4974 Register Reg = MI.getOperand(2).getReg();
4976 if (Negated)
4977 std::swap(CondBrTarget, UncondBrTarget);
4979 B.setInsertPt(B.getMBB(), BrCond->getIterator());
4980 B.buildInstr(AMDGPU::SI_LOOP)
4981 .addUse(Reg)
4982 .addMBB(UncondBrTarget);
4984 if (Br)
4985 Br->getOperand(0).setMBB(CondBrTarget);
4986 else
4987 B.buildBr(*CondBrTarget);
4989 MI.eraseFromParent();
4990 BrCond->eraseFromParent();
4991 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4992 return true;
4995 return false;
4997 case Intrinsic::amdgcn_kernarg_segment_ptr:
4998 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4999 // This only makes sense to call in a kernel, so just lower to null.
5000 B.buildConstant(MI.getOperand(0).getReg(), 0);
5001 MI.eraseFromParent();
5002 return true;
5005 return legalizePreloadedArgIntrin(
5006 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5007 case Intrinsic::amdgcn_implicitarg_ptr:
5008 return legalizeImplicitArgPtr(MI, MRI, B);
5009 case Intrinsic::amdgcn_workitem_id_x:
5010 return legalizePreloadedArgIntrin(MI, MRI, B,
5011 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5012 case Intrinsic::amdgcn_workitem_id_y:
5013 return legalizePreloadedArgIntrin(MI, MRI, B,
5014 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5015 case Intrinsic::amdgcn_workitem_id_z:
5016 return legalizePreloadedArgIntrin(MI, MRI, B,
5017 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5018 case Intrinsic::amdgcn_workgroup_id_x:
5019 return legalizePreloadedArgIntrin(MI, MRI, B,
5020 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5021 case Intrinsic::amdgcn_workgroup_id_y:
5022 return legalizePreloadedArgIntrin(MI, MRI, B,
5023 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5024 case Intrinsic::amdgcn_workgroup_id_z:
5025 return legalizePreloadedArgIntrin(MI, MRI, B,
5026 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5027 case Intrinsic::amdgcn_dispatch_ptr:
5028 return legalizePreloadedArgIntrin(MI, MRI, B,
5029 AMDGPUFunctionArgInfo::DISPATCH_PTR);
5030 case Intrinsic::amdgcn_queue_ptr:
5031 return legalizePreloadedArgIntrin(MI, MRI, B,
5032 AMDGPUFunctionArgInfo::QUEUE_PTR);
5033 case Intrinsic::amdgcn_implicit_buffer_ptr:
5034 return legalizePreloadedArgIntrin(
5035 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5036 case Intrinsic::amdgcn_dispatch_id:
5037 return legalizePreloadedArgIntrin(MI, MRI, B,
5038 AMDGPUFunctionArgInfo::DISPATCH_ID);
5039 case Intrinsic::amdgcn_fdiv_fast:
5040 return legalizeFDIVFastIntrin(MI, MRI, B);
5041 case Intrinsic::amdgcn_is_shared:
5042 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5043 case Intrinsic::amdgcn_is_private:
5044 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5045 case Intrinsic::amdgcn_wavefrontsize: {
5046 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5047 MI.eraseFromParent();
5048 return true;
5050 case Intrinsic::amdgcn_s_buffer_load:
5051 return legalizeSBufferLoad(Helper, MI);
5052 case Intrinsic::amdgcn_raw_buffer_store:
5053 case Intrinsic::amdgcn_struct_buffer_store:
5054 return legalizeBufferStore(MI, MRI, B, false, false);
5055 case Intrinsic::amdgcn_raw_buffer_store_format:
5056 case Intrinsic::amdgcn_struct_buffer_store_format:
5057 return legalizeBufferStore(MI, MRI, B, false, true);
5058 case Intrinsic::amdgcn_raw_tbuffer_store:
5059 case Intrinsic::amdgcn_struct_tbuffer_store:
5060 return legalizeBufferStore(MI, MRI, B, true, true);
5061 case Intrinsic::amdgcn_raw_buffer_load:
5062 case Intrinsic::amdgcn_struct_buffer_load:
5063 return legalizeBufferLoad(MI, MRI, B, false, false);
5064 case Intrinsic::amdgcn_raw_buffer_load_format:
5065 case Intrinsic::amdgcn_struct_buffer_load_format:
5066 return legalizeBufferLoad(MI, MRI, B, true, false);
5067 case Intrinsic::amdgcn_raw_tbuffer_load:
5068 case Intrinsic::amdgcn_struct_tbuffer_load:
5069 return legalizeBufferLoad(MI, MRI, B, true, true);
5070 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5071 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5072 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5073 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5074 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5075 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5076 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5077 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5078 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5079 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5080 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5081 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5082 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5083 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5084 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5085 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5086 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5087 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5088 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5089 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5090 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5091 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5092 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5093 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5094 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5095 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5096 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5097 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5098 case Intrinsic::amdgcn_buffer_atomic_fadd:
5099 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5100 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5101 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5102 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5103 return legalizeBufferAtomic(MI, B, IntrID);
5104 case Intrinsic::amdgcn_atomic_inc:
5105 return legalizeAtomicIncDec(MI, B, true);
5106 case Intrinsic::amdgcn_atomic_dec:
5107 return legalizeAtomicIncDec(MI, B, false);
5108 case Intrinsic::trap:
5109 return legalizeTrapIntrinsic(MI, MRI, B);
5110 case Intrinsic::debugtrap:
5111 return legalizeDebugTrapIntrinsic(MI, MRI, B);
5112 case Intrinsic::amdgcn_rsq_clamp:
5113 return legalizeRsqClampIntrinsic(MI, MRI, B);
5114 case Intrinsic::amdgcn_ds_fadd:
5115 case Intrinsic::amdgcn_ds_fmin:
5116 case Intrinsic::amdgcn_ds_fmax:
5117 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5118 case Intrinsic::amdgcn_image_bvh_intersect_ray:
5119 return legalizeBVHIntrinsic(MI, B);
5120 default: {
5121 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5122 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5123 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5124 return true;
5128 return true;