[Codegen] Alter the default promotion for saturating adds and subs
[llvm-complete.git] / lib / Target / AMDGPU / AMDGPULegalizerInfo.cpp
blob7fe0298f1c33c1b478bf5659a91de999acd615de
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
34 #define DEBUG_TYPE "amdgpu-legalinfo"
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43 unsigned MaxSize = 1024) {
44 return [=](const LegalityQuery &Query) {
45 const LLT Ty = Query.Types[TypeIdx];
46 const LLT EltTy = Ty.getScalarType();
47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52 return [=](const LegalityQuery &Query) {
53 return Query.Types[TypeIdx].getSizeInBits() == Size;
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58 return [=](const LegalityQuery &Query) {
59 const LLT Ty = Query.Types[TypeIdx];
60 return Ty.isVector() &&
61 Ty.getNumElements() % 2 != 0 &&
62 Ty.getElementType().getSizeInBits() < 32 &&
63 Ty.getSizeInBits() % 32 != 0;
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68 return [=](const LegalityQuery &Query) {
69 const LLT Ty = Query.Types[TypeIdx];
70 const LLT EltTy = Ty.getScalarType();
71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 const LLT EltTy = Ty.getElementType();
79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84 return [=](const LegalityQuery &Query) {
85 const LLT Ty = Query.Types[TypeIdx];
86 const LLT EltTy = Ty.getElementType();
87 unsigned Size = Ty.getSizeInBits();
88 unsigned Pieces = (Size + 63) / 64;
89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
100 const LLT EltTy = Ty.getElementType();
101 const int Size = Ty.getSizeInBits();
102 const int EltSize = EltTy.getSizeInBits();
103 const int NextMul32 = (Size + 31) / 32;
105 assert(EltSize < 32);
107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113 return [=](const LegalityQuery &Query) {
114 const LLT QueryTy = Query.Types[TypeIdx];
115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120 return [=](const LegalityQuery &Query) {
121 const LLT QueryTy = Query.Types[TypeIdx];
122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127 return [=](const LegalityQuery &Query) {
128 const LLT QueryTy = Query.Types[TypeIdx];
129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136 return [=](const LegalityQuery &Query) {
137 const LLT Ty = Query.Types[TypeIdx];
138 if (Ty.isVector()) {
139 const int EltSize = Ty.getElementType().getSizeInBits();
140 return EltSize == 32 || EltSize == 64 ||
141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142 EltSize == 128 || EltSize == 256;
145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150 return [=](const LegalityQuery &Query) {
151 return Query.Types[TypeIdx].getElementType() == Type;
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156 return [=](const LegalityQuery &Query) {
157 const LLT Ty = Query.Types[TypeIdx];
158 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164 const GCNTargetMachine &TM)
165 : ST(ST_) {
166 using namespace TargetOpcode;
168 auto GetAddrSpacePtr = [&TM](unsigned AS) {
169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
172 const LLT S1 = LLT::scalar(1);
173 const LLT S8 = LLT::scalar(8);
174 const LLT S16 = LLT::scalar(16);
175 const LLT S32 = LLT::scalar(32);
176 const LLT S64 = LLT::scalar(64);
177 const LLT S96 = LLT::scalar(96);
178 const LLT S128 = LLT::scalar(128);
179 const LLT S256 = LLT::scalar(256);
180 const LLT S1024 = LLT::scalar(1024);
182 const LLT V2S16 = LLT::vector(2, 16);
183 const LLT V4S16 = LLT::vector(4, 16);
185 const LLT V2S32 = LLT::vector(2, 32);
186 const LLT V3S32 = LLT::vector(3, 32);
187 const LLT V4S32 = LLT::vector(4, 32);
188 const LLT V5S32 = LLT::vector(5, 32);
189 const LLT V6S32 = LLT::vector(6, 32);
190 const LLT V7S32 = LLT::vector(7, 32);
191 const LLT V8S32 = LLT::vector(8, 32);
192 const LLT V9S32 = LLT::vector(9, 32);
193 const LLT V10S32 = LLT::vector(10, 32);
194 const LLT V11S32 = LLT::vector(11, 32);
195 const LLT V12S32 = LLT::vector(12, 32);
196 const LLT V13S32 = LLT::vector(13, 32);
197 const LLT V14S32 = LLT::vector(14, 32);
198 const LLT V15S32 = LLT::vector(15, 32);
199 const LLT V16S32 = LLT::vector(16, 32);
200 const LLT V32S32 = LLT::vector(32, 32);
202 const LLT V2S64 = LLT::vector(2, 64);
203 const LLT V3S64 = LLT::vector(3, 64);
204 const LLT V4S64 = LLT::vector(4, 64);
205 const LLT V5S64 = LLT::vector(5, 64);
206 const LLT V6S64 = LLT::vector(6, 64);
207 const LLT V7S64 = LLT::vector(7, 64);
208 const LLT V8S64 = LLT::vector(8, 64);
209 const LLT V16S64 = LLT::vector(16, 64);
211 std::initializer_list<LLT> AllS32Vectors =
212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214 std::initializer_list<LLT> AllS64Vectors =
215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
225 const LLT CodePtr = FlatPtr;
227 const std::initializer_list<LLT> AddrSpaces64 = {
228 GlobalPtr, ConstantPtr, FlatPtr
231 const std::initializer_list<LLT> AddrSpaces32 = {
232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
235 const std::initializer_list<LLT> FPTypesBase = {
236 S32, S64
239 const std::initializer_list<LLT> FPTypes16 = {
240 S32, S64, S16
243 const std::initializer_list<LLT> FPTypesPK16 = {
244 S32, S64, S16, V2S16
247 setAction({G_BRCOND, S1}, Legal);
249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250 // elements for v3s16
251 getActionDefinitionsBuilder(G_PHI)
252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253 .legalFor(AllS32Vectors)
254 .legalFor(AllS64Vectors)
255 .legalFor(AddrSpaces64)
256 .legalFor(AddrSpaces32)
257 .clampScalar(0, S32, S256)
258 .widenScalarToNextPow2(0, 32)
259 .clampMaxNumElements(0, S32, 16)
260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261 .legalIf(isPointer(0));
263 if (ST.has16BitInsts()) {
264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265 .legalFor({S32, S16})
266 .clampScalar(0, S16, S32)
267 .scalarize(0);
268 } else {
269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270 .legalFor({S32})
271 .clampScalar(0, S32, S32)
272 .scalarize(0);
275 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
276 .legalFor({S32})
277 .clampScalar(0, S32, S32)
278 .scalarize(0);
280 // Report legal for any types we can handle anywhere. For the cases only legal
281 // on the SALU, RegBankSelect will be able to re-legalize.
282 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
283 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
284 .clampScalar(0, S32, S64)
285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
287 .widenScalarToNextPow2(0)
288 .scalarize(0);
290 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
291 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
292 .legalFor({{S32, S1}})
293 .clampScalar(0, S32, S32)
294 .scalarize(0); // TODO: Implement.
296 getActionDefinitionsBuilder(G_BITCAST)
297 // Don't worry about the size constraint.
298 .legalIf(all(isRegisterType(0), isRegisterType(1)))
299 // FIXME: Testing hack
300 .legalForCartesianProduct({S16, LLT::vector(2, 8), });
302 getActionDefinitionsBuilder(G_FCONSTANT)
303 .legalFor({S32, S64, S16})
304 .clampScalar(0, S16, S64);
306 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
307 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
308 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
309 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
310 .clampScalarOrElt(0, S32, S1024)
311 .legalIf(isMultiple32(0))
312 .widenScalarToNextPow2(0, 32)
313 .clampMaxNumElements(0, S32, 16);
316 // FIXME: i1 operands to intrinsics should always be legal, but other i1
317 // values may not be legal. We need to figure out how to distinguish
318 // between these two scenarios.
319 getActionDefinitionsBuilder(G_CONSTANT)
320 .legalFor({S1, S32, S64, S16, GlobalPtr,
321 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
322 .clampScalar(0, S32, S64)
323 .widenScalarToNextPow2(0)
324 .legalIf(isPointer(0));
326 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
327 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
328 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
331 auto &FPOpActions = getActionDefinitionsBuilder(
332 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
333 .legalFor({S32, S64});
334 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
335 .customFor({S32, S64});
337 if (ST.has16BitInsts()) {
338 if (ST.hasVOP3PInsts())
339 FPOpActions.legalFor({S16, V2S16});
340 else
341 FPOpActions.legalFor({S16});
343 TrigActions.customFor({S16});
346 auto &MinNumMaxNum = getActionDefinitionsBuilder({
347 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
349 if (ST.hasVOP3PInsts()) {
350 MinNumMaxNum.customFor(FPTypesPK16)
351 .clampMaxNumElements(0, S16, 2)
352 .clampScalar(0, S16, S64)
353 .scalarize(0);
354 } else if (ST.has16BitInsts()) {
355 MinNumMaxNum.customFor(FPTypes16)
356 .clampScalar(0, S16, S64)
357 .scalarize(0);
358 } else {
359 MinNumMaxNum.customFor(FPTypesBase)
360 .clampScalar(0, S32, S64)
361 .scalarize(0);
364 if (ST.hasVOP3PInsts())
365 FPOpActions.clampMaxNumElements(0, S16, 2);
367 FPOpActions
368 .scalarize(0)
369 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
371 TrigActions
372 .scalarize(0)
373 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
375 getActionDefinitionsBuilder({G_FNEG, G_FABS})
376 .legalFor(FPTypesPK16)
377 .clampMaxNumElements(0, S16, 2)
378 .scalarize(0)
379 .clampScalar(0, S16, S64);
381 // TODO: Implement
382 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
384 if (ST.has16BitInsts()) {
385 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
386 .legalFor({S32, S64, S16})
387 .scalarize(0)
388 .clampScalar(0, S16, S64);
389 } else {
390 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
391 .legalFor({S32, S64})
392 .scalarize(0)
393 .clampScalar(0, S32, S64);
396 getActionDefinitionsBuilder(G_FPTRUNC)
397 .legalFor({{S32, S64}, {S16, S32}})
398 .scalarize(0);
400 getActionDefinitionsBuilder(G_FPEXT)
401 .legalFor({{S64, S32}, {S32, S16}})
402 .lowerFor({{S64, S16}}) // FIXME: Implement
403 .scalarize(0);
405 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
406 getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
408 getActionDefinitionsBuilder(G_FSUB)
409 // Use actual fsub instruction
410 .legalFor({S32})
411 // Must use fadd + fneg
412 .lowerFor({S64, S16, V2S16})
413 .scalarize(0)
414 .clampScalar(0, S32, S64);
416 // Whether this is legal depends on the floating point mode for the function.
417 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
418 if (ST.hasMadF16())
419 FMad.customFor({S32, S16});
420 else
421 FMad.customFor({S32});
422 FMad.scalarize(0)
423 .lower();
425 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
426 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
427 {S32, S1}, {S64, S1}, {S16, S1},
428 {S96, S32},
429 // FIXME: Hack
430 {S64, LLT::scalar(33)},
431 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
432 .scalarize(0);
434 // TODO: Split s1->s64 during regbankselect for VALU.
435 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
436 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
437 .lowerFor({{S32, S64}})
438 .customFor({{S64, S64}});
439 if (ST.has16BitInsts())
440 IToFP.legalFor({{S16, S16}});
441 IToFP.clampScalar(1, S32, S64)
442 .scalarize(0);
444 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
445 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
446 if (ST.has16BitInsts())
447 FPToI.legalFor({{S16, S16}});
448 else
449 FPToI.minScalar(1, S32);
451 FPToI.minScalar(0, S32)
452 .scalarize(0);
454 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
455 .legalFor({S32, S64})
456 .scalarize(0);
458 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
459 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
460 .legalFor({S32, S64})
461 .clampScalar(0, S32, S64)
462 .scalarize(0);
463 } else {
464 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
465 .legalFor({S32})
466 .customFor({S64})
467 .clampScalar(0, S32, S64)
468 .scalarize(0);
471 getActionDefinitionsBuilder(G_GEP)
472 .legalForCartesianProduct(AddrSpaces64, {S64})
473 .legalForCartesianProduct(AddrSpaces32, {S32})
474 .scalarize(0);
476 getActionDefinitionsBuilder(G_PTR_MASK)
477 .scalarize(0)
478 .alwaysLegal();
480 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
482 auto &CmpBuilder =
483 getActionDefinitionsBuilder(G_ICMP)
484 .legalForCartesianProduct(
485 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
486 .legalFor({{S1, S32}, {S1, S64}});
487 if (ST.has16BitInsts()) {
488 CmpBuilder.legalFor({{S1, S16}});
491 CmpBuilder
492 .widenScalarToNextPow2(1)
493 .clampScalar(1, S32, S64)
494 .scalarize(0)
495 .legalIf(all(typeIs(0, S1), isPointer(1)));
497 getActionDefinitionsBuilder(G_FCMP)
498 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
499 .widenScalarToNextPow2(1)
500 .clampScalar(1, S32, S64)
501 .scalarize(0);
503 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
504 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
505 G_FLOG, G_FLOG2, G_FLOG10})
506 .legalFor({S32})
507 .scalarize(0);
509 // The 64-bit versions produce 32-bit results, but only on the SALU.
510 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
511 G_CTTZ, G_CTTZ_ZERO_UNDEF,
512 G_CTPOP})
513 .legalFor({{S32, S32}, {S32, S64}})
514 .clampScalar(0, S32, S32)
515 .clampScalar(1, S32, S64)
516 .scalarize(0)
517 .widenScalarToNextPow2(0, 32)
518 .widenScalarToNextPow2(1, 32);
520 // TODO: Expand for > s32
521 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
522 .legalFor({S32})
523 .clampScalar(0, S32, S32)
524 .scalarize(0);
526 if (ST.has16BitInsts()) {
527 if (ST.hasVOP3PInsts()) {
528 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
529 .legalFor({S32, S16, V2S16})
530 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
531 .clampMaxNumElements(0, S16, 2)
532 .clampScalar(0, S16, S32)
533 .widenScalarToNextPow2(0)
534 .scalarize(0);
535 } else {
536 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
537 .legalFor({S32, S16})
538 .widenScalarToNextPow2(0)
539 .clampScalar(0, S16, S32)
540 .scalarize(0);
542 } else {
543 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
544 .legalFor({S32})
545 .clampScalar(0, S32, S32)
546 .widenScalarToNextPow2(0)
547 .scalarize(0);
550 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
551 return [=](const LegalityQuery &Query) {
552 return Query.Types[TypeIdx0].getSizeInBits() <
553 Query.Types[TypeIdx1].getSizeInBits();
557 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
558 return [=](const LegalityQuery &Query) {
559 return Query.Types[TypeIdx0].getSizeInBits() >
560 Query.Types[TypeIdx1].getSizeInBits();
564 getActionDefinitionsBuilder(G_INTTOPTR)
565 // List the common cases
566 .legalForCartesianProduct(AddrSpaces64, {S64})
567 .legalForCartesianProduct(AddrSpaces32, {S32})
568 .scalarize(0)
569 // Accept any address space as long as the size matches
570 .legalIf(sameSize(0, 1))
571 .widenScalarIf(smallerThan(1, 0),
572 [](const LegalityQuery &Query) {
573 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
575 .narrowScalarIf(greaterThan(1, 0),
576 [](const LegalityQuery &Query) {
577 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
580 getActionDefinitionsBuilder(G_PTRTOINT)
581 // List the common cases
582 .legalForCartesianProduct(AddrSpaces64, {S64})
583 .legalForCartesianProduct(AddrSpaces32, {S32})
584 .scalarize(0)
585 // Accept any address space as long as the size matches
586 .legalIf(sameSize(0, 1))
587 .widenScalarIf(smallerThan(0, 1),
588 [](const LegalityQuery &Query) {
589 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
591 .narrowScalarIf(
592 greaterThan(0, 1),
593 [](const LegalityQuery &Query) {
594 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
597 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
598 .scalarize(0)
599 .custom();
601 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
602 // handle some operations by just promoting the register during
603 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
604 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
605 switch (AS) {
606 // FIXME: Private element size.
607 case AMDGPUAS::PRIVATE_ADDRESS:
608 return 32;
609 // FIXME: Check subtarget
610 case AMDGPUAS::LOCAL_ADDRESS:
611 return ST.useDS128() ? 128 : 64;
613 // Treat constant and global as identical. SMRD loads are sometimes usable
614 // for global loads (ideally constant address space should be eliminated)
615 // depending on the context. Legality cannot be context dependent, but
616 // RegBankSelect can split the load as necessary depending on the pointer
617 // register bank/uniformity and if the memory is invariant or not written in
618 // a kernel.
619 case AMDGPUAS::CONSTANT_ADDRESS:
620 case AMDGPUAS::GLOBAL_ADDRESS:
621 return 512;
622 default:
623 return 128;
627 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
628 const LLT DstTy = Query.Types[0];
630 // Split vector extloads.
631 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
632 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
633 return true;
635 const LLT PtrTy = Query.Types[1];
636 unsigned AS = PtrTy.getAddressSpace();
637 if (MemSize > maxSizeForAddrSpace(AS))
638 return true;
640 // Catch weird sized loads that don't evenly divide into the access sizes
641 // TODO: May be able to widen depending on alignment etc.
642 unsigned NumRegs = MemSize / 32;
643 if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
644 return true;
646 unsigned Align = Query.MMODescrs[0].AlignInBits;
647 if (Align < MemSize) {
648 const SITargetLowering *TLI = ST.getTargetLowering();
649 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
652 return false;
655 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
656 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
657 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
659 // TODO: Refine based on subtargets which support unaligned access or 128-bit
660 // LDS
661 // TODO: Unsupported flat for SI.
663 for (unsigned Op : {G_LOAD, G_STORE}) {
664 const bool IsStore = Op == G_STORE;
666 auto &Actions = getActionDefinitionsBuilder(Op);
667 // Whitelist the common cases.
668 // TODO: Pointer loads
669 // TODO: Wide constant loads
670 // TODO: Only CI+ has 3x loads
671 // TODO: Loads to s16 on gfx9
672 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
673 {V2S32, GlobalPtr, 64, GlobalAlign32},
674 {V3S32, GlobalPtr, 96, GlobalAlign32},
675 {S96, GlobalPtr, 96, GlobalAlign32},
676 {V4S32, GlobalPtr, 128, GlobalAlign32},
677 {S128, GlobalPtr, 128, GlobalAlign32},
678 {S64, GlobalPtr, 64, GlobalAlign32},
679 {V2S64, GlobalPtr, 128, GlobalAlign32},
680 {V2S16, GlobalPtr, 32, GlobalAlign32},
681 {S32, GlobalPtr, 8, GlobalAlign8},
682 {S32, GlobalPtr, 16, GlobalAlign16},
684 {S32, LocalPtr, 32, 32},
685 {S64, LocalPtr, 64, 32},
686 {V2S32, LocalPtr, 64, 32},
687 {S32, LocalPtr, 8, 8},
688 {S32, LocalPtr, 16, 16},
689 {V2S16, LocalPtr, 32, 32},
691 {S32, PrivatePtr, 32, 32},
692 {S32, PrivatePtr, 8, 8},
693 {S32, PrivatePtr, 16, 16},
694 {V2S16, PrivatePtr, 32, 32},
696 {S32, FlatPtr, 32, GlobalAlign32},
697 {S32, FlatPtr, 16, GlobalAlign16},
698 {S32, FlatPtr, 8, GlobalAlign8},
699 {V2S16, FlatPtr, 32, GlobalAlign32},
701 {S32, ConstantPtr, 32, GlobalAlign32},
702 {V2S32, ConstantPtr, 64, GlobalAlign32},
703 {V3S32, ConstantPtr, 96, GlobalAlign32},
704 {V4S32, ConstantPtr, 128, GlobalAlign32},
705 {S64, ConstantPtr, 64, GlobalAlign32},
706 {S128, ConstantPtr, 128, GlobalAlign32},
707 {V2S32, ConstantPtr, 32, GlobalAlign32}});
708 Actions
709 .customIf(typeIs(1, Constant32Ptr))
710 .narrowScalarIf(
711 [=](const LegalityQuery &Query) -> bool {
712 return !Query.Types[0].isVector() && needToSplitLoad(Query);
714 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
715 const LLT DstTy = Query.Types[0];
716 const LLT PtrTy = Query.Types[1];
718 const unsigned DstSize = DstTy.getSizeInBits();
719 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
721 // Split extloads.
722 if (DstSize > MemSize)
723 return std::make_pair(0, LLT::scalar(MemSize));
725 if (DstSize > 32 && (DstSize % 32 != 0)) {
726 // FIXME: Need a way to specify non-extload of larger size if
727 // suitably aligned.
728 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
731 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
732 if (MemSize > MaxSize)
733 return std::make_pair(0, LLT::scalar(MaxSize));
735 unsigned Align = Query.MMODescrs[0].AlignInBits;
736 return std::make_pair(0, LLT::scalar(Align));
738 .fewerElementsIf(
739 [=](const LegalityQuery &Query) -> bool {
740 return Query.Types[0].isVector() && needToSplitLoad(Query);
742 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
743 const LLT DstTy = Query.Types[0];
744 const LLT PtrTy = Query.Types[1];
746 LLT EltTy = DstTy.getElementType();
747 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
749 // Split if it's too large for the address space.
750 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
751 unsigned NumElts = DstTy.getNumElements();
752 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
754 // FIXME: Refine when odd breakdowns handled
755 // The scalars will need to be re-legalized.
756 if (NumPieces == 1 || NumPieces >= NumElts ||
757 NumElts % NumPieces != 0)
758 return std::make_pair(0, EltTy);
760 return std::make_pair(0,
761 LLT::vector(NumElts / NumPieces, EltTy));
764 // Need to split because of alignment.
765 unsigned Align = Query.MMODescrs[0].AlignInBits;
766 unsigned EltSize = EltTy.getSizeInBits();
767 if (EltSize > Align &&
768 (EltSize / Align < DstTy.getNumElements())) {
769 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
772 // May need relegalization for the scalars.
773 return std::make_pair(0, EltTy);
775 .minScalar(0, S32);
777 if (IsStore)
778 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
780 // TODO: Need a bitcast lower option?
781 Actions
782 .legalIf([=](const LegalityQuery &Query) {
783 const LLT Ty0 = Query.Types[0];
784 unsigned Size = Ty0.getSizeInBits();
785 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
786 unsigned Align = Query.MMODescrs[0].AlignInBits;
788 // No extending vector loads.
789 if (Size > MemSize && Ty0.isVector())
790 return false;
792 // FIXME: Widening store from alignment not valid.
793 if (MemSize < Size)
794 MemSize = std::max(MemSize, Align);
796 switch (MemSize) {
797 case 8:
798 case 16:
799 return Size == 32;
800 case 32:
801 case 64:
802 case 128:
803 return true;
804 case 96:
805 return ST.hasDwordx3LoadStores();
806 case 256:
807 case 512:
808 return true;
809 default:
810 return false;
813 .widenScalarToNextPow2(0)
814 // TODO: v3s32->v4s32 with alignment
815 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
818 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
819 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
820 {S32, GlobalPtr, 16, 2 * 8},
821 {S32, LocalPtr, 8, 8},
822 {S32, LocalPtr, 16, 16},
823 {S32, PrivatePtr, 8, 8},
824 {S32, PrivatePtr, 16, 16},
825 {S32, ConstantPtr, 8, 8},
826 {S32, ConstantPtr, 16, 2 * 8}});
827 if (ST.hasFlatAddressSpace()) {
828 ExtLoads.legalForTypesWithMemDesc(
829 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
832 ExtLoads.clampScalar(0, S32, S32)
833 .widenScalarToNextPow2(0)
834 .unsupportedIfMemSizeNotPow2()
835 .lower();
837 auto &Atomics = getActionDefinitionsBuilder(
838 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
839 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
840 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
841 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
842 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
843 {S64, GlobalPtr}, {S64, LocalPtr}});
844 if (ST.hasFlatAddressSpace()) {
845 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
848 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
849 .legalFor({{S32, LocalPtr}});
851 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
852 .lower();
854 // TODO: Pointer types, any 32-bit or 64-bit vector
855 getActionDefinitionsBuilder(G_SELECT)
856 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
857 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
858 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
859 .clampScalar(0, S16, S64)
860 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
861 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
862 .scalarize(1)
863 .clampMaxNumElements(0, S32, 2)
864 .clampMaxNumElements(0, LocalPtr, 2)
865 .clampMaxNumElements(0, PrivatePtr, 2)
866 .scalarize(0)
867 .widenScalarToNextPow2(0)
868 .legalIf(all(isPointer(0), typeIs(1, S1)));
870 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
871 // be more flexible with the shift amount type.
872 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
873 .legalFor({{S32, S32}, {S64, S32}});
874 if (ST.has16BitInsts()) {
875 if (ST.hasVOP3PInsts()) {
876 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
877 .clampMaxNumElements(0, S16, 2);
878 } else
879 Shifts.legalFor({{S16, S32}, {S16, S16}});
881 Shifts.clampScalar(1, S16, S32);
882 Shifts.clampScalar(0, S16, S64);
883 Shifts.widenScalarToNextPow2(0, 16);
884 } else {
885 // Make sure we legalize the shift amount type first, as the general
886 // expansion for the shifted type will produce much worse code if it hasn't
887 // been truncated already.
888 Shifts.clampScalar(1, S32, S32);
889 Shifts.clampScalar(0, S32, S64);
890 Shifts.widenScalarToNextPow2(0, 32);
892 Shifts.scalarize(0);
894 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
895 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
896 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
897 unsigned IdxTypeIdx = 2;
899 getActionDefinitionsBuilder(Op)
900 .customIf([=](const LegalityQuery &Query) {
901 const LLT EltTy = Query.Types[EltTypeIdx];
902 const LLT VecTy = Query.Types[VecTypeIdx];
903 const LLT IdxTy = Query.Types[IdxTypeIdx];
904 return (EltTy.getSizeInBits() == 16 ||
905 EltTy.getSizeInBits() % 32 == 0) &&
906 VecTy.getSizeInBits() % 32 == 0 &&
907 VecTy.getSizeInBits() <= 1024 &&
908 IdxTy.getSizeInBits() == 32;
910 .clampScalar(EltTypeIdx, S32, S64)
911 .clampScalar(VecTypeIdx, S32, S64)
912 .clampScalar(IdxTypeIdx, S32, S32);
915 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
916 .unsupportedIf([=](const LegalityQuery &Query) {
917 const LLT &EltTy = Query.Types[1].getElementType();
918 return Query.Types[0] != EltTy;
921 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
922 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
923 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
925 // FIXME: Doesn't handle extract of illegal sizes.
926 getActionDefinitionsBuilder(Op)
927 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
928 // FIXME: Multiples of 16 should not be legal.
929 .legalIf([=](const LegalityQuery &Query) {
930 const LLT BigTy = Query.Types[BigTyIdx];
931 const LLT LitTy = Query.Types[LitTyIdx];
932 return (BigTy.getSizeInBits() % 32 == 0) &&
933 (LitTy.getSizeInBits() % 16 == 0);
935 .widenScalarIf(
936 [=](const LegalityQuery &Query) {
937 const LLT BigTy = Query.Types[BigTyIdx];
938 return (BigTy.getScalarSizeInBits() < 16);
940 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
941 .widenScalarIf(
942 [=](const LegalityQuery &Query) {
943 const LLT LitTy = Query.Types[LitTyIdx];
944 return (LitTy.getScalarSizeInBits() < 16);
946 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
947 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
948 .widenScalarToNextPow2(BigTyIdx, 32);
952 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
953 .legalForCartesianProduct(AllS32Vectors, {S32})
954 .legalForCartesianProduct(AllS64Vectors, {S64})
955 .clampNumElements(0, V16S32, V32S32)
956 .clampNumElements(0, V2S64, V16S64)
957 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
959 if (ST.hasScalarPackInsts())
960 BuildVector.legalFor({V2S16, S32});
962 BuildVector
963 .minScalarSameAs(1, 0)
964 .legalIf(isRegisterType(0))
965 .minScalarOrElt(0, S32);
967 if (ST.hasScalarPackInsts()) {
968 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
969 .legalFor({V2S16, S32})
970 .lower();
971 } else {
972 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
973 .lower();
976 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
977 .legalIf(isRegisterType(0));
979 // TODO: Don't fully scalarize v2s16 pieces
980 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
982 // Merge/Unmerge
983 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
984 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
985 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
987 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
988 const LLT &Ty = Query.Types[TypeIdx];
989 if (Ty.isVector()) {
990 const LLT &EltTy = Ty.getElementType();
991 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
992 return true;
993 if (!isPowerOf2_32(EltTy.getSizeInBits()))
994 return true;
996 return false;
999 auto &Builder = getActionDefinitionsBuilder(Op)
1000 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1001 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1002 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1003 // valid.
1004 .clampScalar(LitTyIdx, S16, S256)
1005 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1006 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1007 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1008 elementTypeIs(1, S16)),
1009 changeTo(1, V2S16))
1010 // Break up vectors with weird elements into scalars
1011 .fewerElementsIf(
1012 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1013 scalarize(0))
1014 .fewerElementsIf(
1015 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1016 scalarize(1))
1017 .clampScalar(BigTyIdx, S32, S1024)
1018 .lowerFor({{S16, V2S16}});
1020 if (Op == G_MERGE_VALUES) {
1021 Builder.widenScalarIf(
1022 // TODO: Use 16-bit shifts if legal for 8-bit values?
1023 [=](const LegalityQuery &Query) {
1024 const LLT Ty = Query.Types[LitTyIdx];
1025 return Ty.getSizeInBits() < 32;
1027 changeTo(LitTyIdx, S32));
1030 Builder.widenScalarIf(
1031 [=](const LegalityQuery &Query) {
1032 const LLT Ty = Query.Types[BigTyIdx];
1033 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1034 Ty.getSizeInBits() % 16 != 0;
1036 [=](const LegalityQuery &Query) {
1037 // Pick the next power of 2, or a multiple of 64 over 128.
1038 // Whichever is smaller.
1039 const LLT &Ty = Query.Types[BigTyIdx];
1040 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1041 if (NewSizeInBits >= 256) {
1042 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1043 if (RoundedTo < NewSizeInBits)
1044 NewSizeInBits = RoundedTo;
1046 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1048 .legalIf([=](const LegalityQuery &Query) {
1049 const LLT &BigTy = Query.Types[BigTyIdx];
1050 const LLT &LitTy = Query.Types[LitTyIdx];
1052 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1053 return false;
1054 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1055 return false;
1057 return BigTy.getSizeInBits() % 16 == 0 &&
1058 LitTy.getSizeInBits() % 16 == 0 &&
1059 BigTy.getSizeInBits() <= 1024;
1061 // Any vectors left are the wrong size. Scalarize them.
1062 .scalarize(0)
1063 .scalarize(1);
1066 getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1068 computeTables();
1069 verify(*ST.getInstrInfo());
1072 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1073 MachineRegisterInfo &MRI,
1074 MachineIRBuilder &B,
1075 GISelChangeObserver &Observer) const {
1076 switch (MI.getOpcode()) {
1077 case TargetOpcode::G_ADDRSPACE_CAST:
1078 return legalizeAddrSpaceCast(MI, MRI, B);
1079 case TargetOpcode::G_FRINT:
1080 return legalizeFrint(MI, MRI, B);
1081 case TargetOpcode::G_FCEIL:
1082 return legalizeFceil(MI, MRI, B);
1083 case TargetOpcode::G_INTRINSIC_TRUNC:
1084 return legalizeIntrinsicTrunc(MI, MRI, B);
1085 case TargetOpcode::G_SITOFP:
1086 return legalizeITOFP(MI, MRI, B, true);
1087 case TargetOpcode::G_UITOFP:
1088 return legalizeITOFP(MI, MRI, B, false);
1089 case TargetOpcode::G_FMINNUM:
1090 case TargetOpcode::G_FMAXNUM:
1091 case TargetOpcode::G_FMINNUM_IEEE:
1092 case TargetOpcode::G_FMAXNUM_IEEE:
1093 return legalizeMinNumMaxNum(MI, MRI, B);
1094 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1095 return legalizeExtractVectorElt(MI, MRI, B);
1096 case TargetOpcode::G_INSERT_VECTOR_ELT:
1097 return legalizeInsertVectorElt(MI, MRI, B);
1098 case TargetOpcode::G_FSIN:
1099 case TargetOpcode::G_FCOS:
1100 return legalizeSinCos(MI, MRI, B);
1101 case TargetOpcode::G_GLOBAL_VALUE:
1102 return legalizeGlobalValue(MI, MRI, B);
1103 case TargetOpcode::G_LOAD:
1104 return legalizeLoad(MI, MRI, B, Observer);
1105 case TargetOpcode::G_FMAD:
1106 return legalizeFMad(MI, MRI, B);
1107 default:
1108 return false;
1111 llvm_unreachable("expected switch to return");
1114 Register AMDGPULegalizerInfo::getSegmentAperture(
1115 unsigned AS,
1116 MachineRegisterInfo &MRI,
1117 MachineIRBuilder &B) const {
1118 MachineFunction &MF = B.getMF();
1119 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1120 const LLT S32 = LLT::scalar(32);
1122 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1124 if (ST.hasApertureRegs()) {
1125 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1126 // getreg.
1127 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1128 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1129 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1130 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1131 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1132 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1133 unsigned Encoding =
1134 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1135 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1136 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1138 Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1139 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1141 B.buildInstr(AMDGPU::S_GETREG_B32)
1142 .addDef(GetReg)
1143 .addImm(Encoding);
1144 MRI.setType(GetReg, S32);
1146 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1147 B.buildInstr(TargetOpcode::G_SHL)
1148 .addDef(ApertureReg)
1149 .addUse(GetReg)
1150 .addUse(ShiftAmt.getReg(0));
1152 return ApertureReg;
1155 Register QueuePtr = MRI.createGenericVirtualRegister(
1156 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1158 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1159 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1160 return Register();
1162 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1163 // private_segment_aperture_base_hi.
1164 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1166 // FIXME: Don't use undef
1167 Value *V = UndefValue::get(PointerType::get(
1168 Type::getInt8Ty(MF.getFunction().getContext()),
1169 AMDGPUAS::CONSTANT_ADDRESS));
1171 MachinePointerInfo PtrInfo(V, StructOffset);
1172 MachineMemOperand *MMO = MF.getMachineMemOperand(
1173 PtrInfo,
1174 MachineMemOperand::MOLoad |
1175 MachineMemOperand::MODereferenceable |
1176 MachineMemOperand::MOInvariant,
1178 MinAlign(64, StructOffset));
1180 Register LoadResult = MRI.createGenericVirtualRegister(S32);
1181 Register LoadAddr;
1183 B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1184 B.buildLoad(LoadResult, LoadAddr, *MMO);
1185 return LoadResult;
1188 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1189 MachineInstr &MI, MachineRegisterInfo &MRI,
1190 MachineIRBuilder &B) const {
1191 MachineFunction &MF = B.getMF();
1193 B.setInstr(MI);
1195 const LLT S32 = LLT::scalar(32);
1196 Register Dst = MI.getOperand(0).getReg();
1197 Register Src = MI.getOperand(1).getReg();
1199 LLT DstTy = MRI.getType(Dst);
1200 LLT SrcTy = MRI.getType(Src);
1201 unsigned DestAS = DstTy.getAddressSpace();
1202 unsigned SrcAS = SrcTy.getAddressSpace();
1204 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1205 // vector element.
1206 assert(!DstTy.isVector());
1208 const AMDGPUTargetMachine &TM
1209 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1211 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1212 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1213 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1214 return true;
1217 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1218 // Truncate.
1219 B.buildExtract(Dst, Src, 0);
1220 MI.eraseFromParent();
1221 return true;
1224 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1225 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1226 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1228 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1229 // another. Merge operands are required to be the same type, but creating an
1230 // extra ptrtoint would be kind of pointless.
1231 auto HighAddr = B.buildConstant(
1232 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1233 B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1234 MI.eraseFromParent();
1235 return true;
1238 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1239 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1240 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1241 unsigned NullVal = TM.getNullPointerValue(DestAS);
1243 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1244 auto FlatNull = B.buildConstant(SrcTy, 0);
1246 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1248 // Extract low 32-bits of the pointer.
1249 B.buildExtract(PtrLo32, Src, 0);
1251 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1252 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1253 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1255 MI.eraseFromParent();
1256 return true;
1259 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1260 return false;
1262 if (!ST.hasFlatAddressSpace())
1263 return false;
1265 auto SegmentNull =
1266 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1267 auto FlatNull =
1268 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1270 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1271 if (!ApertureReg.isValid())
1272 return false;
1274 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1275 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1277 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1279 // Coerce the type of the low half of the result so we can use merge_values.
1280 Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1281 B.buildInstr(TargetOpcode::G_PTRTOINT)
1282 .addDef(SrcAsInt)
1283 .addUse(Src);
1285 // TODO: Should we allow mismatched types but matching sizes in merges to
1286 // avoid the ptrtoint?
1287 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1288 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1290 MI.eraseFromParent();
1291 return true;
1294 bool AMDGPULegalizerInfo::legalizeFrint(
1295 MachineInstr &MI, MachineRegisterInfo &MRI,
1296 MachineIRBuilder &B) const {
1297 B.setInstr(MI);
1299 Register Src = MI.getOperand(1).getReg();
1300 LLT Ty = MRI.getType(Src);
1301 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1303 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1304 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1306 auto C1 = B.buildFConstant(Ty, C1Val);
1307 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1309 // TODO: Should this propagate fast-math-flags?
1310 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1311 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1313 auto C2 = B.buildFConstant(Ty, C2Val);
1314 auto Fabs = B.buildFAbs(Ty, Src);
1316 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1317 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1318 return true;
1321 bool AMDGPULegalizerInfo::legalizeFceil(
1322 MachineInstr &MI, MachineRegisterInfo &MRI,
1323 MachineIRBuilder &B) const {
1324 B.setInstr(MI);
1326 const LLT S1 = LLT::scalar(1);
1327 const LLT S64 = LLT::scalar(64);
1329 Register Src = MI.getOperand(1).getReg();
1330 assert(MRI.getType(Src) == S64);
1332 // result = trunc(src)
1333 // if (src > 0.0 && src != result)
1334 // result += 1.0
1336 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1338 const auto Zero = B.buildFConstant(S64, 0.0);
1339 const auto One = B.buildFConstant(S64, 1.0);
1340 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1341 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1342 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1343 auto Add = B.buildSelect(S64, And, One, Zero);
1345 // TODO: Should this propagate fast-math-flags?
1346 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1347 return true;
1350 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1351 MachineIRBuilder &B) {
1352 const unsigned FractBits = 52;
1353 const unsigned ExpBits = 11;
1354 LLT S32 = LLT::scalar(32);
1356 auto Const0 = B.buildConstant(S32, FractBits - 32);
1357 auto Const1 = B.buildConstant(S32, ExpBits);
1359 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1360 .addUse(Const0.getReg(0))
1361 .addUse(Const1.getReg(0));
1363 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1366 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1367 MachineInstr &MI, MachineRegisterInfo &MRI,
1368 MachineIRBuilder &B) const {
1369 B.setInstr(MI);
1371 const LLT S1 = LLT::scalar(1);
1372 const LLT S32 = LLT::scalar(32);
1373 const LLT S64 = LLT::scalar(64);
1375 Register Src = MI.getOperand(1).getReg();
1376 assert(MRI.getType(Src) == S64);
1378 // TODO: Should this use extract since the low half is unused?
1379 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1380 Register Hi = Unmerge.getReg(1);
1382 // Extract the upper half, since this is where we will find the sign and
1383 // exponent.
1384 auto Exp = extractF64Exponent(Hi, B);
1386 const unsigned FractBits = 52;
1388 // Extract the sign bit.
1389 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1390 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1392 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1394 const auto Zero32 = B.buildConstant(S32, 0);
1396 // Extend back to 64-bits.
1397 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1399 auto Shr = B.buildAShr(S64, FractMask, Exp);
1400 auto Not = B.buildNot(S64, Shr);
1401 auto Tmp0 = B.buildAnd(S64, Src, Not);
1402 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1404 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1405 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1407 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1408 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1409 return true;
1412 bool AMDGPULegalizerInfo::legalizeITOFP(
1413 MachineInstr &MI, MachineRegisterInfo &MRI,
1414 MachineIRBuilder &B, bool Signed) const {
1415 B.setInstr(MI);
1417 Register Dst = MI.getOperand(0).getReg();
1418 Register Src = MI.getOperand(1).getReg();
1420 const LLT S64 = LLT::scalar(64);
1421 const LLT S32 = LLT::scalar(32);
1423 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1425 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1427 auto CvtHi = Signed ?
1428 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1429 B.buildUITOFP(S64, Unmerge.getReg(1));
1431 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1433 auto ThirtyTwo = B.buildConstant(S32, 32);
1434 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1435 .addUse(CvtHi.getReg(0))
1436 .addUse(ThirtyTwo.getReg(0));
1438 // TODO: Should this propagate fast-math-flags?
1439 B.buildFAdd(Dst, LdExp, CvtLo);
1440 MI.eraseFromParent();
1441 return true;
1444 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1445 MachineInstr &MI, MachineRegisterInfo &MRI,
1446 MachineIRBuilder &B) const {
1447 MachineFunction &MF = B.getMF();
1448 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1450 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1451 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1453 // With ieee_mode disabled, the instructions have the correct behavior
1454 // already for G_FMINNUM/G_FMAXNUM
1455 if (!MFI->getMode().IEEE)
1456 return !IsIEEEOp;
1458 if (IsIEEEOp)
1459 return true;
1461 MachineIRBuilder HelperBuilder(MI);
1462 GISelObserverWrapper DummyObserver;
1463 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1464 HelperBuilder.setInstr(MI);
1465 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1468 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1469 MachineInstr &MI, MachineRegisterInfo &MRI,
1470 MachineIRBuilder &B) const {
1471 // TODO: Should move some of this into LegalizerHelper.
1473 // TODO: Promote dynamic indexing of s16 to s32
1474 // TODO: Dynamic s64 indexing is only legal for SGPR.
1475 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1476 if (!IdxVal) // Dynamic case will be selected to register indexing.
1477 return true;
1479 Register Dst = MI.getOperand(0).getReg();
1480 Register Vec = MI.getOperand(1).getReg();
1482 LLT VecTy = MRI.getType(Vec);
1483 LLT EltTy = VecTy.getElementType();
1484 assert(EltTy == MRI.getType(Dst));
1486 B.setInstr(MI);
1488 if (IdxVal.getValue() < VecTy.getNumElements())
1489 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1490 else
1491 B.buildUndef(Dst);
1493 MI.eraseFromParent();
1494 return true;
1497 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1498 MachineInstr &MI, MachineRegisterInfo &MRI,
1499 MachineIRBuilder &B) const {
1500 // TODO: Should move some of this into LegalizerHelper.
1502 // TODO: Promote dynamic indexing of s16 to s32
1503 // TODO: Dynamic s64 indexing is only legal for SGPR.
1504 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1505 if (!IdxVal) // Dynamic case will be selected to register indexing.
1506 return true;
1508 Register Dst = MI.getOperand(0).getReg();
1509 Register Vec = MI.getOperand(1).getReg();
1510 Register Ins = MI.getOperand(2).getReg();
1512 LLT VecTy = MRI.getType(Vec);
1513 LLT EltTy = VecTy.getElementType();
1514 assert(EltTy == MRI.getType(Ins));
1516 B.setInstr(MI);
1518 if (IdxVal.getValue() < VecTy.getNumElements())
1519 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1520 else
1521 B.buildUndef(Dst);
1523 MI.eraseFromParent();
1524 return true;
1527 bool AMDGPULegalizerInfo::legalizeSinCos(
1528 MachineInstr &MI, MachineRegisterInfo &MRI,
1529 MachineIRBuilder &B) const {
1530 B.setInstr(MI);
1532 Register DstReg = MI.getOperand(0).getReg();
1533 Register SrcReg = MI.getOperand(1).getReg();
1534 LLT Ty = MRI.getType(DstReg);
1535 unsigned Flags = MI.getFlags();
1537 Register TrigVal;
1538 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1539 if (ST.hasTrigReducedRange()) {
1540 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1541 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1542 .addUse(MulVal.getReg(0))
1543 .setMIFlags(Flags).getReg(0);
1544 } else
1545 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1547 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1548 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1549 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1550 .addUse(TrigVal)
1551 .setMIFlags(Flags);
1552 MI.eraseFromParent();
1553 return true;
1556 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1557 Register DstReg, LLT PtrTy,
1558 MachineIRBuilder &B, const GlobalValue *GV,
1559 unsigned Offset, unsigned GAFlags) const {
1560 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1561 // to the following code sequence:
1563 // For constant address space:
1564 // s_getpc_b64 s[0:1]
1565 // s_add_u32 s0, s0, $symbol
1566 // s_addc_u32 s1, s1, 0
1568 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1569 // a fixup or relocation is emitted to replace $symbol with a literal
1570 // constant, which is a pc-relative offset from the encoding of the $symbol
1571 // operand to the global variable.
1573 // For global address space:
1574 // s_getpc_b64 s[0:1]
1575 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1576 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1578 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1579 // fixups or relocations are emitted to replace $symbol@*@lo and
1580 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1581 // which is a 64-bit pc-relative offset from the encoding of the $symbol
1582 // operand to the global variable.
1584 // What we want here is an offset from the value returned by s_getpc
1585 // (which is the address of the s_add_u32 instruction) to the global
1586 // variable, but since the encoding of $symbol starts 4 bytes after the start
1587 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1588 // small. This requires us to add 4 to the global variable offset in order to
1589 // compute the correct address.
1591 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1593 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1594 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1596 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1597 .addDef(PCReg);
1599 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1600 if (GAFlags == SIInstrInfo::MO_NONE)
1601 MIB.addImm(0);
1602 else
1603 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1605 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1607 if (PtrTy.getSizeInBits() == 32)
1608 B.buildExtract(DstReg, PCReg, 0);
1609 return true;
1612 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1613 MachineInstr &MI, MachineRegisterInfo &MRI,
1614 MachineIRBuilder &B) const {
1615 Register DstReg = MI.getOperand(0).getReg();
1616 LLT Ty = MRI.getType(DstReg);
1617 unsigned AS = Ty.getAddressSpace();
1619 const GlobalValue *GV = MI.getOperand(1).getGlobal();
1620 MachineFunction &MF = B.getMF();
1621 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1622 B.setInstr(MI);
1624 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1625 if (!MFI->isEntryFunction()) {
1626 const Function &Fn = MF.getFunction();
1627 DiagnosticInfoUnsupported BadLDSDecl(
1628 Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1629 Fn.getContext().diagnose(BadLDSDecl);
1632 // TODO: We could emit code to handle the initialization somewhere.
1633 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1634 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1635 MI.eraseFromParent();
1636 return true;
1639 const Function &Fn = MF.getFunction();
1640 DiagnosticInfoUnsupported BadInit(
1641 Fn, "unsupported initializer for address space", MI.getDebugLoc());
1642 Fn.getContext().diagnose(BadInit);
1643 return true;
1646 const SITargetLowering *TLI = ST.getTargetLowering();
1648 if (TLI->shouldEmitFixup(GV)) {
1649 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1650 MI.eraseFromParent();
1651 return true;
1654 if (TLI->shouldEmitPCReloc(GV)) {
1655 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1656 MI.eraseFromParent();
1657 return true;
1660 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1661 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1663 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1664 MachinePointerInfo::getGOT(MF),
1665 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1666 MachineMemOperand::MOInvariant,
1667 8 /*Size*/, 8 /*Align*/);
1669 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1671 if (Ty.getSizeInBits() == 32) {
1672 // Truncate if this is a 32-bit constant adrdess.
1673 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1674 B.buildExtract(DstReg, Load, 0);
1675 } else
1676 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1678 MI.eraseFromParent();
1679 return true;
1682 bool AMDGPULegalizerInfo::legalizeLoad(
1683 MachineInstr &MI, MachineRegisterInfo &MRI,
1684 MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1685 B.setInstr(MI);
1686 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1687 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1688 Observer.changingInstr(MI);
1689 MI.getOperand(1).setReg(Cast.getReg(0));
1690 Observer.changedInstr(MI);
1691 return true;
1694 bool AMDGPULegalizerInfo::legalizeFMad(
1695 MachineInstr &MI, MachineRegisterInfo &MRI,
1696 MachineIRBuilder &B) const {
1697 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1698 assert(Ty.isScalar());
1700 // TODO: Always legal with future ftz flag.
1701 if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1702 return true;
1703 if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1704 return true;
1706 MachineFunction &MF = B.getMF();
1708 MachineIRBuilder HelperBuilder(MI);
1709 GISelObserverWrapper DummyObserver;
1710 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1711 HelperBuilder.setMBB(*MI.getParent());
1712 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1715 // Return the use branch instruction, otherwise null if the usage is invalid.
1716 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1717 MachineRegisterInfo &MRI) {
1718 Register CondDef = MI.getOperand(0).getReg();
1719 if (!MRI.hasOneNonDBGUse(CondDef))
1720 return nullptr;
1722 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1723 return UseMI.getParent() == MI.getParent() &&
1724 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1727 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1728 Register Reg, LLT Ty) const {
1729 Register LiveIn = MRI.getLiveInVirtReg(Reg);
1730 if (LiveIn)
1731 return LiveIn;
1733 Register NewReg = MRI.createGenericVirtualRegister(Ty);
1734 MRI.addLiveIn(Reg, NewReg);
1735 return NewReg;
1738 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1739 const ArgDescriptor *Arg) const {
1740 if (!Arg->isRegister() || !Arg->getRegister().isValid())
1741 return false; // TODO: Handle these
1743 assert(Arg->getRegister().isPhysical());
1745 MachineRegisterInfo &MRI = *B.getMRI();
1747 LLT Ty = MRI.getType(DstReg);
1748 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1750 if (Arg->isMasked()) {
1751 // TODO: Should we try to emit this once in the entry block?
1752 const LLT S32 = LLT::scalar(32);
1753 const unsigned Mask = Arg->getMask();
1754 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1756 Register AndMaskSrc = LiveIn;
1758 if (Shift != 0) {
1759 auto ShiftAmt = B.buildConstant(S32, Shift);
1760 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1763 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1764 } else
1765 B.buildCopy(DstReg, LiveIn);
1767 // Insert the argument copy if it doens't already exist.
1768 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1769 if (!MRI.getVRegDef(LiveIn)) {
1770 // FIXME: Should have scoped insert pt
1771 MachineBasicBlock &OrigInsBB = B.getMBB();
1772 auto OrigInsPt = B.getInsertPt();
1774 MachineBasicBlock &EntryMBB = B.getMF().front();
1775 EntryMBB.addLiveIn(Arg->getRegister());
1776 B.setInsertPt(EntryMBB, EntryMBB.begin());
1777 B.buildCopy(LiveIn, Arg->getRegister());
1779 B.setInsertPt(OrigInsBB, OrigInsPt);
1782 return true;
1785 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1786 MachineInstr &MI,
1787 MachineRegisterInfo &MRI,
1788 MachineIRBuilder &B,
1789 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1790 B.setInstr(MI);
1792 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1794 const ArgDescriptor *Arg;
1795 const TargetRegisterClass *RC;
1796 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1797 if (!Arg) {
1798 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1799 return false;
1802 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1803 MI.eraseFromParent();
1804 return true;
1807 return false;
1810 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1811 MachineRegisterInfo &MRI,
1812 MachineIRBuilder &B) const {
1813 B.setInstr(MI);
1814 Register Res = MI.getOperand(0).getReg();
1815 Register LHS = MI.getOperand(2).getReg();
1816 Register RHS = MI.getOperand(3).getReg();
1817 uint16_t Flags = MI.getFlags();
1819 LLT S32 = LLT::scalar(32);
1820 LLT S1 = LLT::scalar(1);
1822 auto Abs = B.buildFAbs(S32, RHS, Flags);
1823 const APFloat C0Val(1.0f);
1825 auto C0 = B.buildConstant(S32, 0x6f800000);
1826 auto C1 = B.buildConstant(S32, 0x2f800000);
1827 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1829 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1830 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1832 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1834 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1835 .addUse(Mul0.getReg(0))
1836 .setMIFlags(Flags);
1838 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1840 B.buildFMul(Res, Sel, Mul1, Flags);
1842 MI.eraseFromParent();
1843 return true;
1846 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1847 MachineRegisterInfo &MRI,
1848 MachineIRBuilder &B) const {
1849 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1850 if (!MFI->isEntryFunction()) {
1851 return legalizePreloadedArgIntrin(MI, MRI, B,
1852 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1855 B.setInstr(MI);
1857 uint64_t Offset =
1858 ST.getTargetLowering()->getImplicitParameterOffset(
1859 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1860 Register DstReg = MI.getOperand(0).getReg();
1861 LLT DstTy = MRI.getType(DstReg);
1862 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1864 const ArgDescriptor *Arg;
1865 const TargetRegisterClass *RC;
1866 std::tie(Arg, RC)
1867 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1868 if (!Arg)
1869 return false;
1871 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1872 if (!loadInputValue(KernargPtrReg, B, Arg))
1873 return false;
1875 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1876 MI.eraseFromParent();
1877 return true;
1880 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1881 MachineRegisterInfo &MRI,
1882 MachineIRBuilder &B,
1883 unsigned AddrSpace) const {
1884 B.setInstr(MI);
1885 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1886 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1887 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1888 MI.eraseFromParent();
1889 return true;
1892 /// Handle register layout difference for f16 images for some subtargets.
1893 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
1894 MachineRegisterInfo &MRI,
1895 Register Reg) const {
1896 if (!ST.hasUnpackedD16VMem())
1897 return Reg;
1899 const LLT S16 = LLT::scalar(16);
1900 const LLT S32 = LLT::scalar(32);
1901 LLT StoreVT = MRI.getType(Reg);
1902 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
1904 auto Unmerge = B.buildUnmerge(S16, Reg);
1906 SmallVector<Register, 4> WideRegs;
1907 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1908 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1910 int NumElts = StoreVT.getNumElements();
1912 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1915 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
1916 MachineRegisterInfo &MRI,
1917 MachineIRBuilder &B,
1918 bool IsFormat) const {
1919 // TODO: Reject f16 format on targets where unsupported.
1920 Register VData = MI.getOperand(1).getReg();
1921 LLT Ty = MRI.getType(VData);
1923 B.setInstr(MI);
1925 const LLT S32 = LLT::scalar(32);
1926 const LLT S16 = LLT::scalar(16);
1928 // Fixup illegal register types for i8 stores.
1929 if (Ty == LLT::scalar(8) || Ty == S16) {
1930 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
1931 MI.getOperand(1).setReg(AnyExt);
1932 return true;
1935 if (Ty.isVector()) {
1936 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
1937 if (IsFormat)
1938 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
1939 return true;
1942 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
1945 return Ty == S32;
1948 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1949 MachineRegisterInfo &MRI,
1950 MachineIRBuilder &B) const {
1951 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1952 switch (MI.getIntrinsicID()) {
1953 case Intrinsic::amdgcn_if: {
1954 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1955 const SIRegisterInfo *TRI
1956 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1958 B.setInstr(*BrCond);
1959 Register Def = MI.getOperand(1).getReg();
1960 Register Use = MI.getOperand(3).getReg();
1961 B.buildInstr(AMDGPU::SI_IF)
1962 .addDef(Def)
1963 .addUse(Use)
1964 .addMBB(BrCond->getOperand(1).getMBB());
1966 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1967 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1968 MI.eraseFromParent();
1969 BrCond->eraseFromParent();
1970 return true;
1973 return false;
1975 case Intrinsic::amdgcn_loop: {
1976 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1977 const SIRegisterInfo *TRI
1978 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1980 B.setInstr(*BrCond);
1981 Register Reg = MI.getOperand(2).getReg();
1982 B.buildInstr(AMDGPU::SI_LOOP)
1983 .addUse(Reg)
1984 .addMBB(BrCond->getOperand(1).getMBB());
1985 MI.eraseFromParent();
1986 BrCond->eraseFromParent();
1987 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1988 return true;
1991 return false;
1993 case Intrinsic::amdgcn_kernarg_segment_ptr:
1994 return legalizePreloadedArgIntrin(
1995 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1996 case Intrinsic::amdgcn_implicitarg_ptr:
1997 return legalizeImplicitArgPtr(MI, MRI, B);
1998 case Intrinsic::amdgcn_workitem_id_x:
1999 return legalizePreloadedArgIntrin(MI, MRI, B,
2000 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2001 case Intrinsic::amdgcn_workitem_id_y:
2002 return legalizePreloadedArgIntrin(MI, MRI, B,
2003 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2004 case Intrinsic::amdgcn_workitem_id_z:
2005 return legalizePreloadedArgIntrin(MI, MRI, B,
2006 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2007 case Intrinsic::amdgcn_workgroup_id_x:
2008 return legalizePreloadedArgIntrin(MI, MRI, B,
2009 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2010 case Intrinsic::amdgcn_workgroup_id_y:
2011 return legalizePreloadedArgIntrin(MI, MRI, B,
2012 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2013 case Intrinsic::amdgcn_workgroup_id_z:
2014 return legalizePreloadedArgIntrin(MI, MRI, B,
2015 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2016 case Intrinsic::amdgcn_dispatch_ptr:
2017 return legalizePreloadedArgIntrin(MI, MRI, B,
2018 AMDGPUFunctionArgInfo::DISPATCH_PTR);
2019 case Intrinsic::amdgcn_queue_ptr:
2020 return legalizePreloadedArgIntrin(MI, MRI, B,
2021 AMDGPUFunctionArgInfo::QUEUE_PTR);
2022 case Intrinsic::amdgcn_implicit_buffer_ptr:
2023 return legalizePreloadedArgIntrin(
2024 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2025 case Intrinsic::amdgcn_dispatch_id:
2026 return legalizePreloadedArgIntrin(MI, MRI, B,
2027 AMDGPUFunctionArgInfo::DISPATCH_ID);
2028 case Intrinsic::amdgcn_fdiv_fast:
2029 return legalizeFDIVFast(MI, MRI, B);
2030 case Intrinsic::amdgcn_is_shared:
2031 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2032 case Intrinsic::amdgcn_is_private:
2033 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2034 case Intrinsic::amdgcn_wavefrontsize: {
2035 B.setInstr(MI);
2036 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2037 MI.eraseFromParent();
2038 return true;
2040 case Intrinsic::amdgcn_raw_buffer_store:
2041 return legalizeRawBufferStore(MI, MRI, B, false);
2042 case Intrinsic::amdgcn_raw_buffer_store_format:
2043 return legalizeRawBufferStore(MI, MRI, B, true);
2044 default:
2045 return true;
2048 return true;