[InstCombine] Signed saturation patterns
[llvm-complete.git] / lib / Target / AMDGPU / AMDGPULegalizerInfo.cpp
blob5aba35a19ced77532725da31c78546ab263647b9
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
34 #define DEBUG_TYPE "amdgpu-legalinfo"
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43 unsigned MaxSize = 1024) {
44 return [=](const LegalityQuery &Query) {
45 const LLT Ty = Query.Types[TypeIdx];
46 const LLT EltTy = Ty.getScalarType();
47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52 return [=](const LegalityQuery &Query) {
53 return Query.Types[TypeIdx].getSizeInBits() == Size;
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58 return [=](const LegalityQuery &Query) {
59 const LLT Ty = Query.Types[TypeIdx];
60 return Ty.isVector() &&
61 Ty.getNumElements() % 2 != 0 &&
62 Ty.getElementType().getSizeInBits() < 32 &&
63 Ty.getSizeInBits() % 32 != 0;
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68 return [=](const LegalityQuery &Query) {
69 const LLT Ty = Query.Types[TypeIdx];
70 const LLT EltTy = Ty.getScalarType();
71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 const LLT EltTy = Ty.getElementType();
79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84 return [=](const LegalityQuery &Query) {
85 const LLT Ty = Query.Types[TypeIdx];
86 const LLT EltTy = Ty.getElementType();
87 unsigned Size = Ty.getSizeInBits();
88 unsigned Pieces = (Size + 63) / 64;
89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
100 const LLT EltTy = Ty.getElementType();
101 const int Size = Ty.getSizeInBits();
102 const int EltSize = EltTy.getSizeInBits();
103 const int NextMul32 = (Size + 31) / 32;
105 assert(EltSize < 32);
107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113 return [=](const LegalityQuery &Query) {
114 const LLT QueryTy = Query.Types[TypeIdx];
115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120 return [=](const LegalityQuery &Query) {
121 const LLT QueryTy = Query.Types[TypeIdx];
122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127 return [=](const LegalityQuery &Query) {
128 const LLT QueryTy = Query.Types[TypeIdx];
129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136 return [=](const LegalityQuery &Query) {
137 const LLT Ty = Query.Types[TypeIdx];
138 if (Ty.isVector()) {
139 const int EltSize = Ty.getElementType().getSizeInBits();
140 return EltSize == 32 || EltSize == 64 ||
141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142 EltSize == 128 || EltSize == 256;
145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150 return [=](const LegalityQuery &Query) {
151 return Query.Types[TypeIdx].getElementType() == Type;
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156 return [=](const LegalityQuery &Query) {
157 const LLT Ty = Query.Types[TypeIdx];
158 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164 const GCNTargetMachine &TM)
165 : ST(ST_) {
166 using namespace TargetOpcode;
168 auto GetAddrSpacePtr = [&TM](unsigned AS) {
169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
172 const LLT S1 = LLT::scalar(1);
173 const LLT S8 = LLT::scalar(8);
174 const LLT S16 = LLT::scalar(16);
175 const LLT S32 = LLT::scalar(32);
176 const LLT S64 = LLT::scalar(64);
177 const LLT S96 = LLT::scalar(96);
178 const LLT S128 = LLT::scalar(128);
179 const LLT S256 = LLT::scalar(256);
180 const LLT S1024 = LLT::scalar(1024);
182 const LLT V2S16 = LLT::vector(2, 16);
183 const LLT V4S16 = LLT::vector(4, 16);
185 const LLT V2S32 = LLT::vector(2, 32);
186 const LLT V3S32 = LLT::vector(3, 32);
187 const LLT V4S32 = LLT::vector(4, 32);
188 const LLT V5S32 = LLT::vector(5, 32);
189 const LLT V6S32 = LLT::vector(6, 32);
190 const LLT V7S32 = LLT::vector(7, 32);
191 const LLT V8S32 = LLT::vector(8, 32);
192 const LLT V9S32 = LLT::vector(9, 32);
193 const LLT V10S32 = LLT::vector(10, 32);
194 const LLT V11S32 = LLT::vector(11, 32);
195 const LLT V12S32 = LLT::vector(12, 32);
196 const LLT V13S32 = LLT::vector(13, 32);
197 const LLT V14S32 = LLT::vector(14, 32);
198 const LLT V15S32 = LLT::vector(15, 32);
199 const LLT V16S32 = LLT::vector(16, 32);
200 const LLT V32S32 = LLT::vector(32, 32);
202 const LLT V2S64 = LLT::vector(2, 64);
203 const LLT V3S64 = LLT::vector(3, 64);
204 const LLT V4S64 = LLT::vector(4, 64);
205 const LLT V5S64 = LLT::vector(5, 64);
206 const LLT V6S64 = LLT::vector(6, 64);
207 const LLT V7S64 = LLT::vector(7, 64);
208 const LLT V8S64 = LLT::vector(8, 64);
209 const LLT V16S64 = LLT::vector(16, 64);
211 std::initializer_list<LLT> AllS32Vectors =
212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214 std::initializer_list<LLT> AllS64Vectors =
215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
225 const LLT CodePtr = FlatPtr;
227 const std::initializer_list<LLT> AddrSpaces64 = {
228 GlobalPtr, ConstantPtr, FlatPtr
231 const std::initializer_list<LLT> AddrSpaces32 = {
232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
235 const std::initializer_list<LLT> FPTypesBase = {
236 S32, S64
239 const std::initializer_list<LLT> FPTypes16 = {
240 S32, S64, S16
243 const std::initializer_list<LLT> FPTypesPK16 = {
244 S32, S64, S16, V2S16
247 setAction({G_BRCOND, S1}, Legal);
249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250 // elements for v3s16
251 getActionDefinitionsBuilder(G_PHI)
252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253 .legalFor(AllS32Vectors)
254 .legalFor(AllS64Vectors)
255 .legalFor(AddrSpaces64)
256 .legalFor(AddrSpaces32)
257 .clampScalar(0, S32, S256)
258 .widenScalarToNextPow2(0, 32)
259 .clampMaxNumElements(0, S32, 16)
260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261 .legalIf(isPointer(0));
263 if (ST.has16BitInsts()) {
264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265 .legalFor({S32, S16})
266 .clampScalar(0, S16, S32)
267 .scalarize(0);
268 } else {
269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270 .legalFor({S32})
271 .clampScalar(0, S32, S32)
272 .scalarize(0);
275 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
276 .legalFor({S32})
277 .clampScalar(0, S32, S32)
278 .scalarize(0);
280 // Report legal for any types we can handle anywhere. For the cases only legal
281 // on the SALU, RegBankSelect will be able to re-legalize.
282 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
283 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
284 .clampScalar(0, S32, S64)
285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
287 .widenScalarToNextPow2(0)
288 .scalarize(0);
290 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
291 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
292 .legalFor({{S32, S1}})
293 .clampScalar(0, S32, S32)
294 .scalarize(0); // TODO: Implement.
296 getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
297 .lower();
299 getActionDefinitionsBuilder(G_BITCAST)
300 // Don't worry about the size constraint.
301 .legalIf(all(isRegisterType(0), isRegisterType(1)))
302 // FIXME: Testing hack
303 .legalForCartesianProduct({S16, LLT::vector(2, 8), });
305 getActionDefinitionsBuilder(G_FCONSTANT)
306 .legalFor({S32, S64, S16})
307 .clampScalar(0, S16, S64);
309 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
310 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
311 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313 .clampScalarOrElt(0, S32, S1024)
314 .legalIf(isMultiple32(0))
315 .widenScalarToNextPow2(0, 32)
316 .clampMaxNumElements(0, S32, 16);
319 // FIXME: i1 operands to intrinsics should always be legal, but other i1
320 // values may not be legal. We need to figure out how to distinguish
321 // between these two scenarios.
322 getActionDefinitionsBuilder(G_CONSTANT)
323 .legalFor({S1, S32, S64, S16, GlobalPtr,
324 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
325 .clampScalar(0, S32, S64)
326 .widenScalarToNextPow2(0)
327 .legalIf(isPointer(0));
329 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
330 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
331 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
334 auto &FPOpActions = getActionDefinitionsBuilder(
335 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
336 .legalFor({S32, S64});
337 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
338 .customFor({S32, S64});
339 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
340 .customFor({S32, S64});
342 if (ST.has16BitInsts()) {
343 if (ST.hasVOP3PInsts())
344 FPOpActions.legalFor({S16, V2S16});
345 else
346 FPOpActions.legalFor({S16});
348 TrigActions.customFor({S16});
349 FDIVActions.customFor({S16});
352 auto &MinNumMaxNum = getActionDefinitionsBuilder({
353 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
355 if (ST.hasVOP3PInsts()) {
356 MinNumMaxNum.customFor(FPTypesPK16)
357 .clampMaxNumElements(0, S16, 2)
358 .clampScalar(0, S16, S64)
359 .scalarize(0);
360 } else if (ST.has16BitInsts()) {
361 MinNumMaxNum.customFor(FPTypes16)
362 .clampScalar(0, S16, S64)
363 .scalarize(0);
364 } else {
365 MinNumMaxNum.customFor(FPTypesBase)
366 .clampScalar(0, S32, S64)
367 .scalarize(0);
370 if (ST.hasVOP3PInsts())
371 FPOpActions.clampMaxNumElements(0, S16, 2);
373 FPOpActions
374 .scalarize(0)
375 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
377 TrigActions
378 .scalarize(0)
379 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
381 FDIVActions
382 .scalarize(0)
383 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
385 getActionDefinitionsBuilder({G_FNEG, G_FABS})
386 .legalFor(FPTypesPK16)
387 .clampMaxNumElements(0, S16, 2)
388 .scalarize(0)
389 .clampScalar(0, S16, S64);
391 // TODO: Implement
392 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
394 if (ST.has16BitInsts()) {
395 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
396 .legalFor({S32, S64, S16})
397 .scalarize(0)
398 .clampScalar(0, S16, S64);
399 } else {
400 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
401 .legalFor({S32, S64})
402 .scalarize(0)
403 .clampScalar(0, S32, S64);
406 getActionDefinitionsBuilder(G_FPTRUNC)
407 .legalFor({{S32, S64}, {S16, S32}})
408 .scalarize(0);
410 getActionDefinitionsBuilder(G_FPEXT)
411 .legalFor({{S64, S32}, {S32, S16}})
412 .lowerFor({{S64, S16}}) // FIXME: Implement
413 .scalarize(0);
415 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
416 getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
418 getActionDefinitionsBuilder(G_FSUB)
419 // Use actual fsub instruction
420 .legalFor({S32})
421 // Must use fadd + fneg
422 .lowerFor({S64, S16, V2S16})
423 .scalarize(0)
424 .clampScalar(0, S32, S64);
426 // Whether this is legal depends on the floating point mode for the function.
427 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
428 if (ST.hasMadF16())
429 FMad.customFor({S32, S16});
430 else
431 FMad.customFor({S32});
432 FMad.scalarize(0)
433 .lower();
435 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
436 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
437 {S32, S1}, {S64, S1}, {S16, S1},
438 {S96, S32},
439 // FIXME: Hack
440 {S64, LLT::scalar(33)},
441 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
442 .scalarize(0);
444 // TODO: Split s1->s64 during regbankselect for VALU.
445 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
446 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
447 .lowerFor({{S32, S64}})
448 .customFor({{S64, S64}});
449 if (ST.has16BitInsts())
450 IToFP.legalFor({{S16, S16}});
451 IToFP.clampScalar(1, S32, S64)
452 .scalarize(0);
454 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
455 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
456 if (ST.has16BitInsts())
457 FPToI.legalFor({{S16, S16}});
458 else
459 FPToI.minScalar(1, S32);
461 FPToI.minScalar(0, S32)
462 .scalarize(0);
464 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
465 .legalFor({S32, S64})
466 .scalarize(0);
468 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
469 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
470 .legalFor({S32, S64})
471 .clampScalar(0, S32, S64)
472 .scalarize(0);
473 } else {
474 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
475 .legalFor({S32})
476 .customFor({S64})
477 .clampScalar(0, S32, S64)
478 .scalarize(0);
481 getActionDefinitionsBuilder(G_GEP)
482 .legalForCartesianProduct(AddrSpaces64, {S64})
483 .legalForCartesianProduct(AddrSpaces32, {S32})
484 .scalarize(0);
486 getActionDefinitionsBuilder(G_PTR_MASK)
487 .scalarize(0)
488 .alwaysLegal();
490 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
492 auto &CmpBuilder =
493 getActionDefinitionsBuilder(G_ICMP)
494 .legalForCartesianProduct(
495 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
496 .legalFor({{S1, S32}, {S1, S64}});
497 if (ST.has16BitInsts()) {
498 CmpBuilder.legalFor({{S1, S16}});
501 CmpBuilder
502 .widenScalarToNextPow2(1)
503 .clampScalar(1, S32, S64)
504 .scalarize(0)
505 .legalIf(all(typeIs(0, S1), isPointer(1)));
507 getActionDefinitionsBuilder(G_FCMP)
508 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
509 .widenScalarToNextPow2(1)
510 .clampScalar(1, S32, S64)
511 .scalarize(0);
513 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
514 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
515 G_FLOG, G_FLOG2, G_FLOG10})
516 .legalFor({S32})
517 .scalarize(0);
519 // The 64-bit versions produce 32-bit results, but only on the SALU.
520 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
521 G_CTTZ, G_CTTZ_ZERO_UNDEF,
522 G_CTPOP})
523 .legalFor({{S32, S32}, {S32, S64}})
524 .clampScalar(0, S32, S32)
525 .clampScalar(1, S32, S64)
526 .scalarize(0)
527 .widenScalarToNextPow2(0, 32)
528 .widenScalarToNextPow2(1, 32);
530 // TODO: Expand for > s32
531 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
532 .legalFor({S32})
533 .clampScalar(0, S32, S32)
534 .scalarize(0);
536 if (ST.has16BitInsts()) {
537 if (ST.hasVOP3PInsts()) {
538 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
539 .legalFor({S32, S16, V2S16})
540 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
541 .clampMaxNumElements(0, S16, 2)
542 .clampScalar(0, S16, S32)
543 .widenScalarToNextPow2(0)
544 .scalarize(0);
545 } else {
546 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
547 .legalFor({S32, S16})
548 .widenScalarToNextPow2(0)
549 .clampScalar(0, S16, S32)
550 .scalarize(0);
552 } else {
553 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
554 .legalFor({S32})
555 .clampScalar(0, S32, S32)
556 .widenScalarToNextPow2(0)
557 .scalarize(0);
560 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
561 return [=](const LegalityQuery &Query) {
562 return Query.Types[TypeIdx0].getSizeInBits() <
563 Query.Types[TypeIdx1].getSizeInBits();
567 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
568 return [=](const LegalityQuery &Query) {
569 return Query.Types[TypeIdx0].getSizeInBits() >
570 Query.Types[TypeIdx1].getSizeInBits();
574 getActionDefinitionsBuilder(G_INTTOPTR)
575 // List the common cases
576 .legalForCartesianProduct(AddrSpaces64, {S64})
577 .legalForCartesianProduct(AddrSpaces32, {S32})
578 .scalarize(0)
579 // Accept any address space as long as the size matches
580 .legalIf(sameSize(0, 1))
581 .widenScalarIf(smallerThan(1, 0),
582 [](const LegalityQuery &Query) {
583 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
585 .narrowScalarIf(greaterThan(1, 0),
586 [](const LegalityQuery &Query) {
587 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
590 getActionDefinitionsBuilder(G_PTRTOINT)
591 // List the common cases
592 .legalForCartesianProduct(AddrSpaces64, {S64})
593 .legalForCartesianProduct(AddrSpaces32, {S32})
594 .scalarize(0)
595 // Accept any address space as long as the size matches
596 .legalIf(sameSize(0, 1))
597 .widenScalarIf(smallerThan(0, 1),
598 [](const LegalityQuery &Query) {
599 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
601 .narrowScalarIf(
602 greaterThan(0, 1),
603 [](const LegalityQuery &Query) {
604 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
607 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
608 .scalarize(0)
609 .custom();
611 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
612 // handle some operations by just promoting the register during
613 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
614 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
615 switch (AS) {
616 // FIXME: Private element size.
617 case AMDGPUAS::PRIVATE_ADDRESS:
618 return 32;
619 // FIXME: Check subtarget
620 case AMDGPUAS::LOCAL_ADDRESS:
621 return ST.useDS128() ? 128 : 64;
623 // Treat constant and global as identical. SMRD loads are sometimes usable
624 // for global loads (ideally constant address space should be eliminated)
625 // depending on the context. Legality cannot be context dependent, but
626 // RegBankSelect can split the load as necessary depending on the pointer
627 // register bank/uniformity and if the memory is invariant or not written in
628 // a kernel.
629 case AMDGPUAS::CONSTANT_ADDRESS:
630 case AMDGPUAS::GLOBAL_ADDRESS:
631 return 512;
632 default:
633 return 128;
637 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
638 const LLT DstTy = Query.Types[0];
640 // Split vector extloads.
641 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
642 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
643 return true;
645 const LLT PtrTy = Query.Types[1];
646 unsigned AS = PtrTy.getAddressSpace();
647 if (MemSize > maxSizeForAddrSpace(AS))
648 return true;
650 // Catch weird sized loads that don't evenly divide into the access sizes
651 // TODO: May be able to widen depending on alignment etc.
652 unsigned NumRegs = MemSize / 32;
653 if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
654 return true;
656 unsigned Align = Query.MMODescrs[0].AlignInBits;
657 if (Align < MemSize) {
658 const SITargetLowering *TLI = ST.getTargetLowering();
659 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
662 return false;
665 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
666 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
667 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
669 // TODO: Refine based on subtargets which support unaligned access or 128-bit
670 // LDS
671 // TODO: Unsupported flat for SI.
673 for (unsigned Op : {G_LOAD, G_STORE}) {
674 const bool IsStore = Op == G_STORE;
676 auto &Actions = getActionDefinitionsBuilder(Op);
677 // Whitelist the common cases.
678 // TODO: Pointer loads
679 // TODO: Wide constant loads
680 // TODO: Only CI+ has 3x loads
681 // TODO: Loads to s16 on gfx9
682 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
683 {V2S32, GlobalPtr, 64, GlobalAlign32},
684 {V3S32, GlobalPtr, 96, GlobalAlign32},
685 {S96, GlobalPtr, 96, GlobalAlign32},
686 {V4S32, GlobalPtr, 128, GlobalAlign32},
687 {S128, GlobalPtr, 128, GlobalAlign32},
688 {S64, GlobalPtr, 64, GlobalAlign32},
689 {V2S64, GlobalPtr, 128, GlobalAlign32},
690 {V2S16, GlobalPtr, 32, GlobalAlign32},
691 {S32, GlobalPtr, 8, GlobalAlign8},
692 {S32, GlobalPtr, 16, GlobalAlign16},
694 {S32, LocalPtr, 32, 32},
695 {S64, LocalPtr, 64, 32},
696 {V2S32, LocalPtr, 64, 32},
697 {S32, LocalPtr, 8, 8},
698 {S32, LocalPtr, 16, 16},
699 {V2S16, LocalPtr, 32, 32},
701 {S32, PrivatePtr, 32, 32},
702 {S32, PrivatePtr, 8, 8},
703 {S32, PrivatePtr, 16, 16},
704 {V2S16, PrivatePtr, 32, 32},
706 {S32, FlatPtr, 32, GlobalAlign32},
707 {S32, FlatPtr, 16, GlobalAlign16},
708 {S32, FlatPtr, 8, GlobalAlign8},
709 {V2S16, FlatPtr, 32, GlobalAlign32},
711 {S32, ConstantPtr, 32, GlobalAlign32},
712 {V2S32, ConstantPtr, 64, GlobalAlign32},
713 {V3S32, ConstantPtr, 96, GlobalAlign32},
714 {V4S32, ConstantPtr, 128, GlobalAlign32},
715 {S64, ConstantPtr, 64, GlobalAlign32},
716 {S128, ConstantPtr, 128, GlobalAlign32},
717 {V2S32, ConstantPtr, 32, GlobalAlign32}});
718 Actions
719 .customIf(typeIs(1, Constant32Ptr))
720 .narrowScalarIf(
721 [=](const LegalityQuery &Query) -> bool {
722 return !Query.Types[0].isVector() && needToSplitLoad(Query);
724 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
725 const LLT DstTy = Query.Types[0];
726 const LLT PtrTy = Query.Types[1];
728 const unsigned DstSize = DstTy.getSizeInBits();
729 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
731 // Split extloads.
732 if (DstSize > MemSize)
733 return std::make_pair(0, LLT::scalar(MemSize));
735 if (DstSize > 32 && (DstSize % 32 != 0)) {
736 // FIXME: Need a way to specify non-extload of larger size if
737 // suitably aligned.
738 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
741 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
742 if (MemSize > MaxSize)
743 return std::make_pair(0, LLT::scalar(MaxSize));
745 unsigned Align = Query.MMODescrs[0].AlignInBits;
746 return std::make_pair(0, LLT::scalar(Align));
748 .fewerElementsIf(
749 [=](const LegalityQuery &Query) -> bool {
750 return Query.Types[0].isVector() && needToSplitLoad(Query);
752 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
753 const LLT DstTy = Query.Types[0];
754 const LLT PtrTy = Query.Types[1];
756 LLT EltTy = DstTy.getElementType();
757 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
759 // Split if it's too large for the address space.
760 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
761 unsigned NumElts = DstTy.getNumElements();
762 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
764 // FIXME: Refine when odd breakdowns handled
765 // The scalars will need to be re-legalized.
766 if (NumPieces == 1 || NumPieces >= NumElts ||
767 NumElts % NumPieces != 0)
768 return std::make_pair(0, EltTy);
770 return std::make_pair(0,
771 LLT::vector(NumElts / NumPieces, EltTy));
774 // Need to split because of alignment.
775 unsigned Align = Query.MMODescrs[0].AlignInBits;
776 unsigned EltSize = EltTy.getSizeInBits();
777 if (EltSize > Align &&
778 (EltSize / Align < DstTy.getNumElements())) {
779 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
782 // May need relegalization for the scalars.
783 return std::make_pair(0, EltTy);
785 .minScalar(0, S32);
787 if (IsStore)
788 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
790 // TODO: Need a bitcast lower option?
791 Actions
792 .legalIf([=](const LegalityQuery &Query) {
793 const LLT Ty0 = Query.Types[0];
794 unsigned Size = Ty0.getSizeInBits();
795 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
796 unsigned Align = Query.MMODescrs[0].AlignInBits;
798 // No extending vector loads.
799 if (Size > MemSize && Ty0.isVector())
800 return false;
802 // FIXME: Widening store from alignment not valid.
803 if (MemSize < Size)
804 MemSize = std::max(MemSize, Align);
806 switch (MemSize) {
807 case 8:
808 case 16:
809 return Size == 32;
810 case 32:
811 case 64:
812 case 128:
813 return true;
814 case 96:
815 return ST.hasDwordx3LoadStores();
816 case 256:
817 case 512:
818 return true;
819 default:
820 return false;
823 .widenScalarToNextPow2(0)
824 // TODO: v3s32->v4s32 with alignment
825 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
828 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
829 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
830 {S32, GlobalPtr, 16, 2 * 8},
831 {S32, LocalPtr, 8, 8},
832 {S32, LocalPtr, 16, 16},
833 {S32, PrivatePtr, 8, 8},
834 {S32, PrivatePtr, 16, 16},
835 {S32, ConstantPtr, 8, 8},
836 {S32, ConstantPtr, 16, 2 * 8}});
837 if (ST.hasFlatAddressSpace()) {
838 ExtLoads.legalForTypesWithMemDesc(
839 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
842 ExtLoads.clampScalar(0, S32, S32)
843 .widenScalarToNextPow2(0)
844 .unsupportedIfMemSizeNotPow2()
845 .lower();
847 auto &Atomics = getActionDefinitionsBuilder(
848 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
849 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
850 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
851 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
852 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
853 {S64, GlobalPtr}, {S64, LocalPtr}});
854 if (ST.hasFlatAddressSpace()) {
855 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
858 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
859 .legalFor({{S32, LocalPtr}});
861 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
862 .lower();
864 // TODO: Pointer types, any 32-bit or 64-bit vector
865 getActionDefinitionsBuilder(G_SELECT)
866 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
867 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
868 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
869 .clampScalar(0, S16, S64)
870 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
871 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
872 .scalarize(1)
873 .clampMaxNumElements(0, S32, 2)
874 .clampMaxNumElements(0, LocalPtr, 2)
875 .clampMaxNumElements(0, PrivatePtr, 2)
876 .scalarize(0)
877 .widenScalarToNextPow2(0)
878 .legalIf(all(isPointer(0), typeIs(1, S1)));
880 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
881 // be more flexible with the shift amount type.
882 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
883 .legalFor({{S32, S32}, {S64, S32}});
884 if (ST.has16BitInsts()) {
885 if (ST.hasVOP3PInsts()) {
886 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
887 .clampMaxNumElements(0, S16, 2);
888 } else
889 Shifts.legalFor({{S16, S32}, {S16, S16}});
891 Shifts.clampScalar(1, S16, S32);
892 Shifts.clampScalar(0, S16, S64);
893 Shifts.widenScalarToNextPow2(0, 16);
894 } else {
895 // Make sure we legalize the shift amount type first, as the general
896 // expansion for the shifted type will produce much worse code if it hasn't
897 // been truncated already.
898 Shifts.clampScalar(1, S32, S32);
899 Shifts.clampScalar(0, S32, S64);
900 Shifts.widenScalarToNextPow2(0, 32);
902 Shifts.scalarize(0);
904 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
905 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
906 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
907 unsigned IdxTypeIdx = 2;
909 getActionDefinitionsBuilder(Op)
910 .customIf([=](const LegalityQuery &Query) {
911 const LLT EltTy = Query.Types[EltTypeIdx];
912 const LLT VecTy = Query.Types[VecTypeIdx];
913 const LLT IdxTy = Query.Types[IdxTypeIdx];
914 return (EltTy.getSizeInBits() == 16 ||
915 EltTy.getSizeInBits() % 32 == 0) &&
916 VecTy.getSizeInBits() % 32 == 0 &&
917 VecTy.getSizeInBits() <= 1024 &&
918 IdxTy.getSizeInBits() == 32;
920 .clampScalar(EltTypeIdx, S32, S64)
921 .clampScalar(VecTypeIdx, S32, S64)
922 .clampScalar(IdxTypeIdx, S32, S32);
925 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
926 .unsupportedIf([=](const LegalityQuery &Query) {
927 const LLT &EltTy = Query.Types[1].getElementType();
928 return Query.Types[0] != EltTy;
931 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
932 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
933 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
935 // FIXME: Doesn't handle extract of illegal sizes.
936 getActionDefinitionsBuilder(Op)
937 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
938 // FIXME: Multiples of 16 should not be legal.
939 .legalIf([=](const LegalityQuery &Query) {
940 const LLT BigTy = Query.Types[BigTyIdx];
941 const LLT LitTy = Query.Types[LitTyIdx];
942 return (BigTy.getSizeInBits() % 32 == 0) &&
943 (LitTy.getSizeInBits() % 16 == 0);
945 .widenScalarIf(
946 [=](const LegalityQuery &Query) {
947 const LLT BigTy = Query.Types[BigTyIdx];
948 return (BigTy.getScalarSizeInBits() < 16);
950 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
951 .widenScalarIf(
952 [=](const LegalityQuery &Query) {
953 const LLT LitTy = Query.Types[LitTyIdx];
954 return (LitTy.getScalarSizeInBits() < 16);
956 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
957 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
958 .widenScalarToNextPow2(BigTyIdx, 32);
962 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
963 .legalForCartesianProduct(AllS32Vectors, {S32})
964 .legalForCartesianProduct(AllS64Vectors, {S64})
965 .clampNumElements(0, V16S32, V32S32)
966 .clampNumElements(0, V2S64, V16S64)
967 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
969 if (ST.hasScalarPackInsts())
970 BuildVector.legalFor({V2S16, S32});
972 BuildVector
973 .minScalarSameAs(1, 0)
974 .legalIf(isRegisterType(0))
975 .minScalarOrElt(0, S32);
977 if (ST.hasScalarPackInsts()) {
978 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
979 .legalFor({V2S16, S32})
980 .lower();
981 } else {
982 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
983 .lower();
986 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
987 .legalIf(isRegisterType(0));
989 // TODO: Don't fully scalarize v2s16 pieces
990 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
992 // Merge/Unmerge
993 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
994 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
995 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
997 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
998 const LLT &Ty = Query.Types[TypeIdx];
999 if (Ty.isVector()) {
1000 const LLT &EltTy = Ty.getElementType();
1001 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1002 return true;
1003 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1004 return true;
1006 return false;
1009 auto &Builder = getActionDefinitionsBuilder(Op)
1010 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1011 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1012 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1013 // valid.
1014 .clampScalar(LitTyIdx, S16, S256)
1015 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1016 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1017 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1018 elementTypeIs(1, S16)),
1019 changeTo(1, V2S16))
1020 // Break up vectors with weird elements into scalars
1021 .fewerElementsIf(
1022 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1023 scalarize(0))
1024 .fewerElementsIf(
1025 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1026 scalarize(1))
1027 .clampScalar(BigTyIdx, S32, S1024)
1028 .lowerFor({{S16, V2S16}});
1030 if (Op == G_MERGE_VALUES) {
1031 Builder.widenScalarIf(
1032 // TODO: Use 16-bit shifts if legal for 8-bit values?
1033 [=](const LegalityQuery &Query) {
1034 const LLT Ty = Query.Types[LitTyIdx];
1035 return Ty.getSizeInBits() < 32;
1037 changeTo(LitTyIdx, S32));
1040 Builder.widenScalarIf(
1041 [=](const LegalityQuery &Query) {
1042 const LLT Ty = Query.Types[BigTyIdx];
1043 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1044 Ty.getSizeInBits() % 16 != 0;
1046 [=](const LegalityQuery &Query) {
1047 // Pick the next power of 2, or a multiple of 64 over 128.
1048 // Whichever is smaller.
1049 const LLT &Ty = Query.Types[BigTyIdx];
1050 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1051 if (NewSizeInBits >= 256) {
1052 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1053 if (RoundedTo < NewSizeInBits)
1054 NewSizeInBits = RoundedTo;
1056 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1058 .legalIf([=](const LegalityQuery &Query) {
1059 const LLT &BigTy = Query.Types[BigTyIdx];
1060 const LLT &LitTy = Query.Types[LitTyIdx];
1062 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1063 return false;
1064 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1065 return false;
1067 return BigTy.getSizeInBits() % 16 == 0 &&
1068 LitTy.getSizeInBits() % 16 == 0 &&
1069 BigTy.getSizeInBits() <= 1024;
1071 // Any vectors left are the wrong size. Scalarize them.
1072 .scalarize(0)
1073 .scalarize(1);
1076 getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1078 computeTables();
1079 verify(*ST.getInstrInfo());
1082 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1083 MachineRegisterInfo &MRI,
1084 MachineIRBuilder &B,
1085 GISelChangeObserver &Observer) const {
1086 switch (MI.getOpcode()) {
1087 case TargetOpcode::G_ADDRSPACE_CAST:
1088 return legalizeAddrSpaceCast(MI, MRI, B);
1089 case TargetOpcode::G_FRINT:
1090 return legalizeFrint(MI, MRI, B);
1091 case TargetOpcode::G_FCEIL:
1092 return legalizeFceil(MI, MRI, B);
1093 case TargetOpcode::G_INTRINSIC_TRUNC:
1094 return legalizeIntrinsicTrunc(MI, MRI, B);
1095 case TargetOpcode::G_SITOFP:
1096 return legalizeITOFP(MI, MRI, B, true);
1097 case TargetOpcode::G_UITOFP:
1098 return legalizeITOFP(MI, MRI, B, false);
1099 case TargetOpcode::G_FMINNUM:
1100 case TargetOpcode::G_FMAXNUM:
1101 case TargetOpcode::G_FMINNUM_IEEE:
1102 case TargetOpcode::G_FMAXNUM_IEEE:
1103 return legalizeMinNumMaxNum(MI, MRI, B);
1104 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1105 return legalizeExtractVectorElt(MI, MRI, B);
1106 case TargetOpcode::G_INSERT_VECTOR_ELT:
1107 return legalizeInsertVectorElt(MI, MRI, B);
1108 case TargetOpcode::G_FSIN:
1109 case TargetOpcode::G_FCOS:
1110 return legalizeSinCos(MI, MRI, B);
1111 case TargetOpcode::G_GLOBAL_VALUE:
1112 return legalizeGlobalValue(MI, MRI, B);
1113 case TargetOpcode::G_LOAD:
1114 return legalizeLoad(MI, MRI, B, Observer);
1115 case TargetOpcode::G_FMAD:
1116 return legalizeFMad(MI, MRI, B);
1117 case TargetOpcode::G_FDIV:
1118 return legalizeFDIV(MI, MRI, B);
1119 default:
1120 return false;
1123 llvm_unreachable("expected switch to return");
1126 Register AMDGPULegalizerInfo::getSegmentAperture(
1127 unsigned AS,
1128 MachineRegisterInfo &MRI,
1129 MachineIRBuilder &B) const {
1130 MachineFunction &MF = B.getMF();
1131 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1132 const LLT S32 = LLT::scalar(32);
1134 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1136 if (ST.hasApertureRegs()) {
1137 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1138 // getreg.
1139 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1140 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1141 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1142 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1143 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1144 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1145 unsigned Encoding =
1146 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1147 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1148 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1150 Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1151 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1153 B.buildInstr(AMDGPU::S_GETREG_B32)
1154 .addDef(GetReg)
1155 .addImm(Encoding);
1156 MRI.setType(GetReg, S32);
1158 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1159 B.buildInstr(TargetOpcode::G_SHL)
1160 .addDef(ApertureReg)
1161 .addUse(GetReg)
1162 .addUse(ShiftAmt.getReg(0));
1164 return ApertureReg;
1167 Register QueuePtr = MRI.createGenericVirtualRegister(
1168 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1170 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1171 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1172 return Register();
1174 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1175 // private_segment_aperture_base_hi.
1176 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1178 // FIXME: Don't use undef
1179 Value *V = UndefValue::get(PointerType::get(
1180 Type::getInt8Ty(MF.getFunction().getContext()),
1181 AMDGPUAS::CONSTANT_ADDRESS));
1183 MachinePointerInfo PtrInfo(V, StructOffset);
1184 MachineMemOperand *MMO = MF.getMachineMemOperand(
1185 PtrInfo,
1186 MachineMemOperand::MOLoad |
1187 MachineMemOperand::MODereferenceable |
1188 MachineMemOperand::MOInvariant,
1190 MinAlign(64, StructOffset));
1192 Register LoadResult = MRI.createGenericVirtualRegister(S32);
1193 Register LoadAddr;
1195 B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1196 B.buildLoad(LoadResult, LoadAddr, *MMO);
1197 return LoadResult;
1200 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1201 MachineInstr &MI, MachineRegisterInfo &MRI,
1202 MachineIRBuilder &B) const {
1203 MachineFunction &MF = B.getMF();
1205 B.setInstr(MI);
1207 const LLT S32 = LLT::scalar(32);
1208 Register Dst = MI.getOperand(0).getReg();
1209 Register Src = MI.getOperand(1).getReg();
1211 LLT DstTy = MRI.getType(Dst);
1212 LLT SrcTy = MRI.getType(Src);
1213 unsigned DestAS = DstTy.getAddressSpace();
1214 unsigned SrcAS = SrcTy.getAddressSpace();
1216 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1217 // vector element.
1218 assert(!DstTy.isVector());
1220 const AMDGPUTargetMachine &TM
1221 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1223 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1224 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1225 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1226 return true;
1229 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1230 // Truncate.
1231 B.buildExtract(Dst, Src, 0);
1232 MI.eraseFromParent();
1233 return true;
1236 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1237 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1238 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1240 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1241 // another. Merge operands are required to be the same type, but creating an
1242 // extra ptrtoint would be kind of pointless.
1243 auto HighAddr = B.buildConstant(
1244 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1245 B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1246 MI.eraseFromParent();
1247 return true;
1250 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1251 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1252 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1253 unsigned NullVal = TM.getNullPointerValue(DestAS);
1255 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1256 auto FlatNull = B.buildConstant(SrcTy, 0);
1258 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1260 // Extract low 32-bits of the pointer.
1261 B.buildExtract(PtrLo32, Src, 0);
1263 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1264 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1265 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1267 MI.eraseFromParent();
1268 return true;
1271 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1272 return false;
1274 if (!ST.hasFlatAddressSpace())
1275 return false;
1277 auto SegmentNull =
1278 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1279 auto FlatNull =
1280 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1282 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1283 if (!ApertureReg.isValid())
1284 return false;
1286 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1287 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1289 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1291 // Coerce the type of the low half of the result so we can use merge_values.
1292 Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1293 B.buildInstr(TargetOpcode::G_PTRTOINT)
1294 .addDef(SrcAsInt)
1295 .addUse(Src);
1297 // TODO: Should we allow mismatched types but matching sizes in merges to
1298 // avoid the ptrtoint?
1299 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1300 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1302 MI.eraseFromParent();
1303 return true;
1306 bool AMDGPULegalizerInfo::legalizeFrint(
1307 MachineInstr &MI, MachineRegisterInfo &MRI,
1308 MachineIRBuilder &B) const {
1309 B.setInstr(MI);
1311 Register Src = MI.getOperand(1).getReg();
1312 LLT Ty = MRI.getType(Src);
1313 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1315 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1316 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1318 auto C1 = B.buildFConstant(Ty, C1Val);
1319 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1321 // TODO: Should this propagate fast-math-flags?
1322 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1323 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1325 auto C2 = B.buildFConstant(Ty, C2Val);
1326 auto Fabs = B.buildFAbs(Ty, Src);
1328 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1329 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1330 return true;
1333 bool AMDGPULegalizerInfo::legalizeFceil(
1334 MachineInstr &MI, MachineRegisterInfo &MRI,
1335 MachineIRBuilder &B) const {
1336 B.setInstr(MI);
1338 const LLT S1 = LLT::scalar(1);
1339 const LLT S64 = LLT::scalar(64);
1341 Register Src = MI.getOperand(1).getReg();
1342 assert(MRI.getType(Src) == S64);
1344 // result = trunc(src)
1345 // if (src > 0.0 && src != result)
1346 // result += 1.0
1348 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1350 const auto Zero = B.buildFConstant(S64, 0.0);
1351 const auto One = B.buildFConstant(S64, 1.0);
1352 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1353 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1354 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1355 auto Add = B.buildSelect(S64, And, One, Zero);
1357 // TODO: Should this propagate fast-math-flags?
1358 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1359 return true;
1362 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1363 MachineIRBuilder &B) {
1364 const unsigned FractBits = 52;
1365 const unsigned ExpBits = 11;
1366 LLT S32 = LLT::scalar(32);
1368 auto Const0 = B.buildConstant(S32, FractBits - 32);
1369 auto Const1 = B.buildConstant(S32, ExpBits);
1371 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1372 .addUse(Const0.getReg(0))
1373 .addUse(Const1.getReg(0));
1375 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1378 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1379 MachineInstr &MI, MachineRegisterInfo &MRI,
1380 MachineIRBuilder &B) const {
1381 B.setInstr(MI);
1383 const LLT S1 = LLT::scalar(1);
1384 const LLT S32 = LLT::scalar(32);
1385 const LLT S64 = LLT::scalar(64);
1387 Register Src = MI.getOperand(1).getReg();
1388 assert(MRI.getType(Src) == S64);
1390 // TODO: Should this use extract since the low half is unused?
1391 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1392 Register Hi = Unmerge.getReg(1);
1394 // Extract the upper half, since this is where we will find the sign and
1395 // exponent.
1396 auto Exp = extractF64Exponent(Hi, B);
1398 const unsigned FractBits = 52;
1400 // Extract the sign bit.
1401 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1402 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1404 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1406 const auto Zero32 = B.buildConstant(S32, 0);
1408 // Extend back to 64-bits.
1409 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1411 auto Shr = B.buildAShr(S64, FractMask, Exp);
1412 auto Not = B.buildNot(S64, Shr);
1413 auto Tmp0 = B.buildAnd(S64, Src, Not);
1414 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1416 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1417 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1419 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1420 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1421 return true;
1424 bool AMDGPULegalizerInfo::legalizeITOFP(
1425 MachineInstr &MI, MachineRegisterInfo &MRI,
1426 MachineIRBuilder &B, bool Signed) const {
1427 B.setInstr(MI);
1429 Register Dst = MI.getOperand(0).getReg();
1430 Register Src = MI.getOperand(1).getReg();
1432 const LLT S64 = LLT::scalar(64);
1433 const LLT S32 = LLT::scalar(32);
1435 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1437 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1439 auto CvtHi = Signed ?
1440 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1441 B.buildUITOFP(S64, Unmerge.getReg(1));
1443 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1445 auto ThirtyTwo = B.buildConstant(S32, 32);
1446 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1447 .addUse(CvtHi.getReg(0))
1448 .addUse(ThirtyTwo.getReg(0));
1450 // TODO: Should this propagate fast-math-flags?
1451 B.buildFAdd(Dst, LdExp, CvtLo);
1452 MI.eraseFromParent();
1453 return true;
1456 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1457 MachineInstr &MI, MachineRegisterInfo &MRI,
1458 MachineIRBuilder &B) const {
1459 MachineFunction &MF = B.getMF();
1460 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1462 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1463 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1465 // With ieee_mode disabled, the instructions have the correct behavior
1466 // already for G_FMINNUM/G_FMAXNUM
1467 if (!MFI->getMode().IEEE)
1468 return !IsIEEEOp;
1470 if (IsIEEEOp)
1471 return true;
1473 MachineIRBuilder HelperBuilder(MI);
1474 GISelObserverWrapper DummyObserver;
1475 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1476 HelperBuilder.setInstr(MI);
1477 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1480 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1481 MachineInstr &MI, MachineRegisterInfo &MRI,
1482 MachineIRBuilder &B) const {
1483 // TODO: Should move some of this into LegalizerHelper.
1485 // TODO: Promote dynamic indexing of s16 to s32
1486 // TODO: Dynamic s64 indexing is only legal for SGPR.
1487 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1488 if (!IdxVal) // Dynamic case will be selected to register indexing.
1489 return true;
1491 Register Dst = MI.getOperand(0).getReg();
1492 Register Vec = MI.getOperand(1).getReg();
1494 LLT VecTy = MRI.getType(Vec);
1495 LLT EltTy = VecTy.getElementType();
1496 assert(EltTy == MRI.getType(Dst));
1498 B.setInstr(MI);
1500 if (IdxVal.getValue() < VecTy.getNumElements())
1501 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1502 else
1503 B.buildUndef(Dst);
1505 MI.eraseFromParent();
1506 return true;
1509 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1510 MachineInstr &MI, MachineRegisterInfo &MRI,
1511 MachineIRBuilder &B) const {
1512 // TODO: Should move some of this into LegalizerHelper.
1514 // TODO: Promote dynamic indexing of s16 to s32
1515 // TODO: Dynamic s64 indexing is only legal for SGPR.
1516 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1517 if (!IdxVal) // Dynamic case will be selected to register indexing.
1518 return true;
1520 Register Dst = MI.getOperand(0).getReg();
1521 Register Vec = MI.getOperand(1).getReg();
1522 Register Ins = MI.getOperand(2).getReg();
1524 LLT VecTy = MRI.getType(Vec);
1525 LLT EltTy = VecTy.getElementType();
1526 assert(EltTy == MRI.getType(Ins));
1528 B.setInstr(MI);
1530 if (IdxVal.getValue() < VecTy.getNumElements())
1531 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1532 else
1533 B.buildUndef(Dst);
1535 MI.eraseFromParent();
1536 return true;
1539 bool AMDGPULegalizerInfo::legalizeSinCos(
1540 MachineInstr &MI, MachineRegisterInfo &MRI,
1541 MachineIRBuilder &B) const {
1542 B.setInstr(MI);
1544 Register DstReg = MI.getOperand(0).getReg();
1545 Register SrcReg = MI.getOperand(1).getReg();
1546 LLT Ty = MRI.getType(DstReg);
1547 unsigned Flags = MI.getFlags();
1549 Register TrigVal;
1550 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1551 if (ST.hasTrigReducedRange()) {
1552 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1553 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1554 .addUse(MulVal.getReg(0))
1555 .setMIFlags(Flags).getReg(0);
1556 } else
1557 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1559 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1560 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1561 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1562 .addUse(TrigVal)
1563 .setMIFlags(Flags);
1564 MI.eraseFromParent();
1565 return true;
1568 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1569 Register DstReg, LLT PtrTy,
1570 MachineIRBuilder &B, const GlobalValue *GV,
1571 unsigned Offset, unsigned GAFlags) const {
1572 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1573 // to the following code sequence:
1575 // For constant address space:
1576 // s_getpc_b64 s[0:1]
1577 // s_add_u32 s0, s0, $symbol
1578 // s_addc_u32 s1, s1, 0
1580 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1581 // a fixup or relocation is emitted to replace $symbol with a literal
1582 // constant, which is a pc-relative offset from the encoding of the $symbol
1583 // operand to the global variable.
1585 // For global address space:
1586 // s_getpc_b64 s[0:1]
1587 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1588 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1590 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1591 // fixups or relocations are emitted to replace $symbol@*@lo and
1592 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1593 // which is a 64-bit pc-relative offset from the encoding of the $symbol
1594 // operand to the global variable.
1596 // What we want here is an offset from the value returned by s_getpc
1597 // (which is the address of the s_add_u32 instruction) to the global
1598 // variable, but since the encoding of $symbol starts 4 bytes after the start
1599 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1600 // small. This requires us to add 4 to the global variable offset in order to
1601 // compute the correct address.
1603 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1605 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1606 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1608 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1609 .addDef(PCReg);
1611 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1612 if (GAFlags == SIInstrInfo::MO_NONE)
1613 MIB.addImm(0);
1614 else
1615 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1617 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1619 if (PtrTy.getSizeInBits() == 32)
1620 B.buildExtract(DstReg, PCReg, 0);
1621 return true;
1624 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1625 MachineInstr &MI, MachineRegisterInfo &MRI,
1626 MachineIRBuilder &B) const {
1627 Register DstReg = MI.getOperand(0).getReg();
1628 LLT Ty = MRI.getType(DstReg);
1629 unsigned AS = Ty.getAddressSpace();
1631 const GlobalValue *GV = MI.getOperand(1).getGlobal();
1632 MachineFunction &MF = B.getMF();
1633 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1634 B.setInstr(MI);
1636 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1637 if (!MFI->isEntryFunction()) {
1638 const Function &Fn = MF.getFunction();
1639 DiagnosticInfoUnsupported BadLDSDecl(
1640 Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1641 Fn.getContext().diagnose(BadLDSDecl);
1644 // TODO: We could emit code to handle the initialization somewhere.
1645 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1646 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1647 MI.eraseFromParent();
1648 return true;
1651 const Function &Fn = MF.getFunction();
1652 DiagnosticInfoUnsupported BadInit(
1653 Fn, "unsupported initializer for address space", MI.getDebugLoc());
1654 Fn.getContext().diagnose(BadInit);
1655 return true;
1658 const SITargetLowering *TLI = ST.getTargetLowering();
1660 if (TLI->shouldEmitFixup(GV)) {
1661 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1662 MI.eraseFromParent();
1663 return true;
1666 if (TLI->shouldEmitPCReloc(GV)) {
1667 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1668 MI.eraseFromParent();
1669 return true;
1672 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1673 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1675 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1676 MachinePointerInfo::getGOT(MF),
1677 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1678 MachineMemOperand::MOInvariant,
1679 8 /*Size*/, 8 /*Align*/);
1681 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1683 if (Ty.getSizeInBits() == 32) {
1684 // Truncate if this is a 32-bit constant adrdess.
1685 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1686 B.buildExtract(DstReg, Load, 0);
1687 } else
1688 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1690 MI.eraseFromParent();
1691 return true;
1694 bool AMDGPULegalizerInfo::legalizeLoad(
1695 MachineInstr &MI, MachineRegisterInfo &MRI,
1696 MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1697 B.setInstr(MI);
1698 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1699 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1700 Observer.changingInstr(MI);
1701 MI.getOperand(1).setReg(Cast.getReg(0));
1702 Observer.changedInstr(MI);
1703 return true;
1706 bool AMDGPULegalizerInfo::legalizeFMad(
1707 MachineInstr &MI, MachineRegisterInfo &MRI,
1708 MachineIRBuilder &B) const {
1709 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1710 assert(Ty.isScalar());
1712 // TODO: Always legal with future ftz flag.
1713 if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1714 return true;
1715 if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1716 return true;
1718 MachineFunction &MF = B.getMF();
1720 MachineIRBuilder HelperBuilder(MI);
1721 GISelObserverWrapper DummyObserver;
1722 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1723 HelperBuilder.setMBB(*MI.getParent());
1724 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1727 // Return the use branch instruction, otherwise null if the usage is invalid.
1728 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1729 MachineRegisterInfo &MRI) {
1730 Register CondDef = MI.getOperand(0).getReg();
1731 if (!MRI.hasOneNonDBGUse(CondDef))
1732 return nullptr;
1734 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1735 return UseMI.getParent() == MI.getParent() &&
1736 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1739 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1740 Register Reg, LLT Ty) const {
1741 Register LiveIn = MRI.getLiveInVirtReg(Reg);
1742 if (LiveIn)
1743 return LiveIn;
1745 Register NewReg = MRI.createGenericVirtualRegister(Ty);
1746 MRI.addLiveIn(Reg, NewReg);
1747 return NewReg;
1750 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1751 const ArgDescriptor *Arg) const {
1752 if (!Arg->isRegister() || !Arg->getRegister().isValid())
1753 return false; // TODO: Handle these
1755 assert(Arg->getRegister().isPhysical());
1757 MachineRegisterInfo &MRI = *B.getMRI();
1759 LLT Ty = MRI.getType(DstReg);
1760 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1762 if (Arg->isMasked()) {
1763 // TODO: Should we try to emit this once in the entry block?
1764 const LLT S32 = LLT::scalar(32);
1765 const unsigned Mask = Arg->getMask();
1766 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1768 Register AndMaskSrc = LiveIn;
1770 if (Shift != 0) {
1771 auto ShiftAmt = B.buildConstant(S32, Shift);
1772 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1775 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1776 } else
1777 B.buildCopy(DstReg, LiveIn);
1779 // Insert the argument copy if it doens't already exist.
1780 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1781 if (!MRI.getVRegDef(LiveIn)) {
1782 // FIXME: Should have scoped insert pt
1783 MachineBasicBlock &OrigInsBB = B.getMBB();
1784 auto OrigInsPt = B.getInsertPt();
1786 MachineBasicBlock &EntryMBB = B.getMF().front();
1787 EntryMBB.addLiveIn(Arg->getRegister());
1788 B.setInsertPt(EntryMBB, EntryMBB.begin());
1789 B.buildCopy(LiveIn, Arg->getRegister());
1791 B.setInsertPt(OrigInsBB, OrigInsPt);
1794 return true;
1797 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1798 MachineInstr &MI,
1799 MachineRegisterInfo &MRI,
1800 MachineIRBuilder &B,
1801 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1802 B.setInstr(MI);
1804 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1806 const ArgDescriptor *Arg;
1807 const TargetRegisterClass *RC;
1808 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1809 if (!Arg) {
1810 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1811 return false;
1814 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1815 MI.eraseFromParent();
1816 return true;
1819 return false;
1822 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1823 MachineRegisterInfo &MRI,
1824 MachineIRBuilder &B) const {
1825 B.setInstr(MI);
1827 if (legalizeFastUnsafeFDIV(MI, MRI, B))
1828 return true;
1830 return false;
1833 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1834 MachineRegisterInfo &MRI,
1835 MachineIRBuilder &B) const {
1836 Register Res = MI.getOperand(0).getReg();
1837 Register LHS = MI.getOperand(1).getReg();
1838 Register RHS = MI.getOperand(2).getReg();
1840 uint16_t Flags = MI.getFlags();
1842 LLT ResTy = MRI.getType(Res);
1843 LLT S32 = LLT::scalar(32);
1844 LLT S64 = LLT::scalar(64);
1846 const MachineFunction &MF = B.getMF();
1847 bool Unsafe =
1848 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1850 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1851 return false;
1853 if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals())
1854 return false;
1856 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1857 // 1 / x -> RCP(x)
1858 if (CLHS->isExactlyValue(1.0)) {
1859 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1860 .addUse(RHS)
1861 .setMIFlags(Flags);
1863 MI.eraseFromParent();
1864 return true;
1867 // -1 / x -> RCP( FNEG(x) )
1868 if (CLHS->isExactlyValue(-1.0)) {
1869 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1870 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1871 .addUse(FNeg.getReg(0))
1872 .setMIFlags(Flags);
1874 MI.eraseFromParent();
1875 return true;
1879 // x / y -> x * (1.0 / y)
1880 if (Unsafe) {
1881 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1882 .addUse(RHS)
1883 .setMIFlags(Flags);
1884 B.buildFMul(Res, LHS, RCP, Flags);
1886 MI.eraseFromParent();
1887 return true;
1890 return false;
1893 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
1894 MachineRegisterInfo &MRI,
1895 MachineIRBuilder &B) const {
1896 B.setInstr(MI);
1897 Register Res = MI.getOperand(0).getReg();
1898 Register LHS = MI.getOperand(2).getReg();
1899 Register RHS = MI.getOperand(3).getReg();
1900 uint16_t Flags = MI.getFlags();
1902 LLT S32 = LLT::scalar(32);
1903 LLT S1 = LLT::scalar(1);
1905 auto Abs = B.buildFAbs(S32, RHS, Flags);
1906 const APFloat C0Val(1.0f);
1908 auto C0 = B.buildConstant(S32, 0x6f800000);
1909 auto C1 = B.buildConstant(S32, 0x2f800000);
1910 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1912 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1913 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1915 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1917 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1918 .addUse(Mul0.getReg(0))
1919 .setMIFlags(Flags);
1921 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1923 B.buildFMul(Res, Sel, Mul1, Flags);
1925 MI.eraseFromParent();
1926 return true;
1929 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1930 MachineRegisterInfo &MRI,
1931 MachineIRBuilder &B) const {
1932 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1933 if (!MFI->isEntryFunction()) {
1934 return legalizePreloadedArgIntrin(MI, MRI, B,
1935 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1938 B.setInstr(MI);
1940 uint64_t Offset =
1941 ST.getTargetLowering()->getImplicitParameterOffset(
1942 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1943 Register DstReg = MI.getOperand(0).getReg();
1944 LLT DstTy = MRI.getType(DstReg);
1945 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1947 const ArgDescriptor *Arg;
1948 const TargetRegisterClass *RC;
1949 std::tie(Arg, RC)
1950 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1951 if (!Arg)
1952 return false;
1954 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1955 if (!loadInputValue(KernargPtrReg, B, Arg))
1956 return false;
1958 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1959 MI.eraseFromParent();
1960 return true;
1963 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1964 MachineRegisterInfo &MRI,
1965 MachineIRBuilder &B,
1966 unsigned AddrSpace) const {
1967 B.setInstr(MI);
1968 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1969 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1970 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1971 MI.eraseFromParent();
1972 return true;
1975 /// Handle register layout difference for f16 images for some subtargets.
1976 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
1977 MachineRegisterInfo &MRI,
1978 Register Reg) const {
1979 if (!ST.hasUnpackedD16VMem())
1980 return Reg;
1982 const LLT S16 = LLT::scalar(16);
1983 const LLT S32 = LLT::scalar(32);
1984 LLT StoreVT = MRI.getType(Reg);
1985 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
1987 auto Unmerge = B.buildUnmerge(S16, Reg);
1989 SmallVector<Register, 4> WideRegs;
1990 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1991 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1993 int NumElts = StoreVT.getNumElements();
1995 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1998 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
1999 MachineRegisterInfo &MRI,
2000 MachineIRBuilder &B,
2001 bool IsFormat) const {
2002 // TODO: Reject f16 format on targets where unsupported.
2003 Register VData = MI.getOperand(1).getReg();
2004 LLT Ty = MRI.getType(VData);
2006 B.setInstr(MI);
2008 const LLT S32 = LLT::scalar(32);
2009 const LLT S16 = LLT::scalar(16);
2011 // Fixup illegal register types for i8 stores.
2012 if (Ty == LLT::scalar(8) || Ty == S16) {
2013 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2014 MI.getOperand(1).setReg(AnyExt);
2015 return true;
2018 if (Ty.isVector()) {
2019 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2020 if (IsFormat)
2021 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2022 return true;
2025 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2028 return Ty == S32;
2031 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2032 MachineRegisterInfo &MRI,
2033 MachineIRBuilder &B) const {
2034 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2035 switch (MI.getIntrinsicID()) {
2036 case Intrinsic::amdgcn_if: {
2037 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2038 const SIRegisterInfo *TRI
2039 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2041 B.setInstr(*BrCond);
2042 Register Def = MI.getOperand(1).getReg();
2043 Register Use = MI.getOperand(3).getReg();
2044 B.buildInstr(AMDGPU::SI_IF)
2045 .addDef(Def)
2046 .addUse(Use)
2047 .addMBB(BrCond->getOperand(1).getMBB());
2049 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2050 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2051 MI.eraseFromParent();
2052 BrCond->eraseFromParent();
2053 return true;
2056 return false;
2058 case Intrinsic::amdgcn_loop: {
2059 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2060 const SIRegisterInfo *TRI
2061 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2063 B.setInstr(*BrCond);
2064 Register Reg = MI.getOperand(2).getReg();
2065 B.buildInstr(AMDGPU::SI_LOOP)
2066 .addUse(Reg)
2067 .addMBB(BrCond->getOperand(1).getMBB());
2068 MI.eraseFromParent();
2069 BrCond->eraseFromParent();
2070 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2071 return true;
2074 return false;
2076 case Intrinsic::amdgcn_kernarg_segment_ptr:
2077 return legalizePreloadedArgIntrin(
2078 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2079 case Intrinsic::amdgcn_implicitarg_ptr:
2080 return legalizeImplicitArgPtr(MI, MRI, B);
2081 case Intrinsic::amdgcn_workitem_id_x:
2082 return legalizePreloadedArgIntrin(MI, MRI, B,
2083 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2084 case Intrinsic::amdgcn_workitem_id_y:
2085 return legalizePreloadedArgIntrin(MI, MRI, B,
2086 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2087 case Intrinsic::amdgcn_workitem_id_z:
2088 return legalizePreloadedArgIntrin(MI, MRI, B,
2089 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2090 case Intrinsic::amdgcn_workgroup_id_x:
2091 return legalizePreloadedArgIntrin(MI, MRI, B,
2092 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2093 case Intrinsic::amdgcn_workgroup_id_y:
2094 return legalizePreloadedArgIntrin(MI, MRI, B,
2095 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2096 case Intrinsic::amdgcn_workgroup_id_z:
2097 return legalizePreloadedArgIntrin(MI, MRI, B,
2098 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2099 case Intrinsic::amdgcn_dispatch_ptr:
2100 return legalizePreloadedArgIntrin(MI, MRI, B,
2101 AMDGPUFunctionArgInfo::DISPATCH_PTR);
2102 case Intrinsic::amdgcn_queue_ptr:
2103 return legalizePreloadedArgIntrin(MI, MRI, B,
2104 AMDGPUFunctionArgInfo::QUEUE_PTR);
2105 case Intrinsic::amdgcn_implicit_buffer_ptr:
2106 return legalizePreloadedArgIntrin(
2107 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2108 case Intrinsic::amdgcn_dispatch_id:
2109 return legalizePreloadedArgIntrin(MI, MRI, B,
2110 AMDGPUFunctionArgInfo::DISPATCH_ID);
2111 case Intrinsic::amdgcn_fdiv_fast:
2112 return legalizeFDIVFastIntrin(MI, MRI, B);
2113 case Intrinsic::amdgcn_is_shared:
2114 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2115 case Intrinsic::amdgcn_is_private:
2116 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2117 case Intrinsic::amdgcn_wavefrontsize: {
2118 B.setInstr(MI);
2119 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2120 MI.eraseFromParent();
2121 return true;
2123 case Intrinsic::amdgcn_raw_buffer_store:
2124 return legalizeRawBufferStore(MI, MRI, B, false);
2125 case Intrinsic::amdgcn_raw_buffer_store_format:
2126 return legalizeRawBufferStore(MI, MRI, B, true);
2127 default:
2128 return true;
2131 return true;