[Alignment][NFC] Use Align with TargetLowering::setMinFunctionAlignment
[llvm-core.git] / lib / Target / AMDGPU / AMDGPULegalizerInfo.cpp
blobc1d503642fcc6cb381cb3ab5f871f6478d4f843e
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/Type.h"
31 #include "llvm/Support/Debug.h"
33 #define DEBUG_TYPE "amdgpu-legalinfo"
35 using namespace llvm;
36 using namespace LegalizeActions;
37 using namespace LegalizeMutations;
38 using namespace LegalityPredicates;
41 static LegalityPredicate isMultiple32(unsigned TypeIdx,
42 unsigned MaxSize = 512) {
43 return [=](const LegalityQuery &Query) {
44 const LLT Ty = Query.Types[TypeIdx];
45 const LLT EltTy = Ty.getScalarType();
46 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
50 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
51 return [=](const LegalityQuery &Query) {
52 const LLT Ty = Query.Types[TypeIdx];
53 return Ty.isVector() &&
54 Ty.getNumElements() % 2 != 0 &&
55 Ty.getElementType().getSizeInBits() < 32;
59 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
60 return [=](const LegalityQuery &Query) {
61 const LLT Ty = Query.Types[TypeIdx];
62 const LLT EltTy = Ty.getElementType();
63 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
67 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
68 return [=](const LegalityQuery &Query) {
69 const LLT Ty = Query.Types[TypeIdx];
70 const LLT EltTy = Ty.getElementType();
71 unsigned Size = Ty.getSizeInBits();
72 unsigned Pieces = (Size + 63) / 64;
73 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
74 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
78 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
79 return [=](const LegalityQuery &Query) {
80 const LLT QueryTy = Query.Types[TypeIdx];
81 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
85 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
86 return [=](const LegalityQuery &Query) {
87 const LLT QueryTy = Query.Types[TypeIdx];
88 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
92 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
93 // v2s16.
94 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
95 return [=](const LegalityQuery &Query) {
96 const LLT Ty = Query.Types[TypeIdx];
97 if (Ty.isVector()) {
98 const int EltSize = Ty.getElementType().getSizeInBits();
99 return EltSize == 32 || EltSize == 64 ||
100 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
101 EltSize == 128 || EltSize == 256;
104 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
108 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
109 return [=](const LegalityQuery &Query) {
110 return Query.Types[TypeIdx].getElementType() == Type;
114 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
115 const GCNTargetMachine &TM)
116 : ST(ST_) {
117 using namespace TargetOpcode;
119 auto GetAddrSpacePtr = [&TM](unsigned AS) {
120 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
123 const LLT S1 = LLT::scalar(1);
124 const LLT S8 = LLT::scalar(8);
125 const LLT S16 = LLT::scalar(16);
126 const LLT S32 = LLT::scalar(32);
127 const LLT S64 = LLT::scalar(64);
128 const LLT S128 = LLT::scalar(128);
129 const LLT S256 = LLT::scalar(256);
130 const LLT S512 = LLT::scalar(512);
132 const LLT V2S16 = LLT::vector(2, 16);
133 const LLT V4S16 = LLT::vector(4, 16);
135 const LLT V2S32 = LLT::vector(2, 32);
136 const LLT V3S32 = LLT::vector(3, 32);
137 const LLT V4S32 = LLT::vector(4, 32);
138 const LLT V5S32 = LLT::vector(5, 32);
139 const LLT V6S32 = LLT::vector(6, 32);
140 const LLT V7S32 = LLT::vector(7, 32);
141 const LLT V8S32 = LLT::vector(8, 32);
142 const LLT V9S32 = LLT::vector(9, 32);
143 const LLT V10S32 = LLT::vector(10, 32);
144 const LLT V11S32 = LLT::vector(11, 32);
145 const LLT V12S32 = LLT::vector(12, 32);
146 const LLT V13S32 = LLT::vector(13, 32);
147 const LLT V14S32 = LLT::vector(14, 32);
148 const LLT V15S32 = LLT::vector(15, 32);
149 const LLT V16S32 = LLT::vector(16, 32);
151 const LLT V2S64 = LLT::vector(2, 64);
152 const LLT V3S64 = LLT::vector(3, 64);
153 const LLT V4S64 = LLT::vector(4, 64);
154 const LLT V5S64 = LLT::vector(5, 64);
155 const LLT V6S64 = LLT::vector(6, 64);
156 const LLT V7S64 = LLT::vector(7, 64);
157 const LLT V8S64 = LLT::vector(8, 64);
159 std::initializer_list<LLT> AllS32Vectors =
160 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
161 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
162 std::initializer_list<LLT> AllS64Vectors =
163 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
165 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
166 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
167 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
168 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
169 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
170 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
171 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
173 const LLT CodePtr = FlatPtr;
175 const std::initializer_list<LLT> AddrSpaces64 = {
176 GlobalPtr, ConstantPtr, FlatPtr
179 const std::initializer_list<LLT> AddrSpaces32 = {
180 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
183 const std::initializer_list<LLT> FPTypesBase = {
184 S32, S64
187 const std::initializer_list<LLT> FPTypes16 = {
188 S32, S64, S16
191 const std::initializer_list<LLT> FPTypesPK16 = {
192 S32, S64, S16, V2S16
195 setAction({G_BRCOND, S1}, Legal);
197 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
198 // elements for v3s16
199 getActionDefinitionsBuilder(G_PHI)
200 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
201 .legalFor(AllS32Vectors)
202 .legalFor(AllS64Vectors)
203 .legalFor(AddrSpaces64)
204 .legalFor(AddrSpaces32)
205 .clampScalar(0, S32, S256)
206 .widenScalarToNextPow2(0, 32)
207 .clampMaxNumElements(0, S32, 16)
208 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
209 .legalIf(isPointer(0));
211 if (ST.has16BitInsts()) {
212 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
213 .legalFor({S32, S16})
214 .clampScalar(0, S16, S32)
215 .scalarize(0);
216 } else {
217 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
218 .legalFor({S32})
219 .clampScalar(0, S32, S32)
220 .scalarize(0);
223 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
224 .legalFor({S32})
225 .clampScalar(0, S32, S32)
226 .scalarize(0);
228 // Report legal for any types we can handle anywhere. For the cases only legal
229 // on the SALU, RegBankSelect will be able to re-legalize.
230 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
231 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
232 .clampScalar(0, S32, S64)
233 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
234 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
235 .widenScalarToNextPow2(0)
236 .scalarize(0);
238 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
239 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
240 .legalFor({{S32, S1}})
241 .clampScalar(0, S32, S32);
243 getActionDefinitionsBuilder(G_BITCAST)
244 .legalForCartesianProduct({S32, V2S16})
245 .legalForCartesianProduct({S64, V2S32, V4S16})
246 .legalForCartesianProduct({V2S64, V4S32})
247 // Don't worry about the size constraint.
248 .legalIf(all(isPointer(0), isPointer(1)));
250 getActionDefinitionsBuilder(G_FCONSTANT)
251 .legalFor({S32, S64, S16})
252 .clampScalar(0, S16, S64);
254 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
255 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
256 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
257 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
258 .clampScalarOrElt(0, S32, S512)
259 .legalIf(isMultiple32(0))
260 .widenScalarToNextPow2(0, 32)
261 .clampMaxNumElements(0, S32, 16);
264 // FIXME: i1 operands to intrinsics should always be legal, but other i1
265 // values may not be legal. We need to figure out how to distinguish
266 // between these two scenarios.
267 getActionDefinitionsBuilder(G_CONSTANT)
268 .legalFor({S1, S32, S64, S16, GlobalPtr,
269 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
270 .clampScalar(0, S32, S64)
271 .widenScalarToNextPow2(0)
272 .legalIf(isPointer(0));
274 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
276 auto &FPOpActions = getActionDefinitionsBuilder(
277 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
278 .legalFor({S32, S64});
279 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
280 .customFor({S32, S64});
282 if (ST.has16BitInsts()) {
283 if (ST.hasVOP3PInsts())
284 FPOpActions.legalFor({S16, V2S16});
285 else
286 FPOpActions.legalFor({S16});
288 TrigActions.customFor({S16});
291 auto &MinNumMaxNum = getActionDefinitionsBuilder({
292 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
294 if (ST.hasVOP3PInsts()) {
295 MinNumMaxNum.customFor(FPTypesPK16)
296 .clampMaxNumElements(0, S16, 2)
297 .clampScalar(0, S16, S64)
298 .scalarize(0);
299 } else if (ST.has16BitInsts()) {
300 MinNumMaxNum.customFor(FPTypes16)
301 .clampScalar(0, S16, S64)
302 .scalarize(0);
303 } else {
304 MinNumMaxNum.customFor(FPTypesBase)
305 .clampScalar(0, S32, S64)
306 .scalarize(0);
309 // TODO: Implement
310 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
312 if (ST.hasVOP3PInsts())
313 FPOpActions.clampMaxNumElements(0, S16, 2);
315 FPOpActions
316 .scalarize(0)
317 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
319 TrigActions
320 .scalarize(0)
321 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
323 if (ST.has16BitInsts()) {
324 getActionDefinitionsBuilder(G_FSQRT)
325 .legalFor({S32, S64, S16})
326 .scalarize(0)
327 .clampScalar(0, S16, S64);
328 } else {
329 getActionDefinitionsBuilder(G_FSQRT)
330 .legalFor({S32, S64})
331 .scalarize(0)
332 .clampScalar(0, S32, S64);
335 getActionDefinitionsBuilder(G_FPTRUNC)
336 .legalFor({{S32, S64}, {S16, S32}})
337 .scalarize(0);
339 getActionDefinitionsBuilder(G_FPEXT)
340 .legalFor({{S64, S32}, {S32, S16}})
341 .lowerFor({{S64, S16}}) // FIXME: Implement
342 .scalarize(0);
344 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
345 getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
347 getActionDefinitionsBuilder(G_FSUB)
348 // Use actual fsub instruction
349 .legalFor({S32})
350 // Must use fadd + fneg
351 .lowerFor({S64, S16, V2S16})
352 .scalarize(0)
353 .clampScalar(0, S32, S64);
355 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
356 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
357 {S32, S1}, {S64, S1}, {S16, S1},
358 // FIXME: Hack
359 {S64, LLT::scalar(33)},
360 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
361 .scalarize(0);
363 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
364 .legalFor({{S32, S32}, {S64, S32}})
365 .lowerFor({{S32, S64}})
366 .customFor({{S64, S64}})
367 .scalarize(0);
369 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
370 .legalFor({{S32, S32}, {S32, S64}})
371 .scalarize(0);
373 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
374 .legalFor({S32, S64})
375 .scalarize(0);
377 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
378 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
379 .legalFor({S32, S64})
380 .clampScalar(0, S32, S64)
381 .scalarize(0);
382 } else {
383 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
384 .legalFor({S32})
385 .customFor({S64})
386 .clampScalar(0, S32, S64)
387 .scalarize(0);
390 getActionDefinitionsBuilder(G_GEP)
391 .legalForCartesianProduct(AddrSpaces64, {S64})
392 .legalForCartesianProduct(AddrSpaces32, {S32})
393 .scalarize(0);
395 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
397 auto &CmpBuilder =
398 getActionDefinitionsBuilder(G_ICMP)
399 .legalForCartesianProduct(
400 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
401 .legalFor({{S1, S32}, {S1, S64}});
402 if (ST.has16BitInsts()) {
403 CmpBuilder.legalFor({{S1, S16}});
406 CmpBuilder
407 .widenScalarToNextPow2(1)
408 .clampScalar(1, S32, S64)
409 .scalarize(0)
410 .legalIf(all(typeIs(0, S1), isPointer(1)));
412 getActionDefinitionsBuilder(G_FCMP)
413 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
414 .widenScalarToNextPow2(1)
415 .clampScalar(1, S32, S64)
416 .scalarize(0);
418 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
419 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
420 G_FLOG, G_FLOG2, G_FLOG10})
421 .legalFor({S32})
422 .scalarize(0);
424 // The 64-bit versions produce 32-bit results, but only on the SALU.
425 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
426 G_CTTZ, G_CTTZ_ZERO_UNDEF,
427 G_CTPOP})
428 .legalFor({{S32, S32}, {S32, S64}})
429 .clampScalar(0, S32, S32)
430 .clampScalar(1, S32, S64)
431 .scalarize(0)
432 .widenScalarToNextPow2(0, 32)
433 .widenScalarToNextPow2(1, 32);
435 // TODO: Expand for > s32
436 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
437 .legalFor({S32})
438 .clampScalar(0, S32, S32)
439 .scalarize(0);
441 if (ST.has16BitInsts()) {
442 if (ST.hasVOP3PInsts()) {
443 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
444 .legalFor({S32, S16, V2S16})
445 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
446 .clampMaxNumElements(0, S16, 2)
447 .clampScalar(0, S16, S32)
448 .widenScalarToNextPow2(0)
449 .scalarize(0);
450 } else {
451 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
452 .legalFor({S32, S16})
453 .widenScalarToNextPow2(0)
454 .clampScalar(0, S16, S32)
455 .scalarize(0);
457 } else {
458 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
459 .legalFor({S32})
460 .clampScalar(0, S32, S32)
461 .widenScalarToNextPow2(0)
462 .scalarize(0);
465 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
466 return [=](const LegalityQuery &Query) {
467 return Query.Types[TypeIdx0].getSizeInBits() <
468 Query.Types[TypeIdx1].getSizeInBits();
472 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
473 return [=](const LegalityQuery &Query) {
474 return Query.Types[TypeIdx0].getSizeInBits() >
475 Query.Types[TypeIdx1].getSizeInBits();
479 getActionDefinitionsBuilder(G_INTTOPTR)
480 // List the common cases
481 .legalForCartesianProduct(AddrSpaces64, {S64})
482 .legalForCartesianProduct(AddrSpaces32, {S32})
483 .scalarize(0)
484 // Accept any address space as long as the size matches
485 .legalIf(sameSize(0, 1))
486 .widenScalarIf(smallerThan(1, 0),
487 [](const LegalityQuery &Query) {
488 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
490 .narrowScalarIf(greaterThan(1, 0),
491 [](const LegalityQuery &Query) {
492 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
495 getActionDefinitionsBuilder(G_PTRTOINT)
496 // List the common cases
497 .legalForCartesianProduct(AddrSpaces64, {S64})
498 .legalForCartesianProduct(AddrSpaces32, {S32})
499 .scalarize(0)
500 // Accept any address space as long as the size matches
501 .legalIf(sameSize(0, 1))
502 .widenScalarIf(smallerThan(0, 1),
503 [](const LegalityQuery &Query) {
504 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
506 .narrowScalarIf(
507 greaterThan(0, 1),
508 [](const LegalityQuery &Query) {
509 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
512 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
513 .scalarize(0)
514 .custom();
516 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
517 // handle some operations by just promoting the register during
518 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
519 getActionDefinitionsBuilder({G_LOAD, G_STORE})
520 .narrowScalarIf([](const LegalityQuery &Query) {
521 unsigned Size = Query.Types[0].getSizeInBits();
522 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
523 return (Size > 32 && MemSize < Size);
525 [](const LegalityQuery &Query) {
526 return std::make_pair(0, LLT::scalar(32));
528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529 .fewerElementsIf([=](const LegalityQuery &Query) {
530 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
531 return (MemSize == 96) &&
532 Query.Types[0].isVector() &&
533 !ST.hasDwordx3LoadStores();
535 [=](const LegalityQuery &Query) {
536 return std::make_pair(0, V2S32);
538 .legalIf([=](const LegalityQuery &Query) {
539 const LLT &Ty0 = Query.Types[0];
541 unsigned Size = Ty0.getSizeInBits();
542 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
543 if (Size < 32 || (Size > 32 && MemSize < Size))
544 return false;
546 if (Ty0.isVector() && Size != MemSize)
547 return false;
549 // TODO: Decompose private loads into 4-byte components.
550 // TODO: Illegal flat loads on SI
551 switch (MemSize) {
552 case 8:
553 case 16:
554 return Size == 32;
555 case 32:
556 case 64:
557 case 128:
558 return true;
560 case 96:
561 return ST.hasDwordx3LoadStores();
563 case 256:
564 case 512:
565 // TODO: Possibly support loads of i256 and i512 . This will require
566 // adding i256 and i512 types to MVT in order for to be able to use
567 // TableGen.
568 // TODO: Add support for other vector types, this will require
569 // defining more value mappings for the new types.
570 return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
571 Ty0.getScalarType().getSizeInBits() == 64);
573 default:
574 return false;
577 .clampScalar(0, S32, S64);
580 // FIXME: Handle alignment requirements.
581 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
582 .legalForTypesWithMemDesc({
583 {S32, GlobalPtr, 8, 8},
584 {S32, GlobalPtr, 16, 8},
585 {S32, LocalPtr, 8, 8},
586 {S32, LocalPtr, 16, 8},
587 {S32, PrivatePtr, 8, 8},
588 {S32, PrivatePtr, 16, 8}});
589 if (ST.hasFlatAddressSpace()) {
590 ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
591 {S32, FlatPtr, 16, 8}});
594 ExtLoads.clampScalar(0, S32, S32)
595 .widenScalarToNextPow2(0)
596 .unsupportedIfMemSizeNotPow2()
597 .lower();
599 auto &Atomics = getActionDefinitionsBuilder(
600 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
601 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
602 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
603 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
604 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
605 {S64, GlobalPtr}, {S64, LocalPtr}});
606 if (ST.hasFlatAddressSpace()) {
607 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
610 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
611 .legalFor({{S32, LocalPtr}});
613 // TODO: Pointer types, any 32-bit or 64-bit vector
614 getActionDefinitionsBuilder(G_SELECT)
615 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
616 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
617 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
618 .clampScalar(0, S16, S64)
619 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
620 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
621 .scalarize(1)
622 .clampMaxNumElements(0, S32, 2)
623 .clampMaxNumElements(0, LocalPtr, 2)
624 .clampMaxNumElements(0, PrivatePtr, 2)
625 .scalarize(0)
626 .widenScalarToNextPow2(0)
627 .legalIf(all(isPointer(0), typeIs(1, S1)));
629 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
630 // be more flexible with the shift amount type.
631 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
632 .legalFor({{S32, S32}, {S64, S32}});
633 if (ST.has16BitInsts()) {
634 if (ST.hasVOP3PInsts()) {
635 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
636 .clampMaxNumElements(0, S16, 2);
637 } else
638 Shifts.legalFor({{S16, S32}, {S16, S16}});
640 Shifts.clampScalar(1, S16, S32);
641 Shifts.clampScalar(0, S16, S64);
642 Shifts.widenScalarToNextPow2(0, 16);
643 } else {
644 // Make sure we legalize the shift amount type first, as the general
645 // expansion for the shifted type will produce much worse code if it hasn't
646 // been truncated already.
647 Shifts.clampScalar(1, S32, S32);
648 Shifts.clampScalar(0, S32, S64);
649 Shifts.widenScalarToNextPow2(0, 32);
651 Shifts.scalarize(0);
653 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
654 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
655 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
656 unsigned IdxTypeIdx = 2;
658 getActionDefinitionsBuilder(Op)
659 .customIf([=](const LegalityQuery &Query) {
660 const LLT EltTy = Query.Types[EltTypeIdx];
661 const LLT VecTy = Query.Types[VecTypeIdx];
662 const LLT IdxTy = Query.Types[IdxTypeIdx];
663 return (EltTy.getSizeInBits() == 16 ||
664 EltTy.getSizeInBits() % 32 == 0) &&
665 VecTy.getSizeInBits() % 32 == 0 &&
666 VecTy.getSizeInBits() <= 512 &&
667 IdxTy.getSizeInBits() == 32;
669 .clampScalar(EltTypeIdx, S32, S64)
670 .clampScalar(VecTypeIdx, S32, S64)
671 .clampScalar(IdxTypeIdx, S32, S32);
674 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
675 .unsupportedIf([=](const LegalityQuery &Query) {
676 const LLT &EltTy = Query.Types[1].getElementType();
677 return Query.Types[0] != EltTy;
680 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
681 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
682 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
684 // FIXME: Doesn't handle extract of illegal sizes.
685 getActionDefinitionsBuilder(Op)
686 .legalIf([=](const LegalityQuery &Query) {
687 const LLT BigTy = Query.Types[BigTyIdx];
688 const LLT LitTy = Query.Types[LitTyIdx];
689 return (BigTy.getSizeInBits() % 32 == 0) &&
690 (LitTy.getSizeInBits() % 16 == 0);
692 .widenScalarIf(
693 [=](const LegalityQuery &Query) {
694 const LLT BigTy = Query.Types[BigTyIdx];
695 return (BigTy.getScalarSizeInBits() < 16);
697 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
698 .widenScalarIf(
699 [=](const LegalityQuery &Query) {
700 const LLT LitTy = Query.Types[LitTyIdx];
701 return (LitTy.getScalarSizeInBits() < 16);
703 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
704 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
705 .widenScalarToNextPow2(BigTyIdx, 32);
709 getActionDefinitionsBuilder(G_BUILD_VECTOR)
710 .legalForCartesianProduct(AllS32Vectors, {S32})
711 .legalForCartesianProduct(AllS64Vectors, {S64})
712 .clampNumElements(0, V16S32, V16S32)
713 .clampNumElements(0, V2S64, V8S64)
714 .minScalarSameAs(1, 0)
715 .legalIf(isRegisterType(0))
716 .minScalarOrElt(0, S32);
718 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
719 .legalIf(isRegisterType(0));
721 // TODO: Don't fully scalarize v2s16 pieces
722 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
724 // Merge/Unmerge
725 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
726 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
727 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
729 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
730 const LLT &Ty = Query.Types[TypeIdx];
731 if (Ty.isVector()) {
732 const LLT &EltTy = Ty.getElementType();
733 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
734 return true;
735 if (!isPowerOf2_32(EltTy.getSizeInBits()))
736 return true;
738 return false;
741 getActionDefinitionsBuilder(Op)
742 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
743 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
744 // worth considering the multiples of 64 since 2*192 and 2*384 are not
745 // valid.
746 .clampScalar(LitTyIdx, S16, S256)
747 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
748 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
749 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
750 elementTypeIs(1, S16)),
751 changeTo(1, V2S16))
752 // Break up vectors with weird elements into scalars
753 .fewerElementsIf(
754 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
755 scalarize(0))
756 .fewerElementsIf(
757 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
758 scalarize(1))
759 .clampScalar(BigTyIdx, S32, S512)
760 .lowerFor({{S16, V2S16}})
761 .widenScalarIf(
762 [=](const LegalityQuery &Query) {
763 const LLT &Ty = Query.Types[BigTyIdx];
764 return !isPowerOf2_32(Ty.getSizeInBits()) &&
765 Ty.getSizeInBits() % 16 != 0;
767 [=](const LegalityQuery &Query) {
768 // Pick the next power of 2, or a multiple of 64 over 128.
769 // Whichever is smaller.
770 const LLT &Ty = Query.Types[BigTyIdx];
771 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
772 if (NewSizeInBits >= 256) {
773 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
774 if (RoundedTo < NewSizeInBits)
775 NewSizeInBits = RoundedTo;
777 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
779 .legalIf([=](const LegalityQuery &Query) {
780 const LLT &BigTy = Query.Types[BigTyIdx];
781 const LLT &LitTy = Query.Types[LitTyIdx];
783 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
784 return false;
785 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
786 return false;
788 return BigTy.getSizeInBits() % 16 == 0 &&
789 LitTy.getSizeInBits() % 16 == 0 &&
790 BigTy.getSizeInBits() <= 512;
792 // Any vectors left are the wrong size. Scalarize them.
793 .scalarize(0)
794 .scalarize(1);
797 getActionDefinitionsBuilder(G_SEXT_INREG).lower();
799 computeTables();
800 verify(*ST.getInstrInfo());
803 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
804 MachineRegisterInfo &MRI,
805 MachineIRBuilder &MIRBuilder,
806 GISelChangeObserver &Observer) const {
807 switch (MI.getOpcode()) {
808 case TargetOpcode::G_ADDRSPACE_CAST:
809 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
810 case TargetOpcode::G_FRINT:
811 return legalizeFrint(MI, MRI, MIRBuilder);
812 case TargetOpcode::G_FCEIL:
813 return legalizeFceil(MI, MRI, MIRBuilder);
814 case TargetOpcode::G_INTRINSIC_TRUNC:
815 return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
816 case TargetOpcode::G_SITOFP:
817 return legalizeITOFP(MI, MRI, MIRBuilder, true);
818 case TargetOpcode::G_UITOFP:
819 return legalizeITOFP(MI, MRI, MIRBuilder, false);
820 case TargetOpcode::G_FMINNUM:
821 case TargetOpcode::G_FMAXNUM:
822 case TargetOpcode::G_FMINNUM_IEEE:
823 case TargetOpcode::G_FMAXNUM_IEEE:
824 return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
825 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
826 return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
827 case TargetOpcode::G_INSERT_VECTOR_ELT:
828 return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
829 case TargetOpcode::G_FSIN:
830 case TargetOpcode::G_FCOS:
831 return legalizeSinCos(MI, MRI, MIRBuilder);
832 default:
833 return false;
836 llvm_unreachable("expected switch to return");
839 Register AMDGPULegalizerInfo::getSegmentAperture(
840 unsigned AS,
841 MachineRegisterInfo &MRI,
842 MachineIRBuilder &MIRBuilder) const {
843 MachineFunction &MF = MIRBuilder.getMF();
844 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
845 const LLT S32 = LLT::scalar(32);
847 if (ST.hasApertureRegs()) {
848 // FIXME: Use inline constants (src_{shared, private}_base) instead of
849 // getreg.
850 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
851 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
852 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
853 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
854 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
855 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
856 unsigned Encoding =
857 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
858 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
859 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
861 Register ApertureReg = MRI.createGenericVirtualRegister(S32);
862 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
864 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
865 .addDef(GetReg)
866 .addImm(Encoding);
867 MRI.setType(GetReg, S32);
869 auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
870 MIRBuilder.buildInstr(TargetOpcode::G_SHL)
871 .addDef(ApertureReg)
872 .addUse(GetReg)
873 .addUse(ShiftAmt.getReg(0));
875 return ApertureReg;
878 Register QueuePtr = MRI.createGenericVirtualRegister(
879 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
881 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
882 if (!loadInputValue(QueuePtr, MIRBuilder, &MFI->getArgInfo().QueuePtr))
883 return Register();
885 // Offset into amd_queue_t for group_segment_aperture_base_hi /
886 // private_segment_aperture_base_hi.
887 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
889 // FIXME: Don't use undef
890 Value *V = UndefValue::get(PointerType::get(
891 Type::getInt8Ty(MF.getFunction().getContext()),
892 AMDGPUAS::CONSTANT_ADDRESS));
894 MachinePointerInfo PtrInfo(V, StructOffset);
895 MachineMemOperand *MMO = MF.getMachineMemOperand(
896 PtrInfo,
897 MachineMemOperand::MOLoad |
898 MachineMemOperand::MODereferenceable |
899 MachineMemOperand::MOInvariant,
901 MinAlign(64, StructOffset));
903 Register LoadResult = MRI.createGenericVirtualRegister(S32);
904 Register LoadAddr;
906 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
907 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
908 return LoadResult;
911 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
912 MachineInstr &MI, MachineRegisterInfo &MRI,
913 MachineIRBuilder &MIRBuilder) const {
914 MachineFunction &MF = MIRBuilder.getMF();
916 MIRBuilder.setInstr(MI);
918 const LLT S32 = LLT::scalar(32);
919 Register Dst = MI.getOperand(0).getReg();
920 Register Src = MI.getOperand(1).getReg();
922 LLT DstTy = MRI.getType(Dst);
923 LLT SrcTy = MRI.getType(Src);
924 unsigned DestAS = DstTy.getAddressSpace();
925 unsigned SrcAS = SrcTy.getAddressSpace();
927 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
928 // vector element.
929 assert(!DstTy.isVector());
931 const AMDGPUTargetMachine &TM
932 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
934 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
935 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
936 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
937 return true;
940 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
941 // Truncate.
942 MIRBuilder.buildExtract(Dst, Src, 0);
943 MI.eraseFromParent();
944 return true;
947 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
948 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
949 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
951 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
952 // another. Merge operands are required to be the same type, but creating an
953 // extra ptrtoint would be kind of pointless.
954 auto HighAddr = MIRBuilder.buildConstant(
955 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
956 MIRBuilder.buildMerge(Dst, {Src, HighAddr.getReg(0)});
957 MI.eraseFromParent();
958 return true;
961 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
962 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
963 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
964 unsigned NullVal = TM.getNullPointerValue(DestAS);
966 auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
967 auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
969 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
971 // Extract low 32-bits of the pointer.
972 MIRBuilder.buildExtract(PtrLo32, Src, 0);
974 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
975 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
976 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
978 MI.eraseFromParent();
979 return true;
982 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
983 return false;
985 if (!ST.hasFlatAddressSpace())
986 return false;
988 auto SegmentNull =
989 MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
990 auto FlatNull =
991 MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
993 Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
994 if (!ApertureReg.isValid())
995 return false;
997 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
998 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1000 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1002 // Coerce the type of the low half of the result so we can use merge_values.
1003 Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1004 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
1005 .addDef(SrcAsInt)
1006 .addUse(Src);
1008 // TODO: Should we allow mismatched types but matching sizes in merges to
1009 // avoid the ptrtoint?
1010 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1011 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1013 MI.eraseFromParent();
1014 return true;
1017 bool AMDGPULegalizerInfo::legalizeFrint(
1018 MachineInstr &MI, MachineRegisterInfo &MRI,
1019 MachineIRBuilder &MIRBuilder) const {
1020 MIRBuilder.setInstr(MI);
1022 Register Src = MI.getOperand(1).getReg();
1023 LLT Ty = MRI.getType(Src);
1024 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1026 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1027 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1029 auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
1030 auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
1032 // TODO: Should this propagate fast-math-flags?
1033 auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
1034 auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
1036 auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
1037 auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
1039 auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1040 MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1041 return true;
1044 bool AMDGPULegalizerInfo::legalizeFceil(
1045 MachineInstr &MI, MachineRegisterInfo &MRI,
1046 MachineIRBuilder &B) const {
1047 B.setInstr(MI);
1049 const LLT S1 = LLT::scalar(1);
1050 const LLT S64 = LLT::scalar(64);
1052 Register Src = MI.getOperand(1).getReg();
1053 assert(MRI.getType(Src) == S64);
1055 // result = trunc(src)
1056 // if (src > 0.0 && src != result)
1057 // result += 1.0
1059 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1061 const auto Zero = B.buildFConstant(S64, 0.0);
1062 const auto One = B.buildFConstant(S64, 1.0);
1063 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1064 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1065 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1066 auto Add = B.buildSelect(S64, And, One, Zero);
1068 // TODO: Should this propagate fast-math-flags?
1069 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1070 return true;
1073 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1074 MachineIRBuilder &B) {
1075 const unsigned FractBits = 52;
1076 const unsigned ExpBits = 11;
1077 LLT S32 = LLT::scalar(32);
1079 auto Const0 = B.buildConstant(S32, FractBits - 32);
1080 auto Const1 = B.buildConstant(S32, ExpBits);
1082 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1083 .addUse(Const0.getReg(0))
1084 .addUse(Const1.getReg(0));
1086 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1089 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1090 MachineInstr &MI, MachineRegisterInfo &MRI,
1091 MachineIRBuilder &B) const {
1092 B.setInstr(MI);
1094 const LLT S1 = LLT::scalar(1);
1095 const LLT S32 = LLT::scalar(32);
1096 const LLT S64 = LLT::scalar(64);
1098 Register Src = MI.getOperand(1).getReg();
1099 assert(MRI.getType(Src) == S64);
1101 // TODO: Should this use extract since the low half is unused?
1102 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1103 Register Hi = Unmerge.getReg(1);
1105 // Extract the upper half, since this is where we will find the sign and
1106 // exponent.
1107 auto Exp = extractF64Exponent(Hi, B);
1109 const unsigned FractBits = 52;
1111 // Extract the sign bit.
1112 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1113 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1115 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1117 const auto Zero32 = B.buildConstant(S32, 0);
1119 // Extend back to 64-bits.
1120 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1122 auto Shr = B.buildAShr(S64, FractMask, Exp);
1123 auto Not = B.buildNot(S64, Shr);
1124 auto Tmp0 = B.buildAnd(S64, Src, Not);
1125 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1127 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1128 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1130 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1131 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1132 return true;
1135 bool AMDGPULegalizerInfo::legalizeITOFP(
1136 MachineInstr &MI, MachineRegisterInfo &MRI,
1137 MachineIRBuilder &B, bool Signed) const {
1138 B.setInstr(MI);
1140 Register Dst = MI.getOperand(0).getReg();
1141 Register Src = MI.getOperand(1).getReg();
1143 const LLT S64 = LLT::scalar(64);
1144 const LLT S32 = LLT::scalar(32);
1146 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1148 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1150 auto CvtHi = Signed ?
1151 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1152 B.buildUITOFP(S64, Unmerge.getReg(1));
1154 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1156 auto ThirtyTwo = B.buildConstant(S32, 32);
1157 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1158 .addUse(CvtHi.getReg(0))
1159 .addUse(ThirtyTwo.getReg(0));
1161 // TODO: Should this propagate fast-math-flags?
1162 B.buildFAdd(Dst, LdExp, CvtLo);
1163 MI.eraseFromParent();
1164 return true;
1167 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1168 MachineInstr &MI, MachineRegisterInfo &MRI,
1169 MachineIRBuilder &B) const {
1170 MachineFunction &MF = B.getMF();
1171 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1173 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1174 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1176 // With ieee_mode disabled, the instructions have the correct behavior
1177 // already for G_FMINNUM/G_FMAXNUM
1178 if (!MFI->getMode().IEEE)
1179 return !IsIEEEOp;
1181 if (IsIEEEOp)
1182 return true;
1184 MachineIRBuilder HelperBuilder(MI);
1185 GISelObserverWrapper DummyObserver;
1186 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1187 HelperBuilder.setMBB(*MI.getParent());
1188 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1191 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1192 MachineInstr &MI, MachineRegisterInfo &MRI,
1193 MachineIRBuilder &B) const {
1194 // TODO: Should move some of this into LegalizerHelper.
1196 // TODO: Promote dynamic indexing of s16 to s32
1197 // TODO: Dynamic s64 indexing is only legal for SGPR.
1198 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1199 if (!IdxVal) // Dynamic case will be selected to register indexing.
1200 return true;
1202 Register Dst = MI.getOperand(0).getReg();
1203 Register Vec = MI.getOperand(1).getReg();
1205 LLT VecTy = MRI.getType(Vec);
1206 LLT EltTy = VecTy.getElementType();
1207 assert(EltTy == MRI.getType(Dst));
1209 B.setInstr(MI);
1211 if (IdxVal.getValue() < VecTy.getNumElements())
1212 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1213 else
1214 B.buildUndef(Dst);
1216 MI.eraseFromParent();
1217 return true;
1220 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1221 MachineInstr &MI, MachineRegisterInfo &MRI,
1222 MachineIRBuilder &B) const {
1223 // TODO: Should move some of this into LegalizerHelper.
1225 // TODO: Promote dynamic indexing of s16 to s32
1226 // TODO: Dynamic s64 indexing is only legal for SGPR.
1227 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1228 if (!IdxVal) // Dynamic case will be selected to register indexing.
1229 return true;
1231 Register Dst = MI.getOperand(0).getReg();
1232 Register Vec = MI.getOperand(1).getReg();
1233 Register Ins = MI.getOperand(2).getReg();
1235 LLT VecTy = MRI.getType(Vec);
1236 LLT EltTy = VecTy.getElementType();
1237 assert(EltTy == MRI.getType(Ins));
1239 B.setInstr(MI);
1241 if (IdxVal.getValue() < VecTy.getNumElements())
1242 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1243 else
1244 B.buildUndef(Dst);
1246 MI.eraseFromParent();
1247 return true;
1250 bool AMDGPULegalizerInfo::legalizeSinCos(
1251 MachineInstr &MI, MachineRegisterInfo &MRI,
1252 MachineIRBuilder &B) const {
1253 B.setInstr(MI);
1255 Register DstReg = MI.getOperand(0).getReg();
1256 Register SrcReg = MI.getOperand(1).getReg();
1257 LLT Ty = MRI.getType(DstReg);
1258 unsigned Flags = MI.getFlags();
1260 Register TrigVal;
1261 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1262 if (ST.hasTrigReducedRange()) {
1263 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1264 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1265 .addUse(MulVal.getReg(0))
1266 .setMIFlags(Flags).getReg(0);
1267 } else
1268 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1270 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1271 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1272 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1273 .addUse(TrigVal)
1274 .setMIFlags(Flags);
1275 MI.eraseFromParent();
1276 return true;
1279 // Return the use branch instruction, otherwise null if the usage is invalid.
1280 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1281 MachineRegisterInfo &MRI) {
1282 Register CondDef = MI.getOperand(0).getReg();
1283 if (!MRI.hasOneNonDBGUse(CondDef))
1284 return nullptr;
1286 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1287 return UseMI.getParent() == MI.getParent() &&
1288 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1291 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1292 Register Reg, LLT Ty) const {
1293 Register LiveIn = MRI.getLiveInVirtReg(Reg);
1294 if (LiveIn)
1295 return LiveIn;
1297 Register NewReg = MRI.createGenericVirtualRegister(Ty);
1298 MRI.addLiveIn(Reg, NewReg);
1299 return NewReg;
1302 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1303 const ArgDescriptor *Arg) const {
1304 if (!Arg->isRegister() || !Arg->getRegister().isValid())
1305 return false; // TODO: Handle these
1307 assert(Arg->getRegister().isPhysical());
1309 MachineRegisterInfo &MRI = *B.getMRI();
1311 LLT Ty = MRI.getType(DstReg);
1312 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1314 if (Arg->isMasked()) {
1315 // TODO: Should we try to emit this once in the entry block?
1316 const LLT S32 = LLT::scalar(32);
1317 const unsigned Mask = Arg->getMask();
1318 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1320 auto ShiftAmt = B.buildConstant(S32, Shift);
1321 auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1322 B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1323 } else
1324 B.buildCopy(DstReg, LiveIn);
1326 // Insert the argument copy if it doens't already exist.
1327 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1328 if (!MRI.getVRegDef(LiveIn)) {
1329 // FIXME: Should have scoped insert pt
1330 MachineBasicBlock &OrigInsBB = B.getMBB();
1331 auto OrigInsPt = B.getInsertPt();
1333 MachineBasicBlock &EntryMBB = B.getMF().front();
1334 EntryMBB.addLiveIn(Arg->getRegister());
1335 B.setInsertPt(EntryMBB, EntryMBB.begin());
1336 B.buildCopy(LiveIn, Arg->getRegister());
1338 B.setInsertPt(OrigInsBB, OrigInsPt);
1341 return true;
1344 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1345 MachineInstr &MI,
1346 MachineRegisterInfo &MRI,
1347 MachineIRBuilder &B,
1348 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1349 B.setInstr(MI);
1351 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1353 const ArgDescriptor *Arg;
1354 const TargetRegisterClass *RC;
1355 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1356 if (!Arg) {
1357 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1358 return false;
1361 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1362 MI.eraseFromParent();
1363 return true;
1366 return false;
1369 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1370 MachineRegisterInfo &MRI,
1371 MachineIRBuilder &B) const {
1372 B.setInstr(MI);
1373 Register Res = MI.getOperand(0).getReg();
1374 Register LHS = MI.getOperand(2).getReg();
1375 Register RHS = MI.getOperand(3).getReg();
1376 uint16_t Flags = MI.getFlags();
1378 LLT S32 = LLT::scalar(32);
1379 LLT S1 = LLT::scalar(1);
1381 auto Abs = B.buildFAbs(S32, RHS, Flags);
1382 const APFloat C0Val(1.0f);
1384 auto C0 = B.buildConstant(S32, 0x6f800000);
1385 auto C1 = B.buildConstant(S32, 0x2f800000);
1386 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1388 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1389 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1391 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1393 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1394 .addUse(Mul0.getReg(0))
1395 .setMIFlags(Flags);
1397 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1399 B.buildFMul(Res, Sel, Mul1, Flags);
1401 MI.eraseFromParent();
1402 return true;
1405 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1406 MachineRegisterInfo &MRI,
1407 MachineIRBuilder &B) const {
1408 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1409 if (!MFI->isEntryFunction()) {
1410 return legalizePreloadedArgIntrin(MI, MRI, B,
1411 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1414 B.setInstr(MI);
1416 uint64_t Offset =
1417 ST.getTargetLowering()->getImplicitParameterOffset(
1418 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1419 Register DstReg = MI.getOperand(0).getReg();
1420 LLT DstTy = MRI.getType(DstReg);
1421 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1423 const ArgDescriptor *Arg;
1424 const TargetRegisterClass *RC;
1425 std::tie(Arg, RC)
1426 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1427 if (!Arg)
1428 return false;
1430 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1431 if (!loadInputValue(KernargPtrReg, B, Arg))
1432 return false;
1434 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1435 MI.eraseFromParent();
1436 return true;
1439 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1440 MachineRegisterInfo &MRI,
1441 MachineIRBuilder &B,
1442 unsigned AddrSpace) const {
1443 B.setInstr(MI);
1444 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1445 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1446 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1447 MI.eraseFromParent();
1448 return true;
1451 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1452 MachineRegisterInfo &MRI,
1453 MachineIRBuilder &B) const {
1454 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1455 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1456 case Intrinsic::amdgcn_if: {
1457 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1458 const SIRegisterInfo *TRI
1459 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1461 B.setInstr(*BrCond);
1462 Register Def = MI.getOperand(1).getReg();
1463 Register Use = MI.getOperand(3).getReg();
1464 B.buildInstr(AMDGPU::SI_IF)
1465 .addDef(Def)
1466 .addUse(Use)
1467 .addMBB(BrCond->getOperand(1).getMBB());
1469 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1470 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1471 MI.eraseFromParent();
1472 BrCond->eraseFromParent();
1473 return true;
1476 return false;
1478 case Intrinsic::amdgcn_loop: {
1479 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1480 const SIRegisterInfo *TRI
1481 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1483 B.setInstr(*BrCond);
1484 Register Reg = MI.getOperand(2).getReg();
1485 B.buildInstr(AMDGPU::SI_LOOP)
1486 .addUse(Reg)
1487 .addMBB(BrCond->getOperand(1).getMBB());
1488 MI.eraseFromParent();
1489 BrCond->eraseFromParent();
1490 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1491 return true;
1494 return false;
1496 case Intrinsic::amdgcn_kernarg_segment_ptr:
1497 return legalizePreloadedArgIntrin(
1498 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1499 case Intrinsic::amdgcn_implicitarg_ptr:
1500 return legalizeImplicitArgPtr(MI, MRI, B);
1501 case Intrinsic::amdgcn_workitem_id_x:
1502 return legalizePreloadedArgIntrin(MI, MRI, B,
1503 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1504 case Intrinsic::amdgcn_workitem_id_y:
1505 return legalizePreloadedArgIntrin(MI, MRI, B,
1506 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1507 case Intrinsic::amdgcn_workitem_id_z:
1508 return legalizePreloadedArgIntrin(MI, MRI, B,
1509 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1510 case Intrinsic::amdgcn_workgroup_id_x:
1511 return legalizePreloadedArgIntrin(MI, MRI, B,
1512 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1513 case Intrinsic::amdgcn_workgroup_id_y:
1514 return legalizePreloadedArgIntrin(MI, MRI, B,
1515 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1516 case Intrinsic::amdgcn_workgroup_id_z:
1517 return legalizePreloadedArgIntrin(MI, MRI, B,
1518 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1519 case Intrinsic::amdgcn_dispatch_ptr:
1520 return legalizePreloadedArgIntrin(MI, MRI, B,
1521 AMDGPUFunctionArgInfo::DISPATCH_PTR);
1522 case Intrinsic::amdgcn_queue_ptr:
1523 return legalizePreloadedArgIntrin(MI, MRI, B,
1524 AMDGPUFunctionArgInfo::QUEUE_PTR);
1525 case Intrinsic::amdgcn_implicit_buffer_ptr:
1526 return legalizePreloadedArgIntrin(
1527 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1528 case Intrinsic::amdgcn_dispatch_id:
1529 return legalizePreloadedArgIntrin(MI, MRI, B,
1530 AMDGPUFunctionArgInfo::DISPATCH_ID);
1531 case Intrinsic::amdgcn_fdiv_fast:
1532 return legalizeFDIVFast(MI, MRI, B);
1533 case Intrinsic::amdgcn_is_shared:
1534 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
1535 case Intrinsic::amdgcn_is_private:
1536 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
1537 default:
1538 return true;
1541 return true;