gn build: Extract git() and git_out() functions in sync script
[llvm-complete.git] / lib / Target / AMDGPU / AMDGPULegalizerInfo.cpp
blob37222d9988e9a8c611691241fa31f06e95e496f8
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "SIMachineFunctionInfo.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
20 #include "llvm/CodeGen/TargetOpcodes.h"
21 #include "llvm/CodeGen/ValueTypes.h"
22 #include "llvm/IR/DerivedTypes.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/Support/Debug.h"
26 #define DEBUG_TYPE "amdgpu-legalinfo"
28 using namespace llvm;
29 using namespace LegalizeActions;
30 using namespace LegalizeMutations;
31 using namespace LegalityPredicates;
34 static LegalityPredicate isMultiple32(unsigned TypeIdx,
35 unsigned MaxSize = 512) {
36 return [=](const LegalityQuery &Query) {
37 const LLT Ty = Query.Types[TypeIdx];
38 const LLT EltTy = Ty.getScalarType();
39 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
44 return [=](const LegalityQuery &Query) {
45 const LLT Ty = Query.Types[TypeIdx];
46 return Ty.isVector() &&
47 Ty.getNumElements() % 2 != 0 &&
48 Ty.getElementType().getSizeInBits() < 32;
52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
53 return [=](const LegalityQuery &Query) {
54 const LLT Ty = Query.Types[TypeIdx];
55 const LLT EltTy = Ty.getElementType();
56 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
61 return [=](const LegalityQuery &Query) {
62 const LLT Ty = Query.Types[TypeIdx];
63 const LLT EltTy = Ty.getElementType();
64 unsigned Size = Ty.getSizeInBits();
65 unsigned Pieces = (Size + 63) / 64;
66 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
67 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
72 return [=](const LegalityQuery &Query) {
73 const LLT QueryTy = Query.Types[TypeIdx];
74 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
79 return [=](const LegalityQuery &Query) {
80 const LLT QueryTy = Query.Types[TypeIdx];
81 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
86 // v2s16.
87 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 if (Ty.isVector()) {
91 const int EltSize = Ty.getElementType().getSizeInBits();
92 return EltSize == 32 || EltSize == 64 ||
93 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
94 EltSize == 128 || EltSize == 256;
97 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
102 const GCNTargetMachine &TM)
103 : ST(ST_) {
104 using namespace TargetOpcode;
106 auto GetAddrSpacePtr = [&TM](unsigned AS) {
107 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
110 const LLT S1 = LLT::scalar(1);
111 const LLT S8 = LLT::scalar(8);
112 const LLT S16 = LLT::scalar(16);
113 const LLT S32 = LLT::scalar(32);
114 const LLT S64 = LLT::scalar(64);
115 const LLT S128 = LLT::scalar(128);
116 const LLT S256 = LLT::scalar(256);
117 const LLT S512 = LLT::scalar(512);
119 const LLT V2S16 = LLT::vector(2, 16);
120 const LLT V4S16 = LLT::vector(4, 16);
122 const LLT V2S32 = LLT::vector(2, 32);
123 const LLT V3S32 = LLT::vector(3, 32);
124 const LLT V4S32 = LLT::vector(4, 32);
125 const LLT V5S32 = LLT::vector(5, 32);
126 const LLT V6S32 = LLT::vector(6, 32);
127 const LLT V7S32 = LLT::vector(7, 32);
128 const LLT V8S32 = LLT::vector(8, 32);
129 const LLT V9S32 = LLT::vector(9, 32);
130 const LLT V10S32 = LLT::vector(10, 32);
131 const LLT V11S32 = LLT::vector(11, 32);
132 const LLT V12S32 = LLT::vector(12, 32);
133 const LLT V13S32 = LLT::vector(13, 32);
134 const LLT V14S32 = LLT::vector(14, 32);
135 const LLT V15S32 = LLT::vector(15, 32);
136 const LLT V16S32 = LLT::vector(16, 32);
138 const LLT V2S64 = LLT::vector(2, 64);
139 const LLT V3S64 = LLT::vector(3, 64);
140 const LLT V4S64 = LLT::vector(4, 64);
141 const LLT V5S64 = LLT::vector(5, 64);
142 const LLT V6S64 = LLT::vector(6, 64);
143 const LLT V7S64 = LLT::vector(7, 64);
144 const LLT V8S64 = LLT::vector(8, 64);
146 std::initializer_list<LLT> AllS32Vectors =
147 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
148 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
149 std::initializer_list<LLT> AllS64Vectors =
150 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
152 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
153 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
154 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
155 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
156 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
157 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
158 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
160 const LLT CodePtr = FlatPtr;
162 const std::initializer_list<LLT> AddrSpaces64 = {
163 GlobalPtr, ConstantPtr, FlatPtr
166 const std::initializer_list<LLT> AddrSpaces32 = {
167 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
170 const std::initializer_list<LLT> FPTypesBase = {
171 S32, S64
174 const std::initializer_list<LLT> FPTypes16 = {
175 S32, S64, S16
178 const std::initializer_list<LLT> FPTypesPK16 = {
179 S32, S64, S16, V2S16
182 setAction({G_BRCOND, S1}, Legal);
184 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
185 // elements for v3s16
186 getActionDefinitionsBuilder(G_PHI)
187 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
188 .legalFor(AllS32Vectors)
189 .legalFor(AllS64Vectors)
190 .legalFor(AddrSpaces64)
191 .legalFor(AddrSpaces32)
192 .clampScalar(0, S32, S256)
193 .widenScalarToNextPow2(0, 32)
194 .clampMaxNumElements(0, S32, 16)
195 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
196 .legalIf(isPointer(0));
198 if (ST.has16BitInsts()) {
199 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
200 .legalFor({S32, S16})
201 .clampScalar(0, S16, S32)
202 .scalarize(0);
203 } else {
204 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
205 .legalFor({S32})
206 .clampScalar(0, S32, S32)
207 .scalarize(0);
210 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
211 .legalFor({S32})
212 .clampScalar(0, S32, S32)
213 .scalarize(0);
215 // Report legal for any types we can handle anywhere. For the cases only legal
216 // on the SALU, RegBankSelect will be able to re-legalize.
217 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
218 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
219 .clampScalar(0, S32, S64)
220 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
221 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
222 .widenScalarToNextPow2(0)
223 .scalarize(0);
225 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
226 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
227 .legalFor({{S32, S1}})
228 .clampScalar(0, S32, S32);
230 getActionDefinitionsBuilder(G_BITCAST)
231 .legalForCartesianProduct({S32, V2S16})
232 .legalForCartesianProduct({S64, V2S32, V4S16})
233 .legalForCartesianProduct({V2S64, V4S32})
234 // Don't worry about the size constraint.
235 .legalIf(all(isPointer(0), isPointer(1)));
237 if (ST.has16BitInsts()) {
238 getActionDefinitionsBuilder(G_FCONSTANT)
239 .legalFor({S32, S64, S16})
240 .clampScalar(0, S16, S64);
241 } else {
242 getActionDefinitionsBuilder(G_FCONSTANT)
243 .legalFor({S32, S64})
244 .clampScalar(0, S32, S64);
247 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
248 .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
249 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
250 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
251 .clampScalarOrElt(0, S32, S512)
252 .legalIf(isMultiple32(0))
253 .widenScalarToNextPow2(0, 32)
254 .clampMaxNumElements(0, S32, 16);
257 // FIXME: i1 operands to intrinsics should always be legal, but other i1
258 // values may not be legal. We need to figure out how to distinguish
259 // between these two scenarios.
260 getActionDefinitionsBuilder(G_CONSTANT)
261 .legalFor({S1, S32, S64, GlobalPtr,
262 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
263 .clampScalar(0, S32, S64)
264 .widenScalarToNextPow2(0)
265 .legalIf(isPointer(0));
267 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
269 auto &FPOpActions = getActionDefinitionsBuilder(
270 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
271 .legalFor({S32, S64});
273 if (ST.has16BitInsts()) {
274 if (ST.hasVOP3PInsts())
275 FPOpActions.legalFor({S16, V2S16});
276 else
277 FPOpActions.legalFor({S16});
280 auto &MinNumMaxNum = getActionDefinitionsBuilder({
281 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
283 if (ST.hasVOP3PInsts()) {
284 MinNumMaxNum.customFor(FPTypesPK16)
285 .clampMaxNumElements(0, S16, 2)
286 .clampScalar(0, S16, S64)
287 .scalarize(0);
288 } else if (ST.has16BitInsts()) {
289 MinNumMaxNum.customFor(FPTypes16)
290 .clampScalar(0, S16, S64)
291 .scalarize(0);
292 } else {
293 MinNumMaxNum.customFor(FPTypesBase)
294 .clampScalar(0, S32, S64)
295 .scalarize(0);
298 // TODO: Implement
299 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
301 if (ST.hasVOP3PInsts())
302 FPOpActions.clampMaxNumElements(0, S16, 2);
303 FPOpActions
304 .scalarize(0)
305 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
307 if (ST.has16BitInsts()) {
308 getActionDefinitionsBuilder(G_FSQRT)
309 .legalFor({S32, S64, S16})
310 .scalarize(0)
311 .clampScalar(0, S16, S64);
312 } else {
313 getActionDefinitionsBuilder(G_FSQRT)
314 .legalFor({S32, S64})
315 .scalarize(0)
316 .clampScalar(0, S32, S64);
319 getActionDefinitionsBuilder(G_FPTRUNC)
320 .legalFor({{S32, S64}, {S16, S32}})
321 .scalarize(0);
323 getActionDefinitionsBuilder(G_FPEXT)
324 .legalFor({{S64, S32}, {S32, S16}})
325 .lowerFor({{S64, S16}}) // FIXME: Implement
326 .scalarize(0);
328 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
329 getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
331 getActionDefinitionsBuilder(G_FSUB)
332 // Use actual fsub instruction
333 .legalFor({S32})
334 // Must use fadd + fneg
335 .lowerFor({S64, S16, V2S16})
336 .scalarize(0)
337 .clampScalar(0, S32, S64);
339 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
340 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
341 {S32, S1}, {S64, S1}, {S16, S1},
342 // FIXME: Hack
343 {S64, LLT::scalar(33)},
344 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
345 .scalarize(0);
347 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
348 .legalFor({{S32, S32}, {S64, S32}})
349 .lowerFor({{S32, S64}})
350 .customFor({{S64, S64}})
351 .scalarize(0);
353 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
354 .legalFor({{S32, S32}, {S32, S64}})
355 .scalarize(0);
357 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
358 .legalFor({S32, S64})
359 .scalarize(0);
361 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
362 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
363 .legalFor({S32, S64})
364 .clampScalar(0, S32, S64)
365 .scalarize(0);
366 } else {
367 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
368 .legalFor({S32})
369 .customFor({S64})
370 .clampScalar(0, S32, S64)
371 .scalarize(0);
374 getActionDefinitionsBuilder(G_GEP)
375 .legalForCartesianProduct(AddrSpaces64, {S64})
376 .legalForCartesianProduct(AddrSpaces32, {S32})
377 .scalarize(0);
379 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
381 auto &CmpBuilder =
382 getActionDefinitionsBuilder(G_ICMP)
383 .legalForCartesianProduct(
384 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
385 .legalFor({{S1, S32}, {S1, S64}});
386 if (ST.has16BitInsts()) {
387 CmpBuilder.legalFor({{S1, S16}});
390 CmpBuilder
391 .widenScalarToNextPow2(1)
392 .clampScalar(1, S32, S64)
393 .scalarize(0)
394 .legalIf(all(typeIs(0, S1), isPointer(1)));
396 getActionDefinitionsBuilder(G_FCMP)
397 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
398 .widenScalarToNextPow2(1)
399 .clampScalar(1, S32, S64)
400 .scalarize(0);
402 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
403 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
404 G_FLOG, G_FLOG2, G_FLOG10})
405 .legalFor({S32})
406 .scalarize(0);
408 // The 64-bit versions produce 32-bit results, but only on the SALU.
409 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
410 G_CTTZ, G_CTTZ_ZERO_UNDEF,
411 G_CTPOP})
412 .legalFor({{S32, S32}, {S32, S64}})
413 .clampScalar(0, S32, S32)
414 .clampScalar(1, S32, S64)
415 .scalarize(0)
416 .widenScalarToNextPow2(0, 32)
417 .widenScalarToNextPow2(1, 32);
419 // TODO: Expand for > s32
420 getActionDefinitionsBuilder(G_BSWAP)
421 .legalFor({S32})
422 .clampScalar(0, S32, S32)
423 .scalarize(0);
425 if (ST.has16BitInsts()) {
426 if (ST.hasVOP3PInsts()) {
427 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
428 .legalFor({S32, S16, V2S16})
429 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
430 .clampMaxNumElements(0, S16, 2)
431 .clampScalar(0, S16, S32)
432 .widenScalarToNextPow2(0)
433 .scalarize(0);
434 } else {
435 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
436 .legalFor({S32, S16})
437 .widenScalarToNextPow2(0)
438 .clampScalar(0, S16, S32)
439 .scalarize(0);
441 } else {
442 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
443 .legalFor({S32})
444 .clampScalar(0, S32, S32)
445 .widenScalarToNextPow2(0)
446 .scalarize(0);
449 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
450 return [=](const LegalityQuery &Query) {
451 return Query.Types[TypeIdx0].getSizeInBits() <
452 Query.Types[TypeIdx1].getSizeInBits();
456 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
457 return [=](const LegalityQuery &Query) {
458 return Query.Types[TypeIdx0].getSizeInBits() >
459 Query.Types[TypeIdx1].getSizeInBits();
463 getActionDefinitionsBuilder(G_INTTOPTR)
464 // List the common cases
465 .legalForCartesianProduct(AddrSpaces64, {S64})
466 .legalForCartesianProduct(AddrSpaces32, {S32})
467 .scalarize(0)
468 // Accept any address space as long as the size matches
469 .legalIf(sameSize(0, 1))
470 .widenScalarIf(smallerThan(1, 0),
471 [](const LegalityQuery &Query) {
472 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
474 .narrowScalarIf(greaterThan(1, 0),
475 [](const LegalityQuery &Query) {
476 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
479 getActionDefinitionsBuilder(G_PTRTOINT)
480 // List the common cases
481 .legalForCartesianProduct(AddrSpaces64, {S64})
482 .legalForCartesianProduct(AddrSpaces32, {S32})
483 .scalarize(0)
484 // Accept any address space as long as the size matches
485 .legalIf(sameSize(0, 1))
486 .widenScalarIf(smallerThan(0, 1),
487 [](const LegalityQuery &Query) {
488 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
490 .narrowScalarIf(
491 greaterThan(0, 1),
492 [](const LegalityQuery &Query) {
493 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
496 if (ST.hasFlatAddressSpace()) {
497 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
498 .scalarize(0)
499 .custom();
502 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
503 // handle some operations by just promoting the register during
504 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
505 getActionDefinitionsBuilder({G_LOAD, G_STORE})
506 .narrowScalarIf([](const LegalityQuery &Query) {
507 unsigned Size = Query.Types[0].getSizeInBits();
508 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
509 return (Size > 32 && MemSize < Size);
511 [](const LegalityQuery &Query) {
512 return std::make_pair(0, LLT::scalar(32));
514 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
515 .fewerElementsIf([=](const LegalityQuery &Query) {
516 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
517 return (MemSize == 96) &&
518 Query.Types[0].isVector() &&
519 !ST.hasDwordx3LoadStores();
521 [=](const LegalityQuery &Query) {
522 return std::make_pair(0, V2S32);
524 .legalIf([=](const LegalityQuery &Query) {
525 const LLT &Ty0 = Query.Types[0];
527 unsigned Size = Ty0.getSizeInBits();
528 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
529 if (Size < 32 || (Size > 32 && MemSize < Size))
530 return false;
532 if (Ty0.isVector() && Size != MemSize)
533 return false;
535 // TODO: Decompose private loads into 4-byte components.
536 // TODO: Illegal flat loads on SI
537 switch (MemSize) {
538 case 8:
539 case 16:
540 return Size == 32;
541 case 32:
542 case 64:
543 case 128:
544 return true;
546 case 96:
547 return ST.hasDwordx3LoadStores();
549 case 256:
550 case 512:
551 // TODO: Possibly support loads of i256 and i512 . This will require
552 // adding i256 and i512 types to MVT in order for to be able to use
553 // TableGen.
554 // TODO: Add support for other vector types, this will require
555 // defining more value mappings for the new types.
556 return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
557 Ty0.getScalarType().getSizeInBits() == 64);
559 default:
560 return false;
563 .clampScalar(0, S32, S64);
566 // FIXME: Handle alignment requirements.
567 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
568 .legalForTypesWithMemDesc({
569 {S32, GlobalPtr, 8, 8},
570 {S32, GlobalPtr, 16, 8},
571 {S32, LocalPtr, 8, 8},
572 {S32, LocalPtr, 16, 8},
573 {S32, PrivatePtr, 8, 8},
574 {S32, PrivatePtr, 16, 8}});
575 if (ST.hasFlatAddressSpace()) {
576 ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
577 {S32, FlatPtr, 16, 8}});
580 ExtLoads.clampScalar(0, S32, S32)
581 .widenScalarToNextPow2(0)
582 .unsupportedIfMemSizeNotPow2()
583 .lower();
585 auto &Atomics = getActionDefinitionsBuilder(
586 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
587 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
588 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
589 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
590 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
591 {S64, GlobalPtr}, {S64, LocalPtr}});
592 if (ST.hasFlatAddressSpace()) {
593 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
596 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
597 .legalFor({{S32, LocalPtr}});
599 // TODO: Pointer types, any 32-bit or 64-bit vector
600 getActionDefinitionsBuilder(G_SELECT)
601 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
602 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
603 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
604 .clampScalar(0, S16, S64)
605 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
606 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
607 .scalarize(1)
608 .clampMaxNumElements(0, S32, 2)
609 .clampMaxNumElements(0, LocalPtr, 2)
610 .clampMaxNumElements(0, PrivatePtr, 2)
611 .scalarize(0)
612 .widenScalarToNextPow2(0)
613 .legalIf(all(isPointer(0), typeIs(1, S1)));
615 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
616 // be more flexible with the shift amount type.
617 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
618 .legalFor({{S32, S32}, {S64, S32}});
619 if (ST.has16BitInsts()) {
620 if (ST.hasVOP3PInsts()) {
621 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
622 .clampMaxNumElements(0, S16, 2);
623 } else
624 Shifts.legalFor({{S16, S32}, {S16, S16}});
626 Shifts.clampScalar(1, S16, S32);
627 Shifts.clampScalar(0, S16, S64);
628 Shifts.widenScalarToNextPow2(0, 16);
629 } else {
630 // Make sure we legalize the shift amount type first, as the general
631 // expansion for the shifted type will produce much worse code if it hasn't
632 // been truncated already.
633 Shifts.clampScalar(1, S32, S32);
634 Shifts.clampScalar(0, S32, S64);
635 Shifts.widenScalarToNextPow2(0, 32);
637 Shifts.scalarize(0);
639 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
640 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
641 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
642 unsigned IdxTypeIdx = 2;
644 getActionDefinitionsBuilder(Op)
645 .customIf([=](const LegalityQuery &Query) {
646 const LLT EltTy = Query.Types[EltTypeIdx];
647 const LLT VecTy = Query.Types[VecTypeIdx];
648 const LLT IdxTy = Query.Types[IdxTypeIdx];
649 return (EltTy.getSizeInBits() == 16 ||
650 EltTy.getSizeInBits() % 32 == 0) &&
651 VecTy.getSizeInBits() % 32 == 0 &&
652 VecTy.getSizeInBits() <= 512 &&
653 IdxTy.getSizeInBits() == 32;
655 .clampScalar(EltTypeIdx, S32, S64)
656 .clampScalar(VecTypeIdx, S32, S64)
657 .clampScalar(IdxTypeIdx, S32, S32);
660 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
661 .unsupportedIf([=](const LegalityQuery &Query) {
662 const LLT &EltTy = Query.Types[1].getElementType();
663 return Query.Types[0] != EltTy;
666 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
667 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
668 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
670 // FIXME: Doesn't handle extract of illegal sizes.
671 getActionDefinitionsBuilder(Op)
672 .legalIf([=](const LegalityQuery &Query) {
673 const LLT BigTy = Query.Types[BigTyIdx];
674 const LLT LitTy = Query.Types[LitTyIdx];
675 return (BigTy.getSizeInBits() % 32 == 0) &&
676 (LitTy.getSizeInBits() % 16 == 0);
678 .widenScalarIf(
679 [=](const LegalityQuery &Query) {
680 const LLT BigTy = Query.Types[BigTyIdx];
681 return (BigTy.getScalarSizeInBits() < 16);
683 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
684 .widenScalarIf(
685 [=](const LegalityQuery &Query) {
686 const LLT LitTy = Query.Types[LitTyIdx];
687 return (LitTy.getScalarSizeInBits() < 16);
689 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
690 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
691 .widenScalarToNextPow2(BigTyIdx, 32);
695 getActionDefinitionsBuilder(G_BUILD_VECTOR)
696 .legalForCartesianProduct(AllS32Vectors, {S32})
697 .legalForCartesianProduct(AllS64Vectors, {S64})
698 .clampNumElements(0, V16S32, V16S32)
699 .clampNumElements(0, V2S64, V8S64)
700 .minScalarSameAs(1, 0)
701 .legalIf(isRegisterType(0))
702 .minScalarOrElt(0, S32);
704 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
705 .legalIf(isRegisterType(0));
707 // Merge/Unmerge
708 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
709 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
710 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
712 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
713 const LLT &Ty = Query.Types[TypeIdx];
714 if (Ty.isVector()) {
715 const LLT &EltTy = Ty.getElementType();
716 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
717 return true;
718 if (!isPowerOf2_32(EltTy.getSizeInBits()))
719 return true;
721 return false;
724 getActionDefinitionsBuilder(Op)
725 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
726 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
727 // worth considering the multiples of 64 since 2*192 and 2*384 are not
728 // valid.
729 .clampScalar(LitTyIdx, S16, S256)
730 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
732 // Break up vectors with weird elements into scalars
733 .fewerElementsIf(
734 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
735 scalarize(0))
736 .fewerElementsIf(
737 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
738 scalarize(1))
739 .clampScalar(BigTyIdx, S32, S512)
740 .lowerFor({{S16, V2S16}})
741 .widenScalarIf(
742 [=](const LegalityQuery &Query) {
743 const LLT &Ty = Query.Types[BigTyIdx];
744 return !isPowerOf2_32(Ty.getSizeInBits()) &&
745 Ty.getSizeInBits() % 16 != 0;
747 [=](const LegalityQuery &Query) {
748 // Pick the next power of 2, or a multiple of 64 over 128.
749 // Whichever is smaller.
750 const LLT &Ty = Query.Types[BigTyIdx];
751 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
752 if (NewSizeInBits >= 256) {
753 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
754 if (RoundedTo < NewSizeInBits)
755 NewSizeInBits = RoundedTo;
757 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
759 .legalIf([=](const LegalityQuery &Query) {
760 const LLT &BigTy = Query.Types[BigTyIdx];
761 const LLT &LitTy = Query.Types[LitTyIdx];
763 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
764 return false;
765 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
766 return false;
768 return BigTy.getSizeInBits() % 16 == 0 &&
769 LitTy.getSizeInBits() % 16 == 0 &&
770 BigTy.getSizeInBits() <= 512;
772 // Any vectors left are the wrong size. Scalarize them.
773 .scalarize(0)
774 .scalarize(1);
777 getActionDefinitionsBuilder(G_SEXT_INREG).lower();
779 computeTables();
780 verify(*ST.getInstrInfo());
783 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
784 MachineRegisterInfo &MRI,
785 MachineIRBuilder &MIRBuilder,
786 GISelChangeObserver &Observer) const {
787 switch (MI.getOpcode()) {
788 case TargetOpcode::G_ADDRSPACE_CAST:
789 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
790 case TargetOpcode::G_FRINT:
791 return legalizeFrint(MI, MRI, MIRBuilder);
792 case TargetOpcode::G_FCEIL:
793 return legalizeFceil(MI, MRI, MIRBuilder);
794 case TargetOpcode::G_INTRINSIC_TRUNC:
795 return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
796 case TargetOpcode::G_SITOFP:
797 return legalizeITOFP(MI, MRI, MIRBuilder, true);
798 case TargetOpcode::G_UITOFP:
799 return legalizeITOFP(MI, MRI, MIRBuilder, false);
800 case TargetOpcode::G_FMINNUM:
801 case TargetOpcode::G_FMAXNUM:
802 case TargetOpcode::G_FMINNUM_IEEE:
803 case TargetOpcode::G_FMAXNUM_IEEE:
804 return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
805 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
806 return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
807 case TargetOpcode::G_INSERT_VECTOR_ELT:
808 return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
809 default:
810 return false;
813 llvm_unreachable("expected switch to return");
816 Register AMDGPULegalizerInfo::getSegmentAperture(
817 unsigned AS,
818 MachineRegisterInfo &MRI,
819 MachineIRBuilder &MIRBuilder) const {
820 MachineFunction &MF = MIRBuilder.getMF();
821 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
822 const LLT S32 = LLT::scalar(32);
824 if (ST.hasApertureRegs()) {
825 // FIXME: Use inline constants (src_{shared, private}_base) instead of
826 // getreg.
827 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
828 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
829 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
830 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
831 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
832 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
833 unsigned Encoding =
834 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
835 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
836 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
838 Register ApertureReg = MRI.createGenericVirtualRegister(S32);
839 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
841 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
842 .addDef(GetReg)
843 .addImm(Encoding);
844 MRI.setType(GetReg, S32);
846 auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
847 MIRBuilder.buildInstr(TargetOpcode::G_SHL)
848 .addDef(ApertureReg)
849 .addUse(GetReg)
850 .addUse(ShiftAmt.getReg(0));
852 return ApertureReg;
855 Register QueuePtr = MRI.createGenericVirtualRegister(
856 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
858 // FIXME: Placeholder until we can track the input registers.
859 MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
861 // Offset into amd_queue_t for group_segment_aperture_base_hi /
862 // private_segment_aperture_base_hi.
863 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
865 // FIXME: Don't use undef
866 Value *V = UndefValue::get(PointerType::get(
867 Type::getInt8Ty(MF.getFunction().getContext()),
868 AMDGPUAS::CONSTANT_ADDRESS));
870 MachinePointerInfo PtrInfo(V, StructOffset);
871 MachineMemOperand *MMO = MF.getMachineMemOperand(
872 PtrInfo,
873 MachineMemOperand::MOLoad |
874 MachineMemOperand::MODereferenceable |
875 MachineMemOperand::MOInvariant,
877 MinAlign(64, StructOffset));
879 Register LoadResult = MRI.createGenericVirtualRegister(S32);
880 Register LoadAddr;
882 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
883 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
884 return LoadResult;
887 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
888 MachineInstr &MI, MachineRegisterInfo &MRI,
889 MachineIRBuilder &MIRBuilder) const {
890 MachineFunction &MF = MIRBuilder.getMF();
892 MIRBuilder.setInstr(MI);
894 Register Dst = MI.getOperand(0).getReg();
895 Register Src = MI.getOperand(1).getReg();
897 LLT DstTy = MRI.getType(Dst);
898 LLT SrcTy = MRI.getType(Src);
899 unsigned DestAS = DstTy.getAddressSpace();
900 unsigned SrcAS = SrcTy.getAddressSpace();
902 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
903 // vector element.
904 assert(!DstTy.isVector());
906 const AMDGPUTargetMachine &TM
907 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
909 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
910 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
911 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
912 return true;
915 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
916 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
917 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
918 unsigned NullVal = TM.getNullPointerValue(DestAS);
920 auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
921 auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
923 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
925 // Extract low 32-bits of the pointer.
926 MIRBuilder.buildExtract(PtrLo32, Src, 0);
928 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
929 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
930 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
932 MI.eraseFromParent();
933 return true;
936 assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
937 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
939 auto SegmentNull =
940 MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
941 auto FlatNull =
942 MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
944 Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
946 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
947 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
949 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
951 // Coerce the type of the low half of the result so we can use merge_values.
952 Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
953 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
954 .addDef(SrcAsInt)
955 .addUse(Src);
957 // TODO: Should we allow mismatched types but matching sizes in merges to
958 // avoid the ptrtoint?
959 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
960 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
962 MI.eraseFromParent();
963 return true;
966 bool AMDGPULegalizerInfo::legalizeFrint(
967 MachineInstr &MI, MachineRegisterInfo &MRI,
968 MachineIRBuilder &MIRBuilder) const {
969 MIRBuilder.setInstr(MI);
971 Register Src = MI.getOperand(1).getReg();
972 LLT Ty = MRI.getType(Src);
973 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
975 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
976 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
978 auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
979 auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
981 // TODO: Should this propagate fast-math-flags?
982 auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
983 auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
985 auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
986 auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
988 auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
989 MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
990 return true;
993 bool AMDGPULegalizerInfo::legalizeFceil(
994 MachineInstr &MI, MachineRegisterInfo &MRI,
995 MachineIRBuilder &B) const {
996 B.setInstr(MI);
998 const LLT S1 = LLT::scalar(1);
999 const LLT S64 = LLT::scalar(64);
1001 Register Src = MI.getOperand(1).getReg();
1002 assert(MRI.getType(Src) == S64);
1004 // result = trunc(src)
1005 // if (src > 0.0 && src != result)
1006 // result += 1.0
1008 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1010 const auto Zero = B.buildFConstant(S64, 0.0);
1011 const auto One = B.buildFConstant(S64, 1.0);
1012 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1013 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1014 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1015 auto Add = B.buildSelect(S64, And, One, Zero);
1017 // TODO: Should this propagate fast-math-flags?
1018 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1019 return true;
1022 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1023 MachineIRBuilder &B) {
1024 const unsigned FractBits = 52;
1025 const unsigned ExpBits = 11;
1026 LLT S32 = LLT::scalar(32);
1028 auto Const0 = B.buildConstant(S32, FractBits - 32);
1029 auto Const1 = B.buildConstant(S32, ExpBits);
1031 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1032 .addUse(Const0.getReg(0))
1033 .addUse(Const1.getReg(0));
1035 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1038 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1039 MachineInstr &MI, MachineRegisterInfo &MRI,
1040 MachineIRBuilder &B) const {
1041 B.setInstr(MI);
1043 const LLT S1 = LLT::scalar(1);
1044 const LLT S32 = LLT::scalar(32);
1045 const LLT S64 = LLT::scalar(64);
1047 Register Src = MI.getOperand(1).getReg();
1048 assert(MRI.getType(Src) == S64);
1050 // TODO: Should this use extract since the low half is unused?
1051 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1052 Register Hi = Unmerge.getReg(1);
1054 // Extract the upper half, since this is where we will find the sign and
1055 // exponent.
1056 auto Exp = extractF64Exponent(Hi, B);
1058 const unsigned FractBits = 52;
1060 // Extract the sign bit.
1061 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1062 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1064 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1066 const auto Zero32 = B.buildConstant(S32, 0);
1068 // Extend back to 64-bits.
1069 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1071 auto Shr = B.buildAShr(S64, FractMask, Exp);
1072 auto Not = B.buildNot(S64, Shr);
1073 auto Tmp0 = B.buildAnd(S64, Src, Not);
1074 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1076 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1077 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1079 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1080 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1081 return true;
1084 bool AMDGPULegalizerInfo::legalizeITOFP(
1085 MachineInstr &MI, MachineRegisterInfo &MRI,
1086 MachineIRBuilder &B, bool Signed) const {
1087 B.setInstr(MI);
1089 Register Dst = MI.getOperand(0).getReg();
1090 Register Src = MI.getOperand(1).getReg();
1092 const LLT S64 = LLT::scalar(64);
1093 const LLT S32 = LLT::scalar(32);
1095 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1097 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1099 auto CvtHi = Signed ?
1100 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1101 B.buildUITOFP(S64, Unmerge.getReg(1));
1103 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1105 auto ThirtyTwo = B.buildConstant(S32, 32);
1106 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1107 .addUse(CvtHi.getReg(0))
1108 .addUse(ThirtyTwo.getReg(0));
1110 // TODO: Should this propagate fast-math-flags?
1111 B.buildFAdd(Dst, LdExp, CvtLo);
1112 MI.eraseFromParent();
1113 return true;
1116 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1117 MachineInstr &MI, MachineRegisterInfo &MRI,
1118 MachineIRBuilder &B) const {
1119 MachineFunction &MF = B.getMF();
1120 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1122 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1123 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1125 // With ieee_mode disabled, the instructions have the correct behavior
1126 // already for G_FMINNUM/G_FMAXNUM
1127 if (!MFI->getMode().IEEE)
1128 return !IsIEEEOp;
1130 if (IsIEEEOp)
1131 return true;
1133 MachineIRBuilder HelperBuilder(MI);
1134 GISelObserverWrapper DummyObserver;
1135 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1136 HelperBuilder.setMBB(*MI.getParent());
1137 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1140 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1141 MachineInstr &MI, MachineRegisterInfo &MRI,
1142 MachineIRBuilder &B) const {
1143 // TODO: Should move some of this into LegalizerHelper.
1145 // TODO: Promote dynamic indexing of s16 to s32
1146 // TODO: Dynamic s64 indexing is only legal for SGPR.
1147 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1148 if (!IdxVal) // Dynamic case will be selected to register indexing.
1149 return true;
1151 Register Dst = MI.getOperand(0).getReg();
1152 Register Vec = MI.getOperand(1).getReg();
1154 LLT VecTy = MRI.getType(Vec);
1155 LLT EltTy = VecTy.getElementType();
1156 assert(EltTy == MRI.getType(Dst));
1158 B.setInstr(MI);
1160 if (IdxVal.getValue() < VecTy.getNumElements())
1161 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1162 else
1163 B.buildUndef(Dst);
1165 MI.eraseFromParent();
1166 return true;
1169 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1170 MachineInstr &MI, MachineRegisterInfo &MRI,
1171 MachineIRBuilder &B) const {
1172 // TODO: Should move some of this into LegalizerHelper.
1174 // TODO: Promote dynamic indexing of s16 to s32
1175 // TODO: Dynamic s64 indexing is only legal for SGPR.
1176 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1177 if (!IdxVal) // Dynamic case will be selected to register indexing.
1178 return true;
1180 Register Dst = MI.getOperand(0).getReg();
1181 Register Vec = MI.getOperand(1).getReg();
1182 Register Ins = MI.getOperand(2).getReg();
1184 LLT VecTy = MRI.getType(Vec);
1185 LLT EltTy = VecTy.getElementType();
1186 assert(EltTy == MRI.getType(Ins));
1188 B.setInstr(MI);
1190 if (IdxVal.getValue() < VecTy.getNumElements())
1191 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1192 else
1193 B.buildUndef(Dst);
1195 MI.eraseFromParent();
1196 return true;
1199 // Return the use branch instruction, otherwise null if the usage is invalid.
1200 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1201 MachineRegisterInfo &MRI) {
1202 Register CondDef = MI.getOperand(0).getReg();
1203 if (!MRI.hasOneNonDBGUse(CondDef))
1204 return nullptr;
1206 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1207 return UseMI.getParent() == MI.getParent() &&
1208 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1211 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1212 Register Reg, LLT Ty) const {
1213 Register LiveIn = MRI.getLiveInVirtReg(Reg);
1214 if (LiveIn)
1215 return LiveIn;
1217 Register NewReg = MRI.createGenericVirtualRegister(Ty);
1218 MRI.addLiveIn(Reg, NewReg);
1219 return NewReg;
1222 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1223 const ArgDescriptor *Arg) const {
1224 if (!Arg->isRegister())
1225 return false; // TODO: Handle these
1227 assert(Arg->getRegister() != 0);
1228 assert(Arg->getRegister().isPhysical());
1230 MachineRegisterInfo &MRI = *B.getMRI();
1232 LLT Ty = MRI.getType(DstReg);
1233 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1235 if (Arg->isMasked()) {
1236 // TODO: Should we try to emit this once in the entry block?
1237 const LLT S32 = LLT::scalar(32);
1238 const unsigned Mask = Arg->getMask();
1239 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1241 auto ShiftAmt = B.buildConstant(S32, Shift);
1242 auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1243 B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1244 } else
1245 B.buildCopy(DstReg, LiveIn);
1247 // Insert the argument copy if it doens't already exist.
1248 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1249 if (!MRI.getVRegDef(LiveIn)) {
1250 MachineBasicBlock &EntryMBB = B.getMF().front();
1251 EntryMBB.addLiveIn(Arg->getRegister());
1252 B.setInsertPt(EntryMBB, EntryMBB.begin());
1253 B.buildCopy(LiveIn, Arg->getRegister());
1256 return true;
1259 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1260 MachineInstr &MI,
1261 MachineRegisterInfo &MRI,
1262 MachineIRBuilder &B,
1263 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1264 B.setInstr(MI);
1266 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1268 const ArgDescriptor *Arg;
1269 const TargetRegisterClass *RC;
1270 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1271 if (!Arg) {
1272 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1273 return false;
1276 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1277 MI.eraseFromParent();
1278 return true;
1281 return false;
1284 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1285 MachineRegisterInfo &MRI,
1286 MachineIRBuilder &B) const {
1287 B.setInstr(MI);
1288 Register Res = MI.getOperand(0).getReg();
1289 Register LHS = MI.getOperand(2).getReg();
1290 Register RHS = MI.getOperand(3).getReg();
1291 uint16_t Flags = MI.getFlags();
1293 LLT S32 = LLT::scalar(32);
1294 LLT S1 = LLT::scalar(1);
1296 auto Abs = B.buildFAbs(S32, RHS, Flags);
1297 const APFloat C0Val(1.0f);
1299 auto C0 = B.buildConstant(S32, 0x6f800000);
1300 auto C1 = B.buildConstant(S32, 0x2f800000);
1301 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1303 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1304 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1306 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1308 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1309 .addUse(Mul0.getReg(0))
1310 .setMIFlags(Flags);
1312 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1314 B.buildFMul(Res, Sel, Mul1, Flags);
1316 MI.eraseFromParent();
1317 return true;
1320 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1321 MachineRegisterInfo &MRI,
1322 MachineIRBuilder &B) const {
1323 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1324 if (!MFI->isEntryFunction()) {
1325 return legalizePreloadedArgIntrin(MI, MRI, B,
1326 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1329 B.setInstr(MI);
1331 uint64_t Offset =
1332 ST.getTargetLowering()->getImplicitParameterOffset(
1333 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1334 Register DstReg = MI.getOperand(0).getReg();
1335 LLT DstTy = MRI.getType(DstReg);
1336 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1338 const ArgDescriptor *Arg;
1339 const TargetRegisterClass *RC;
1340 std::tie(Arg, RC)
1341 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1342 if (!Arg)
1343 return false;
1345 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1346 if (!loadInputValue(KernargPtrReg, B, Arg))
1347 return false;
1349 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1350 MI.eraseFromParent();
1351 return true;
1354 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1355 MachineRegisterInfo &MRI,
1356 MachineIRBuilder &B) const {
1357 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1358 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1359 case Intrinsic::amdgcn_if: {
1360 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1361 const SIRegisterInfo *TRI
1362 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1364 B.setInstr(*BrCond);
1365 Register Def = MI.getOperand(1).getReg();
1366 Register Use = MI.getOperand(3).getReg();
1367 B.buildInstr(AMDGPU::SI_IF)
1368 .addDef(Def)
1369 .addUse(Use)
1370 .addMBB(BrCond->getOperand(1).getMBB());
1372 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1373 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1374 MI.eraseFromParent();
1375 BrCond->eraseFromParent();
1376 return true;
1379 return false;
1381 case Intrinsic::amdgcn_loop: {
1382 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1383 const SIRegisterInfo *TRI
1384 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1386 B.setInstr(*BrCond);
1387 Register Reg = MI.getOperand(2).getReg();
1388 B.buildInstr(AMDGPU::SI_LOOP)
1389 .addUse(Reg)
1390 .addMBB(BrCond->getOperand(1).getMBB());
1391 MI.eraseFromParent();
1392 BrCond->eraseFromParent();
1393 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1394 return true;
1397 return false;
1399 case Intrinsic::amdgcn_kernarg_segment_ptr:
1400 return legalizePreloadedArgIntrin(
1401 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1402 case Intrinsic::amdgcn_implicitarg_ptr:
1403 return legalizeImplicitArgPtr(MI, MRI, B);
1404 case Intrinsic::amdgcn_workitem_id_x:
1405 return legalizePreloadedArgIntrin(MI, MRI, B,
1406 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1407 case Intrinsic::amdgcn_workitem_id_y:
1408 return legalizePreloadedArgIntrin(MI, MRI, B,
1409 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1410 case Intrinsic::amdgcn_workitem_id_z:
1411 return legalizePreloadedArgIntrin(MI, MRI, B,
1412 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1413 case Intrinsic::amdgcn_workgroup_id_x:
1414 return legalizePreloadedArgIntrin(MI, MRI, B,
1415 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1416 case Intrinsic::amdgcn_workgroup_id_y:
1417 return legalizePreloadedArgIntrin(MI, MRI, B,
1418 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1419 case Intrinsic::amdgcn_workgroup_id_z:
1420 return legalizePreloadedArgIntrin(MI, MRI, B,
1421 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1422 case Intrinsic::amdgcn_dispatch_ptr:
1423 return legalizePreloadedArgIntrin(MI, MRI, B,
1424 AMDGPUFunctionArgInfo::DISPATCH_PTR);
1425 case Intrinsic::amdgcn_queue_ptr:
1426 return legalizePreloadedArgIntrin(MI, MRI, B,
1427 AMDGPUFunctionArgInfo::QUEUE_PTR);
1428 case Intrinsic::amdgcn_implicit_buffer_ptr:
1429 return legalizePreloadedArgIntrin(
1430 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1431 case Intrinsic::amdgcn_dispatch_id:
1432 return legalizePreloadedArgIntrin(MI, MRI, B,
1433 AMDGPUFunctionArgInfo::DISPATCH_ID);
1434 case Intrinsic::amdgcn_fdiv_fast:
1435 return legalizeFDIVFast(MI, MRI, B);
1436 default:
1437 return true;
1440 return true;