[ARM] Better patterns for fp <> predicate vectors
[llvm-complete.git] / lib / Target / AMDGPU / AMDGPULegalizerInfo.cpp
blobaff85b118f561a14b91176fffa6d4b01647fd459
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "SIMachineFunctionInfo.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
20 #include "llvm/CodeGen/TargetOpcodes.h"
21 #include "llvm/CodeGen/ValueTypes.h"
22 #include "llvm/IR/DerivedTypes.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/Support/Debug.h"
26 #define DEBUG_TYPE "amdgpu-legalinfo"
28 using namespace llvm;
29 using namespace LegalizeActions;
30 using namespace LegalizeMutations;
31 using namespace LegalityPredicates;
34 static LegalityPredicate isMultiple32(unsigned TypeIdx,
35 unsigned MaxSize = 512) {
36 return [=](const LegalityQuery &Query) {
37 const LLT Ty = Query.Types[TypeIdx];
38 const LLT EltTy = Ty.getScalarType();
39 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
44 return [=](const LegalityQuery &Query) {
45 const LLT Ty = Query.Types[TypeIdx];
46 return Ty.isVector() &&
47 Ty.getNumElements() % 2 != 0 &&
48 Ty.getElementType().getSizeInBits() < 32;
52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
53 return [=](const LegalityQuery &Query) {
54 const LLT Ty = Query.Types[TypeIdx];
55 const LLT EltTy = Ty.getElementType();
56 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
61 return [=](const LegalityQuery &Query) {
62 const LLT Ty = Query.Types[TypeIdx];
63 const LLT EltTy = Ty.getElementType();
64 unsigned Size = Ty.getSizeInBits();
65 unsigned Pieces = (Size + 63) / 64;
66 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
67 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
72 return [=](const LegalityQuery &Query) {
73 const LLT QueryTy = Query.Types[TypeIdx];
74 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
79 return [=](const LegalityQuery &Query) {
80 const LLT QueryTy = Query.Types[TypeIdx];
81 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
86 // v2s16.
87 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 if (Ty.isVector()) {
91 const int EltSize = Ty.getElementType().getSizeInBits();
92 return EltSize == 32 || EltSize == 64 ||
93 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
94 EltSize == 128 || EltSize == 256;
97 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
102 const GCNTargetMachine &TM)
103 : ST(ST_) {
104 using namespace TargetOpcode;
106 auto GetAddrSpacePtr = [&TM](unsigned AS) {
107 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
110 const LLT S1 = LLT::scalar(1);
111 const LLT S8 = LLT::scalar(8);
112 const LLT S16 = LLT::scalar(16);
113 const LLT S32 = LLT::scalar(32);
114 const LLT S64 = LLT::scalar(64);
115 const LLT S128 = LLT::scalar(128);
116 const LLT S256 = LLT::scalar(256);
117 const LLT S512 = LLT::scalar(512);
119 const LLT V2S16 = LLT::vector(2, 16);
120 const LLT V4S16 = LLT::vector(4, 16);
122 const LLT V2S32 = LLT::vector(2, 32);
123 const LLT V3S32 = LLT::vector(3, 32);
124 const LLT V4S32 = LLT::vector(4, 32);
125 const LLT V5S32 = LLT::vector(5, 32);
126 const LLT V6S32 = LLT::vector(6, 32);
127 const LLT V7S32 = LLT::vector(7, 32);
128 const LLT V8S32 = LLT::vector(8, 32);
129 const LLT V9S32 = LLT::vector(9, 32);
130 const LLT V10S32 = LLT::vector(10, 32);
131 const LLT V11S32 = LLT::vector(11, 32);
132 const LLT V12S32 = LLT::vector(12, 32);
133 const LLT V13S32 = LLT::vector(13, 32);
134 const LLT V14S32 = LLT::vector(14, 32);
135 const LLT V15S32 = LLT::vector(15, 32);
136 const LLT V16S32 = LLT::vector(16, 32);
138 const LLT V2S64 = LLT::vector(2, 64);
139 const LLT V3S64 = LLT::vector(3, 64);
140 const LLT V4S64 = LLT::vector(4, 64);
141 const LLT V5S64 = LLT::vector(5, 64);
142 const LLT V6S64 = LLT::vector(6, 64);
143 const LLT V7S64 = LLT::vector(7, 64);
144 const LLT V8S64 = LLT::vector(8, 64);
146 std::initializer_list<LLT> AllS32Vectors =
147 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
148 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
149 std::initializer_list<LLT> AllS64Vectors =
150 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
152 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
153 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
154 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
155 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
156 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
157 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
158 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
160 const LLT CodePtr = FlatPtr;
162 const std::initializer_list<LLT> AddrSpaces64 = {
163 GlobalPtr, ConstantPtr, FlatPtr
166 const std::initializer_list<LLT> AddrSpaces32 = {
167 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
170 const std::initializer_list<LLT> FPTypesBase = {
171 S32, S64
174 const std::initializer_list<LLT> FPTypes16 = {
175 S32, S64, S16
178 const std::initializer_list<LLT> FPTypesPK16 = {
179 S32, S64, S16, V2S16
182 setAction({G_BRCOND, S1}, Legal);
184 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
185 // elements for v3s16
186 getActionDefinitionsBuilder(G_PHI)
187 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
188 .legalFor(AllS32Vectors)
189 .legalFor(AllS64Vectors)
190 .legalFor(AddrSpaces64)
191 .legalFor(AddrSpaces32)
192 .clampScalar(0, S32, S256)
193 .widenScalarToNextPow2(0, 32)
194 .clampMaxNumElements(0, S32, 16)
195 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
196 .legalIf(isPointer(0));
198 if (ST.has16BitInsts()) {
199 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
200 .legalFor({S32, S16})
201 .clampScalar(0, S16, S32)
202 .scalarize(0);
203 } else {
204 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
205 .legalFor({S32})
206 .clampScalar(0, S32, S32)
207 .scalarize(0);
210 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
211 .legalFor({S32})
212 .clampScalar(0, S32, S32)
213 .scalarize(0);
215 // Report legal for any types we can handle anywhere. For the cases only legal
216 // on the SALU, RegBankSelect will be able to re-legalize.
217 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
218 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
219 .clampScalar(0, S32, S64)
220 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
221 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
222 .widenScalarToNextPow2(0)
223 .scalarize(0);
225 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
226 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
227 .legalFor({{S32, S1}})
228 .clampScalar(0, S32, S32);
230 getActionDefinitionsBuilder(G_BITCAST)
231 .legalForCartesianProduct({S32, V2S16})
232 .legalForCartesianProduct({S64, V2S32, V4S16})
233 .legalForCartesianProduct({V2S64, V4S32})
234 // Don't worry about the size constraint.
235 .legalIf(all(isPointer(0), isPointer(1)));
237 if (ST.has16BitInsts()) {
238 getActionDefinitionsBuilder(G_FCONSTANT)
239 .legalFor({S32, S64, S16})
240 .clampScalar(0, S16, S64);
241 } else {
242 getActionDefinitionsBuilder(G_FCONSTANT)
243 .legalFor({S32, S64})
244 .clampScalar(0, S32, S64);
247 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
248 .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
249 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
250 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
251 .clampScalarOrElt(0, S32, S512)
252 .legalIf(isMultiple32(0))
253 .widenScalarToNextPow2(0, 32)
254 .clampMaxNumElements(0, S32, 16);
257 // FIXME: i1 operands to intrinsics should always be legal, but other i1
258 // values may not be legal. We need to figure out how to distinguish
259 // between these two scenarios.
260 getActionDefinitionsBuilder(G_CONSTANT)
261 .legalFor({S1, S32, S64, GlobalPtr,
262 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
263 .clampScalar(0, S32, S64)
264 .widenScalarToNextPow2(0)
265 .legalIf(isPointer(0));
267 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
269 auto &FPOpActions = getActionDefinitionsBuilder(
270 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
271 .legalFor({S32, S64});
273 if (ST.has16BitInsts()) {
274 if (ST.hasVOP3PInsts())
275 FPOpActions.legalFor({S16, V2S16});
276 else
277 FPOpActions.legalFor({S16});
280 auto &MinNumMaxNum = getActionDefinitionsBuilder({
281 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
283 if (ST.hasVOP3PInsts()) {
284 MinNumMaxNum.customFor(FPTypesPK16)
285 .clampMaxNumElements(0, S16, 2)
286 .clampScalar(0, S16, S64)
287 .scalarize(0);
288 } else if (ST.has16BitInsts()) {
289 MinNumMaxNum.customFor(FPTypes16)
290 .clampScalar(0, S16, S64)
291 .scalarize(0);
292 } else {
293 MinNumMaxNum.customFor(FPTypesBase)
294 .clampScalar(0, S32, S64)
295 .scalarize(0);
298 // TODO: Implement
299 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
301 if (ST.hasVOP3PInsts())
302 FPOpActions.clampMaxNumElements(0, S16, 2);
303 FPOpActions
304 .scalarize(0)
305 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
307 if (ST.has16BitInsts()) {
308 getActionDefinitionsBuilder(G_FSQRT)
309 .legalFor({S32, S64, S16})
310 .scalarize(0)
311 .clampScalar(0, S16, S64);
312 } else {
313 getActionDefinitionsBuilder(G_FSQRT)
314 .legalFor({S32, S64})
315 .scalarize(0)
316 .clampScalar(0, S32, S64);
319 getActionDefinitionsBuilder(G_FPTRUNC)
320 .legalFor({{S32, S64}, {S16, S32}})
321 .scalarize(0);
323 getActionDefinitionsBuilder(G_FPEXT)
324 .legalFor({{S64, S32}, {S32, S16}})
325 .lowerFor({{S64, S16}}) // FIXME: Implement
326 .scalarize(0);
328 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
329 getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
331 getActionDefinitionsBuilder(G_FSUB)
332 // Use actual fsub instruction
333 .legalFor({S32})
334 // Must use fadd + fneg
335 .lowerFor({S64, S16, V2S16})
336 .scalarize(0)
337 .clampScalar(0, S32, S64);
339 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
340 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
341 {S32, S1}, {S64, S1}, {S16, S1},
342 // FIXME: Hack
343 {S64, LLT::scalar(33)},
344 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
345 .scalarize(0);
347 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
348 .legalFor({{S32, S32}, {S64, S32}})
349 .lowerFor({{S32, S64}})
350 .customFor({{S64, S64}})
351 .scalarize(0);
353 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
354 .legalFor({{S32, S32}, {S32, S64}})
355 .scalarize(0);
357 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
358 .legalFor({S32, S64})
359 .scalarize(0);
361 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
362 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
363 .legalFor({S32, S64})
364 .clampScalar(0, S32, S64)
365 .scalarize(0);
366 } else {
367 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
368 .legalFor({S32})
369 .customFor({S64})
370 .clampScalar(0, S32, S64)
371 .scalarize(0);
374 getActionDefinitionsBuilder(G_GEP)
375 .legalForCartesianProduct(AddrSpaces64, {S64})
376 .legalForCartesianProduct(AddrSpaces32, {S32})
377 .scalarize(0);
379 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
381 auto &CmpBuilder =
382 getActionDefinitionsBuilder(G_ICMP)
383 .legalForCartesianProduct(
384 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
385 .legalFor({{S1, S32}, {S1, S64}});
386 if (ST.has16BitInsts()) {
387 CmpBuilder.legalFor({{S1, S16}});
390 CmpBuilder
391 .widenScalarToNextPow2(1)
392 .clampScalar(1, S32, S64)
393 .scalarize(0)
394 .legalIf(all(typeIs(0, S1), isPointer(1)));
396 getActionDefinitionsBuilder(G_FCMP)
397 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
398 .widenScalarToNextPow2(1)
399 .clampScalar(1, S32, S64)
400 .scalarize(0);
402 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
403 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
404 G_FLOG, G_FLOG2, G_FLOG10})
405 .legalFor({S32})
406 .scalarize(0);
408 // The 64-bit versions produce 32-bit results, but only on the SALU.
409 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
410 G_CTTZ, G_CTTZ_ZERO_UNDEF,
411 G_CTPOP})
412 .legalFor({{S32, S32}, {S32, S64}})
413 .clampScalar(0, S32, S32)
414 .clampScalar(1, S32, S64)
415 .scalarize(0)
416 .widenScalarToNextPow2(0, 32)
417 .widenScalarToNextPow2(1, 32);
419 // TODO: Expand for > s32
420 getActionDefinitionsBuilder(G_BSWAP)
421 .legalFor({S32})
422 .clampScalar(0, S32, S32)
423 .scalarize(0);
425 if (ST.has16BitInsts()) {
426 if (ST.hasVOP3PInsts()) {
427 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
428 .legalFor({S32, S16, V2S16})
429 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
430 .clampMaxNumElements(0, S16, 2)
431 .clampScalar(0, S16, S32)
432 .widenScalarToNextPow2(0)
433 .scalarize(0);
434 } else {
435 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
436 .legalFor({S32, S16})
437 .widenScalarToNextPow2(0)
438 .clampScalar(0, S16, S32)
439 .scalarize(0);
441 } else {
442 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
443 .legalFor({S32})
444 .clampScalar(0, S32, S32)
445 .widenScalarToNextPow2(0)
446 .scalarize(0);
449 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
450 return [=](const LegalityQuery &Query) {
451 return Query.Types[TypeIdx0].getSizeInBits() <
452 Query.Types[TypeIdx1].getSizeInBits();
456 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
457 return [=](const LegalityQuery &Query) {
458 return Query.Types[TypeIdx0].getSizeInBits() >
459 Query.Types[TypeIdx1].getSizeInBits();
463 getActionDefinitionsBuilder(G_INTTOPTR)
464 // List the common cases
465 .legalForCartesianProduct(AddrSpaces64, {S64})
466 .legalForCartesianProduct(AddrSpaces32, {S32})
467 .scalarize(0)
468 // Accept any address space as long as the size matches
469 .legalIf(sameSize(0, 1))
470 .widenScalarIf(smallerThan(1, 0),
471 [](const LegalityQuery &Query) {
472 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
474 .narrowScalarIf(greaterThan(1, 0),
475 [](const LegalityQuery &Query) {
476 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
479 getActionDefinitionsBuilder(G_PTRTOINT)
480 // List the common cases
481 .legalForCartesianProduct(AddrSpaces64, {S64})
482 .legalForCartesianProduct(AddrSpaces32, {S32})
483 .scalarize(0)
484 // Accept any address space as long as the size matches
485 .legalIf(sameSize(0, 1))
486 .widenScalarIf(smallerThan(0, 1),
487 [](const LegalityQuery &Query) {
488 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
490 .narrowScalarIf(
491 greaterThan(0, 1),
492 [](const LegalityQuery &Query) {
493 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
496 if (ST.hasFlatAddressSpace()) {
497 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
498 .scalarize(0)
499 .custom();
502 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
503 // handle some operations by just promoting the register during
504 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
505 getActionDefinitionsBuilder({G_LOAD, G_STORE})
506 .narrowScalarIf([](const LegalityQuery &Query) {
507 unsigned Size = Query.Types[0].getSizeInBits();
508 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
509 return (Size > 32 && MemSize < Size);
511 [](const LegalityQuery &Query) {
512 return std::make_pair(0, LLT::scalar(32));
514 .fewerElementsIf([=](const LegalityQuery &Query) {
515 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
516 return (MemSize == 96) &&
517 Query.Types[0].isVector() &&
518 !ST.hasDwordx3LoadStores();
520 [=](const LegalityQuery &Query) {
521 return std::make_pair(0, V2S32);
523 .legalIf([=](const LegalityQuery &Query) {
524 const LLT &Ty0 = Query.Types[0];
526 unsigned Size = Ty0.getSizeInBits();
527 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
528 if (Size < 32 || (Size > 32 && MemSize < Size))
529 return false;
531 if (Ty0.isVector() && Size != MemSize)
532 return false;
534 // TODO: Decompose private loads into 4-byte components.
535 // TODO: Illegal flat loads on SI
536 switch (MemSize) {
537 case 8:
538 case 16:
539 return Size == 32;
540 case 32:
541 case 64:
542 case 128:
543 return true;
545 case 96:
546 return ST.hasDwordx3LoadStores();
548 case 256:
549 case 512:
550 // TODO: Possibly support loads of i256 and i512 . This will require
551 // adding i256 and i512 types to MVT in order for to be able to use
552 // TableGen.
553 // TODO: Add support for other vector types, this will require
554 // defining more value mappings for the new types.
555 return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
556 Ty0.getScalarType().getSizeInBits() == 64);
558 default:
559 return false;
562 .clampScalar(0, S32, S64);
565 // FIXME: Handle alignment requirements.
566 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
567 .legalForTypesWithMemDesc({
568 {S32, GlobalPtr, 8, 8},
569 {S32, GlobalPtr, 16, 8},
570 {S32, LocalPtr, 8, 8},
571 {S32, LocalPtr, 16, 8},
572 {S32, PrivatePtr, 8, 8},
573 {S32, PrivatePtr, 16, 8}});
574 if (ST.hasFlatAddressSpace()) {
575 ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
576 {S32, FlatPtr, 16, 8}});
579 ExtLoads.clampScalar(0, S32, S32)
580 .widenScalarToNextPow2(0)
581 .unsupportedIfMemSizeNotPow2()
582 .lower();
584 auto &Atomics = getActionDefinitionsBuilder(
585 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
586 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
587 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
588 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
589 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
590 {S64, GlobalPtr}, {S64, LocalPtr}});
591 if (ST.hasFlatAddressSpace()) {
592 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
595 // TODO: Pointer types, any 32-bit or 64-bit vector
596 getActionDefinitionsBuilder(G_SELECT)
597 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
598 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
599 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
600 .clampScalar(0, S16, S64)
601 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
602 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
603 .scalarize(1)
604 .clampMaxNumElements(0, S32, 2)
605 .clampMaxNumElements(0, LocalPtr, 2)
606 .clampMaxNumElements(0, PrivatePtr, 2)
607 .scalarize(0)
608 .widenScalarToNextPow2(0)
609 .legalIf(all(isPointer(0), typeIs(1, S1)));
611 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
612 // be more flexible with the shift amount type.
613 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
614 .legalFor({{S32, S32}, {S64, S32}});
615 if (ST.has16BitInsts()) {
616 if (ST.hasVOP3PInsts()) {
617 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
618 .clampMaxNumElements(0, S16, 2);
619 } else
620 Shifts.legalFor({{S16, S32}, {S16, S16}});
622 Shifts.clampScalar(1, S16, S32);
623 Shifts.clampScalar(0, S16, S64);
624 Shifts.widenScalarToNextPow2(0, 16);
625 } else {
626 // Make sure we legalize the shift amount type first, as the general
627 // expansion for the shifted type will produce much worse code if it hasn't
628 // been truncated already.
629 Shifts.clampScalar(1, S32, S32);
630 Shifts.clampScalar(0, S32, S64);
631 Shifts.widenScalarToNextPow2(0, 32);
633 Shifts.scalarize(0);
635 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
636 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
637 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
638 unsigned IdxTypeIdx = 2;
640 getActionDefinitionsBuilder(Op)
641 .customIf([=](const LegalityQuery &Query) {
642 const LLT EltTy = Query.Types[EltTypeIdx];
643 const LLT VecTy = Query.Types[VecTypeIdx];
644 const LLT IdxTy = Query.Types[IdxTypeIdx];
645 return (EltTy.getSizeInBits() == 16 ||
646 EltTy.getSizeInBits() % 32 == 0) &&
647 VecTy.getSizeInBits() % 32 == 0 &&
648 VecTy.getSizeInBits() <= 512 &&
649 IdxTy.getSizeInBits() == 32;
651 .clampScalar(EltTypeIdx, S32, S64)
652 .clampScalar(VecTypeIdx, S32, S64)
653 .clampScalar(IdxTypeIdx, S32, S32);
656 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
657 .unsupportedIf([=](const LegalityQuery &Query) {
658 const LLT &EltTy = Query.Types[1].getElementType();
659 return Query.Types[0] != EltTy;
662 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
663 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
664 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
666 // FIXME: Doesn't handle extract of illegal sizes.
667 getActionDefinitionsBuilder(Op)
668 .legalIf([=](const LegalityQuery &Query) {
669 const LLT BigTy = Query.Types[BigTyIdx];
670 const LLT LitTy = Query.Types[LitTyIdx];
671 return (BigTy.getSizeInBits() % 32 == 0) &&
672 (LitTy.getSizeInBits() % 16 == 0);
674 .widenScalarIf(
675 [=](const LegalityQuery &Query) {
676 const LLT BigTy = Query.Types[BigTyIdx];
677 return (BigTy.getScalarSizeInBits() < 16);
679 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
680 .widenScalarIf(
681 [=](const LegalityQuery &Query) {
682 const LLT LitTy = Query.Types[LitTyIdx];
683 return (LitTy.getScalarSizeInBits() < 16);
685 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
686 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
687 .widenScalarToNextPow2(BigTyIdx, 32);
691 getActionDefinitionsBuilder(G_BUILD_VECTOR)
692 .legalForCartesianProduct(AllS32Vectors, {S32})
693 .legalForCartesianProduct(AllS64Vectors, {S64})
694 .clampNumElements(0, V16S32, V16S32)
695 .clampNumElements(0, V2S64, V8S64)
696 .minScalarSameAs(1, 0)
697 .legalIf(isRegisterType(0))
698 .minScalarOrElt(0, S32);
700 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
701 .legalIf(isRegisterType(0));
703 // Merge/Unmerge
704 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
705 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
706 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
708 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
709 const LLT &Ty = Query.Types[TypeIdx];
710 if (Ty.isVector()) {
711 const LLT &EltTy = Ty.getElementType();
712 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
713 return true;
714 if (!isPowerOf2_32(EltTy.getSizeInBits()))
715 return true;
717 return false;
720 getActionDefinitionsBuilder(Op)
721 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
722 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
723 // worth considering the multiples of 64 since 2*192 and 2*384 are not
724 // valid.
725 .clampScalar(LitTyIdx, S16, S256)
726 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
728 // Break up vectors with weird elements into scalars
729 .fewerElementsIf(
730 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
731 scalarize(0))
732 .fewerElementsIf(
733 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
734 scalarize(1))
735 .clampScalar(BigTyIdx, S32, S512)
736 .widenScalarIf(
737 [=](const LegalityQuery &Query) {
738 const LLT &Ty = Query.Types[BigTyIdx];
739 return !isPowerOf2_32(Ty.getSizeInBits()) &&
740 Ty.getSizeInBits() % 16 != 0;
742 [=](const LegalityQuery &Query) {
743 // Pick the next power of 2, or a multiple of 64 over 128.
744 // Whichever is smaller.
745 const LLT &Ty = Query.Types[BigTyIdx];
746 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
747 if (NewSizeInBits >= 256) {
748 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
749 if (RoundedTo < NewSizeInBits)
750 NewSizeInBits = RoundedTo;
752 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
754 .legalIf([=](const LegalityQuery &Query) {
755 const LLT &BigTy = Query.Types[BigTyIdx];
756 const LLT &LitTy = Query.Types[LitTyIdx];
758 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
759 return false;
760 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
761 return false;
763 return BigTy.getSizeInBits() % 16 == 0 &&
764 LitTy.getSizeInBits() % 16 == 0 &&
765 BigTy.getSizeInBits() <= 512;
767 // Any vectors left are the wrong size. Scalarize them.
768 .scalarize(0)
769 .scalarize(1);
772 computeTables();
773 verify(*ST.getInstrInfo());
776 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
777 MachineRegisterInfo &MRI,
778 MachineIRBuilder &MIRBuilder,
779 GISelChangeObserver &Observer) const {
780 switch (MI.getOpcode()) {
781 case TargetOpcode::G_ADDRSPACE_CAST:
782 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
783 case TargetOpcode::G_FRINT:
784 return legalizeFrint(MI, MRI, MIRBuilder);
785 case TargetOpcode::G_FCEIL:
786 return legalizeFceil(MI, MRI, MIRBuilder);
787 case TargetOpcode::G_INTRINSIC_TRUNC:
788 return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
789 case TargetOpcode::G_SITOFP:
790 return legalizeITOFP(MI, MRI, MIRBuilder, true);
791 case TargetOpcode::G_UITOFP:
792 return legalizeITOFP(MI, MRI, MIRBuilder, false);
793 case TargetOpcode::G_FMINNUM:
794 case TargetOpcode::G_FMAXNUM:
795 case TargetOpcode::G_FMINNUM_IEEE:
796 case TargetOpcode::G_FMAXNUM_IEEE:
797 return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
798 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
799 return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
800 case TargetOpcode::G_INSERT_VECTOR_ELT:
801 return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
802 default:
803 return false;
806 llvm_unreachable("expected switch to return");
809 Register AMDGPULegalizerInfo::getSegmentAperture(
810 unsigned AS,
811 MachineRegisterInfo &MRI,
812 MachineIRBuilder &MIRBuilder) const {
813 MachineFunction &MF = MIRBuilder.getMF();
814 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
815 const LLT S32 = LLT::scalar(32);
817 if (ST.hasApertureRegs()) {
818 // FIXME: Use inline constants (src_{shared, private}_base) instead of
819 // getreg.
820 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
821 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
822 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
823 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
824 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
825 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
826 unsigned Encoding =
827 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
828 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
829 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
831 Register ApertureReg = MRI.createGenericVirtualRegister(S32);
832 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
834 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
835 .addDef(GetReg)
836 .addImm(Encoding);
837 MRI.setType(GetReg, S32);
839 auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
840 MIRBuilder.buildInstr(TargetOpcode::G_SHL)
841 .addDef(ApertureReg)
842 .addUse(GetReg)
843 .addUse(ShiftAmt.getReg(0));
845 return ApertureReg;
848 Register QueuePtr = MRI.createGenericVirtualRegister(
849 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
851 // FIXME: Placeholder until we can track the input registers.
852 MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
854 // Offset into amd_queue_t for group_segment_aperture_base_hi /
855 // private_segment_aperture_base_hi.
856 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
858 // FIXME: Don't use undef
859 Value *V = UndefValue::get(PointerType::get(
860 Type::getInt8Ty(MF.getFunction().getContext()),
861 AMDGPUAS::CONSTANT_ADDRESS));
863 MachinePointerInfo PtrInfo(V, StructOffset);
864 MachineMemOperand *MMO = MF.getMachineMemOperand(
865 PtrInfo,
866 MachineMemOperand::MOLoad |
867 MachineMemOperand::MODereferenceable |
868 MachineMemOperand::MOInvariant,
870 MinAlign(64, StructOffset));
872 Register LoadResult = MRI.createGenericVirtualRegister(S32);
873 Register LoadAddr;
875 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
876 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
877 return LoadResult;
880 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
881 MachineInstr &MI, MachineRegisterInfo &MRI,
882 MachineIRBuilder &MIRBuilder) const {
883 MachineFunction &MF = MIRBuilder.getMF();
885 MIRBuilder.setInstr(MI);
887 Register Dst = MI.getOperand(0).getReg();
888 Register Src = MI.getOperand(1).getReg();
890 LLT DstTy = MRI.getType(Dst);
891 LLT SrcTy = MRI.getType(Src);
892 unsigned DestAS = DstTy.getAddressSpace();
893 unsigned SrcAS = SrcTy.getAddressSpace();
895 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
896 // vector element.
897 assert(!DstTy.isVector());
899 const AMDGPUTargetMachine &TM
900 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
902 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
903 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
904 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
905 return true;
908 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
909 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
910 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
911 unsigned NullVal = TM.getNullPointerValue(DestAS);
913 auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
914 auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
916 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
918 // Extract low 32-bits of the pointer.
919 MIRBuilder.buildExtract(PtrLo32, Src, 0);
921 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
922 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
923 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
925 MI.eraseFromParent();
926 return true;
929 assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
930 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
932 auto SegmentNull =
933 MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
934 auto FlatNull =
935 MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
937 Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
939 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
940 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
942 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
944 // Coerce the type of the low half of the result so we can use merge_values.
945 Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
946 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
947 .addDef(SrcAsInt)
948 .addUse(Src);
950 // TODO: Should we allow mismatched types but matching sizes in merges to
951 // avoid the ptrtoint?
952 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
953 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
955 MI.eraseFromParent();
956 return true;
959 bool AMDGPULegalizerInfo::legalizeFrint(
960 MachineInstr &MI, MachineRegisterInfo &MRI,
961 MachineIRBuilder &MIRBuilder) const {
962 MIRBuilder.setInstr(MI);
964 Register Src = MI.getOperand(1).getReg();
965 LLT Ty = MRI.getType(Src);
966 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
968 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
969 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
971 auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
972 auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
974 // TODO: Should this propagate fast-math-flags?
975 auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
976 auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
978 auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
979 auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
981 auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
982 MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
983 return true;
986 bool AMDGPULegalizerInfo::legalizeFceil(
987 MachineInstr &MI, MachineRegisterInfo &MRI,
988 MachineIRBuilder &B) const {
989 B.setInstr(MI);
991 const LLT S1 = LLT::scalar(1);
992 const LLT S64 = LLT::scalar(64);
994 Register Src = MI.getOperand(1).getReg();
995 assert(MRI.getType(Src) == S64);
997 // result = trunc(src)
998 // if (src > 0.0 && src != result)
999 // result += 1.0
1001 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1003 const auto Zero = B.buildFConstant(S64, 0.0);
1004 const auto One = B.buildFConstant(S64, 1.0);
1005 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1006 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1007 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1008 auto Add = B.buildSelect(S64, And, One, Zero);
1010 // TODO: Should this propagate fast-math-flags?
1011 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1012 return true;
1015 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1016 MachineIRBuilder &B) {
1017 const unsigned FractBits = 52;
1018 const unsigned ExpBits = 11;
1019 LLT S32 = LLT::scalar(32);
1021 auto Const0 = B.buildConstant(S32, FractBits - 32);
1022 auto Const1 = B.buildConstant(S32, ExpBits);
1024 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1025 .addUse(Const0.getReg(0))
1026 .addUse(Const1.getReg(0));
1028 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1031 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1032 MachineInstr &MI, MachineRegisterInfo &MRI,
1033 MachineIRBuilder &B) const {
1034 B.setInstr(MI);
1036 const LLT S1 = LLT::scalar(1);
1037 const LLT S32 = LLT::scalar(32);
1038 const LLT S64 = LLT::scalar(64);
1040 Register Src = MI.getOperand(1).getReg();
1041 assert(MRI.getType(Src) == S64);
1043 // TODO: Should this use extract since the low half is unused?
1044 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1045 Register Hi = Unmerge.getReg(1);
1047 // Extract the upper half, since this is where we will find the sign and
1048 // exponent.
1049 auto Exp = extractF64Exponent(Hi, B);
1051 const unsigned FractBits = 52;
1053 // Extract the sign bit.
1054 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1055 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1057 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1059 const auto Zero32 = B.buildConstant(S32, 0);
1061 // Extend back to 64-bits.
1062 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1064 auto Shr = B.buildAShr(S64, FractMask, Exp);
1065 auto Not = B.buildNot(S64, Shr);
1066 auto Tmp0 = B.buildAnd(S64, Src, Not);
1067 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1069 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1070 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1072 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1073 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1074 return true;
1077 bool AMDGPULegalizerInfo::legalizeITOFP(
1078 MachineInstr &MI, MachineRegisterInfo &MRI,
1079 MachineIRBuilder &B, bool Signed) const {
1080 B.setInstr(MI);
1082 Register Dst = MI.getOperand(0).getReg();
1083 Register Src = MI.getOperand(1).getReg();
1085 const LLT S64 = LLT::scalar(64);
1086 const LLT S32 = LLT::scalar(32);
1088 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1090 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1092 auto CvtHi = Signed ?
1093 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1094 B.buildUITOFP(S64, Unmerge.getReg(1));
1096 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1098 auto ThirtyTwo = B.buildConstant(S32, 32);
1099 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1100 .addUse(CvtHi.getReg(0))
1101 .addUse(ThirtyTwo.getReg(0));
1103 // TODO: Should this propagate fast-math-flags?
1104 B.buildFAdd(Dst, LdExp, CvtLo);
1105 MI.eraseFromParent();
1106 return true;
1109 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1110 MachineInstr &MI, MachineRegisterInfo &MRI,
1111 MachineIRBuilder &B) const {
1112 MachineFunction &MF = B.getMF();
1113 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1115 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1116 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1118 // With ieee_mode disabled, the instructions have the correct behavior
1119 // already for G_FMINNUM/G_FMAXNUM
1120 if (!MFI->getMode().IEEE)
1121 return !IsIEEEOp;
1123 if (IsIEEEOp)
1124 return true;
1126 MachineIRBuilder HelperBuilder(MI);
1127 GISelObserverWrapper DummyObserver;
1128 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1129 HelperBuilder.setMBB(*MI.getParent());
1130 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1133 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1134 MachineInstr &MI, MachineRegisterInfo &MRI,
1135 MachineIRBuilder &B) const {
1136 // TODO: Should move some of this into LegalizerHelper.
1138 // TODO: Promote dynamic indexing of s16 to s32
1139 // TODO: Dynamic s64 indexing is only legal for SGPR.
1140 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1141 if (!IdxVal) // Dynamic case will be selected to register indexing.
1142 return true;
1144 Register Dst = MI.getOperand(0).getReg();
1145 Register Vec = MI.getOperand(1).getReg();
1147 LLT VecTy = MRI.getType(Vec);
1148 LLT EltTy = VecTy.getElementType();
1149 assert(EltTy == MRI.getType(Dst));
1151 B.setInstr(MI);
1153 if (IdxVal.getValue() < VecTy.getNumElements())
1154 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1155 else
1156 B.buildUndef(Dst);
1158 MI.eraseFromParent();
1159 return true;
1162 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1163 MachineInstr &MI, MachineRegisterInfo &MRI,
1164 MachineIRBuilder &B) const {
1165 // TODO: Should move some of this into LegalizerHelper.
1167 // TODO: Promote dynamic indexing of s16 to s32
1168 // TODO: Dynamic s64 indexing is only legal for SGPR.
1169 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1170 if (!IdxVal) // Dynamic case will be selected to register indexing.
1171 return true;
1173 Register Dst = MI.getOperand(0).getReg();
1174 Register Vec = MI.getOperand(1).getReg();
1175 Register Ins = MI.getOperand(2).getReg();
1177 LLT VecTy = MRI.getType(Vec);
1178 LLT EltTy = VecTy.getElementType();
1179 assert(EltTy == MRI.getType(Ins));
1181 B.setInstr(MI);
1183 if (IdxVal.getValue() < VecTy.getNumElements())
1184 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1185 else
1186 B.buildUndef(Dst);
1188 MI.eraseFromParent();
1189 return true;
1192 // Return the use branch instruction, otherwise null if the usage is invalid.
1193 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1194 MachineRegisterInfo &MRI) {
1195 Register CondDef = MI.getOperand(0).getReg();
1196 if (!MRI.hasOneNonDBGUse(CondDef))
1197 return nullptr;
1199 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1200 return UseMI.getParent() == MI.getParent() &&
1201 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1204 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1205 Register Reg, LLT Ty) const {
1206 Register LiveIn = MRI.getLiveInVirtReg(Reg);
1207 if (LiveIn)
1208 return LiveIn;
1210 Register NewReg = MRI.createGenericVirtualRegister(Ty);
1211 MRI.addLiveIn(Reg, NewReg);
1212 return NewReg;
1215 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1216 const ArgDescriptor *Arg) const {
1217 if (!Arg->isRegister())
1218 return false; // TODO: Handle these
1220 assert(Arg->getRegister() != 0);
1221 assert(Arg->getRegister().isPhysical());
1223 MachineRegisterInfo &MRI = *B.getMRI();
1225 LLT Ty = MRI.getType(DstReg);
1226 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1228 if (Arg->isMasked()) {
1229 // TODO: Should we try to emit this once in the entry block?
1230 const LLT S32 = LLT::scalar(32);
1231 const unsigned Mask = Arg->getMask();
1232 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1234 auto ShiftAmt = B.buildConstant(S32, Shift);
1235 auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1236 B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1237 } else
1238 B.buildCopy(DstReg, LiveIn);
1240 // Insert the argument copy if it doens't already exist.
1241 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1242 if (!MRI.getVRegDef(LiveIn)) {
1243 MachineBasicBlock &EntryMBB = B.getMF().front();
1244 EntryMBB.addLiveIn(Arg->getRegister());
1245 B.setInsertPt(EntryMBB, EntryMBB.begin());
1246 B.buildCopy(LiveIn, Arg->getRegister());
1249 return true;
1252 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1253 MachineInstr &MI,
1254 MachineRegisterInfo &MRI,
1255 MachineIRBuilder &B,
1256 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1257 B.setInstr(MI);
1259 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1261 const ArgDescriptor *Arg;
1262 const TargetRegisterClass *RC;
1263 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1264 if (!Arg) {
1265 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1266 return false;
1269 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1270 MI.eraseFromParent();
1271 return true;
1274 return false;
1277 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1278 MachineRegisterInfo &MRI,
1279 MachineIRBuilder &B) const {
1280 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1281 if (!MFI->isEntryFunction()) {
1282 return legalizePreloadedArgIntrin(MI, MRI, B,
1283 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1286 B.setInstr(MI);
1288 uint64_t Offset =
1289 ST.getTargetLowering()->getImplicitParameterOffset(
1290 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1291 Register DstReg = MI.getOperand(0).getReg();
1292 LLT DstTy = MRI.getType(DstReg);
1293 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1295 const ArgDescriptor *Arg;
1296 const TargetRegisterClass *RC;
1297 std::tie(Arg, RC)
1298 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1299 if (!Arg)
1300 return false;
1302 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1303 if (!loadInputValue(KernargPtrReg, B, Arg))
1304 return false;
1306 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1307 MI.eraseFromParent();
1308 return true;
1311 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1312 MachineRegisterInfo &MRI,
1313 MachineIRBuilder &B) const {
1314 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1315 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1316 case Intrinsic::amdgcn_if: {
1317 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1318 const SIRegisterInfo *TRI
1319 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1321 B.setInstr(*BrCond);
1322 Register Def = MI.getOperand(1).getReg();
1323 Register Use = MI.getOperand(3).getReg();
1324 B.buildInstr(AMDGPU::SI_IF)
1325 .addDef(Def)
1326 .addUse(Use)
1327 .addMBB(BrCond->getOperand(1).getMBB());
1329 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1330 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1331 MI.eraseFromParent();
1332 BrCond->eraseFromParent();
1333 return true;
1336 return false;
1338 case Intrinsic::amdgcn_loop: {
1339 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1340 const SIRegisterInfo *TRI
1341 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1343 B.setInstr(*BrCond);
1344 Register Reg = MI.getOperand(2).getReg();
1345 B.buildInstr(AMDGPU::SI_LOOP)
1346 .addUse(Reg)
1347 .addMBB(BrCond->getOperand(1).getMBB());
1348 MI.eraseFromParent();
1349 BrCond->eraseFromParent();
1350 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1351 return true;
1354 return false;
1356 case Intrinsic::amdgcn_kernarg_segment_ptr:
1357 return legalizePreloadedArgIntrin(
1358 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1359 case Intrinsic::amdgcn_implicitarg_ptr:
1360 return legalizeImplicitArgPtr(MI, MRI, B);
1361 case Intrinsic::amdgcn_workitem_id_x:
1362 return legalizePreloadedArgIntrin(MI, MRI, B,
1363 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1364 case Intrinsic::amdgcn_workitem_id_y:
1365 return legalizePreloadedArgIntrin(MI, MRI, B,
1366 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1367 case Intrinsic::amdgcn_workitem_id_z:
1368 return legalizePreloadedArgIntrin(MI, MRI, B,
1369 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1370 case Intrinsic::amdgcn_workgroup_id_x:
1371 return legalizePreloadedArgIntrin(MI, MRI, B,
1372 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1373 case Intrinsic::amdgcn_workgroup_id_y:
1374 return legalizePreloadedArgIntrin(MI, MRI, B,
1375 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1376 case Intrinsic::amdgcn_workgroup_id_z:
1377 return legalizePreloadedArgIntrin(MI, MRI, B,
1378 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1379 case Intrinsic::amdgcn_dispatch_ptr:
1380 return legalizePreloadedArgIntrin(MI, MRI, B,
1381 AMDGPUFunctionArgInfo::DISPATCH_PTR);
1382 case Intrinsic::amdgcn_queue_ptr:
1383 return legalizePreloadedArgIntrin(MI, MRI, B,
1384 AMDGPUFunctionArgInfo::QUEUE_PTR);
1385 case Intrinsic::amdgcn_implicit_buffer_ptr:
1386 return legalizePreloadedArgIntrin(
1387 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1388 case Intrinsic::amdgcn_dispatch_id:
1389 return legalizePreloadedArgIntrin(MI, MRI, B,
1390 AMDGPUFunctionArgInfo::DISPATCH_ID);
1391 default:
1392 return true;
1395 return true;