[x86] fix assert with horizontal math + broadcast of vector (PR43402)
[llvm-core.git] / lib / Target / AMDGPU / AMDGPULegalizerInfo.cpp
blob02b3b42102672d8bfd5c838974adbc891498675c
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
34 #define DEBUG_TYPE "amdgpu-legalinfo"
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43 unsigned MaxSize = 512) {
44 return [=](const LegalityQuery &Query) {
45 const LLT Ty = Query.Types[TypeIdx];
46 const LLT EltTy = Ty.getScalarType();
47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
51 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
52 return [=](const LegalityQuery &Query) {
53 const LLT Ty = Query.Types[TypeIdx];
54 return Ty.isVector() &&
55 Ty.getNumElements() % 2 != 0 &&
56 Ty.getElementType().getSizeInBits() < 32;
60 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
61 return [=](const LegalityQuery &Query) {
62 const LLT Ty = Query.Types[TypeIdx];
63 const LLT EltTy = Ty.getElementType();
64 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
68 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
69 return [=](const LegalityQuery &Query) {
70 const LLT Ty = Query.Types[TypeIdx];
71 const LLT EltTy = Ty.getElementType();
72 unsigned Size = Ty.getSizeInBits();
73 unsigned Pieces = (Size + 63) / 64;
74 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
75 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
79 // Increase the number of vector elements to reach the next multiple of 32-bit
80 // type.
81 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
82 return [=](const LegalityQuery &Query) {
83 const LLT Ty = Query.Types[TypeIdx];
85 const LLT EltTy = Ty.getElementType();
86 const int Size = Ty.getSizeInBits();
87 const int EltSize = EltTy.getSizeInBits();
88 const int NextMul32 = (Size + 31) / 32;
90 assert(EltSize < 32);
92 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
93 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
97 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
98 return [=](const LegalityQuery &Query) {
99 const LLT QueryTy = Query.Types[TypeIdx];
100 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
104 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
105 return [=](const LegalityQuery &Query) {
106 const LLT QueryTy = Query.Types[TypeIdx];
107 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
111 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
112 return [=](const LegalityQuery &Query) {
113 const LLT QueryTy = Query.Types[TypeIdx];
114 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
118 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
119 // v2s16.
120 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
121 return [=](const LegalityQuery &Query) {
122 const LLT Ty = Query.Types[TypeIdx];
123 if (Ty.isVector()) {
124 const int EltSize = Ty.getElementType().getSizeInBits();
125 return EltSize == 32 || EltSize == 64 ||
126 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
127 EltSize == 128 || EltSize == 256;
130 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
134 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
135 return [=](const LegalityQuery &Query) {
136 return Query.Types[TypeIdx].getElementType() == Type;
140 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
141 return [=](const LegalityQuery &Query) {
142 const LLT Ty = Query.Types[TypeIdx];
143 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
144 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
148 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
149 const GCNTargetMachine &TM)
150 : ST(ST_) {
151 using namespace TargetOpcode;
153 auto GetAddrSpacePtr = [&TM](unsigned AS) {
154 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
157 const LLT S1 = LLT::scalar(1);
158 const LLT S8 = LLT::scalar(8);
159 const LLT S16 = LLT::scalar(16);
160 const LLT S32 = LLT::scalar(32);
161 const LLT S64 = LLT::scalar(64);
162 const LLT S96 = LLT::scalar(96);
163 const LLT S128 = LLT::scalar(128);
164 const LLT S256 = LLT::scalar(256);
165 const LLT S512 = LLT::scalar(512);
167 const LLT V2S16 = LLT::vector(2, 16);
168 const LLT V4S16 = LLT::vector(4, 16);
170 const LLT V2S32 = LLT::vector(2, 32);
171 const LLT V3S32 = LLT::vector(3, 32);
172 const LLT V4S32 = LLT::vector(4, 32);
173 const LLT V5S32 = LLT::vector(5, 32);
174 const LLT V6S32 = LLT::vector(6, 32);
175 const LLT V7S32 = LLT::vector(7, 32);
176 const LLT V8S32 = LLT::vector(8, 32);
177 const LLT V9S32 = LLT::vector(9, 32);
178 const LLT V10S32 = LLT::vector(10, 32);
179 const LLT V11S32 = LLT::vector(11, 32);
180 const LLT V12S32 = LLT::vector(12, 32);
181 const LLT V13S32 = LLT::vector(13, 32);
182 const LLT V14S32 = LLT::vector(14, 32);
183 const LLT V15S32 = LLT::vector(15, 32);
184 const LLT V16S32 = LLT::vector(16, 32);
186 const LLT V2S64 = LLT::vector(2, 64);
187 const LLT V3S64 = LLT::vector(3, 64);
188 const LLT V4S64 = LLT::vector(4, 64);
189 const LLT V5S64 = LLT::vector(5, 64);
190 const LLT V6S64 = LLT::vector(6, 64);
191 const LLT V7S64 = LLT::vector(7, 64);
192 const LLT V8S64 = LLT::vector(8, 64);
194 std::initializer_list<LLT> AllS32Vectors =
195 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
196 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
197 std::initializer_list<LLT> AllS64Vectors =
198 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
200 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
201 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
202 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
203 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
204 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
205 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
206 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
208 const LLT CodePtr = FlatPtr;
210 const std::initializer_list<LLT> AddrSpaces64 = {
211 GlobalPtr, ConstantPtr, FlatPtr
214 const std::initializer_list<LLT> AddrSpaces32 = {
215 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
218 const std::initializer_list<LLT> FPTypesBase = {
219 S32, S64
222 const std::initializer_list<LLT> FPTypes16 = {
223 S32, S64, S16
226 const std::initializer_list<LLT> FPTypesPK16 = {
227 S32, S64, S16, V2S16
230 setAction({G_BRCOND, S1}, Legal);
232 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
233 // elements for v3s16
234 getActionDefinitionsBuilder(G_PHI)
235 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
236 .legalFor(AllS32Vectors)
237 .legalFor(AllS64Vectors)
238 .legalFor(AddrSpaces64)
239 .legalFor(AddrSpaces32)
240 .clampScalar(0, S32, S256)
241 .widenScalarToNextPow2(0, 32)
242 .clampMaxNumElements(0, S32, 16)
243 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
244 .legalIf(isPointer(0));
246 if (ST.has16BitInsts()) {
247 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
248 .legalFor({S32, S16})
249 .clampScalar(0, S16, S32)
250 .scalarize(0);
251 } else {
252 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
253 .legalFor({S32})
254 .clampScalar(0, S32, S32)
255 .scalarize(0);
258 getActionDefinitionsBuilder({G_UMULH, G_SMULH})
259 .legalFor({S32})
260 .clampScalar(0, S32, S32)
261 .scalarize(0);
263 // Report legal for any types we can handle anywhere. For the cases only legal
264 // on the SALU, RegBankSelect will be able to re-legalize.
265 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
266 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
267 .clampScalar(0, S32, S64)
268 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
269 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
270 .widenScalarToNextPow2(0)
271 .scalarize(0);
273 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
274 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
275 .legalFor({{S32, S1}})
276 .clampScalar(0, S32, S32);
278 getActionDefinitionsBuilder(G_BITCAST)
279 .legalForCartesianProduct({S32, V2S16})
280 .legalForCartesianProduct({S64, V2S32, V4S16})
281 .legalForCartesianProduct({V2S64, V4S32})
282 // Don't worry about the size constraint.
283 .legalIf(all(isPointer(0), isPointer(1)))
284 // FIXME: Testing hack
285 .legalForCartesianProduct({S16, LLT::vector(2, 8), });
287 getActionDefinitionsBuilder(G_FCONSTANT)
288 .legalFor({S32, S64, S16})
289 .clampScalar(0, S16, S64);
291 getActionDefinitionsBuilder(G_IMPLICIT_DEF)
292 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
293 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
294 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
295 .clampScalarOrElt(0, S32, S512)
296 .legalIf(isMultiple32(0))
297 .widenScalarToNextPow2(0, 32)
298 .clampMaxNumElements(0, S32, 16);
301 // FIXME: i1 operands to intrinsics should always be legal, but other i1
302 // values may not be legal. We need to figure out how to distinguish
303 // between these two scenarios.
304 getActionDefinitionsBuilder(G_CONSTANT)
305 .legalFor({S1, S32, S64, S16, GlobalPtr,
306 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
307 .clampScalar(0, S32, S64)
308 .widenScalarToNextPow2(0)
309 .legalIf(isPointer(0));
311 setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
312 getActionDefinitionsBuilder(G_GLOBAL_VALUE).customFor({LocalPtr});
315 auto &FPOpActions = getActionDefinitionsBuilder(
316 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
317 .legalFor({S32, S64});
318 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
319 .customFor({S32, S64});
321 if (ST.has16BitInsts()) {
322 if (ST.hasVOP3PInsts())
323 FPOpActions.legalFor({S16, V2S16});
324 else
325 FPOpActions.legalFor({S16});
327 TrigActions.customFor({S16});
330 auto &MinNumMaxNum = getActionDefinitionsBuilder({
331 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
333 if (ST.hasVOP3PInsts()) {
334 MinNumMaxNum.customFor(FPTypesPK16)
335 .clampMaxNumElements(0, S16, 2)
336 .clampScalar(0, S16, S64)
337 .scalarize(0);
338 } else if (ST.has16BitInsts()) {
339 MinNumMaxNum.customFor(FPTypes16)
340 .clampScalar(0, S16, S64)
341 .scalarize(0);
342 } else {
343 MinNumMaxNum.customFor(FPTypesBase)
344 .clampScalar(0, S32, S64)
345 .scalarize(0);
348 if (ST.hasVOP3PInsts())
349 FPOpActions.clampMaxNumElements(0, S16, 2);
351 FPOpActions
352 .scalarize(0)
353 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
355 TrigActions
356 .scalarize(0)
357 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
359 getActionDefinitionsBuilder({G_FNEG, G_FABS})
360 .legalFor(FPTypesPK16)
361 .clampMaxNumElements(0, S16, 2)
362 .scalarize(0)
363 .clampScalar(0, S16, S64);
365 // TODO: Implement
366 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
368 if (ST.has16BitInsts()) {
369 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
370 .legalFor({S32, S64, S16})
371 .scalarize(0)
372 .clampScalar(0, S16, S64);
373 } else {
374 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
375 .legalFor({S32, S64})
376 .scalarize(0)
377 .clampScalar(0, S32, S64);
380 getActionDefinitionsBuilder(G_FPTRUNC)
381 .legalFor({{S32, S64}, {S16, S32}})
382 .scalarize(0);
384 getActionDefinitionsBuilder(G_FPEXT)
385 .legalFor({{S64, S32}, {S32, S16}})
386 .lowerFor({{S64, S16}}) // FIXME: Implement
387 .scalarize(0);
389 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
390 getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
392 getActionDefinitionsBuilder(G_FSUB)
393 // Use actual fsub instruction
394 .legalFor({S32})
395 // Must use fadd + fneg
396 .lowerFor({S64, S16, V2S16})
397 .scalarize(0)
398 .clampScalar(0, S32, S64);
400 // Whether this is legal depends on the floating point mode for the function.
401 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
402 if (ST.hasMadF16())
403 FMad.customFor({S32, S16});
404 else
405 FMad.customFor({S32});
406 FMad.scalarize(0)
407 .lower();
409 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
410 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
411 {S32, S1}, {S64, S1}, {S16, S1},
412 {S96, S32},
413 // FIXME: Hack
414 {S64, LLT::scalar(33)},
415 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
416 .scalarize(0);
418 // TODO: Legal for s1->s64, requires split for VALU.
419 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
420 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}})
421 .lowerFor({{S32, S64}})
422 .customFor({{S64, S64}})
423 .scalarize(0);
425 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
426 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
427 .scalarize(0);
429 getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
430 .legalFor({S32, S64})
431 .scalarize(0);
433 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
434 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
435 .legalFor({S32, S64})
436 .clampScalar(0, S32, S64)
437 .scalarize(0);
438 } else {
439 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
440 .legalFor({S32})
441 .customFor({S64})
442 .clampScalar(0, S32, S64)
443 .scalarize(0);
446 getActionDefinitionsBuilder(G_GEP)
447 .legalForCartesianProduct(AddrSpaces64, {S64})
448 .legalForCartesianProduct(AddrSpaces32, {S32})
449 .scalarize(0);
451 getActionDefinitionsBuilder(G_PTR_MASK)
452 .scalarize(0)
453 .alwaysLegal();
455 setAction({G_BLOCK_ADDR, CodePtr}, Legal);
457 auto &CmpBuilder =
458 getActionDefinitionsBuilder(G_ICMP)
459 .legalForCartesianProduct(
460 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
461 .legalFor({{S1, S32}, {S1, S64}});
462 if (ST.has16BitInsts()) {
463 CmpBuilder.legalFor({{S1, S16}});
466 CmpBuilder
467 .widenScalarToNextPow2(1)
468 .clampScalar(1, S32, S64)
469 .scalarize(0)
470 .legalIf(all(typeIs(0, S1), isPointer(1)));
472 getActionDefinitionsBuilder(G_FCMP)
473 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
474 .widenScalarToNextPow2(1)
475 .clampScalar(1, S32, S64)
476 .scalarize(0);
478 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
479 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
480 G_FLOG, G_FLOG2, G_FLOG10})
481 .legalFor({S32})
482 .scalarize(0);
484 // The 64-bit versions produce 32-bit results, but only on the SALU.
485 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
486 G_CTTZ, G_CTTZ_ZERO_UNDEF,
487 G_CTPOP})
488 .legalFor({{S32, S32}, {S32, S64}})
489 .clampScalar(0, S32, S32)
490 .clampScalar(1, S32, S64)
491 .scalarize(0)
492 .widenScalarToNextPow2(0, 32)
493 .widenScalarToNextPow2(1, 32);
495 // TODO: Expand for > s32
496 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
497 .legalFor({S32})
498 .clampScalar(0, S32, S32)
499 .scalarize(0);
501 if (ST.has16BitInsts()) {
502 if (ST.hasVOP3PInsts()) {
503 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
504 .legalFor({S32, S16, V2S16})
505 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
506 .clampMaxNumElements(0, S16, 2)
507 .clampScalar(0, S16, S32)
508 .widenScalarToNextPow2(0)
509 .scalarize(0);
510 } else {
511 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
512 .legalFor({S32, S16})
513 .widenScalarToNextPow2(0)
514 .clampScalar(0, S16, S32)
515 .scalarize(0);
517 } else {
518 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
519 .legalFor({S32})
520 .clampScalar(0, S32, S32)
521 .widenScalarToNextPow2(0)
522 .scalarize(0);
525 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
526 return [=](const LegalityQuery &Query) {
527 return Query.Types[TypeIdx0].getSizeInBits() <
528 Query.Types[TypeIdx1].getSizeInBits();
532 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
533 return [=](const LegalityQuery &Query) {
534 return Query.Types[TypeIdx0].getSizeInBits() >
535 Query.Types[TypeIdx1].getSizeInBits();
539 getActionDefinitionsBuilder(G_INTTOPTR)
540 // List the common cases
541 .legalForCartesianProduct(AddrSpaces64, {S64})
542 .legalForCartesianProduct(AddrSpaces32, {S32})
543 .scalarize(0)
544 // Accept any address space as long as the size matches
545 .legalIf(sameSize(0, 1))
546 .widenScalarIf(smallerThan(1, 0),
547 [](const LegalityQuery &Query) {
548 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
550 .narrowScalarIf(greaterThan(1, 0),
551 [](const LegalityQuery &Query) {
552 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
555 getActionDefinitionsBuilder(G_PTRTOINT)
556 // List the common cases
557 .legalForCartesianProduct(AddrSpaces64, {S64})
558 .legalForCartesianProduct(AddrSpaces32, {S32})
559 .scalarize(0)
560 // Accept any address space as long as the size matches
561 .legalIf(sameSize(0, 1))
562 .widenScalarIf(smallerThan(0, 1),
563 [](const LegalityQuery &Query) {
564 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
566 .narrowScalarIf(
567 greaterThan(0, 1),
568 [](const LegalityQuery &Query) {
569 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
572 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
573 .scalarize(0)
574 .custom();
576 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
577 // handle some operations by just promoting the register during
578 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
579 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
580 switch (AS) {
581 // FIXME: Private element size.
582 case AMDGPUAS::PRIVATE_ADDRESS:
583 return 32;
584 // FIXME: Check subtarget
585 case AMDGPUAS::LOCAL_ADDRESS:
586 return ST.useDS128() ? 128 : 64;
588 // Treat constant and global as identical. SMRD loads are sometimes usable
589 // for global loads (ideally constant address space should be eliminated)
590 // depending on the context. Legality cannot be context dependent, but
591 // RegBankSelect can split the load as necessary depending on the pointer
592 // register bank/uniformity and if the memory is invariant or not written in
593 // a kernel.
594 case AMDGPUAS::CONSTANT_ADDRESS:
595 case AMDGPUAS::GLOBAL_ADDRESS:
596 return 512;
597 default:
598 return 128;
602 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
603 const LLT DstTy = Query.Types[0];
605 // Split vector extloads.
606 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
607 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
608 return true;
610 const LLT PtrTy = Query.Types[1];
611 unsigned AS = PtrTy.getAddressSpace();
612 if (MemSize > maxSizeForAddrSpace(AS))
613 return true;
615 // Catch weird sized loads that don't evenly divide into the access sizes
616 // TODO: May be able to widen depending on alignment etc.
617 unsigned NumRegs = MemSize / 32;
618 if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
619 return true;
621 unsigned Align = Query.MMODescrs[0].AlignInBits;
622 if (Align < MemSize) {
623 const SITargetLowering *TLI = ST.getTargetLowering();
624 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
627 return false;
630 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
631 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
632 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
634 // TODO: Refine based on subtargets which support unaligned access or 128-bit
635 // LDS
636 // TODO: Unsupported flat for SI.
638 for (unsigned Op : {G_LOAD, G_STORE}) {
639 const bool IsStore = Op == G_STORE;
641 auto &Actions = getActionDefinitionsBuilder(Op);
642 // Whitelist the common cases.
643 // TODO: Pointer loads
644 // TODO: Wide constant loads
645 // TODO: Only CI+ has 3x loads
646 // TODO: Loads to s16 on gfx9
647 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
648 {V2S32, GlobalPtr, 64, GlobalAlign32},
649 {V3S32, GlobalPtr, 96, GlobalAlign32},
650 {S96, GlobalPtr, 96, GlobalAlign32},
651 {V4S32, GlobalPtr, 128, GlobalAlign32},
652 {S128, GlobalPtr, 128, GlobalAlign32},
653 {S64, GlobalPtr, 64, GlobalAlign32},
654 {V2S64, GlobalPtr, 128, GlobalAlign32},
655 {V2S16, GlobalPtr, 32, GlobalAlign32},
656 {S32, GlobalPtr, 8, GlobalAlign8},
657 {S32, GlobalPtr, 16, GlobalAlign16},
659 {S32, LocalPtr, 32, 32},
660 {S64, LocalPtr, 64, 32},
661 {V2S32, LocalPtr, 64, 32},
662 {S32, LocalPtr, 8, 8},
663 {S32, LocalPtr, 16, 16},
664 {V2S16, LocalPtr, 32, 32},
666 {S32, PrivatePtr, 32, 32},
667 {S32, PrivatePtr, 8, 8},
668 {S32, PrivatePtr, 16, 16},
669 {V2S16, PrivatePtr, 32, 32},
671 {S32, FlatPtr, 32, GlobalAlign32},
672 {S32, FlatPtr, 16, GlobalAlign16},
673 {S32, FlatPtr, 8, GlobalAlign8},
674 {V2S16, FlatPtr, 32, GlobalAlign32},
676 {S32, ConstantPtr, 32, GlobalAlign32},
677 {V2S32, ConstantPtr, 64, GlobalAlign32},
678 {V3S32, ConstantPtr, 96, GlobalAlign32},
679 {V4S32, ConstantPtr, 128, GlobalAlign32},
680 {S64, ConstantPtr, 64, GlobalAlign32},
681 {S128, ConstantPtr, 128, GlobalAlign32},
682 {V2S32, ConstantPtr, 32, GlobalAlign32}});
683 Actions
684 .customIf(typeIs(1, Constant32Ptr))
685 .narrowScalarIf(
686 [=](const LegalityQuery &Query) -> bool {
687 return !Query.Types[0].isVector() && needToSplitLoad(Query);
689 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
690 const LLT DstTy = Query.Types[0];
691 const LLT PtrTy = Query.Types[1];
693 const unsigned DstSize = DstTy.getSizeInBits();
694 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
696 // Split extloads.
697 if (DstSize > MemSize)
698 return std::make_pair(0, LLT::scalar(MemSize));
700 if (DstSize > 32 && (DstSize % 32 != 0)) {
701 // FIXME: Need a way to specify non-extload of larger size if
702 // suitably aligned.
703 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
706 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
707 if (MemSize > MaxSize)
708 return std::make_pair(0, LLT::scalar(MaxSize));
710 unsigned Align = Query.MMODescrs[0].AlignInBits;
711 return std::make_pair(0, LLT::scalar(Align));
713 .fewerElementsIf(
714 [=](const LegalityQuery &Query) -> bool {
715 return Query.Types[0].isVector() && needToSplitLoad(Query);
717 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
718 const LLT DstTy = Query.Types[0];
719 const LLT PtrTy = Query.Types[1];
721 LLT EltTy = DstTy.getElementType();
722 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
724 // Split if it's too large for the address space.
725 if (Query.MMODescrs[0].SizeInBits > MaxSize) {
726 unsigned NumElts = DstTy.getNumElements();
727 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
729 // FIXME: Refine when odd breakdowns handled
730 // The scalars will need to be re-legalized.
731 if (NumPieces == 1 || NumPieces >= NumElts ||
732 NumElts % NumPieces != 0)
733 return std::make_pair(0, EltTy);
735 return std::make_pair(0,
736 LLT::vector(NumElts / NumPieces, EltTy));
739 // Need to split because of alignment.
740 unsigned Align = Query.MMODescrs[0].AlignInBits;
741 unsigned EltSize = EltTy.getSizeInBits();
742 if (EltSize > Align &&
743 (EltSize / Align < DstTy.getNumElements())) {
744 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
747 // May need relegalization for the scalars.
748 return std::make_pair(0, EltTy);
750 .minScalar(0, S32);
752 if (IsStore)
753 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
755 // TODO: Need a bitcast lower option?
756 Actions
757 .legalIf([=](const LegalityQuery &Query) {
758 const LLT Ty0 = Query.Types[0];
759 unsigned Size = Ty0.getSizeInBits();
760 unsigned MemSize = Query.MMODescrs[0].SizeInBits;
761 unsigned Align = Query.MMODescrs[0].AlignInBits;
763 // No extending vector loads.
764 if (Size > MemSize && Ty0.isVector())
765 return false;
767 // FIXME: Widening store from alignment not valid.
768 if (MemSize < Size)
769 MemSize = std::max(MemSize, Align);
771 switch (MemSize) {
772 case 8:
773 case 16:
774 return Size == 32;
775 case 32:
776 case 64:
777 case 128:
778 return true;
779 case 96:
780 return ST.hasDwordx3LoadStores();
781 case 256:
782 case 512:
783 return true;
784 default:
785 return false;
788 .widenScalarToNextPow2(0)
789 // TODO: v3s32->v4s32 with alignment
790 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
793 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
794 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
795 {S32, GlobalPtr, 16, 2 * 8},
796 {S32, LocalPtr, 8, 8},
797 {S32, LocalPtr, 16, 16},
798 {S32, PrivatePtr, 8, 8},
799 {S32, PrivatePtr, 16, 16},
800 {S32, ConstantPtr, 8, 8},
801 {S32, ConstantPtr, 16, 2 * 8}});
802 if (ST.hasFlatAddressSpace()) {
803 ExtLoads.legalForTypesWithMemDesc(
804 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
807 ExtLoads.clampScalar(0, S32, S32)
808 .widenScalarToNextPow2(0)
809 .unsupportedIfMemSizeNotPow2()
810 .lower();
812 auto &Atomics = getActionDefinitionsBuilder(
813 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
814 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
815 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
816 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
817 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
818 {S64, GlobalPtr}, {S64, LocalPtr}});
819 if (ST.hasFlatAddressSpace()) {
820 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
823 getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
824 .legalFor({{S32, LocalPtr}});
826 // TODO: Pointer types, any 32-bit or 64-bit vector
827 getActionDefinitionsBuilder(G_SELECT)
828 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
829 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
830 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
831 .clampScalar(0, S16, S64)
832 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
833 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
834 .scalarize(1)
835 .clampMaxNumElements(0, S32, 2)
836 .clampMaxNumElements(0, LocalPtr, 2)
837 .clampMaxNumElements(0, PrivatePtr, 2)
838 .scalarize(0)
839 .widenScalarToNextPow2(0)
840 .legalIf(all(isPointer(0), typeIs(1, S1)));
842 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
843 // be more flexible with the shift amount type.
844 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
845 .legalFor({{S32, S32}, {S64, S32}});
846 if (ST.has16BitInsts()) {
847 if (ST.hasVOP3PInsts()) {
848 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
849 .clampMaxNumElements(0, S16, 2);
850 } else
851 Shifts.legalFor({{S16, S32}, {S16, S16}});
853 Shifts.clampScalar(1, S16, S32);
854 Shifts.clampScalar(0, S16, S64);
855 Shifts.widenScalarToNextPow2(0, 16);
856 } else {
857 // Make sure we legalize the shift amount type first, as the general
858 // expansion for the shifted type will produce much worse code if it hasn't
859 // been truncated already.
860 Shifts.clampScalar(1, S32, S32);
861 Shifts.clampScalar(0, S32, S64);
862 Shifts.widenScalarToNextPow2(0, 32);
864 Shifts.scalarize(0);
866 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
867 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
868 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
869 unsigned IdxTypeIdx = 2;
871 getActionDefinitionsBuilder(Op)
872 .customIf([=](const LegalityQuery &Query) {
873 const LLT EltTy = Query.Types[EltTypeIdx];
874 const LLT VecTy = Query.Types[VecTypeIdx];
875 const LLT IdxTy = Query.Types[IdxTypeIdx];
876 return (EltTy.getSizeInBits() == 16 ||
877 EltTy.getSizeInBits() % 32 == 0) &&
878 VecTy.getSizeInBits() % 32 == 0 &&
879 VecTy.getSizeInBits() <= 512 &&
880 IdxTy.getSizeInBits() == 32;
882 .clampScalar(EltTypeIdx, S32, S64)
883 .clampScalar(VecTypeIdx, S32, S64)
884 .clampScalar(IdxTypeIdx, S32, S32);
887 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
888 .unsupportedIf([=](const LegalityQuery &Query) {
889 const LLT &EltTy = Query.Types[1].getElementType();
890 return Query.Types[0] != EltTy;
893 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
894 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
895 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
897 // FIXME: Doesn't handle extract of illegal sizes.
898 getActionDefinitionsBuilder(Op)
899 .legalIf([=](const LegalityQuery &Query) {
900 const LLT BigTy = Query.Types[BigTyIdx];
901 const LLT LitTy = Query.Types[LitTyIdx];
902 return (BigTy.getSizeInBits() % 32 == 0) &&
903 (LitTy.getSizeInBits() % 16 == 0);
905 .widenScalarIf(
906 [=](const LegalityQuery &Query) {
907 const LLT BigTy = Query.Types[BigTyIdx];
908 return (BigTy.getScalarSizeInBits() < 16);
910 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
911 .widenScalarIf(
912 [=](const LegalityQuery &Query) {
913 const LLT LitTy = Query.Types[LitTyIdx];
914 return (LitTy.getScalarSizeInBits() < 16);
916 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
917 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
918 .widenScalarToNextPow2(BigTyIdx, 32);
922 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
923 .legalForCartesianProduct(AllS32Vectors, {S32})
924 .legalForCartesianProduct(AllS64Vectors, {S64})
925 .clampNumElements(0, V16S32, V16S32)
926 .clampNumElements(0, V2S64, V8S64);
928 if (ST.hasScalarPackInsts())
929 BuildVector.legalFor({V2S16, S32});
931 BuildVector
932 .minScalarSameAs(1, 0)
933 .legalIf(isRegisterType(0))
934 .minScalarOrElt(0, S32);
936 if (ST.hasScalarPackInsts()) {
937 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
938 .legalFor({V2S16, S32})
939 .lower();
940 } else {
941 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
942 .lower();
945 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
946 .legalIf(isRegisterType(0));
948 // TODO: Don't fully scalarize v2s16 pieces
949 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
951 // Merge/Unmerge
952 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
953 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
954 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
956 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
957 const LLT &Ty = Query.Types[TypeIdx];
958 if (Ty.isVector()) {
959 const LLT &EltTy = Ty.getElementType();
960 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
961 return true;
962 if (!isPowerOf2_32(EltTy.getSizeInBits()))
963 return true;
965 return false;
968 getActionDefinitionsBuilder(Op)
969 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
970 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
971 // worth considering the multiples of 64 since 2*192 and 2*384 are not
972 // valid.
973 .clampScalar(LitTyIdx, S16, S256)
974 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
975 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
976 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
977 elementTypeIs(1, S16)),
978 changeTo(1, V2S16))
979 // Break up vectors with weird elements into scalars
980 .fewerElementsIf(
981 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
982 scalarize(0))
983 .fewerElementsIf(
984 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
985 scalarize(1))
986 .clampScalar(BigTyIdx, S32, S512)
987 .lowerFor({{S16, V2S16}})
988 .widenScalarIf(
989 [=](const LegalityQuery &Query) {
990 const LLT &Ty = Query.Types[BigTyIdx];
991 return !isPowerOf2_32(Ty.getSizeInBits()) &&
992 Ty.getSizeInBits() % 16 != 0;
994 [=](const LegalityQuery &Query) {
995 // Pick the next power of 2, or a multiple of 64 over 128.
996 // Whichever is smaller.
997 const LLT &Ty = Query.Types[BigTyIdx];
998 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
999 if (NewSizeInBits >= 256) {
1000 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1001 if (RoundedTo < NewSizeInBits)
1002 NewSizeInBits = RoundedTo;
1004 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1006 .legalIf([=](const LegalityQuery &Query) {
1007 const LLT &BigTy = Query.Types[BigTyIdx];
1008 const LLT &LitTy = Query.Types[LitTyIdx];
1010 if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1011 return false;
1012 if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1013 return false;
1015 return BigTy.getSizeInBits() % 16 == 0 &&
1016 LitTy.getSizeInBits() % 16 == 0 &&
1017 BigTy.getSizeInBits() <= 512;
1019 // Any vectors left are the wrong size. Scalarize them.
1020 .scalarize(0)
1021 .scalarize(1);
1024 getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1026 computeTables();
1027 verify(*ST.getInstrInfo());
1030 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1031 MachineRegisterInfo &MRI,
1032 MachineIRBuilder &B,
1033 GISelChangeObserver &Observer) const {
1034 switch (MI.getOpcode()) {
1035 case TargetOpcode::G_ADDRSPACE_CAST:
1036 return legalizeAddrSpaceCast(MI, MRI, B);
1037 case TargetOpcode::G_FRINT:
1038 return legalizeFrint(MI, MRI, B);
1039 case TargetOpcode::G_FCEIL:
1040 return legalizeFceil(MI, MRI, B);
1041 case TargetOpcode::G_INTRINSIC_TRUNC:
1042 return legalizeIntrinsicTrunc(MI, MRI, B);
1043 case TargetOpcode::G_SITOFP:
1044 return legalizeITOFP(MI, MRI, B, true);
1045 case TargetOpcode::G_UITOFP:
1046 return legalizeITOFP(MI, MRI, B, false);
1047 case TargetOpcode::G_FMINNUM:
1048 case TargetOpcode::G_FMAXNUM:
1049 case TargetOpcode::G_FMINNUM_IEEE:
1050 case TargetOpcode::G_FMAXNUM_IEEE:
1051 return legalizeMinNumMaxNum(MI, MRI, B);
1052 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1053 return legalizeExtractVectorElt(MI, MRI, B);
1054 case TargetOpcode::G_INSERT_VECTOR_ELT:
1055 return legalizeInsertVectorElt(MI, MRI, B);
1056 case TargetOpcode::G_FSIN:
1057 case TargetOpcode::G_FCOS:
1058 return legalizeSinCos(MI, MRI, B);
1059 case TargetOpcode::G_GLOBAL_VALUE:
1060 return legalizeGlobalValue(MI, MRI, B);
1061 case TargetOpcode::G_LOAD:
1062 return legalizeLoad(MI, MRI, B, Observer);
1063 case TargetOpcode::G_FMAD:
1064 return legalizeFMad(MI, MRI, B);
1065 default:
1066 return false;
1069 llvm_unreachable("expected switch to return");
1072 Register AMDGPULegalizerInfo::getSegmentAperture(
1073 unsigned AS,
1074 MachineRegisterInfo &MRI,
1075 MachineIRBuilder &B) const {
1076 MachineFunction &MF = B.getMF();
1077 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1078 const LLT S32 = LLT::scalar(32);
1080 if (ST.hasApertureRegs()) {
1081 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1082 // getreg.
1083 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1084 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1085 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1086 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1087 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1088 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1089 unsigned Encoding =
1090 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1091 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1092 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1094 Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1095 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1097 B.buildInstr(AMDGPU::S_GETREG_B32)
1098 .addDef(GetReg)
1099 .addImm(Encoding);
1100 MRI.setType(GetReg, S32);
1102 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1103 B.buildInstr(TargetOpcode::G_SHL)
1104 .addDef(ApertureReg)
1105 .addUse(GetReg)
1106 .addUse(ShiftAmt.getReg(0));
1108 return ApertureReg;
1111 Register QueuePtr = MRI.createGenericVirtualRegister(
1112 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1114 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1115 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1116 return Register();
1118 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1119 // private_segment_aperture_base_hi.
1120 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1122 // FIXME: Don't use undef
1123 Value *V = UndefValue::get(PointerType::get(
1124 Type::getInt8Ty(MF.getFunction().getContext()),
1125 AMDGPUAS::CONSTANT_ADDRESS));
1127 MachinePointerInfo PtrInfo(V, StructOffset);
1128 MachineMemOperand *MMO = MF.getMachineMemOperand(
1129 PtrInfo,
1130 MachineMemOperand::MOLoad |
1131 MachineMemOperand::MODereferenceable |
1132 MachineMemOperand::MOInvariant,
1134 MinAlign(64, StructOffset));
1136 Register LoadResult = MRI.createGenericVirtualRegister(S32);
1137 Register LoadAddr;
1139 B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1140 B.buildLoad(LoadResult, LoadAddr, *MMO);
1141 return LoadResult;
1144 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1145 MachineInstr &MI, MachineRegisterInfo &MRI,
1146 MachineIRBuilder &B) const {
1147 MachineFunction &MF = B.getMF();
1149 B.setInstr(MI);
1151 const LLT S32 = LLT::scalar(32);
1152 Register Dst = MI.getOperand(0).getReg();
1153 Register Src = MI.getOperand(1).getReg();
1155 LLT DstTy = MRI.getType(Dst);
1156 LLT SrcTy = MRI.getType(Src);
1157 unsigned DestAS = DstTy.getAddressSpace();
1158 unsigned SrcAS = SrcTy.getAddressSpace();
1160 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1161 // vector element.
1162 assert(!DstTy.isVector());
1164 const AMDGPUTargetMachine &TM
1165 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1167 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1168 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1169 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1170 return true;
1173 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1174 // Truncate.
1175 B.buildExtract(Dst, Src, 0);
1176 MI.eraseFromParent();
1177 return true;
1180 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1181 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1182 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1184 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1185 // another. Merge operands are required to be the same type, but creating an
1186 // extra ptrtoint would be kind of pointless.
1187 auto HighAddr = B.buildConstant(
1188 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1189 B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1190 MI.eraseFromParent();
1191 return true;
1194 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1195 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1196 DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1197 unsigned NullVal = TM.getNullPointerValue(DestAS);
1199 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1200 auto FlatNull = B.buildConstant(SrcTy, 0);
1202 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1204 // Extract low 32-bits of the pointer.
1205 B.buildExtract(PtrLo32, Src, 0);
1207 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1208 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1209 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1211 MI.eraseFromParent();
1212 return true;
1215 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1216 return false;
1218 if (!ST.hasFlatAddressSpace())
1219 return false;
1221 auto SegmentNull =
1222 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1223 auto FlatNull =
1224 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1226 Register ApertureReg = getSegmentAperture(DestAS, MRI, B);
1227 if (!ApertureReg.isValid())
1228 return false;
1230 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1231 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1233 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1235 // Coerce the type of the low half of the result so we can use merge_values.
1236 Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1237 B.buildInstr(TargetOpcode::G_PTRTOINT)
1238 .addDef(SrcAsInt)
1239 .addUse(Src);
1241 // TODO: Should we allow mismatched types but matching sizes in merges to
1242 // avoid the ptrtoint?
1243 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1244 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1246 MI.eraseFromParent();
1247 return true;
1250 bool AMDGPULegalizerInfo::legalizeFrint(
1251 MachineInstr &MI, MachineRegisterInfo &MRI,
1252 MachineIRBuilder &B) const {
1253 B.setInstr(MI);
1255 Register Src = MI.getOperand(1).getReg();
1256 LLT Ty = MRI.getType(Src);
1257 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1259 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1260 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1262 auto C1 = B.buildFConstant(Ty, C1Val);
1263 auto CopySign = B.buildFCopysign(Ty, C1, Src);
1265 // TODO: Should this propagate fast-math-flags?
1266 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1267 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1269 auto C2 = B.buildFConstant(Ty, C2Val);
1270 auto Fabs = B.buildFAbs(Ty, Src);
1272 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1273 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1274 return true;
1277 bool AMDGPULegalizerInfo::legalizeFceil(
1278 MachineInstr &MI, MachineRegisterInfo &MRI,
1279 MachineIRBuilder &B) const {
1280 B.setInstr(MI);
1282 const LLT S1 = LLT::scalar(1);
1283 const LLT S64 = LLT::scalar(64);
1285 Register Src = MI.getOperand(1).getReg();
1286 assert(MRI.getType(Src) == S64);
1288 // result = trunc(src)
1289 // if (src > 0.0 && src != result)
1290 // result += 1.0
1292 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1294 const auto Zero = B.buildFConstant(S64, 0.0);
1295 const auto One = B.buildFConstant(S64, 1.0);
1296 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1297 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1298 auto And = B.buildAnd(S1, Lt0, NeTrunc);
1299 auto Add = B.buildSelect(S64, And, One, Zero);
1301 // TODO: Should this propagate fast-math-flags?
1302 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1303 return true;
1306 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1307 MachineIRBuilder &B) {
1308 const unsigned FractBits = 52;
1309 const unsigned ExpBits = 11;
1310 LLT S32 = LLT::scalar(32);
1312 auto Const0 = B.buildConstant(S32, FractBits - 32);
1313 auto Const1 = B.buildConstant(S32, ExpBits);
1315 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1316 .addUse(Const0.getReg(0))
1317 .addUse(Const1.getReg(0));
1319 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1322 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1323 MachineInstr &MI, MachineRegisterInfo &MRI,
1324 MachineIRBuilder &B) const {
1325 B.setInstr(MI);
1327 const LLT S1 = LLT::scalar(1);
1328 const LLT S32 = LLT::scalar(32);
1329 const LLT S64 = LLT::scalar(64);
1331 Register Src = MI.getOperand(1).getReg();
1332 assert(MRI.getType(Src) == S64);
1334 // TODO: Should this use extract since the low half is unused?
1335 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1336 Register Hi = Unmerge.getReg(1);
1338 // Extract the upper half, since this is where we will find the sign and
1339 // exponent.
1340 auto Exp = extractF64Exponent(Hi, B);
1342 const unsigned FractBits = 52;
1344 // Extract the sign bit.
1345 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1346 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1348 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1350 const auto Zero32 = B.buildConstant(S32, 0);
1352 // Extend back to 64-bits.
1353 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1355 auto Shr = B.buildAShr(S64, FractMask, Exp);
1356 auto Not = B.buildNot(S64, Shr);
1357 auto Tmp0 = B.buildAnd(S64, Src, Not);
1358 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1360 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1361 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1363 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1364 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1365 return true;
1368 bool AMDGPULegalizerInfo::legalizeITOFP(
1369 MachineInstr &MI, MachineRegisterInfo &MRI,
1370 MachineIRBuilder &B, bool Signed) const {
1371 B.setInstr(MI);
1373 Register Dst = MI.getOperand(0).getReg();
1374 Register Src = MI.getOperand(1).getReg();
1376 const LLT S64 = LLT::scalar(64);
1377 const LLT S32 = LLT::scalar(32);
1379 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1381 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1383 auto CvtHi = Signed ?
1384 B.buildSITOFP(S64, Unmerge.getReg(1)) :
1385 B.buildUITOFP(S64, Unmerge.getReg(1));
1387 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1389 auto ThirtyTwo = B.buildConstant(S32, 32);
1390 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1391 .addUse(CvtHi.getReg(0))
1392 .addUse(ThirtyTwo.getReg(0));
1394 // TODO: Should this propagate fast-math-flags?
1395 B.buildFAdd(Dst, LdExp, CvtLo);
1396 MI.eraseFromParent();
1397 return true;
1400 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1401 MachineInstr &MI, MachineRegisterInfo &MRI,
1402 MachineIRBuilder &B) const {
1403 MachineFunction &MF = B.getMF();
1404 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1406 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1407 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1409 // With ieee_mode disabled, the instructions have the correct behavior
1410 // already for G_FMINNUM/G_FMAXNUM
1411 if (!MFI->getMode().IEEE)
1412 return !IsIEEEOp;
1414 if (IsIEEEOp)
1415 return true;
1417 MachineIRBuilder HelperBuilder(MI);
1418 GISelObserverWrapper DummyObserver;
1419 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1420 HelperBuilder.setInstr(MI);
1421 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1424 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1425 MachineInstr &MI, MachineRegisterInfo &MRI,
1426 MachineIRBuilder &B) const {
1427 // TODO: Should move some of this into LegalizerHelper.
1429 // TODO: Promote dynamic indexing of s16 to s32
1430 // TODO: Dynamic s64 indexing is only legal for SGPR.
1431 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1432 if (!IdxVal) // Dynamic case will be selected to register indexing.
1433 return true;
1435 Register Dst = MI.getOperand(0).getReg();
1436 Register Vec = MI.getOperand(1).getReg();
1438 LLT VecTy = MRI.getType(Vec);
1439 LLT EltTy = VecTy.getElementType();
1440 assert(EltTy == MRI.getType(Dst));
1442 B.setInstr(MI);
1444 if (IdxVal.getValue() < VecTy.getNumElements())
1445 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1446 else
1447 B.buildUndef(Dst);
1449 MI.eraseFromParent();
1450 return true;
1453 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1454 MachineInstr &MI, MachineRegisterInfo &MRI,
1455 MachineIRBuilder &B) const {
1456 // TODO: Should move some of this into LegalizerHelper.
1458 // TODO: Promote dynamic indexing of s16 to s32
1459 // TODO: Dynamic s64 indexing is only legal for SGPR.
1460 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1461 if (!IdxVal) // Dynamic case will be selected to register indexing.
1462 return true;
1464 Register Dst = MI.getOperand(0).getReg();
1465 Register Vec = MI.getOperand(1).getReg();
1466 Register Ins = MI.getOperand(2).getReg();
1468 LLT VecTy = MRI.getType(Vec);
1469 LLT EltTy = VecTy.getElementType();
1470 assert(EltTy == MRI.getType(Ins));
1472 B.setInstr(MI);
1474 if (IdxVal.getValue() < VecTy.getNumElements())
1475 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1476 else
1477 B.buildUndef(Dst);
1479 MI.eraseFromParent();
1480 return true;
1483 bool AMDGPULegalizerInfo::legalizeSinCos(
1484 MachineInstr &MI, MachineRegisterInfo &MRI,
1485 MachineIRBuilder &B) const {
1486 B.setInstr(MI);
1488 Register DstReg = MI.getOperand(0).getReg();
1489 Register SrcReg = MI.getOperand(1).getReg();
1490 LLT Ty = MRI.getType(DstReg);
1491 unsigned Flags = MI.getFlags();
1493 Register TrigVal;
1494 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1495 if (ST.hasTrigReducedRange()) {
1496 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1497 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1498 .addUse(MulVal.getReg(0))
1499 .setMIFlags(Flags).getReg(0);
1500 } else
1501 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1503 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1504 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1505 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1506 .addUse(TrigVal)
1507 .setMIFlags(Flags);
1508 MI.eraseFromParent();
1509 return true;
1512 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1513 MachineInstr &MI, MachineRegisterInfo &MRI,
1514 MachineIRBuilder &B) const {
1515 Register DstReg = MI.getOperand(0).getReg();
1516 LLT Ty = MRI.getType(DstReg);
1517 unsigned AS = Ty.getAddressSpace();
1519 const GlobalValue *GV = MI.getOperand(1).getGlobal();
1520 MachineFunction &MF = B.getMF();
1521 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1523 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1524 B.setInstr(MI);
1526 if (!MFI->isEntryFunction()) {
1527 const Function &Fn = MF.getFunction();
1528 DiagnosticInfoUnsupported BadLDSDecl(
1529 Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1530 Fn.getContext().diagnose(BadLDSDecl);
1533 // TODO: We could emit code to handle the initialization somewhere.
1534 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1535 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1536 MI.eraseFromParent();
1537 return true;
1539 } else
1540 return false;
1542 const Function &Fn = MF.getFunction();
1543 DiagnosticInfoUnsupported BadInit(
1544 Fn, "unsupported initializer for address space", MI.getDebugLoc());
1545 Fn.getContext().diagnose(BadInit);
1546 return true;
1549 bool AMDGPULegalizerInfo::legalizeLoad(
1550 MachineInstr &MI, MachineRegisterInfo &MRI,
1551 MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1552 B.setInstr(MI);
1553 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1554 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1555 Observer.changingInstr(MI);
1556 MI.getOperand(1).setReg(Cast.getReg(0));
1557 Observer.changedInstr(MI);
1558 return true;
1561 bool AMDGPULegalizerInfo::legalizeFMad(
1562 MachineInstr &MI, MachineRegisterInfo &MRI,
1563 MachineIRBuilder &B) const {
1564 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1565 assert(Ty.isScalar());
1567 // TODO: Always legal with future ftz flag.
1568 if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1569 return true;
1570 if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1571 return true;
1573 MachineFunction &MF = B.getMF();
1575 MachineIRBuilder HelperBuilder(MI);
1576 GISelObserverWrapper DummyObserver;
1577 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1578 HelperBuilder.setMBB(*MI.getParent());
1579 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1582 // Return the use branch instruction, otherwise null if the usage is invalid.
1583 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1584 MachineRegisterInfo &MRI) {
1585 Register CondDef = MI.getOperand(0).getReg();
1586 if (!MRI.hasOneNonDBGUse(CondDef))
1587 return nullptr;
1589 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1590 return UseMI.getParent() == MI.getParent() &&
1591 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1594 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1595 Register Reg, LLT Ty) const {
1596 Register LiveIn = MRI.getLiveInVirtReg(Reg);
1597 if (LiveIn)
1598 return LiveIn;
1600 Register NewReg = MRI.createGenericVirtualRegister(Ty);
1601 MRI.addLiveIn(Reg, NewReg);
1602 return NewReg;
1605 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1606 const ArgDescriptor *Arg) const {
1607 if (!Arg->isRegister() || !Arg->getRegister().isValid())
1608 return false; // TODO: Handle these
1610 assert(Arg->getRegister().isPhysical());
1612 MachineRegisterInfo &MRI = *B.getMRI();
1614 LLT Ty = MRI.getType(DstReg);
1615 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1617 if (Arg->isMasked()) {
1618 // TODO: Should we try to emit this once in the entry block?
1619 const LLT S32 = LLT::scalar(32);
1620 const unsigned Mask = Arg->getMask();
1621 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1623 auto ShiftAmt = B.buildConstant(S32, Shift);
1624 auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1625 B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1626 } else
1627 B.buildCopy(DstReg, LiveIn);
1629 // Insert the argument copy if it doens't already exist.
1630 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1631 if (!MRI.getVRegDef(LiveIn)) {
1632 // FIXME: Should have scoped insert pt
1633 MachineBasicBlock &OrigInsBB = B.getMBB();
1634 auto OrigInsPt = B.getInsertPt();
1636 MachineBasicBlock &EntryMBB = B.getMF().front();
1637 EntryMBB.addLiveIn(Arg->getRegister());
1638 B.setInsertPt(EntryMBB, EntryMBB.begin());
1639 B.buildCopy(LiveIn, Arg->getRegister());
1641 B.setInsertPt(OrigInsBB, OrigInsPt);
1644 return true;
1647 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1648 MachineInstr &MI,
1649 MachineRegisterInfo &MRI,
1650 MachineIRBuilder &B,
1651 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1652 B.setInstr(MI);
1654 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1656 const ArgDescriptor *Arg;
1657 const TargetRegisterClass *RC;
1658 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1659 if (!Arg) {
1660 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1661 return false;
1664 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1665 MI.eraseFromParent();
1666 return true;
1669 return false;
1672 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1673 MachineRegisterInfo &MRI,
1674 MachineIRBuilder &B) const {
1675 B.setInstr(MI);
1676 Register Res = MI.getOperand(0).getReg();
1677 Register LHS = MI.getOperand(2).getReg();
1678 Register RHS = MI.getOperand(3).getReg();
1679 uint16_t Flags = MI.getFlags();
1681 LLT S32 = LLT::scalar(32);
1682 LLT S1 = LLT::scalar(1);
1684 auto Abs = B.buildFAbs(S32, RHS, Flags);
1685 const APFloat C0Val(1.0f);
1687 auto C0 = B.buildConstant(S32, 0x6f800000);
1688 auto C1 = B.buildConstant(S32, 0x2f800000);
1689 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1691 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1692 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1694 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1696 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1697 .addUse(Mul0.getReg(0))
1698 .setMIFlags(Flags);
1700 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1702 B.buildFMul(Res, Sel, Mul1, Flags);
1704 MI.eraseFromParent();
1705 return true;
1708 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1709 MachineRegisterInfo &MRI,
1710 MachineIRBuilder &B) const {
1711 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1712 if (!MFI->isEntryFunction()) {
1713 return legalizePreloadedArgIntrin(MI, MRI, B,
1714 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1717 B.setInstr(MI);
1719 uint64_t Offset =
1720 ST.getTargetLowering()->getImplicitParameterOffset(
1721 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1722 Register DstReg = MI.getOperand(0).getReg();
1723 LLT DstTy = MRI.getType(DstReg);
1724 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1726 const ArgDescriptor *Arg;
1727 const TargetRegisterClass *RC;
1728 std::tie(Arg, RC)
1729 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1730 if (!Arg)
1731 return false;
1733 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1734 if (!loadInputValue(KernargPtrReg, B, Arg))
1735 return false;
1737 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1738 MI.eraseFromParent();
1739 return true;
1742 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1743 MachineRegisterInfo &MRI,
1744 MachineIRBuilder &B,
1745 unsigned AddrSpace) const {
1746 B.setInstr(MI);
1747 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1748 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1749 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1750 MI.eraseFromParent();
1751 return true;
1754 /// Handle register layout difference for f16 images for some subtargets.
1755 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
1756 MachineRegisterInfo &MRI,
1757 Register Reg) const {
1758 if (!ST.hasUnpackedD16VMem())
1759 return Reg;
1761 const LLT S16 = LLT::scalar(16);
1762 const LLT S32 = LLT::scalar(32);
1763 LLT StoreVT = MRI.getType(Reg);
1764 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
1766 auto Unmerge = B.buildUnmerge(S16, Reg);
1768 SmallVector<Register, 4> WideRegs;
1769 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1770 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1772 int NumElts = StoreVT.getNumElements();
1774 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1777 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
1778 MachineRegisterInfo &MRI,
1779 MachineIRBuilder &B,
1780 bool IsFormat) const {
1781 // TODO: Reject f16 format on targets where unsupported.
1782 Register VData = MI.getOperand(1).getReg();
1783 LLT Ty = MRI.getType(VData);
1785 B.setInstr(MI);
1787 const LLT S32 = LLT::scalar(32);
1788 const LLT S16 = LLT::scalar(16);
1790 // Fixup illegal register types for i8 stores.
1791 if (Ty == LLT::scalar(8) || Ty == S16) {
1792 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
1793 MI.getOperand(1).setReg(AnyExt);
1794 return true;
1797 if (Ty.isVector()) {
1798 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
1799 if (IsFormat)
1800 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
1801 return true;
1804 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
1807 return Ty == S32;
1810 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1811 MachineRegisterInfo &MRI,
1812 MachineIRBuilder &B) const {
1813 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1814 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1815 case Intrinsic::amdgcn_if: {
1816 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1817 const SIRegisterInfo *TRI
1818 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1820 B.setInstr(*BrCond);
1821 Register Def = MI.getOperand(1).getReg();
1822 Register Use = MI.getOperand(3).getReg();
1823 B.buildInstr(AMDGPU::SI_IF)
1824 .addDef(Def)
1825 .addUse(Use)
1826 .addMBB(BrCond->getOperand(1).getMBB());
1828 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1829 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1830 MI.eraseFromParent();
1831 BrCond->eraseFromParent();
1832 return true;
1835 return false;
1837 case Intrinsic::amdgcn_loop: {
1838 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1839 const SIRegisterInfo *TRI
1840 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1842 B.setInstr(*BrCond);
1843 Register Reg = MI.getOperand(2).getReg();
1844 B.buildInstr(AMDGPU::SI_LOOP)
1845 .addUse(Reg)
1846 .addMBB(BrCond->getOperand(1).getMBB());
1847 MI.eraseFromParent();
1848 BrCond->eraseFromParent();
1849 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1850 return true;
1853 return false;
1855 case Intrinsic::amdgcn_kernarg_segment_ptr:
1856 return legalizePreloadedArgIntrin(
1857 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1858 case Intrinsic::amdgcn_implicitarg_ptr:
1859 return legalizeImplicitArgPtr(MI, MRI, B);
1860 case Intrinsic::amdgcn_workitem_id_x:
1861 return legalizePreloadedArgIntrin(MI, MRI, B,
1862 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1863 case Intrinsic::amdgcn_workitem_id_y:
1864 return legalizePreloadedArgIntrin(MI, MRI, B,
1865 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1866 case Intrinsic::amdgcn_workitem_id_z:
1867 return legalizePreloadedArgIntrin(MI, MRI, B,
1868 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1869 case Intrinsic::amdgcn_workgroup_id_x:
1870 return legalizePreloadedArgIntrin(MI, MRI, B,
1871 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1872 case Intrinsic::amdgcn_workgroup_id_y:
1873 return legalizePreloadedArgIntrin(MI, MRI, B,
1874 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1875 case Intrinsic::amdgcn_workgroup_id_z:
1876 return legalizePreloadedArgIntrin(MI, MRI, B,
1877 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1878 case Intrinsic::amdgcn_dispatch_ptr:
1879 return legalizePreloadedArgIntrin(MI, MRI, B,
1880 AMDGPUFunctionArgInfo::DISPATCH_PTR);
1881 case Intrinsic::amdgcn_queue_ptr:
1882 return legalizePreloadedArgIntrin(MI, MRI, B,
1883 AMDGPUFunctionArgInfo::QUEUE_PTR);
1884 case Intrinsic::amdgcn_implicit_buffer_ptr:
1885 return legalizePreloadedArgIntrin(
1886 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1887 case Intrinsic::amdgcn_dispatch_id:
1888 return legalizePreloadedArgIntrin(MI, MRI, B,
1889 AMDGPUFunctionArgInfo::DISPATCH_ID);
1890 case Intrinsic::amdgcn_fdiv_fast:
1891 return legalizeFDIVFast(MI, MRI, B);
1892 case Intrinsic::amdgcn_is_shared:
1893 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
1894 case Intrinsic::amdgcn_is_private:
1895 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
1896 case Intrinsic::amdgcn_wavefrontsize: {
1897 B.setInstr(MI);
1898 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
1899 MI.eraseFromParent();
1900 return true;
1902 case Intrinsic::amdgcn_raw_buffer_store:
1903 return legalizeRawBufferStore(MI, MRI, B, false);
1904 case Intrinsic::amdgcn_raw_buffer_store_format:
1905 return legalizeRawBufferStore(MI, MRI, B, true);
1906 default:
1907 return true;
1910 return true;