1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the Machinelegalizer class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
34 #define DEBUG_TYPE "amdgpu-legalinfo"
37 using namespace LegalizeActions
;
38 using namespace LegalizeMutations
;
39 using namespace LegalityPredicates
;
42 static LegalityPredicate
isMultiple32(unsigned TypeIdx
,
43 unsigned MaxSize
= 512) {
44 return [=](const LegalityQuery
&Query
) {
45 const LLT Ty
= Query
.Types
[TypeIdx
];
46 const LLT EltTy
= Ty
.getScalarType();
47 return Ty
.getSizeInBits() <= MaxSize
&& EltTy
.getSizeInBits() % 32 == 0;
51 static LegalityPredicate
isSmallOddVector(unsigned TypeIdx
) {
52 return [=](const LegalityQuery
&Query
) {
53 const LLT Ty
= Query
.Types
[TypeIdx
];
54 return Ty
.isVector() &&
55 Ty
.getNumElements() % 2 != 0 &&
56 Ty
.getElementType().getSizeInBits() < 32;
60 static LegalizeMutation
oneMoreElement(unsigned TypeIdx
) {
61 return [=](const LegalityQuery
&Query
) {
62 const LLT Ty
= Query
.Types
[TypeIdx
];
63 const LLT EltTy
= Ty
.getElementType();
64 return std::make_pair(TypeIdx
, LLT::vector(Ty
.getNumElements() + 1, EltTy
));
68 static LegalizeMutation
fewerEltsToSize64Vector(unsigned TypeIdx
) {
69 return [=](const LegalityQuery
&Query
) {
70 const LLT Ty
= Query
.Types
[TypeIdx
];
71 const LLT EltTy
= Ty
.getElementType();
72 unsigned Size
= Ty
.getSizeInBits();
73 unsigned Pieces
= (Size
+ 63) / 64;
74 unsigned NewNumElts
= (Ty
.getNumElements() + 1) / Pieces
;
75 return std::make_pair(TypeIdx
, LLT::scalarOrVector(NewNumElts
, EltTy
));
79 // Increase the number of vector elements to reach the next multiple of 32-bit
81 static LegalizeMutation
moreEltsToNext32Bit(unsigned TypeIdx
) {
82 return [=](const LegalityQuery
&Query
) {
83 const LLT Ty
= Query
.Types
[TypeIdx
];
85 const LLT EltTy
= Ty
.getElementType();
86 const int Size
= Ty
.getSizeInBits();
87 const int EltSize
= EltTy
.getSizeInBits();
88 const int NextMul32
= (Size
+ 31) / 32;
92 const int NewNumElts
= (32 * NextMul32
+ EltSize
- 1) / EltSize
;
93 return std::make_pair(TypeIdx
, LLT::vector(NewNumElts
, EltTy
));
97 static LegalityPredicate
vectorSmallerThan(unsigned TypeIdx
, unsigned Size
) {
98 return [=](const LegalityQuery
&Query
) {
99 const LLT QueryTy
= Query
.Types
[TypeIdx
];
100 return QueryTy
.isVector() && QueryTy
.getSizeInBits() < Size
;
104 static LegalityPredicate
vectorWiderThan(unsigned TypeIdx
, unsigned Size
) {
105 return [=](const LegalityQuery
&Query
) {
106 const LLT QueryTy
= Query
.Types
[TypeIdx
];
107 return QueryTy
.isVector() && QueryTy
.getSizeInBits() > Size
;
111 static LegalityPredicate
numElementsNotEven(unsigned TypeIdx
) {
112 return [=](const LegalityQuery
&Query
) {
113 const LLT QueryTy
= Query
.Types
[TypeIdx
];
114 return QueryTy
.isVector() && QueryTy
.getNumElements() % 2 != 0;
118 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
120 static LegalityPredicate
isRegisterType(unsigned TypeIdx
) {
121 return [=](const LegalityQuery
&Query
) {
122 const LLT Ty
= Query
.Types
[TypeIdx
];
124 const int EltSize
= Ty
.getElementType().getSizeInBits();
125 return EltSize
== 32 || EltSize
== 64 ||
126 (EltSize
== 16 && Ty
.getNumElements() % 2 == 0) ||
127 EltSize
== 128 || EltSize
== 256;
130 return Ty
.getSizeInBits() % 32 == 0 && Ty
.getSizeInBits() <= 512;
134 static LegalityPredicate
elementTypeIs(unsigned TypeIdx
, LLT Type
) {
135 return [=](const LegalityQuery
&Query
) {
136 return Query
.Types
[TypeIdx
].getElementType() == Type
;
140 static LegalityPredicate
isWideScalarTruncStore(unsigned TypeIdx
) {
141 return [=](const LegalityQuery
&Query
) {
142 const LLT Ty
= Query
.Types
[TypeIdx
];
143 return !Ty
.isVector() && Ty
.getSizeInBits() > 32 &&
144 Query
.MMODescrs
[0].SizeInBits
< Ty
.getSizeInBits();
148 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget
&ST_
,
149 const GCNTargetMachine
&TM
)
151 using namespace TargetOpcode
;
153 auto GetAddrSpacePtr
= [&TM
](unsigned AS
) {
154 return LLT::pointer(AS
, TM
.getPointerSizeInBits(AS
));
157 const LLT S1
= LLT::scalar(1);
158 const LLT S8
= LLT::scalar(8);
159 const LLT S16
= LLT::scalar(16);
160 const LLT S32
= LLT::scalar(32);
161 const LLT S64
= LLT::scalar(64);
162 const LLT S96
= LLT::scalar(96);
163 const LLT S128
= LLT::scalar(128);
164 const LLT S256
= LLT::scalar(256);
165 const LLT S512
= LLT::scalar(512);
167 const LLT V2S16
= LLT::vector(2, 16);
168 const LLT V4S16
= LLT::vector(4, 16);
170 const LLT V2S32
= LLT::vector(2, 32);
171 const LLT V3S32
= LLT::vector(3, 32);
172 const LLT V4S32
= LLT::vector(4, 32);
173 const LLT V5S32
= LLT::vector(5, 32);
174 const LLT V6S32
= LLT::vector(6, 32);
175 const LLT V7S32
= LLT::vector(7, 32);
176 const LLT V8S32
= LLT::vector(8, 32);
177 const LLT V9S32
= LLT::vector(9, 32);
178 const LLT V10S32
= LLT::vector(10, 32);
179 const LLT V11S32
= LLT::vector(11, 32);
180 const LLT V12S32
= LLT::vector(12, 32);
181 const LLT V13S32
= LLT::vector(13, 32);
182 const LLT V14S32
= LLT::vector(14, 32);
183 const LLT V15S32
= LLT::vector(15, 32);
184 const LLT V16S32
= LLT::vector(16, 32);
186 const LLT V2S64
= LLT::vector(2, 64);
187 const LLT V3S64
= LLT::vector(3, 64);
188 const LLT V4S64
= LLT::vector(4, 64);
189 const LLT V5S64
= LLT::vector(5, 64);
190 const LLT V6S64
= LLT::vector(6, 64);
191 const LLT V7S64
= LLT::vector(7, 64);
192 const LLT V8S64
= LLT::vector(8, 64);
194 std::initializer_list
<LLT
> AllS32Vectors
=
195 {V2S32
, V3S32
, V4S32
, V5S32
, V6S32
, V7S32
, V8S32
,
196 V9S32
, V10S32
, V11S32
, V12S32
, V13S32
, V14S32
, V15S32
, V16S32
};
197 std::initializer_list
<LLT
> AllS64Vectors
=
198 {V2S64
, V3S64
, V4S64
, V5S64
, V6S64
, V7S64
, V8S64
};
200 const LLT GlobalPtr
= GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS
);
201 const LLT ConstantPtr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS
);
202 const LLT Constant32Ptr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT
);
203 const LLT LocalPtr
= GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS
);
204 const LLT RegionPtr
= GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS
);
205 const LLT FlatPtr
= GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS
);
206 const LLT PrivatePtr
= GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS
);
208 const LLT CodePtr
= FlatPtr
;
210 const std::initializer_list
<LLT
> AddrSpaces64
= {
211 GlobalPtr
, ConstantPtr
, FlatPtr
214 const std::initializer_list
<LLT
> AddrSpaces32
= {
215 LocalPtr
, PrivatePtr
, Constant32Ptr
, RegionPtr
218 const std::initializer_list
<LLT
> FPTypesBase
= {
222 const std::initializer_list
<LLT
> FPTypes16
= {
226 const std::initializer_list
<LLT
> FPTypesPK16
= {
230 setAction({G_BRCOND
, S1
}, Legal
);
232 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
233 // elements for v3s16
234 getActionDefinitionsBuilder(G_PHI
)
235 .legalFor({S32
, S64
, V2S16
, V4S16
, S1
, S128
, S256
})
236 .legalFor(AllS32Vectors
)
237 .legalFor(AllS64Vectors
)
238 .legalFor(AddrSpaces64
)
239 .legalFor(AddrSpaces32
)
240 .clampScalar(0, S32
, S256
)
241 .widenScalarToNextPow2(0, 32)
242 .clampMaxNumElements(0, S32
, 16)
243 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
244 .legalIf(isPointer(0));
246 if (ST
.has16BitInsts()) {
247 getActionDefinitionsBuilder({G_ADD
, G_SUB
, G_MUL
})
248 .legalFor({S32
, S16
})
249 .clampScalar(0, S16
, S32
)
252 getActionDefinitionsBuilder({G_ADD
, G_SUB
, G_MUL
})
254 .clampScalar(0, S32
, S32
)
258 getActionDefinitionsBuilder({G_UMULH
, G_SMULH
})
260 .clampScalar(0, S32
, S32
)
263 // Report legal for any types we can handle anywhere. For the cases only legal
264 // on the SALU, RegBankSelect will be able to re-legalize.
265 getActionDefinitionsBuilder({G_AND
, G_OR
, G_XOR
})
266 .legalFor({S32
, S1
, S64
, V2S32
, S16
, V2S16
, V4S16
})
267 .clampScalar(0, S32
, S64
)
268 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
269 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
270 .widenScalarToNextPow2(0)
273 getActionDefinitionsBuilder({G_UADDO
, G_SADDO
, G_USUBO
, G_SSUBO
,
274 G_UADDE
, G_SADDE
, G_USUBE
, G_SSUBE
})
275 .legalFor({{S32
, S1
}})
276 .clampScalar(0, S32
, S32
);
278 getActionDefinitionsBuilder(G_BITCAST
)
279 .legalForCartesianProduct({S32
, V2S16
})
280 .legalForCartesianProduct({S64
, V2S32
, V4S16
})
281 .legalForCartesianProduct({V2S64
, V4S32
})
282 // Don't worry about the size constraint.
283 .legalIf(all(isPointer(0), isPointer(1)))
284 // FIXME: Testing hack
285 .legalForCartesianProduct({S16
, LLT::vector(2, 8), });
287 getActionDefinitionsBuilder(G_FCONSTANT
)
288 .legalFor({S32
, S64
, S16
})
289 .clampScalar(0, S16
, S64
);
291 getActionDefinitionsBuilder(G_IMPLICIT_DEF
)
292 .legalFor({S1
, S32
, S64
, S16
, V2S32
, V4S32
, V2S16
, V4S16
, GlobalPtr
,
293 ConstantPtr
, LocalPtr
, FlatPtr
, PrivatePtr
})
294 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
295 .clampScalarOrElt(0, S32
, S512
)
296 .legalIf(isMultiple32(0))
297 .widenScalarToNextPow2(0, 32)
298 .clampMaxNumElements(0, S32
, 16);
301 // FIXME: i1 operands to intrinsics should always be legal, but other i1
302 // values may not be legal. We need to figure out how to distinguish
303 // between these two scenarios.
304 getActionDefinitionsBuilder(G_CONSTANT
)
305 .legalFor({S1
, S32
, S64
, S16
, GlobalPtr
,
306 LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
307 .clampScalar(0, S32
, S64
)
308 .widenScalarToNextPow2(0)
309 .legalIf(isPointer(0));
311 setAction({G_FRAME_INDEX
, PrivatePtr
}, Legal
);
312 getActionDefinitionsBuilder(G_GLOBAL_VALUE
).customFor({LocalPtr
});
315 auto &FPOpActions
= getActionDefinitionsBuilder(
316 { G_FADD
, G_FMUL
, G_FMA
, G_FCANONICALIZE
})
317 .legalFor({S32
, S64
});
318 auto &TrigActions
= getActionDefinitionsBuilder({G_FSIN
, G_FCOS
})
319 .customFor({S32
, S64
});
321 if (ST
.has16BitInsts()) {
322 if (ST
.hasVOP3PInsts())
323 FPOpActions
.legalFor({S16
, V2S16
});
325 FPOpActions
.legalFor({S16
});
327 TrigActions
.customFor({S16
});
330 auto &MinNumMaxNum
= getActionDefinitionsBuilder({
331 G_FMINNUM
, G_FMAXNUM
, G_FMINNUM_IEEE
, G_FMAXNUM_IEEE
});
333 if (ST
.hasVOP3PInsts()) {
334 MinNumMaxNum
.customFor(FPTypesPK16
)
335 .clampMaxNumElements(0, S16
, 2)
336 .clampScalar(0, S16
, S64
)
338 } else if (ST
.has16BitInsts()) {
339 MinNumMaxNum
.customFor(FPTypes16
)
340 .clampScalar(0, S16
, S64
)
343 MinNumMaxNum
.customFor(FPTypesBase
)
344 .clampScalar(0, S32
, S64
)
348 if (ST
.hasVOP3PInsts())
349 FPOpActions
.clampMaxNumElements(0, S16
, 2);
353 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
357 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
359 getActionDefinitionsBuilder({G_FNEG
, G_FABS
})
360 .legalFor(FPTypesPK16
)
361 .clampMaxNumElements(0, S16
, 2)
363 .clampScalar(0, S16
, S64
);
366 getActionDefinitionsBuilder({G_FMINIMUM
, G_FMAXIMUM
}).lower();
368 if (ST
.has16BitInsts()) {
369 getActionDefinitionsBuilder({G_FSQRT
, G_FFLOOR
})
370 .legalFor({S32
, S64
, S16
})
372 .clampScalar(0, S16
, S64
);
374 getActionDefinitionsBuilder({G_FSQRT
, G_FFLOOR
})
375 .legalFor({S32
, S64
})
377 .clampScalar(0, S32
, S64
);
380 getActionDefinitionsBuilder(G_FPTRUNC
)
381 .legalFor({{S32
, S64
}, {S16
, S32
}})
384 getActionDefinitionsBuilder(G_FPEXT
)
385 .legalFor({{S64
, S32
}, {S32
, S16
}})
386 .lowerFor({{S64
, S16
}}) // FIXME: Implement
389 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
390 getActionDefinitionsBuilder(G_FCOPYSIGN
).lower();
392 getActionDefinitionsBuilder(G_FSUB
)
393 // Use actual fsub instruction
395 // Must use fadd + fneg
396 .lowerFor({S64
, S16
, V2S16
})
398 .clampScalar(0, S32
, S64
);
400 // Whether this is legal depends on the floating point mode for the function.
401 auto &FMad
= getActionDefinitionsBuilder(G_FMAD
);
403 FMad
.customFor({S32
, S16
});
405 FMad
.customFor({S32
});
409 getActionDefinitionsBuilder({G_SEXT
, G_ZEXT
, G_ANYEXT
})
410 .legalFor({{S64
, S32
}, {S32
, S16
}, {S64
, S16
},
411 {S32
, S1
}, {S64
, S1
}, {S16
, S1
},
414 {S64
, LLT::scalar(33)},
415 {S32
, S8
}, {S128
, S32
}, {S128
, S64
}, {S32
, LLT::scalar(24)}})
418 // TODO: Legal for s1->s64, requires split for VALU.
419 getActionDefinitionsBuilder({G_SITOFP
, G_UITOFP
})
420 .legalFor({{S32
, S32
}, {S64
, S32
}, {S16
, S32
}, {S32
, S1
}, {S16
, S1
}})
421 .lowerFor({{S32
, S64
}})
422 .customFor({{S64
, S64
}})
425 getActionDefinitionsBuilder({G_FPTOSI
, G_FPTOUI
})
426 .legalFor({{S32
, S32
}, {S32
, S64
}, {S32
, S16
}})
429 getActionDefinitionsBuilder(G_INTRINSIC_ROUND
)
430 .legalFor({S32
, S64
})
433 if (ST
.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS
) {
434 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC
, G_FCEIL
, G_FRINT
})
435 .legalFor({S32
, S64
})
436 .clampScalar(0, S32
, S64
)
439 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC
, G_FCEIL
, G_FRINT
})
442 .clampScalar(0, S32
, S64
)
446 getActionDefinitionsBuilder(G_GEP
)
447 .legalForCartesianProduct(AddrSpaces64
, {S64
})
448 .legalForCartesianProduct(AddrSpaces32
, {S32
})
451 getActionDefinitionsBuilder(G_PTR_MASK
)
455 setAction({G_BLOCK_ADDR
, CodePtr
}, Legal
);
458 getActionDefinitionsBuilder(G_ICMP
)
459 .legalForCartesianProduct(
460 {S1
}, {S32
, S64
, GlobalPtr
, LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
461 .legalFor({{S1
, S32
}, {S1
, S64
}});
462 if (ST
.has16BitInsts()) {
463 CmpBuilder
.legalFor({{S1
, S16
}});
467 .widenScalarToNextPow2(1)
468 .clampScalar(1, S32
, S64
)
470 .legalIf(all(typeIs(0, S1
), isPointer(1)));
472 getActionDefinitionsBuilder(G_FCMP
)
473 .legalForCartesianProduct({S1
}, ST
.has16BitInsts() ? FPTypes16
: FPTypesBase
)
474 .widenScalarToNextPow2(1)
475 .clampScalar(1, S32
, S64
)
478 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
479 getActionDefinitionsBuilder({G_FPOW
, G_FEXP
, G_FEXP2
,
480 G_FLOG
, G_FLOG2
, G_FLOG10
})
484 // The 64-bit versions produce 32-bit results, but only on the SALU.
485 getActionDefinitionsBuilder({G_CTLZ
, G_CTLZ_ZERO_UNDEF
,
486 G_CTTZ
, G_CTTZ_ZERO_UNDEF
,
488 .legalFor({{S32
, S32
}, {S32
, S64
}})
489 .clampScalar(0, S32
, S32
)
490 .clampScalar(1, S32
, S64
)
492 .widenScalarToNextPow2(0, 32)
493 .widenScalarToNextPow2(1, 32);
495 // TODO: Expand for > s32
496 getActionDefinitionsBuilder({G_BSWAP
, G_BITREVERSE
})
498 .clampScalar(0, S32
, S32
)
501 if (ST
.has16BitInsts()) {
502 if (ST
.hasVOP3PInsts()) {
503 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
504 .legalFor({S32
, S16
, V2S16
})
505 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
506 .clampMaxNumElements(0, S16
, 2)
507 .clampScalar(0, S16
, S32
)
508 .widenScalarToNextPow2(0)
511 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
512 .legalFor({S32
, S16
})
513 .widenScalarToNextPow2(0)
514 .clampScalar(0, S16
, S32
)
518 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
520 .clampScalar(0, S32
, S32
)
521 .widenScalarToNextPow2(0)
525 auto smallerThan
= [](unsigned TypeIdx0
, unsigned TypeIdx1
) {
526 return [=](const LegalityQuery
&Query
) {
527 return Query
.Types
[TypeIdx0
].getSizeInBits() <
528 Query
.Types
[TypeIdx1
].getSizeInBits();
532 auto greaterThan
= [](unsigned TypeIdx0
, unsigned TypeIdx1
) {
533 return [=](const LegalityQuery
&Query
) {
534 return Query
.Types
[TypeIdx0
].getSizeInBits() >
535 Query
.Types
[TypeIdx1
].getSizeInBits();
539 getActionDefinitionsBuilder(G_INTTOPTR
)
540 // List the common cases
541 .legalForCartesianProduct(AddrSpaces64
, {S64
})
542 .legalForCartesianProduct(AddrSpaces32
, {S32
})
544 // Accept any address space as long as the size matches
545 .legalIf(sameSize(0, 1))
546 .widenScalarIf(smallerThan(1, 0),
547 [](const LegalityQuery
&Query
) {
548 return std::make_pair(1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
550 .narrowScalarIf(greaterThan(1, 0),
551 [](const LegalityQuery
&Query
) {
552 return std::make_pair(1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
555 getActionDefinitionsBuilder(G_PTRTOINT
)
556 // List the common cases
557 .legalForCartesianProduct(AddrSpaces64
, {S64
})
558 .legalForCartesianProduct(AddrSpaces32
, {S32
})
560 // Accept any address space as long as the size matches
561 .legalIf(sameSize(0, 1))
562 .widenScalarIf(smallerThan(0, 1),
563 [](const LegalityQuery
&Query
) {
564 return std::make_pair(0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
568 [](const LegalityQuery
&Query
) {
569 return std::make_pair(0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
572 getActionDefinitionsBuilder(G_ADDRSPACE_CAST
)
576 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
577 // handle some operations by just promoting the register during
578 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
579 auto maxSizeForAddrSpace
= [this](unsigned AS
) -> unsigned {
581 // FIXME: Private element size.
582 case AMDGPUAS::PRIVATE_ADDRESS
:
584 // FIXME: Check subtarget
585 case AMDGPUAS::LOCAL_ADDRESS
:
586 return ST
.useDS128() ? 128 : 64;
588 // Treat constant and global as identical. SMRD loads are sometimes usable
589 // for global loads (ideally constant address space should be eliminated)
590 // depending on the context. Legality cannot be context dependent, but
591 // RegBankSelect can split the load as necessary depending on the pointer
592 // register bank/uniformity and if the memory is invariant or not written in
594 case AMDGPUAS::CONSTANT_ADDRESS
:
595 case AMDGPUAS::GLOBAL_ADDRESS
:
602 const auto needToSplitLoad
= [=](const LegalityQuery
&Query
) -> bool {
603 const LLT DstTy
= Query
.Types
[0];
605 // Split vector extloads.
606 unsigned MemSize
= Query
.MMODescrs
[0].SizeInBits
;
607 if (DstTy
.isVector() && DstTy
.getSizeInBits() > MemSize
)
610 const LLT PtrTy
= Query
.Types
[1];
611 unsigned AS
= PtrTy
.getAddressSpace();
612 if (MemSize
> maxSizeForAddrSpace(AS
))
615 // Catch weird sized loads that don't evenly divide into the access sizes
616 // TODO: May be able to widen depending on alignment etc.
617 unsigned NumRegs
= MemSize
/ 32;
618 if (NumRegs
== 3 && !ST
.hasDwordx3LoadStores())
621 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
622 if (Align
< MemSize
) {
623 const SITargetLowering
*TLI
= ST
.getTargetLowering();
624 return !TLI
->allowsMisalignedMemoryAccessesImpl(MemSize
, AS
, Align
/ 8);
630 unsigned GlobalAlign32
= ST
.hasUnalignedBufferAccess() ? 0 : 32;
631 unsigned GlobalAlign16
= ST
.hasUnalignedBufferAccess() ? 0 : 16;
632 unsigned GlobalAlign8
= ST
.hasUnalignedBufferAccess() ? 0 : 8;
634 // TODO: Refine based on subtargets which support unaligned access or 128-bit
636 // TODO: Unsupported flat for SI.
638 for (unsigned Op
: {G_LOAD
, G_STORE
}) {
639 const bool IsStore
= Op
== G_STORE
;
641 auto &Actions
= getActionDefinitionsBuilder(Op
);
642 // Whitelist the common cases.
643 // TODO: Pointer loads
644 // TODO: Wide constant loads
645 // TODO: Only CI+ has 3x loads
646 // TODO: Loads to s16 on gfx9
647 Actions
.legalForTypesWithMemDesc({{S32
, GlobalPtr
, 32, GlobalAlign32
},
648 {V2S32
, GlobalPtr
, 64, GlobalAlign32
},
649 {V3S32
, GlobalPtr
, 96, GlobalAlign32
},
650 {S96
, GlobalPtr
, 96, GlobalAlign32
},
651 {V4S32
, GlobalPtr
, 128, GlobalAlign32
},
652 {S128
, GlobalPtr
, 128, GlobalAlign32
},
653 {S64
, GlobalPtr
, 64, GlobalAlign32
},
654 {V2S64
, GlobalPtr
, 128, GlobalAlign32
},
655 {V2S16
, GlobalPtr
, 32, GlobalAlign32
},
656 {S32
, GlobalPtr
, 8, GlobalAlign8
},
657 {S32
, GlobalPtr
, 16, GlobalAlign16
},
659 {S32
, LocalPtr
, 32, 32},
660 {S64
, LocalPtr
, 64, 32},
661 {V2S32
, LocalPtr
, 64, 32},
662 {S32
, LocalPtr
, 8, 8},
663 {S32
, LocalPtr
, 16, 16},
664 {V2S16
, LocalPtr
, 32, 32},
666 {S32
, PrivatePtr
, 32, 32},
667 {S32
, PrivatePtr
, 8, 8},
668 {S32
, PrivatePtr
, 16, 16},
669 {V2S16
, PrivatePtr
, 32, 32},
671 {S32
, FlatPtr
, 32, GlobalAlign32
},
672 {S32
, FlatPtr
, 16, GlobalAlign16
},
673 {S32
, FlatPtr
, 8, GlobalAlign8
},
674 {V2S16
, FlatPtr
, 32, GlobalAlign32
},
676 {S32
, ConstantPtr
, 32, GlobalAlign32
},
677 {V2S32
, ConstantPtr
, 64, GlobalAlign32
},
678 {V3S32
, ConstantPtr
, 96, GlobalAlign32
},
679 {V4S32
, ConstantPtr
, 128, GlobalAlign32
},
680 {S64
, ConstantPtr
, 64, GlobalAlign32
},
681 {S128
, ConstantPtr
, 128, GlobalAlign32
},
682 {V2S32
, ConstantPtr
, 32, GlobalAlign32
}});
684 .customIf(typeIs(1, Constant32Ptr
))
686 [=](const LegalityQuery
&Query
) -> bool {
687 return !Query
.Types
[0].isVector() && needToSplitLoad(Query
);
689 [=](const LegalityQuery
&Query
) -> std::pair
<unsigned, LLT
> {
690 const LLT DstTy
= Query
.Types
[0];
691 const LLT PtrTy
= Query
.Types
[1];
693 const unsigned DstSize
= DstTy
.getSizeInBits();
694 unsigned MemSize
= Query
.MMODescrs
[0].SizeInBits
;
697 if (DstSize
> MemSize
)
698 return std::make_pair(0, LLT::scalar(MemSize
));
700 if (DstSize
> 32 && (DstSize
% 32 != 0)) {
701 // FIXME: Need a way to specify non-extload of larger size if
703 return std::make_pair(0, LLT::scalar(32 * (DstSize
/ 32)));
706 unsigned MaxSize
= maxSizeForAddrSpace(PtrTy
.getAddressSpace());
707 if (MemSize
> MaxSize
)
708 return std::make_pair(0, LLT::scalar(MaxSize
));
710 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
711 return std::make_pair(0, LLT::scalar(Align
));
714 [=](const LegalityQuery
&Query
) -> bool {
715 return Query
.Types
[0].isVector() && needToSplitLoad(Query
);
717 [=](const LegalityQuery
&Query
) -> std::pair
<unsigned, LLT
> {
718 const LLT DstTy
= Query
.Types
[0];
719 const LLT PtrTy
= Query
.Types
[1];
721 LLT EltTy
= DstTy
.getElementType();
722 unsigned MaxSize
= maxSizeForAddrSpace(PtrTy
.getAddressSpace());
724 // Split if it's too large for the address space.
725 if (Query
.MMODescrs
[0].SizeInBits
> MaxSize
) {
726 unsigned NumElts
= DstTy
.getNumElements();
727 unsigned NumPieces
= Query
.MMODescrs
[0].SizeInBits
/ MaxSize
;
729 // FIXME: Refine when odd breakdowns handled
730 // The scalars will need to be re-legalized.
731 if (NumPieces
== 1 || NumPieces
>= NumElts
||
732 NumElts
% NumPieces
!= 0)
733 return std::make_pair(0, EltTy
);
735 return std::make_pair(0,
736 LLT::vector(NumElts
/ NumPieces
, EltTy
));
739 // Need to split because of alignment.
740 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
741 unsigned EltSize
= EltTy
.getSizeInBits();
742 if (EltSize
> Align
&&
743 (EltSize
/ Align
< DstTy
.getNumElements())) {
744 return std::make_pair(0, LLT::vector(EltSize
/ Align
, EltTy
));
747 // May need relegalization for the scalars.
748 return std::make_pair(0, EltTy
);
753 Actions
.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32
));
755 // TODO: Need a bitcast lower option?
757 .legalIf([=](const LegalityQuery
&Query
) {
758 const LLT Ty0
= Query
.Types
[0];
759 unsigned Size
= Ty0
.getSizeInBits();
760 unsigned MemSize
= Query
.MMODescrs
[0].SizeInBits
;
761 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
763 // No extending vector loads.
764 if (Size
> MemSize
&& Ty0
.isVector())
767 // FIXME: Widening store from alignment not valid.
769 MemSize
= std::max(MemSize
, Align
);
780 return ST
.hasDwordx3LoadStores();
788 .widenScalarToNextPow2(0)
789 // TODO: v3s32->v4s32 with alignment
790 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
793 auto &ExtLoads
= getActionDefinitionsBuilder({G_SEXTLOAD
, G_ZEXTLOAD
})
794 .legalForTypesWithMemDesc({{S32
, GlobalPtr
, 8, 8},
795 {S32
, GlobalPtr
, 16, 2 * 8},
796 {S32
, LocalPtr
, 8, 8},
797 {S32
, LocalPtr
, 16, 16},
798 {S32
, PrivatePtr
, 8, 8},
799 {S32
, PrivatePtr
, 16, 16},
800 {S32
, ConstantPtr
, 8, 8},
801 {S32
, ConstantPtr
, 16, 2 * 8}});
802 if (ST
.hasFlatAddressSpace()) {
803 ExtLoads
.legalForTypesWithMemDesc(
804 {{S32
, FlatPtr
, 8, 8}, {S32
, FlatPtr
, 16, 16}});
807 ExtLoads
.clampScalar(0, S32
, S32
)
808 .widenScalarToNextPow2(0)
809 .unsupportedIfMemSizeNotPow2()
812 auto &Atomics
= getActionDefinitionsBuilder(
813 {G_ATOMICRMW_XCHG
, G_ATOMICRMW_ADD
, G_ATOMICRMW_SUB
,
814 G_ATOMICRMW_AND
, G_ATOMICRMW_OR
, G_ATOMICRMW_XOR
,
815 G_ATOMICRMW_MAX
, G_ATOMICRMW_MIN
, G_ATOMICRMW_UMAX
,
816 G_ATOMICRMW_UMIN
, G_ATOMIC_CMPXCHG
})
817 .legalFor({{S32
, GlobalPtr
}, {S32
, LocalPtr
},
818 {S64
, GlobalPtr
}, {S64
, LocalPtr
}});
819 if (ST
.hasFlatAddressSpace()) {
820 Atomics
.legalFor({{S32
, FlatPtr
}, {S64
, FlatPtr
}});
823 getActionDefinitionsBuilder(G_ATOMICRMW_FADD
)
824 .legalFor({{S32
, LocalPtr
}});
826 // TODO: Pointer types, any 32-bit or 64-bit vector
827 getActionDefinitionsBuilder(G_SELECT
)
828 .legalForCartesianProduct({S32
, S64
, S16
, V2S32
, V2S16
, V4S16
,
829 GlobalPtr
, LocalPtr
, FlatPtr
, PrivatePtr
,
830 LLT::vector(2, LocalPtr
), LLT::vector(2, PrivatePtr
)}, {S1
})
831 .clampScalar(0, S16
, S64
)
832 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
833 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
835 .clampMaxNumElements(0, S32
, 2)
836 .clampMaxNumElements(0, LocalPtr
, 2)
837 .clampMaxNumElements(0, PrivatePtr
, 2)
839 .widenScalarToNextPow2(0)
840 .legalIf(all(isPointer(0), typeIs(1, S1
)));
842 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
843 // be more flexible with the shift amount type.
844 auto &Shifts
= getActionDefinitionsBuilder({G_SHL
, G_LSHR
, G_ASHR
})
845 .legalFor({{S32
, S32
}, {S64
, S32
}});
846 if (ST
.has16BitInsts()) {
847 if (ST
.hasVOP3PInsts()) {
848 Shifts
.legalFor({{S16
, S32
}, {S16
, S16
}, {V2S16
, V2S16
}})
849 .clampMaxNumElements(0, S16
, 2);
851 Shifts
.legalFor({{S16
, S32
}, {S16
, S16
}});
853 Shifts
.clampScalar(1, S16
, S32
);
854 Shifts
.clampScalar(0, S16
, S64
);
855 Shifts
.widenScalarToNextPow2(0, 16);
857 // Make sure we legalize the shift amount type first, as the general
858 // expansion for the shifted type will produce much worse code if it hasn't
859 // been truncated already.
860 Shifts
.clampScalar(1, S32
, S32
);
861 Shifts
.clampScalar(0, S32
, S64
);
862 Shifts
.widenScalarToNextPow2(0, 32);
866 for (unsigned Op
: {G_EXTRACT_VECTOR_ELT
, G_INSERT_VECTOR_ELT
}) {
867 unsigned VecTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 1 : 0;
868 unsigned EltTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 0 : 1;
869 unsigned IdxTypeIdx
= 2;
871 getActionDefinitionsBuilder(Op
)
872 .customIf([=](const LegalityQuery
&Query
) {
873 const LLT EltTy
= Query
.Types
[EltTypeIdx
];
874 const LLT VecTy
= Query
.Types
[VecTypeIdx
];
875 const LLT IdxTy
= Query
.Types
[IdxTypeIdx
];
876 return (EltTy
.getSizeInBits() == 16 ||
877 EltTy
.getSizeInBits() % 32 == 0) &&
878 VecTy
.getSizeInBits() % 32 == 0 &&
879 VecTy
.getSizeInBits() <= 512 &&
880 IdxTy
.getSizeInBits() == 32;
882 .clampScalar(EltTypeIdx
, S32
, S64
)
883 .clampScalar(VecTypeIdx
, S32
, S64
)
884 .clampScalar(IdxTypeIdx
, S32
, S32
);
887 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT
)
888 .unsupportedIf([=](const LegalityQuery
&Query
) {
889 const LLT
&EltTy
= Query
.Types
[1].getElementType();
890 return Query
.Types
[0] != EltTy
;
893 for (unsigned Op
: {G_EXTRACT
, G_INSERT
}) {
894 unsigned BigTyIdx
= Op
== G_EXTRACT
? 1 : 0;
895 unsigned LitTyIdx
= Op
== G_EXTRACT
? 0 : 1;
897 // FIXME: Doesn't handle extract of illegal sizes.
898 getActionDefinitionsBuilder(Op
)
899 .legalIf([=](const LegalityQuery
&Query
) {
900 const LLT BigTy
= Query
.Types
[BigTyIdx
];
901 const LLT LitTy
= Query
.Types
[LitTyIdx
];
902 return (BigTy
.getSizeInBits() % 32 == 0) &&
903 (LitTy
.getSizeInBits() % 16 == 0);
906 [=](const LegalityQuery
&Query
) {
907 const LLT BigTy
= Query
.Types
[BigTyIdx
];
908 return (BigTy
.getScalarSizeInBits() < 16);
910 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx
, 16))
912 [=](const LegalityQuery
&Query
) {
913 const LLT LitTy
= Query
.Types
[LitTyIdx
];
914 return (LitTy
.getScalarSizeInBits() < 16);
916 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx
, 16))
917 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
918 .widenScalarToNextPow2(BigTyIdx
, 32);
922 auto &BuildVector
= getActionDefinitionsBuilder(G_BUILD_VECTOR
)
923 .legalForCartesianProduct(AllS32Vectors
, {S32
})
924 .legalForCartesianProduct(AllS64Vectors
, {S64
})
925 .clampNumElements(0, V16S32
, V16S32
)
926 .clampNumElements(0, V2S64
, V8S64
);
928 if (ST
.hasScalarPackInsts())
929 BuildVector
.legalFor({V2S16
, S32
});
932 .minScalarSameAs(1, 0)
933 .legalIf(isRegisterType(0))
934 .minScalarOrElt(0, S32
);
936 if (ST
.hasScalarPackInsts()) {
937 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
)
938 .legalFor({V2S16
, S32
})
941 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
)
945 getActionDefinitionsBuilder(G_CONCAT_VECTORS
)
946 .legalIf(isRegisterType(0));
948 // TODO: Don't fully scalarize v2s16 pieces
949 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR
).lower();
952 for (unsigned Op
: {G_MERGE_VALUES
, G_UNMERGE_VALUES
}) {
953 unsigned BigTyIdx
= Op
== G_MERGE_VALUES
? 0 : 1;
954 unsigned LitTyIdx
= Op
== G_MERGE_VALUES
? 1 : 0;
956 auto notValidElt
= [=](const LegalityQuery
&Query
, unsigned TypeIdx
) {
957 const LLT
&Ty
= Query
.Types
[TypeIdx
];
959 const LLT
&EltTy
= Ty
.getElementType();
960 if (EltTy
.getSizeInBits() < 8 || EltTy
.getSizeInBits() > 64)
962 if (!isPowerOf2_32(EltTy
.getSizeInBits()))
968 getActionDefinitionsBuilder(Op
)
969 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 16)
970 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
971 // worth considering the multiples of 64 since 2*192 and 2*384 are not
973 .clampScalar(LitTyIdx
, S16
, S256
)
974 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 32)
975 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
976 .fewerElementsIf(all(typeIs(0, S16
), vectorWiderThan(1, 32),
977 elementTypeIs(1, S16
)),
979 // Break up vectors with weird elements into scalars
981 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, 0); },
984 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, 1); },
986 .clampScalar(BigTyIdx
, S32
, S512
)
987 .lowerFor({{S16
, V2S16
}})
989 [=](const LegalityQuery
&Query
) {
990 const LLT
&Ty
= Query
.Types
[BigTyIdx
];
991 return !isPowerOf2_32(Ty
.getSizeInBits()) &&
992 Ty
.getSizeInBits() % 16 != 0;
994 [=](const LegalityQuery
&Query
) {
995 // Pick the next power of 2, or a multiple of 64 over 128.
996 // Whichever is smaller.
997 const LLT
&Ty
= Query
.Types
[BigTyIdx
];
998 unsigned NewSizeInBits
= 1 << Log2_32_Ceil(Ty
.getSizeInBits() + 1);
999 if (NewSizeInBits
>= 256) {
1000 unsigned RoundedTo
= alignTo
<64>(Ty
.getSizeInBits() + 1);
1001 if (RoundedTo
< NewSizeInBits
)
1002 NewSizeInBits
= RoundedTo
;
1004 return std::make_pair(BigTyIdx
, LLT::scalar(NewSizeInBits
));
1006 .legalIf([=](const LegalityQuery
&Query
) {
1007 const LLT
&BigTy
= Query
.Types
[BigTyIdx
];
1008 const LLT
&LitTy
= Query
.Types
[LitTyIdx
];
1010 if (BigTy
.isVector() && BigTy
.getSizeInBits() < 32)
1012 if (LitTy
.isVector() && LitTy
.getSizeInBits() < 32)
1015 return BigTy
.getSizeInBits() % 16 == 0 &&
1016 LitTy
.getSizeInBits() % 16 == 0 &&
1017 BigTy
.getSizeInBits() <= 512;
1019 // Any vectors left are the wrong size. Scalarize them.
1024 getActionDefinitionsBuilder(G_SEXT_INREG
).lower();
1027 verify(*ST
.getInstrInfo());
1030 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr
&MI
,
1031 MachineRegisterInfo
&MRI
,
1032 MachineIRBuilder
&B
,
1033 GISelChangeObserver
&Observer
) const {
1034 switch (MI
.getOpcode()) {
1035 case TargetOpcode::G_ADDRSPACE_CAST
:
1036 return legalizeAddrSpaceCast(MI
, MRI
, B
);
1037 case TargetOpcode::G_FRINT
:
1038 return legalizeFrint(MI
, MRI
, B
);
1039 case TargetOpcode::G_FCEIL
:
1040 return legalizeFceil(MI
, MRI
, B
);
1041 case TargetOpcode::G_INTRINSIC_TRUNC
:
1042 return legalizeIntrinsicTrunc(MI
, MRI
, B
);
1043 case TargetOpcode::G_SITOFP
:
1044 return legalizeITOFP(MI
, MRI
, B
, true);
1045 case TargetOpcode::G_UITOFP
:
1046 return legalizeITOFP(MI
, MRI
, B
, false);
1047 case TargetOpcode::G_FMINNUM
:
1048 case TargetOpcode::G_FMAXNUM
:
1049 case TargetOpcode::G_FMINNUM_IEEE
:
1050 case TargetOpcode::G_FMAXNUM_IEEE
:
1051 return legalizeMinNumMaxNum(MI
, MRI
, B
);
1052 case TargetOpcode::G_EXTRACT_VECTOR_ELT
:
1053 return legalizeExtractVectorElt(MI
, MRI
, B
);
1054 case TargetOpcode::G_INSERT_VECTOR_ELT
:
1055 return legalizeInsertVectorElt(MI
, MRI
, B
);
1056 case TargetOpcode::G_FSIN
:
1057 case TargetOpcode::G_FCOS
:
1058 return legalizeSinCos(MI
, MRI
, B
);
1059 case TargetOpcode::G_GLOBAL_VALUE
:
1060 return legalizeGlobalValue(MI
, MRI
, B
);
1061 case TargetOpcode::G_LOAD
:
1062 return legalizeLoad(MI
, MRI
, B
, Observer
);
1063 case TargetOpcode::G_FMAD
:
1064 return legalizeFMad(MI
, MRI
, B
);
1069 llvm_unreachable("expected switch to return");
1072 Register
AMDGPULegalizerInfo::getSegmentAperture(
1074 MachineRegisterInfo
&MRI
,
1075 MachineIRBuilder
&B
) const {
1076 MachineFunction
&MF
= B
.getMF();
1077 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1078 const LLT S32
= LLT::scalar(32);
1080 if (ST
.hasApertureRegs()) {
1081 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1083 unsigned Offset
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
1084 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE
:
1085 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE
;
1086 unsigned WidthM1
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
1087 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE
:
1088 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE
;
1090 AMDGPU::Hwreg::ID_MEM_BASES
<< AMDGPU::Hwreg::ID_SHIFT_
|
1091 Offset
<< AMDGPU::Hwreg::OFFSET_SHIFT_
|
1092 WidthM1
<< AMDGPU::Hwreg::WIDTH_M1_SHIFT_
;
1094 Register ApertureReg
= MRI
.createGenericVirtualRegister(S32
);
1095 Register GetReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
1097 B
.buildInstr(AMDGPU::S_GETREG_B32
)
1100 MRI
.setType(GetReg
, S32
);
1102 auto ShiftAmt
= B
.buildConstant(S32
, WidthM1
+ 1);
1103 B
.buildInstr(TargetOpcode::G_SHL
)
1104 .addDef(ApertureReg
)
1106 .addUse(ShiftAmt
.getReg(0));
1111 Register QueuePtr
= MRI
.createGenericVirtualRegister(
1112 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
1114 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1115 if (!loadInputValue(QueuePtr
, B
, &MFI
->getArgInfo().QueuePtr
))
1118 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1119 // private_segment_aperture_base_hi.
1120 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
1122 // FIXME: Don't use undef
1123 Value
*V
= UndefValue::get(PointerType::get(
1124 Type::getInt8Ty(MF
.getFunction().getContext()),
1125 AMDGPUAS::CONSTANT_ADDRESS
));
1127 MachinePointerInfo
PtrInfo(V
, StructOffset
);
1128 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
1130 MachineMemOperand::MOLoad
|
1131 MachineMemOperand::MODereferenceable
|
1132 MachineMemOperand::MOInvariant
,
1134 MinAlign(64, StructOffset
));
1136 Register LoadResult
= MRI
.createGenericVirtualRegister(S32
);
1139 B
.materializeGEP(LoadAddr
, QueuePtr
, LLT::scalar(64), StructOffset
);
1140 B
.buildLoad(LoadResult
, LoadAddr
, *MMO
);
1144 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1145 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1146 MachineIRBuilder
&B
) const {
1147 MachineFunction
&MF
= B
.getMF();
1151 const LLT S32
= LLT::scalar(32);
1152 Register Dst
= MI
.getOperand(0).getReg();
1153 Register Src
= MI
.getOperand(1).getReg();
1155 LLT DstTy
= MRI
.getType(Dst
);
1156 LLT SrcTy
= MRI
.getType(Src
);
1157 unsigned DestAS
= DstTy
.getAddressSpace();
1158 unsigned SrcAS
= SrcTy
.getAddressSpace();
1160 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1162 assert(!DstTy
.isVector());
1164 const AMDGPUTargetMachine
&TM
1165 = static_cast<const AMDGPUTargetMachine
&>(MF
.getTarget());
1167 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1168 if (ST
.getTargetLowering()->isNoopAddrSpaceCast(SrcAS
, DestAS
)) {
1169 MI
.setDesc(B
.getTII().get(TargetOpcode::G_BITCAST
));
1173 if (DestAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
1175 B
.buildExtract(Dst
, Src
, 0);
1176 MI
.eraseFromParent();
1180 if (SrcAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
1181 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1182 uint32_t AddrHiVal
= Info
->get32BitAddressHighBits();
1184 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1185 // another. Merge operands are required to be the same type, but creating an
1186 // extra ptrtoint would be kind of pointless.
1187 auto HighAddr
= B
.buildConstant(
1188 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT
, 32), AddrHiVal
);
1189 B
.buildMerge(Dst
, {Src
, HighAddr
.getReg(0)});
1190 MI
.eraseFromParent();
1194 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
) {
1195 assert(DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
1196 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
);
1197 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
1199 auto SegmentNull
= B
.buildConstant(DstTy
, NullVal
);
1200 auto FlatNull
= B
.buildConstant(SrcTy
, 0);
1202 Register PtrLo32
= MRI
.createGenericVirtualRegister(DstTy
);
1204 // Extract low 32-bits of the pointer.
1205 B
.buildExtract(PtrLo32
, Src
, 0);
1207 Register CmpRes
= MRI
.createGenericVirtualRegister(LLT::scalar(1));
1208 B
.buildICmp(CmpInst::ICMP_NE
, CmpRes
, Src
, FlatNull
.getReg(0));
1209 B
.buildSelect(Dst
, CmpRes
, PtrLo32
, SegmentNull
.getReg(0));
1211 MI
.eraseFromParent();
1215 if (SrcAS
!= AMDGPUAS::LOCAL_ADDRESS
&& SrcAS
!= AMDGPUAS::PRIVATE_ADDRESS
)
1218 if (!ST
.hasFlatAddressSpace())
1222 B
.buildConstant(SrcTy
, TM
.getNullPointerValue(SrcAS
));
1224 B
.buildConstant(DstTy
, TM
.getNullPointerValue(DestAS
));
1226 Register ApertureReg
= getSegmentAperture(DestAS
, MRI
, B
);
1227 if (!ApertureReg
.isValid())
1230 Register CmpRes
= MRI
.createGenericVirtualRegister(LLT::scalar(1));
1231 B
.buildICmp(CmpInst::ICMP_NE
, CmpRes
, Src
, SegmentNull
.getReg(0));
1233 Register BuildPtr
= MRI
.createGenericVirtualRegister(DstTy
);
1235 // Coerce the type of the low half of the result so we can use merge_values.
1236 Register SrcAsInt
= MRI
.createGenericVirtualRegister(S32
);
1237 B
.buildInstr(TargetOpcode::G_PTRTOINT
)
1241 // TODO: Should we allow mismatched types but matching sizes in merges to
1242 // avoid the ptrtoint?
1243 B
.buildMerge(BuildPtr
, {SrcAsInt
, ApertureReg
});
1244 B
.buildSelect(Dst
, CmpRes
, BuildPtr
, FlatNull
.getReg(0));
1246 MI
.eraseFromParent();
1250 bool AMDGPULegalizerInfo::legalizeFrint(
1251 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1252 MachineIRBuilder
&B
) const {
1255 Register Src
= MI
.getOperand(1).getReg();
1256 LLT Ty
= MRI
.getType(Src
);
1257 assert(Ty
.isScalar() && Ty
.getSizeInBits() == 64);
1259 APFloat
C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1260 APFloat
C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1262 auto C1
= B
.buildFConstant(Ty
, C1Val
);
1263 auto CopySign
= B
.buildFCopysign(Ty
, C1
, Src
);
1265 // TODO: Should this propagate fast-math-flags?
1266 auto Tmp1
= B
.buildFAdd(Ty
, Src
, CopySign
);
1267 auto Tmp2
= B
.buildFSub(Ty
, Tmp1
, CopySign
);
1269 auto C2
= B
.buildFConstant(Ty
, C2Val
);
1270 auto Fabs
= B
.buildFAbs(Ty
, Src
);
1272 auto Cond
= B
.buildFCmp(CmpInst::FCMP_OGT
, LLT::scalar(1), Fabs
, C2
);
1273 B
.buildSelect(MI
.getOperand(0).getReg(), Cond
, Src
, Tmp2
);
1277 bool AMDGPULegalizerInfo::legalizeFceil(
1278 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1279 MachineIRBuilder
&B
) const {
1282 const LLT S1
= LLT::scalar(1);
1283 const LLT S64
= LLT::scalar(64);
1285 Register Src
= MI
.getOperand(1).getReg();
1286 assert(MRI
.getType(Src
) == S64
);
1288 // result = trunc(src)
1289 // if (src > 0.0 && src != result)
1292 auto Trunc
= B
.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC
, {S64
}, {Src
});
1294 const auto Zero
= B
.buildFConstant(S64
, 0.0);
1295 const auto One
= B
.buildFConstant(S64
, 1.0);
1296 auto Lt0
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Src
, Zero
);
1297 auto NeTrunc
= B
.buildFCmp(CmpInst::FCMP_ONE
, S1
, Src
, Trunc
);
1298 auto And
= B
.buildAnd(S1
, Lt0
, NeTrunc
);
1299 auto Add
= B
.buildSelect(S64
, And
, One
, Zero
);
1301 // TODO: Should this propagate fast-math-flags?
1302 B
.buildFAdd(MI
.getOperand(0).getReg(), Trunc
, Add
);
1306 static MachineInstrBuilder
extractF64Exponent(unsigned Hi
,
1307 MachineIRBuilder
&B
) {
1308 const unsigned FractBits
= 52;
1309 const unsigned ExpBits
= 11;
1310 LLT S32
= LLT::scalar(32);
1312 auto Const0
= B
.buildConstant(S32
, FractBits
- 32);
1313 auto Const1
= B
.buildConstant(S32
, ExpBits
);
1315 auto ExpPart
= B
.buildIntrinsic(Intrinsic::amdgcn_ubfe
, {S32
}, false)
1316 .addUse(Const0
.getReg(0))
1317 .addUse(Const1
.getReg(0));
1319 return B
.buildSub(S32
, ExpPart
, B
.buildConstant(S32
, 1023));
1322 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1323 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1324 MachineIRBuilder
&B
) const {
1327 const LLT S1
= LLT::scalar(1);
1328 const LLT S32
= LLT::scalar(32);
1329 const LLT S64
= LLT::scalar(64);
1331 Register Src
= MI
.getOperand(1).getReg();
1332 assert(MRI
.getType(Src
) == S64
);
1334 // TODO: Should this use extract since the low half is unused?
1335 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
1336 Register Hi
= Unmerge
.getReg(1);
1338 // Extract the upper half, since this is where we will find the sign and
1340 auto Exp
= extractF64Exponent(Hi
, B
);
1342 const unsigned FractBits
= 52;
1344 // Extract the sign bit.
1345 const auto SignBitMask
= B
.buildConstant(S32
, UINT32_C(1) << 31);
1346 auto SignBit
= B
.buildAnd(S32
, Hi
, SignBitMask
);
1348 const auto FractMask
= B
.buildConstant(S64
, (UINT64_C(1) << FractBits
) - 1);
1350 const auto Zero32
= B
.buildConstant(S32
, 0);
1352 // Extend back to 64-bits.
1353 auto SignBit64
= B
.buildMerge(S64
, {Zero32
.getReg(0), SignBit
.getReg(0)});
1355 auto Shr
= B
.buildAShr(S64
, FractMask
, Exp
);
1356 auto Not
= B
.buildNot(S64
, Shr
);
1357 auto Tmp0
= B
.buildAnd(S64
, Src
, Not
);
1358 auto FiftyOne
= B
.buildConstant(S32
, FractBits
- 1);
1360 auto ExpLt0
= B
.buildICmp(CmpInst::ICMP_SLT
, S1
, Exp
, Zero32
);
1361 auto ExpGt51
= B
.buildICmp(CmpInst::ICMP_SGT
, S1
, Exp
, FiftyOne
);
1363 auto Tmp1
= B
.buildSelect(S64
, ExpLt0
, SignBit64
, Tmp0
);
1364 B
.buildSelect(MI
.getOperand(0).getReg(), ExpGt51
, Src
, Tmp1
);
1368 bool AMDGPULegalizerInfo::legalizeITOFP(
1369 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1370 MachineIRBuilder
&B
, bool Signed
) const {
1373 Register Dst
= MI
.getOperand(0).getReg();
1374 Register Src
= MI
.getOperand(1).getReg();
1376 const LLT S64
= LLT::scalar(64);
1377 const LLT S32
= LLT::scalar(32);
1379 assert(MRI
.getType(Src
) == S64
&& MRI
.getType(Dst
) == S64
);
1381 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
1383 auto CvtHi
= Signed
?
1384 B
.buildSITOFP(S64
, Unmerge
.getReg(1)) :
1385 B
.buildUITOFP(S64
, Unmerge
.getReg(1));
1387 auto CvtLo
= B
.buildUITOFP(S64
, Unmerge
.getReg(0));
1389 auto ThirtyTwo
= B
.buildConstant(S32
, 32);
1390 auto LdExp
= B
.buildIntrinsic(Intrinsic::amdgcn_ldexp
, {S64
}, false)
1391 .addUse(CvtHi
.getReg(0))
1392 .addUse(ThirtyTwo
.getReg(0));
1394 // TODO: Should this propagate fast-math-flags?
1395 B
.buildFAdd(Dst
, LdExp
, CvtLo
);
1396 MI
.eraseFromParent();
1400 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1401 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1402 MachineIRBuilder
&B
) const {
1403 MachineFunction
&MF
= B
.getMF();
1404 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1406 const bool IsIEEEOp
= MI
.getOpcode() == AMDGPU::G_FMINNUM_IEEE
||
1407 MI
.getOpcode() == AMDGPU::G_FMAXNUM_IEEE
;
1409 // With ieee_mode disabled, the instructions have the correct behavior
1410 // already for G_FMINNUM/G_FMAXNUM
1411 if (!MFI
->getMode().IEEE
)
1417 MachineIRBuilder
HelperBuilder(MI
);
1418 GISelObserverWrapper DummyObserver
;
1419 LegalizerHelper
Helper(MF
, DummyObserver
, HelperBuilder
);
1420 HelperBuilder
.setInstr(MI
);
1421 return Helper
.lowerFMinNumMaxNum(MI
) == LegalizerHelper::Legalized
;
1424 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1425 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1426 MachineIRBuilder
&B
) const {
1427 // TODO: Should move some of this into LegalizerHelper.
1429 // TODO: Promote dynamic indexing of s16 to s32
1430 // TODO: Dynamic s64 indexing is only legal for SGPR.
1431 Optional
<int64_t> IdxVal
= getConstantVRegVal(MI
.getOperand(2).getReg(), MRI
);
1432 if (!IdxVal
) // Dynamic case will be selected to register indexing.
1435 Register Dst
= MI
.getOperand(0).getReg();
1436 Register Vec
= MI
.getOperand(1).getReg();
1438 LLT VecTy
= MRI
.getType(Vec
);
1439 LLT EltTy
= VecTy
.getElementType();
1440 assert(EltTy
== MRI
.getType(Dst
));
1444 if (IdxVal
.getValue() < VecTy
.getNumElements())
1445 B
.buildExtract(Dst
, Vec
, IdxVal
.getValue() * EltTy
.getSizeInBits());
1449 MI
.eraseFromParent();
1453 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1454 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1455 MachineIRBuilder
&B
) const {
1456 // TODO: Should move some of this into LegalizerHelper.
1458 // TODO: Promote dynamic indexing of s16 to s32
1459 // TODO: Dynamic s64 indexing is only legal for SGPR.
1460 Optional
<int64_t> IdxVal
= getConstantVRegVal(MI
.getOperand(3).getReg(), MRI
);
1461 if (!IdxVal
) // Dynamic case will be selected to register indexing.
1464 Register Dst
= MI
.getOperand(0).getReg();
1465 Register Vec
= MI
.getOperand(1).getReg();
1466 Register Ins
= MI
.getOperand(2).getReg();
1468 LLT VecTy
= MRI
.getType(Vec
);
1469 LLT EltTy
= VecTy
.getElementType();
1470 assert(EltTy
== MRI
.getType(Ins
));
1474 if (IdxVal
.getValue() < VecTy
.getNumElements())
1475 B
.buildInsert(Dst
, Vec
, Ins
, IdxVal
.getValue() * EltTy
.getSizeInBits());
1479 MI
.eraseFromParent();
1483 bool AMDGPULegalizerInfo::legalizeSinCos(
1484 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1485 MachineIRBuilder
&B
) const {
1488 Register DstReg
= MI
.getOperand(0).getReg();
1489 Register SrcReg
= MI
.getOperand(1).getReg();
1490 LLT Ty
= MRI
.getType(DstReg
);
1491 unsigned Flags
= MI
.getFlags();
1494 auto OneOver2Pi
= B
.buildFConstant(Ty
, 0.5 / M_PI
);
1495 if (ST
.hasTrigReducedRange()) {
1496 auto MulVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
);
1497 TrigVal
= B
.buildIntrinsic(Intrinsic::amdgcn_fract
, {Ty
}, false)
1498 .addUse(MulVal
.getReg(0))
1499 .setMIFlags(Flags
).getReg(0);
1501 TrigVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
).getReg(0);
1503 Intrinsic::ID TrigIntrin
= MI
.getOpcode() == AMDGPU::G_FSIN
?
1504 Intrinsic::amdgcn_sin
: Intrinsic::amdgcn_cos
;
1505 B
.buildIntrinsic(TrigIntrin
, makeArrayRef
<Register
>(DstReg
), false)
1508 MI
.eraseFromParent();
1512 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1513 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1514 MachineIRBuilder
&B
) const {
1515 Register DstReg
= MI
.getOperand(0).getReg();
1516 LLT Ty
= MRI
.getType(DstReg
);
1517 unsigned AS
= Ty
.getAddressSpace();
1519 const GlobalValue
*GV
= MI
.getOperand(1).getGlobal();
1520 MachineFunction
&MF
= B
.getMF();
1521 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1523 if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
1526 if (!MFI
->isEntryFunction()) {
1527 const Function
&Fn
= MF
.getFunction();
1528 DiagnosticInfoUnsupported
BadLDSDecl(
1529 Fn
, "local memory global used by non-kernel function", MI
.getDebugLoc());
1530 Fn
.getContext().diagnose(BadLDSDecl
);
1533 // TODO: We could emit code to handle the initialization somewhere.
1534 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV
)) {
1535 B
.buildConstant(DstReg
, MFI
->allocateLDSGlobal(B
.getDataLayout(), *GV
));
1536 MI
.eraseFromParent();
1542 const Function
&Fn
= MF
.getFunction();
1543 DiagnosticInfoUnsupported
BadInit(
1544 Fn
, "unsupported initializer for address space", MI
.getDebugLoc());
1545 Fn
.getContext().diagnose(BadInit
);
1549 bool AMDGPULegalizerInfo::legalizeLoad(
1550 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1551 MachineIRBuilder
&B
, GISelChangeObserver
&Observer
) const {
1553 LLT ConstPtr
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
1554 auto Cast
= B
.buildAddrSpaceCast(ConstPtr
, MI
.getOperand(1).getReg());
1555 Observer
.changingInstr(MI
);
1556 MI
.getOperand(1).setReg(Cast
.getReg(0));
1557 Observer
.changedInstr(MI
);
1561 bool AMDGPULegalizerInfo::legalizeFMad(
1562 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1563 MachineIRBuilder
&B
) const {
1564 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
1565 assert(Ty
.isScalar());
1567 // TODO: Always legal with future ftz flag.
1568 if (Ty
== LLT::scalar(32) && !ST
.hasFP32Denormals())
1570 if (Ty
== LLT::scalar(16) && !ST
.hasFP16Denormals())
1573 MachineFunction
&MF
= B
.getMF();
1575 MachineIRBuilder
HelperBuilder(MI
);
1576 GISelObserverWrapper DummyObserver
;
1577 LegalizerHelper
Helper(MF
, DummyObserver
, HelperBuilder
);
1578 HelperBuilder
.setMBB(*MI
.getParent());
1579 return Helper
.lowerFMad(MI
) == LegalizerHelper::Legalized
;
1582 // Return the use branch instruction, otherwise null if the usage is invalid.
1583 static MachineInstr
*verifyCFIntrinsic(MachineInstr
&MI
,
1584 MachineRegisterInfo
&MRI
) {
1585 Register CondDef
= MI
.getOperand(0).getReg();
1586 if (!MRI
.hasOneNonDBGUse(CondDef
))
1589 MachineInstr
&UseMI
= *MRI
.use_instr_nodbg_begin(CondDef
);
1590 return UseMI
.getParent() == MI
.getParent() &&
1591 UseMI
.getOpcode() == AMDGPU::G_BRCOND
? &UseMI
: nullptr;
1594 Register
AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo
&MRI
,
1595 Register Reg
, LLT Ty
) const {
1596 Register LiveIn
= MRI
.getLiveInVirtReg(Reg
);
1600 Register NewReg
= MRI
.createGenericVirtualRegister(Ty
);
1601 MRI
.addLiveIn(Reg
, NewReg
);
1605 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg
, MachineIRBuilder
&B
,
1606 const ArgDescriptor
*Arg
) const {
1607 if (!Arg
->isRegister() || !Arg
->getRegister().isValid())
1608 return false; // TODO: Handle these
1610 assert(Arg
->getRegister().isPhysical());
1612 MachineRegisterInfo
&MRI
= *B
.getMRI();
1614 LLT Ty
= MRI
.getType(DstReg
);
1615 Register LiveIn
= getLiveInRegister(MRI
, Arg
->getRegister(), Ty
);
1617 if (Arg
->isMasked()) {
1618 // TODO: Should we try to emit this once in the entry block?
1619 const LLT S32
= LLT::scalar(32);
1620 const unsigned Mask
= Arg
->getMask();
1621 const unsigned Shift
= countTrailingZeros
<unsigned>(Mask
);
1623 auto ShiftAmt
= B
.buildConstant(S32
, Shift
);
1624 auto LShr
= B
.buildLShr(S32
, LiveIn
, ShiftAmt
);
1625 B
.buildAnd(DstReg
, LShr
, B
.buildConstant(S32
, Mask
>> Shift
));
1627 B
.buildCopy(DstReg
, LiveIn
);
1629 // Insert the argument copy if it doens't already exist.
1630 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1631 if (!MRI
.getVRegDef(LiveIn
)) {
1632 // FIXME: Should have scoped insert pt
1633 MachineBasicBlock
&OrigInsBB
= B
.getMBB();
1634 auto OrigInsPt
= B
.getInsertPt();
1636 MachineBasicBlock
&EntryMBB
= B
.getMF().front();
1637 EntryMBB
.addLiveIn(Arg
->getRegister());
1638 B
.setInsertPt(EntryMBB
, EntryMBB
.begin());
1639 B
.buildCopy(LiveIn
, Arg
->getRegister());
1641 B
.setInsertPt(OrigInsBB
, OrigInsPt
);
1647 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1649 MachineRegisterInfo
&MRI
,
1650 MachineIRBuilder
&B
,
1651 AMDGPUFunctionArgInfo::PreloadedValue ArgType
) const {
1654 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
1656 const ArgDescriptor
*Arg
;
1657 const TargetRegisterClass
*RC
;
1658 std::tie(Arg
, RC
) = MFI
->getPreloadedValue(ArgType
);
1660 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1664 if (loadInputValue(MI
.getOperand(0).getReg(), B
, Arg
)) {
1665 MI
.eraseFromParent();
1672 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr
&MI
,
1673 MachineRegisterInfo
&MRI
,
1674 MachineIRBuilder
&B
) const {
1676 Register Res
= MI
.getOperand(0).getReg();
1677 Register LHS
= MI
.getOperand(2).getReg();
1678 Register RHS
= MI
.getOperand(3).getReg();
1679 uint16_t Flags
= MI
.getFlags();
1681 LLT S32
= LLT::scalar(32);
1682 LLT S1
= LLT::scalar(1);
1684 auto Abs
= B
.buildFAbs(S32
, RHS
, Flags
);
1685 const APFloat
C0Val(1.0f
);
1687 auto C0
= B
.buildConstant(S32
, 0x6f800000);
1688 auto C1
= B
.buildConstant(S32
, 0x2f800000);
1689 auto C2
= B
.buildConstant(S32
, FloatToBits(1.0f
));
1691 auto CmpRes
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Abs
, C0
, Flags
);
1692 auto Sel
= B
.buildSelect(S32
, CmpRes
, C1
, C2
, Flags
);
1694 auto Mul0
= B
.buildFMul(S32
, RHS
, Sel
, Flags
);
1696 auto RCP
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S32
}, false)
1697 .addUse(Mul0
.getReg(0))
1700 auto Mul1
= B
.buildFMul(S32
, LHS
, RCP
, Flags
);
1702 B
.buildFMul(Res
, Sel
, Mul1
, Flags
);
1704 MI
.eraseFromParent();
1708 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr
&MI
,
1709 MachineRegisterInfo
&MRI
,
1710 MachineIRBuilder
&B
) const {
1711 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
1712 if (!MFI
->isEntryFunction()) {
1713 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1714 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
1720 ST
.getTargetLowering()->getImplicitParameterOffset(
1721 B
.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT
);
1722 Register DstReg
= MI
.getOperand(0).getReg();
1723 LLT DstTy
= MRI
.getType(DstReg
);
1724 LLT IdxTy
= LLT::scalar(DstTy
.getSizeInBits());
1726 const ArgDescriptor
*Arg
;
1727 const TargetRegisterClass
*RC
;
1729 = MFI
->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1733 Register KernargPtrReg
= MRI
.createGenericVirtualRegister(DstTy
);
1734 if (!loadInputValue(KernargPtrReg
, B
, Arg
))
1737 B
.buildGEP(DstReg
, KernargPtrReg
, B
.buildConstant(IdxTy
, Offset
).getReg(0));
1738 MI
.eraseFromParent();
1742 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr
&MI
,
1743 MachineRegisterInfo
&MRI
,
1744 MachineIRBuilder
&B
,
1745 unsigned AddrSpace
) const {
1747 Register ApertureReg
= getSegmentAperture(AddrSpace
, MRI
, B
);
1748 auto Hi32
= B
.buildExtract(LLT::scalar(32), MI
.getOperand(2).getReg(), 32);
1749 B
.buildICmp(ICmpInst::ICMP_EQ
, MI
.getOperand(0), Hi32
, ApertureReg
);
1750 MI
.eraseFromParent();
1754 /// Handle register layout difference for f16 images for some subtargets.
1755 Register
AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder
&B
,
1756 MachineRegisterInfo
&MRI
,
1757 Register Reg
) const {
1758 if (!ST
.hasUnpackedD16VMem())
1761 const LLT S16
= LLT::scalar(16);
1762 const LLT S32
= LLT::scalar(32);
1763 LLT StoreVT
= MRI
.getType(Reg
);
1764 assert(StoreVT
.isVector() && StoreVT
.getElementType() == S16
);
1766 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
1768 SmallVector
<Register
, 4> WideRegs
;
1769 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
1770 WideRegs
.push_back(B
.buildAnyExt(S32
, Unmerge
.getReg(I
)).getReg(0));
1772 int NumElts
= StoreVT
.getNumElements();
1774 return B
.buildBuildVector(LLT::vector(NumElts
, S32
), WideRegs
).getReg(0);
1777 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr
&MI
,
1778 MachineRegisterInfo
&MRI
,
1779 MachineIRBuilder
&B
,
1780 bool IsFormat
) const {
1781 // TODO: Reject f16 format on targets where unsupported.
1782 Register VData
= MI
.getOperand(1).getReg();
1783 LLT Ty
= MRI
.getType(VData
);
1787 const LLT S32
= LLT::scalar(32);
1788 const LLT S16
= LLT::scalar(16);
1790 // Fixup illegal register types for i8 stores.
1791 if (Ty
== LLT::scalar(8) || Ty
== S16
) {
1792 Register AnyExt
= B
.buildAnyExt(LLT::scalar(32), VData
).getReg(0);
1793 MI
.getOperand(1).setReg(AnyExt
);
1797 if (Ty
.isVector()) {
1798 if (Ty
.getElementType() == S16
&& Ty
.getNumElements() <= 4) {
1800 MI
.getOperand(1).setReg(handleD16VData(B
, MRI
, VData
));
1804 return Ty
.getElementType() == S32
&& Ty
.getNumElements() <= 4;
1810 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr
&MI
,
1811 MachineRegisterInfo
&MRI
,
1812 MachineIRBuilder
&B
) const {
1813 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1814 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
1815 case Intrinsic::amdgcn_if
: {
1816 if (MachineInstr
*BrCond
= verifyCFIntrinsic(MI
, MRI
)) {
1817 const SIRegisterInfo
*TRI
1818 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
1820 B
.setInstr(*BrCond
);
1821 Register Def
= MI
.getOperand(1).getReg();
1822 Register Use
= MI
.getOperand(3).getReg();
1823 B
.buildInstr(AMDGPU::SI_IF
)
1826 .addMBB(BrCond
->getOperand(1).getMBB());
1828 MRI
.setRegClass(Def
, TRI
->getWaveMaskRegClass());
1829 MRI
.setRegClass(Use
, TRI
->getWaveMaskRegClass());
1830 MI
.eraseFromParent();
1831 BrCond
->eraseFromParent();
1837 case Intrinsic::amdgcn_loop
: {
1838 if (MachineInstr
*BrCond
= verifyCFIntrinsic(MI
, MRI
)) {
1839 const SIRegisterInfo
*TRI
1840 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
1842 B
.setInstr(*BrCond
);
1843 Register Reg
= MI
.getOperand(2).getReg();
1844 B
.buildInstr(AMDGPU::SI_LOOP
)
1846 .addMBB(BrCond
->getOperand(1).getMBB());
1847 MI
.eraseFromParent();
1848 BrCond
->eraseFromParent();
1849 MRI
.setRegClass(Reg
, TRI
->getWaveMaskRegClass());
1855 case Intrinsic::amdgcn_kernarg_segment_ptr
:
1856 return legalizePreloadedArgIntrin(
1857 MI
, MRI
, B
, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1858 case Intrinsic::amdgcn_implicitarg_ptr
:
1859 return legalizeImplicitArgPtr(MI
, MRI
, B
);
1860 case Intrinsic::amdgcn_workitem_id_x
:
1861 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1862 AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
1863 case Intrinsic::amdgcn_workitem_id_y
:
1864 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1865 AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
1866 case Intrinsic::amdgcn_workitem_id_z
:
1867 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1868 AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
1869 case Intrinsic::amdgcn_workgroup_id_x
:
1870 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1871 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
1872 case Intrinsic::amdgcn_workgroup_id_y
:
1873 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1874 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
1875 case Intrinsic::amdgcn_workgroup_id_z
:
1876 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1877 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
1878 case Intrinsic::amdgcn_dispatch_ptr
:
1879 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1880 AMDGPUFunctionArgInfo::DISPATCH_PTR
);
1881 case Intrinsic::amdgcn_queue_ptr
:
1882 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1883 AMDGPUFunctionArgInfo::QUEUE_PTR
);
1884 case Intrinsic::amdgcn_implicit_buffer_ptr
:
1885 return legalizePreloadedArgIntrin(
1886 MI
, MRI
, B
, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
1887 case Intrinsic::amdgcn_dispatch_id
:
1888 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1889 AMDGPUFunctionArgInfo::DISPATCH_ID
);
1890 case Intrinsic::amdgcn_fdiv_fast
:
1891 return legalizeFDIVFast(MI
, MRI
, B
);
1892 case Intrinsic::amdgcn_is_shared
:
1893 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::LOCAL_ADDRESS
);
1894 case Intrinsic::amdgcn_is_private
:
1895 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::PRIVATE_ADDRESS
);
1896 case Intrinsic::amdgcn_wavefrontsize
: {
1898 B
.buildConstant(MI
.getOperand(0), ST
.getWavefrontSize());
1899 MI
.eraseFromParent();
1902 case Intrinsic::amdgcn_raw_buffer_store
:
1903 return legalizeRawBufferStore(MI
, MRI
, B
, false);
1904 case Intrinsic::amdgcn_raw_buffer_store_format
:
1905 return legalizeRawBufferStore(MI
, MRI
, B
, true);