1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the Machinelegalizer class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/Type.h"
31 #include "llvm/Support/Debug.h"
33 #define DEBUG_TYPE "amdgpu-legalinfo"
36 using namespace LegalizeActions
;
37 using namespace LegalizeMutations
;
38 using namespace LegalityPredicates
;
41 static LegalityPredicate
isMultiple32(unsigned TypeIdx
,
42 unsigned MaxSize
= 512) {
43 return [=](const LegalityQuery
&Query
) {
44 const LLT Ty
= Query
.Types
[TypeIdx
];
45 const LLT EltTy
= Ty
.getScalarType();
46 return Ty
.getSizeInBits() <= MaxSize
&& EltTy
.getSizeInBits() % 32 == 0;
50 static LegalityPredicate
isSmallOddVector(unsigned TypeIdx
) {
51 return [=](const LegalityQuery
&Query
) {
52 const LLT Ty
= Query
.Types
[TypeIdx
];
53 return Ty
.isVector() &&
54 Ty
.getNumElements() % 2 != 0 &&
55 Ty
.getElementType().getSizeInBits() < 32;
59 static LegalizeMutation
oneMoreElement(unsigned TypeIdx
) {
60 return [=](const LegalityQuery
&Query
) {
61 const LLT Ty
= Query
.Types
[TypeIdx
];
62 const LLT EltTy
= Ty
.getElementType();
63 return std::make_pair(TypeIdx
, LLT::vector(Ty
.getNumElements() + 1, EltTy
));
67 static LegalizeMutation
fewerEltsToSize64Vector(unsigned TypeIdx
) {
68 return [=](const LegalityQuery
&Query
) {
69 const LLT Ty
= Query
.Types
[TypeIdx
];
70 const LLT EltTy
= Ty
.getElementType();
71 unsigned Size
= Ty
.getSizeInBits();
72 unsigned Pieces
= (Size
+ 63) / 64;
73 unsigned NewNumElts
= (Ty
.getNumElements() + 1) / Pieces
;
74 return std::make_pair(TypeIdx
, LLT::scalarOrVector(NewNumElts
, EltTy
));
78 static LegalityPredicate
vectorWiderThan(unsigned TypeIdx
, unsigned Size
) {
79 return [=](const LegalityQuery
&Query
) {
80 const LLT QueryTy
= Query
.Types
[TypeIdx
];
81 return QueryTy
.isVector() && QueryTy
.getSizeInBits() > Size
;
85 static LegalityPredicate
numElementsNotEven(unsigned TypeIdx
) {
86 return [=](const LegalityQuery
&Query
) {
87 const LLT QueryTy
= Query
.Types
[TypeIdx
];
88 return QueryTy
.isVector() && QueryTy
.getNumElements() % 2 != 0;
92 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
94 static LegalityPredicate
isRegisterType(unsigned TypeIdx
) {
95 return [=](const LegalityQuery
&Query
) {
96 const LLT Ty
= Query
.Types
[TypeIdx
];
98 const int EltSize
= Ty
.getElementType().getSizeInBits();
99 return EltSize
== 32 || EltSize
== 64 ||
100 (EltSize
== 16 && Ty
.getNumElements() % 2 == 0) ||
101 EltSize
== 128 || EltSize
== 256;
104 return Ty
.getSizeInBits() % 32 == 0 && Ty
.getSizeInBits() <= 512;
108 static LegalityPredicate
elementTypeIs(unsigned TypeIdx
, LLT Type
) {
109 return [=](const LegalityQuery
&Query
) {
110 return Query
.Types
[TypeIdx
].getElementType() == Type
;
114 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget
&ST_
,
115 const GCNTargetMachine
&TM
)
117 using namespace TargetOpcode
;
119 auto GetAddrSpacePtr
= [&TM
](unsigned AS
) {
120 return LLT::pointer(AS
, TM
.getPointerSizeInBits(AS
));
123 const LLT S1
= LLT::scalar(1);
124 const LLT S8
= LLT::scalar(8);
125 const LLT S16
= LLT::scalar(16);
126 const LLT S32
= LLT::scalar(32);
127 const LLT S64
= LLT::scalar(64);
128 const LLT S128
= LLT::scalar(128);
129 const LLT S256
= LLT::scalar(256);
130 const LLT S512
= LLT::scalar(512);
132 const LLT V2S16
= LLT::vector(2, 16);
133 const LLT V4S16
= LLT::vector(4, 16);
135 const LLT V2S32
= LLT::vector(2, 32);
136 const LLT V3S32
= LLT::vector(3, 32);
137 const LLT V4S32
= LLT::vector(4, 32);
138 const LLT V5S32
= LLT::vector(5, 32);
139 const LLT V6S32
= LLT::vector(6, 32);
140 const LLT V7S32
= LLT::vector(7, 32);
141 const LLT V8S32
= LLT::vector(8, 32);
142 const LLT V9S32
= LLT::vector(9, 32);
143 const LLT V10S32
= LLT::vector(10, 32);
144 const LLT V11S32
= LLT::vector(11, 32);
145 const LLT V12S32
= LLT::vector(12, 32);
146 const LLT V13S32
= LLT::vector(13, 32);
147 const LLT V14S32
= LLT::vector(14, 32);
148 const LLT V15S32
= LLT::vector(15, 32);
149 const LLT V16S32
= LLT::vector(16, 32);
151 const LLT V2S64
= LLT::vector(2, 64);
152 const LLT V3S64
= LLT::vector(3, 64);
153 const LLT V4S64
= LLT::vector(4, 64);
154 const LLT V5S64
= LLT::vector(5, 64);
155 const LLT V6S64
= LLT::vector(6, 64);
156 const LLT V7S64
= LLT::vector(7, 64);
157 const LLT V8S64
= LLT::vector(8, 64);
159 std::initializer_list
<LLT
> AllS32Vectors
=
160 {V2S32
, V3S32
, V4S32
, V5S32
, V6S32
, V7S32
, V8S32
,
161 V9S32
, V10S32
, V11S32
, V12S32
, V13S32
, V14S32
, V15S32
, V16S32
};
162 std::initializer_list
<LLT
> AllS64Vectors
=
163 {V2S64
, V3S64
, V4S64
, V5S64
, V6S64
, V7S64
, V8S64
};
165 const LLT GlobalPtr
= GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS
);
166 const LLT ConstantPtr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS
);
167 const LLT Constant32Ptr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT
);
168 const LLT LocalPtr
= GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS
);
169 const LLT RegionPtr
= GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS
);
170 const LLT FlatPtr
= GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS
);
171 const LLT PrivatePtr
= GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS
);
173 const LLT CodePtr
= FlatPtr
;
175 const std::initializer_list
<LLT
> AddrSpaces64
= {
176 GlobalPtr
, ConstantPtr
, FlatPtr
179 const std::initializer_list
<LLT
> AddrSpaces32
= {
180 LocalPtr
, PrivatePtr
, Constant32Ptr
, RegionPtr
183 const std::initializer_list
<LLT
> FPTypesBase
= {
187 const std::initializer_list
<LLT
> FPTypes16
= {
191 const std::initializer_list
<LLT
> FPTypesPK16
= {
195 setAction({G_BRCOND
, S1
}, Legal
);
197 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
198 // elements for v3s16
199 getActionDefinitionsBuilder(G_PHI
)
200 .legalFor({S32
, S64
, V2S16
, V4S16
, S1
, S128
, S256
})
201 .legalFor(AllS32Vectors
)
202 .legalFor(AllS64Vectors
)
203 .legalFor(AddrSpaces64
)
204 .legalFor(AddrSpaces32
)
205 .clampScalar(0, S32
, S256
)
206 .widenScalarToNextPow2(0, 32)
207 .clampMaxNumElements(0, S32
, 16)
208 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
209 .legalIf(isPointer(0));
211 if (ST
.has16BitInsts()) {
212 getActionDefinitionsBuilder({G_ADD
, G_SUB
, G_MUL
})
213 .legalFor({S32
, S16
})
214 .clampScalar(0, S16
, S32
)
217 getActionDefinitionsBuilder({G_ADD
, G_SUB
, G_MUL
})
219 .clampScalar(0, S32
, S32
)
223 getActionDefinitionsBuilder({G_UMULH
, G_SMULH
})
225 .clampScalar(0, S32
, S32
)
228 // Report legal for any types we can handle anywhere. For the cases only legal
229 // on the SALU, RegBankSelect will be able to re-legalize.
230 getActionDefinitionsBuilder({G_AND
, G_OR
, G_XOR
})
231 .legalFor({S32
, S1
, S64
, V2S32
, S16
, V2S16
, V4S16
})
232 .clampScalar(0, S32
, S64
)
233 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
234 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
235 .widenScalarToNextPow2(0)
238 getActionDefinitionsBuilder({G_UADDO
, G_SADDO
, G_USUBO
, G_SSUBO
,
239 G_UADDE
, G_SADDE
, G_USUBE
, G_SSUBE
})
240 .legalFor({{S32
, S1
}})
241 .clampScalar(0, S32
, S32
);
243 getActionDefinitionsBuilder(G_BITCAST
)
244 .legalForCartesianProduct({S32
, V2S16
})
245 .legalForCartesianProduct({S64
, V2S32
, V4S16
})
246 .legalForCartesianProduct({V2S64
, V4S32
})
247 // Don't worry about the size constraint.
248 .legalIf(all(isPointer(0), isPointer(1)));
250 getActionDefinitionsBuilder(G_FCONSTANT
)
251 .legalFor({S32
, S64
, S16
})
252 .clampScalar(0, S16
, S64
);
254 getActionDefinitionsBuilder(G_IMPLICIT_DEF
)
255 .legalFor({S1
, S32
, S64
, S16
, V2S32
, V4S32
, V2S16
, V4S16
, GlobalPtr
,
256 ConstantPtr
, LocalPtr
, FlatPtr
, PrivatePtr
})
257 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
258 .clampScalarOrElt(0, S32
, S512
)
259 .legalIf(isMultiple32(0))
260 .widenScalarToNextPow2(0, 32)
261 .clampMaxNumElements(0, S32
, 16);
264 // FIXME: i1 operands to intrinsics should always be legal, but other i1
265 // values may not be legal. We need to figure out how to distinguish
266 // between these two scenarios.
267 getActionDefinitionsBuilder(G_CONSTANT
)
268 .legalFor({S1
, S32
, S64
, S16
, GlobalPtr
,
269 LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
270 .clampScalar(0, S32
, S64
)
271 .widenScalarToNextPow2(0)
272 .legalIf(isPointer(0));
274 setAction({G_FRAME_INDEX
, PrivatePtr
}, Legal
);
276 auto &FPOpActions
= getActionDefinitionsBuilder(
277 { G_FADD
, G_FMUL
, G_FNEG
, G_FABS
, G_FMA
, G_FCANONICALIZE
})
278 .legalFor({S32
, S64
});
279 auto &TrigActions
= getActionDefinitionsBuilder({G_FSIN
, G_FCOS
})
280 .customFor({S32
, S64
});
282 if (ST
.has16BitInsts()) {
283 if (ST
.hasVOP3PInsts())
284 FPOpActions
.legalFor({S16
, V2S16
});
286 FPOpActions
.legalFor({S16
});
288 TrigActions
.customFor({S16
});
291 auto &MinNumMaxNum
= getActionDefinitionsBuilder({
292 G_FMINNUM
, G_FMAXNUM
, G_FMINNUM_IEEE
, G_FMAXNUM_IEEE
});
294 if (ST
.hasVOP3PInsts()) {
295 MinNumMaxNum
.customFor(FPTypesPK16
)
296 .clampMaxNumElements(0, S16
, 2)
297 .clampScalar(0, S16
, S64
)
299 } else if (ST
.has16BitInsts()) {
300 MinNumMaxNum
.customFor(FPTypes16
)
301 .clampScalar(0, S16
, S64
)
304 MinNumMaxNum
.customFor(FPTypesBase
)
305 .clampScalar(0, S32
, S64
)
310 getActionDefinitionsBuilder({G_FMINIMUM
, G_FMAXIMUM
}).lower();
312 if (ST
.hasVOP3PInsts())
313 FPOpActions
.clampMaxNumElements(0, S16
, 2);
317 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
321 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
323 if (ST
.has16BitInsts()) {
324 getActionDefinitionsBuilder(G_FSQRT
)
325 .legalFor({S32
, S64
, S16
})
327 .clampScalar(0, S16
, S64
);
329 getActionDefinitionsBuilder(G_FSQRT
)
330 .legalFor({S32
, S64
})
332 .clampScalar(0, S32
, S64
);
335 getActionDefinitionsBuilder(G_FPTRUNC
)
336 .legalFor({{S32
, S64
}, {S16
, S32
}})
339 getActionDefinitionsBuilder(G_FPEXT
)
340 .legalFor({{S64
, S32
}, {S32
, S16
}})
341 .lowerFor({{S64
, S16
}}) // FIXME: Implement
344 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
345 getActionDefinitionsBuilder(G_FCOPYSIGN
).lower();
347 getActionDefinitionsBuilder(G_FSUB
)
348 // Use actual fsub instruction
350 // Must use fadd + fneg
351 .lowerFor({S64
, S16
, V2S16
})
353 .clampScalar(0, S32
, S64
);
355 getActionDefinitionsBuilder({G_SEXT
, G_ZEXT
, G_ANYEXT
})
356 .legalFor({{S64
, S32
}, {S32
, S16
}, {S64
, S16
},
357 {S32
, S1
}, {S64
, S1
}, {S16
, S1
},
359 {S64
, LLT::scalar(33)},
360 {S32
, S8
}, {S128
, S32
}, {S128
, S64
}, {S32
, LLT::scalar(24)}})
363 getActionDefinitionsBuilder({G_SITOFP
, G_UITOFP
})
364 .legalFor({{S32
, S32
}, {S64
, S32
}})
365 .lowerFor({{S32
, S64
}})
366 .customFor({{S64
, S64
}})
369 getActionDefinitionsBuilder({G_FPTOSI
, G_FPTOUI
})
370 .legalFor({{S32
, S32
}, {S32
, S64
}})
373 getActionDefinitionsBuilder(G_INTRINSIC_ROUND
)
374 .legalFor({S32
, S64
})
377 if (ST
.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS
) {
378 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC
, G_FCEIL
, G_FRINT
})
379 .legalFor({S32
, S64
})
380 .clampScalar(0, S32
, S64
)
383 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC
, G_FCEIL
, G_FRINT
})
386 .clampScalar(0, S32
, S64
)
390 getActionDefinitionsBuilder(G_GEP
)
391 .legalForCartesianProduct(AddrSpaces64
, {S64
})
392 .legalForCartesianProduct(AddrSpaces32
, {S32
})
395 setAction({G_BLOCK_ADDR
, CodePtr
}, Legal
);
398 getActionDefinitionsBuilder(G_ICMP
)
399 .legalForCartesianProduct(
400 {S1
}, {S32
, S64
, GlobalPtr
, LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
401 .legalFor({{S1
, S32
}, {S1
, S64
}});
402 if (ST
.has16BitInsts()) {
403 CmpBuilder
.legalFor({{S1
, S16
}});
407 .widenScalarToNextPow2(1)
408 .clampScalar(1, S32
, S64
)
410 .legalIf(all(typeIs(0, S1
), isPointer(1)));
412 getActionDefinitionsBuilder(G_FCMP
)
413 .legalForCartesianProduct({S1
}, ST
.has16BitInsts() ? FPTypes16
: FPTypesBase
)
414 .widenScalarToNextPow2(1)
415 .clampScalar(1, S32
, S64
)
418 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
419 getActionDefinitionsBuilder({G_FPOW
, G_FEXP
, G_FEXP2
,
420 G_FLOG
, G_FLOG2
, G_FLOG10
})
424 // The 64-bit versions produce 32-bit results, but only on the SALU.
425 getActionDefinitionsBuilder({G_CTLZ
, G_CTLZ_ZERO_UNDEF
,
426 G_CTTZ
, G_CTTZ_ZERO_UNDEF
,
428 .legalFor({{S32
, S32
}, {S32
, S64
}})
429 .clampScalar(0, S32
, S32
)
430 .clampScalar(1, S32
, S64
)
432 .widenScalarToNextPow2(0, 32)
433 .widenScalarToNextPow2(1, 32);
435 // TODO: Expand for > s32
436 getActionDefinitionsBuilder({G_BSWAP
, G_BITREVERSE
})
438 .clampScalar(0, S32
, S32
)
441 if (ST
.has16BitInsts()) {
442 if (ST
.hasVOP3PInsts()) {
443 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
444 .legalFor({S32
, S16
, V2S16
})
445 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
446 .clampMaxNumElements(0, S16
, 2)
447 .clampScalar(0, S16
, S32
)
448 .widenScalarToNextPow2(0)
451 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
452 .legalFor({S32
, S16
})
453 .widenScalarToNextPow2(0)
454 .clampScalar(0, S16
, S32
)
458 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
460 .clampScalar(0, S32
, S32
)
461 .widenScalarToNextPow2(0)
465 auto smallerThan
= [](unsigned TypeIdx0
, unsigned TypeIdx1
) {
466 return [=](const LegalityQuery
&Query
) {
467 return Query
.Types
[TypeIdx0
].getSizeInBits() <
468 Query
.Types
[TypeIdx1
].getSizeInBits();
472 auto greaterThan
= [](unsigned TypeIdx0
, unsigned TypeIdx1
) {
473 return [=](const LegalityQuery
&Query
) {
474 return Query
.Types
[TypeIdx0
].getSizeInBits() >
475 Query
.Types
[TypeIdx1
].getSizeInBits();
479 getActionDefinitionsBuilder(G_INTTOPTR
)
480 // List the common cases
481 .legalForCartesianProduct(AddrSpaces64
, {S64
})
482 .legalForCartesianProduct(AddrSpaces32
, {S32
})
484 // Accept any address space as long as the size matches
485 .legalIf(sameSize(0, 1))
486 .widenScalarIf(smallerThan(1, 0),
487 [](const LegalityQuery
&Query
) {
488 return std::make_pair(1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
490 .narrowScalarIf(greaterThan(1, 0),
491 [](const LegalityQuery
&Query
) {
492 return std::make_pair(1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
495 getActionDefinitionsBuilder(G_PTRTOINT
)
496 // List the common cases
497 .legalForCartesianProduct(AddrSpaces64
, {S64
})
498 .legalForCartesianProduct(AddrSpaces32
, {S32
})
500 // Accept any address space as long as the size matches
501 .legalIf(sameSize(0, 1))
502 .widenScalarIf(smallerThan(0, 1),
503 [](const LegalityQuery
&Query
) {
504 return std::make_pair(0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
508 [](const LegalityQuery
&Query
) {
509 return std::make_pair(0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
512 getActionDefinitionsBuilder(G_ADDRSPACE_CAST
)
516 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
517 // handle some operations by just promoting the register during
518 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
519 getActionDefinitionsBuilder({G_LOAD
, G_STORE
})
520 .narrowScalarIf([](const LegalityQuery
&Query
) {
521 unsigned Size
= Query
.Types
[0].getSizeInBits();
522 unsigned MemSize
= Query
.MMODescrs
[0].SizeInBits
;
523 return (Size
> 32 && MemSize
< Size
);
525 [](const LegalityQuery
&Query
) {
526 return std::make_pair(0, LLT::scalar(32));
528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529 .fewerElementsIf([=](const LegalityQuery
&Query
) {
530 unsigned MemSize
= Query
.MMODescrs
[0].SizeInBits
;
531 return (MemSize
== 96) &&
532 Query
.Types
[0].isVector() &&
533 !ST
.hasDwordx3LoadStores();
535 [=](const LegalityQuery
&Query
) {
536 return std::make_pair(0, V2S32
);
538 .legalIf([=](const LegalityQuery
&Query
) {
539 const LLT
&Ty0
= Query
.Types
[0];
541 unsigned Size
= Ty0
.getSizeInBits();
542 unsigned MemSize
= Query
.MMODescrs
[0].SizeInBits
;
543 if (Size
< 32 || (Size
> 32 && MemSize
< Size
))
546 if (Ty0
.isVector() && Size
!= MemSize
)
549 // TODO: Decompose private loads into 4-byte components.
550 // TODO: Illegal flat loads on SI
561 return ST
.hasDwordx3LoadStores();
565 // TODO: Possibly support loads of i256 and i512 . This will require
566 // adding i256 and i512 types to MVT in order for to be able to use
568 // TODO: Add support for other vector types, this will require
569 // defining more value mappings for the new types.
570 return Ty0
.isVector() && (Ty0
.getScalarType().getSizeInBits() == 32 ||
571 Ty0
.getScalarType().getSizeInBits() == 64);
577 .clampScalar(0, S32
, S64
);
580 // FIXME: Handle alignment requirements.
581 auto &ExtLoads
= getActionDefinitionsBuilder({G_SEXTLOAD
, G_ZEXTLOAD
})
582 .legalForTypesWithMemDesc({
583 {S32
, GlobalPtr
, 8, 8},
584 {S32
, GlobalPtr
, 16, 8},
585 {S32
, LocalPtr
, 8, 8},
586 {S32
, LocalPtr
, 16, 8},
587 {S32
, PrivatePtr
, 8, 8},
588 {S32
, PrivatePtr
, 16, 8}});
589 if (ST
.hasFlatAddressSpace()) {
590 ExtLoads
.legalForTypesWithMemDesc({{S32
, FlatPtr
, 8, 8},
591 {S32
, FlatPtr
, 16, 8}});
594 ExtLoads
.clampScalar(0, S32
, S32
)
595 .widenScalarToNextPow2(0)
596 .unsupportedIfMemSizeNotPow2()
599 auto &Atomics
= getActionDefinitionsBuilder(
600 {G_ATOMICRMW_XCHG
, G_ATOMICRMW_ADD
, G_ATOMICRMW_SUB
,
601 G_ATOMICRMW_AND
, G_ATOMICRMW_OR
, G_ATOMICRMW_XOR
,
602 G_ATOMICRMW_MAX
, G_ATOMICRMW_MIN
, G_ATOMICRMW_UMAX
,
603 G_ATOMICRMW_UMIN
, G_ATOMIC_CMPXCHG
})
604 .legalFor({{S32
, GlobalPtr
}, {S32
, LocalPtr
},
605 {S64
, GlobalPtr
}, {S64
, LocalPtr
}});
606 if (ST
.hasFlatAddressSpace()) {
607 Atomics
.legalFor({{S32
, FlatPtr
}, {S64
, FlatPtr
}});
610 getActionDefinitionsBuilder(G_ATOMICRMW_FADD
)
611 .legalFor({{S32
, LocalPtr
}});
613 // TODO: Pointer types, any 32-bit or 64-bit vector
614 getActionDefinitionsBuilder(G_SELECT
)
615 .legalForCartesianProduct({S32
, S64
, S16
, V2S32
, V2S16
, V4S16
,
616 GlobalPtr
, LocalPtr
, FlatPtr
, PrivatePtr
,
617 LLT::vector(2, LocalPtr
), LLT::vector(2, PrivatePtr
)}, {S1
})
618 .clampScalar(0, S16
, S64
)
619 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
620 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
622 .clampMaxNumElements(0, S32
, 2)
623 .clampMaxNumElements(0, LocalPtr
, 2)
624 .clampMaxNumElements(0, PrivatePtr
, 2)
626 .widenScalarToNextPow2(0)
627 .legalIf(all(isPointer(0), typeIs(1, S1
)));
629 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
630 // be more flexible with the shift amount type.
631 auto &Shifts
= getActionDefinitionsBuilder({G_SHL
, G_LSHR
, G_ASHR
})
632 .legalFor({{S32
, S32
}, {S64
, S32
}});
633 if (ST
.has16BitInsts()) {
634 if (ST
.hasVOP3PInsts()) {
635 Shifts
.legalFor({{S16
, S32
}, {S16
, S16
}, {V2S16
, V2S16
}})
636 .clampMaxNumElements(0, S16
, 2);
638 Shifts
.legalFor({{S16
, S32
}, {S16
, S16
}});
640 Shifts
.clampScalar(1, S16
, S32
);
641 Shifts
.clampScalar(0, S16
, S64
);
642 Shifts
.widenScalarToNextPow2(0, 16);
644 // Make sure we legalize the shift amount type first, as the general
645 // expansion for the shifted type will produce much worse code if it hasn't
646 // been truncated already.
647 Shifts
.clampScalar(1, S32
, S32
);
648 Shifts
.clampScalar(0, S32
, S64
);
649 Shifts
.widenScalarToNextPow2(0, 32);
653 for (unsigned Op
: {G_EXTRACT_VECTOR_ELT
, G_INSERT_VECTOR_ELT
}) {
654 unsigned VecTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 1 : 0;
655 unsigned EltTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 0 : 1;
656 unsigned IdxTypeIdx
= 2;
658 getActionDefinitionsBuilder(Op
)
659 .customIf([=](const LegalityQuery
&Query
) {
660 const LLT EltTy
= Query
.Types
[EltTypeIdx
];
661 const LLT VecTy
= Query
.Types
[VecTypeIdx
];
662 const LLT IdxTy
= Query
.Types
[IdxTypeIdx
];
663 return (EltTy
.getSizeInBits() == 16 ||
664 EltTy
.getSizeInBits() % 32 == 0) &&
665 VecTy
.getSizeInBits() % 32 == 0 &&
666 VecTy
.getSizeInBits() <= 512 &&
667 IdxTy
.getSizeInBits() == 32;
669 .clampScalar(EltTypeIdx
, S32
, S64
)
670 .clampScalar(VecTypeIdx
, S32
, S64
)
671 .clampScalar(IdxTypeIdx
, S32
, S32
);
674 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT
)
675 .unsupportedIf([=](const LegalityQuery
&Query
) {
676 const LLT
&EltTy
= Query
.Types
[1].getElementType();
677 return Query
.Types
[0] != EltTy
;
680 for (unsigned Op
: {G_EXTRACT
, G_INSERT
}) {
681 unsigned BigTyIdx
= Op
== G_EXTRACT
? 1 : 0;
682 unsigned LitTyIdx
= Op
== G_EXTRACT
? 0 : 1;
684 // FIXME: Doesn't handle extract of illegal sizes.
685 getActionDefinitionsBuilder(Op
)
686 .legalIf([=](const LegalityQuery
&Query
) {
687 const LLT BigTy
= Query
.Types
[BigTyIdx
];
688 const LLT LitTy
= Query
.Types
[LitTyIdx
];
689 return (BigTy
.getSizeInBits() % 32 == 0) &&
690 (LitTy
.getSizeInBits() % 16 == 0);
693 [=](const LegalityQuery
&Query
) {
694 const LLT BigTy
= Query
.Types
[BigTyIdx
];
695 return (BigTy
.getScalarSizeInBits() < 16);
697 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx
, 16))
699 [=](const LegalityQuery
&Query
) {
700 const LLT LitTy
= Query
.Types
[LitTyIdx
];
701 return (LitTy
.getScalarSizeInBits() < 16);
703 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx
, 16))
704 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
705 .widenScalarToNextPow2(BigTyIdx
, 32);
709 getActionDefinitionsBuilder(G_BUILD_VECTOR
)
710 .legalForCartesianProduct(AllS32Vectors
, {S32
})
711 .legalForCartesianProduct(AllS64Vectors
, {S64
})
712 .clampNumElements(0, V16S32
, V16S32
)
713 .clampNumElements(0, V2S64
, V8S64
)
714 .minScalarSameAs(1, 0)
715 .legalIf(isRegisterType(0))
716 .minScalarOrElt(0, S32
);
718 getActionDefinitionsBuilder(G_CONCAT_VECTORS
)
719 .legalIf(isRegisterType(0));
721 // TODO: Don't fully scalarize v2s16 pieces
722 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR
).lower();
725 for (unsigned Op
: {G_MERGE_VALUES
, G_UNMERGE_VALUES
}) {
726 unsigned BigTyIdx
= Op
== G_MERGE_VALUES
? 0 : 1;
727 unsigned LitTyIdx
= Op
== G_MERGE_VALUES
? 1 : 0;
729 auto notValidElt
= [=](const LegalityQuery
&Query
, unsigned TypeIdx
) {
730 const LLT
&Ty
= Query
.Types
[TypeIdx
];
732 const LLT
&EltTy
= Ty
.getElementType();
733 if (EltTy
.getSizeInBits() < 8 || EltTy
.getSizeInBits() > 64)
735 if (!isPowerOf2_32(EltTy
.getSizeInBits()))
741 getActionDefinitionsBuilder(Op
)
742 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 16)
743 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
744 // worth considering the multiples of 64 since 2*192 and 2*384 are not
746 .clampScalar(LitTyIdx
, S16
, S256
)
747 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 32)
748 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
749 .fewerElementsIf(all(typeIs(0, S16
), vectorWiderThan(1, 32),
750 elementTypeIs(1, S16
)),
752 // Break up vectors with weird elements into scalars
754 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, 0); },
757 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, 1); },
759 .clampScalar(BigTyIdx
, S32
, S512
)
760 .lowerFor({{S16
, V2S16
}})
762 [=](const LegalityQuery
&Query
) {
763 const LLT
&Ty
= Query
.Types
[BigTyIdx
];
764 return !isPowerOf2_32(Ty
.getSizeInBits()) &&
765 Ty
.getSizeInBits() % 16 != 0;
767 [=](const LegalityQuery
&Query
) {
768 // Pick the next power of 2, or a multiple of 64 over 128.
769 // Whichever is smaller.
770 const LLT
&Ty
= Query
.Types
[BigTyIdx
];
771 unsigned NewSizeInBits
= 1 << Log2_32_Ceil(Ty
.getSizeInBits() + 1);
772 if (NewSizeInBits
>= 256) {
773 unsigned RoundedTo
= alignTo
<64>(Ty
.getSizeInBits() + 1);
774 if (RoundedTo
< NewSizeInBits
)
775 NewSizeInBits
= RoundedTo
;
777 return std::make_pair(BigTyIdx
, LLT::scalar(NewSizeInBits
));
779 .legalIf([=](const LegalityQuery
&Query
) {
780 const LLT
&BigTy
= Query
.Types
[BigTyIdx
];
781 const LLT
&LitTy
= Query
.Types
[LitTyIdx
];
783 if (BigTy
.isVector() && BigTy
.getSizeInBits() < 32)
785 if (LitTy
.isVector() && LitTy
.getSizeInBits() < 32)
788 return BigTy
.getSizeInBits() % 16 == 0 &&
789 LitTy
.getSizeInBits() % 16 == 0 &&
790 BigTy
.getSizeInBits() <= 512;
792 // Any vectors left are the wrong size. Scalarize them.
797 getActionDefinitionsBuilder(G_SEXT_INREG
).lower();
800 verify(*ST
.getInstrInfo());
803 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr
&MI
,
804 MachineRegisterInfo
&MRI
,
805 MachineIRBuilder
&MIRBuilder
,
806 GISelChangeObserver
&Observer
) const {
807 switch (MI
.getOpcode()) {
808 case TargetOpcode::G_ADDRSPACE_CAST
:
809 return legalizeAddrSpaceCast(MI
, MRI
, MIRBuilder
);
810 case TargetOpcode::G_FRINT
:
811 return legalizeFrint(MI
, MRI
, MIRBuilder
);
812 case TargetOpcode::G_FCEIL
:
813 return legalizeFceil(MI
, MRI
, MIRBuilder
);
814 case TargetOpcode::G_INTRINSIC_TRUNC
:
815 return legalizeIntrinsicTrunc(MI
, MRI
, MIRBuilder
);
816 case TargetOpcode::G_SITOFP
:
817 return legalizeITOFP(MI
, MRI
, MIRBuilder
, true);
818 case TargetOpcode::G_UITOFP
:
819 return legalizeITOFP(MI
, MRI
, MIRBuilder
, false);
820 case TargetOpcode::G_FMINNUM
:
821 case TargetOpcode::G_FMAXNUM
:
822 case TargetOpcode::G_FMINNUM_IEEE
:
823 case TargetOpcode::G_FMAXNUM_IEEE
:
824 return legalizeMinNumMaxNum(MI
, MRI
, MIRBuilder
);
825 case TargetOpcode::G_EXTRACT_VECTOR_ELT
:
826 return legalizeExtractVectorElt(MI
, MRI
, MIRBuilder
);
827 case TargetOpcode::G_INSERT_VECTOR_ELT
:
828 return legalizeInsertVectorElt(MI
, MRI
, MIRBuilder
);
829 case TargetOpcode::G_FSIN
:
830 case TargetOpcode::G_FCOS
:
831 return legalizeSinCos(MI
, MRI
, MIRBuilder
);
836 llvm_unreachable("expected switch to return");
839 Register
AMDGPULegalizerInfo::getSegmentAperture(
841 MachineRegisterInfo
&MRI
,
842 MachineIRBuilder
&MIRBuilder
) const {
843 MachineFunction
&MF
= MIRBuilder
.getMF();
844 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
845 const LLT S32
= LLT::scalar(32);
847 if (ST
.hasApertureRegs()) {
848 // FIXME: Use inline constants (src_{shared, private}_base) instead of
850 unsigned Offset
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
851 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE
:
852 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE
;
853 unsigned WidthM1
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
854 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE
:
855 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE
;
857 AMDGPU::Hwreg::ID_MEM_BASES
<< AMDGPU::Hwreg::ID_SHIFT_
|
858 Offset
<< AMDGPU::Hwreg::OFFSET_SHIFT_
|
859 WidthM1
<< AMDGPU::Hwreg::WIDTH_M1_SHIFT_
;
861 Register ApertureReg
= MRI
.createGenericVirtualRegister(S32
);
862 Register GetReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
864 MIRBuilder
.buildInstr(AMDGPU::S_GETREG_B32
)
867 MRI
.setType(GetReg
, S32
);
869 auto ShiftAmt
= MIRBuilder
.buildConstant(S32
, WidthM1
+ 1);
870 MIRBuilder
.buildInstr(TargetOpcode::G_SHL
)
873 .addUse(ShiftAmt
.getReg(0));
878 Register QueuePtr
= MRI
.createGenericVirtualRegister(
879 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
881 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
882 if (!loadInputValue(QueuePtr
, MIRBuilder
, &MFI
->getArgInfo().QueuePtr
))
885 // Offset into amd_queue_t for group_segment_aperture_base_hi /
886 // private_segment_aperture_base_hi.
887 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
889 // FIXME: Don't use undef
890 Value
*V
= UndefValue::get(PointerType::get(
891 Type::getInt8Ty(MF
.getFunction().getContext()),
892 AMDGPUAS::CONSTANT_ADDRESS
));
894 MachinePointerInfo
PtrInfo(V
, StructOffset
);
895 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
897 MachineMemOperand::MOLoad
|
898 MachineMemOperand::MODereferenceable
|
899 MachineMemOperand::MOInvariant
,
901 MinAlign(64, StructOffset
));
903 Register LoadResult
= MRI
.createGenericVirtualRegister(S32
);
906 MIRBuilder
.materializeGEP(LoadAddr
, QueuePtr
, LLT::scalar(64), StructOffset
);
907 MIRBuilder
.buildLoad(LoadResult
, LoadAddr
, *MMO
);
911 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
912 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
913 MachineIRBuilder
&MIRBuilder
) const {
914 MachineFunction
&MF
= MIRBuilder
.getMF();
916 MIRBuilder
.setInstr(MI
);
918 const LLT S32
= LLT::scalar(32);
919 Register Dst
= MI
.getOperand(0).getReg();
920 Register Src
= MI
.getOperand(1).getReg();
922 LLT DstTy
= MRI
.getType(Dst
);
923 LLT SrcTy
= MRI
.getType(Src
);
924 unsigned DestAS
= DstTy
.getAddressSpace();
925 unsigned SrcAS
= SrcTy
.getAddressSpace();
927 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
929 assert(!DstTy
.isVector());
931 const AMDGPUTargetMachine
&TM
932 = static_cast<const AMDGPUTargetMachine
&>(MF
.getTarget());
934 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
935 if (ST
.getTargetLowering()->isNoopAddrSpaceCast(SrcAS
, DestAS
)) {
936 MI
.setDesc(MIRBuilder
.getTII().get(TargetOpcode::G_BITCAST
));
940 if (DestAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
942 MIRBuilder
.buildExtract(Dst
, Src
, 0);
943 MI
.eraseFromParent();
947 if (SrcAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
948 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
949 uint32_t AddrHiVal
= Info
->get32BitAddressHighBits();
951 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
952 // another. Merge operands are required to be the same type, but creating an
953 // extra ptrtoint would be kind of pointless.
954 auto HighAddr
= MIRBuilder
.buildConstant(
955 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT
, 32), AddrHiVal
);
956 MIRBuilder
.buildMerge(Dst
, {Src
, HighAddr
.getReg(0)});
957 MI
.eraseFromParent();
961 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
) {
962 assert(DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
963 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
);
964 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
966 auto SegmentNull
= MIRBuilder
.buildConstant(DstTy
, NullVal
);
967 auto FlatNull
= MIRBuilder
.buildConstant(SrcTy
, 0);
969 Register PtrLo32
= MRI
.createGenericVirtualRegister(DstTy
);
971 // Extract low 32-bits of the pointer.
972 MIRBuilder
.buildExtract(PtrLo32
, Src
, 0);
974 Register CmpRes
= MRI
.createGenericVirtualRegister(LLT::scalar(1));
975 MIRBuilder
.buildICmp(CmpInst::ICMP_NE
, CmpRes
, Src
, FlatNull
.getReg(0));
976 MIRBuilder
.buildSelect(Dst
, CmpRes
, PtrLo32
, SegmentNull
.getReg(0));
978 MI
.eraseFromParent();
982 if (SrcAS
!= AMDGPUAS::LOCAL_ADDRESS
&& SrcAS
!= AMDGPUAS::PRIVATE_ADDRESS
)
985 if (!ST
.hasFlatAddressSpace())
989 MIRBuilder
.buildConstant(SrcTy
, TM
.getNullPointerValue(SrcAS
));
991 MIRBuilder
.buildConstant(DstTy
, TM
.getNullPointerValue(DestAS
));
993 Register ApertureReg
= getSegmentAperture(DestAS
, MRI
, MIRBuilder
);
994 if (!ApertureReg
.isValid())
997 Register CmpRes
= MRI
.createGenericVirtualRegister(LLT::scalar(1));
998 MIRBuilder
.buildICmp(CmpInst::ICMP_NE
, CmpRes
, Src
, SegmentNull
.getReg(0));
1000 Register BuildPtr
= MRI
.createGenericVirtualRegister(DstTy
);
1002 // Coerce the type of the low half of the result so we can use merge_values.
1003 Register SrcAsInt
= MRI
.createGenericVirtualRegister(S32
);
1004 MIRBuilder
.buildInstr(TargetOpcode::G_PTRTOINT
)
1008 // TODO: Should we allow mismatched types but matching sizes in merges to
1009 // avoid the ptrtoint?
1010 MIRBuilder
.buildMerge(BuildPtr
, {SrcAsInt
, ApertureReg
});
1011 MIRBuilder
.buildSelect(Dst
, CmpRes
, BuildPtr
, FlatNull
.getReg(0));
1013 MI
.eraseFromParent();
1017 bool AMDGPULegalizerInfo::legalizeFrint(
1018 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1019 MachineIRBuilder
&MIRBuilder
) const {
1020 MIRBuilder
.setInstr(MI
);
1022 Register Src
= MI
.getOperand(1).getReg();
1023 LLT Ty
= MRI
.getType(Src
);
1024 assert(Ty
.isScalar() && Ty
.getSizeInBits() == 64);
1026 APFloat
C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1027 APFloat
C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1029 auto C1
= MIRBuilder
.buildFConstant(Ty
, C1Val
);
1030 auto CopySign
= MIRBuilder
.buildFCopysign(Ty
, C1
, Src
);
1032 // TODO: Should this propagate fast-math-flags?
1033 auto Tmp1
= MIRBuilder
.buildFAdd(Ty
, Src
, CopySign
);
1034 auto Tmp2
= MIRBuilder
.buildFSub(Ty
, Tmp1
, CopySign
);
1036 auto C2
= MIRBuilder
.buildFConstant(Ty
, C2Val
);
1037 auto Fabs
= MIRBuilder
.buildFAbs(Ty
, Src
);
1039 auto Cond
= MIRBuilder
.buildFCmp(CmpInst::FCMP_OGT
, LLT::scalar(1), Fabs
, C2
);
1040 MIRBuilder
.buildSelect(MI
.getOperand(0).getReg(), Cond
, Src
, Tmp2
);
1044 bool AMDGPULegalizerInfo::legalizeFceil(
1045 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1046 MachineIRBuilder
&B
) const {
1049 const LLT S1
= LLT::scalar(1);
1050 const LLT S64
= LLT::scalar(64);
1052 Register Src
= MI
.getOperand(1).getReg();
1053 assert(MRI
.getType(Src
) == S64
);
1055 // result = trunc(src)
1056 // if (src > 0.0 && src != result)
1059 auto Trunc
= B
.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC
, {S64
}, {Src
});
1061 const auto Zero
= B
.buildFConstant(S64
, 0.0);
1062 const auto One
= B
.buildFConstant(S64
, 1.0);
1063 auto Lt0
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Src
, Zero
);
1064 auto NeTrunc
= B
.buildFCmp(CmpInst::FCMP_ONE
, S1
, Src
, Trunc
);
1065 auto And
= B
.buildAnd(S1
, Lt0
, NeTrunc
);
1066 auto Add
= B
.buildSelect(S64
, And
, One
, Zero
);
1068 // TODO: Should this propagate fast-math-flags?
1069 B
.buildFAdd(MI
.getOperand(0).getReg(), Trunc
, Add
);
1073 static MachineInstrBuilder
extractF64Exponent(unsigned Hi
,
1074 MachineIRBuilder
&B
) {
1075 const unsigned FractBits
= 52;
1076 const unsigned ExpBits
= 11;
1077 LLT S32
= LLT::scalar(32);
1079 auto Const0
= B
.buildConstant(S32
, FractBits
- 32);
1080 auto Const1
= B
.buildConstant(S32
, ExpBits
);
1082 auto ExpPart
= B
.buildIntrinsic(Intrinsic::amdgcn_ubfe
, {S32
}, false)
1083 .addUse(Const0
.getReg(0))
1084 .addUse(Const1
.getReg(0));
1086 return B
.buildSub(S32
, ExpPart
, B
.buildConstant(S32
, 1023));
1089 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1090 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1091 MachineIRBuilder
&B
) const {
1094 const LLT S1
= LLT::scalar(1);
1095 const LLT S32
= LLT::scalar(32);
1096 const LLT S64
= LLT::scalar(64);
1098 Register Src
= MI
.getOperand(1).getReg();
1099 assert(MRI
.getType(Src
) == S64
);
1101 // TODO: Should this use extract since the low half is unused?
1102 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
1103 Register Hi
= Unmerge
.getReg(1);
1105 // Extract the upper half, since this is where we will find the sign and
1107 auto Exp
= extractF64Exponent(Hi
, B
);
1109 const unsigned FractBits
= 52;
1111 // Extract the sign bit.
1112 const auto SignBitMask
= B
.buildConstant(S32
, UINT32_C(1) << 31);
1113 auto SignBit
= B
.buildAnd(S32
, Hi
, SignBitMask
);
1115 const auto FractMask
= B
.buildConstant(S64
, (UINT64_C(1) << FractBits
) - 1);
1117 const auto Zero32
= B
.buildConstant(S32
, 0);
1119 // Extend back to 64-bits.
1120 auto SignBit64
= B
.buildMerge(S64
, {Zero32
.getReg(0), SignBit
.getReg(0)});
1122 auto Shr
= B
.buildAShr(S64
, FractMask
, Exp
);
1123 auto Not
= B
.buildNot(S64
, Shr
);
1124 auto Tmp0
= B
.buildAnd(S64
, Src
, Not
);
1125 auto FiftyOne
= B
.buildConstant(S32
, FractBits
- 1);
1127 auto ExpLt0
= B
.buildICmp(CmpInst::ICMP_SLT
, S1
, Exp
, Zero32
);
1128 auto ExpGt51
= B
.buildICmp(CmpInst::ICMP_SGT
, S1
, Exp
, FiftyOne
);
1130 auto Tmp1
= B
.buildSelect(S64
, ExpLt0
, SignBit64
, Tmp0
);
1131 B
.buildSelect(MI
.getOperand(0).getReg(), ExpGt51
, Src
, Tmp1
);
1135 bool AMDGPULegalizerInfo::legalizeITOFP(
1136 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1137 MachineIRBuilder
&B
, bool Signed
) const {
1140 Register Dst
= MI
.getOperand(0).getReg();
1141 Register Src
= MI
.getOperand(1).getReg();
1143 const LLT S64
= LLT::scalar(64);
1144 const LLT S32
= LLT::scalar(32);
1146 assert(MRI
.getType(Src
) == S64
&& MRI
.getType(Dst
) == S64
);
1148 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
1150 auto CvtHi
= Signed
?
1151 B
.buildSITOFP(S64
, Unmerge
.getReg(1)) :
1152 B
.buildUITOFP(S64
, Unmerge
.getReg(1));
1154 auto CvtLo
= B
.buildUITOFP(S64
, Unmerge
.getReg(0));
1156 auto ThirtyTwo
= B
.buildConstant(S32
, 32);
1157 auto LdExp
= B
.buildIntrinsic(Intrinsic::amdgcn_ldexp
, {S64
}, false)
1158 .addUse(CvtHi
.getReg(0))
1159 .addUse(ThirtyTwo
.getReg(0));
1161 // TODO: Should this propagate fast-math-flags?
1162 B
.buildFAdd(Dst
, LdExp
, CvtLo
);
1163 MI
.eraseFromParent();
1167 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1168 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1169 MachineIRBuilder
&B
) const {
1170 MachineFunction
&MF
= B
.getMF();
1171 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1173 const bool IsIEEEOp
= MI
.getOpcode() == AMDGPU::G_FMINNUM_IEEE
||
1174 MI
.getOpcode() == AMDGPU::G_FMAXNUM_IEEE
;
1176 // With ieee_mode disabled, the instructions have the correct behavior
1177 // already for G_FMINNUM/G_FMAXNUM
1178 if (!MFI
->getMode().IEEE
)
1184 MachineIRBuilder
HelperBuilder(MI
);
1185 GISelObserverWrapper DummyObserver
;
1186 LegalizerHelper
Helper(MF
, DummyObserver
, HelperBuilder
);
1187 HelperBuilder
.setMBB(*MI
.getParent());
1188 return Helper
.lowerFMinNumMaxNum(MI
) == LegalizerHelper::Legalized
;
1191 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1192 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1193 MachineIRBuilder
&B
) const {
1194 // TODO: Should move some of this into LegalizerHelper.
1196 // TODO: Promote dynamic indexing of s16 to s32
1197 // TODO: Dynamic s64 indexing is only legal for SGPR.
1198 Optional
<int64_t> IdxVal
= getConstantVRegVal(MI
.getOperand(2).getReg(), MRI
);
1199 if (!IdxVal
) // Dynamic case will be selected to register indexing.
1202 Register Dst
= MI
.getOperand(0).getReg();
1203 Register Vec
= MI
.getOperand(1).getReg();
1205 LLT VecTy
= MRI
.getType(Vec
);
1206 LLT EltTy
= VecTy
.getElementType();
1207 assert(EltTy
== MRI
.getType(Dst
));
1211 if (IdxVal
.getValue() < VecTy
.getNumElements())
1212 B
.buildExtract(Dst
, Vec
, IdxVal
.getValue() * EltTy
.getSizeInBits());
1216 MI
.eraseFromParent();
1220 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1221 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1222 MachineIRBuilder
&B
) const {
1223 // TODO: Should move some of this into LegalizerHelper.
1225 // TODO: Promote dynamic indexing of s16 to s32
1226 // TODO: Dynamic s64 indexing is only legal for SGPR.
1227 Optional
<int64_t> IdxVal
= getConstantVRegVal(MI
.getOperand(3).getReg(), MRI
);
1228 if (!IdxVal
) // Dynamic case will be selected to register indexing.
1231 Register Dst
= MI
.getOperand(0).getReg();
1232 Register Vec
= MI
.getOperand(1).getReg();
1233 Register Ins
= MI
.getOperand(2).getReg();
1235 LLT VecTy
= MRI
.getType(Vec
);
1236 LLT EltTy
= VecTy
.getElementType();
1237 assert(EltTy
== MRI
.getType(Ins
));
1241 if (IdxVal
.getValue() < VecTy
.getNumElements())
1242 B
.buildInsert(Dst
, Vec
, Ins
, IdxVal
.getValue() * EltTy
.getSizeInBits());
1246 MI
.eraseFromParent();
1250 bool AMDGPULegalizerInfo::legalizeSinCos(
1251 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1252 MachineIRBuilder
&B
) const {
1255 Register DstReg
= MI
.getOperand(0).getReg();
1256 Register SrcReg
= MI
.getOperand(1).getReg();
1257 LLT Ty
= MRI
.getType(DstReg
);
1258 unsigned Flags
= MI
.getFlags();
1261 auto OneOver2Pi
= B
.buildFConstant(Ty
, 0.5 / M_PI
);
1262 if (ST
.hasTrigReducedRange()) {
1263 auto MulVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
);
1264 TrigVal
= B
.buildIntrinsic(Intrinsic::amdgcn_fract
, {Ty
}, false)
1265 .addUse(MulVal
.getReg(0))
1266 .setMIFlags(Flags
).getReg(0);
1268 TrigVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
).getReg(0);
1270 Intrinsic::ID TrigIntrin
= MI
.getOpcode() == AMDGPU::G_FSIN
?
1271 Intrinsic::amdgcn_sin
: Intrinsic::amdgcn_cos
;
1272 B
.buildIntrinsic(TrigIntrin
, makeArrayRef
<Register
>(DstReg
), false)
1275 MI
.eraseFromParent();
1279 // Return the use branch instruction, otherwise null if the usage is invalid.
1280 static MachineInstr
*verifyCFIntrinsic(MachineInstr
&MI
,
1281 MachineRegisterInfo
&MRI
) {
1282 Register CondDef
= MI
.getOperand(0).getReg();
1283 if (!MRI
.hasOneNonDBGUse(CondDef
))
1286 MachineInstr
&UseMI
= *MRI
.use_instr_nodbg_begin(CondDef
);
1287 return UseMI
.getParent() == MI
.getParent() &&
1288 UseMI
.getOpcode() == AMDGPU::G_BRCOND
? &UseMI
: nullptr;
1291 Register
AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo
&MRI
,
1292 Register Reg
, LLT Ty
) const {
1293 Register LiveIn
= MRI
.getLiveInVirtReg(Reg
);
1297 Register NewReg
= MRI
.createGenericVirtualRegister(Ty
);
1298 MRI
.addLiveIn(Reg
, NewReg
);
1302 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg
, MachineIRBuilder
&B
,
1303 const ArgDescriptor
*Arg
) const {
1304 if (!Arg
->isRegister() || !Arg
->getRegister().isValid())
1305 return false; // TODO: Handle these
1307 assert(Arg
->getRegister().isPhysical());
1309 MachineRegisterInfo
&MRI
= *B
.getMRI();
1311 LLT Ty
= MRI
.getType(DstReg
);
1312 Register LiveIn
= getLiveInRegister(MRI
, Arg
->getRegister(), Ty
);
1314 if (Arg
->isMasked()) {
1315 // TODO: Should we try to emit this once in the entry block?
1316 const LLT S32
= LLT::scalar(32);
1317 const unsigned Mask
= Arg
->getMask();
1318 const unsigned Shift
= countTrailingZeros
<unsigned>(Mask
);
1320 auto ShiftAmt
= B
.buildConstant(S32
, Shift
);
1321 auto LShr
= B
.buildLShr(S32
, LiveIn
, ShiftAmt
);
1322 B
.buildAnd(DstReg
, LShr
, B
.buildConstant(S32
, Mask
>> Shift
));
1324 B
.buildCopy(DstReg
, LiveIn
);
1326 // Insert the argument copy if it doens't already exist.
1327 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1328 if (!MRI
.getVRegDef(LiveIn
)) {
1329 // FIXME: Should have scoped insert pt
1330 MachineBasicBlock
&OrigInsBB
= B
.getMBB();
1331 auto OrigInsPt
= B
.getInsertPt();
1333 MachineBasicBlock
&EntryMBB
= B
.getMF().front();
1334 EntryMBB
.addLiveIn(Arg
->getRegister());
1335 B
.setInsertPt(EntryMBB
, EntryMBB
.begin());
1336 B
.buildCopy(LiveIn
, Arg
->getRegister());
1338 B
.setInsertPt(OrigInsBB
, OrigInsPt
);
1344 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1346 MachineRegisterInfo
&MRI
,
1347 MachineIRBuilder
&B
,
1348 AMDGPUFunctionArgInfo::PreloadedValue ArgType
) const {
1351 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
1353 const ArgDescriptor
*Arg
;
1354 const TargetRegisterClass
*RC
;
1355 std::tie(Arg
, RC
) = MFI
->getPreloadedValue(ArgType
);
1357 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1361 if (loadInputValue(MI
.getOperand(0).getReg(), B
, Arg
)) {
1362 MI
.eraseFromParent();
1369 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr
&MI
,
1370 MachineRegisterInfo
&MRI
,
1371 MachineIRBuilder
&B
) const {
1373 Register Res
= MI
.getOperand(0).getReg();
1374 Register LHS
= MI
.getOperand(2).getReg();
1375 Register RHS
= MI
.getOperand(3).getReg();
1376 uint16_t Flags
= MI
.getFlags();
1378 LLT S32
= LLT::scalar(32);
1379 LLT S1
= LLT::scalar(1);
1381 auto Abs
= B
.buildFAbs(S32
, RHS
, Flags
);
1382 const APFloat
C0Val(1.0f
);
1384 auto C0
= B
.buildConstant(S32
, 0x6f800000);
1385 auto C1
= B
.buildConstant(S32
, 0x2f800000);
1386 auto C2
= B
.buildConstant(S32
, FloatToBits(1.0f
));
1388 auto CmpRes
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Abs
, C0
, Flags
);
1389 auto Sel
= B
.buildSelect(S32
, CmpRes
, C1
, C2
, Flags
);
1391 auto Mul0
= B
.buildFMul(S32
, RHS
, Sel
, Flags
);
1393 auto RCP
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S32
}, false)
1394 .addUse(Mul0
.getReg(0))
1397 auto Mul1
= B
.buildFMul(S32
, LHS
, RCP
, Flags
);
1399 B
.buildFMul(Res
, Sel
, Mul1
, Flags
);
1401 MI
.eraseFromParent();
1405 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr
&MI
,
1406 MachineRegisterInfo
&MRI
,
1407 MachineIRBuilder
&B
) const {
1408 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
1409 if (!MFI
->isEntryFunction()) {
1410 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1411 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
1417 ST
.getTargetLowering()->getImplicitParameterOffset(
1418 B
.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT
);
1419 Register DstReg
= MI
.getOperand(0).getReg();
1420 LLT DstTy
= MRI
.getType(DstReg
);
1421 LLT IdxTy
= LLT::scalar(DstTy
.getSizeInBits());
1423 const ArgDescriptor
*Arg
;
1424 const TargetRegisterClass
*RC
;
1426 = MFI
->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1430 Register KernargPtrReg
= MRI
.createGenericVirtualRegister(DstTy
);
1431 if (!loadInputValue(KernargPtrReg
, B
, Arg
))
1434 B
.buildGEP(DstReg
, KernargPtrReg
, B
.buildConstant(IdxTy
, Offset
).getReg(0));
1435 MI
.eraseFromParent();
1439 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr
&MI
,
1440 MachineRegisterInfo
&MRI
,
1441 MachineIRBuilder
&B
,
1442 unsigned AddrSpace
) const {
1444 Register ApertureReg
= getSegmentAperture(AddrSpace
, MRI
, B
);
1445 auto Hi32
= B
.buildExtract(LLT::scalar(32), MI
.getOperand(2).getReg(), 32);
1446 B
.buildICmp(ICmpInst::ICMP_EQ
, MI
.getOperand(0), Hi32
, ApertureReg
);
1447 MI
.eraseFromParent();
1451 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr
&MI
,
1452 MachineRegisterInfo
&MRI
,
1453 MachineIRBuilder
&B
) const {
1454 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1455 switch (MI
.getOperand(MI
.getNumExplicitDefs()).getIntrinsicID()) {
1456 case Intrinsic::amdgcn_if
: {
1457 if (MachineInstr
*BrCond
= verifyCFIntrinsic(MI
, MRI
)) {
1458 const SIRegisterInfo
*TRI
1459 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
1461 B
.setInstr(*BrCond
);
1462 Register Def
= MI
.getOperand(1).getReg();
1463 Register Use
= MI
.getOperand(3).getReg();
1464 B
.buildInstr(AMDGPU::SI_IF
)
1467 .addMBB(BrCond
->getOperand(1).getMBB());
1469 MRI
.setRegClass(Def
, TRI
->getWaveMaskRegClass());
1470 MRI
.setRegClass(Use
, TRI
->getWaveMaskRegClass());
1471 MI
.eraseFromParent();
1472 BrCond
->eraseFromParent();
1478 case Intrinsic::amdgcn_loop
: {
1479 if (MachineInstr
*BrCond
= verifyCFIntrinsic(MI
, MRI
)) {
1480 const SIRegisterInfo
*TRI
1481 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
1483 B
.setInstr(*BrCond
);
1484 Register Reg
= MI
.getOperand(2).getReg();
1485 B
.buildInstr(AMDGPU::SI_LOOP
)
1487 .addMBB(BrCond
->getOperand(1).getMBB());
1488 MI
.eraseFromParent();
1489 BrCond
->eraseFromParent();
1490 MRI
.setRegClass(Reg
, TRI
->getWaveMaskRegClass());
1496 case Intrinsic::amdgcn_kernarg_segment_ptr
:
1497 return legalizePreloadedArgIntrin(
1498 MI
, MRI
, B
, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1499 case Intrinsic::amdgcn_implicitarg_ptr
:
1500 return legalizeImplicitArgPtr(MI
, MRI
, B
);
1501 case Intrinsic::amdgcn_workitem_id_x
:
1502 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1503 AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
1504 case Intrinsic::amdgcn_workitem_id_y
:
1505 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1506 AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
1507 case Intrinsic::amdgcn_workitem_id_z
:
1508 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1509 AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
1510 case Intrinsic::amdgcn_workgroup_id_x
:
1511 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1512 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
1513 case Intrinsic::amdgcn_workgroup_id_y
:
1514 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1515 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
1516 case Intrinsic::amdgcn_workgroup_id_z
:
1517 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1518 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
1519 case Intrinsic::amdgcn_dispatch_ptr
:
1520 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1521 AMDGPUFunctionArgInfo::DISPATCH_PTR
);
1522 case Intrinsic::amdgcn_queue_ptr
:
1523 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1524 AMDGPUFunctionArgInfo::QUEUE_PTR
);
1525 case Intrinsic::amdgcn_implicit_buffer_ptr
:
1526 return legalizePreloadedArgIntrin(
1527 MI
, MRI
, B
, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
1528 case Intrinsic::amdgcn_dispatch_id
:
1529 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1530 AMDGPUFunctionArgInfo::DISPATCH_ID
);
1531 case Intrinsic::amdgcn_fdiv_fast
:
1532 return legalizeFDIVFast(MI
, MRI
, B
);
1533 case Intrinsic::amdgcn_is_shared
:
1534 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::LOCAL_ADDRESS
);
1535 case Intrinsic::amdgcn_is_private
:
1536 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::PRIVATE_ADDRESS
);