1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the Machinelegalizer class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
34 #define DEBUG_TYPE "amdgpu-legalinfo"
37 using namespace LegalizeActions
;
38 using namespace LegalizeMutations
;
39 using namespace LegalityPredicates
;
42 static LegalityPredicate
isMultiple32(unsigned TypeIdx
,
43 unsigned MaxSize
= 1024) {
44 return [=](const LegalityQuery
&Query
) {
45 const LLT Ty
= Query
.Types
[TypeIdx
];
46 const LLT EltTy
= Ty
.getScalarType();
47 return Ty
.getSizeInBits() <= MaxSize
&& EltTy
.getSizeInBits() % 32 == 0;
51 static LegalityPredicate
sizeIs(unsigned TypeIdx
, unsigned Size
) {
52 return [=](const LegalityQuery
&Query
) {
53 return Query
.Types
[TypeIdx
].getSizeInBits() == Size
;
57 static LegalityPredicate
isSmallOddVector(unsigned TypeIdx
) {
58 return [=](const LegalityQuery
&Query
) {
59 const LLT Ty
= Query
.Types
[TypeIdx
];
60 return Ty
.isVector() &&
61 Ty
.getNumElements() % 2 != 0 &&
62 Ty
.getElementType().getSizeInBits() < 32 &&
63 Ty
.getSizeInBits() % 32 != 0;
67 static LegalityPredicate
isWideVec16(unsigned TypeIdx
) {
68 return [=](const LegalityQuery
&Query
) {
69 const LLT Ty
= Query
.Types
[TypeIdx
];
70 const LLT EltTy
= Ty
.getScalarType();
71 return EltTy
.getSizeInBits() == 16 && Ty
.getNumElements() > 2;
75 static LegalizeMutation
oneMoreElement(unsigned TypeIdx
) {
76 return [=](const LegalityQuery
&Query
) {
77 const LLT Ty
= Query
.Types
[TypeIdx
];
78 const LLT EltTy
= Ty
.getElementType();
79 return std::make_pair(TypeIdx
, LLT::vector(Ty
.getNumElements() + 1, EltTy
));
83 static LegalizeMutation
fewerEltsToSize64Vector(unsigned TypeIdx
) {
84 return [=](const LegalityQuery
&Query
) {
85 const LLT Ty
= Query
.Types
[TypeIdx
];
86 const LLT EltTy
= Ty
.getElementType();
87 unsigned Size
= Ty
.getSizeInBits();
88 unsigned Pieces
= (Size
+ 63) / 64;
89 unsigned NewNumElts
= (Ty
.getNumElements() + 1) / Pieces
;
90 return std::make_pair(TypeIdx
, LLT::scalarOrVector(NewNumElts
, EltTy
));
94 // Increase the number of vector elements to reach the next multiple of 32-bit
96 static LegalizeMutation
moreEltsToNext32Bit(unsigned TypeIdx
) {
97 return [=](const LegalityQuery
&Query
) {
98 const LLT Ty
= Query
.Types
[TypeIdx
];
100 const LLT EltTy
= Ty
.getElementType();
101 const int Size
= Ty
.getSizeInBits();
102 const int EltSize
= EltTy
.getSizeInBits();
103 const int NextMul32
= (Size
+ 31) / 32;
105 assert(EltSize
< 32);
107 const int NewNumElts
= (32 * NextMul32
+ EltSize
- 1) / EltSize
;
108 return std::make_pair(TypeIdx
, LLT::vector(NewNumElts
, EltTy
));
112 static LegalityPredicate
vectorSmallerThan(unsigned TypeIdx
, unsigned Size
) {
113 return [=](const LegalityQuery
&Query
) {
114 const LLT QueryTy
= Query
.Types
[TypeIdx
];
115 return QueryTy
.isVector() && QueryTy
.getSizeInBits() < Size
;
119 static LegalityPredicate
vectorWiderThan(unsigned TypeIdx
, unsigned Size
) {
120 return [=](const LegalityQuery
&Query
) {
121 const LLT QueryTy
= Query
.Types
[TypeIdx
];
122 return QueryTy
.isVector() && QueryTy
.getSizeInBits() > Size
;
126 static LegalityPredicate
numElementsNotEven(unsigned TypeIdx
) {
127 return [=](const LegalityQuery
&Query
) {
128 const LLT QueryTy
= Query
.Types
[TypeIdx
];
129 return QueryTy
.isVector() && QueryTy
.getNumElements() % 2 != 0;
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
135 static LegalityPredicate
isRegisterType(unsigned TypeIdx
) {
136 return [=](const LegalityQuery
&Query
) {
137 const LLT Ty
= Query
.Types
[TypeIdx
];
139 const int EltSize
= Ty
.getElementType().getSizeInBits();
140 return EltSize
== 32 || EltSize
== 64 ||
141 (EltSize
== 16 && Ty
.getNumElements() % 2 == 0) ||
142 EltSize
== 128 || EltSize
== 256;
145 return Ty
.getSizeInBits() % 32 == 0 && Ty
.getSizeInBits() <= 1024;
149 static LegalityPredicate
elementTypeIs(unsigned TypeIdx
, LLT Type
) {
150 return [=](const LegalityQuery
&Query
) {
151 return Query
.Types
[TypeIdx
].getElementType() == Type
;
155 static LegalityPredicate
isWideScalarTruncStore(unsigned TypeIdx
) {
156 return [=](const LegalityQuery
&Query
) {
157 const LLT Ty
= Query
.Types
[TypeIdx
];
158 return !Ty
.isVector() && Ty
.getSizeInBits() > 32 &&
159 Query
.MMODescrs
[0].SizeInBits
< Ty
.getSizeInBits();
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget
&ST_
,
164 const GCNTargetMachine
&TM
)
166 using namespace TargetOpcode
;
168 auto GetAddrSpacePtr
= [&TM
](unsigned AS
) {
169 return LLT::pointer(AS
, TM
.getPointerSizeInBits(AS
));
172 const LLT S1
= LLT::scalar(1);
173 const LLT S8
= LLT::scalar(8);
174 const LLT S16
= LLT::scalar(16);
175 const LLT S32
= LLT::scalar(32);
176 const LLT S64
= LLT::scalar(64);
177 const LLT S96
= LLT::scalar(96);
178 const LLT S128
= LLT::scalar(128);
179 const LLT S256
= LLT::scalar(256);
180 const LLT S1024
= LLT::scalar(1024);
182 const LLT V2S16
= LLT::vector(2, 16);
183 const LLT V4S16
= LLT::vector(4, 16);
185 const LLT V2S32
= LLT::vector(2, 32);
186 const LLT V3S32
= LLT::vector(3, 32);
187 const LLT V4S32
= LLT::vector(4, 32);
188 const LLT V5S32
= LLT::vector(5, 32);
189 const LLT V6S32
= LLT::vector(6, 32);
190 const LLT V7S32
= LLT::vector(7, 32);
191 const LLT V8S32
= LLT::vector(8, 32);
192 const LLT V9S32
= LLT::vector(9, 32);
193 const LLT V10S32
= LLT::vector(10, 32);
194 const LLT V11S32
= LLT::vector(11, 32);
195 const LLT V12S32
= LLT::vector(12, 32);
196 const LLT V13S32
= LLT::vector(13, 32);
197 const LLT V14S32
= LLT::vector(14, 32);
198 const LLT V15S32
= LLT::vector(15, 32);
199 const LLT V16S32
= LLT::vector(16, 32);
200 const LLT V32S32
= LLT::vector(32, 32);
202 const LLT V2S64
= LLT::vector(2, 64);
203 const LLT V3S64
= LLT::vector(3, 64);
204 const LLT V4S64
= LLT::vector(4, 64);
205 const LLT V5S64
= LLT::vector(5, 64);
206 const LLT V6S64
= LLT::vector(6, 64);
207 const LLT V7S64
= LLT::vector(7, 64);
208 const LLT V8S64
= LLT::vector(8, 64);
209 const LLT V16S64
= LLT::vector(16, 64);
211 std::initializer_list
<LLT
> AllS32Vectors
=
212 {V2S32
, V3S32
, V4S32
, V5S32
, V6S32
, V7S32
, V8S32
,
213 V9S32
, V10S32
, V11S32
, V12S32
, V13S32
, V14S32
, V15S32
, V16S32
, V32S32
};
214 std::initializer_list
<LLT
> AllS64Vectors
=
215 {V2S64
, V3S64
, V4S64
, V5S64
, V6S64
, V7S64
, V8S64
, V16S64
};
217 const LLT GlobalPtr
= GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS
);
218 const LLT ConstantPtr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS
);
219 const LLT Constant32Ptr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT
);
220 const LLT LocalPtr
= GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS
);
221 const LLT RegionPtr
= GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS
);
222 const LLT FlatPtr
= GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS
);
223 const LLT PrivatePtr
= GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS
);
225 const LLT CodePtr
= FlatPtr
;
227 const std::initializer_list
<LLT
> AddrSpaces64
= {
228 GlobalPtr
, ConstantPtr
, FlatPtr
231 const std::initializer_list
<LLT
> AddrSpaces32
= {
232 LocalPtr
, PrivatePtr
, Constant32Ptr
, RegionPtr
235 const std::initializer_list
<LLT
> FPTypesBase
= {
239 const std::initializer_list
<LLT
> FPTypes16
= {
243 const std::initializer_list
<LLT
> FPTypesPK16
= {
247 setAction({G_BRCOND
, S1
}, Legal
);
249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250 // elements for v3s16
251 getActionDefinitionsBuilder(G_PHI
)
252 .legalFor({S32
, S64
, V2S16
, V4S16
, S1
, S128
, S256
})
253 .legalFor(AllS32Vectors
)
254 .legalFor(AllS64Vectors
)
255 .legalFor(AddrSpaces64
)
256 .legalFor(AddrSpaces32
)
257 .clampScalar(0, S32
, S256
)
258 .widenScalarToNextPow2(0, 32)
259 .clampMaxNumElements(0, S32
, 16)
260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261 .legalIf(isPointer(0));
263 if (ST
.has16BitInsts()) {
264 getActionDefinitionsBuilder({G_ADD
, G_SUB
, G_MUL
})
265 .legalFor({S32
, S16
})
266 .clampScalar(0, S16
, S32
)
269 getActionDefinitionsBuilder({G_ADD
, G_SUB
, G_MUL
})
271 .clampScalar(0, S32
, S32
)
275 getActionDefinitionsBuilder({G_UMULH
, G_SMULH
})
277 .clampScalar(0, S32
, S32
)
280 // Report legal for any types we can handle anywhere. For the cases only legal
281 // on the SALU, RegBankSelect will be able to re-legalize.
282 getActionDefinitionsBuilder({G_AND
, G_OR
, G_XOR
})
283 .legalFor({S32
, S1
, S64
, V2S32
, S16
, V2S16
, V4S16
})
284 .clampScalar(0, S32
, S64
)
285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
287 .widenScalarToNextPow2(0)
290 getActionDefinitionsBuilder({G_UADDO
, G_SADDO
, G_USUBO
, G_SSUBO
,
291 G_UADDE
, G_SADDE
, G_USUBE
, G_SSUBE
})
292 .legalFor({{S32
, S1
}})
293 .clampScalar(0, S32
, S32
)
294 .scalarize(0); // TODO: Implement.
296 getActionDefinitionsBuilder(G_BITCAST
)
297 // Don't worry about the size constraint.
298 .legalIf(all(isRegisterType(0), isRegisterType(1)))
299 // FIXME: Testing hack
300 .legalForCartesianProduct({S16
, LLT::vector(2, 8), });
302 getActionDefinitionsBuilder(G_FCONSTANT
)
303 .legalFor({S32
, S64
, S16
})
304 .clampScalar(0, S16
, S64
);
306 getActionDefinitionsBuilder(G_IMPLICIT_DEF
)
307 .legalFor({S1
, S32
, S64
, S16
, V2S32
, V4S32
, V2S16
, V4S16
, GlobalPtr
,
308 ConstantPtr
, LocalPtr
, FlatPtr
, PrivatePtr
})
309 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
310 .clampScalarOrElt(0, S32
, S1024
)
311 .legalIf(isMultiple32(0))
312 .widenScalarToNextPow2(0, 32)
313 .clampMaxNumElements(0, S32
, 16);
316 // FIXME: i1 operands to intrinsics should always be legal, but other i1
317 // values may not be legal. We need to figure out how to distinguish
318 // between these two scenarios.
319 getActionDefinitionsBuilder(G_CONSTANT
)
320 .legalFor({S1
, S32
, S64
, S16
, GlobalPtr
,
321 LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
322 .clampScalar(0, S32
, S64
)
323 .widenScalarToNextPow2(0)
324 .legalIf(isPointer(0));
326 setAction({G_FRAME_INDEX
, PrivatePtr
}, Legal
);
327 getActionDefinitionsBuilder(G_GLOBAL_VALUE
)
328 .customFor({LocalPtr
, GlobalPtr
, ConstantPtr
, Constant32Ptr
});
331 auto &FPOpActions
= getActionDefinitionsBuilder(
332 { G_FADD
, G_FMUL
, G_FMA
, G_FCANONICALIZE
})
333 .legalFor({S32
, S64
});
334 auto &TrigActions
= getActionDefinitionsBuilder({G_FSIN
, G_FCOS
})
335 .customFor({S32
, S64
});
337 if (ST
.has16BitInsts()) {
338 if (ST
.hasVOP3PInsts())
339 FPOpActions
.legalFor({S16
, V2S16
});
341 FPOpActions
.legalFor({S16
});
343 TrigActions
.customFor({S16
});
346 auto &MinNumMaxNum
= getActionDefinitionsBuilder({
347 G_FMINNUM
, G_FMAXNUM
, G_FMINNUM_IEEE
, G_FMAXNUM_IEEE
});
349 if (ST
.hasVOP3PInsts()) {
350 MinNumMaxNum
.customFor(FPTypesPK16
)
351 .clampMaxNumElements(0, S16
, 2)
352 .clampScalar(0, S16
, S64
)
354 } else if (ST
.has16BitInsts()) {
355 MinNumMaxNum
.customFor(FPTypes16
)
356 .clampScalar(0, S16
, S64
)
359 MinNumMaxNum
.customFor(FPTypesBase
)
360 .clampScalar(0, S32
, S64
)
364 if (ST
.hasVOP3PInsts())
365 FPOpActions
.clampMaxNumElements(0, S16
, 2);
369 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
373 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
375 getActionDefinitionsBuilder({G_FNEG
, G_FABS
})
376 .legalFor(FPTypesPK16
)
377 .clampMaxNumElements(0, S16
, 2)
379 .clampScalar(0, S16
, S64
);
382 getActionDefinitionsBuilder({G_FMINIMUM
, G_FMAXIMUM
}).lower();
384 if (ST
.has16BitInsts()) {
385 getActionDefinitionsBuilder({G_FSQRT
, G_FFLOOR
})
386 .legalFor({S32
, S64
, S16
})
388 .clampScalar(0, S16
, S64
);
390 getActionDefinitionsBuilder({G_FSQRT
, G_FFLOOR
})
391 .legalFor({S32
, S64
})
393 .clampScalar(0, S32
, S64
);
396 getActionDefinitionsBuilder(G_FPTRUNC
)
397 .legalFor({{S32
, S64
}, {S16
, S32
}})
400 getActionDefinitionsBuilder(G_FPEXT
)
401 .legalFor({{S64
, S32
}, {S32
, S16
}})
402 .lowerFor({{S64
, S16
}}) // FIXME: Implement
405 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
406 getActionDefinitionsBuilder(G_FCOPYSIGN
).lower();
408 getActionDefinitionsBuilder(G_FSUB
)
409 // Use actual fsub instruction
411 // Must use fadd + fneg
412 .lowerFor({S64
, S16
, V2S16
})
414 .clampScalar(0, S32
, S64
);
416 // Whether this is legal depends on the floating point mode for the function.
417 auto &FMad
= getActionDefinitionsBuilder(G_FMAD
);
419 FMad
.customFor({S32
, S16
});
421 FMad
.customFor({S32
});
425 getActionDefinitionsBuilder({G_SEXT
, G_ZEXT
, G_ANYEXT
})
426 .legalFor({{S64
, S32
}, {S32
, S16
}, {S64
, S16
},
427 {S32
, S1
}, {S64
, S1
}, {S16
, S1
},
430 {S64
, LLT::scalar(33)},
431 {S32
, S8
}, {S128
, S32
}, {S128
, S64
}, {S32
, LLT::scalar(24)}})
434 // TODO: Split s1->s64 during regbankselect for VALU.
435 auto &IToFP
= getActionDefinitionsBuilder({G_SITOFP
, G_UITOFP
})
436 .legalFor({{S32
, S32
}, {S64
, S32
}, {S16
, S32
}, {S32
, S1
}, {S16
, S1
}, {S64
, S1
}})
437 .lowerFor({{S32
, S64
}})
438 .customFor({{S64
, S64
}});
439 if (ST
.has16BitInsts())
440 IToFP
.legalFor({{S16
, S16
}});
441 IToFP
.clampScalar(1, S32
, S64
)
444 auto &FPToI
= getActionDefinitionsBuilder({G_FPTOSI
, G_FPTOUI
})
445 .legalFor({{S32
, S32
}, {S32
, S64
}, {S32
, S16
}});
446 if (ST
.has16BitInsts())
447 FPToI
.legalFor({{S16
, S16
}});
449 FPToI
.minScalar(1, S32
);
451 FPToI
.minScalar(0, S32
)
454 getActionDefinitionsBuilder(G_INTRINSIC_ROUND
)
455 .legalFor({S32
, S64
})
458 if (ST
.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS
) {
459 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC
, G_FCEIL
, G_FRINT
})
460 .legalFor({S32
, S64
})
461 .clampScalar(0, S32
, S64
)
464 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC
, G_FCEIL
, G_FRINT
})
467 .clampScalar(0, S32
, S64
)
471 getActionDefinitionsBuilder(G_GEP
)
472 .legalForCartesianProduct(AddrSpaces64
, {S64
})
473 .legalForCartesianProduct(AddrSpaces32
, {S32
})
476 getActionDefinitionsBuilder(G_PTR_MASK
)
480 setAction({G_BLOCK_ADDR
, CodePtr
}, Legal
);
483 getActionDefinitionsBuilder(G_ICMP
)
484 .legalForCartesianProduct(
485 {S1
}, {S32
, S64
, GlobalPtr
, LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
486 .legalFor({{S1
, S32
}, {S1
, S64
}});
487 if (ST
.has16BitInsts()) {
488 CmpBuilder
.legalFor({{S1
, S16
}});
492 .widenScalarToNextPow2(1)
493 .clampScalar(1, S32
, S64
)
495 .legalIf(all(typeIs(0, S1
), isPointer(1)));
497 getActionDefinitionsBuilder(G_FCMP
)
498 .legalForCartesianProduct({S1
}, ST
.has16BitInsts() ? FPTypes16
: FPTypesBase
)
499 .widenScalarToNextPow2(1)
500 .clampScalar(1, S32
, S64
)
503 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
504 getActionDefinitionsBuilder({G_FPOW
, G_FEXP
, G_FEXP2
,
505 G_FLOG
, G_FLOG2
, G_FLOG10
})
509 // The 64-bit versions produce 32-bit results, but only on the SALU.
510 getActionDefinitionsBuilder({G_CTLZ
, G_CTLZ_ZERO_UNDEF
,
511 G_CTTZ
, G_CTTZ_ZERO_UNDEF
,
513 .legalFor({{S32
, S32
}, {S32
, S64
}})
514 .clampScalar(0, S32
, S32
)
515 .clampScalar(1, S32
, S64
)
517 .widenScalarToNextPow2(0, 32)
518 .widenScalarToNextPow2(1, 32);
520 // TODO: Expand for > s32
521 getActionDefinitionsBuilder({G_BSWAP
, G_BITREVERSE
})
523 .clampScalar(0, S32
, S32
)
526 if (ST
.has16BitInsts()) {
527 if (ST
.hasVOP3PInsts()) {
528 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
529 .legalFor({S32
, S16
, V2S16
})
530 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
531 .clampMaxNumElements(0, S16
, 2)
532 .clampScalar(0, S16
, S32
)
533 .widenScalarToNextPow2(0)
536 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
537 .legalFor({S32
, S16
})
538 .widenScalarToNextPow2(0)
539 .clampScalar(0, S16
, S32
)
543 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
545 .clampScalar(0, S32
, S32
)
546 .widenScalarToNextPow2(0)
550 auto smallerThan
= [](unsigned TypeIdx0
, unsigned TypeIdx1
) {
551 return [=](const LegalityQuery
&Query
) {
552 return Query
.Types
[TypeIdx0
].getSizeInBits() <
553 Query
.Types
[TypeIdx1
].getSizeInBits();
557 auto greaterThan
= [](unsigned TypeIdx0
, unsigned TypeIdx1
) {
558 return [=](const LegalityQuery
&Query
) {
559 return Query
.Types
[TypeIdx0
].getSizeInBits() >
560 Query
.Types
[TypeIdx1
].getSizeInBits();
564 getActionDefinitionsBuilder(G_INTTOPTR
)
565 // List the common cases
566 .legalForCartesianProduct(AddrSpaces64
, {S64
})
567 .legalForCartesianProduct(AddrSpaces32
, {S32
})
569 // Accept any address space as long as the size matches
570 .legalIf(sameSize(0, 1))
571 .widenScalarIf(smallerThan(1, 0),
572 [](const LegalityQuery
&Query
) {
573 return std::make_pair(1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
575 .narrowScalarIf(greaterThan(1, 0),
576 [](const LegalityQuery
&Query
) {
577 return std::make_pair(1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
580 getActionDefinitionsBuilder(G_PTRTOINT
)
581 // List the common cases
582 .legalForCartesianProduct(AddrSpaces64
, {S64
})
583 .legalForCartesianProduct(AddrSpaces32
, {S32
})
585 // Accept any address space as long as the size matches
586 .legalIf(sameSize(0, 1))
587 .widenScalarIf(smallerThan(0, 1),
588 [](const LegalityQuery
&Query
) {
589 return std::make_pair(0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
593 [](const LegalityQuery
&Query
) {
594 return std::make_pair(0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
597 getActionDefinitionsBuilder(G_ADDRSPACE_CAST
)
601 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
602 // handle some operations by just promoting the register during
603 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
604 auto maxSizeForAddrSpace
= [this](unsigned AS
) -> unsigned {
606 // FIXME: Private element size.
607 case AMDGPUAS::PRIVATE_ADDRESS
:
609 // FIXME: Check subtarget
610 case AMDGPUAS::LOCAL_ADDRESS
:
611 return ST
.useDS128() ? 128 : 64;
613 // Treat constant and global as identical. SMRD loads are sometimes usable
614 // for global loads (ideally constant address space should be eliminated)
615 // depending on the context. Legality cannot be context dependent, but
616 // RegBankSelect can split the load as necessary depending on the pointer
617 // register bank/uniformity and if the memory is invariant or not written in
619 case AMDGPUAS::CONSTANT_ADDRESS
:
620 case AMDGPUAS::GLOBAL_ADDRESS
:
627 const auto needToSplitLoad
= [=](const LegalityQuery
&Query
) -> bool {
628 const LLT DstTy
= Query
.Types
[0];
630 // Split vector extloads.
631 unsigned MemSize
= Query
.MMODescrs
[0].SizeInBits
;
632 if (DstTy
.isVector() && DstTy
.getSizeInBits() > MemSize
)
635 const LLT PtrTy
= Query
.Types
[1];
636 unsigned AS
= PtrTy
.getAddressSpace();
637 if (MemSize
> maxSizeForAddrSpace(AS
))
640 // Catch weird sized loads that don't evenly divide into the access sizes
641 // TODO: May be able to widen depending on alignment etc.
642 unsigned NumRegs
= MemSize
/ 32;
643 if (NumRegs
== 3 && !ST
.hasDwordx3LoadStores())
646 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
647 if (Align
< MemSize
) {
648 const SITargetLowering
*TLI
= ST
.getTargetLowering();
649 return !TLI
->allowsMisalignedMemoryAccessesImpl(MemSize
, AS
, Align
/ 8);
655 unsigned GlobalAlign32
= ST
.hasUnalignedBufferAccess() ? 0 : 32;
656 unsigned GlobalAlign16
= ST
.hasUnalignedBufferAccess() ? 0 : 16;
657 unsigned GlobalAlign8
= ST
.hasUnalignedBufferAccess() ? 0 : 8;
659 // TODO: Refine based on subtargets which support unaligned access or 128-bit
661 // TODO: Unsupported flat for SI.
663 for (unsigned Op
: {G_LOAD
, G_STORE
}) {
664 const bool IsStore
= Op
== G_STORE
;
666 auto &Actions
= getActionDefinitionsBuilder(Op
);
667 // Whitelist the common cases.
668 // TODO: Pointer loads
669 // TODO: Wide constant loads
670 // TODO: Only CI+ has 3x loads
671 // TODO: Loads to s16 on gfx9
672 Actions
.legalForTypesWithMemDesc({{S32
, GlobalPtr
, 32, GlobalAlign32
},
673 {V2S32
, GlobalPtr
, 64, GlobalAlign32
},
674 {V3S32
, GlobalPtr
, 96, GlobalAlign32
},
675 {S96
, GlobalPtr
, 96, GlobalAlign32
},
676 {V4S32
, GlobalPtr
, 128, GlobalAlign32
},
677 {S128
, GlobalPtr
, 128, GlobalAlign32
},
678 {S64
, GlobalPtr
, 64, GlobalAlign32
},
679 {V2S64
, GlobalPtr
, 128, GlobalAlign32
},
680 {V2S16
, GlobalPtr
, 32, GlobalAlign32
},
681 {S32
, GlobalPtr
, 8, GlobalAlign8
},
682 {S32
, GlobalPtr
, 16, GlobalAlign16
},
684 {S32
, LocalPtr
, 32, 32},
685 {S64
, LocalPtr
, 64, 32},
686 {V2S32
, LocalPtr
, 64, 32},
687 {S32
, LocalPtr
, 8, 8},
688 {S32
, LocalPtr
, 16, 16},
689 {V2S16
, LocalPtr
, 32, 32},
691 {S32
, PrivatePtr
, 32, 32},
692 {S32
, PrivatePtr
, 8, 8},
693 {S32
, PrivatePtr
, 16, 16},
694 {V2S16
, PrivatePtr
, 32, 32},
696 {S32
, FlatPtr
, 32, GlobalAlign32
},
697 {S32
, FlatPtr
, 16, GlobalAlign16
},
698 {S32
, FlatPtr
, 8, GlobalAlign8
},
699 {V2S16
, FlatPtr
, 32, GlobalAlign32
},
701 {S32
, ConstantPtr
, 32, GlobalAlign32
},
702 {V2S32
, ConstantPtr
, 64, GlobalAlign32
},
703 {V3S32
, ConstantPtr
, 96, GlobalAlign32
},
704 {V4S32
, ConstantPtr
, 128, GlobalAlign32
},
705 {S64
, ConstantPtr
, 64, GlobalAlign32
},
706 {S128
, ConstantPtr
, 128, GlobalAlign32
},
707 {V2S32
, ConstantPtr
, 32, GlobalAlign32
}});
709 .customIf(typeIs(1, Constant32Ptr
))
711 [=](const LegalityQuery
&Query
) -> bool {
712 return !Query
.Types
[0].isVector() && needToSplitLoad(Query
);
714 [=](const LegalityQuery
&Query
) -> std::pair
<unsigned, LLT
> {
715 const LLT DstTy
= Query
.Types
[0];
716 const LLT PtrTy
= Query
.Types
[1];
718 const unsigned DstSize
= DstTy
.getSizeInBits();
719 unsigned MemSize
= Query
.MMODescrs
[0].SizeInBits
;
722 if (DstSize
> MemSize
)
723 return std::make_pair(0, LLT::scalar(MemSize
));
725 if (DstSize
> 32 && (DstSize
% 32 != 0)) {
726 // FIXME: Need a way to specify non-extload of larger size if
728 return std::make_pair(0, LLT::scalar(32 * (DstSize
/ 32)));
731 unsigned MaxSize
= maxSizeForAddrSpace(PtrTy
.getAddressSpace());
732 if (MemSize
> MaxSize
)
733 return std::make_pair(0, LLT::scalar(MaxSize
));
735 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
736 return std::make_pair(0, LLT::scalar(Align
));
739 [=](const LegalityQuery
&Query
) -> bool {
740 return Query
.Types
[0].isVector() && needToSplitLoad(Query
);
742 [=](const LegalityQuery
&Query
) -> std::pair
<unsigned, LLT
> {
743 const LLT DstTy
= Query
.Types
[0];
744 const LLT PtrTy
= Query
.Types
[1];
746 LLT EltTy
= DstTy
.getElementType();
747 unsigned MaxSize
= maxSizeForAddrSpace(PtrTy
.getAddressSpace());
749 // Split if it's too large for the address space.
750 if (Query
.MMODescrs
[0].SizeInBits
> MaxSize
) {
751 unsigned NumElts
= DstTy
.getNumElements();
752 unsigned NumPieces
= Query
.MMODescrs
[0].SizeInBits
/ MaxSize
;
754 // FIXME: Refine when odd breakdowns handled
755 // The scalars will need to be re-legalized.
756 if (NumPieces
== 1 || NumPieces
>= NumElts
||
757 NumElts
% NumPieces
!= 0)
758 return std::make_pair(0, EltTy
);
760 return std::make_pair(0,
761 LLT::vector(NumElts
/ NumPieces
, EltTy
));
764 // Need to split because of alignment.
765 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
766 unsigned EltSize
= EltTy
.getSizeInBits();
767 if (EltSize
> Align
&&
768 (EltSize
/ Align
< DstTy
.getNumElements())) {
769 return std::make_pair(0, LLT::vector(EltSize
/ Align
, EltTy
));
772 // May need relegalization for the scalars.
773 return std::make_pair(0, EltTy
);
778 Actions
.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32
));
780 // TODO: Need a bitcast lower option?
782 .legalIf([=](const LegalityQuery
&Query
) {
783 const LLT Ty0
= Query
.Types
[0];
784 unsigned Size
= Ty0
.getSizeInBits();
785 unsigned MemSize
= Query
.MMODescrs
[0].SizeInBits
;
786 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
788 // No extending vector loads.
789 if (Size
> MemSize
&& Ty0
.isVector())
792 // FIXME: Widening store from alignment not valid.
794 MemSize
= std::max(MemSize
, Align
);
805 return ST
.hasDwordx3LoadStores();
813 .widenScalarToNextPow2(0)
814 // TODO: v3s32->v4s32 with alignment
815 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
818 auto &ExtLoads
= getActionDefinitionsBuilder({G_SEXTLOAD
, G_ZEXTLOAD
})
819 .legalForTypesWithMemDesc({{S32
, GlobalPtr
, 8, 8},
820 {S32
, GlobalPtr
, 16, 2 * 8},
821 {S32
, LocalPtr
, 8, 8},
822 {S32
, LocalPtr
, 16, 16},
823 {S32
, PrivatePtr
, 8, 8},
824 {S32
, PrivatePtr
, 16, 16},
825 {S32
, ConstantPtr
, 8, 8},
826 {S32
, ConstantPtr
, 16, 2 * 8}});
827 if (ST
.hasFlatAddressSpace()) {
828 ExtLoads
.legalForTypesWithMemDesc(
829 {{S32
, FlatPtr
, 8, 8}, {S32
, FlatPtr
, 16, 16}});
832 ExtLoads
.clampScalar(0, S32
, S32
)
833 .widenScalarToNextPow2(0)
834 .unsupportedIfMemSizeNotPow2()
837 auto &Atomics
= getActionDefinitionsBuilder(
838 {G_ATOMICRMW_XCHG
, G_ATOMICRMW_ADD
, G_ATOMICRMW_SUB
,
839 G_ATOMICRMW_AND
, G_ATOMICRMW_OR
, G_ATOMICRMW_XOR
,
840 G_ATOMICRMW_MAX
, G_ATOMICRMW_MIN
, G_ATOMICRMW_UMAX
,
841 G_ATOMICRMW_UMIN
, G_ATOMIC_CMPXCHG
})
842 .legalFor({{S32
, GlobalPtr
}, {S32
, LocalPtr
},
843 {S64
, GlobalPtr
}, {S64
, LocalPtr
}});
844 if (ST
.hasFlatAddressSpace()) {
845 Atomics
.legalFor({{S32
, FlatPtr
}, {S64
, FlatPtr
}});
848 getActionDefinitionsBuilder(G_ATOMICRMW_FADD
)
849 .legalFor({{S32
, LocalPtr
}});
851 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS
)
854 // TODO: Pointer types, any 32-bit or 64-bit vector
855 getActionDefinitionsBuilder(G_SELECT
)
856 .legalForCartesianProduct({S32
, S64
, S16
, V2S32
, V2S16
, V4S16
,
857 GlobalPtr
, LocalPtr
, FlatPtr
, PrivatePtr
,
858 LLT::vector(2, LocalPtr
), LLT::vector(2, PrivatePtr
)}, {S1
})
859 .clampScalar(0, S16
, S64
)
860 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
861 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
863 .clampMaxNumElements(0, S32
, 2)
864 .clampMaxNumElements(0, LocalPtr
, 2)
865 .clampMaxNumElements(0, PrivatePtr
, 2)
867 .widenScalarToNextPow2(0)
868 .legalIf(all(isPointer(0), typeIs(1, S1
)));
870 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
871 // be more flexible with the shift amount type.
872 auto &Shifts
= getActionDefinitionsBuilder({G_SHL
, G_LSHR
, G_ASHR
})
873 .legalFor({{S32
, S32
}, {S64
, S32
}});
874 if (ST
.has16BitInsts()) {
875 if (ST
.hasVOP3PInsts()) {
876 Shifts
.legalFor({{S16
, S32
}, {S16
, S16
}, {V2S16
, V2S16
}})
877 .clampMaxNumElements(0, S16
, 2);
879 Shifts
.legalFor({{S16
, S32
}, {S16
, S16
}});
881 Shifts
.clampScalar(1, S16
, S32
);
882 Shifts
.clampScalar(0, S16
, S64
);
883 Shifts
.widenScalarToNextPow2(0, 16);
885 // Make sure we legalize the shift amount type first, as the general
886 // expansion for the shifted type will produce much worse code if it hasn't
887 // been truncated already.
888 Shifts
.clampScalar(1, S32
, S32
);
889 Shifts
.clampScalar(0, S32
, S64
);
890 Shifts
.widenScalarToNextPow2(0, 32);
894 for (unsigned Op
: {G_EXTRACT_VECTOR_ELT
, G_INSERT_VECTOR_ELT
}) {
895 unsigned VecTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 1 : 0;
896 unsigned EltTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 0 : 1;
897 unsigned IdxTypeIdx
= 2;
899 getActionDefinitionsBuilder(Op
)
900 .customIf([=](const LegalityQuery
&Query
) {
901 const LLT EltTy
= Query
.Types
[EltTypeIdx
];
902 const LLT VecTy
= Query
.Types
[VecTypeIdx
];
903 const LLT IdxTy
= Query
.Types
[IdxTypeIdx
];
904 return (EltTy
.getSizeInBits() == 16 ||
905 EltTy
.getSizeInBits() % 32 == 0) &&
906 VecTy
.getSizeInBits() % 32 == 0 &&
907 VecTy
.getSizeInBits() <= 1024 &&
908 IdxTy
.getSizeInBits() == 32;
910 .clampScalar(EltTypeIdx
, S32
, S64
)
911 .clampScalar(VecTypeIdx
, S32
, S64
)
912 .clampScalar(IdxTypeIdx
, S32
, S32
);
915 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT
)
916 .unsupportedIf([=](const LegalityQuery
&Query
) {
917 const LLT
&EltTy
= Query
.Types
[1].getElementType();
918 return Query
.Types
[0] != EltTy
;
921 for (unsigned Op
: {G_EXTRACT
, G_INSERT
}) {
922 unsigned BigTyIdx
= Op
== G_EXTRACT
? 1 : 0;
923 unsigned LitTyIdx
= Op
== G_EXTRACT
? 0 : 1;
925 // FIXME: Doesn't handle extract of illegal sizes.
926 getActionDefinitionsBuilder(Op
)
927 .lowerIf(all(typeIs(LitTyIdx
, S16
), sizeIs(BigTyIdx
, 32)))
928 // FIXME: Multiples of 16 should not be legal.
929 .legalIf([=](const LegalityQuery
&Query
) {
930 const LLT BigTy
= Query
.Types
[BigTyIdx
];
931 const LLT LitTy
= Query
.Types
[LitTyIdx
];
932 return (BigTy
.getSizeInBits() % 32 == 0) &&
933 (LitTy
.getSizeInBits() % 16 == 0);
936 [=](const LegalityQuery
&Query
) {
937 const LLT BigTy
= Query
.Types
[BigTyIdx
];
938 return (BigTy
.getScalarSizeInBits() < 16);
940 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx
, 16))
942 [=](const LegalityQuery
&Query
) {
943 const LLT LitTy
= Query
.Types
[LitTyIdx
];
944 return (LitTy
.getScalarSizeInBits() < 16);
946 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx
, 16))
947 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
948 .widenScalarToNextPow2(BigTyIdx
, 32);
952 auto &BuildVector
= getActionDefinitionsBuilder(G_BUILD_VECTOR
)
953 .legalForCartesianProduct(AllS32Vectors
, {S32
})
954 .legalForCartesianProduct(AllS64Vectors
, {S64
})
955 .clampNumElements(0, V16S32
, V32S32
)
956 .clampNumElements(0, V2S64
, V16S64
)
957 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16
));
959 if (ST
.hasScalarPackInsts())
960 BuildVector
.legalFor({V2S16
, S32
});
963 .minScalarSameAs(1, 0)
964 .legalIf(isRegisterType(0))
965 .minScalarOrElt(0, S32
);
967 if (ST
.hasScalarPackInsts()) {
968 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
)
969 .legalFor({V2S16
, S32
})
972 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
)
976 getActionDefinitionsBuilder(G_CONCAT_VECTORS
)
977 .legalIf(isRegisterType(0));
979 // TODO: Don't fully scalarize v2s16 pieces
980 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR
).lower();
983 for (unsigned Op
: {G_MERGE_VALUES
, G_UNMERGE_VALUES
}) {
984 unsigned BigTyIdx
= Op
== G_MERGE_VALUES
? 0 : 1;
985 unsigned LitTyIdx
= Op
== G_MERGE_VALUES
? 1 : 0;
987 auto notValidElt
= [=](const LegalityQuery
&Query
, unsigned TypeIdx
) {
988 const LLT
&Ty
= Query
.Types
[TypeIdx
];
990 const LLT
&EltTy
= Ty
.getElementType();
991 if (EltTy
.getSizeInBits() < 8 || EltTy
.getSizeInBits() > 64)
993 if (!isPowerOf2_32(EltTy
.getSizeInBits()))
999 auto &Builder
= getActionDefinitionsBuilder(Op
)
1000 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 16)
1001 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1002 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1004 .clampScalar(LitTyIdx
, S16
, S256
)
1005 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 32)
1006 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
1007 .fewerElementsIf(all(typeIs(0, S16
), vectorWiderThan(1, 32),
1008 elementTypeIs(1, S16
)),
1010 // Break up vectors with weird elements into scalars
1012 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, 0); },
1015 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, 1); },
1017 .clampScalar(BigTyIdx
, S32
, S1024
)
1018 .lowerFor({{S16
, V2S16
}});
1020 if (Op
== G_MERGE_VALUES
) {
1021 Builder
.widenScalarIf(
1022 // TODO: Use 16-bit shifts if legal for 8-bit values?
1023 [=](const LegalityQuery
&Query
) {
1024 const LLT Ty
= Query
.Types
[LitTyIdx
];
1025 return Ty
.getSizeInBits() < 32;
1027 changeTo(LitTyIdx
, S32
));
1030 Builder
.widenScalarIf(
1031 [=](const LegalityQuery
&Query
) {
1032 const LLT Ty
= Query
.Types
[BigTyIdx
];
1033 return !isPowerOf2_32(Ty
.getSizeInBits()) &&
1034 Ty
.getSizeInBits() % 16 != 0;
1036 [=](const LegalityQuery
&Query
) {
1037 // Pick the next power of 2, or a multiple of 64 over 128.
1038 // Whichever is smaller.
1039 const LLT
&Ty
= Query
.Types
[BigTyIdx
];
1040 unsigned NewSizeInBits
= 1 << Log2_32_Ceil(Ty
.getSizeInBits() + 1);
1041 if (NewSizeInBits
>= 256) {
1042 unsigned RoundedTo
= alignTo
<64>(Ty
.getSizeInBits() + 1);
1043 if (RoundedTo
< NewSizeInBits
)
1044 NewSizeInBits
= RoundedTo
;
1046 return std::make_pair(BigTyIdx
, LLT::scalar(NewSizeInBits
));
1048 .legalIf([=](const LegalityQuery
&Query
) {
1049 const LLT
&BigTy
= Query
.Types
[BigTyIdx
];
1050 const LLT
&LitTy
= Query
.Types
[LitTyIdx
];
1052 if (BigTy
.isVector() && BigTy
.getSizeInBits() < 32)
1054 if (LitTy
.isVector() && LitTy
.getSizeInBits() < 32)
1057 return BigTy
.getSizeInBits() % 16 == 0 &&
1058 LitTy
.getSizeInBits() % 16 == 0 &&
1059 BigTy
.getSizeInBits() <= 1024;
1061 // Any vectors left are the wrong size. Scalarize them.
1066 getActionDefinitionsBuilder(G_SEXT_INREG
).lower();
1069 verify(*ST
.getInstrInfo());
1072 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr
&MI
,
1073 MachineRegisterInfo
&MRI
,
1074 MachineIRBuilder
&B
,
1075 GISelChangeObserver
&Observer
) const {
1076 switch (MI
.getOpcode()) {
1077 case TargetOpcode::G_ADDRSPACE_CAST
:
1078 return legalizeAddrSpaceCast(MI
, MRI
, B
);
1079 case TargetOpcode::G_FRINT
:
1080 return legalizeFrint(MI
, MRI
, B
);
1081 case TargetOpcode::G_FCEIL
:
1082 return legalizeFceil(MI
, MRI
, B
);
1083 case TargetOpcode::G_INTRINSIC_TRUNC
:
1084 return legalizeIntrinsicTrunc(MI
, MRI
, B
);
1085 case TargetOpcode::G_SITOFP
:
1086 return legalizeITOFP(MI
, MRI
, B
, true);
1087 case TargetOpcode::G_UITOFP
:
1088 return legalizeITOFP(MI
, MRI
, B
, false);
1089 case TargetOpcode::G_FMINNUM
:
1090 case TargetOpcode::G_FMAXNUM
:
1091 case TargetOpcode::G_FMINNUM_IEEE
:
1092 case TargetOpcode::G_FMAXNUM_IEEE
:
1093 return legalizeMinNumMaxNum(MI
, MRI
, B
);
1094 case TargetOpcode::G_EXTRACT_VECTOR_ELT
:
1095 return legalizeExtractVectorElt(MI
, MRI
, B
);
1096 case TargetOpcode::G_INSERT_VECTOR_ELT
:
1097 return legalizeInsertVectorElt(MI
, MRI
, B
);
1098 case TargetOpcode::G_FSIN
:
1099 case TargetOpcode::G_FCOS
:
1100 return legalizeSinCos(MI
, MRI
, B
);
1101 case TargetOpcode::G_GLOBAL_VALUE
:
1102 return legalizeGlobalValue(MI
, MRI
, B
);
1103 case TargetOpcode::G_LOAD
:
1104 return legalizeLoad(MI
, MRI
, B
, Observer
);
1105 case TargetOpcode::G_FMAD
:
1106 return legalizeFMad(MI
, MRI
, B
);
1111 llvm_unreachable("expected switch to return");
1114 Register
AMDGPULegalizerInfo::getSegmentAperture(
1116 MachineRegisterInfo
&MRI
,
1117 MachineIRBuilder
&B
) const {
1118 MachineFunction
&MF
= B
.getMF();
1119 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1120 const LLT S32
= LLT::scalar(32);
1122 assert(AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::PRIVATE_ADDRESS
);
1124 if (ST
.hasApertureRegs()) {
1125 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1127 unsigned Offset
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
1128 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE
:
1129 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE
;
1130 unsigned WidthM1
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
1131 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE
:
1132 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE
;
1134 AMDGPU::Hwreg::ID_MEM_BASES
<< AMDGPU::Hwreg::ID_SHIFT_
|
1135 Offset
<< AMDGPU::Hwreg::OFFSET_SHIFT_
|
1136 WidthM1
<< AMDGPU::Hwreg::WIDTH_M1_SHIFT_
;
1138 Register ApertureReg
= MRI
.createGenericVirtualRegister(S32
);
1139 Register GetReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
1141 B
.buildInstr(AMDGPU::S_GETREG_B32
)
1144 MRI
.setType(GetReg
, S32
);
1146 auto ShiftAmt
= B
.buildConstant(S32
, WidthM1
+ 1);
1147 B
.buildInstr(TargetOpcode::G_SHL
)
1148 .addDef(ApertureReg
)
1150 .addUse(ShiftAmt
.getReg(0));
1155 Register QueuePtr
= MRI
.createGenericVirtualRegister(
1156 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
1158 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1159 if (!loadInputValue(QueuePtr
, B
, &MFI
->getArgInfo().QueuePtr
))
1162 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1163 // private_segment_aperture_base_hi.
1164 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
1166 // FIXME: Don't use undef
1167 Value
*V
= UndefValue::get(PointerType::get(
1168 Type::getInt8Ty(MF
.getFunction().getContext()),
1169 AMDGPUAS::CONSTANT_ADDRESS
));
1171 MachinePointerInfo
PtrInfo(V
, StructOffset
);
1172 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
1174 MachineMemOperand::MOLoad
|
1175 MachineMemOperand::MODereferenceable
|
1176 MachineMemOperand::MOInvariant
,
1178 MinAlign(64, StructOffset
));
1180 Register LoadResult
= MRI
.createGenericVirtualRegister(S32
);
1183 B
.materializeGEP(LoadAddr
, QueuePtr
, LLT::scalar(64), StructOffset
);
1184 B
.buildLoad(LoadResult
, LoadAddr
, *MMO
);
1188 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1189 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1190 MachineIRBuilder
&B
) const {
1191 MachineFunction
&MF
= B
.getMF();
1195 const LLT S32
= LLT::scalar(32);
1196 Register Dst
= MI
.getOperand(0).getReg();
1197 Register Src
= MI
.getOperand(1).getReg();
1199 LLT DstTy
= MRI
.getType(Dst
);
1200 LLT SrcTy
= MRI
.getType(Src
);
1201 unsigned DestAS
= DstTy
.getAddressSpace();
1202 unsigned SrcAS
= SrcTy
.getAddressSpace();
1204 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1206 assert(!DstTy
.isVector());
1208 const AMDGPUTargetMachine
&TM
1209 = static_cast<const AMDGPUTargetMachine
&>(MF
.getTarget());
1211 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1212 if (ST
.getTargetLowering()->isNoopAddrSpaceCast(SrcAS
, DestAS
)) {
1213 MI
.setDesc(B
.getTII().get(TargetOpcode::G_BITCAST
));
1217 if (DestAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
1219 B
.buildExtract(Dst
, Src
, 0);
1220 MI
.eraseFromParent();
1224 if (SrcAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
1225 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1226 uint32_t AddrHiVal
= Info
->get32BitAddressHighBits();
1228 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1229 // another. Merge operands are required to be the same type, but creating an
1230 // extra ptrtoint would be kind of pointless.
1231 auto HighAddr
= B
.buildConstant(
1232 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT
, 32), AddrHiVal
);
1233 B
.buildMerge(Dst
, {Src
, HighAddr
.getReg(0)});
1234 MI
.eraseFromParent();
1238 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
) {
1239 assert(DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
1240 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
);
1241 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
1243 auto SegmentNull
= B
.buildConstant(DstTy
, NullVal
);
1244 auto FlatNull
= B
.buildConstant(SrcTy
, 0);
1246 Register PtrLo32
= MRI
.createGenericVirtualRegister(DstTy
);
1248 // Extract low 32-bits of the pointer.
1249 B
.buildExtract(PtrLo32
, Src
, 0);
1251 Register CmpRes
= MRI
.createGenericVirtualRegister(LLT::scalar(1));
1252 B
.buildICmp(CmpInst::ICMP_NE
, CmpRes
, Src
, FlatNull
.getReg(0));
1253 B
.buildSelect(Dst
, CmpRes
, PtrLo32
, SegmentNull
.getReg(0));
1255 MI
.eraseFromParent();
1259 if (SrcAS
!= AMDGPUAS::LOCAL_ADDRESS
&& SrcAS
!= AMDGPUAS::PRIVATE_ADDRESS
)
1262 if (!ST
.hasFlatAddressSpace())
1266 B
.buildConstant(SrcTy
, TM
.getNullPointerValue(SrcAS
));
1268 B
.buildConstant(DstTy
, TM
.getNullPointerValue(DestAS
));
1270 Register ApertureReg
= getSegmentAperture(SrcAS
, MRI
, B
);
1271 if (!ApertureReg
.isValid())
1274 Register CmpRes
= MRI
.createGenericVirtualRegister(LLT::scalar(1));
1275 B
.buildICmp(CmpInst::ICMP_NE
, CmpRes
, Src
, SegmentNull
.getReg(0));
1277 Register BuildPtr
= MRI
.createGenericVirtualRegister(DstTy
);
1279 // Coerce the type of the low half of the result so we can use merge_values.
1280 Register SrcAsInt
= MRI
.createGenericVirtualRegister(S32
);
1281 B
.buildInstr(TargetOpcode::G_PTRTOINT
)
1285 // TODO: Should we allow mismatched types but matching sizes in merges to
1286 // avoid the ptrtoint?
1287 B
.buildMerge(BuildPtr
, {SrcAsInt
, ApertureReg
});
1288 B
.buildSelect(Dst
, CmpRes
, BuildPtr
, FlatNull
.getReg(0));
1290 MI
.eraseFromParent();
1294 bool AMDGPULegalizerInfo::legalizeFrint(
1295 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1296 MachineIRBuilder
&B
) const {
1299 Register Src
= MI
.getOperand(1).getReg();
1300 LLT Ty
= MRI
.getType(Src
);
1301 assert(Ty
.isScalar() && Ty
.getSizeInBits() == 64);
1303 APFloat
C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1304 APFloat
C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1306 auto C1
= B
.buildFConstant(Ty
, C1Val
);
1307 auto CopySign
= B
.buildFCopysign(Ty
, C1
, Src
);
1309 // TODO: Should this propagate fast-math-flags?
1310 auto Tmp1
= B
.buildFAdd(Ty
, Src
, CopySign
);
1311 auto Tmp2
= B
.buildFSub(Ty
, Tmp1
, CopySign
);
1313 auto C2
= B
.buildFConstant(Ty
, C2Val
);
1314 auto Fabs
= B
.buildFAbs(Ty
, Src
);
1316 auto Cond
= B
.buildFCmp(CmpInst::FCMP_OGT
, LLT::scalar(1), Fabs
, C2
);
1317 B
.buildSelect(MI
.getOperand(0).getReg(), Cond
, Src
, Tmp2
);
1321 bool AMDGPULegalizerInfo::legalizeFceil(
1322 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1323 MachineIRBuilder
&B
) const {
1326 const LLT S1
= LLT::scalar(1);
1327 const LLT S64
= LLT::scalar(64);
1329 Register Src
= MI
.getOperand(1).getReg();
1330 assert(MRI
.getType(Src
) == S64
);
1332 // result = trunc(src)
1333 // if (src > 0.0 && src != result)
1336 auto Trunc
= B
.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC
, {S64
}, {Src
});
1338 const auto Zero
= B
.buildFConstant(S64
, 0.0);
1339 const auto One
= B
.buildFConstant(S64
, 1.0);
1340 auto Lt0
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Src
, Zero
);
1341 auto NeTrunc
= B
.buildFCmp(CmpInst::FCMP_ONE
, S1
, Src
, Trunc
);
1342 auto And
= B
.buildAnd(S1
, Lt0
, NeTrunc
);
1343 auto Add
= B
.buildSelect(S64
, And
, One
, Zero
);
1345 // TODO: Should this propagate fast-math-flags?
1346 B
.buildFAdd(MI
.getOperand(0).getReg(), Trunc
, Add
);
1350 static MachineInstrBuilder
extractF64Exponent(unsigned Hi
,
1351 MachineIRBuilder
&B
) {
1352 const unsigned FractBits
= 52;
1353 const unsigned ExpBits
= 11;
1354 LLT S32
= LLT::scalar(32);
1356 auto Const0
= B
.buildConstant(S32
, FractBits
- 32);
1357 auto Const1
= B
.buildConstant(S32
, ExpBits
);
1359 auto ExpPart
= B
.buildIntrinsic(Intrinsic::amdgcn_ubfe
, {S32
}, false)
1360 .addUse(Const0
.getReg(0))
1361 .addUse(Const1
.getReg(0));
1363 return B
.buildSub(S32
, ExpPart
, B
.buildConstant(S32
, 1023));
1366 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1367 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1368 MachineIRBuilder
&B
) const {
1371 const LLT S1
= LLT::scalar(1);
1372 const LLT S32
= LLT::scalar(32);
1373 const LLT S64
= LLT::scalar(64);
1375 Register Src
= MI
.getOperand(1).getReg();
1376 assert(MRI
.getType(Src
) == S64
);
1378 // TODO: Should this use extract since the low half is unused?
1379 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
1380 Register Hi
= Unmerge
.getReg(1);
1382 // Extract the upper half, since this is where we will find the sign and
1384 auto Exp
= extractF64Exponent(Hi
, B
);
1386 const unsigned FractBits
= 52;
1388 // Extract the sign bit.
1389 const auto SignBitMask
= B
.buildConstant(S32
, UINT32_C(1) << 31);
1390 auto SignBit
= B
.buildAnd(S32
, Hi
, SignBitMask
);
1392 const auto FractMask
= B
.buildConstant(S64
, (UINT64_C(1) << FractBits
) - 1);
1394 const auto Zero32
= B
.buildConstant(S32
, 0);
1396 // Extend back to 64-bits.
1397 auto SignBit64
= B
.buildMerge(S64
, {Zero32
.getReg(0), SignBit
.getReg(0)});
1399 auto Shr
= B
.buildAShr(S64
, FractMask
, Exp
);
1400 auto Not
= B
.buildNot(S64
, Shr
);
1401 auto Tmp0
= B
.buildAnd(S64
, Src
, Not
);
1402 auto FiftyOne
= B
.buildConstant(S32
, FractBits
- 1);
1404 auto ExpLt0
= B
.buildICmp(CmpInst::ICMP_SLT
, S1
, Exp
, Zero32
);
1405 auto ExpGt51
= B
.buildICmp(CmpInst::ICMP_SGT
, S1
, Exp
, FiftyOne
);
1407 auto Tmp1
= B
.buildSelect(S64
, ExpLt0
, SignBit64
, Tmp0
);
1408 B
.buildSelect(MI
.getOperand(0).getReg(), ExpGt51
, Src
, Tmp1
);
1412 bool AMDGPULegalizerInfo::legalizeITOFP(
1413 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1414 MachineIRBuilder
&B
, bool Signed
) const {
1417 Register Dst
= MI
.getOperand(0).getReg();
1418 Register Src
= MI
.getOperand(1).getReg();
1420 const LLT S64
= LLT::scalar(64);
1421 const LLT S32
= LLT::scalar(32);
1423 assert(MRI
.getType(Src
) == S64
&& MRI
.getType(Dst
) == S64
);
1425 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
1427 auto CvtHi
= Signed
?
1428 B
.buildSITOFP(S64
, Unmerge
.getReg(1)) :
1429 B
.buildUITOFP(S64
, Unmerge
.getReg(1));
1431 auto CvtLo
= B
.buildUITOFP(S64
, Unmerge
.getReg(0));
1433 auto ThirtyTwo
= B
.buildConstant(S32
, 32);
1434 auto LdExp
= B
.buildIntrinsic(Intrinsic::amdgcn_ldexp
, {S64
}, false)
1435 .addUse(CvtHi
.getReg(0))
1436 .addUse(ThirtyTwo
.getReg(0));
1438 // TODO: Should this propagate fast-math-flags?
1439 B
.buildFAdd(Dst
, LdExp
, CvtLo
);
1440 MI
.eraseFromParent();
1444 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1445 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1446 MachineIRBuilder
&B
) const {
1447 MachineFunction
&MF
= B
.getMF();
1448 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1450 const bool IsIEEEOp
= MI
.getOpcode() == AMDGPU::G_FMINNUM_IEEE
||
1451 MI
.getOpcode() == AMDGPU::G_FMAXNUM_IEEE
;
1453 // With ieee_mode disabled, the instructions have the correct behavior
1454 // already for G_FMINNUM/G_FMAXNUM
1455 if (!MFI
->getMode().IEEE
)
1461 MachineIRBuilder
HelperBuilder(MI
);
1462 GISelObserverWrapper DummyObserver
;
1463 LegalizerHelper
Helper(MF
, DummyObserver
, HelperBuilder
);
1464 HelperBuilder
.setInstr(MI
);
1465 return Helper
.lowerFMinNumMaxNum(MI
) == LegalizerHelper::Legalized
;
1468 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1469 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1470 MachineIRBuilder
&B
) const {
1471 // TODO: Should move some of this into LegalizerHelper.
1473 // TODO: Promote dynamic indexing of s16 to s32
1474 // TODO: Dynamic s64 indexing is only legal for SGPR.
1475 Optional
<int64_t> IdxVal
= getConstantVRegVal(MI
.getOperand(2).getReg(), MRI
);
1476 if (!IdxVal
) // Dynamic case will be selected to register indexing.
1479 Register Dst
= MI
.getOperand(0).getReg();
1480 Register Vec
= MI
.getOperand(1).getReg();
1482 LLT VecTy
= MRI
.getType(Vec
);
1483 LLT EltTy
= VecTy
.getElementType();
1484 assert(EltTy
== MRI
.getType(Dst
));
1488 if (IdxVal
.getValue() < VecTy
.getNumElements())
1489 B
.buildExtract(Dst
, Vec
, IdxVal
.getValue() * EltTy
.getSizeInBits());
1493 MI
.eraseFromParent();
1497 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1498 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1499 MachineIRBuilder
&B
) const {
1500 // TODO: Should move some of this into LegalizerHelper.
1502 // TODO: Promote dynamic indexing of s16 to s32
1503 // TODO: Dynamic s64 indexing is only legal for SGPR.
1504 Optional
<int64_t> IdxVal
= getConstantVRegVal(MI
.getOperand(3).getReg(), MRI
);
1505 if (!IdxVal
) // Dynamic case will be selected to register indexing.
1508 Register Dst
= MI
.getOperand(0).getReg();
1509 Register Vec
= MI
.getOperand(1).getReg();
1510 Register Ins
= MI
.getOperand(2).getReg();
1512 LLT VecTy
= MRI
.getType(Vec
);
1513 LLT EltTy
= VecTy
.getElementType();
1514 assert(EltTy
== MRI
.getType(Ins
));
1518 if (IdxVal
.getValue() < VecTy
.getNumElements())
1519 B
.buildInsert(Dst
, Vec
, Ins
, IdxVal
.getValue() * EltTy
.getSizeInBits());
1523 MI
.eraseFromParent();
1527 bool AMDGPULegalizerInfo::legalizeSinCos(
1528 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1529 MachineIRBuilder
&B
) const {
1532 Register DstReg
= MI
.getOperand(0).getReg();
1533 Register SrcReg
= MI
.getOperand(1).getReg();
1534 LLT Ty
= MRI
.getType(DstReg
);
1535 unsigned Flags
= MI
.getFlags();
1538 auto OneOver2Pi
= B
.buildFConstant(Ty
, 0.5 / M_PI
);
1539 if (ST
.hasTrigReducedRange()) {
1540 auto MulVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
);
1541 TrigVal
= B
.buildIntrinsic(Intrinsic::amdgcn_fract
, {Ty
}, false)
1542 .addUse(MulVal
.getReg(0))
1543 .setMIFlags(Flags
).getReg(0);
1545 TrigVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
).getReg(0);
1547 Intrinsic::ID TrigIntrin
= MI
.getOpcode() == AMDGPU::G_FSIN
?
1548 Intrinsic::amdgcn_sin
: Intrinsic::amdgcn_cos
;
1549 B
.buildIntrinsic(TrigIntrin
, makeArrayRef
<Register
>(DstReg
), false)
1552 MI
.eraseFromParent();
1556 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1557 Register DstReg
, LLT PtrTy
,
1558 MachineIRBuilder
&B
, const GlobalValue
*GV
,
1559 unsigned Offset
, unsigned GAFlags
) const {
1560 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1561 // to the following code sequence:
1563 // For constant address space:
1564 // s_getpc_b64 s[0:1]
1565 // s_add_u32 s0, s0, $symbol
1566 // s_addc_u32 s1, s1, 0
1568 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1569 // a fixup or relocation is emitted to replace $symbol with a literal
1570 // constant, which is a pc-relative offset from the encoding of the $symbol
1571 // operand to the global variable.
1573 // For global address space:
1574 // s_getpc_b64 s[0:1]
1575 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1576 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1578 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1579 // fixups or relocations are emitted to replace $symbol@*@lo and
1580 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1581 // which is a 64-bit pc-relative offset from the encoding of the $symbol
1582 // operand to the global variable.
1584 // What we want here is an offset from the value returned by s_getpc
1585 // (which is the address of the s_add_u32 instruction) to the global
1586 // variable, but since the encoding of $symbol starts 4 bytes after the start
1587 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1588 // small. This requires us to add 4 to the global variable offset in order to
1589 // compute the correct address.
1591 LLT ConstPtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
1593 Register PCReg
= PtrTy
.getSizeInBits() != 32 ? DstReg
:
1594 B
.getMRI()->createGenericVirtualRegister(ConstPtrTy
);
1596 MachineInstrBuilder MIB
= B
.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET
)
1599 MIB
.addGlobalAddress(GV
, Offset
+ 4, GAFlags
);
1600 if (GAFlags
== SIInstrInfo::MO_NONE
)
1603 MIB
.addGlobalAddress(GV
, Offset
+ 4, GAFlags
+ 1);
1605 B
.getMRI()->setRegClass(PCReg
, &AMDGPU::SReg_64RegClass
);
1607 if (PtrTy
.getSizeInBits() == 32)
1608 B
.buildExtract(DstReg
, PCReg
, 0);
1612 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1613 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1614 MachineIRBuilder
&B
) const {
1615 Register DstReg
= MI
.getOperand(0).getReg();
1616 LLT Ty
= MRI
.getType(DstReg
);
1617 unsigned AS
= Ty
.getAddressSpace();
1619 const GlobalValue
*GV
= MI
.getOperand(1).getGlobal();
1620 MachineFunction
&MF
= B
.getMF();
1621 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1624 if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
1625 if (!MFI
->isEntryFunction()) {
1626 const Function
&Fn
= MF
.getFunction();
1627 DiagnosticInfoUnsupported
BadLDSDecl(
1628 Fn
, "local memory global used by non-kernel function", MI
.getDebugLoc());
1629 Fn
.getContext().diagnose(BadLDSDecl
);
1632 // TODO: We could emit code to handle the initialization somewhere.
1633 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV
)) {
1634 B
.buildConstant(DstReg
, MFI
->allocateLDSGlobal(B
.getDataLayout(), *GV
));
1635 MI
.eraseFromParent();
1639 const Function
&Fn
= MF
.getFunction();
1640 DiagnosticInfoUnsupported
BadInit(
1641 Fn
, "unsupported initializer for address space", MI
.getDebugLoc());
1642 Fn
.getContext().diagnose(BadInit
);
1646 const SITargetLowering
*TLI
= ST
.getTargetLowering();
1648 if (TLI
->shouldEmitFixup(GV
)) {
1649 buildPCRelGlobalAddress(DstReg
, Ty
, B
, GV
, 0);
1650 MI
.eraseFromParent();
1654 if (TLI
->shouldEmitPCReloc(GV
)) {
1655 buildPCRelGlobalAddress(DstReg
, Ty
, B
, GV
, 0, SIInstrInfo::MO_REL32
);
1656 MI
.eraseFromParent();
1660 LLT PtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
1661 Register GOTAddr
= MRI
.createGenericVirtualRegister(PtrTy
);
1663 MachineMemOperand
*GOTMMO
= MF
.getMachineMemOperand(
1664 MachinePointerInfo::getGOT(MF
),
1665 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
1666 MachineMemOperand::MOInvariant
,
1667 8 /*Size*/, 8 /*Align*/);
1669 buildPCRelGlobalAddress(GOTAddr
, PtrTy
, B
, GV
, 0, SIInstrInfo::MO_GOTPCREL32
);
1671 if (Ty
.getSizeInBits() == 32) {
1672 // Truncate if this is a 32-bit constant adrdess.
1673 auto Load
= B
.buildLoad(PtrTy
, GOTAddr
, *GOTMMO
);
1674 B
.buildExtract(DstReg
, Load
, 0);
1676 B
.buildLoad(DstReg
, GOTAddr
, *GOTMMO
);
1678 MI
.eraseFromParent();
1682 bool AMDGPULegalizerInfo::legalizeLoad(
1683 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1684 MachineIRBuilder
&B
, GISelChangeObserver
&Observer
) const {
1686 LLT ConstPtr
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
1687 auto Cast
= B
.buildAddrSpaceCast(ConstPtr
, MI
.getOperand(1).getReg());
1688 Observer
.changingInstr(MI
);
1689 MI
.getOperand(1).setReg(Cast
.getReg(0));
1690 Observer
.changedInstr(MI
);
1694 bool AMDGPULegalizerInfo::legalizeFMad(
1695 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1696 MachineIRBuilder
&B
) const {
1697 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
1698 assert(Ty
.isScalar());
1700 // TODO: Always legal with future ftz flag.
1701 if (Ty
== LLT::scalar(32) && !ST
.hasFP32Denormals())
1703 if (Ty
== LLT::scalar(16) && !ST
.hasFP16Denormals())
1706 MachineFunction
&MF
= B
.getMF();
1708 MachineIRBuilder
HelperBuilder(MI
);
1709 GISelObserverWrapper DummyObserver
;
1710 LegalizerHelper
Helper(MF
, DummyObserver
, HelperBuilder
);
1711 HelperBuilder
.setMBB(*MI
.getParent());
1712 return Helper
.lowerFMad(MI
) == LegalizerHelper::Legalized
;
1715 // Return the use branch instruction, otherwise null if the usage is invalid.
1716 static MachineInstr
*verifyCFIntrinsic(MachineInstr
&MI
,
1717 MachineRegisterInfo
&MRI
) {
1718 Register CondDef
= MI
.getOperand(0).getReg();
1719 if (!MRI
.hasOneNonDBGUse(CondDef
))
1722 MachineInstr
&UseMI
= *MRI
.use_instr_nodbg_begin(CondDef
);
1723 return UseMI
.getParent() == MI
.getParent() &&
1724 UseMI
.getOpcode() == AMDGPU::G_BRCOND
? &UseMI
: nullptr;
1727 Register
AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo
&MRI
,
1728 Register Reg
, LLT Ty
) const {
1729 Register LiveIn
= MRI
.getLiveInVirtReg(Reg
);
1733 Register NewReg
= MRI
.createGenericVirtualRegister(Ty
);
1734 MRI
.addLiveIn(Reg
, NewReg
);
1738 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg
, MachineIRBuilder
&B
,
1739 const ArgDescriptor
*Arg
) const {
1740 if (!Arg
->isRegister() || !Arg
->getRegister().isValid())
1741 return false; // TODO: Handle these
1743 assert(Arg
->getRegister().isPhysical());
1745 MachineRegisterInfo
&MRI
= *B
.getMRI();
1747 LLT Ty
= MRI
.getType(DstReg
);
1748 Register LiveIn
= getLiveInRegister(MRI
, Arg
->getRegister(), Ty
);
1750 if (Arg
->isMasked()) {
1751 // TODO: Should we try to emit this once in the entry block?
1752 const LLT S32
= LLT::scalar(32);
1753 const unsigned Mask
= Arg
->getMask();
1754 const unsigned Shift
= countTrailingZeros
<unsigned>(Mask
);
1756 Register AndMaskSrc
= LiveIn
;
1759 auto ShiftAmt
= B
.buildConstant(S32
, Shift
);
1760 AndMaskSrc
= B
.buildLShr(S32
, LiveIn
, ShiftAmt
).getReg(0);
1763 B
.buildAnd(DstReg
, AndMaskSrc
, B
.buildConstant(S32
, Mask
>> Shift
));
1765 B
.buildCopy(DstReg
, LiveIn
);
1767 // Insert the argument copy if it doens't already exist.
1768 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1769 if (!MRI
.getVRegDef(LiveIn
)) {
1770 // FIXME: Should have scoped insert pt
1771 MachineBasicBlock
&OrigInsBB
= B
.getMBB();
1772 auto OrigInsPt
= B
.getInsertPt();
1774 MachineBasicBlock
&EntryMBB
= B
.getMF().front();
1775 EntryMBB
.addLiveIn(Arg
->getRegister());
1776 B
.setInsertPt(EntryMBB
, EntryMBB
.begin());
1777 B
.buildCopy(LiveIn
, Arg
->getRegister());
1779 B
.setInsertPt(OrigInsBB
, OrigInsPt
);
1785 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1787 MachineRegisterInfo
&MRI
,
1788 MachineIRBuilder
&B
,
1789 AMDGPUFunctionArgInfo::PreloadedValue ArgType
) const {
1792 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
1794 const ArgDescriptor
*Arg
;
1795 const TargetRegisterClass
*RC
;
1796 std::tie(Arg
, RC
) = MFI
->getPreloadedValue(ArgType
);
1798 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1802 if (loadInputValue(MI
.getOperand(0).getReg(), B
, Arg
)) {
1803 MI
.eraseFromParent();
1810 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr
&MI
,
1811 MachineRegisterInfo
&MRI
,
1812 MachineIRBuilder
&B
) const {
1814 Register Res
= MI
.getOperand(0).getReg();
1815 Register LHS
= MI
.getOperand(2).getReg();
1816 Register RHS
= MI
.getOperand(3).getReg();
1817 uint16_t Flags
= MI
.getFlags();
1819 LLT S32
= LLT::scalar(32);
1820 LLT S1
= LLT::scalar(1);
1822 auto Abs
= B
.buildFAbs(S32
, RHS
, Flags
);
1823 const APFloat
C0Val(1.0f
);
1825 auto C0
= B
.buildConstant(S32
, 0x6f800000);
1826 auto C1
= B
.buildConstant(S32
, 0x2f800000);
1827 auto C2
= B
.buildConstant(S32
, FloatToBits(1.0f
));
1829 auto CmpRes
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Abs
, C0
, Flags
);
1830 auto Sel
= B
.buildSelect(S32
, CmpRes
, C1
, C2
, Flags
);
1832 auto Mul0
= B
.buildFMul(S32
, RHS
, Sel
, Flags
);
1834 auto RCP
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S32
}, false)
1835 .addUse(Mul0
.getReg(0))
1838 auto Mul1
= B
.buildFMul(S32
, LHS
, RCP
, Flags
);
1840 B
.buildFMul(Res
, Sel
, Mul1
, Flags
);
1842 MI
.eraseFromParent();
1846 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr
&MI
,
1847 MachineRegisterInfo
&MRI
,
1848 MachineIRBuilder
&B
) const {
1849 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
1850 if (!MFI
->isEntryFunction()) {
1851 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1852 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
1858 ST
.getTargetLowering()->getImplicitParameterOffset(
1859 B
.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT
);
1860 Register DstReg
= MI
.getOperand(0).getReg();
1861 LLT DstTy
= MRI
.getType(DstReg
);
1862 LLT IdxTy
= LLT::scalar(DstTy
.getSizeInBits());
1864 const ArgDescriptor
*Arg
;
1865 const TargetRegisterClass
*RC
;
1867 = MFI
->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1871 Register KernargPtrReg
= MRI
.createGenericVirtualRegister(DstTy
);
1872 if (!loadInputValue(KernargPtrReg
, B
, Arg
))
1875 B
.buildGEP(DstReg
, KernargPtrReg
, B
.buildConstant(IdxTy
, Offset
).getReg(0));
1876 MI
.eraseFromParent();
1880 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr
&MI
,
1881 MachineRegisterInfo
&MRI
,
1882 MachineIRBuilder
&B
,
1883 unsigned AddrSpace
) const {
1885 Register ApertureReg
= getSegmentAperture(AddrSpace
, MRI
, B
);
1886 auto Hi32
= B
.buildExtract(LLT::scalar(32), MI
.getOperand(2).getReg(), 32);
1887 B
.buildICmp(ICmpInst::ICMP_EQ
, MI
.getOperand(0), Hi32
, ApertureReg
);
1888 MI
.eraseFromParent();
1892 /// Handle register layout difference for f16 images for some subtargets.
1893 Register
AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder
&B
,
1894 MachineRegisterInfo
&MRI
,
1895 Register Reg
) const {
1896 if (!ST
.hasUnpackedD16VMem())
1899 const LLT S16
= LLT::scalar(16);
1900 const LLT S32
= LLT::scalar(32);
1901 LLT StoreVT
= MRI
.getType(Reg
);
1902 assert(StoreVT
.isVector() && StoreVT
.getElementType() == S16
);
1904 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
1906 SmallVector
<Register
, 4> WideRegs
;
1907 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
1908 WideRegs
.push_back(B
.buildAnyExt(S32
, Unmerge
.getReg(I
)).getReg(0));
1910 int NumElts
= StoreVT
.getNumElements();
1912 return B
.buildBuildVector(LLT::vector(NumElts
, S32
), WideRegs
).getReg(0);
1915 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr
&MI
,
1916 MachineRegisterInfo
&MRI
,
1917 MachineIRBuilder
&B
,
1918 bool IsFormat
) const {
1919 // TODO: Reject f16 format on targets where unsupported.
1920 Register VData
= MI
.getOperand(1).getReg();
1921 LLT Ty
= MRI
.getType(VData
);
1925 const LLT S32
= LLT::scalar(32);
1926 const LLT S16
= LLT::scalar(16);
1928 // Fixup illegal register types for i8 stores.
1929 if (Ty
== LLT::scalar(8) || Ty
== S16
) {
1930 Register AnyExt
= B
.buildAnyExt(LLT::scalar(32), VData
).getReg(0);
1931 MI
.getOperand(1).setReg(AnyExt
);
1935 if (Ty
.isVector()) {
1936 if (Ty
.getElementType() == S16
&& Ty
.getNumElements() <= 4) {
1938 MI
.getOperand(1).setReg(handleD16VData(B
, MRI
, VData
));
1942 return Ty
.getElementType() == S32
&& Ty
.getNumElements() <= 4;
1948 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr
&MI
,
1949 MachineRegisterInfo
&MRI
,
1950 MachineIRBuilder
&B
) const {
1951 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1952 switch (MI
.getIntrinsicID()) {
1953 case Intrinsic::amdgcn_if
: {
1954 if (MachineInstr
*BrCond
= verifyCFIntrinsic(MI
, MRI
)) {
1955 const SIRegisterInfo
*TRI
1956 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
1958 B
.setInstr(*BrCond
);
1959 Register Def
= MI
.getOperand(1).getReg();
1960 Register Use
= MI
.getOperand(3).getReg();
1961 B
.buildInstr(AMDGPU::SI_IF
)
1964 .addMBB(BrCond
->getOperand(1).getMBB());
1966 MRI
.setRegClass(Def
, TRI
->getWaveMaskRegClass());
1967 MRI
.setRegClass(Use
, TRI
->getWaveMaskRegClass());
1968 MI
.eraseFromParent();
1969 BrCond
->eraseFromParent();
1975 case Intrinsic::amdgcn_loop
: {
1976 if (MachineInstr
*BrCond
= verifyCFIntrinsic(MI
, MRI
)) {
1977 const SIRegisterInfo
*TRI
1978 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
1980 B
.setInstr(*BrCond
);
1981 Register Reg
= MI
.getOperand(2).getReg();
1982 B
.buildInstr(AMDGPU::SI_LOOP
)
1984 .addMBB(BrCond
->getOperand(1).getMBB());
1985 MI
.eraseFromParent();
1986 BrCond
->eraseFromParent();
1987 MRI
.setRegClass(Reg
, TRI
->getWaveMaskRegClass());
1993 case Intrinsic::amdgcn_kernarg_segment_ptr
:
1994 return legalizePreloadedArgIntrin(
1995 MI
, MRI
, B
, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1996 case Intrinsic::amdgcn_implicitarg_ptr
:
1997 return legalizeImplicitArgPtr(MI
, MRI
, B
);
1998 case Intrinsic::amdgcn_workitem_id_x
:
1999 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2000 AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
2001 case Intrinsic::amdgcn_workitem_id_y
:
2002 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2003 AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
2004 case Intrinsic::amdgcn_workitem_id_z
:
2005 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2006 AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
2007 case Intrinsic::amdgcn_workgroup_id_x
:
2008 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2009 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
2010 case Intrinsic::amdgcn_workgroup_id_y
:
2011 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2012 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
2013 case Intrinsic::amdgcn_workgroup_id_z
:
2014 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2015 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
2016 case Intrinsic::amdgcn_dispatch_ptr
:
2017 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2018 AMDGPUFunctionArgInfo::DISPATCH_PTR
);
2019 case Intrinsic::amdgcn_queue_ptr
:
2020 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2021 AMDGPUFunctionArgInfo::QUEUE_PTR
);
2022 case Intrinsic::amdgcn_implicit_buffer_ptr
:
2023 return legalizePreloadedArgIntrin(
2024 MI
, MRI
, B
, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
2025 case Intrinsic::amdgcn_dispatch_id
:
2026 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2027 AMDGPUFunctionArgInfo::DISPATCH_ID
);
2028 case Intrinsic::amdgcn_fdiv_fast
:
2029 return legalizeFDIVFast(MI
, MRI
, B
);
2030 case Intrinsic::amdgcn_is_shared
:
2031 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::LOCAL_ADDRESS
);
2032 case Intrinsic::amdgcn_is_private
:
2033 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::PRIVATE_ADDRESS
);
2034 case Intrinsic::amdgcn_wavefrontsize
: {
2036 B
.buildConstant(MI
.getOperand(0), ST
.getWavefrontSize());
2037 MI
.eraseFromParent();
2040 case Intrinsic::amdgcn_raw_buffer_store
:
2041 return legalizeRawBufferStore(MI
, MRI
, B
, false);
2042 case Intrinsic::amdgcn_raw_buffer_store_format
:
2043 return legalizeRawBufferStore(MI
, MRI
, B
, true);