1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the Machinelegalizer class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
34 #define DEBUG_TYPE "amdgpu-legalinfo"
37 using namespace LegalizeActions
;
38 using namespace LegalizeMutations
;
39 using namespace LegalityPredicates
;
42 static LegalityPredicate
isMultiple32(unsigned TypeIdx
,
43 unsigned MaxSize
= 1024) {
44 return [=](const LegalityQuery
&Query
) {
45 const LLT Ty
= Query
.Types
[TypeIdx
];
46 const LLT EltTy
= Ty
.getScalarType();
47 return Ty
.getSizeInBits() <= MaxSize
&& EltTy
.getSizeInBits() % 32 == 0;
51 static LegalityPredicate
sizeIs(unsigned TypeIdx
, unsigned Size
) {
52 return [=](const LegalityQuery
&Query
) {
53 return Query
.Types
[TypeIdx
].getSizeInBits() == Size
;
57 static LegalityPredicate
isSmallOddVector(unsigned TypeIdx
) {
58 return [=](const LegalityQuery
&Query
) {
59 const LLT Ty
= Query
.Types
[TypeIdx
];
60 return Ty
.isVector() &&
61 Ty
.getNumElements() % 2 != 0 &&
62 Ty
.getElementType().getSizeInBits() < 32 &&
63 Ty
.getSizeInBits() % 32 != 0;
67 static LegalityPredicate
isWideVec16(unsigned TypeIdx
) {
68 return [=](const LegalityQuery
&Query
) {
69 const LLT Ty
= Query
.Types
[TypeIdx
];
70 const LLT EltTy
= Ty
.getScalarType();
71 return EltTy
.getSizeInBits() == 16 && Ty
.getNumElements() > 2;
75 static LegalizeMutation
oneMoreElement(unsigned TypeIdx
) {
76 return [=](const LegalityQuery
&Query
) {
77 const LLT Ty
= Query
.Types
[TypeIdx
];
78 const LLT EltTy
= Ty
.getElementType();
79 return std::make_pair(TypeIdx
, LLT::vector(Ty
.getNumElements() + 1, EltTy
));
83 static LegalizeMutation
fewerEltsToSize64Vector(unsigned TypeIdx
) {
84 return [=](const LegalityQuery
&Query
) {
85 const LLT Ty
= Query
.Types
[TypeIdx
];
86 const LLT EltTy
= Ty
.getElementType();
87 unsigned Size
= Ty
.getSizeInBits();
88 unsigned Pieces
= (Size
+ 63) / 64;
89 unsigned NewNumElts
= (Ty
.getNumElements() + 1) / Pieces
;
90 return std::make_pair(TypeIdx
, LLT::scalarOrVector(NewNumElts
, EltTy
));
94 // Increase the number of vector elements to reach the next multiple of 32-bit
96 static LegalizeMutation
moreEltsToNext32Bit(unsigned TypeIdx
) {
97 return [=](const LegalityQuery
&Query
) {
98 const LLT Ty
= Query
.Types
[TypeIdx
];
100 const LLT EltTy
= Ty
.getElementType();
101 const int Size
= Ty
.getSizeInBits();
102 const int EltSize
= EltTy
.getSizeInBits();
103 const int NextMul32
= (Size
+ 31) / 32;
105 assert(EltSize
< 32);
107 const int NewNumElts
= (32 * NextMul32
+ EltSize
- 1) / EltSize
;
108 return std::make_pair(TypeIdx
, LLT::vector(NewNumElts
, EltTy
));
112 static LegalityPredicate
vectorSmallerThan(unsigned TypeIdx
, unsigned Size
) {
113 return [=](const LegalityQuery
&Query
) {
114 const LLT QueryTy
= Query
.Types
[TypeIdx
];
115 return QueryTy
.isVector() && QueryTy
.getSizeInBits() < Size
;
119 static LegalityPredicate
vectorWiderThan(unsigned TypeIdx
, unsigned Size
) {
120 return [=](const LegalityQuery
&Query
) {
121 const LLT QueryTy
= Query
.Types
[TypeIdx
];
122 return QueryTy
.isVector() && QueryTy
.getSizeInBits() > Size
;
126 static LegalityPredicate
numElementsNotEven(unsigned TypeIdx
) {
127 return [=](const LegalityQuery
&Query
) {
128 const LLT QueryTy
= Query
.Types
[TypeIdx
];
129 return QueryTy
.isVector() && QueryTy
.getNumElements() % 2 != 0;
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
135 static LegalityPredicate
isRegisterType(unsigned TypeIdx
) {
136 return [=](const LegalityQuery
&Query
) {
137 const LLT Ty
= Query
.Types
[TypeIdx
];
139 const int EltSize
= Ty
.getElementType().getSizeInBits();
140 return EltSize
== 32 || EltSize
== 64 ||
141 (EltSize
== 16 && Ty
.getNumElements() % 2 == 0) ||
142 EltSize
== 128 || EltSize
== 256;
145 return Ty
.getSizeInBits() % 32 == 0 && Ty
.getSizeInBits() <= 1024;
149 static LegalityPredicate
elementTypeIs(unsigned TypeIdx
, LLT Type
) {
150 return [=](const LegalityQuery
&Query
) {
151 return Query
.Types
[TypeIdx
].getElementType() == Type
;
155 static LegalityPredicate
isWideScalarTruncStore(unsigned TypeIdx
) {
156 return [=](const LegalityQuery
&Query
) {
157 const LLT Ty
= Query
.Types
[TypeIdx
];
158 return !Ty
.isVector() && Ty
.getSizeInBits() > 32 &&
159 Query
.MMODescrs
[0].SizeInBits
< Ty
.getSizeInBits();
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget
&ST_
,
164 const GCNTargetMachine
&TM
)
166 using namespace TargetOpcode
;
168 auto GetAddrSpacePtr
= [&TM
](unsigned AS
) {
169 return LLT::pointer(AS
, TM
.getPointerSizeInBits(AS
));
172 const LLT S1
= LLT::scalar(1);
173 const LLT S8
= LLT::scalar(8);
174 const LLT S16
= LLT::scalar(16);
175 const LLT S32
= LLT::scalar(32);
176 const LLT S64
= LLT::scalar(64);
177 const LLT S96
= LLT::scalar(96);
178 const LLT S128
= LLT::scalar(128);
179 const LLT S256
= LLT::scalar(256);
180 const LLT S1024
= LLT::scalar(1024);
182 const LLT V2S16
= LLT::vector(2, 16);
183 const LLT V4S16
= LLT::vector(4, 16);
185 const LLT V2S32
= LLT::vector(2, 32);
186 const LLT V3S32
= LLT::vector(3, 32);
187 const LLT V4S32
= LLT::vector(4, 32);
188 const LLT V5S32
= LLT::vector(5, 32);
189 const LLT V6S32
= LLT::vector(6, 32);
190 const LLT V7S32
= LLT::vector(7, 32);
191 const LLT V8S32
= LLT::vector(8, 32);
192 const LLT V9S32
= LLT::vector(9, 32);
193 const LLT V10S32
= LLT::vector(10, 32);
194 const LLT V11S32
= LLT::vector(11, 32);
195 const LLT V12S32
= LLT::vector(12, 32);
196 const LLT V13S32
= LLT::vector(13, 32);
197 const LLT V14S32
= LLT::vector(14, 32);
198 const LLT V15S32
= LLT::vector(15, 32);
199 const LLT V16S32
= LLT::vector(16, 32);
200 const LLT V32S32
= LLT::vector(32, 32);
202 const LLT V2S64
= LLT::vector(2, 64);
203 const LLT V3S64
= LLT::vector(3, 64);
204 const LLT V4S64
= LLT::vector(4, 64);
205 const LLT V5S64
= LLT::vector(5, 64);
206 const LLT V6S64
= LLT::vector(6, 64);
207 const LLT V7S64
= LLT::vector(7, 64);
208 const LLT V8S64
= LLT::vector(8, 64);
209 const LLT V16S64
= LLT::vector(16, 64);
211 std::initializer_list
<LLT
> AllS32Vectors
=
212 {V2S32
, V3S32
, V4S32
, V5S32
, V6S32
, V7S32
, V8S32
,
213 V9S32
, V10S32
, V11S32
, V12S32
, V13S32
, V14S32
, V15S32
, V16S32
, V32S32
};
214 std::initializer_list
<LLT
> AllS64Vectors
=
215 {V2S64
, V3S64
, V4S64
, V5S64
, V6S64
, V7S64
, V8S64
, V16S64
};
217 const LLT GlobalPtr
= GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS
);
218 const LLT ConstantPtr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS
);
219 const LLT Constant32Ptr
= GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT
);
220 const LLT LocalPtr
= GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS
);
221 const LLT RegionPtr
= GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS
);
222 const LLT FlatPtr
= GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS
);
223 const LLT PrivatePtr
= GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS
);
225 const LLT CodePtr
= FlatPtr
;
227 const std::initializer_list
<LLT
> AddrSpaces64
= {
228 GlobalPtr
, ConstantPtr
, FlatPtr
231 const std::initializer_list
<LLT
> AddrSpaces32
= {
232 LocalPtr
, PrivatePtr
, Constant32Ptr
, RegionPtr
235 const std::initializer_list
<LLT
> FPTypesBase
= {
239 const std::initializer_list
<LLT
> FPTypes16
= {
243 const std::initializer_list
<LLT
> FPTypesPK16
= {
247 setAction({G_BRCOND
, S1
}, Legal
);
249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250 // elements for v3s16
251 getActionDefinitionsBuilder(G_PHI
)
252 .legalFor({S32
, S64
, V2S16
, V4S16
, S1
, S128
, S256
})
253 .legalFor(AllS32Vectors
)
254 .legalFor(AllS64Vectors
)
255 .legalFor(AddrSpaces64
)
256 .legalFor(AddrSpaces32
)
257 .clampScalar(0, S32
, S256
)
258 .widenScalarToNextPow2(0, 32)
259 .clampMaxNumElements(0, S32
, 16)
260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261 .legalIf(isPointer(0));
263 if (ST
.has16BitInsts()) {
264 getActionDefinitionsBuilder({G_ADD
, G_SUB
, G_MUL
})
265 .legalFor({S32
, S16
})
266 .clampScalar(0, S16
, S32
)
269 getActionDefinitionsBuilder({G_ADD
, G_SUB
, G_MUL
})
271 .clampScalar(0, S32
, S32
)
275 getActionDefinitionsBuilder({G_UMULH
, G_SMULH
})
277 .clampScalar(0, S32
, S32
)
280 // Report legal for any types we can handle anywhere. For the cases only legal
281 // on the SALU, RegBankSelect will be able to re-legalize.
282 getActionDefinitionsBuilder({G_AND
, G_OR
, G_XOR
})
283 .legalFor({S32
, S1
, S64
, V2S32
, S16
, V2S16
, V4S16
})
284 .clampScalar(0, S32
, S64
)
285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
287 .widenScalarToNextPow2(0)
290 getActionDefinitionsBuilder({G_UADDO
, G_USUBO
,
291 G_UADDE
, G_SADDE
, G_USUBE
, G_SSUBE
})
292 .legalFor({{S32
, S1
}})
293 .clampScalar(0, S32
, S32
)
294 .scalarize(0); // TODO: Implement.
296 getActionDefinitionsBuilder({G_SADDO
, G_SSUBO
})
299 getActionDefinitionsBuilder(G_BITCAST
)
300 // Don't worry about the size constraint.
301 .legalIf(all(isRegisterType(0), isRegisterType(1)))
302 // FIXME: Testing hack
303 .legalForCartesianProduct({S16
, LLT::vector(2, 8), });
305 getActionDefinitionsBuilder(G_FCONSTANT
)
306 .legalFor({S32
, S64
, S16
})
307 .clampScalar(0, S16
, S64
);
309 getActionDefinitionsBuilder(G_IMPLICIT_DEF
)
310 .legalFor({S1
, S32
, S64
, S16
, V2S32
, V4S32
, V2S16
, V4S16
, GlobalPtr
,
311 ConstantPtr
, LocalPtr
, FlatPtr
, PrivatePtr
})
312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313 .clampScalarOrElt(0, S32
, S1024
)
314 .legalIf(isMultiple32(0))
315 .widenScalarToNextPow2(0, 32)
316 .clampMaxNumElements(0, S32
, 16);
319 // FIXME: i1 operands to intrinsics should always be legal, but other i1
320 // values may not be legal. We need to figure out how to distinguish
321 // between these two scenarios.
322 getActionDefinitionsBuilder(G_CONSTANT
)
323 .legalFor({S1
, S32
, S64
, S16
, GlobalPtr
,
324 LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
325 .clampScalar(0, S32
, S64
)
326 .widenScalarToNextPow2(0)
327 .legalIf(isPointer(0));
329 setAction({G_FRAME_INDEX
, PrivatePtr
}, Legal
);
330 getActionDefinitionsBuilder(G_GLOBAL_VALUE
)
331 .customFor({LocalPtr
, GlobalPtr
, ConstantPtr
, Constant32Ptr
});
334 auto &FPOpActions
= getActionDefinitionsBuilder(
335 { G_FADD
, G_FMUL
, G_FMA
, G_FCANONICALIZE
})
336 .legalFor({S32
, S64
});
337 auto &TrigActions
= getActionDefinitionsBuilder({G_FSIN
, G_FCOS
})
338 .customFor({S32
, S64
});
339 auto &FDIVActions
= getActionDefinitionsBuilder(G_FDIV
)
340 .customFor({S32
, S64
});
342 if (ST
.has16BitInsts()) {
343 if (ST
.hasVOP3PInsts())
344 FPOpActions
.legalFor({S16
, V2S16
});
346 FPOpActions
.legalFor({S16
});
348 TrigActions
.customFor({S16
});
349 FDIVActions
.customFor({S16
});
352 auto &MinNumMaxNum
= getActionDefinitionsBuilder({
353 G_FMINNUM
, G_FMAXNUM
, G_FMINNUM_IEEE
, G_FMAXNUM_IEEE
});
355 if (ST
.hasVOP3PInsts()) {
356 MinNumMaxNum
.customFor(FPTypesPK16
)
357 .clampMaxNumElements(0, S16
, 2)
358 .clampScalar(0, S16
, S64
)
360 } else if (ST
.has16BitInsts()) {
361 MinNumMaxNum
.customFor(FPTypes16
)
362 .clampScalar(0, S16
, S64
)
365 MinNumMaxNum
.customFor(FPTypesBase
)
366 .clampScalar(0, S32
, S64
)
370 if (ST
.hasVOP3PInsts())
371 FPOpActions
.clampMaxNumElements(0, S16
, 2);
375 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
379 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
383 .clampScalar(0, ST
.has16BitInsts() ? S16
: S32
, S64
);
385 getActionDefinitionsBuilder({G_FNEG
, G_FABS
})
386 .legalFor(FPTypesPK16
)
387 .clampMaxNumElements(0, S16
, 2)
389 .clampScalar(0, S16
, S64
);
392 getActionDefinitionsBuilder({G_FMINIMUM
, G_FMAXIMUM
}).lower();
394 if (ST
.has16BitInsts()) {
395 getActionDefinitionsBuilder({G_FSQRT
, G_FFLOOR
})
396 .legalFor({S32
, S64
, S16
})
398 .clampScalar(0, S16
, S64
);
400 getActionDefinitionsBuilder({G_FSQRT
, G_FFLOOR
})
401 .legalFor({S32
, S64
})
403 .clampScalar(0, S32
, S64
);
406 getActionDefinitionsBuilder(G_FPTRUNC
)
407 .legalFor({{S32
, S64
}, {S16
, S32
}})
410 getActionDefinitionsBuilder(G_FPEXT
)
411 .legalFor({{S64
, S32
}, {S32
, S16
}})
412 .lowerFor({{S64
, S16
}}) // FIXME: Implement
415 // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
416 getActionDefinitionsBuilder(G_FCOPYSIGN
).lower();
418 getActionDefinitionsBuilder(G_FSUB
)
419 // Use actual fsub instruction
421 // Must use fadd + fneg
422 .lowerFor({S64
, S16
, V2S16
})
424 .clampScalar(0, S32
, S64
);
426 // Whether this is legal depends on the floating point mode for the function.
427 auto &FMad
= getActionDefinitionsBuilder(G_FMAD
);
429 FMad
.customFor({S32
, S16
});
431 FMad
.customFor({S32
});
435 getActionDefinitionsBuilder({G_SEXT
, G_ZEXT
, G_ANYEXT
})
436 .legalFor({{S64
, S32
}, {S32
, S16
}, {S64
, S16
},
437 {S32
, S1
}, {S64
, S1
}, {S16
, S1
},
440 {S64
, LLT::scalar(33)},
441 {S32
, S8
}, {S128
, S32
}, {S128
, S64
}, {S32
, LLT::scalar(24)}})
444 // TODO: Split s1->s64 during regbankselect for VALU.
445 auto &IToFP
= getActionDefinitionsBuilder({G_SITOFP
, G_UITOFP
})
446 .legalFor({{S32
, S32
}, {S64
, S32
}, {S16
, S32
}, {S32
, S1
}, {S16
, S1
}, {S64
, S1
}})
447 .lowerFor({{S32
, S64
}})
448 .customFor({{S64
, S64
}});
449 if (ST
.has16BitInsts())
450 IToFP
.legalFor({{S16
, S16
}});
451 IToFP
.clampScalar(1, S32
, S64
)
454 auto &FPToI
= getActionDefinitionsBuilder({G_FPTOSI
, G_FPTOUI
})
455 .legalFor({{S32
, S32
}, {S32
, S64
}, {S32
, S16
}});
456 if (ST
.has16BitInsts())
457 FPToI
.legalFor({{S16
, S16
}});
459 FPToI
.minScalar(1, S32
);
461 FPToI
.minScalar(0, S32
)
464 getActionDefinitionsBuilder(G_INTRINSIC_ROUND
)
465 .legalFor({S32
, S64
})
468 if (ST
.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS
) {
469 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC
, G_FCEIL
, G_FRINT
})
470 .legalFor({S32
, S64
})
471 .clampScalar(0, S32
, S64
)
474 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC
, G_FCEIL
, G_FRINT
})
477 .clampScalar(0, S32
, S64
)
481 getActionDefinitionsBuilder(G_GEP
)
482 .legalForCartesianProduct(AddrSpaces64
, {S64
})
483 .legalForCartesianProduct(AddrSpaces32
, {S32
})
486 getActionDefinitionsBuilder(G_PTR_MASK
)
490 setAction({G_BLOCK_ADDR
, CodePtr
}, Legal
);
493 getActionDefinitionsBuilder(G_ICMP
)
494 .legalForCartesianProduct(
495 {S1
}, {S32
, S64
, GlobalPtr
, LocalPtr
, ConstantPtr
, PrivatePtr
, FlatPtr
})
496 .legalFor({{S1
, S32
}, {S1
, S64
}});
497 if (ST
.has16BitInsts()) {
498 CmpBuilder
.legalFor({{S1
, S16
}});
502 .widenScalarToNextPow2(1)
503 .clampScalar(1, S32
, S64
)
505 .legalIf(all(typeIs(0, S1
), isPointer(1)));
507 getActionDefinitionsBuilder(G_FCMP
)
508 .legalForCartesianProduct({S1
}, ST
.has16BitInsts() ? FPTypes16
: FPTypesBase
)
509 .widenScalarToNextPow2(1)
510 .clampScalar(1, S32
, S64
)
513 // FIXME: fexp, flog2, flog10 needs to be custom lowered.
514 getActionDefinitionsBuilder({G_FPOW
, G_FEXP
, G_FEXP2
,
515 G_FLOG
, G_FLOG2
, G_FLOG10
})
519 // The 64-bit versions produce 32-bit results, but only on the SALU.
520 getActionDefinitionsBuilder({G_CTLZ
, G_CTLZ_ZERO_UNDEF
,
521 G_CTTZ
, G_CTTZ_ZERO_UNDEF
,
523 .legalFor({{S32
, S32
}, {S32
, S64
}})
524 .clampScalar(0, S32
, S32
)
525 .clampScalar(1, S32
, S64
)
527 .widenScalarToNextPow2(0, 32)
528 .widenScalarToNextPow2(1, 32);
530 // TODO: Expand for > s32
531 getActionDefinitionsBuilder({G_BSWAP
, G_BITREVERSE
})
533 .clampScalar(0, S32
, S32
)
536 if (ST
.has16BitInsts()) {
537 if (ST
.hasVOP3PInsts()) {
538 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
539 .legalFor({S32
, S16
, V2S16
})
540 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
541 .clampMaxNumElements(0, S16
, 2)
542 .clampScalar(0, S16
, S32
)
543 .widenScalarToNextPow2(0)
546 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
547 .legalFor({S32
, S16
})
548 .widenScalarToNextPow2(0)
549 .clampScalar(0, S16
, S32
)
553 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
555 .clampScalar(0, S32
, S32
)
556 .widenScalarToNextPow2(0)
560 auto smallerThan
= [](unsigned TypeIdx0
, unsigned TypeIdx1
) {
561 return [=](const LegalityQuery
&Query
) {
562 return Query
.Types
[TypeIdx0
].getSizeInBits() <
563 Query
.Types
[TypeIdx1
].getSizeInBits();
567 auto greaterThan
= [](unsigned TypeIdx0
, unsigned TypeIdx1
) {
568 return [=](const LegalityQuery
&Query
) {
569 return Query
.Types
[TypeIdx0
].getSizeInBits() >
570 Query
.Types
[TypeIdx1
].getSizeInBits();
574 getActionDefinitionsBuilder(G_INTTOPTR
)
575 // List the common cases
576 .legalForCartesianProduct(AddrSpaces64
, {S64
})
577 .legalForCartesianProduct(AddrSpaces32
, {S32
})
579 // Accept any address space as long as the size matches
580 .legalIf(sameSize(0, 1))
581 .widenScalarIf(smallerThan(1, 0),
582 [](const LegalityQuery
&Query
) {
583 return std::make_pair(1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
585 .narrowScalarIf(greaterThan(1, 0),
586 [](const LegalityQuery
&Query
) {
587 return std::make_pair(1, LLT::scalar(Query
.Types
[0].getSizeInBits()));
590 getActionDefinitionsBuilder(G_PTRTOINT
)
591 // List the common cases
592 .legalForCartesianProduct(AddrSpaces64
, {S64
})
593 .legalForCartesianProduct(AddrSpaces32
, {S32
})
595 // Accept any address space as long as the size matches
596 .legalIf(sameSize(0, 1))
597 .widenScalarIf(smallerThan(0, 1),
598 [](const LegalityQuery
&Query
) {
599 return std::make_pair(0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
603 [](const LegalityQuery
&Query
) {
604 return std::make_pair(0, LLT::scalar(Query
.Types
[1].getSizeInBits()));
607 getActionDefinitionsBuilder(G_ADDRSPACE_CAST
)
611 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
612 // handle some operations by just promoting the register during
613 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
614 auto maxSizeForAddrSpace
= [this](unsigned AS
) -> unsigned {
616 // FIXME: Private element size.
617 case AMDGPUAS::PRIVATE_ADDRESS
:
619 // FIXME: Check subtarget
620 case AMDGPUAS::LOCAL_ADDRESS
:
621 return ST
.useDS128() ? 128 : 64;
623 // Treat constant and global as identical. SMRD loads are sometimes usable
624 // for global loads (ideally constant address space should be eliminated)
625 // depending on the context. Legality cannot be context dependent, but
626 // RegBankSelect can split the load as necessary depending on the pointer
627 // register bank/uniformity and if the memory is invariant or not written in
629 case AMDGPUAS::CONSTANT_ADDRESS
:
630 case AMDGPUAS::GLOBAL_ADDRESS
:
637 const auto needToSplitLoad
= [=](const LegalityQuery
&Query
) -> bool {
638 const LLT DstTy
= Query
.Types
[0];
640 // Split vector extloads.
641 unsigned MemSize
= Query
.MMODescrs
[0].SizeInBits
;
642 if (DstTy
.isVector() && DstTy
.getSizeInBits() > MemSize
)
645 const LLT PtrTy
= Query
.Types
[1];
646 unsigned AS
= PtrTy
.getAddressSpace();
647 if (MemSize
> maxSizeForAddrSpace(AS
))
650 // Catch weird sized loads that don't evenly divide into the access sizes
651 // TODO: May be able to widen depending on alignment etc.
652 unsigned NumRegs
= MemSize
/ 32;
653 if (NumRegs
== 3 && !ST
.hasDwordx3LoadStores())
656 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
657 if (Align
< MemSize
) {
658 const SITargetLowering
*TLI
= ST
.getTargetLowering();
659 return !TLI
->allowsMisalignedMemoryAccessesImpl(MemSize
, AS
, Align
/ 8);
665 unsigned GlobalAlign32
= ST
.hasUnalignedBufferAccess() ? 0 : 32;
666 unsigned GlobalAlign16
= ST
.hasUnalignedBufferAccess() ? 0 : 16;
667 unsigned GlobalAlign8
= ST
.hasUnalignedBufferAccess() ? 0 : 8;
669 // TODO: Refine based on subtargets which support unaligned access or 128-bit
671 // TODO: Unsupported flat for SI.
673 for (unsigned Op
: {G_LOAD
, G_STORE
}) {
674 const bool IsStore
= Op
== G_STORE
;
676 auto &Actions
= getActionDefinitionsBuilder(Op
);
677 // Whitelist the common cases.
678 // TODO: Pointer loads
679 // TODO: Wide constant loads
680 // TODO: Only CI+ has 3x loads
681 // TODO: Loads to s16 on gfx9
682 Actions
.legalForTypesWithMemDesc({{S32
, GlobalPtr
, 32, GlobalAlign32
},
683 {V2S32
, GlobalPtr
, 64, GlobalAlign32
},
684 {V3S32
, GlobalPtr
, 96, GlobalAlign32
},
685 {S96
, GlobalPtr
, 96, GlobalAlign32
},
686 {V4S32
, GlobalPtr
, 128, GlobalAlign32
},
687 {S128
, GlobalPtr
, 128, GlobalAlign32
},
688 {S64
, GlobalPtr
, 64, GlobalAlign32
},
689 {V2S64
, GlobalPtr
, 128, GlobalAlign32
},
690 {V2S16
, GlobalPtr
, 32, GlobalAlign32
},
691 {S32
, GlobalPtr
, 8, GlobalAlign8
},
692 {S32
, GlobalPtr
, 16, GlobalAlign16
},
694 {S32
, LocalPtr
, 32, 32},
695 {S64
, LocalPtr
, 64, 32},
696 {V2S32
, LocalPtr
, 64, 32},
697 {S32
, LocalPtr
, 8, 8},
698 {S32
, LocalPtr
, 16, 16},
699 {V2S16
, LocalPtr
, 32, 32},
701 {S32
, PrivatePtr
, 32, 32},
702 {S32
, PrivatePtr
, 8, 8},
703 {S32
, PrivatePtr
, 16, 16},
704 {V2S16
, PrivatePtr
, 32, 32},
706 {S32
, FlatPtr
, 32, GlobalAlign32
},
707 {S32
, FlatPtr
, 16, GlobalAlign16
},
708 {S32
, FlatPtr
, 8, GlobalAlign8
},
709 {V2S16
, FlatPtr
, 32, GlobalAlign32
},
711 {S32
, ConstantPtr
, 32, GlobalAlign32
},
712 {V2S32
, ConstantPtr
, 64, GlobalAlign32
},
713 {V3S32
, ConstantPtr
, 96, GlobalAlign32
},
714 {V4S32
, ConstantPtr
, 128, GlobalAlign32
},
715 {S64
, ConstantPtr
, 64, GlobalAlign32
},
716 {S128
, ConstantPtr
, 128, GlobalAlign32
},
717 {V2S32
, ConstantPtr
, 32, GlobalAlign32
}});
719 .customIf(typeIs(1, Constant32Ptr
))
721 [=](const LegalityQuery
&Query
) -> bool {
722 return !Query
.Types
[0].isVector() && needToSplitLoad(Query
);
724 [=](const LegalityQuery
&Query
) -> std::pair
<unsigned, LLT
> {
725 const LLT DstTy
= Query
.Types
[0];
726 const LLT PtrTy
= Query
.Types
[1];
728 const unsigned DstSize
= DstTy
.getSizeInBits();
729 unsigned MemSize
= Query
.MMODescrs
[0].SizeInBits
;
732 if (DstSize
> MemSize
)
733 return std::make_pair(0, LLT::scalar(MemSize
));
735 if (DstSize
> 32 && (DstSize
% 32 != 0)) {
736 // FIXME: Need a way to specify non-extload of larger size if
738 return std::make_pair(0, LLT::scalar(32 * (DstSize
/ 32)));
741 unsigned MaxSize
= maxSizeForAddrSpace(PtrTy
.getAddressSpace());
742 if (MemSize
> MaxSize
)
743 return std::make_pair(0, LLT::scalar(MaxSize
));
745 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
746 return std::make_pair(0, LLT::scalar(Align
));
749 [=](const LegalityQuery
&Query
) -> bool {
750 return Query
.Types
[0].isVector() && needToSplitLoad(Query
);
752 [=](const LegalityQuery
&Query
) -> std::pair
<unsigned, LLT
> {
753 const LLT DstTy
= Query
.Types
[0];
754 const LLT PtrTy
= Query
.Types
[1];
756 LLT EltTy
= DstTy
.getElementType();
757 unsigned MaxSize
= maxSizeForAddrSpace(PtrTy
.getAddressSpace());
759 // Split if it's too large for the address space.
760 if (Query
.MMODescrs
[0].SizeInBits
> MaxSize
) {
761 unsigned NumElts
= DstTy
.getNumElements();
762 unsigned NumPieces
= Query
.MMODescrs
[0].SizeInBits
/ MaxSize
;
764 // FIXME: Refine when odd breakdowns handled
765 // The scalars will need to be re-legalized.
766 if (NumPieces
== 1 || NumPieces
>= NumElts
||
767 NumElts
% NumPieces
!= 0)
768 return std::make_pair(0, EltTy
);
770 return std::make_pair(0,
771 LLT::vector(NumElts
/ NumPieces
, EltTy
));
774 // Need to split because of alignment.
775 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
776 unsigned EltSize
= EltTy
.getSizeInBits();
777 if (EltSize
> Align
&&
778 (EltSize
/ Align
< DstTy
.getNumElements())) {
779 return std::make_pair(0, LLT::vector(EltSize
/ Align
, EltTy
));
782 // May need relegalization for the scalars.
783 return std::make_pair(0, EltTy
);
788 Actions
.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32
));
790 // TODO: Need a bitcast lower option?
792 .legalIf([=](const LegalityQuery
&Query
) {
793 const LLT Ty0
= Query
.Types
[0];
794 unsigned Size
= Ty0
.getSizeInBits();
795 unsigned MemSize
= Query
.MMODescrs
[0].SizeInBits
;
796 unsigned Align
= Query
.MMODescrs
[0].AlignInBits
;
798 // No extending vector loads.
799 if (Size
> MemSize
&& Ty0
.isVector())
802 // FIXME: Widening store from alignment not valid.
804 MemSize
= std::max(MemSize
, Align
);
815 return ST
.hasDwordx3LoadStores();
823 .widenScalarToNextPow2(0)
824 // TODO: v3s32->v4s32 with alignment
825 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
828 auto &ExtLoads
= getActionDefinitionsBuilder({G_SEXTLOAD
, G_ZEXTLOAD
})
829 .legalForTypesWithMemDesc({{S32
, GlobalPtr
, 8, 8},
830 {S32
, GlobalPtr
, 16, 2 * 8},
831 {S32
, LocalPtr
, 8, 8},
832 {S32
, LocalPtr
, 16, 16},
833 {S32
, PrivatePtr
, 8, 8},
834 {S32
, PrivatePtr
, 16, 16},
835 {S32
, ConstantPtr
, 8, 8},
836 {S32
, ConstantPtr
, 16, 2 * 8}});
837 if (ST
.hasFlatAddressSpace()) {
838 ExtLoads
.legalForTypesWithMemDesc(
839 {{S32
, FlatPtr
, 8, 8}, {S32
, FlatPtr
, 16, 16}});
842 ExtLoads
.clampScalar(0, S32
, S32
)
843 .widenScalarToNextPow2(0)
844 .unsupportedIfMemSizeNotPow2()
847 auto &Atomics
= getActionDefinitionsBuilder(
848 {G_ATOMICRMW_XCHG
, G_ATOMICRMW_ADD
, G_ATOMICRMW_SUB
,
849 G_ATOMICRMW_AND
, G_ATOMICRMW_OR
, G_ATOMICRMW_XOR
,
850 G_ATOMICRMW_MAX
, G_ATOMICRMW_MIN
, G_ATOMICRMW_UMAX
,
851 G_ATOMICRMW_UMIN
, G_ATOMIC_CMPXCHG
})
852 .legalFor({{S32
, GlobalPtr
}, {S32
, LocalPtr
},
853 {S64
, GlobalPtr
}, {S64
, LocalPtr
}});
854 if (ST
.hasFlatAddressSpace()) {
855 Atomics
.legalFor({{S32
, FlatPtr
}, {S64
, FlatPtr
}});
858 getActionDefinitionsBuilder(G_ATOMICRMW_FADD
)
859 .legalFor({{S32
, LocalPtr
}});
861 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS
)
864 // TODO: Pointer types, any 32-bit or 64-bit vector
865 getActionDefinitionsBuilder(G_SELECT
)
866 .legalForCartesianProduct({S32
, S64
, S16
, V2S32
, V2S16
, V4S16
,
867 GlobalPtr
, LocalPtr
, FlatPtr
, PrivatePtr
,
868 LLT::vector(2, LocalPtr
), LLT::vector(2, PrivatePtr
)}, {S1
})
869 .clampScalar(0, S16
, S64
)
870 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
871 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
873 .clampMaxNumElements(0, S32
, 2)
874 .clampMaxNumElements(0, LocalPtr
, 2)
875 .clampMaxNumElements(0, PrivatePtr
, 2)
877 .widenScalarToNextPow2(0)
878 .legalIf(all(isPointer(0), typeIs(1, S1
)));
880 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
881 // be more flexible with the shift amount type.
882 auto &Shifts
= getActionDefinitionsBuilder({G_SHL
, G_LSHR
, G_ASHR
})
883 .legalFor({{S32
, S32
}, {S64
, S32
}});
884 if (ST
.has16BitInsts()) {
885 if (ST
.hasVOP3PInsts()) {
886 Shifts
.legalFor({{S16
, S32
}, {S16
, S16
}, {V2S16
, V2S16
}})
887 .clampMaxNumElements(0, S16
, 2);
889 Shifts
.legalFor({{S16
, S32
}, {S16
, S16
}});
891 Shifts
.clampScalar(1, S16
, S32
);
892 Shifts
.clampScalar(0, S16
, S64
);
893 Shifts
.widenScalarToNextPow2(0, 16);
895 // Make sure we legalize the shift amount type first, as the general
896 // expansion for the shifted type will produce much worse code if it hasn't
897 // been truncated already.
898 Shifts
.clampScalar(1, S32
, S32
);
899 Shifts
.clampScalar(0, S32
, S64
);
900 Shifts
.widenScalarToNextPow2(0, 32);
904 for (unsigned Op
: {G_EXTRACT_VECTOR_ELT
, G_INSERT_VECTOR_ELT
}) {
905 unsigned VecTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 1 : 0;
906 unsigned EltTypeIdx
= Op
== G_EXTRACT_VECTOR_ELT
? 0 : 1;
907 unsigned IdxTypeIdx
= 2;
909 getActionDefinitionsBuilder(Op
)
910 .customIf([=](const LegalityQuery
&Query
) {
911 const LLT EltTy
= Query
.Types
[EltTypeIdx
];
912 const LLT VecTy
= Query
.Types
[VecTypeIdx
];
913 const LLT IdxTy
= Query
.Types
[IdxTypeIdx
];
914 return (EltTy
.getSizeInBits() == 16 ||
915 EltTy
.getSizeInBits() % 32 == 0) &&
916 VecTy
.getSizeInBits() % 32 == 0 &&
917 VecTy
.getSizeInBits() <= 1024 &&
918 IdxTy
.getSizeInBits() == 32;
920 .clampScalar(EltTypeIdx
, S32
, S64
)
921 .clampScalar(VecTypeIdx
, S32
, S64
)
922 .clampScalar(IdxTypeIdx
, S32
, S32
);
925 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT
)
926 .unsupportedIf([=](const LegalityQuery
&Query
) {
927 const LLT
&EltTy
= Query
.Types
[1].getElementType();
928 return Query
.Types
[0] != EltTy
;
931 for (unsigned Op
: {G_EXTRACT
, G_INSERT
}) {
932 unsigned BigTyIdx
= Op
== G_EXTRACT
? 1 : 0;
933 unsigned LitTyIdx
= Op
== G_EXTRACT
? 0 : 1;
935 // FIXME: Doesn't handle extract of illegal sizes.
936 getActionDefinitionsBuilder(Op
)
937 .lowerIf(all(typeIs(LitTyIdx
, S16
), sizeIs(BigTyIdx
, 32)))
938 // FIXME: Multiples of 16 should not be legal.
939 .legalIf([=](const LegalityQuery
&Query
) {
940 const LLT BigTy
= Query
.Types
[BigTyIdx
];
941 const LLT LitTy
= Query
.Types
[LitTyIdx
];
942 return (BigTy
.getSizeInBits() % 32 == 0) &&
943 (LitTy
.getSizeInBits() % 16 == 0);
946 [=](const LegalityQuery
&Query
) {
947 const LLT BigTy
= Query
.Types
[BigTyIdx
];
948 return (BigTy
.getScalarSizeInBits() < 16);
950 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx
, 16))
952 [=](const LegalityQuery
&Query
) {
953 const LLT LitTy
= Query
.Types
[LitTyIdx
];
954 return (LitTy
.getScalarSizeInBits() < 16);
956 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx
, 16))
957 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
958 .widenScalarToNextPow2(BigTyIdx
, 32);
962 auto &BuildVector
= getActionDefinitionsBuilder(G_BUILD_VECTOR
)
963 .legalForCartesianProduct(AllS32Vectors
, {S32
})
964 .legalForCartesianProduct(AllS64Vectors
, {S64
})
965 .clampNumElements(0, V16S32
, V32S32
)
966 .clampNumElements(0, V2S64
, V16S64
)
967 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16
));
969 if (ST
.hasScalarPackInsts())
970 BuildVector
.legalFor({V2S16
, S32
});
973 .minScalarSameAs(1, 0)
974 .legalIf(isRegisterType(0))
975 .minScalarOrElt(0, S32
);
977 if (ST
.hasScalarPackInsts()) {
978 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
)
979 .legalFor({V2S16
, S32
})
982 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
)
986 getActionDefinitionsBuilder(G_CONCAT_VECTORS
)
987 .legalIf(isRegisterType(0));
989 // TODO: Don't fully scalarize v2s16 pieces
990 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR
).lower();
993 for (unsigned Op
: {G_MERGE_VALUES
, G_UNMERGE_VALUES
}) {
994 unsigned BigTyIdx
= Op
== G_MERGE_VALUES
? 0 : 1;
995 unsigned LitTyIdx
= Op
== G_MERGE_VALUES
? 1 : 0;
997 auto notValidElt
= [=](const LegalityQuery
&Query
, unsigned TypeIdx
) {
998 const LLT
&Ty
= Query
.Types
[TypeIdx
];
1000 const LLT
&EltTy
= Ty
.getElementType();
1001 if (EltTy
.getSizeInBits() < 8 || EltTy
.getSizeInBits() > 64)
1003 if (!isPowerOf2_32(EltTy
.getSizeInBits()))
1009 auto &Builder
= getActionDefinitionsBuilder(Op
)
1010 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 16)
1011 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1012 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1014 .clampScalar(LitTyIdx
, S16
, S256
)
1015 .widenScalarToNextPow2(LitTyIdx
, /*Min*/ 32)
1016 .moreElementsIf(isSmallOddVector(BigTyIdx
), oneMoreElement(BigTyIdx
))
1017 .fewerElementsIf(all(typeIs(0, S16
), vectorWiderThan(1, 32),
1018 elementTypeIs(1, S16
)),
1020 // Break up vectors with weird elements into scalars
1022 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, 0); },
1025 [=](const LegalityQuery
&Query
) { return notValidElt(Query
, 1); },
1027 .clampScalar(BigTyIdx
, S32
, S1024
)
1028 .lowerFor({{S16
, V2S16
}});
1030 if (Op
== G_MERGE_VALUES
) {
1031 Builder
.widenScalarIf(
1032 // TODO: Use 16-bit shifts if legal for 8-bit values?
1033 [=](const LegalityQuery
&Query
) {
1034 const LLT Ty
= Query
.Types
[LitTyIdx
];
1035 return Ty
.getSizeInBits() < 32;
1037 changeTo(LitTyIdx
, S32
));
1040 Builder
.widenScalarIf(
1041 [=](const LegalityQuery
&Query
) {
1042 const LLT Ty
= Query
.Types
[BigTyIdx
];
1043 return !isPowerOf2_32(Ty
.getSizeInBits()) &&
1044 Ty
.getSizeInBits() % 16 != 0;
1046 [=](const LegalityQuery
&Query
) {
1047 // Pick the next power of 2, or a multiple of 64 over 128.
1048 // Whichever is smaller.
1049 const LLT
&Ty
= Query
.Types
[BigTyIdx
];
1050 unsigned NewSizeInBits
= 1 << Log2_32_Ceil(Ty
.getSizeInBits() + 1);
1051 if (NewSizeInBits
>= 256) {
1052 unsigned RoundedTo
= alignTo
<64>(Ty
.getSizeInBits() + 1);
1053 if (RoundedTo
< NewSizeInBits
)
1054 NewSizeInBits
= RoundedTo
;
1056 return std::make_pair(BigTyIdx
, LLT::scalar(NewSizeInBits
));
1058 .legalIf([=](const LegalityQuery
&Query
) {
1059 const LLT
&BigTy
= Query
.Types
[BigTyIdx
];
1060 const LLT
&LitTy
= Query
.Types
[LitTyIdx
];
1062 if (BigTy
.isVector() && BigTy
.getSizeInBits() < 32)
1064 if (LitTy
.isVector() && LitTy
.getSizeInBits() < 32)
1067 return BigTy
.getSizeInBits() % 16 == 0 &&
1068 LitTy
.getSizeInBits() % 16 == 0 &&
1069 BigTy
.getSizeInBits() <= 1024;
1071 // Any vectors left are the wrong size. Scalarize them.
1076 getActionDefinitionsBuilder(G_SEXT_INREG
).lower();
1079 verify(*ST
.getInstrInfo());
1082 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr
&MI
,
1083 MachineRegisterInfo
&MRI
,
1084 MachineIRBuilder
&B
,
1085 GISelChangeObserver
&Observer
) const {
1086 switch (MI
.getOpcode()) {
1087 case TargetOpcode::G_ADDRSPACE_CAST
:
1088 return legalizeAddrSpaceCast(MI
, MRI
, B
);
1089 case TargetOpcode::G_FRINT
:
1090 return legalizeFrint(MI
, MRI
, B
);
1091 case TargetOpcode::G_FCEIL
:
1092 return legalizeFceil(MI
, MRI
, B
);
1093 case TargetOpcode::G_INTRINSIC_TRUNC
:
1094 return legalizeIntrinsicTrunc(MI
, MRI
, B
);
1095 case TargetOpcode::G_SITOFP
:
1096 return legalizeITOFP(MI
, MRI
, B
, true);
1097 case TargetOpcode::G_UITOFP
:
1098 return legalizeITOFP(MI
, MRI
, B
, false);
1099 case TargetOpcode::G_FMINNUM
:
1100 case TargetOpcode::G_FMAXNUM
:
1101 case TargetOpcode::G_FMINNUM_IEEE
:
1102 case TargetOpcode::G_FMAXNUM_IEEE
:
1103 return legalizeMinNumMaxNum(MI
, MRI
, B
);
1104 case TargetOpcode::G_EXTRACT_VECTOR_ELT
:
1105 return legalizeExtractVectorElt(MI
, MRI
, B
);
1106 case TargetOpcode::G_INSERT_VECTOR_ELT
:
1107 return legalizeInsertVectorElt(MI
, MRI
, B
);
1108 case TargetOpcode::G_FSIN
:
1109 case TargetOpcode::G_FCOS
:
1110 return legalizeSinCos(MI
, MRI
, B
);
1111 case TargetOpcode::G_GLOBAL_VALUE
:
1112 return legalizeGlobalValue(MI
, MRI
, B
);
1113 case TargetOpcode::G_LOAD
:
1114 return legalizeLoad(MI
, MRI
, B
, Observer
);
1115 case TargetOpcode::G_FMAD
:
1116 return legalizeFMad(MI
, MRI
, B
);
1117 case TargetOpcode::G_FDIV
:
1118 return legalizeFDIV(MI
, MRI
, B
);
1123 llvm_unreachable("expected switch to return");
1126 Register
AMDGPULegalizerInfo::getSegmentAperture(
1128 MachineRegisterInfo
&MRI
,
1129 MachineIRBuilder
&B
) const {
1130 MachineFunction
&MF
= B
.getMF();
1131 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1132 const LLT S32
= LLT::scalar(32);
1134 assert(AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::PRIVATE_ADDRESS
);
1136 if (ST
.hasApertureRegs()) {
1137 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1139 unsigned Offset
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
1140 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE
:
1141 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE
;
1142 unsigned WidthM1
= AS
== AMDGPUAS::LOCAL_ADDRESS
?
1143 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE
:
1144 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE
;
1146 AMDGPU::Hwreg::ID_MEM_BASES
<< AMDGPU::Hwreg::ID_SHIFT_
|
1147 Offset
<< AMDGPU::Hwreg::OFFSET_SHIFT_
|
1148 WidthM1
<< AMDGPU::Hwreg::WIDTH_M1_SHIFT_
;
1150 Register ApertureReg
= MRI
.createGenericVirtualRegister(S32
);
1151 Register GetReg
= MRI
.createVirtualRegister(&AMDGPU::SReg_32RegClass
);
1153 B
.buildInstr(AMDGPU::S_GETREG_B32
)
1156 MRI
.setType(GetReg
, S32
);
1158 auto ShiftAmt
= B
.buildConstant(S32
, WidthM1
+ 1);
1159 B
.buildInstr(TargetOpcode::G_SHL
)
1160 .addDef(ApertureReg
)
1162 .addUse(ShiftAmt
.getReg(0));
1167 Register QueuePtr
= MRI
.createGenericVirtualRegister(
1168 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64));
1170 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1171 if (!loadInputValue(QueuePtr
, B
, &MFI
->getArgInfo().QueuePtr
))
1174 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1175 // private_segment_aperture_base_hi.
1176 uint32_t StructOffset
= (AS
== AMDGPUAS::LOCAL_ADDRESS
) ? 0x40 : 0x44;
1178 // FIXME: Don't use undef
1179 Value
*V
= UndefValue::get(PointerType::get(
1180 Type::getInt8Ty(MF
.getFunction().getContext()),
1181 AMDGPUAS::CONSTANT_ADDRESS
));
1183 MachinePointerInfo
PtrInfo(V
, StructOffset
);
1184 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
1186 MachineMemOperand::MOLoad
|
1187 MachineMemOperand::MODereferenceable
|
1188 MachineMemOperand::MOInvariant
,
1190 MinAlign(64, StructOffset
));
1192 Register LoadResult
= MRI
.createGenericVirtualRegister(S32
);
1195 B
.materializeGEP(LoadAddr
, QueuePtr
, LLT::scalar(64), StructOffset
);
1196 B
.buildLoad(LoadResult
, LoadAddr
, *MMO
);
1200 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1201 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1202 MachineIRBuilder
&B
) const {
1203 MachineFunction
&MF
= B
.getMF();
1207 const LLT S32
= LLT::scalar(32);
1208 Register Dst
= MI
.getOperand(0).getReg();
1209 Register Src
= MI
.getOperand(1).getReg();
1211 LLT DstTy
= MRI
.getType(Dst
);
1212 LLT SrcTy
= MRI
.getType(Src
);
1213 unsigned DestAS
= DstTy
.getAddressSpace();
1214 unsigned SrcAS
= SrcTy
.getAddressSpace();
1216 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1218 assert(!DstTy
.isVector());
1220 const AMDGPUTargetMachine
&TM
1221 = static_cast<const AMDGPUTargetMachine
&>(MF
.getTarget());
1223 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1224 if (ST
.getTargetLowering()->isNoopAddrSpaceCast(SrcAS
, DestAS
)) {
1225 MI
.setDesc(B
.getTII().get(TargetOpcode::G_BITCAST
));
1229 if (DestAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
1231 B
.buildExtract(Dst
, Src
, 0);
1232 MI
.eraseFromParent();
1236 if (SrcAS
== AMDGPUAS::CONSTANT_ADDRESS_32BIT
) {
1237 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
1238 uint32_t AddrHiVal
= Info
->get32BitAddressHighBits();
1240 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1241 // another. Merge operands are required to be the same type, but creating an
1242 // extra ptrtoint would be kind of pointless.
1243 auto HighAddr
= B
.buildConstant(
1244 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT
, 32), AddrHiVal
);
1245 B
.buildMerge(Dst
, {Src
, HighAddr
.getReg(0)});
1246 MI
.eraseFromParent();
1250 if (SrcAS
== AMDGPUAS::FLAT_ADDRESS
) {
1251 assert(DestAS
== AMDGPUAS::LOCAL_ADDRESS
||
1252 DestAS
== AMDGPUAS::PRIVATE_ADDRESS
);
1253 unsigned NullVal
= TM
.getNullPointerValue(DestAS
);
1255 auto SegmentNull
= B
.buildConstant(DstTy
, NullVal
);
1256 auto FlatNull
= B
.buildConstant(SrcTy
, 0);
1258 Register PtrLo32
= MRI
.createGenericVirtualRegister(DstTy
);
1260 // Extract low 32-bits of the pointer.
1261 B
.buildExtract(PtrLo32
, Src
, 0);
1263 Register CmpRes
= MRI
.createGenericVirtualRegister(LLT::scalar(1));
1264 B
.buildICmp(CmpInst::ICMP_NE
, CmpRes
, Src
, FlatNull
.getReg(0));
1265 B
.buildSelect(Dst
, CmpRes
, PtrLo32
, SegmentNull
.getReg(0));
1267 MI
.eraseFromParent();
1271 if (SrcAS
!= AMDGPUAS::LOCAL_ADDRESS
&& SrcAS
!= AMDGPUAS::PRIVATE_ADDRESS
)
1274 if (!ST
.hasFlatAddressSpace())
1278 B
.buildConstant(SrcTy
, TM
.getNullPointerValue(SrcAS
));
1280 B
.buildConstant(DstTy
, TM
.getNullPointerValue(DestAS
));
1282 Register ApertureReg
= getSegmentAperture(SrcAS
, MRI
, B
);
1283 if (!ApertureReg
.isValid())
1286 Register CmpRes
= MRI
.createGenericVirtualRegister(LLT::scalar(1));
1287 B
.buildICmp(CmpInst::ICMP_NE
, CmpRes
, Src
, SegmentNull
.getReg(0));
1289 Register BuildPtr
= MRI
.createGenericVirtualRegister(DstTy
);
1291 // Coerce the type of the low half of the result so we can use merge_values.
1292 Register SrcAsInt
= MRI
.createGenericVirtualRegister(S32
);
1293 B
.buildInstr(TargetOpcode::G_PTRTOINT
)
1297 // TODO: Should we allow mismatched types but matching sizes in merges to
1298 // avoid the ptrtoint?
1299 B
.buildMerge(BuildPtr
, {SrcAsInt
, ApertureReg
});
1300 B
.buildSelect(Dst
, CmpRes
, BuildPtr
, FlatNull
.getReg(0));
1302 MI
.eraseFromParent();
1306 bool AMDGPULegalizerInfo::legalizeFrint(
1307 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1308 MachineIRBuilder
&B
) const {
1311 Register Src
= MI
.getOperand(1).getReg();
1312 LLT Ty
= MRI
.getType(Src
);
1313 assert(Ty
.isScalar() && Ty
.getSizeInBits() == 64);
1315 APFloat
C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1316 APFloat
C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1318 auto C1
= B
.buildFConstant(Ty
, C1Val
);
1319 auto CopySign
= B
.buildFCopysign(Ty
, C1
, Src
);
1321 // TODO: Should this propagate fast-math-flags?
1322 auto Tmp1
= B
.buildFAdd(Ty
, Src
, CopySign
);
1323 auto Tmp2
= B
.buildFSub(Ty
, Tmp1
, CopySign
);
1325 auto C2
= B
.buildFConstant(Ty
, C2Val
);
1326 auto Fabs
= B
.buildFAbs(Ty
, Src
);
1328 auto Cond
= B
.buildFCmp(CmpInst::FCMP_OGT
, LLT::scalar(1), Fabs
, C2
);
1329 B
.buildSelect(MI
.getOperand(0).getReg(), Cond
, Src
, Tmp2
);
1333 bool AMDGPULegalizerInfo::legalizeFceil(
1334 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1335 MachineIRBuilder
&B
) const {
1338 const LLT S1
= LLT::scalar(1);
1339 const LLT S64
= LLT::scalar(64);
1341 Register Src
= MI
.getOperand(1).getReg();
1342 assert(MRI
.getType(Src
) == S64
);
1344 // result = trunc(src)
1345 // if (src > 0.0 && src != result)
1348 auto Trunc
= B
.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC
, {S64
}, {Src
});
1350 const auto Zero
= B
.buildFConstant(S64
, 0.0);
1351 const auto One
= B
.buildFConstant(S64
, 1.0);
1352 auto Lt0
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Src
, Zero
);
1353 auto NeTrunc
= B
.buildFCmp(CmpInst::FCMP_ONE
, S1
, Src
, Trunc
);
1354 auto And
= B
.buildAnd(S1
, Lt0
, NeTrunc
);
1355 auto Add
= B
.buildSelect(S64
, And
, One
, Zero
);
1357 // TODO: Should this propagate fast-math-flags?
1358 B
.buildFAdd(MI
.getOperand(0).getReg(), Trunc
, Add
);
1362 static MachineInstrBuilder
extractF64Exponent(unsigned Hi
,
1363 MachineIRBuilder
&B
) {
1364 const unsigned FractBits
= 52;
1365 const unsigned ExpBits
= 11;
1366 LLT S32
= LLT::scalar(32);
1368 auto Const0
= B
.buildConstant(S32
, FractBits
- 32);
1369 auto Const1
= B
.buildConstant(S32
, ExpBits
);
1371 auto ExpPart
= B
.buildIntrinsic(Intrinsic::amdgcn_ubfe
, {S32
}, false)
1372 .addUse(Const0
.getReg(0))
1373 .addUse(Const1
.getReg(0));
1375 return B
.buildSub(S32
, ExpPart
, B
.buildConstant(S32
, 1023));
1378 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1379 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1380 MachineIRBuilder
&B
) const {
1383 const LLT S1
= LLT::scalar(1);
1384 const LLT S32
= LLT::scalar(32);
1385 const LLT S64
= LLT::scalar(64);
1387 Register Src
= MI
.getOperand(1).getReg();
1388 assert(MRI
.getType(Src
) == S64
);
1390 // TODO: Should this use extract since the low half is unused?
1391 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
1392 Register Hi
= Unmerge
.getReg(1);
1394 // Extract the upper half, since this is where we will find the sign and
1396 auto Exp
= extractF64Exponent(Hi
, B
);
1398 const unsigned FractBits
= 52;
1400 // Extract the sign bit.
1401 const auto SignBitMask
= B
.buildConstant(S32
, UINT32_C(1) << 31);
1402 auto SignBit
= B
.buildAnd(S32
, Hi
, SignBitMask
);
1404 const auto FractMask
= B
.buildConstant(S64
, (UINT64_C(1) << FractBits
) - 1);
1406 const auto Zero32
= B
.buildConstant(S32
, 0);
1408 // Extend back to 64-bits.
1409 auto SignBit64
= B
.buildMerge(S64
, {Zero32
.getReg(0), SignBit
.getReg(0)});
1411 auto Shr
= B
.buildAShr(S64
, FractMask
, Exp
);
1412 auto Not
= B
.buildNot(S64
, Shr
);
1413 auto Tmp0
= B
.buildAnd(S64
, Src
, Not
);
1414 auto FiftyOne
= B
.buildConstant(S32
, FractBits
- 1);
1416 auto ExpLt0
= B
.buildICmp(CmpInst::ICMP_SLT
, S1
, Exp
, Zero32
);
1417 auto ExpGt51
= B
.buildICmp(CmpInst::ICMP_SGT
, S1
, Exp
, FiftyOne
);
1419 auto Tmp1
= B
.buildSelect(S64
, ExpLt0
, SignBit64
, Tmp0
);
1420 B
.buildSelect(MI
.getOperand(0).getReg(), ExpGt51
, Src
, Tmp1
);
1424 bool AMDGPULegalizerInfo::legalizeITOFP(
1425 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1426 MachineIRBuilder
&B
, bool Signed
) const {
1429 Register Dst
= MI
.getOperand(0).getReg();
1430 Register Src
= MI
.getOperand(1).getReg();
1432 const LLT S64
= LLT::scalar(64);
1433 const LLT S32
= LLT::scalar(32);
1435 assert(MRI
.getType(Src
) == S64
&& MRI
.getType(Dst
) == S64
);
1437 auto Unmerge
= B
.buildUnmerge({S32
, S32
}, Src
);
1439 auto CvtHi
= Signed
?
1440 B
.buildSITOFP(S64
, Unmerge
.getReg(1)) :
1441 B
.buildUITOFP(S64
, Unmerge
.getReg(1));
1443 auto CvtLo
= B
.buildUITOFP(S64
, Unmerge
.getReg(0));
1445 auto ThirtyTwo
= B
.buildConstant(S32
, 32);
1446 auto LdExp
= B
.buildIntrinsic(Intrinsic::amdgcn_ldexp
, {S64
}, false)
1447 .addUse(CvtHi
.getReg(0))
1448 .addUse(ThirtyTwo
.getReg(0));
1450 // TODO: Should this propagate fast-math-flags?
1451 B
.buildFAdd(Dst
, LdExp
, CvtLo
);
1452 MI
.eraseFromParent();
1456 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1457 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1458 MachineIRBuilder
&B
) const {
1459 MachineFunction
&MF
= B
.getMF();
1460 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1462 const bool IsIEEEOp
= MI
.getOpcode() == AMDGPU::G_FMINNUM_IEEE
||
1463 MI
.getOpcode() == AMDGPU::G_FMAXNUM_IEEE
;
1465 // With ieee_mode disabled, the instructions have the correct behavior
1466 // already for G_FMINNUM/G_FMAXNUM
1467 if (!MFI
->getMode().IEEE
)
1473 MachineIRBuilder
HelperBuilder(MI
);
1474 GISelObserverWrapper DummyObserver
;
1475 LegalizerHelper
Helper(MF
, DummyObserver
, HelperBuilder
);
1476 HelperBuilder
.setInstr(MI
);
1477 return Helper
.lowerFMinNumMaxNum(MI
) == LegalizerHelper::Legalized
;
1480 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1481 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1482 MachineIRBuilder
&B
) const {
1483 // TODO: Should move some of this into LegalizerHelper.
1485 // TODO: Promote dynamic indexing of s16 to s32
1486 // TODO: Dynamic s64 indexing is only legal for SGPR.
1487 Optional
<int64_t> IdxVal
= getConstantVRegVal(MI
.getOperand(2).getReg(), MRI
);
1488 if (!IdxVal
) // Dynamic case will be selected to register indexing.
1491 Register Dst
= MI
.getOperand(0).getReg();
1492 Register Vec
= MI
.getOperand(1).getReg();
1494 LLT VecTy
= MRI
.getType(Vec
);
1495 LLT EltTy
= VecTy
.getElementType();
1496 assert(EltTy
== MRI
.getType(Dst
));
1500 if (IdxVal
.getValue() < VecTy
.getNumElements())
1501 B
.buildExtract(Dst
, Vec
, IdxVal
.getValue() * EltTy
.getSizeInBits());
1505 MI
.eraseFromParent();
1509 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1510 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1511 MachineIRBuilder
&B
) const {
1512 // TODO: Should move some of this into LegalizerHelper.
1514 // TODO: Promote dynamic indexing of s16 to s32
1515 // TODO: Dynamic s64 indexing is only legal for SGPR.
1516 Optional
<int64_t> IdxVal
= getConstantVRegVal(MI
.getOperand(3).getReg(), MRI
);
1517 if (!IdxVal
) // Dynamic case will be selected to register indexing.
1520 Register Dst
= MI
.getOperand(0).getReg();
1521 Register Vec
= MI
.getOperand(1).getReg();
1522 Register Ins
= MI
.getOperand(2).getReg();
1524 LLT VecTy
= MRI
.getType(Vec
);
1525 LLT EltTy
= VecTy
.getElementType();
1526 assert(EltTy
== MRI
.getType(Ins
));
1530 if (IdxVal
.getValue() < VecTy
.getNumElements())
1531 B
.buildInsert(Dst
, Vec
, Ins
, IdxVal
.getValue() * EltTy
.getSizeInBits());
1535 MI
.eraseFromParent();
1539 bool AMDGPULegalizerInfo::legalizeSinCos(
1540 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1541 MachineIRBuilder
&B
) const {
1544 Register DstReg
= MI
.getOperand(0).getReg();
1545 Register SrcReg
= MI
.getOperand(1).getReg();
1546 LLT Ty
= MRI
.getType(DstReg
);
1547 unsigned Flags
= MI
.getFlags();
1550 auto OneOver2Pi
= B
.buildFConstant(Ty
, 0.5 / M_PI
);
1551 if (ST
.hasTrigReducedRange()) {
1552 auto MulVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
);
1553 TrigVal
= B
.buildIntrinsic(Intrinsic::amdgcn_fract
, {Ty
}, false)
1554 .addUse(MulVal
.getReg(0))
1555 .setMIFlags(Flags
).getReg(0);
1557 TrigVal
= B
.buildFMul(Ty
, SrcReg
, OneOver2Pi
, Flags
).getReg(0);
1559 Intrinsic::ID TrigIntrin
= MI
.getOpcode() == AMDGPU::G_FSIN
?
1560 Intrinsic::amdgcn_sin
: Intrinsic::amdgcn_cos
;
1561 B
.buildIntrinsic(TrigIntrin
, makeArrayRef
<Register
>(DstReg
), false)
1564 MI
.eraseFromParent();
1568 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1569 Register DstReg
, LLT PtrTy
,
1570 MachineIRBuilder
&B
, const GlobalValue
*GV
,
1571 unsigned Offset
, unsigned GAFlags
) const {
1572 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1573 // to the following code sequence:
1575 // For constant address space:
1576 // s_getpc_b64 s[0:1]
1577 // s_add_u32 s0, s0, $symbol
1578 // s_addc_u32 s1, s1, 0
1580 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1581 // a fixup or relocation is emitted to replace $symbol with a literal
1582 // constant, which is a pc-relative offset from the encoding of the $symbol
1583 // operand to the global variable.
1585 // For global address space:
1586 // s_getpc_b64 s[0:1]
1587 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1588 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1590 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1591 // fixups or relocations are emitted to replace $symbol@*@lo and
1592 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1593 // which is a 64-bit pc-relative offset from the encoding of the $symbol
1594 // operand to the global variable.
1596 // What we want here is an offset from the value returned by s_getpc
1597 // (which is the address of the s_add_u32 instruction) to the global
1598 // variable, but since the encoding of $symbol starts 4 bytes after the start
1599 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1600 // small. This requires us to add 4 to the global variable offset in order to
1601 // compute the correct address.
1603 LLT ConstPtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
1605 Register PCReg
= PtrTy
.getSizeInBits() != 32 ? DstReg
:
1606 B
.getMRI()->createGenericVirtualRegister(ConstPtrTy
);
1608 MachineInstrBuilder MIB
= B
.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET
)
1611 MIB
.addGlobalAddress(GV
, Offset
+ 4, GAFlags
);
1612 if (GAFlags
== SIInstrInfo::MO_NONE
)
1615 MIB
.addGlobalAddress(GV
, Offset
+ 4, GAFlags
+ 1);
1617 B
.getMRI()->setRegClass(PCReg
, &AMDGPU::SReg_64RegClass
);
1619 if (PtrTy
.getSizeInBits() == 32)
1620 B
.buildExtract(DstReg
, PCReg
, 0);
1624 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1625 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1626 MachineIRBuilder
&B
) const {
1627 Register DstReg
= MI
.getOperand(0).getReg();
1628 LLT Ty
= MRI
.getType(DstReg
);
1629 unsigned AS
= Ty
.getAddressSpace();
1631 const GlobalValue
*GV
= MI
.getOperand(1).getGlobal();
1632 MachineFunction
&MF
= B
.getMF();
1633 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1636 if (AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::REGION_ADDRESS
) {
1637 if (!MFI
->isEntryFunction()) {
1638 const Function
&Fn
= MF
.getFunction();
1639 DiagnosticInfoUnsupported
BadLDSDecl(
1640 Fn
, "local memory global used by non-kernel function", MI
.getDebugLoc());
1641 Fn
.getContext().diagnose(BadLDSDecl
);
1644 // TODO: We could emit code to handle the initialization somewhere.
1645 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV
)) {
1646 B
.buildConstant(DstReg
, MFI
->allocateLDSGlobal(B
.getDataLayout(), *GV
));
1647 MI
.eraseFromParent();
1651 const Function
&Fn
= MF
.getFunction();
1652 DiagnosticInfoUnsupported
BadInit(
1653 Fn
, "unsupported initializer for address space", MI
.getDebugLoc());
1654 Fn
.getContext().diagnose(BadInit
);
1658 const SITargetLowering
*TLI
= ST
.getTargetLowering();
1660 if (TLI
->shouldEmitFixup(GV
)) {
1661 buildPCRelGlobalAddress(DstReg
, Ty
, B
, GV
, 0);
1662 MI
.eraseFromParent();
1666 if (TLI
->shouldEmitPCReloc(GV
)) {
1667 buildPCRelGlobalAddress(DstReg
, Ty
, B
, GV
, 0, SIInstrInfo::MO_REL32
);
1668 MI
.eraseFromParent();
1672 LLT PtrTy
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
1673 Register GOTAddr
= MRI
.createGenericVirtualRegister(PtrTy
);
1675 MachineMemOperand
*GOTMMO
= MF
.getMachineMemOperand(
1676 MachinePointerInfo::getGOT(MF
),
1677 MachineMemOperand::MOLoad
| MachineMemOperand::MODereferenceable
|
1678 MachineMemOperand::MOInvariant
,
1679 8 /*Size*/, 8 /*Align*/);
1681 buildPCRelGlobalAddress(GOTAddr
, PtrTy
, B
, GV
, 0, SIInstrInfo::MO_GOTPCREL32
);
1683 if (Ty
.getSizeInBits() == 32) {
1684 // Truncate if this is a 32-bit constant adrdess.
1685 auto Load
= B
.buildLoad(PtrTy
, GOTAddr
, *GOTMMO
);
1686 B
.buildExtract(DstReg
, Load
, 0);
1688 B
.buildLoad(DstReg
, GOTAddr
, *GOTMMO
);
1690 MI
.eraseFromParent();
1694 bool AMDGPULegalizerInfo::legalizeLoad(
1695 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1696 MachineIRBuilder
&B
, GISelChangeObserver
&Observer
) const {
1698 LLT ConstPtr
= LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS
, 64);
1699 auto Cast
= B
.buildAddrSpaceCast(ConstPtr
, MI
.getOperand(1).getReg());
1700 Observer
.changingInstr(MI
);
1701 MI
.getOperand(1).setReg(Cast
.getReg(0));
1702 Observer
.changedInstr(MI
);
1706 bool AMDGPULegalizerInfo::legalizeFMad(
1707 MachineInstr
&MI
, MachineRegisterInfo
&MRI
,
1708 MachineIRBuilder
&B
) const {
1709 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
1710 assert(Ty
.isScalar());
1712 // TODO: Always legal with future ftz flag.
1713 if (Ty
== LLT::scalar(32) && !ST
.hasFP32Denormals())
1715 if (Ty
== LLT::scalar(16) && !ST
.hasFP16Denormals())
1718 MachineFunction
&MF
= B
.getMF();
1720 MachineIRBuilder
HelperBuilder(MI
);
1721 GISelObserverWrapper DummyObserver
;
1722 LegalizerHelper
Helper(MF
, DummyObserver
, HelperBuilder
);
1723 HelperBuilder
.setMBB(*MI
.getParent());
1724 return Helper
.lowerFMad(MI
) == LegalizerHelper::Legalized
;
1727 // Return the use branch instruction, otherwise null if the usage is invalid.
1728 static MachineInstr
*verifyCFIntrinsic(MachineInstr
&MI
,
1729 MachineRegisterInfo
&MRI
) {
1730 Register CondDef
= MI
.getOperand(0).getReg();
1731 if (!MRI
.hasOneNonDBGUse(CondDef
))
1734 MachineInstr
&UseMI
= *MRI
.use_instr_nodbg_begin(CondDef
);
1735 return UseMI
.getParent() == MI
.getParent() &&
1736 UseMI
.getOpcode() == AMDGPU::G_BRCOND
? &UseMI
: nullptr;
1739 Register
AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo
&MRI
,
1740 Register Reg
, LLT Ty
) const {
1741 Register LiveIn
= MRI
.getLiveInVirtReg(Reg
);
1745 Register NewReg
= MRI
.createGenericVirtualRegister(Ty
);
1746 MRI
.addLiveIn(Reg
, NewReg
);
1750 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg
, MachineIRBuilder
&B
,
1751 const ArgDescriptor
*Arg
) const {
1752 if (!Arg
->isRegister() || !Arg
->getRegister().isValid())
1753 return false; // TODO: Handle these
1755 assert(Arg
->getRegister().isPhysical());
1757 MachineRegisterInfo
&MRI
= *B
.getMRI();
1759 LLT Ty
= MRI
.getType(DstReg
);
1760 Register LiveIn
= getLiveInRegister(MRI
, Arg
->getRegister(), Ty
);
1762 if (Arg
->isMasked()) {
1763 // TODO: Should we try to emit this once in the entry block?
1764 const LLT S32
= LLT::scalar(32);
1765 const unsigned Mask
= Arg
->getMask();
1766 const unsigned Shift
= countTrailingZeros
<unsigned>(Mask
);
1768 Register AndMaskSrc
= LiveIn
;
1771 auto ShiftAmt
= B
.buildConstant(S32
, Shift
);
1772 AndMaskSrc
= B
.buildLShr(S32
, LiveIn
, ShiftAmt
).getReg(0);
1775 B
.buildAnd(DstReg
, AndMaskSrc
, B
.buildConstant(S32
, Mask
>> Shift
));
1777 B
.buildCopy(DstReg
, LiveIn
);
1779 // Insert the argument copy if it doens't already exist.
1780 // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1781 if (!MRI
.getVRegDef(LiveIn
)) {
1782 // FIXME: Should have scoped insert pt
1783 MachineBasicBlock
&OrigInsBB
= B
.getMBB();
1784 auto OrigInsPt
= B
.getInsertPt();
1786 MachineBasicBlock
&EntryMBB
= B
.getMF().front();
1787 EntryMBB
.addLiveIn(Arg
->getRegister());
1788 B
.setInsertPt(EntryMBB
, EntryMBB
.begin());
1789 B
.buildCopy(LiveIn
, Arg
->getRegister());
1791 B
.setInsertPt(OrigInsBB
, OrigInsPt
);
1797 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1799 MachineRegisterInfo
&MRI
,
1800 MachineIRBuilder
&B
,
1801 AMDGPUFunctionArgInfo::PreloadedValue ArgType
) const {
1804 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
1806 const ArgDescriptor
*Arg
;
1807 const TargetRegisterClass
*RC
;
1808 std::tie(Arg
, RC
) = MFI
->getPreloadedValue(ArgType
);
1810 LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1814 if (loadInputValue(MI
.getOperand(0).getReg(), B
, Arg
)) {
1815 MI
.eraseFromParent();
1822 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr
&MI
,
1823 MachineRegisterInfo
&MRI
,
1824 MachineIRBuilder
&B
) const {
1827 if (legalizeFastUnsafeFDIV(MI
, MRI
, B
))
1833 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr
&MI
,
1834 MachineRegisterInfo
&MRI
,
1835 MachineIRBuilder
&B
) const {
1836 Register Res
= MI
.getOperand(0).getReg();
1837 Register LHS
= MI
.getOperand(1).getReg();
1838 Register RHS
= MI
.getOperand(2).getReg();
1840 uint16_t Flags
= MI
.getFlags();
1842 LLT ResTy
= MRI
.getType(Res
);
1843 LLT S32
= LLT::scalar(32);
1844 LLT S64
= LLT::scalar(64);
1846 const MachineFunction
&MF
= B
.getMF();
1848 MF
.getTarget().Options
.UnsafeFPMath
|| MI
.getFlag(MachineInstr::FmArcp
);
1850 if (!MF
.getTarget().Options
.UnsafeFPMath
&& ResTy
== S64
)
1853 if (!Unsafe
&& ResTy
== S32
&& ST
.hasFP32Denormals())
1856 if (auto CLHS
= getConstantFPVRegVal(LHS
, MRI
)) {
1858 if (CLHS
->isExactlyValue(1.0)) {
1859 B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, Res
, false)
1863 MI
.eraseFromParent();
1867 // -1 / x -> RCP( FNEG(x) )
1868 if (CLHS
->isExactlyValue(-1.0)) {
1869 auto FNeg
= B
.buildFNeg(ResTy
, RHS
, Flags
);
1870 B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, Res
, false)
1871 .addUse(FNeg
.getReg(0))
1874 MI
.eraseFromParent();
1879 // x / y -> x * (1.0 / y)
1881 auto RCP
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {ResTy
}, false)
1884 B
.buildFMul(Res
, LHS
, RCP
, Flags
);
1886 MI
.eraseFromParent();
1893 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr
&MI
,
1894 MachineRegisterInfo
&MRI
,
1895 MachineIRBuilder
&B
) const {
1897 Register Res
= MI
.getOperand(0).getReg();
1898 Register LHS
= MI
.getOperand(2).getReg();
1899 Register RHS
= MI
.getOperand(3).getReg();
1900 uint16_t Flags
= MI
.getFlags();
1902 LLT S32
= LLT::scalar(32);
1903 LLT S1
= LLT::scalar(1);
1905 auto Abs
= B
.buildFAbs(S32
, RHS
, Flags
);
1906 const APFloat
C0Val(1.0f
);
1908 auto C0
= B
.buildConstant(S32
, 0x6f800000);
1909 auto C1
= B
.buildConstant(S32
, 0x2f800000);
1910 auto C2
= B
.buildConstant(S32
, FloatToBits(1.0f
));
1912 auto CmpRes
= B
.buildFCmp(CmpInst::FCMP_OGT
, S1
, Abs
, C0
, Flags
);
1913 auto Sel
= B
.buildSelect(S32
, CmpRes
, C1
, C2
, Flags
);
1915 auto Mul0
= B
.buildFMul(S32
, RHS
, Sel
, Flags
);
1917 auto RCP
= B
.buildIntrinsic(Intrinsic::amdgcn_rcp
, {S32
}, false)
1918 .addUse(Mul0
.getReg(0))
1921 auto Mul1
= B
.buildFMul(S32
, LHS
, RCP
, Flags
);
1923 B
.buildFMul(Res
, Sel
, Mul1
, Flags
);
1925 MI
.eraseFromParent();
1929 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr
&MI
,
1930 MachineRegisterInfo
&MRI
,
1931 MachineIRBuilder
&B
) const {
1932 const SIMachineFunctionInfo
*MFI
= B
.getMF().getInfo
<SIMachineFunctionInfo
>();
1933 if (!MFI
->isEntryFunction()) {
1934 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
1935 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
);
1941 ST
.getTargetLowering()->getImplicitParameterOffset(
1942 B
.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT
);
1943 Register DstReg
= MI
.getOperand(0).getReg();
1944 LLT DstTy
= MRI
.getType(DstReg
);
1945 LLT IdxTy
= LLT::scalar(DstTy
.getSizeInBits());
1947 const ArgDescriptor
*Arg
;
1948 const TargetRegisterClass
*RC
;
1950 = MFI
->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
1954 Register KernargPtrReg
= MRI
.createGenericVirtualRegister(DstTy
);
1955 if (!loadInputValue(KernargPtrReg
, B
, Arg
))
1958 B
.buildGEP(DstReg
, KernargPtrReg
, B
.buildConstant(IdxTy
, Offset
).getReg(0));
1959 MI
.eraseFromParent();
1963 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr
&MI
,
1964 MachineRegisterInfo
&MRI
,
1965 MachineIRBuilder
&B
,
1966 unsigned AddrSpace
) const {
1968 Register ApertureReg
= getSegmentAperture(AddrSpace
, MRI
, B
);
1969 auto Hi32
= B
.buildExtract(LLT::scalar(32), MI
.getOperand(2).getReg(), 32);
1970 B
.buildICmp(ICmpInst::ICMP_EQ
, MI
.getOperand(0), Hi32
, ApertureReg
);
1971 MI
.eraseFromParent();
1975 /// Handle register layout difference for f16 images for some subtargets.
1976 Register
AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder
&B
,
1977 MachineRegisterInfo
&MRI
,
1978 Register Reg
) const {
1979 if (!ST
.hasUnpackedD16VMem())
1982 const LLT S16
= LLT::scalar(16);
1983 const LLT S32
= LLT::scalar(32);
1984 LLT StoreVT
= MRI
.getType(Reg
);
1985 assert(StoreVT
.isVector() && StoreVT
.getElementType() == S16
);
1987 auto Unmerge
= B
.buildUnmerge(S16
, Reg
);
1989 SmallVector
<Register
, 4> WideRegs
;
1990 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
1991 WideRegs
.push_back(B
.buildAnyExt(S32
, Unmerge
.getReg(I
)).getReg(0));
1993 int NumElts
= StoreVT
.getNumElements();
1995 return B
.buildBuildVector(LLT::vector(NumElts
, S32
), WideRegs
).getReg(0);
1998 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr
&MI
,
1999 MachineRegisterInfo
&MRI
,
2000 MachineIRBuilder
&B
,
2001 bool IsFormat
) const {
2002 // TODO: Reject f16 format on targets where unsupported.
2003 Register VData
= MI
.getOperand(1).getReg();
2004 LLT Ty
= MRI
.getType(VData
);
2008 const LLT S32
= LLT::scalar(32);
2009 const LLT S16
= LLT::scalar(16);
2011 // Fixup illegal register types for i8 stores.
2012 if (Ty
== LLT::scalar(8) || Ty
== S16
) {
2013 Register AnyExt
= B
.buildAnyExt(LLT::scalar(32), VData
).getReg(0);
2014 MI
.getOperand(1).setReg(AnyExt
);
2018 if (Ty
.isVector()) {
2019 if (Ty
.getElementType() == S16
&& Ty
.getNumElements() <= 4) {
2021 MI
.getOperand(1).setReg(handleD16VData(B
, MRI
, VData
));
2025 return Ty
.getElementType() == S32
&& Ty
.getNumElements() <= 4;
2031 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr
&MI
,
2032 MachineRegisterInfo
&MRI
,
2033 MachineIRBuilder
&B
) const {
2034 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2035 switch (MI
.getIntrinsicID()) {
2036 case Intrinsic::amdgcn_if
: {
2037 if (MachineInstr
*BrCond
= verifyCFIntrinsic(MI
, MRI
)) {
2038 const SIRegisterInfo
*TRI
2039 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
2041 B
.setInstr(*BrCond
);
2042 Register Def
= MI
.getOperand(1).getReg();
2043 Register Use
= MI
.getOperand(3).getReg();
2044 B
.buildInstr(AMDGPU::SI_IF
)
2047 .addMBB(BrCond
->getOperand(1).getMBB());
2049 MRI
.setRegClass(Def
, TRI
->getWaveMaskRegClass());
2050 MRI
.setRegClass(Use
, TRI
->getWaveMaskRegClass());
2051 MI
.eraseFromParent();
2052 BrCond
->eraseFromParent();
2058 case Intrinsic::amdgcn_loop
: {
2059 if (MachineInstr
*BrCond
= verifyCFIntrinsic(MI
, MRI
)) {
2060 const SIRegisterInfo
*TRI
2061 = static_cast<const SIRegisterInfo
*>(MRI
.getTargetRegisterInfo());
2063 B
.setInstr(*BrCond
);
2064 Register Reg
= MI
.getOperand(2).getReg();
2065 B
.buildInstr(AMDGPU::SI_LOOP
)
2067 .addMBB(BrCond
->getOperand(1).getMBB());
2068 MI
.eraseFromParent();
2069 BrCond
->eraseFromParent();
2070 MRI
.setRegClass(Reg
, TRI
->getWaveMaskRegClass());
2076 case Intrinsic::amdgcn_kernarg_segment_ptr
:
2077 return legalizePreloadedArgIntrin(
2078 MI
, MRI
, B
, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR
);
2079 case Intrinsic::amdgcn_implicitarg_ptr
:
2080 return legalizeImplicitArgPtr(MI
, MRI
, B
);
2081 case Intrinsic::amdgcn_workitem_id_x
:
2082 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2083 AMDGPUFunctionArgInfo::WORKITEM_ID_X
);
2084 case Intrinsic::amdgcn_workitem_id_y
:
2085 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2086 AMDGPUFunctionArgInfo::WORKITEM_ID_Y
);
2087 case Intrinsic::amdgcn_workitem_id_z
:
2088 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2089 AMDGPUFunctionArgInfo::WORKITEM_ID_Z
);
2090 case Intrinsic::amdgcn_workgroup_id_x
:
2091 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2092 AMDGPUFunctionArgInfo::WORKGROUP_ID_X
);
2093 case Intrinsic::amdgcn_workgroup_id_y
:
2094 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2095 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y
);
2096 case Intrinsic::amdgcn_workgroup_id_z
:
2097 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2098 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
);
2099 case Intrinsic::amdgcn_dispatch_ptr
:
2100 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2101 AMDGPUFunctionArgInfo::DISPATCH_PTR
);
2102 case Intrinsic::amdgcn_queue_ptr
:
2103 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2104 AMDGPUFunctionArgInfo::QUEUE_PTR
);
2105 case Intrinsic::amdgcn_implicit_buffer_ptr
:
2106 return legalizePreloadedArgIntrin(
2107 MI
, MRI
, B
, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR
);
2108 case Intrinsic::amdgcn_dispatch_id
:
2109 return legalizePreloadedArgIntrin(MI
, MRI
, B
,
2110 AMDGPUFunctionArgInfo::DISPATCH_ID
);
2111 case Intrinsic::amdgcn_fdiv_fast
:
2112 return legalizeFDIVFastIntrin(MI
, MRI
, B
);
2113 case Intrinsic::amdgcn_is_shared
:
2114 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::LOCAL_ADDRESS
);
2115 case Intrinsic::amdgcn_is_private
:
2116 return legalizeIsAddrSpace(MI
, MRI
, B
, AMDGPUAS::PRIVATE_ADDRESS
);
2117 case Intrinsic::amdgcn_wavefrontsize
: {
2119 B
.buildConstant(MI
.getOperand(0), ST
.getWavefrontSize());
2120 MI
.eraseFromParent();
2123 case Intrinsic::amdgcn_raw_buffer_store
:
2124 return legalizeRawBufferStore(MI
, MRI
, B
, false);
2125 case Intrinsic::amdgcn_raw_buffer_store_format
:
2126 return legalizeRawBufferStore(MI
, MRI
, B
, true);