1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
13 //===----------------------------------------------------------------------===//
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
19 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24 #include "llvm/CodeGen/GlobalISel/Utils.h"
25 #include "llvm/CodeGen/MachineConstantPool.h"
26 #include "llvm/CodeGen/MachineFrameInfo.h"
27 #include "llvm/CodeGen/MachineRegisterInfo.h"
28 #include "llvm/CodeGen/RuntimeLibcalls.h"
29 #include "llvm/CodeGen/TargetFrameLowering.h"
30 #include "llvm/CodeGen/TargetInstrInfo.h"
31 #include "llvm/CodeGen/TargetLowering.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/CodeGen/TargetSubtargetInfo.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/Support/Debug.h"
36 #include "llvm/Support/MathExtras.h"
37 #include "llvm/Support/raw_ostream.h"
38 #include "llvm/Target/TargetMachine.h"
42 #define DEBUG_TYPE "legalizer"
45 using namespace LegalizeActions
;
46 using namespace MIPatternMatch
;
48 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
50 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
51 /// with any leftover piece as type \p LeftoverTy
53 /// Returns -1 in the first element of the pair if the breakdown is not
55 static std::pair
<int, int>
56 getNarrowTypeBreakDown(LLT OrigTy
, LLT NarrowTy
, LLT
&LeftoverTy
) {
57 assert(!LeftoverTy
.isValid() && "this is an out argument");
59 unsigned Size
= OrigTy
.getSizeInBits();
60 unsigned NarrowSize
= NarrowTy
.getSizeInBits();
61 unsigned NumParts
= Size
/ NarrowSize
;
62 unsigned LeftoverSize
= Size
- NumParts
* NarrowSize
;
63 assert(Size
> NarrowSize
);
65 if (LeftoverSize
== 0)
68 if (NarrowTy
.isVector()) {
69 unsigned EltSize
= OrigTy
.getScalarSizeInBits();
70 if (LeftoverSize
% EltSize
!= 0)
72 LeftoverTy
= LLT::scalarOrVector(
73 ElementCount::getFixed(LeftoverSize
/ EltSize
), EltSize
);
75 LeftoverTy
= LLT::scalar(LeftoverSize
);
78 int NumLeftover
= LeftoverSize
/ LeftoverTy
.getSizeInBits();
79 return std::make_pair(NumParts
, NumLeftover
);
82 static Type
*getFloatTypeForLLT(LLVMContext
&Ctx
, LLT Ty
) {
87 switch (Ty
.getSizeInBits()) {
89 return Type::getHalfTy(Ctx
);
91 return Type::getFloatTy(Ctx
);
93 return Type::getDoubleTy(Ctx
);
95 return Type::getX86_FP80Ty(Ctx
);
97 return Type::getFP128Ty(Ctx
);
103 LegalizerHelper::LegalizerHelper(MachineFunction
&MF
,
104 GISelChangeObserver
&Observer
,
105 MachineIRBuilder
&Builder
)
106 : MIRBuilder(Builder
), Observer(Observer
), MRI(MF
.getRegInfo()),
107 LI(*MF
.getSubtarget().getLegalizerInfo()),
108 TLI(*MF
.getSubtarget().getTargetLowering()), KB(nullptr) {}
110 LegalizerHelper::LegalizerHelper(MachineFunction
&MF
, const LegalizerInfo
&LI
,
111 GISelChangeObserver
&Observer
,
112 MachineIRBuilder
&B
, GISelKnownBits
*KB
)
113 : MIRBuilder(B
), Observer(Observer
), MRI(MF
.getRegInfo()), LI(LI
),
114 TLI(*MF
.getSubtarget().getTargetLowering()), KB(KB
) {}
116 LegalizerHelper::LegalizeResult
117 LegalizerHelper::legalizeInstrStep(MachineInstr
&MI
,
118 LostDebugLocObserver
&LocObserver
) {
119 LLVM_DEBUG(dbgs() << "Legalizing: " << MI
);
121 MIRBuilder
.setInstrAndDebugLoc(MI
);
123 if (isa
<GIntrinsic
>(MI
))
124 return LI
.legalizeIntrinsic(*this, MI
) ? Legalized
: UnableToLegalize
;
125 auto Step
= LI
.getAction(MI
, MRI
);
126 switch (Step
.Action
) {
128 LLVM_DEBUG(dbgs() << ".. Already legal\n");
131 LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
132 return libcall(MI
, LocObserver
);
134 LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
135 return narrowScalar(MI
, Step
.TypeIdx
, Step
.NewType
);
137 LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
138 return widenScalar(MI
, Step
.TypeIdx
, Step
.NewType
);
140 LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
141 return bitcast(MI
, Step
.TypeIdx
, Step
.NewType
);
143 LLVM_DEBUG(dbgs() << ".. Lower\n");
144 return lower(MI
, Step
.TypeIdx
, Step
.NewType
);
146 LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
147 return fewerElementsVector(MI
, Step
.TypeIdx
, Step
.NewType
);
149 LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
150 return moreElementsVector(MI
, Step
.TypeIdx
, Step
.NewType
);
152 LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
153 return LI
.legalizeCustom(*this, MI
, LocObserver
) ? Legalized
156 LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
157 return UnableToLegalize
;
161 void LegalizerHelper::insertParts(Register DstReg
,
162 LLT ResultTy
, LLT PartTy
,
163 ArrayRef
<Register
> PartRegs
,
165 ArrayRef
<Register
> LeftoverRegs
) {
166 if (!LeftoverTy
.isValid()) {
167 assert(LeftoverRegs
.empty());
169 if (!ResultTy
.isVector()) {
170 MIRBuilder
.buildMergeLikeInstr(DstReg
, PartRegs
);
174 if (PartTy
.isVector())
175 MIRBuilder
.buildConcatVectors(DstReg
, PartRegs
);
177 MIRBuilder
.buildBuildVector(DstReg
, PartRegs
);
181 // Merge sub-vectors with different number of elements and insert into DstReg.
182 if (ResultTy
.isVector()) {
183 assert(LeftoverRegs
.size() == 1 && "Expected one leftover register");
184 SmallVector
<Register
, 8> AllRegs
;
185 for (auto Reg
: concat
<const Register
>(PartRegs
, LeftoverRegs
))
186 AllRegs
.push_back(Reg
);
187 return mergeMixedSubvectors(DstReg
, AllRegs
);
190 SmallVector
<Register
> GCDRegs
;
191 LLT GCDTy
= getGCDType(getGCDType(ResultTy
, LeftoverTy
), PartTy
);
192 for (auto PartReg
: concat
<const Register
>(PartRegs
, LeftoverRegs
))
193 extractGCDType(GCDRegs
, GCDTy
, PartReg
);
194 LLT ResultLCMTy
= buildLCMMergePieces(ResultTy
, LeftoverTy
, GCDTy
, GCDRegs
);
195 buildWidenedRemergeToDst(DstReg
, ResultLCMTy
, GCDRegs
);
198 void LegalizerHelper::appendVectorElts(SmallVectorImpl
<Register
> &Elts
,
200 LLT Ty
= MRI
.getType(Reg
);
201 SmallVector
<Register
, 8> RegElts
;
202 extractParts(Reg
, Ty
.getScalarType(), Ty
.getNumElements(), RegElts
,
204 Elts
.append(RegElts
);
207 /// Merge \p PartRegs with different types into \p DstReg.
208 void LegalizerHelper::mergeMixedSubvectors(Register DstReg
,
209 ArrayRef
<Register
> PartRegs
) {
210 SmallVector
<Register
, 8> AllElts
;
211 for (unsigned i
= 0; i
< PartRegs
.size() - 1; ++i
)
212 appendVectorElts(AllElts
, PartRegs
[i
]);
214 Register Leftover
= PartRegs
[PartRegs
.size() - 1];
215 if (MRI
.getType(Leftover
).isScalar())
216 AllElts
.push_back(Leftover
);
218 appendVectorElts(AllElts
, Leftover
);
220 MIRBuilder
.buildMergeLikeInstr(DstReg
, AllElts
);
223 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
224 static void getUnmergeResults(SmallVectorImpl
<Register
> &Regs
,
225 const MachineInstr
&MI
) {
226 assert(MI
.getOpcode() == TargetOpcode::G_UNMERGE_VALUES
);
228 const int StartIdx
= Regs
.size();
229 const int NumResults
= MI
.getNumOperands() - 1;
230 Regs
.resize(Regs
.size() + NumResults
);
231 for (int I
= 0; I
!= NumResults
; ++I
)
232 Regs
[StartIdx
+ I
] = MI
.getOperand(I
).getReg();
235 void LegalizerHelper::extractGCDType(SmallVectorImpl
<Register
> &Parts
,
236 LLT GCDTy
, Register SrcReg
) {
237 LLT SrcTy
= MRI
.getType(SrcReg
);
238 if (SrcTy
== GCDTy
) {
239 // If the source already evenly divides the result type, we don't need to do
241 Parts
.push_back(SrcReg
);
243 // Need to split into common type sized pieces.
244 auto Unmerge
= MIRBuilder
.buildUnmerge(GCDTy
, SrcReg
);
245 getUnmergeResults(Parts
, *Unmerge
);
249 LLT
LegalizerHelper::extractGCDType(SmallVectorImpl
<Register
> &Parts
, LLT DstTy
,
250 LLT NarrowTy
, Register SrcReg
) {
251 LLT SrcTy
= MRI
.getType(SrcReg
);
252 LLT GCDTy
= getGCDType(getGCDType(SrcTy
, NarrowTy
), DstTy
);
253 extractGCDType(Parts
, GCDTy
, SrcReg
);
257 LLT
LegalizerHelper::buildLCMMergePieces(LLT DstTy
, LLT NarrowTy
, LLT GCDTy
,
258 SmallVectorImpl
<Register
> &VRegs
,
259 unsigned PadStrategy
) {
260 LLT LCMTy
= getLCMType(DstTy
, NarrowTy
);
262 int NumParts
= LCMTy
.getSizeInBits() / NarrowTy
.getSizeInBits();
263 int NumSubParts
= NarrowTy
.getSizeInBits() / GCDTy
.getSizeInBits();
264 int NumOrigSrc
= VRegs
.size();
268 // Get a value we can use to pad the source value if the sources won't evenly
269 // cover the result type.
270 if (NumOrigSrc
< NumParts
* NumSubParts
) {
271 if (PadStrategy
== TargetOpcode::G_ZEXT
)
272 PadReg
= MIRBuilder
.buildConstant(GCDTy
, 0).getReg(0);
273 else if (PadStrategy
== TargetOpcode::G_ANYEXT
)
274 PadReg
= MIRBuilder
.buildUndef(GCDTy
).getReg(0);
276 assert(PadStrategy
== TargetOpcode::G_SEXT
);
278 // Shift the sign bit of the low register through the high register.
280 MIRBuilder
.buildConstant(LLT::scalar(64), GCDTy
.getSizeInBits() - 1);
281 PadReg
= MIRBuilder
.buildAShr(GCDTy
, VRegs
.back(), ShiftAmt
).getReg(0);
285 // Registers for the final merge to be produced.
286 SmallVector
<Register
, 4> Remerge(NumParts
);
288 // Registers needed for intermediate merges, which will be merged into a
289 // source for Remerge.
290 SmallVector
<Register
, 4> SubMerge(NumSubParts
);
292 // Once we've fully read off the end of the original source bits, we can reuse
293 // the same high bits for remaining padding elements.
296 // Build merges to the LCM type to cover the original result type.
297 for (int I
= 0; I
!= NumParts
; ++I
) {
298 bool AllMergePartsArePadding
= true;
300 // Build the requested merges to the requested type.
301 for (int J
= 0; J
!= NumSubParts
; ++J
) {
302 int Idx
= I
* NumSubParts
+ J
;
303 if (Idx
>= NumOrigSrc
) {
304 SubMerge
[J
] = PadReg
;
308 SubMerge
[J
] = VRegs
[Idx
];
310 // There are meaningful bits here we can't reuse later.
311 AllMergePartsArePadding
= false;
314 // If we've filled up a complete piece with padding bits, we can directly
315 // emit the natural sized constant if applicable, rather than a merge of
316 // smaller constants.
317 if (AllMergePartsArePadding
&& !AllPadReg
) {
318 if (PadStrategy
== TargetOpcode::G_ANYEXT
)
319 AllPadReg
= MIRBuilder
.buildUndef(NarrowTy
).getReg(0);
320 else if (PadStrategy
== TargetOpcode::G_ZEXT
)
321 AllPadReg
= MIRBuilder
.buildConstant(NarrowTy
, 0).getReg(0);
323 // If this is a sign extension, we can't materialize a trivial constant
324 // with the right type and have to produce a merge.
328 // Avoid creating additional instructions if we're just adding additional
329 // copies of padding bits.
330 Remerge
[I
] = AllPadReg
;
334 if (NumSubParts
== 1)
335 Remerge
[I
] = SubMerge
[0];
337 Remerge
[I
] = MIRBuilder
.buildMergeLikeInstr(NarrowTy
, SubMerge
).getReg(0);
339 // In the sign extend padding case, re-use the first all-signbit merge.
340 if (AllMergePartsArePadding
&& !AllPadReg
)
341 AllPadReg
= Remerge
[I
];
344 VRegs
= std::move(Remerge
);
348 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg
, LLT LCMTy
,
349 ArrayRef
<Register
> RemergeRegs
) {
350 LLT DstTy
= MRI
.getType(DstReg
);
352 // Create the merge to the widened source, and extract the relevant bits into
355 if (DstTy
== LCMTy
) {
356 MIRBuilder
.buildMergeLikeInstr(DstReg
, RemergeRegs
);
360 auto Remerge
= MIRBuilder
.buildMergeLikeInstr(LCMTy
, RemergeRegs
);
361 if (DstTy
.isScalar() && LCMTy
.isScalar()) {
362 MIRBuilder
.buildTrunc(DstReg
, Remerge
);
366 if (LCMTy
.isVector()) {
367 unsigned NumDefs
= LCMTy
.getSizeInBits() / DstTy
.getSizeInBits();
368 SmallVector
<Register
, 8> UnmergeDefs(NumDefs
);
369 UnmergeDefs
[0] = DstReg
;
370 for (unsigned I
= 1; I
!= NumDefs
; ++I
)
371 UnmergeDefs
[I
] = MRI
.createGenericVirtualRegister(DstTy
);
373 MIRBuilder
.buildUnmerge(UnmergeDefs
,
374 MIRBuilder
.buildMergeLikeInstr(LCMTy
, RemergeRegs
));
378 llvm_unreachable("unhandled case");
381 static RTLIB::Libcall
getRTLibDesc(unsigned Opcode
, unsigned Size
) {
382 #define RTLIBCASE_INT(LibcallPrefix) \
386 return RTLIB::LibcallPrefix##32; \
388 return RTLIB::LibcallPrefix##64; \
390 return RTLIB::LibcallPrefix##128; \
392 llvm_unreachable("unexpected size"); \
396 #define RTLIBCASE(LibcallPrefix) \
400 return RTLIB::LibcallPrefix##32; \
402 return RTLIB::LibcallPrefix##64; \
404 return RTLIB::LibcallPrefix##80; \
406 return RTLIB::LibcallPrefix##128; \
408 llvm_unreachable("unexpected size"); \
413 case TargetOpcode::G_MUL
:
414 RTLIBCASE_INT(MUL_I
);
415 case TargetOpcode::G_SDIV
:
416 RTLIBCASE_INT(SDIV_I
);
417 case TargetOpcode::G_UDIV
:
418 RTLIBCASE_INT(UDIV_I
);
419 case TargetOpcode::G_SREM
:
420 RTLIBCASE_INT(SREM_I
);
421 case TargetOpcode::G_UREM
:
422 RTLIBCASE_INT(UREM_I
);
423 case TargetOpcode::G_CTLZ_ZERO_UNDEF
:
424 RTLIBCASE_INT(CTLZ_I
);
425 case TargetOpcode::G_FADD
:
427 case TargetOpcode::G_FSUB
:
429 case TargetOpcode::G_FMUL
:
431 case TargetOpcode::G_FDIV
:
433 case TargetOpcode::G_FEXP
:
435 case TargetOpcode::G_FEXP2
:
437 case TargetOpcode::G_FEXP10
:
439 case TargetOpcode::G_FREM
:
441 case TargetOpcode::G_FPOW
:
443 case TargetOpcode::G_FPOWI
:
445 case TargetOpcode::G_FMA
:
447 case TargetOpcode::G_FSIN
:
449 case TargetOpcode::G_FCOS
:
451 case TargetOpcode::G_FLOG10
:
453 case TargetOpcode::G_FLOG
:
455 case TargetOpcode::G_FLOG2
:
457 case TargetOpcode::G_FLDEXP
:
459 case TargetOpcode::G_FCEIL
:
461 case TargetOpcode::G_FFLOOR
:
463 case TargetOpcode::G_FMINNUM
:
465 case TargetOpcode::G_FMAXNUM
:
467 case TargetOpcode::G_FSQRT
:
469 case TargetOpcode::G_FRINT
:
471 case TargetOpcode::G_FNEARBYINT
:
472 RTLIBCASE(NEARBYINT_F
);
473 case TargetOpcode::G_INTRINSIC_ROUNDEVEN
:
474 RTLIBCASE(ROUNDEVEN_F
);
476 llvm_unreachable("Unknown libcall function");
479 /// True if an instruction is in tail position in its caller. Intended for
480 /// legalizing libcalls as tail calls when possible.
481 static bool isLibCallInTailPosition(const CallLowering::ArgInfo
&Result
,
483 const TargetInstrInfo
&TII
,
484 MachineRegisterInfo
&MRI
) {
485 MachineBasicBlock
&MBB
= *MI
.getParent();
486 const Function
&F
= MBB
.getParent()->getFunction();
488 // Conservatively require the attributes of the call to match those of
489 // the return. Ignore NoAlias and NonNull because they don't affect the
491 AttributeList CallerAttrs
= F
.getAttributes();
492 if (AttrBuilder(F
.getContext(), CallerAttrs
.getRetAttrs())
493 .removeAttribute(Attribute::NoAlias
)
494 .removeAttribute(Attribute::NonNull
)
498 // It's not safe to eliminate the sign / zero extension of the return value.
499 if (CallerAttrs
.hasRetAttr(Attribute::ZExt
) ||
500 CallerAttrs
.hasRetAttr(Attribute::SExt
))
503 // Only tail call if the following instruction is a standard return or if we
504 // have a `thisreturn` callee, and a sequence like:
506 // G_MEMCPY %0, %1, %2
508 // RET_ReallyLR implicit $x0
509 auto Next
= next_nodbg(MI
.getIterator(), MBB
.instr_end());
510 if (Next
!= MBB
.instr_end() && Next
->isCopy()) {
511 if (MI
.getOpcode() == TargetOpcode::G_BZERO
)
514 // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the
515 // mempy/etc routines return the same parameter. For other it will be the
517 Register VReg
= MI
.getOperand(0).getReg();
518 if (!VReg
.isVirtual() || VReg
!= Next
->getOperand(1).getReg())
521 Register PReg
= Next
->getOperand(0).getReg();
522 if (!PReg
.isPhysical())
525 auto Ret
= next_nodbg(Next
, MBB
.instr_end());
526 if (Ret
== MBB
.instr_end() || !Ret
->isReturn())
529 if (Ret
->getNumImplicitOperands() != 1)
532 if (!Ret
->getOperand(0).isReg() || PReg
!= Ret
->getOperand(0).getReg())
535 // Skip over the COPY that we just validated.
539 if (Next
== MBB
.instr_end() || TII
.isTailCall(*Next
) || !Next
->isReturn())
545 LegalizerHelper::LegalizeResult
546 llvm::createLibcall(MachineIRBuilder
&MIRBuilder
, const char *Name
,
547 const CallLowering::ArgInfo
&Result
,
548 ArrayRef
<CallLowering::ArgInfo
> Args
,
549 const CallingConv::ID CC
, LostDebugLocObserver
&LocObserver
,
551 auto &CLI
= *MIRBuilder
.getMF().getSubtarget().getCallLowering();
553 CallLowering::CallLoweringInfo Info
;
555 Info
.Callee
= MachineOperand::CreateES(Name
);
556 Info
.OrigRet
= Result
;
559 (Result
.Ty
->isVoidTy() ||
560 Result
.Ty
== MIRBuilder
.getMF().getFunction().getReturnType()) &&
561 isLibCallInTailPosition(Result
, *MI
, MIRBuilder
.getTII(),
562 *MIRBuilder
.getMRI());
564 std::copy(Args
.begin(), Args
.end(), std::back_inserter(Info
.OrigArgs
));
565 if (!CLI
.lowerCall(MIRBuilder
, Info
))
566 return LegalizerHelper::UnableToLegalize
;
568 if (MI
&& Info
.LoweredTailCall
) {
569 assert(Info
.IsTailCall
&& "Lowered tail call when it wasn't a tail call?");
571 // Check debug locations before removing the return.
572 LocObserver
.checkpoint(true);
574 // We must have a return following the call (or debug insts) to get past
575 // isLibCallInTailPosition.
577 MachineInstr
*Next
= MI
->getNextNode();
579 (Next
->isCopy() || Next
->isReturn() || Next
->isDebugInstr()) &&
580 "Expected instr following MI to be return or debug inst?");
581 // We lowered a tail call, so the call is now the return from the block.
582 // Delete the old return.
583 Next
->eraseFromParent();
584 } while (MI
->getNextNode());
586 // We expect to lose the debug location from the return.
587 LocObserver
.checkpoint(false);
589 return LegalizerHelper::Legalized
;
592 LegalizerHelper::LegalizeResult
593 llvm::createLibcall(MachineIRBuilder
&MIRBuilder
, RTLIB::Libcall Libcall
,
594 const CallLowering::ArgInfo
&Result
,
595 ArrayRef
<CallLowering::ArgInfo
> Args
,
596 LostDebugLocObserver
&LocObserver
, MachineInstr
*MI
) {
597 auto &TLI
= *MIRBuilder
.getMF().getSubtarget().getTargetLowering();
598 const char *Name
= TLI
.getLibcallName(Libcall
);
600 return LegalizerHelper::UnableToLegalize
;
601 const CallingConv::ID CC
= TLI
.getLibcallCallingConv(Libcall
);
602 return createLibcall(MIRBuilder
, Name
, Result
, Args
, CC
, LocObserver
, MI
);
605 // Useful for libcalls where all operands have the same type.
606 static LegalizerHelper::LegalizeResult
607 simpleLibcall(MachineInstr
&MI
, MachineIRBuilder
&MIRBuilder
, unsigned Size
,
608 Type
*OpType
, LostDebugLocObserver
&LocObserver
) {
609 auto Libcall
= getRTLibDesc(MI
.getOpcode(), Size
);
611 // FIXME: What does the original arg index mean here?
612 SmallVector
<CallLowering::ArgInfo
, 3> Args
;
613 for (const MachineOperand
&MO
: llvm::drop_begin(MI
.operands()))
614 Args
.push_back({MO
.getReg(), OpType
, 0});
615 return createLibcall(MIRBuilder
, Libcall
,
616 {MI
.getOperand(0).getReg(), OpType
, 0}, Args
,
620 LegalizerHelper::LegalizeResult
621 llvm::createMemLibcall(MachineIRBuilder
&MIRBuilder
, MachineRegisterInfo
&MRI
,
622 MachineInstr
&MI
, LostDebugLocObserver
&LocObserver
) {
623 auto &Ctx
= MIRBuilder
.getMF().getFunction().getContext();
625 SmallVector
<CallLowering::ArgInfo
, 3> Args
;
626 // Add all the args, except for the last which is an imm denoting 'tail'.
627 for (unsigned i
= 0; i
< MI
.getNumOperands() - 1; ++i
) {
628 Register Reg
= MI
.getOperand(i
).getReg();
630 // Need derive an IR type for call lowering.
631 LLT OpLLT
= MRI
.getType(Reg
);
632 Type
*OpTy
= nullptr;
633 if (OpLLT
.isPointer())
634 OpTy
= PointerType::get(Ctx
, OpLLT
.getAddressSpace());
636 OpTy
= IntegerType::get(Ctx
, OpLLT
.getSizeInBits());
637 Args
.push_back({Reg
, OpTy
, 0});
640 auto &CLI
= *MIRBuilder
.getMF().getSubtarget().getCallLowering();
641 auto &TLI
= *MIRBuilder
.getMF().getSubtarget().getTargetLowering();
642 RTLIB::Libcall RTLibcall
;
643 unsigned Opc
= MI
.getOpcode();
645 case TargetOpcode::G_BZERO
:
646 RTLibcall
= RTLIB::BZERO
;
648 case TargetOpcode::G_MEMCPY
:
649 RTLibcall
= RTLIB::MEMCPY
;
650 Args
[0].Flags
[0].setReturned();
652 case TargetOpcode::G_MEMMOVE
:
653 RTLibcall
= RTLIB::MEMMOVE
;
654 Args
[0].Flags
[0].setReturned();
656 case TargetOpcode::G_MEMSET
:
657 RTLibcall
= RTLIB::MEMSET
;
658 Args
[0].Flags
[0].setReturned();
661 llvm_unreachable("unsupported opcode");
663 const char *Name
= TLI
.getLibcallName(RTLibcall
);
665 // Unsupported libcall on the target.
667 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
668 << MIRBuilder
.getTII().getName(Opc
) << "\n");
669 return LegalizerHelper::UnableToLegalize
;
672 CallLowering::CallLoweringInfo Info
;
673 Info
.CallConv
= TLI
.getLibcallCallingConv(RTLibcall
);
674 Info
.Callee
= MachineOperand::CreateES(Name
);
675 Info
.OrigRet
= CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx
), 0);
677 MI
.getOperand(MI
.getNumOperands() - 1).getImm() &&
678 isLibCallInTailPosition(Info
.OrigRet
, MI
, MIRBuilder
.getTII(), MRI
);
680 std::copy(Args
.begin(), Args
.end(), std::back_inserter(Info
.OrigArgs
));
681 if (!CLI
.lowerCall(MIRBuilder
, Info
))
682 return LegalizerHelper::UnableToLegalize
;
684 if (Info
.LoweredTailCall
) {
685 assert(Info
.IsTailCall
&& "Lowered tail call when it wasn't a tail call?");
687 // Check debug locations before removing the return.
688 LocObserver
.checkpoint(true);
690 // We must have a return following the call (or debug insts) to get past
691 // isLibCallInTailPosition.
693 MachineInstr
*Next
= MI
.getNextNode();
695 (Next
->isCopy() || Next
->isReturn() || Next
->isDebugInstr()) &&
696 "Expected instr following MI to be return or debug inst?");
697 // We lowered a tail call, so the call is now the return from the block.
698 // Delete the old return.
699 Next
->eraseFromParent();
700 } while (MI
.getNextNode());
702 // We expect to lose the debug location from the return.
703 LocObserver
.checkpoint(false);
706 return LegalizerHelper::Legalized
;
709 static RTLIB::Libcall
getOutlineAtomicLibcall(MachineInstr
&MI
) {
710 unsigned Opc
= MI
.getOpcode();
711 auto &AtomicMI
= cast
<GMemOperation
>(MI
);
712 auto &MMO
= AtomicMI
.getMMO();
713 auto Ordering
= MMO
.getMergedOrdering();
714 LLT MemType
= MMO
.getMemoryType();
715 uint64_t MemSize
= MemType
.getSizeInBytes();
716 if (MemType
.isVector())
717 return RTLIB::UNKNOWN_LIBCALL
;
719 #define LCALLS(A, B) \
720 { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
722 LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
724 case TargetOpcode::G_ATOMIC_CMPXCHG
:
725 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS
: {
726 const RTLIB::Libcall LC
[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS
)};
727 return getOutlineAtomicHelper(LC
, Ordering
, MemSize
);
729 case TargetOpcode::G_ATOMICRMW_XCHG
: {
730 const RTLIB::Libcall LC
[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP
)};
731 return getOutlineAtomicHelper(LC
, Ordering
, MemSize
);
733 case TargetOpcode::G_ATOMICRMW_ADD
:
734 case TargetOpcode::G_ATOMICRMW_SUB
: {
735 const RTLIB::Libcall LC
[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD
)};
736 return getOutlineAtomicHelper(LC
, Ordering
, MemSize
);
738 case TargetOpcode::G_ATOMICRMW_AND
: {
739 const RTLIB::Libcall LC
[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR
)};
740 return getOutlineAtomicHelper(LC
, Ordering
, MemSize
);
742 case TargetOpcode::G_ATOMICRMW_OR
: {
743 const RTLIB::Libcall LC
[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET
)};
744 return getOutlineAtomicHelper(LC
, Ordering
, MemSize
);
746 case TargetOpcode::G_ATOMICRMW_XOR
: {
747 const RTLIB::Libcall LC
[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR
)};
748 return getOutlineAtomicHelper(LC
, Ordering
, MemSize
);
751 return RTLIB::UNKNOWN_LIBCALL
;
757 static LegalizerHelper::LegalizeResult
758 createAtomicLibcall(MachineIRBuilder
&MIRBuilder
, MachineInstr
&MI
) {
759 auto &Ctx
= MIRBuilder
.getMF().getFunction().getContext();
762 SmallVector
<Register
> RetRegs
;
763 SmallVector
<CallLowering::ArgInfo
, 3> Args
;
764 unsigned Opc
= MI
.getOpcode();
766 case TargetOpcode::G_ATOMIC_CMPXCHG
:
767 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS
: {
770 auto [Ret
, RetLLT
, Mem
, MemLLT
, Cmp
, CmpLLT
, New
, NewLLT
] =
771 MI
.getFirst4RegLLTs();
772 RetRegs
.push_back(Ret
);
773 RetTy
= IntegerType::get(Ctx
, RetLLT
.getSizeInBits());
774 if (Opc
== TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS
) {
775 std::tie(Ret
, RetLLT
, Success
, SuccessLLT
, Mem
, MemLLT
, Cmp
, CmpLLT
, New
,
776 NewLLT
) = MI
.getFirst5RegLLTs();
777 RetRegs
.push_back(Success
);
778 RetTy
= StructType::get(
779 Ctx
, {RetTy
, IntegerType::get(Ctx
, SuccessLLT
.getSizeInBits())});
781 Args
.push_back({Cmp
, IntegerType::get(Ctx
, CmpLLT
.getSizeInBits()), 0});
782 Args
.push_back({New
, IntegerType::get(Ctx
, NewLLT
.getSizeInBits()), 0});
783 Args
.push_back({Mem
, PointerType::get(Ctx
, MemLLT
.getAddressSpace()), 0});
786 case TargetOpcode::G_ATOMICRMW_XCHG
:
787 case TargetOpcode::G_ATOMICRMW_ADD
:
788 case TargetOpcode::G_ATOMICRMW_SUB
:
789 case TargetOpcode::G_ATOMICRMW_AND
:
790 case TargetOpcode::G_ATOMICRMW_OR
:
791 case TargetOpcode::G_ATOMICRMW_XOR
: {
792 auto [Ret
, RetLLT
, Mem
, MemLLT
, Val
, ValLLT
] = MI
.getFirst3RegLLTs();
793 RetRegs
.push_back(Ret
);
794 RetTy
= IntegerType::get(Ctx
, RetLLT
.getSizeInBits());
795 if (Opc
== TargetOpcode::G_ATOMICRMW_AND
)
797 MIRBuilder
.buildXor(ValLLT
, MIRBuilder
.buildConstant(ValLLT
, -1), Val
)
799 else if (Opc
== TargetOpcode::G_ATOMICRMW_SUB
)
801 MIRBuilder
.buildSub(ValLLT
, MIRBuilder
.buildConstant(ValLLT
, 0), Val
)
803 Args
.push_back({Val
, IntegerType::get(Ctx
, ValLLT
.getSizeInBits()), 0});
804 Args
.push_back({Mem
, PointerType::get(Ctx
, MemLLT
.getAddressSpace()), 0});
808 llvm_unreachable("unsupported opcode");
811 auto &CLI
= *MIRBuilder
.getMF().getSubtarget().getCallLowering();
812 auto &TLI
= *MIRBuilder
.getMF().getSubtarget().getTargetLowering();
813 RTLIB::Libcall RTLibcall
= getOutlineAtomicLibcall(MI
);
814 const char *Name
= TLI
.getLibcallName(RTLibcall
);
816 // Unsupported libcall on the target.
818 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
819 << MIRBuilder
.getTII().getName(Opc
) << "\n");
820 return LegalizerHelper::UnableToLegalize
;
823 CallLowering::CallLoweringInfo Info
;
824 Info
.CallConv
= TLI
.getLibcallCallingConv(RTLibcall
);
825 Info
.Callee
= MachineOperand::CreateES(Name
);
826 Info
.OrigRet
= CallLowering::ArgInfo(RetRegs
, RetTy
, 0);
828 std::copy(Args
.begin(), Args
.end(), std::back_inserter(Info
.OrigArgs
));
829 if (!CLI
.lowerCall(MIRBuilder
, Info
))
830 return LegalizerHelper::UnableToLegalize
;
832 return LegalizerHelper::Legalized
;
835 static RTLIB::Libcall
getConvRTLibDesc(unsigned Opcode
, Type
*ToType
,
837 auto ToMVT
= MVT::getVT(ToType
);
838 auto FromMVT
= MVT::getVT(FromType
);
841 case TargetOpcode::G_FPEXT
:
842 return RTLIB::getFPEXT(FromMVT
, ToMVT
);
843 case TargetOpcode::G_FPTRUNC
:
844 return RTLIB::getFPROUND(FromMVT
, ToMVT
);
845 case TargetOpcode::G_FPTOSI
:
846 return RTLIB::getFPTOSINT(FromMVT
, ToMVT
);
847 case TargetOpcode::G_FPTOUI
:
848 return RTLIB::getFPTOUINT(FromMVT
, ToMVT
);
849 case TargetOpcode::G_SITOFP
:
850 return RTLIB::getSINTTOFP(FromMVT
, ToMVT
);
851 case TargetOpcode::G_UITOFP
:
852 return RTLIB::getUINTTOFP(FromMVT
, ToMVT
);
854 llvm_unreachable("Unsupported libcall function");
857 static LegalizerHelper::LegalizeResult
858 conversionLibcall(MachineInstr
&MI
, MachineIRBuilder
&MIRBuilder
, Type
*ToType
,
859 Type
*FromType
, LostDebugLocObserver
&LocObserver
) {
860 RTLIB::Libcall Libcall
= getConvRTLibDesc(MI
.getOpcode(), ToType
, FromType
);
861 return createLibcall(
862 MIRBuilder
, Libcall
, {MI
.getOperand(0).getReg(), ToType
, 0},
863 {{MI
.getOperand(1).getReg(), FromType
, 0}}, LocObserver
, &MI
);
866 static RTLIB::Libcall
867 getStateLibraryFunctionFor(MachineInstr
&MI
, const TargetLowering
&TLI
) {
868 RTLIB::Libcall RTLibcall
;
869 switch (MI
.getOpcode()) {
870 case TargetOpcode::G_GET_FPENV
:
871 RTLibcall
= RTLIB::FEGETENV
;
873 case TargetOpcode::G_SET_FPENV
:
874 case TargetOpcode::G_RESET_FPENV
:
875 RTLibcall
= RTLIB::FESETENV
;
877 case TargetOpcode::G_GET_FPMODE
:
878 RTLibcall
= RTLIB::FEGETMODE
;
880 case TargetOpcode::G_SET_FPMODE
:
881 case TargetOpcode::G_RESET_FPMODE
:
882 RTLibcall
= RTLIB::FESETMODE
;
885 llvm_unreachable("Unexpected opcode");
890 // Some library functions that read FP state (fegetmode, fegetenv) write the
891 // state into a region in memory. IR intrinsics that do the same operations
892 // (get_fpmode, get_fpenv) return the state as integer value. To implement these
893 // intrinsics via the library functions, we need to use temporary variable,
896 // %0:_(s32) = G_GET_FPMODE
898 // is transformed to:
900 // %1:_(p0) = G_FRAME_INDEX %stack.0
902 // %0:_(s32) = G_LOAD % 1
904 LegalizerHelper::LegalizeResult
905 LegalizerHelper::createGetStateLibcall(MachineIRBuilder
&MIRBuilder
,
907 LostDebugLocObserver
&LocObserver
) {
908 const DataLayout
&DL
= MIRBuilder
.getDataLayout();
909 auto &MF
= MIRBuilder
.getMF();
910 auto &MRI
= *MIRBuilder
.getMRI();
911 auto &Ctx
= MF
.getFunction().getContext();
913 // Create temporary, where library function will put the read state.
914 Register Dst
= MI
.getOperand(0).getReg();
915 LLT StateTy
= MRI
.getType(Dst
);
916 TypeSize StateSize
= StateTy
.getSizeInBytes();
917 Align TempAlign
= getStackTemporaryAlignment(StateTy
);
918 MachinePointerInfo TempPtrInfo
;
919 auto Temp
= createStackTemporary(StateSize
, TempAlign
, TempPtrInfo
);
921 // Create a call to library function, with the temporary as an argument.
922 unsigned TempAddrSpace
= DL
.getAllocaAddrSpace();
923 Type
*StatePtrTy
= PointerType::get(Ctx
, TempAddrSpace
);
924 RTLIB::Libcall RTLibcall
= getStateLibraryFunctionFor(MI
, TLI
);
926 createLibcall(MIRBuilder
, RTLibcall
,
927 CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx
), 0),
928 CallLowering::ArgInfo({Temp
.getReg(0), StatePtrTy
, 0}),
929 LocObserver
, nullptr);
930 if (Res
!= LegalizerHelper::Legalized
)
933 // Create a load from the temporary.
934 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
935 TempPtrInfo
, MachineMemOperand::MOLoad
, StateTy
, TempAlign
);
936 MIRBuilder
.buildLoadInstr(TargetOpcode::G_LOAD
, Dst
, Temp
, *MMO
);
938 return LegalizerHelper::Legalized
;
941 // Similar to `createGetStateLibcall` the function calls a library function
942 // using transient space in stack. In this case the library function reads
943 // content of memory region.
944 LegalizerHelper::LegalizeResult
945 LegalizerHelper::createSetStateLibcall(MachineIRBuilder
&MIRBuilder
,
947 LostDebugLocObserver
&LocObserver
) {
948 const DataLayout
&DL
= MIRBuilder
.getDataLayout();
949 auto &MF
= MIRBuilder
.getMF();
950 auto &MRI
= *MIRBuilder
.getMRI();
951 auto &Ctx
= MF
.getFunction().getContext();
953 // Create temporary, where library function will get the new state.
954 Register Src
= MI
.getOperand(0).getReg();
955 LLT StateTy
= MRI
.getType(Src
);
956 TypeSize StateSize
= StateTy
.getSizeInBytes();
957 Align TempAlign
= getStackTemporaryAlignment(StateTy
);
958 MachinePointerInfo TempPtrInfo
;
959 auto Temp
= createStackTemporary(StateSize
, TempAlign
, TempPtrInfo
);
961 // Put the new state into the temporary.
962 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
963 TempPtrInfo
, MachineMemOperand::MOStore
, StateTy
, TempAlign
);
964 MIRBuilder
.buildStore(Src
, Temp
, *MMO
);
966 // Create a call to library function, with the temporary as an argument.
967 unsigned TempAddrSpace
= DL
.getAllocaAddrSpace();
968 Type
*StatePtrTy
= PointerType::get(Ctx
, TempAddrSpace
);
969 RTLIB::Libcall RTLibcall
= getStateLibraryFunctionFor(MI
, TLI
);
970 return createLibcall(MIRBuilder
, RTLibcall
,
971 CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx
), 0),
972 CallLowering::ArgInfo({Temp
.getReg(0), StatePtrTy
, 0}),
973 LocObserver
, nullptr);
976 // The function is used to legalize operations that set default environment
977 // state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
978 // On most targets supported in glibc FE_DFL_MODE is defined as
979 // `((const femode_t *) -1)`. Such assumption is used here. If for some target
980 // it is not true, the target must provide custom lowering.
981 LegalizerHelper::LegalizeResult
982 LegalizerHelper::createResetStateLibcall(MachineIRBuilder
&MIRBuilder
,
984 LostDebugLocObserver
&LocObserver
) {
985 const DataLayout
&DL
= MIRBuilder
.getDataLayout();
986 auto &MF
= MIRBuilder
.getMF();
987 auto &Ctx
= MF
.getFunction().getContext();
989 // Create an argument for the library function.
990 unsigned AddrSpace
= DL
.getDefaultGlobalsAddressSpace();
991 Type
*StatePtrTy
= PointerType::get(Ctx
, AddrSpace
);
992 unsigned PtrSize
= DL
.getPointerSizeInBits(AddrSpace
);
993 LLT MemTy
= LLT::pointer(AddrSpace
, PtrSize
);
994 auto DefValue
= MIRBuilder
.buildConstant(LLT::scalar(PtrSize
), -1LL);
995 DstOp
Dest(MRI
.createGenericVirtualRegister(MemTy
));
996 MIRBuilder
.buildIntToPtr(Dest
, DefValue
);
998 RTLIB::Libcall RTLibcall
= getStateLibraryFunctionFor(MI
, TLI
);
999 return createLibcall(MIRBuilder
, RTLibcall
,
1000 CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx
), 0),
1001 CallLowering::ArgInfo({Dest
.getReg(), StatePtrTy
, 0}),
1005 LegalizerHelper::LegalizeResult
1006 LegalizerHelper::libcall(MachineInstr
&MI
, LostDebugLocObserver
&LocObserver
) {
1007 auto &Ctx
= MIRBuilder
.getMF().getFunction().getContext();
1009 switch (MI
.getOpcode()) {
1011 return UnableToLegalize
;
1012 case TargetOpcode::G_MUL
:
1013 case TargetOpcode::G_SDIV
:
1014 case TargetOpcode::G_UDIV
:
1015 case TargetOpcode::G_SREM
:
1016 case TargetOpcode::G_UREM
:
1017 case TargetOpcode::G_CTLZ_ZERO_UNDEF
: {
1018 LLT LLTy
= MRI
.getType(MI
.getOperand(0).getReg());
1019 unsigned Size
= LLTy
.getSizeInBits();
1020 Type
*HLTy
= IntegerType::get(Ctx
, Size
);
1021 auto Status
= simpleLibcall(MI
, MIRBuilder
, Size
, HLTy
, LocObserver
);
1022 if (Status
!= Legalized
)
1026 case TargetOpcode::G_FADD
:
1027 case TargetOpcode::G_FSUB
:
1028 case TargetOpcode::G_FMUL
:
1029 case TargetOpcode::G_FDIV
:
1030 case TargetOpcode::G_FMA
:
1031 case TargetOpcode::G_FPOW
:
1032 case TargetOpcode::G_FREM
:
1033 case TargetOpcode::G_FCOS
:
1034 case TargetOpcode::G_FSIN
:
1035 case TargetOpcode::G_FLOG10
:
1036 case TargetOpcode::G_FLOG
:
1037 case TargetOpcode::G_FLOG2
:
1038 case TargetOpcode::G_FLDEXP
:
1039 case TargetOpcode::G_FEXP
:
1040 case TargetOpcode::G_FEXP2
:
1041 case TargetOpcode::G_FEXP10
:
1042 case TargetOpcode::G_FCEIL
:
1043 case TargetOpcode::G_FFLOOR
:
1044 case TargetOpcode::G_FMINNUM
:
1045 case TargetOpcode::G_FMAXNUM
:
1046 case TargetOpcode::G_FSQRT
:
1047 case TargetOpcode::G_FRINT
:
1048 case TargetOpcode::G_FNEARBYINT
:
1049 case TargetOpcode::G_INTRINSIC_ROUNDEVEN
: {
1050 LLT LLTy
= MRI
.getType(MI
.getOperand(0).getReg());
1051 unsigned Size
= LLTy
.getSizeInBits();
1052 Type
*HLTy
= getFloatTypeForLLT(Ctx
, LLTy
);
1053 if (!HLTy
|| (Size
!= 32 && Size
!= 64 && Size
!= 80 && Size
!= 128)) {
1054 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy
<< ".\n");
1055 return UnableToLegalize
;
1057 auto Status
= simpleLibcall(MI
, MIRBuilder
, Size
, HLTy
, LocObserver
);
1058 if (Status
!= Legalized
)
1062 case TargetOpcode::G_FPOWI
: {
1063 LLT LLTy
= MRI
.getType(MI
.getOperand(0).getReg());
1064 unsigned Size
= LLTy
.getSizeInBits();
1065 Type
*HLTy
= getFloatTypeForLLT(Ctx
, LLTy
);
1066 Type
*ITy
= IntegerType::get(
1067 Ctx
, MRI
.getType(MI
.getOperand(2).getReg()).getSizeInBits());
1068 if (!HLTy
|| (Size
!= 32 && Size
!= 64 && Size
!= 80 && Size
!= 128)) {
1069 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy
<< ".\n");
1070 return UnableToLegalize
;
1072 auto Libcall
= getRTLibDesc(MI
.getOpcode(), Size
);
1073 std::initializer_list
<CallLowering::ArgInfo
> Args
= {
1074 {MI
.getOperand(1).getReg(), HLTy
, 0},
1075 {MI
.getOperand(2).getReg(), ITy
, 1}};
1076 LegalizeResult Status
=
1077 createLibcall(MIRBuilder
, Libcall
, {MI
.getOperand(0).getReg(), HLTy
, 0},
1078 Args
, LocObserver
, &MI
);
1079 if (Status
!= Legalized
)
1083 case TargetOpcode::G_FPEXT
:
1084 case TargetOpcode::G_FPTRUNC
: {
1085 Type
*FromTy
= getFloatTypeForLLT(Ctx
, MRI
.getType(MI
.getOperand(1).getReg()));
1086 Type
*ToTy
= getFloatTypeForLLT(Ctx
, MRI
.getType(MI
.getOperand(0).getReg()));
1087 if (!FromTy
|| !ToTy
)
1088 return UnableToLegalize
;
1089 LegalizeResult Status
=
1090 conversionLibcall(MI
, MIRBuilder
, ToTy
, FromTy
, LocObserver
);
1091 if (Status
!= Legalized
)
1095 case TargetOpcode::G_FPTOSI
:
1096 case TargetOpcode::G_FPTOUI
: {
1097 // FIXME: Support other types
1098 unsigned FromSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
1099 unsigned ToSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1100 if ((ToSize
!= 32 && ToSize
!= 64) || (FromSize
!= 32 && FromSize
!= 64))
1101 return UnableToLegalize
;
1102 LegalizeResult Status
= conversionLibcall(
1104 ToSize
== 32 ? Type::getInt32Ty(Ctx
) : Type::getInt64Ty(Ctx
),
1105 FromSize
== 64 ? Type::getDoubleTy(Ctx
) : Type::getFloatTy(Ctx
),
1107 if (Status
!= Legalized
)
1111 case TargetOpcode::G_SITOFP
:
1112 case TargetOpcode::G_UITOFP
: {
1113 // FIXME: Support other types
1114 unsigned FromSize
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
1115 unsigned ToSize
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1116 if ((FromSize
!= 32 && FromSize
!= 64) || (ToSize
!= 32 && ToSize
!= 64))
1117 return UnableToLegalize
;
1118 LegalizeResult Status
= conversionLibcall(
1120 ToSize
== 64 ? Type::getDoubleTy(Ctx
) : Type::getFloatTy(Ctx
),
1121 FromSize
== 32 ? Type::getInt32Ty(Ctx
) : Type::getInt64Ty(Ctx
),
1123 if (Status
!= Legalized
)
1127 case TargetOpcode::G_ATOMICRMW_XCHG
:
1128 case TargetOpcode::G_ATOMICRMW_ADD
:
1129 case TargetOpcode::G_ATOMICRMW_SUB
:
1130 case TargetOpcode::G_ATOMICRMW_AND
:
1131 case TargetOpcode::G_ATOMICRMW_OR
:
1132 case TargetOpcode::G_ATOMICRMW_XOR
:
1133 case TargetOpcode::G_ATOMIC_CMPXCHG
:
1134 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS
: {
1135 auto Status
= createAtomicLibcall(MIRBuilder
, MI
);
1136 if (Status
!= Legalized
)
1140 case TargetOpcode::G_BZERO
:
1141 case TargetOpcode::G_MEMCPY
:
1142 case TargetOpcode::G_MEMMOVE
:
1143 case TargetOpcode::G_MEMSET
: {
1144 LegalizeResult Result
=
1145 createMemLibcall(MIRBuilder
, *MIRBuilder
.getMRI(), MI
, LocObserver
);
1146 if (Result
!= Legalized
)
1148 MI
.eraseFromParent();
1151 case TargetOpcode::G_GET_FPENV
:
1152 case TargetOpcode::G_GET_FPMODE
: {
1153 LegalizeResult Result
= createGetStateLibcall(MIRBuilder
, MI
, LocObserver
);
1154 if (Result
!= Legalized
)
1158 case TargetOpcode::G_SET_FPENV
:
1159 case TargetOpcode::G_SET_FPMODE
: {
1160 LegalizeResult Result
= createSetStateLibcall(MIRBuilder
, MI
, LocObserver
);
1161 if (Result
!= Legalized
)
1165 case TargetOpcode::G_RESET_FPENV
:
1166 case TargetOpcode::G_RESET_FPMODE
: {
1167 LegalizeResult Result
=
1168 createResetStateLibcall(MIRBuilder
, MI
, LocObserver
);
1169 if (Result
!= Legalized
)
1175 MI
.eraseFromParent();
1179 LegalizerHelper::LegalizeResult
LegalizerHelper::narrowScalar(MachineInstr
&MI
,
1182 uint64_t SizeOp0
= MRI
.getType(MI
.getOperand(0).getReg()).getSizeInBits();
1183 uint64_t NarrowSize
= NarrowTy
.getSizeInBits();
1185 switch (MI
.getOpcode()) {
1187 return UnableToLegalize
;
1188 case TargetOpcode::G_IMPLICIT_DEF
: {
1189 Register DstReg
= MI
.getOperand(0).getReg();
1190 LLT DstTy
= MRI
.getType(DstReg
);
1192 // If SizeOp0 is not an exact multiple of NarrowSize, emit
1193 // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
1194 // FIXME: Although this would also be legal for the general case, it causes
1195 // a lot of regressions in the emitted code (superfluous COPYs, artifact
1196 // combines not being hit). This seems to be a problem related to the
1197 // artifact combiner.
1198 if (SizeOp0
% NarrowSize
!= 0) {
1199 LLT ImplicitTy
= NarrowTy
;
1200 if (DstTy
.isVector())
1201 ImplicitTy
= LLT::vector(DstTy
.getElementCount(), ImplicitTy
);
1203 Register ImplicitReg
= MIRBuilder
.buildUndef(ImplicitTy
).getReg(0);
1204 MIRBuilder
.buildAnyExt(DstReg
, ImplicitReg
);
1206 MI
.eraseFromParent();
1210 int NumParts
= SizeOp0
/ NarrowSize
;
1212 SmallVector
<Register
, 2> DstRegs
;
1213 for (int i
= 0; i
< NumParts
; ++i
)
1214 DstRegs
.push_back(MIRBuilder
.buildUndef(NarrowTy
).getReg(0));
1216 if (DstTy
.isVector())
1217 MIRBuilder
.buildBuildVector(DstReg
, DstRegs
);
1219 MIRBuilder
.buildMergeLikeInstr(DstReg
, DstRegs
);
1220 MI
.eraseFromParent();
1223 case TargetOpcode::G_CONSTANT
: {
1224 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
1225 const APInt
&Val
= MI
.getOperand(1).getCImm()->getValue();
1226 unsigned TotalSize
= Ty
.getSizeInBits();
1227 unsigned NarrowSize
= NarrowTy
.getSizeInBits();
1228 int NumParts
= TotalSize
/ NarrowSize
;
1230 SmallVector
<Register
, 4> PartRegs
;
1231 for (int I
= 0; I
!= NumParts
; ++I
) {
1232 unsigned Offset
= I
* NarrowSize
;
1233 auto K
= MIRBuilder
.buildConstant(NarrowTy
,
1234 Val
.lshr(Offset
).trunc(NarrowSize
));
1235 PartRegs
.push_back(K
.getReg(0));
1239 unsigned LeftoverBits
= TotalSize
- NumParts
* NarrowSize
;
1240 SmallVector
<Register
, 1> LeftoverRegs
;
1241 if (LeftoverBits
!= 0) {
1242 LeftoverTy
= LLT::scalar(LeftoverBits
);
1243 auto K
= MIRBuilder
.buildConstant(
1245 Val
.lshr(NumParts
* NarrowSize
).trunc(LeftoverBits
));
1246 LeftoverRegs
.push_back(K
.getReg(0));
1249 insertParts(MI
.getOperand(0).getReg(),
1250 Ty
, NarrowTy
, PartRegs
, LeftoverTy
, LeftoverRegs
);
1252 MI
.eraseFromParent();
1255 case TargetOpcode::G_SEXT
:
1256 case TargetOpcode::G_ZEXT
:
1257 case TargetOpcode::G_ANYEXT
:
1258 return narrowScalarExt(MI
, TypeIdx
, NarrowTy
);
1259 case TargetOpcode::G_TRUNC
: {
1261 return UnableToLegalize
;
1263 uint64_t SizeOp1
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
1264 if (NarrowTy
.getSizeInBits() * 2 != SizeOp1
) {
1265 LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy
<< "\n");
1266 return UnableToLegalize
;
1269 auto Unmerge
= MIRBuilder
.buildUnmerge(NarrowTy
, MI
.getOperand(1));
1270 MIRBuilder
.buildCopy(MI
.getOperand(0), Unmerge
.getReg(0));
1271 MI
.eraseFromParent();
1275 case TargetOpcode::G_FREEZE
: {
1277 return UnableToLegalize
;
1279 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
1280 // Should widen scalar first
1281 if (Ty
.getSizeInBits() % NarrowTy
.getSizeInBits() != 0)
1282 return UnableToLegalize
;
1284 auto Unmerge
= MIRBuilder
.buildUnmerge(NarrowTy
, MI
.getOperand(1).getReg());
1285 SmallVector
<Register
, 8> Parts
;
1286 for (unsigned i
= 0; i
< Unmerge
->getNumDefs(); ++i
) {
1288 MIRBuilder
.buildFreeze(NarrowTy
, Unmerge
.getReg(i
)).getReg(0));
1291 MIRBuilder
.buildMergeLikeInstr(MI
.getOperand(0).getReg(), Parts
);
1292 MI
.eraseFromParent();
1295 case TargetOpcode::G_ADD
:
1296 case TargetOpcode::G_SUB
:
1297 case TargetOpcode::G_SADDO
:
1298 case TargetOpcode::G_SSUBO
:
1299 case TargetOpcode::G_SADDE
:
1300 case TargetOpcode::G_SSUBE
:
1301 case TargetOpcode::G_UADDO
:
1302 case TargetOpcode::G_USUBO
:
1303 case TargetOpcode::G_UADDE
:
1304 case TargetOpcode::G_USUBE
:
1305 return narrowScalarAddSub(MI
, TypeIdx
, NarrowTy
);
1306 case TargetOpcode::G_MUL
:
1307 case TargetOpcode::G_UMULH
:
1308 return narrowScalarMul(MI
, NarrowTy
);
1309 case TargetOpcode::G_EXTRACT
:
1310 return narrowScalarExtract(MI
, TypeIdx
, NarrowTy
);
1311 case TargetOpcode::G_INSERT
:
1312 return narrowScalarInsert(MI
, TypeIdx
, NarrowTy
);
1313 case TargetOpcode::G_LOAD
: {
1314 auto &LoadMI
= cast
<GLoad
>(MI
);
1315 Register DstReg
= LoadMI
.getDstReg();
1316 LLT DstTy
= MRI
.getType(DstReg
);
1317 if (DstTy
.isVector())
1318 return UnableToLegalize
;
1320 if (8 * LoadMI
.getMemSize() != DstTy
.getSizeInBits()) {
1321 Register TmpReg
= MRI
.createGenericVirtualRegister(NarrowTy
);
1322 MIRBuilder
.buildLoad(TmpReg
, LoadMI
.getPointerReg(), LoadMI
.getMMO());
1323 MIRBuilder
.buildAnyExt(DstReg
, TmpReg
);
1324 LoadMI
.eraseFromParent();
1328 return reduceLoadStoreWidth(LoadMI
, TypeIdx
, NarrowTy
);
1330 case TargetOpcode::G_ZEXTLOAD
:
1331 case TargetOpcode::G_SEXTLOAD
: {
1332 auto &LoadMI
= cast
<GExtLoad
>(MI
);
1333 Register DstReg
= LoadMI
.getDstReg();
1334 Register PtrReg
= LoadMI
.getPointerReg();
1336 Register TmpReg
= MRI
.createGenericVirtualRegister(NarrowTy
);
1337 auto &MMO
= LoadMI
.getMMO();
1338 unsigned MemSize
= MMO
.getSizeInBits();
1340 if (MemSize
== NarrowSize
) {
1341 MIRBuilder
.buildLoad(TmpReg
, PtrReg
, MMO
);
1342 } else if (MemSize
< NarrowSize
) {
1343 MIRBuilder
.buildLoadInstr(LoadMI
.getOpcode(), TmpReg
, PtrReg
, MMO
);
1344 } else if (MemSize
> NarrowSize
) {
1345 // FIXME: Need to split the load.
1346 return UnableToLegalize
;
1349 if (isa
<GZExtLoad
>(LoadMI
))
1350 MIRBuilder
.buildZExt(DstReg
, TmpReg
);
1352 MIRBuilder
.buildSExt(DstReg
, TmpReg
);
1354 LoadMI
.eraseFromParent();
1357 case TargetOpcode::G_STORE
: {
1358 auto &StoreMI
= cast
<GStore
>(MI
);
1360 Register SrcReg
= StoreMI
.getValueReg();
1361 LLT SrcTy
= MRI
.getType(SrcReg
);
1362 if (SrcTy
.isVector())
1363 return UnableToLegalize
;
1365 int NumParts
= SizeOp0
/ NarrowSize
;
1366 unsigned HandledSize
= NumParts
* NarrowTy
.getSizeInBits();
1367 unsigned LeftoverBits
= SrcTy
.getSizeInBits() - HandledSize
;
1368 if (SrcTy
.isVector() && LeftoverBits
!= 0)
1369 return UnableToLegalize
;
1371 if (8 * StoreMI
.getMemSize() != SrcTy
.getSizeInBits()) {
1372 Register TmpReg
= MRI
.createGenericVirtualRegister(NarrowTy
);
1373 MIRBuilder
.buildTrunc(TmpReg
, SrcReg
);
1374 MIRBuilder
.buildStore(TmpReg
, StoreMI
.getPointerReg(), StoreMI
.getMMO());
1375 StoreMI
.eraseFromParent();
1379 return reduceLoadStoreWidth(StoreMI
, 0, NarrowTy
);
1381 case TargetOpcode::G_SELECT
:
1382 return narrowScalarSelect(MI
, TypeIdx
, NarrowTy
);
1383 case TargetOpcode::G_AND
:
1384 case TargetOpcode::G_OR
:
1385 case TargetOpcode::G_XOR
: {
1386 // Legalize bitwise operation:
1387 // A = BinOp<Ty> B, C
1389 // B1, ..., BN = G_UNMERGE_VALUES B
1390 // C1, ..., CN = G_UNMERGE_VALUES C
1391 // A1 = BinOp<Ty/N> B1, C2
1393 // AN = BinOp<Ty/N> BN, CN
1394 // A = G_MERGE_VALUES A1, ..., AN
1395 return narrowScalarBasic(MI
, TypeIdx
, NarrowTy
);
1397 case TargetOpcode::G_SHL
:
1398 case TargetOpcode::G_LSHR
:
1399 case TargetOpcode::G_ASHR
:
1400 return narrowScalarShift(MI
, TypeIdx
, NarrowTy
);
1401 case TargetOpcode::G_CTLZ
:
1402 case TargetOpcode::G_CTLZ_ZERO_UNDEF
:
1403 case TargetOpcode::G_CTTZ
:
1404 case TargetOpcode::G_CTTZ_ZERO_UNDEF
:
1405 case TargetOpcode::G_CTPOP
:
1407 switch (MI
.getOpcode()) {
1408 case TargetOpcode::G_CTLZ
:
1409 case TargetOpcode::G_CTLZ_ZERO_UNDEF
:
1410 return narrowScalarCTLZ(MI
, TypeIdx
, NarrowTy
);
1411 case TargetOpcode::G_CTTZ
:
1412 case TargetOpcode::G_CTTZ_ZERO_UNDEF
:
1413 return narrowScalarCTTZ(MI
, TypeIdx
, NarrowTy
);
1414 case TargetOpcode::G_CTPOP
:
1415 return narrowScalarCTPOP(MI
, TypeIdx
, NarrowTy
);
1417 return UnableToLegalize
;
1420 Observer
.changingInstr(MI
);
1421 narrowScalarDst(MI
, NarrowTy
, 0, TargetOpcode::G_ZEXT
);
1422 Observer
.changedInstr(MI
);
1424 case TargetOpcode::G_INTTOPTR
:
1426 return UnableToLegalize
;
1428 Observer
.changingInstr(MI
);
1429 narrowScalarSrc(MI
, NarrowTy
, 1);
1430 Observer
.changedInstr(MI
);
1432 case TargetOpcode::G_PTRTOINT
:
1434 return UnableToLegalize
;
1436 Observer
.changingInstr(MI
);
1437 narrowScalarDst(MI
, NarrowTy
, 0, TargetOpcode::G_ZEXT
);
1438 Observer
.changedInstr(MI
);
1440 case TargetOpcode::G_PHI
: {
1441 // FIXME: add support for when SizeOp0 isn't an exact multiple of
1443 if (SizeOp0
% NarrowSize
!= 0)
1444 return UnableToLegalize
;
1446 unsigned NumParts
= SizeOp0
/ NarrowSize
;
1447 SmallVector
<Register
, 2> DstRegs(NumParts
);
1448 SmallVector
<SmallVector
<Register
, 2>, 2> SrcRegs(MI
.getNumOperands() / 2);
1449 Observer
.changingInstr(MI
);
1450 for (unsigned i
= 1; i
< MI
.getNumOperands(); i
+= 2) {
1451 MachineBasicBlock
&OpMBB
= *MI
.getOperand(i
+ 1).getMBB();
1452 MIRBuilder
.setInsertPt(OpMBB
, OpMBB
.getFirstTerminatorForward());
1453 extractParts(MI
.getOperand(i
).getReg(), NarrowTy
, NumParts
,
1454 SrcRegs
[i
/ 2], MIRBuilder
, MRI
);
1456 MachineBasicBlock
&MBB
= *MI
.getParent();
1457 MIRBuilder
.setInsertPt(MBB
, MI
);
1458 for (unsigned i
= 0; i
< NumParts
; ++i
) {
1459 DstRegs
[i
] = MRI
.createGenericVirtualRegister(NarrowTy
);
1460 MachineInstrBuilder MIB
=
1461 MIRBuilder
.buildInstr(TargetOpcode::G_PHI
).addDef(DstRegs
[i
]);
1462 for (unsigned j
= 1; j
< MI
.getNumOperands(); j
+= 2)
1463 MIB
.addUse(SrcRegs
[j
/ 2][i
]).add(MI
.getOperand(j
+ 1));
1465 MIRBuilder
.setInsertPt(MBB
, MBB
.getFirstNonPHI());
1466 MIRBuilder
.buildMergeLikeInstr(MI
.getOperand(0), DstRegs
);
1467 Observer
.changedInstr(MI
);
1468 MI
.eraseFromParent();
1471 case TargetOpcode::G_EXTRACT_VECTOR_ELT
:
1472 case TargetOpcode::G_INSERT_VECTOR_ELT
: {
1474 return UnableToLegalize
;
1476 int OpIdx
= MI
.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT
? 2 : 3;
1477 Observer
.changingInstr(MI
);
1478 narrowScalarSrc(MI
, NarrowTy
, OpIdx
);
1479 Observer
.changedInstr(MI
);
1482 case TargetOpcode::G_ICMP
: {
1483 Register LHS
= MI
.getOperand(2).getReg();
1484 LLT SrcTy
= MRI
.getType(LHS
);
1485 uint64_t SrcSize
= SrcTy
.getSizeInBits();
1486 CmpInst::Predicate Pred
=
1487 static_cast<CmpInst::Predicate
>(MI
.getOperand(1).getPredicate());
1489 // TODO: Handle the non-equality case for weird sizes.
1490 if (NarrowSize
* 2 != SrcSize
&& !ICmpInst::isEquality(Pred
))
1491 return UnableToLegalize
;
1493 LLT LeftoverTy
; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1494 SmallVector
<Register
, 4> LHSPartRegs
, LHSLeftoverRegs
;
1495 if (!extractParts(LHS
, SrcTy
, NarrowTy
, LeftoverTy
, LHSPartRegs
,
1496 LHSLeftoverRegs
, MIRBuilder
, MRI
))
1497 return UnableToLegalize
;
1499 LLT Unused
; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1500 SmallVector
<Register
, 4> RHSPartRegs
, RHSLeftoverRegs
;
1501 if (!extractParts(MI
.getOperand(3).getReg(), SrcTy
, NarrowTy
, Unused
,
1502 RHSPartRegs
, RHSLeftoverRegs
, MIRBuilder
, MRI
))
1503 return UnableToLegalize
;
1505 // We now have the LHS and RHS of the compare split into narrow-type
1506 // registers, plus potentially some leftover type.
1507 Register Dst
= MI
.getOperand(0).getReg();
1508 LLT ResTy
= MRI
.getType(Dst
);
1509 if (ICmpInst::isEquality(Pred
)) {
1510 // For each part on the LHS and RHS, keep track of the result of XOR-ing
1511 // them together. For each equal part, the result should be all 0s. For
1512 // each non-equal part, we'll get at least one 1.
1513 auto Zero
= MIRBuilder
.buildConstant(NarrowTy
, 0);
1514 SmallVector
<Register
, 4> Xors
;
1515 for (auto LHSAndRHS
: zip(LHSPartRegs
, RHSPartRegs
)) {
1516 auto LHS
= std::get
<0>(LHSAndRHS
);
1517 auto RHS
= std::get
<1>(LHSAndRHS
);
1518 auto Xor
= MIRBuilder
.buildXor(NarrowTy
, LHS
, RHS
).getReg(0);
1519 Xors
.push_back(Xor
);
1522 // Build a G_XOR for each leftover register. Each G_XOR must be widened
1523 // to the desired narrow type so that we can OR them together later.
1524 SmallVector
<Register
, 4> WidenedXors
;
1525 for (auto LHSAndRHS
: zip(LHSLeftoverRegs
, RHSLeftoverRegs
)) {
1526 auto LHS
= std::get
<0>(LHSAndRHS
);
1527 auto RHS
= std::get
<1>(LHSAndRHS
);
1528 auto Xor
= MIRBuilder
.buildXor(LeftoverTy
, LHS
, RHS
).getReg(0);
1529 LLT GCDTy
= extractGCDType(WidenedXors
, NarrowTy
, LeftoverTy
, Xor
);
1530 buildLCMMergePieces(LeftoverTy
, NarrowTy
, GCDTy
, WidenedXors
,
1531 /* PadStrategy = */ TargetOpcode::G_ZEXT
);
1532 Xors
.insert(Xors
.end(), WidenedXors
.begin(), WidenedXors
.end());
1535 // Now, for each part we broke up, we know if they are equal/not equal
1536 // based off the G_XOR. We can OR these all together and compare against
1537 // 0 to get the result.
1538 assert(Xors
.size() >= 2 && "Should have gotten at least two Xors?");
1539 auto Or
= MIRBuilder
.buildOr(NarrowTy
, Xors
[0], Xors
[1]);
1540 for (unsigned I
= 2, E
= Xors
.size(); I
< E
; ++I
)
1541 Or
= MIRBuilder
.buildOr(NarrowTy
, Or
, Xors
[I
]);
1542 MIRBuilder
.buildICmp(Pred
, Dst
, Or
, Zero
);
1544 // TODO: Handle non-power-of-two types.
1545 assert(LHSPartRegs
.size() == 2 && "Expected exactly 2 LHS part regs?");
1546 assert(RHSPartRegs
.size() == 2 && "Expected exactly 2 RHS part regs?");
1547 Register LHSL
= LHSPartRegs
[0];
1548 Register LHSH
= LHSPartRegs
[1];
1549 Register RHSL
= RHSPartRegs
[0];
1550 Register RHSH
= RHSPartRegs
[1];
1551 MachineInstrBuilder CmpH
= MIRBuilder
.buildICmp(Pred
, ResTy
, LHSH
, RHSH
);
1552 MachineInstrBuilder CmpHEQ
=
1553 MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_EQ
, ResTy
, LHSH
, RHSH
);
1554 MachineInstrBuilder CmpLU
= MIRBuilder
.buildICmp(
1555 ICmpInst::getUnsignedPredicate(Pred
), ResTy
, LHSL
, RHSL
);
1556 MIRBuilder
.buildSelect(Dst
, CmpHEQ
, CmpLU
, CmpH
);
1558 MI
.eraseFromParent();
1561 case TargetOpcode::G_SEXT_INREG
: {
1563 return UnableToLegalize
;
1565 int64_t SizeInBits
= MI
.getOperand(2).getImm();
1567 // So long as the new type has more bits than the bits we're extending we
1568 // don't need to break it apart.
1569 if (NarrowTy
.getScalarSizeInBits() > SizeInBits
) {
1570 Observer
.changingInstr(MI
);
1571 // We don't lose any non-extension bits by truncating the src and
1572 // sign-extending the dst.
1573 MachineOperand
&MO1
= MI
.getOperand(1);
1574 auto TruncMIB
= MIRBuilder
.buildTrunc(NarrowTy
, MO1
);
1575 MO1
.setReg(TruncMIB
.getReg(0));
1577 MachineOperand
&MO2
= MI
.getOperand(0);
1578 Register DstExt
= MRI
.createGenericVirtualRegister(NarrowTy
);
1579 MIRBuilder
.setInsertPt(MIRBuilder
.getMBB(), ++MIRBuilder
.getInsertPt());
1580 MIRBuilder
.buildSExt(MO2
, DstExt
);
1582 Observer
.changedInstr(MI
);
1586 // Break it apart. Components below the extension point are unmodified. The
1587 // component containing the extension point becomes a narrower SEXT_INREG.
1588 // Components above it are ashr'd from the component containing the
1590 if (SizeOp0
% NarrowSize
!= 0)
1591 return UnableToLegalize
;
1592 int NumParts
= SizeOp0
/ NarrowSize
;
1594 // List the registers where the destination will be scattered.
1595 SmallVector
<Register
, 2> DstRegs
;
1596 // List the registers where the source will be split.
1597 SmallVector
<Register
, 2> SrcRegs
;
1599 // Create all the temporary registers.
1600 for (int i
= 0; i
< NumParts
; ++i
) {
1601 Register SrcReg
= MRI
.createGenericVirtualRegister(NarrowTy
);
1603 SrcRegs
.push_back(SrcReg
);
1606 // Explode the big arguments into smaller chunks.
1607 MIRBuilder
.buildUnmerge(SrcRegs
, MI
.getOperand(1));
1609 Register AshrCstReg
=
1610 MIRBuilder
.buildConstant(NarrowTy
, NarrowTy
.getScalarSizeInBits() - 1)
1612 Register FullExtensionReg
;
1613 Register PartialExtensionReg
;
1615 // Do the operation on each small part.
1616 for (int i
= 0; i
< NumParts
; ++i
) {
1617 if ((i
+ 1) * NarrowTy
.getScalarSizeInBits() <= SizeInBits
) {
1618 DstRegs
.push_back(SrcRegs
[i
]);
1619 PartialExtensionReg
= DstRegs
.back();
1620 } else if (i
* NarrowTy
.getScalarSizeInBits() >= SizeInBits
) {
1621 assert(PartialExtensionReg
&&
1622 "Expected to visit partial extension before full");
1623 if (FullExtensionReg
) {
1624 DstRegs
.push_back(FullExtensionReg
);
1628 MIRBuilder
.buildAShr(NarrowTy
, PartialExtensionReg
, AshrCstReg
)
1630 FullExtensionReg
= DstRegs
.back();
1635 TargetOpcode::G_SEXT_INREG
, {NarrowTy
},
1636 {SrcRegs
[i
], SizeInBits
% NarrowTy
.getScalarSizeInBits()})
1638 PartialExtensionReg
= DstRegs
.back();
1642 // Gather the destination registers into the final destination.
1643 Register DstReg
= MI
.getOperand(0).getReg();
1644 MIRBuilder
.buildMergeLikeInstr(DstReg
, DstRegs
);
1645 MI
.eraseFromParent();
1648 case TargetOpcode::G_BSWAP
:
1649 case TargetOpcode::G_BITREVERSE
: {
1650 if (SizeOp0
% NarrowSize
!= 0)
1651 return UnableToLegalize
;
1653 Observer
.changingInstr(MI
);
1654 SmallVector
<Register
, 2> SrcRegs
, DstRegs
;
1655 unsigned NumParts
= SizeOp0
/ NarrowSize
;
1656 extractParts(MI
.getOperand(1).getReg(), NarrowTy
, NumParts
, SrcRegs
,
1659 for (unsigned i
= 0; i
< NumParts
; ++i
) {
1660 auto DstPart
= MIRBuilder
.buildInstr(MI
.getOpcode(), {NarrowTy
},
1661 {SrcRegs
[NumParts
- 1 - i
]});
1662 DstRegs
.push_back(DstPart
.getReg(0));
1665 MIRBuilder
.buildMergeLikeInstr(MI
.getOperand(0), DstRegs
);
1667 Observer
.changedInstr(MI
);
1668 MI
.eraseFromParent();
1671 case TargetOpcode::G_PTR_ADD
:
1672 case TargetOpcode::G_PTRMASK
: {
1674 return UnableToLegalize
;
1675 Observer
.changingInstr(MI
);
1676 narrowScalarSrc(MI
, NarrowTy
, 2);
1677 Observer
.changedInstr(MI
);
1680 case TargetOpcode::G_FPTOUI
:
1681 case TargetOpcode::G_FPTOSI
:
1682 return narrowScalarFPTOI(MI
, TypeIdx
, NarrowTy
);
1683 case TargetOpcode::G_FPEXT
:
1685 return UnableToLegalize
;
1686 Observer
.changingInstr(MI
);
1687 narrowScalarDst(MI
, NarrowTy
, 0, TargetOpcode::G_FPEXT
);
1688 Observer
.changedInstr(MI
);
1690 case TargetOpcode::G_FLDEXP
:
1691 case TargetOpcode::G_STRICT_FLDEXP
:
1692 return narrowScalarFLDEXP(MI
, TypeIdx
, NarrowTy
);
1696 Register
LegalizerHelper::coerceToScalar(Register Val
) {
1697 LLT Ty
= MRI
.getType(Val
);
1701 const DataLayout
&DL
= MIRBuilder
.getDataLayout();
1702 LLT NewTy
= LLT::scalar(Ty
.getSizeInBits());
1703 if (Ty
.isPointer()) {
1704 if (DL
.isNonIntegralAddressSpace(Ty
.getAddressSpace()))
1706 return MIRBuilder
.buildPtrToInt(NewTy
, Val
).getReg(0);
1709 Register NewVal
= Val
;
1711 assert(Ty
.isVector());
1712 LLT EltTy
= Ty
.getElementType();
1713 if (EltTy
.isPointer())
1714 NewVal
= MIRBuilder
.buildPtrToInt(NewTy
, NewVal
).getReg(0);
1715 return MIRBuilder
.buildBitcast(NewTy
, NewVal
).getReg(0);
1718 void LegalizerHelper::widenScalarSrc(MachineInstr
&MI
, LLT WideTy
,
1719 unsigned OpIdx
, unsigned ExtOpcode
) {
1720 MachineOperand
&MO
= MI
.getOperand(OpIdx
);
1721 auto ExtB
= MIRBuilder
.buildInstr(ExtOpcode
, {WideTy
}, {MO
});
1722 MO
.setReg(ExtB
.getReg(0));
1725 void LegalizerHelper::narrowScalarSrc(MachineInstr
&MI
, LLT NarrowTy
,
1727 MachineOperand
&MO
= MI
.getOperand(OpIdx
);
1728 auto ExtB
= MIRBuilder
.buildTrunc(NarrowTy
, MO
);
1729 MO
.setReg(ExtB
.getReg(0));
1732 void LegalizerHelper::widenScalarDst(MachineInstr
&MI
, LLT WideTy
,
1733 unsigned OpIdx
, unsigned TruncOpcode
) {
1734 MachineOperand
&MO
= MI
.getOperand(OpIdx
);
1735 Register DstExt
= MRI
.createGenericVirtualRegister(WideTy
);
1736 MIRBuilder
.setInsertPt(MIRBuilder
.getMBB(), ++MIRBuilder
.getInsertPt());
1737 MIRBuilder
.buildInstr(TruncOpcode
, {MO
}, {DstExt
});
1741 void LegalizerHelper::narrowScalarDst(MachineInstr
&MI
, LLT NarrowTy
,
1742 unsigned OpIdx
, unsigned ExtOpcode
) {
1743 MachineOperand
&MO
= MI
.getOperand(OpIdx
);
1744 Register DstTrunc
= MRI
.createGenericVirtualRegister(NarrowTy
);
1745 MIRBuilder
.setInsertPt(MIRBuilder
.getMBB(), ++MIRBuilder
.getInsertPt());
1746 MIRBuilder
.buildInstr(ExtOpcode
, {MO
}, {DstTrunc
});
1747 MO
.setReg(DstTrunc
);
1750 void LegalizerHelper::moreElementsVectorDst(MachineInstr
&MI
, LLT WideTy
,
1752 MachineOperand
&MO
= MI
.getOperand(OpIdx
);
1753 MIRBuilder
.setInsertPt(MIRBuilder
.getMBB(), ++MIRBuilder
.getInsertPt());
1754 Register Dst
= MO
.getReg();
1755 Register DstExt
= MRI
.createGenericVirtualRegister(WideTy
);
1757 MIRBuilder
.buildDeleteTrailingVectorElements(Dst
, DstExt
);
1760 void LegalizerHelper::moreElementsVectorSrc(MachineInstr
&MI
, LLT MoreTy
,
1762 MachineOperand
&MO
= MI
.getOperand(OpIdx
);
1763 SmallVector
<Register
, 8> Regs
;
1764 MO
.setReg(MIRBuilder
.buildPadVectorWithUndefElements(MoreTy
, MO
).getReg(0));
1767 void LegalizerHelper::bitcastSrc(MachineInstr
&MI
, LLT CastTy
, unsigned OpIdx
) {
1768 MachineOperand
&Op
= MI
.getOperand(OpIdx
);
1769 Op
.setReg(MIRBuilder
.buildBitcast(CastTy
, Op
).getReg(0));
1772 void LegalizerHelper::bitcastDst(MachineInstr
&MI
, LLT CastTy
, unsigned OpIdx
) {
1773 MachineOperand
&MO
= MI
.getOperand(OpIdx
);
1774 Register CastDst
= MRI
.createGenericVirtualRegister(CastTy
);
1775 MIRBuilder
.setInsertPt(MIRBuilder
.getMBB(), ++MIRBuilder
.getInsertPt());
1776 MIRBuilder
.buildBitcast(MO
, CastDst
);
1780 LegalizerHelper::LegalizeResult
1781 LegalizerHelper::widenScalarMergeValues(MachineInstr
&MI
, unsigned TypeIdx
,
1784 return UnableToLegalize
;
1786 auto [DstReg
, DstTy
, Src1Reg
, Src1Ty
] = MI
.getFirst2RegLLTs();
1787 if (DstTy
.isVector())
1788 return UnableToLegalize
;
1790 LLT SrcTy
= MRI
.getType(Src1Reg
);
1791 const int DstSize
= DstTy
.getSizeInBits();
1792 const int SrcSize
= SrcTy
.getSizeInBits();
1793 const int WideSize
= WideTy
.getSizeInBits();
1794 const int NumMerge
= (DstSize
+ WideSize
- 1) / WideSize
;
1796 unsigned NumOps
= MI
.getNumOperands();
1797 unsigned NumSrc
= MI
.getNumOperands() - 1;
1798 unsigned PartSize
= DstTy
.getSizeInBits() / NumSrc
;
1800 if (WideSize
>= DstSize
) {
1801 // Directly pack the bits in the target type.
1802 Register ResultReg
= MIRBuilder
.buildZExt(WideTy
, Src1Reg
).getReg(0);
1804 for (unsigned I
= 2; I
!= NumOps
; ++I
) {
1805 const unsigned Offset
= (I
- 1) * PartSize
;
1807 Register SrcReg
= MI
.getOperand(I
).getReg();
1808 assert(MRI
.getType(SrcReg
) == LLT::scalar(PartSize
));
1810 auto ZextInput
= MIRBuilder
.buildZExt(WideTy
, SrcReg
);
1812 Register NextResult
= I
+ 1 == NumOps
&& WideTy
== DstTy
? DstReg
:
1813 MRI
.createGenericVirtualRegister(WideTy
);
1815 auto ShiftAmt
= MIRBuilder
.buildConstant(WideTy
, Offset
);
1816 auto Shl
= MIRBuilder
.buildShl(WideTy
, ZextInput
, ShiftAmt
);
1817 MIRBuilder
.buildOr(NextResult
, ResultReg
, Shl
);
1818 ResultReg
= NextResult
;
1821 if (WideSize
> DstSize
)
1822 MIRBuilder
.buildTrunc(DstReg
, ResultReg
);
1823 else if (DstTy
.isPointer())
1824 MIRBuilder
.buildIntToPtr(DstReg
, ResultReg
);
1826 MI
.eraseFromParent();
1830 // Unmerge the original values to the GCD type, and recombine to the next
1831 // multiple greater than the original type.
1833 // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1834 // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1835 // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1836 // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1837 // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1838 // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1839 // %12:_(s12) = G_MERGE_VALUES %10, %11
1841 // Padding with undef if necessary:
1843 // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1844 // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1845 // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1846 // %7:_(s2) = G_IMPLICIT_DEF
1847 // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1848 // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1849 // %10:_(s12) = G_MERGE_VALUES %8, %9
1851 const int GCD
= std::gcd(SrcSize
, WideSize
);
1852 LLT GCDTy
= LLT::scalar(GCD
);
1854 SmallVector
<Register
, 8> Parts
;
1855 SmallVector
<Register
, 8> NewMergeRegs
;
1856 SmallVector
<Register
, 8> Unmerges
;
1857 LLT WideDstTy
= LLT::scalar(NumMerge
* WideSize
);
1859 // Decompose the original operands if they don't evenly divide.
1860 for (const MachineOperand
&MO
: llvm::drop_begin(MI
.operands())) {
1861 Register SrcReg
= MO
.getReg();
1862 if (GCD
== SrcSize
) {
1863 Unmerges
.push_back(SrcReg
);
1865 auto Unmerge
= MIRBuilder
.buildUnmerge(GCDTy
, SrcReg
);
1866 for (int J
= 0, JE
= Unmerge
->getNumOperands() - 1; J
!= JE
; ++J
)
1867 Unmerges
.push_back(Unmerge
.getReg(J
));
1871 // Pad with undef to the next size that is a multiple of the requested size.
1872 if (static_cast<int>(Unmerges
.size()) != NumMerge
* WideSize
) {
1873 Register UndefReg
= MIRBuilder
.buildUndef(GCDTy
).getReg(0);
1874 for (int I
= Unmerges
.size(); I
!= NumMerge
* WideSize
; ++I
)
1875 Unmerges
.push_back(UndefReg
);
1878 const int PartsPerGCD
= WideSize
/ GCD
;
1880 // Build merges of each piece.
1881 ArrayRef
<Register
> Slicer(Unmerges
);
1882 for (int I
= 0; I
!= NumMerge
; ++I
, Slicer
= Slicer
.drop_front(PartsPerGCD
)) {
1884 MIRBuilder
.buildMergeLikeInstr(WideTy
, Slicer
.take_front(PartsPerGCD
));
1885 NewMergeRegs
.push_back(Merge
.getReg(0));
1888 // A truncate may be necessary if the requested type doesn't evenly divide the
1889 // original result type.
1890 if (DstTy
.getSizeInBits() == WideDstTy
.getSizeInBits()) {
1891 MIRBuilder
.buildMergeLikeInstr(DstReg
, NewMergeRegs
);
1893 auto FinalMerge
= MIRBuilder
.buildMergeLikeInstr(WideDstTy
, NewMergeRegs
);
1894 MIRBuilder
.buildTrunc(DstReg
, FinalMerge
.getReg(0));
1897 MI
.eraseFromParent();
1901 LegalizerHelper::LegalizeResult
1902 LegalizerHelper::widenScalarUnmergeValues(MachineInstr
&MI
, unsigned TypeIdx
,
1905 return UnableToLegalize
;
1907 int NumDst
= MI
.getNumOperands() - 1;
1908 Register SrcReg
= MI
.getOperand(NumDst
).getReg();
1909 LLT SrcTy
= MRI
.getType(SrcReg
);
1910 if (SrcTy
.isVector())
1911 return UnableToLegalize
;
1913 Register Dst0Reg
= MI
.getOperand(0).getReg();
1914 LLT DstTy
= MRI
.getType(Dst0Reg
);
1915 if (!DstTy
.isScalar())
1916 return UnableToLegalize
;
1918 if (WideTy
.getSizeInBits() >= SrcTy
.getSizeInBits()) {
1919 if (SrcTy
.isPointer()) {
1920 const DataLayout
&DL
= MIRBuilder
.getDataLayout();
1921 if (DL
.isNonIntegralAddressSpace(SrcTy
.getAddressSpace())) {
1923 dbgs() << "Not casting non-integral address space integer\n");
1924 return UnableToLegalize
;
1927 SrcTy
= LLT::scalar(SrcTy
.getSizeInBits());
1928 SrcReg
= MIRBuilder
.buildPtrToInt(SrcTy
, SrcReg
).getReg(0);
1931 // Widen SrcTy to WideTy. This does not affect the result, but since the
1932 // user requested this size, it is probably better handled than SrcTy and
1933 // should reduce the total number of legalization artifacts.
1934 if (WideTy
.getSizeInBits() > SrcTy
.getSizeInBits()) {
1936 SrcReg
= MIRBuilder
.buildAnyExt(WideTy
, SrcReg
).getReg(0);
1939 // Theres no unmerge type to target. Directly extract the bits from the
1941 unsigned DstSize
= DstTy
.getSizeInBits();
1943 MIRBuilder
.buildTrunc(Dst0Reg
, SrcReg
);
1944 for (int I
= 1; I
!= NumDst
; ++I
) {
1945 auto ShiftAmt
= MIRBuilder
.buildConstant(SrcTy
, DstSize
* I
);
1946 auto Shr
= MIRBuilder
.buildLShr(SrcTy
, SrcReg
, ShiftAmt
);
1947 MIRBuilder
.buildTrunc(MI
.getOperand(I
), Shr
);
1950 MI
.eraseFromParent();
1954 // Extend the source to a wider type.
1955 LLT LCMTy
= getLCMType(SrcTy
, WideTy
);
1957 Register WideSrc
= SrcReg
;
1958 if (LCMTy
.getSizeInBits() != SrcTy
.getSizeInBits()) {
1959 // TODO: If this is an integral address space, cast to integer and anyext.
1960 if (SrcTy
.isPointer()) {
1961 LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1962 return UnableToLegalize
;
1965 WideSrc
= MIRBuilder
.buildAnyExt(LCMTy
, WideSrc
).getReg(0);
1968 auto Unmerge
= MIRBuilder
.buildUnmerge(WideTy
, WideSrc
);
1970 // Create a sequence of unmerges and merges to the original results. Since we
1971 // may have widened the source, we will need to pad the results with dead defs
1972 // to cover the source register.
1973 // e.g. widen s48 to s64:
1974 // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1977 // %4:_(s192) = G_ANYEXT %0:_(s96)
1978 // %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1979 // ; unpack to GCD type, with extra dead defs
1980 // %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1981 // %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1982 // dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1983 // %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10 ; Remerge to destination
1984 // %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1985 const LLT GCDTy
= getGCDType(WideTy
, DstTy
);
1986 const int NumUnmerge
= Unmerge
->getNumOperands() - 1;
1987 const int PartsPerRemerge
= DstTy
.getSizeInBits() / GCDTy
.getSizeInBits();
1989 // Directly unmerge to the destination without going through a GCD type
1991 if (PartsPerRemerge
== 1) {
1992 const int PartsPerUnmerge
= WideTy
.getSizeInBits() / DstTy
.getSizeInBits();
1994 for (int I
= 0; I
!= NumUnmerge
; ++I
) {
1995 auto MIB
= MIRBuilder
.buildInstr(TargetOpcode::G_UNMERGE_VALUES
);
1997 for (int J
= 0; J
!= PartsPerUnmerge
; ++J
) {
1998 int Idx
= I
* PartsPerUnmerge
+ J
;
2000 MIB
.addDef(MI
.getOperand(Idx
).getReg());
2002 // Create dead def for excess components.
2003 MIB
.addDef(MRI
.createGenericVirtualRegister(DstTy
));
2007 MIB
.addUse(Unmerge
.getReg(I
));
2010 SmallVector
<Register
, 16> Parts
;
2011 for (int J
= 0; J
!= NumUnmerge
; ++J
)
2012 extractGCDType(Parts
, GCDTy
, Unmerge
.getReg(J
));
2014 SmallVector
<Register
, 8> RemergeParts
;
2015 for (int I
= 0; I
!= NumDst
; ++I
) {
2016 for (int J
= 0; J
< PartsPerRemerge
; ++J
) {
2017 const int Idx
= I
* PartsPerRemerge
+ J
;
2018 RemergeParts
.emplace_back(Parts
[Idx
]);
2021 MIRBuilder
.buildMergeLikeInstr(MI
.getOperand(I
).getReg(), RemergeParts
);
2022 RemergeParts
.clear();
2026 MI
.eraseFromParent();
2030 LegalizerHelper::LegalizeResult
2031 LegalizerHelper::widenScalarExtract(MachineInstr
&MI
, unsigned TypeIdx
,
2033 auto [DstReg
, DstTy
, SrcReg
, SrcTy
] = MI
.getFirst2RegLLTs();
2034 unsigned Offset
= MI
.getOperand(2).getImm();
2037 if (SrcTy
.isVector() || DstTy
.isVector())
2038 return UnableToLegalize
;
2041 if (SrcTy
.isPointer()) {
2042 // Extracts from pointers can be handled only if they are really just
2044 const DataLayout
&DL
= MIRBuilder
.getDataLayout();
2045 if (DL
.isNonIntegralAddressSpace(SrcTy
.getAddressSpace()))
2046 return UnableToLegalize
;
2048 LLT SrcAsIntTy
= LLT::scalar(SrcTy
.getSizeInBits());
2049 Src
= MIRBuilder
.buildPtrToInt(SrcAsIntTy
, Src
);
2053 if (DstTy
.isPointer())
2054 return UnableToLegalize
;
2057 // Avoid a shift in the degenerate case.
2058 MIRBuilder
.buildTrunc(DstReg
,
2059 MIRBuilder
.buildAnyExtOrTrunc(WideTy
, Src
));
2060 MI
.eraseFromParent();
2064 // Do a shift in the source type.
2065 LLT ShiftTy
= SrcTy
;
2066 if (WideTy
.getSizeInBits() > SrcTy
.getSizeInBits()) {
2067 Src
= MIRBuilder
.buildAnyExt(WideTy
, Src
);
2071 auto LShr
= MIRBuilder
.buildLShr(
2072 ShiftTy
, Src
, MIRBuilder
.buildConstant(ShiftTy
, Offset
));
2073 MIRBuilder
.buildTrunc(DstReg
, LShr
);
2074 MI
.eraseFromParent();
2078 if (SrcTy
.isScalar()) {
2079 Observer
.changingInstr(MI
);
2080 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ANYEXT
);
2081 Observer
.changedInstr(MI
);
2085 if (!SrcTy
.isVector())
2086 return UnableToLegalize
;
2088 if (DstTy
!= SrcTy
.getElementType())
2089 return UnableToLegalize
;
2091 if (Offset
% SrcTy
.getScalarSizeInBits() != 0)
2092 return UnableToLegalize
;
2094 Observer
.changingInstr(MI
);
2095 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ANYEXT
);
2097 MI
.getOperand(2).setImm((WideTy
.getSizeInBits() / SrcTy
.getSizeInBits()) *
2099 widenScalarDst(MI
, WideTy
.getScalarType(), 0);
2100 Observer
.changedInstr(MI
);
2104 LegalizerHelper::LegalizeResult
2105 LegalizerHelper::widenScalarInsert(MachineInstr
&MI
, unsigned TypeIdx
,
2107 if (TypeIdx
!= 0 || WideTy
.isVector())
2108 return UnableToLegalize
;
2109 Observer
.changingInstr(MI
);
2110 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ANYEXT
);
2111 widenScalarDst(MI
, WideTy
);
2112 Observer
.changedInstr(MI
);
2116 LegalizerHelper::LegalizeResult
2117 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr
&MI
, unsigned TypeIdx
,
2121 std::optional
<Register
> CarryIn
;
2122 switch (MI
.getOpcode()) {
2124 llvm_unreachable("Unexpected opcode!");
2125 case TargetOpcode::G_SADDO
:
2126 Opcode
= TargetOpcode::G_ADD
;
2127 ExtOpcode
= TargetOpcode::G_SEXT
;
2129 case TargetOpcode::G_SSUBO
:
2130 Opcode
= TargetOpcode::G_SUB
;
2131 ExtOpcode
= TargetOpcode::G_SEXT
;
2133 case TargetOpcode::G_UADDO
:
2134 Opcode
= TargetOpcode::G_ADD
;
2135 ExtOpcode
= TargetOpcode::G_ZEXT
;
2137 case TargetOpcode::G_USUBO
:
2138 Opcode
= TargetOpcode::G_SUB
;
2139 ExtOpcode
= TargetOpcode::G_ZEXT
;
2141 case TargetOpcode::G_SADDE
:
2142 Opcode
= TargetOpcode::G_UADDE
;
2143 ExtOpcode
= TargetOpcode::G_SEXT
;
2144 CarryIn
= MI
.getOperand(4).getReg();
2146 case TargetOpcode::G_SSUBE
:
2147 Opcode
= TargetOpcode::G_USUBE
;
2148 ExtOpcode
= TargetOpcode::G_SEXT
;
2149 CarryIn
= MI
.getOperand(4).getReg();
2151 case TargetOpcode::G_UADDE
:
2152 Opcode
= TargetOpcode::G_UADDE
;
2153 ExtOpcode
= TargetOpcode::G_ZEXT
;
2154 CarryIn
= MI
.getOperand(4).getReg();
2156 case TargetOpcode::G_USUBE
:
2157 Opcode
= TargetOpcode::G_USUBE
;
2158 ExtOpcode
= TargetOpcode::G_ZEXT
;
2159 CarryIn
= MI
.getOperand(4).getReg();
2164 unsigned BoolExtOp
= MIRBuilder
.getBoolExtOp(WideTy
.isVector(), false);
2166 Observer
.changingInstr(MI
);
2168 widenScalarSrc(MI
, WideTy
, 4, BoolExtOp
);
2169 widenScalarDst(MI
, WideTy
, 1);
2171 Observer
.changedInstr(MI
);
2175 auto LHSExt
= MIRBuilder
.buildInstr(ExtOpcode
, {WideTy
}, {MI
.getOperand(2)});
2176 auto RHSExt
= MIRBuilder
.buildInstr(ExtOpcode
, {WideTy
}, {MI
.getOperand(3)});
2177 // Do the arithmetic in the larger type.
2180 LLT CarryOutTy
= MRI
.getType(MI
.getOperand(1).getReg());
2182 .buildInstr(Opcode
, {WideTy
, CarryOutTy
},
2183 {LHSExt
, RHSExt
, *CarryIn
})
2186 NewOp
= MIRBuilder
.buildInstr(Opcode
, {WideTy
}, {LHSExt
, RHSExt
}).getReg(0);
2188 LLT OrigTy
= MRI
.getType(MI
.getOperand(0).getReg());
2189 auto TruncOp
= MIRBuilder
.buildTrunc(OrigTy
, NewOp
);
2190 auto ExtOp
= MIRBuilder
.buildInstr(ExtOpcode
, {WideTy
}, {TruncOp
});
2191 // There is no overflow if the ExtOp is the same as NewOp.
2192 MIRBuilder
.buildICmp(CmpInst::ICMP_NE
, MI
.getOperand(1), NewOp
, ExtOp
);
2193 // Now trunc the NewOp to the original result.
2194 MIRBuilder
.buildTrunc(MI
.getOperand(0), NewOp
);
2195 MI
.eraseFromParent();
2199 LegalizerHelper::LegalizeResult
2200 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr
&MI
, unsigned TypeIdx
,
2202 bool IsSigned
= MI
.getOpcode() == TargetOpcode::G_SADDSAT
||
2203 MI
.getOpcode() == TargetOpcode::G_SSUBSAT
||
2204 MI
.getOpcode() == TargetOpcode::G_SSHLSAT
;
2205 bool IsShift
= MI
.getOpcode() == TargetOpcode::G_SSHLSAT
||
2206 MI
.getOpcode() == TargetOpcode::G_USHLSAT
;
2207 // We can convert this to:
2208 // 1. Any extend iN to iM
2210 // 3. [US][ADD|SUB|SHL]SAT
2213 // It may be more efficient to lower this to a min and a max operation in
2214 // the higher precision arithmetic if the promoted operation isn't legal,
2215 // but this decision is up to the target's lowering request.
2216 Register DstReg
= MI
.getOperand(0).getReg();
2218 unsigned NewBits
= WideTy
.getScalarSizeInBits();
2219 unsigned SHLAmount
= NewBits
- MRI
.getType(DstReg
).getScalarSizeInBits();
2221 // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
2222 // must not left shift the RHS to preserve the shift amount.
2223 auto LHS
= MIRBuilder
.buildAnyExt(WideTy
, MI
.getOperand(1));
2224 auto RHS
= IsShift
? MIRBuilder
.buildZExt(WideTy
, MI
.getOperand(2))
2225 : MIRBuilder
.buildAnyExt(WideTy
, MI
.getOperand(2));
2226 auto ShiftK
= MIRBuilder
.buildConstant(WideTy
, SHLAmount
);
2227 auto ShiftL
= MIRBuilder
.buildShl(WideTy
, LHS
, ShiftK
);
2228 auto ShiftR
= IsShift
? RHS
: MIRBuilder
.buildShl(WideTy
, RHS
, ShiftK
);
2230 auto WideInst
= MIRBuilder
.buildInstr(MI
.getOpcode(), {WideTy
},
2231 {ShiftL
, ShiftR
}, MI
.getFlags());
2233 // Use a shift that will preserve the number of sign bits when the trunc is
2235 auto Result
= IsSigned
? MIRBuilder
.buildAShr(WideTy
, WideInst
, ShiftK
)
2236 : MIRBuilder
.buildLShr(WideTy
, WideInst
, ShiftK
);
2238 MIRBuilder
.buildTrunc(DstReg
, Result
);
2239 MI
.eraseFromParent();
2243 LegalizerHelper::LegalizeResult
2244 LegalizerHelper::widenScalarMulo(MachineInstr
&MI
, unsigned TypeIdx
,
2247 Observer
.changingInstr(MI
);
2248 widenScalarDst(MI
, WideTy
, 1);
2249 Observer
.changedInstr(MI
);
2253 bool IsSigned
= MI
.getOpcode() == TargetOpcode::G_SMULO
;
2254 auto [Result
, OriginalOverflow
, LHS
, RHS
] = MI
.getFirst4Regs();
2255 LLT SrcTy
= MRI
.getType(LHS
);
2256 LLT OverflowTy
= MRI
.getType(OriginalOverflow
);
2257 unsigned SrcBitWidth
= SrcTy
.getScalarSizeInBits();
2259 // To determine if the result overflowed in the larger type, we extend the
2260 // input to the larger type, do the multiply (checking if it overflows),
2261 // then also check the high bits of the result to see if overflow happened
2263 unsigned ExtOp
= IsSigned
? TargetOpcode::G_SEXT
: TargetOpcode::G_ZEXT
;
2264 auto LeftOperand
= MIRBuilder
.buildInstr(ExtOp
, {WideTy
}, {LHS
});
2265 auto RightOperand
= MIRBuilder
.buildInstr(ExtOp
, {WideTy
}, {RHS
});
2267 // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2268 // so we don't need to check the overflow result of larger type Mulo.
2269 bool WideMulCanOverflow
= WideTy
.getScalarSizeInBits() < 2 * SrcBitWidth
;
2272 WideMulCanOverflow
? MI
.getOpcode() : (unsigned)TargetOpcode::G_MUL
;
2274 MachineInstrBuilder Mulo
;
2275 if (WideMulCanOverflow
)
2276 Mulo
= MIRBuilder
.buildInstr(MulOpc
, {WideTy
, OverflowTy
},
2277 {LeftOperand
, RightOperand
});
2279 Mulo
= MIRBuilder
.buildInstr(MulOpc
, {WideTy
}, {LeftOperand
, RightOperand
});
2281 auto Mul
= Mulo
->getOperand(0);
2282 MIRBuilder
.buildTrunc(Result
, Mul
);
2284 MachineInstrBuilder ExtResult
;
2285 // Overflow occurred if it occurred in the larger type, or if the high part
2286 // of the result does not zero/sign-extend the low part. Check this second
2287 // possibility first.
2289 // For signed, overflow occurred when the high part does not sign-extend
2291 ExtResult
= MIRBuilder
.buildSExtInReg(WideTy
, Mul
, SrcBitWidth
);
2293 // Unsigned overflow occurred when the high part does not zero-extend the
2295 ExtResult
= MIRBuilder
.buildZExtInReg(WideTy
, Mul
, SrcBitWidth
);
2298 if (WideMulCanOverflow
) {
2300 MIRBuilder
.buildICmp(CmpInst::ICMP_NE
, OverflowTy
, Mul
, ExtResult
);
2301 // Finally check if the multiplication in the larger type itself overflowed.
2302 MIRBuilder
.buildOr(OriginalOverflow
, Mulo
->getOperand(1), Overflow
);
2304 MIRBuilder
.buildICmp(CmpInst::ICMP_NE
, OriginalOverflow
, Mul
, ExtResult
);
2306 MI
.eraseFromParent();
2310 LegalizerHelper::LegalizeResult
2311 LegalizerHelper::widenScalar(MachineInstr
&MI
, unsigned TypeIdx
, LLT WideTy
) {
2312 switch (MI
.getOpcode()) {
2314 return UnableToLegalize
;
2315 case TargetOpcode::G_ATOMICRMW_XCHG
:
2316 case TargetOpcode::G_ATOMICRMW_ADD
:
2317 case TargetOpcode::G_ATOMICRMW_SUB
:
2318 case TargetOpcode::G_ATOMICRMW_AND
:
2319 case TargetOpcode::G_ATOMICRMW_OR
:
2320 case TargetOpcode::G_ATOMICRMW_XOR
:
2321 case TargetOpcode::G_ATOMICRMW_MIN
:
2322 case TargetOpcode::G_ATOMICRMW_MAX
:
2323 case TargetOpcode::G_ATOMICRMW_UMIN
:
2324 case TargetOpcode::G_ATOMICRMW_UMAX
:
2325 assert(TypeIdx
== 0 && "atomicrmw with second scalar type");
2326 Observer
.changingInstr(MI
);
2327 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_ANYEXT
);
2328 widenScalarDst(MI
, WideTy
, 0);
2329 Observer
.changedInstr(MI
);
2331 case TargetOpcode::G_ATOMIC_CMPXCHG
:
2332 assert(TypeIdx
== 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2333 Observer
.changingInstr(MI
);
2334 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_ANYEXT
);
2335 widenScalarSrc(MI
, WideTy
, 3, TargetOpcode::G_ANYEXT
);
2336 widenScalarDst(MI
, WideTy
, 0);
2337 Observer
.changedInstr(MI
);
2339 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS
:
2341 Observer
.changingInstr(MI
);
2342 widenScalarSrc(MI
, WideTy
, 3, TargetOpcode::G_ANYEXT
);
2343 widenScalarSrc(MI
, WideTy
, 4, TargetOpcode::G_ANYEXT
);
2344 widenScalarDst(MI
, WideTy
, 0);
2345 Observer
.changedInstr(MI
);
2348 assert(TypeIdx
== 1 &&
2349 "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2350 Observer
.changingInstr(MI
);
2351 widenScalarDst(MI
, WideTy
, 1);
2352 Observer
.changedInstr(MI
);
2354 case TargetOpcode::G_EXTRACT
:
2355 return widenScalarExtract(MI
, TypeIdx
, WideTy
);
2356 case TargetOpcode::G_INSERT
:
2357 return widenScalarInsert(MI
, TypeIdx
, WideTy
);
2358 case TargetOpcode::G_MERGE_VALUES
:
2359 return widenScalarMergeValues(MI
, TypeIdx
, WideTy
);
2360 case TargetOpcode::G_UNMERGE_VALUES
:
2361 return widenScalarUnmergeValues(MI
, TypeIdx
, WideTy
);
2362 case TargetOpcode::G_SADDO
:
2363 case TargetOpcode::G_SSUBO
:
2364 case TargetOpcode::G_UADDO
:
2365 case TargetOpcode::G_USUBO
:
2366 case TargetOpcode::G_SADDE
:
2367 case TargetOpcode::G_SSUBE
:
2368 case TargetOpcode::G_UADDE
:
2369 case TargetOpcode::G_USUBE
:
2370 return widenScalarAddSubOverflow(MI
, TypeIdx
, WideTy
);
2371 case TargetOpcode::G_UMULO
:
2372 case TargetOpcode::G_SMULO
:
2373 return widenScalarMulo(MI
, TypeIdx
, WideTy
);
2374 case TargetOpcode::G_SADDSAT
:
2375 case TargetOpcode::G_SSUBSAT
:
2376 case TargetOpcode::G_SSHLSAT
:
2377 case TargetOpcode::G_UADDSAT
:
2378 case TargetOpcode::G_USUBSAT
:
2379 case TargetOpcode::G_USHLSAT
:
2380 return widenScalarAddSubShlSat(MI
, TypeIdx
, WideTy
);
2381 case TargetOpcode::G_CTTZ
:
2382 case TargetOpcode::G_CTTZ_ZERO_UNDEF
:
2383 case TargetOpcode::G_CTLZ
:
2384 case TargetOpcode::G_CTLZ_ZERO_UNDEF
:
2385 case TargetOpcode::G_CTPOP
: {
2387 Observer
.changingInstr(MI
);
2388 widenScalarDst(MI
, WideTy
, 0);
2389 Observer
.changedInstr(MI
);
2393 Register SrcReg
= MI
.getOperand(1).getReg();
2395 // First extend the input.
2396 unsigned ExtOpc
= MI
.getOpcode() == TargetOpcode::G_CTTZ
||
2397 MI
.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2398 ? TargetOpcode::G_ANYEXT
2399 : TargetOpcode::G_ZEXT
;
2400 auto MIBSrc
= MIRBuilder
.buildInstr(ExtOpc
, {WideTy
}, {SrcReg
});
2401 LLT CurTy
= MRI
.getType(SrcReg
);
2402 unsigned NewOpc
= MI
.getOpcode();
2403 if (NewOpc
== TargetOpcode::G_CTTZ
) {
2404 // The count is the same in the larger type except if the original
2405 // value was zero. This can be handled by setting the bit just off
2406 // the top of the original type.
2408 APInt::getOneBitSet(WideTy
.getSizeInBits(), CurTy
.getSizeInBits());
2409 MIBSrc
= MIRBuilder
.buildOr(
2410 WideTy
, MIBSrc
, MIRBuilder
.buildConstant(WideTy
, TopBit
));
2411 // Now we know the operand is non-zero, use the more relaxed opcode.
2412 NewOpc
= TargetOpcode::G_CTTZ_ZERO_UNDEF
;
2415 // Perform the operation at the larger size.
2416 auto MIBNewOp
= MIRBuilder
.buildInstr(NewOpc
, {WideTy
}, {MIBSrc
});
2417 // This is already the correct result for CTPOP and CTTZs
2418 if (MI
.getOpcode() == TargetOpcode::G_CTLZ
||
2419 MI
.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF
) {
2420 // The correct result is NewOp - (Difference in widety and current ty).
2421 unsigned SizeDiff
= WideTy
.getSizeInBits() - CurTy
.getSizeInBits();
2422 MIBNewOp
= MIRBuilder
.buildSub(
2423 WideTy
, MIBNewOp
, MIRBuilder
.buildConstant(WideTy
, SizeDiff
));
2426 MIRBuilder
.buildZExtOrTrunc(MI
.getOperand(0), MIBNewOp
);
2427 MI
.eraseFromParent();
2430 case TargetOpcode::G_BSWAP
: {
2431 Observer
.changingInstr(MI
);
2432 Register DstReg
= MI
.getOperand(0).getReg();
2434 Register ShrReg
= MRI
.createGenericVirtualRegister(WideTy
);
2435 Register DstExt
= MRI
.createGenericVirtualRegister(WideTy
);
2436 Register ShiftAmtReg
= MRI
.createGenericVirtualRegister(WideTy
);
2437 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ANYEXT
);
2439 MI
.getOperand(0).setReg(DstExt
);
2441 MIRBuilder
.setInsertPt(MIRBuilder
.getMBB(), ++MIRBuilder
.getInsertPt());
2443 LLT Ty
= MRI
.getType(DstReg
);
2444 unsigned DiffBits
= WideTy
.getScalarSizeInBits() - Ty
.getScalarSizeInBits();
2445 MIRBuilder
.buildConstant(ShiftAmtReg
, DiffBits
);
2446 MIRBuilder
.buildLShr(ShrReg
, DstExt
, ShiftAmtReg
);
2448 MIRBuilder
.buildTrunc(DstReg
, ShrReg
);
2449 Observer
.changedInstr(MI
);
2452 case TargetOpcode::G_BITREVERSE
: {
2453 Observer
.changingInstr(MI
);
2455 Register DstReg
= MI
.getOperand(0).getReg();
2456 LLT Ty
= MRI
.getType(DstReg
);
2457 unsigned DiffBits
= WideTy
.getScalarSizeInBits() - Ty
.getScalarSizeInBits();
2459 Register DstExt
= MRI
.createGenericVirtualRegister(WideTy
);
2460 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ANYEXT
);
2461 MI
.getOperand(0).setReg(DstExt
);
2462 MIRBuilder
.setInsertPt(MIRBuilder
.getMBB(), ++MIRBuilder
.getInsertPt());
2464 auto ShiftAmt
= MIRBuilder
.buildConstant(WideTy
, DiffBits
);
2465 auto Shift
= MIRBuilder
.buildLShr(WideTy
, DstExt
, ShiftAmt
);
2466 MIRBuilder
.buildTrunc(DstReg
, Shift
);
2467 Observer
.changedInstr(MI
);
2470 case TargetOpcode::G_FREEZE
:
2471 Observer
.changingInstr(MI
);
2472 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ANYEXT
);
2473 widenScalarDst(MI
, WideTy
);
2474 Observer
.changedInstr(MI
);
2477 case TargetOpcode::G_ABS
:
2478 Observer
.changingInstr(MI
);
2479 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_SEXT
);
2480 widenScalarDst(MI
, WideTy
);
2481 Observer
.changedInstr(MI
);
2484 case TargetOpcode::G_ADD
:
2485 case TargetOpcode::G_AND
:
2486 case TargetOpcode::G_MUL
:
2487 case TargetOpcode::G_OR
:
2488 case TargetOpcode::G_XOR
:
2489 case TargetOpcode::G_SUB
:
2490 // Perform operation at larger width (any extension is fines here, high bits
2491 // don't affect the result) and then truncate the result back to the
2493 Observer
.changingInstr(MI
);
2494 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ANYEXT
);
2495 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_ANYEXT
);
2496 widenScalarDst(MI
, WideTy
);
2497 Observer
.changedInstr(MI
);
2500 case TargetOpcode::G_SBFX
:
2501 case TargetOpcode::G_UBFX
:
2502 Observer
.changingInstr(MI
);
2505 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ANYEXT
);
2506 widenScalarDst(MI
, WideTy
);
2508 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_ZEXT
);
2509 widenScalarSrc(MI
, WideTy
, 3, TargetOpcode::G_ZEXT
);
2512 Observer
.changedInstr(MI
);
2515 case TargetOpcode::G_SHL
:
2516 Observer
.changingInstr(MI
);
2519 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ANYEXT
);
2520 widenScalarDst(MI
, WideTy
);
2522 assert(TypeIdx
== 1);
2523 // The "number of bits to shift" operand must preserve its value as an
2524 // unsigned integer:
2525 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_ZEXT
);
2528 Observer
.changedInstr(MI
);
2531 case TargetOpcode::G_ROTR
:
2532 case TargetOpcode::G_ROTL
:
2534 return UnableToLegalize
;
2536 Observer
.changingInstr(MI
);
2537 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_ZEXT
);
2538 Observer
.changedInstr(MI
);
2541 case TargetOpcode::G_SDIV
:
2542 case TargetOpcode::G_SREM
:
2543 case TargetOpcode::G_SMIN
:
2544 case TargetOpcode::G_SMAX
:
2545 Observer
.changingInstr(MI
);
2546 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_SEXT
);
2547 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_SEXT
);
2548 widenScalarDst(MI
, WideTy
);
2549 Observer
.changedInstr(MI
);
2552 case TargetOpcode::G_SDIVREM
:
2553 Observer
.changingInstr(MI
);
2554 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_SEXT
);
2555 widenScalarSrc(MI
, WideTy
, 3, TargetOpcode::G_SEXT
);
2556 widenScalarDst(MI
, WideTy
);
2557 widenScalarDst(MI
, WideTy
, 1);
2558 Observer
.changedInstr(MI
);
2561 case TargetOpcode::G_ASHR
:
2562 case TargetOpcode::G_LSHR
:
2563 Observer
.changingInstr(MI
);
2566 unsigned CvtOp
= MI
.getOpcode() == TargetOpcode::G_ASHR
?
2567 TargetOpcode::G_SEXT
: TargetOpcode::G_ZEXT
;
2569 widenScalarSrc(MI
, WideTy
, 1, CvtOp
);
2570 widenScalarDst(MI
, WideTy
);
2572 assert(TypeIdx
== 1);
2573 // The "number of bits to shift" operand must preserve its value as an
2574 // unsigned integer:
2575 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_ZEXT
);
2578 Observer
.changedInstr(MI
);
2580 case TargetOpcode::G_UDIV
:
2581 case TargetOpcode::G_UREM
:
2582 case TargetOpcode::G_UMIN
:
2583 case TargetOpcode::G_UMAX
:
2584 Observer
.changingInstr(MI
);
2585 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ZEXT
);
2586 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_ZEXT
);
2587 widenScalarDst(MI
, WideTy
);
2588 Observer
.changedInstr(MI
);
2591 case TargetOpcode::G_UDIVREM
:
2592 Observer
.changingInstr(MI
);
2593 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_ZEXT
);
2594 widenScalarSrc(MI
, WideTy
, 3, TargetOpcode::G_ZEXT
);
2595 widenScalarDst(MI
, WideTy
);
2596 widenScalarDst(MI
, WideTy
, 1);
2597 Observer
.changedInstr(MI
);
2600 case TargetOpcode::G_SELECT
:
2601 Observer
.changingInstr(MI
);
2603 // Perform operation at larger width (any extension is fine here, high
2604 // bits don't affect the result) and then truncate the result back to the
2606 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_ANYEXT
);
2607 widenScalarSrc(MI
, WideTy
, 3, TargetOpcode::G_ANYEXT
);
2608 widenScalarDst(MI
, WideTy
);
2610 bool IsVec
= MRI
.getType(MI
.getOperand(1).getReg()).isVector();
2611 // Explicit extension is required here since high bits affect the result.
2612 widenScalarSrc(MI
, WideTy
, 1, MIRBuilder
.getBoolExtOp(IsVec
, false));
2614 Observer
.changedInstr(MI
);
2617 case TargetOpcode::G_FPTOSI
:
2618 case TargetOpcode::G_FPTOUI
:
2619 case TargetOpcode::G_IS_FPCLASS
:
2620 Observer
.changingInstr(MI
);
2623 widenScalarDst(MI
, WideTy
);
2625 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_FPEXT
);
2627 Observer
.changedInstr(MI
);
2629 case TargetOpcode::G_SITOFP
:
2630 Observer
.changingInstr(MI
);
2633 widenScalarDst(MI
, WideTy
, 0, TargetOpcode::G_FPTRUNC
);
2635 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_SEXT
);
2637 Observer
.changedInstr(MI
);
2639 case TargetOpcode::G_UITOFP
:
2640 Observer
.changingInstr(MI
);
2643 widenScalarDst(MI
, WideTy
, 0, TargetOpcode::G_FPTRUNC
);
2645 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ZEXT
);
2647 Observer
.changedInstr(MI
);
2649 case TargetOpcode::G_LOAD
:
2650 case TargetOpcode::G_SEXTLOAD
:
2651 case TargetOpcode::G_ZEXTLOAD
:
2652 Observer
.changingInstr(MI
);
2653 widenScalarDst(MI
, WideTy
);
2654 Observer
.changedInstr(MI
);
2657 case TargetOpcode::G_STORE
: {
2659 return UnableToLegalize
;
2661 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
2663 return UnableToLegalize
;
2665 Observer
.changingInstr(MI
);
2667 unsigned ExtType
= Ty
.getScalarSizeInBits() == 1 ?
2668 TargetOpcode::G_ZEXT
: TargetOpcode::G_ANYEXT
;
2669 widenScalarSrc(MI
, WideTy
, 0, ExtType
);
2671 Observer
.changedInstr(MI
);
2674 case TargetOpcode::G_CONSTANT
: {
2675 MachineOperand
&SrcMO
= MI
.getOperand(1);
2676 LLVMContext
&Ctx
= MIRBuilder
.getMF().getFunction().getContext();
2677 unsigned ExtOpc
= LI
.getExtOpcodeForWideningConstant(
2678 MRI
.getType(MI
.getOperand(0).getReg()));
2679 assert((ExtOpc
== TargetOpcode::G_ZEXT
|| ExtOpc
== TargetOpcode::G_SEXT
||
2680 ExtOpc
== TargetOpcode::G_ANYEXT
) &&
2682 const APInt
&SrcVal
= SrcMO
.getCImm()->getValue();
2683 const APInt
&Val
= (ExtOpc
== TargetOpcode::G_SEXT
)
2684 ? SrcVal
.sext(WideTy
.getSizeInBits())
2685 : SrcVal
.zext(WideTy
.getSizeInBits());
2686 Observer
.changingInstr(MI
);
2687 SrcMO
.setCImm(ConstantInt::get(Ctx
, Val
));
2689 widenScalarDst(MI
, WideTy
);
2690 Observer
.changedInstr(MI
);
2693 case TargetOpcode::G_FCONSTANT
: {
2694 // To avoid changing the bits of the constant due to extension to a larger
2695 // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
2696 MachineOperand
&SrcMO
= MI
.getOperand(1);
2697 APInt Val
= SrcMO
.getFPImm()->getValueAPF().bitcastToAPInt();
2698 MIRBuilder
.setInstrAndDebugLoc(MI
);
2699 auto IntCst
= MIRBuilder
.buildConstant(MI
.getOperand(0).getReg(), Val
);
2700 widenScalarDst(*IntCst
, WideTy
, 0, TargetOpcode::G_TRUNC
);
2701 MI
.eraseFromParent();
2704 case TargetOpcode::G_IMPLICIT_DEF
: {
2705 Observer
.changingInstr(MI
);
2706 widenScalarDst(MI
, WideTy
);
2707 Observer
.changedInstr(MI
);
2710 case TargetOpcode::G_BRCOND
:
2711 Observer
.changingInstr(MI
);
2712 widenScalarSrc(MI
, WideTy
, 0, MIRBuilder
.getBoolExtOp(false, false));
2713 Observer
.changedInstr(MI
);
2716 case TargetOpcode::G_FCMP
:
2717 Observer
.changingInstr(MI
);
2719 widenScalarDst(MI
, WideTy
);
2721 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_FPEXT
);
2722 widenScalarSrc(MI
, WideTy
, 3, TargetOpcode::G_FPEXT
);
2724 Observer
.changedInstr(MI
);
2727 case TargetOpcode::G_ICMP
:
2728 Observer
.changingInstr(MI
);
2730 widenScalarDst(MI
, WideTy
);
2732 unsigned ExtOpcode
= CmpInst::isSigned(static_cast<CmpInst::Predicate
>(
2733 MI
.getOperand(1).getPredicate()))
2734 ? TargetOpcode::G_SEXT
2735 : TargetOpcode::G_ZEXT
;
2736 widenScalarSrc(MI
, WideTy
, 2, ExtOpcode
);
2737 widenScalarSrc(MI
, WideTy
, 3, ExtOpcode
);
2739 Observer
.changedInstr(MI
);
2742 case TargetOpcode::G_PTR_ADD
:
2743 assert(TypeIdx
== 1 && "unable to legalize pointer of G_PTR_ADD");
2744 Observer
.changingInstr(MI
);
2745 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_SEXT
);
2746 Observer
.changedInstr(MI
);
2749 case TargetOpcode::G_PHI
: {
2750 assert(TypeIdx
== 0 && "Expecting only Idx 0");
2752 Observer
.changingInstr(MI
);
2753 for (unsigned I
= 1; I
< MI
.getNumOperands(); I
+= 2) {
2754 MachineBasicBlock
&OpMBB
= *MI
.getOperand(I
+ 1).getMBB();
2755 MIRBuilder
.setInsertPt(OpMBB
, OpMBB
.getFirstTerminatorForward());
2756 widenScalarSrc(MI
, WideTy
, I
, TargetOpcode::G_ANYEXT
);
2759 MachineBasicBlock
&MBB
= *MI
.getParent();
2760 MIRBuilder
.setInsertPt(MBB
, --MBB
.getFirstNonPHI());
2761 widenScalarDst(MI
, WideTy
);
2762 Observer
.changedInstr(MI
);
2765 case TargetOpcode::G_EXTRACT_VECTOR_ELT
: {
2767 Register VecReg
= MI
.getOperand(1).getReg();
2768 LLT VecTy
= MRI
.getType(VecReg
);
2769 Observer
.changingInstr(MI
);
2772 MI
, LLT::vector(VecTy
.getElementCount(), WideTy
.getSizeInBits()), 1,
2773 TargetOpcode::G_ANYEXT
);
2775 widenScalarDst(MI
, WideTy
, 0);
2776 Observer
.changedInstr(MI
);
2781 return UnableToLegalize
;
2782 Observer
.changingInstr(MI
);
2783 // TODO: Probably should be zext
2784 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_SEXT
);
2785 Observer
.changedInstr(MI
);
2788 case TargetOpcode::G_INSERT_VECTOR_ELT
: {
2790 Observer
.changingInstr(MI
);
2791 const LLT WideEltTy
= WideTy
.getElementType();
2793 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ANYEXT
);
2794 widenScalarSrc(MI
, WideEltTy
, 2, TargetOpcode::G_ANYEXT
);
2795 widenScalarDst(MI
, WideTy
, 0);
2796 Observer
.changedInstr(MI
);
2801 Observer
.changingInstr(MI
);
2803 Register VecReg
= MI
.getOperand(1).getReg();
2804 LLT VecTy
= MRI
.getType(VecReg
);
2805 LLT WideVecTy
= LLT::vector(VecTy
.getElementCount(), WideTy
);
2807 widenScalarSrc(MI
, WideVecTy
, 1, TargetOpcode::G_ANYEXT
);
2808 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_ANYEXT
);
2809 widenScalarDst(MI
, WideVecTy
, 0);
2810 Observer
.changedInstr(MI
);
2815 Observer
.changingInstr(MI
);
2816 // TODO: Probably should be zext
2817 widenScalarSrc(MI
, WideTy
, 3, TargetOpcode::G_SEXT
);
2818 Observer
.changedInstr(MI
);
2822 return UnableToLegalize
;
2824 case TargetOpcode::G_FADD
:
2825 case TargetOpcode::G_FMUL
:
2826 case TargetOpcode::G_FSUB
:
2827 case TargetOpcode::G_FMA
:
2828 case TargetOpcode::G_FMAD
:
2829 case TargetOpcode::G_FNEG
:
2830 case TargetOpcode::G_FABS
:
2831 case TargetOpcode::G_FCANONICALIZE
:
2832 case TargetOpcode::G_FMINNUM
:
2833 case TargetOpcode::G_FMAXNUM
:
2834 case TargetOpcode::G_FMINNUM_IEEE
:
2835 case TargetOpcode::G_FMAXNUM_IEEE
:
2836 case TargetOpcode::G_FMINIMUM
:
2837 case TargetOpcode::G_FMAXIMUM
:
2838 case TargetOpcode::G_FDIV
:
2839 case TargetOpcode::G_FREM
:
2840 case TargetOpcode::G_FCEIL
:
2841 case TargetOpcode::G_FFLOOR
:
2842 case TargetOpcode::G_FCOS
:
2843 case TargetOpcode::G_FSIN
:
2844 case TargetOpcode::G_FLOG10
:
2845 case TargetOpcode::G_FLOG
:
2846 case TargetOpcode::G_FLOG2
:
2847 case TargetOpcode::G_FRINT
:
2848 case TargetOpcode::G_FNEARBYINT
:
2849 case TargetOpcode::G_FSQRT
:
2850 case TargetOpcode::G_FEXP
:
2851 case TargetOpcode::G_FEXP2
:
2852 case TargetOpcode::G_FEXP10
:
2853 case TargetOpcode::G_FPOW
:
2854 case TargetOpcode::G_INTRINSIC_TRUNC
:
2855 case TargetOpcode::G_INTRINSIC_ROUND
:
2856 case TargetOpcode::G_INTRINSIC_ROUNDEVEN
:
2857 assert(TypeIdx
== 0);
2858 Observer
.changingInstr(MI
);
2860 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; ++I
)
2861 widenScalarSrc(MI
, WideTy
, I
, TargetOpcode::G_FPEXT
);
2863 widenScalarDst(MI
, WideTy
, 0, TargetOpcode::G_FPTRUNC
);
2864 Observer
.changedInstr(MI
);
2866 case TargetOpcode::G_FPOWI
:
2867 case TargetOpcode::G_FLDEXP
:
2868 case TargetOpcode::G_STRICT_FLDEXP
: {
2870 if (MI
.getOpcode() == TargetOpcode::G_STRICT_FLDEXP
)
2871 return UnableToLegalize
;
2873 Observer
.changingInstr(MI
);
2874 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_FPEXT
);
2875 widenScalarDst(MI
, WideTy
, 0, TargetOpcode::G_FPTRUNC
);
2876 Observer
.changedInstr(MI
);
2881 // For some reason SelectionDAG tries to promote to a libcall without
2882 // actually changing the integer type for promotion.
2883 Observer
.changingInstr(MI
);
2884 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_SEXT
);
2885 Observer
.changedInstr(MI
);
2889 return UnableToLegalize
;
2891 case TargetOpcode::G_FFREXP
: {
2892 Observer
.changingInstr(MI
);
2895 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_FPEXT
);
2896 widenScalarDst(MI
, WideTy
, 0, TargetOpcode::G_FPTRUNC
);
2898 widenScalarDst(MI
, WideTy
, 1);
2901 Observer
.changedInstr(MI
);
2904 case TargetOpcode::G_INTTOPTR
:
2906 return UnableToLegalize
;
2908 Observer
.changingInstr(MI
);
2909 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ZEXT
);
2910 Observer
.changedInstr(MI
);
2912 case TargetOpcode::G_PTRTOINT
:
2914 return UnableToLegalize
;
2916 Observer
.changingInstr(MI
);
2917 widenScalarDst(MI
, WideTy
, 0);
2918 Observer
.changedInstr(MI
);
2920 case TargetOpcode::G_BUILD_VECTOR
: {
2921 Observer
.changingInstr(MI
);
2923 const LLT WideEltTy
= TypeIdx
== 1 ? WideTy
: WideTy
.getElementType();
2924 for (int I
= 1, E
= MI
.getNumOperands(); I
!= E
; ++I
)
2925 widenScalarSrc(MI
, WideEltTy
, I
, TargetOpcode::G_ANYEXT
);
2927 // Avoid changing the result vector type if the source element type was
2930 MI
.setDesc(MIRBuilder
.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC
));
2932 widenScalarDst(MI
, WideTy
, 0);
2935 Observer
.changedInstr(MI
);
2938 case TargetOpcode::G_SEXT_INREG
:
2940 return UnableToLegalize
;
2942 Observer
.changingInstr(MI
);
2943 widenScalarSrc(MI
, WideTy
, 1, TargetOpcode::G_ANYEXT
);
2944 widenScalarDst(MI
, WideTy
, 0, TargetOpcode::G_TRUNC
);
2945 Observer
.changedInstr(MI
);
2947 case TargetOpcode::G_PTRMASK
: {
2949 return UnableToLegalize
;
2950 Observer
.changingInstr(MI
);
2951 widenScalarSrc(MI
, WideTy
, 2, TargetOpcode::G_ZEXT
);
2952 Observer
.changedInstr(MI
);
2955 case TargetOpcode::G_VECREDUCE_FADD
:
2956 case TargetOpcode::G_VECREDUCE_FMUL
:
2957 case TargetOpcode::G_VECREDUCE_FMIN
:
2958 case TargetOpcode::G_VECREDUCE_FMAX
:
2959 case TargetOpcode::G_VECREDUCE_FMINIMUM
:
2960 case TargetOpcode::G_VECREDUCE_FMAXIMUM
:
2962 return UnableToLegalize
;
2963 Observer
.changingInstr(MI
);
2964 Register VecReg
= MI
.getOperand(1).getReg();
2965 LLT VecTy
= MRI
.getType(VecReg
);
2966 LLT WideVecTy
= VecTy
.isVector()
2967 ? LLT::vector(VecTy
.getElementCount(), WideTy
)
2969 widenScalarSrc(MI
, WideVecTy
, 1, TargetOpcode::G_FPEXT
);
2970 widenScalarDst(MI
, WideTy
, 0, TargetOpcode::G_FPTRUNC
);
2971 Observer
.changedInstr(MI
);
2976 static void getUnmergePieces(SmallVectorImpl
<Register
> &Pieces
,
2977 MachineIRBuilder
&B
, Register Src
, LLT Ty
) {
2978 auto Unmerge
= B
.buildUnmerge(Ty
, Src
);
2979 for (int I
= 0, E
= Unmerge
->getNumOperands() - 1; I
!= E
; ++I
)
2980 Pieces
.push_back(Unmerge
.getReg(I
));
2983 LegalizerHelper::LegalizeResult
2984 LegalizerHelper::lowerFConstant(MachineInstr
&MI
) {
2985 Register Dst
= MI
.getOperand(0).getReg();
2987 MachineFunction
&MF
= MIRBuilder
.getMF();
2988 const DataLayout
&DL
= MIRBuilder
.getDataLayout();
2990 unsigned AddrSpace
= DL
.getDefaultGlobalsAddressSpace();
2991 LLT AddrPtrTy
= LLT::pointer(AddrSpace
, DL
.getPointerSizeInBits(AddrSpace
));
2992 Align Alignment
= Align(DL
.getABITypeAlign(
2993 getFloatTypeForLLT(MF
.getFunction().getContext(), MRI
.getType(Dst
))));
2995 auto Addr
= MIRBuilder
.buildConstantPool(
2996 AddrPtrTy
, MF
.getConstantPool()->getConstantPoolIndex(
2997 MI
.getOperand(1).getFPImm(), Alignment
));
2999 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
3000 MachinePointerInfo::getConstantPool(MF
), MachineMemOperand::MOLoad
,
3001 MRI
.getType(Dst
), Alignment
);
3003 MIRBuilder
.buildLoadInstr(TargetOpcode::G_LOAD
, Dst
, Addr
, *MMO
);
3004 MI
.eraseFromParent();
3009 LegalizerHelper::LegalizeResult
3010 LegalizerHelper::lowerBitcast(MachineInstr
&MI
) {
3011 auto [Dst
, DstTy
, Src
, SrcTy
] = MI
.getFirst2RegLLTs();
3012 if (SrcTy
.isVector()) {
3013 LLT SrcEltTy
= SrcTy
.getElementType();
3014 SmallVector
<Register
, 8> SrcRegs
;
3016 if (DstTy
.isVector()) {
3017 int NumDstElt
= DstTy
.getNumElements();
3018 int NumSrcElt
= SrcTy
.getNumElements();
3020 LLT DstEltTy
= DstTy
.getElementType();
3021 LLT DstCastTy
= DstEltTy
; // Intermediate bitcast result type
3022 LLT SrcPartTy
= SrcEltTy
; // Original unmerge result type.
3024 // If there's an element size mismatch, insert intermediate casts to match
3025 // the result element type.
3026 if (NumSrcElt
< NumDstElt
) { // Source element type is larger.
3027 // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
3031 // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
3032 // %3:_(<2 x s8>) = G_BITCAST %2
3033 // %4:_(<2 x s8>) = G_BITCAST %3
3034 // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
3035 DstCastTy
= LLT::fixed_vector(NumDstElt
/ NumSrcElt
, DstEltTy
);
3036 SrcPartTy
= SrcEltTy
;
3037 } else if (NumSrcElt
> NumDstElt
) { // Source element type is smaller.
3039 // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
3043 // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
3044 // %3:_(s16) = G_BITCAST %2
3045 // %4:_(s16) = G_BITCAST %3
3046 // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
3047 SrcPartTy
= LLT::fixed_vector(NumSrcElt
/ NumDstElt
, SrcEltTy
);
3048 DstCastTy
= DstEltTy
;
3051 getUnmergePieces(SrcRegs
, MIRBuilder
, Src
, SrcPartTy
);
3052 for (Register
&SrcReg
: SrcRegs
)
3053 SrcReg
= MIRBuilder
.buildBitcast(DstCastTy
, SrcReg
).getReg(0);
3055 getUnmergePieces(SrcRegs
, MIRBuilder
, Src
, SrcEltTy
);
3057 MIRBuilder
.buildMergeLikeInstr(Dst
, SrcRegs
);
3058 MI
.eraseFromParent();
3062 if (DstTy
.isVector()) {
3063 SmallVector
<Register
, 8> SrcRegs
;
3064 getUnmergePieces(SrcRegs
, MIRBuilder
, Src
, DstTy
.getElementType());
3065 MIRBuilder
.buildMergeLikeInstr(Dst
, SrcRegs
);
3066 MI
.eraseFromParent();
3070 return UnableToLegalize
;
3073 /// Figure out the bit offset into a register when coercing a vector index for
3074 /// the wide element type. This is only for the case when promoting vector to
3075 /// one with larger elements.
3078 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3079 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3080 static Register
getBitcastWiderVectorElementOffset(MachineIRBuilder
&B
,
3082 unsigned NewEltSize
,
3083 unsigned OldEltSize
) {
3084 const unsigned Log2EltRatio
= Log2_32(NewEltSize
/ OldEltSize
);
3085 LLT IdxTy
= B
.getMRI()->getType(Idx
);
3087 // Now figure out the amount we need to shift to get the target bits.
3088 auto OffsetMask
= B
.buildConstant(
3089 IdxTy
, ~(APInt::getAllOnes(IdxTy
.getSizeInBits()) << Log2EltRatio
));
3090 auto OffsetIdx
= B
.buildAnd(IdxTy
, Idx
, OffsetMask
);
3091 return B
.buildShl(IdxTy
, OffsetIdx
,
3092 B
.buildConstant(IdxTy
, Log2_32(OldEltSize
))).getReg(0);
3095 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
3096 /// is casting to a vector with a smaller element size, perform multiple element
3097 /// extracts and merge the results. If this is coercing to a vector with larger
3098 /// elements, index the bitcasted vector and extract the target element with bit
3099 /// operations. This is intended to force the indexing in the native register
3100 /// size for architectures that can dynamically index the register file.
3101 LegalizerHelper::LegalizeResult
3102 LegalizerHelper::bitcastExtractVectorElt(MachineInstr
&MI
, unsigned TypeIdx
,
3105 return UnableToLegalize
;
3107 auto [Dst
, DstTy
, SrcVec
, SrcVecTy
, Idx
, IdxTy
] = MI
.getFirst3RegLLTs();
3109 LLT SrcEltTy
= SrcVecTy
.getElementType();
3110 unsigned NewNumElts
= CastTy
.isVector() ? CastTy
.getNumElements() : 1;
3111 unsigned OldNumElts
= SrcVecTy
.getNumElements();
3113 LLT NewEltTy
= CastTy
.isVector() ? CastTy
.getElementType() : CastTy
;
3114 Register CastVec
= MIRBuilder
.buildBitcast(CastTy
, SrcVec
).getReg(0);
3116 const unsigned NewEltSize
= NewEltTy
.getSizeInBits();
3117 const unsigned OldEltSize
= SrcEltTy
.getSizeInBits();
3118 if (NewNumElts
> OldNumElts
) {
3119 // Decreasing the vector element size
3121 // e.g. i64 = extract_vector_elt x:v2i64, y:i32
3123 // v4i32:castx = bitcast x:v2i64
3126 // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
3127 // (i32 (extract_vector_elt castx, (2 * y + 1)))
3129 if (NewNumElts
% OldNumElts
!= 0)
3130 return UnableToLegalize
;
3132 // Type of the intermediate result vector.
3133 const unsigned NewEltsPerOldElt
= NewNumElts
/ OldNumElts
;
3135 LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt
), NewEltTy
);
3137 auto NewEltsPerOldEltK
= MIRBuilder
.buildConstant(IdxTy
, NewEltsPerOldElt
);
3139 SmallVector
<Register
, 8> NewOps(NewEltsPerOldElt
);
3140 auto NewBaseIdx
= MIRBuilder
.buildMul(IdxTy
, Idx
, NewEltsPerOldEltK
);
3142 for (unsigned I
= 0; I
< NewEltsPerOldElt
; ++I
) {
3143 auto IdxOffset
= MIRBuilder
.buildConstant(IdxTy
, I
);
3144 auto TmpIdx
= MIRBuilder
.buildAdd(IdxTy
, NewBaseIdx
, IdxOffset
);
3145 auto Elt
= MIRBuilder
.buildExtractVectorElement(NewEltTy
, CastVec
, TmpIdx
);
3146 NewOps
[I
] = Elt
.getReg(0);
3149 auto NewVec
= MIRBuilder
.buildBuildVector(MidTy
, NewOps
);
3150 MIRBuilder
.buildBitcast(Dst
, NewVec
);
3151 MI
.eraseFromParent();
3155 if (NewNumElts
< OldNumElts
) {
3156 if (NewEltSize
% OldEltSize
!= 0)
3157 return UnableToLegalize
;
3159 // This only depends on powers of 2 because we use bit tricks to figure out
3160 // the bit offset we need to shift to get the target element. A general
3161 // expansion could emit division/multiply.
3162 if (!isPowerOf2_32(NewEltSize
/ OldEltSize
))
3163 return UnableToLegalize
;
3165 // Increasing the vector element size.
3166 // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
3170 // %cast = G_BITCAST %vec
3171 // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
3172 // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
3173 // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3174 // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3175 // %elt_bits = G_LSHR %wide_elt, %offset_bits
3176 // %elt = G_TRUNC %elt_bits
3178 const unsigned Log2EltRatio
= Log2_32(NewEltSize
/ OldEltSize
);
3179 auto Log2Ratio
= MIRBuilder
.buildConstant(IdxTy
, Log2EltRatio
);
3181 // Divide to get the index in the wider element type.
3182 auto ScaledIdx
= MIRBuilder
.buildLShr(IdxTy
, Idx
, Log2Ratio
);
3184 Register WideElt
= CastVec
;
3185 if (CastTy
.isVector()) {
3186 WideElt
= MIRBuilder
.buildExtractVectorElement(NewEltTy
, CastVec
,
3187 ScaledIdx
).getReg(0);
3190 // Compute the bit offset into the register of the target element.
3191 Register OffsetBits
= getBitcastWiderVectorElementOffset(
3192 MIRBuilder
, Idx
, NewEltSize
, OldEltSize
);
3194 // Shift the wide element to get the target element.
3195 auto ExtractedBits
= MIRBuilder
.buildLShr(NewEltTy
, WideElt
, OffsetBits
);
3196 MIRBuilder
.buildTrunc(Dst
, ExtractedBits
);
3197 MI
.eraseFromParent();
3201 return UnableToLegalize
;
3204 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
3205 /// TargetReg, while preserving other bits in \p TargetReg.
3207 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
3208 static Register
buildBitFieldInsert(MachineIRBuilder
&B
,
3209 Register TargetReg
, Register InsertReg
,
3210 Register OffsetBits
) {
3211 LLT TargetTy
= B
.getMRI()->getType(TargetReg
);
3212 LLT InsertTy
= B
.getMRI()->getType(InsertReg
);
3213 auto ZextVal
= B
.buildZExt(TargetTy
, InsertReg
);
3214 auto ShiftedInsertVal
= B
.buildShl(TargetTy
, ZextVal
, OffsetBits
);
3216 // Produce a bitmask of the value to insert
3217 auto EltMask
= B
.buildConstant(
3218 TargetTy
, APInt::getLowBitsSet(TargetTy
.getSizeInBits(),
3219 InsertTy
.getSizeInBits()));
3220 // Shift it into position
3221 auto ShiftedMask
= B
.buildShl(TargetTy
, EltMask
, OffsetBits
);
3222 auto InvShiftedMask
= B
.buildNot(TargetTy
, ShiftedMask
);
3224 // Clear out the bits in the wide element
3225 auto MaskedOldElt
= B
.buildAnd(TargetTy
, TargetReg
, InvShiftedMask
);
3227 // The value to insert has all zeros already, so stick it into the masked
3229 return B
.buildOr(TargetTy
, MaskedOldElt
, ShiftedInsertVal
).getReg(0);
3232 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
3233 /// is increasing the element size, perform the indexing in the target element
3234 /// type, and use bit operations to insert at the element position. This is
3235 /// intended for architectures that can dynamically index the register file and
3236 /// want to force indexing in the native register size.
3237 LegalizerHelper::LegalizeResult
3238 LegalizerHelper::bitcastInsertVectorElt(MachineInstr
&MI
, unsigned TypeIdx
,
3241 return UnableToLegalize
;
3243 auto [Dst
, DstTy
, SrcVec
, SrcVecTy
, Val
, ValTy
, Idx
, IdxTy
] =
3244 MI
.getFirst4RegLLTs();
3247 LLT VecEltTy
= VecTy
.getElementType();
3248 LLT NewEltTy
= CastTy
.isVector() ? CastTy
.getElementType() : CastTy
;
3249 const unsigned NewEltSize
= NewEltTy
.getSizeInBits();
3250 const unsigned OldEltSize
= VecEltTy
.getSizeInBits();
3252 unsigned NewNumElts
= CastTy
.isVector() ? CastTy
.getNumElements() : 1;
3253 unsigned OldNumElts
= VecTy
.getNumElements();
3255 Register CastVec
= MIRBuilder
.buildBitcast(CastTy
, SrcVec
).getReg(0);
3256 if (NewNumElts
< OldNumElts
) {
3257 if (NewEltSize
% OldEltSize
!= 0)
3258 return UnableToLegalize
;
3260 // This only depends on powers of 2 because we use bit tricks to figure out
3261 // the bit offset we need to shift to get the target element. A general
3262 // expansion could emit division/multiply.
3263 if (!isPowerOf2_32(NewEltSize
/ OldEltSize
))
3264 return UnableToLegalize
;
3266 const unsigned Log2EltRatio
= Log2_32(NewEltSize
/ OldEltSize
);
3267 auto Log2Ratio
= MIRBuilder
.buildConstant(IdxTy
, Log2EltRatio
);
3269 // Divide to get the index in the wider element type.
3270 auto ScaledIdx
= MIRBuilder
.buildLShr(IdxTy
, Idx
, Log2Ratio
);
3272 Register ExtractedElt
= CastVec
;
3273 if (CastTy
.isVector()) {
3274 ExtractedElt
= MIRBuilder
.buildExtractVectorElement(NewEltTy
, CastVec
,
3275 ScaledIdx
).getReg(0);
3278 // Compute the bit offset into the register of the target element.
3279 Register OffsetBits
= getBitcastWiderVectorElementOffset(
3280 MIRBuilder
, Idx
, NewEltSize
, OldEltSize
);
3282 Register InsertedElt
= buildBitFieldInsert(MIRBuilder
, ExtractedElt
,
3284 if (CastTy
.isVector()) {
3285 InsertedElt
= MIRBuilder
.buildInsertVectorElement(
3286 CastTy
, CastVec
, InsertedElt
, ScaledIdx
).getReg(0);
3289 MIRBuilder
.buildBitcast(Dst
, InsertedElt
);
3290 MI
.eraseFromParent();
3294 return UnableToLegalize
;
3297 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerLoad(GAnyLoad
&LoadMI
) {
3298 // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
3299 Register DstReg
= LoadMI
.getDstReg();
3300 Register PtrReg
= LoadMI
.getPointerReg();
3301 LLT DstTy
= MRI
.getType(DstReg
);
3302 MachineMemOperand
&MMO
= LoadMI
.getMMO();
3303 LLT MemTy
= MMO
.getMemoryType();
3304 MachineFunction
&MF
= MIRBuilder
.getMF();
3306 unsigned MemSizeInBits
= MemTy
.getSizeInBits();
3307 unsigned MemStoreSizeInBits
= 8 * MemTy
.getSizeInBytes();
3309 if (MemSizeInBits
!= MemStoreSizeInBits
) {
3310 if (MemTy
.isVector())
3311 return UnableToLegalize
;
3313 // Promote to a byte-sized load if not loading an integral number of
3314 // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
3315 LLT WideMemTy
= LLT::scalar(MemStoreSizeInBits
);
3316 MachineMemOperand
*NewMMO
=
3317 MF
.getMachineMemOperand(&MMO
, MMO
.getPointerInfo(), WideMemTy
);
3319 Register LoadReg
= DstReg
;
3322 // If this wasn't already an extending load, we need to widen the result
3323 // register to avoid creating a load with a narrower result than the source.
3324 if (MemStoreSizeInBits
> DstTy
.getSizeInBits()) {
3326 LoadReg
= MRI
.createGenericVirtualRegister(WideMemTy
);
3329 if (isa
<GSExtLoad
>(LoadMI
)) {
3330 auto NewLoad
= MIRBuilder
.buildLoad(LoadTy
, PtrReg
, *NewMMO
);
3331 MIRBuilder
.buildSExtInReg(LoadReg
, NewLoad
, MemSizeInBits
);
3332 } else if (isa
<GZExtLoad
>(LoadMI
) || WideMemTy
== LoadTy
) {
3333 auto NewLoad
= MIRBuilder
.buildLoad(LoadTy
, PtrReg
, *NewMMO
);
3334 // The extra bits are guaranteed to be zero, since we stored them that
3335 // way. A zext load from Wide thus automatically gives zext from MemVT.
3336 MIRBuilder
.buildAssertZExt(LoadReg
, NewLoad
, MemSizeInBits
);
3338 MIRBuilder
.buildLoad(LoadReg
, PtrReg
, *NewMMO
);
3341 if (DstTy
!= LoadTy
)
3342 MIRBuilder
.buildTrunc(DstReg
, LoadReg
);
3344 LoadMI
.eraseFromParent();
3348 // Big endian lowering not implemented.
3349 if (MIRBuilder
.getDataLayout().isBigEndian())
3350 return UnableToLegalize
;
3352 // This load needs splitting into power of 2 sized loads.
3354 // Our strategy here is to generate anyextending loads for the smaller
3355 // types up to next power-2 result type, and then combine the two larger
3356 // result values together, before truncating back down to the non-pow-2
3358 // E.g. v1 = i24 load =>
3359 // v2 = i32 zextload (2 byte)
3360 // v3 = i32 load (1 byte)
3361 // v4 = i32 shl v3, 16
3362 // v5 = i32 or v4, v2
3363 // v1 = i24 trunc v5
3364 // By doing this we generate the correct truncate which should get
3365 // combined away as an artifact with a matching extend.
3367 uint64_t LargeSplitSize
, SmallSplitSize
;
3369 if (!isPowerOf2_32(MemSizeInBits
)) {
3370 // This load needs splitting into power of 2 sized loads.
3371 LargeSplitSize
= llvm::bit_floor(MemSizeInBits
);
3372 SmallSplitSize
= MemSizeInBits
- LargeSplitSize
;
3374 // This is already a power of 2, but we still need to split this in half.
3376 // Assume we're being asked to decompose an unaligned load.
3377 // TODO: If this requires multiple splits, handle them all at once.
3378 auto &Ctx
= MF
.getFunction().getContext();
3379 if (TLI
.allowsMemoryAccess(Ctx
, MIRBuilder
.getDataLayout(), MemTy
, MMO
))
3380 return UnableToLegalize
;
3382 SmallSplitSize
= LargeSplitSize
= MemSizeInBits
/ 2;
3385 if (MemTy
.isVector()) {
3386 // TODO: Handle vector extloads
3388 return UnableToLegalize
;
3390 // TODO: We can do better than scalarizing the vector and at least split it
3392 return reduceLoadStoreWidth(LoadMI
, 0, DstTy
.getElementType());
3395 MachineMemOperand
*LargeMMO
=
3396 MF
.getMachineMemOperand(&MMO
, 0, LargeSplitSize
/ 8);
3397 MachineMemOperand
*SmallMMO
=
3398 MF
.getMachineMemOperand(&MMO
, LargeSplitSize
/ 8, SmallSplitSize
/ 8);
3400 LLT PtrTy
= MRI
.getType(PtrReg
);
3401 unsigned AnyExtSize
= PowerOf2Ceil(DstTy
.getSizeInBits());
3402 LLT AnyExtTy
= LLT::scalar(AnyExtSize
);
3403 auto LargeLoad
= MIRBuilder
.buildLoadInstr(TargetOpcode::G_ZEXTLOAD
, AnyExtTy
,
3406 auto OffsetCst
= MIRBuilder
.buildConstant(LLT::scalar(PtrTy
.getSizeInBits()),
3407 LargeSplitSize
/ 8);
3408 Register PtrAddReg
= MRI
.createGenericVirtualRegister(PtrTy
);
3409 auto SmallPtr
= MIRBuilder
.buildPtrAdd(PtrAddReg
, PtrReg
, OffsetCst
);
3410 auto SmallLoad
= MIRBuilder
.buildLoadInstr(LoadMI
.getOpcode(), AnyExtTy
,
3411 SmallPtr
, *SmallMMO
);
3413 auto ShiftAmt
= MIRBuilder
.buildConstant(AnyExtTy
, LargeSplitSize
);
3414 auto Shift
= MIRBuilder
.buildShl(AnyExtTy
, SmallLoad
, ShiftAmt
);
3416 if (AnyExtTy
== DstTy
)
3417 MIRBuilder
.buildOr(DstReg
, Shift
, LargeLoad
);
3418 else if (AnyExtTy
.getSizeInBits() != DstTy
.getSizeInBits()) {
3419 auto Or
= MIRBuilder
.buildOr(AnyExtTy
, Shift
, LargeLoad
);
3420 MIRBuilder
.buildTrunc(DstReg
, {Or
});
3422 assert(DstTy
.isPointer() && "expected pointer");
3423 auto Or
= MIRBuilder
.buildOr(AnyExtTy
, Shift
, LargeLoad
);
3425 // FIXME: We currently consider this to be illegal for non-integral address
3426 // spaces, but we need still need a way to reinterpret the bits.
3427 MIRBuilder
.buildIntToPtr(DstReg
, Or
);
3430 LoadMI
.eraseFromParent();
3434 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerStore(GStore
&StoreMI
) {
3435 // Lower a non-power of 2 store into multiple pow-2 stores.
3436 // E.g. split an i24 store into an i16 store + i8 store.
3437 // We do this by first extending the stored value to the next largest power
3438 // of 2 type, and then using truncating stores to store the components.
3439 // By doing this, likewise with G_LOAD, generate an extend that can be
3440 // artifact-combined away instead of leaving behind extracts.
3441 Register SrcReg
= StoreMI
.getValueReg();
3442 Register PtrReg
= StoreMI
.getPointerReg();
3443 LLT SrcTy
= MRI
.getType(SrcReg
);
3444 MachineFunction
&MF
= MIRBuilder
.getMF();
3445 MachineMemOperand
&MMO
= **StoreMI
.memoperands_begin();
3446 LLT MemTy
= MMO
.getMemoryType();
3448 unsigned StoreWidth
= MemTy
.getSizeInBits();
3449 unsigned StoreSizeInBits
= 8 * MemTy
.getSizeInBytes();
3451 if (StoreWidth
!= StoreSizeInBits
) {
3452 if (SrcTy
.isVector())
3453 return UnableToLegalize
;
3455 // Promote to a byte-sized store with upper bits zero if not
3456 // storing an integral number of bytes. For example, promote
3457 // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3458 LLT WideTy
= LLT::scalar(StoreSizeInBits
);
3460 if (StoreSizeInBits
> SrcTy
.getSizeInBits()) {
3461 // Avoid creating a store with a narrower source than result.
3462 SrcReg
= MIRBuilder
.buildAnyExt(WideTy
, SrcReg
).getReg(0);
3466 auto ZextInReg
= MIRBuilder
.buildZExtInReg(SrcTy
, SrcReg
, StoreWidth
);
3468 MachineMemOperand
*NewMMO
=
3469 MF
.getMachineMemOperand(&MMO
, MMO
.getPointerInfo(), WideTy
);
3470 MIRBuilder
.buildStore(ZextInReg
, PtrReg
, *NewMMO
);
3471 StoreMI
.eraseFromParent();
3475 if (MemTy
.isVector()) {
3476 // TODO: Handle vector trunc stores
3478 return UnableToLegalize
;
3480 // TODO: We can do better than scalarizing the vector and at least split it
3482 return reduceLoadStoreWidth(StoreMI
, 0, SrcTy
.getElementType());
3485 unsigned MemSizeInBits
= MemTy
.getSizeInBits();
3486 uint64_t LargeSplitSize
, SmallSplitSize
;
3488 if (!isPowerOf2_32(MemSizeInBits
)) {
3489 LargeSplitSize
= llvm::bit_floor
<uint64_t>(MemTy
.getSizeInBits());
3490 SmallSplitSize
= MemTy
.getSizeInBits() - LargeSplitSize
;
3492 auto &Ctx
= MF
.getFunction().getContext();
3493 if (TLI
.allowsMemoryAccess(Ctx
, MIRBuilder
.getDataLayout(), MemTy
, MMO
))
3494 return UnableToLegalize
; // Don't know what we're being asked to do.
3496 SmallSplitSize
= LargeSplitSize
= MemSizeInBits
/ 2;
3499 // Extend to the next pow-2. If this store was itself the result of lowering,
3500 // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3501 // that's wider than the stored size.
3502 unsigned AnyExtSize
= PowerOf2Ceil(MemTy
.getSizeInBits());
3503 const LLT NewSrcTy
= LLT::scalar(AnyExtSize
);
3505 if (SrcTy
.isPointer()) {
3506 const LLT IntPtrTy
= LLT::scalar(SrcTy
.getSizeInBits());
3507 SrcReg
= MIRBuilder
.buildPtrToInt(IntPtrTy
, SrcReg
).getReg(0);
3510 auto ExtVal
= MIRBuilder
.buildAnyExtOrTrunc(NewSrcTy
, SrcReg
);
3512 // Obtain the smaller value by shifting away the larger value.
3513 auto ShiftAmt
= MIRBuilder
.buildConstant(NewSrcTy
, LargeSplitSize
);
3514 auto SmallVal
= MIRBuilder
.buildLShr(NewSrcTy
, ExtVal
, ShiftAmt
);
3516 // Generate the PtrAdd and truncating stores.
3517 LLT PtrTy
= MRI
.getType(PtrReg
);
3518 auto OffsetCst
= MIRBuilder
.buildConstant(
3519 LLT::scalar(PtrTy
.getSizeInBits()), LargeSplitSize
/ 8);
3521 MIRBuilder
.buildPtrAdd(PtrTy
, PtrReg
, OffsetCst
);
3523 MachineMemOperand
*LargeMMO
=
3524 MF
.getMachineMemOperand(&MMO
, 0, LargeSplitSize
/ 8);
3525 MachineMemOperand
*SmallMMO
=
3526 MF
.getMachineMemOperand(&MMO
, LargeSplitSize
/ 8, SmallSplitSize
/ 8);
3527 MIRBuilder
.buildStore(ExtVal
, PtrReg
, *LargeMMO
);
3528 MIRBuilder
.buildStore(SmallVal
, SmallPtr
, *SmallMMO
);
3529 StoreMI
.eraseFromParent();
3533 LegalizerHelper::LegalizeResult
3534 LegalizerHelper::bitcast(MachineInstr
&MI
, unsigned TypeIdx
, LLT CastTy
) {
3535 switch (MI
.getOpcode()) {
3536 case TargetOpcode::G_LOAD
: {
3538 return UnableToLegalize
;
3539 MachineMemOperand
&MMO
= **MI
.memoperands_begin();
3541 // Not sure how to interpret a bitcast of an extending load.
3542 if (MMO
.getMemoryType().getSizeInBits() != CastTy
.getSizeInBits())
3543 return UnableToLegalize
;
3545 Observer
.changingInstr(MI
);
3546 bitcastDst(MI
, CastTy
, 0);
3547 MMO
.setType(CastTy
);
3548 Observer
.changedInstr(MI
);
3551 case TargetOpcode::G_STORE
: {
3553 return UnableToLegalize
;
3555 MachineMemOperand
&MMO
= **MI
.memoperands_begin();
3557 // Not sure how to interpret a bitcast of a truncating store.
3558 if (MMO
.getMemoryType().getSizeInBits() != CastTy
.getSizeInBits())
3559 return UnableToLegalize
;
3561 Observer
.changingInstr(MI
);
3562 bitcastSrc(MI
, CastTy
, 0);
3563 MMO
.setType(CastTy
);
3564 Observer
.changedInstr(MI
);
3567 case TargetOpcode::G_SELECT
: {
3569 return UnableToLegalize
;
3571 if (MRI
.getType(MI
.getOperand(1).getReg()).isVector()) {
3573 dbgs() << "bitcast action not implemented for vector select\n");
3574 return UnableToLegalize
;
3577 Observer
.changingInstr(MI
);
3578 bitcastSrc(MI
, CastTy
, 2);
3579 bitcastSrc(MI
, CastTy
, 3);
3580 bitcastDst(MI
, CastTy
, 0);
3581 Observer
.changedInstr(MI
);
3584 case TargetOpcode::G_AND
:
3585 case TargetOpcode::G_OR
:
3586 case TargetOpcode::G_XOR
: {
3587 Observer
.changingInstr(MI
);
3588 bitcastSrc(MI
, CastTy
, 1);
3589 bitcastSrc(MI
, CastTy
, 2);
3590 bitcastDst(MI
, CastTy
, 0);
3591 Observer
.changedInstr(MI
);
3594 case TargetOpcode::G_EXTRACT_VECTOR_ELT
:
3595 return bitcastExtractVectorElt(MI
, TypeIdx
, CastTy
);
3596 case TargetOpcode::G_INSERT_VECTOR_ELT
:
3597 return bitcastInsertVectorElt(MI
, TypeIdx
, CastTy
);
3599 return UnableToLegalize
;
3603 // Legalize an instruction by changing the opcode in place.
3604 void LegalizerHelper::changeOpcode(MachineInstr
&MI
, unsigned NewOpcode
) {
3605 Observer
.changingInstr(MI
);
3606 MI
.setDesc(MIRBuilder
.getTII().get(NewOpcode
));
3607 Observer
.changedInstr(MI
);
3610 LegalizerHelper::LegalizeResult
3611 LegalizerHelper::lower(MachineInstr
&MI
, unsigned TypeIdx
, LLT LowerHintTy
) {
3612 using namespace TargetOpcode
;
3614 switch(MI
.getOpcode()) {
3616 return UnableToLegalize
;
3617 case TargetOpcode::G_FCONSTANT
:
3618 return lowerFConstant(MI
);
3619 case TargetOpcode::G_BITCAST
:
3620 return lowerBitcast(MI
);
3621 case TargetOpcode::G_SREM
:
3622 case TargetOpcode::G_UREM
: {
3623 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
3625 MIRBuilder
.buildInstr(MI
.getOpcode() == G_SREM
? G_SDIV
: G_UDIV
, {Ty
},
3626 {MI
.getOperand(1), MI
.getOperand(2)});
3628 auto Prod
= MIRBuilder
.buildMul(Ty
, Quot
, MI
.getOperand(2));
3629 MIRBuilder
.buildSub(MI
.getOperand(0), MI
.getOperand(1), Prod
);
3630 MI
.eraseFromParent();
3633 case TargetOpcode::G_SADDO
:
3634 case TargetOpcode::G_SSUBO
:
3635 return lowerSADDO_SSUBO(MI
);
3636 case TargetOpcode::G_UMULH
:
3637 case TargetOpcode::G_SMULH
:
3638 return lowerSMULH_UMULH(MI
);
3639 case TargetOpcode::G_SMULO
:
3640 case TargetOpcode::G_UMULO
: {
3641 // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3643 auto [Res
, Overflow
, LHS
, RHS
] = MI
.getFirst4Regs();
3644 LLT Ty
= MRI
.getType(Res
);
3646 unsigned Opcode
= MI
.getOpcode() == TargetOpcode::G_SMULO
3647 ? TargetOpcode::G_SMULH
3648 : TargetOpcode::G_UMULH
;
3650 Observer
.changingInstr(MI
);
3651 const auto &TII
= MIRBuilder
.getTII();
3652 MI
.setDesc(TII
.get(TargetOpcode::G_MUL
));
3653 MI
.removeOperand(1);
3654 Observer
.changedInstr(MI
);
3656 auto HiPart
= MIRBuilder
.buildInstr(Opcode
, {Ty
}, {LHS
, RHS
});
3657 auto Zero
= MIRBuilder
.buildConstant(Ty
, 0);
3659 // Move insert point forward so we can use the Res register if needed.
3660 MIRBuilder
.setInsertPt(MIRBuilder
.getMBB(), ++MIRBuilder
.getInsertPt());
3662 // For *signed* multiply, overflow is detected by checking:
3663 // (hi != (lo >> bitwidth-1))
3664 if (Opcode
== TargetOpcode::G_SMULH
) {
3665 auto ShiftAmt
= MIRBuilder
.buildConstant(Ty
, Ty
.getSizeInBits() - 1);
3666 auto Shifted
= MIRBuilder
.buildAShr(Ty
, Res
, ShiftAmt
);
3667 MIRBuilder
.buildICmp(CmpInst::ICMP_NE
, Overflow
, HiPart
, Shifted
);
3669 MIRBuilder
.buildICmp(CmpInst::ICMP_NE
, Overflow
, HiPart
, Zero
);
3673 case TargetOpcode::G_FNEG
: {
3674 auto [Res
, SubByReg
] = MI
.getFirst2Regs();
3675 LLT Ty
= MRI
.getType(Res
);
3677 // TODO: Handle vector types once we are able to
3680 return UnableToLegalize
;
3682 MIRBuilder
.buildConstant(Ty
, APInt::getSignMask(Ty
.getSizeInBits()));
3683 MIRBuilder
.buildXor(Res
, SubByReg
, SignMask
);
3684 MI
.eraseFromParent();
3687 case TargetOpcode::G_FSUB
:
3688 case TargetOpcode::G_STRICT_FSUB
: {
3689 auto [Res
, LHS
, RHS
] = MI
.getFirst3Regs();
3690 LLT Ty
= MRI
.getType(Res
);
3692 // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3693 auto Neg
= MIRBuilder
.buildFNeg(Ty
, RHS
);
3695 if (MI
.getOpcode() == TargetOpcode::G_STRICT_FSUB
)
3696 MIRBuilder
.buildStrictFAdd(Res
, LHS
, Neg
, MI
.getFlags());
3698 MIRBuilder
.buildFAdd(Res
, LHS
, Neg
, MI
.getFlags());
3700 MI
.eraseFromParent();
3703 case TargetOpcode::G_FMAD
:
3704 return lowerFMad(MI
);
3705 case TargetOpcode::G_FFLOOR
:
3706 return lowerFFloor(MI
);
3707 case TargetOpcode::G_INTRINSIC_ROUND
:
3708 return lowerIntrinsicRound(MI
);
3709 case TargetOpcode::G_FRINT
: {
3710 // Since round even is the assumed rounding mode for unconstrained FP
3711 // operations, rint and roundeven are the same operation.
3712 changeOpcode(MI
, TargetOpcode::G_INTRINSIC_ROUNDEVEN
);
3715 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS
: {
3716 auto [OldValRes
, SuccessRes
, Addr
, CmpVal
, NewVal
] = MI
.getFirst5Regs();
3717 MIRBuilder
.buildAtomicCmpXchg(OldValRes
, Addr
, CmpVal
, NewVal
,
3718 **MI
.memoperands_begin());
3719 MIRBuilder
.buildICmp(CmpInst::ICMP_EQ
, SuccessRes
, OldValRes
, CmpVal
);
3720 MI
.eraseFromParent();
3723 case TargetOpcode::G_LOAD
:
3724 case TargetOpcode::G_SEXTLOAD
:
3725 case TargetOpcode::G_ZEXTLOAD
:
3726 return lowerLoad(cast
<GAnyLoad
>(MI
));
3727 case TargetOpcode::G_STORE
:
3728 return lowerStore(cast
<GStore
>(MI
));
3729 case TargetOpcode::G_CTLZ_ZERO_UNDEF
:
3730 case TargetOpcode::G_CTTZ_ZERO_UNDEF
:
3731 case TargetOpcode::G_CTLZ
:
3732 case TargetOpcode::G_CTTZ
:
3733 case TargetOpcode::G_CTPOP
:
3734 return lowerBitCount(MI
);
3736 auto [Res
, CarryOut
, LHS
, RHS
] = MI
.getFirst4Regs();
3738 MIRBuilder
.buildAdd(Res
, LHS
, RHS
);
3739 MIRBuilder
.buildICmp(CmpInst::ICMP_ULT
, CarryOut
, Res
, RHS
);
3741 MI
.eraseFromParent();
3745 auto [Res
, CarryOut
, LHS
, RHS
, CarryIn
] = MI
.getFirst5Regs();
3746 const LLT CondTy
= MRI
.getType(CarryOut
);
3747 const LLT Ty
= MRI
.getType(Res
);
3749 // Initial add of the two operands.
3750 auto TmpRes
= MIRBuilder
.buildAdd(Ty
, LHS
, RHS
);
3752 // Initial check for carry.
3753 auto Carry
= MIRBuilder
.buildICmp(CmpInst::ICMP_ULT
, CondTy
, TmpRes
, LHS
);
3755 // Add the sum and the carry.
3756 auto ZExtCarryIn
= MIRBuilder
.buildZExt(Ty
, CarryIn
);
3757 MIRBuilder
.buildAdd(Res
, TmpRes
, ZExtCarryIn
);
3759 // Second check for carry. We can only carry if the initial sum is all 1s
3760 // and the carry is set, resulting in a new sum of 0.
3761 auto Zero
= MIRBuilder
.buildConstant(Ty
, 0);
3762 auto ResEqZero
= MIRBuilder
.buildICmp(CmpInst::ICMP_EQ
, CondTy
, Res
, Zero
);
3763 auto Carry2
= MIRBuilder
.buildAnd(CondTy
, ResEqZero
, CarryIn
);
3764 MIRBuilder
.buildOr(CarryOut
, Carry
, Carry2
);
3766 MI
.eraseFromParent();
3770 auto [Res
, BorrowOut
, LHS
, RHS
] = MI
.getFirst4Regs();
3772 MIRBuilder
.buildSub(Res
, LHS
, RHS
);
3773 MIRBuilder
.buildICmp(CmpInst::ICMP_ULT
, BorrowOut
, LHS
, RHS
);
3775 MI
.eraseFromParent();
3779 auto [Res
, BorrowOut
, LHS
, RHS
, BorrowIn
] = MI
.getFirst5Regs();
3780 const LLT CondTy
= MRI
.getType(BorrowOut
);
3781 const LLT Ty
= MRI
.getType(Res
);
3783 // Initial subtract of the two operands.
3784 auto TmpRes
= MIRBuilder
.buildSub(Ty
, LHS
, RHS
);
3786 // Initial check for borrow.
3787 auto Borrow
= MIRBuilder
.buildICmp(CmpInst::ICMP_UGT
, CondTy
, TmpRes
, LHS
);
3789 // Subtract the borrow from the first subtract.
3790 auto ZExtBorrowIn
= MIRBuilder
.buildZExt(Ty
, BorrowIn
);
3791 MIRBuilder
.buildSub(Res
, TmpRes
, ZExtBorrowIn
);
3793 // Second check for borrow. We can only borrow if the initial difference is
3794 // 0 and the borrow is set, resulting in a new difference of all 1s.
3795 auto Zero
= MIRBuilder
.buildConstant(Ty
, 0);
3797 MIRBuilder
.buildICmp(CmpInst::ICMP_EQ
, CondTy
, TmpRes
, Zero
);
3798 auto Borrow2
= MIRBuilder
.buildAnd(CondTy
, TmpResEqZero
, BorrowIn
);
3799 MIRBuilder
.buildOr(BorrowOut
, Borrow
, Borrow2
);
3801 MI
.eraseFromParent();
3805 return lowerUITOFP(MI
);
3807 return lowerSITOFP(MI
);
3809 return lowerFPTOUI(MI
);
3811 return lowerFPTOSI(MI
);
3813 return lowerFPTRUNC(MI
);
3815 return lowerFPOWI(MI
);
3820 return lowerMinMax(MI
);
3822 return lowerFCopySign(MI
);
3825 return lowerFMinNumMaxNum(MI
);
3826 case G_MERGE_VALUES
:
3827 return lowerMergeValues(MI
);
3828 case G_UNMERGE_VALUES
:
3829 return lowerUnmergeValues(MI
);
3830 case TargetOpcode::G_SEXT_INREG
: {
3831 assert(MI
.getOperand(2).isImm() && "Expected immediate");
3832 int64_t SizeInBits
= MI
.getOperand(2).getImm();
3834 auto [DstReg
, SrcReg
] = MI
.getFirst2Regs();
3835 LLT DstTy
= MRI
.getType(DstReg
);
3836 Register TmpRes
= MRI
.createGenericVirtualRegister(DstTy
);
3838 auto MIBSz
= MIRBuilder
.buildConstant(DstTy
, DstTy
.getScalarSizeInBits() - SizeInBits
);
3839 MIRBuilder
.buildShl(TmpRes
, SrcReg
, MIBSz
->getOperand(0));
3840 MIRBuilder
.buildAShr(DstReg
, TmpRes
, MIBSz
->getOperand(0));
3841 MI
.eraseFromParent();
3844 case G_EXTRACT_VECTOR_ELT
:
3845 case G_INSERT_VECTOR_ELT
:
3846 return lowerExtractInsertVectorElt(MI
);
3847 case G_SHUFFLE_VECTOR
:
3848 return lowerShuffleVector(MI
);
3849 case G_DYN_STACKALLOC
:
3850 return lowerDynStackAlloc(MI
);
3852 return lowerStackSave(MI
);
3853 case G_STACKRESTORE
:
3854 return lowerStackRestore(MI
);
3856 return lowerExtract(MI
);
3858 return lowerInsert(MI
);
3860 return lowerBswap(MI
);
3862 return lowerBitreverse(MI
);
3863 case G_READ_REGISTER
:
3864 case G_WRITE_REGISTER
:
3865 return lowerReadWriteRegister(MI
);
3868 // Try to make a reasonable guess about which lowering strategy to use. The
3869 // target can override this with custom lowering and calling the
3870 // implementation functions.
3871 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
3872 if (LI
.isLegalOrCustom({G_UMIN
, Ty
}))
3873 return lowerAddSubSatToMinMax(MI
);
3874 return lowerAddSubSatToAddoSubo(MI
);
3878 LLT Ty
= MRI
.getType(MI
.getOperand(0).getReg());
3880 // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3881 // since it's a shorter expansion. However, we would need to figure out the
3882 // preferred boolean type for the carry out for the query.
3883 if (LI
.isLegalOrCustom({G_SMIN
, Ty
}) && LI
.isLegalOrCustom({G_SMAX
, Ty
}))
3884 return lowerAddSubSatToMinMax(MI
);
3885 return lowerAddSubSatToAddoSubo(MI
);
3889 return lowerShlSat(MI
);
3891 return lowerAbsToAddXor(MI
);
3893 return lowerSelect(MI
);
3895 return lowerISFPCLASS(MI
);
3898 return lowerDIVREM(MI
);
3901 return lowerFunnelShift(MI
);
3904 return lowerRotate(MI
);
3908 return lowerMemCpyFamily(MI
);
3909 case G_MEMCPY_INLINE
:
3910 return lowerMemcpyInline(MI
);
3914 return lowerEXT(MI
);
3916 return lowerTRUNC(MI
);
3917 GISEL_VECREDUCE_CASES_NONSEQ
3918 return lowerVectorReduction(MI
);
3920 return lowerVAArg(MI
);
3924 Align
LegalizerHelper::getStackTemporaryAlignment(LLT Ty
,
3925 Align MinAlign
) const {
3926 // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3927 // datalayout for the preferred alignment. Also there should be a target hook
3928 // for this to allow targets to reduce the alignment and ignore the
3929 // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3931 return std::max(Align(PowerOf2Ceil(Ty
.getSizeInBytes())), MinAlign
);
3935 LegalizerHelper::createStackTemporary(TypeSize Bytes
, Align Alignment
,
3936 MachinePointerInfo
&PtrInfo
) {
3937 MachineFunction
&MF
= MIRBuilder
.getMF();
3938 const DataLayout
&DL
= MIRBuilder
.getDataLayout();
3939 int FrameIdx
= MF
.getFrameInfo().CreateStackObject(Bytes
, Alignment
, false);
3941 unsigned AddrSpace
= DL
.getAllocaAddrSpace();
3942 LLT FramePtrTy
= LLT::pointer(AddrSpace
, DL
.getPointerSizeInBits(AddrSpace
));
3944 PtrInfo
= MachinePointerInfo::getFixedStack(MF
, FrameIdx
);
3945 return MIRBuilder
.buildFrameIndex(FramePtrTy
, FrameIdx
);
3948 static Register
clampDynamicVectorIndex(MachineIRBuilder
&B
, Register IdxReg
,
3951 if (mi_match(IdxReg
, *B
.getMRI(), m_ICst(IdxVal
)))
3954 LLT IdxTy
= B
.getMRI()->getType(IdxReg
);
3955 unsigned NElts
= VecTy
.getNumElements();
3956 if (isPowerOf2_32(NElts
)) {
3957 APInt Imm
= APInt::getLowBitsSet(IdxTy
.getSizeInBits(), Log2_32(NElts
));
3958 return B
.buildAnd(IdxTy
, IdxReg
, B
.buildConstant(IdxTy
, Imm
)).getReg(0);
3961 return B
.buildUMin(IdxTy
, IdxReg
, B
.buildConstant(IdxTy
, NElts
- 1))
3965 Register
LegalizerHelper::getVectorElementPointer(Register VecPtr
, LLT VecTy
,
3967 LLT EltTy
= VecTy
.getElementType();
3969 // Calculate the element offset and add it to the pointer.
3970 unsigned EltSize
= EltTy
.getSizeInBits() / 8; // FIXME: should be ABI size.
3971 assert(EltSize
* 8 == EltTy
.getSizeInBits() &&
3972 "Converting bits to bytes lost precision");
3974 Index
= clampDynamicVectorIndex(MIRBuilder
, Index
, VecTy
);
3976 LLT IdxTy
= MRI
.getType(Index
);
3977 auto Mul
= MIRBuilder
.buildMul(IdxTy
, Index
,
3978 MIRBuilder
.buildConstant(IdxTy
, EltSize
));
3980 LLT PtrTy
= MRI
.getType(VecPtr
);
3981 return MIRBuilder
.buildPtrAdd(PtrTy
, VecPtr
, Mul
).getReg(0);
3985 /// Check that all vector operands have same number of elements. Other operands
3986 /// should be listed in NonVecOp.
3987 static bool hasSameNumEltsOnAllVectorOperands(
3988 GenericMachineInstr
&MI
, MachineRegisterInfo
&MRI
,
3989 std::initializer_list
<unsigned> NonVecOpIndices
) {
3990 if (MI
.getNumMemOperands() != 0)
3993 LLT VecTy
= MRI
.getType(MI
.getReg(0));
3994 if (!VecTy
.isVector())
3996 unsigned NumElts
= VecTy
.getNumElements();
3998 for (unsigned OpIdx
= 1; OpIdx
< MI
.getNumOperands(); ++OpIdx
) {
3999 MachineOperand
&Op
= MI
.getOperand(OpIdx
);
4001 if (!is_contained(NonVecOpIndices
, OpIdx
))
4006 LLT Ty
= MRI
.getType(Op
.getReg());
4007 if (!Ty
.isVector()) {
4008 if (!is_contained(NonVecOpIndices
, OpIdx
))
4013 if (Ty
.getNumElements() != NumElts
)
4021 /// Fill \p DstOps with DstOps that have same number of elements combined as
4022 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
4023 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
4024 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
4025 static void makeDstOps(SmallVectorImpl
<DstOp
> &DstOps
, LLT Ty
,
4028 assert(Ty
.isVector() && "Expected vector type");
4029 LLT EltTy
= Ty
.getElementType();
4030 LLT NarrowTy
= (NumElts
== 1) ? EltTy
: LLT::fixed_vector(NumElts
, EltTy
);
4031 int NumParts
, NumLeftover
;
4032 std::tie(NumParts
, NumLeftover
) =
4033 getNarrowTypeBreakDown(Ty
, NarrowTy
, LeftoverTy
);
4035 assert(NumParts
> 0 && "Error in getNarrowTypeBreakDown");
4036 for (int i
= 0; i
< NumParts
; ++i
) {
4037 DstOps
.push_back(NarrowTy
);
4040 if (LeftoverTy
.isValid()) {
4041 assert(NumLeftover
== 1 && "expected exactly one leftover");
4042 DstOps
.push_back(LeftoverTy
);
4046 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
4047 /// made from \p Op depending on operand type.
4048 static void broadcastSrcOp(SmallVectorImpl
<SrcOp
> &Ops
, unsigned N
,
4049 MachineOperand
&Op
) {
4050 for (unsigned i
= 0; i
< N
; ++i
) {
4052 Ops
.push_back(Op
.getReg());
4053 else if (Op
.isImm())
4054 Ops
.push_back(Op
.getImm());
4055 else if (Op
.isPredicate())
4056 Ops
.push_back(static_cast<CmpInst::Predicate
>(Op
.getPredicate()));
4058 llvm_unreachable("Unsupported type");
4062 // Handle splitting vector operations which need to have the same number of
4063 // elements in each type index, but each type index may have a different element
4066 // e.g. <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
4067 // <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4068 // <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4070 // Also handles some irregular breakdown cases, e.g.
4071 // e.g. <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
4072 // <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4073 // s64 = G_SHL s64, s32
4074 LegalizerHelper::LegalizeResult
4075 LegalizerHelper::fewerElementsVectorMultiEltType(
4076 GenericMachineInstr
&MI
, unsigned NumElts
,
4077 std::initializer_list
<unsigned> NonVecOpIndices
) {
4078 assert(hasSameNumEltsOnAllVectorOperands(MI
, MRI
, NonVecOpIndices
) &&
4079 "Non-compatible opcode or not specified non-vector operands");
4080 unsigned OrigNumElts
= MRI
.getType(MI
.getReg(0)).getNumElements();
4082 unsigned NumInputs
= MI
.getNumOperands() - MI
.getNumDefs();
4083 unsigned NumDefs
= MI
.getNumDefs();
4085 // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
4086 // Build instructions with DstOps to use instruction found by CSE directly.
4087 // CSE copies found instruction into given vreg when building with vreg dest.
4088 SmallVector
<SmallVector
<DstOp
, 8>, 2> OutputOpsPieces(NumDefs
);
4089 // Output registers will be taken from created instructions.
4090 SmallVector
<SmallVector
<Register
, 8>, 2> OutputRegs(NumDefs
);
4091 for (unsigned i
= 0; i
< NumDefs
; ++i
) {
4092 makeDstOps(OutputOpsPieces
[i
], MRI
.getType(MI
.getReg(i
)), NumElts
);
4095 // Split vector input operands into sub-vectors with NumElts elts + Leftover.
4096 // Operands listed in NonVecOpIndices will be used as is without splitting;
4097 // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
4098 // scalar condition (op 1), immediate in sext_inreg (op 2).
4099 SmallVector
<SmallVector
<SrcOp
, 8>, 3> InputOpsPieces(NumInputs
);
4100 for (unsigned UseIdx
= NumDefs
, UseNo
= 0; UseIdx
< MI
.getNumOperands();
4101 ++UseIdx
, ++UseNo
) {
4102 if (is_contained(NonVecOpIndices
, UseIdx
)) {
4103 broadcastSrcOp(InputOpsPieces
[UseNo
], OutputOpsPieces
[0].size(),
4104 MI
.getOperand(UseIdx
));
4106 SmallVector
<Register
, 8> SplitPieces
;
4107 extractVectorParts(MI
.getReg(UseIdx
), NumElts
, SplitPieces
, MIRBuilder
,
4109 for (auto Reg
: SplitPieces
)
4110 InputOpsPieces
[UseNo
].push_back(Reg
);
4114 unsigned NumLeftovers
= OrigNumElts
% NumElts
? 1 : 0;
4116 // Take i-th piece of each input operand split and build sub-vector/scalar
4117 // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
4118 for (unsigned i
= 0; i
< OrigNumElts
/ NumElts
+ NumLeftovers
; ++i
) {
4119 SmallVector
<DstOp
, 2> Defs
;
4120 for (unsigned DstNo
= 0; DstNo
< NumDefs
; ++DstNo
)
4121 Defs
.push_back(OutputOpsPieces
[DstNo
][i
]);
4123 SmallVector
<SrcOp
, 3> Uses
;
4124 for (unsigned InputNo
= 0; InputNo
< NumInputs
; ++InputNo
)
4125 Uses
.push_back(InputOpsPieces
[InputNo
][i
]);
4127 auto I
= MIRBuilder
.buildInstr(MI
.getOpcode(), Defs
, Uses
, MI
.getFlags());
4128 for (unsigned DstNo
= 0; DstNo
< NumDefs
; ++DstNo
)
4129 OutputRegs
[DstNo
].push_back(I
.getReg(DstNo
));
4132 // Merge small outputs into MI's output for each def operand.
4134 for (unsigned i
= 0; i
< NumDefs
; ++i
)
4135 mergeMixedSubvectors(MI
.getReg(i
), OutputRegs
[i
]);
4137 for (unsigned i
= 0; i
< NumDefs
; ++i
)
4138 MIRBuilder
.buildMergeLikeInstr(MI
.getReg(i
), OutputRegs
[i
]);
4141 MI
.eraseFromParent();
4145 LegalizerHelper::LegalizeResult
4146 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr
&MI
,
4148 unsigned OrigNumElts
= MRI
.getType(MI
.getReg(0)).getNumElements();
4150 unsigned NumInputs
= MI
.getNumOperands() - MI
.getNumDefs();
4151 unsigned NumDefs
= MI
.getNumDefs();
4153 SmallVector
<DstOp
, 8> OutputOpsPieces
;
4154 SmallVector
<Register
, 8> OutputRegs
;
4155 makeDstOps(OutputOpsPieces
, MRI
.getType(MI
.getReg(0)), NumElts
);
4157 // Instructions that perform register split will be inserted in basic block
4158 // where register is defined (basic block is in the next operand).
4159 SmallVector
<SmallVector
<Register
, 8>, 3> InputOpsPieces(NumInputs
/ 2);
4160 for (unsigned UseIdx
= NumDefs
, UseNo
= 0; UseIdx
< MI
.getNumOperands();
4161 UseIdx
+= 2, ++UseNo
) {
4162 MachineBasicBlock
&OpMBB
= *MI
.getOperand(UseIdx
+ 1).getMBB();
4163 MIRBuilder
.setInsertPt(OpMBB
, OpMBB
.getFirstTerminatorForward());
4164 extractVectorParts(MI
.getReg(UseIdx
), NumElts
, InputOpsPieces
[UseNo
],
4168 // Build PHIs with fewer elements.
4169 unsigned NumLeftovers
= OrigNumElts
% NumElts
? 1 : 0;
4170 MIRBuilder
.setInsertPt(*MI
.getParent(), MI
);
4171 for (unsigned i
= 0; i
< OrigNumElts
/ NumElts
+ NumLeftovers
; ++i
) {
4172 auto Phi
= MIRBuilder
.buildInstr(TargetOpcode::G_PHI
);
4174 MRI
.createGenericVirtualRegister(OutputOpsPieces
[i
].getLLTTy(MRI
)));
4175 OutputRegs
.push_back(Phi
.getReg(0));
4177 for (unsigned j
= 0; j
< NumInputs
/ 2; ++j
) {
4178 Phi
.addUse(InputOpsPieces
[j
][i
]);
4179 Phi
.add(MI
.getOperand(1 + j
* 2 + 1));
4183 // Merge small outputs into MI's def.
4185 mergeMixedSubvectors(MI
.getReg(0), OutputRegs
);
4187 MIRBuilder
.buildMergeLikeInstr(MI
.getReg(0), OutputRegs
);
4190 MI
.eraseFromParent();
4194 LegalizerHelper::LegalizeResult
4195 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr
&MI
,
4198 const int NumDst
= MI
.getNumOperands() - 1;
4199 const Register SrcReg
= MI
.getOperand(NumDst
).getReg();
4200 LLT DstTy
= MRI
.getType(MI
.getOperand(0).getReg());
4201 LLT SrcTy
= MRI
.getType(SrcReg
);
4203 if (TypeIdx
!= 1 || NarrowTy
== DstTy
)
4204 return UnableToLegalize
;
4206 // Requires compatible types. Otherwise SrcReg should have been defined by
4207 // merge-like instruction that would get artifact combined. Most likely
4208 // instruction that defines SrcReg has to perform more/fewer elements
4209 // legalization compatible with NarrowTy.
4210 assert(SrcTy
.isVector() && NarrowTy
.isVector() && "Expected vector types");
4211 assert((SrcTy
.getScalarType() == NarrowTy
.getScalarType()) && "bad type");
4213 if ((SrcTy
.getSizeInBits() % NarrowTy
.getSizeInBits() != 0) ||
4214 (NarrowTy
.getSizeInBits() % DstTy
.getSizeInBits() != 0))
4215 return UnableToLegalize
;
4217 // This is most likely DstTy (smaller then register size) packed in SrcTy
4218 // (larger then register size) and since unmerge was not combined it will be
4219 // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
4220 // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
4222 // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
4224 // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
4225 // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
4226 // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
4227 auto Unmerge
= MIRBuilder
.buildUnmerge(NarrowTy
, SrcReg
);
4228 const int NumUnmerge
= Unmerge
->getNumOperands() - 1;
4229 const int PartsPerUnmerge
= NumDst
/ NumUnmerge
;
4231 for (int I
= 0; I
!= NumUnmerge
; ++I
) {
4232 auto MIB
= MIRBuilder
.buildInstr(TargetOpcode::G_UNMERGE_VALUES
);
4234 for (int J
= 0; J
!= PartsPerUnmerge
; ++J
)
4235 MIB
.addDef(MI
.getOperand(I
* PartsPerUnmerge
+ J
).getReg());
4236 MIB
.addUse(Unmerge
.getReg(I
));
4239 MI
.eraseFromParent();
4243 LegalizerHelper::LegalizeResult
4244 LegalizerHelper::fewerElementsVectorMerge(MachineInstr
&MI
, unsigned TypeIdx
,
4246 auto [DstReg
, DstTy
, SrcReg
, SrcTy
] = MI
.getFirst2RegLLTs();
4247 // Requires compatible types. Otherwise user of DstReg did not perform unmerge
4248 // that should have been artifact combined. Most likely instruction that uses
4249 // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
4250 assert(DstTy
.isVector() && NarrowTy
.isVector() && "Expected vector types");
4251 assert((DstTy
.getScalarType() == NarrowTy
.getScalarType()) && "bad type");
4252 if (NarrowTy
== SrcTy
)
4253 return UnableToLegalize
;
4255 // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
4256 // is for old mir tests. Since the changes to more/fewer elements it should no
4257 // longer be possible to generate MIR like this when starting from llvm-ir
4258 // because LCMTy approach was replaced with merge/unmerge to vector elements.
4260 assert(SrcTy
.isVector() && "Expected vector types");
4261 assert((SrcTy
.getScalarType() == NarrowTy
.getScalarType()) && "bad type");
4262 if ((DstTy
.getSizeInBits() % NarrowTy
.getSizeInBits() != 0) ||
4263 (NarrowTy
.getNumElements() >= SrcTy
.getNumElements()))
4264 return UnableToLegalize
;
4265 // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
4267 // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
4268 // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
4269 // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
4270 // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
4271 // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
4272 // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
4274 SmallVector
<Register
, 8> Elts
;
4275 LLT EltTy
= MRI
.getType(MI
.getOperand(1).getReg()).getScalarType();
4276 for (unsigned i
= 1; i
< MI
.getNumOperands(); ++i
) {
4277 auto Unmerge
= MIRBuilder
.buildUnmerge(EltTy
, MI
.getOperand(i
).getReg());
4278 for (unsigned j
= 0; j
< Unmerge
->getNumDefs(); ++j
)
4279 Elts
.push_back(Unmerge
.getReg(j
));
4282 SmallVector
<Register
, 8> NarrowTyElts
;
4283 unsigned NumNarrowTyElts
= NarrowTy
.getNumElements();
4284 unsigned NumNarrowTyPieces
= DstTy
.getNumElements() / NumNarrowTyElts
;
4285 for (unsigned i
= 0, Offset
= 0; i
< NumNarrowTyPieces
;
4286 ++i
, Offset
+= NumNarrowTyElts
) {
4287 ArrayRef
<Register
> Pieces(&Elts
[Offset
], NumNarrowTyElts
);
4288 NarrowTyElts
.push_back(
4289 MIRBuilder
.buildMergeLikeInstr(NarrowTy
, Pieces
).getReg(0));
4292 MIRBuilder
.buildMergeLikeInstr(DstReg
, NarrowTyElts
);
4293 MI
.eraseFromParent();
4297 assert(TypeIdx
== 0 && "Bad type index");
4298 if ((NarrowTy
.getSizeInBits() % SrcTy
.getSizeInBits() != 0) ||
4299 (DstTy
.getSizeInBits() % NarrowTy
.getSizeInBits() != 0))
4300 return UnableToLegalize
;
4302 // This is most likely SrcTy (smaller then register size) packed in DstTy
4303 // (larger then register size) and since merge was not combined it will be
4304 // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
4305 // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
4307 // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
4309 // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
4310 // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
4311 // %0:_(DstTy) = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
4312 SmallVector
<Register
, 8> NarrowTyElts
;
4313 unsigned NumParts
= DstTy
.getNumElements() / NarrowTy
.getNumElements();
4314 unsigned NumSrcElts
= SrcTy
.isVector() ? SrcTy
.getNumElements() : 1;
4315 unsigned NumElts
= NarrowTy
.getNumElements() / NumSrcElts
;
4316 for (unsigned i
= 0; i
< NumParts
; ++i
) {
4317 SmallVector
<Register
, 8> Sources
;
4318 for (unsigned j
= 0; j
< NumElts
; ++j
)
4319 Sources
.push_back(MI
.getOperand(1 + i
* NumElts
+ j
).getReg());
4320 NarrowTyElts
.push_back(
4321 MIRBuilder
.buildMergeLikeInstr(NarrowTy
, Sources
).getReg(0));
4324 MIRBuilder
.buildMergeLikeInstr(DstReg
, NarrowTyElts
);
4325 MI
.eraseFromParent();
4329 LegalizerHelper::LegalizeResult
4330 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr
&MI
,
4333 auto [DstReg
, SrcVec
] = MI
.getFirst2Regs();
4335 bool IsInsert
= MI
.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT
;
4337 assert((IsInsert
? TypeIdx
== 0 : TypeIdx
== 1) && "not a vector type index");
4339 InsertVal
= MI
.getOperand(2).getReg();
4341 Register Idx
= MI
.getOperand(MI
.getNumOperands() - 1).getReg();
4343 // TODO: Handle total scalarization case.
4344 if (!NarrowVecTy
.isVector())
4345 return UnableToLegalize
;
4347 LLT VecTy
= MRI
.getType(SrcVec
);
4349 // If the index is a constant, we can really break this down as you would
4350 // expect, and index into the target size pieces.
4352 auto MaybeCst
= getIConstantVRegValWithLookThrough(Idx
, MRI
);
4354 IdxVal
= MaybeCst
->Value
.getSExtValue();
4355 // Avoid out of bounds indexing the pieces.
4356 if (IdxVal
>= VecTy
.getNumElements()) {
4357 MIRBuilder
.buildUndef(DstReg
);
4358 MI
.eraseFromParent();
4362 SmallVector
<Register
, 8> VecParts
;
4363 LLT GCDTy
= extractGCDType(VecParts
, VecTy
, NarrowVecTy
, SrcVec
);
4365 // Build a sequence of NarrowTy pieces in VecParts for this operand.
4366 LLT LCMTy
= buildLCMMergePieces(VecTy
, NarrowVecTy
, GCDTy
, VecParts
,
4367 TargetOpcode::G_ANYEXT
);
4369 unsigned NewNumElts
= NarrowVecTy
.getNumElements();
4371 LLT IdxTy
= MRI
.getType(Idx
);
4372 int64_t PartIdx
= IdxVal
/ NewNumElts
;
4374 MIRBuilder
.buildConstant(IdxTy
, IdxVal
- NewNumElts
* PartIdx
);
4377 LLT PartTy
= MRI
.getType(VecParts
[PartIdx
]);
4379 // Use the adjusted index to insert into one of the subvectors.
4380 auto InsertPart
= MIRBuilder
.buildInsertVectorElement(
4381 PartTy
, VecParts
[PartIdx
], InsertVal
, NewIdx
);
4382 VecParts
[PartIdx
] = InsertPart
.getReg(0);
4384 // Recombine the inserted subvector with the others to reform the result
4386 buildWidenedRemergeToDst(DstReg
, LCMTy
, VecParts
);
4388 MIRBuilder
.buildExtractVectorElement(DstReg
, VecParts
[PartIdx
], NewIdx
);
4391 MI
.eraseFromParent();
4395 // With a variable index, we can't perform the operation in a smaller type, so
4396 // we're forced to expand this.
4398 // TODO: We could emit a chain of compare/select to figure out which piece to
4400 return lowerExtractInsertVectorElt(MI
);
4403 LegalizerHelper::LegalizeResult
4404 LegalizerHelper::reduceLoadStoreWidth(GLoadStore
&LdStMI
, unsigned TypeIdx
,
4406 // FIXME: Don't know how to handle secondary types yet.
4408 return UnableToLegalize
;
4410 // This implementation doesn't work for atomics. Give up instead of doing
4411 // something invalid.
4412 if (LdStMI
.isAtomic())
4413 return UnableToLegalize
;
4415 bool IsLoad
= isa
<GLoad
>(LdStMI
);
4416 Register ValReg
= LdStMI
.getReg(0);
4417 Register AddrReg
= LdStMI
.getPointerReg();
4418 LLT ValTy
= MRI
.getType(ValReg
);
4420 // FIXME: Do we need a distinct NarrowMemory legalize action?
4421 if (ValTy
.getSizeInBits() != 8 * LdStMI
.getMemSize()) {
4422 LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4423 return UnableToLegalize
;
4427 int NumLeftover
= -1;
4429 SmallVector
<Register
, 8> NarrowRegs
, NarrowLeftoverRegs
;
4431 std::tie(NumParts
, NumLeftover
) = getNarrowTypeBreakDown(ValTy
, NarrowTy
, LeftoverTy
);
4433 if (extractParts(ValReg
, ValTy
, NarrowTy
, LeftoverTy
, NarrowRegs
,
4434 NarrowLeftoverRegs
, MIRBuilder
, MRI
)) {
4435 NumParts
= NarrowRegs
.size();
4436 NumLeftover
= NarrowLeftoverRegs
.size();
4441 return UnableToLegalize
;
4443 LLT PtrTy
= MRI
.getType(AddrReg
);
4444 const LLT OffsetTy
= LLT::scalar(PtrTy
.getSizeInBits());
4446 unsigned TotalSize
= ValTy
.getSizeInBits();
4448 // Split the load/store into PartTy sized pieces starting at Offset. If this
4449 // is a load, return the new registers in ValRegs. For a store, each elements
4450 // of ValRegs should be PartTy. Returns the next offset that needs to be
4452 bool isBigEndian
= MIRBuilder
.getDataLayout().isBigEndian();
4453 auto MMO
= LdStMI
.getMMO();
4454 auto splitTypePieces
= [=](LLT PartTy
, SmallVectorImpl
<Register
> &ValRegs
,
4455 unsigned NumParts
, unsigned Offset
) -> unsigned {
4456 MachineFunction
&MF
= MIRBuilder
.getMF();
4457 unsigned PartSize
= PartTy
.getSizeInBits();
4458 for (unsigned Idx
= 0, E
= NumParts
; Idx
!= E
&& Offset
< TotalSize
;
4460 unsigned ByteOffset
= Offset
/ 8;
4461 Register NewAddrReg
;
4463 MIRBuilder
.materializePtrAdd(NewAddrReg
, AddrReg
, OffsetTy
, ByteOffset
);
4465 MachineMemOperand
*NewMMO
=
4466 MF
.getMachineMemOperand(&MMO
, ByteOffset
, PartTy
);
4469 Register Dst
= MRI
.createGenericVirtualRegister(PartTy
);
4470 ValRegs
.push_back(Dst
);
4471 MIRBuilder
.buildLoad(Dst
, NewAddrReg
, *NewMMO
);
4473 MIRBuilder
.buildStore(ValRegs
[Idx
], NewAddrReg
, *NewMMO
);
4475 Offset
= isBigEndian
? Offset
- PartSize
: Offset
+ PartSize
;
4481 unsigned Offset
= isBigEndian
? TotalSize
- NarrowTy
.getSizeInBits() : 0;
4482 unsigned HandledOffset
=
4483 splitTypePieces(NarrowTy
, NarrowRegs
, NumParts
, Offset
);
4485 // Handle the rest of the register if this isn't an even type breakdown.
4486 if (LeftoverTy
.isValid())
4487 splitTypePieces(LeftoverTy
, NarrowLeftoverRegs
, NumLeftover
, HandledOffset
);
4490 insertParts(ValReg
, ValTy
, NarrowTy
, NarrowRegs
,
4491 LeftoverTy
, NarrowLeftoverRegs
);
4494 LdStMI
.eraseFromParent();
4498 LegalizerHelper::LegalizeResult
4499 LegalizerHelper::fewerElementsVector(MachineInstr
&MI
, unsigned TypeIdx
,
4501 using namespace TargetOpcode
;
4502 GenericMachineInstr
&GMI
= cast
<GenericMachineInstr
>(MI
);
4503 unsigned NumElts
= NarrowTy
.isVector() ? NarrowTy
.getNumElements() : 1;
4505 switch (MI
.getOpcode()) {
4506 case G_IMPLICIT_DEF
:
4522 case G_FCANONICALIZE
:
4539 case G_INTRINSIC_ROUND
:
4540 case G_INTRINSIC_ROUNDEVEN
:
4541 case G_INTRINSIC_TRUNC
:
4560 case G_FMINNUM_IEEE
:
4561 case G_FMAXNUM_IEEE
:
4581 case G_CTLZ_ZERO_UNDEF
:
4583 case G_CTTZ_ZERO_UNDEF
:
4597 case G_ADDRSPACE_CAST
:
4610 case G_STRICT_FLDEXP
:
4612 return fewerElementsVectorMultiEltType(GMI
, NumElts
);
4615 return fewerElementsVectorMultiEltType(GMI
, NumElts
, {1 /*cpm predicate*/});
4617 return fewerElementsVectorMultiEltType(GMI
, NumElts
, {2, 3 /*mask,fpsem*/});
4619 if (MRI
.getType(MI
.getOperand(1).getReg()).isVector())
4620 return fewerElementsVectorMultiEltType(GMI
, NumElts
);
4621 return fewerElementsVectorMultiEltType(GMI
, NumElts
, {1 /*scalar cond*/});
4623 return fewerElementsVectorPhi(GMI
, NumElts
);
4624 case G_UNMERGE_VALUES
:
4625 return fewerElementsVectorUnmergeValues(MI
, TypeIdx
, NarrowTy
);
4626 case G_BUILD_VECTOR
:
4627 assert(TypeIdx
== 0 && "not a vector type index");
4628 return fewerElementsVectorMerge(MI
, TypeIdx
, NarrowTy
);
4629 case G_CONCAT_VECTORS
:
4630 if (TypeIdx
!= 1) // TODO: This probably does work as expected already.
4631 return UnableToLegalize
;
4632 return fewerElementsVectorMerge(MI
, TypeIdx
, NarrowTy
);
4633 case G_EXTRACT_VECTOR_ELT
:
4634 case G_INSERT_VECTOR_ELT
:
4635 return fewerElementsVectorExtractInsertVectorElt(MI
, TypeIdx
, NarrowTy
);
4638 return reduceLoadStoreWidth(cast
<GLoadStore
>(MI
), TypeIdx
, NarrowTy
);
4640 return fewerElementsVectorMultiEltType(GMI
, NumElts
, {2 /*imm*/});
4641 GISEL_VECREDUCE_CASES_NONSEQ
4642 return fewerElementsVectorReductions(MI
, TypeIdx
, NarrowTy
);
4643 case TargetOpcode::G_VECREDUCE_SEQ_FADD
:
4644 case TargetOpcode::G_VECREDUCE_SEQ_FMUL
:
4645 return fewerElementsVectorSeqReductions(MI
, TypeIdx
, NarrowTy
);
4646 case G_SHUFFLE_VECTOR
:
4647 return fewerElementsVectorShuffle(MI
, TypeIdx
, NarrowTy
);
4649 return fewerElementsVectorMultiEltType(GMI
, NumElts
, {2 /*pow*/});
4651 return UnableToLegalize
;
4655 LegalizerHelper::LegalizeResult
LegalizerHelper::fewerElementsVectorShuffle(
4656 MachineInstr
&MI
, unsigned int TypeIdx
, LLT NarrowTy
) {
4657 assert(MI
.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR
);
4659 return UnableToLegalize
;
4661 auto [DstReg
, DstTy
, Src1Reg
, Src1Ty
, Src2Reg
, Src2Ty
] =
4662 MI
.getFirst3RegLLTs();
4663 ArrayRef
<int> Mask
= MI
.getOperand(3).getShuffleMask();
4664 // The shuffle should be canonicalized by now.
4665 if (DstTy
!= Src1Ty
)
4666 return UnableToLegalize
;
4667 if (DstTy
!= Src2Ty
)
4668 return UnableToLegalize
;
4670 if (!isPowerOf2_32(DstTy
.getNumElements()))
4671 return UnableToLegalize
;
4673 // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4674 // Further legalization attempts will be needed to do split further.
4676 DstTy
.changeElementCount(DstTy
.getElementCount().divideCoefficientBy(2));
4677 unsigned NewElts
= NarrowTy
.getNumElements();
4679 SmallVector
<Register
> SplitSrc1Regs
, SplitSrc2Regs
;
4680 extractParts(Src1Reg
, NarrowTy
, 2, SplitSrc1Regs
, MIRBuilder
, MRI
);
4681 extractParts(Src2Reg
, NarrowTy
, 2, SplitSrc2Regs
, MIRBuilder
, MRI
);
4682 Register Inputs
[4] = {SplitSrc1Regs
[0], SplitSrc1Regs
[1], SplitSrc2Regs
[0],
4687 // If Lo or Hi uses elements from at most two of the four input vectors, then
4688 // express it as a vector shuffle of those two inputs. Otherwise extract the
4689 // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4690 SmallVector
<int, 16> Ops
;
4691 for (unsigned High
= 0; High
< 2; ++High
) {
4692 Register
&Output
= High
? Hi
: Lo
;
4694 // Build a shuffle mask for the output, discovering on the fly which
4695 // input vectors to use as shuffle operands (recorded in InputUsed).
4696 // If building a suitable shuffle vector proves too hard, then bail
4697 // out with useBuildVector set.
4698 unsigned InputUsed
[2] = {-1U, -1U}; // Not yet discovered.
4699 unsigned FirstMaskIdx
= High
* NewElts
;
4700 bool UseBuildVector
= false;
4701 for (unsigned MaskOffset
= 0; MaskOffset
< NewElts
; ++MaskOffset
) {
4702 // The mask element. This indexes into the input.
4703 int Idx
= Mask
[FirstMaskIdx
+ MaskOffset
];
4705 // The input vector this mask element indexes into.
4706 unsigned Input
= (unsigned)Idx
/ NewElts
;
4708 if (Input
>= std::size(Inputs
)) {
4709 // The mask element does not index into any input vector.
4714 // Turn the index into an offset from the start of the input vector.
4715 Idx
-= Input
* NewElts
;
4717 // Find or create a shuffle vector operand to hold this input.
4719 for (OpNo
= 0; OpNo
< std::size(InputUsed
); ++OpNo
) {
4720 if (InputUsed
[OpNo
] == Input
) {
4721 // This input vector is already an operand.
4723 } else if (InputUsed
[OpNo
] == -1U) {
4724 // Create a new operand for this input vector.
4725 InputUsed
[OpNo
] = Input
;
4730 if (OpNo
>= std::size(InputUsed
)) {
4731 // More than two input vectors used! Give up on trying to create a
4732 // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
4733 UseBuildVector
= true;
4737 // Add the mask index for the new shuffle vector.
4738 Ops
.push_back(Idx
+ OpNo
* NewElts
);
4741 if (UseBuildVector
) {
4742 LLT EltTy
= NarrowTy
.getElementType();
4743 SmallVector
<Register
, 16> SVOps
;
4745 // Extract the input elements by hand.
4746 for (unsigned MaskOffset
= 0; MaskOffset
< NewElts
; ++MaskOffset
) {
4747 // The mask element. This indexes into the input.
4748 int Idx
= Mask
[FirstMaskIdx
+ MaskOffset
];
4750 // The input vector this mask element indexes into.
4751 unsigned Input
= (unsigned)Idx
/ NewElts
;
4753 if (Input
>= std::size(Inputs
)) {
4754 // The mask element is "undef" or indexes off the end of the input.
4755 SVOps
.push_back(MIRBuilder
.buildUndef(EltTy
).getReg(0));
4759 // Turn the index into an offset from the start of the input vector.
4760 Idx
-= Input
* NewElts
;
4762 // Extract the vector element by hand.
4763 SVOps
.push_back(MIRBuilder
4764 .buildExtractVectorElement(
4765 EltTy
, Inputs
[Input
],
4766 MIRBuilder
.buildConstant(LLT::scalar(32), Idx
))
4770 // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4771 Output
= MIRBuilder
.buildBuildVector(NarrowTy
, SVOps
).getReg(0);
4772 } else if (InputUsed
[0] == -1U) {
4773 // No input vectors were used! The result is undefined.
4774 Output
= MIRBuilder
.buildUndef(NarrowTy
).getReg(0);
4776 Register Op0
= Inputs
[InputUsed
[0]];
4777 // If only one input was used, use an undefined vector for the other.
4778 Register Op1
= InputUsed
[1] == -1U
4779 ? MIRBuilder
.buildUndef(NarrowTy
).getReg(0)
4780 : Inputs
[InputUsed
[1]];
4781 // At least one input vector was used. Create a new shuffle vector.
4782 Output
= MIRBuilder
.buildShuffleVector(NarrowTy
, Op0
, Op1
, Ops
).getReg(0);
4788 MIRBuilder
.buildConcatVectors(DstReg
, {Lo
, Hi
});
4789 MI
.eraseFromParent();
4793 LegalizerHelper::LegalizeResult
LegalizerHelper::fewerElementsVectorReductions(
4794 MachineInstr
&MI
, unsigned int TypeIdx
, LLT NarrowTy
) {
4795 auto &RdxMI
= cast
<GVecReduce
>(MI
);
4798 return UnableToLegalize
;
4800 // The semantics of the normal non-sequential reductions allow us to freely
4801 // re-associate the operation.
4802 auto [DstReg
, DstTy
, SrcReg
, SrcTy
] = RdxMI
.getFirst2RegLLTs();
4804 if (NarrowTy
.isVector() &&
4805 (SrcTy
.getNumElements() % NarrowTy
.getNumElements() != 0))
4806 return UnableToLegalize
;
4808 unsigned ScalarOpc
= RdxMI
.getScalarOpcForReduction();
4809 SmallVector
<Register
> SplitSrcs
;
4810 // If NarrowTy is a scalar then we're being asked to scalarize.
4811 const unsigned NumParts
=
4812 NarrowTy
.isVector() ? SrcTy
.getNumElements() / NarrowTy
.getNumElements()
4813 : SrcTy
.getNumElements();
4815 extractParts(SrcReg
, NarrowTy
, NumParts
, SplitSrcs
, MIRBuilder
, MRI
);
4816 if (NarrowTy
.isScalar()) {
4817 if (DstTy
!= NarrowTy
)
4818 return UnableToLegalize
; // FIXME: handle implicit extensions.
4820 if (isPowerOf2_32(NumParts
)) {
4821 // Generate a tree of scalar operations to reduce the critical path.
4822 SmallVector
<Register
> PartialResults
;
4823 unsigned NumPartsLeft
= NumParts
;
4824 while (NumPartsLeft
> 1) {
4825 for (unsigned Idx
= 0; Idx
< NumPartsLeft
- 1; Idx
+= 2) {
4826 PartialResults
.emplace_back(
4828 .buildInstr(ScalarOpc
, {NarrowTy
},
4829 {SplitSrcs
[Idx
], SplitSrcs
[Idx
+ 1]})
4832 SplitSrcs
= PartialResults
;
4833 PartialResults
.clear();
4834 NumPartsLeft
= SplitSrcs
.size();
4836 assert(SplitSrcs
.size() == 1);
4837 MIRBuilder
.buildCopy(DstReg
, SplitSrcs
[0]);
4838 MI
.eraseFromParent();
4841 // If we can't generate a tree, then just do sequential operations.
4842 Register Acc
= SplitSrcs
[0];
4843 for (unsigned Idx
= 1; Idx
< NumParts
; ++Idx
)
4844 Acc
= MIRBuilder
.buildInstr(ScalarOpc
, {NarrowTy
}, {Acc
, SplitSrcs
[Idx
]})
4846 MIRBuilder
.buildCopy(DstReg
, Acc
);
4847 MI
.eraseFromParent();
4850 SmallVector
<Register
> PartialReductions
;
4851 for (unsigned Part
= 0; Part
< NumParts
; ++Part
) {
4852 PartialReductions
.push_back(
4853 MIRBuilder
.buildInstr(RdxMI
.getOpcode(), {DstTy
}, {SplitSrcs
[Part
]})
4857 // If the types involved are powers of 2, we can generate intermediate vector
4858 // ops, before generating a final reduction operation.
4859 if (isPowerOf2_32(SrcTy
.getNumElements()) &&
4860 isPowerOf2_32(NarrowTy
.getNumElements())) {
4861 return tryNarrowPow2Reduction(MI
, SrcReg
, SrcTy
, NarrowTy
, ScalarOpc
);
4864 Register Acc
= PartialReductions
[0];
4865 for (unsigned Part
= 1; Part
< NumParts
; ++Part
) {
4866 if (Part
== NumParts
- 1) {
4867 MIRBuilder
.buildInstr(ScalarOpc
, {DstReg
},
4868 {Acc
, PartialReductions
[Part
]});
4871 .buildInstr(ScalarOpc
, {DstTy
}, {Acc
, PartialReductions
[Part
]})
4875 MI
.eraseFromParent();
4879 LegalizerHelper::LegalizeResult
4880 LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr
&MI
,
4881 unsigned int TypeIdx
,
4883 auto [DstReg
, DstTy
, ScalarReg
, ScalarTy
, SrcReg
, SrcTy
] =
4884 MI
.getFirst3RegLLTs();
4885 if (!NarrowTy
.isScalar() || TypeIdx
!= 2 || DstTy
!= ScalarTy
||
4887 return UnableToLegalize
;
4889 assert((MI
.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD
||
4890 MI
.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FMUL
) &&
4891 "Unexpected vecreduce opcode");
4892 unsigned ScalarOpc
= MI
.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD
4893 ? TargetOpcode::G_FADD
4894 : TargetOpcode::G_FMUL
;
4896 SmallVector
<Register
> SplitSrcs
;
4897 unsigned NumParts
= SrcTy
.getNumElements();
4898 extractParts(SrcReg
, NarrowTy
, NumParts
, SplitSrcs
, MIRBuilder
, MRI
);
4899 Register Acc
= ScalarReg
;
4900 for (unsigned i
= 0; i
< NumParts
; i
++)
4901 Acc
= MIRBuilder
.buildInstr(ScalarOpc
, {NarrowTy
}, {Acc
, SplitSrcs
[i
]})
4904 MIRBuilder
.buildCopy(DstReg
, Acc
);
4905 MI
.eraseFromParent();
4909 LegalizerHelper::LegalizeResult
4910 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr
&MI
, Register SrcReg
,
4911 LLT SrcTy
, LLT NarrowTy
,
4912 unsigned ScalarOpc
) {
4913 SmallVector
<Register
> SplitSrcs
;
4914 // Split the sources into NarrowTy size pieces.
4915 extractParts(SrcReg
, NarrowTy
,
4916 SrcTy
.getNumElements() / NarrowTy
.getNumElements(), SplitSrcs
,
4918 // We're going to do a tree reduction using vector operations until we have
4919 // one NarrowTy size value left.
4920 while (SplitSrcs
.size() > 1) {
4921 SmallVector
<Register
> PartialRdxs
;
4922 for (unsigned Idx
= 0; Idx
< SplitSrcs
.size()-1; Idx
+= 2) {
4923 Register LHS
= SplitSrcs
[Idx
];
4924 Register RHS
= SplitSrcs
[Idx
+ 1];
4925 // Create the intermediate vector op.
4927 MIRBuilder
.buildInstr(ScalarOpc
, {NarrowTy
}, {LHS
, RHS
}).getReg(0);
4928 PartialRdxs
.push_back(Res
);
4930 SplitSrcs
= std::move(PartialRdxs
);
4932 // Finally generate the requested NarrowTy based reduction.
4933 Observer
.changingInstr(MI
);
4934 MI
.getOperand(1).setReg(SplitSrcs
[0]);
4935 Observer
.changedInstr(MI
);
4939 LegalizerHelper::LegalizeResult
4940 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr
&MI
, const APInt
&Amt
,
4941 const LLT HalfTy
, const LLT AmtTy
) {
4943 Register InL
= MRI
.createGenericVirtualRegister(HalfTy
);
4944 Register InH
= MRI
.createGenericVirtualRegister(HalfTy
);
4945 MIRBuilder
.buildUnmerge({InL
, InH
}, MI
.getOperand(1));
4948 MIRBuilder
.buildMergeLikeInstr(MI
.getOperand(0), {InL
, InH
});
4949 MI
.eraseFromParent();
4954 unsigned NVTBits
= HalfTy
.getSizeInBits();
4955 unsigned VTBits
= 2 * NVTBits
;
4957 SrcOp
Lo(Register(0)), Hi(Register(0));
4958 if (MI
.getOpcode() == TargetOpcode::G_SHL
) {
4959 if (Amt
.ugt(VTBits
)) {
4960 Lo
= Hi
= MIRBuilder
.buildConstant(NVT
, 0);
4961 } else if (Amt
.ugt(NVTBits
)) {
4962 Lo
= MIRBuilder
.buildConstant(NVT
, 0);
4963 Hi
= MIRBuilder
.buildShl(NVT
, InL
,
4964 MIRBuilder
.buildConstant(AmtTy
, Amt
- NVTBits
));
4965 } else if (Amt
== NVTBits
) {
4966 Lo
= MIRBuilder
.buildConstant(NVT
, 0);
4969 Lo
= MIRBuilder
.buildShl(NVT
, InL
, MIRBuilder
.buildConstant(AmtTy
, Amt
));
4971 MIRBuilder
.buildShl(NVT
, InH
, MIRBuilder
.buildConstant(AmtTy
, Amt
));
4972 auto OrRHS
= MIRBuilder
.buildLShr(
4973 NVT
, InL
, MIRBuilder
.buildConstant(AmtTy
, -Amt
+ NVTBits
));
4974 Hi
= MIRBuilder
.buildOr(NVT
, OrLHS
, OrRHS
);
4976 } else if (MI
.getOpcode() == TargetOpcode::G_LSHR
) {
4977 if (Amt
.ugt(VTBits
)) {
4978 Lo
= Hi
= MIRBuilder
.buildConstant(NVT
, 0);
4979 } else if (Amt
.ugt(NVTBits
)) {
4980 Lo
= MIRBuilder
.buildLShr(NVT
, InH
,
4981 MIRBuilder
.buildConstant(AmtTy
, Amt
- NVTBits
));
4982 Hi
= MIRBuilder
.buildConstant(NVT
, 0);
4983 } else if (Amt
== NVTBits
) {
4985 Hi
= MIRBuilder
.buildConstant(NVT
, 0);
4987 auto ShiftAmtConst
= MIRBuilder
.buildConstant(AmtTy
, Amt
);
4989 auto OrLHS
= MIRBuilder
.buildLShr(NVT
, InL
, ShiftAmtConst
);
4990 auto OrRHS
= MIRBuilder
.buildShl(
4991 NVT
, InH
, MIRBuilder
.buildConstant(AmtTy
, -Amt
+ NVTBits
));
4993 Lo
= MIRBuilder
.buildOr(NVT
, OrLHS
, OrRHS
);
4994 Hi
= MIRBuilder
.buildLShr(NVT
, InH
, ShiftAmtConst
);
4997 if (Amt
.ugt(VTBits
)) {
4998 Hi
= Lo
= MIRBuilder
.buildAShr(
4999 NVT
, InH
, MIRBuilder
.buildConstant(AmtTy
, NVTBits
- 1));
5000 } else if (Amt
.ugt(NVTBits
)) {
5001 Lo
= MIRBuilder
.buildAShr(NVT
, InH
,
5002 MIRBuilder
.buildConstant(AmtTy
, Amt
- NVTBits
));
5003 Hi
= MIRBuilder
.buildAShr(NVT
, InH
,
5004 MIRBuilder
.buildConstant(AmtTy
, NVTBits
- 1));
5005 } else if (Amt
== NVTBits
) {
5007 Hi
= MIRBuilder
.buildAShr(NVT
, InH
,
5008 MIRBuilder
.buildConstant(AmtTy
, NVTBits
- 1));
5010 auto ShiftAmtConst
= MIRBuilder
.buildConstant(AmtTy
, Amt
);
5012 auto OrLHS
= MIRBuilder
.buildLShr(NVT
, InL
, ShiftAmtConst
);
5013 auto OrRHS
= MIRBuilder
.buildShl(
5014 NVT
, InH
, MIRBuilder
.buildConstant(AmtTy
, -Amt
+ NVTBits
));
5016 Lo
= MIRBuilder
.buildOr(NVT
, OrLHS
, OrRHS
);
5017 Hi
= MIRBuilder
.buildAShr(NVT
, InH
, ShiftAmtConst
);
5021 MIRBuilder
.buildMergeLikeInstr(MI
.getOperand(0), {Lo
, Hi
});
5022 MI
.eraseFromParent();
5027 // TODO: Optimize if constant shift amount.
5028 LegalizerHelper::LegalizeResult
5029 LegalizerHelper::narrowScalarShift(MachineInstr
&MI
, unsigned TypeIdx
,
5032 Observer
.changingInstr(MI
);
5033 narrowScalarSrc(MI
, RequestedTy
, 2);
5034 Observer
.changedInstr(MI
);
5038 Register DstReg
= MI
.getOperand(0).getReg();
5039 LLT DstTy
= MRI
.getType(DstReg
);
5040 if (DstTy
.isVector())
5041 return UnableToLegalize
;
5043 Register Amt
= MI
.getOperand(2).getReg();
5044 LLT ShiftAmtTy
= MRI
.getType(Amt
);
5045 const unsigned DstEltSize
= DstTy
.getScalarSizeInBits();
5046 if (DstEltSize
% 2 != 0)
5047 return UnableToLegalize
;
5049 // Ignore the input type. We can only go to exactly half the size of the
5050 // input. If that isn't small enough, the resulting pieces will be further
5052 const unsigned NewBitSize
= DstEltSize
/ 2;
5053 const LLT HalfTy
= LLT::scalar(NewBitSize
);
5054 const LLT CondTy
= LLT::scalar(1);
5056 if (auto VRegAndVal
= getIConstantVRegValWithLookThrough(Amt
, MRI
)) {
5057 return narrowScalarShiftByConstant(MI
, VRegAndVal
->Value
, HalfTy
,
5061 // TODO: Expand with known bits.
5063 // Handle the fully general expansion by an unknown amount.
5064 auto NewBits
= MIRBuilder
.buildConstant(ShiftAmtTy
, NewBitSize
);
5066 Register InL
= MRI
.createGenericVirtualRegister(HalfTy
);
5067 Register InH
= MRI
.createGenericVirtualRegister(HalfTy
);
5068 MIRBuilder
.buildUnmerge({InL
, InH
}, MI
.getOperand(1));
5070 auto AmtExcess
= MIRBuilder
.buildSub(ShiftAmtTy
, Amt
, NewBits
);
5071 auto AmtLack
= MIRBuilder
.buildSub(ShiftAmtTy
, NewBits
, Amt
);
5073 auto Zero
= MIRBuilder
.buildConstant(ShiftAmtTy
, 0);
5074 auto IsShort
= MIRBuilder
.buildICmp(ICmpInst::ICMP_ULT
, CondTy
, Amt
, NewBits
);
5075 auto IsZero
= MIRBuilder
.buildICmp(ICmpInst::ICMP_EQ
, CondTy
, Amt
, Zero
);
5077 Register ResultRegs
[2];
5078 switch (MI
.getOpcode()) {
5079 case TargetOpcode::G_SHL
: {
5080 // Short: ShAmt < NewBitSize
5081 auto LoS
= MIRBuilder
.buildShl(HalfTy
, InL
, Amt
);
5083 auto LoOr
= MIRBuilder
.buildLShr(HalfTy
, InL
, AmtLack
);
5084 auto HiOr
= MIRBuilder
.buildShl(HalfTy
, InH
, Amt
);
5085 auto HiS
= MIRBuilder
.buildOr(HalfTy
, LoOr
, HiOr
);
5087 // Long: ShAmt >= NewBitSize
5088 auto LoL
= MIRBuilder
.buildConstant(HalfTy
, 0); // Lo part is zero.
5089 auto HiL
= MIRBuilder
.buildShl(HalfTy
, InL
, AmtExcess
); // Hi from Lo part.
5091 auto Lo
= MIRBuilder
.buildSelect(HalfTy
, IsShort
, LoS
, LoL
);
5092 auto Hi
= MIRBuilder
.buildSelect(
5093 HalfTy
, IsZero
, InH
, MIRBuilder
.buildSelect(HalfTy
, IsShort
, HiS
, HiL
));
5095 ResultRegs
[0] = Lo
.getReg(0);
5096 ResultRegs
[1] = Hi
.getReg(0);
5099 case TargetOpcode::G_LSHR
:
5100 case TargetOpcode::G_ASHR
: {
5101 // Short: ShAmt < NewBitSize
5102 auto HiS
= MIRBuilder
.buildInstr(MI
.getOpcode(), {HalfTy
}, {InH
, Amt
});
5104 auto LoOr
= MIRBuilder
.buildLShr(HalfTy
, InL
, Amt
);
5105 auto HiOr
= MIRBuilder
.buildShl(HalfTy
, InH
, AmtLack
);
5106 auto LoS
= MIRBuilder
.buildOr(HalfTy
, LoOr
, HiOr
);
5108 // Long: ShAmt >= NewBitSize
5109 MachineInstrBuilder HiL
;
5110 if (MI
.getOpcode() == TargetOpcode::G_LSHR
) {
5111 HiL
= MIRBuilder
.buildConstant(HalfTy
, 0); // Hi part is zero.
5113 auto ShiftAmt
= MIRBuilder
.buildConstant(ShiftAmtTy
, NewBitSize
- 1);
5114 HiL
= MIRBuilder
.buildAShr(HalfTy
, InH
, ShiftAmt
); // Sign of Hi part.
5116 auto LoL
= MIRBuilder
.buildInstr(MI
.getOpcode(), {HalfTy
},
5117 {InH
, AmtExcess
}); // Lo from Hi part.
5119 auto Lo
= MIRBuilder
.buildSelect(
5120 HalfTy
, IsZero
, InL
, MIRBuilder
.buildSelect(HalfTy
, IsShort
, LoS
, LoL
));
5122 auto Hi
= MIRBuilder
.buildSelect(HalfTy
, IsShort
, HiS
, HiL
);
5124 ResultRegs
[0] = Lo
.getReg(0);
5125 ResultRegs
[1] = Hi
.getReg(0);
5129 llvm_unreachable("not a shift");
5132 MIRBuilder
.buildMergeLikeInstr(DstReg
, ResultRegs
);
5133 MI
.eraseFromParent();
5137 LegalizerHelper::LegalizeResult
5138 LegalizerHelper::moreElementsVectorPhi(MachineInstr
&MI
, unsigned TypeIdx
,
5140 assert(TypeIdx
== 0 && "Expecting only Idx 0");
5142 Observer
.changingInstr(MI
);
5143 for (unsigned I
= 1, E
= MI
.getNumOperands(); I
!= E
; I
+= 2) {
5144 MachineBasicBlock
&OpMBB
= *MI
.getOperand(I
+ 1).getMBB();
5145 MIRBuilder
.setInsertPt(OpMBB
, OpMBB
.getFirstTerminator());
5146 moreElementsVectorSrc(MI
, MoreTy
, I
);
5149 MachineBasicBlock
&MBB
= *MI
.getParent();
5150 MIRBuilder
.setInsertPt(MBB
, --MBB
.getFirstNonPHI());
5151 moreElementsVectorDst(MI
, MoreTy
, 0);
5152 Observer
.changedInstr(MI
);
5156 LegalizerHelper::LegalizeResult
5157 LegalizerHelper::moreElementsVector(MachineInstr
&MI
, unsigned TypeIdx
,
5159 unsigned Opc
= MI
.getOpcode();
5161 case TargetOpcode::G_IMPLICIT_DEF
:
5162 case TargetOpcode::G_LOAD
: {
5164 return UnableToLegalize
;
5165 Observer
.changingInstr(MI
);
5166 moreElementsVectorDst(MI
, MoreTy
, 0);
5167 Observer
.changedInstr(MI
);
5170 case TargetOpcode::G_STORE
:
5172 return UnableToLegalize
;
5173 Observer
.changingInstr(MI
);
5174 moreElementsVectorSrc(MI
, MoreTy
, 0);
5175 Observer
.changedInstr(MI
);
5177 case TargetOpcode::G_AND
:
5178 case TargetOpcode::G_OR
:
5179 case TargetOpcode::G_XOR
:
5180 case TargetOpcode::G_ADD
:
5181 case TargetOpcode::G_SUB
:
5182 case TargetOpcode::G_MUL
:
5183 case TargetOpcode::G_FADD
:
5184 case TargetOpcode::G_FSUB
:
5185 case TargetOpcode::G_FMUL
:
5186 case TargetOpcode::G_FDIV
:
5187 case TargetOpcode::G_UADDSAT
:
5188 case TargetOpcode::G_USUBSAT
:
5189 case TargetOpcode::G_SADDSAT
:
5190 case TargetOpcode::G_SSUBSAT
:
5191 case TargetOpcode::G_SMIN
:
5192 case TargetOpcode::G_SMAX
:
5193 case TargetOpcode::G_UMIN
:
5194 case TargetOpcode::G_UMAX
:
5195 case TargetOpcode::G_FMINNUM
:
5196 case TargetOpcode::G_FMAXNUM
:
5197 case TargetOpcode::G_FMINNUM_IEEE
:
5198 case TargetOpcode::G_FMAXNUM_IEEE
:
5199 case TargetOpcode::G_FMINIMUM
:
5200 case TargetOpcode::G_FMAXIMUM
:
5201 case TargetOpcode::G_STRICT_FADD
:
5202 case TargetOpcode::G_STRICT_FSUB
:
5203 case TargetOpcode::G_STRICT_FMUL
:
5204 case TargetOpcode::G_SHL
:
5205 case TargetOpcode::G_ASHR
:
5206 case TargetOpcode::G_LSHR
: {
5207 Observer
.changingInstr(MI
);
5208 moreElementsVectorSrc(MI
, MoreTy
, 1);
5209 moreElementsVectorSrc(MI
, MoreTy
, 2);
5210 moreElementsVectorDst(MI
, MoreTy
, 0);
5211 Observer
.changedInstr(MI
);
5214 case TargetOpcode::G_FMA
:
5215 case TargetOpcode::G_STRICT_FMA
:
5216 case TargetOpcode::G_FSHR
:
5217 case TargetOpcode::G_FSHL
: {
5218 Observer
.changingInstr(MI
);
5219 moreElementsVectorSrc(MI
, MoreTy
, 1);
5220 moreElementsVectorSrc(MI
, MoreTy
, 2);
5221 moreElementsVectorSrc(MI
, MoreTy
, 3);
5222 moreElementsVectorDst(MI
, MoreTy
, 0);
5223 Observer
.changedInstr(MI
);
5226 case TargetOpcode::G_EXTRACT_VECTOR_ELT
:
5227 case TargetOpcode::G_EXTRACT
:
5229 return UnableToLegalize
;
5230 Observer
.changingInstr(MI
);
5231 moreElementsVectorSrc(MI
, MoreTy
, 1);
5232 Observer
.changedInstr(MI
);
5234 case TargetOpcode::G_INSERT
:
5235 case TargetOpcode::G_INSERT_VECTOR_ELT
:
5236 case TargetOpcode::G_FREEZE
:
5237 case TargetOpcode::G_FNEG
:
5238 case TargetOpcode::G_FABS
:
5239 case TargetOpcode::G_FSQRT
:
5240 case TargetOpcode::G_FCEIL
:
5241 case TargetOpcode::G_FFLOOR
:
5242 case TargetOpcode::G_FNEARBYINT
:
5243 case TargetOpcode::G_FRINT
:
5244 case TargetOpcode::G_INTRINSIC_ROUND
:
5245 case TargetOpcode::G_INTRINSIC_ROUNDEVEN
:
5246 case TargetOpcode::G_INTRINSIC_TRUNC
:
5247 case TargetOpcode::G_BSWAP
:
5248 case TargetOpcode::G_FCANONICALIZE
:
5249 case TargetOpcode::G_SEXT_INREG
:
5251 return UnableToLegalize
;
5252 Observer
.changingInstr(MI
);
5253 moreElementsVectorSrc(MI
, MoreTy
, 1);
5254 moreElementsVectorDst(MI
, MoreTy
, 0);
5255 Observer
.changedInstr(MI
);
5257 case TargetOpcode::G_SELECT
: {
5258 auto [DstReg
, DstTy
, CondReg
, CondTy
] = MI
.getFirst2RegLLTs();
5260 if (!CondTy
.isScalar() ||
5261 DstTy
.getElementCount() != MoreTy
.getElementCount())
5262 return UnableToLegalize
;
5264 // This is turning a scalar select of vectors into a vector
5265 // select. Broadcast the select condition.
5266 auto ShufSplat
= MIRBuilder
.buildShuffleSplat(MoreTy
, CondReg
);
5267 Observer
.changingInstr(MI
);
5268 MI
.getOperand(1).setReg(ShufSplat
.getReg(0));
5269 Observer
.changedInstr(MI
);
5273 if (CondTy
.isVector())
5274 return UnableToLegalize
;
5276 Observer
.changingInstr(MI
);
5277 moreElementsVectorSrc(MI
, MoreTy
, 2);
5278 moreElementsVectorSrc(MI
, MoreTy
, 3);
5279 moreElementsVectorDst(MI
, MoreTy
, 0);
5280 Observer
.changedInstr(MI
);
5283 case TargetOpcode::G_UNMERGE_VALUES
:
5284 return UnableToLegalize
;
5285 case TargetOpcode::G_PHI
:
5286 return moreElementsVectorPhi(MI
, TypeIdx
, MoreTy
);
5287 case TargetOpcode::G_SHUFFLE_VECTOR
:
5288 return moreElementsVectorShuffle(MI
, TypeIdx
, MoreTy
);
5289 case TargetOpcode::G_BUILD_VECTOR
: {
5290 SmallVector
<SrcOp
, 8> Elts
;
5291 for (auto Op
: MI
.uses()) {
5292 Elts
.push_back(Op
.getReg());
5295 for (unsigned i
= Elts
.size(); i
< MoreTy
.getNumElements(); ++i
) {
5296 Elts
.push_back(MIRBuilder
.buildUndef(MoreTy
.getScalarType()));
5299 MIRBuilder
.buildDeleteTrailingVectorElements(
5300 MI
.getOperand(0).getReg(), MIRBuilder
.buildInstr(Opc
, {MoreTy
}, Elts
));
5301 MI
.eraseFromParent();
5304 case TargetOpcode::G_TRUNC
:
5305 case TargetOpcode::G_FPTRUNC
:
5306 case TargetOpcode::G_FPEXT
:
5307 case TargetOpcode::G_FPTOSI
:
5308 case TargetOpcode::G_FPTOUI
:
5309 case TargetOpcode::G_SITOFP
:
5310 case TargetOpcode::G_UITOFP
: {
5312 return UnableToLegalize
;
5313 Observer
.changingInstr(MI
);
5314 LLT SrcTy
= LLT::fixed_vector(
5315 MoreTy
.getNumElements(),
5316 MRI
.getType(MI
.getOperand(1).getReg()).getElementType());
5317 moreElementsVectorSrc(MI
, SrcTy
, 1);
5318 moreElementsVectorDst(MI
, MoreTy
, 0);
5319 Observer
.changedInstr(MI
);
5322 case TargetOpcode::G_ICMP
: {
5323 // TODO: the symmetric MoreTy works for targets like, e.g. NEON.
5324 // For targets, like e.g. MVE, the result is a predicated vector (i1).
5325 // This will need some refactoring.
5326 Observer
.changingInstr(MI
);
5327 moreElementsVectorSrc(MI
, MoreTy
, 2);
5328 moreElementsVectorSrc(MI
, MoreTy
, 3);
5329 moreElementsVectorDst(MI
, MoreTy
, 0);
5330 Observer
.changedInstr(MI
);
5334 return UnableToLegalize
;
5338 LegalizerHelper::LegalizeResult
5339 LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr
&MI
) {
5340 auto [DstReg
, DstTy
, SrcReg
, SrcTy
] = MI
.getFirst2RegLLTs();
5341 ArrayRef
<int> Mask
= MI
.getOperand(3).getShuffleMask();
5342 unsigned MaskNumElts
= Mask
.size();
5343 unsigned SrcNumElts
= SrcTy
.getNumElements();
5344 LLT DestEltTy
= DstTy
.getElementType();
5346 if (MaskNumElts
== SrcNumElts
)
5349 if (MaskNumElts
< SrcNumElts
) {
5350 // Extend mask to match new destination vector size with
5352 SmallVector
<int, 16> NewMask(Mask
);
5353 for (unsigned I
= MaskNumElts
; I
< SrcNumElts
; ++I
)
5354 NewMask
.push_back(-1);
5356 moreElementsVectorDst(MI
, SrcTy
, 0);
5357 MIRBuilder
.setInstrAndDebugLoc(MI
);
5358 MIRBuilder
.buildShuffleVector(MI
.getOperand(0).getReg(),
5359 MI
.getOperand(1).getReg(),
5360 MI
.getOperand(2).getReg(), NewMask
);
5361 MI
.eraseFromParent();
5366 unsigned PaddedMaskNumElts
= alignTo(MaskNumElts
, SrcNumElts
);
5367 unsigned NumConcat
= PaddedMaskNumElts
/ SrcNumElts
;
5368 LLT PaddedTy
= LLT::fixed_vector(PaddedMaskNumElts
, DestEltTy
);
5370 // Create new source vectors by concatenating the initial
5371 // source vectors with undefined vectors of the same size.
5372 auto Undef
= MIRBuilder
.buildUndef(SrcTy
);
5373 SmallVector
<Register
, 8> MOps1(NumConcat
, Undef
.getReg(0));
5374 SmallVector
<Register
, 8> MOps2(NumConcat
, Undef
.getReg(0));
5375 MOps1
[0] = MI
.getOperand(1).getReg();
5376 MOps2
[0] = MI
.getOperand(2).getReg();
5378 auto Src1
= MIRBuilder
.buildConcatVectors(PaddedTy
, MOps1
);
5379 auto Src2
= MIRBuilder
.buildConcatVectors(PaddedTy
, MOps2
);
5381 // Readjust mask for new input vector length.
5382 SmallVector
<int, 8> MappedOps(PaddedMaskNumElts
, -1);
5383 for (unsigned I
= 0; I
!= MaskNumElts
; ++I
) {
5385 if (Idx
>= static_cast<int>(SrcNumElts
))
5386 Idx
+= PaddedMaskNumElts
- SrcNumElts
;
5390 // If we got more elements than required, extract subvector.
5391 if (MaskNumElts
!= PaddedMaskNumElts
) {
5393 MIRBuilder
.buildShuffleVector(PaddedTy
, Src1
, Src2
, MappedOps
);
5395 SmallVector
<Register
, 16> Elts(MaskNumElts
);
5396 for (unsigned I
= 0; I
< MaskNumElts
; ++I
) {
5398 MIRBuilder
.buildExtractVectorElementConstant(DestEltTy
, Shuffle
, I
)
5401 MIRBuilder
.buildBuildVector(DstReg
, Elts
);
5403 MIRBuilder
.buildShuffleVector(DstReg
, Src1
, Src2
, MappedOps
);
5406 MI
.eraseFromParent();
5407 return LegalizerHelper::LegalizeResult::Legalized
;
5410 LegalizerHelper::LegalizeResult
5411 LegalizerHelper::moreElementsVectorShuffle(MachineInstr
&MI
,
5412 unsigned int TypeIdx
, LLT MoreTy
) {
5413 auto [DstTy
, Src1Ty
, Src2Ty
] = MI
.getFirst3LLTs();
5414 ArrayRef
<int> Mask
= MI
.getOperand(3).getShuffleMask();
5415 unsigned NumElts
= DstTy
.getNumElements();
5416 unsigned WidenNumElts
= MoreTy
.getNumElements();
5418 if (DstTy
.isVector() && Src1Ty
.isVector() &&
5419 DstTy
.getNumElements() != Src1Ty
.getNumElements()) {
5420 return equalizeVectorShuffleLengths(MI
);
5424 return UnableToLegalize
;
5426 // Expect a canonicalized shuffle.
5427 if (DstTy
!= Src1Ty
|| DstTy
!= Src2Ty
)
5428 return UnableToLegalize
;
5430 moreElementsVectorSrc(MI
, MoreTy
, 1);
5431 moreElementsVectorSrc(MI
, MoreTy
, 2);
5433 // Adjust mask based on new input vector length.
5434 SmallVector
<int, 16> NewMask
;
5435 for (unsigned I
= 0; I
!= NumElts
; ++I
) {
5437 if (Idx
< static_cast<int>(NumElts
))
5438 NewMask
.push_back(Idx
);
5440 NewMask
.push_back(Idx
- NumElts
+ WidenNumElts
);
5442 for (unsigned I
= NumElts
; I
!= WidenNumElts
; ++I
)
5443 NewMask
.push_back(-1);
5444 moreElementsVectorDst(MI
, MoreTy
, 0);
5445 MIRBuilder
.setInstrAndDebugLoc(MI
);
5446 MIRBuilder
.buildShuffleVector(MI
.getOperand(0).getReg(),
5447 MI
.getOperand(1).getReg(),
5448 MI
.getOperand(2).getReg(), NewMask
);
5449 MI
.eraseFromParent();
5453 void LegalizerHelper::multiplyRegisters(SmallVectorImpl
<Register
> &DstRegs
,
5454 ArrayRef
<Register
> Src1Regs
,
5455 ArrayRef
<Register
> Src2Regs
,
5457 MachineIRBuilder
&B
= MIRBuilder
;
5458 unsigned SrcParts
= Src1Regs
.size();
5459 unsigned DstParts
= DstRegs
.size();
5461 unsigned DstIdx
= 0; // Low bits of the result.
5462 Register FactorSum
=
5463 B
.buildMul(NarrowTy
, Src1Regs
[DstIdx
], Src2Regs
[DstIdx
]).getReg(0);
5464 DstRegs
[DstIdx
] = FactorSum
;
5466 unsigned CarrySumPrevDstIdx
;
5467 SmallVector
<Register
, 4> Factors
;
5469 for (DstIdx
= 1; DstIdx
< DstParts
; DstIdx
++) {
5470 // Collect low parts of muls for DstIdx.
5471 for (unsigned i
= DstIdx
+ 1 < SrcParts
? 0 : DstIdx
- SrcParts
+ 1;
5472 i
<= std::min(DstIdx
, SrcParts
- 1); ++i
) {
5473 MachineInstrBuilder Mul
=
5474 B
.buildMul(NarrowTy
, Src1Regs
[DstIdx
- i
], Src2Regs
[i
]);
5475 Factors
.push_back(Mul
.getReg(0));
5477 // Collect high parts of muls from previous DstIdx.
5478 for (unsigned i
= DstIdx
< SrcParts
? 0 : DstIdx
- SrcParts
;
5479 i
<= std::min(DstIdx
- 1, SrcParts
- 1); ++i
) {
5480 MachineInstrBuilder Umulh
=
5481 B
.buildUMulH(NarrowTy
, Src1Regs
[DstIdx
- 1 - i
], Src2Regs
[i
]);
5482 Factors
.push_back(Umulh
.getReg(0));
5484 // Add CarrySum from additions calculated for previous DstIdx.
5486 Factors
.push_back(CarrySumPrevDstIdx
);
5490 // Add all factors and accumulate all carries into CarrySum.
5491 if (DstIdx
!= DstParts
- 1) {
5492 MachineInstrBuilder Uaddo
=
5493 B
.buildUAddo(NarrowTy
, LLT::scalar(1), Factors
[0], Factors
[1]);
5494 FactorSum
= Uaddo
.getReg(0);
5495 CarrySum
= B
.buildZExt(NarrowTy
, Uaddo
.getReg(1)).getReg(0);
5496 for (unsigned i
= 2; i
< Factors
.size(); ++i
) {
5497 MachineInstrBuilder Uaddo
=
5498 B
.buildUAddo(NarrowTy
, LLT::scalar(1), FactorSum
, Factors
[i
]);
5499 FactorSum
= Uaddo
.getReg(0);
5500 MachineInstrBuilder Carry
= B
.buildZExt(NarrowTy
, Uaddo
.getReg(1));
5501 CarrySum
= B
.buildAdd(NarrowTy
, CarrySum
, Carry
).getReg(0);
5504 // Since value for the next index is not calculated, neither is CarrySum.
5505 FactorSum
= B
.buildAdd(NarrowTy
, Factors
[0], Factors
[1]).getReg(0);
5506 for (unsigned i
= 2; i
< Factors
.size(); ++i
)
5507 FactorSum
= B
.buildAdd(NarrowTy
, FactorSum
, Factors
[i
]).getReg(0);
5510 CarrySumPrevDstIdx
= CarrySum
;
5511 DstRegs
[DstIdx
] = FactorSum
;
5516 LegalizerHelper::LegalizeResult
5517 LegalizerHelper::narrowScalarAddSub(MachineInstr
&MI
, unsigned TypeIdx
,
5520 return UnableToLegalize
;
5522 Register DstReg
= MI
.getOperand(0).getReg();
5523 LLT DstType
= MRI
.getType(DstReg
);
5524 // FIXME: add support for vector types
5525 if (DstType
.isVector())
5526 return UnableToLegalize
;
5528 unsigned Opcode
= MI
.getOpcode();
5529 unsigned OpO
, OpE
, OpF
;
5531 case TargetOpcode::G_SADDO
:
5532 case TargetOpcode::G_SADDE
:
5533 case TargetOpcode::G_UADDO
:
5534 case TargetOpcode::G_UADDE
:
5535 case TargetOpcode::G_ADD
:
5536 OpO
= TargetOpcode::G_UADDO
;
5537 OpE
= TargetOpcode::G_UADDE
;
5538 OpF
= TargetOpcode::G_UADDE
;
5539 if (Opcode
== TargetOpcode::G_SADDO
|| Opcode
== TargetOpcode::G_SADDE
)
5540 OpF
= TargetOpcode::G_SADDE
;
5542 case TargetOpcode::G_SSUBO
:
5543 case TargetOpcode::G_SSUBE
:
5544 case TargetOpcode::G_USUBO
:
5545 case TargetOpcode::G_USUBE
:
5546 case TargetOpcode::G_SUB
:
5547 OpO
= TargetOpcode::G_USUBO
;
5548 OpE
= TargetOpcode::G_USUBE
;
5549 OpF
= TargetOpcode::G_USUBE
;
5550 if (Opcode
== TargetOpcode::G_SSUBO
|| Opcode
== TargetOpcode::G_SSUBE
)
5551 OpF
= TargetOpcode::G_SSUBE
;
5554 llvm_unreachable("Unexpected add/sub opcode!");
5557 // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5558 unsigned NumDefs
= MI
.getNumExplicitDefs();
5559 Register Src1
= MI
.getOperand(NumDefs
).getReg();
5560 Register Src2
= MI
.getOperand(NumDefs
+ 1).getReg();
5561 Register CarryDst
, CarryIn
;
5563 CarryDst
= MI
.getOperand(1).getReg();
5564 if (MI
.getNumOperands() == NumDefs
+ 3)
5565 CarryIn
= MI
.getOperand(NumDefs
+ 2).getReg();
5567 LLT RegTy
= MRI
.getType(MI
.getOperand(0).getReg());
5568 LLT LeftoverTy
, DummyTy
;
5569 SmallVector
<Register
, 2> Src1Regs
, Src2Regs
, Src1Left
, Src2Left
, DstRegs
;
5570 extractParts(Src1
, RegTy
, NarrowTy
, LeftoverTy
, Src1Regs
, Src1Left
,
5572 extractParts(Src2
, RegTy
, NarrowTy
, DummyTy
, Src2Regs
, Src2Left
, MIRBuilder
,
5575 int NarrowParts
= Src1Regs
.size();
5576 for (int I
= 0, E
= Src1Left
.size(); I
!= E
; ++I
) {
5577 Src1Regs
.push_back(Src1Left
[I
]);
5578 Src2Regs
.push_back(Src2Left
[I
]);
5580 DstRegs
.reserve(Src1Regs
.size());
5582 for (int i
= 0, e
= Src1Regs
.size(); i
!= e
; ++i
) {
5584 MRI
.createGenericVirtualRegister(MRI
.getType(Src1Regs
[i
]));
5585 Register CarryOut
= MRI
.createGenericVirtualRegister(LLT::scalar(1));
5586 // Forward the final carry-out to the destination register
5587 if (i
== e
- 1 && CarryDst
)
5588 CarryOut
= CarryDst
;
5591 MIRBuilder
.buildInstr(OpO
, {DstReg
, CarryOut
},
5592 {Src1Regs
[i
], Src2Regs
[i
]});
5593 } else if (i
== e
- 1) {
5594 MIRBuilder
.buildInstr(OpF
, {DstReg
, CarryOut
},
5595 {Src1Regs
[i
], Src2Regs
[i
], CarryIn
});
5597 MIRBuilder
.buildInstr(OpE
, {DstReg
, CarryOut
},
5598 {Src1Regs
[i
], Src2Regs
[i
], CarryIn
});
5601 DstRegs
.push_back(DstReg
);
5604 insertParts(MI
.getOperand(0).getReg(), RegTy
, NarrowTy
,
5605 ArrayRef(DstRegs
).take_front(NarrowParts
), LeftoverTy
,
5606 ArrayRef(DstRegs
).drop_front(NarrowParts
));
5608 MI
.eraseFromParent();
5612 LegalizerHelper::LegalizeResult
5613 LegalizerHelper::narrowScalarMul(MachineInstr
&MI
, LLT NarrowTy
) {
5614 auto [DstReg
, Src1
, Src2
] = MI
.getFirst3Regs();
5616 LLT Ty
= MRI
.getType(DstReg
);
5618 return UnableToLegalize
;
5620 unsigned Size
= Ty
.getSizeInBits();
5621 unsigned NarrowSize
= NarrowTy
.getSizeInBits();
5622 if (Size
% NarrowSize
!= 0)
5623 return UnableToLegalize
;
5625 unsigned NumParts
= Size
/ NarrowSize
;
5626 bool IsMulHigh
= MI
.getOpcode() == TargetOpcode::G_UMULH
;
5627 unsigned DstTmpParts
= NumParts
* (IsMulHigh
? 2 : 1);
5629 SmallVector
<Register
, 2> Src1Parts
, Src2Parts
;
5630 SmallVector
<Register
, 2> DstTmpRegs(DstTmpParts
);
5631 extractParts(Src1
, NarrowTy
, NumParts
, Src1Parts
, MIRBuilder
, MRI
);
5632 extractParts(Src2
, NarrowTy
, NumParts
, Src2Parts
, MIRBuilder
, MRI
);
5633 multiplyRegisters(DstTmpRegs
, Src1Parts
, Src2Parts
, NarrowTy
);
5635 // Take only high half of registers if this is high mul.
5636 ArrayRef
<Register
> DstRegs(&DstTmpRegs
[DstTmpParts
- NumParts
], NumParts
);
5637 MIRBuilder
.buildMergeLikeInstr(DstReg
, DstRegs
);
5638 MI
.eraseFromParent();
5642 LegalizerHelper::LegalizeResult
5643 LegalizerHelper::narrowScalarFPTOI(MachineInstr
&MI
, unsigned TypeIdx
,
5646 return UnableToLegalize
;
5648 bool IsSigned
= MI
.getOpcode() == TargetOpcode::G_FPTOSI
;
5650 Register Src
= MI
.getOperand(1).getReg();
5651 LLT SrcTy
= MRI
.getType(Src
);
5653 // If all finite floats fit into the narrowed integer type, we can just swap
5654 // out the result type. This is practically only useful for conversions from
5655 // half to at least 16-bits, so just handle the one case.
5656 if (SrcTy
.getScalarType() != LLT::scalar(16) ||
5657 NarrowTy
.getScalarSizeInBits() < (IsSigned
? 17u : 16u))
5658 return UnableToLegalize
;
5660 Observer
.changingInstr(MI
);
5661 narrowScalarDst(MI
, NarrowTy
, 0,
5662 IsSigned
? TargetOpcode::G_SEXT
: TargetOpcode::G_ZEXT
);
5663 Observer
.changedInstr(MI
);
5667 LegalizerHelper::LegalizeResult
5668 LegalizerHelper::narrowScalarExtract(MachineInstr
&MI
, unsigned TypeIdx
,
5671 return UnableToLegalize
;
5673 uint64_t NarrowSize
= NarrowTy
.getSizeInBits();
5675 int64_t SizeOp1
= MRI
.getType(MI
.getOperand(1).getReg()).getSizeInBits();
5676 // FIXME: add support for when SizeOp1 isn't an exact multiple of
5678 if (SizeOp1
% NarrowSize
!= 0)
5679 return UnableToLegalize
;
5680 int NumParts
= SizeOp1
/ NarrowSize
;
5682 SmallVector
<Register
, 2> SrcRegs
, DstRegs
;
5683 SmallVector
<uint64_t, 2> Indexes
;
5684 extractParts(MI
.getOperand(1).getReg(), NarrowTy
, NumParts
, SrcRegs
,
5687 Register OpReg
= MI
.getOperand(0).getReg();
5688 uint64_t OpStart
= MI
.getOperand(2).getImm();
5689 uint64_t OpSize
= MRI
.getType(OpReg
).getSizeInBits();
5690 for (int i
= 0; i
< NumParts
; ++i
) {
5691 unsigned SrcStart
= i
* NarrowSize
;
5693 if (SrcStart
+ NarrowSize
<= OpStart
|| SrcStart
>= OpStart
+ OpSize
) {
5694 // No part of the extract uses this subregister, ignore it.
5696 } else if (SrcStart
== OpStart
&& NarrowTy
== MRI
.getType(OpReg
)) {
5697 // The entire subregister is extracted, forward the value.
5698 DstRegs
.push_back(SrcRegs
[i
]);
5702 // OpSegStart is where this destination segment would start in OpReg if it
5703 // extended infinitely in both directions.
5704 int64_t ExtractOffset
;
5706 if (OpStart
< SrcStart
) {
5708 SegSize
= std::min(NarrowSize
, OpStart
+ OpSize
- SrcStart
);
5710 ExtractOffset
= OpStart
- SrcStart
;
5711 SegSize
= std::min(SrcStart
+ NarrowSize
- OpStart
, OpSize
);
5714 Register SegReg
= SrcRegs
[i
];
5715 if (ExtractOffset
!= 0 || SegSize
!= NarrowSize
) {
5716 // A genuine extract is needed.
5717 SegReg
= MRI
.createGenericVirtualRegister(LLT::scalar(SegSize
));
5718 MIRBuilder
.buildExtract(SegReg
, SrcRegs
[i
], ExtractOffset
);
5721 DstRegs
.push_back(SegReg
);
5724 Register DstReg
= MI
.getOperand(0).getReg();
5725 if (MRI
.getType(DstReg
).isVector())
5726 MIRBuilder
.buildBuildVector(DstReg
, DstRegs
);
5727 else if (DstRegs
.size() > 1)
5728 MIRBuilder
.buildMergeLikeInstr(DstReg
, DstRegs
);
5730 MIRBuilder
.buildCopy(DstReg
, DstRegs
[0]);
5731 MI
.eraseFromParent();
5735 LegalizerHelper::LegalizeResult
5736 LegalizerHelper::narrowScalarInsert(MachineInstr
&MI
, unsigned TypeIdx
,
5738 // FIXME: Don't know how to handle secondary types yet.
5740 return UnableToLegalize
;
5742 SmallVector
<Register
, 2> SrcRegs
, LeftoverRegs
, DstRegs
;
5743 SmallVector
<uint64_t, 2> Indexes
;
5744 LLT RegTy
= MRI
.getType(MI
.getOperand(0).getReg());
5746 extractParts(MI
.getOperand(1).getReg(), RegTy
, NarrowTy
, LeftoverTy
, SrcRegs
,
5747 LeftoverRegs
, MIRBuilder
, MRI
);
5749 for (Register Reg
: LeftoverRegs
)
5750 SrcRegs
.push_back(Reg
);
5752 uint64_t NarrowSize
= NarrowTy
.getSizeInBits();
5753 Register OpReg
= MI
.getOperand(2).getReg();
5754 uint64_t OpStart
= MI
.getOperand(3).getImm();
5755 uint64_t OpSize
= MRI
.getType(OpReg
).getSizeInBits();
5756 for (int I
= 0, E
= SrcRegs
.size(); I
!= E
; ++I
) {
5757 unsigned DstStart
= I
* NarrowSize
;
5759 if (DstStart
== OpStart
&& NarrowTy
== MRI
.getType(OpReg
)) {
5760 // The entire subregister is defined by this insert, forward the new
5762 DstRegs
.push_back(OpReg
);
5766 Register SrcReg
= SrcRegs
[I
];
5767 if (MRI
.getType(SrcRegs
[I
]) == LeftoverTy
) {
5768 // The leftover reg is smaller than NarrowTy, so we need to extend it.
5769 SrcReg
= MRI
.createGenericVirtualRegister(NarrowTy
);
5770 MIRBuilder
.buildAnyExt(SrcReg
, SrcRegs
[I
]);
5773 if (DstStart
+ NarrowSize
<= OpStart
|| DstStart
>= OpStart
+ OpSize
) {
5774 // No part of the insert affects this subregister, forward the original.
5775 DstRegs
.push_back(SrcReg
);
5779 // OpSegStart is where this destination segment would start in OpReg if it
5780 // extended infinitely in both directions.
5781 int64_t ExtractOffset
, InsertOffset
;
5783 if (OpStart
< DstStart
) {
5785 ExtractOffset
= DstStart
- OpStart
;
5786 SegSize
= std::min(NarrowSize
, OpStart
+ OpSize
- DstStart
);
5788 InsertOffset
= OpStart
- DstStart
;
5791 std::min(NarrowSize
- InsertOffset
, OpStart
+ OpSize
- DstStart
);
5794 Register SegReg
= OpReg
;
5795 if (ExtractOffset
!= 0 || SegSize
!= OpSize
) {
5796 // A genuine extract is needed.
5797 SegReg
= MRI
.createGenericVirtualRegister(LLT::scalar(SegSize
));
5798 MIRBuilder
.buildExtract(SegReg
, OpReg
, ExtractOffset
);
5801 Register DstReg
= MRI
.createGenericVirtualRegister(NarrowTy
);
5802 MIRBuilder
.buildInsert(DstReg
, SrcReg
, SegReg
, InsertOffset
);
5803 DstRegs
.push_back(DstReg
);
5806 uint64_t WideSize
= DstRegs
.size() * NarrowSize
;
5807 Register DstReg
= MI
.getOperand(0).getReg();
5808 if (WideSize
> RegTy
.getSizeInBits()) {
5809 Register MergeReg
= MRI
.createGenericVirtualRegister(LLT::scalar(WideSize
));
5810 MIRBuilder
.buildMergeLikeInstr(MergeReg
, DstRegs
);
5811 MIRBuilder
.buildTrunc(DstReg
, MergeReg
);
5813 MIRBuilder
.buildMergeLikeInstr(DstReg
, DstRegs
);
5815 MI
.eraseFromParent();
5819 LegalizerHelper::LegalizeResult
5820 LegalizerHelper::narrowScalarBasic(MachineInstr
&MI
, unsigned TypeIdx
,
5822 Register DstReg
= MI
.getOperand(0).getReg();
5823 LLT DstTy
= MRI
.getType(DstReg
);
5825 assert(MI
.getNumOperands() == 3 && TypeIdx
== 0);
5827 SmallVector
<Register
, 4> DstRegs
, DstLeftoverRegs
;
5828 SmallVector
<Register
, 4> Src0Regs
, Src0LeftoverRegs
;
5829 SmallVector
<Register
, 4> Src1Regs
, Src1LeftoverRegs
;
5831 if (!extractParts(MI
.getOperand(1).getReg(), DstTy
, NarrowTy
, LeftoverTy
,
5832 Src0Regs
, Src0LeftoverRegs
, MIRBuilder
, MRI
))
5833 return UnableToLegalize
;
5836 if (!extractParts(MI
.getOperand(2).getReg(), DstTy
, NarrowTy
, Unused
,
5837 Src1Regs
, Src1LeftoverRegs
, MIRBuilder
, MRI
))
5838 llvm_unreachable("inconsistent extractParts result");
5840 for (unsigned I
= 0, E
= Src1Regs
.size(); I
!= E
; ++I
) {
5841 auto Inst
= MIRBuilder
.buildInstr(MI
.getOpcode(), {NarrowTy
},
5842 {Src0Regs
[I
], Src1Regs
[I
]});
5843 DstRegs
.push_back(Inst
.getReg(0));
5846 for (unsigned I
= 0, E
= Src1LeftoverRegs
.size(); I
!= E
; ++I
) {
5847 auto Inst
= MIRBuilder
.buildInstr(
5849 {LeftoverTy
}, {Src0LeftoverRegs
[I
], Src1LeftoverRegs
[I
]});
5850 DstLeftoverRegs
.push_back(Inst
.getReg(0));
5853 insertParts(DstReg
, DstTy
, NarrowTy
, DstRegs
,
5854 LeftoverTy
, DstLeftoverRegs
);
5856 MI
.eraseFromParent();
5860 LegalizerHelper::LegalizeResult
5861 LegalizerHelper::narrowScalarExt(MachineInstr
&MI
, unsigned TypeIdx
,
5864 return UnableToLegalize
;
5866 auto [DstReg
, SrcReg
] = MI
.getFirst2Regs();
5868 LLT DstTy
= MRI
.getType(DstReg
);
5869 if (DstTy
.isVector())
5870 return UnableToLegalize
;
5872 SmallVector
<Register
, 8> Parts
;
5873 LLT GCDTy
= extractGCDType(Parts
, DstTy
, NarrowTy
, SrcReg
);
5874 LLT LCMTy
= buildLCMMergePieces(DstTy
, NarrowTy
, GCDTy
, Parts
, MI
.getOpcode());
5875 buildWidenedRemergeToDst(DstReg
, LCMTy
, Parts
);
5877 MI
.eraseFromParent();
5881 LegalizerHelper::LegalizeResult
5882 LegalizerHelper::narrowScalarSelect(MachineInstr
&MI
, unsigned TypeIdx
,
5885 return UnableToLegalize
;
5887 Register CondReg
= MI
.getOperand(1).getReg();
5888 LLT CondTy
= MRI
.getType(CondReg
);
5889 if (CondTy
.isVector()) // TODO: Handle vselect
5890 return UnableToLegalize
;
5892 Register DstReg
= MI
.getOperand(0).getReg();
5893 LLT DstTy
= MRI
.getType(DstReg
);
5895 SmallVector
<Register
, 4> DstRegs
, DstLeftoverRegs
;
5896 SmallVector
<Register
, 4> Src1Regs
, Src1LeftoverRegs
;
5897 SmallVector
<Register
, 4> Src2Regs
, Src2LeftoverRegs
;
5899 if (!extractParts(MI
.getOperand(2).getReg(), DstTy
, NarrowTy
, LeftoverTy
,
5900 Src1Regs
, Src1LeftoverRegs
, MIRBuilder
, MRI
))
5901 return UnableToLegalize
;
5904 if (!extractParts(MI
.getOperand(3).getReg(), DstTy
, NarrowTy
, Unused
,
5905 Src2Regs
, Src2LeftoverRegs
, MIRBuilder
, MRI
))
5906 llvm_unreachable("inconsistent extractParts result");
5908 for (unsigned I
= 0, E
= Src1Regs
.size(); I
!= E
; ++I
) {
5909 auto Select
= MIRBuilder
.buildSelect(NarrowTy
,
5910 CondReg
, Src1Regs
[I
], Src2Regs
[I
]);
5911 DstRegs
.push_back(Select
.getReg(0));
5914 for (unsigned I
= 0, E
= Src1LeftoverRegs
.size(); I
!= E
; ++I
) {
5915 auto Select
= MIRBuilder
.buildSelect(
5916 LeftoverTy
, CondReg
, Src1LeftoverRegs
[I
], Src2LeftoverRegs
[I
]);
5917 DstLeftoverRegs
.push_back(Select
.getReg(0));
5920 insertParts(DstReg
, DstTy
, NarrowTy
, DstRegs
,
5921 LeftoverTy
, DstLeftoverRegs
);
5923 MI
.eraseFromParent();
5927 LegalizerHelper::LegalizeResult
5928 LegalizerHelper::narrowScalarCTLZ(MachineInstr
&MI
, unsigned TypeIdx
,
5931 return UnableToLegalize
;
5933 auto [DstReg
, DstTy
, SrcReg
, SrcTy
] = MI
.getFirst2RegLLTs();
5934 unsigned NarrowSize
= NarrowTy
.getSizeInBits();
5936 if (SrcTy
.isScalar() && SrcTy
.getSizeInBits() == 2 * NarrowSize
) {
5937 const bool IsUndef
= MI
.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF
;
5939 MachineIRBuilder
&B
= MIRBuilder
;
5940 auto UnmergeSrc
= B
.buildUnmerge(NarrowTy
, SrcReg
);
5941 // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5942 auto C_0
= B
.buildConstant(NarrowTy
, 0);
5943 auto HiIsZero
= B
.buildICmp(CmpInst::ICMP_EQ
, LLT::scalar(1),
5944 UnmergeSrc
.getReg(1), C_0
);
5945 auto LoCTLZ
= IsUndef
?
5946 B
.buildCTLZ_ZERO_UNDEF(DstTy
, UnmergeSrc
.getReg(0)) :
5947 B
.buildCTLZ(DstTy
, UnmergeSrc
.getReg(0));
5948 auto C_NarrowSize
= B
.buildConstant(DstTy
, NarrowSize
);
5949 auto HiIsZeroCTLZ
= B
.buildAdd(DstTy
, LoCTLZ
, C_NarrowSize
);
5950 auto HiCTLZ
= B
.buildCTLZ_ZERO_UNDEF(DstTy
, UnmergeSrc
.getReg(1));
5951 B
.buildSelect(DstReg
, HiIsZero
, HiIsZeroCTLZ
, HiCTLZ
);
5953 MI
.eraseFromParent();
5957 return UnableToLegalize
;
5960 LegalizerHelper::LegalizeResult
5961 LegalizerHelper::narrowScalarCTTZ(MachineInstr
&MI
, unsigned TypeIdx
,
5964 return UnableToLegalize
;
5966 auto [DstReg
, DstTy
, SrcReg
, SrcTy
] = MI
.getFirst2RegLLTs();
5967 unsigned NarrowSize
= NarrowTy
.getSizeInBits();
5969 if (SrcTy
.isScalar() && SrcTy
.getSizeInBits() == 2 * NarrowSize
) {
5970 const bool IsUndef
= MI
.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
;
5972 MachineIRBuilder
&B
= MIRBuilder
;
5973 auto UnmergeSrc
= B
.buildUnmerge(NarrowTy
, SrcReg
);
5974 // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5975 auto C_0
= B
.buildConstant(NarrowTy
, 0);
5976 auto LoIsZero
= B
.buildICmp(CmpInst::ICMP_EQ
, LLT::scalar(1),
5977 UnmergeSrc
.getReg(0), C_0
);
5978 auto HiCTTZ
= IsUndef
?
5979 B
.buildCTTZ_ZERO_UNDEF(DstTy
, UnmergeSrc
.getReg(1)) :
5980 B
.buildCTTZ(DstTy
, UnmergeSrc
.getReg(1));
5981 auto C_NarrowSize
= B
.buildConstant(DstTy
, NarrowSize
);
5982 auto LoIsZeroCTTZ
= B
.buildAdd(DstTy
, HiCTTZ
, C_NarrowSize
);
5983 auto LoCTTZ
= B
.buildCTTZ_ZERO_UNDEF(DstTy
, UnmergeSrc
.getReg(0));
5984 B
.buildSelect(DstReg
, LoIsZero
, LoIsZeroCTTZ
, LoCTTZ
);
5986 MI
.eraseFromParent();
5990 return UnableToLegalize
;
5993 LegalizerHelper::LegalizeResult
5994 LegalizerHelper::narrowScalarCTPOP(MachineInstr
&MI
, unsigned TypeIdx
,
5997 return UnableToLegalize
;
5999 auto [DstReg
, DstTy
, SrcReg
, SrcTy
] = MI
.getFirst2RegLLTs();
6000 unsigned NarrowSize
= NarrowTy
.getSizeInBits();
6002 if (SrcTy
.isScalar() && SrcTy
.getSizeInBits() == 2 * NarrowSize
) {
6003 auto UnmergeSrc
= MIRBuilder
.buildUnmerge(NarrowTy
, MI
.getOperand(1));
6005 auto LoCTPOP
= MIRBuilder
.buildCTPOP(DstTy
, UnmergeSrc
.getReg(0));
6006 auto HiCTPOP
= MIRBuilder
.buildCTPOP(DstTy
, UnmergeSrc
.getReg(1));
6007 MIRBuilder
.buildAdd(DstReg
, HiCTPOP
, LoCTPOP
);
6009 MI
.eraseFromParent();
6013 return UnableToLegalize
;
6016 LegalizerHelper::LegalizeResult
6017 LegalizerHelper::narrowScalarFLDEXP(MachineInstr
&MI
, unsigned TypeIdx
,
6020 return UnableToLegalize
;
6022 MachineIRBuilder
&B
= MIRBuilder
;
6023 Register ExpReg
= MI
.getOperand(2).getReg();
6024 LLT ExpTy
= MRI
.getType(ExpReg
);
6026 unsigned ClampSize
= NarrowTy
.getScalarSizeInBits();
6028 // Clamp the exponent to the range of the target type.
6029 auto MinExp
= B
.buildConstant(ExpTy
, minIntN(ClampSize
));
6030 auto ClampMin
= B
.buildSMax(ExpTy
, ExpReg
, MinExp
);
6031 auto MaxExp
= B
.buildConstant(ExpTy
, maxIntN(ClampSize
));
6032 auto Clamp
= B
.buildSMin(ExpTy
, ClampMin
, MaxExp
);
6034 auto Trunc
= B
.buildTrunc(NarrowTy
, Clamp
);
6035 Observer
.changingInstr(MI
);
6036 MI
.getOperand(2).setReg(Trunc
.getReg(0));
6037 Observer
.changedInstr(MI
);
6041 LegalizerHelper::LegalizeResult
6042 LegalizerHelper::lowerBitCount(MachineInstr
&MI
) {
6043 unsigned Opc
= MI
.getOpcode();
6044 const auto &TII
= MIRBuilder
.getTII();
6045 auto isSupported
= [this](const LegalityQuery
&Q
) {
6046 auto QAction
= LI
.getAction(Q
).Action
;
6047 return QAction
== Legal
|| QAction
== Libcall
|| QAction
== Custom
;
6051 return UnableToLegalize
;
6052 case TargetOpcode::G_CTLZ_ZERO_UNDEF
: {
6053 // This trivially expands to CTLZ.
6054 Observer
.changingInstr(MI
);
6055 MI
.setDesc(TII
.get(TargetOpcode::G_CTLZ
));
6056 Observer
.changedInstr(MI
);
6059 case TargetOpcode::G_CTLZ
: {
6060 auto [DstReg
, DstTy
, SrcReg
, SrcTy
] = MI
.getFirst2RegLLTs();
6061 unsigned Len
= SrcTy
.getSizeInBits();
6063 if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF
, {DstTy
, SrcTy
}})) {
6064 // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
6065 auto CtlzZU
= MIRBuilder
.buildCTLZ_ZERO_UNDEF(DstTy
, SrcReg
);
6066 auto ZeroSrc
= MIRBuilder
.buildConstant(SrcTy
, 0);
6067 auto ICmp
= MIRBuilder
.buildICmp(
6068 CmpInst::ICMP_EQ
, SrcTy
.changeElementSize(1), SrcReg
, ZeroSrc
);
6069 auto LenConst
= MIRBuilder
.buildConstant(DstTy
, Len
);
6070 MIRBuilder
.buildSelect(DstReg
, ICmp
, LenConst
, CtlzZU
);
6071 MI
.eraseFromParent();
6074 // for now, we do this:
6075 // NewLen = NextPowerOf2(Len);
6076 // x = x | (x >> 1);
6077 // x = x | (x >> 2);
6079 // x = x | (x >>16);
6080 // x = x | (x >>32); // for 64-bit input
6082 // return Len - popcount(x);
6084 // Ref: "Hacker's Delight" by Henry Warren
6085 Register Op
= SrcReg
;
6086 unsigned NewLen
= PowerOf2Ceil(Len
);
6087 for (unsigned i
= 0; (1U << i
) <= (NewLen
/ 2); ++i
) {
6088 auto MIBShiftAmt
= MIRBuilder
.buildConstant(SrcTy
, 1ULL << i
);
6089 auto MIBOp
= MIRBuilder
.buildOr(
6090 SrcTy
, Op
, MIRBuilder
.buildLShr(SrcTy
, Op
, MIBShiftAmt
));
6091 Op
= MIBOp
.getReg(0);
6093 auto MIBPop
= MIRBuilder
.buildCTPOP(DstTy
, Op
);
6094 MIRBuilder
.buildSub(MI
.getOperand(0), MIRBuilder
.buildConstant(DstTy
, Len
),
6096 MI
.eraseFromParent();
6099 case TargetOpcode::G_CTTZ_ZERO_UNDEF
: {
6100 // This trivially expands to CTTZ.
6101 Observer
.changingInstr(MI
);
6102 MI
.setDesc(TII
.get(TargetOpcode::G_CTTZ
));
6103 Observer
.changedInstr(MI
);
6106 case TargetOpcode::G_CTTZ
: {
6107 auto [DstReg
, DstTy
, SrcReg
, SrcTy
] = MI
.getFirst2RegLLTs();
6109 unsigned Len
= SrcTy
.getSizeInBits();
6110 if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF
, {DstTy
, SrcTy
}})) {
6111 // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
6113 auto CttzZU
= MIRBuilder
.buildCTTZ_ZERO_UNDEF(DstTy
, SrcReg
);
6114 auto Zero
= MIRBuilder
.buildConstant(SrcTy
, 0);
6115 auto ICmp
= MIRBuilder
.buildICmp(
6116 CmpInst::ICMP_EQ
, DstTy
.changeElementSize(1), SrcReg
, Zero
);
6117 auto LenConst
= MIRBuilder
.buildConstant(DstTy
, Len
);
6118 MIRBuilder
.buildSelect(DstReg
, ICmp
, LenConst
, CttzZU
);
6119 MI
.eraseFromParent();
6122 // for now, we use: { return popcount(~x & (x - 1)); }
6123 // unless the target has ctlz but not ctpop, in which case we use:
6124 // { return 32 - nlz(~x & (x-1)); }
6125 // Ref: "Hacker's Delight" by Henry Warren
6126 auto MIBCstNeg1
= MIRBuilder
.buildConstant(SrcTy
, -1);
6127 auto MIBNot
= MIRBuilder
.buildXor(SrcTy
, SrcReg
, MIBCstNeg1
);
6128 auto MIBTmp
= MIRBuilder
.buildAnd(
6129 SrcTy
, MIBNot
, MIRBuilder
.buildAdd(SrcTy
, SrcReg
, MIBCstNeg1
));
6130 if (!isSupported({TargetOpcode::G_CTPOP
, {SrcTy
, SrcTy
}}) &&
6131 isSupported({TargetOpcode::G_CTLZ
, {SrcTy
, SrcTy
}})) {
6132 auto MIBCstLen
= MIRBuilder
.buildConstant(SrcTy
, Len
);
6133 MIRBuilder
.buildSub(MI
.getOperand(0), MIBCstLen
,
6134 MIRBuilder
.buildCTLZ(SrcTy
, MIBTmp
));
6135 MI
.eraseFromParent();
6138 Observer
.changingInstr(MI
);
6139 MI
.setDesc(TII
.get(TargetOpcode::G_CTPOP
));
6140 MI
.getOperand(1).setReg(MIBTmp
.getReg(0));
6141 Observer
.changedInstr(MI
);
6144 case TargetOpcode::G_CTPOP
: {
6145 Register SrcReg
= MI
.getOperand(1).getReg();
6146 LLT Ty
= MRI
.getType(SrcReg
);
6147 unsigned Size
= Ty
.getSizeInBits();
6148 MachineIRBuilder
&B
= MIRBuilder
;
6150 // Count set bits in blocks of 2 bits. Default approach would be
6151 // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
6152 // We use following formula instead:
6153 // B2Count = val - { (val >> 1) & 0x55555555 }
6154 // since it gives same result in blocks of 2 with one instruction less.
6155 auto C_1
= B
.buildConstant(Ty
, 1);
6156 auto B2Set1LoTo1Hi
= B
.buildLShr(Ty
, SrcReg
, C_1
);
6157 APInt B2Mask1HiTo0
= APInt::getSplat(Size
, APInt(8, 0x55));
6158 auto C_B2Mask1HiTo0
= B
.buildConstant(Ty
, B2Mask1HiTo0
);
6159 auto B2Count1Hi
= B
.buildAnd(Ty
, B2Set1LoTo1Hi
, C_B2Mask1HiTo0
);
6160 auto B2Count
= B
.buildSub(Ty
, SrcReg
, B2Count1Hi
);
6162 // In order to get count in blocks of 4 add values from adjacent block of 2.
6163 // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
6164 auto C_2
= B
.buildConstant(Ty
, 2);
6165 auto B4Set2LoTo2Hi
= B
.buildLShr(Ty
, B2Count
, C_2
);
6166 APInt B4Mask2HiTo0
= APInt::getSplat(Size
, APInt(8, 0x33));
6167 auto C_B4Mask2HiTo0
= B
.buildConstant(Ty
, B4Mask2HiTo0
);
6168 auto B4HiB2Count
= B
.buildAnd(Ty
, B4Set2LoTo2Hi
, C_B4Mask2HiTo0
);
6169 auto B4LoB2Count
= B
.buildAnd(Ty
, B2Count
, C_B4Mask2HiTo0
);
6170 auto B4Count
= B
.buildAdd(Ty
, B4HiB2Count
, B4LoB2Count
);
6172 // For count in blocks of 8 bits we don't have to mask high 4 bits before
6173 // addition since count value sits in range {0,...,8} and 4 bits are enough
6174 // to hold such binary values. After addition high 4 bits still hold count
6175 // of set bits in high 4 bit block, set them to zero and get 8 bit result.
6176 // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
6177 auto C_4
= B
.buildConstant(Ty
, 4);
6178 auto B8HiB4Count
= B
.buildLShr(Ty
, B4Count
, C_4
);
6179 auto B8CountDirty4Hi
= B
.buildAdd(Ty
, B8HiB4Count
, B4Count
);
6180 APInt B8Mask4HiTo0
= APInt::getSplat(Size
, APInt(8, 0x0F));
6181 auto C_B8Mask4HiTo0
= B
.buildConstant(Ty
, B8Mask4HiTo0
);
6182 auto B8Count
= B
.buildAnd(Ty
, B8CountDirty4Hi
, C_B8Mask4HiTo0
);
6184 assert(Size
<=128 && "Scalar size is too large for CTPOP lower algorithm");
6185 // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
6186 // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
6187 auto MulMask
= B
.buildConstant(Ty
, APInt::getSplat(Size
, APInt(8, 0x01)));
6188 auto ResTmp
= B
.buildMul(Ty
, B8Count
, MulMask
);
6190 // Shift count result from 8 high bits to low bits.
6191 auto C_SizeM8
= B
.buildConstant(Ty
, Size
- 8);
6192 B
.buildLShr(MI
.getOperand(0).getReg(), ResTmp
, C_SizeM8
);
6194 MI
.eraseFromParent();
6200 // Check that (every element of) Reg is undef or not an exact multiple of BW.
6201 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo
&MRI
,
6202 Register Reg
, unsigned BW
) {
6203 return matchUnaryPredicate(
6205 [=](const Constant
*C
) {
6206 // Null constant here means an undef.
6207 const ConstantInt
*CI
= dyn_cast_or_null
<ConstantInt
>(C
);
6208 return !CI
|| CI
->getValue().urem(BW
) != 0;
6210 /*AllowUndefs*/ true);
6213 LegalizerHelper::LegalizeResult
6214 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr
&MI
) {
6215 auto [Dst
, X
, Y
, Z
] = MI
.getFirst4Regs();
6216 LLT Ty
= MRI
.getType(Dst
);
6217 LLT ShTy
= MRI
.getType(Z
);
6219 unsigned BW
= Ty
.getScalarSizeInBits();
6221 if (!isPowerOf2_32(BW
))
6222 return UnableToLegalize
;
6224 const bool IsFSHL
= MI
.getOpcode() == TargetOpcode::G_FSHL
;
6225 unsigned RevOpcode
= IsFSHL
? TargetOpcode::G_FSHR
: TargetOpcode::G_FSHL
;
6227 if (isNonZeroModBitWidthOrUndef(MRI
, Z
, BW
)) {
6228 // fshl X, Y, Z -> fshr X, Y, -Z
6229 // fshr X, Y, Z -> fshl X, Y, -Z
6230 auto Zero
= MIRBuilder
.buildConstant(ShTy
, 0);
6231 Z
= MIRBuilder
.buildSub(Ty
, Zero
, Z
).getReg(0);
6233 // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
6234 // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
6235 auto One
= MIRBuilder
.buildConstant(ShTy
, 1);
6237 Y
= MIRBuilder
.buildInstr(RevOpcode
, {Ty
}, {X
, Y
, One
}).getReg(0);
6238 X
= MIRBuilder
.buildLShr(Ty
, X
, One
).getReg(0);
6240 X
= MIRBuilder
.buildInstr(RevOpcode
, {Ty
}, {X
, Y
, One
}).getReg(0);
6241 Y
= MIRBuilder
.buildShl(Ty
, Y
, One
).getReg(0);
6244 Z
= MIRBuilder
.buildNot(ShTy
, Z
).getReg(0);
6247 MIRBuilder
.buildInstr(RevOpcode
, {Dst
}, {X
, Y
, Z
});
6248 MI
.eraseFromParent();
6252 LegalizerHelper::LegalizeResult
6253 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr
&MI
) {
6254 auto [Dst
, X
, Y
, Z
] = MI
.getFirst4Regs();
6255 LLT Ty
= MRI
.getType(Dst
);
6256 LLT ShTy
= MRI
.getType(Z
);
6258 const unsigned BW
= Ty
.getScalarSizeInBits();
6259 const bool IsFSHL
= MI
.getOpcode() == TargetOpcode::G_FSHL
;
6262 Register ShAmt
, InvShAmt
;
6264 // FIXME: Emit optimized urem by constant instead of letting it expand later.
6265 if (isNonZeroModBitWidthOrUndef(MRI
, Z
, BW
)) {
6266 // fshl: X << C | Y >> (BW - C)
6267 // fshr: X << (BW - C) | Y >> C
6268 // where C = Z % BW is not zero
6269 auto BitWidthC
= MIRBuilder
.buildConstant(ShTy
, BW
);
6270 ShAmt
= MIRBuilder
.buildURem(ShTy
, Z
, BitWidthC
).getReg(0);
6271 InvShAmt
= MIRBuilder
.buildSub(ShTy
, BitWidthC
, ShAmt
).getReg(0);
6272 ShX
= MIRBuilder
.buildShl(Ty
, X
, IsFSHL
? ShAmt
: InvShAmt
).getReg(0);
6273 ShY
= MIRBuilder
.buildLShr(Ty
, Y
, IsFSHL
? InvShAmt
: ShAmt
).getReg(0);
6275 // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
6276 // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
6277 auto Mask
= MIRBuilder
.buildConstant(ShTy
, BW
- 1);
6278 if (isPowerOf2_32(BW
)) {
6279 // Z % BW -> Z & (BW - 1)
6280 ShAmt
= MIRBuilder
.buildAnd(ShTy
, Z
, Mask
).getReg(0);
6281 // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
6282 auto NotZ
= MIRBuilder
.buildNot(ShTy
, Z
);
6283 InvShAmt
= MIRBuilder
.buildAnd(ShTy
, NotZ
, Mask
).getReg(0);
6285 auto BitWidthC
= MIRBuilder
.buildConstant(ShTy
, BW
);
6286 ShAmt
= MIRBuilder
.buildURem(ShTy
, Z
, BitWidthC
).getReg(0);
6287 InvShAmt
= MIRBuilder
.buildSub(ShTy
, Mask
, ShAmt
).getReg(0);
6290 auto One
= MIRBuilder
.buildConstant(ShTy
, 1);
6292 ShX
= MIRBuilder
.buildShl(Ty
, X
, ShAmt
).getReg(0);
6293 auto ShY1
= MIRBuilder
.buildLShr(Ty
, Y
, One
);
6294 ShY
= MIRBuilder
.buildLShr(Ty
, ShY1
, InvShAmt
).getReg(0);
6296 auto ShX1
= MIRBuilder
.buildShl(Ty
, X
, One
);
6297 ShX
= MIRBuilder
.buildShl(Ty
, ShX1
, InvShAmt
).getReg(0);
6298 ShY
= MIRBuilder
.buildLShr(Ty
, Y
, ShAmt
).getReg(0);
6302 MIRBuilder
.buildOr(Dst
, ShX
, ShY
);
6303 MI
.eraseFromParent();
6307 LegalizerHelper::LegalizeResult
6308 LegalizerHelper::lowerFunnelShift(MachineInstr
&MI
) {
6309 // These operations approximately do the following (while avoiding undefined
6311 // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
6312 // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
6313 Register Dst
= MI
.getOperand(0).getReg();
6314 LLT Ty
= MRI
.getType(Dst
);
6315 LLT ShTy
= MRI
.getType(MI
.getOperand(3).getReg());
6317 bool IsFSHL
= MI
.getOpcode() == TargetOpcode::G_FSHL
;
6318 unsigned RevOpcode
= IsFSHL
? TargetOpcode::G_FSHR
: TargetOpcode::G_FSHL
;
6320 // TODO: Use smarter heuristic that accounts for vector legalization.
6321 if (LI
.getAction({RevOpcode
, {Ty
, ShTy
}}).Action
== Lower
)
6322 return lowerFunnelShiftAsShifts(MI
);
6324 // This only works for powers of 2, fallback to shifts if it fails.
6325 LegalizerHelper::LegalizeResult Result
= lowerFunnelShiftWithInverse(MI
);
6326 if (Result
== UnableToLegalize
)
6327 return lowerFunnelShiftAsShifts(MI
);
6331 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerEXT(MachineInstr
&MI
) {
6332 auto [Dst
, Src
] = MI
.getFirst2Regs();
6333 LLT DstTy
= MRI
.getType(Dst
);
6334 LLT SrcTy
= MRI
.getType(Src
);
6336 uint32_t DstTySize
= DstTy
.getSizeInBits();
6337 uint32_t DstTyScalarSize
= DstTy
.getScalarSizeInBits();
6338 uint32_t SrcTyScalarSize
= SrcTy
.getScalarSizeInBits();
6340 if (!isPowerOf2_32(DstTySize
) || !isPowerOf2_32(DstTyScalarSize
) ||
6341 !isPowerOf2_32(SrcTyScalarSize
))
6342 return UnableToLegalize
;
6344 // The step between extend is too large, split it by creating an intermediate
6345 // extend instruction
6346 if (SrcTyScalarSize
* 2 < DstTyScalarSize
) {
6347 LLT MidTy
= SrcTy
.changeElementSize(SrcTyScalarSize
* 2);
6348 // If the destination type is illegal, split it into multiple statements
6349 // zext x -> zext(merge(zext(unmerge), zext(unmerge)))
6350 auto NewExt
= MIRBuilder
.buildInstr(MI
.getOpcode(), {MidTy
}, {Src
});
6351 // Unmerge the vector
6352 LLT EltTy
= MidTy
.changeElementCount(
6353 MidTy
.getElementCount().divideCoefficientBy(2));
6354 auto UnmergeSrc
= MIRBuilder
.buildUnmerge(EltTy
, NewExt
);
6357 LLT ZExtResTy
= DstTy
.changeElementCount(
6358 DstTy
.getElementCount().divideCoefficientBy(2));
6359 auto ZExtRes1
= MIRBuilder
.buildInstr(MI
.getOpcode(), {ZExtResTy
},
6360 {UnmergeSrc
.getReg(0)});
6361 auto ZExtRes2
= MIRBuilder
.buildInstr(MI
.getOpcode(), {ZExtResTy
},
6362 {UnmergeSrc
.getReg(1)});
6364 // Merge the ending vectors
6365 MIRBuilder
.buildMergeLikeInstr(Dst
, {ZExtRes1
, ZExtRes2
});
6367 MI
.eraseFromParent();
6370 return UnableToLegalize
;
6373 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerTRUNC(MachineInstr
&MI
) {
6374 // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
6375 MachineRegisterInfo
&MRI
= *MIRBuilder
.getMRI();
6376 // Similar to how operand splitting is done in SelectiondDAG, we can handle
6377 // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
6378 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
6379 // %lo16(<4 x s16>) = G_TRUNC %inlo
6380 // %hi16(<4 x s16>) = G_TRUNC %inhi
6381 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
6382 // %res(<8 x s8>) = G_TRUNC %in16
6384 assert(MI
.getOpcode() == TargetOpcode::G_TRUNC
);
6386 Register DstReg
= MI
.getOperand(0).getReg();
6387 Register SrcReg
= MI
.getOperand(1).getReg();
6388 LLT DstTy
= MRI
.getType(DstReg
);
6389 LLT SrcTy
= MRI
.getType(SrcReg
);
6391 if (DstTy
.isVector() && isPowerOf2_32(DstTy
.getNumElements()) &&
6392 isPowerOf2_32(DstTy
.getScalarSizeInBits()) &&
6393 isPowerOf2_32(SrcTy
.getNumElements()) &&
6394 isPowerOf2_32(SrcTy
.getScalarSizeInBits())) {
6395 // Split input type.
6396 LLT SplitSrcTy
= SrcTy
.changeElementCount(
6397 SrcTy
.getElementCount().divideCoefficientBy(2));
6399 // First, split the source into two smaller vectors.
6400 SmallVector
<Register
, 2> SplitSrcs
;
6401 extractParts(SrcReg
, SplitSrcTy
, 2, SplitSrcs
, MIRBuilder
, MRI
);
6403 // Truncate the splits into intermediate narrower elements.
6405 if (DstTy
.getScalarSizeInBits() * 2 < SrcTy
.getScalarSizeInBits())
6406 InterTy
= SplitSrcTy
.changeElementSize(DstTy
.getScalarSizeInBits() * 2);
6408 InterTy
= SplitSrcTy
.changeElementSize(DstTy
.getScalarSizeInBits());
6409 for (unsigned I
= 0; I
< SplitSrcs
.size(); ++I
) {
6410 SplitSrcs
[I
] = MIRBuilder
.buildTrunc(InterTy
, SplitSrcs
[I
]).getReg(0);
6413 // Combine the new truncates into one vector
6414 auto Merge
= MIRBuilder
.buildMergeLikeInstr(
6415 DstTy
.changeElementSize(InterTy
.getScalarSizeInBits()), SplitSrcs
);
6417 // Truncate the new vector to the final result type
6418 if (DstTy
.getScalarSizeInBits() * 2 < SrcTy
.getScalarSizeInBits())
6419 MIRBuilder
.buildTrunc(MI
.getOperand(0).getReg(), Merge
.getReg(0));
6421 MIRBuilder
.buildCopy(MI
.getOperand(0).getReg(), Merge
.getReg(0));
6423 MI
.eraseFromParent();
6427 return UnableToLegalize
;
6430 LegalizerHelper::LegalizeResult
6431 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr
&MI
) {
6432 auto [Dst
, DstTy
, Src
, SrcTy
, Amt
, AmtTy
] = MI
.getFirst3RegLLTs();
6433 auto Zero
= MIRBuilder
.buildConstant(AmtTy
, 0);
6434 bool IsLeft
= MI
.getOpcode() == TargetOpcode::G_ROTL
;
6435 unsigned RevRot
= IsLeft
? TargetOpcode::G_ROTR
: TargetOpcode::G_ROTL
;
6436 auto Neg
= MIRBuilder
.buildSub(AmtTy
, Zero
, Amt
);
6437 MIRBuilder
.buildInstr(RevRot
, {Dst
}, {Src
, Neg
});
6438 MI
.eraseFromParent();
6442 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerRotate(MachineInstr
&MI
) {
6443 auto [Dst
, DstTy
, Src
, SrcTy
, Amt
, AmtTy
] = MI
.getFirst3RegLLTs();
6445 unsigned EltSizeInBits
= DstTy
.getScalarSizeInBits();
6446 bool IsLeft
= MI
.getOpcode() == TargetOpcode::G_ROTL
;
6448 MIRBuilder
.setInstrAndDebugLoc(MI
);
6450 // If a rotate in the other direction is supported, use it.
6451 unsigned RevRot
= IsLeft
? TargetOpcode::G_ROTR
: TargetOpcode::G_ROTL
;
6452 if (LI
.isLegalOrCustom({RevRot
, {DstTy
, SrcTy
}}) &&
6453 isPowerOf2_32(EltSizeInBits
))
6454 return lowerRotateWithReverseRotate(MI
);
6456 // If a funnel shift is supported, use it.
6457 unsigned FShOpc
= IsLeft
? TargetOpcode::G_FSHL
: TargetOpcode::G_FSHR
;
6458 unsigned RevFsh
= !IsLeft
? TargetOpcode::G_FSHL
: TargetOpcode::G_FSHR
;
6459 bool IsFShLegal
= false;
6460 if ((IsFShLegal
= LI
.isLegalOrCustom({FShOpc
, {DstTy
, AmtTy
}})) ||
6461 LI
.isLegalOrCustom({RevFsh
, {DstTy
, AmtTy
}})) {
6462 auto buildFunnelShift
= [&](unsigned Opc
, Register R1
, Register R2
,
6464 MIRBuilder
.buildInstr(Opc
, {R1
}, {R2
, R2
, R3
});
6465 MI
.eraseFromParent();
6468 // If a funnel shift in the other direction is supported, use it.
6470 return buildFunnelShift(FShOpc
, Dst
, Src
, Amt
);
6471 } else if (isPowerOf2_32(EltSizeInBits
)) {
6472 Amt
= MIRBuilder
.buildNeg(DstTy
, Amt
).getReg(0);
6473 return buildFunnelShift(RevFsh
, Dst
, Src
, Amt
);
6477 auto Zero
= MIRBuilder
.buildConstant(AmtTy
, 0);
6478 unsigned ShOpc
= IsLeft
? TargetOpcode::G_SHL
: TargetOpcode::G_LSHR
;
6479 unsigned RevShiftOpc
= IsLeft
? TargetOpcode::G_LSHR
: TargetOpcode::G_SHL
;
6480 auto BitWidthMinusOneC
= MIRBuilder
.buildConstant(AmtTy
, EltSizeInBits
- 1);
6482 Register RevShiftVal
;
6483 if (isPowerOf2_32(EltSizeInBits
)) {
6484 // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
6485 // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
6486 auto NegAmt
= MIRBuilder
.buildSub(AmtTy
, Zero
, Amt
);
6487 auto ShAmt
= MIRBuilder
.buildAnd(AmtTy
, Amt
, BitWidthMinusOneC
);
6488 ShVal
= MIRBuilder
.buildInstr(ShOpc
, {DstTy
}, {Src
, ShAmt
}).getReg(0);
6489 auto RevAmt
= MIRBuilder
.buildAnd(AmtTy
, NegAmt
, BitWidthMinusOneC
);
6491 MIRBuilder
.buildInstr(RevShiftOpc
, {DstTy
}, {Src
, RevAmt
}).getReg(0);
6493 // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
6494 // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
6495 auto BitWidthC
= MIRBuilder
.buildConstant(AmtTy
, EltSizeInBits
);
6496 auto ShAmt
= MIRBuilder
.buildURem(AmtTy
, Amt
, BitWidthC
);
6497 ShVal
= MIRBuilder
.buildInstr(ShOpc
, {DstTy
}, {Src
, ShAmt
}).getReg(0);
6498 auto RevAmt
= MIRBuilder
.buildSub(AmtTy
, BitWidthMinusOneC
, ShAmt
);
6499 auto One
= MIRBuilder
.buildConstant(AmtTy
, 1);
6500 auto Inner
= MIRBuilder
.buildInstr(RevShiftOpc
, {DstTy
}, {Src
, One
});
6502 MIRBuilder
.buildInstr(RevShiftOpc
, {DstTy
}, {Inner
, RevAmt
}).getReg(0);
6504 MIRBuilder
.buildOr(Dst
, ShVal
, RevShiftVal
);
6505 MI
.eraseFromParent();
6509 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
6511 LegalizerHelper::LegalizeResult
6512 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr
&MI
) {
6513 auto [Dst
, Src
] = MI
.getFirst2Regs();
6514 const LLT S64
= LLT::scalar(64);
6515 const LLT S32
= LLT::scalar(32);
6516 const LLT S1
= LLT::scalar(1);
6518 assert(MRI
.getType(Src
) == S64
&& MRI
.getType(Dst
) == S32
);
6520 // unsigned cul2f(ulong u) {
6521 // uint lz = clz(u);
6522 // uint e = (u != 0) ? 127U + 63U - lz : 0;
6523 // u = (u << lz) & 0x7fffffffffffffffUL;
6524 // ulong t = u & 0xffffffffffUL;
6525 // uint v = (e << 23) | (uint)(u >> 40);
6526 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
6527 // return as_float(v + r);
6530 auto Zero32
= MIRBuilder
.buildConstant(S32
, 0);
6531 auto Zero64
= MIRBuilder
.buildConstant(S64
, 0);
6533 auto LZ
= MIRBuilder
.buildCTLZ_ZERO_UNDEF(S32
, Src
);
6535 auto K
= MIRBuilder
.buildConstant(S32
, 127U + 63U);
6536 auto Sub
= MIRBuilder
.buildSub(S32
, K
, LZ
);
6538 auto NotZero
= MIRBuilder
.buildICmp(CmpInst::ICMP_NE
, S1
, Src
, Zero64
);
6539 auto E
= MIRBuilder
.buildSelect(S32
, NotZero
, Sub
, Zero32
);
6541 auto Mask0
= MIRBuilder
.buildConstant(S64
, (-1ULL) >> 1);
6542 auto ShlLZ
= MIRBuilder
.buildShl(S64
, Src
, LZ
);
6544 auto U
= MIRBuilder
.buildAnd(S64
, ShlLZ
, Mask0
);
6546 auto Mask1
= MIRBuilder
.buildConstant(S64
, 0xffffffffffULL
);
6547 auto T
= MIRBuilder
.buildAnd(S64
, U
, Mask1
);
6549 auto UShl
= MIRBuilder
.buildLShr(S64
, U
, MIRBuilder
.buildConstant(S64
, 40));
6550 auto ShlE
= MIRBuilder
.buildShl(S32
, E
, MIRBuilder
.buildConstant(S32
, 23));
6551 auto V
= MIRBuilder
.buildOr(S32
, ShlE
, MIRBuilder
.buildTrunc(S32
, UShl
));
6553 auto C
= MIRBuilder
.buildConstant(S64
, 0x8000000000ULL
);
6554 auto RCmp
= MIRBuilder
.buildICmp(CmpInst::ICMP_UGT
, S1
, T
, C
);
6555 auto TCmp
= MIRBuilder
.buildICmp(CmpInst::ICMP_EQ
, S1
, T
, C
);
6556 auto One
= MIRBuilder
.buildConstant(S32
, 1);
6558 auto VTrunc1
= MIRBuilder
.buildAnd(S32
, V
, One
);
6559 auto Select0
= MIRBuilder
.buildSelect(S32
, TCmp
, VTrunc1
, Zero32
);
6560 auto R
= MIRBuilder
.buildSelect(S32
, RCmp
, One
, Select0
);
6561 MIRBuilder
.buildAdd(Dst
, V
, R
);
6563 MI
.eraseFromParent();
6567 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerUITOFP(MachineInstr
&MI
) {
6568 auto [Dst
, DstTy
, Src
, SrcTy
] = MI
.getFirst2RegLLTs();
6570 if (SrcTy
== LLT::scalar(1)) {
6571 auto True
= MIRBuilder
.buildFConstant(DstTy
, 1.0);
6572 auto False
= MIRBuilder
.buildFConstant(DstTy
, 0.0);
6573 MIRBuilder
.buildSelect(Dst
, Src
, True
, False
);
6574 MI
.eraseFromParent();
6578 if (SrcTy
!= LLT::scalar(64))
6579 return UnableToLegalize
;
6581 if (DstTy
== LLT::scalar(32)) {
6582 // TODO: SelectionDAG has several alternative expansions to port which may
6583 // be more reasonble depending on the available instructions. If a target
6584 // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6585 // intermediate type, this is probably worse.
6586 return lowerU64ToF32BitOps(MI
);
6589 return UnableToLegalize
;
6592 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerSITOFP(MachineInstr
&MI
) {
6593 auto [Dst
, DstTy
, Src
, SrcTy
] = MI
.getFirst2RegLLTs();
6595 const LLT S64
= LLT::scalar(64);
6596 const LLT S32
= LLT::scalar(32);
6597 const LLT S1
= LLT::scalar(1);
6600 auto True
= MIRBuilder
.buildFConstant(DstTy
, -1.0);
6601 auto False
= MIRBuilder
.buildFConstant(DstTy
, 0.0);
6602 MIRBuilder
.buildSelect(Dst
, Src
, True
, False
);
6603 MI
.eraseFromParent();
6608 return UnableToLegalize
;
6611 // signed cl2f(long l) {
6612 // long s = l >> 63;
6613 // float r = cul2f((l + s) ^ s);
6614 // return s ? -r : r;
6617 auto SignBit
= MIRBuilder
.buildConstant(S64
, 63);
6618 auto S
= MIRBuilder
.buildAShr(S64
, L
, SignBit
);
6620 auto LPlusS
= MIRBuilder
.buildAdd(S64
, L
, S
);
6621 auto Xor
= MIRBuilder
.buildXor(S64
, LPlusS
, S
);
6622 auto R
= MIRBuilder
.buildUITOFP(S32
, Xor
);
6624 auto RNeg
= MIRBuilder
.buildFNeg(S32
, R
);
6625 auto SignNotZero
= MIRBuilder
.buildICmp(CmpInst::ICMP_NE
, S1
, S
,
6626 MIRBuilder
.buildConstant(S64
, 0));
6627 MIRBuilder
.buildSelect(Dst
, SignNotZero
, RNeg
, R
);
6628 MI
.eraseFromParent();
6632 return UnableToLegalize
;
6635 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerFPTOUI(MachineInstr
&MI
) {
6636 auto [Dst
, DstTy
, Src
, SrcTy
] = MI
.getFirst2RegLLTs();
6637 const LLT S64
= LLT::scalar(64);
6638 const LLT S32
= LLT::scalar(32);
6640 if (SrcTy
!= S64
&& SrcTy
!= S32
)
6641 return UnableToLegalize
;
6642 if (DstTy
!= S32
&& DstTy
!= S64
)
6643 return UnableToLegalize
;
6645 // FPTOSI gives same result as FPTOUI for positive signed integers.
6646 // FPTOUI needs to deal with fp values that convert to unsigned integers
6647 // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6649 APInt TwoPExpInt
= APInt::getSignMask(DstTy
.getSizeInBits());
6650 APFloat
TwoPExpFP(SrcTy
.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6651 : APFloat::IEEEdouble(),
6652 APInt::getZero(SrcTy
.getSizeInBits()));
6653 TwoPExpFP
.convertFromAPInt(TwoPExpInt
, false, APFloat::rmNearestTiesToEven
);
6655 MachineInstrBuilder FPTOSI
= MIRBuilder
.buildFPTOSI(DstTy
, Src
);
6657 MachineInstrBuilder Threshold
= MIRBuilder
.buildFConstant(SrcTy
, TwoPExpFP
);
6658 // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6659 // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6660 MachineInstrBuilder FSub
= MIRBuilder
.buildFSub(SrcTy
, Src
, Threshold
);
6661 MachineInstrBuilder ResLowBits
= MIRBuilder
.buildFPTOSI(DstTy
, FSub
);
6662 MachineInstrBuilder ResHighBit
= MIRBuilder
.buildConstant(DstTy
, TwoPExpInt
);
6663 MachineInstrBuilder Res
= MIRBuilder
.buildXor(DstTy
, ResLowBits
, ResHighBit
);
6665 const LLT S1
= LLT::scalar(1);
6667 MachineInstrBuilder FCMP
=
6668 MIRBuilder
.buildFCmp(CmpInst::FCMP_ULT
, S1
, Src
, Threshold
);
6669 MIRBuilder
.buildSelect(Dst
, FCMP
, FPTOSI
, Res
);
6671 MI
.eraseFromParent();
6675 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerFPTOSI(MachineInstr
&MI
) {
6676 auto [Dst
, DstTy
, Src
, SrcTy
] = MI
.getFirst2RegLLTs();
6677 const LLT S64
= LLT::scalar(64);
6678 const LLT S32
= LLT::scalar(32);
6680 // FIXME: Only f32 to i64 conversions are supported.
6681 if (SrcTy
.getScalarType() != S32
|| DstTy
.getScalarType() != S64
)
6682 return UnableToLegalize
;
6684 // Expand f32 -> i64 conversion
6685 // This algorithm comes from compiler-rt's implementation of fixsfdi:
6686 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6688 unsigned SrcEltBits
= SrcTy
.getScalarSizeInBits();
6690 auto ExponentMask
= MIRBuilder
.buildConstant(SrcTy
, 0x7F800000);
6691 auto ExponentLoBit
= MIRBuilder
.buildConstant(SrcTy
, 23);
6693 auto AndExpMask
= MIRBuilder
.buildAnd(SrcTy
, Src
, ExponentMask
);
6694 auto ExponentBits
= MIRBuilder
.buildLShr(SrcTy
, AndExpMask
, ExponentLoBit
);
6696 auto SignMask
= MIRBuilder
.buildConstant(SrcTy
,
6697 APInt::getSignMask(SrcEltBits
));
6698 auto AndSignMask
= MIRBuilder
.buildAnd(SrcTy
, Src
, SignMask
);
6699 auto SignLowBit
= MIRBuilder
.buildConstant(SrcTy
, SrcEltBits
- 1);
6700 auto Sign
= MIRBuilder
.buildAShr(SrcTy
, AndSignMask
, SignLowBit
);
6701 Sign
= MIRBuilder
.buildSExt(DstTy
, Sign
);
6703 auto MantissaMask
= MIRBuilder
.buildConstant(SrcTy
, 0x007FFFFF);
6704 auto AndMantissaMask
= MIRBuilder
.buildAnd(SrcTy
, Src
, MantissaMask
);
6705 auto K
= MIRBuilder
.buildConstant(SrcTy
, 0x00800000);
6707 auto R
= MIRBuilder
.buildOr(SrcTy
, AndMantissaMask
, K
);
6708 R
= MIRBuilder
.buildZExt(DstTy
, R
);
6710 auto Bias
= MIRBuilder
.buildConstant(SrcTy
, 127);
6711 auto Exponent
= MIRBuilder
.buildSub(SrcTy
, ExponentBits
, Bias
);
6712 auto SubExponent
= MIRBuilder
.buildSub(SrcTy
, Exponent
, ExponentLoBit
);
6713 auto ExponentSub
= MIRBuilder
.buildSub(SrcTy
, ExponentLoBit
, Exponent
);
6715 auto Shl
= MIRBuilder
.buildShl(DstTy
, R
, SubExponent
);
6716 auto Srl
= MIRBuilder
.buildLShr(DstTy
, R
, ExponentSub
);
6718 const LLT S1
= LLT::scalar(1);
6719 auto CmpGt
= MIRBuilder
.buildICmp(CmpInst::ICMP_SGT
,
6720 S1
, Exponent
, ExponentLoBit
);
6722 R
= MIRBuilder
.buildSelect(DstTy
, CmpGt
, Shl
, Srl
);
6724 auto XorSign
= MIRBuilder
.buildXor(DstTy
, R
, Sign
);
6725 auto Ret
= MIRBuilder
.buildSub(DstTy
, XorSign
, Sign
);
6727 auto ZeroSrcTy
= MIRBuilder
.buildConstant(SrcTy
, 0);
6729 auto ExponentLt0
= MIRBuilder
.buildICmp(CmpInst::ICMP_SLT
,
6730 S1
, Exponent
, ZeroSrcTy
);
6732 auto ZeroDstTy
= MIRBuilder
.buildConstant(DstTy
, 0);
6733 MIRBuilder
.buildSelect(Dst
, ExponentLt0
, ZeroDstTy
, Ret
);
6735 MI
.eraseFromParent();
6739 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6740 LegalizerHelper::LegalizeResult
6741 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr
&MI
) {
6742 const LLT S1
= LLT::scalar(1);
6743 const LLT S32
= LLT::scalar(32);
6745 auto [Dst
, Src
] = MI
.getFirst2Regs();
6746 assert(MRI
.getType(Dst
).getScalarType() == LLT::scalar(16) &&
6747 MRI
.getType(Src
).getScalarType() == LLT::scalar(64));
6749 if (MRI
.getType(Src
).isVector()) // TODO: Handle vectors directly.
6750 return UnableToLegalize
;
6752 if (MIRBuilder
.getMF().getTarget().Options
.UnsafeFPMath
) {
6753 unsigned Flags
= MI
.getFlags();
6754 auto Src32
= MIRBuilder
.buildFPTrunc(S32
, Src
, Flags
);
6755 MIRBuilder
.buildFPTrunc(Dst
, Src32
, Flags
);
6756 MI
.eraseFromParent();
6760 const unsigned ExpMask
= 0x7ff;
6761 const unsigned ExpBiasf64
= 1023;
6762 const unsigned ExpBiasf16
= 15;
6764 auto Unmerge
= MIRBuilder
.buildUnmerge(S32
, Src
);
6765 Register U
= Unmerge
.getReg(0);
6766 Register UH
= Unmerge
.getReg(1);
6768 auto E
= MIRBuilder
.buildLShr(S32
, UH
, MIRBuilder
.buildConstant(S32
, 20));
6769 E
= MIRBuilder
.buildAnd(S32
, E
, MIRBuilder
.buildConstant(S32
, ExpMask
));
6771 // Subtract the fp64 exponent bias (1023) to get the real exponent and
6772 // add the f16 bias (15) to get the biased exponent for the f16 format.
6773 E
= MIRBuilder
.buildAdd(
6774 S32
, E
, MIRBuilder
.buildConstant(S32
, -ExpBiasf64
+ ExpBiasf16
));
6776 auto M
= MIRBuilder
.buildLShr(S32
, UH
, MIRBuilder
.buildConstant(S32
, 8));
6777 M
= MIRBuilder
.buildAnd(S32
, M
, MIRBuilder
.buildConstant(S32
, 0xffe));
6779 auto MaskedSig
= MIRBuilder
.buildAnd(S32
, UH
,
6780 MIRBuilder
.buildConstant(S32
, 0x1ff));
6781 MaskedSig
= MIRBuilder
.buildOr(S32
, MaskedSig
, U
);
6783 auto Zero
= MIRBuilder
.buildConstant(S32
, 0);
6784 auto SigCmpNE0
= MIRBuilder
.buildICmp(CmpInst::ICMP_NE
, S1
, MaskedSig
, Zero
);
6785 auto Lo40Set
= MIRBuilder
.buildZExt(S32
, SigCmpNE0
);
6786 M
= MIRBuilder
.buildOr(S32
, M
, Lo40Set
);
6788 // (M != 0 ? 0x0200 : 0) | 0x7c00;
6789 auto Bits0x200
= MIRBuilder
.buildConstant(S32
, 0x0200);
6790 auto CmpM_NE0
= MIRBuilder
.buildICmp(CmpInst::ICMP_NE
, S1
, M
, Zero
);
6791 auto SelectCC
= MIRBuilder
.buildSelect(S32
, CmpM_NE0
, Bits0x200
, Zero
);
6793 auto Bits0x7c00
= MIRBuilder
.buildConstant(S32
, 0x7c00);
6794 auto I
= MIRBuilder
.buildOr(S32
, SelectCC
, Bits0x7c00
);
6796 // N = M | (E << 12);
6797 auto EShl12
= MIRBuilder
.buildShl(S32
, E
, MIRBuilder
.buildConstant(S32
, 12));
6798 auto N
= MIRBuilder
.buildOr(S32
, M
, EShl12
);
6800 // B = clamp(1-E, 0, 13);
6801 auto One
= MIRBuilder
.buildConstant(S32
, 1);
6802 auto OneSubExp
= MIRBuilder
.buildSub(S32
, One
, E
);
6803 auto B
= MIRBuilder
.buildSMax(S32
, OneSubExp
, Zero
);
6804 B
= MIRBuilder
.buildSMin(S32
, B
, MIRBuilder
.buildConstant(S32
, 13));
6806 auto SigSetHigh
= MIRBuilder
.buildOr(S32
, M
,
6807 MIRBuilder
.buildConstant(S32
, 0x1000));
6809 auto D
= MIRBuilder
.buildLShr(S32
, SigSetHigh
, B
);
6810 auto D0
= MIRBuilder
.buildShl(S32
, D
, B
);
6812 auto D0_NE_SigSetHigh
= MIRBuilder
.buildICmp(CmpInst::ICMP_NE
, S1
,
6814 auto D1
= MIRBuilder
.buildZExt(S32
, D0_NE_SigSetHigh
);
6815 D
= MIRBuilder
.buildOr(S32
, D
, D1
);
6817 auto CmpELtOne
= MIRBuilder
.buildICmp(CmpInst::ICMP_SLT
, S1
, E
, One
);
6818 auto V
= MIRBuilder
.buildSelect(S32
, CmpELtOne
, D
, N
);
6820 auto VLow3
= MIRBuilder
.buildAnd(S32
, V
, MIRBuilder
.buildConstant(S32
, 7));
6821 V
= MIRBuilder
.buildLShr(S32
, V
, MIRBuilder
.buildConstant(S32
, 2));
6823 auto VLow3Eq3
= MIRBuilder
.buildICmp(CmpInst::ICMP_EQ
, S1
, VLow3
,
6824 MIRBuilder
.buildConstant(S32
, 3));
6825 auto V0
= MIRBuilder
.buildZExt(S32
, VLow3Eq3
);
6827 auto VLow3Gt5
= MIRBuilder
.buildICmp(CmpInst::ICMP_SGT
, S1
, VLow3
,
6828 MIRBuilder
.buildConstant(S32
, 5));
6829 auto V1
= MIRBuilder
.buildZExt(S32
, VLow3Gt5
);
6831 V1
= MIRBuilder
.buildOr(S32
, V0
, V1
);
6832 V
= MIRBuilder
.buildAdd(S32
, V
, V1
);
6834 auto CmpEGt30
= MIRBuilder
.buildICmp(CmpInst::ICMP_SGT
, S1
,
6835 E
, MIRBuilder
.buildConstant(S32
, 30));
6836 V
= MIRBuilder
.buildSelect(S32
, CmpEGt30
,
6837 MIRBuilder
.buildConstant(S32
, 0x7c00), V
);
6839 auto CmpEGt1039
= MIRBuilder
.buildICmp(CmpInst::ICMP_EQ
, S1
,
6840 E
, MIRBuilder
.buildConstant(S32
, 1039));
6841 V
= MIRBuilder
.buildSelect(S32
, CmpEGt1039
, I
, V
);
6843 // Extract the sign bit.
6844 auto Sign
= MIRBuilder
.buildLShr(S32
, UH
, MIRBuilder
.buildConstant(S32
, 16));
6845 Sign
= MIRBuilder
.buildAnd(S32
, Sign
, MIRBuilder
.buildConstant(S32
, 0x8000));
6847 // Insert the sign bit
6848 V
= MIRBuilder
.buildOr(S32
, Sign
, V
);
6850 MIRBuilder
.buildTrunc(Dst
, V
);
6851 MI
.eraseFromParent();
6855 LegalizerHelper::LegalizeResult
6856 LegalizerHelper::lowerFPTRUNC(MachineInstr
&MI
) {
6857 auto [DstTy
, SrcTy
] = MI
.getFirst2LLTs();
6858 const LLT S64
= LLT::scalar(64);
6859 const LLT S16
= LLT::scalar(16);
6861 if (DstTy
.getScalarType() == S16
&& SrcTy
.getScalarType() == S64
)
6862 return lowerFPTRUNC_F64_TO_F16(MI
);
6864 return UnableToLegalize
;
6867 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6868 // multiplication tree.
6869 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerFPOWI(MachineInstr
&MI
) {
6870 auto [Dst
, Src0
, Src1
] = MI
.getFirst3Regs();
6871 LLT Ty
= MRI
.getType(Dst
);
6873 auto CvtSrc1
= MIRBuilder
.buildSITOFP(Ty
, Src1
);
6874 MIRBuilder
.buildFPow(Dst
, Src0
, CvtSrc1
, MI
.getFlags());
6875 MI
.eraseFromParent();
6879 static CmpInst::Predicate
minMaxToCompare(unsigned Opc
) {
6881 case TargetOpcode::G_SMIN
:
6882 return CmpInst::ICMP_SLT
;
6883 case TargetOpcode::G_SMAX
:
6884 return CmpInst::ICMP_SGT
;
6885 case TargetOpcode::G_UMIN
:
6886 return CmpInst::ICMP_ULT
;
6887 case TargetOpcode::G_UMAX
:
6888 return CmpInst::ICMP_UGT
;
6890 llvm_unreachable("not in integer min/max");
6894 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerMinMax(MachineInstr
&MI
) {
6895 auto [Dst
, Src0
, Src1
] = MI
.getFirst3Regs();
6897 const CmpInst::Predicate Pred
= minMaxToCompare(MI
.getOpcode());
6898 LLT CmpType
= MRI
.getType(Dst
).changeElementSize(1);
6900 auto Cmp
= MIRBuilder
.buildICmp(Pred
, CmpType
, Src0
, Src1
);
6901 MIRBuilder
.buildSelect(Dst
, Cmp
, Src0
, Src1
);
6903 MI
.eraseFromParent();
6907 LegalizerHelper::LegalizeResult
6908 LegalizerHelper::lowerFCopySign(MachineInstr
&MI
) {
6909 auto [Dst
, DstTy
, Src0
, Src0Ty
, Src1
, Src1Ty
] = MI
.getFirst3RegLLTs();
6910 const int Src0Size
= Src0Ty
.getScalarSizeInBits();
6911 const int Src1Size
= Src1Ty
.getScalarSizeInBits();
6913 auto SignBitMask
= MIRBuilder
.buildConstant(
6914 Src0Ty
, APInt::getSignMask(Src0Size
));
6916 auto NotSignBitMask
= MIRBuilder
.buildConstant(
6917 Src0Ty
, APInt::getLowBitsSet(Src0Size
, Src0Size
- 1));
6919 Register And0
= MIRBuilder
.buildAnd(Src0Ty
, Src0
, NotSignBitMask
).getReg(0);
6921 if (Src0Ty
== Src1Ty
) {
6922 And1
= MIRBuilder
.buildAnd(Src1Ty
, Src1
, SignBitMask
).getReg(0);
6923 } else if (Src0Size
> Src1Size
) {
6924 auto ShiftAmt
= MIRBuilder
.buildConstant(Src0Ty
, Src0Size
- Src1Size
);
6925 auto Zext
= MIRBuilder
.buildZExt(Src0Ty
, Src1
);
6926 auto Shift
= MIRBuilder
.buildShl(Src0Ty
, Zext
, ShiftAmt
);
6927 And1
= MIRBuilder
.buildAnd(Src0Ty
, Shift
, SignBitMask
).getReg(0);
6929 auto ShiftAmt
= MIRBuilder
.buildConstant(Src1Ty
, Src1Size
- Src0Size
);
6930 auto Shift
= MIRBuilder
.buildLShr(Src1Ty
, Src1
, ShiftAmt
);
6931 auto Trunc
= MIRBuilder
.buildTrunc(Src0Ty
, Shift
);
6932 And1
= MIRBuilder
.buildAnd(Src0Ty
, Trunc
, SignBitMask
).getReg(0);
6935 // Be careful about setting nsz/nnan/ninf on every instruction, since the
6936 // constants are a nan and -0.0, but the final result should preserve
6938 unsigned Flags
= MI
.getFlags();
6939 MIRBuilder
.buildOr(Dst
, And0
, And1
, Flags
);
6941 MI
.eraseFromParent();
6945 LegalizerHelper::LegalizeResult
6946 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr
&MI
) {
6947 unsigned NewOp
= MI
.getOpcode() == TargetOpcode::G_FMINNUM
?
6948 TargetOpcode::G_FMINNUM_IEEE
: TargetOpcode::G_FMAXNUM_IEEE
;
6950 auto [Dst
, Src0
, Src1
] = MI
.getFirst3Regs();
6951 LLT Ty
= MRI
.getType(Dst
);
6953 if (!MI
.getFlag(MachineInstr::FmNoNans
)) {
6954 // Insert canonicalizes if it's possible we need to quiet to get correct
6957 // Note this must be done here, and not as an optimization combine in the
6958 // absence of a dedicate quiet-snan instruction as we're using an
6959 // omni-purpose G_FCANONICALIZE.
6960 if (!isKnownNeverSNaN(Src0
, MRI
))
6961 Src0
= MIRBuilder
.buildFCanonicalize(Ty
, Src0
, MI
.getFlags()).getReg(0);
6963 if (!isKnownNeverSNaN(Src1
, MRI
))
6964 Src1
= MIRBuilder
.buildFCanonicalize(Ty
, Src1
, MI
.getFlags()).getReg(0);
6967 // If there are no nans, it's safe to simply replace this with the non-IEEE
6969 MIRBuilder
.buildInstr(NewOp
, {Dst
}, {Src0
, Src1
}, MI
.getFlags());
6970 MI
.eraseFromParent();
6974 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerFMad(MachineInstr
&MI
) {
6975 // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6976 Register DstReg
= MI
.getOperand(0).getReg();
6977 LLT Ty
= MRI
.getType(DstReg
);
6978 unsigned Flags
= MI
.getFlags();
6980 auto Mul
= MIRBuilder
.buildFMul(Ty
, MI
.getOperand(1), MI
.getOperand(2),
6982 MIRBuilder
.buildFAdd(DstReg
, Mul
, MI
.getOperand(3), Flags
);
6983 MI
.eraseFromParent();
6987 LegalizerHelper::LegalizeResult
6988 LegalizerHelper::lowerIntrinsicRound(MachineInstr
&MI
) {
6989 auto [DstReg
, X
] = MI
.getFirst2Regs();
6990 const unsigned Flags
= MI
.getFlags();
6991 const LLT Ty
= MRI
.getType(DstReg
);
6992 const LLT CondTy
= Ty
.changeElementSize(1);
6997 // o = copysign(d >= 0.5 ? 1.0 : 0.0, x);
7000 auto T
= MIRBuilder
.buildIntrinsicTrunc(Ty
, X
, Flags
);
7002 auto Diff
= MIRBuilder
.buildFSub(Ty
, X
, T
, Flags
);
7003 auto AbsDiff
= MIRBuilder
.buildFAbs(Ty
, Diff
, Flags
);
7005 auto Half
= MIRBuilder
.buildFConstant(Ty
, 0.5);
7007 MIRBuilder
.buildFCmp(CmpInst::FCMP_OGE
, CondTy
, AbsDiff
, Half
, Flags
);
7009 // Could emit G_UITOFP instead
7010 auto One
= MIRBuilder
.buildFConstant(Ty
, 1.0);
7011 auto Zero
= MIRBuilder
.buildFConstant(Ty
, 0.0);
7012 auto BoolFP
= MIRBuilder
.buildSelect(Ty
, Cmp
, One
, Zero
);
7013 auto SignedOffset
= MIRBuilder
.buildFCopysign(Ty
, BoolFP
, X
);
7015 MIRBuilder
.buildFAdd(DstReg
, T
, SignedOffset
, Flags
);
7017 MI
.eraseFromParent();
7021 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerFFloor(MachineInstr
&MI
) {
7022 auto [DstReg
, SrcReg
] = MI
.getFirst2Regs();
7023 unsigned Flags
= MI
.getFlags();
7024 LLT Ty
= MRI
.getType(DstReg
);
7025 const LLT CondTy
= Ty
.changeElementSize(1);
7027 // result = trunc(src);
7028 // if (src < 0.0 && src != result)
7031 auto Trunc
= MIRBuilder
.buildIntrinsicTrunc(Ty
, SrcReg
, Flags
);
7032 auto Zero
= MIRBuilder
.buildFConstant(Ty
, 0.0);
7034 auto Lt0
= MIRBuilder
.buildFCmp(CmpInst::FCMP_OLT
, CondTy
,
7035 SrcReg
, Zero
, Flags
);
7036 auto NeTrunc
= MIRBuilder
.buildFCmp(CmpInst::FCMP_ONE
, CondTy
,
7037 SrcReg
, Trunc
, Flags
);
7038 auto And
= MIRBuilder
.buildAnd(CondTy
, Lt0
, NeTrunc
);
7039 auto AddVal
= MIRBuilder
.buildSITOFP(Ty
, And
);
7041 MIRBuilder
.buildFAdd(DstReg
, Trunc
, AddVal
, Flags
);
7042 MI
.eraseFromParent();
7046 LegalizerHelper::LegalizeResult
7047 LegalizerHelper::lowerMergeValues(MachineInstr
&MI
) {
7048 const unsigned NumOps
= MI
.getNumOperands();
7049 auto [DstReg
, DstTy
, Src0Reg
, Src0Ty
] = MI
.getFirst2RegLLTs();
7050 unsigned PartSize
= Src0Ty
.getSizeInBits();
7052 LLT WideTy
= LLT::scalar(DstTy
.getSizeInBits());
7053 Register ResultReg
= MIRBuilder
.buildZExt(WideTy
, Src0Reg
).getReg(0);
7055 for (unsigned I
= 2; I
!= NumOps
; ++I
) {
7056 const unsigned Offset
= (I
- 1) * PartSize
;
7058 Register SrcReg
= MI
.getOperand(I
).getReg();
7059 auto ZextInput
= MIRBuilder
.buildZExt(WideTy
, SrcReg
);
7061 Register NextResult
= I
+ 1 == NumOps
&& WideTy
== DstTy
? DstReg
:
7062 MRI
.createGenericVirtualRegister(WideTy
);
7064 auto ShiftAmt
= MIRBuilder
.buildConstant(WideTy
, Offset
);
7065 auto Shl
= MIRBuilder
.buildShl(WideTy
, ZextInput
, ShiftAmt
);
7066 MIRBuilder
.buildOr(NextResult
, ResultReg
, Shl
);
7067 ResultReg
= NextResult
;
7070 if (DstTy
.isPointer()) {
7071 if (MIRBuilder
.getDataLayout().isNonIntegralAddressSpace(
7072 DstTy
.getAddressSpace())) {
7073 LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
7074 return UnableToLegalize
;
7077 MIRBuilder
.buildIntToPtr(DstReg
, ResultReg
);
7080 MI
.eraseFromParent();
7084 LegalizerHelper::LegalizeResult
7085 LegalizerHelper::lowerUnmergeValues(MachineInstr
&MI
) {
7086 const unsigned NumDst
= MI
.getNumOperands() - 1;
7087 Register SrcReg
= MI
.getOperand(NumDst
).getReg();
7088 Register Dst0Reg
= MI
.getOperand(0).getReg();
7089 LLT DstTy
= MRI
.getType(Dst0Reg
);
7090 if (DstTy
.isPointer())
7091 return UnableToLegalize
; // TODO
7093 SrcReg
= coerceToScalar(SrcReg
);
7095 return UnableToLegalize
;
7097 // Expand scalarizing unmerge as bitcast to integer and shift.
7098 LLT IntTy
= MRI
.getType(SrcReg
);
7100 MIRBuilder
.buildTrunc(Dst0Reg
, SrcReg
);
7102 const unsigned DstSize
= DstTy
.getSizeInBits();
7103 unsigned Offset
= DstSize
;
7104 for (unsigned I
= 1; I
!= NumDst
; ++I
, Offset
+= DstSize
) {
7105 auto ShiftAmt
= MIRBuilder
.buildConstant(IntTy
, Offset
);
7106 auto Shift
= MIRBuilder
.buildLShr(IntTy
, SrcReg
, ShiftAmt
);
7107 MIRBuilder
.buildTrunc(MI
.getOperand(I
), Shift
);
7110 MI
.eraseFromParent();
7114 /// Lower a vector extract or insert by writing the vector to a stack temporary
7115 /// and reloading the element or vector.
7117 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
7119 /// %stack_temp = G_FRAME_INDEX
7120 /// G_STORE %vec, %stack_temp
7121 /// %idx = clamp(%idx, %vec.getNumElements())
7122 /// %element_ptr = G_PTR_ADD %stack_temp, %idx
7123 /// %dst = G_LOAD %element_ptr
7124 LegalizerHelper::LegalizeResult
7125 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr
&MI
) {
7126 Register DstReg
= MI
.getOperand(0).getReg();
7127 Register SrcVec
= MI
.getOperand(1).getReg();
7129 if (MI
.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT
)
7130 InsertVal
= MI
.getOperand(2).getReg();
7132 Register Idx
= MI
.getOperand(MI
.getNumOperands() - 1).getReg();
7134 LLT VecTy
= MRI
.getType(SrcVec
);
7135 LLT EltTy
= VecTy
.getElementType();
7136 unsigned NumElts
= VecTy
.getNumElements();
7139 if (mi_match(Idx
, MRI
, m_ICst(IdxVal
)) && IdxVal
<= NumElts
) {
7140 SmallVector
<Register
, 8> SrcRegs
;
7141 extractParts(SrcVec
, EltTy
, NumElts
, SrcRegs
, MIRBuilder
, MRI
);
7144 SrcRegs
[IdxVal
] = MI
.getOperand(2).getReg();
7145 MIRBuilder
.buildMergeLikeInstr(DstReg
, SrcRegs
);
7147 MIRBuilder
.buildCopy(DstReg
, SrcRegs
[IdxVal
]);
7150 MI
.eraseFromParent();
7154 if (!EltTy
.isByteSized()) { // Not implemented.
7155 LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
7156 return UnableToLegalize
;
7159 unsigned EltBytes
= EltTy
.getSizeInBytes();
7160 Align VecAlign
= getStackTemporaryAlignment(VecTy
);
7163 MachinePointerInfo PtrInfo
;
7164 auto StackTemp
= createStackTemporary(
7165 TypeSize::getFixed(VecTy
.getSizeInBytes()), VecAlign
, PtrInfo
);
7166 MIRBuilder
.buildStore(SrcVec
, StackTemp
, PtrInfo
, VecAlign
);
7168 // Get the pointer to the element, and be sure not to hit undefined behavior
7169 // if the index is out of bounds.
7170 Register EltPtr
= getVectorElementPointer(StackTemp
.getReg(0), VecTy
, Idx
);
7172 if (mi_match(Idx
, MRI
, m_ICst(IdxVal
))) {
7173 int64_t Offset
= IdxVal
* EltBytes
;
7174 PtrInfo
= PtrInfo
.getWithOffset(Offset
);
7175 EltAlign
= commonAlignment(VecAlign
, Offset
);
7177 // We lose information with a variable offset.
7178 EltAlign
= getStackTemporaryAlignment(EltTy
);
7179 PtrInfo
= MachinePointerInfo(MRI
.getType(EltPtr
).getAddressSpace());
7183 // Write the inserted element
7184 MIRBuilder
.buildStore(InsertVal
, EltPtr
, PtrInfo
, EltAlign
);
7186 // Reload the whole vector.
7187 MIRBuilder
.buildLoad(DstReg
, StackTemp
, PtrInfo
, VecAlign
);
7189 MIRBuilder
.buildLoad(DstReg
, EltPtr
, PtrInfo
, EltAlign
);
7192 MI
.eraseFromParent();
7196 LegalizerHelper::LegalizeResult
7197 LegalizerHelper::lowerShuffleVector(MachineInstr
&MI
) {
7198 auto [DstReg
, DstTy
, Src0Reg
, Src0Ty
, Src1Reg
, Src1Ty
] =
7199 MI
.getFirst3RegLLTs();
7200 LLT IdxTy
= LLT::scalar(32);
7202 ArrayRef
<int> Mask
= MI
.getOperand(3).getShuffleMask();
7204 SmallVector
<Register
, 32> BuildVec
;
7205 LLT EltTy
= DstTy
.getScalarType();
7207 for (int Idx
: Mask
) {
7209 if (!Undef
.isValid())
7210 Undef
= MIRBuilder
.buildUndef(EltTy
).getReg(0);
7211 BuildVec
.push_back(Undef
);
7215 if (Src0Ty
.isScalar()) {
7216 BuildVec
.push_back(Idx
== 0 ? Src0Reg
: Src1Reg
);
7218 int NumElts
= Src0Ty
.getNumElements();
7219 Register SrcVec
= Idx
< NumElts
? Src0Reg
: Src1Reg
;
7220 int ExtractIdx
= Idx
< NumElts
? Idx
: Idx
- NumElts
;
7221 auto IdxK
= MIRBuilder
.buildConstant(IdxTy
, ExtractIdx
);
7222 auto Extract
= MIRBuilder
.buildExtractVectorElement(EltTy
, SrcVec
, IdxK
);
7223 BuildVec
.push_back(Extract
.getReg(0));
7227 if (DstTy
.isScalar())
7228 MIRBuilder
.buildCopy(DstReg
, BuildVec
[0]);
7230 MIRBuilder
.buildBuildVector(DstReg
, BuildVec
);
7231 MI
.eraseFromParent();
7235 Register
LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg
,
7239 LLT IntPtrTy
= LLT::scalar(PtrTy
.getSizeInBits());
7241 auto SPTmp
= MIRBuilder
.buildCopy(PtrTy
, SPReg
);
7242 SPTmp
= MIRBuilder
.buildCast(IntPtrTy
, SPTmp
);
7244 // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
7245 // have to generate an extra instruction to negate the alloc and then use
7246 // G_PTR_ADD to add the negative offset.
7247 auto Alloc
= MIRBuilder
.buildSub(IntPtrTy
, SPTmp
, AllocSize
);
7248 if (Alignment
> Align(1)) {
7249 APInt
AlignMask(IntPtrTy
.getSizeInBits(), Alignment
.value(), true);
7251 auto AlignCst
= MIRBuilder
.buildConstant(IntPtrTy
, AlignMask
);
7252 Alloc
= MIRBuilder
.buildAnd(IntPtrTy
, Alloc
, AlignCst
);
7255 return MIRBuilder
.buildCast(PtrTy
, Alloc
).getReg(0);
7258 LegalizerHelper::LegalizeResult
7259 LegalizerHelper::lowerDynStackAlloc(MachineInstr
&MI
) {
7260 const auto &MF
= *MI
.getMF();
7261 const auto &TFI
= *MF
.getSubtarget().getFrameLowering();
7262 if (TFI
.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp
)
7263 return UnableToLegalize
;
7265 Register Dst
= MI
.getOperand(0).getReg();
7266 Register AllocSize
= MI
.getOperand(1).getReg();
7267 Align Alignment
= assumeAligned(MI
.getOperand(2).getImm());
7269 LLT PtrTy
= MRI
.getType(Dst
);
7270 Register SPReg
= TLI
.getStackPointerRegisterToSaveRestore();
7272 getDynStackAllocTargetPtr(SPReg
, AllocSize
, Alignment
, PtrTy
);
7274 MIRBuilder
.buildCopy(SPReg
, SPTmp
);
7275 MIRBuilder
.buildCopy(Dst
, SPTmp
);
7277 MI
.eraseFromParent();
7281 LegalizerHelper::LegalizeResult
7282 LegalizerHelper::lowerStackSave(MachineInstr
&MI
) {
7283 Register StackPtr
= TLI
.getStackPointerRegisterToSaveRestore();
7285 return UnableToLegalize
;
7287 MIRBuilder
.buildCopy(MI
.getOperand(0), StackPtr
);
7288 MI
.eraseFromParent();
7292 LegalizerHelper::LegalizeResult
7293 LegalizerHelper::lowerStackRestore(MachineInstr
&MI
) {
7294 Register StackPtr
= TLI
.getStackPointerRegisterToSaveRestore();
7296 return UnableToLegalize
;
7298 MIRBuilder
.buildCopy(StackPtr
, MI
.getOperand(0));
7299 MI
.eraseFromParent();
7303 LegalizerHelper::LegalizeResult
7304 LegalizerHelper::lowerExtract(MachineInstr
&MI
) {
7305 auto [DstReg
, DstTy
, SrcReg
, SrcTy
] = MI
.getFirst2RegLLTs();
7306 unsigned Offset
= MI
.getOperand(2).getImm();
7308 // Extract sub-vector or one element
7309 if (SrcTy
.isVector()) {
7310 unsigned SrcEltSize
= SrcTy
.getElementType().getSizeInBits();
7311 unsigned DstSize
= DstTy
.getSizeInBits();
7313 if ((Offset
% SrcEltSize
== 0) && (DstSize
% SrcEltSize
== 0) &&
7314 (Offset
+ DstSize
<= SrcTy
.getSizeInBits())) {
7315 // Unmerge and allow access to each Src element for the artifact combiner.
7316 auto Unmerge
= MIRBuilder
.buildUnmerge(SrcTy
.getElementType(), SrcReg
);
7318 // Take element(s) we need to extract and copy it (merge them).
7319 SmallVector
<Register
, 8> SubVectorElts
;
7320 for (unsigned Idx
= Offset
/ SrcEltSize
;
7321 Idx
< (Offset
+ DstSize
) / SrcEltSize
; ++Idx
) {
7322 SubVectorElts
.push_back(Unmerge
.getReg(Idx
));
7324 if (SubVectorElts
.size() == 1)
7325 MIRBuilder
.buildCopy(DstReg
, SubVectorElts
[0]);
7327 MIRBuilder
.buildMergeLikeInstr(DstReg
, SubVectorElts
);
7329 MI
.eraseFromParent();
7334 if (DstTy
.isScalar() &&
7335 (SrcTy
.isScalar() ||
7336 (SrcTy
.isVector() && DstTy
== SrcTy
.getElementType()))) {
7337 LLT SrcIntTy
= SrcTy
;
7338 if (!SrcTy
.isScalar()) {
7339 SrcIntTy
= LLT::scalar(SrcTy
.getSizeInBits());
7340 SrcReg
= MIRBuilder
.buildBitcast(SrcIntTy
, SrcReg
).getReg(0);
7344 MIRBuilder
.buildTrunc(DstReg
, SrcReg
);
7346 auto ShiftAmt
= MIRBuilder
.buildConstant(SrcIntTy
, Offset
);
7347 auto Shr
= MIRBuilder
.buildLShr(SrcIntTy
, SrcReg
, ShiftAmt
);
7348 MIRBuilder
.buildTrunc(DstReg
, Shr
);
7351 MI
.eraseFromParent();
7355 return UnableToLegalize
;
7358 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerInsert(MachineInstr
&MI
) {
7359 auto [Dst
, Src
, InsertSrc
] = MI
.getFirst3Regs();
7360 uint64_t Offset
= MI
.getOperand(3).getImm();
7362 LLT DstTy
= MRI
.getType(Src
);
7363 LLT InsertTy
= MRI
.getType(InsertSrc
);
7365 // Insert sub-vector or one element
7366 if (DstTy
.isVector() && !InsertTy
.isPointer()) {
7367 LLT EltTy
= DstTy
.getElementType();
7368 unsigned EltSize
= EltTy
.getSizeInBits();
7369 unsigned InsertSize
= InsertTy
.getSizeInBits();
7371 if ((Offset
% EltSize
== 0) && (InsertSize
% EltSize
== 0) &&
7372 (Offset
+ InsertSize
<= DstTy
.getSizeInBits())) {
7373 auto UnmergeSrc
= MIRBuilder
.buildUnmerge(EltTy
, Src
);
7374 SmallVector
<Register
, 8> DstElts
;
7376 // Elements from Src before insert start Offset
7377 for (; Idx
< Offset
/ EltSize
; ++Idx
) {
7378 DstElts
.push_back(UnmergeSrc
.getReg(Idx
));
7381 // Replace elements in Src with elements from InsertSrc
7382 if (InsertTy
.getSizeInBits() > EltSize
) {
7383 auto UnmergeInsertSrc
= MIRBuilder
.buildUnmerge(EltTy
, InsertSrc
);
7384 for (unsigned i
= 0; Idx
< (Offset
+ InsertSize
) / EltSize
;
7386 DstElts
.push_back(UnmergeInsertSrc
.getReg(i
));
7389 DstElts
.push_back(InsertSrc
);
7393 // Remaining elements from Src after insert
7394 for (; Idx
< DstTy
.getNumElements(); ++Idx
) {
7395 DstElts
.push_back(UnmergeSrc
.getReg(Idx
));
7398 MIRBuilder
.buildMergeLikeInstr(Dst
, DstElts
);
7399 MI
.eraseFromParent();
7404 if (InsertTy
.isVector() ||
7405 (DstTy
.isVector() && DstTy
.getElementType() != InsertTy
))
7406 return UnableToLegalize
;
7408 const DataLayout
&DL
= MIRBuilder
.getDataLayout();
7409 if ((DstTy
.isPointer() &&
7410 DL
.isNonIntegralAddressSpace(DstTy
.getAddressSpace())) ||
7411 (InsertTy
.isPointer() &&
7412 DL
.isNonIntegralAddressSpace(InsertTy
.getAddressSpace()))) {
7413 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
7414 return UnableToLegalize
;
7417 LLT IntDstTy
= DstTy
;
7419 if (!DstTy
.isScalar()) {
7420 IntDstTy
= LLT::scalar(DstTy
.getSizeInBits());
7421 Src
= MIRBuilder
.buildCast(IntDstTy
, Src
).getReg(0);
7424 if (!InsertTy
.isScalar()) {
7425 const LLT IntInsertTy
= LLT::scalar(InsertTy
.getSizeInBits());
7426 InsertSrc
= MIRBuilder
.buildPtrToInt(IntInsertTy
, InsertSrc
).getReg(0);
7429 Register ExtInsSrc
= MIRBuilder
.buildZExt(IntDstTy
, InsertSrc
).getReg(0);
7431 auto ShiftAmt
= MIRBuilder
.buildConstant(IntDstTy
, Offset
);
7432 ExtInsSrc
= MIRBuilder
.buildShl(IntDstTy
, ExtInsSrc
, ShiftAmt
).getReg(0);
7435 APInt MaskVal
= APInt::getBitsSetWithWrap(
7436 DstTy
.getSizeInBits(), Offset
+ InsertTy
.getSizeInBits(), Offset
);
7438 auto Mask
= MIRBuilder
.buildConstant(IntDstTy
, MaskVal
);
7439 auto MaskedSrc
= MIRBuilder
.buildAnd(IntDstTy
, Src
, Mask
);
7440 auto Or
= MIRBuilder
.buildOr(IntDstTy
, MaskedSrc
, ExtInsSrc
);
7442 MIRBuilder
.buildCast(Dst
, Or
);
7443 MI
.eraseFromParent();
7447 LegalizerHelper::LegalizeResult
7448 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr
&MI
) {
7449 auto [Dst0
, Dst0Ty
, Dst1
, Dst1Ty
, LHS
, LHSTy
, RHS
, RHSTy
] =
7450 MI
.getFirst4RegLLTs();
7451 const bool IsAdd
= MI
.getOpcode() == TargetOpcode::G_SADDO
;
7454 LLT BoolTy
= Dst1Ty
;
7457 MIRBuilder
.buildAdd(Dst0
, LHS
, RHS
);
7459 MIRBuilder
.buildSub(Dst0
, LHS
, RHS
);
7461 // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
7463 auto Zero
= MIRBuilder
.buildConstant(Ty
, 0);
7465 // For an addition, the result should be less than one of the operands (LHS)
7466 // if and only if the other operand (RHS) is negative, otherwise there will
7468 // For a subtraction, the result should be less than one of the operands
7469 // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
7470 // otherwise there will be overflow.
7471 auto ResultLowerThanLHS
=
7472 MIRBuilder
.buildICmp(CmpInst::ICMP_SLT
, BoolTy
, Dst0
, LHS
);
7473 auto ConditionRHS
= MIRBuilder
.buildICmp(
7474 IsAdd
? CmpInst::ICMP_SLT
: CmpInst::ICMP_SGT
, BoolTy
, RHS
, Zero
);
7476 MIRBuilder
.buildXor(Dst1
, ConditionRHS
, ResultLowerThanLHS
);
7477 MI
.eraseFromParent();
7481 LegalizerHelper::LegalizeResult
7482 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr
&MI
) {
7483 auto [Res
, LHS
, RHS
] = MI
.getFirst3Regs();
7484 LLT Ty
= MRI
.getType(Res
);
7488 switch (MI
.getOpcode()) {
7490 llvm_unreachable("unexpected addsat/subsat opcode");
7491 case TargetOpcode::G_UADDSAT
:
7494 BaseOp
= TargetOpcode::G_ADD
;
7496 case TargetOpcode::G_SADDSAT
:
7499 BaseOp
= TargetOpcode::G_ADD
;
7501 case TargetOpcode::G_USUBSAT
:
7504 BaseOp
= TargetOpcode::G_SUB
;
7506 case TargetOpcode::G_SSUBSAT
:
7509 BaseOp
= TargetOpcode::G_SUB
;
7514 // sadd.sat(a, b) ->
7515 // hi = 0x7fffffff - smax(a, 0)
7516 // lo = 0x80000000 - smin(a, 0)
7517 // a + smin(smax(lo, b), hi)
7518 // ssub.sat(a, b) ->
7519 // lo = smax(a, -1) - 0x7fffffff
7520 // hi = smin(a, -1) - 0x80000000
7521 // a - smin(smax(lo, b), hi)
7522 // TODO: AMDGPU can use a "median of 3" instruction here:
7523 // a +/- med3(lo, b, hi)
7524 uint64_t NumBits
= Ty
.getScalarSizeInBits();
7526 MIRBuilder
.buildConstant(Ty
, APInt::getSignedMaxValue(NumBits
));
7528 MIRBuilder
.buildConstant(Ty
, APInt::getSignedMinValue(NumBits
));
7529 MachineInstrBuilder Hi
, Lo
;
7531 auto Zero
= MIRBuilder
.buildConstant(Ty
, 0);
7532 Hi
= MIRBuilder
.buildSub(Ty
, MaxVal
, MIRBuilder
.buildSMax(Ty
, LHS
, Zero
));
7533 Lo
= MIRBuilder
.buildSub(Ty
, MinVal
, MIRBuilder
.buildSMin(Ty
, LHS
, Zero
));
7535 auto NegOne
= MIRBuilder
.buildConstant(Ty
, -1);
7536 Lo
= MIRBuilder
.buildSub(Ty
, MIRBuilder
.buildSMax(Ty
, LHS
, NegOne
),
7538 Hi
= MIRBuilder
.buildSub(Ty
, MIRBuilder
.buildSMin(Ty
, LHS
, NegOne
),
7542 MIRBuilder
.buildSMin(Ty
, MIRBuilder
.buildSMax(Ty
, Lo
, RHS
), Hi
);
7543 MIRBuilder
.buildInstr(BaseOp
, {Res
}, {LHS
, RHSClamped
});
7545 // uadd.sat(a, b) -> a + umin(~a, b)
7546 // usub.sat(a, b) -> a - umin(a, b)
7547 Register Not
= IsAdd
? MIRBuilder
.buildNot(Ty
, LHS
).getReg(0) : LHS
;
7548 auto Min
= MIRBuilder
.buildUMin(Ty
, Not
, RHS
);
7549 MIRBuilder
.buildInstr(BaseOp
, {Res
}, {LHS
, Min
});
7552 MI
.eraseFromParent();
7556 LegalizerHelper::LegalizeResult
7557 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr
&MI
) {
7558 auto [Res
, LHS
, RHS
] = MI
.getFirst3Regs();
7559 LLT Ty
= MRI
.getType(Res
);
7560 LLT BoolTy
= Ty
.changeElementSize(1);
7563 unsigned OverflowOp
;
7564 switch (MI
.getOpcode()) {
7566 llvm_unreachable("unexpected addsat/subsat opcode");
7567 case TargetOpcode::G_UADDSAT
:
7570 OverflowOp
= TargetOpcode::G_UADDO
;
7572 case TargetOpcode::G_SADDSAT
:
7575 OverflowOp
= TargetOpcode::G_SADDO
;
7577 case TargetOpcode::G_USUBSAT
:
7580 OverflowOp
= TargetOpcode::G_USUBO
;
7582 case TargetOpcode::G_SSUBSAT
:
7585 OverflowOp
= TargetOpcode::G_SSUBO
;
7590 MIRBuilder
.buildInstr(OverflowOp
, {Ty
, BoolTy
}, {LHS
, RHS
});
7591 Register Tmp
= OverflowRes
.getReg(0);
7592 Register Ov
= OverflowRes
.getReg(1);
7593 MachineInstrBuilder Clamp
;
7595 // sadd.sat(a, b) ->
7596 // {tmp, ov} = saddo(a, b)
7597 // ov ? (tmp >>s 31) + 0x80000000 : r
7598 // ssub.sat(a, b) ->
7599 // {tmp, ov} = ssubo(a, b)
7600 // ov ? (tmp >>s 31) + 0x80000000 : r
7601 uint64_t NumBits
= Ty
.getScalarSizeInBits();
7602 auto ShiftAmount
= MIRBuilder
.buildConstant(Ty
, NumBits
- 1);
7603 auto Sign
= MIRBuilder
.buildAShr(Ty
, Tmp
, ShiftAmount
);
7605 MIRBuilder
.buildConstant(Ty
, APInt::getSignedMinValue(NumBits
));
7606 Clamp
= MIRBuilder
.buildAdd(Ty
, Sign
, MinVal
);
7608 // uadd.sat(a, b) ->
7609 // {tmp, ov} = uaddo(a, b)
7610 // ov ? 0xffffffff : tmp
7611 // usub.sat(a, b) ->
7612 // {tmp, ov} = usubo(a, b)
7614 Clamp
= MIRBuilder
.buildConstant(Ty
, IsAdd
? -1 : 0);
7616 MIRBuilder
.buildSelect(Res
, Ov
, Clamp
, Tmp
);
7618 MI
.eraseFromParent();
7622 LegalizerHelper::LegalizeResult
7623 LegalizerHelper::lowerShlSat(MachineInstr
&MI
) {
7624 assert((MI
.getOpcode() == TargetOpcode::G_SSHLSAT
||
7625 MI
.getOpcode() == TargetOpcode::G_USHLSAT
) &&
7626 "Expected shlsat opcode!");
7627 bool IsSigned
= MI
.getOpcode() == TargetOpcode::G_SSHLSAT
;
7628 auto [Res
, LHS
, RHS
] = MI
.getFirst3Regs();
7629 LLT Ty
= MRI
.getType(Res
);
7630 LLT BoolTy
= Ty
.changeElementSize(1);
7632 unsigned BW
= Ty
.getScalarSizeInBits();
7633 auto Result
= MIRBuilder
.buildShl(Ty
, LHS
, RHS
);
7634 auto Orig
= IsSigned
? MIRBuilder
.buildAShr(Ty
, Result
, RHS
)
7635 : MIRBuilder
.buildLShr(Ty
, Result
, RHS
);
7637 MachineInstrBuilder SatVal
;
7639 auto SatMin
= MIRBuilder
.buildConstant(Ty
, APInt::getSignedMinValue(BW
));
7640 auto SatMax
= MIRBuilder
.buildConstant(Ty
, APInt::getSignedMaxValue(BW
));
7641 auto Cmp
= MIRBuilder
.buildICmp(CmpInst::ICMP_SLT
, BoolTy
, LHS
,
7642 MIRBuilder
.buildConstant(Ty
, 0));
7643 SatVal
= MIRBuilder
.buildSelect(Ty
, Cmp
, SatMin
, SatMax
);
7645 SatVal
= MIRBuilder
.buildConstant(Ty
, APInt::getMaxValue(BW
));
7647 auto Ov
= MIRBuilder
.buildICmp(CmpInst::ICMP_NE
, BoolTy
, LHS
, Orig
);
7648 MIRBuilder
.buildSelect(Res
, Ov
, SatVal
, Result
);
7650 MI
.eraseFromParent();
7654 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerBswap(MachineInstr
&MI
) {
7655 auto [Dst
, Src
] = MI
.getFirst2Regs();
7656 const LLT Ty
= MRI
.getType(Src
);
7657 unsigned SizeInBytes
= (Ty
.getScalarSizeInBits() + 7) / 8;
7658 unsigned BaseShiftAmt
= (SizeInBytes
- 1) * 8;
7660 // Swap most and least significant byte, set remaining bytes in Res to zero.
7661 auto ShiftAmt
= MIRBuilder
.buildConstant(Ty
, BaseShiftAmt
);
7662 auto LSByteShiftedLeft
= MIRBuilder
.buildShl(Ty
, Src
, ShiftAmt
);
7663 auto MSByteShiftedRight
= MIRBuilder
.buildLShr(Ty
, Src
, ShiftAmt
);
7664 auto Res
= MIRBuilder
.buildOr(Ty
, MSByteShiftedRight
, LSByteShiftedLeft
);
7666 // Set i-th high/low byte in Res to i-th low/high byte from Src.
7667 for (unsigned i
= 1; i
< SizeInBytes
/ 2; ++i
) {
7668 // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7669 APInt
APMask(SizeInBytes
* 8, 0xFF << (i
* 8));
7670 auto Mask
= MIRBuilder
.buildConstant(Ty
, APMask
);
7671 auto ShiftAmt
= MIRBuilder
.buildConstant(Ty
, BaseShiftAmt
- 16 * i
);
7672 // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7673 auto LoByte
= MIRBuilder
.buildAnd(Ty
, Src
, Mask
);
7674 auto LoShiftedLeft
= MIRBuilder
.buildShl(Ty
, LoByte
, ShiftAmt
);
7675 Res
= MIRBuilder
.buildOr(Ty
, Res
, LoShiftedLeft
);
7676 // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7677 auto SrcShiftedRight
= MIRBuilder
.buildLShr(Ty
, Src
, ShiftAmt
);
7678 auto HiShiftedRight
= MIRBuilder
.buildAnd(Ty
, SrcShiftedRight
, Mask
);
7679 Res
= MIRBuilder
.buildOr(Ty
, Res
, HiShiftedRight
);
7681 Res
.getInstr()->getOperand(0).setReg(Dst
);
7683 MI
.eraseFromParent();
7687 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7688 static MachineInstrBuilder
SwapN(unsigned N
, DstOp Dst
, MachineIRBuilder
&B
,
7689 MachineInstrBuilder Src
, APInt Mask
) {
7690 const LLT Ty
= Dst
.getLLTTy(*B
.getMRI());
7691 MachineInstrBuilder C_N
= B
.buildConstant(Ty
, N
);
7692 MachineInstrBuilder MaskLoNTo0
= B
.buildConstant(Ty
, Mask
);
7693 auto LHS
= B
.buildLShr(Ty
, B
.buildAnd(Ty
, Src
, MaskLoNTo0
), C_N
);
7694 auto RHS
= B
.buildAnd(Ty
, B
.buildShl(Ty
, Src
, C_N
), MaskLoNTo0
);
7695 return B
.buildOr(Dst
, LHS
, RHS
);
7698 LegalizerHelper::LegalizeResult
7699 LegalizerHelper::lowerBitreverse(MachineInstr
&MI
) {
7700 auto [Dst
, Src
] = MI
.getFirst2Regs();
7701 const LLT Ty
= MRI
.getType(Src
);
7702 unsigned Size
= Ty
.getSizeInBits();
7704 MachineInstrBuilder BSWAP
=
7705 MIRBuilder
.buildInstr(TargetOpcode::G_BSWAP
, {Ty
}, {Src
});
7707 // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7708 // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7709 // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7710 MachineInstrBuilder Swap4
=
7711 SwapN(4, Ty
, MIRBuilder
, BSWAP
, APInt::getSplat(Size
, APInt(8, 0xF0)));
7713 // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7714 // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7715 // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7716 MachineInstrBuilder Swap2
=
7717 SwapN(2, Ty
, MIRBuilder
, Swap4
, APInt::getSplat(Size
, APInt(8, 0xCC)));
7719 // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7720 // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7721 // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7722 SwapN(1, Dst
, MIRBuilder
, Swap2
, APInt::getSplat(Size
, APInt(8, 0xAA)));
7724 MI
.eraseFromParent();
7728 LegalizerHelper::LegalizeResult
7729 LegalizerHelper::lowerReadWriteRegister(MachineInstr
&MI
) {
7730 MachineFunction
&MF
= MIRBuilder
.getMF();
7732 bool IsRead
= MI
.getOpcode() == TargetOpcode::G_READ_REGISTER
;
7733 int NameOpIdx
= IsRead
? 1 : 0;
7734 int ValRegIndex
= IsRead
? 0 : 1;
7736 Register ValReg
= MI
.getOperand(ValRegIndex
).getReg();
7737 const LLT Ty
= MRI
.getType(ValReg
);
7738 const MDString
*RegStr
= cast
<MDString
>(
7739 cast
<MDNode
>(MI
.getOperand(NameOpIdx
).getMetadata())->getOperand(0));
7741 Register PhysReg
= TLI
.getRegisterByName(RegStr
->getString().data(), Ty
, MF
);
7742 if (!PhysReg
.isValid())
7743 return UnableToLegalize
;
7746 MIRBuilder
.buildCopy(ValReg
, PhysReg
);
7748 MIRBuilder
.buildCopy(PhysReg
, ValReg
);
7750 MI
.eraseFromParent();
7754 LegalizerHelper::LegalizeResult
7755 LegalizerHelper::lowerSMULH_UMULH(MachineInstr
&MI
) {
7756 bool IsSigned
= MI
.getOpcode() == TargetOpcode::G_SMULH
;
7757 unsigned ExtOp
= IsSigned
? TargetOpcode::G_SEXT
: TargetOpcode::G_ZEXT
;
7758 Register Result
= MI
.getOperand(0).getReg();
7759 LLT OrigTy
= MRI
.getType(Result
);
7760 auto SizeInBits
= OrigTy
.getScalarSizeInBits();
7761 LLT WideTy
= OrigTy
.changeElementSize(SizeInBits
* 2);
7763 auto LHS
= MIRBuilder
.buildInstr(ExtOp
, {WideTy
}, {MI
.getOperand(1)});
7764 auto RHS
= MIRBuilder
.buildInstr(ExtOp
, {WideTy
}, {MI
.getOperand(2)});
7765 auto Mul
= MIRBuilder
.buildMul(WideTy
, LHS
, RHS
);
7766 unsigned ShiftOp
= IsSigned
? TargetOpcode::G_ASHR
: TargetOpcode::G_LSHR
;
7768 auto ShiftAmt
= MIRBuilder
.buildConstant(WideTy
, SizeInBits
);
7769 auto Shifted
= MIRBuilder
.buildInstr(ShiftOp
, {WideTy
}, {Mul
, ShiftAmt
});
7770 MIRBuilder
.buildTrunc(Result
, Shifted
);
7772 MI
.eraseFromParent();
7776 LegalizerHelper::LegalizeResult
7777 LegalizerHelper::lowerISFPCLASS(MachineInstr
&MI
) {
7778 auto [DstReg
, DstTy
, SrcReg
, SrcTy
] = MI
.getFirst2RegLLTs();
7779 FPClassTest Mask
= static_cast<FPClassTest
>(MI
.getOperand(2).getImm());
7781 if (Mask
== fcNone
) {
7782 MIRBuilder
.buildConstant(DstReg
, 0);
7783 MI
.eraseFromParent();
7786 if (Mask
== fcAllFlags
) {
7787 MIRBuilder
.buildConstant(DstReg
, 1);
7788 MI
.eraseFromParent();
7792 // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
7795 unsigned BitSize
= SrcTy
.getScalarSizeInBits();
7796 const fltSemantics
&Semantics
= getFltSemanticForLLT(SrcTy
.getScalarType());
7798 LLT IntTy
= LLT::scalar(BitSize
);
7799 if (SrcTy
.isVector())
7800 IntTy
= LLT::vector(SrcTy
.getElementCount(), IntTy
);
7801 auto AsInt
= MIRBuilder
.buildCopy(IntTy
, SrcReg
);
7804 APInt SignBit
= APInt::getSignMask(BitSize
);
7805 APInt ValueMask
= APInt::getSignedMaxValue(BitSize
); // All bits but sign.
7806 APInt Inf
= APFloat::getInf(Semantics
).bitcastToAPInt(); // Exp and int bit.
7807 APInt ExpMask
= Inf
;
7808 APInt AllOneMantissa
= APFloat::getLargest(Semantics
).bitcastToAPInt() & ~Inf
;
7810 APInt::getOneBitSet(BitSize
, AllOneMantissa
.getActiveBits() - 1);
7811 APInt InvertionMask
= APInt::getAllOnes(DstTy
.getScalarSizeInBits());
7813 auto SignBitC
= MIRBuilder
.buildConstant(IntTy
, SignBit
);
7814 auto ValueMaskC
= MIRBuilder
.buildConstant(IntTy
, ValueMask
);
7815 auto InfC
= MIRBuilder
.buildConstant(IntTy
, Inf
);
7816 auto ExpMaskC
= MIRBuilder
.buildConstant(IntTy
, ExpMask
);
7817 auto ZeroC
= MIRBuilder
.buildConstant(IntTy
, 0);
7819 auto Abs
= MIRBuilder
.buildAnd(IntTy
, AsInt
, ValueMaskC
);
7821 MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_NE
, DstTy
, AsInt
, Abs
);
7823 auto Res
= MIRBuilder
.buildConstant(DstTy
, 0);
7824 // Clang doesn't support capture of structured bindings:
7825 LLT DstTyCopy
= DstTy
;
7826 const auto appendToRes
= [&](MachineInstrBuilder ToAppend
) {
7827 Res
= MIRBuilder
.buildOr(DstTyCopy
, Res
, ToAppend
);
7830 // Tests that involve more than one class should be processed first.
7831 if ((Mask
& fcFinite
) == fcFinite
) {
7832 // finite(V) ==> abs(V) u< exp_mask
7833 appendToRes(MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_ULT
, DstTy
, Abs
,
7836 } else if ((Mask
& fcFinite
) == fcPosFinite
) {
7837 // finite(V) && V > 0 ==> V u< exp_mask
7838 appendToRes(MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_ULT
, DstTy
, AsInt
,
7840 Mask
&= ~fcPosFinite
;
7841 } else if ((Mask
& fcFinite
) == fcNegFinite
) {
7842 // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
7843 auto Cmp
= MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_ULT
, DstTy
, Abs
,
7845 auto And
= MIRBuilder
.buildAnd(DstTy
, Cmp
, Sign
);
7847 Mask
&= ~fcNegFinite
;
7850 if (FPClassTest PartialCheck
= Mask
& (fcZero
| fcSubnormal
)) {
7851 // fcZero | fcSubnormal => test all exponent bits are 0
7852 // TODO: Handle sign bit specific cases
7853 // TODO: Handle inverted case
7854 if (PartialCheck
== (fcZero
| fcSubnormal
)) {
7855 auto ExpBits
= MIRBuilder
.buildAnd(IntTy
, AsInt
, ExpMaskC
);
7856 appendToRes(MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_EQ
, DstTy
,
7858 Mask
&= ~PartialCheck
;
7862 // Check for individual classes.
7863 if (FPClassTest PartialCheck
= Mask
& fcZero
) {
7864 if (PartialCheck
== fcPosZero
)
7865 appendToRes(MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_EQ
, DstTy
,
7867 else if (PartialCheck
== fcZero
)
7869 MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_EQ
, DstTy
, Abs
, ZeroC
));
7871 appendToRes(MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_EQ
, DstTy
,
7875 if (FPClassTest PartialCheck
= Mask
& fcSubnormal
) {
7876 // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
7877 // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
7878 auto V
= (PartialCheck
== fcPosSubnormal
) ? AsInt
: Abs
;
7879 auto OneC
= MIRBuilder
.buildConstant(IntTy
, 1);
7880 auto VMinusOne
= MIRBuilder
.buildSub(IntTy
, V
, OneC
);
7882 MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_ULT
, DstTy
, VMinusOne
,
7883 MIRBuilder
.buildConstant(IntTy
, AllOneMantissa
));
7884 if (PartialCheck
== fcNegSubnormal
)
7885 SubnormalRes
= MIRBuilder
.buildAnd(DstTy
, SubnormalRes
, Sign
);
7886 appendToRes(SubnormalRes
);
7889 if (FPClassTest PartialCheck
= Mask
& fcInf
) {
7890 if (PartialCheck
== fcPosInf
)
7891 appendToRes(MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_EQ
, DstTy
,
7893 else if (PartialCheck
== fcInf
)
7895 MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_EQ
, DstTy
, Abs
, InfC
));
7897 APInt NegInf
= APFloat::getInf(Semantics
, true).bitcastToAPInt();
7898 auto NegInfC
= MIRBuilder
.buildConstant(IntTy
, NegInf
);
7899 appendToRes(MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_EQ
, DstTy
,
7904 if (FPClassTest PartialCheck
= Mask
& fcNan
) {
7905 auto InfWithQnanBitC
= MIRBuilder
.buildConstant(IntTy
, Inf
| QNaNBitMask
);
7906 if (PartialCheck
== fcNan
) {
7907 // isnan(V) ==> abs(V) u> int(inf)
7909 MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_UGT
, DstTy
, Abs
, InfC
));
7910 } else if (PartialCheck
== fcQNan
) {
7911 // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
7912 appendToRes(MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_UGE
, DstTy
, Abs
,
7915 // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
7916 // abs(V) u< (unsigned(Inf) | quiet_bit)
7918 MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_UGT
, DstTy
, Abs
, InfC
);
7919 auto IsNotQnan
= MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_ULT
, DstTy
,
7920 Abs
, InfWithQnanBitC
);
7921 appendToRes(MIRBuilder
.buildAnd(DstTy
, IsNan
, IsNotQnan
));
7925 if (FPClassTest PartialCheck
= Mask
& fcNormal
) {
7926 // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
7928 APInt ExpLSB
= ExpMask
& ~(ExpMask
.shl(1));
7929 auto ExpMinusOne
= MIRBuilder
.buildSub(
7930 IntTy
, Abs
, MIRBuilder
.buildConstant(IntTy
, ExpLSB
));
7931 APInt MaxExpMinusOne
= ExpMask
- ExpLSB
;
7933 MIRBuilder
.buildICmp(CmpInst::Predicate::ICMP_ULT
, DstTy
, ExpMinusOne
,
7934 MIRBuilder
.buildConstant(IntTy
, MaxExpMinusOne
));
7935 if (PartialCheck
== fcNegNormal
)
7936 NormalRes
= MIRBuilder
.buildAnd(DstTy
, NormalRes
, Sign
);
7937 else if (PartialCheck
== fcPosNormal
) {
7938 auto PosSign
= MIRBuilder
.buildXor(
7939 DstTy
, Sign
, MIRBuilder
.buildConstant(DstTy
, InvertionMask
));
7940 NormalRes
= MIRBuilder
.buildAnd(DstTy
, NormalRes
, PosSign
);
7942 appendToRes(NormalRes
);
7945 MIRBuilder
.buildCopy(DstReg
, Res
);
7946 MI
.eraseFromParent();
7950 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerSelect(MachineInstr
&MI
) {
7951 // Implement vector G_SELECT in terms of XOR, AND, OR.
7952 auto [DstReg
, DstTy
, MaskReg
, MaskTy
, Op1Reg
, Op1Ty
, Op2Reg
, Op2Ty
] =
7953 MI
.getFirst4RegLLTs();
7954 if (!DstTy
.isVector())
7955 return UnableToLegalize
;
7957 bool IsEltPtr
= DstTy
.getElementType().isPointer();
7959 LLT ScalarPtrTy
= LLT::scalar(DstTy
.getScalarSizeInBits());
7960 LLT NewTy
= DstTy
.changeElementType(ScalarPtrTy
);
7961 Op1Reg
= MIRBuilder
.buildPtrToInt(NewTy
, Op1Reg
).getReg(0);
7962 Op2Reg
= MIRBuilder
.buildPtrToInt(NewTy
, Op2Reg
).getReg(0);
7966 if (MaskTy
.isScalar()) {
7967 // Turn the scalar condition into a vector condition mask.
7969 Register MaskElt
= MaskReg
;
7971 // The condition was potentially zero extended before, but we want a sign
7972 // extended boolean.
7973 if (MaskTy
!= LLT::scalar(1))
7974 MaskElt
= MIRBuilder
.buildSExtInReg(MaskTy
, MaskElt
, 1).getReg(0);
7976 // Continue the sign extension (or truncate) to match the data type.
7977 MaskElt
= MIRBuilder
.buildSExtOrTrunc(DstTy
.getElementType(),
7980 // Generate a vector splat idiom.
7981 auto ShufSplat
= MIRBuilder
.buildShuffleSplat(DstTy
, MaskElt
);
7982 MaskReg
= ShufSplat
.getReg(0);
7986 if (MaskTy
.getSizeInBits() != DstTy
.getSizeInBits()) {
7987 return UnableToLegalize
;
7990 auto NotMask
= MIRBuilder
.buildNot(MaskTy
, MaskReg
);
7991 auto NewOp1
= MIRBuilder
.buildAnd(MaskTy
, Op1Reg
, MaskReg
);
7992 auto NewOp2
= MIRBuilder
.buildAnd(MaskTy
, Op2Reg
, NotMask
);
7994 auto Or
= MIRBuilder
.buildOr(DstTy
, NewOp1
, NewOp2
);
7995 MIRBuilder
.buildIntToPtr(DstReg
, Or
);
7997 MIRBuilder
.buildOr(DstReg
, NewOp1
, NewOp2
);
7999 MI
.eraseFromParent();
8003 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerDIVREM(MachineInstr
&MI
) {
8004 // Split DIVREM into individual instructions.
8005 unsigned Opcode
= MI
.getOpcode();
8007 MIRBuilder
.buildInstr(
8008 Opcode
== TargetOpcode::G_SDIVREM
? TargetOpcode::G_SDIV
8009 : TargetOpcode::G_UDIV
,
8010 {MI
.getOperand(0).getReg()}, {MI
.getOperand(2), MI
.getOperand(3)});
8011 MIRBuilder
.buildInstr(
8012 Opcode
== TargetOpcode::G_SDIVREM
? TargetOpcode::G_SREM
8013 : TargetOpcode::G_UREM
,
8014 {MI
.getOperand(1).getReg()}, {MI
.getOperand(2), MI
.getOperand(3)});
8015 MI
.eraseFromParent();
8019 LegalizerHelper::LegalizeResult
8020 LegalizerHelper::lowerAbsToAddXor(MachineInstr
&MI
) {
8021 // Expand %res = G_ABS %a into:
8022 // %v1 = G_ASHR %a, scalar_size-1
8023 // %v2 = G_ADD %a, %v1
8024 // %res = G_XOR %v2, %v1
8025 LLT DstTy
= MRI
.getType(MI
.getOperand(0).getReg());
8026 Register OpReg
= MI
.getOperand(1).getReg();
8028 MIRBuilder
.buildConstant(DstTy
, DstTy
.getScalarSizeInBits() - 1);
8029 auto Shift
= MIRBuilder
.buildAShr(DstTy
, OpReg
, ShiftAmt
);
8030 auto Add
= MIRBuilder
.buildAdd(DstTy
, OpReg
, Shift
);
8031 MIRBuilder
.buildXor(MI
.getOperand(0).getReg(), Add
, Shift
);
8032 MI
.eraseFromParent();
8036 LegalizerHelper::LegalizeResult
8037 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr
&MI
) {
8038 // Expand %res = G_ABS %a into:
8039 // %v1 = G_CONSTANT 0
8040 // %v2 = G_SUB %v1, %a
8041 // %res = G_SMAX %a, %v2
8042 Register SrcReg
= MI
.getOperand(1).getReg();
8043 LLT Ty
= MRI
.getType(SrcReg
);
8044 auto Zero
= MIRBuilder
.buildConstant(Ty
, 0).getReg(0);
8045 auto Sub
= MIRBuilder
.buildSub(Ty
, Zero
, SrcReg
).getReg(0);
8046 MIRBuilder
.buildSMax(MI
.getOperand(0), SrcReg
, Sub
);
8047 MI
.eraseFromParent();
8051 LegalizerHelper::LegalizeResult
8052 LegalizerHelper::lowerVectorReduction(MachineInstr
&MI
) {
8053 Register SrcReg
= MI
.getOperand(1).getReg();
8054 LLT SrcTy
= MRI
.getType(SrcReg
);
8055 LLT DstTy
= MRI
.getType(SrcReg
);
8057 // The source could be a scalar if the IR type was <1 x sN>.
8058 if (SrcTy
.isScalar()) {
8059 if (DstTy
.getSizeInBits() > SrcTy
.getSizeInBits())
8060 return UnableToLegalize
; // FIXME: handle extension.
8061 // This can be just a plain copy.
8062 Observer
.changingInstr(MI
);
8063 MI
.setDesc(MIRBuilder
.getTII().get(TargetOpcode::COPY
));
8064 Observer
.changedInstr(MI
);
8067 return UnableToLegalize
;
8070 static Type
*getTypeForLLT(LLT Ty
, LLVMContext
&C
);
8072 LegalizerHelper::LegalizeResult
LegalizerHelper::lowerVAArg(MachineInstr
&MI
) {
8073 MachineFunction
&MF
= *MI
.getMF();
8074 const DataLayout
&DL
= MIRBuilder
.getDataLayout();
8075 LLVMContext
&Ctx
= MF
.getFunction().getContext();
8076 Register ListPtr
= MI
.getOperand(1).getReg();
8077 LLT PtrTy
= MRI
.getType(ListPtr
);
8079 // LstPtr is a pointer to the head of the list. Get the address
8080 // of the head of the list.
8081 Align PtrAlignment
= DL
.getABITypeAlign(getTypeForLLT(PtrTy
, Ctx
));
8082 MachineMemOperand
*PtrLoadMMO
= MF
.getMachineMemOperand(
8083 MachinePointerInfo(), MachineMemOperand::MOLoad
, PtrTy
, PtrAlignment
);
8084 auto VAList
= MIRBuilder
.buildLoad(PtrTy
, ListPtr
, *PtrLoadMMO
).getReg(0);
8086 const Align
A(MI
.getOperand(2).getImm());
8087 LLT PtrTyAsScalarTy
= LLT::scalar(PtrTy
.getSizeInBits());
8088 if (A
> TLI
.getMinStackArgumentAlignment()) {
8090 MIRBuilder
.buildConstant(PtrTyAsScalarTy
, A
.value() - 1).getReg(0);
8091 auto AddDst
= MIRBuilder
.buildPtrAdd(PtrTy
, VAList
, AlignAmt
);
8092 auto AndDst
= MIRBuilder
.buildMaskLowPtrBits(PtrTy
, AddDst
, Log2(A
));
8093 VAList
= AndDst
.getReg(0);
8096 // Increment the pointer, VAList, to the next vaarg
8097 // The list should be bumped by the size of element in the current head of
8099 Register Dst
= MI
.getOperand(0).getReg();
8100 LLT LLTTy
= MRI
.getType(Dst
);
8101 Type
*Ty
= getTypeForLLT(LLTTy
, Ctx
);
8103 MIRBuilder
.buildConstant(PtrTyAsScalarTy
, DL
.getTypeAllocSize(Ty
));
8104 auto Succ
= MIRBuilder
.buildPtrAdd(PtrTy
, VAList
, IncAmt
);
8106 // Store the increment VAList to the legalized pointer
8107 MachineMemOperand
*StoreMMO
= MF
.getMachineMemOperand(
8108 MachinePointerInfo(), MachineMemOperand::MOStore
, PtrTy
, PtrAlignment
);
8109 MIRBuilder
.buildStore(Succ
, ListPtr
, *StoreMMO
);
8110 // Load the actual argument out of the pointer VAList
8111 Align EltAlignment
= DL
.getABITypeAlign(Ty
);
8112 MachineMemOperand
*EltLoadMMO
= MF
.getMachineMemOperand(
8113 MachinePointerInfo(), MachineMemOperand::MOLoad
, LLTTy
, EltAlignment
);
8114 MIRBuilder
.buildLoad(Dst
, VAList
, *EltLoadMMO
);
8116 MI
.eraseFromParent();
8120 static bool shouldLowerMemFuncForSize(const MachineFunction
&MF
) {
8121 // On Darwin, -Os means optimize for size without hurting performance, so
8122 // only really optimize for size when -Oz (MinSize) is used.
8123 if (MF
.getTarget().getTargetTriple().isOSDarwin())
8124 return MF
.getFunction().hasMinSize();
8125 return MF
.getFunction().hasOptSize();
8128 // Returns a list of types to use for memory op lowering in MemOps. A partial
8129 // port of findOptimalMemOpLowering in TargetLowering.
8130 static bool findGISelOptimalMemOpLowering(std::vector
<LLT
> &MemOps
,
8131 unsigned Limit
, const MemOp
&Op
,
8132 unsigned DstAS
, unsigned SrcAS
,
8133 const AttributeList
&FuncAttributes
,
8134 const TargetLowering
&TLI
) {
8135 if (Op
.isMemcpyWithFixedDstAlign() && Op
.getSrcAlign() < Op
.getDstAlign())
8138 LLT Ty
= TLI
.getOptimalMemOpLLT(Op
, FuncAttributes
);
8141 // Use the largest scalar type whose alignment constraints are satisfied.
8142 // We only need to check DstAlign here as SrcAlign is always greater or
8143 // equal to DstAlign (or zero).
8144 Ty
= LLT::scalar(64);
8145 if (Op
.isFixedDstAlign())
8146 while (Op
.getDstAlign() < Ty
.getSizeInBytes() &&
8147 !TLI
.allowsMisalignedMemoryAccesses(Ty
, DstAS
, Op
.getDstAlign()))
8148 Ty
= LLT::scalar(Ty
.getSizeInBytes());
8149 assert(Ty
.getSizeInBits() > 0 && "Could not find valid type");
8150 // FIXME: check for the largest legal type we can load/store to.
8153 unsigned NumMemOps
= 0;
8154 uint64_t Size
= Op
.size();
8156 unsigned TySize
= Ty
.getSizeInBytes();
8157 while (TySize
> Size
) {
8158 // For now, only use non-vector load / store's for the left-over pieces.
8160 // FIXME: check for mem op safety and legality of the types. Not all of
8161 // SDAGisms map cleanly to GISel concepts.
8162 if (NewTy
.isVector())
8163 NewTy
= NewTy
.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
8164 NewTy
= LLT::scalar(llvm::bit_floor(NewTy
.getSizeInBits() - 1));
8165 unsigned NewTySize
= NewTy
.getSizeInBytes();
8166 assert(NewTySize
> 0 && "Could not find appropriate type");
8168 // If the new LLT cannot cover all of the remaining bits, then consider
8169 // issuing a (or a pair of) unaligned and overlapping load / store.
8171 // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
8172 MVT VT
= getMVTForLLT(Ty
);
8173 if (NumMemOps
&& Op
.allowOverlap() && NewTySize
< Size
&&
8174 TLI
.allowsMisalignedMemoryAccesses(
8175 VT
, DstAS
, Op
.isFixedDstAlign() ? Op
.getDstAlign() : Align(1),
8176 MachineMemOperand::MONone
, &Fast
) &&
8185 if (++NumMemOps
> Limit
)
8188 MemOps
.push_back(Ty
);
8195 static Type
*getTypeForLLT(LLT Ty
, LLVMContext
&C
) {
8197 return FixedVectorType::get(IntegerType::get(C
, Ty
.getScalarSizeInBits()),
8198 Ty
.getNumElements());
8199 return IntegerType::get(C
, Ty
.getSizeInBits());
8202 // Get a vectorized representation of the memset value operand, GISel edition.
8203 static Register
getMemsetValue(Register Val
, LLT Ty
, MachineIRBuilder
&MIB
) {
8204 MachineRegisterInfo
&MRI
= *MIB
.getMRI();
8205 unsigned NumBits
= Ty
.getScalarSizeInBits();
8206 auto ValVRegAndVal
= getIConstantVRegValWithLookThrough(Val
, MRI
);
8207 if (!Ty
.isVector() && ValVRegAndVal
) {
8208 APInt Scalar
= ValVRegAndVal
->Value
.trunc(8);
8209 APInt SplatVal
= APInt::getSplat(NumBits
, Scalar
);
8210 return MIB
.buildConstant(Ty
, SplatVal
).getReg(0);
8213 // Extend the byte value to the larger type, and then multiply by a magic
8214 // value 0x010101... in order to replicate it across every byte.
8215 // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
8216 if (ValVRegAndVal
&& ValVRegAndVal
->Value
== 0) {
8217 return MIB
.buildConstant(Ty
, 0).getReg(0);
8220 LLT ExtType
= Ty
.getScalarType();
8221 auto ZExt
= MIB
.buildZExtOrTrunc(ExtType
, Val
);
8223 APInt Magic
= APInt::getSplat(NumBits
, APInt(8, 0x01));
8224 auto MagicMI
= MIB
.buildConstant(ExtType
, Magic
);
8225 Val
= MIB
.buildMul(ExtType
, ZExt
, MagicMI
).getReg(0);
8228 // For vector types create a G_BUILD_VECTOR.
8230 Val
= MIB
.buildSplatVector(Ty
, Val
).getReg(0);
8235 LegalizerHelper::LegalizeResult
8236 LegalizerHelper::lowerMemset(MachineInstr
&MI
, Register Dst
, Register Val
,
8237 uint64_t KnownLen
, Align Alignment
,
8239 auto &MF
= *MI
.getParent()->getParent();
8240 const auto &TLI
= *MF
.getSubtarget().getTargetLowering();
8241 auto &DL
= MF
.getDataLayout();
8242 LLVMContext
&C
= MF
.getFunction().getContext();
8244 assert(KnownLen
!= 0 && "Have a zero length memset length!");
8246 bool DstAlignCanChange
= false;
8247 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
8248 bool OptSize
= shouldLowerMemFuncForSize(MF
);
8250 MachineInstr
*FIDef
= getOpcodeDef(TargetOpcode::G_FRAME_INDEX
, Dst
, MRI
);
8251 if (FIDef
&& !MFI
.isFixedObjectIndex(FIDef
->getOperand(1).getIndex()))
8252 DstAlignCanChange
= true;
8254 unsigned Limit
= TLI
.getMaxStoresPerMemset(OptSize
);
8255 std::vector
<LLT
> MemOps
;
8257 const auto &DstMMO
= **MI
.memoperands_begin();
8258 MachinePointerInfo DstPtrInfo
= DstMMO
.getPointerInfo();
8260 auto ValVRegAndVal
= getIConstantVRegValWithLookThrough(Val
, MRI
);
8261 bool IsZeroVal
= ValVRegAndVal
&& ValVRegAndVal
->Value
== 0;
8263 if (!findGISelOptimalMemOpLowering(MemOps
, Limit
,
8264 MemOp::Set(KnownLen
, DstAlignCanChange
,
8266 /*IsZeroMemset=*/IsZeroVal
,
8267 /*IsVolatile=*/IsVolatile
),
8268 DstPtrInfo
.getAddrSpace(), ~0u,
8269 MF
.getFunction().getAttributes(), TLI
))
8270 return UnableToLegalize
;
8272 if (DstAlignCanChange
) {
8273 // Get an estimate of the type from the LLT.
8274 Type
*IRTy
= getTypeForLLT(MemOps
[0], C
);
8275 Align NewAlign
= DL
.getABITypeAlign(IRTy
);
8276 if (NewAlign
> Alignment
) {
8277 Alignment
= NewAlign
;
8278 unsigned FI
= FIDef
->getOperand(1).getIndex();
8279 // Give the stack frame object a larger alignment if needed.
8280 if (MFI
.getObjectAlign(FI
) < Alignment
)
8281 MFI
.setObjectAlignment(FI
, Alignment
);
8285 MachineIRBuilder
MIB(MI
);
8286 // Find the largest store and generate the bit pattern for it.
8287 LLT LargestTy
= MemOps
[0];
8288 for (unsigned i
= 1; i
< MemOps
.size(); i
++)
8289 if (MemOps
[i
].getSizeInBits() > LargestTy
.getSizeInBits())
8290 LargestTy
= MemOps
[i
];
8292 // The memset stored value is always defined as an s8, so in order to make it
8293 // work with larger store types we need to repeat the bit pattern across the
8295 Register MemSetValue
= getMemsetValue(Val
, LargestTy
, MIB
);
8298 return UnableToLegalize
;
8300 // Generate the stores. For each store type in the list, we generate the
8301 // matching store of that type to the destination address.
8302 LLT PtrTy
= MRI
.getType(Dst
);
8303 unsigned DstOff
= 0;
8304 unsigned Size
= KnownLen
;
8305 for (unsigned I
= 0; I
< MemOps
.size(); I
++) {
8307 unsigned TySize
= Ty
.getSizeInBytes();
8308 if (TySize
> Size
) {
8309 // Issuing an unaligned load / store pair that overlaps with the previous
8310 // pair. Adjust the offset accordingly.
8311 assert(I
== MemOps
.size() - 1 && I
!= 0);
8312 DstOff
-= TySize
- Size
;
8315 // If this store is smaller than the largest store see whether we can get
8316 // the smaller value for free with a truncate.
8317 Register Value
= MemSetValue
;
8318 if (Ty
.getSizeInBits() < LargestTy
.getSizeInBits()) {
8319 MVT VT
= getMVTForLLT(Ty
);
8320 MVT LargestVT
= getMVTForLLT(LargestTy
);
8321 if (!LargestTy
.isVector() && !Ty
.isVector() &&
8322 TLI
.isTruncateFree(LargestVT
, VT
))
8323 Value
= MIB
.buildTrunc(Ty
, MemSetValue
).getReg(0);
8325 Value
= getMemsetValue(Val
, Ty
, MIB
);
8327 return UnableToLegalize
;
8330 auto *StoreMMO
= MF
.getMachineMemOperand(&DstMMO
, DstOff
, Ty
);
8335 MIB
.buildConstant(LLT::scalar(PtrTy
.getSizeInBits()), DstOff
);
8336 Ptr
= MIB
.buildPtrAdd(PtrTy
, Dst
, Offset
).getReg(0);
8339 MIB
.buildStore(Value
, Ptr
, *StoreMMO
);
8340 DstOff
+= Ty
.getSizeInBytes();
8344 MI
.eraseFromParent();
8348 LegalizerHelper::LegalizeResult
8349 LegalizerHelper::lowerMemcpyInline(MachineInstr
&MI
) {
8350 assert(MI
.getOpcode() == TargetOpcode::G_MEMCPY_INLINE
);
8352 auto [Dst
, Src
, Len
] = MI
.getFirst3Regs();
8354 const auto *MMOIt
= MI
.memoperands_begin();
8355 const MachineMemOperand
*MemOp
= *MMOIt
;
8356 bool IsVolatile
= MemOp
->isVolatile();
8358 // See if this is a constant length copy
8359 auto LenVRegAndVal
= getIConstantVRegValWithLookThrough(Len
, MRI
);
8360 // FIXME: support dynamically sized G_MEMCPY_INLINE
8361 assert(LenVRegAndVal
&&
8362 "inline memcpy with dynamic size is not yet supported");
8363 uint64_t KnownLen
= LenVRegAndVal
->Value
.getZExtValue();
8364 if (KnownLen
== 0) {
8365 MI
.eraseFromParent();
8369 const auto &DstMMO
= **MI
.memoperands_begin();
8370 const auto &SrcMMO
= **std::next(MI
.memoperands_begin());
8371 Align DstAlign
= DstMMO
.getBaseAlign();
8372 Align SrcAlign
= SrcMMO
.getBaseAlign();
8374 return lowerMemcpyInline(MI
, Dst
, Src
, KnownLen
, DstAlign
, SrcAlign
,
8378 LegalizerHelper::LegalizeResult
8379 LegalizerHelper::lowerMemcpyInline(MachineInstr
&MI
, Register Dst
, Register Src
,
8380 uint64_t KnownLen
, Align DstAlign
,
8381 Align SrcAlign
, bool IsVolatile
) {
8382 assert(MI
.getOpcode() == TargetOpcode::G_MEMCPY_INLINE
);
8383 return lowerMemcpy(MI
, Dst
, Src
, KnownLen
,
8384 std::numeric_limits
<uint64_t>::max(), DstAlign
, SrcAlign
,
8388 LegalizerHelper::LegalizeResult
8389 LegalizerHelper::lowerMemcpy(MachineInstr
&MI
, Register Dst
, Register Src
,
8390 uint64_t KnownLen
, uint64_t Limit
, Align DstAlign
,
8391 Align SrcAlign
, bool IsVolatile
) {
8392 auto &MF
= *MI
.getParent()->getParent();
8393 const auto &TLI
= *MF
.getSubtarget().getTargetLowering();
8394 auto &DL
= MF
.getDataLayout();
8395 LLVMContext
&C
= MF
.getFunction().getContext();
8397 assert(KnownLen
!= 0 && "Have a zero length memcpy length!");
8399 bool DstAlignCanChange
= false;
8400 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
8401 Align Alignment
= std::min(DstAlign
, SrcAlign
);
8403 MachineInstr
*FIDef
= getOpcodeDef(TargetOpcode::G_FRAME_INDEX
, Dst
, MRI
);
8404 if (FIDef
&& !MFI
.isFixedObjectIndex(FIDef
->getOperand(1).getIndex()))
8405 DstAlignCanChange
= true;
8407 // FIXME: infer better src pointer alignment like SelectionDAG does here.
8408 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
8409 // if the memcpy is in a tail call position.
8411 std::vector
<LLT
> MemOps
;
8413 const auto &DstMMO
= **MI
.memoperands_begin();
8414 const auto &SrcMMO
= **std::next(MI
.memoperands_begin());
8415 MachinePointerInfo DstPtrInfo
= DstMMO
.getPointerInfo();
8416 MachinePointerInfo SrcPtrInfo
= SrcMMO
.getPointerInfo();
8418 if (!findGISelOptimalMemOpLowering(
8420 MemOp::Copy(KnownLen
, DstAlignCanChange
, Alignment
, SrcAlign
,
8422 DstPtrInfo
.getAddrSpace(), SrcPtrInfo
.getAddrSpace(),
8423 MF
.getFunction().getAttributes(), TLI
))
8424 return UnableToLegalize
;
8426 if (DstAlignCanChange
) {
8427 // Get an estimate of the type from the LLT.
8428 Type
*IRTy
= getTypeForLLT(MemOps
[0], C
);
8429 Align NewAlign
= DL
.getABITypeAlign(IRTy
);
8431 // Don't promote to an alignment that would require dynamic stack
8433 const TargetRegisterInfo
*TRI
= MF
.getSubtarget().getRegisterInfo();
8434 if (!TRI
->hasStackRealignment(MF
))
8435 while (NewAlign
> Alignment
&& DL
.exceedsNaturalStackAlignment(NewAlign
))
8436 NewAlign
= NewAlign
.previous();
8438 if (NewAlign
> Alignment
) {
8439 Alignment
= NewAlign
;
8440 unsigned FI
= FIDef
->getOperand(1).getIndex();
8441 // Give the stack frame object a larger alignment if needed.
8442 if (MFI
.getObjectAlign(FI
) < Alignment
)
8443 MFI
.setObjectAlignment(FI
, Alignment
);
8447 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI
<< " into loads & stores\n");
8449 MachineIRBuilder
MIB(MI
);
8450 // Now we need to emit a pair of load and stores for each of the types we've
8451 // collected. I.e. for each type, generate a load from the source pointer of
8452 // that type width, and then generate a corresponding store to the dest buffer
8453 // of that value loaded. This can result in a sequence of loads and stores
8454 // mixed types, depending on what the target specifies as good types to use.
8455 unsigned CurrOffset
= 0;
8456 unsigned Size
= KnownLen
;
8457 for (auto CopyTy
: MemOps
) {
8458 // Issuing an unaligned load / store pair that overlaps with the previous
8459 // pair. Adjust the offset accordingly.
8460 if (CopyTy
.getSizeInBytes() > Size
)
8461 CurrOffset
-= CopyTy
.getSizeInBytes() - Size
;
8463 // Construct MMOs for the accesses.
8465 MF
.getMachineMemOperand(&SrcMMO
, CurrOffset
, CopyTy
.getSizeInBytes());
8467 MF
.getMachineMemOperand(&DstMMO
, CurrOffset
, CopyTy
.getSizeInBytes());
8470 Register LoadPtr
= Src
;
8472 if (CurrOffset
!= 0) {
8473 LLT SrcTy
= MRI
.getType(Src
);
8474 Offset
= MIB
.buildConstant(LLT::scalar(SrcTy
.getSizeInBits()), CurrOffset
)
8476 LoadPtr
= MIB
.buildPtrAdd(SrcTy
, Src
, Offset
).getReg(0);
8478 auto LdVal
= MIB
.buildLoad(CopyTy
, LoadPtr
, *LoadMMO
);
8480 // Create the store.
8481 Register StorePtr
= Dst
;
8482 if (CurrOffset
!= 0) {
8483 LLT DstTy
= MRI
.getType(Dst
);
8484 StorePtr
= MIB
.buildPtrAdd(DstTy
, Dst
, Offset
).getReg(0);
8486 MIB
.buildStore(LdVal
, StorePtr
, *StoreMMO
);
8487 CurrOffset
+= CopyTy
.getSizeInBytes();
8488 Size
-= CopyTy
.getSizeInBytes();
8491 MI
.eraseFromParent();
8495 LegalizerHelper::LegalizeResult
8496 LegalizerHelper::lowerMemmove(MachineInstr
&MI
, Register Dst
, Register Src
,
8497 uint64_t KnownLen
, Align DstAlign
, Align SrcAlign
,
8499 auto &MF
= *MI
.getParent()->getParent();
8500 const auto &TLI
= *MF
.getSubtarget().getTargetLowering();
8501 auto &DL
= MF
.getDataLayout();
8502 LLVMContext
&C
= MF
.getFunction().getContext();
8504 assert(KnownLen
!= 0 && "Have a zero length memmove length!");
8506 bool DstAlignCanChange
= false;
8507 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
8508 bool OptSize
= shouldLowerMemFuncForSize(MF
);
8509 Align Alignment
= std::min(DstAlign
, SrcAlign
);
8511 MachineInstr
*FIDef
= getOpcodeDef(TargetOpcode::G_FRAME_INDEX
, Dst
, MRI
);
8512 if (FIDef
&& !MFI
.isFixedObjectIndex(FIDef
->getOperand(1).getIndex()))
8513 DstAlignCanChange
= true;
8515 unsigned Limit
= TLI
.getMaxStoresPerMemmove(OptSize
);
8516 std::vector
<LLT
> MemOps
;
8518 const auto &DstMMO
= **MI
.memoperands_begin();
8519 const auto &SrcMMO
= **std::next(MI
.memoperands_begin());
8520 MachinePointerInfo DstPtrInfo
= DstMMO
.getPointerInfo();
8521 MachinePointerInfo SrcPtrInfo
= SrcMMO
.getPointerInfo();
8523 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
8524 // to a bug in it's findOptimalMemOpLowering implementation. For now do the
8526 if (!findGISelOptimalMemOpLowering(
8528 MemOp::Copy(KnownLen
, DstAlignCanChange
, Alignment
, SrcAlign
,
8529 /*IsVolatile*/ true),
8530 DstPtrInfo
.getAddrSpace(), SrcPtrInfo
.getAddrSpace(),
8531 MF
.getFunction().getAttributes(), TLI
))
8532 return UnableToLegalize
;
8534 if (DstAlignCanChange
) {
8535 // Get an estimate of the type from the LLT.
8536 Type
*IRTy
= getTypeForLLT(MemOps
[0], C
);
8537 Align NewAlign
= DL
.getABITypeAlign(IRTy
);
8539 // Don't promote to an alignment that would require dynamic stack
8541 const TargetRegisterInfo
*TRI
= MF
.getSubtarget().getRegisterInfo();
8542 if (!TRI
->hasStackRealignment(MF
))
8543 while (NewAlign
> Alignment
&& DL
.exceedsNaturalStackAlignment(NewAlign
))
8544 NewAlign
= NewAlign
.previous();
8546 if (NewAlign
> Alignment
) {
8547 Alignment
= NewAlign
;
8548 unsigned FI
= FIDef
->getOperand(1).getIndex();
8549 // Give the stack frame object a larger alignment if needed.
8550 if (MFI
.getObjectAlign(FI
) < Alignment
)
8551 MFI
.setObjectAlignment(FI
, Alignment
);
8555 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI
<< " into loads & stores\n");
8557 MachineIRBuilder
MIB(MI
);
8558 // Memmove requires that we perform the loads first before issuing the stores.
8559 // Apart from that, this loop is pretty much doing the same thing as the
8560 // memcpy codegen function.
8561 unsigned CurrOffset
= 0;
8562 SmallVector
<Register
, 16> LoadVals
;
8563 for (auto CopyTy
: MemOps
) {
8564 // Construct MMO for the load.
8566 MF
.getMachineMemOperand(&SrcMMO
, CurrOffset
, CopyTy
.getSizeInBytes());
8569 Register LoadPtr
= Src
;
8570 if (CurrOffset
!= 0) {
8571 LLT SrcTy
= MRI
.getType(Src
);
8573 MIB
.buildConstant(LLT::scalar(SrcTy
.getSizeInBits()), CurrOffset
);
8574 LoadPtr
= MIB
.buildPtrAdd(SrcTy
, Src
, Offset
).getReg(0);
8576 LoadVals
.push_back(MIB
.buildLoad(CopyTy
, LoadPtr
, *LoadMMO
).getReg(0));
8577 CurrOffset
+= CopyTy
.getSizeInBytes();
8581 for (unsigned I
= 0; I
< MemOps
.size(); ++I
) {
8582 LLT CopyTy
= MemOps
[I
];
8583 // Now store the values loaded.
8585 MF
.getMachineMemOperand(&DstMMO
, CurrOffset
, CopyTy
.getSizeInBytes());
8587 Register StorePtr
= Dst
;
8588 if (CurrOffset
!= 0) {
8589 LLT DstTy
= MRI
.getType(Dst
);
8591 MIB
.buildConstant(LLT::scalar(DstTy
.getSizeInBits()), CurrOffset
);
8592 StorePtr
= MIB
.buildPtrAdd(DstTy
, Dst
, Offset
).getReg(0);
8594 MIB
.buildStore(LoadVals
[I
], StorePtr
, *StoreMMO
);
8595 CurrOffset
+= CopyTy
.getSizeInBytes();
8597 MI
.eraseFromParent();
8601 LegalizerHelper::LegalizeResult
8602 LegalizerHelper::lowerMemCpyFamily(MachineInstr
&MI
, unsigned MaxLen
) {
8603 const unsigned Opc
= MI
.getOpcode();
8604 // This combine is fairly complex so it's not written with a separate
8605 // matcher function.
8606 assert((Opc
== TargetOpcode::G_MEMCPY
|| Opc
== TargetOpcode::G_MEMMOVE
||
8607 Opc
== TargetOpcode::G_MEMSET
) &&
8608 "Expected memcpy like instruction");
8610 auto MMOIt
= MI
.memoperands_begin();
8611 const MachineMemOperand
*MemOp
= *MMOIt
;
8613 Align DstAlign
= MemOp
->getBaseAlign();
8615 auto [Dst
, Src
, Len
] = MI
.getFirst3Regs();
8617 if (Opc
!= TargetOpcode::G_MEMSET
) {
8618 assert(MMOIt
!= MI
.memoperands_end() && "Expected a second MMO on MI");
8620 SrcAlign
= MemOp
->getBaseAlign();
8623 // See if this is a constant length copy
8624 auto LenVRegAndVal
= getIConstantVRegValWithLookThrough(Len
, MRI
);
8626 return UnableToLegalize
;
8627 uint64_t KnownLen
= LenVRegAndVal
->Value
.getZExtValue();
8629 if (KnownLen
== 0) {
8630 MI
.eraseFromParent();
8634 bool IsVolatile
= MemOp
->isVolatile();
8635 if (Opc
== TargetOpcode::G_MEMCPY_INLINE
)
8636 return lowerMemcpyInline(MI
, Dst
, Src
, KnownLen
, DstAlign
, SrcAlign
,
8639 // Don't try to optimize volatile.
8641 return UnableToLegalize
;
8643 if (MaxLen
&& KnownLen
> MaxLen
)
8644 return UnableToLegalize
;
8646 if (Opc
== TargetOpcode::G_MEMCPY
) {
8647 auto &MF
= *MI
.getParent()->getParent();
8648 const auto &TLI
= *MF
.getSubtarget().getTargetLowering();
8649 bool OptSize
= shouldLowerMemFuncForSize(MF
);
8650 uint64_t Limit
= TLI
.getMaxStoresPerMemcpy(OptSize
);
8651 return lowerMemcpy(MI
, Dst
, Src
, KnownLen
, Limit
, DstAlign
, SrcAlign
,
8654 if (Opc
== TargetOpcode::G_MEMMOVE
)
8655 return lowerMemmove(MI
, Dst
, Src
, KnownLen
, DstAlign
, SrcAlign
, IsVolatile
);
8656 if (Opc
== TargetOpcode::G_MEMSET
)
8657 return lowerMemset(MI
, Dst
, Src
, KnownLen
, DstAlign
, IsVolatile
);
8658 return UnableToLegalize
;