[MachineScheduler] Fix physreg dependencies of ExitSU (#123541)
[llvm-project.git] / llvm / lib / CodeGen / GlobalISel / LegalizerHelper.cpp
blob5a16d72594363d73aedd3b251d43aea4137a0a71
1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
13 //===----------------------------------------------------------------------===//
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
19 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24 #include "llvm/CodeGen/GlobalISel/Utils.h"
25 #include "llvm/CodeGen/LowLevelTypeUtils.h"
26 #include "llvm/CodeGen/MachineConstantPool.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/CodeGen/RuntimeLibcallUtil.h"
30 #include "llvm/CodeGen/TargetFrameLowering.h"
31 #include "llvm/CodeGen/TargetInstrInfo.h"
32 #include "llvm/CodeGen/TargetLowering.h"
33 #include "llvm/CodeGen/TargetOpcodes.h"
34 #include "llvm/CodeGen/TargetSubtargetInfo.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/Support/Debug.h"
37 #include "llvm/Support/MathExtras.h"
38 #include "llvm/Support/raw_ostream.h"
39 #include "llvm/Target/TargetMachine.h"
40 #include <numeric>
41 #include <optional>
43 #define DEBUG_TYPE "legalizer"
45 using namespace llvm;
46 using namespace LegalizeActions;
47 using namespace MIPatternMatch;
49 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
50 ///
51 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
52 /// with any leftover piece as type \p LeftoverTy
53 ///
54 /// Returns -1 in the first element of the pair if the breakdown is not
55 /// satisfiable.
56 static std::pair<int, int>
57 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
58 assert(!LeftoverTy.isValid() && "this is an out argument");
60 unsigned Size = OrigTy.getSizeInBits();
61 unsigned NarrowSize = NarrowTy.getSizeInBits();
62 unsigned NumParts = Size / NarrowSize;
63 unsigned LeftoverSize = Size - NumParts * NarrowSize;
64 assert(Size > NarrowSize);
66 if (LeftoverSize == 0)
67 return {NumParts, 0};
69 if (NarrowTy.isVector()) {
70 unsigned EltSize = OrigTy.getScalarSizeInBits();
71 if (LeftoverSize % EltSize != 0)
72 return {-1, -1};
73 LeftoverTy =
74 LLT::scalarOrVector(ElementCount::getFixed(LeftoverSize / EltSize),
75 OrigTy.getElementType());
76 } else {
77 LeftoverTy = LLT::scalar(LeftoverSize);
80 int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
81 return std::make_pair(NumParts, NumLeftover);
84 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
86 if (!Ty.isScalar())
87 return nullptr;
89 switch (Ty.getSizeInBits()) {
90 case 16:
91 return Type::getHalfTy(Ctx);
92 case 32:
93 return Type::getFloatTy(Ctx);
94 case 64:
95 return Type::getDoubleTy(Ctx);
96 case 80:
97 return Type::getX86_FP80Ty(Ctx);
98 case 128:
99 return Type::getFP128Ty(Ctx);
100 default:
101 return nullptr;
105 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
106 GISelChangeObserver &Observer,
107 MachineIRBuilder &Builder)
108 : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
109 LI(*MF.getSubtarget().getLegalizerInfo()),
110 TLI(*MF.getSubtarget().getTargetLowering()), KB(nullptr) {}
112 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
113 GISelChangeObserver &Observer,
114 MachineIRBuilder &B, GISelKnownBits *KB)
115 : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
116 TLI(*MF.getSubtarget().getTargetLowering()), KB(KB) {}
118 LegalizerHelper::LegalizeResult
119 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
120 LostDebugLocObserver &LocObserver) {
121 LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
123 MIRBuilder.setInstrAndDebugLoc(MI);
125 if (isa<GIntrinsic>(MI))
126 return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
127 auto Step = LI.getAction(MI, MRI);
128 switch (Step.Action) {
129 case Legal:
130 LLVM_DEBUG(dbgs() << ".. Already legal\n");
131 return AlreadyLegal;
132 case Libcall:
133 LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
134 return libcall(MI, LocObserver);
135 case NarrowScalar:
136 LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
137 return narrowScalar(MI, Step.TypeIdx, Step.NewType);
138 case WidenScalar:
139 LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
140 return widenScalar(MI, Step.TypeIdx, Step.NewType);
141 case Bitcast:
142 LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
143 return bitcast(MI, Step.TypeIdx, Step.NewType);
144 case Lower:
145 LLVM_DEBUG(dbgs() << ".. Lower\n");
146 return lower(MI, Step.TypeIdx, Step.NewType);
147 case FewerElements:
148 LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
149 return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
150 case MoreElements:
151 LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
152 return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
153 case Custom:
154 LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
155 return LI.legalizeCustom(*this, MI, LocObserver) ? Legalized
156 : UnableToLegalize;
157 default:
158 LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
159 return UnableToLegalize;
163 void LegalizerHelper::insertParts(Register DstReg,
164 LLT ResultTy, LLT PartTy,
165 ArrayRef<Register> PartRegs,
166 LLT LeftoverTy,
167 ArrayRef<Register> LeftoverRegs) {
168 if (!LeftoverTy.isValid()) {
169 assert(LeftoverRegs.empty());
171 if (!ResultTy.isVector()) {
172 MIRBuilder.buildMergeLikeInstr(DstReg, PartRegs);
173 return;
176 if (PartTy.isVector())
177 MIRBuilder.buildConcatVectors(DstReg, PartRegs);
178 else
179 MIRBuilder.buildBuildVector(DstReg, PartRegs);
180 return;
183 // Merge sub-vectors with different number of elements and insert into DstReg.
184 if (ResultTy.isVector()) {
185 assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
186 SmallVector<Register, 8> AllRegs(PartRegs.begin(), PartRegs.end());
187 AllRegs.append(LeftoverRegs.begin(), LeftoverRegs.end());
188 return mergeMixedSubvectors(DstReg, AllRegs);
191 SmallVector<Register> GCDRegs;
192 LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
193 for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
194 extractGCDType(GCDRegs, GCDTy, PartReg);
195 LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
196 buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
199 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
200 Register Reg) {
201 LLT Ty = MRI.getType(Reg);
202 SmallVector<Register, 8> RegElts;
203 extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts,
204 MIRBuilder, MRI);
205 Elts.append(RegElts);
208 /// Merge \p PartRegs with different types into \p DstReg.
209 void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
210 ArrayRef<Register> PartRegs) {
211 SmallVector<Register, 8> AllElts;
212 for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
213 appendVectorElts(AllElts, PartRegs[i]);
215 Register Leftover = PartRegs[PartRegs.size() - 1];
216 if (!MRI.getType(Leftover).isVector())
217 AllElts.push_back(Leftover);
218 else
219 appendVectorElts(AllElts, Leftover);
221 MIRBuilder.buildMergeLikeInstr(DstReg, AllElts);
224 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
225 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
226 const MachineInstr &MI) {
227 assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
229 const int StartIdx = Regs.size();
230 const int NumResults = MI.getNumOperands() - 1;
231 Regs.resize(Regs.size() + NumResults);
232 for (int I = 0; I != NumResults; ++I)
233 Regs[StartIdx + I] = MI.getOperand(I).getReg();
236 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
237 LLT GCDTy, Register SrcReg) {
238 LLT SrcTy = MRI.getType(SrcReg);
239 if (SrcTy == GCDTy) {
240 // If the source already evenly divides the result type, we don't need to do
241 // anything.
242 Parts.push_back(SrcReg);
243 } else {
244 // Need to split into common type sized pieces.
245 auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
246 getUnmergeResults(Parts, *Unmerge);
250 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
251 LLT NarrowTy, Register SrcReg) {
252 LLT SrcTy = MRI.getType(SrcReg);
253 LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
254 extractGCDType(Parts, GCDTy, SrcReg);
255 return GCDTy;
258 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
259 SmallVectorImpl<Register> &VRegs,
260 unsigned PadStrategy) {
261 LLT LCMTy = getLCMType(DstTy, NarrowTy);
263 int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
264 int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
265 int NumOrigSrc = VRegs.size();
267 Register PadReg;
269 // Get a value we can use to pad the source value if the sources won't evenly
270 // cover the result type.
271 if (NumOrigSrc < NumParts * NumSubParts) {
272 if (PadStrategy == TargetOpcode::G_ZEXT)
273 PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
274 else if (PadStrategy == TargetOpcode::G_ANYEXT)
275 PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
276 else {
277 assert(PadStrategy == TargetOpcode::G_SEXT);
279 // Shift the sign bit of the low register through the high register.
280 auto ShiftAmt =
281 MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
282 PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
286 // Registers for the final merge to be produced.
287 SmallVector<Register, 4> Remerge(NumParts);
289 // Registers needed for intermediate merges, which will be merged into a
290 // source for Remerge.
291 SmallVector<Register, 4> SubMerge(NumSubParts);
293 // Once we've fully read off the end of the original source bits, we can reuse
294 // the same high bits for remaining padding elements.
295 Register AllPadReg;
297 // Build merges to the LCM type to cover the original result type.
298 for (int I = 0; I != NumParts; ++I) {
299 bool AllMergePartsArePadding = true;
301 // Build the requested merges to the requested type.
302 for (int J = 0; J != NumSubParts; ++J) {
303 int Idx = I * NumSubParts + J;
304 if (Idx >= NumOrigSrc) {
305 SubMerge[J] = PadReg;
306 continue;
309 SubMerge[J] = VRegs[Idx];
311 // There are meaningful bits here we can't reuse later.
312 AllMergePartsArePadding = false;
315 // If we've filled up a complete piece with padding bits, we can directly
316 // emit the natural sized constant if applicable, rather than a merge of
317 // smaller constants.
318 if (AllMergePartsArePadding && !AllPadReg) {
319 if (PadStrategy == TargetOpcode::G_ANYEXT)
320 AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
321 else if (PadStrategy == TargetOpcode::G_ZEXT)
322 AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
324 // If this is a sign extension, we can't materialize a trivial constant
325 // with the right type and have to produce a merge.
328 if (AllPadReg) {
329 // Avoid creating additional instructions if we're just adding additional
330 // copies of padding bits.
331 Remerge[I] = AllPadReg;
332 continue;
335 if (NumSubParts == 1)
336 Remerge[I] = SubMerge[0];
337 else
338 Remerge[I] = MIRBuilder.buildMergeLikeInstr(NarrowTy, SubMerge).getReg(0);
340 // In the sign extend padding case, re-use the first all-signbit merge.
341 if (AllMergePartsArePadding && !AllPadReg)
342 AllPadReg = Remerge[I];
345 VRegs = std::move(Remerge);
346 return LCMTy;
349 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
350 ArrayRef<Register> RemergeRegs) {
351 LLT DstTy = MRI.getType(DstReg);
353 // Create the merge to the widened source, and extract the relevant bits into
354 // the result.
356 if (DstTy == LCMTy) {
357 MIRBuilder.buildMergeLikeInstr(DstReg, RemergeRegs);
358 return;
361 auto Remerge = MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs);
362 if (DstTy.isScalar() && LCMTy.isScalar()) {
363 MIRBuilder.buildTrunc(DstReg, Remerge);
364 return;
367 if (LCMTy.isVector()) {
368 unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
369 SmallVector<Register, 8> UnmergeDefs(NumDefs);
370 UnmergeDefs[0] = DstReg;
371 for (unsigned I = 1; I != NumDefs; ++I)
372 UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
374 MIRBuilder.buildUnmerge(UnmergeDefs,
375 MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs));
376 return;
379 llvm_unreachable("unhandled case");
382 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
383 #define RTLIBCASE_INT(LibcallPrefix) \
384 do { \
385 switch (Size) { \
386 case 32: \
387 return RTLIB::LibcallPrefix##32; \
388 case 64: \
389 return RTLIB::LibcallPrefix##64; \
390 case 128: \
391 return RTLIB::LibcallPrefix##128; \
392 default: \
393 llvm_unreachable("unexpected size"); \
395 } while (0)
397 #define RTLIBCASE(LibcallPrefix) \
398 do { \
399 switch (Size) { \
400 case 32: \
401 return RTLIB::LibcallPrefix##32; \
402 case 64: \
403 return RTLIB::LibcallPrefix##64; \
404 case 80: \
405 return RTLIB::LibcallPrefix##80; \
406 case 128: \
407 return RTLIB::LibcallPrefix##128; \
408 default: \
409 llvm_unreachable("unexpected size"); \
411 } while (0)
413 switch (Opcode) {
414 case TargetOpcode::G_MUL:
415 RTLIBCASE_INT(MUL_I);
416 case TargetOpcode::G_SDIV:
417 RTLIBCASE_INT(SDIV_I);
418 case TargetOpcode::G_UDIV:
419 RTLIBCASE_INT(UDIV_I);
420 case TargetOpcode::G_SREM:
421 RTLIBCASE_INT(SREM_I);
422 case TargetOpcode::G_UREM:
423 RTLIBCASE_INT(UREM_I);
424 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
425 RTLIBCASE_INT(CTLZ_I);
426 case TargetOpcode::G_FADD:
427 RTLIBCASE(ADD_F);
428 case TargetOpcode::G_FSUB:
429 RTLIBCASE(SUB_F);
430 case TargetOpcode::G_FMUL:
431 RTLIBCASE(MUL_F);
432 case TargetOpcode::G_FDIV:
433 RTLIBCASE(DIV_F);
434 case TargetOpcode::G_FEXP:
435 RTLIBCASE(EXP_F);
436 case TargetOpcode::G_FEXP2:
437 RTLIBCASE(EXP2_F);
438 case TargetOpcode::G_FEXP10:
439 RTLIBCASE(EXP10_F);
440 case TargetOpcode::G_FREM:
441 RTLIBCASE(REM_F);
442 case TargetOpcode::G_FPOW:
443 RTLIBCASE(POW_F);
444 case TargetOpcode::G_FPOWI:
445 RTLIBCASE(POWI_F);
446 case TargetOpcode::G_FMA:
447 RTLIBCASE(FMA_F);
448 case TargetOpcode::G_FSIN:
449 RTLIBCASE(SIN_F);
450 case TargetOpcode::G_FCOS:
451 RTLIBCASE(COS_F);
452 case TargetOpcode::G_FTAN:
453 RTLIBCASE(TAN_F);
454 case TargetOpcode::G_FASIN:
455 RTLIBCASE(ASIN_F);
456 case TargetOpcode::G_FACOS:
457 RTLIBCASE(ACOS_F);
458 case TargetOpcode::G_FATAN:
459 RTLIBCASE(ATAN_F);
460 case TargetOpcode::G_FATAN2:
461 RTLIBCASE(ATAN2_F);
462 case TargetOpcode::G_FSINH:
463 RTLIBCASE(SINH_F);
464 case TargetOpcode::G_FCOSH:
465 RTLIBCASE(COSH_F);
466 case TargetOpcode::G_FTANH:
467 RTLIBCASE(TANH_F);
468 case TargetOpcode::G_FLOG10:
469 RTLIBCASE(LOG10_F);
470 case TargetOpcode::G_FLOG:
471 RTLIBCASE(LOG_F);
472 case TargetOpcode::G_FLOG2:
473 RTLIBCASE(LOG2_F);
474 case TargetOpcode::G_FLDEXP:
475 RTLIBCASE(LDEXP_F);
476 case TargetOpcode::G_FCEIL:
477 RTLIBCASE(CEIL_F);
478 case TargetOpcode::G_FFLOOR:
479 RTLIBCASE(FLOOR_F);
480 case TargetOpcode::G_FMINNUM:
481 RTLIBCASE(FMIN_F);
482 case TargetOpcode::G_FMAXNUM:
483 RTLIBCASE(FMAX_F);
484 case TargetOpcode::G_FSQRT:
485 RTLIBCASE(SQRT_F);
486 case TargetOpcode::G_FRINT:
487 RTLIBCASE(RINT_F);
488 case TargetOpcode::G_FNEARBYINT:
489 RTLIBCASE(NEARBYINT_F);
490 case TargetOpcode::G_INTRINSIC_TRUNC:
491 RTLIBCASE(TRUNC_F);
492 case TargetOpcode::G_INTRINSIC_ROUND:
493 RTLIBCASE(ROUND_F);
494 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
495 RTLIBCASE(ROUNDEVEN_F);
496 case TargetOpcode::G_INTRINSIC_LRINT:
497 RTLIBCASE(LRINT_F);
498 case TargetOpcode::G_INTRINSIC_LLRINT:
499 RTLIBCASE(LLRINT_F);
501 llvm_unreachable("Unknown libcall function");
502 #undef RTLIBCASE_INT
503 #undef RTLIBCASE
506 /// True if an instruction is in tail position in its caller. Intended for
507 /// legalizing libcalls as tail calls when possible.
508 static bool isLibCallInTailPosition(const CallLowering::ArgInfo &Result,
509 MachineInstr &MI,
510 const TargetInstrInfo &TII,
511 MachineRegisterInfo &MRI) {
512 MachineBasicBlock &MBB = *MI.getParent();
513 const Function &F = MBB.getParent()->getFunction();
515 // Conservatively require the attributes of the call to match those of
516 // the return. Ignore NoAlias and NonNull because they don't affect the
517 // call sequence.
518 AttributeList CallerAttrs = F.getAttributes();
519 if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
520 .removeAttribute(Attribute::NoAlias)
521 .removeAttribute(Attribute::NonNull)
522 .hasAttributes())
523 return false;
525 // It's not safe to eliminate the sign / zero extension of the return value.
526 if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
527 CallerAttrs.hasRetAttr(Attribute::SExt))
528 return false;
530 // Only tail call if the following instruction is a standard return or if we
531 // have a `thisreturn` callee, and a sequence like:
533 // G_MEMCPY %0, %1, %2
534 // $x0 = COPY %0
535 // RET_ReallyLR implicit $x0
536 auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
537 if (Next != MBB.instr_end() && Next->isCopy()) {
538 if (MI.getOpcode() == TargetOpcode::G_BZERO)
539 return false;
541 // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the
542 // mempy/etc routines return the same parameter. For other it will be the
543 // returned value.
544 Register VReg = MI.getOperand(0).getReg();
545 if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
546 return false;
548 Register PReg = Next->getOperand(0).getReg();
549 if (!PReg.isPhysical())
550 return false;
552 auto Ret = next_nodbg(Next, MBB.instr_end());
553 if (Ret == MBB.instr_end() || !Ret->isReturn())
554 return false;
556 if (Ret->getNumImplicitOperands() != 1)
557 return false;
559 if (!Ret->getOperand(0).isReg() || PReg != Ret->getOperand(0).getReg())
560 return false;
562 // Skip over the COPY that we just validated.
563 Next = Ret;
566 if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
567 return false;
569 return true;
572 LegalizerHelper::LegalizeResult
573 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
574 const CallLowering::ArgInfo &Result,
575 ArrayRef<CallLowering::ArgInfo> Args,
576 const CallingConv::ID CC, LostDebugLocObserver &LocObserver,
577 MachineInstr *MI) {
578 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
580 CallLowering::CallLoweringInfo Info;
581 Info.CallConv = CC;
582 Info.Callee = MachineOperand::CreateES(Name);
583 Info.OrigRet = Result;
584 if (MI)
585 Info.IsTailCall =
586 (Result.Ty->isVoidTy() ||
587 Result.Ty == MIRBuilder.getMF().getFunction().getReturnType()) &&
588 isLibCallInTailPosition(Result, *MI, MIRBuilder.getTII(),
589 *MIRBuilder.getMRI());
591 std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
592 if (!CLI.lowerCall(MIRBuilder, Info))
593 return LegalizerHelper::UnableToLegalize;
595 if (MI && Info.LoweredTailCall) {
596 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
598 // Check debug locations before removing the return.
599 LocObserver.checkpoint(true);
601 // We must have a return following the call (or debug insts) to get past
602 // isLibCallInTailPosition.
603 do {
604 MachineInstr *Next = MI->getNextNode();
605 assert(Next &&
606 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
607 "Expected instr following MI to be return or debug inst?");
608 // We lowered a tail call, so the call is now the return from the block.
609 // Delete the old return.
610 Next->eraseFromParent();
611 } while (MI->getNextNode());
613 // We expect to lose the debug location from the return.
614 LocObserver.checkpoint(false);
616 return LegalizerHelper::Legalized;
619 LegalizerHelper::LegalizeResult
620 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
621 const CallLowering::ArgInfo &Result,
622 ArrayRef<CallLowering::ArgInfo> Args,
623 LostDebugLocObserver &LocObserver, MachineInstr *MI) {
624 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
625 const char *Name = TLI.getLibcallName(Libcall);
626 if (!Name)
627 return LegalizerHelper::UnableToLegalize;
628 const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
629 return createLibcall(MIRBuilder, Name, Result, Args, CC, LocObserver, MI);
632 // Useful for libcalls where all operands have the same type.
633 static LegalizerHelper::LegalizeResult
634 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
635 Type *OpType, LostDebugLocObserver &LocObserver) {
636 auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
638 // FIXME: What does the original arg index mean here?
639 SmallVector<CallLowering::ArgInfo, 3> Args;
640 for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
641 Args.push_back({MO.getReg(), OpType, 0});
642 return createLibcall(MIRBuilder, Libcall,
643 {MI.getOperand(0).getReg(), OpType, 0}, Args,
644 LocObserver, &MI);
647 LegalizerHelper::LegalizeResult
648 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
649 MachineInstr &MI, LostDebugLocObserver &LocObserver) {
650 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
652 SmallVector<CallLowering::ArgInfo, 3> Args;
653 // Add all the args, except for the last which is an imm denoting 'tail'.
654 for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
655 Register Reg = MI.getOperand(i).getReg();
657 // Need derive an IR type for call lowering.
658 LLT OpLLT = MRI.getType(Reg);
659 Type *OpTy = nullptr;
660 if (OpLLT.isPointer())
661 OpTy = PointerType::get(Ctx, OpLLT.getAddressSpace());
662 else
663 OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
664 Args.push_back({Reg, OpTy, 0});
667 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
668 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
669 RTLIB::Libcall RTLibcall;
670 unsigned Opc = MI.getOpcode();
671 switch (Opc) {
672 case TargetOpcode::G_BZERO:
673 RTLibcall = RTLIB::BZERO;
674 break;
675 case TargetOpcode::G_MEMCPY:
676 RTLibcall = RTLIB::MEMCPY;
677 Args[0].Flags[0].setReturned();
678 break;
679 case TargetOpcode::G_MEMMOVE:
680 RTLibcall = RTLIB::MEMMOVE;
681 Args[0].Flags[0].setReturned();
682 break;
683 case TargetOpcode::G_MEMSET:
684 RTLibcall = RTLIB::MEMSET;
685 Args[0].Flags[0].setReturned();
686 break;
687 default:
688 llvm_unreachable("unsupported opcode");
690 const char *Name = TLI.getLibcallName(RTLibcall);
692 // Unsupported libcall on the target.
693 if (!Name) {
694 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
695 << MIRBuilder.getTII().getName(Opc) << "\n");
696 return LegalizerHelper::UnableToLegalize;
699 CallLowering::CallLoweringInfo Info;
700 Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
701 Info.Callee = MachineOperand::CreateES(Name);
702 Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
703 Info.IsTailCall =
704 MI.getOperand(MI.getNumOperands() - 1).getImm() &&
705 isLibCallInTailPosition(Info.OrigRet, MI, MIRBuilder.getTII(), MRI);
707 std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
708 if (!CLI.lowerCall(MIRBuilder, Info))
709 return LegalizerHelper::UnableToLegalize;
711 if (Info.LoweredTailCall) {
712 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
714 // Check debug locations before removing the return.
715 LocObserver.checkpoint(true);
717 // We must have a return following the call (or debug insts) to get past
718 // isLibCallInTailPosition.
719 do {
720 MachineInstr *Next = MI.getNextNode();
721 assert(Next &&
722 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
723 "Expected instr following MI to be return or debug inst?");
724 // We lowered a tail call, so the call is now the return from the block.
725 // Delete the old return.
726 Next->eraseFromParent();
727 } while (MI.getNextNode());
729 // We expect to lose the debug location from the return.
730 LocObserver.checkpoint(false);
733 return LegalizerHelper::Legalized;
736 static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
737 unsigned Opc = MI.getOpcode();
738 auto &AtomicMI = cast<GMemOperation>(MI);
739 auto &MMO = AtomicMI.getMMO();
740 auto Ordering = MMO.getMergedOrdering();
741 LLT MemType = MMO.getMemoryType();
742 uint64_t MemSize = MemType.getSizeInBytes();
743 if (MemType.isVector())
744 return RTLIB::UNKNOWN_LIBCALL;
746 #define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
747 #define LCALL5(A) \
748 LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
749 switch (Opc) {
750 case TargetOpcode::G_ATOMIC_CMPXCHG:
751 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
752 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
753 return getOutlineAtomicHelper(LC, Ordering, MemSize);
755 case TargetOpcode::G_ATOMICRMW_XCHG: {
756 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
757 return getOutlineAtomicHelper(LC, Ordering, MemSize);
759 case TargetOpcode::G_ATOMICRMW_ADD:
760 case TargetOpcode::G_ATOMICRMW_SUB: {
761 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
762 return getOutlineAtomicHelper(LC, Ordering, MemSize);
764 case TargetOpcode::G_ATOMICRMW_AND: {
765 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
766 return getOutlineAtomicHelper(LC, Ordering, MemSize);
768 case TargetOpcode::G_ATOMICRMW_OR: {
769 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
770 return getOutlineAtomicHelper(LC, Ordering, MemSize);
772 case TargetOpcode::G_ATOMICRMW_XOR: {
773 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
774 return getOutlineAtomicHelper(LC, Ordering, MemSize);
776 default:
777 return RTLIB::UNKNOWN_LIBCALL;
779 #undef LCALLS
780 #undef LCALL5
783 static LegalizerHelper::LegalizeResult
784 createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
785 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
787 Type *RetTy;
788 SmallVector<Register> RetRegs;
789 SmallVector<CallLowering::ArgInfo, 3> Args;
790 unsigned Opc = MI.getOpcode();
791 switch (Opc) {
792 case TargetOpcode::G_ATOMIC_CMPXCHG:
793 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
794 Register Success;
795 LLT SuccessLLT;
796 auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
797 MI.getFirst4RegLLTs();
798 RetRegs.push_back(Ret);
799 RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
800 if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
801 std::tie(Ret, RetLLT, Success, SuccessLLT, Mem, MemLLT, Cmp, CmpLLT, New,
802 NewLLT) = MI.getFirst5RegLLTs();
803 RetRegs.push_back(Success);
804 RetTy = StructType::get(
805 Ctx, {RetTy, IntegerType::get(Ctx, SuccessLLT.getSizeInBits())});
807 Args.push_back({Cmp, IntegerType::get(Ctx, CmpLLT.getSizeInBits()), 0});
808 Args.push_back({New, IntegerType::get(Ctx, NewLLT.getSizeInBits()), 0});
809 Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
810 break;
812 case TargetOpcode::G_ATOMICRMW_XCHG:
813 case TargetOpcode::G_ATOMICRMW_ADD:
814 case TargetOpcode::G_ATOMICRMW_SUB:
815 case TargetOpcode::G_ATOMICRMW_AND:
816 case TargetOpcode::G_ATOMICRMW_OR:
817 case TargetOpcode::G_ATOMICRMW_XOR: {
818 auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
819 RetRegs.push_back(Ret);
820 RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
821 if (Opc == TargetOpcode::G_ATOMICRMW_AND)
822 Val =
823 MIRBuilder.buildXor(ValLLT, MIRBuilder.buildConstant(ValLLT, -1), Val)
824 .getReg(0);
825 else if (Opc == TargetOpcode::G_ATOMICRMW_SUB)
826 Val =
827 MIRBuilder.buildSub(ValLLT, MIRBuilder.buildConstant(ValLLT, 0), Val)
828 .getReg(0);
829 Args.push_back({Val, IntegerType::get(Ctx, ValLLT.getSizeInBits()), 0});
830 Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
831 break;
833 default:
834 llvm_unreachable("unsupported opcode");
837 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
838 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
839 RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI);
840 const char *Name = TLI.getLibcallName(RTLibcall);
842 // Unsupported libcall on the target.
843 if (!Name) {
844 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
845 << MIRBuilder.getTII().getName(Opc) << "\n");
846 return LegalizerHelper::UnableToLegalize;
849 CallLowering::CallLoweringInfo Info;
850 Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
851 Info.Callee = MachineOperand::CreateES(Name);
852 Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
854 std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
855 if (!CLI.lowerCall(MIRBuilder, Info))
856 return LegalizerHelper::UnableToLegalize;
858 return LegalizerHelper::Legalized;
861 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
862 Type *FromType) {
863 auto ToMVT = MVT::getVT(ToType);
864 auto FromMVT = MVT::getVT(FromType);
866 switch (Opcode) {
867 case TargetOpcode::G_FPEXT:
868 return RTLIB::getFPEXT(FromMVT, ToMVT);
869 case TargetOpcode::G_FPTRUNC:
870 return RTLIB::getFPROUND(FromMVT, ToMVT);
871 case TargetOpcode::G_FPTOSI:
872 return RTLIB::getFPTOSINT(FromMVT, ToMVT);
873 case TargetOpcode::G_FPTOUI:
874 return RTLIB::getFPTOUINT(FromMVT, ToMVT);
875 case TargetOpcode::G_SITOFP:
876 return RTLIB::getSINTTOFP(FromMVT, ToMVT);
877 case TargetOpcode::G_UITOFP:
878 return RTLIB::getUINTTOFP(FromMVT, ToMVT);
880 llvm_unreachable("Unsupported libcall function");
883 static LegalizerHelper::LegalizeResult
884 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
885 Type *FromType, LostDebugLocObserver &LocObserver,
886 const TargetLowering &TLI, bool IsSigned = false) {
887 CallLowering::ArgInfo Arg = {MI.getOperand(1).getReg(), FromType, 0};
888 if (FromType->isIntegerTy()) {
889 if (TLI.shouldSignExtendTypeInLibCall(FromType, IsSigned))
890 Arg.Flags[0].setSExt();
891 else
892 Arg.Flags[0].setZExt();
895 RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
896 return createLibcall(MIRBuilder, Libcall,
897 {MI.getOperand(0).getReg(), ToType, 0}, Arg, LocObserver,
898 &MI);
901 static RTLIB::Libcall
902 getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) {
903 RTLIB::Libcall RTLibcall;
904 switch (MI.getOpcode()) {
905 case TargetOpcode::G_GET_FPENV:
906 RTLibcall = RTLIB::FEGETENV;
907 break;
908 case TargetOpcode::G_SET_FPENV:
909 case TargetOpcode::G_RESET_FPENV:
910 RTLibcall = RTLIB::FESETENV;
911 break;
912 case TargetOpcode::G_GET_FPMODE:
913 RTLibcall = RTLIB::FEGETMODE;
914 break;
915 case TargetOpcode::G_SET_FPMODE:
916 case TargetOpcode::G_RESET_FPMODE:
917 RTLibcall = RTLIB::FESETMODE;
918 break;
919 default:
920 llvm_unreachable("Unexpected opcode");
922 return RTLibcall;
925 // Some library functions that read FP state (fegetmode, fegetenv) write the
926 // state into a region in memory. IR intrinsics that do the same operations
927 // (get_fpmode, get_fpenv) return the state as integer value. To implement these
928 // intrinsics via the library functions, we need to use temporary variable,
929 // for example:
931 // %0:_(s32) = G_GET_FPMODE
933 // is transformed to:
935 // %1:_(p0) = G_FRAME_INDEX %stack.0
936 // BL &fegetmode
937 // %0:_(s32) = G_LOAD % 1
939 LegalizerHelper::LegalizeResult
940 LegalizerHelper::createGetStateLibcall(MachineIRBuilder &MIRBuilder,
941 MachineInstr &MI,
942 LostDebugLocObserver &LocObserver) {
943 const DataLayout &DL = MIRBuilder.getDataLayout();
944 auto &MF = MIRBuilder.getMF();
945 auto &MRI = *MIRBuilder.getMRI();
946 auto &Ctx = MF.getFunction().getContext();
948 // Create temporary, where library function will put the read state.
949 Register Dst = MI.getOperand(0).getReg();
950 LLT StateTy = MRI.getType(Dst);
951 TypeSize StateSize = StateTy.getSizeInBytes();
952 Align TempAlign = getStackTemporaryAlignment(StateTy);
953 MachinePointerInfo TempPtrInfo;
954 auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo);
956 // Create a call to library function, with the temporary as an argument.
957 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
958 Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace);
959 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
960 auto Res =
961 createLibcall(MIRBuilder, RTLibcall,
962 CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
963 CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
964 LocObserver, nullptr);
965 if (Res != LegalizerHelper::Legalized)
966 return Res;
968 // Create a load from the temporary.
969 MachineMemOperand *MMO = MF.getMachineMemOperand(
970 TempPtrInfo, MachineMemOperand::MOLoad, StateTy, TempAlign);
971 MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Temp, *MMO);
973 return LegalizerHelper::Legalized;
976 // Similar to `createGetStateLibcall` the function calls a library function
977 // using transient space in stack. In this case the library function reads
978 // content of memory region.
979 LegalizerHelper::LegalizeResult
980 LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
981 MachineInstr &MI,
982 LostDebugLocObserver &LocObserver) {
983 const DataLayout &DL = MIRBuilder.getDataLayout();
984 auto &MF = MIRBuilder.getMF();
985 auto &MRI = *MIRBuilder.getMRI();
986 auto &Ctx = MF.getFunction().getContext();
988 // Create temporary, where library function will get the new state.
989 Register Src = MI.getOperand(0).getReg();
990 LLT StateTy = MRI.getType(Src);
991 TypeSize StateSize = StateTy.getSizeInBytes();
992 Align TempAlign = getStackTemporaryAlignment(StateTy);
993 MachinePointerInfo TempPtrInfo;
994 auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo);
996 // Put the new state into the temporary.
997 MachineMemOperand *MMO = MF.getMachineMemOperand(
998 TempPtrInfo, MachineMemOperand::MOStore, StateTy, TempAlign);
999 MIRBuilder.buildStore(Src, Temp, *MMO);
1001 // Create a call to library function, with the temporary as an argument.
1002 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1003 Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace);
1004 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1005 return createLibcall(MIRBuilder, RTLibcall,
1006 CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
1007 CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
1008 LocObserver, nullptr);
1011 /// Returns the corresponding libcall for the given Pred and
1012 /// the ICMP predicate that should be generated to compare with #0
1013 /// after the libcall.
1014 static std::pair<RTLIB::Libcall, CmpInst::Predicate>
1015 getFCMPLibcallDesc(const CmpInst::Predicate Pred, unsigned Size) {
1016 #define RTLIBCASE_CMP(LibcallPrefix, ICmpPred) \
1017 do { \
1018 switch (Size) { \
1019 case 32: \
1020 return {RTLIB::LibcallPrefix##32, ICmpPred}; \
1021 case 64: \
1022 return {RTLIB::LibcallPrefix##64, ICmpPred}; \
1023 case 128: \
1024 return {RTLIB::LibcallPrefix##128, ICmpPred}; \
1025 default: \
1026 llvm_unreachable("unexpected size"); \
1028 } while (0)
1030 switch (Pred) {
1031 case CmpInst::FCMP_OEQ:
1032 RTLIBCASE_CMP(OEQ_F, CmpInst::ICMP_EQ);
1033 case CmpInst::FCMP_UNE:
1034 RTLIBCASE_CMP(UNE_F, CmpInst::ICMP_NE);
1035 case CmpInst::FCMP_OGE:
1036 RTLIBCASE_CMP(OGE_F, CmpInst::ICMP_SGE);
1037 case CmpInst::FCMP_OLT:
1038 RTLIBCASE_CMP(OLT_F, CmpInst::ICMP_SLT);
1039 case CmpInst::FCMP_OLE:
1040 RTLIBCASE_CMP(OLE_F, CmpInst::ICMP_SLE);
1041 case CmpInst::FCMP_OGT:
1042 RTLIBCASE_CMP(OGT_F, CmpInst::ICMP_SGT);
1043 case CmpInst::FCMP_UNO:
1044 RTLIBCASE_CMP(UO_F, CmpInst::ICMP_NE);
1045 default:
1046 return {RTLIB::UNKNOWN_LIBCALL, CmpInst::BAD_ICMP_PREDICATE};
1050 LegalizerHelper::LegalizeResult
1051 LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
1052 MachineInstr &MI,
1053 LostDebugLocObserver &LocObserver) {
1054 auto &MF = MIRBuilder.getMF();
1055 auto &Ctx = MF.getFunction().getContext();
1056 const GFCmp *Cmp = cast<GFCmp>(&MI);
1058 LLT OpLLT = MRI.getType(Cmp->getLHSReg());
1059 unsigned Size = OpLLT.getSizeInBits();
1060 if ((Size != 32 && Size != 64 && Size != 128) ||
1061 OpLLT != MRI.getType(Cmp->getRHSReg()))
1062 return UnableToLegalize;
1064 Type *OpType = getFloatTypeForLLT(Ctx, OpLLT);
1066 // DstReg type is s32
1067 const Register DstReg = Cmp->getReg(0);
1068 LLT DstTy = MRI.getType(DstReg);
1069 const auto Cond = Cmp->getCond();
1071 // Reference:
1072 // https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html#Comparison-functions-1
1073 // Generates a libcall followed by ICMP.
1074 const auto BuildLibcall = [&](const RTLIB::Libcall Libcall,
1075 const CmpInst::Predicate ICmpPred,
1076 const DstOp &Res) -> Register {
1077 // FCMP libcall always returns an i32, and needs an ICMP with #0.
1078 constexpr LLT TempLLT = LLT::scalar(32);
1079 Register Temp = MRI.createGenericVirtualRegister(TempLLT);
1080 // Generate libcall, holding result in Temp
1081 const auto Status = createLibcall(
1082 MIRBuilder, Libcall, {Temp, Type::getInt32Ty(Ctx), 0},
1083 {{Cmp->getLHSReg(), OpType, 0}, {Cmp->getRHSReg(), OpType, 1}},
1084 LocObserver, &MI);
1085 if (!Status)
1086 return {};
1088 // Compare temp with #0 to get the final result.
1089 return MIRBuilder
1090 .buildICmp(ICmpPred, Res, Temp, MIRBuilder.buildConstant(TempLLT, 0))
1091 .getReg(0);
1094 // Simple case if we have a direct mapping from predicate to libcall
1095 if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Cond, Size);
1096 Libcall != RTLIB::UNKNOWN_LIBCALL &&
1097 ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
1098 if (BuildLibcall(Libcall, ICmpPred, DstReg)) {
1099 return Legalized;
1101 return UnableToLegalize;
1104 // No direct mapping found, should be generated as combination of libcalls.
1106 switch (Cond) {
1107 case CmpInst::FCMP_UEQ: {
1108 // FCMP_UEQ: unordered or equal
1109 // Convert into (FCMP_OEQ || FCMP_UNO).
1111 const auto [OeqLibcall, OeqPred] =
1112 getFCMPLibcallDesc(CmpInst::FCMP_OEQ, Size);
1113 const auto Oeq = BuildLibcall(OeqLibcall, OeqPred, DstTy);
1115 const auto [UnoLibcall, UnoPred] =
1116 getFCMPLibcallDesc(CmpInst::FCMP_UNO, Size);
1117 const auto Uno = BuildLibcall(UnoLibcall, UnoPred, DstTy);
1118 if (Oeq && Uno)
1119 MIRBuilder.buildOr(DstReg, Oeq, Uno);
1120 else
1121 return UnableToLegalize;
1123 break;
1125 case CmpInst::FCMP_ONE: {
1126 // FCMP_ONE: ordered and operands are unequal
1127 // Convert into (!FCMP_OEQ && !FCMP_UNO).
1129 // We inverse the predicate instead of generating a NOT
1130 // to save one instruction.
1131 // On AArch64 isel can even select two cmp into a single ccmp.
1132 const auto [OeqLibcall, OeqPred] =
1133 getFCMPLibcallDesc(CmpInst::FCMP_OEQ, Size);
1134 const auto NotOeq =
1135 BuildLibcall(OeqLibcall, CmpInst::getInversePredicate(OeqPred), DstTy);
1137 const auto [UnoLibcall, UnoPred] =
1138 getFCMPLibcallDesc(CmpInst::FCMP_UNO, Size);
1139 const auto NotUno =
1140 BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(UnoPred), DstTy);
1142 if (NotOeq && NotUno)
1143 MIRBuilder.buildAnd(DstReg, NotOeq, NotUno);
1144 else
1145 return UnableToLegalize;
1147 break;
1149 case CmpInst::FCMP_ULT:
1150 case CmpInst::FCMP_UGE:
1151 case CmpInst::FCMP_UGT:
1152 case CmpInst::FCMP_ULE:
1153 case CmpInst::FCMP_ORD: {
1154 // Convert into: !(inverse(Pred))
1155 // E.g. FCMP_ULT becomes !FCMP_OGE
1156 // This is equivalent to the following, but saves some instructions.
1157 // MIRBuilder.buildNot(
1158 // PredTy,
1159 // MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
1160 // Op1, Op2));
1161 const auto [InversedLibcall, InversedPred] =
1162 getFCMPLibcallDesc(CmpInst::getInversePredicate(Cond), Size);
1163 if (!BuildLibcall(InversedLibcall,
1164 CmpInst::getInversePredicate(InversedPred), DstReg))
1165 return UnableToLegalize;
1166 break;
1168 default:
1169 return UnableToLegalize;
1172 return Legalized;
1175 // The function is used to legalize operations that set default environment
1176 // state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
1177 // On most targets supported in glibc FE_DFL_MODE is defined as
1178 // `((const femode_t *) -1)`. Such assumption is used here. If for some target
1179 // it is not true, the target must provide custom lowering.
1180 LegalizerHelper::LegalizeResult
1181 LegalizerHelper::createResetStateLibcall(MachineIRBuilder &MIRBuilder,
1182 MachineInstr &MI,
1183 LostDebugLocObserver &LocObserver) {
1184 const DataLayout &DL = MIRBuilder.getDataLayout();
1185 auto &MF = MIRBuilder.getMF();
1186 auto &Ctx = MF.getFunction().getContext();
1188 // Create an argument for the library function.
1189 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
1190 Type *StatePtrTy = PointerType::get(Ctx, AddrSpace);
1191 unsigned PtrSize = DL.getPointerSizeInBits(AddrSpace);
1192 LLT MemTy = LLT::pointer(AddrSpace, PtrSize);
1193 auto DefValue = MIRBuilder.buildConstant(LLT::scalar(PtrSize), -1LL);
1194 DstOp Dest(MRI.createGenericVirtualRegister(MemTy));
1195 MIRBuilder.buildIntToPtr(Dest, DefValue);
1197 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1198 return createLibcall(MIRBuilder, RTLibcall,
1199 CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
1200 CallLowering::ArgInfo({Dest.getReg(), StatePtrTy, 0}),
1201 LocObserver, &MI);
1204 LegalizerHelper::LegalizeResult
1205 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
1206 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
1208 switch (MI.getOpcode()) {
1209 default:
1210 return UnableToLegalize;
1211 case TargetOpcode::G_MUL:
1212 case TargetOpcode::G_SDIV:
1213 case TargetOpcode::G_UDIV:
1214 case TargetOpcode::G_SREM:
1215 case TargetOpcode::G_UREM:
1216 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
1217 LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1218 unsigned Size = LLTy.getSizeInBits();
1219 Type *HLTy = IntegerType::get(Ctx, Size);
1220 auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1221 if (Status != Legalized)
1222 return Status;
1223 break;
1225 case TargetOpcode::G_FADD:
1226 case TargetOpcode::G_FSUB:
1227 case TargetOpcode::G_FMUL:
1228 case TargetOpcode::G_FDIV:
1229 case TargetOpcode::G_FMA:
1230 case TargetOpcode::G_FPOW:
1231 case TargetOpcode::G_FREM:
1232 case TargetOpcode::G_FCOS:
1233 case TargetOpcode::G_FSIN:
1234 case TargetOpcode::G_FTAN:
1235 case TargetOpcode::G_FACOS:
1236 case TargetOpcode::G_FASIN:
1237 case TargetOpcode::G_FATAN:
1238 case TargetOpcode::G_FATAN2:
1239 case TargetOpcode::G_FCOSH:
1240 case TargetOpcode::G_FSINH:
1241 case TargetOpcode::G_FTANH:
1242 case TargetOpcode::G_FLOG10:
1243 case TargetOpcode::G_FLOG:
1244 case TargetOpcode::G_FLOG2:
1245 case TargetOpcode::G_FEXP:
1246 case TargetOpcode::G_FEXP2:
1247 case TargetOpcode::G_FEXP10:
1248 case TargetOpcode::G_FCEIL:
1249 case TargetOpcode::G_FFLOOR:
1250 case TargetOpcode::G_FMINNUM:
1251 case TargetOpcode::G_FMAXNUM:
1252 case TargetOpcode::G_FSQRT:
1253 case TargetOpcode::G_FRINT:
1254 case TargetOpcode::G_FNEARBYINT:
1255 case TargetOpcode::G_INTRINSIC_TRUNC:
1256 case TargetOpcode::G_INTRINSIC_ROUND:
1257 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
1258 LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1259 unsigned Size = LLTy.getSizeInBits();
1260 Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1261 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1262 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1263 return UnableToLegalize;
1265 auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1266 if (Status != Legalized)
1267 return Status;
1268 break;
1270 case TargetOpcode::G_INTRINSIC_LRINT:
1271 case TargetOpcode::G_INTRINSIC_LLRINT: {
1272 LLT LLTy = MRI.getType(MI.getOperand(1).getReg());
1273 unsigned Size = LLTy.getSizeInBits();
1274 Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1275 Type *ITy = IntegerType::get(
1276 Ctx, MRI.getType(MI.getOperand(0).getReg()).getSizeInBits());
1277 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1278 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1279 return UnableToLegalize;
1281 auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
1282 LegalizeResult Status =
1283 createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), ITy, 0},
1284 {{MI.getOperand(1).getReg(), HLTy, 0}}, LocObserver, &MI);
1285 if (Status != Legalized)
1286 return Status;
1287 MI.eraseFromParent();
1288 return Legalized;
1290 case TargetOpcode::G_FPOWI:
1291 case TargetOpcode::G_FLDEXP: {
1292 LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1293 unsigned Size = LLTy.getSizeInBits();
1294 Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1295 Type *ITy = IntegerType::get(
1296 Ctx, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
1297 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1298 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1299 return UnableToLegalize;
1301 auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
1302 SmallVector<CallLowering::ArgInfo, 2> Args = {
1303 {MI.getOperand(1).getReg(), HLTy, 0},
1304 {MI.getOperand(2).getReg(), ITy, 1}};
1305 Args[1].Flags[0].setSExt();
1306 LegalizeResult Status =
1307 createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), HLTy, 0},
1308 Args, LocObserver, &MI);
1309 if (Status != Legalized)
1310 return Status;
1311 break;
1313 case TargetOpcode::G_FPEXT:
1314 case TargetOpcode::G_FPTRUNC: {
1315 Type *FromTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(1).getReg()));
1316 Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
1317 if (!FromTy || !ToTy)
1318 return UnableToLegalize;
1319 LegalizeResult Status =
1320 conversionLibcall(MI, MIRBuilder, ToTy, FromTy, LocObserver, TLI);
1321 if (Status != Legalized)
1322 return Status;
1323 break;
1325 case TargetOpcode::G_FCMP: {
1326 LegalizeResult Status = createFCMPLibcall(MIRBuilder, MI, LocObserver);
1327 if (Status != Legalized)
1328 return Status;
1329 MI.eraseFromParent();
1330 return Status;
1332 case TargetOpcode::G_FPTOSI:
1333 case TargetOpcode::G_FPTOUI: {
1334 // FIXME: Support other types
1335 Type *FromTy =
1336 getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(1).getReg()));
1337 unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1338 if ((ToSize != 32 && ToSize != 64 && ToSize != 128) || !FromTy)
1339 return UnableToLegalize;
1340 LegalizeResult Status = conversionLibcall(
1341 MI, MIRBuilder, Type::getIntNTy(Ctx, ToSize), FromTy, LocObserver, TLI);
1342 if (Status != Legalized)
1343 return Status;
1344 break;
1346 case TargetOpcode::G_SITOFP:
1347 case TargetOpcode::G_UITOFP: {
1348 unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1349 Type *ToTy =
1350 getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
1351 if ((FromSize != 32 && FromSize != 64 && FromSize != 128) || !ToTy)
1352 return UnableToLegalize;
1353 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SITOFP;
1354 LegalizeResult Status =
1355 conversionLibcall(MI, MIRBuilder, ToTy, Type::getIntNTy(Ctx, FromSize),
1356 LocObserver, TLI, IsSigned);
1357 if (Status != Legalized)
1358 return Status;
1359 break;
1361 case TargetOpcode::G_ATOMICRMW_XCHG:
1362 case TargetOpcode::G_ATOMICRMW_ADD:
1363 case TargetOpcode::G_ATOMICRMW_SUB:
1364 case TargetOpcode::G_ATOMICRMW_AND:
1365 case TargetOpcode::G_ATOMICRMW_OR:
1366 case TargetOpcode::G_ATOMICRMW_XOR:
1367 case TargetOpcode::G_ATOMIC_CMPXCHG:
1368 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
1369 auto Status = createAtomicLibcall(MIRBuilder, MI);
1370 if (Status != Legalized)
1371 return Status;
1372 break;
1374 case TargetOpcode::G_BZERO:
1375 case TargetOpcode::G_MEMCPY:
1376 case TargetOpcode::G_MEMMOVE:
1377 case TargetOpcode::G_MEMSET: {
1378 LegalizeResult Result =
1379 createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
1380 if (Result != Legalized)
1381 return Result;
1382 MI.eraseFromParent();
1383 return Result;
1385 case TargetOpcode::G_GET_FPENV:
1386 case TargetOpcode::G_GET_FPMODE: {
1387 LegalizeResult Result = createGetStateLibcall(MIRBuilder, MI, LocObserver);
1388 if (Result != Legalized)
1389 return Result;
1390 break;
1392 case TargetOpcode::G_SET_FPENV:
1393 case TargetOpcode::G_SET_FPMODE: {
1394 LegalizeResult Result = createSetStateLibcall(MIRBuilder, MI, LocObserver);
1395 if (Result != Legalized)
1396 return Result;
1397 break;
1399 case TargetOpcode::G_RESET_FPENV:
1400 case TargetOpcode::G_RESET_FPMODE: {
1401 LegalizeResult Result =
1402 createResetStateLibcall(MIRBuilder, MI, LocObserver);
1403 if (Result != Legalized)
1404 return Result;
1405 break;
1409 MI.eraseFromParent();
1410 return Legalized;
1413 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
1414 unsigned TypeIdx,
1415 LLT NarrowTy) {
1416 uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1417 uint64_t NarrowSize = NarrowTy.getSizeInBits();
1419 switch (MI.getOpcode()) {
1420 default:
1421 return UnableToLegalize;
1422 case TargetOpcode::G_IMPLICIT_DEF: {
1423 Register DstReg = MI.getOperand(0).getReg();
1424 LLT DstTy = MRI.getType(DstReg);
1426 // If SizeOp0 is not an exact multiple of NarrowSize, emit
1427 // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
1428 // FIXME: Although this would also be legal for the general case, it causes
1429 // a lot of regressions in the emitted code (superfluous COPYs, artifact
1430 // combines not being hit). This seems to be a problem related to the
1431 // artifact combiner.
1432 if (SizeOp0 % NarrowSize != 0) {
1433 LLT ImplicitTy = NarrowTy;
1434 if (DstTy.isVector())
1435 ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
1437 Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
1438 MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
1440 MI.eraseFromParent();
1441 return Legalized;
1444 int NumParts = SizeOp0 / NarrowSize;
1446 SmallVector<Register, 2> DstRegs;
1447 for (int i = 0; i < NumParts; ++i)
1448 DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
1450 if (DstTy.isVector())
1451 MIRBuilder.buildBuildVector(DstReg, DstRegs);
1452 else
1453 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
1454 MI.eraseFromParent();
1455 return Legalized;
1457 case TargetOpcode::G_CONSTANT: {
1458 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1459 const APInt &Val = MI.getOperand(1).getCImm()->getValue();
1460 unsigned TotalSize = Ty.getSizeInBits();
1461 unsigned NarrowSize = NarrowTy.getSizeInBits();
1462 int NumParts = TotalSize / NarrowSize;
1464 SmallVector<Register, 4> PartRegs;
1465 for (int I = 0; I != NumParts; ++I) {
1466 unsigned Offset = I * NarrowSize;
1467 auto K = MIRBuilder.buildConstant(NarrowTy,
1468 Val.lshr(Offset).trunc(NarrowSize));
1469 PartRegs.push_back(K.getReg(0));
1472 LLT LeftoverTy;
1473 unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
1474 SmallVector<Register, 1> LeftoverRegs;
1475 if (LeftoverBits != 0) {
1476 LeftoverTy = LLT::scalar(LeftoverBits);
1477 auto K = MIRBuilder.buildConstant(
1478 LeftoverTy,
1479 Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
1480 LeftoverRegs.push_back(K.getReg(0));
1483 insertParts(MI.getOperand(0).getReg(),
1484 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
1486 MI.eraseFromParent();
1487 return Legalized;
1489 case TargetOpcode::G_SEXT:
1490 case TargetOpcode::G_ZEXT:
1491 case TargetOpcode::G_ANYEXT:
1492 return narrowScalarExt(MI, TypeIdx, NarrowTy);
1493 case TargetOpcode::G_TRUNC: {
1494 if (TypeIdx != 1)
1495 return UnableToLegalize;
1497 uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1498 if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
1499 LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
1500 return UnableToLegalize;
1503 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
1504 MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
1505 MI.eraseFromParent();
1506 return Legalized;
1508 case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
1509 case TargetOpcode::G_FREEZE: {
1510 if (TypeIdx != 0)
1511 return UnableToLegalize;
1513 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1514 // Should widen scalar first
1515 if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1516 return UnableToLegalize;
1518 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
1519 SmallVector<Register, 8> Parts;
1520 for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1521 Parts.push_back(
1522 MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy}, {Unmerge.getReg(i)})
1523 .getReg(0));
1526 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), Parts);
1527 MI.eraseFromParent();
1528 return Legalized;
1530 case TargetOpcode::G_ADD:
1531 case TargetOpcode::G_SUB:
1532 case TargetOpcode::G_SADDO:
1533 case TargetOpcode::G_SSUBO:
1534 case TargetOpcode::G_SADDE:
1535 case TargetOpcode::G_SSUBE:
1536 case TargetOpcode::G_UADDO:
1537 case TargetOpcode::G_USUBO:
1538 case TargetOpcode::G_UADDE:
1539 case TargetOpcode::G_USUBE:
1540 return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1541 case TargetOpcode::G_MUL:
1542 case TargetOpcode::G_UMULH:
1543 return narrowScalarMul(MI, NarrowTy);
1544 case TargetOpcode::G_EXTRACT:
1545 return narrowScalarExtract(MI, TypeIdx, NarrowTy);
1546 case TargetOpcode::G_INSERT:
1547 return narrowScalarInsert(MI, TypeIdx, NarrowTy);
1548 case TargetOpcode::G_LOAD: {
1549 auto &LoadMI = cast<GLoad>(MI);
1550 Register DstReg = LoadMI.getDstReg();
1551 LLT DstTy = MRI.getType(DstReg);
1552 if (DstTy.isVector())
1553 return UnableToLegalize;
1555 if (8 * LoadMI.getMemSize().getValue() != DstTy.getSizeInBits()) {
1556 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1557 MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
1558 MIRBuilder.buildAnyExt(DstReg, TmpReg);
1559 LoadMI.eraseFromParent();
1560 return Legalized;
1563 return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
1565 case TargetOpcode::G_ZEXTLOAD:
1566 case TargetOpcode::G_SEXTLOAD: {
1567 auto &LoadMI = cast<GExtLoad>(MI);
1568 Register DstReg = LoadMI.getDstReg();
1569 Register PtrReg = LoadMI.getPointerReg();
1571 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1572 auto &MMO = LoadMI.getMMO();
1573 unsigned MemSize = MMO.getSizeInBits().getValue();
1575 if (MemSize == NarrowSize) {
1576 MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
1577 } else if (MemSize < NarrowSize) {
1578 MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
1579 } else if (MemSize > NarrowSize) {
1580 // FIXME: Need to split the load.
1581 return UnableToLegalize;
1584 if (isa<GZExtLoad>(LoadMI))
1585 MIRBuilder.buildZExt(DstReg, TmpReg);
1586 else
1587 MIRBuilder.buildSExt(DstReg, TmpReg);
1589 LoadMI.eraseFromParent();
1590 return Legalized;
1592 case TargetOpcode::G_STORE: {
1593 auto &StoreMI = cast<GStore>(MI);
1595 Register SrcReg = StoreMI.getValueReg();
1596 LLT SrcTy = MRI.getType(SrcReg);
1597 if (SrcTy.isVector())
1598 return UnableToLegalize;
1600 int NumParts = SizeOp0 / NarrowSize;
1601 unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1602 unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1603 if (SrcTy.isVector() && LeftoverBits != 0)
1604 return UnableToLegalize;
1606 if (8 * StoreMI.getMemSize().getValue() != SrcTy.getSizeInBits()) {
1607 Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1608 MIRBuilder.buildTrunc(TmpReg, SrcReg);
1609 MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1610 StoreMI.eraseFromParent();
1611 return Legalized;
1614 return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1616 case TargetOpcode::G_SELECT:
1617 return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1618 case TargetOpcode::G_AND:
1619 case TargetOpcode::G_OR:
1620 case TargetOpcode::G_XOR: {
1621 // Legalize bitwise operation:
1622 // A = BinOp<Ty> B, C
1623 // into:
1624 // B1, ..., BN = G_UNMERGE_VALUES B
1625 // C1, ..., CN = G_UNMERGE_VALUES C
1626 // A1 = BinOp<Ty/N> B1, C2
1627 // ...
1628 // AN = BinOp<Ty/N> BN, CN
1629 // A = G_MERGE_VALUES A1, ..., AN
1630 return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1632 case TargetOpcode::G_SHL:
1633 case TargetOpcode::G_LSHR:
1634 case TargetOpcode::G_ASHR:
1635 return narrowScalarShift(MI, TypeIdx, NarrowTy);
1636 case TargetOpcode::G_CTLZ:
1637 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1638 case TargetOpcode::G_CTTZ:
1639 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1640 case TargetOpcode::G_CTPOP:
1641 if (TypeIdx == 1)
1642 switch (MI.getOpcode()) {
1643 case TargetOpcode::G_CTLZ:
1644 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1645 return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1646 case TargetOpcode::G_CTTZ:
1647 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1648 return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1649 case TargetOpcode::G_CTPOP:
1650 return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1651 default:
1652 return UnableToLegalize;
1655 Observer.changingInstr(MI);
1656 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1657 Observer.changedInstr(MI);
1658 return Legalized;
1659 case TargetOpcode::G_INTTOPTR:
1660 if (TypeIdx != 1)
1661 return UnableToLegalize;
1663 Observer.changingInstr(MI);
1664 narrowScalarSrc(MI, NarrowTy, 1);
1665 Observer.changedInstr(MI);
1666 return Legalized;
1667 case TargetOpcode::G_PTRTOINT:
1668 if (TypeIdx != 0)
1669 return UnableToLegalize;
1671 Observer.changingInstr(MI);
1672 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1673 Observer.changedInstr(MI);
1674 return Legalized;
1675 case TargetOpcode::G_PHI: {
1676 // FIXME: add support for when SizeOp0 isn't an exact multiple of
1677 // NarrowSize.
1678 if (SizeOp0 % NarrowSize != 0)
1679 return UnableToLegalize;
1681 unsigned NumParts = SizeOp0 / NarrowSize;
1682 SmallVector<Register, 2> DstRegs(NumParts);
1683 SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1684 Observer.changingInstr(MI);
1685 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1686 MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1687 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
1688 extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1689 SrcRegs[i / 2], MIRBuilder, MRI);
1691 MachineBasicBlock &MBB = *MI.getParent();
1692 MIRBuilder.setInsertPt(MBB, MI);
1693 for (unsigned i = 0; i < NumParts; ++i) {
1694 DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1695 MachineInstrBuilder MIB =
1696 MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1697 for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1698 MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1700 MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1701 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
1702 Observer.changedInstr(MI);
1703 MI.eraseFromParent();
1704 return Legalized;
1706 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1707 case TargetOpcode::G_INSERT_VECTOR_ELT: {
1708 if (TypeIdx != 2)
1709 return UnableToLegalize;
1711 int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1712 Observer.changingInstr(MI);
1713 narrowScalarSrc(MI, NarrowTy, OpIdx);
1714 Observer.changedInstr(MI);
1715 return Legalized;
1717 case TargetOpcode::G_ICMP: {
1718 Register LHS = MI.getOperand(2).getReg();
1719 LLT SrcTy = MRI.getType(LHS);
1720 CmpInst::Predicate Pred =
1721 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1723 LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1724 SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1725 if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1726 LHSLeftoverRegs, MIRBuilder, MRI))
1727 return UnableToLegalize;
1729 LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1730 SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1731 if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1732 RHSPartRegs, RHSLeftoverRegs, MIRBuilder, MRI))
1733 return UnableToLegalize;
1735 // We now have the LHS and RHS of the compare split into narrow-type
1736 // registers, plus potentially some leftover type.
1737 Register Dst = MI.getOperand(0).getReg();
1738 LLT ResTy = MRI.getType(Dst);
1739 if (ICmpInst::isEquality(Pred)) {
1740 // For each part on the LHS and RHS, keep track of the result of XOR-ing
1741 // them together. For each equal part, the result should be all 0s. For
1742 // each non-equal part, we'll get at least one 1.
1743 auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1744 SmallVector<Register, 4> Xors;
1745 for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1746 auto LHS = std::get<0>(LHSAndRHS);
1747 auto RHS = std::get<1>(LHSAndRHS);
1748 auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1749 Xors.push_back(Xor);
1752 // Build a G_XOR for each leftover register. Each G_XOR must be widened
1753 // to the desired narrow type so that we can OR them together later.
1754 SmallVector<Register, 4> WidenedXors;
1755 for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1756 auto LHS = std::get<0>(LHSAndRHS);
1757 auto RHS = std::get<1>(LHSAndRHS);
1758 auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1759 LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1760 buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1761 /* PadStrategy = */ TargetOpcode::G_ZEXT);
1762 Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1765 // Now, for each part we broke up, we know if they are equal/not equal
1766 // based off the G_XOR. We can OR these all together and compare against
1767 // 0 to get the result.
1768 assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1769 auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1770 for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1771 Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1772 MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1773 } else {
1774 Register CmpIn;
1775 for (unsigned I = 0, E = LHSPartRegs.size(); I != E; ++I) {
1776 Register CmpOut;
1777 CmpInst::Predicate PartPred;
1779 if (I == E - 1 && LHSLeftoverRegs.empty()) {
1780 PartPred = Pred;
1781 CmpOut = Dst;
1782 } else {
1783 PartPred = ICmpInst::getUnsignedPredicate(Pred);
1784 CmpOut = MRI.createGenericVirtualRegister(ResTy);
1787 if (!CmpIn) {
1788 MIRBuilder.buildICmp(PartPred, CmpOut, LHSPartRegs[I],
1789 RHSPartRegs[I]);
1790 } else {
1791 auto Cmp = MIRBuilder.buildICmp(PartPred, ResTy, LHSPartRegs[I],
1792 RHSPartRegs[I]);
1793 auto CmpEq = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy,
1794 LHSPartRegs[I], RHSPartRegs[I]);
1795 MIRBuilder.buildSelect(CmpOut, CmpEq, CmpIn, Cmp);
1798 CmpIn = CmpOut;
1801 for (unsigned I = 0, E = LHSLeftoverRegs.size(); I != E; ++I) {
1802 Register CmpOut;
1803 CmpInst::Predicate PartPred;
1805 if (I == E - 1 && LHSLeftoverRegs.empty()) {
1806 PartPred = Pred;
1807 CmpOut = Dst;
1808 } else {
1809 PartPred = ICmpInst::getUnsignedPredicate(Pred);
1810 CmpOut = MRI.createGenericVirtualRegister(ResTy);
1813 if (!CmpIn) {
1814 MIRBuilder.buildICmp(PartPred, CmpOut, LHSLeftoverRegs[I],
1815 RHSLeftoverRegs[I]);
1816 } else {
1817 auto Cmp = MIRBuilder.buildICmp(PartPred, ResTy, LHSLeftoverRegs[I],
1818 RHSLeftoverRegs[I]);
1819 auto CmpEq =
1820 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy,
1821 LHSLeftoverRegs[I], RHSLeftoverRegs[I]);
1822 MIRBuilder.buildSelect(CmpOut, CmpEq, CmpIn, Cmp);
1825 CmpIn = CmpOut;
1828 MI.eraseFromParent();
1829 return Legalized;
1831 case TargetOpcode::G_FCMP:
1832 if (TypeIdx != 0)
1833 return UnableToLegalize;
1835 Observer.changingInstr(MI);
1836 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1837 Observer.changedInstr(MI);
1838 return Legalized;
1840 case TargetOpcode::G_SEXT_INREG: {
1841 if (TypeIdx != 0)
1842 return UnableToLegalize;
1844 int64_t SizeInBits = MI.getOperand(2).getImm();
1846 // So long as the new type has more bits than the bits we're extending we
1847 // don't need to break it apart.
1848 if (NarrowTy.getScalarSizeInBits() > SizeInBits) {
1849 Observer.changingInstr(MI);
1850 // We don't lose any non-extension bits by truncating the src and
1851 // sign-extending the dst.
1852 MachineOperand &MO1 = MI.getOperand(1);
1853 auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1854 MO1.setReg(TruncMIB.getReg(0));
1856 MachineOperand &MO2 = MI.getOperand(0);
1857 Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1858 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1859 MIRBuilder.buildSExt(MO2, DstExt);
1860 MO2.setReg(DstExt);
1861 Observer.changedInstr(MI);
1862 return Legalized;
1865 // Break it apart. Components below the extension point are unmodified. The
1866 // component containing the extension point becomes a narrower SEXT_INREG.
1867 // Components above it are ashr'd from the component containing the
1868 // extension point.
1869 if (SizeOp0 % NarrowSize != 0)
1870 return UnableToLegalize;
1871 int NumParts = SizeOp0 / NarrowSize;
1873 // List the registers where the destination will be scattered.
1874 SmallVector<Register, 2> DstRegs;
1875 // List the registers where the source will be split.
1876 SmallVector<Register, 2> SrcRegs;
1878 // Create all the temporary registers.
1879 for (int i = 0; i < NumParts; ++i) {
1880 Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1882 SrcRegs.push_back(SrcReg);
1885 // Explode the big arguments into smaller chunks.
1886 MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1888 Register AshrCstReg =
1889 MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1890 .getReg(0);
1891 Register FullExtensionReg;
1892 Register PartialExtensionReg;
1894 // Do the operation on each small part.
1895 for (int i = 0; i < NumParts; ++i) {
1896 if ((i + 1) * NarrowTy.getScalarSizeInBits() <= SizeInBits) {
1897 DstRegs.push_back(SrcRegs[i]);
1898 PartialExtensionReg = DstRegs.back();
1899 } else if (i * NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1900 assert(PartialExtensionReg &&
1901 "Expected to visit partial extension before full");
1902 if (FullExtensionReg) {
1903 DstRegs.push_back(FullExtensionReg);
1904 continue;
1906 DstRegs.push_back(
1907 MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1908 .getReg(0));
1909 FullExtensionReg = DstRegs.back();
1910 } else {
1911 DstRegs.push_back(
1912 MIRBuilder
1913 .buildInstr(
1914 TargetOpcode::G_SEXT_INREG, {NarrowTy},
1915 {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1916 .getReg(0));
1917 PartialExtensionReg = DstRegs.back();
1921 // Gather the destination registers into the final destination.
1922 Register DstReg = MI.getOperand(0).getReg();
1923 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
1924 MI.eraseFromParent();
1925 return Legalized;
1927 case TargetOpcode::G_BSWAP:
1928 case TargetOpcode::G_BITREVERSE: {
1929 if (SizeOp0 % NarrowSize != 0)
1930 return UnableToLegalize;
1932 Observer.changingInstr(MI);
1933 SmallVector<Register, 2> SrcRegs, DstRegs;
1934 unsigned NumParts = SizeOp0 / NarrowSize;
1935 extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs,
1936 MIRBuilder, MRI);
1938 for (unsigned i = 0; i < NumParts; ++i) {
1939 auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1940 {SrcRegs[NumParts - 1 - i]});
1941 DstRegs.push_back(DstPart.getReg(0));
1944 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
1946 Observer.changedInstr(MI);
1947 MI.eraseFromParent();
1948 return Legalized;
1950 case TargetOpcode::G_PTR_ADD:
1951 case TargetOpcode::G_PTRMASK: {
1952 if (TypeIdx != 1)
1953 return UnableToLegalize;
1954 Observer.changingInstr(MI);
1955 narrowScalarSrc(MI, NarrowTy, 2);
1956 Observer.changedInstr(MI);
1957 return Legalized;
1959 case TargetOpcode::G_FPTOUI:
1960 case TargetOpcode::G_FPTOSI:
1961 case TargetOpcode::G_FPTOUI_SAT:
1962 case TargetOpcode::G_FPTOSI_SAT:
1963 return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1964 case TargetOpcode::G_FPEXT:
1965 if (TypeIdx != 0)
1966 return UnableToLegalize;
1967 Observer.changingInstr(MI);
1968 narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1969 Observer.changedInstr(MI);
1970 return Legalized;
1971 case TargetOpcode::G_FLDEXP:
1972 case TargetOpcode::G_STRICT_FLDEXP:
1973 return narrowScalarFLDEXP(MI, TypeIdx, NarrowTy);
1974 case TargetOpcode::G_VSCALE: {
1975 Register Dst = MI.getOperand(0).getReg();
1976 LLT Ty = MRI.getType(Dst);
1978 // Assume VSCALE(1) fits into a legal integer
1979 const APInt One(NarrowTy.getSizeInBits(), 1);
1980 auto VScaleBase = MIRBuilder.buildVScale(NarrowTy, One);
1981 auto ZExt = MIRBuilder.buildZExt(Ty, VScaleBase);
1982 auto C = MIRBuilder.buildConstant(Ty, *MI.getOperand(1).getCImm());
1983 MIRBuilder.buildMul(Dst, ZExt, C);
1985 MI.eraseFromParent();
1986 return Legalized;
1991 Register LegalizerHelper::coerceToScalar(Register Val) {
1992 LLT Ty = MRI.getType(Val);
1993 if (Ty.isScalar())
1994 return Val;
1996 const DataLayout &DL = MIRBuilder.getDataLayout();
1997 LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1998 if (Ty.isPointer()) {
1999 if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
2000 return Register();
2001 return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
2004 Register NewVal = Val;
2006 assert(Ty.isVector());
2007 if (Ty.isPointerVector())
2008 NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
2009 return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
2012 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
2013 unsigned OpIdx, unsigned ExtOpcode) {
2014 MachineOperand &MO = MI.getOperand(OpIdx);
2015 auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
2016 MO.setReg(ExtB.getReg(0));
2019 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
2020 unsigned OpIdx) {
2021 MachineOperand &MO = MI.getOperand(OpIdx);
2022 auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
2023 MO.setReg(ExtB.getReg(0));
2026 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
2027 unsigned OpIdx, unsigned TruncOpcode) {
2028 MachineOperand &MO = MI.getOperand(OpIdx);
2029 Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2030 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2031 MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
2032 MO.setReg(DstExt);
2035 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
2036 unsigned OpIdx, unsigned ExtOpcode) {
2037 MachineOperand &MO = MI.getOperand(OpIdx);
2038 Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
2039 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2040 MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
2041 MO.setReg(DstTrunc);
2044 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
2045 unsigned OpIdx) {
2046 MachineOperand &MO = MI.getOperand(OpIdx);
2047 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2048 Register Dst = MO.getReg();
2049 Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2050 MO.setReg(DstExt);
2051 MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
2054 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
2055 unsigned OpIdx) {
2056 MachineOperand &MO = MI.getOperand(OpIdx);
2057 SmallVector<Register, 8> Regs;
2058 MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
2061 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2062 MachineOperand &Op = MI.getOperand(OpIdx);
2063 Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
2066 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2067 MachineOperand &MO = MI.getOperand(OpIdx);
2068 Register CastDst = MRI.createGenericVirtualRegister(CastTy);
2069 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2070 MIRBuilder.buildBitcast(MO, CastDst);
2071 MO.setReg(CastDst);
2074 LegalizerHelper::LegalizeResult
2075 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
2076 LLT WideTy) {
2077 if (TypeIdx != 1)
2078 return UnableToLegalize;
2080 auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs();
2081 if (DstTy.isVector())
2082 return UnableToLegalize;
2084 LLT SrcTy = MRI.getType(Src1Reg);
2085 const int DstSize = DstTy.getSizeInBits();
2086 const int SrcSize = SrcTy.getSizeInBits();
2087 const int WideSize = WideTy.getSizeInBits();
2088 const int NumMerge = (DstSize + WideSize - 1) / WideSize;
2090 unsigned NumOps = MI.getNumOperands();
2091 unsigned NumSrc = MI.getNumOperands() - 1;
2092 unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
2094 if (WideSize >= DstSize) {
2095 // Directly pack the bits in the target type.
2096 Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1Reg).getReg(0);
2098 for (unsigned I = 2; I != NumOps; ++I) {
2099 const unsigned Offset = (I - 1) * PartSize;
2101 Register SrcReg = MI.getOperand(I).getReg();
2102 assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
2104 auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
2106 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
2107 MRI.createGenericVirtualRegister(WideTy);
2109 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
2110 auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
2111 MIRBuilder.buildOr(NextResult, ResultReg, Shl);
2112 ResultReg = NextResult;
2115 if (WideSize > DstSize)
2116 MIRBuilder.buildTrunc(DstReg, ResultReg);
2117 else if (DstTy.isPointer())
2118 MIRBuilder.buildIntToPtr(DstReg, ResultReg);
2120 MI.eraseFromParent();
2121 return Legalized;
2124 // Unmerge the original values to the GCD type, and recombine to the next
2125 // multiple greater than the original type.
2127 // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
2128 // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
2129 // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
2130 // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
2131 // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
2132 // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
2133 // %12:_(s12) = G_MERGE_VALUES %10, %11
2135 // Padding with undef if necessary:
2137 // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
2138 // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
2139 // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
2140 // %7:_(s2) = G_IMPLICIT_DEF
2141 // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
2142 // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
2143 // %10:_(s12) = G_MERGE_VALUES %8, %9
2145 const int GCD = std::gcd(SrcSize, WideSize);
2146 LLT GCDTy = LLT::scalar(GCD);
2148 SmallVector<Register, 8> Parts;
2149 SmallVector<Register, 8> NewMergeRegs;
2150 SmallVector<Register, 8> Unmerges;
2151 LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
2153 // Decompose the original operands if they don't evenly divide.
2154 for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
2155 Register SrcReg = MO.getReg();
2156 if (GCD == SrcSize) {
2157 Unmerges.push_back(SrcReg);
2158 } else {
2159 auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
2160 for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
2161 Unmerges.push_back(Unmerge.getReg(J));
2165 // Pad with undef to the next size that is a multiple of the requested size.
2166 if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
2167 Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
2168 for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
2169 Unmerges.push_back(UndefReg);
2172 const int PartsPerGCD = WideSize / GCD;
2174 // Build merges of each piece.
2175 ArrayRef<Register> Slicer(Unmerges);
2176 for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
2177 auto Merge =
2178 MIRBuilder.buildMergeLikeInstr(WideTy, Slicer.take_front(PartsPerGCD));
2179 NewMergeRegs.push_back(Merge.getReg(0));
2182 // A truncate may be necessary if the requested type doesn't evenly divide the
2183 // original result type.
2184 if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
2185 MIRBuilder.buildMergeLikeInstr(DstReg, NewMergeRegs);
2186 } else {
2187 auto FinalMerge = MIRBuilder.buildMergeLikeInstr(WideDstTy, NewMergeRegs);
2188 MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
2191 MI.eraseFromParent();
2192 return Legalized;
2195 LegalizerHelper::LegalizeResult
2196 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
2197 LLT WideTy) {
2198 if (TypeIdx != 0)
2199 return UnableToLegalize;
2201 int NumDst = MI.getNumOperands() - 1;
2202 Register SrcReg = MI.getOperand(NumDst).getReg();
2203 LLT SrcTy = MRI.getType(SrcReg);
2204 if (SrcTy.isVector())
2205 return UnableToLegalize;
2207 Register Dst0Reg = MI.getOperand(0).getReg();
2208 LLT DstTy = MRI.getType(Dst0Reg);
2209 if (!DstTy.isScalar())
2210 return UnableToLegalize;
2212 if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
2213 if (SrcTy.isPointer()) {
2214 const DataLayout &DL = MIRBuilder.getDataLayout();
2215 if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
2216 LLVM_DEBUG(
2217 dbgs() << "Not casting non-integral address space integer\n");
2218 return UnableToLegalize;
2221 SrcTy = LLT::scalar(SrcTy.getSizeInBits());
2222 SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
2225 // Widen SrcTy to WideTy. This does not affect the result, but since the
2226 // user requested this size, it is probably better handled than SrcTy and
2227 // should reduce the total number of legalization artifacts.
2228 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2229 SrcTy = WideTy;
2230 SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
2233 // Theres no unmerge type to target. Directly extract the bits from the
2234 // source type
2235 unsigned DstSize = DstTy.getSizeInBits();
2237 MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
2238 for (int I = 1; I != NumDst; ++I) {
2239 auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
2240 auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
2241 MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
2244 MI.eraseFromParent();
2245 return Legalized;
2248 // Extend the source to a wider type.
2249 LLT LCMTy = getLCMType(SrcTy, WideTy);
2251 Register WideSrc = SrcReg;
2252 if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
2253 // TODO: If this is an integral address space, cast to integer and anyext.
2254 if (SrcTy.isPointer()) {
2255 LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
2256 return UnableToLegalize;
2259 WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
2262 auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
2264 // Create a sequence of unmerges and merges to the original results. Since we
2265 // may have widened the source, we will need to pad the results with dead defs
2266 // to cover the source register.
2267 // e.g. widen s48 to s64:
2268 // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
2270 // =>
2271 // %4:_(s192) = G_ANYEXT %0:_(s96)
2272 // %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
2273 // ; unpack to GCD type, with extra dead defs
2274 // %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
2275 // %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
2276 // dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
2277 // %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10 ; Remerge to destination
2278 // %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
2279 const LLT GCDTy = getGCDType(WideTy, DstTy);
2280 const int NumUnmerge = Unmerge->getNumOperands() - 1;
2281 const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
2283 // Directly unmerge to the destination without going through a GCD type
2284 // if possible
2285 if (PartsPerRemerge == 1) {
2286 const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
2288 for (int I = 0; I != NumUnmerge; ++I) {
2289 auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
2291 for (int J = 0; J != PartsPerUnmerge; ++J) {
2292 int Idx = I * PartsPerUnmerge + J;
2293 if (Idx < NumDst)
2294 MIB.addDef(MI.getOperand(Idx).getReg());
2295 else {
2296 // Create dead def for excess components.
2297 MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
2301 MIB.addUse(Unmerge.getReg(I));
2303 } else {
2304 SmallVector<Register, 16> Parts;
2305 for (int J = 0; J != NumUnmerge; ++J)
2306 extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
2308 SmallVector<Register, 8> RemergeParts;
2309 for (int I = 0; I != NumDst; ++I) {
2310 for (int J = 0; J < PartsPerRemerge; ++J) {
2311 const int Idx = I * PartsPerRemerge + J;
2312 RemergeParts.emplace_back(Parts[Idx]);
2315 MIRBuilder.buildMergeLikeInstr(MI.getOperand(I).getReg(), RemergeParts);
2316 RemergeParts.clear();
2320 MI.eraseFromParent();
2321 return Legalized;
2324 LegalizerHelper::LegalizeResult
2325 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
2326 LLT WideTy) {
2327 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
2328 unsigned Offset = MI.getOperand(2).getImm();
2330 if (TypeIdx == 0) {
2331 if (SrcTy.isVector() || DstTy.isVector())
2332 return UnableToLegalize;
2334 SrcOp Src(SrcReg);
2335 if (SrcTy.isPointer()) {
2336 // Extracts from pointers can be handled only if they are really just
2337 // simple integers.
2338 const DataLayout &DL = MIRBuilder.getDataLayout();
2339 if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
2340 return UnableToLegalize;
2342 LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
2343 Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
2344 SrcTy = SrcAsIntTy;
2347 if (DstTy.isPointer())
2348 return UnableToLegalize;
2350 if (Offset == 0) {
2351 // Avoid a shift in the degenerate case.
2352 MIRBuilder.buildTrunc(DstReg,
2353 MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
2354 MI.eraseFromParent();
2355 return Legalized;
2358 // Do a shift in the source type.
2359 LLT ShiftTy = SrcTy;
2360 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2361 Src = MIRBuilder.buildAnyExt(WideTy, Src);
2362 ShiftTy = WideTy;
2365 auto LShr = MIRBuilder.buildLShr(
2366 ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
2367 MIRBuilder.buildTrunc(DstReg, LShr);
2368 MI.eraseFromParent();
2369 return Legalized;
2372 if (SrcTy.isScalar()) {
2373 Observer.changingInstr(MI);
2374 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2375 Observer.changedInstr(MI);
2376 return Legalized;
2379 if (!SrcTy.isVector())
2380 return UnableToLegalize;
2382 if (DstTy != SrcTy.getElementType())
2383 return UnableToLegalize;
2385 if (Offset % SrcTy.getScalarSizeInBits() != 0)
2386 return UnableToLegalize;
2388 Observer.changingInstr(MI);
2389 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2391 MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
2392 Offset);
2393 widenScalarDst(MI, WideTy.getScalarType(), 0);
2394 Observer.changedInstr(MI);
2395 return Legalized;
2398 LegalizerHelper::LegalizeResult
2399 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
2400 LLT WideTy) {
2401 if (TypeIdx != 0 || WideTy.isVector())
2402 return UnableToLegalize;
2403 Observer.changingInstr(MI);
2404 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2405 widenScalarDst(MI, WideTy);
2406 Observer.changedInstr(MI);
2407 return Legalized;
2410 LegalizerHelper::LegalizeResult
2411 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
2412 LLT WideTy) {
2413 unsigned Opcode;
2414 unsigned ExtOpcode;
2415 std::optional<Register> CarryIn;
2416 switch (MI.getOpcode()) {
2417 default:
2418 llvm_unreachable("Unexpected opcode!");
2419 case TargetOpcode::G_SADDO:
2420 Opcode = TargetOpcode::G_ADD;
2421 ExtOpcode = TargetOpcode::G_SEXT;
2422 break;
2423 case TargetOpcode::G_SSUBO:
2424 Opcode = TargetOpcode::G_SUB;
2425 ExtOpcode = TargetOpcode::G_SEXT;
2426 break;
2427 case TargetOpcode::G_UADDO:
2428 Opcode = TargetOpcode::G_ADD;
2429 ExtOpcode = TargetOpcode::G_ZEXT;
2430 break;
2431 case TargetOpcode::G_USUBO:
2432 Opcode = TargetOpcode::G_SUB;
2433 ExtOpcode = TargetOpcode::G_ZEXT;
2434 break;
2435 case TargetOpcode::G_SADDE:
2436 Opcode = TargetOpcode::G_UADDE;
2437 ExtOpcode = TargetOpcode::G_SEXT;
2438 CarryIn = MI.getOperand(4).getReg();
2439 break;
2440 case TargetOpcode::G_SSUBE:
2441 Opcode = TargetOpcode::G_USUBE;
2442 ExtOpcode = TargetOpcode::G_SEXT;
2443 CarryIn = MI.getOperand(4).getReg();
2444 break;
2445 case TargetOpcode::G_UADDE:
2446 Opcode = TargetOpcode::G_UADDE;
2447 ExtOpcode = TargetOpcode::G_ZEXT;
2448 CarryIn = MI.getOperand(4).getReg();
2449 break;
2450 case TargetOpcode::G_USUBE:
2451 Opcode = TargetOpcode::G_USUBE;
2452 ExtOpcode = TargetOpcode::G_ZEXT;
2453 CarryIn = MI.getOperand(4).getReg();
2454 break;
2457 if (TypeIdx == 1) {
2458 unsigned BoolExtOp = MIRBuilder.getBoolExtOp(WideTy.isVector(), false);
2460 Observer.changingInstr(MI);
2461 if (CarryIn)
2462 widenScalarSrc(MI, WideTy, 4, BoolExtOp);
2463 widenScalarDst(MI, WideTy, 1);
2465 Observer.changedInstr(MI);
2466 return Legalized;
2469 auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
2470 auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
2471 // Do the arithmetic in the larger type.
2472 Register NewOp;
2473 if (CarryIn) {
2474 LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
2475 NewOp = MIRBuilder
2476 .buildInstr(Opcode, {WideTy, CarryOutTy},
2477 {LHSExt, RHSExt, *CarryIn})
2478 .getReg(0);
2479 } else {
2480 NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
2482 LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
2483 auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
2484 auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
2485 // There is no overflow if the ExtOp is the same as NewOp.
2486 MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
2487 // Now trunc the NewOp to the original result.
2488 MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
2489 MI.eraseFromParent();
2490 return Legalized;
2493 LegalizerHelper::LegalizeResult
2494 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
2495 LLT WideTy) {
2496 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
2497 MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
2498 MI.getOpcode() == TargetOpcode::G_SSHLSAT;
2499 bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
2500 MI.getOpcode() == TargetOpcode::G_USHLSAT;
2501 // We can convert this to:
2502 // 1. Any extend iN to iM
2503 // 2. SHL by M-N
2504 // 3. [US][ADD|SUB|SHL]SAT
2505 // 4. L/ASHR by M-N
2507 // It may be more efficient to lower this to a min and a max operation in
2508 // the higher precision arithmetic if the promoted operation isn't legal,
2509 // but this decision is up to the target's lowering request.
2510 Register DstReg = MI.getOperand(0).getReg();
2512 unsigned NewBits = WideTy.getScalarSizeInBits();
2513 unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
2515 // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
2516 // must not left shift the RHS to preserve the shift amount.
2517 auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
2518 auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
2519 : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
2520 auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
2521 auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
2522 auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
2524 auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
2525 {ShiftL, ShiftR}, MI.getFlags());
2527 // Use a shift that will preserve the number of sign bits when the trunc is
2528 // folded away.
2529 auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
2530 : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
2532 MIRBuilder.buildTrunc(DstReg, Result);
2533 MI.eraseFromParent();
2534 return Legalized;
2537 LegalizerHelper::LegalizeResult
2538 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
2539 LLT WideTy) {
2540 if (TypeIdx == 1) {
2541 Observer.changingInstr(MI);
2542 widenScalarDst(MI, WideTy, 1);
2543 Observer.changedInstr(MI);
2544 return Legalized;
2547 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
2548 auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs();
2549 LLT SrcTy = MRI.getType(LHS);
2550 LLT OverflowTy = MRI.getType(OriginalOverflow);
2551 unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
2553 // To determine if the result overflowed in the larger type, we extend the
2554 // input to the larger type, do the multiply (checking if it overflows),
2555 // then also check the high bits of the result to see if overflow happened
2556 // there.
2557 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2558 auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
2559 auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
2561 // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2562 // so we don't need to check the overflow result of larger type Mulo.
2563 bool WideMulCanOverflow = WideTy.getScalarSizeInBits() < 2 * SrcBitWidth;
2565 unsigned MulOpc =
2566 WideMulCanOverflow ? MI.getOpcode() : (unsigned)TargetOpcode::G_MUL;
2568 MachineInstrBuilder Mulo;
2569 if (WideMulCanOverflow)
2570 Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy, OverflowTy},
2571 {LeftOperand, RightOperand});
2572 else
2573 Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy}, {LeftOperand, RightOperand});
2575 auto Mul = Mulo->getOperand(0);
2576 MIRBuilder.buildTrunc(Result, Mul);
2578 MachineInstrBuilder ExtResult;
2579 // Overflow occurred if it occurred in the larger type, or if the high part
2580 // of the result does not zero/sign-extend the low part. Check this second
2581 // possibility first.
2582 if (IsSigned) {
2583 // For signed, overflow occurred when the high part does not sign-extend
2584 // the low part.
2585 ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
2586 } else {
2587 // Unsigned overflow occurred when the high part does not zero-extend the
2588 // low part.
2589 ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
2592 if (WideMulCanOverflow) {
2593 auto Overflow =
2594 MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
2595 // Finally check if the multiplication in the larger type itself overflowed.
2596 MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
2597 } else {
2598 MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
2600 MI.eraseFromParent();
2601 return Legalized;
2604 LegalizerHelper::LegalizeResult
2605 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2606 unsigned Opcode = MI.getOpcode();
2607 switch (Opcode) {
2608 default:
2609 return UnableToLegalize;
2610 case TargetOpcode::G_ATOMICRMW_XCHG:
2611 case TargetOpcode::G_ATOMICRMW_ADD:
2612 case TargetOpcode::G_ATOMICRMW_SUB:
2613 case TargetOpcode::G_ATOMICRMW_AND:
2614 case TargetOpcode::G_ATOMICRMW_OR:
2615 case TargetOpcode::G_ATOMICRMW_XOR:
2616 case TargetOpcode::G_ATOMICRMW_MIN:
2617 case TargetOpcode::G_ATOMICRMW_MAX:
2618 case TargetOpcode::G_ATOMICRMW_UMIN:
2619 case TargetOpcode::G_ATOMICRMW_UMAX:
2620 assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2621 Observer.changingInstr(MI);
2622 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2623 widenScalarDst(MI, WideTy, 0);
2624 Observer.changedInstr(MI);
2625 return Legalized;
2626 case TargetOpcode::G_ATOMIC_CMPXCHG:
2627 assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2628 Observer.changingInstr(MI);
2629 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2630 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2631 widenScalarDst(MI, WideTy, 0);
2632 Observer.changedInstr(MI);
2633 return Legalized;
2634 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2635 if (TypeIdx == 0) {
2636 Observer.changingInstr(MI);
2637 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2638 widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2639 widenScalarDst(MI, WideTy, 0);
2640 Observer.changedInstr(MI);
2641 return Legalized;
2643 assert(TypeIdx == 1 &&
2644 "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2645 Observer.changingInstr(MI);
2646 widenScalarDst(MI, WideTy, 1);
2647 Observer.changedInstr(MI);
2648 return Legalized;
2649 case TargetOpcode::G_EXTRACT:
2650 return widenScalarExtract(MI, TypeIdx, WideTy);
2651 case TargetOpcode::G_INSERT:
2652 return widenScalarInsert(MI, TypeIdx, WideTy);
2653 case TargetOpcode::G_MERGE_VALUES:
2654 return widenScalarMergeValues(MI, TypeIdx, WideTy);
2655 case TargetOpcode::G_UNMERGE_VALUES:
2656 return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2657 case TargetOpcode::G_SADDO:
2658 case TargetOpcode::G_SSUBO:
2659 case TargetOpcode::G_UADDO:
2660 case TargetOpcode::G_USUBO:
2661 case TargetOpcode::G_SADDE:
2662 case TargetOpcode::G_SSUBE:
2663 case TargetOpcode::G_UADDE:
2664 case TargetOpcode::G_USUBE:
2665 return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2666 case TargetOpcode::G_UMULO:
2667 case TargetOpcode::G_SMULO:
2668 return widenScalarMulo(MI, TypeIdx, WideTy);
2669 case TargetOpcode::G_SADDSAT:
2670 case TargetOpcode::G_SSUBSAT:
2671 case TargetOpcode::G_SSHLSAT:
2672 case TargetOpcode::G_UADDSAT:
2673 case TargetOpcode::G_USUBSAT:
2674 case TargetOpcode::G_USHLSAT:
2675 return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2676 case TargetOpcode::G_CTTZ:
2677 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2678 case TargetOpcode::G_CTLZ:
2679 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2680 case TargetOpcode::G_CTPOP: {
2681 if (TypeIdx == 0) {
2682 Observer.changingInstr(MI);
2683 widenScalarDst(MI, WideTy, 0);
2684 Observer.changedInstr(MI);
2685 return Legalized;
2688 Register SrcReg = MI.getOperand(1).getReg();
2690 // First extend the input.
2691 unsigned ExtOpc = Opcode == TargetOpcode::G_CTTZ ||
2692 Opcode == TargetOpcode::G_CTTZ_ZERO_UNDEF
2693 ? TargetOpcode::G_ANYEXT
2694 : TargetOpcode::G_ZEXT;
2695 auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2696 LLT CurTy = MRI.getType(SrcReg);
2697 unsigned NewOpc = Opcode;
2698 if (NewOpc == TargetOpcode::G_CTTZ) {
2699 // The count is the same in the larger type except if the original
2700 // value was zero. This can be handled by setting the bit just off
2701 // the top of the original type.
2702 auto TopBit =
2703 APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2704 MIBSrc = MIRBuilder.buildOr(
2705 WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2706 // Now we know the operand is non-zero, use the more relaxed opcode.
2707 NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2710 unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2712 if (Opcode == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2713 // An optimization where the result is the CTLZ after the left shift by
2714 // (Difference in widety and current ty), that is,
2715 // MIBSrc = MIBSrc << (sizeinbits(WideTy) - sizeinbits(CurTy))
2716 // Result = ctlz MIBSrc
2717 MIBSrc = MIRBuilder.buildShl(WideTy, MIBSrc,
2718 MIRBuilder.buildConstant(WideTy, SizeDiff));
2721 // Perform the operation at the larger size.
2722 auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2723 // This is already the correct result for CTPOP and CTTZs
2724 if (Opcode == TargetOpcode::G_CTLZ) {
2725 // The correct result is NewOp - (Difference in widety and current ty).
2726 MIBNewOp = MIRBuilder.buildSub(
2727 WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2730 MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2731 MI.eraseFromParent();
2732 return Legalized;
2734 case TargetOpcode::G_BSWAP: {
2735 Observer.changingInstr(MI);
2736 Register DstReg = MI.getOperand(0).getReg();
2738 Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2739 Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2740 Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2741 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2743 MI.getOperand(0).setReg(DstExt);
2745 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2747 LLT Ty = MRI.getType(DstReg);
2748 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2749 MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2750 MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2752 MIRBuilder.buildTrunc(DstReg, ShrReg);
2753 Observer.changedInstr(MI);
2754 return Legalized;
2756 case TargetOpcode::G_BITREVERSE: {
2757 Observer.changingInstr(MI);
2759 Register DstReg = MI.getOperand(0).getReg();
2760 LLT Ty = MRI.getType(DstReg);
2761 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2763 Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2764 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2765 MI.getOperand(0).setReg(DstExt);
2766 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2768 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2769 auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2770 MIRBuilder.buildTrunc(DstReg, Shift);
2771 Observer.changedInstr(MI);
2772 return Legalized;
2774 case TargetOpcode::G_FREEZE:
2775 case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
2776 Observer.changingInstr(MI);
2777 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2778 widenScalarDst(MI, WideTy);
2779 Observer.changedInstr(MI);
2780 return Legalized;
2782 case TargetOpcode::G_ABS:
2783 Observer.changingInstr(MI);
2784 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2785 widenScalarDst(MI, WideTy);
2786 Observer.changedInstr(MI);
2787 return Legalized;
2789 case TargetOpcode::G_ADD:
2790 case TargetOpcode::G_AND:
2791 case TargetOpcode::G_MUL:
2792 case TargetOpcode::G_OR:
2793 case TargetOpcode::G_XOR:
2794 case TargetOpcode::G_SUB:
2795 case TargetOpcode::G_SHUFFLE_VECTOR:
2796 // Perform operation at larger width (any extension is fines here, high bits
2797 // don't affect the result) and then truncate the result back to the
2798 // original type.
2799 Observer.changingInstr(MI);
2800 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2801 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2802 widenScalarDst(MI, WideTy);
2803 Observer.changedInstr(MI);
2804 return Legalized;
2806 case TargetOpcode::G_SBFX:
2807 case TargetOpcode::G_UBFX:
2808 Observer.changingInstr(MI);
2810 if (TypeIdx == 0) {
2811 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2812 widenScalarDst(MI, WideTy);
2813 } else {
2814 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2815 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2818 Observer.changedInstr(MI);
2819 return Legalized;
2821 case TargetOpcode::G_SHL:
2822 Observer.changingInstr(MI);
2824 if (TypeIdx == 0) {
2825 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2826 widenScalarDst(MI, WideTy);
2827 } else {
2828 assert(TypeIdx == 1);
2829 // The "number of bits to shift" operand must preserve its value as an
2830 // unsigned integer:
2831 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2834 Observer.changedInstr(MI);
2835 return Legalized;
2837 case TargetOpcode::G_ROTR:
2838 case TargetOpcode::G_ROTL:
2839 if (TypeIdx != 1)
2840 return UnableToLegalize;
2842 Observer.changingInstr(MI);
2843 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2844 Observer.changedInstr(MI);
2845 return Legalized;
2847 case TargetOpcode::G_SDIV:
2848 case TargetOpcode::G_SREM:
2849 case TargetOpcode::G_SMIN:
2850 case TargetOpcode::G_SMAX:
2851 Observer.changingInstr(MI);
2852 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2853 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2854 widenScalarDst(MI, WideTy);
2855 Observer.changedInstr(MI);
2856 return Legalized;
2858 case TargetOpcode::G_SDIVREM:
2859 Observer.changingInstr(MI);
2860 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2861 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2862 widenScalarDst(MI, WideTy);
2863 widenScalarDst(MI, WideTy, 1);
2864 Observer.changedInstr(MI);
2865 return Legalized;
2867 case TargetOpcode::G_ASHR:
2868 case TargetOpcode::G_LSHR:
2869 Observer.changingInstr(MI);
2871 if (TypeIdx == 0) {
2872 unsigned CvtOp = Opcode == TargetOpcode::G_ASHR ? TargetOpcode::G_SEXT
2873 : TargetOpcode::G_ZEXT;
2875 widenScalarSrc(MI, WideTy, 1, CvtOp);
2876 widenScalarDst(MI, WideTy);
2877 } else {
2878 assert(TypeIdx == 1);
2879 // The "number of bits to shift" operand must preserve its value as an
2880 // unsigned integer:
2881 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2884 Observer.changedInstr(MI);
2885 return Legalized;
2886 case TargetOpcode::G_UDIV:
2887 case TargetOpcode::G_UREM:
2888 Observer.changingInstr(MI);
2889 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2890 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2891 widenScalarDst(MI, WideTy);
2892 Observer.changedInstr(MI);
2893 return Legalized;
2894 case TargetOpcode::G_UDIVREM:
2895 Observer.changingInstr(MI);
2896 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2897 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2898 widenScalarDst(MI, WideTy);
2899 widenScalarDst(MI, WideTy, 1);
2900 Observer.changedInstr(MI);
2901 return Legalized;
2902 case TargetOpcode::G_UMIN:
2903 case TargetOpcode::G_UMAX: {
2904 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2906 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
2907 unsigned ExtOpc =
2908 TLI.isSExtCheaperThanZExt(getApproximateEVTForLLT(Ty, Ctx),
2909 getApproximateEVTForLLT(WideTy, Ctx))
2910 ? TargetOpcode::G_SEXT
2911 : TargetOpcode::G_ZEXT;
2913 Observer.changingInstr(MI);
2914 widenScalarSrc(MI, WideTy, 1, ExtOpc);
2915 widenScalarSrc(MI, WideTy, 2, ExtOpc);
2916 widenScalarDst(MI, WideTy);
2917 Observer.changedInstr(MI);
2918 return Legalized;
2921 case TargetOpcode::G_SELECT:
2922 Observer.changingInstr(MI);
2923 if (TypeIdx == 0) {
2924 // Perform operation at larger width (any extension is fine here, high
2925 // bits don't affect the result) and then truncate the result back to the
2926 // original type.
2927 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2928 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2929 widenScalarDst(MI, WideTy);
2930 } else {
2931 bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2932 // Explicit extension is required here since high bits affect the result.
2933 widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2935 Observer.changedInstr(MI);
2936 return Legalized;
2938 case TargetOpcode::G_FPTOSI:
2939 case TargetOpcode::G_FPTOUI:
2940 case TargetOpcode::G_INTRINSIC_LRINT:
2941 case TargetOpcode::G_INTRINSIC_LLRINT:
2942 case TargetOpcode::G_IS_FPCLASS:
2943 Observer.changingInstr(MI);
2945 if (TypeIdx == 0)
2946 widenScalarDst(MI, WideTy);
2947 else
2948 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2950 Observer.changedInstr(MI);
2951 return Legalized;
2952 case TargetOpcode::G_SITOFP:
2953 Observer.changingInstr(MI);
2955 if (TypeIdx == 0)
2956 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2957 else
2958 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2960 Observer.changedInstr(MI);
2961 return Legalized;
2962 case TargetOpcode::G_UITOFP:
2963 Observer.changingInstr(MI);
2965 if (TypeIdx == 0)
2966 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2967 else
2968 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2970 Observer.changedInstr(MI);
2971 return Legalized;
2972 case TargetOpcode::G_FPTOSI_SAT:
2973 case TargetOpcode::G_FPTOUI_SAT:
2974 Observer.changingInstr(MI);
2976 if (TypeIdx == 0) {
2977 Register OldDst = MI.getOperand(0).getReg();
2978 LLT Ty = MRI.getType(OldDst);
2979 Register ExtReg = MRI.createGenericVirtualRegister(WideTy);
2980 Register NewDst;
2981 MI.getOperand(0).setReg(ExtReg);
2982 uint64_t ShortBits = Ty.getScalarSizeInBits();
2983 uint64_t WideBits = WideTy.getScalarSizeInBits();
2984 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2985 if (Opcode == TargetOpcode::G_FPTOSI_SAT) {
2986 // z = i16 fptosi_sat(a)
2987 // ->
2988 // x = i32 fptosi_sat(a)
2989 // y = smin(x, 32767)
2990 // z = smax(y, -32768)
2991 auto MaxVal = MIRBuilder.buildConstant(
2992 WideTy, APInt::getSignedMaxValue(ShortBits).sext(WideBits));
2993 auto MinVal = MIRBuilder.buildConstant(
2994 WideTy, APInt::getSignedMinValue(ShortBits).sext(WideBits));
2995 Register MidReg =
2996 MIRBuilder.buildSMin(WideTy, ExtReg, MaxVal).getReg(0);
2997 NewDst = MIRBuilder.buildSMax(WideTy, MidReg, MinVal).getReg(0);
2998 } else {
2999 // z = i16 fptoui_sat(a)
3000 // ->
3001 // x = i32 fptoui_sat(a)
3002 // y = smin(x, 65535)
3003 auto MaxVal = MIRBuilder.buildConstant(
3004 WideTy, APInt::getAllOnes(ShortBits).zext(WideBits));
3005 NewDst = MIRBuilder.buildUMin(WideTy, ExtReg, MaxVal).getReg(0);
3007 MIRBuilder.buildTrunc(OldDst, NewDst);
3008 } else
3009 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
3011 Observer.changedInstr(MI);
3012 return Legalized;
3013 case TargetOpcode::G_LOAD:
3014 case TargetOpcode::G_SEXTLOAD:
3015 case TargetOpcode::G_ZEXTLOAD:
3016 Observer.changingInstr(MI);
3017 widenScalarDst(MI, WideTy);
3018 Observer.changedInstr(MI);
3019 return Legalized;
3021 case TargetOpcode::G_STORE: {
3022 if (TypeIdx != 0)
3023 return UnableToLegalize;
3025 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3026 assert(!Ty.isPointerOrPointerVector() && "Can't widen type");
3027 if (!Ty.isScalar()) {
3028 // We need to widen the vector element type.
3029 Observer.changingInstr(MI);
3030 widenScalarSrc(MI, WideTy, 0, TargetOpcode::G_ANYEXT);
3031 // We also need to adjust the MMO to turn this into a truncating store.
3032 MachineMemOperand &MMO = **MI.memoperands_begin();
3033 MachineFunction &MF = MIRBuilder.getMF();
3034 auto *NewMMO = MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), Ty);
3035 MI.setMemRefs(MF, {NewMMO});
3036 Observer.changedInstr(MI);
3037 return Legalized;
3040 Observer.changingInstr(MI);
3042 unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
3043 TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
3044 widenScalarSrc(MI, WideTy, 0, ExtType);
3046 Observer.changedInstr(MI);
3047 return Legalized;
3049 case TargetOpcode::G_CONSTANT: {
3050 MachineOperand &SrcMO = MI.getOperand(1);
3051 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3052 unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
3053 MRI.getType(MI.getOperand(0).getReg()));
3054 assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
3055 ExtOpc == TargetOpcode::G_ANYEXT) &&
3056 "Illegal Extend");
3057 const APInt &SrcVal = SrcMO.getCImm()->getValue();
3058 const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
3059 ? SrcVal.sext(WideTy.getSizeInBits())
3060 : SrcVal.zext(WideTy.getSizeInBits());
3061 Observer.changingInstr(MI);
3062 SrcMO.setCImm(ConstantInt::get(Ctx, Val));
3064 widenScalarDst(MI, WideTy);
3065 Observer.changedInstr(MI);
3066 return Legalized;
3068 case TargetOpcode::G_FCONSTANT: {
3069 // To avoid changing the bits of the constant due to extension to a larger
3070 // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
3071 MachineOperand &SrcMO = MI.getOperand(1);
3072 APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
3073 MIRBuilder.setInstrAndDebugLoc(MI);
3074 auto IntCst = MIRBuilder.buildConstant(MI.getOperand(0).getReg(), Val);
3075 widenScalarDst(*IntCst, WideTy, 0, TargetOpcode::G_TRUNC);
3076 MI.eraseFromParent();
3077 return Legalized;
3079 case TargetOpcode::G_IMPLICIT_DEF: {
3080 Observer.changingInstr(MI);
3081 widenScalarDst(MI, WideTy);
3082 Observer.changedInstr(MI);
3083 return Legalized;
3085 case TargetOpcode::G_BRCOND:
3086 Observer.changingInstr(MI);
3087 widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
3088 Observer.changedInstr(MI);
3089 return Legalized;
3091 case TargetOpcode::G_FCMP:
3092 Observer.changingInstr(MI);
3093 if (TypeIdx == 0)
3094 widenScalarDst(MI, WideTy);
3095 else {
3096 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
3097 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
3099 Observer.changedInstr(MI);
3100 return Legalized;
3102 case TargetOpcode::G_ICMP:
3103 Observer.changingInstr(MI);
3104 if (TypeIdx == 0)
3105 widenScalarDst(MI, WideTy);
3106 else {
3107 LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
3108 CmpInst::Predicate Pred =
3109 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3111 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
3112 unsigned ExtOpcode =
3113 (CmpInst::isSigned(Pred) ||
3114 TLI.isSExtCheaperThanZExt(getApproximateEVTForLLT(SrcTy, Ctx),
3115 getApproximateEVTForLLT(WideTy, Ctx)))
3116 ? TargetOpcode::G_SEXT
3117 : TargetOpcode::G_ZEXT;
3118 widenScalarSrc(MI, WideTy, 2, ExtOpcode);
3119 widenScalarSrc(MI, WideTy, 3, ExtOpcode);
3121 Observer.changedInstr(MI);
3122 return Legalized;
3124 case TargetOpcode::G_PTR_ADD:
3125 assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
3126 Observer.changingInstr(MI);
3127 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
3128 Observer.changedInstr(MI);
3129 return Legalized;
3131 case TargetOpcode::G_PHI: {
3132 assert(TypeIdx == 0 && "Expecting only Idx 0");
3134 Observer.changingInstr(MI);
3135 for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
3136 MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
3137 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
3138 widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
3141 MachineBasicBlock &MBB = *MI.getParent();
3142 MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
3143 widenScalarDst(MI, WideTy);
3144 Observer.changedInstr(MI);
3145 return Legalized;
3147 case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
3148 if (TypeIdx == 0) {
3149 Register VecReg = MI.getOperand(1).getReg();
3150 LLT VecTy = MRI.getType(VecReg);
3151 Observer.changingInstr(MI);
3153 widenScalarSrc(
3154 MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
3155 TargetOpcode::G_ANYEXT);
3157 widenScalarDst(MI, WideTy, 0);
3158 Observer.changedInstr(MI);
3159 return Legalized;
3162 if (TypeIdx != 2)
3163 return UnableToLegalize;
3164 Observer.changingInstr(MI);
3165 // TODO: Probably should be zext
3166 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
3167 Observer.changedInstr(MI);
3168 return Legalized;
3170 case TargetOpcode::G_INSERT_VECTOR_ELT: {
3171 if (TypeIdx == 0) {
3172 Observer.changingInstr(MI);
3173 const LLT WideEltTy = WideTy.getElementType();
3175 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
3176 widenScalarSrc(MI, WideEltTy, 2, TargetOpcode::G_ANYEXT);
3177 widenScalarDst(MI, WideTy, 0);
3178 Observer.changedInstr(MI);
3179 return Legalized;
3182 if (TypeIdx == 1) {
3183 Observer.changingInstr(MI);
3185 Register VecReg = MI.getOperand(1).getReg();
3186 LLT VecTy = MRI.getType(VecReg);
3187 LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
3189 widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
3190 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
3191 widenScalarDst(MI, WideVecTy, 0);
3192 Observer.changedInstr(MI);
3193 return Legalized;
3196 if (TypeIdx == 2) {
3197 Observer.changingInstr(MI);
3198 // TODO: Probably should be zext
3199 widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
3200 Observer.changedInstr(MI);
3201 return Legalized;
3204 return UnableToLegalize;
3206 case TargetOpcode::G_FADD:
3207 case TargetOpcode::G_FMUL:
3208 case TargetOpcode::G_FSUB:
3209 case TargetOpcode::G_FMA:
3210 case TargetOpcode::G_FMAD:
3211 case TargetOpcode::G_FNEG:
3212 case TargetOpcode::G_FABS:
3213 case TargetOpcode::G_FCANONICALIZE:
3214 case TargetOpcode::G_FMINNUM:
3215 case TargetOpcode::G_FMAXNUM:
3216 case TargetOpcode::G_FMINNUM_IEEE:
3217 case TargetOpcode::G_FMAXNUM_IEEE:
3218 case TargetOpcode::G_FMINIMUM:
3219 case TargetOpcode::G_FMAXIMUM:
3220 case TargetOpcode::G_FDIV:
3221 case TargetOpcode::G_FREM:
3222 case TargetOpcode::G_FCEIL:
3223 case TargetOpcode::G_FFLOOR:
3224 case TargetOpcode::G_FCOS:
3225 case TargetOpcode::G_FSIN:
3226 case TargetOpcode::G_FTAN:
3227 case TargetOpcode::G_FACOS:
3228 case TargetOpcode::G_FASIN:
3229 case TargetOpcode::G_FATAN:
3230 case TargetOpcode::G_FATAN2:
3231 case TargetOpcode::G_FCOSH:
3232 case TargetOpcode::G_FSINH:
3233 case TargetOpcode::G_FTANH:
3234 case TargetOpcode::G_FLOG10:
3235 case TargetOpcode::G_FLOG:
3236 case TargetOpcode::G_FLOG2:
3237 case TargetOpcode::G_FRINT:
3238 case TargetOpcode::G_FNEARBYINT:
3239 case TargetOpcode::G_FSQRT:
3240 case TargetOpcode::G_FEXP:
3241 case TargetOpcode::G_FEXP2:
3242 case TargetOpcode::G_FEXP10:
3243 case TargetOpcode::G_FPOW:
3244 case TargetOpcode::G_INTRINSIC_TRUNC:
3245 case TargetOpcode::G_INTRINSIC_ROUND:
3246 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
3247 assert(TypeIdx == 0);
3248 Observer.changingInstr(MI);
3250 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3251 widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
3253 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3254 Observer.changedInstr(MI);
3255 return Legalized;
3256 case TargetOpcode::G_FPOWI:
3257 case TargetOpcode::G_FLDEXP:
3258 case TargetOpcode::G_STRICT_FLDEXP: {
3259 if (TypeIdx == 0) {
3260 if (Opcode == TargetOpcode::G_STRICT_FLDEXP)
3261 return UnableToLegalize;
3263 Observer.changingInstr(MI);
3264 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
3265 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3266 Observer.changedInstr(MI);
3267 return Legalized;
3270 if (TypeIdx == 1) {
3271 // For some reason SelectionDAG tries to promote to a libcall without
3272 // actually changing the integer type for promotion.
3273 Observer.changingInstr(MI);
3274 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
3275 Observer.changedInstr(MI);
3276 return Legalized;
3279 return UnableToLegalize;
3281 case TargetOpcode::G_FFREXP: {
3282 Observer.changingInstr(MI);
3284 if (TypeIdx == 0) {
3285 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
3286 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3287 } else {
3288 widenScalarDst(MI, WideTy, 1);
3291 Observer.changedInstr(MI);
3292 return Legalized;
3294 case TargetOpcode::G_INTTOPTR:
3295 if (TypeIdx != 1)
3296 return UnableToLegalize;
3298 Observer.changingInstr(MI);
3299 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
3300 Observer.changedInstr(MI);
3301 return Legalized;
3302 case TargetOpcode::G_PTRTOINT:
3303 if (TypeIdx != 0)
3304 return UnableToLegalize;
3306 Observer.changingInstr(MI);
3307 widenScalarDst(MI, WideTy, 0);
3308 Observer.changedInstr(MI);
3309 return Legalized;
3310 case TargetOpcode::G_BUILD_VECTOR: {
3311 Observer.changingInstr(MI);
3313 const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
3314 for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
3315 widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
3317 // Avoid changing the result vector type if the source element type was
3318 // requested.
3319 if (TypeIdx == 1) {
3320 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
3321 } else {
3322 widenScalarDst(MI, WideTy, 0);
3325 Observer.changedInstr(MI);
3326 return Legalized;
3328 case TargetOpcode::G_SEXT_INREG:
3329 if (TypeIdx != 0)
3330 return UnableToLegalize;
3332 Observer.changingInstr(MI);
3333 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
3334 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
3335 Observer.changedInstr(MI);
3336 return Legalized;
3337 case TargetOpcode::G_PTRMASK: {
3338 if (TypeIdx != 1)
3339 return UnableToLegalize;
3340 Observer.changingInstr(MI);
3341 widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
3342 Observer.changedInstr(MI);
3343 return Legalized;
3345 case TargetOpcode::G_VECREDUCE_ADD: {
3346 if (TypeIdx != 1)
3347 return UnableToLegalize;
3348 Observer.changingInstr(MI);
3349 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
3350 widenScalarDst(MI, WideTy.getScalarType(), 0, TargetOpcode::G_TRUNC);
3351 Observer.changedInstr(MI);
3352 return Legalized;
3354 case TargetOpcode::G_VECREDUCE_FADD:
3355 case TargetOpcode::G_VECREDUCE_FMUL:
3356 case TargetOpcode::G_VECREDUCE_FMIN:
3357 case TargetOpcode::G_VECREDUCE_FMAX:
3358 case TargetOpcode::G_VECREDUCE_FMINIMUM:
3359 case TargetOpcode::G_VECREDUCE_FMAXIMUM: {
3360 if (TypeIdx != 0)
3361 return UnableToLegalize;
3362 Observer.changingInstr(MI);
3363 Register VecReg = MI.getOperand(1).getReg();
3364 LLT VecTy = MRI.getType(VecReg);
3365 LLT WideVecTy = VecTy.isVector()
3366 ? LLT::vector(VecTy.getElementCount(), WideTy)
3367 : WideTy;
3368 widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_FPEXT);
3369 widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3370 Observer.changedInstr(MI);
3371 return Legalized;
3373 case TargetOpcode::G_VSCALE: {
3374 MachineOperand &SrcMO = MI.getOperand(1);
3375 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3376 const APInt &SrcVal = SrcMO.getCImm()->getValue();
3377 // The CImm is always a signed value
3378 const APInt Val = SrcVal.sext(WideTy.getSizeInBits());
3379 Observer.changingInstr(MI);
3380 SrcMO.setCImm(ConstantInt::get(Ctx, Val));
3381 widenScalarDst(MI, WideTy);
3382 Observer.changedInstr(MI);
3383 return Legalized;
3385 case TargetOpcode::G_SPLAT_VECTOR: {
3386 if (TypeIdx != 1)
3387 return UnableToLegalize;
3389 Observer.changingInstr(MI);
3390 widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
3391 Observer.changedInstr(MI);
3392 return Legalized;
3394 case TargetOpcode::G_INSERT_SUBVECTOR: {
3395 if (TypeIdx != 0)
3396 return UnableToLegalize;
3398 GInsertSubvector &IS = cast<GInsertSubvector>(MI);
3399 Register BigVec = IS.getBigVec();
3400 Register SubVec = IS.getSubVec();
3402 LLT SubVecTy = MRI.getType(SubVec);
3403 LLT SubVecWideTy = SubVecTy.changeElementType(WideTy.getElementType());
3405 // Widen the G_INSERT_SUBVECTOR
3406 auto BigZExt = MIRBuilder.buildZExt(WideTy, BigVec);
3407 auto SubZExt = MIRBuilder.buildZExt(SubVecWideTy, SubVec);
3408 auto WideInsert = MIRBuilder.buildInsertSubvector(WideTy, BigZExt, SubZExt,
3409 IS.getIndexImm());
3411 // Truncate back down
3412 auto SplatZero = MIRBuilder.buildSplatVector(
3413 WideTy, MIRBuilder.buildConstant(WideTy.getElementType(), 0));
3414 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, IS.getReg(0), WideInsert,
3415 SplatZero);
3417 MI.eraseFromParent();
3419 return Legalized;
3424 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
3425 MachineIRBuilder &B, Register Src, LLT Ty) {
3426 auto Unmerge = B.buildUnmerge(Ty, Src);
3427 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3428 Pieces.push_back(Unmerge.getReg(I));
3431 static void emitLoadFromConstantPool(Register DstReg, const Constant *ConstVal,
3432 MachineIRBuilder &MIRBuilder) {
3433 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3434 MachineFunction &MF = MIRBuilder.getMF();
3435 const DataLayout &DL = MIRBuilder.getDataLayout();
3436 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
3437 LLT AddrPtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3438 LLT DstLLT = MRI.getType(DstReg);
3440 Align Alignment(DL.getABITypeAlign(ConstVal->getType()));
3442 auto Addr = MIRBuilder.buildConstantPool(
3443 AddrPtrTy,
3444 MF.getConstantPool()->getConstantPoolIndex(ConstVal, Alignment));
3446 MachineMemOperand *MMO =
3447 MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF),
3448 MachineMemOperand::MOLoad, DstLLT, Alignment);
3450 MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, DstReg, Addr, *MMO);
3453 LegalizerHelper::LegalizeResult
3454 LegalizerHelper::lowerConstant(MachineInstr &MI) {
3455 const MachineOperand &ConstOperand = MI.getOperand(1);
3456 const Constant *ConstantVal = ConstOperand.getCImm();
3458 emitLoadFromConstantPool(MI.getOperand(0).getReg(), ConstantVal, MIRBuilder);
3459 MI.eraseFromParent();
3461 return Legalized;
3464 LegalizerHelper::LegalizeResult
3465 LegalizerHelper::lowerFConstant(MachineInstr &MI) {
3466 const MachineOperand &ConstOperand = MI.getOperand(1);
3467 const Constant *ConstantVal = ConstOperand.getFPImm();
3469 emitLoadFromConstantPool(MI.getOperand(0).getReg(), ConstantVal, MIRBuilder);
3470 MI.eraseFromParent();
3472 return Legalized;
3475 LegalizerHelper::LegalizeResult
3476 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
3477 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
3478 if (SrcTy.isVector()) {
3479 LLT SrcEltTy = SrcTy.getElementType();
3480 SmallVector<Register, 8> SrcRegs;
3482 if (DstTy.isVector()) {
3483 int NumDstElt = DstTy.getNumElements();
3484 int NumSrcElt = SrcTy.getNumElements();
3486 LLT DstEltTy = DstTy.getElementType();
3487 LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
3488 LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
3490 // If there's an element size mismatch, insert intermediate casts to match
3491 // the result element type.
3492 if (NumSrcElt < NumDstElt) { // Source element type is larger.
3493 // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
3495 // =>
3497 // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
3498 // %3:_(<2 x s8>) = G_BITCAST %2
3499 // %4:_(<2 x s8>) = G_BITCAST %3
3500 // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
3501 DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
3502 SrcPartTy = SrcEltTy;
3503 } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
3505 // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
3507 // =>
3509 // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
3510 // %3:_(s16) = G_BITCAST %2
3511 // %4:_(s16) = G_BITCAST %3
3512 // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
3513 SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
3514 DstCastTy = DstEltTy;
3517 getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
3518 for (Register &SrcReg : SrcRegs)
3519 SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
3520 } else
3521 getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
3523 MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
3524 MI.eraseFromParent();
3525 return Legalized;
3528 if (DstTy.isVector()) {
3529 SmallVector<Register, 8> SrcRegs;
3530 getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
3531 MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
3532 MI.eraseFromParent();
3533 return Legalized;
3536 return UnableToLegalize;
3539 /// Figure out the bit offset into a register when coercing a vector index for
3540 /// the wide element type. This is only for the case when promoting vector to
3541 /// one with larger elements.
3544 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3545 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3546 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
3547 Register Idx,
3548 unsigned NewEltSize,
3549 unsigned OldEltSize) {
3550 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3551 LLT IdxTy = B.getMRI()->getType(Idx);
3553 // Now figure out the amount we need to shift to get the target bits.
3554 auto OffsetMask = B.buildConstant(
3555 IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
3556 auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
3557 return B.buildShl(IdxTy, OffsetIdx,
3558 B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
3561 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
3562 /// is casting to a vector with a smaller element size, perform multiple element
3563 /// extracts and merge the results. If this is coercing to a vector with larger
3564 /// elements, index the bitcasted vector and extract the target element with bit
3565 /// operations. This is intended to force the indexing in the native register
3566 /// size for architectures that can dynamically index the register file.
3567 LegalizerHelper::LegalizeResult
3568 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
3569 LLT CastTy) {
3570 if (TypeIdx != 1)
3571 return UnableToLegalize;
3573 auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs();
3575 LLT SrcEltTy = SrcVecTy.getElementType();
3576 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3577 unsigned OldNumElts = SrcVecTy.getNumElements();
3579 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3580 Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
3582 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3583 const unsigned OldEltSize = SrcEltTy.getSizeInBits();
3584 if (NewNumElts > OldNumElts) {
3585 // Decreasing the vector element size
3587 // e.g. i64 = extract_vector_elt x:v2i64, y:i32
3588 // =>
3589 // v4i32:castx = bitcast x:v2i64
3591 // i64 = bitcast
3592 // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
3593 // (i32 (extract_vector_elt castx, (2 * y + 1)))
3595 if (NewNumElts % OldNumElts != 0)
3596 return UnableToLegalize;
3598 // Type of the intermediate result vector.
3599 const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
3600 LLT MidTy =
3601 LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
3603 auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
3605 SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
3606 auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
3608 for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
3609 auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
3610 auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
3611 auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
3612 NewOps[I] = Elt.getReg(0);
3615 auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
3616 MIRBuilder.buildBitcast(Dst, NewVec);
3617 MI.eraseFromParent();
3618 return Legalized;
3621 if (NewNumElts < OldNumElts) {
3622 if (NewEltSize % OldEltSize != 0)
3623 return UnableToLegalize;
3625 // This only depends on powers of 2 because we use bit tricks to figure out
3626 // the bit offset we need to shift to get the target element. A general
3627 // expansion could emit division/multiply.
3628 if (!isPowerOf2_32(NewEltSize / OldEltSize))
3629 return UnableToLegalize;
3631 // Increasing the vector element size.
3632 // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
3634 // =>
3636 // %cast = G_BITCAST %vec
3637 // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
3638 // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
3639 // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3640 // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3641 // %elt_bits = G_LSHR %wide_elt, %offset_bits
3642 // %elt = G_TRUNC %elt_bits
3644 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3645 auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
3647 // Divide to get the index in the wider element type.
3648 auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
3650 Register WideElt = CastVec;
3651 if (CastTy.isVector()) {
3652 WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
3653 ScaledIdx).getReg(0);
3656 // Compute the bit offset into the register of the target element.
3657 Register OffsetBits = getBitcastWiderVectorElementOffset(
3658 MIRBuilder, Idx, NewEltSize, OldEltSize);
3660 // Shift the wide element to get the target element.
3661 auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
3662 MIRBuilder.buildTrunc(Dst, ExtractedBits);
3663 MI.eraseFromParent();
3664 return Legalized;
3667 return UnableToLegalize;
3670 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
3671 /// TargetReg, while preserving other bits in \p TargetReg.
3673 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
3674 static Register buildBitFieldInsert(MachineIRBuilder &B,
3675 Register TargetReg, Register InsertReg,
3676 Register OffsetBits) {
3677 LLT TargetTy = B.getMRI()->getType(TargetReg);
3678 LLT InsertTy = B.getMRI()->getType(InsertReg);
3679 auto ZextVal = B.buildZExt(TargetTy, InsertReg);
3680 auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
3682 // Produce a bitmask of the value to insert
3683 auto EltMask = B.buildConstant(
3684 TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
3685 InsertTy.getSizeInBits()));
3686 // Shift it into position
3687 auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
3688 auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
3690 // Clear out the bits in the wide element
3691 auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
3693 // The value to insert has all zeros already, so stick it into the masked
3694 // wide element.
3695 return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
3698 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
3699 /// is increasing the element size, perform the indexing in the target element
3700 /// type, and use bit operations to insert at the element position. This is
3701 /// intended for architectures that can dynamically index the register file and
3702 /// want to force indexing in the native register size.
3703 LegalizerHelper::LegalizeResult
3704 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
3705 LLT CastTy) {
3706 if (TypeIdx != 0)
3707 return UnableToLegalize;
3709 auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] =
3710 MI.getFirst4RegLLTs();
3711 LLT VecTy = DstTy;
3713 LLT VecEltTy = VecTy.getElementType();
3714 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3715 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3716 const unsigned OldEltSize = VecEltTy.getSizeInBits();
3718 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3719 unsigned OldNumElts = VecTy.getNumElements();
3721 Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
3722 if (NewNumElts < OldNumElts) {
3723 if (NewEltSize % OldEltSize != 0)
3724 return UnableToLegalize;
3726 // This only depends on powers of 2 because we use bit tricks to figure out
3727 // the bit offset we need to shift to get the target element. A general
3728 // expansion could emit division/multiply.
3729 if (!isPowerOf2_32(NewEltSize / OldEltSize))
3730 return UnableToLegalize;
3732 const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3733 auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
3735 // Divide to get the index in the wider element type.
3736 auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
3738 Register ExtractedElt = CastVec;
3739 if (CastTy.isVector()) {
3740 ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
3741 ScaledIdx).getReg(0);
3744 // Compute the bit offset into the register of the target element.
3745 Register OffsetBits = getBitcastWiderVectorElementOffset(
3746 MIRBuilder, Idx, NewEltSize, OldEltSize);
3748 Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
3749 Val, OffsetBits);
3750 if (CastTy.isVector()) {
3751 InsertedElt = MIRBuilder.buildInsertVectorElement(
3752 CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
3755 MIRBuilder.buildBitcast(Dst, InsertedElt);
3756 MI.eraseFromParent();
3757 return Legalized;
3760 return UnableToLegalize;
3763 // This attempts to handle G_CONCAT_VECTORS with illegal operands, particularly
3764 // those that have smaller than legal operands.
3766 // <16 x s8> = G_CONCAT_VECTORS <4 x s8>, <4 x s8>, <4 x s8>, <4 x s8>
3768 // ===>
3770 // s32 = G_BITCAST <4 x s8>
3771 // s32 = G_BITCAST <4 x s8>
3772 // s32 = G_BITCAST <4 x s8>
3773 // s32 = G_BITCAST <4 x s8>
3774 // <4 x s32> = G_BUILD_VECTOR s32, s32, s32, s32
3775 // <16 x s8> = G_BITCAST <4 x s32>
3776 LegalizerHelper::LegalizeResult
3777 LegalizerHelper::bitcastConcatVector(MachineInstr &MI, unsigned TypeIdx,
3778 LLT CastTy) {
3779 // Convert it to CONCAT instruction
3780 auto ConcatMI = dyn_cast<GConcatVectors>(&MI);
3781 if (!ConcatMI) {
3782 return UnableToLegalize;
3785 // Check if bitcast is Legal
3786 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
3787 LLT SrcScalTy = LLT::scalar(SrcTy.getSizeInBits());
3789 // Check if the build vector is Legal
3790 if (!LI.isLegal({TargetOpcode::G_BUILD_VECTOR, {CastTy, SrcScalTy}})) {
3791 return UnableToLegalize;
3794 // Bitcast the sources
3795 SmallVector<Register> BitcastRegs;
3796 for (unsigned i = 0; i < ConcatMI->getNumSources(); i++) {
3797 BitcastRegs.push_back(
3798 MIRBuilder.buildBitcast(SrcScalTy, ConcatMI->getSourceReg(i))
3799 .getReg(0));
3802 // Build the scalar values into a vector
3803 Register BuildReg =
3804 MIRBuilder.buildBuildVector(CastTy, BitcastRegs).getReg(0);
3805 MIRBuilder.buildBitcast(DstReg, BuildReg);
3807 MI.eraseFromParent();
3808 return Legalized;
3811 // This bitcasts a shuffle vector to a different type currently of the same
3812 // element size. Mostly used to legalize ptr vectors, where ptrtoint/inttoptr
3813 // will be used instead.
3815 // <16 x p0> = G_CONCAT_VECTORS <4 x p0>, <4 x p0>, mask
3816 // ===>
3817 // <4 x s64> = G_PTRTOINT <4 x p0>
3818 // <4 x s64> = G_PTRTOINT <4 x p0>
3819 // <16 x s64> = G_CONCAT_VECTORS <4 x s64>, <4 x s64>, mask
3820 // <16 x p0> = G_INTTOPTR <16 x s64>
3821 LegalizerHelper::LegalizeResult
3822 LegalizerHelper::bitcastShuffleVector(MachineInstr &MI, unsigned TypeIdx,
3823 LLT CastTy) {
3824 auto ShuffleMI = cast<GShuffleVector>(&MI);
3825 LLT DstTy = MRI.getType(ShuffleMI->getReg(0));
3826 LLT SrcTy = MRI.getType(ShuffleMI->getReg(1));
3828 // We currently only handle vectors of the same size.
3829 if (TypeIdx != 0 ||
3830 CastTy.getScalarSizeInBits() != DstTy.getScalarSizeInBits() ||
3831 CastTy.getElementCount() != DstTy.getElementCount())
3832 return UnableToLegalize;
3834 LLT NewSrcTy = SrcTy.changeElementType(CastTy.getScalarType());
3836 auto Inp1 = MIRBuilder.buildCast(NewSrcTy, ShuffleMI->getReg(1));
3837 auto Inp2 = MIRBuilder.buildCast(NewSrcTy, ShuffleMI->getReg(2));
3838 auto Shuf =
3839 MIRBuilder.buildShuffleVector(CastTy, Inp1, Inp2, ShuffleMI->getMask());
3840 MIRBuilder.buildCast(ShuffleMI->getReg(0), Shuf);
3842 MI.eraseFromParent();
3843 return Legalized;
3846 /// This attempts to bitcast G_EXTRACT_SUBVECTOR to CastTy.
3848 /// <vscale x 8 x i1> = G_EXTRACT_SUBVECTOR <vscale x 16 x i1>, N
3850 /// ===>
3852 /// <vscale x 2 x i1> = G_BITCAST <vscale x 16 x i1>
3853 /// <vscale x 1 x i8> = G_EXTRACT_SUBVECTOR <vscale x 2 x i1>, N / 8
3854 /// <vscale x 8 x i1> = G_BITCAST <vscale x 1 x i8>
3855 LegalizerHelper::LegalizeResult
3856 LegalizerHelper::bitcastExtractSubvector(MachineInstr &MI, unsigned TypeIdx,
3857 LLT CastTy) {
3858 auto ES = cast<GExtractSubvector>(&MI);
3860 if (!CastTy.isVector())
3861 return UnableToLegalize;
3863 if (TypeIdx != 0)
3864 return UnableToLegalize;
3866 Register Dst = ES->getReg(0);
3867 Register Src = ES->getSrcVec();
3868 uint64_t Idx = ES->getIndexImm();
3870 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3872 LLT DstTy = MRI.getType(Dst);
3873 LLT SrcTy = MRI.getType(Src);
3874 ElementCount DstTyEC = DstTy.getElementCount();
3875 ElementCount SrcTyEC = SrcTy.getElementCount();
3876 auto DstTyMinElts = DstTyEC.getKnownMinValue();
3877 auto SrcTyMinElts = SrcTyEC.getKnownMinValue();
3879 if (DstTy == CastTy)
3880 return Legalized;
3882 if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
3883 return UnableToLegalize;
3885 unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
3886 unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
3887 if (CastEltSize < DstEltSize)
3888 return UnableToLegalize;
3890 auto AdjustAmt = CastEltSize / DstEltSize;
3891 if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
3892 SrcTyMinElts % AdjustAmt != 0)
3893 return UnableToLegalize;
3895 Idx /= AdjustAmt;
3896 SrcTy = LLT::vector(SrcTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt);
3897 auto CastVec = MIRBuilder.buildBitcast(SrcTy, Src);
3898 auto PromotedES = MIRBuilder.buildExtractSubvector(CastTy, CastVec, Idx);
3899 MIRBuilder.buildBitcast(Dst, PromotedES);
3901 ES->eraseFromParent();
3902 return Legalized;
3905 /// This attempts to bitcast G_INSERT_SUBVECTOR to CastTy.
3907 /// <vscale x 16 x i1> = G_INSERT_SUBVECTOR <vscale x 16 x i1>,
3908 /// <vscale x 8 x i1>,
3909 /// N
3911 /// ===>
3913 /// <vscale x 2 x i8> = G_BITCAST <vscale x 16 x i1>
3914 /// <vscale x 1 x i8> = G_BITCAST <vscale x 8 x i1>
3915 /// <vscale x 2 x i8> = G_INSERT_SUBVECTOR <vscale x 2 x i8>,
3916 /// <vscale x 1 x i8>, N / 8
3917 /// <vscale x 16 x i1> = G_BITCAST <vscale x 2 x i8>
3918 LegalizerHelper::LegalizeResult
3919 LegalizerHelper::bitcastInsertSubvector(MachineInstr &MI, unsigned TypeIdx,
3920 LLT CastTy) {
3921 auto ES = cast<GInsertSubvector>(&MI);
3923 if (!CastTy.isVector())
3924 return UnableToLegalize;
3926 if (TypeIdx != 0)
3927 return UnableToLegalize;
3929 Register Dst = ES->getReg(0);
3930 Register BigVec = ES->getBigVec();
3931 Register SubVec = ES->getSubVec();
3932 uint64_t Idx = ES->getIndexImm();
3934 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3936 LLT DstTy = MRI.getType(Dst);
3937 LLT BigVecTy = MRI.getType(BigVec);
3938 LLT SubVecTy = MRI.getType(SubVec);
3940 if (DstTy == CastTy)
3941 return Legalized;
3943 if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
3944 return UnableToLegalize;
3946 ElementCount DstTyEC = DstTy.getElementCount();
3947 ElementCount BigVecTyEC = BigVecTy.getElementCount();
3948 ElementCount SubVecTyEC = SubVecTy.getElementCount();
3949 auto DstTyMinElts = DstTyEC.getKnownMinValue();
3950 auto BigVecTyMinElts = BigVecTyEC.getKnownMinValue();
3951 auto SubVecTyMinElts = SubVecTyEC.getKnownMinValue();
3953 unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
3954 unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
3955 if (CastEltSize < DstEltSize)
3956 return UnableToLegalize;
3958 auto AdjustAmt = CastEltSize / DstEltSize;
3959 if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
3960 BigVecTyMinElts % AdjustAmt != 0 || SubVecTyMinElts % AdjustAmt != 0)
3961 return UnableToLegalize;
3963 Idx /= AdjustAmt;
3964 BigVecTy = LLT::vector(BigVecTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt);
3965 SubVecTy = LLT::vector(SubVecTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt);
3966 auto CastBigVec = MIRBuilder.buildBitcast(BigVecTy, BigVec);
3967 auto CastSubVec = MIRBuilder.buildBitcast(SubVecTy, SubVec);
3968 auto PromotedIS =
3969 MIRBuilder.buildInsertSubvector(CastTy, CastBigVec, CastSubVec, Idx);
3970 MIRBuilder.buildBitcast(Dst, PromotedIS);
3972 ES->eraseFromParent();
3973 return Legalized;
3976 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
3977 // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
3978 Register DstReg = LoadMI.getDstReg();
3979 Register PtrReg = LoadMI.getPointerReg();
3980 LLT DstTy = MRI.getType(DstReg);
3981 MachineMemOperand &MMO = LoadMI.getMMO();
3982 LLT MemTy = MMO.getMemoryType();
3983 MachineFunction &MF = MIRBuilder.getMF();
3985 unsigned MemSizeInBits = MemTy.getSizeInBits();
3986 unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
3988 if (MemSizeInBits != MemStoreSizeInBits) {
3989 if (MemTy.isVector())
3990 return UnableToLegalize;
3992 // Promote to a byte-sized load if not loading an integral number of
3993 // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
3994 LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
3995 MachineMemOperand *NewMMO =
3996 MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
3998 Register LoadReg = DstReg;
3999 LLT LoadTy = DstTy;
4001 // If this wasn't already an extending load, we need to widen the result
4002 // register to avoid creating a load with a narrower result than the source.
4003 if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
4004 LoadTy = WideMemTy;
4005 LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
4008 if (isa<GSExtLoad>(LoadMI)) {
4009 auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
4010 MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
4011 } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == LoadTy) {
4012 auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
4013 // The extra bits are guaranteed to be zero, since we stored them that
4014 // way. A zext load from Wide thus automatically gives zext from MemVT.
4015 MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
4016 } else {
4017 MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
4020 if (DstTy != LoadTy)
4021 MIRBuilder.buildTrunc(DstReg, LoadReg);
4023 LoadMI.eraseFromParent();
4024 return Legalized;
4027 // Big endian lowering not implemented.
4028 if (MIRBuilder.getDataLayout().isBigEndian())
4029 return UnableToLegalize;
4031 // This load needs splitting into power of 2 sized loads.
4033 // Our strategy here is to generate anyextending loads for the smaller
4034 // types up to next power-2 result type, and then combine the two larger
4035 // result values together, before truncating back down to the non-pow-2
4036 // type.
4037 // E.g. v1 = i24 load =>
4038 // v2 = i32 zextload (2 byte)
4039 // v3 = i32 load (1 byte)
4040 // v4 = i32 shl v3, 16
4041 // v5 = i32 or v4, v2
4042 // v1 = i24 trunc v5
4043 // By doing this we generate the correct truncate which should get
4044 // combined away as an artifact with a matching extend.
4046 uint64_t LargeSplitSize, SmallSplitSize;
4048 if (!isPowerOf2_32(MemSizeInBits)) {
4049 // This load needs splitting into power of 2 sized loads.
4050 LargeSplitSize = llvm::bit_floor(MemSizeInBits);
4051 SmallSplitSize = MemSizeInBits - LargeSplitSize;
4052 } else {
4053 // This is already a power of 2, but we still need to split this in half.
4055 // Assume we're being asked to decompose an unaligned load.
4056 // TODO: If this requires multiple splits, handle them all at once.
4057 auto &Ctx = MF.getFunction().getContext();
4058 if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
4059 return UnableToLegalize;
4061 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4064 if (MemTy.isVector()) {
4065 // TODO: Handle vector extloads
4066 if (MemTy != DstTy)
4067 return UnableToLegalize;
4069 // TODO: We can do better than scalarizing the vector and at least split it
4070 // in half.
4071 return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
4074 MachineMemOperand *LargeMMO =
4075 MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
4076 MachineMemOperand *SmallMMO =
4077 MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
4079 LLT PtrTy = MRI.getType(PtrReg);
4080 unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
4081 LLT AnyExtTy = LLT::scalar(AnyExtSize);
4082 auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
4083 PtrReg, *LargeMMO);
4085 auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
4086 LargeSplitSize / 8);
4087 Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
4088 auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
4089 auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
4090 SmallPtr, *SmallMMO);
4092 auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
4093 auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
4095 if (AnyExtTy == DstTy)
4096 MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
4097 else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
4098 auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
4099 MIRBuilder.buildTrunc(DstReg, {Or});
4100 } else {
4101 assert(DstTy.isPointer() && "expected pointer");
4102 auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
4104 // FIXME: We currently consider this to be illegal for non-integral address
4105 // spaces, but we need still need a way to reinterpret the bits.
4106 MIRBuilder.buildIntToPtr(DstReg, Or);
4109 LoadMI.eraseFromParent();
4110 return Legalized;
4113 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
4114 // Lower a non-power of 2 store into multiple pow-2 stores.
4115 // E.g. split an i24 store into an i16 store + i8 store.
4116 // We do this by first extending the stored value to the next largest power
4117 // of 2 type, and then using truncating stores to store the components.
4118 // By doing this, likewise with G_LOAD, generate an extend that can be
4119 // artifact-combined away instead of leaving behind extracts.
4120 Register SrcReg = StoreMI.getValueReg();
4121 Register PtrReg = StoreMI.getPointerReg();
4122 LLT SrcTy = MRI.getType(SrcReg);
4123 MachineFunction &MF = MIRBuilder.getMF();
4124 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4125 LLT MemTy = MMO.getMemoryType();
4127 unsigned StoreWidth = MemTy.getSizeInBits();
4128 unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
4130 if (StoreWidth != StoreSizeInBits && !SrcTy.isVector()) {
4131 // Promote to a byte-sized store with upper bits zero if not
4132 // storing an integral number of bytes. For example, promote
4133 // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
4134 LLT WideTy = LLT::scalar(StoreSizeInBits);
4136 if (StoreSizeInBits > SrcTy.getSizeInBits()) {
4137 // Avoid creating a store with a narrower source than result.
4138 SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
4139 SrcTy = WideTy;
4142 auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
4144 MachineMemOperand *NewMMO =
4145 MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
4146 MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
4147 StoreMI.eraseFromParent();
4148 return Legalized;
4151 if (MemTy.isVector()) {
4152 if (MemTy != SrcTy)
4153 return scalarizeVectorBooleanStore(StoreMI);
4155 // TODO: We can do better than scalarizing the vector and at least split it
4156 // in half.
4157 return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
4160 unsigned MemSizeInBits = MemTy.getSizeInBits();
4161 uint64_t LargeSplitSize, SmallSplitSize;
4163 if (!isPowerOf2_32(MemSizeInBits)) {
4164 LargeSplitSize = llvm::bit_floor<uint64_t>(MemTy.getSizeInBits());
4165 SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
4166 } else {
4167 auto &Ctx = MF.getFunction().getContext();
4168 if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
4169 return UnableToLegalize; // Don't know what we're being asked to do.
4171 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4174 // Extend to the next pow-2. If this store was itself the result of lowering,
4175 // e.g. an s56 store being broken into s32 + s24, we might have a stored type
4176 // that's wider than the stored size.
4177 unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
4178 const LLT NewSrcTy = LLT::scalar(AnyExtSize);
4180 if (SrcTy.isPointer()) {
4181 const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
4182 SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
4185 auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
4187 // Obtain the smaller value by shifting away the larger value.
4188 auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
4189 auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
4191 // Generate the PtrAdd and truncating stores.
4192 LLT PtrTy = MRI.getType(PtrReg);
4193 auto OffsetCst = MIRBuilder.buildConstant(
4194 LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
4195 auto SmallPtr =
4196 MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
4198 MachineMemOperand *LargeMMO =
4199 MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
4200 MachineMemOperand *SmallMMO =
4201 MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
4202 MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
4203 MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
4204 StoreMI.eraseFromParent();
4205 return Legalized;
4208 LegalizerHelper::LegalizeResult
4209 LegalizerHelper::scalarizeVectorBooleanStore(GStore &StoreMI) {
4210 Register SrcReg = StoreMI.getValueReg();
4211 Register PtrReg = StoreMI.getPointerReg();
4212 LLT SrcTy = MRI.getType(SrcReg);
4213 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4214 LLT MemTy = MMO.getMemoryType();
4215 LLT MemScalarTy = MemTy.getElementType();
4216 MachineFunction &MF = MIRBuilder.getMF();
4218 assert(SrcTy.isVector() && "Expect a vector store type");
4220 if (!MemScalarTy.isByteSized()) {
4221 // We need to build an integer scalar of the vector bit pattern.
4222 // It's not legal for us to add padding when storing a vector.
4223 unsigned NumBits = MemTy.getSizeInBits();
4224 LLT IntTy = LLT::scalar(NumBits);
4225 auto CurrVal = MIRBuilder.buildConstant(IntTy, 0);
4226 LLT IdxTy = getLLTForMVT(TLI.getVectorIdxTy(MF.getDataLayout()));
4228 for (unsigned I = 0, E = MemTy.getNumElements(); I < E; ++I) {
4229 auto Elt = MIRBuilder.buildExtractVectorElement(
4230 SrcTy.getElementType(), SrcReg, MIRBuilder.buildConstant(IdxTy, I));
4231 auto Trunc = MIRBuilder.buildTrunc(MemScalarTy, Elt);
4232 auto ZExt = MIRBuilder.buildZExt(IntTy, Trunc);
4233 unsigned ShiftIntoIdx = MF.getDataLayout().isBigEndian()
4234 ? (MemTy.getNumElements() - 1) - I
4235 : I;
4236 auto ShiftAmt = MIRBuilder.buildConstant(
4237 IntTy, ShiftIntoIdx * MemScalarTy.getSizeInBits());
4238 auto Shifted = MIRBuilder.buildShl(IntTy, ZExt, ShiftAmt);
4239 CurrVal = MIRBuilder.buildOr(IntTy, CurrVal, Shifted);
4241 auto PtrInfo = MMO.getPointerInfo();
4242 auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, IntTy);
4243 MIRBuilder.buildStore(CurrVal, PtrReg, *NewMMO);
4244 StoreMI.eraseFromParent();
4245 return Legalized;
4248 // TODO: implement simple scalarization.
4249 return UnableToLegalize;
4252 LegalizerHelper::LegalizeResult
4253 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
4254 switch (MI.getOpcode()) {
4255 case TargetOpcode::G_LOAD: {
4256 if (TypeIdx != 0)
4257 return UnableToLegalize;
4258 MachineMemOperand &MMO = **MI.memoperands_begin();
4260 // Not sure how to interpret a bitcast of an extending load.
4261 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4262 return UnableToLegalize;
4264 Observer.changingInstr(MI);
4265 bitcastDst(MI, CastTy, 0);
4266 MMO.setType(CastTy);
4267 // The range metadata is no longer valid when reinterpreted as a different
4268 // type.
4269 MMO.clearRanges();
4270 Observer.changedInstr(MI);
4271 return Legalized;
4273 case TargetOpcode::G_STORE: {
4274 if (TypeIdx != 0)
4275 return UnableToLegalize;
4277 MachineMemOperand &MMO = **MI.memoperands_begin();
4279 // Not sure how to interpret a bitcast of a truncating store.
4280 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4281 return UnableToLegalize;
4283 Observer.changingInstr(MI);
4284 bitcastSrc(MI, CastTy, 0);
4285 MMO.setType(CastTy);
4286 Observer.changedInstr(MI);
4287 return Legalized;
4289 case TargetOpcode::G_SELECT: {
4290 if (TypeIdx != 0)
4291 return UnableToLegalize;
4293 if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
4294 LLVM_DEBUG(
4295 dbgs() << "bitcast action not implemented for vector select\n");
4296 return UnableToLegalize;
4299 Observer.changingInstr(MI);
4300 bitcastSrc(MI, CastTy, 2);
4301 bitcastSrc(MI, CastTy, 3);
4302 bitcastDst(MI, CastTy, 0);
4303 Observer.changedInstr(MI);
4304 return Legalized;
4306 case TargetOpcode::G_AND:
4307 case TargetOpcode::G_OR:
4308 case TargetOpcode::G_XOR: {
4309 Observer.changingInstr(MI);
4310 bitcastSrc(MI, CastTy, 1);
4311 bitcastSrc(MI, CastTy, 2);
4312 bitcastDst(MI, CastTy, 0);
4313 Observer.changedInstr(MI);
4314 return Legalized;
4316 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4317 return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
4318 case TargetOpcode::G_INSERT_VECTOR_ELT:
4319 return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
4320 case TargetOpcode::G_CONCAT_VECTORS:
4321 return bitcastConcatVector(MI, TypeIdx, CastTy);
4322 case TargetOpcode::G_SHUFFLE_VECTOR:
4323 return bitcastShuffleVector(MI, TypeIdx, CastTy);
4324 case TargetOpcode::G_EXTRACT_SUBVECTOR:
4325 return bitcastExtractSubvector(MI, TypeIdx, CastTy);
4326 case TargetOpcode::G_INSERT_SUBVECTOR:
4327 return bitcastInsertSubvector(MI, TypeIdx, CastTy);
4328 default:
4329 return UnableToLegalize;
4333 // Legalize an instruction by changing the opcode in place.
4334 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
4335 Observer.changingInstr(MI);
4336 MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
4337 Observer.changedInstr(MI);
4340 LegalizerHelper::LegalizeResult
4341 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
4342 using namespace TargetOpcode;
4344 switch(MI.getOpcode()) {
4345 default:
4346 return UnableToLegalize;
4347 case TargetOpcode::G_FCONSTANT:
4348 return lowerFConstant(MI);
4349 case TargetOpcode::G_BITCAST:
4350 return lowerBitcast(MI);
4351 case TargetOpcode::G_SREM:
4352 case TargetOpcode::G_UREM: {
4353 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4354 auto Quot =
4355 MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
4356 {MI.getOperand(1), MI.getOperand(2)});
4358 auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
4359 MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
4360 MI.eraseFromParent();
4361 return Legalized;
4363 case TargetOpcode::G_SADDO:
4364 case TargetOpcode::G_SSUBO:
4365 return lowerSADDO_SSUBO(MI);
4366 case TargetOpcode::G_UMULH:
4367 case TargetOpcode::G_SMULH:
4368 return lowerSMULH_UMULH(MI);
4369 case TargetOpcode::G_SMULO:
4370 case TargetOpcode::G_UMULO: {
4371 // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
4372 // result.
4373 auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs();
4374 LLT Ty = MRI.getType(Res);
4376 unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
4377 ? TargetOpcode::G_SMULH
4378 : TargetOpcode::G_UMULH;
4380 Observer.changingInstr(MI);
4381 const auto &TII = MIRBuilder.getTII();
4382 MI.setDesc(TII.get(TargetOpcode::G_MUL));
4383 MI.removeOperand(1);
4384 Observer.changedInstr(MI);
4386 auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
4387 auto Zero = MIRBuilder.buildConstant(Ty, 0);
4389 // Move insert point forward so we can use the Res register if needed.
4390 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
4392 // For *signed* multiply, overflow is detected by checking:
4393 // (hi != (lo >> bitwidth-1))
4394 if (Opcode == TargetOpcode::G_SMULH) {
4395 auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
4396 auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
4397 MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
4398 } else {
4399 MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
4401 return Legalized;
4403 case TargetOpcode::G_FNEG: {
4404 auto [Res, SubByReg] = MI.getFirst2Regs();
4405 LLT Ty = MRI.getType(Res);
4407 auto SignMask = MIRBuilder.buildConstant(
4408 Ty, APInt::getSignMask(Ty.getScalarSizeInBits()));
4409 MIRBuilder.buildXor(Res, SubByReg, SignMask);
4410 MI.eraseFromParent();
4411 return Legalized;
4413 case TargetOpcode::G_FSUB:
4414 case TargetOpcode::G_STRICT_FSUB: {
4415 auto [Res, LHS, RHS] = MI.getFirst3Regs();
4416 LLT Ty = MRI.getType(Res);
4418 // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
4419 auto Neg = MIRBuilder.buildFNeg(Ty, RHS);
4421 if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB)
4422 MIRBuilder.buildStrictFAdd(Res, LHS, Neg, MI.getFlags());
4423 else
4424 MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
4426 MI.eraseFromParent();
4427 return Legalized;
4429 case TargetOpcode::G_FMAD:
4430 return lowerFMad(MI);
4431 case TargetOpcode::G_FFLOOR:
4432 return lowerFFloor(MI);
4433 case TargetOpcode::G_LROUND:
4434 case TargetOpcode::G_LLROUND: {
4435 Register DstReg = MI.getOperand(0).getReg();
4436 Register SrcReg = MI.getOperand(1).getReg();
4437 LLT SrcTy = MRI.getType(SrcReg);
4438 auto Round = MIRBuilder.buildInstr(TargetOpcode::G_INTRINSIC_ROUND, {SrcTy},
4439 {SrcReg});
4440 MIRBuilder.buildFPTOSI(DstReg, Round);
4441 MI.eraseFromParent();
4442 return Legalized;
4444 case TargetOpcode::G_INTRINSIC_ROUND:
4445 return lowerIntrinsicRound(MI);
4446 case TargetOpcode::G_FRINT: {
4447 // Since round even is the assumed rounding mode for unconstrained FP
4448 // operations, rint and roundeven are the same operation.
4449 changeOpcode(MI, TargetOpcode::G_INTRINSIC_ROUNDEVEN);
4450 return Legalized;
4452 case TargetOpcode::G_INTRINSIC_LRINT:
4453 case TargetOpcode::G_INTRINSIC_LLRINT: {
4454 Register DstReg = MI.getOperand(0).getReg();
4455 Register SrcReg = MI.getOperand(1).getReg();
4456 LLT SrcTy = MRI.getType(SrcReg);
4457 auto Round =
4458 MIRBuilder.buildInstr(TargetOpcode::G_FRINT, {SrcTy}, {SrcReg});
4459 MIRBuilder.buildFPTOSI(DstReg, Round);
4460 MI.eraseFromParent();
4461 return Legalized;
4463 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
4464 auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
4465 Register NewOldValRes = MRI.cloneVirtualRegister(OldValRes);
4466 MIRBuilder.buildAtomicCmpXchg(NewOldValRes, Addr, CmpVal, NewVal,
4467 **MI.memoperands_begin());
4468 MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, NewOldValRes, CmpVal);
4469 MIRBuilder.buildCopy(OldValRes, NewOldValRes);
4470 MI.eraseFromParent();
4471 return Legalized;
4473 case TargetOpcode::G_LOAD:
4474 case TargetOpcode::G_SEXTLOAD:
4475 case TargetOpcode::G_ZEXTLOAD:
4476 return lowerLoad(cast<GAnyLoad>(MI));
4477 case TargetOpcode::G_STORE:
4478 return lowerStore(cast<GStore>(MI));
4479 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
4480 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
4481 case TargetOpcode::G_CTLZ:
4482 case TargetOpcode::G_CTTZ:
4483 case TargetOpcode::G_CTPOP:
4484 return lowerBitCount(MI);
4485 case G_UADDO: {
4486 auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs();
4488 Register NewRes = MRI.cloneVirtualRegister(Res);
4490 MIRBuilder.buildAdd(NewRes, LHS, RHS);
4491 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, NewRes, RHS);
4493 MIRBuilder.buildCopy(Res, NewRes);
4495 MI.eraseFromParent();
4496 return Legalized;
4498 case G_UADDE: {
4499 auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
4500 const LLT CondTy = MRI.getType(CarryOut);
4501 const LLT Ty = MRI.getType(Res);
4503 Register NewRes = MRI.cloneVirtualRegister(Res);
4505 // Initial add of the two operands.
4506 auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
4508 // Initial check for carry.
4509 auto Carry = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, TmpRes, LHS);
4511 // Add the sum and the carry.
4512 auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
4513 MIRBuilder.buildAdd(NewRes, TmpRes, ZExtCarryIn);
4515 // Second check for carry. We can only carry if the initial sum is all 1s
4516 // and the carry is set, resulting in a new sum of 0.
4517 auto Zero = MIRBuilder.buildConstant(Ty, 0);
4518 auto ResEqZero =
4519 MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, NewRes, Zero);
4520 auto Carry2 = MIRBuilder.buildAnd(CondTy, ResEqZero, CarryIn);
4521 MIRBuilder.buildOr(CarryOut, Carry, Carry2);
4523 MIRBuilder.buildCopy(Res, NewRes);
4525 MI.eraseFromParent();
4526 return Legalized;
4528 case G_USUBO: {
4529 auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs();
4531 MIRBuilder.buildSub(Res, LHS, RHS);
4532 MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
4534 MI.eraseFromParent();
4535 return Legalized;
4537 case G_USUBE: {
4538 auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs();
4539 const LLT CondTy = MRI.getType(BorrowOut);
4540 const LLT Ty = MRI.getType(Res);
4542 // Initial subtract of the two operands.
4543 auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
4545 // Initial check for borrow.
4546 auto Borrow = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, CondTy, TmpRes, LHS);
4548 // Subtract the borrow from the first subtract.
4549 auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
4550 MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
4552 // Second check for borrow. We can only borrow if the initial difference is
4553 // 0 and the borrow is set, resulting in a new difference of all 1s.
4554 auto Zero = MIRBuilder.buildConstant(Ty, 0);
4555 auto TmpResEqZero =
4556 MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, TmpRes, Zero);
4557 auto Borrow2 = MIRBuilder.buildAnd(CondTy, TmpResEqZero, BorrowIn);
4558 MIRBuilder.buildOr(BorrowOut, Borrow, Borrow2);
4560 MI.eraseFromParent();
4561 return Legalized;
4563 case G_UITOFP:
4564 return lowerUITOFP(MI);
4565 case G_SITOFP:
4566 return lowerSITOFP(MI);
4567 case G_FPTOUI:
4568 return lowerFPTOUI(MI);
4569 case G_FPTOSI:
4570 return lowerFPTOSI(MI);
4571 case G_FPTOUI_SAT:
4572 case G_FPTOSI_SAT:
4573 return lowerFPTOINT_SAT(MI);
4574 case G_FPTRUNC:
4575 return lowerFPTRUNC(MI);
4576 case G_FPOWI:
4577 return lowerFPOWI(MI);
4578 case G_SMIN:
4579 case G_SMAX:
4580 case G_UMIN:
4581 case G_UMAX:
4582 return lowerMinMax(MI);
4583 case G_SCMP:
4584 case G_UCMP:
4585 return lowerThreewayCompare(MI);
4586 case G_FCOPYSIGN:
4587 return lowerFCopySign(MI);
4588 case G_FMINNUM:
4589 case G_FMAXNUM:
4590 return lowerFMinNumMaxNum(MI);
4591 case G_MERGE_VALUES:
4592 return lowerMergeValues(MI);
4593 case G_UNMERGE_VALUES:
4594 return lowerUnmergeValues(MI);
4595 case TargetOpcode::G_SEXT_INREG: {
4596 assert(MI.getOperand(2).isImm() && "Expected immediate");
4597 int64_t SizeInBits = MI.getOperand(2).getImm();
4599 auto [DstReg, SrcReg] = MI.getFirst2Regs();
4600 LLT DstTy = MRI.getType(DstReg);
4601 Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
4603 auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
4604 MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
4605 MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
4606 MI.eraseFromParent();
4607 return Legalized;
4609 case G_EXTRACT_VECTOR_ELT:
4610 case G_INSERT_VECTOR_ELT:
4611 return lowerExtractInsertVectorElt(MI);
4612 case G_SHUFFLE_VECTOR:
4613 return lowerShuffleVector(MI);
4614 case G_VECTOR_COMPRESS:
4615 return lowerVECTOR_COMPRESS(MI);
4616 case G_DYN_STACKALLOC:
4617 return lowerDynStackAlloc(MI);
4618 case G_STACKSAVE:
4619 return lowerStackSave(MI);
4620 case G_STACKRESTORE:
4621 return lowerStackRestore(MI);
4622 case G_EXTRACT:
4623 return lowerExtract(MI);
4624 case G_INSERT:
4625 return lowerInsert(MI);
4626 case G_BSWAP:
4627 return lowerBswap(MI);
4628 case G_BITREVERSE:
4629 return lowerBitreverse(MI);
4630 case G_READ_REGISTER:
4631 case G_WRITE_REGISTER:
4632 return lowerReadWriteRegister(MI);
4633 case G_UADDSAT:
4634 case G_USUBSAT: {
4635 // Try to make a reasonable guess about which lowering strategy to use. The
4636 // target can override this with custom lowering and calling the
4637 // implementation functions.
4638 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4639 if (LI.isLegalOrCustom({G_UMIN, Ty}))
4640 return lowerAddSubSatToMinMax(MI);
4641 return lowerAddSubSatToAddoSubo(MI);
4643 case G_SADDSAT:
4644 case G_SSUBSAT: {
4645 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4647 // FIXME: It would probably make more sense to see if G_SADDO is preferred,
4648 // since it's a shorter expansion. However, we would need to figure out the
4649 // preferred boolean type for the carry out for the query.
4650 if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
4651 return lowerAddSubSatToMinMax(MI);
4652 return lowerAddSubSatToAddoSubo(MI);
4654 case G_SSHLSAT:
4655 case G_USHLSAT:
4656 return lowerShlSat(MI);
4657 case G_ABS:
4658 return lowerAbsToAddXor(MI);
4659 case G_FABS:
4660 return lowerFAbs(MI);
4661 case G_SELECT:
4662 return lowerSelect(MI);
4663 case G_IS_FPCLASS:
4664 return lowerISFPCLASS(MI);
4665 case G_SDIVREM:
4666 case G_UDIVREM:
4667 return lowerDIVREM(MI);
4668 case G_FSHL:
4669 case G_FSHR:
4670 return lowerFunnelShift(MI);
4671 case G_ROTL:
4672 case G_ROTR:
4673 return lowerRotate(MI);
4674 case G_MEMSET:
4675 case G_MEMCPY:
4676 case G_MEMMOVE:
4677 return lowerMemCpyFamily(MI);
4678 case G_MEMCPY_INLINE:
4679 return lowerMemcpyInline(MI);
4680 case G_ZEXT:
4681 case G_SEXT:
4682 case G_ANYEXT:
4683 return lowerEXT(MI);
4684 case G_TRUNC:
4685 return lowerTRUNC(MI);
4686 GISEL_VECREDUCE_CASES_NONSEQ
4687 return lowerVectorReduction(MI);
4688 case G_VAARG:
4689 return lowerVAArg(MI);
4693 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
4694 Align MinAlign) const {
4695 // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
4696 // datalayout for the preferred alignment. Also there should be a target hook
4697 // for this to allow targets to reduce the alignment and ignore the
4698 // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
4699 // the type.
4700 return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
4703 MachineInstrBuilder
4704 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
4705 MachinePointerInfo &PtrInfo) {
4706 MachineFunction &MF = MIRBuilder.getMF();
4707 const DataLayout &DL = MIRBuilder.getDataLayout();
4708 int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
4710 unsigned AddrSpace = DL.getAllocaAddrSpace();
4711 LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
4713 PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
4714 return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
4717 MachineInstrBuilder LegalizerHelper::createStackStoreLoad(const DstOp &Res,
4718 const SrcOp &Val) {
4719 LLT SrcTy = Val.getLLTTy(MRI);
4720 Align StackTypeAlign =
4721 std::max(getStackTemporaryAlignment(SrcTy),
4722 getStackTemporaryAlignment(Res.getLLTTy(MRI)));
4723 MachinePointerInfo PtrInfo;
4724 auto StackTemp =
4725 createStackTemporary(SrcTy.getSizeInBytes(), StackTypeAlign, PtrInfo);
4727 MIRBuilder.buildStore(Val, StackTemp, PtrInfo, StackTypeAlign);
4728 return MIRBuilder.buildLoad(Res, StackTemp, PtrInfo, StackTypeAlign);
4731 static Register clampVectorIndex(MachineIRBuilder &B, Register IdxReg,
4732 LLT VecTy) {
4733 LLT IdxTy = B.getMRI()->getType(IdxReg);
4734 unsigned NElts = VecTy.getNumElements();
4736 int64_t IdxVal;
4737 if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal))) {
4738 if (IdxVal < VecTy.getNumElements())
4739 return IdxReg;
4740 // If a constant index would be out of bounds, clamp it as well.
4743 if (isPowerOf2_32(NElts)) {
4744 APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
4745 return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
4748 return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
4749 .getReg(0);
4752 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
4753 Register Index) {
4754 LLT EltTy = VecTy.getElementType();
4756 // Calculate the element offset and add it to the pointer.
4757 unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
4758 assert(EltSize * 8 == EltTy.getSizeInBits() &&
4759 "Converting bits to bytes lost precision");
4761 Index = clampVectorIndex(MIRBuilder, Index, VecTy);
4763 // Convert index to the correct size for the address space.
4764 const DataLayout &DL = MIRBuilder.getDataLayout();
4765 unsigned AS = MRI.getType(VecPtr).getAddressSpace();
4766 unsigned IndexSizeInBits = DL.getIndexSize(AS) * 8;
4767 LLT IdxTy = MRI.getType(Index).changeElementSize(IndexSizeInBits);
4768 if (IdxTy != MRI.getType(Index))
4769 Index = MIRBuilder.buildSExtOrTrunc(IdxTy, Index).getReg(0);
4771 auto Mul = MIRBuilder.buildMul(IdxTy, Index,
4772 MIRBuilder.buildConstant(IdxTy, EltSize));
4774 LLT PtrTy = MRI.getType(VecPtr);
4775 return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
4778 #ifndef NDEBUG
4779 /// Check that all vector operands have same number of elements. Other operands
4780 /// should be listed in NonVecOp.
4781 static bool hasSameNumEltsOnAllVectorOperands(
4782 GenericMachineInstr &MI, MachineRegisterInfo &MRI,
4783 std::initializer_list<unsigned> NonVecOpIndices) {
4784 if (MI.getNumMemOperands() != 0)
4785 return false;
4787 LLT VecTy = MRI.getType(MI.getReg(0));
4788 if (!VecTy.isVector())
4789 return false;
4790 unsigned NumElts = VecTy.getNumElements();
4792 for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
4793 MachineOperand &Op = MI.getOperand(OpIdx);
4794 if (!Op.isReg()) {
4795 if (!is_contained(NonVecOpIndices, OpIdx))
4796 return false;
4797 continue;
4800 LLT Ty = MRI.getType(Op.getReg());
4801 if (!Ty.isVector()) {
4802 if (!is_contained(NonVecOpIndices, OpIdx))
4803 return false;
4804 continue;
4807 if (Ty.getNumElements() != NumElts)
4808 return false;
4811 return true;
4813 #endif
4815 /// Fill \p DstOps with DstOps that have same number of elements combined as
4816 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
4817 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
4818 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
4819 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
4820 unsigned NumElts) {
4821 LLT LeftoverTy;
4822 assert(Ty.isVector() && "Expected vector type");
4823 LLT EltTy = Ty.getElementType();
4824 LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
4825 int NumParts, NumLeftover;
4826 std::tie(NumParts, NumLeftover) =
4827 getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
4829 assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
4830 for (int i = 0; i < NumParts; ++i) {
4831 DstOps.push_back(NarrowTy);
4834 if (LeftoverTy.isValid()) {
4835 assert(NumLeftover == 1 && "expected exactly one leftover");
4836 DstOps.push_back(LeftoverTy);
4840 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
4841 /// made from \p Op depending on operand type.
4842 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
4843 MachineOperand &Op) {
4844 for (unsigned i = 0; i < N; ++i) {
4845 if (Op.isReg())
4846 Ops.push_back(Op.getReg());
4847 else if (Op.isImm())
4848 Ops.push_back(Op.getImm());
4849 else if (Op.isPredicate())
4850 Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
4851 else
4852 llvm_unreachable("Unsupported type");
4856 // Handle splitting vector operations which need to have the same number of
4857 // elements in each type index, but each type index may have a different element
4858 // type.
4860 // e.g. <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
4861 // <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4862 // <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4864 // Also handles some irregular breakdown cases, e.g.
4865 // e.g. <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
4866 // <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4867 // s64 = G_SHL s64, s32
4868 LegalizerHelper::LegalizeResult
4869 LegalizerHelper::fewerElementsVectorMultiEltType(
4870 GenericMachineInstr &MI, unsigned NumElts,
4871 std::initializer_list<unsigned> NonVecOpIndices) {
4872 assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
4873 "Non-compatible opcode or not specified non-vector operands");
4874 unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
4876 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
4877 unsigned NumDefs = MI.getNumDefs();
4879 // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
4880 // Build instructions with DstOps to use instruction found by CSE directly.
4881 // CSE copies found instruction into given vreg when building with vreg dest.
4882 SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
4883 // Output registers will be taken from created instructions.
4884 SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
4885 for (unsigned i = 0; i < NumDefs; ++i) {
4886 makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
4889 // Split vector input operands into sub-vectors with NumElts elts + Leftover.
4890 // Operands listed in NonVecOpIndices will be used as is without splitting;
4891 // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
4892 // scalar condition (op 1), immediate in sext_inreg (op 2).
4893 SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
4894 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
4895 ++UseIdx, ++UseNo) {
4896 if (is_contained(NonVecOpIndices, UseIdx)) {
4897 broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
4898 MI.getOperand(UseIdx));
4899 } else {
4900 SmallVector<Register, 8> SplitPieces;
4901 extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces, MIRBuilder,
4902 MRI);
4903 for (auto Reg : SplitPieces)
4904 InputOpsPieces[UseNo].push_back(Reg);
4908 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
4910 // Take i-th piece of each input operand split and build sub-vector/scalar
4911 // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
4912 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
4913 SmallVector<DstOp, 2> Defs;
4914 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
4915 Defs.push_back(OutputOpsPieces[DstNo][i]);
4917 SmallVector<SrcOp, 3> Uses;
4918 for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
4919 Uses.push_back(InputOpsPieces[InputNo][i]);
4921 auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
4922 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
4923 OutputRegs[DstNo].push_back(I.getReg(DstNo));
4926 // Merge small outputs into MI's output for each def operand.
4927 if (NumLeftovers) {
4928 for (unsigned i = 0; i < NumDefs; ++i)
4929 mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
4930 } else {
4931 for (unsigned i = 0; i < NumDefs; ++i)
4932 MIRBuilder.buildMergeLikeInstr(MI.getReg(i), OutputRegs[i]);
4935 MI.eraseFromParent();
4936 return Legalized;
4939 LegalizerHelper::LegalizeResult
4940 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
4941 unsigned NumElts) {
4942 unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
4944 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
4945 unsigned NumDefs = MI.getNumDefs();
4947 SmallVector<DstOp, 8> OutputOpsPieces;
4948 SmallVector<Register, 8> OutputRegs;
4949 makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
4951 // Instructions that perform register split will be inserted in basic block
4952 // where register is defined (basic block is in the next operand).
4953 SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
4954 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
4955 UseIdx += 2, ++UseNo) {
4956 MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
4957 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
4958 extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo],
4959 MIRBuilder, MRI);
4962 // Build PHIs with fewer elements.
4963 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
4964 MIRBuilder.setInsertPt(*MI.getParent(), MI);
4965 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
4966 auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
4967 Phi.addDef(
4968 MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
4969 OutputRegs.push_back(Phi.getReg(0));
4971 for (unsigned j = 0; j < NumInputs / 2; ++j) {
4972 Phi.addUse(InputOpsPieces[j][i]);
4973 Phi.add(MI.getOperand(1 + j * 2 + 1));
4977 // Set the insert point after the existing PHIs
4978 MachineBasicBlock &MBB = *MI.getParent();
4979 MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
4981 // Merge small outputs into MI's def.
4982 if (NumLeftovers) {
4983 mergeMixedSubvectors(MI.getReg(0), OutputRegs);
4984 } else {
4985 MIRBuilder.buildMergeLikeInstr(MI.getReg(0), OutputRegs);
4988 MI.eraseFromParent();
4989 return Legalized;
4992 LegalizerHelper::LegalizeResult
4993 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
4994 unsigned TypeIdx,
4995 LLT NarrowTy) {
4996 const int NumDst = MI.getNumOperands() - 1;
4997 const Register SrcReg = MI.getOperand(NumDst).getReg();
4998 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
4999 LLT SrcTy = MRI.getType(SrcReg);
5001 if (TypeIdx != 1 || NarrowTy == DstTy)
5002 return UnableToLegalize;
5004 // Requires compatible types. Otherwise SrcReg should have been defined by
5005 // merge-like instruction that would get artifact combined. Most likely
5006 // instruction that defines SrcReg has to perform more/fewer elements
5007 // legalization compatible with NarrowTy.
5008 assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5009 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5011 if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5012 (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
5013 return UnableToLegalize;
5015 // This is most likely DstTy (smaller then register size) packed in SrcTy
5016 // (larger then register size) and since unmerge was not combined it will be
5017 // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
5018 // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
5020 // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
5022 // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
5023 // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
5024 // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
5025 auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
5026 const int NumUnmerge = Unmerge->getNumOperands() - 1;
5027 const int PartsPerUnmerge = NumDst / NumUnmerge;
5029 for (int I = 0; I != NumUnmerge; ++I) {
5030 auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
5032 for (int J = 0; J != PartsPerUnmerge; ++J)
5033 MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
5034 MIB.addUse(Unmerge.getReg(I));
5037 MI.eraseFromParent();
5038 return Legalized;
5041 LegalizerHelper::LegalizeResult
5042 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
5043 LLT NarrowTy) {
5044 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5045 // Requires compatible types. Otherwise user of DstReg did not perform unmerge
5046 // that should have been artifact combined. Most likely instruction that uses
5047 // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
5048 assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5049 assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5050 if (NarrowTy == SrcTy)
5051 return UnableToLegalize;
5053 // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
5054 // is for old mir tests. Since the changes to more/fewer elements it should no
5055 // longer be possible to generate MIR like this when starting from llvm-ir
5056 // because LCMTy approach was replaced with merge/unmerge to vector elements.
5057 if (TypeIdx == 1) {
5058 assert(SrcTy.isVector() && "Expected vector types");
5059 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5060 if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5061 (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
5062 return UnableToLegalize;
5063 // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
5065 // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
5066 // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
5067 // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
5068 // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
5069 // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
5070 // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
5072 SmallVector<Register, 8> Elts;
5073 LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
5074 for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
5075 auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
5076 for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
5077 Elts.push_back(Unmerge.getReg(j));
5080 SmallVector<Register, 8> NarrowTyElts;
5081 unsigned NumNarrowTyElts = NarrowTy.getNumElements();
5082 unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
5083 for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
5084 ++i, Offset += NumNarrowTyElts) {
5085 ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
5086 NarrowTyElts.push_back(
5087 MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0));
5090 MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
5091 MI.eraseFromParent();
5092 return Legalized;
5095 assert(TypeIdx == 0 && "Bad type index");
5096 if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
5097 (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
5098 return UnableToLegalize;
5100 // This is most likely SrcTy (smaller then register size) packed in DstTy
5101 // (larger then register size) and since merge was not combined it will be
5102 // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
5103 // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
5105 // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
5107 // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
5108 // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
5109 // %0:_(DstTy) = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
5110 SmallVector<Register, 8> NarrowTyElts;
5111 unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
5112 unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
5113 unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
5114 for (unsigned i = 0; i < NumParts; ++i) {
5115 SmallVector<Register, 8> Sources;
5116 for (unsigned j = 0; j < NumElts; ++j)
5117 Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
5118 NarrowTyElts.push_back(
5119 MIRBuilder.buildMergeLikeInstr(NarrowTy, Sources).getReg(0));
5122 MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
5123 MI.eraseFromParent();
5124 return Legalized;
5127 LegalizerHelper::LegalizeResult
5128 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
5129 unsigned TypeIdx,
5130 LLT NarrowVecTy) {
5131 auto [DstReg, SrcVec] = MI.getFirst2Regs();
5132 Register InsertVal;
5133 bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
5135 assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
5136 if (IsInsert)
5137 InsertVal = MI.getOperand(2).getReg();
5139 Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
5141 // TODO: Handle total scalarization case.
5142 if (!NarrowVecTy.isVector())
5143 return UnableToLegalize;
5145 LLT VecTy = MRI.getType(SrcVec);
5147 // If the index is a constant, we can really break this down as you would
5148 // expect, and index into the target size pieces.
5149 int64_t IdxVal;
5150 auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
5151 if (MaybeCst) {
5152 IdxVal = MaybeCst->Value.getSExtValue();
5153 // Avoid out of bounds indexing the pieces.
5154 if (IdxVal >= VecTy.getNumElements()) {
5155 MIRBuilder.buildUndef(DstReg);
5156 MI.eraseFromParent();
5157 return Legalized;
5160 SmallVector<Register, 8> VecParts;
5161 LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
5163 // Build a sequence of NarrowTy pieces in VecParts for this operand.
5164 LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
5165 TargetOpcode::G_ANYEXT);
5167 unsigned NewNumElts = NarrowVecTy.getNumElements();
5169 LLT IdxTy = MRI.getType(Idx);
5170 int64_t PartIdx = IdxVal / NewNumElts;
5171 auto NewIdx =
5172 MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
5174 if (IsInsert) {
5175 LLT PartTy = MRI.getType(VecParts[PartIdx]);
5177 // Use the adjusted index to insert into one of the subvectors.
5178 auto InsertPart = MIRBuilder.buildInsertVectorElement(
5179 PartTy, VecParts[PartIdx], InsertVal, NewIdx);
5180 VecParts[PartIdx] = InsertPart.getReg(0);
5182 // Recombine the inserted subvector with the others to reform the result
5183 // vector.
5184 buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
5185 } else {
5186 MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
5189 MI.eraseFromParent();
5190 return Legalized;
5193 // With a variable index, we can't perform the operation in a smaller type, so
5194 // we're forced to expand this.
5196 // TODO: We could emit a chain of compare/select to figure out which piece to
5197 // index.
5198 return lowerExtractInsertVectorElt(MI);
5201 LegalizerHelper::LegalizeResult
5202 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
5203 LLT NarrowTy) {
5204 // FIXME: Don't know how to handle secondary types yet.
5205 if (TypeIdx != 0)
5206 return UnableToLegalize;
5208 // This implementation doesn't work for atomics. Give up instead of doing
5209 // something invalid.
5210 if (LdStMI.isAtomic())
5211 return UnableToLegalize;
5213 bool IsLoad = isa<GLoad>(LdStMI);
5214 Register ValReg = LdStMI.getReg(0);
5215 Register AddrReg = LdStMI.getPointerReg();
5216 LLT ValTy = MRI.getType(ValReg);
5218 // FIXME: Do we need a distinct NarrowMemory legalize action?
5219 if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize().getValue()) {
5220 LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
5221 return UnableToLegalize;
5224 int NumParts = -1;
5225 int NumLeftover = -1;
5226 LLT LeftoverTy;
5227 SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
5228 if (IsLoad) {
5229 std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
5230 } else {
5231 if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
5232 NarrowLeftoverRegs, MIRBuilder, MRI)) {
5233 NumParts = NarrowRegs.size();
5234 NumLeftover = NarrowLeftoverRegs.size();
5238 if (NumParts == -1)
5239 return UnableToLegalize;
5241 LLT PtrTy = MRI.getType(AddrReg);
5242 const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
5244 unsigned TotalSize = ValTy.getSizeInBits();
5246 // Split the load/store into PartTy sized pieces starting at Offset. If this
5247 // is a load, return the new registers in ValRegs. For a store, each elements
5248 // of ValRegs should be PartTy. Returns the next offset that needs to be
5249 // handled.
5250 bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
5251 auto MMO = LdStMI.getMMO();
5252 auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
5253 unsigned NumParts, unsigned Offset) -> unsigned {
5254 MachineFunction &MF = MIRBuilder.getMF();
5255 unsigned PartSize = PartTy.getSizeInBits();
5256 for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
5257 ++Idx) {
5258 unsigned ByteOffset = Offset / 8;
5259 Register NewAddrReg;
5261 MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
5263 MachineMemOperand *NewMMO =
5264 MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
5266 if (IsLoad) {
5267 Register Dst = MRI.createGenericVirtualRegister(PartTy);
5268 ValRegs.push_back(Dst);
5269 MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
5270 } else {
5271 MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
5273 Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
5276 return Offset;
5279 unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
5280 unsigned HandledOffset =
5281 splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
5283 // Handle the rest of the register if this isn't an even type breakdown.
5284 if (LeftoverTy.isValid())
5285 splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
5287 if (IsLoad) {
5288 insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
5289 LeftoverTy, NarrowLeftoverRegs);
5292 LdStMI.eraseFromParent();
5293 return Legalized;
5296 LegalizerHelper::LegalizeResult
5297 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
5298 LLT NarrowTy) {
5299 using namespace TargetOpcode;
5300 GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
5301 unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5303 switch (MI.getOpcode()) {
5304 case G_IMPLICIT_DEF:
5305 case G_TRUNC:
5306 case G_AND:
5307 case G_OR:
5308 case G_XOR:
5309 case G_ADD:
5310 case G_SUB:
5311 case G_MUL:
5312 case G_PTR_ADD:
5313 case G_SMULH:
5314 case G_UMULH:
5315 case G_FADD:
5316 case G_FMUL:
5317 case G_FSUB:
5318 case G_FNEG:
5319 case G_FABS:
5320 case G_FCANONICALIZE:
5321 case G_FDIV:
5322 case G_FREM:
5323 case G_FMA:
5324 case G_FMAD:
5325 case G_FPOW:
5326 case G_FEXP:
5327 case G_FEXP2:
5328 case G_FEXP10:
5329 case G_FLOG:
5330 case G_FLOG2:
5331 case G_FLOG10:
5332 case G_FLDEXP:
5333 case G_FNEARBYINT:
5334 case G_FCEIL:
5335 case G_FFLOOR:
5336 case G_FRINT:
5337 case G_INTRINSIC_LRINT:
5338 case G_INTRINSIC_LLRINT:
5339 case G_INTRINSIC_ROUND:
5340 case G_INTRINSIC_ROUNDEVEN:
5341 case G_LROUND:
5342 case G_LLROUND:
5343 case G_INTRINSIC_TRUNC:
5344 case G_FCOS:
5345 case G_FSIN:
5346 case G_FTAN:
5347 case G_FACOS:
5348 case G_FASIN:
5349 case G_FATAN:
5350 case G_FATAN2:
5351 case G_FCOSH:
5352 case G_FSINH:
5353 case G_FTANH:
5354 case G_FSQRT:
5355 case G_BSWAP:
5356 case G_BITREVERSE:
5357 case G_SDIV:
5358 case G_UDIV:
5359 case G_SREM:
5360 case G_UREM:
5361 case G_SDIVREM:
5362 case G_UDIVREM:
5363 case G_SMIN:
5364 case G_SMAX:
5365 case G_UMIN:
5366 case G_UMAX:
5367 case G_ABS:
5368 case G_FMINNUM:
5369 case G_FMAXNUM:
5370 case G_FMINNUM_IEEE:
5371 case G_FMAXNUM_IEEE:
5372 case G_FMINIMUM:
5373 case G_FMAXIMUM:
5374 case G_FSHL:
5375 case G_FSHR:
5376 case G_ROTL:
5377 case G_ROTR:
5378 case G_FREEZE:
5379 case G_SADDSAT:
5380 case G_SSUBSAT:
5381 case G_UADDSAT:
5382 case G_USUBSAT:
5383 case G_UMULO:
5384 case G_SMULO:
5385 case G_SHL:
5386 case G_LSHR:
5387 case G_ASHR:
5388 case G_SSHLSAT:
5389 case G_USHLSAT:
5390 case G_CTLZ:
5391 case G_CTLZ_ZERO_UNDEF:
5392 case G_CTTZ:
5393 case G_CTTZ_ZERO_UNDEF:
5394 case G_CTPOP:
5395 case G_FCOPYSIGN:
5396 case G_ZEXT:
5397 case G_SEXT:
5398 case G_ANYEXT:
5399 case G_FPEXT:
5400 case G_FPTRUNC:
5401 case G_SITOFP:
5402 case G_UITOFP:
5403 case G_FPTOSI:
5404 case G_FPTOUI:
5405 case G_FPTOSI_SAT:
5406 case G_FPTOUI_SAT:
5407 case G_INTTOPTR:
5408 case G_PTRTOINT:
5409 case G_ADDRSPACE_CAST:
5410 case G_UADDO:
5411 case G_USUBO:
5412 case G_UADDE:
5413 case G_USUBE:
5414 case G_SADDO:
5415 case G_SSUBO:
5416 case G_SADDE:
5417 case G_SSUBE:
5418 case G_STRICT_FADD:
5419 case G_STRICT_FSUB:
5420 case G_STRICT_FMUL:
5421 case G_STRICT_FMA:
5422 case G_STRICT_FLDEXP:
5423 case G_FFREXP:
5424 return fewerElementsVectorMultiEltType(GMI, NumElts);
5425 case G_ICMP:
5426 case G_FCMP:
5427 return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
5428 case G_IS_FPCLASS:
5429 return fewerElementsVectorMultiEltType(GMI, NumElts, {2, 3 /*mask,fpsem*/});
5430 case G_SELECT:
5431 if (MRI.getType(MI.getOperand(1).getReg()).isVector())
5432 return fewerElementsVectorMultiEltType(GMI, NumElts);
5433 return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
5434 case G_PHI:
5435 return fewerElementsVectorPhi(GMI, NumElts);
5436 case G_UNMERGE_VALUES:
5437 return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
5438 case G_BUILD_VECTOR:
5439 assert(TypeIdx == 0 && "not a vector type index");
5440 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5441 case G_CONCAT_VECTORS:
5442 if (TypeIdx != 1) // TODO: This probably does work as expected already.
5443 return UnableToLegalize;
5444 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5445 case G_EXTRACT_VECTOR_ELT:
5446 case G_INSERT_VECTOR_ELT:
5447 return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
5448 case G_LOAD:
5449 case G_STORE:
5450 return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
5451 case G_SEXT_INREG:
5452 return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
5453 GISEL_VECREDUCE_CASES_NONSEQ
5454 return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
5455 case TargetOpcode::G_VECREDUCE_SEQ_FADD:
5456 case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
5457 return fewerElementsVectorSeqReductions(MI, TypeIdx, NarrowTy);
5458 case G_SHUFFLE_VECTOR:
5459 return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
5460 case G_FPOWI:
5461 return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*pow*/});
5462 case G_BITCAST:
5463 return fewerElementsBitcast(MI, TypeIdx, NarrowTy);
5464 case G_INTRINSIC_FPTRUNC_ROUND:
5465 return fewerElementsVectorMultiEltType(GMI, NumElts, {2});
5466 default:
5467 return UnableToLegalize;
5471 LegalizerHelper::LegalizeResult
5472 LegalizerHelper::fewerElementsBitcast(MachineInstr &MI, unsigned int TypeIdx,
5473 LLT NarrowTy) {
5474 assert(MI.getOpcode() == TargetOpcode::G_BITCAST &&
5475 "Not a bitcast operation");
5477 if (TypeIdx != 0)
5478 return UnableToLegalize;
5480 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5482 unsigned NewElemCount =
5483 NarrowTy.getSizeInBits() / SrcTy.getScalarSizeInBits();
5484 LLT SrcNarrowTy = LLT::fixed_vector(NewElemCount, SrcTy.getElementType());
5486 // Split the Src and Dst Reg into smaller registers
5487 SmallVector<Register> SrcVRegs, BitcastVRegs;
5488 if (extractGCDType(SrcVRegs, DstTy, SrcNarrowTy, SrcReg) != SrcNarrowTy)
5489 return UnableToLegalize;
5491 // Build new smaller bitcast instructions
5492 // Not supporting Leftover types for now but will have to
5493 for (unsigned i = 0; i < SrcVRegs.size(); i++)
5494 BitcastVRegs.push_back(
5495 MIRBuilder.buildBitcast(NarrowTy, SrcVRegs[i]).getReg(0));
5497 MIRBuilder.buildMergeLikeInstr(DstReg, BitcastVRegs);
5498 MI.eraseFromParent();
5499 return Legalized;
5502 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
5503 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5504 assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
5505 if (TypeIdx != 0)
5506 return UnableToLegalize;
5508 auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] =
5509 MI.getFirst3RegLLTs();
5510 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5511 // The shuffle should be canonicalized by now.
5512 if (DstTy != Src1Ty)
5513 return UnableToLegalize;
5514 if (DstTy != Src2Ty)
5515 return UnableToLegalize;
5517 if (!isPowerOf2_32(DstTy.getNumElements()))
5518 return UnableToLegalize;
5520 // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
5521 // Further legalization attempts will be needed to do split further.
5522 NarrowTy =
5523 DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
5524 unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5526 SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
5527 extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs, MIRBuilder, MRI);
5528 extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs, MIRBuilder, MRI);
5529 Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
5530 SplitSrc2Regs[1]};
5532 Register Hi, Lo;
5534 // If Lo or Hi uses elements from at most two of the four input vectors, then
5535 // express it as a vector shuffle of those two inputs. Otherwise extract the
5536 // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
5537 SmallVector<int, 16> Ops;
5538 for (unsigned High = 0; High < 2; ++High) {
5539 Register &Output = High ? Hi : Lo;
5541 // Build a shuffle mask for the output, discovering on the fly which
5542 // input vectors to use as shuffle operands (recorded in InputUsed).
5543 // If building a suitable shuffle vector proves too hard, then bail
5544 // out with useBuildVector set.
5545 unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
5546 unsigned FirstMaskIdx = High * NewElts;
5547 bool UseBuildVector = false;
5548 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5549 // The mask element. This indexes into the input.
5550 int Idx = Mask[FirstMaskIdx + MaskOffset];
5552 // The input vector this mask element indexes into.
5553 unsigned Input = (unsigned)Idx / NewElts;
5555 if (Input >= std::size(Inputs)) {
5556 // The mask element does not index into any input vector.
5557 Ops.push_back(-1);
5558 continue;
5561 // Turn the index into an offset from the start of the input vector.
5562 Idx -= Input * NewElts;
5564 // Find or create a shuffle vector operand to hold this input.
5565 unsigned OpNo;
5566 for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) {
5567 if (InputUsed[OpNo] == Input) {
5568 // This input vector is already an operand.
5569 break;
5570 } else if (InputUsed[OpNo] == -1U) {
5571 // Create a new operand for this input vector.
5572 InputUsed[OpNo] = Input;
5573 break;
5577 if (OpNo >= std::size(InputUsed)) {
5578 // More than two input vectors used! Give up on trying to create a
5579 // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
5580 UseBuildVector = true;
5581 break;
5584 // Add the mask index for the new shuffle vector.
5585 Ops.push_back(Idx + OpNo * NewElts);
5588 if (UseBuildVector) {
5589 LLT EltTy = NarrowTy.getElementType();
5590 SmallVector<Register, 16> SVOps;
5592 // Extract the input elements by hand.
5593 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5594 // The mask element. This indexes into the input.
5595 int Idx = Mask[FirstMaskIdx + MaskOffset];
5597 // The input vector this mask element indexes into.
5598 unsigned Input = (unsigned)Idx / NewElts;
5600 if (Input >= std::size(Inputs)) {
5601 // The mask element is "undef" or indexes off the end of the input.
5602 SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
5603 continue;
5606 // Turn the index into an offset from the start of the input vector.
5607 Idx -= Input * NewElts;
5609 // Extract the vector element by hand.
5610 SVOps.push_back(MIRBuilder
5611 .buildExtractVectorElement(
5612 EltTy, Inputs[Input],
5613 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
5614 .getReg(0));
5617 // Construct the Lo/Hi output using a G_BUILD_VECTOR.
5618 Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
5619 } else if (InputUsed[0] == -1U) {
5620 // No input vectors were used! The result is undefined.
5621 Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
5622 } else {
5623 Register Op0 = Inputs[InputUsed[0]];
5624 // If only one input was used, use an undefined vector for the other.
5625 Register Op1 = InputUsed[1] == -1U
5626 ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
5627 : Inputs[InputUsed[1]];
5628 // At least one input vector was used. Create a new shuffle vector.
5629 Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
5632 Ops.clear();
5635 MIRBuilder.buildMergeLikeInstr(DstReg, {Lo, Hi});
5636 MI.eraseFromParent();
5637 return Legalized;
5640 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
5641 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5642 auto &RdxMI = cast<GVecReduce>(MI);
5644 if (TypeIdx != 1)
5645 return UnableToLegalize;
5647 // The semantics of the normal non-sequential reductions allow us to freely
5648 // re-associate the operation.
5649 auto [DstReg, DstTy, SrcReg, SrcTy] = RdxMI.getFirst2RegLLTs();
5651 if (NarrowTy.isVector() &&
5652 (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
5653 return UnableToLegalize;
5655 unsigned ScalarOpc = RdxMI.getScalarOpcForReduction();
5656 SmallVector<Register> SplitSrcs;
5657 // If NarrowTy is a scalar then we're being asked to scalarize.
5658 const unsigned NumParts =
5659 NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
5660 : SrcTy.getNumElements();
5662 extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI);
5663 if (NarrowTy.isScalar()) {
5664 if (DstTy != NarrowTy)
5665 return UnableToLegalize; // FIXME: handle implicit extensions.
5667 if (isPowerOf2_32(NumParts)) {
5668 // Generate a tree of scalar operations to reduce the critical path.
5669 SmallVector<Register> PartialResults;
5670 unsigned NumPartsLeft = NumParts;
5671 while (NumPartsLeft > 1) {
5672 for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
5673 PartialResults.emplace_back(
5674 MIRBuilder
5675 .buildInstr(ScalarOpc, {NarrowTy},
5676 {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
5677 .getReg(0));
5679 SplitSrcs = PartialResults;
5680 PartialResults.clear();
5681 NumPartsLeft = SplitSrcs.size();
5683 assert(SplitSrcs.size() == 1);
5684 MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
5685 MI.eraseFromParent();
5686 return Legalized;
5688 // If we can't generate a tree, then just do sequential operations.
5689 Register Acc = SplitSrcs[0];
5690 for (unsigned Idx = 1; Idx < NumParts; ++Idx)
5691 Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
5692 .getReg(0);
5693 MIRBuilder.buildCopy(DstReg, Acc);
5694 MI.eraseFromParent();
5695 return Legalized;
5697 SmallVector<Register> PartialReductions;
5698 for (unsigned Part = 0; Part < NumParts; ++Part) {
5699 PartialReductions.push_back(
5700 MIRBuilder.buildInstr(RdxMI.getOpcode(), {DstTy}, {SplitSrcs[Part]})
5701 .getReg(0));
5704 // If the types involved are powers of 2, we can generate intermediate vector
5705 // ops, before generating a final reduction operation.
5706 if (isPowerOf2_32(SrcTy.getNumElements()) &&
5707 isPowerOf2_32(NarrowTy.getNumElements())) {
5708 return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
5711 Register Acc = PartialReductions[0];
5712 for (unsigned Part = 1; Part < NumParts; ++Part) {
5713 if (Part == NumParts - 1) {
5714 MIRBuilder.buildInstr(ScalarOpc, {DstReg},
5715 {Acc, PartialReductions[Part]});
5716 } else {
5717 Acc = MIRBuilder
5718 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
5719 .getReg(0);
5722 MI.eraseFromParent();
5723 return Legalized;
5726 LegalizerHelper::LegalizeResult
5727 LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr &MI,
5728 unsigned int TypeIdx,
5729 LLT NarrowTy) {
5730 auto [DstReg, DstTy, ScalarReg, ScalarTy, SrcReg, SrcTy] =
5731 MI.getFirst3RegLLTs();
5732 if (!NarrowTy.isScalar() || TypeIdx != 2 || DstTy != ScalarTy ||
5733 DstTy != NarrowTy)
5734 return UnableToLegalize;
5736 assert((MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD ||
5737 MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FMUL) &&
5738 "Unexpected vecreduce opcode");
5739 unsigned ScalarOpc = MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD
5740 ? TargetOpcode::G_FADD
5741 : TargetOpcode::G_FMUL;
5743 SmallVector<Register> SplitSrcs;
5744 unsigned NumParts = SrcTy.getNumElements();
5745 extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI);
5746 Register Acc = ScalarReg;
5747 for (unsigned i = 0; i < NumParts; i++)
5748 Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[i]})
5749 .getReg(0);
5751 MIRBuilder.buildCopy(DstReg, Acc);
5752 MI.eraseFromParent();
5753 return Legalized;
5756 LegalizerHelper::LegalizeResult
5757 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
5758 LLT SrcTy, LLT NarrowTy,
5759 unsigned ScalarOpc) {
5760 SmallVector<Register> SplitSrcs;
5761 // Split the sources into NarrowTy size pieces.
5762 extractParts(SrcReg, NarrowTy,
5763 SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs,
5764 MIRBuilder, MRI);
5765 // We're going to do a tree reduction using vector operations until we have
5766 // one NarrowTy size value left.
5767 while (SplitSrcs.size() > 1) {
5768 SmallVector<Register> PartialRdxs;
5769 for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
5770 Register LHS = SplitSrcs[Idx];
5771 Register RHS = SplitSrcs[Idx + 1];
5772 // Create the intermediate vector op.
5773 Register Res =
5774 MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
5775 PartialRdxs.push_back(Res);
5777 SplitSrcs = std::move(PartialRdxs);
5779 // Finally generate the requested NarrowTy based reduction.
5780 Observer.changingInstr(MI);
5781 MI.getOperand(1).setReg(SplitSrcs[0]);
5782 Observer.changedInstr(MI);
5783 return Legalized;
5786 LegalizerHelper::LegalizeResult
5787 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
5788 const LLT HalfTy, const LLT AmtTy) {
5790 Register InL = MRI.createGenericVirtualRegister(HalfTy);
5791 Register InH = MRI.createGenericVirtualRegister(HalfTy);
5792 MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
5794 if (Amt.isZero()) {
5795 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {InL, InH});
5796 MI.eraseFromParent();
5797 return Legalized;
5800 LLT NVT = HalfTy;
5801 unsigned NVTBits = HalfTy.getSizeInBits();
5802 unsigned VTBits = 2 * NVTBits;
5804 SrcOp Lo(Register(0)), Hi(Register(0));
5805 if (MI.getOpcode() == TargetOpcode::G_SHL) {
5806 if (Amt.ugt(VTBits)) {
5807 Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
5808 } else if (Amt.ugt(NVTBits)) {
5809 Lo = MIRBuilder.buildConstant(NVT, 0);
5810 Hi = MIRBuilder.buildShl(NVT, InL,
5811 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5812 } else if (Amt == NVTBits) {
5813 Lo = MIRBuilder.buildConstant(NVT, 0);
5814 Hi = InL;
5815 } else {
5816 Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
5817 auto OrLHS =
5818 MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
5819 auto OrRHS = MIRBuilder.buildLShr(
5820 NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5821 Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5823 } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
5824 if (Amt.ugt(VTBits)) {
5825 Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
5826 } else if (Amt.ugt(NVTBits)) {
5827 Lo = MIRBuilder.buildLShr(NVT, InH,
5828 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5829 Hi = MIRBuilder.buildConstant(NVT, 0);
5830 } else if (Amt == NVTBits) {
5831 Lo = InH;
5832 Hi = MIRBuilder.buildConstant(NVT, 0);
5833 } else {
5834 auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
5836 auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
5837 auto OrRHS = MIRBuilder.buildShl(
5838 NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5840 Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5841 Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
5843 } else {
5844 if (Amt.ugt(VTBits)) {
5845 Hi = Lo = MIRBuilder.buildAShr(
5846 NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5847 } else if (Amt.ugt(NVTBits)) {
5848 Lo = MIRBuilder.buildAShr(NVT, InH,
5849 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5850 Hi = MIRBuilder.buildAShr(NVT, InH,
5851 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5852 } else if (Amt == NVTBits) {
5853 Lo = InH;
5854 Hi = MIRBuilder.buildAShr(NVT, InH,
5855 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5856 } else {
5857 auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
5859 auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
5860 auto OrRHS = MIRBuilder.buildShl(
5861 NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5863 Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5864 Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
5868 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {Lo, Hi});
5869 MI.eraseFromParent();
5871 return Legalized;
5874 // TODO: Optimize if constant shift amount.
5875 LegalizerHelper::LegalizeResult
5876 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
5877 LLT RequestedTy) {
5878 if (TypeIdx == 1) {
5879 Observer.changingInstr(MI);
5880 narrowScalarSrc(MI, RequestedTy, 2);
5881 Observer.changedInstr(MI);
5882 return Legalized;
5885 Register DstReg = MI.getOperand(0).getReg();
5886 LLT DstTy = MRI.getType(DstReg);
5887 if (DstTy.isVector())
5888 return UnableToLegalize;
5890 Register Amt = MI.getOperand(2).getReg();
5891 LLT ShiftAmtTy = MRI.getType(Amt);
5892 const unsigned DstEltSize = DstTy.getScalarSizeInBits();
5893 if (DstEltSize % 2 != 0)
5894 return UnableToLegalize;
5896 // Ignore the input type. We can only go to exactly half the size of the
5897 // input. If that isn't small enough, the resulting pieces will be further
5898 // legalized.
5899 const unsigned NewBitSize = DstEltSize / 2;
5900 const LLT HalfTy = LLT::scalar(NewBitSize);
5901 const LLT CondTy = LLT::scalar(1);
5903 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
5904 return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
5905 ShiftAmtTy);
5908 // TODO: Expand with known bits.
5910 // Handle the fully general expansion by an unknown amount.
5911 auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
5913 Register InL = MRI.createGenericVirtualRegister(HalfTy);
5914 Register InH = MRI.createGenericVirtualRegister(HalfTy);
5915 MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
5917 auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
5918 auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
5920 auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
5921 auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
5922 auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
5924 Register ResultRegs[2];
5925 switch (MI.getOpcode()) {
5926 case TargetOpcode::G_SHL: {
5927 // Short: ShAmt < NewBitSize
5928 auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
5930 auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
5931 auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
5932 auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
5934 // Long: ShAmt >= NewBitSize
5935 auto LoL = MIRBuilder.buildConstant(HalfTy, 0); // Lo part is zero.
5936 auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
5938 auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
5939 auto Hi = MIRBuilder.buildSelect(
5940 HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
5942 ResultRegs[0] = Lo.getReg(0);
5943 ResultRegs[1] = Hi.getReg(0);
5944 break;
5946 case TargetOpcode::G_LSHR:
5947 case TargetOpcode::G_ASHR: {
5948 // Short: ShAmt < NewBitSize
5949 auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
5951 auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
5952 auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
5953 auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
5955 // Long: ShAmt >= NewBitSize
5956 MachineInstrBuilder HiL;
5957 if (MI.getOpcode() == TargetOpcode::G_LSHR) {
5958 HiL = MIRBuilder.buildConstant(HalfTy, 0); // Hi part is zero.
5959 } else {
5960 auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
5961 HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt); // Sign of Hi part.
5963 auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
5964 {InH, AmtExcess}); // Lo from Hi part.
5966 auto Lo = MIRBuilder.buildSelect(
5967 HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
5969 auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
5971 ResultRegs[0] = Lo.getReg(0);
5972 ResultRegs[1] = Hi.getReg(0);
5973 break;
5975 default:
5976 llvm_unreachable("not a shift");
5979 MIRBuilder.buildMergeLikeInstr(DstReg, ResultRegs);
5980 MI.eraseFromParent();
5981 return Legalized;
5984 LegalizerHelper::LegalizeResult
5985 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
5986 LLT MoreTy) {
5987 assert(TypeIdx == 0 && "Expecting only Idx 0");
5989 Observer.changingInstr(MI);
5990 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5991 MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
5992 MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
5993 moreElementsVectorSrc(MI, MoreTy, I);
5996 MachineBasicBlock &MBB = *MI.getParent();
5997 MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
5998 moreElementsVectorDst(MI, MoreTy, 0);
5999 Observer.changedInstr(MI);
6000 return Legalized;
6003 MachineInstrBuilder LegalizerHelper::getNeutralElementForVecReduce(
6004 unsigned Opcode, MachineIRBuilder &MIRBuilder, LLT Ty) {
6005 assert(Ty.isScalar() && "Expected scalar type to make neutral element for");
6007 switch (Opcode) {
6008 default:
6009 llvm_unreachable(
6010 "getNeutralElementForVecReduce called with invalid opcode!");
6011 case TargetOpcode::G_VECREDUCE_ADD:
6012 case TargetOpcode::G_VECREDUCE_OR:
6013 case TargetOpcode::G_VECREDUCE_XOR:
6014 case TargetOpcode::G_VECREDUCE_UMAX:
6015 return MIRBuilder.buildConstant(Ty, 0);
6016 case TargetOpcode::G_VECREDUCE_MUL:
6017 return MIRBuilder.buildConstant(Ty, 1);
6018 case TargetOpcode::G_VECREDUCE_AND:
6019 case TargetOpcode::G_VECREDUCE_UMIN:
6020 return MIRBuilder.buildConstant(
6021 Ty, APInt::getAllOnes(Ty.getScalarSizeInBits()));
6022 case TargetOpcode::G_VECREDUCE_SMAX:
6023 return MIRBuilder.buildConstant(
6024 Ty, APInt::getSignedMinValue(Ty.getSizeInBits()));
6025 case TargetOpcode::G_VECREDUCE_SMIN:
6026 return MIRBuilder.buildConstant(
6027 Ty, APInt::getSignedMaxValue(Ty.getSizeInBits()));
6028 case TargetOpcode::G_VECREDUCE_FADD:
6029 return MIRBuilder.buildFConstant(Ty, -0.0);
6030 case TargetOpcode::G_VECREDUCE_FMUL:
6031 return MIRBuilder.buildFConstant(Ty, 1.0);
6032 case TargetOpcode::G_VECREDUCE_FMINIMUM:
6033 case TargetOpcode::G_VECREDUCE_FMAXIMUM:
6034 assert(false && "getNeutralElementForVecReduce unimplemented for "
6035 "G_VECREDUCE_FMINIMUM and G_VECREDUCE_FMAXIMUM!");
6037 llvm_unreachable("switch expected to return!");
6040 LegalizerHelper::LegalizeResult
6041 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
6042 LLT MoreTy) {
6043 unsigned Opc = MI.getOpcode();
6044 switch (Opc) {
6045 case TargetOpcode::G_IMPLICIT_DEF:
6046 case TargetOpcode::G_LOAD: {
6047 if (TypeIdx != 0)
6048 return UnableToLegalize;
6049 Observer.changingInstr(MI);
6050 moreElementsVectorDst(MI, MoreTy, 0);
6051 Observer.changedInstr(MI);
6052 return Legalized;
6054 case TargetOpcode::G_STORE:
6055 if (TypeIdx != 0)
6056 return UnableToLegalize;
6057 Observer.changingInstr(MI);
6058 moreElementsVectorSrc(MI, MoreTy, 0);
6059 Observer.changedInstr(MI);
6060 return Legalized;
6061 case TargetOpcode::G_AND:
6062 case TargetOpcode::G_OR:
6063 case TargetOpcode::G_XOR:
6064 case TargetOpcode::G_ADD:
6065 case TargetOpcode::G_SUB:
6066 case TargetOpcode::G_MUL:
6067 case TargetOpcode::G_FADD:
6068 case TargetOpcode::G_FSUB:
6069 case TargetOpcode::G_FMUL:
6070 case TargetOpcode::G_FDIV:
6071 case TargetOpcode::G_FCOPYSIGN:
6072 case TargetOpcode::G_UADDSAT:
6073 case TargetOpcode::G_USUBSAT:
6074 case TargetOpcode::G_SADDSAT:
6075 case TargetOpcode::G_SSUBSAT:
6076 case TargetOpcode::G_SMIN:
6077 case TargetOpcode::G_SMAX:
6078 case TargetOpcode::G_UMIN:
6079 case TargetOpcode::G_UMAX:
6080 case TargetOpcode::G_FMINNUM:
6081 case TargetOpcode::G_FMAXNUM:
6082 case TargetOpcode::G_FMINNUM_IEEE:
6083 case TargetOpcode::G_FMAXNUM_IEEE:
6084 case TargetOpcode::G_FMINIMUM:
6085 case TargetOpcode::G_FMAXIMUM:
6086 case TargetOpcode::G_STRICT_FADD:
6087 case TargetOpcode::G_STRICT_FSUB:
6088 case TargetOpcode::G_STRICT_FMUL:
6089 case TargetOpcode::G_SHL:
6090 case TargetOpcode::G_ASHR:
6091 case TargetOpcode::G_LSHR: {
6092 Observer.changingInstr(MI);
6093 moreElementsVectorSrc(MI, MoreTy, 1);
6094 moreElementsVectorSrc(MI, MoreTy, 2);
6095 moreElementsVectorDst(MI, MoreTy, 0);
6096 Observer.changedInstr(MI);
6097 return Legalized;
6099 case TargetOpcode::G_FMA:
6100 case TargetOpcode::G_STRICT_FMA:
6101 case TargetOpcode::G_FSHR:
6102 case TargetOpcode::G_FSHL: {
6103 Observer.changingInstr(MI);
6104 moreElementsVectorSrc(MI, MoreTy, 1);
6105 moreElementsVectorSrc(MI, MoreTy, 2);
6106 moreElementsVectorSrc(MI, MoreTy, 3);
6107 moreElementsVectorDst(MI, MoreTy, 0);
6108 Observer.changedInstr(MI);
6109 return Legalized;
6111 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
6112 case TargetOpcode::G_EXTRACT:
6113 if (TypeIdx != 1)
6114 return UnableToLegalize;
6115 Observer.changingInstr(MI);
6116 moreElementsVectorSrc(MI, MoreTy, 1);
6117 Observer.changedInstr(MI);
6118 return Legalized;
6119 case TargetOpcode::G_INSERT:
6120 case TargetOpcode::G_INSERT_VECTOR_ELT:
6121 case TargetOpcode::G_FREEZE:
6122 case TargetOpcode::G_FNEG:
6123 case TargetOpcode::G_FABS:
6124 case TargetOpcode::G_FSQRT:
6125 case TargetOpcode::G_FCEIL:
6126 case TargetOpcode::G_FFLOOR:
6127 case TargetOpcode::G_FNEARBYINT:
6128 case TargetOpcode::G_FRINT:
6129 case TargetOpcode::G_INTRINSIC_ROUND:
6130 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
6131 case TargetOpcode::G_INTRINSIC_TRUNC:
6132 case TargetOpcode::G_BSWAP:
6133 case TargetOpcode::G_FCANONICALIZE:
6134 case TargetOpcode::G_SEXT_INREG:
6135 case TargetOpcode::G_ABS:
6136 if (TypeIdx != 0)
6137 return UnableToLegalize;
6138 Observer.changingInstr(MI);
6139 moreElementsVectorSrc(MI, MoreTy, 1);
6140 moreElementsVectorDst(MI, MoreTy, 0);
6141 Observer.changedInstr(MI);
6142 return Legalized;
6143 case TargetOpcode::G_SELECT: {
6144 auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs();
6145 if (TypeIdx == 1) {
6146 if (!CondTy.isScalar() ||
6147 DstTy.getElementCount() != MoreTy.getElementCount())
6148 return UnableToLegalize;
6150 // This is turning a scalar select of vectors into a vector
6151 // select. Broadcast the select condition.
6152 auto ShufSplat = MIRBuilder.buildShuffleSplat(MoreTy, CondReg);
6153 Observer.changingInstr(MI);
6154 MI.getOperand(1).setReg(ShufSplat.getReg(0));
6155 Observer.changedInstr(MI);
6156 return Legalized;
6159 if (CondTy.isVector())
6160 return UnableToLegalize;
6162 Observer.changingInstr(MI);
6163 moreElementsVectorSrc(MI, MoreTy, 2);
6164 moreElementsVectorSrc(MI, MoreTy, 3);
6165 moreElementsVectorDst(MI, MoreTy, 0);
6166 Observer.changedInstr(MI);
6167 return Legalized;
6169 case TargetOpcode::G_UNMERGE_VALUES:
6170 return UnableToLegalize;
6171 case TargetOpcode::G_PHI:
6172 return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
6173 case TargetOpcode::G_SHUFFLE_VECTOR:
6174 return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
6175 case TargetOpcode::G_BUILD_VECTOR: {
6176 SmallVector<SrcOp, 8> Elts;
6177 for (auto Op : MI.uses()) {
6178 Elts.push_back(Op.getReg());
6181 for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
6182 Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
6185 MIRBuilder.buildDeleteTrailingVectorElements(
6186 MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
6187 MI.eraseFromParent();
6188 return Legalized;
6190 case TargetOpcode::G_SEXT:
6191 case TargetOpcode::G_ZEXT:
6192 case TargetOpcode::G_ANYEXT:
6193 case TargetOpcode::G_TRUNC:
6194 case TargetOpcode::G_FPTRUNC:
6195 case TargetOpcode::G_FPEXT:
6196 case TargetOpcode::G_FPTOSI:
6197 case TargetOpcode::G_FPTOUI:
6198 case TargetOpcode::G_FPTOSI_SAT:
6199 case TargetOpcode::G_FPTOUI_SAT:
6200 case TargetOpcode::G_SITOFP:
6201 case TargetOpcode::G_UITOFP: {
6202 Observer.changingInstr(MI);
6203 LLT SrcExtTy;
6204 LLT DstExtTy;
6205 if (TypeIdx == 0) {
6206 DstExtTy = MoreTy;
6207 SrcExtTy = LLT::fixed_vector(
6208 MoreTy.getNumElements(),
6209 MRI.getType(MI.getOperand(1).getReg()).getElementType());
6210 } else {
6211 DstExtTy = LLT::fixed_vector(
6212 MoreTy.getNumElements(),
6213 MRI.getType(MI.getOperand(0).getReg()).getElementType());
6214 SrcExtTy = MoreTy;
6216 moreElementsVectorSrc(MI, SrcExtTy, 1);
6217 moreElementsVectorDst(MI, DstExtTy, 0);
6218 Observer.changedInstr(MI);
6219 return Legalized;
6221 case TargetOpcode::G_ICMP:
6222 case TargetOpcode::G_FCMP: {
6223 if (TypeIdx != 1)
6224 return UnableToLegalize;
6226 Observer.changingInstr(MI);
6227 moreElementsVectorSrc(MI, MoreTy, 2);
6228 moreElementsVectorSrc(MI, MoreTy, 3);
6229 LLT CondTy = LLT::fixed_vector(
6230 MoreTy.getNumElements(),
6231 MRI.getType(MI.getOperand(0).getReg()).getElementType());
6232 moreElementsVectorDst(MI, CondTy, 0);
6233 Observer.changedInstr(MI);
6234 return Legalized;
6236 case TargetOpcode::G_BITCAST: {
6237 if (TypeIdx != 0)
6238 return UnableToLegalize;
6240 LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
6241 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
6243 unsigned coefficient = SrcTy.getNumElements() * MoreTy.getNumElements();
6244 if (coefficient % DstTy.getNumElements() != 0)
6245 return UnableToLegalize;
6247 coefficient = coefficient / DstTy.getNumElements();
6249 LLT NewTy = SrcTy.changeElementCount(
6250 ElementCount::get(coefficient, MoreTy.isScalable()));
6251 Observer.changingInstr(MI);
6252 moreElementsVectorSrc(MI, NewTy, 1);
6253 moreElementsVectorDst(MI, MoreTy, 0);
6254 Observer.changedInstr(MI);
6255 return Legalized;
6257 case TargetOpcode::G_VECREDUCE_FADD:
6258 case TargetOpcode::G_VECREDUCE_FMUL:
6259 case TargetOpcode::G_VECREDUCE_ADD:
6260 case TargetOpcode::G_VECREDUCE_MUL:
6261 case TargetOpcode::G_VECREDUCE_AND:
6262 case TargetOpcode::G_VECREDUCE_OR:
6263 case TargetOpcode::G_VECREDUCE_XOR:
6264 case TargetOpcode::G_VECREDUCE_SMAX:
6265 case TargetOpcode::G_VECREDUCE_SMIN:
6266 case TargetOpcode::G_VECREDUCE_UMAX:
6267 case TargetOpcode::G_VECREDUCE_UMIN: {
6268 LLT OrigTy = MRI.getType(MI.getOperand(1).getReg());
6269 MachineOperand &MO = MI.getOperand(1);
6270 auto NewVec = MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO);
6271 auto NeutralElement = getNeutralElementForVecReduce(
6272 MI.getOpcode(), MIRBuilder, MoreTy.getElementType());
6274 LLT IdxTy(TLI.getVectorIdxTy(MIRBuilder.getDataLayout()));
6275 for (size_t i = OrigTy.getNumElements(), e = MoreTy.getNumElements();
6276 i != e; i++) {
6277 auto Idx = MIRBuilder.buildConstant(IdxTy, i);
6278 NewVec = MIRBuilder.buildInsertVectorElement(MoreTy, NewVec,
6279 NeutralElement, Idx);
6282 Observer.changingInstr(MI);
6283 MO.setReg(NewVec.getReg(0));
6284 Observer.changedInstr(MI);
6285 return Legalized;
6288 default:
6289 return UnableToLegalize;
6293 LegalizerHelper::LegalizeResult
6294 LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) {
6295 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6296 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6297 unsigned MaskNumElts = Mask.size();
6298 unsigned SrcNumElts = SrcTy.getNumElements();
6299 LLT DestEltTy = DstTy.getElementType();
6301 if (MaskNumElts == SrcNumElts)
6302 return Legalized;
6304 if (MaskNumElts < SrcNumElts) {
6305 // Extend mask to match new destination vector size with
6306 // undef values.
6307 SmallVector<int, 16> NewMask(SrcNumElts, -1);
6308 llvm::copy(Mask, NewMask.begin());
6310 moreElementsVectorDst(MI, SrcTy, 0);
6311 MIRBuilder.setInstrAndDebugLoc(MI);
6312 MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
6313 MI.getOperand(1).getReg(),
6314 MI.getOperand(2).getReg(), NewMask);
6315 MI.eraseFromParent();
6317 return Legalized;
6320 unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts);
6321 unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
6322 LLT PaddedTy = LLT::fixed_vector(PaddedMaskNumElts, DestEltTy);
6324 // Create new source vectors by concatenating the initial
6325 // source vectors with undefined vectors of the same size.
6326 auto Undef = MIRBuilder.buildUndef(SrcTy);
6327 SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(0));
6328 SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(0));
6329 MOps1[0] = MI.getOperand(1).getReg();
6330 MOps2[0] = MI.getOperand(2).getReg();
6332 auto Src1 = MIRBuilder.buildConcatVectors(PaddedTy, MOps1);
6333 auto Src2 = MIRBuilder.buildConcatVectors(PaddedTy, MOps2);
6335 // Readjust mask for new input vector length.
6336 SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
6337 for (unsigned I = 0; I != MaskNumElts; ++I) {
6338 int Idx = Mask[I];
6339 if (Idx >= static_cast<int>(SrcNumElts))
6340 Idx += PaddedMaskNumElts - SrcNumElts;
6341 MappedOps[I] = Idx;
6344 // If we got more elements than required, extract subvector.
6345 if (MaskNumElts != PaddedMaskNumElts) {
6346 auto Shuffle =
6347 MIRBuilder.buildShuffleVector(PaddedTy, Src1, Src2, MappedOps);
6349 SmallVector<Register, 16> Elts(MaskNumElts);
6350 for (unsigned I = 0; I < MaskNumElts; ++I) {
6351 Elts[I] =
6352 MIRBuilder.buildExtractVectorElementConstant(DestEltTy, Shuffle, I)
6353 .getReg(0);
6355 MIRBuilder.buildBuildVector(DstReg, Elts);
6356 } else {
6357 MIRBuilder.buildShuffleVector(DstReg, Src1, Src2, MappedOps);
6360 MI.eraseFromParent();
6361 return LegalizerHelper::LegalizeResult::Legalized;
6364 LegalizerHelper::LegalizeResult
6365 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
6366 unsigned int TypeIdx, LLT MoreTy) {
6367 auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs();
6368 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6369 unsigned NumElts = DstTy.getNumElements();
6370 unsigned WidenNumElts = MoreTy.getNumElements();
6372 if (DstTy.isVector() && Src1Ty.isVector() &&
6373 DstTy.getNumElements() != Src1Ty.getNumElements()) {
6374 return equalizeVectorShuffleLengths(MI);
6377 if (TypeIdx != 0)
6378 return UnableToLegalize;
6380 // Expect a canonicalized shuffle.
6381 if (DstTy != Src1Ty || DstTy != Src2Ty)
6382 return UnableToLegalize;
6384 moreElementsVectorSrc(MI, MoreTy, 1);
6385 moreElementsVectorSrc(MI, MoreTy, 2);
6387 // Adjust mask based on new input vector length.
6388 SmallVector<int, 16> NewMask(WidenNumElts, -1);
6389 for (unsigned I = 0; I != NumElts; ++I) {
6390 int Idx = Mask[I];
6391 if (Idx < static_cast<int>(NumElts))
6392 NewMask[I] = Idx;
6393 else
6394 NewMask[I] = Idx - NumElts + WidenNumElts;
6396 moreElementsVectorDst(MI, MoreTy, 0);
6397 MIRBuilder.setInstrAndDebugLoc(MI);
6398 MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
6399 MI.getOperand(1).getReg(),
6400 MI.getOperand(2).getReg(), NewMask);
6401 MI.eraseFromParent();
6402 return Legalized;
6405 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
6406 ArrayRef<Register> Src1Regs,
6407 ArrayRef<Register> Src2Regs,
6408 LLT NarrowTy) {
6409 MachineIRBuilder &B = MIRBuilder;
6410 unsigned SrcParts = Src1Regs.size();
6411 unsigned DstParts = DstRegs.size();
6413 unsigned DstIdx = 0; // Low bits of the result.
6414 Register FactorSum =
6415 B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
6416 DstRegs[DstIdx] = FactorSum;
6418 unsigned CarrySumPrevDstIdx;
6419 SmallVector<Register, 4> Factors;
6421 for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
6422 // Collect low parts of muls for DstIdx.
6423 for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
6424 i <= std::min(DstIdx, SrcParts - 1); ++i) {
6425 MachineInstrBuilder Mul =
6426 B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
6427 Factors.push_back(Mul.getReg(0));
6429 // Collect high parts of muls from previous DstIdx.
6430 for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
6431 i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
6432 MachineInstrBuilder Umulh =
6433 B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
6434 Factors.push_back(Umulh.getReg(0));
6436 // Add CarrySum from additions calculated for previous DstIdx.
6437 if (DstIdx != 1) {
6438 Factors.push_back(CarrySumPrevDstIdx);
6441 Register CarrySum;
6442 // Add all factors and accumulate all carries into CarrySum.
6443 if (DstIdx != DstParts - 1) {
6444 MachineInstrBuilder Uaddo =
6445 B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
6446 FactorSum = Uaddo.getReg(0);
6447 CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
6448 for (unsigned i = 2; i < Factors.size(); ++i) {
6449 MachineInstrBuilder Uaddo =
6450 B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
6451 FactorSum = Uaddo.getReg(0);
6452 MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
6453 CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
6455 } else {
6456 // Since value for the next index is not calculated, neither is CarrySum.
6457 FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
6458 for (unsigned i = 2; i < Factors.size(); ++i)
6459 FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
6462 CarrySumPrevDstIdx = CarrySum;
6463 DstRegs[DstIdx] = FactorSum;
6464 Factors.clear();
6468 LegalizerHelper::LegalizeResult
6469 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
6470 LLT NarrowTy) {
6471 if (TypeIdx != 0)
6472 return UnableToLegalize;
6474 Register DstReg = MI.getOperand(0).getReg();
6475 LLT DstType = MRI.getType(DstReg);
6476 // FIXME: add support for vector types
6477 if (DstType.isVector())
6478 return UnableToLegalize;
6480 unsigned Opcode = MI.getOpcode();
6481 unsigned OpO, OpE, OpF;
6482 switch (Opcode) {
6483 case TargetOpcode::G_SADDO:
6484 case TargetOpcode::G_SADDE:
6485 case TargetOpcode::G_UADDO:
6486 case TargetOpcode::G_UADDE:
6487 case TargetOpcode::G_ADD:
6488 OpO = TargetOpcode::G_UADDO;
6489 OpE = TargetOpcode::G_UADDE;
6490 OpF = TargetOpcode::G_UADDE;
6491 if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
6492 OpF = TargetOpcode::G_SADDE;
6493 break;
6494 case TargetOpcode::G_SSUBO:
6495 case TargetOpcode::G_SSUBE:
6496 case TargetOpcode::G_USUBO:
6497 case TargetOpcode::G_USUBE:
6498 case TargetOpcode::G_SUB:
6499 OpO = TargetOpcode::G_USUBO;
6500 OpE = TargetOpcode::G_USUBE;
6501 OpF = TargetOpcode::G_USUBE;
6502 if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
6503 OpF = TargetOpcode::G_SSUBE;
6504 break;
6505 default:
6506 llvm_unreachable("Unexpected add/sub opcode!");
6509 // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
6510 unsigned NumDefs = MI.getNumExplicitDefs();
6511 Register Src1 = MI.getOperand(NumDefs).getReg();
6512 Register Src2 = MI.getOperand(NumDefs + 1).getReg();
6513 Register CarryDst, CarryIn;
6514 if (NumDefs == 2)
6515 CarryDst = MI.getOperand(1).getReg();
6516 if (MI.getNumOperands() == NumDefs + 3)
6517 CarryIn = MI.getOperand(NumDefs + 2).getReg();
6519 LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
6520 LLT LeftoverTy, DummyTy;
6521 SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
6522 extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left,
6523 MIRBuilder, MRI);
6524 extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left, MIRBuilder,
6525 MRI);
6527 int NarrowParts = Src1Regs.size();
6528 Src1Regs.append(Src1Left);
6529 Src2Regs.append(Src2Left);
6530 DstRegs.reserve(Src1Regs.size());
6532 for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
6533 Register DstReg =
6534 MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
6535 Register CarryOut;
6536 // Forward the final carry-out to the destination register
6537 if (i == e - 1 && CarryDst)
6538 CarryOut = CarryDst;
6539 else
6540 CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
6542 if (!CarryIn) {
6543 MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
6544 {Src1Regs[i], Src2Regs[i]});
6545 } else if (i == e - 1) {
6546 MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
6547 {Src1Regs[i], Src2Regs[i], CarryIn});
6548 } else {
6549 MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
6550 {Src1Regs[i], Src2Regs[i], CarryIn});
6553 DstRegs.push_back(DstReg);
6554 CarryIn = CarryOut;
6556 insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
6557 ArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
6558 ArrayRef(DstRegs).drop_front(NarrowParts));
6560 MI.eraseFromParent();
6561 return Legalized;
6564 LegalizerHelper::LegalizeResult
6565 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
6566 auto [DstReg, Src1, Src2] = MI.getFirst3Regs();
6568 LLT Ty = MRI.getType(DstReg);
6569 if (Ty.isVector())
6570 return UnableToLegalize;
6572 unsigned Size = Ty.getSizeInBits();
6573 unsigned NarrowSize = NarrowTy.getSizeInBits();
6574 if (Size % NarrowSize != 0)
6575 return UnableToLegalize;
6577 unsigned NumParts = Size / NarrowSize;
6578 bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
6579 unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
6581 SmallVector<Register, 2> Src1Parts, Src2Parts;
6582 SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
6583 extractParts(Src1, NarrowTy, NumParts, Src1Parts, MIRBuilder, MRI);
6584 extractParts(Src2, NarrowTy, NumParts, Src2Parts, MIRBuilder, MRI);
6585 multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
6587 // Take only high half of registers if this is high mul.
6588 ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
6589 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
6590 MI.eraseFromParent();
6591 return Legalized;
6594 LegalizerHelper::LegalizeResult
6595 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
6596 LLT NarrowTy) {
6597 if (TypeIdx != 0)
6598 return UnableToLegalize;
6600 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
6602 Register Src = MI.getOperand(1).getReg();
6603 LLT SrcTy = MRI.getType(Src);
6605 // If all finite floats fit into the narrowed integer type, we can just swap
6606 // out the result type. This is practically only useful for conversions from
6607 // half to at least 16-bits, so just handle the one case.
6608 if (SrcTy.getScalarType() != LLT::scalar(16) ||
6609 NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
6610 return UnableToLegalize;
6612 Observer.changingInstr(MI);
6613 narrowScalarDst(MI, NarrowTy, 0,
6614 IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
6615 Observer.changedInstr(MI);
6616 return Legalized;
6619 LegalizerHelper::LegalizeResult
6620 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
6621 LLT NarrowTy) {
6622 if (TypeIdx != 1)
6623 return UnableToLegalize;
6625 uint64_t NarrowSize = NarrowTy.getSizeInBits();
6627 int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
6628 // FIXME: add support for when SizeOp1 isn't an exact multiple of
6629 // NarrowSize.
6630 if (SizeOp1 % NarrowSize != 0)
6631 return UnableToLegalize;
6632 int NumParts = SizeOp1 / NarrowSize;
6634 SmallVector<Register, 2> SrcRegs, DstRegs;
6635 SmallVector<uint64_t, 2> Indexes;
6636 extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs,
6637 MIRBuilder, MRI);
6639 Register OpReg = MI.getOperand(0).getReg();
6640 uint64_t OpStart = MI.getOperand(2).getImm();
6641 uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
6642 for (int i = 0; i < NumParts; ++i) {
6643 unsigned SrcStart = i * NarrowSize;
6645 if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
6646 // No part of the extract uses this subregister, ignore it.
6647 continue;
6648 } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
6649 // The entire subregister is extracted, forward the value.
6650 DstRegs.push_back(SrcRegs[i]);
6651 continue;
6654 // OpSegStart is where this destination segment would start in OpReg if it
6655 // extended infinitely in both directions.
6656 int64_t ExtractOffset;
6657 uint64_t SegSize;
6658 if (OpStart < SrcStart) {
6659 ExtractOffset = 0;
6660 SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
6661 } else {
6662 ExtractOffset = OpStart - SrcStart;
6663 SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
6666 Register SegReg = SrcRegs[i];
6667 if (ExtractOffset != 0 || SegSize != NarrowSize) {
6668 // A genuine extract is needed.
6669 SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
6670 MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
6673 DstRegs.push_back(SegReg);
6676 Register DstReg = MI.getOperand(0).getReg();
6677 if (MRI.getType(DstReg).isVector())
6678 MIRBuilder.buildBuildVector(DstReg, DstRegs);
6679 else if (DstRegs.size() > 1)
6680 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
6681 else
6682 MIRBuilder.buildCopy(DstReg, DstRegs[0]);
6683 MI.eraseFromParent();
6684 return Legalized;
6687 LegalizerHelper::LegalizeResult
6688 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
6689 LLT NarrowTy) {
6690 // FIXME: Don't know how to handle secondary types yet.
6691 if (TypeIdx != 0)
6692 return UnableToLegalize;
6694 SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
6695 SmallVector<uint64_t, 2> Indexes;
6696 LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
6697 LLT LeftoverTy;
6698 extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
6699 LeftoverRegs, MIRBuilder, MRI);
6701 SrcRegs.append(LeftoverRegs);
6703 uint64_t NarrowSize = NarrowTy.getSizeInBits();
6704 Register OpReg = MI.getOperand(2).getReg();
6705 uint64_t OpStart = MI.getOperand(3).getImm();
6706 uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
6707 for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
6708 unsigned DstStart = I * NarrowSize;
6710 if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
6711 // The entire subregister is defined by this insert, forward the new
6712 // value.
6713 DstRegs.push_back(OpReg);
6714 continue;
6717 Register SrcReg = SrcRegs[I];
6718 if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
6719 // The leftover reg is smaller than NarrowTy, so we need to extend it.
6720 SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
6721 MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
6724 if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
6725 // No part of the insert affects this subregister, forward the original.
6726 DstRegs.push_back(SrcReg);
6727 continue;
6730 // OpSegStart is where this destination segment would start in OpReg if it
6731 // extended infinitely in both directions.
6732 int64_t ExtractOffset, InsertOffset;
6733 uint64_t SegSize;
6734 if (OpStart < DstStart) {
6735 InsertOffset = 0;
6736 ExtractOffset = DstStart - OpStart;
6737 SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
6738 } else {
6739 InsertOffset = OpStart - DstStart;
6740 ExtractOffset = 0;
6741 SegSize =
6742 std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
6745 Register SegReg = OpReg;
6746 if (ExtractOffset != 0 || SegSize != OpSize) {
6747 // A genuine extract is needed.
6748 SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
6749 MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
6752 Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
6753 MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
6754 DstRegs.push_back(DstReg);
6757 uint64_t WideSize = DstRegs.size() * NarrowSize;
6758 Register DstReg = MI.getOperand(0).getReg();
6759 if (WideSize > RegTy.getSizeInBits()) {
6760 Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
6761 MIRBuilder.buildMergeLikeInstr(MergeReg, DstRegs);
6762 MIRBuilder.buildTrunc(DstReg, MergeReg);
6763 } else
6764 MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
6766 MI.eraseFromParent();
6767 return Legalized;
6770 LegalizerHelper::LegalizeResult
6771 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
6772 LLT NarrowTy) {
6773 Register DstReg = MI.getOperand(0).getReg();
6774 LLT DstTy = MRI.getType(DstReg);
6776 assert(MI.getNumOperands() == 3 && TypeIdx == 0);
6778 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
6779 SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
6780 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
6781 LLT LeftoverTy;
6782 if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
6783 Src0Regs, Src0LeftoverRegs, MIRBuilder, MRI))
6784 return UnableToLegalize;
6786 LLT Unused;
6787 if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
6788 Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI))
6789 llvm_unreachable("inconsistent extractParts result");
6791 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
6792 auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
6793 {Src0Regs[I], Src1Regs[I]});
6794 DstRegs.push_back(Inst.getReg(0));
6797 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
6798 auto Inst = MIRBuilder.buildInstr(
6799 MI.getOpcode(),
6800 {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
6801 DstLeftoverRegs.push_back(Inst.getReg(0));
6804 insertParts(DstReg, DstTy, NarrowTy, DstRegs,
6805 LeftoverTy, DstLeftoverRegs);
6807 MI.eraseFromParent();
6808 return Legalized;
6811 LegalizerHelper::LegalizeResult
6812 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
6813 LLT NarrowTy) {
6814 if (TypeIdx != 0)
6815 return UnableToLegalize;
6817 auto [DstReg, SrcReg] = MI.getFirst2Regs();
6819 LLT DstTy = MRI.getType(DstReg);
6820 if (DstTy.isVector())
6821 return UnableToLegalize;
6823 SmallVector<Register, 8> Parts;
6824 LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
6825 LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
6826 buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
6828 MI.eraseFromParent();
6829 return Legalized;
6832 LegalizerHelper::LegalizeResult
6833 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
6834 LLT NarrowTy) {
6835 if (TypeIdx != 0)
6836 return UnableToLegalize;
6838 Register CondReg = MI.getOperand(1).getReg();
6839 LLT CondTy = MRI.getType(CondReg);
6840 if (CondTy.isVector()) // TODO: Handle vselect
6841 return UnableToLegalize;
6843 Register DstReg = MI.getOperand(0).getReg();
6844 LLT DstTy = MRI.getType(DstReg);
6846 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
6847 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
6848 SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
6849 LLT LeftoverTy;
6850 if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
6851 Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI))
6852 return UnableToLegalize;
6854 LLT Unused;
6855 if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
6856 Src2Regs, Src2LeftoverRegs, MIRBuilder, MRI))
6857 llvm_unreachable("inconsistent extractParts result");
6859 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
6860 auto Select = MIRBuilder.buildSelect(NarrowTy,
6861 CondReg, Src1Regs[I], Src2Regs[I]);
6862 DstRegs.push_back(Select.getReg(0));
6865 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
6866 auto Select = MIRBuilder.buildSelect(
6867 LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
6868 DstLeftoverRegs.push_back(Select.getReg(0));
6871 insertParts(DstReg, DstTy, NarrowTy, DstRegs,
6872 LeftoverTy, DstLeftoverRegs);
6874 MI.eraseFromParent();
6875 return Legalized;
6878 LegalizerHelper::LegalizeResult
6879 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
6880 LLT NarrowTy) {
6881 if (TypeIdx != 1)
6882 return UnableToLegalize;
6884 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6885 unsigned NarrowSize = NarrowTy.getSizeInBits();
6887 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
6888 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
6890 MachineIRBuilder &B = MIRBuilder;
6891 auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
6892 // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
6893 auto C_0 = B.buildConstant(NarrowTy, 0);
6894 auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
6895 UnmergeSrc.getReg(1), C_0);
6896 auto LoCTLZ = IsUndef ?
6897 B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
6898 B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
6899 auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
6900 auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
6901 auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
6902 B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
6904 MI.eraseFromParent();
6905 return Legalized;
6908 return UnableToLegalize;
6911 LegalizerHelper::LegalizeResult
6912 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
6913 LLT NarrowTy) {
6914 if (TypeIdx != 1)
6915 return UnableToLegalize;
6917 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6918 unsigned NarrowSize = NarrowTy.getSizeInBits();
6920 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
6921 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
6923 MachineIRBuilder &B = MIRBuilder;
6924 auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
6925 // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
6926 auto C_0 = B.buildConstant(NarrowTy, 0);
6927 auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
6928 UnmergeSrc.getReg(0), C_0);
6929 auto HiCTTZ = IsUndef ?
6930 B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
6931 B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
6932 auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
6933 auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
6934 auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
6935 B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
6937 MI.eraseFromParent();
6938 return Legalized;
6941 return UnableToLegalize;
6944 LegalizerHelper::LegalizeResult
6945 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
6946 LLT NarrowTy) {
6947 if (TypeIdx != 1)
6948 return UnableToLegalize;
6950 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6951 unsigned NarrowSize = NarrowTy.getSizeInBits();
6953 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
6954 auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
6956 auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
6957 auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
6958 MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
6960 MI.eraseFromParent();
6961 return Legalized;
6964 return UnableToLegalize;
6967 LegalizerHelper::LegalizeResult
6968 LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx,
6969 LLT NarrowTy) {
6970 if (TypeIdx != 1)
6971 return UnableToLegalize;
6973 MachineIRBuilder &B = MIRBuilder;
6974 Register ExpReg = MI.getOperand(2).getReg();
6975 LLT ExpTy = MRI.getType(ExpReg);
6977 unsigned ClampSize = NarrowTy.getScalarSizeInBits();
6979 // Clamp the exponent to the range of the target type.
6980 auto MinExp = B.buildConstant(ExpTy, minIntN(ClampSize));
6981 auto ClampMin = B.buildSMax(ExpTy, ExpReg, MinExp);
6982 auto MaxExp = B.buildConstant(ExpTy, maxIntN(ClampSize));
6983 auto Clamp = B.buildSMin(ExpTy, ClampMin, MaxExp);
6985 auto Trunc = B.buildTrunc(NarrowTy, Clamp);
6986 Observer.changingInstr(MI);
6987 MI.getOperand(2).setReg(Trunc.getReg(0));
6988 Observer.changedInstr(MI);
6989 return Legalized;
6992 LegalizerHelper::LegalizeResult
6993 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
6994 unsigned Opc = MI.getOpcode();
6995 const auto &TII = MIRBuilder.getTII();
6996 auto isSupported = [this](const LegalityQuery &Q) {
6997 auto QAction = LI.getAction(Q).Action;
6998 return QAction == Legal || QAction == Libcall || QAction == Custom;
7000 switch (Opc) {
7001 default:
7002 return UnableToLegalize;
7003 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
7004 // This trivially expands to CTLZ.
7005 Observer.changingInstr(MI);
7006 MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
7007 Observer.changedInstr(MI);
7008 return Legalized;
7010 case TargetOpcode::G_CTLZ: {
7011 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7012 unsigned Len = SrcTy.getSizeInBits();
7014 if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7015 // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
7016 auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
7017 auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
7018 auto ICmp = MIRBuilder.buildICmp(
7019 CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
7020 auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
7021 MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
7022 MI.eraseFromParent();
7023 return Legalized;
7025 // for now, we do this:
7026 // NewLen = NextPowerOf2(Len);
7027 // x = x | (x >> 1);
7028 // x = x | (x >> 2);
7029 // ...
7030 // x = x | (x >>16);
7031 // x = x | (x >>32); // for 64-bit input
7032 // Upto NewLen/2
7033 // return Len - popcount(x);
7035 // Ref: "Hacker's Delight" by Henry Warren
7036 Register Op = SrcReg;
7037 unsigned NewLen = PowerOf2Ceil(Len);
7038 for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
7039 auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
7040 auto MIBOp = MIRBuilder.buildOr(
7041 SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
7042 Op = MIBOp.getReg(0);
7044 auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
7045 MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
7046 MIBPop);
7047 MI.eraseFromParent();
7048 return Legalized;
7050 case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
7051 // This trivially expands to CTTZ.
7052 Observer.changingInstr(MI);
7053 MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
7054 Observer.changedInstr(MI);
7055 return Legalized;
7057 case TargetOpcode::G_CTTZ: {
7058 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7060 unsigned Len = SrcTy.getSizeInBits();
7061 if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7062 // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
7063 // zero.
7064 auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
7065 auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
7066 auto ICmp = MIRBuilder.buildICmp(
7067 CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
7068 auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
7069 MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
7070 MI.eraseFromParent();
7071 return Legalized;
7073 // for now, we use: { return popcount(~x & (x - 1)); }
7074 // unless the target has ctlz but not ctpop, in which case we use:
7075 // { return 32 - nlz(~x & (x-1)); }
7076 // Ref: "Hacker's Delight" by Henry Warren
7077 auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
7078 auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
7079 auto MIBTmp = MIRBuilder.buildAnd(
7080 SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
7081 if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
7082 isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
7083 auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
7084 MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
7085 MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
7086 MI.eraseFromParent();
7087 return Legalized;
7089 Observer.changingInstr(MI);
7090 MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
7091 MI.getOperand(1).setReg(MIBTmp.getReg(0));
7092 Observer.changedInstr(MI);
7093 return Legalized;
7095 case TargetOpcode::G_CTPOP: {
7096 Register SrcReg = MI.getOperand(1).getReg();
7097 LLT Ty = MRI.getType(SrcReg);
7098 unsigned Size = Ty.getSizeInBits();
7099 MachineIRBuilder &B = MIRBuilder;
7101 // Count set bits in blocks of 2 bits. Default approach would be
7102 // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
7103 // We use following formula instead:
7104 // B2Count = val - { (val >> 1) & 0x55555555 }
7105 // since it gives same result in blocks of 2 with one instruction less.
7106 auto C_1 = B.buildConstant(Ty, 1);
7107 auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
7108 APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
7109 auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
7110 auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
7111 auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
7113 // In order to get count in blocks of 4 add values from adjacent block of 2.
7114 // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
7115 auto C_2 = B.buildConstant(Ty, 2);
7116 auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
7117 APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
7118 auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
7119 auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
7120 auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
7121 auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
7123 // For count in blocks of 8 bits we don't have to mask high 4 bits before
7124 // addition since count value sits in range {0,...,8} and 4 bits are enough
7125 // to hold such binary values. After addition high 4 bits still hold count
7126 // of set bits in high 4 bit block, set them to zero and get 8 bit result.
7127 // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
7128 auto C_4 = B.buildConstant(Ty, 4);
7129 auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
7130 auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
7131 APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
7132 auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
7133 auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
7135 assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
7136 // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
7137 // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
7138 auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
7140 // Shift count result from 8 high bits to low bits.
7141 auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
7143 auto IsMulSupported = [this](const LLT Ty) {
7144 auto Action = LI.getAction({TargetOpcode::G_MUL, {Ty}}).Action;
7145 return Action == Legal || Action == WidenScalar || Action == Custom;
7147 if (IsMulSupported(Ty)) {
7148 auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
7149 B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
7150 } else {
7151 auto ResTmp = B8Count;
7152 for (unsigned Shift = 8; Shift < Size; Shift *= 2) {
7153 auto ShiftC = B.buildConstant(Ty, Shift);
7154 auto Shl = B.buildShl(Ty, ResTmp, ShiftC);
7155 ResTmp = B.buildAdd(Ty, ResTmp, Shl);
7157 B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
7159 MI.eraseFromParent();
7160 return Legalized;
7165 // Check that (every element of) Reg is undef or not an exact multiple of BW.
7166 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
7167 Register Reg, unsigned BW) {
7168 return matchUnaryPredicate(
7169 MRI, Reg,
7170 [=](const Constant *C) {
7171 // Null constant here means an undef.
7172 const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
7173 return !CI || CI->getValue().urem(BW) != 0;
7175 /*AllowUndefs*/ true);
7178 LegalizerHelper::LegalizeResult
7179 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
7180 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7181 LLT Ty = MRI.getType(Dst);
7182 LLT ShTy = MRI.getType(Z);
7184 unsigned BW = Ty.getScalarSizeInBits();
7186 if (!isPowerOf2_32(BW))
7187 return UnableToLegalize;
7189 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7190 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
7192 if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
7193 // fshl X, Y, Z -> fshr X, Y, -Z
7194 // fshr X, Y, Z -> fshl X, Y, -Z
7195 auto Zero = MIRBuilder.buildConstant(ShTy, 0);
7196 Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
7197 } else {
7198 // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
7199 // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
7200 auto One = MIRBuilder.buildConstant(ShTy, 1);
7201 if (IsFSHL) {
7202 Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
7203 X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
7204 } else {
7205 X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
7206 Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
7209 Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
7212 MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
7213 MI.eraseFromParent();
7214 return Legalized;
7217 LegalizerHelper::LegalizeResult
7218 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
7219 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7220 LLT Ty = MRI.getType(Dst);
7221 LLT ShTy = MRI.getType(Z);
7223 const unsigned BW = Ty.getScalarSizeInBits();
7224 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7226 Register ShX, ShY;
7227 Register ShAmt, InvShAmt;
7229 // FIXME: Emit optimized urem by constant instead of letting it expand later.
7230 if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
7231 // fshl: X << C | Y >> (BW - C)
7232 // fshr: X << (BW - C) | Y >> C
7233 // where C = Z % BW is not zero
7234 auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
7235 ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
7236 InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
7237 ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
7238 ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
7239 } else {
7240 // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
7241 // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
7242 auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
7243 if (isPowerOf2_32(BW)) {
7244 // Z % BW -> Z & (BW - 1)
7245 ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
7246 // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
7247 auto NotZ = MIRBuilder.buildNot(ShTy, Z);
7248 InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
7249 } else {
7250 auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
7251 ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
7252 InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
7255 auto One = MIRBuilder.buildConstant(ShTy, 1);
7256 if (IsFSHL) {
7257 ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
7258 auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
7259 ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
7260 } else {
7261 auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
7262 ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
7263 ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
7267 MIRBuilder.buildOr(Dst, ShX, ShY);
7268 MI.eraseFromParent();
7269 return Legalized;
7272 LegalizerHelper::LegalizeResult
7273 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
7274 // These operations approximately do the following (while avoiding undefined
7275 // shifts by BW):
7276 // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
7277 // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
7278 Register Dst = MI.getOperand(0).getReg();
7279 LLT Ty = MRI.getType(Dst);
7280 LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
7282 bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7283 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
7285 // TODO: Use smarter heuristic that accounts for vector legalization.
7286 if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
7287 return lowerFunnelShiftAsShifts(MI);
7289 // This only works for powers of 2, fallback to shifts if it fails.
7290 LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
7291 if (Result == UnableToLegalize)
7292 return lowerFunnelShiftAsShifts(MI);
7293 return Result;
7296 LegalizerHelper::LegalizeResult LegalizerHelper::lowerEXT(MachineInstr &MI) {
7297 auto [Dst, Src] = MI.getFirst2Regs();
7298 LLT DstTy = MRI.getType(Dst);
7299 LLT SrcTy = MRI.getType(Src);
7301 uint32_t DstTySize = DstTy.getSizeInBits();
7302 uint32_t DstTyScalarSize = DstTy.getScalarSizeInBits();
7303 uint32_t SrcTyScalarSize = SrcTy.getScalarSizeInBits();
7305 if (!isPowerOf2_32(DstTySize) || !isPowerOf2_32(DstTyScalarSize) ||
7306 !isPowerOf2_32(SrcTyScalarSize))
7307 return UnableToLegalize;
7309 // The step between extend is too large, split it by creating an intermediate
7310 // extend instruction
7311 if (SrcTyScalarSize * 2 < DstTyScalarSize) {
7312 LLT MidTy = SrcTy.changeElementSize(SrcTyScalarSize * 2);
7313 // If the destination type is illegal, split it into multiple statements
7314 // zext x -> zext(merge(zext(unmerge), zext(unmerge)))
7315 auto NewExt = MIRBuilder.buildInstr(MI.getOpcode(), {MidTy}, {Src});
7316 // Unmerge the vector
7317 LLT EltTy = MidTy.changeElementCount(
7318 MidTy.getElementCount().divideCoefficientBy(2));
7319 auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, NewExt);
7321 // ZExt the vectors
7322 LLT ZExtResTy = DstTy.changeElementCount(
7323 DstTy.getElementCount().divideCoefficientBy(2));
7324 auto ZExtRes1 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy},
7325 {UnmergeSrc.getReg(0)});
7326 auto ZExtRes2 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy},
7327 {UnmergeSrc.getReg(1)});
7329 // Merge the ending vectors
7330 MIRBuilder.buildMergeLikeInstr(Dst, {ZExtRes1, ZExtRes2});
7332 MI.eraseFromParent();
7333 return Legalized;
7335 return UnableToLegalize;
7338 LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) {
7339 // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
7340 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
7341 // Similar to how operand splitting is done in SelectiondDAG, we can handle
7342 // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
7343 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
7344 // %lo16(<4 x s16>) = G_TRUNC %inlo
7345 // %hi16(<4 x s16>) = G_TRUNC %inhi
7346 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
7347 // %res(<8 x s8>) = G_TRUNC %in16
7349 assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
7351 Register DstReg = MI.getOperand(0).getReg();
7352 Register SrcReg = MI.getOperand(1).getReg();
7353 LLT DstTy = MRI.getType(DstReg);
7354 LLT SrcTy = MRI.getType(SrcReg);
7356 if (DstTy.isVector() && isPowerOf2_32(DstTy.getNumElements()) &&
7357 isPowerOf2_32(DstTy.getScalarSizeInBits()) &&
7358 isPowerOf2_32(SrcTy.getNumElements()) &&
7359 isPowerOf2_32(SrcTy.getScalarSizeInBits())) {
7360 // Split input type.
7361 LLT SplitSrcTy = SrcTy.changeElementCount(
7362 SrcTy.getElementCount().divideCoefficientBy(2));
7364 // First, split the source into two smaller vectors.
7365 SmallVector<Register, 2> SplitSrcs;
7366 extractParts(SrcReg, SplitSrcTy, 2, SplitSrcs, MIRBuilder, MRI);
7368 // Truncate the splits into intermediate narrower elements.
7369 LLT InterTy;
7370 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
7371 InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
7372 else
7373 InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits());
7374 for (unsigned I = 0; I < SplitSrcs.size(); ++I) {
7375 SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0);
7378 // Combine the new truncates into one vector
7379 auto Merge = MIRBuilder.buildMergeLikeInstr(
7380 DstTy.changeElementSize(InterTy.getScalarSizeInBits()), SplitSrcs);
7382 // Truncate the new vector to the final result type
7383 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
7384 MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), Merge.getReg(0));
7385 else
7386 MIRBuilder.buildCopy(MI.getOperand(0).getReg(), Merge.getReg(0));
7388 MI.eraseFromParent();
7390 return Legalized;
7392 return UnableToLegalize;
7395 LegalizerHelper::LegalizeResult
7396 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
7397 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
7398 auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
7399 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
7400 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
7401 auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
7402 MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
7403 MI.eraseFromParent();
7404 return Legalized;
7407 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
7408 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
7410 unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
7411 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
7413 MIRBuilder.setInstrAndDebugLoc(MI);
7415 // If a rotate in the other direction is supported, use it.
7416 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
7417 if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
7418 isPowerOf2_32(EltSizeInBits))
7419 return lowerRotateWithReverseRotate(MI);
7421 // If a funnel shift is supported, use it.
7422 unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
7423 unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
7424 bool IsFShLegal = false;
7425 if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
7426 LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
7427 auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
7428 Register R3) {
7429 MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
7430 MI.eraseFromParent();
7431 return Legalized;
7433 // If a funnel shift in the other direction is supported, use it.
7434 if (IsFShLegal) {
7435 return buildFunnelShift(FShOpc, Dst, Src, Amt);
7436 } else if (isPowerOf2_32(EltSizeInBits)) {
7437 Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
7438 return buildFunnelShift(RevFsh, Dst, Src, Amt);
7442 auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
7443 unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
7444 unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
7445 auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
7446 Register ShVal;
7447 Register RevShiftVal;
7448 if (isPowerOf2_32(EltSizeInBits)) {
7449 // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
7450 // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
7451 auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
7452 auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
7453 ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
7454 auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
7455 RevShiftVal =
7456 MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
7457 } else {
7458 // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
7459 // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
7460 auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
7461 auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
7462 ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
7463 auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
7464 auto One = MIRBuilder.buildConstant(AmtTy, 1);
7465 auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
7466 RevShiftVal =
7467 MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
7469 MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
7470 MI.eraseFromParent();
7471 return Legalized;
7474 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
7475 // representation.
7476 LegalizerHelper::LegalizeResult
7477 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
7478 auto [Dst, Src] = MI.getFirst2Regs();
7479 const LLT S64 = LLT::scalar(64);
7480 const LLT S32 = LLT::scalar(32);
7481 const LLT S1 = LLT::scalar(1);
7483 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
7485 // unsigned cul2f(ulong u) {
7486 // uint lz = clz(u);
7487 // uint e = (u != 0) ? 127U + 63U - lz : 0;
7488 // u = (u << lz) & 0x7fffffffffffffffUL;
7489 // ulong t = u & 0xffffffffffUL;
7490 // uint v = (e << 23) | (uint)(u >> 40);
7491 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
7492 // return as_float(v + r);
7493 // }
7495 auto Zero32 = MIRBuilder.buildConstant(S32, 0);
7496 auto Zero64 = MIRBuilder.buildConstant(S64, 0);
7498 auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
7500 auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
7501 auto Sub = MIRBuilder.buildSub(S32, K, LZ);
7503 auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
7504 auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
7506 auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
7507 auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
7509 auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
7511 auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
7512 auto T = MIRBuilder.buildAnd(S64, U, Mask1);
7514 auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
7515 auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
7516 auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
7518 auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
7519 auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
7520 auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
7521 auto One = MIRBuilder.buildConstant(S32, 1);
7523 auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
7524 auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
7525 auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
7526 MIRBuilder.buildAdd(Dst, V, R);
7528 MI.eraseFromParent();
7529 return Legalized;
7532 // Expand s32 = G_UITOFP s64 to an IEEE float representation using bit
7533 // operations and G_SITOFP
7534 LegalizerHelper::LegalizeResult
7535 LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
7536 auto [Dst, Src] = MI.getFirst2Regs();
7537 const LLT S64 = LLT::scalar(64);
7538 const LLT S32 = LLT::scalar(32);
7539 const LLT S1 = LLT::scalar(1);
7541 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
7543 // For i64 < INT_MAX we simply reuse SITOFP.
7544 // Otherwise, divide i64 by 2, round result by ORing with the lowest bit
7545 // saved before division, convert to float by SITOFP, multiply the result
7546 // by 2.
7547 auto One = MIRBuilder.buildConstant(S64, 1);
7548 auto Zero = MIRBuilder.buildConstant(S64, 0);
7549 // Result if Src < INT_MAX
7550 auto SmallResult = MIRBuilder.buildSITOFP(S32, Src);
7551 // Result if Src >= INT_MAX
7552 auto Halved = MIRBuilder.buildLShr(S64, Src, One);
7553 auto LowerBit = MIRBuilder.buildAnd(S64, Src, One);
7554 auto RoundedHalved = MIRBuilder.buildOr(S64, Halved, LowerBit);
7555 auto HalvedFP = MIRBuilder.buildSITOFP(S32, RoundedHalved);
7556 auto LargeResult = MIRBuilder.buildFAdd(S32, HalvedFP, HalvedFP);
7557 // Check if the original value is larger than INT_MAX by comparing with
7558 // zero to pick one of the two conversions.
7559 auto IsLarge =
7560 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, S1, Src, Zero);
7561 MIRBuilder.buildSelect(Dst, IsLarge, LargeResult, SmallResult);
7563 MI.eraseFromParent();
7564 return Legalized;
7567 // Expand s64 = G_UITOFP s64 using bit and float arithmetic operations to an
7568 // IEEE double representation.
7569 LegalizerHelper::LegalizeResult
7570 LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
7571 auto [Dst, Src] = MI.getFirst2Regs();
7572 const LLT S64 = LLT::scalar(64);
7573 const LLT S32 = LLT::scalar(32);
7575 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
7577 // We create double value from 32 bit parts with 32 exponent difference.
7578 // Note that + and - are float operations that adjust the implicit leading
7579 // one, the bases 2^52 and 2^84 are for illustrative purposes.
7581 // X = 2^52 * 1.0...LowBits
7582 // Y = 2^84 * 1.0...HighBits
7583 // Scratch = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0
7584 // = - 2^52 * 1.0...HighBits
7585 // Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits
7586 auto TwoP52 = MIRBuilder.buildConstant(S64, UINT64_C(0x4330000000000000));
7587 auto TwoP84 = MIRBuilder.buildConstant(S64, UINT64_C(0x4530000000000000));
7588 auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000));
7589 auto TwoP52P84FP = MIRBuilder.buildFConstant(S64, TwoP52P84);
7590 auto HalfWidth = MIRBuilder.buildConstant(S64, 32);
7592 auto LowBits = MIRBuilder.buildTrunc(S32, Src);
7593 LowBits = MIRBuilder.buildZExt(S64, LowBits);
7594 auto LowBitsFP = MIRBuilder.buildOr(S64, TwoP52, LowBits);
7595 auto HighBits = MIRBuilder.buildLShr(S64, Src, HalfWidth);
7596 auto HighBitsFP = MIRBuilder.buildOr(S64, TwoP84, HighBits);
7597 auto Scratch = MIRBuilder.buildFSub(S64, HighBitsFP, TwoP52P84FP);
7598 MIRBuilder.buildFAdd(Dst, Scratch, LowBitsFP);
7600 MI.eraseFromParent();
7601 return Legalized;
7604 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
7605 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7607 if (SrcTy == LLT::scalar(1)) {
7608 auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
7609 auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
7610 MIRBuilder.buildSelect(Dst, Src, True, False);
7611 MI.eraseFromParent();
7612 return Legalized;
7615 if (SrcTy != LLT::scalar(64))
7616 return UnableToLegalize;
7618 if (DstTy == LLT::scalar(32))
7619 // TODO: SelectionDAG has several alternative expansions to port which may
7620 // be more reasonable depending on the available instructions. We also need
7621 // a more advanced mechanism to choose an optimal version depending on
7622 // target features such as sitofp or CTLZ availability.
7623 return lowerU64ToF32WithSITOFP(MI);
7625 if (DstTy == LLT::scalar(64))
7626 return lowerU64ToF64BitFloatOps(MI);
7628 return UnableToLegalize;
7631 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
7632 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7634 const LLT S64 = LLT::scalar(64);
7635 const LLT S32 = LLT::scalar(32);
7636 const LLT S1 = LLT::scalar(1);
7638 if (SrcTy == S1) {
7639 auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
7640 auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
7641 MIRBuilder.buildSelect(Dst, Src, True, False);
7642 MI.eraseFromParent();
7643 return Legalized;
7646 if (SrcTy != S64)
7647 return UnableToLegalize;
7649 if (DstTy == S32) {
7650 // signed cl2f(long l) {
7651 // long s = l >> 63;
7652 // float r = cul2f((l + s) ^ s);
7653 // return s ? -r : r;
7654 // }
7655 Register L = Src;
7656 auto SignBit = MIRBuilder.buildConstant(S64, 63);
7657 auto S = MIRBuilder.buildAShr(S64, L, SignBit);
7659 auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
7660 auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
7661 auto R = MIRBuilder.buildUITOFP(S32, Xor);
7663 auto RNeg = MIRBuilder.buildFNeg(S32, R);
7664 auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
7665 MIRBuilder.buildConstant(S64, 0));
7666 MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
7667 MI.eraseFromParent();
7668 return Legalized;
7671 return UnableToLegalize;
7674 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
7675 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7676 const LLT S64 = LLT::scalar(64);
7677 const LLT S32 = LLT::scalar(32);
7679 if (SrcTy != S64 && SrcTy != S32)
7680 return UnableToLegalize;
7681 if (DstTy != S32 && DstTy != S64)
7682 return UnableToLegalize;
7684 // FPTOSI gives same result as FPTOUI for positive signed integers.
7685 // FPTOUI needs to deal with fp values that convert to unsigned integers
7686 // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
7688 APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
7689 APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
7690 : APFloat::IEEEdouble(),
7691 APInt::getZero(SrcTy.getSizeInBits()));
7692 TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
7694 MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
7696 MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
7697 // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
7698 // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
7699 MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
7700 MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
7701 MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
7702 MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
7704 const LLT S1 = LLT::scalar(1);
7706 MachineInstrBuilder FCMP =
7707 MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
7708 MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
7710 MI.eraseFromParent();
7711 return Legalized;
7714 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
7715 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7716 const LLT S64 = LLT::scalar(64);
7717 const LLT S32 = LLT::scalar(32);
7719 // FIXME: Only f32 to i64 conversions are supported.
7720 if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
7721 return UnableToLegalize;
7723 // Expand f32 -> i64 conversion
7724 // This algorithm comes from compiler-rt's implementation of fixsfdi:
7725 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
7727 unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
7729 auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
7730 auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
7732 auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
7733 auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
7735 auto SignMask = MIRBuilder.buildConstant(SrcTy,
7736 APInt::getSignMask(SrcEltBits));
7737 auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
7738 auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
7739 auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
7740 Sign = MIRBuilder.buildSExt(DstTy, Sign);
7742 auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
7743 auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
7744 auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
7746 auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
7747 R = MIRBuilder.buildZExt(DstTy, R);
7749 auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
7750 auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
7751 auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
7752 auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
7754 auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
7755 auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
7757 const LLT S1 = LLT::scalar(1);
7758 auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
7759 S1, Exponent, ExponentLoBit);
7761 R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
7763 auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
7764 auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
7766 auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
7768 auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
7769 S1, Exponent, ZeroSrcTy);
7771 auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
7772 MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
7774 MI.eraseFromParent();
7775 return Legalized;
7778 LegalizerHelper::LegalizeResult
7779 LegalizerHelper::lowerFPTOINT_SAT(MachineInstr &MI) {
7780 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7782 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI_SAT;
7783 unsigned SatWidth = DstTy.getScalarSizeInBits();
7785 // Determine minimum and maximum integer values and their corresponding
7786 // floating-point values.
7787 APInt MinInt, MaxInt;
7788 if (IsSigned) {
7789 MinInt = APInt::getSignedMinValue(SatWidth);
7790 MaxInt = APInt::getSignedMaxValue(SatWidth);
7791 } else {
7792 MinInt = APInt::getMinValue(SatWidth);
7793 MaxInt = APInt::getMaxValue(SatWidth);
7796 const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
7797 APFloat MinFloat(Semantics);
7798 APFloat MaxFloat(Semantics);
7800 APFloat::opStatus MinStatus =
7801 MinFloat.convertFromAPInt(MinInt, IsSigned, APFloat::rmTowardZero);
7802 APFloat::opStatus MaxStatus =
7803 MaxFloat.convertFromAPInt(MaxInt, IsSigned, APFloat::rmTowardZero);
7804 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) &&
7805 !(MaxStatus & APFloat::opStatus::opInexact);
7807 // If the integer bounds are exactly representable as floats, emit a
7808 // min+max+fptoi sequence. Otherwise we have to use a sequence of comparisons
7809 // and selects.
7810 if (AreExactFloatBounds) {
7811 // Clamp Src by MinFloat from below. If Src is NaN the result is MinFloat.
7812 auto MaxC = MIRBuilder.buildFConstant(SrcTy, MinFloat);
7813 auto MaxP = MIRBuilder.buildFCmp(CmpInst::FCMP_ULT,
7814 SrcTy.changeElementSize(1), Src, MaxC);
7815 auto Max = MIRBuilder.buildSelect(SrcTy, MaxP, Src, MaxC);
7816 // Clamp by MaxFloat from above. NaN cannot occur.
7817 auto MinC = MIRBuilder.buildFConstant(SrcTy, MaxFloat);
7818 auto MinP =
7819 MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, SrcTy.changeElementSize(1), Max,
7820 MinC, MachineInstr::FmNoNans);
7821 auto Min =
7822 MIRBuilder.buildSelect(SrcTy, MinP, Max, MinC, MachineInstr::FmNoNans);
7823 // Convert clamped value to integer. In the unsigned case we're done,
7824 // because we mapped NaN to MinFloat, which will cast to zero.
7825 if (!IsSigned) {
7826 MIRBuilder.buildFPTOUI(Dst, Min);
7827 MI.eraseFromParent();
7828 return Legalized;
7831 // Otherwise, select 0 if Src is NaN.
7832 auto FpToInt = MIRBuilder.buildFPTOSI(DstTy, Min);
7833 auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_UNO,
7834 DstTy.changeElementSize(1), Src, Src);
7835 MIRBuilder.buildSelect(Dst, IsZero, MIRBuilder.buildConstant(DstTy, 0),
7836 FpToInt);
7837 MI.eraseFromParent();
7838 return Legalized;
7841 // Result of direct conversion. The assumption here is that the operation is
7842 // non-trapping and it's fine to apply it to an out-of-range value if we
7843 // select it away later.
7844 auto FpToInt = IsSigned ? MIRBuilder.buildFPTOSI(DstTy, Src)
7845 : MIRBuilder.buildFPTOUI(DstTy, Src);
7847 // If Src ULT MinFloat, select MinInt. In particular, this also selects
7848 // MinInt if Src is NaN.
7849 auto ULT =
7850 MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, SrcTy.changeElementSize(1), Src,
7851 MIRBuilder.buildFConstant(SrcTy, MinFloat));
7852 auto Max = MIRBuilder.buildSelect(
7853 DstTy, ULT, MIRBuilder.buildConstant(DstTy, MinInt), FpToInt);
7854 // If Src OGT MaxFloat, select MaxInt.
7855 auto OGT =
7856 MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, SrcTy.changeElementSize(1), Src,
7857 MIRBuilder.buildFConstant(SrcTy, MaxFloat));
7859 // In the unsigned case we are done, because we mapped NaN to MinInt, which
7860 // is already zero.
7861 if (!IsSigned) {
7862 MIRBuilder.buildSelect(Dst, OGT, MIRBuilder.buildConstant(DstTy, MaxInt),
7863 Max);
7864 MI.eraseFromParent();
7865 return Legalized;
7868 // Otherwise, select 0 if Src is NaN.
7869 auto Min = MIRBuilder.buildSelect(
7870 DstTy, OGT, MIRBuilder.buildConstant(DstTy, MaxInt), Max);
7871 auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_UNO,
7872 DstTy.changeElementSize(1), Src, Src);
7873 MIRBuilder.buildSelect(Dst, IsZero, MIRBuilder.buildConstant(DstTy, 0), Min);
7874 MI.eraseFromParent();
7875 return Legalized;
7878 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
7879 LegalizerHelper::LegalizeResult
7880 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
7881 const LLT S1 = LLT::scalar(1);
7882 const LLT S32 = LLT::scalar(32);
7884 auto [Dst, Src] = MI.getFirst2Regs();
7885 assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
7886 MRI.getType(Src).getScalarType() == LLT::scalar(64));
7888 if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
7889 return UnableToLegalize;
7891 if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
7892 unsigned Flags = MI.getFlags();
7893 auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
7894 MIRBuilder.buildFPTrunc(Dst, Src32, Flags);
7895 MI.eraseFromParent();
7896 return Legalized;
7899 const unsigned ExpMask = 0x7ff;
7900 const unsigned ExpBiasf64 = 1023;
7901 const unsigned ExpBiasf16 = 15;
7903 auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
7904 Register U = Unmerge.getReg(0);
7905 Register UH = Unmerge.getReg(1);
7907 auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
7908 E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
7910 // Subtract the fp64 exponent bias (1023) to get the real exponent and
7911 // add the f16 bias (15) to get the biased exponent for the f16 format.
7912 E = MIRBuilder.buildAdd(
7913 S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
7915 auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
7916 M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
7918 auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
7919 MIRBuilder.buildConstant(S32, 0x1ff));
7920 MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
7922 auto Zero = MIRBuilder.buildConstant(S32, 0);
7923 auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
7924 auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
7925 M = MIRBuilder.buildOr(S32, M, Lo40Set);
7927 // (M != 0 ? 0x0200 : 0) | 0x7c00;
7928 auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
7929 auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
7930 auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
7932 auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
7933 auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
7935 // N = M | (E << 12);
7936 auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
7937 auto N = MIRBuilder.buildOr(S32, M, EShl12);
7939 // B = clamp(1-E, 0, 13);
7940 auto One = MIRBuilder.buildConstant(S32, 1);
7941 auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
7942 auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
7943 B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
7945 auto SigSetHigh = MIRBuilder.buildOr(S32, M,
7946 MIRBuilder.buildConstant(S32, 0x1000));
7948 auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
7949 auto D0 = MIRBuilder.buildShl(S32, D, B);
7951 auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
7952 D0, SigSetHigh);
7953 auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
7954 D = MIRBuilder.buildOr(S32, D, D1);
7956 auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
7957 auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
7959 auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
7960 V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
7962 auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
7963 MIRBuilder.buildConstant(S32, 3));
7964 auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
7966 auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
7967 MIRBuilder.buildConstant(S32, 5));
7968 auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
7970 V1 = MIRBuilder.buildOr(S32, V0, V1);
7971 V = MIRBuilder.buildAdd(S32, V, V1);
7973 auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1,
7974 E, MIRBuilder.buildConstant(S32, 30));
7975 V = MIRBuilder.buildSelect(S32, CmpEGt30,
7976 MIRBuilder.buildConstant(S32, 0x7c00), V);
7978 auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
7979 E, MIRBuilder.buildConstant(S32, 1039));
7980 V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
7982 // Extract the sign bit.
7983 auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
7984 Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
7986 // Insert the sign bit
7987 V = MIRBuilder.buildOr(S32, Sign, V);
7989 MIRBuilder.buildTrunc(Dst, V);
7990 MI.eraseFromParent();
7991 return Legalized;
7994 LegalizerHelper::LegalizeResult
7995 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
7996 auto [DstTy, SrcTy] = MI.getFirst2LLTs();
7997 const LLT S64 = LLT::scalar(64);
7998 const LLT S16 = LLT::scalar(16);
8000 if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
8001 return lowerFPTRUNC_F64_TO_F16(MI);
8003 return UnableToLegalize;
8006 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
8007 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8008 LLT Ty = MRI.getType(Dst);
8010 auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
8011 MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
8012 MI.eraseFromParent();
8013 return Legalized;
8016 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
8017 switch (Opc) {
8018 case TargetOpcode::G_SMIN:
8019 return CmpInst::ICMP_SLT;
8020 case TargetOpcode::G_SMAX:
8021 return CmpInst::ICMP_SGT;
8022 case TargetOpcode::G_UMIN:
8023 return CmpInst::ICMP_ULT;
8024 case TargetOpcode::G_UMAX:
8025 return CmpInst::ICMP_UGT;
8026 default:
8027 llvm_unreachable("not in integer min/max");
8031 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
8032 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8034 const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
8035 LLT CmpType = MRI.getType(Dst).changeElementSize(1);
8037 auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
8038 MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
8040 MI.eraseFromParent();
8041 return Legalized;
8044 LegalizerHelper::LegalizeResult
8045 LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) {
8046 GSUCmp *Cmp = cast<GSUCmp>(&MI);
8048 Register Dst = Cmp->getReg(0);
8049 LLT DstTy = MRI.getType(Dst);
8050 LLT SrcTy = MRI.getType(Cmp->getReg(1));
8051 LLT CmpTy = DstTy.changeElementSize(1);
8053 CmpInst::Predicate LTPredicate = Cmp->isSigned()
8054 ? CmpInst::Predicate::ICMP_SLT
8055 : CmpInst::Predicate::ICMP_ULT;
8056 CmpInst::Predicate GTPredicate = Cmp->isSigned()
8057 ? CmpInst::Predicate::ICMP_SGT
8058 : CmpInst::Predicate::ICMP_UGT;
8060 auto Zero = MIRBuilder.buildConstant(DstTy, 0);
8061 auto IsGT = MIRBuilder.buildICmp(GTPredicate, CmpTy, Cmp->getLHSReg(),
8062 Cmp->getRHSReg());
8063 auto IsLT = MIRBuilder.buildICmp(LTPredicate, CmpTy, Cmp->getLHSReg(),
8064 Cmp->getRHSReg());
8066 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
8067 auto BC = TLI.getBooleanContents(DstTy.isVector(), /*isFP=*/false);
8068 if (TLI.shouldExpandCmpUsingSelects(getApproximateEVTForLLT(SrcTy, Ctx)) ||
8069 BC == TargetLowering::UndefinedBooleanContent) {
8070 auto One = MIRBuilder.buildConstant(DstTy, 1);
8071 auto SelectZeroOrOne = MIRBuilder.buildSelect(DstTy, IsGT, One, Zero);
8073 auto MinusOne = MIRBuilder.buildConstant(DstTy, -1);
8074 MIRBuilder.buildSelect(Dst, IsLT, MinusOne, SelectZeroOrOne);
8075 } else {
8076 if (BC == TargetLowering::ZeroOrNegativeOneBooleanContent)
8077 std::swap(IsGT, IsLT);
8078 // Extend boolean results to DstTy, which is at least i2, before subtracting
8079 // them.
8080 unsigned BoolExtOp =
8081 MIRBuilder.getBoolExtOp(DstTy.isVector(), /*isFP=*/false);
8082 IsGT = MIRBuilder.buildInstr(BoolExtOp, {DstTy}, {IsGT});
8083 IsLT = MIRBuilder.buildInstr(BoolExtOp, {DstTy}, {IsLT});
8084 MIRBuilder.buildSub(Dst, IsGT, IsLT);
8087 MI.eraseFromParent();
8088 return Legalized;
8091 LegalizerHelper::LegalizeResult
8092 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
8093 auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
8094 const int Src0Size = Src0Ty.getScalarSizeInBits();
8095 const int Src1Size = Src1Ty.getScalarSizeInBits();
8097 auto SignBitMask = MIRBuilder.buildConstant(
8098 Src0Ty, APInt::getSignMask(Src0Size));
8100 auto NotSignBitMask = MIRBuilder.buildConstant(
8101 Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
8103 Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
8104 Register And1;
8105 if (Src0Ty == Src1Ty) {
8106 And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
8107 } else if (Src0Size > Src1Size) {
8108 auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
8109 auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
8110 auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
8111 And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
8112 } else {
8113 auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
8114 auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
8115 auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
8116 And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
8119 // Be careful about setting nsz/nnan/ninf on every instruction, since the
8120 // constants are a nan and -0.0, but the final result should preserve
8121 // everything.
8122 unsigned Flags = MI.getFlags();
8124 // We masked the sign bit and the not-sign bit, so these are disjoint.
8125 Flags |= MachineInstr::Disjoint;
8127 MIRBuilder.buildOr(Dst, And0, And1, Flags);
8129 MI.eraseFromParent();
8130 return Legalized;
8133 LegalizerHelper::LegalizeResult
8134 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
8135 unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
8136 TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
8138 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8139 LLT Ty = MRI.getType(Dst);
8141 if (!MI.getFlag(MachineInstr::FmNoNans)) {
8142 // Insert canonicalizes if it's possible we need to quiet to get correct
8143 // sNaN behavior.
8145 // Note this must be done here, and not as an optimization combine in the
8146 // absence of a dedicate quiet-snan instruction as we're using an
8147 // omni-purpose G_FCANONICALIZE.
8148 if (!isKnownNeverSNaN(Src0, MRI))
8149 Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
8151 if (!isKnownNeverSNaN(Src1, MRI))
8152 Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
8155 // If there are no nans, it's safe to simply replace this with the non-IEEE
8156 // version.
8157 MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
8158 MI.eraseFromParent();
8159 return Legalized;
8162 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
8163 // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
8164 Register DstReg = MI.getOperand(0).getReg();
8165 LLT Ty = MRI.getType(DstReg);
8166 unsigned Flags = MI.getFlags();
8168 auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
8169 Flags);
8170 MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
8171 MI.eraseFromParent();
8172 return Legalized;
8175 LegalizerHelper::LegalizeResult
8176 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
8177 auto [DstReg, X] = MI.getFirst2Regs();
8178 const unsigned Flags = MI.getFlags();
8179 const LLT Ty = MRI.getType(DstReg);
8180 const LLT CondTy = Ty.changeElementSize(1);
8182 // round(x) =>
8183 // t = trunc(x);
8184 // d = fabs(x - t);
8185 // o = copysign(d >= 0.5 ? 1.0 : 0.0, x);
8186 // return t + o;
8188 auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
8190 auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
8191 auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
8193 auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
8194 auto Cmp =
8195 MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half, Flags);
8197 // Could emit G_UITOFP instead
8198 auto One = MIRBuilder.buildFConstant(Ty, 1.0);
8199 auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
8200 auto BoolFP = MIRBuilder.buildSelect(Ty, Cmp, One, Zero);
8201 auto SignedOffset = MIRBuilder.buildFCopysign(Ty, BoolFP, X);
8203 MIRBuilder.buildFAdd(DstReg, T, SignedOffset, Flags);
8205 MI.eraseFromParent();
8206 return Legalized;
8209 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
8210 auto [DstReg, SrcReg] = MI.getFirst2Regs();
8211 unsigned Flags = MI.getFlags();
8212 LLT Ty = MRI.getType(DstReg);
8213 const LLT CondTy = Ty.changeElementSize(1);
8215 // result = trunc(src);
8216 // if (src < 0.0 && src != result)
8217 // result += -1.0.
8219 auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
8220 auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
8222 auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
8223 SrcReg, Zero, Flags);
8224 auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
8225 SrcReg, Trunc, Flags);
8226 auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
8227 auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
8229 MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
8230 MI.eraseFromParent();
8231 return Legalized;
8234 LegalizerHelper::LegalizeResult
8235 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
8236 const unsigned NumOps = MI.getNumOperands();
8237 auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
8238 unsigned PartSize = Src0Ty.getSizeInBits();
8240 LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
8241 Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
8243 for (unsigned I = 2; I != NumOps; ++I) {
8244 const unsigned Offset = (I - 1) * PartSize;
8246 Register SrcReg = MI.getOperand(I).getReg();
8247 auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
8249 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
8250 MRI.createGenericVirtualRegister(WideTy);
8252 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
8253 auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
8254 MIRBuilder.buildOr(NextResult, ResultReg, Shl);
8255 ResultReg = NextResult;
8258 if (DstTy.isPointer()) {
8259 if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
8260 DstTy.getAddressSpace())) {
8261 LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
8262 return UnableToLegalize;
8265 MIRBuilder.buildIntToPtr(DstReg, ResultReg);
8268 MI.eraseFromParent();
8269 return Legalized;
8272 LegalizerHelper::LegalizeResult
8273 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
8274 const unsigned NumDst = MI.getNumOperands() - 1;
8275 Register SrcReg = MI.getOperand(NumDst).getReg();
8276 Register Dst0Reg = MI.getOperand(0).getReg();
8277 LLT DstTy = MRI.getType(Dst0Reg);
8278 if (DstTy.isPointer())
8279 return UnableToLegalize; // TODO
8281 SrcReg = coerceToScalar(SrcReg);
8282 if (!SrcReg)
8283 return UnableToLegalize;
8285 // Expand scalarizing unmerge as bitcast to integer and shift.
8286 LLT IntTy = MRI.getType(SrcReg);
8288 MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
8290 const unsigned DstSize = DstTy.getSizeInBits();
8291 unsigned Offset = DstSize;
8292 for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
8293 auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
8294 auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
8295 MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
8298 MI.eraseFromParent();
8299 return Legalized;
8302 /// Lower a vector extract or insert by writing the vector to a stack temporary
8303 /// and reloading the element or vector.
8305 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
8306 /// =>
8307 /// %stack_temp = G_FRAME_INDEX
8308 /// G_STORE %vec, %stack_temp
8309 /// %idx = clamp(%idx, %vec.getNumElements())
8310 /// %element_ptr = G_PTR_ADD %stack_temp, %idx
8311 /// %dst = G_LOAD %element_ptr
8312 LegalizerHelper::LegalizeResult
8313 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
8314 Register DstReg = MI.getOperand(0).getReg();
8315 Register SrcVec = MI.getOperand(1).getReg();
8316 Register InsertVal;
8317 if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
8318 InsertVal = MI.getOperand(2).getReg();
8320 Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
8322 LLT VecTy = MRI.getType(SrcVec);
8323 LLT EltTy = VecTy.getElementType();
8324 unsigned NumElts = VecTy.getNumElements();
8326 int64_t IdxVal;
8327 if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
8328 SmallVector<Register, 8> SrcRegs;
8329 extractParts(SrcVec, EltTy, NumElts, SrcRegs, MIRBuilder, MRI);
8331 if (InsertVal) {
8332 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
8333 MIRBuilder.buildMergeLikeInstr(DstReg, SrcRegs);
8334 } else {
8335 MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
8338 MI.eraseFromParent();
8339 return Legalized;
8342 if (!EltTy.isByteSized()) { // Not implemented.
8343 LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
8344 return UnableToLegalize;
8347 unsigned EltBytes = EltTy.getSizeInBytes();
8348 Align VecAlign = getStackTemporaryAlignment(VecTy);
8349 Align EltAlign;
8351 MachinePointerInfo PtrInfo;
8352 auto StackTemp = createStackTemporary(
8353 TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign, PtrInfo);
8354 MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
8356 // Get the pointer to the element, and be sure not to hit undefined behavior
8357 // if the index is out of bounds.
8358 Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
8360 if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
8361 int64_t Offset = IdxVal * EltBytes;
8362 PtrInfo = PtrInfo.getWithOffset(Offset);
8363 EltAlign = commonAlignment(VecAlign, Offset);
8364 } else {
8365 // We lose information with a variable offset.
8366 EltAlign = getStackTemporaryAlignment(EltTy);
8367 PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
8370 if (InsertVal) {
8371 // Write the inserted element
8372 MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
8374 // Reload the whole vector.
8375 MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
8376 } else {
8377 MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
8380 MI.eraseFromParent();
8381 return Legalized;
8384 LegalizerHelper::LegalizeResult
8385 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
8386 auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
8387 MI.getFirst3RegLLTs();
8388 LLT IdxTy = LLT::scalar(32);
8390 ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
8391 Register Undef;
8392 SmallVector<Register, 32> BuildVec;
8393 LLT EltTy = DstTy.getScalarType();
8395 for (int Idx : Mask) {
8396 if (Idx < 0) {
8397 if (!Undef.isValid())
8398 Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
8399 BuildVec.push_back(Undef);
8400 continue;
8403 if (Src0Ty.isScalar()) {
8404 BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
8405 } else {
8406 int NumElts = Src0Ty.getNumElements();
8407 Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
8408 int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
8409 auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
8410 auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
8411 BuildVec.push_back(Extract.getReg(0));
8415 if (DstTy.isScalar())
8416 MIRBuilder.buildCopy(DstReg, BuildVec[0]);
8417 else
8418 MIRBuilder.buildBuildVector(DstReg, BuildVec);
8419 MI.eraseFromParent();
8420 return Legalized;
8423 LegalizerHelper::LegalizeResult
8424 LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) {
8425 auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy, Passthru, PassthruTy] =
8426 MI.getFirst4RegLLTs();
8428 if (VecTy.isScalableVector())
8429 report_fatal_error("Cannot expand masked_compress for scalable vectors.");
8431 Align VecAlign = getStackTemporaryAlignment(VecTy);
8432 MachinePointerInfo PtrInfo;
8433 Register StackPtr =
8434 createStackTemporary(TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign,
8435 PtrInfo)
8436 .getReg(0);
8437 MachinePointerInfo ValPtrInfo =
8438 MachinePointerInfo::getUnknownStack(*MI.getMF());
8440 LLT IdxTy = LLT::scalar(32);
8441 LLT ValTy = VecTy.getElementType();
8442 Align ValAlign = getStackTemporaryAlignment(ValTy);
8444 auto OutPos = MIRBuilder.buildConstant(IdxTy, 0);
8446 bool HasPassthru =
8447 MRI.getVRegDef(Passthru)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF;
8449 if (HasPassthru)
8450 MIRBuilder.buildStore(Passthru, StackPtr, PtrInfo, VecAlign);
8452 Register LastWriteVal;
8453 std::optional<APInt> PassthruSplatVal =
8454 isConstantOrConstantSplatVector(*MRI.getVRegDef(Passthru), MRI);
8456 if (PassthruSplatVal.has_value()) {
8457 LastWriteVal =
8458 MIRBuilder.buildConstant(ValTy, PassthruSplatVal.value()).getReg(0);
8459 } else if (HasPassthru) {
8460 auto Popcount = MIRBuilder.buildZExt(MaskTy.changeElementSize(32), Mask);
8461 Popcount = MIRBuilder.buildInstr(TargetOpcode::G_VECREDUCE_ADD,
8462 {LLT::scalar(32)}, {Popcount});
8464 Register LastElmtPtr =
8465 getVectorElementPointer(StackPtr, VecTy, Popcount.getReg(0));
8466 LastWriteVal =
8467 MIRBuilder.buildLoad(ValTy, LastElmtPtr, ValPtrInfo, ValAlign)
8468 .getReg(0);
8471 unsigned NumElmts = VecTy.getNumElements();
8472 for (unsigned I = 0; I < NumElmts; ++I) {
8473 auto Idx = MIRBuilder.buildConstant(IdxTy, I);
8474 auto Val = MIRBuilder.buildExtractVectorElement(ValTy, Vec, Idx);
8475 Register ElmtPtr =
8476 getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0));
8477 MIRBuilder.buildStore(Val, ElmtPtr, ValPtrInfo, ValAlign);
8479 LLT MaskITy = MaskTy.getElementType();
8480 auto MaskI = MIRBuilder.buildExtractVectorElement(MaskITy, Mask, Idx);
8481 if (MaskITy.getSizeInBits() > 1)
8482 MaskI = MIRBuilder.buildTrunc(LLT::scalar(1), MaskI);
8484 MaskI = MIRBuilder.buildZExt(IdxTy, MaskI);
8485 OutPos = MIRBuilder.buildAdd(IdxTy, OutPos, MaskI);
8487 if (HasPassthru && I == NumElmts - 1) {
8488 auto EndOfVector =
8489 MIRBuilder.buildConstant(IdxTy, VecTy.getNumElements() - 1);
8490 auto AllLanesSelected = MIRBuilder.buildICmp(
8491 CmpInst::ICMP_UGT, LLT::scalar(1), OutPos, EndOfVector);
8492 OutPos = MIRBuilder.buildInstr(TargetOpcode::G_UMIN, {IdxTy},
8493 {OutPos, EndOfVector});
8494 ElmtPtr = getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0));
8496 LastWriteVal =
8497 MIRBuilder.buildSelect(ValTy, AllLanesSelected, Val, LastWriteVal)
8498 .getReg(0);
8499 MIRBuilder.buildStore(LastWriteVal, ElmtPtr, ValPtrInfo, ValAlign);
8503 // TODO: Use StackPtr's FrameIndex alignment.
8504 MIRBuilder.buildLoad(Dst, StackPtr, PtrInfo, VecAlign);
8506 MI.eraseFromParent();
8507 return Legalized;
8510 Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
8511 Register AllocSize,
8512 Align Alignment,
8513 LLT PtrTy) {
8514 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
8516 auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
8517 SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
8519 // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
8520 // have to generate an extra instruction to negate the alloc and then use
8521 // G_PTR_ADD to add the negative offset.
8522 auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
8523 if (Alignment > Align(1)) {
8524 APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
8525 AlignMask.negate();
8526 auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
8527 Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
8530 return MIRBuilder.buildCast(PtrTy, Alloc).getReg(0);
8533 LegalizerHelper::LegalizeResult
8534 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
8535 const auto &MF = *MI.getMF();
8536 const auto &TFI = *MF.getSubtarget().getFrameLowering();
8537 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
8538 return UnableToLegalize;
8540 Register Dst = MI.getOperand(0).getReg();
8541 Register AllocSize = MI.getOperand(1).getReg();
8542 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
8544 LLT PtrTy = MRI.getType(Dst);
8545 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
8546 Register SPTmp =
8547 getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
8549 MIRBuilder.buildCopy(SPReg, SPTmp);
8550 MIRBuilder.buildCopy(Dst, SPTmp);
8552 MI.eraseFromParent();
8553 return Legalized;
8556 LegalizerHelper::LegalizeResult
8557 LegalizerHelper::lowerStackSave(MachineInstr &MI) {
8558 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
8559 if (!StackPtr)
8560 return UnableToLegalize;
8562 MIRBuilder.buildCopy(MI.getOperand(0), StackPtr);
8563 MI.eraseFromParent();
8564 return Legalized;
8567 LegalizerHelper::LegalizeResult
8568 LegalizerHelper::lowerStackRestore(MachineInstr &MI) {
8569 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
8570 if (!StackPtr)
8571 return UnableToLegalize;
8573 MIRBuilder.buildCopy(StackPtr, MI.getOperand(0));
8574 MI.eraseFromParent();
8575 return Legalized;
8578 LegalizerHelper::LegalizeResult
8579 LegalizerHelper::lowerExtract(MachineInstr &MI) {
8580 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
8581 unsigned Offset = MI.getOperand(2).getImm();
8583 // Extract sub-vector or one element
8584 if (SrcTy.isVector()) {
8585 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
8586 unsigned DstSize = DstTy.getSizeInBits();
8588 if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
8589 (Offset + DstSize <= SrcTy.getSizeInBits())) {
8590 // Unmerge and allow access to each Src element for the artifact combiner.
8591 auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), SrcReg);
8593 // Take element(s) we need to extract and copy it (merge them).
8594 SmallVector<Register, 8> SubVectorElts;
8595 for (unsigned Idx = Offset / SrcEltSize;
8596 Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
8597 SubVectorElts.push_back(Unmerge.getReg(Idx));
8599 if (SubVectorElts.size() == 1)
8600 MIRBuilder.buildCopy(DstReg, SubVectorElts[0]);
8601 else
8602 MIRBuilder.buildMergeLikeInstr(DstReg, SubVectorElts);
8604 MI.eraseFromParent();
8605 return Legalized;
8609 if (DstTy.isScalar() &&
8610 (SrcTy.isScalar() ||
8611 (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
8612 LLT SrcIntTy = SrcTy;
8613 if (!SrcTy.isScalar()) {
8614 SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
8615 SrcReg = MIRBuilder.buildBitcast(SrcIntTy, SrcReg).getReg(0);
8618 if (Offset == 0)
8619 MIRBuilder.buildTrunc(DstReg, SrcReg);
8620 else {
8621 auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
8622 auto Shr = MIRBuilder.buildLShr(SrcIntTy, SrcReg, ShiftAmt);
8623 MIRBuilder.buildTrunc(DstReg, Shr);
8626 MI.eraseFromParent();
8627 return Legalized;
8630 return UnableToLegalize;
8633 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
8634 auto [Dst, Src, InsertSrc] = MI.getFirst3Regs();
8635 uint64_t Offset = MI.getOperand(3).getImm();
8637 LLT DstTy = MRI.getType(Src);
8638 LLT InsertTy = MRI.getType(InsertSrc);
8640 // Insert sub-vector or one element
8641 if (DstTy.isVector() && !InsertTy.isPointer()) {
8642 LLT EltTy = DstTy.getElementType();
8643 unsigned EltSize = EltTy.getSizeInBits();
8644 unsigned InsertSize = InsertTy.getSizeInBits();
8646 if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
8647 (Offset + InsertSize <= DstTy.getSizeInBits())) {
8648 auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
8649 SmallVector<Register, 8> DstElts;
8650 unsigned Idx = 0;
8651 // Elements from Src before insert start Offset
8652 for (; Idx < Offset / EltSize; ++Idx) {
8653 DstElts.push_back(UnmergeSrc.getReg(Idx));
8656 // Replace elements in Src with elements from InsertSrc
8657 if (InsertTy.getSizeInBits() > EltSize) {
8658 auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
8659 for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
8660 ++Idx, ++i) {
8661 DstElts.push_back(UnmergeInsertSrc.getReg(i));
8663 } else {
8664 DstElts.push_back(InsertSrc);
8665 ++Idx;
8668 // Remaining elements from Src after insert
8669 for (; Idx < DstTy.getNumElements(); ++Idx) {
8670 DstElts.push_back(UnmergeSrc.getReg(Idx));
8673 MIRBuilder.buildMergeLikeInstr(Dst, DstElts);
8674 MI.eraseFromParent();
8675 return Legalized;
8679 if (InsertTy.isVector() ||
8680 (DstTy.isVector() && DstTy.getElementType() != InsertTy))
8681 return UnableToLegalize;
8683 const DataLayout &DL = MIRBuilder.getDataLayout();
8684 if ((DstTy.isPointer() &&
8685 DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
8686 (InsertTy.isPointer() &&
8687 DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
8688 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
8689 return UnableToLegalize;
8692 LLT IntDstTy = DstTy;
8694 if (!DstTy.isScalar()) {
8695 IntDstTy = LLT::scalar(DstTy.getSizeInBits());
8696 Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
8699 if (!InsertTy.isScalar()) {
8700 const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
8701 InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
8704 Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
8705 if (Offset != 0) {
8706 auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
8707 ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
8710 APInt MaskVal = APInt::getBitsSetWithWrap(
8711 DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
8713 auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
8714 auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
8715 auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
8717 MIRBuilder.buildCast(Dst, Or);
8718 MI.eraseFromParent();
8719 return Legalized;
8722 LegalizerHelper::LegalizeResult
8723 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
8724 auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] =
8725 MI.getFirst4RegLLTs();
8726 const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
8728 LLT Ty = Dst0Ty;
8729 LLT BoolTy = Dst1Ty;
8731 Register NewDst0 = MRI.cloneVirtualRegister(Dst0);
8733 if (IsAdd)
8734 MIRBuilder.buildAdd(NewDst0, LHS, RHS);
8735 else
8736 MIRBuilder.buildSub(NewDst0, LHS, RHS);
8738 // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
8740 auto Zero = MIRBuilder.buildConstant(Ty, 0);
8742 // For an addition, the result should be less than one of the operands (LHS)
8743 // if and only if the other operand (RHS) is negative, otherwise there will
8744 // be overflow.
8745 // For a subtraction, the result should be less than one of the operands
8746 // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
8747 // otherwise there will be overflow.
8748 auto ResultLowerThanLHS =
8749 MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, NewDst0, LHS);
8750 auto ConditionRHS = MIRBuilder.buildICmp(
8751 IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
8753 MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
8755 MIRBuilder.buildCopy(Dst0, NewDst0);
8756 MI.eraseFromParent();
8758 return Legalized;
8761 LegalizerHelper::LegalizeResult
8762 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
8763 auto [Res, LHS, RHS] = MI.getFirst3Regs();
8764 LLT Ty = MRI.getType(Res);
8765 bool IsSigned;
8766 bool IsAdd;
8767 unsigned BaseOp;
8768 switch (MI.getOpcode()) {
8769 default:
8770 llvm_unreachable("unexpected addsat/subsat opcode");
8771 case TargetOpcode::G_UADDSAT:
8772 IsSigned = false;
8773 IsAdd = true;
8774 BaseOp = TargetOpcode::G_ADD;
8775 break;
8776 case TargetOpcode::G_SADDSAT:
8777 IsSigned = true;
8778 IsAdd = true;
8779 BaseOp = TargetOpcode::G_ADD;
8780 break;
8781 case TargetOpcode::G_USUBSAT:
8782 IsSigned = false;
8783 IsAdd = false;
8784 BaseOp = TargetOpcode::G_SUB;
8785 break;
8786 case TargetOpcode::G_SSUBSAT:
8787 IsSigned = true;
8788 IsAdd = false;
8789 BaseOp = TargetOpcode::G_SUB;
8790 break;
8793 if (IsSigned) {
8794 // sadd.sat(a, b) ->
8795 // hi = 0x7fffffff - smax(a, 0)
8796 // lo = 0x80000000 - smin(a, 0)
8797 // a + smin(smax(lo, b), hi)
8798 // ssub.sat(a, b) ->
8799 // lo = smax(a, -1) - 0x7fffffff
8800 // hi = smin(a, -1) - 0x80000000
8801 // a - smin(smax(lo, b), hi)
8802 // TODO: AMDGPU can use a "median of 3" instruction here:
8803 // a +/- med3(lo, b, hi)
8804 uint64_t NumBits = Ty.getScalarSizeInBits();
8805 auto MaxVal =
8806 MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
8807 auto MinVal =
8808 MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
8809 MachineInstrBuilder Hi, Lo;
8810 if (IsAdd) {
8811 auto Zero = MIRBuilder.buildConstant(Ty, 0);
8812 Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
8813 Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
8814 } else {
8815 auto NegOne = MIRBuilder.buildConstant(Ty, -1);
8816 Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
8817 MaxVal);
8818 Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
8819 MinVal);
8821 auto RHSClamped =
8822 MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
8823 MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
8824 } else {
8825 // uadd.sat(a, b) -> a + umin(~a, b)
8826 // usub.sat(a, b) -> a - umin(a, b)
8827 Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
8828 auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
8829 MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
8832 MI.eraseFromParent();
8833 return Legalized;
8836 LegalizerHelper::LegalizeResult
8837 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
8838 auto [Res, LHS, RHS] = MI.getFirst3Regs();
8839 LLT Ty = MRI.getType(Res);
8840 LLT BoolTy = Ty.changeElementSize(1);
8841 bool IsSigned;
8842 bool IsAdd;
8843 unsigned OverflowOp;
8844 switch (MI.getOpcode()) {
8845 default:
8846 llvm_unreachable("unexpected addsat/subsat opcode");
8847 case TargetOpcode::G_UADDSAT:
8848 IsSigned = false;
8849 IsAdd = true;
8850 OverflowOp = TargetOpcode::G_UADDO;
8851 break;
8852 case TargetOpcode::G_SADDSAT:
8853 IsSigned = true;
8854 IsAdd = true;
8855 OverflowOp = TargetOpcode::G_SADDO;
8856 break;
8857 case TargetOpcode::G_USUBSAT:
8858 IsSigned = false;
8859 IsAdd = false;
8860 OverflowOp = TargetOpcode::G_USUBO;
8861 break;
8862 case TargetOpcode::G_SSUBSAT:
8863 IsSigned = true;
8864 IsAdd = false;
8865 OverflowOp = TargetOpcode::G_SSUBO;
8866 break;
8869 auto OverflowRes =
8870 MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
8871 Register Tmp = OverflowRes.getReg(0);
8872 Register Ov = OverflowRes.getReg(1);
8873 MachineInstrBuilder Clamp;
8874 if (IsSigned) {
8875 // sadd.sat(a, b) ->
8876 // {tmp, ov} = saddo(a, b)
8877 // ov ? (tmp >>s 31) + 0x80000000 : r
8878 // ssub.sat(a, b) ->
8879 // {tmp, ov} = ssubo(a, b)
8880 // ov ? (tmp >>s 31) + 0x80000000 : r
8881 uint64_t NumBits = Ty.getScalarSizeInBits();
8882 auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
8883 auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
8884 auto MinVal =
8885 MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
8886 Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
8887 } else {
8888 // uadd.sat(a, b) ->
8889 // {tmp, ov} = uaddo(a, b)
8890 // ov ? 0xffffffff : tmp
8891 // usub.sat(a, b) ->
8892 // {tmp, ov} = usubo(a, b)
8893 // ov ? 0 : tmp
8894 Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
8896 MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
8898 MI.eraseFromParent();
8899 return Legalized;
8902 LegalizerHelper::LegalizeResult
8903 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
8904 assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
8905 MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
8906 "Expected shlsat opcode!");
8907 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
8908 auto [Res, LHS, RHS] = MI.getFirst3Regs();
8909 LLT Ty = MRI.getType(Res);
8910 LLT BoolTy = Ty.changeElementSize(1);
8912 unsigned BW = Ty.getScalarSizeInBits();
8913 auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
8914 auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
8915 : MIRBuilder.buildLShr(Ty, Result, RHS);
8917 MachineInstrBuilder SatVal;
8918 if (IsSigned) {
8919 auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
8920 auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
8921 auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
8922 MIRBuilder.buildConstant(Ty, 0));
8923 SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
8924 } else {
8925 SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
8927 auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
8928 MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
8930 MI.eraseFromParent();
8931 return Legalized;
8934 LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
8935 auto [Dst, Src] = MI.getFirst2Regs();
8936 const LLT Ty = MRI.getType(Src);
8937 unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
8938 unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
8940 // Swap most and least significant byte, set remaining bytes in Res to zero.
8941 auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
8942 auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
8943 auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
8944 auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
8946 // Set i-th high/low byte in Res to i-th low/high byte from Src.
8947 for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
8948 // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
8949 APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
8950 auto Mask = MIRBuilder.buildConstant(Ty, APMask);
8951 auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
8952 // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
8953 auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
8954 auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
8955 Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
8956 // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
8957 auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
8958 auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
8959 Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
8961 Res.getInstr()->getOperand(0).setReg(Dst);
8963 MI.eraseFromParent();
8964 return Legalized;
8967 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
8968 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
8969 MachineInstrBuilder Src, const APInt &Mask) {
8970 const LLT Ty = Dst.getLLTTy(*B.getMRI());
8971 MachineInstrBuilder C_N = B.buildConstant(Ty, N);
8972 MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
8973 auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
8974 auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
8975 return B.buildOr(Dst, LHS, RHS);
8978 LegalizerHelper::LegalizeResult
8979 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
8980 auto [Dst, Src] = MI.getFirst2Regs();
8981 const LLT Ty = MRI.getType(Src);
8982 unsigned Size = Ty.getScalarSizeInBits();
8984 if (Size >= 8) {
8985 MachineInstrBuilder BSWAP =
8986 MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
8988 // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
8989 // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
8990 // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
8991 MachineInstrBuilder Swap4 =
8992 SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
8994 // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
8995 // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
8996 // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
8997 MachineInstrBuilder Swap2 =
8998 SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
9000 // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5
9001 // 6|7
9002 // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
9003 // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
9004 SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
9005 } else {
9006 // Expand bitreverse for types smaller than 8 bits.
9007 MachineInstrBuilder Tmp;
9008 for (unsigned I = 0, J = Size - 1; I < Size; ++I, --J) {
9009 MachineInstrBuilder Tmp2;
9010 if (I < J) {
9011 auto ShAmt = MIRBuilder.buildConstant(Ty, J - I);
9012 Tmp2 = MIRBuilder.buildShl(Ty, Src, ShAmt);
9013 } else {
9014 auto ShAmt = MIRBuilder.buildConstant(Ty, I - J);
9015 Tmp2 = MIRBuilder.buildLShr(Ty, Src, ShAmt);
9018 auto Mask = MIRBuilder.buildConstant(Ty, 1ULL << J);
9019 Tmp2 = MIRBuilder.buildAnd(Ty, Tmp2, Mask);
9020 if (I == 0)
9021 Tmp = Tmp2;
9022 else
9023 Tmp = MIRBuilder.buildOr(Ty, Tmp, Tmp2);
9025 MIRBuilder.buildCopy(Dst, Tmp);
9028 MI.eraseFromParent();
9029 return Legalized;
9032 LegalizerHelper::LegalizeResult
9033 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
9034 MachineFunction &MF = MIRBuilder.getMF();
9036 bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
9037 int NameOpIdx = IsRead ? 1 : 0;
9038 int ValRegIndex = IsRead ? 0 : 1;
9040 Register ValReg = MI.getOperand(ValRegIndex).getReg();
9041 const LLT Ty = MRI.getType(ValReg);
9042 const MDString *RegStr = cast<MDString>(
9043 cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
9045 Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
9046 if (!PhysReg.isValid())
9047 return UnableToLegalize;
9049 if (IsRead)
9050 MIRBuilder.buildCopy(ValReg, PhysReg);
9051 else
9052 MIRBuilder.buildCopy(PhysReg, ValReg);
9054 MI.eraseFromParent();
9055 return Legalized;
9058 LegalizerHelper::LegalizeResult
9059 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
9060 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
9061 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
9062 Register Result = MI.getOperand(0).getReg();
9063 LLT OrigTy = MRI.getType(Result);
9064 auto SizeInBits = OrigTy.getScalarSizeInBits();
9065 LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
9067 auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
9068 auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
9069 auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
9070 unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
9072 auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
9073 auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
9074 MIRBuilder.buildTrunc(Result, Shifted);
9076 MI.eraseFromParent();
9077 return Legalized;
9080 LegalizerHelper::LegalizeResult
9081 LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
9082 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
9083 FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm());
9085 if (Mask == fcNone) {
9086 MIRBuilder.buildConstant(DstReg, 0);
9087 MI.eraseFromParent();
9088 return Legalized;
9090 if (Mask == fcAllFlags) {
9091 MIRBuilder.buildConstant(DstReg, 1);
9092 MI.eraseFromParent();
9093 return Legalized;
9096 // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
9097 // version
9099 unsigned BitSize = SrcTy.getScalarSizeInBits();
9100 const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
9102 LLT IntTy = LLT::scalar(BitSize);
9103 if (SrcTy.isVector())
9104 IntTy = LLT::vector(SrcTy.getElementCount(), IntTy);
9105 auto AsInt = MIRBuilder.buildCopy(IntTy, SrcReg);
9107 // Various masks.
9108 APInt SignBit = APInt::getSignMask(BitSize);
9109 APInt ValueMask = APInt::getSignedMaxValue(BitSize); // All bits but sign.
9110 APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit.
9111 APInt ExpMask = Inf;
9112 APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
9113 APInt QNaNBitMask =
9114 APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
9115 APInt InvertionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits());
9117 auto SignBitC = MIRBuilder.buildConstant(IntTy, SignBit);
9118 auto ValueMaskC = MIRBuilder.buildConstant(IntTy, ValueMask);
9119 auto InfC = MIRBuilder.buildConstant(IntTy, Inf);
9120 auto ExpMaskC = MIRBuilder.buildConstant(IntTy, ExpMask);
9121 auto ZeroC = MIRBuilder.buildConstant(IntTy, 0);
9123 auto Abs = MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC);
9124 auto Sign =
9125 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs);
9127 auto Res = MIRBuilder.buildConstant(DstTy, 0);
9128 // Clang doesn't support capture of structured bindings:
9129 LLT DstTyCopy = DstTy;
9130 const auto appendToRes = [&](MachineInstrBuilder ToAppend) {
9131 Res = MIRBuilder.buildOr(DstTyCopy, Res, ToAppend);
9134 // Tests that involve more than one class should be processed first.
9135 if ((Mask & fcFinite) == fcFinite) {
9136 // finite(V) ==> abs(V) u< exp_mask
9137 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
9138 ExpMaskC));
9139 Mask &= ~fcFinite;
9140 } else if ((Mask & fcFinite) == fcPosFinite) {
9141 // finite(V) && V > 0 ==> V u< exp_mask
9142 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt,
9143 ExpMaskC));
9144 Mask &= ~fcPosFinite;
9145 } else if ((Mask & fcFinite) == fcNegFinite) {
9146 // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
9147 auto Cmp = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
9148 ExpMaskC);
9149 auto And = MIRBuilder.buildAnd(DstTy, Cmp, Sign);
9150 appendToRes(And);
9151 Mask &= ~fcNegFinite;
9154 if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
9155 // fcZero | fcSubnormal => test all exponent bits are 0
9156 // TODO: Handle sign bit specific cases
9157 // TODO: Handle inverted case
9158 if (PartialCheck == (fcZero | fcSubnormal)) {
9159 auto ExpBits = MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC);
9160 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9161 ExpBits, ZeroC));
9162 Mask &= ~PartialCheck;
9166 // Check for individual classes.
9167 if (FPClassTest PartialCheck = Mask & fcZero) {
9168 if (PartialCheck == fcPosZero)
9169 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9170 AsInt, ZeroC));
9171 else if (PartialCheck == fcZero)
9172 appendToRes(
9173 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC));
9174 else // fcNegZero
9175 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9176 AsInt, SignBitC));
9179 if (FPClassTest PartialCheck = Mask & fcSubnormal) {
9180 // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
9181 // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
9182 auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
9183 auto OneC = MIRBuilder.buildConstant(IntTy, 1);
9184 auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC);
9185 auto SubnormalRes =
9186 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne,
9187 MIRBuilder.buildConstant(IntTy, AllOneMantissa));
9188 if (PartialCheck == fcNegSubnormal)
9189 SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign);
9190 appendToRes(SubnormalRes);
9193 if (FPClassTest PartialCheck = Mask & fcInf) {
9194 if (PartialCheck == fcPosInf)
9195 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9196 AsInt, InfC));
9197 else if (PartialCheck == fcInf)
9198 appendToRes(
9199 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC));
9200 else { // fcNegInf
9201 APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt();
9202 auto NegInfC = MIRBuilder.buildConstant(IntTy, NegInf);
9203 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9204 AsInt, NegInfC));
9208 if (FPClassTest PartialCheck = Mask & fcNan) {
9209 auto InfWithQnanBitC = MIRBuilder.buildConstant(IntTy, Inf | QNaNBitMask);
9210 if (PartialCheck == fcNan) {
9211 // isnan(V) ==> abs(V) u> int(inf)
9212 appendToRes(
9213 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC));
9214 } else if (PartialCheck == fcQNan) {
9215 // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
9216 appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs,
9217 InfWithQnanBitC));
9218 } else { // fcSNan
9219 // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
9220 // abs(V) u< (unsigned(Inf) | quiet_bit)
9221 auto IsNan =
9222 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC);
9223 auto IsNotQnan = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy,
9224 Abs, InfWithQnanBitC);
9225 appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan));
9229 if (FPClassTest PartialCheck = Mask & fcNormal) {
9230 // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
9231 // (max_exp-1))
9232 APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
9233 auto ExpMinusOne = MIRBuilder.buildSub(
9234 IntTy, Abs, MIRBuilder.buildConstant(IntTy, ExpLSB));
9235 APInt MaxExpMinusOne = ExpMask - ExpLSB;
9236 auto NormalRes =
9237 MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
9238 MIRBuilder.buildConstant(IntTy, MaxExpMinusOne));
9239 if (PartialCheck == fcNegNormal)
9240 NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign);
9241 else if (PartialCheck == fcPosNormal) {
9242 auto PosSign = MIRBuilder.buildXor(
9243 DstTy, Sign, MIRBuilder.buildConstant(DstTy, InvertionMask));
9244 NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign);
9246 appendToRes(NormalRes);
9249 MIRBuilder.buildCopy(DstReg, Res);
9250 MI.eraseFromParent();
9251 return Legalized;
9254 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
9255 // Implement G_SELECT in terms of XOR, AND, OR.
9256 auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
9257 MI.getFirst4RegLLTs();
9259 bool IsEltPtr = DstTy.isPointerOrPointerVector();
9260 if (IsEltPtr) {
9261 LLT ScalarPtrTy = LLT::scalar(DstTy.getScalarSizeInBits());
9262 LLT NewTy = DstTy.changeElementType(ScalarPtrTy);
9263 Op1Reg = MIRBuilder.buildPtrToInt(NewTy, Op1Reg).getReg(0);
9264 Op2Reg = MIRBuilder.buildPtrToInt(NewTy, Op2Reg).getReg(0);
9265 DstTy = NewTy;
9268 if (MaskTy.isScalar()) {
9269 // Turn the scalar condition into a vector condition mask if needed.
9271 Register MaskElt = MaskReg;
9273 // The condition was potentially zero extended before, but we want a sign
9274 // extended boolean.
9275 if (MaskTy != LLT::scalar(1))
9276 MaskElt = MIRBuilder.buildSExtInReg(MaskTy, MaskElt, 1).getReg(0);
9278 // Continue the sign extension (or truncate) to match the data type.
9279 MaskElt =
9280 MIRBuilder.buildSExtOrTrunc(DstTy.getScalarType(), MaskElt).getReg(0);
9282 if (DstTy.isVector()) {
9283 // Generate a vector splat idiom.
9284 auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
9285 MaskReg = ShufSplat.getReg(0);
9286 } else {
9287 MaskReg = MaskElt;
9289 MaskTy = DstTy;
9290 } else if (!DstTy.isVector()) {
9291 // Cannot handle the case that mask is a vector and dst is a scalar.
9292 return UnableToLegalize;
9295 if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
9296 return UnableToLegalize;
9299 auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
9300 auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
9301 auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
9302 if (IsEltPtr) {
9303 auto Or = MIRBuilder.buildOr(DstTy, NewOp1, NewOp2);
9304 MIRBuilder.buildIntToPtr(DstReg, Or);
9305 } else {
9306 MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
9308 MI.eraseFromParent();
9309 return Legalized;
9312 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
9313 // Split DIVREM into individual instructions.
9314 unsigned Opcode = MI.getOpcode();
9316 MIRBuilder.buildInstr(
9317 Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
9318 : TargetOpcode::G_UDIV,
9319 {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
9320 MIRBuilder.buildInstr(
9321 Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
9322 : TargetOpcode::G_UREM,
9323 {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
9324 MI.eraseFromParent();
9325 return Legalized;
9328 LegalizerHelper::LegalizeResult
9329 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
9330 // Expand %res = G_ABS %a into:
9331 // %v1 = G_ASHR %a, scalar_size-1
9332 // %v2 = G_ADD %a, %v1
9333 // %res = G_XOR %v2, %v1
9334 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
9335 Register OpReg = MI.getOperand(1).getReg();
9336 auto ShiftAmt =
9337 MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
9338 auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
9339 auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
9340 MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
9341 MI.eraseFromParent();
9342 return Legalized;
9345 LegalizerHelper::LegalizeResult
9346 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
9347 // Expand %res = G_ABS %a into:
9348 // %v1 = G_CONSTANT 0
9349 // %v2 = G_SUB %v1, %a
9350 // %res = G_SMAX %a, %v2
9351 Register SrcReg = MI.getOperand(1).getReg();
9352 LLT Ty = MRI.getType(SrcReg);
9353 auto Zero = MIRBuilder.buildConstant(Ty, 0);
9354 auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg);
9355 MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
9356 MI.eraseFromParent();
9357 return Legalized;
9360 LegalizerHelper::LegalizeResult
9361 LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) {
9362 Register SrcReg = MI.getOperand(1).getReg();
9363 Register DestReg = MI.getOperand(0).getReg();
9364 LLT Ty = MRI.getType(SrcReg), IType = LLT::scalar(1);
9365 auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
9366 auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
9367 auto ICmp = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, IType, SrcReg, Zero);
9368 MIRBuilder.buildSelect(DestReg, ICmp, SrcReg, Sub);
9369 MI.eraseFromParent();
9370 return Legalized;
9373 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFAbs(MachineInstr &MI) {
9374 Register SrcReg = MI.getOperand(1).getReg();
9375 Register DstReg = MI.getOperand(0).getReg();
9377 LLT Ty = MRI.getType(DstReg);
9379 // Reset sign bit
9380 MIRBuilder.buildAnd(
9381 DstReg, SrcReg,
9382 MIRBuilder.buildConstant(
9383 Ty, APInt::getSignedMaxValue(Ty.getScalarSizeInBits())));
9385 MI.eraseFromParent();
9386 return Legalized;
9389 LegalizerHelper::LegalizeResult
9390 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
9391 Register SrcReg = MI.getOperand(1).getReg();
9392 LLT SrcTy = MRI.getType(SrcReg);
9393 LLT DstTy = MRI.getType(SrcReg);
9395 // The source could be a scalar if the IR type was <1 x sN>.
9396 if (SrcTy.isScalar()) {
9397 if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
9398 return UnableToLegalize; // FIXME: handle extension.
9399 // This can be just a plain copy.
9400 Observer.changingInstr(MI);
9401 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
9402 Observer.changedInstr(MI);
9403 return Legalized;
9405 return UnableToLegalize;
9408 LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) {
9409 MachineFunction &MF = *MI.getMF();
9410 const DataLayout &DL = MIRBuilder.getDataLayout();
9411 LLVMContext &Ctx = MF.getFunction().getContext();
9412 Register ListPtr = MI.getOperand(1).getReg();
9413 LLT PtrTy = MRI.getType(ListPtr);
9415 // LstPtr is a pointer to the head of the list. Get the address
9416 // of the head of the list.
9417 Align PtrAlignment = DL.getABITypeAlign(getTypeForLLT(PtrTy, Ctx));
9418 MachineMemOperand *PtrLoadMMO = MF.getMachineMemOperand(
9419 MachinePointerInfo(), MachineMemOperand::MOLoad, PtrTy, PtrAlignment);
9420 auto VAList = MIRBuilder.buildLoad(PtrTy, ListPtr, *PtrLoadMMO).getReg(0);
9422 const Align A(MI.getOperand(2).getImm());
9423 LLT PtrTyAsScalarTy = LLT::scalar(PtrTy.getSizeInBits());
9424 if (A > TLI.getMinStackArgumentAlignment()) {
9425 Register AlignAmt =
9426 MIRBuilder.buildConstant(PtrTyAsScalarTy, A.value() - 1).getReg(0);
9427 auto AddDst = MIRBuilder.buildPtrAdd(PtrTy, VAList, AlignAmt);
9428 auto AndDst = MIRBuilder.buildMaskLowPtrBits(PtrTy, AddDst, Log2(A));
9429 VAList = AndDst.getReg(0);
9432 // Increment the pointer, VAList, to the next vaarg
9433 // The list should be bumped by the size of element in the current head of
9434 // list.
9435 Register Dst = MI.getOperand(0).getReg();
9436 LLT LLTTy = MRI.getType(Dst);
9437 Type *Ty = getTypeForLLT(LLTTy, Ctx);
9438 auto IncAmt =
9439 MIRBuilder.buildConstant(PtrTyAsScalarTy, DL.getTypeAllocSize(Ty));
9440 auto Succ = MIRBuilder.buildPtrAdd(PtrTy, VAList, IncAmt);
9442 // Store the increment VAList to the legalized pointer
9443 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
9444 MachinePointerInfo(), MachineMemOperand::MOStore, PtrTy, PtrAlignment);
9445 MIRBuilder.buildStore(Succ, ListPtr, *StoreMMO);
9446 // Load the actual argument out of the pointer VAList
9447 Align EltAlignment = DL.getABITypeAlign(Ty);
9448 MachineMemOperand *EltLoadMMO = MF.getMachineMemOperand(
9449 MachinePointerInfo(), MachineMemOperand::MOLoad, LLTTy, EltAlignment);
9450 MIRBuilder.buildLoad(Dst, VAList, *EltLoadMMO);
9452 MI.eraseFromParent();
9453 return Legalized;
9456 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
9457 // On Darwin, -Os means optimize for size without hurting performance, so
9458 // only really optimize for size when -Oz (MinSize) is used.
9459 if (MF.getTarget().getTargetTriple().isOSDarwin())
9460 return MF.getFunction().hasMinSize();
9461 return MF.getFunction().hasOptSize();
9464 // Returns a list of types to use for memory op lowering in MemOps. A partial
9465 // port of findOptimalMemOpLowering in TargetLowering.
9466 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
9467 unsigned Limit, const MemOp &Op,
9468 unsigned DstAS, unsigned SrcAS,
9469 const AttributeList &FuncAttributes,
9470 const TargetLowering &TLI) {
9471 if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
9472 return false;
9474 LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
9476 if (Ty == LLT()) {
9477 // Use the largest scalar type whose alignment constraints are satisfied.
9478 // We only need to check DstAlign here as SrcAlign is always greater or
9479 // equal to DstAlign (or zero).
9480 Ty = LLT::scalar(64);
9481 if (Op.isFixedDstAlign())
9482 while (Op.getDstAlign() < Ty.getSizeInBytes() &&
9483 !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
9484 Ty = LLT::scalar(Ty.getSizeInBytes());
9485 assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
9486 // FIXME: check for the largest legal type we can load/store to.
9489 unsigned NumMemOps = 0;
9490 uint64_t Size = Op.size();
9491 while (Size) {
9492 unsigned TySize = Ty.getSizeInBytes();
9493 while (TySize > Size) {
9494 // For now, only use non-vector load / store's for the left-over pieces.
9495 LLT NewTy = Ty;
9496 // FIXME: check for mem op safety and legality of the types. Not all of
9497 // SDAGisms map cleanly to GISel concepts.
9498 if (NewTy.isVector())
9499 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
9500 NewTy = LLT::scalar(llvm::bit_floor(NewTy.getSizeInBits() - 1));
9501 unsigned NewTySize = NewTy.getSizeInBytes();
9502 assert(NewTySize > 0 && "Could not find appropriate type");
9504 // If the new LLT cannot cover all of the remaining bits, then consider
9505 // issuing a (or a pair of) unaligned and overlapping load / store.
9506 unsigned Fast;
9507 // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
9508 MVT VT = getMVTForLLT(Ty);
9509 if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
9510 TLI.allowsMisalignedMemoryAccesses(
9511 VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
9512 MachineMemOperand::MONone, &Fast) &&
9513 Fast)
9514 TySize = Size;
9515 else {
9516 Ty = NewTy;
9517 TySize = NewTySize;
9521 if (++NumMemOps > Limit)
9522 return false;
9524 MemOps.push_back(Ty);
9525 Size -= TySize;
9528 return true;
9531 // Get a vectorized representation of the memset value operand, GISel edition.
9532 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
9533 MachineRegisterInfo &MRI = *MIB.getMRI();
9534 unsigned NumBits = Ty.getScalarSizeInBits();
9535 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
9536 if (!Ty.isVector() && ValVRegAndVal) {
9537 APInt Scalar = ValVRegAndVal->Value.trunc(8);
9538 APInt SplatVal = APInt::getSplat(NumBits, Scalar);
9539 return MIB.buildConstant(Ty, SplatVal).getReg(0);
9542 // Extend the byte value to the larger type, and then multiply by a magic
9543 // value 0x010101... in order to replicate it across every byte.
9544 // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
9545 if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
9546 return MIB.buildConstant(Ty, 0).getReg(0);
9549 LLT ExtType = Ty.getScalarType();
9550 auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
9551 if (NumBits > 8) {
9552 APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
9553 auto MagicMI = MIB.buildConstant(ExtType, Magic);
9554 Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
9557 // For vector types create a G_BUILD_VECTOR.
9558 if (Ty.isVector())
9559 Val = MIB.buildSplatBuildVector(Ty, Val).getReg(0);
9561 return Val;
9564 LegalizerHelper::LegalizeResult
9565 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
9566 uint64_t KnownLen, Align Alignment,
9567 bool IsVolatile) {
9568 auto &MF = *MI.getParent()->getParent();
9569 const auto &TLI = *MF.getSubtarget().getTargetLowering();
9570 auto &DL = MF.getDataLayout();
9571 LLVMContext &C = MF.getFunction().getContext();
9573 assert(KnownLen != 0 && "Have a zero length memset length!");
9575 bool DstAlignCanChange = false;
9576 MachineFrameInfo &MFI = MF.getFrameInfo();
9577 bool OptSize = shouldLowerMemFuncForSize(MF);
9579 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
9580 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
9581 DstAlignCanChange = true;
9583 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
9584 std::vector<LLT> MemOps;
9586 const auto &DstMMO = **MI.memoperands_begin();
9587 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
9589 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
9590 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
9592 if (!findGISelOptimalMemOpLowering(MemOps, Limit,
9593 MemOp::Set(KnownLen, DstAlignCanChange,
9594 Alignment,
9595 /*IsZeroMemset=*/IsZeroVal,
9596 /*IsVolatile=*/IsVolatile),
9597 DstPtrInfo.getAddrSpace(), ~0u,
9598 MF.getFunction().getAttributes(), TLI))
9599 return UnableToLegalize;
9601 if (DstAlignCanChange) {
9602 // Get an estimate of the type from the LLT.
9603 Type *IRTy = getTypeForLLT(MemOps[0], C);
9604 Align NewAlign = DL.getABITypeAlign(IRTy);
9605 if (NewAlign > Alignment) {
9606 Alignment = NewAlign;
9607 unsigned FI = FIDef->getOperand(1).getIndex();
9608 // Give the stack frame object a larger alignment if needed.
9609 if (MFI.getObjectAlign(FI) < Alignment)
9610 MFI.setObjectAlignment(FI, Alignment);
9614 MachineIRBuilder MIB(MI);
9615 // Find the largest store and generate the bit pattern for it.
9616 LLT LargestTy = MemOps[0];
9617 for (unsigned i = 1; i < MemOps.size(); i++)
9618 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
9619 LargestTy = MemOps[i];
9621 // The memset stored value is always defined as an s8, so in order to make it
9622 // work with larger store types we need to repeat the bit pattern across the
9623 // wider type.
9624 Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
9626 if (!MemSetValue)
9627 return UnableToLegalize;
9629 // Generate the stores. For each store type in the list, we generate the
9630 // matching store of that type to the destination address.
9631 LLT PtrTy = MRI.getType(Dst);
9632 unsigned DstOff = 0;
9633 unsigned Size = KnownLen;
9634 for (unsigned I = 0; I < MemOps.size(); I++) {
9635 LLT Ty = MemOps[I];
9636 unsigned TySize = Ty.getSizeInBytes();
9637 if (TySize > Size) {
9638 // Issuing an unaligned load / store pair that overlaps with the previous
9639 // pair. Adjust the offset accordingly.
9640 assert(I == MemOps.size() - 1 && I != 0);
9641 DstOff -= TySize - Size;
9644 // If this store is smaller than the largest store see whether we can get
9645 // the smaller value for free with a truncate.
9646 Register Value = MemSetValue;
9647 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
9648 MVT VT = getMVTForLLT(Ty);
9649 MVT LargestVT = getMVTForLLT(LargestTy);
9650 if (!LargestTy.isVector() && !Ty.isVector() &&
9651 TLI.isTruncateFree(LargestVT, VT))
9652 Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
9653 else
9654 Value = getMemsetValue(Val, Ty, MIB);
9655 if (!Value)
9656 return UnableToLegalize;
9659 auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
9661 Register Ptr = Dst;
9662 if (DstOff != 0) {
9663 auto Offset =
9664 MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
9665 Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
9668 MIB.buildStore(Value, Ptr, *StoreMMO);
9669 DstOff += Ty.getSizeInBytes();
9670 Size -= TySize;
9673 MI.eraseFromParent();
9674 return Legalized;
9677 LegalizerHelper::LegalizeResult
9678 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
9679 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
9681 auto [Dst, Src, Len] = MI.getFirst3Regs();
9683 const auto *MMOIt = MI.memoperands_begin();
9684 const MachineMemOperand *MemOp = *MMOIt;
9685 bool IsVolatile = MemOp->isVolatile();
9687 // See if this is a constant length copy
9688 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
9689 // FIXME: support dynamically sized G_MEMCPY_INLINE
9690 assert(LenVRegAndVal &&
9691 "inline memcpy with dynamic size is not yet supported");
9692 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
9693 if (KnownLen == 0) {
9694 MI.eraseFromParent();
9695 return Legalized;
9698 const auto &DstMMO = **MI.memoperands_begin();
9699 const auto &SrcMMO = **std::next(MI.memoperands_begin());
9700 Align DstAlign = DstMMO.getBaseAlign();
9701 Align SrcAlign = SrcMMO.getBaseAlign();
9703 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
9704 IsVolatile);
9707 LegalizerHelper::LegalizeResult
9708 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
9709 uint64_t KnownLen, Align DstAlign,
9710 Align SrcAlign, bool IsVolatile) {
9711 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
9712 return lowerMemcpy(MI, Dst, Src, KnownLen,
9713 std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
9714 IsVolatile);
9717 LegalizerHelper::LegalizeResult
9718 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
9719 uint64_t KnownLen, uint64_t Limit, Align DstAlign,
9720 Align SrcAlign, bool IsVolatile) {
9721 auto &MF = *MI.getParent()->getParent();
9722 const auto &TLI = *MF.getSubtarget().getTargetLowering();
9723 auto &DL = MF.getDataLayout();
9724 LLVMContext &C = MF.getFunction().getContext();
9726 assert(KnownLen != 0 && "Have a zero length memcpy length!");
9728 bool DstAlignCanChange = false;
9729 MachineFrameInfo &MFI = MF.getFrameInfo();
9730 Align Alignment = std::min(DstAlign, SrcAlign);
9732 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
9733 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
9734 DstAlignCanChange = true;
9736 // FIXME: infer better src pointer alignment like SelectionDAG does here.
9737 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
9738 // if the memcpy is in a tail call position.
9740 std::vector<LLT> MemOps;
9742 const auto &DstMMO = **MI.memoperands_begin();
9743 const auto &SrcMMO = **std::next(MI.memoperands_begin());
9744 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
9745 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
9747 if (!findGISelOptimalMemOpLowering(
9748 MemOps, Limit,
9749 MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
9750 IsVolatile),
9751 DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
9752 MF.getFunction().getAttributes(), TLI))
9753 return UnableToLegalize;
9755 if (DstAlignCanChange) {
9756 // Get an estimate of the type from the LLT.
9757 Type *IRTy = getTypeForLLT(MemOps[0], C);
9758 Align NewAlign = DL.getABITypeAlign(IRTy);
9760 // Don't promote to an alignment that would require dynamic stack
9761 // realignment.
9762 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
9763 if (!TRI->hasStackRealignment(MF))
9764 if (MaybeAlign StackAlign = DL.getStackAlignment())
9765 NewAlign = std::min(NewAlign, *StackAlign);
9767 if (NewAlign > Alignment) {
9768 Alignment = NewAlign;
9769 unsigned FI = FIDef->getOperand(1).getIndex();
9770 // Give the stack frame object a larger alignment if needed.
9771 if (MFI.getObjectAlign(FI) < Alignment)
9772 MFI.setObjectAlignment(FI, Alignment);
9776 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
9778 MachineIRBuilder MIB(MI);
9779 // Now we need to emit a pair of load and stores for each of the types we've
9780 // collected. I.e. for each type, generate a load from the source pointer of
9781 // that type width, and then generate a corresponding store to the dest buffer
9782 // of that value loaded. This can result in a sequence of loads and stores
9783 // mixed types, depending on what the target specifies as good types to use.
9784 unsigned CurrOffset = 0;
9785 unsigned Size = KnownLen;
9786 for (auto CopyTy : MemOps) {
9787 // Issuing an unaligned load / store pair that overlaps with the previous
9788 // pair. Adjust the offset accordingly.
9789 if (CopyTy.getSizeInBytes() > Size)
9790 CurrOffset -= CopyTy.getSizeInBytes() - Size;
9792 // Construct MMOs for the accesses.
9793 auto *LoadMMO =
9794 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
9795 auto *StoreMMO =
9796 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
9798 // Create the load.
9799 Register LoadPtr = Src;
9800 Register Offset;
9801 if (CurrOffset != 0) {
9802 LLT SrcTy = MRI.getType(Src);
9803 Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
9804 .getReg(0);
9805 LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
9807 auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
9809 // Create the store.
9810 Register StorePtr = Dst;
9811 if (CurrOffset != 0) {
9812 LLT DstTy = MRI.getType(Dst);
9813 StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
9815 MIB.buildStore(LdVal, StorePtr, *StoreMMO);
9816 CurrOffset += CopyTy.getSizeInBytes();
9817 Size -= CopyTy.getSizeInBytes();
9820 MI.eraseFromParent();
9821 return Legalized;
9824 LegalizerHelper::LegalizeResult
9825 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
9826 uint64_t KnownLen, Align DstAlign, Align SrcAlign,
9827 bool IsVolatile) {
9828 auto &MF = *MI.getParent()->getParent();
9829 const auto &TLI = *MF.getSubtarget().getTargetLowering();
9830 auto &DL = MF.getDataLayout();
9831 LLVMContext &C = MF.getFunction().getContext();
9833 assert(KnownLen != 0 && "Have a zero length memmove length!");
9835 bool DstAlignCanChange = false;
9836 MachineFrameInfo &MFI = MF.getFrameInfo();
9837 bool OptSize = shouldLowerMemFuncForSize(MF);
9838 Align Alignment = std::min(DstAlign, SrcAlign);
9840 MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
9841 if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
9842 DstAlignCanChange = true;
9844 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
9845 std::vector<LLT> MemOps;
9847 const auto &DstMMO = **MI.memoperands_begin();
9848 const auto &SrcMMO = **std::next(MI.memoperands_begin());
9849 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
9850 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
9852 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
9853 // to a bug in it's findOptimalMemOpLowering implementation. For now do the
9854 // same thing here.
9855 if (!findGISelOptimalMemOpLowering(
9856 MemOps, Limit,
9857 MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
9858 /*IsVolatile*/ true),
9859 DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
9860 MF.getFunction().getAttributes(), TLI))
9861 return UnableToLegalize;
9863 if (DstAlignCanChange) {
9864 // Get an estimate of the type from the LLT.
9865 Type *IRTy = getTypeForLLT(MemOps[0], C);
9866 Align NewAlign = DL.getABITypeAlign(IRTy);
9868 // Don't promote to an alignment that would require dynamic stack
9869 // realignment.
9870 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
9871 if (!TRI->hasStackRealignment(MF))
9872 if (MaybeAlign StackAlign = DL.getStackAlignment())
9873 NewAlign = std::min(NewAlign, *StackAlign);
9875 if (NewAlign > Alignment) {
9876 Alignment = NewAlign;
9877 unsigned FI = FIDef->getOperand(1).getIndex();
9878 // Give the stack frame object a larger alignment if needed.
9879 if (MFI.getObjectAlign(FI) < Alignment)
9880 MFI.setObjectAlignment(FI, Alignment);
9884 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
9886 MachineIRBuilder MIB(MI);
9887 // Memmove requires that we perform the loads first before issuing the stores.
9888 // Apart from that, this loop is pretty much doing the same thing as the
9889 // memcpy codegen function.
9890 unsigned CurrOffset = 0;
9891 SmallVector<Register, 16> LoadVals;
9892 for (auto CopyTy : MemOps) {
9893 // Construct MMO for the load.
9894 auto *LoadMMO =
9895 MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
9897 // Create the load.
9898 Register LoadPtr = Src;
9899 if (CurrOffset != 0) {
9900 LLT SrcTy = MRI.getType(Src);
9901 auto Offset =
9902 MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
9903 LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
9905 LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
9906 CurrOffset += CopyTy.getSizeInBytes();
9909 CurrOffset = 0;
9910 for (unsigned I = 0; I < MemOps.size(); ++I) {
9911 LLT CopyTy = MemOps[I];
9912 // Now store the values loaded.
9913 auto *StoreMMO =
9914 MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
9916 Register StorePtr = Dst;
9917 if (CurrOffset != 0) {
9918 LLT DstTy = MRI.getType(Dst);
9919 auto Offset =
9920 MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
9921 StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
9923 MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
9924 CurrOffset += CopyTy.getSizeInBytes();
9926 MI.eraseFromParent();
9927 return Legalized;
9930 LegalizerHelper::LegalizeResult
9931 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
9932 const unsigned Opc = MI.getOpcode();
9933 // This combine is fairly complex so it's not written with a separate
9934 // matcher function.
9935 assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
9936 Opc == TargetOpcode::G_MEMSET) &&
9937 "Expected memcpy like instruction");
9939 auto MMOIt = MI.memoperands_begin();
9940 const MachineMemOperand *MemOp = *MMOIt;
9942 Align DstAlign = MemOp->getBaseAlign();
9943 Align SrcAlign;
9944 auto [Dst, Src, Len] = MI.getFirst3Regs();
9946 if (Opc != TargetOpcode::G_MEMSET) {
9947 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
9948 MemOp = *(++MMOIt);
9949 SrcAlign = MemOp->getBaseAlign();
9952 // See if this is a constant length copy
9953 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
9954 if (!LenVRegAndVal)
9955 return UnableToLegalize;
9956 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
9958 if (KnownLen == 0) {
9959 MI.eraseFromParent();
9960 return Legalized;
9963 bool IsVolatile = MemOp->isVolatile();
9964 if (Opc == TargetOpcode::G_MEMCPY_INLINE)
9965 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
9966 IsVolatile);
9968 // Don't try to optimize volatile.
9969 if (IsVolatile)
9970 return UnableToLegalize;
9972 if (MaxLen && KnownLen > MaxLen)
9973 return UnableToLegalize;
9975 if (Opc == TargetOpcode::G_MEMCPY) {
9976 auto &MF = *MI.getParent()->getParent();
9977 const auto &TLI = *MF.getSubtarget().getTargetLowering();
9978 bool OptSize = shouldLowerMemFuncForSize(MF);
9979 uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
9980 return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
9981 IsVolatile);
9983 if (Opc == TargetOpcode::G_MEMMOVE)
9984 return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
9985 if (Opc == TargetOpcode::G_MEMSET)
9986 return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
9987 return UnableToLegalize;