1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the Machinelegalizer class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64RegisterBankInfo.h"
16 #include "AArch64Subtarget.h"
17 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
20 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
21 #include "llvm/CodeGen/GlobalISel/Utils.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineRegisterInfo.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/IntrinsicsAArch64.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/MathExtras.h"
31 #include <initializer_list>
33 #define DEBUG_TYPE "aarch64-legalinfo"
36 using namespace LegalizeActions
;
37 using namespace LegalizeMutations
;
38 using namespace LegalityPredicates
;
39 using namespace MIPatternMatch
;
41 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget
&ST
)
43 using namespace TargetOpcode
;
44 const LLT p0
= LLT::pointer(0, 64);
45 const LLT s1
= LLT::scalar(1);
46 const LLT s8
= LLT::scalar(8);
47 const LLT s16
= LLT::scalar(16);
48 const LLT s32
= LLT::scalar(32);
49 const LLT s64
= LLT::scalar(64);
50 const LLT s128
= LLT::scalar(128);
51 const LLT v16s8
= LLT::fixed_vector(16, 8);
52 const LLT v8s8
= LLT::fixed_vector(8, 8);
53 const LLT v4s8
= LLT::fixed_vector(4, 8);
54 const LLT v8s16
= LLT::fixed_vector(8, 16);
55 const LLT v4s16
= LLT::fixed_vector(4, 16);
56 const LLT v2s16
= LLT::fixed_vector(2, 16);
57 const LLT v2s32
= LLT::fixed_vector(2, 32);
58 const LLT v4s32
= LLT::fixed_vector(4, 32);
59 const LLT v2s64
= LLT::fixed_vector(2, 64);
60 const LLT v2p0
= LLT::fixed_vector(2, p0
);
62 std::initializer_list
<LLT
> PackedVectorAllTypeList
= {/* Begin 128bit types */
65 /* End 128bit types */
66 /* Begin 64bit types */
69 const TargetMachine
&TM
= ST
.getTargetLowering()->getTargetMachine();
71 // FIXME: support subtargets which have neon/fp-armv8 disabled.
72 if (!ST
.hasNEON() || !ST
.hasFPARMv8()) {
73 getLegacyLegalizerInfo().computeTables();
77 // Some instructions only support s16 if the subtarget has full 16-bit FP
79 const bool HasFP16
= ST
.hasFullFP16();
80 const LLT
&MinFPScalar
= HasFP16
? s16
: s32
;
82 getActionDefinitionsBuilder({G_IMPLICIT_DEF
, G_FREEZE
})
83 .legalFor({p0
, s1
, s8
, s16
, s32
, s64
})
84 .legalFor(PackedVectorAllTypeList
)
85 .widenScalarToNextPow2(0)
86 .clampScalar(0, s8
, s64
)
88 [=](const LegalityQuery
&Query
) {
89 return Query
.Types
[0].isVector() &&
90 (Query
.Types
[0].getElementType() != s64
||
91 Query
.Types
[0].getNumElements() != 2);
93 [=](const LegalityQuery
&Query
) {
94 LLT EltTy
= Query
.Types
[0].getElementType();
96 return std::make_pair(0, LLT::fixed_vector(2, 64));
97 return std::make_pair(0, EltTy
);
100 getActionDefinitionsBuilder(G_PHI
)
101 .legalFor({p0
, s16
, s32
, s64
})
102 .legalFor(PackedVectorAllTypeList
)
103 .widenScalarToNextPow2(0)
104 .clampScalar(0, s16
, s64
)
105 // Maximum: sN * k = 128
106 .clampMaxNumElements(0, s8
, 16)
107 .clampMaxNumElements(0, s16
, 8)
108 .clampMaxNumElements(0, s32
, 4)
109 .clampMaxNumElements(0, s64
, 2)
110 .clampMaxNumElements(0, p0
, 2);
112 getActionDefinitionsBuilder(G_BSWAP
)
113 .legalFor({s32
, s64
, v4s32
, v2s32
, v2s64
})
114 .widenScalarToNextPow2(0)
115 .clampScalar(0, s32
, s64
);
117 getActionDefinitionsBuilder({G_ADD
, G_SUB
, G_MUL
, G_AND
, G_OR
, G_XOR
})
118 .legalFor({s32
, s64
, v2s32
, v4s32
, v4s16
, v8s16
, v16s8
, v8s8
})
120 [=](const LegalityQuery
&Query
) {
121 return Query
.Opcode
== G_MUL
&& Query
.Types
[0] == v2s64
;
125 .widenScalarToNextPow2(0)
126 .clampScalar(0, s32
, s64
)
127 .clampNumElements(0, v2s32
, v4s32
)
128 .clampNumElements(0, v2s64
, v2s64
)
129 .moreElementsToNextPow2(0);
131 getActionDefinitionsBuilder({G_SHL
, G_ASHR
, G_LSHR
})
132 .customIf([=](const LegalityQuery
&Query
) {
133 const auto &SrcTy
= Query
.Types
[0];
134 const auto &AmtTy
= Query
.Types
[1];
135 return !SrcTy
.isVector() && SrcTy
.getSizeInBits() == 32 &&
136 AmtTy
.getSizeInBits() == 32;
150 .widenScalarToNextPow2(0)
151 .clampScalar(1, s32
, s64
)
152 .clampScalar(0, s32
, s64
)
153 .clampNumElements(0, v2s32
, v4s32
)
154 .clampNumElements(0, v2s64
, v2s64
)
155 .moreElementsToNextPow2(0)
156 .minScalarSameAs(1, 0);
158 getActionDefinitionsBuilder(G_PTR_ADD
)
159 .legalFor({{p0
, s64
}, {v2p0
, v2s64
}})
160 .clampScalar(1, s64
, s64
);
162 getActionDefinitionsBuilder(G_PTRMASK
).legalFor({{p0
, s64
}});
164 getActionDefinitionsBuilder({G_SDIV
, G_UDIV
})
165 .legalFor({s32
, s64
})
167 .clampScalar(0, s32
, s64
)
168 .widenScalarToNextPow2(0)
171 getActionDefinitionsBuilder({G_SREM
, G_UREM
, G_SDIVREM
, G_UDIVREM
})
172 .lowerFor({s1
, s8
, s16
, s32
, s64
, v2s64
, v4s32
, v2s32
})
173 .widenScalarOrEltToNextPow2(0)
174 .clampScalarOrElt(0, s32
, s64
)
175 .clampNumElements(0, v2s32
, v4s32
)
176 .clampNumElements(0, v2s64
, v2s64
)
177 .moreElementsToNextPow2(0);
180 getActionDefinitionsBuilder({G_SMULO
, G_UMULO
})
181 .widenScalarToNextPow2(0, /*Min = */ 32)
182 .clampScalar(0, s32
, s64
)
183 .lowerIf(typeIs(1, s1
));
185 getActionDefinitionsBuilder({G_SMULH
, G_UMULH
})
186 .legalFor({s64
, v8s16
, v16s8
, v4s32
})
189 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
190 .legalFor({v8s8
, v16s8
, v4s16
, v8s16
, v2s32
, v4s32
})
191 .clampNumElements(0, v8s8
, v16s8
)
192 .clampNumElements(0, v4s16
, v8s16
)
193 .clampNumElements(0, v2s32
, v4s32
)
194 // FIXME: This sholdn't be needed as v2s64 types are going to
195 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
196 .clampNumElements(0, v2s64
, v2s64
)
199 getActionDefinitionsBuilder(
200 {G_SADDE
, G_SSUBE
, G_UADDE
, G_USUBE
, G_SADDO
, G_SSUBO
, G_UADDO
, G_USUBO
})
201 .legalFor({{s32
, s1
}, {s64
, s1
}})
202 .clampScalar(0, s32
, s64
)
203 .widenScalarToNextPow2(0);
205 getActionDefinitionsBuilder({G_FADD
, G_FSUB
, G_FMUL
, G_FDIV
, G_FNEG
})
206 .legalFor({MinFPScalar
, s32
, s64
, v2s64
, v4s32
, v2s32
})
207 .clampScalar(0, MinFPScalar
, s64
)
208 .clampNumElements(0, v2s32
, v4s32
)
209 .clampNumElements(0, v2s64
, v2s64
);
211 getActionDefinitionsBuilder(G_FREM
).libcallFor({s32
, s64
});
213 getActionDefinitionsBuilder({G_FCEIL
, G_FABS
, G_FSQRT
, G_FFLOOR
, G_FRINT
,
214 G_FMA
, G_INTRINSIC_TRUNC
, G_INTRINSIC_ROUND
,
215 G_FNEARBYINT
, G_INTRINSIC_LRINT
})
216 // If we don't have full FP16 support, then scalarize the elements of
217 // vectors containing fp16 types.
219 [=, &ST
](const LegalityQuery
&Query
) {
220 const auto &Ty
= Query
.Types
[0];
221 return Ty
.isVector() && Ty
.getElementType() == s16
&&
224 [=](const LegalityQuery
&Query
) { return std::make_pair(0, s16
); })
225 // If we don't have full FP16 support, then widen s16 to s32 if we
228 [=, &ST
](const LegalityQuery
&Query
) {
229 return Query
.Types
[0] == s16
&& !ST
.hasFullFP16();
231 [=](const LegalityQuery
&Query
) { return std::make_pair(0, s32
); })
232 .legalFor({s16
, s32
, s64
, v2s32
, v4s32
, v2s64
, v2s16
, v4s16
, v8s16
});
234 getActionDefinitionsBuilder(
235 {G_FCOS
, G_FSIN
, G_FLOG10
, G_FLOG
, G_FLOG2
, G_FEXP
, G_FEXP2
, G_FPOW
})
236 // We need a call for these, so we always need to scalarize.
238 // Regardless of FP16 support, widen 16-bit elements to 32-bits.
240 .libcallFor({s32
, s64
, v2s32
, v4s32
, v2s64
});
242 getActionDefinitionsBuilder(G_INSERT
)
243 .legalIf(all(typeInSet(0, {s32
, s64
, p0
}),
244 typeInSet(1, {s1
, s8
, s16
, s32
}), smallerThan(1, 0)))
245 .widenScalarToNextPow2(0)
246 .clampScalar(0, s32
, s64
)
247 .widenScalarToNextPow2(1)
249 .maxScalarIf(typeInSet(0, {s32
}), 1, s16
)
250 .maxScalarIf(typeInSet(0, {s64
, p0
}), 1, s32
);
252 getActionDefinitionsBuilder(G_EXTRACT
)
253 .legalIf(all(typeInSet(0, {s16
, s32
, s64
, p0
}),
254 typeInSet(1, {s32
, s64
, s128
, p0
}), smallerThan(0, 1)))
255 .widenScalarToNextPow2(1)
256 .clampScalar(1, s32
, s128
)
257 .widenScalarToNextPow2(0)
259 .maxScalarIf(typeInSet(1, {s32
}), 0, s16
)
260 .maxScalarIf(typeInSet(1, {s64
, p0
}), 0, s32
)
261 .maxScalarIf(typeInSet(1, {s128
}), 0, s64
);
263 getActionDefinitionsBuilder({G_SEXTLOAD
, G_ZEXTLOAD
})
264 .lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered
))
265 .legalForTypesWithMemDesc({{s32
, p0
, s8
, 8},
273 {v2s32
, p0
, s64
, 8}})
274 .widenScalarToNextPow2(0)
275 .clampScalar(0, s32
, s64
)
276 // TODO: We could support sum-of-pow2's but the lowering code doesn't know
277 // how to do that yet.
278 .unsupportedIfMemSizeNotPow2()
279 // Lower anything left over into G_*EXT and G_LOAD
282 auto IsPtrVecPred
= [=](const LegalityQuery
&Query
) {
283 const LLT
&ValTy
= Query
.Types
[0];
284 if (!ValTy
.isVector())
286 const LLT EltTy
= ValTy
.getElementType();
287 return EltTy
.isPointer() && EltTy
.getAddressSpace() == 0;
290 getActionDefinitionsBuilder(G_LOAD
)
291 .customIf([=](const LegalityQuery
&Query
) {
292 return Query
.Types
[0] == s128
&&
293 Query
.MMODescrs
[0].Ordering
!= AtomicOrdering::NotAtomic
;
295 .legalForTypesWithMemDesc({{s8
, p0
, s8
, 8},
302 {v16s8
, p0
, s128
, 8},
304 {v8s16
, p0
, s128
, 8},
306 {v4s32
, p0
, s128
, 8},
307 {v2s64
, p0
, s128
, 8}})
308 // These extends are also legal
309 .legalForTypesWithMemDesc({{s32
, p0
, s8
, 8}, {s32
, p0
, s16
, 8}})
310 .widenScalarToNextPow2(0, /* MinSize = */8)
311 .lowerIfMemSizeNotPow2()
312 .clampScalar(0, s8
, s64
)
313 .narrowScalarIf([=](const LegalityQuery
&Query
) {
314 // Clamp extending load results to 32-bits.
315 return Query
.Types
[0].isScalar() &&
316 Query
.Types
[0] != Query
.MMODescrs
[0].MemoryTy
&&
317 Query
.Types
[0].getSizeInBits() > 32;
320 // Lower any any-extending loads left into G_ANYEXT and G_LOAD
321 .lowerIf([=](const LegalityQuery
&Query
) {
322 return Query
.Types
[0] != Query
.MMODescrs
[0].MemoryTy
;
324 .clampMaxNumElements(0, s8
, 16)
325 .clampMaxNumElements(0, s16
, 8)
326 .clampMaxNumElements(0, s32
, 4)
327 .clampMaxNumElements(0, s64
, 2)
328 .clampMaxNumElements(0, p0
, 2)
329 .customIf(IsPtrVecPred
)
330 .scalarizeIf(typeIs(0, v2s16
), 0);
332 getActionDefinitionsBuilder(G_STORE
)
333 .customIf([=](const LegalityQuery
&Query
) {
334 return Query
.Types
[0] == s128
&&
335 Query
.MMODescrs
[0].Ordering
!= AtomicOrdering::NotAtomic
;
337 .legalForTypesWithMemDesc({{s8
, p0
, s8
, 8},
338 {s16
, p0
, s8
, 8}, // truncstorei8 from s16
339 {s32
, p0
, s8
, 8}, // truncstorei8 from s32
340 {s64
, p0
, s8
, 8}, // truncstorei8 from s64
342 {s32
, p0
, s16
, 8}, // truncstorei16 from s32
343 {s64
, p0
, s16
, 8}, // truncstorei16 from s64
348 {s64
, p0
, s32
, 8}, // truncstorei32 from s64
351 {v16s8
, p0
, s128
, 8},
354 {v8s16
, p0
, s128
, 8},
356 {v4s32
, p0
, s128
, 8},
357 {v2s64
, p0
, s128
, 8}})
358 .clampScalar(0, s8
, s64
)
359 .lowerIf([=](const LegalityQuery
&Query
) {
360 return Query
.Types
[0].isScalar() &&
361 Query
.Types
[0] != Query
.MMODescrs
[0].MemoryTy
;
363 // Maximum: sN * k = 128
364 .clampMaxNumElements(0, s8
, 16)
365 .clampMaxNumElements(0, s16
, 8)
366 .clampMaxNumElements(0, s32
, 4)
367 .clampMaxNumElements(0, s64
, 2)
368 .clampMaxNumElements(0, p0
, 2)
369 .lowerIfMemSizeNotPow2()
370 .customIf(IsPtrVecPred
)
371 .scalarizeIf(typeIs(0, v2s16
), 0);
374 getActionDefinitionsBuilder(G_CONSTANT
)
375 .legalFor({p0
, s8
, s16
, s32
, s64
})
376 .widenScalarToNextPow2(0)
377 .clampScalar(0, s8
, s64
);
378 getActionDefinitionsBuilder(G_FCONSTANT
)
379 .legalIf([=](const LegalityQuery
&Query
) {
380 const auto &Ty
= Query
.Types
[0];
381 if (HasFP16
&& Ty
== s16
)
383 return Ty
== s32
|| Ty
== s64
|| Ty
== s128
;
385 .clampScalar(0, MinFPScalar
, s128
);
387 getActionDefinitionsBuilder({G_ICMP
, G_FCMP
})
388 .legalFor({{s32
, s32
},
399 .widenScalarOrEltToNextPow2(1)
400 .clampScalar(1, s32
, s64
)
401 .clampScalar(0, s32
, s32
)
402 .minScalarEltSameAsIf(
403 [=](const LegalityQuery
&Query
) {
404 const LLT
&Ty
= Query
.Types
[0];
405 const LLT
&SrcTy
= Query
.Types
[1];
406 return Ty
.isVector() && !SrcTy
.getElementType().isPointer() &&
407 Ty
.getElementType() != SrcTy
.getElementType();
411 [=](const LegalityQuery
&Query
) { return Query
.Types
[1] == v2s16
; },
414 [=](const LegalityQuery
&Query
) { return Query
.Types
[1] == v2p0
; }, 0,
416 .clampNumElements(0, v2s32
, v4s32
);
419 auto ExtLegalFunc
= [=](const LegalityQuery
&Query
) {
420 unsigned DstSize
= Query
.Types
[0].getSizeInBits();
422 if (DstSize
== 128 && !Query
.Types
[0].isVector())
423 return false; // Extending to a scalar s128 needs narrowing.
425 // Make sure that we have something that will fit in a register, and
426 // make sure it's a power of 2.
427 if (DstSize
< 8 || DstSize
> 128 || !isPowerOf2_32(DstSize
))
430 const LLT
&SrcTy
= Query
.Types
[1];
432 // Special case for s1.
436 // Make sure we fit in a register otherwise. Don't bother checking that
437 // the source type is below 128 bits. We shouldn't be allowing anything
438 // through which is wider than the destination in the first place.
439 unsigned SrcSize
= SrcTy
.getSizeInBits();
440 if (SrcSize
< 8 || !isPowerOf2_32(SrcSize
))
445 getActionDefinitionsBuilder({G_ZEXT
, G_SEXT
, G_ANYEXT
})
446 .legalIf(ExtLegalFunc
)
447 .clampScalar(0, s64
, s64
); // Just for s128, others are handled above.
449 getActionDefinitionsBuilder(G_TRUNC
)
451 [=](const LegalityQuery
&Query
) { return Query
.Types
[0].isVector(); },
453 .customIf([=](const LegalityQuery
&Query
) {
454 LLT DstTy
= Query
.Types
[0];
455 LLT SrcTy
= Query
.Types
[1];
456 return DstTy
== v8s8
&& SrcTy
.getSizeInBits() > 128;
460 getActionDefinitionsBuilder(G_SEXT_INREG
).legalFor({s32
, s64
}).lower();
463 getActionDefinitionsBuilder(G_FPTRUNC
)
465 {{s16
, s32
}, {s16
, s64
}, {s32
, s64
}, {v4s16
, v4s32
}, {v2s32
, v2s64
}})
466 .clampMaxNumElements(0, s32
, 2);
467 getActionDefinitionsBuilder(G_FPEXT
)
469 {{s32
, s16
}, {s64
, s16
}, {s64
, s32
}, {v4s32
, v4s16
}, {v2s64
, v2s32
}})
470 .clampMaxNumElements(0, s64
, 2);
473 getActionDefinitionsBuilder({G_FPTOSI
, G_FPTOUI
})
474 .legalForCartesianProduct({s32
, s64
, v2s64
, v4s32
, v2s32
})
475 .widenScalarToNextPow2(0)
476 .clampScalar(0, s32
, s64
)
477 .widenScalarToNextPow2(1)
478 .clampScalar(1, s32
, s64
);
480 getActionDefinitionsBuilder({G_SITOFP
, G_UITOFP
})
481 .legalForCartesianProduct({s32
, s64
, v2s64
, v4s32
, v2s32
})
482 .clampScalar(1, s32
, s64
)
483 .minScalarSameAs(1, 0)
484 .clampScalar(0, s32
, s64
)
485 .widenScalarToNextPow2(0);
488 getActionDefinitionsBuilder(G_BRCOND
).legalFor({s1
, s8
, s16
, s32
});
489 getActionDefinitionsBuilder(G_BRINDIRECT
).legalFor({p0
});
491 getActionDefinitionsBuilder(G_SELECT
)
492 .legalFor({{s32
, s1
}, {s64
, s1
}, {p0
, s1
}})
493 .widenScalarToNextPow2(0)
494 .clampScalar(0, s32
, s64
)
495 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
496 .lowerIf(isVector(0));
499 getActionDefinitionsBuilder(G_FRAME_INDEX
).legalFor({p0
});
501 if (TM
.getCodeModel() == CodeModel::Small
)
502 getActionDefinitionsBuilder(G_GLOBAL_VALUE
).custom();
504 getActionDefinitionsBuilder(G_GLOBAL_VALUE
).legalFor({p0
});
506 getActionDefinitionsBuilder(G_PTRTOINT
)
507 .legalForCartesianProduct({s1
, s8
, s16
, s32
, s64
}, {p0
})
508 .legalFor({{v2s64
, v2p0
}})
510 .widenScalarToNextPow2(0, /*Min*/ 8);
512 getActionDefinitionsBuilder(G_INTTOPTR
)
513 .unsupportedIf([&](const LegalityQuery
&Query
) {
514 return Query
.Types
[0].getSizeInBits() != Query
.Types
[1].getSizeInBits();
516 .legalFor({{p0
, s64
}, {v2p0
, v2s64
}});
518 // Casts for 32 and 64-bit width type are just copies.
519 // Same for 128-bit width type, except they are on the FPR bank.
520 getActionDefinitionsBuilder(G_BITCAST
)
521 // FIXME: This is wrong since G_BITCAST is not allowed to change the
522 // number of bits but it's what the previous code described and fixing
524 .legalForCartesianProduct({s1
, s8
, s16
, s32
, s64
, s128
, v16s8
, v8s8
, v4s8
,
525 v8s16
, v4s16
, v2s16
, v4s32
, v2s32
, v2s64
,
528 getActionDefinitionsBuilder(G_VASTART
).legalFor({p0
});
530 // va_list must be a pointer, but most sized types are pretty easy to handle
531 // as the destination.
532 getActionDefinitionsBuilder(G_VAARG
)
533 .customForCartesianProduct({s8
, s16
, s32
, s64
, p0
}, {p0
})
534 .clampScalar(0, s8
, s64
)
535 .widenScalarToNextPow2(0, /*Min*/ 8);
537 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS
)
539 all(typeInSet(0, {s8
, s16
, s32
, s64
, s128
}), typeIs(1, s1
), typeIs(2, p0
)));
541 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG
)
542 .customIf([](const LegalityQuery
&Query
) {
543 return Query
.Types
[0].getSizeInBits() == 128;
545 .clampScalar(0, s32
, s64
)
546 .legalIf(all(typeInSet(0, {s32
, s64
}), typeIs(1, p0
)));
548 getActionDefinitionsBuilder(
549 {G_ATOMICRMW_XCHG
, G_ATOMICRMW_ADD
, G_ATOMICRMW_SUB
, G_ATOMICRMW_AND
,
550 G_ATOMICRMW_OR
, G_ATOMICRMW_XOR
, G_ATOMICRMW_MIN
, G_ATOMICRMW_MAX
,
551 G_ATOMICRMW_UMIN
, G_ATOMICRMW_UMAX
})
552 .clampScalar(0, s32
, s64
)
553 .legalIf(all(typeInSet(0, {s32
, s64
}), typeIs(1, p0
)));
555 getActionDefinitionsBuilder(G_BLOCK_ADDR
).legalFor({p0
});
558 for (unsigned Op
: {G_MERGE_VALUES
, G_UNMERGE_VALUES
}) {
559 unsigned BigTyIdx
= Op
== G_MERGE_VALUES
? 0 : 1;
560 unsigned LitTyIdx
= Op
== G_MERGE_VALUES
? 1 : 0;
561 getActionDefinitionsBuilder(Op
)
562 .widenScalarToNextPow2(LitTyIdx
, 8)
563 .widenScalarToNextPow2(BigTyIdx
, 32)
564 .clampScalar(LitTyIdx
, s8
, s64
)
565 .clampScalar(BigTyIdx
, s32
, s128
)
566 .legalIf([=](const LegalityQuery
&Q
) {
567 switch (Q
.Types
[BigTyIdx
].getSizeInBits()) {
575 switch (Q
.Types
[LitTyIdx
].getSizeInBits()) {
587 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT
)
588 .unsupportedIf([=](const LegalityQuery
&Query
) {
589 const LLT
&EltTy
= Query
.Types
[1].getElementType();
590 return Query
.Types
[0] != EltTy
;
593 .legalIf([=](const LegalityQuery
&Query
) {
594 const LLT
&VecTy
= Query
.Types
[1];
595 return VecTy
== v2s16
|| VecTy
== v4s16
|| VecTy
== v8s16
||
596 VecTy
== v4s32
|| VecTy
== v2s64
|| VecTy
== v2s32
||
597 VecTy
== v8s8
|| VecTy
== v16s8
|| VecTy
== v2s32
||
601 [=](const LegalityQuery
&Query
) {
602 // We want to promote to <M x s1> to <M x s64> if that wouldn't
603 // cause the total vec size to be > 128b.
604 return Query
.Types
[1].getNumElements() <= 2;
608 [=](const LegalityQuery
&Query
) {
609 return Query
.Types
[1].getNumElements() <= 4;
613 [=](const LegalityQuery
&Query
) {
614 return Query
.Types
[1].getNumElements() <= 8;
618 [=](const LegalityQuery
&Query
) {
619 return Query
.Types
[1].getNumElements() <= 16;
622 .minScalarOrElt(0, s8
) // Worst case, we need at least s8.
623 .clampMaxNumElements(1, s64
, 2)
624 .clampMaxNumElements(1, s32
, 4)
625 .clampMaxNumElements(1, s16
, 8)
626 .clampMaxNumElements(1, p0
, 2);
628 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT
)
629 .legalIf(typeInSet(0, {v8s16
, v2s32
, v4s32
, v2s64
}));
631 getActionDefinitionsBuilder(G_BUILD_VECTOR
)
632 .legalFor({{v8s8
, s8
},
641 .clampNumElements(0, v4s32
, v4s32
)
642 .clampNumElements(0, v2s64
, v2s64
)
643 .minScalarOrElt(0, s8
)
644 .minScalarSameAs(1, 0);
646 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
).lower();
648 getActionDefinitionsBuilder(G_CTLZ
)
649 .legalForCartesianProduct(
650 {s32
, s64
, v8s8
, v16s8
, v4s16
, v8s16
, v2s32
, v4s32
})
652 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF
).lower();
654 // TODO: Custom lowering for v2s32, v4s32, v2s64.
655 getActionDefinitionsBuilder(G_BITREVERSE
)
656 .legalFor({s32
, s64
, v8s8
, v16s8
})
657 .widenScalarToNextPow2(0, /*Min = */ 32)
658 .clampScalar(0, s32
, s64
);
660 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF
).lower();
662 // TODO: Handle vector types.
663 getActionDefinitionsBuilder(G_CTTZ
)
664 .clampScalar(0, s32
, s64
)
665 .scalarSameSizeAs(1, 0)
666 .customFor({s32
, s64
});
668 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR
)
669 .legalIf([=](const LegalityQuery
&Query
) {
670 const LLT
&DstTy
= Query
.Types
[0];
671 const LLT
&SrcTy
= Query
.Types
[1];
672 // For now just support the TBL2 variant which needs the source vectors
673 // to be the same size as the dest.
676 for (auto &Ty
: {v2s32
, v4s32
, v2s64
, v2p0
, v16s8
, v8s16
}) {
682 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
683 // just want those lowered into G_BUILD_VECTOR
684 .lowerIf([=](const LegalityQuery
&Query
) {
685 return !Query
.Types
[1].isVector();
687 .moreElementsToNextPow2(0)
688 .clampNumElements(0, v4s32
, v4s32
)
689 .clampNumElements(0, v2s64
, v2s64
);
691 getActionDefinitionsBuilder(G_CONCAT_VECTORS
)
692 .legalFor({{v4s32
, v2s32
}, {v8s16
, v4s16
}, {v16s8
, v8s8
}});
694 getActionDefinitionsBuilder(G_JUMP_TABLE
).legalFor({{p0
}, {s64
}});
696 getActionDefinitionsBuilder(G_BRJT
).legalIf([=](const LegalityQuery
&Query
) {
697 return Query
.Types
[0] == p0
&& Query
.Types
[1] == s64
;
700 getActionDefinitionsBuilder(G_DYN_STACKALLOC
).lower();
703 // G_BZERO is not supported. Currently it is only emitted by
704 // PreLegalizerCombiner for G_MEMSET with zero constant.
705 getActionDefinitionsBuilder(G_BZERO
).unsupported();
707 getActionDefinitionsBuilder(G_MEMSET
)
708 .legalForCartesianProduct({p0
}, {s64
}, {s64
})
709 .customForCartesianProduct({p0
}, {s8
}, {s64
})
710 .immIdx(0); // Inform verifier imm idx 0 is handled.
712 getActionDefinitionsBuilder({G_MEMCPY
, G_MEMMOVE
})
713 .legalForCartesianProduct({p0
}, {p0
}, {s64
})
714 .immIdx(0); // Inform verifier imm idx 0 is handled.
716 // G_MEMCPY_INLINE does not have a tailcall immediate
717 getActionDefinitionsBuilder(G_MEMCPY_INLINE
)
718 .legalForCartesianProduct({p0
}, {p0
}, {s64
});
721 getActionDefinitionsBuilder({G_BZERO
, G_MEMCPY
, G_MEMMOVE
, G_MEMSET
})
725 // FIXME: Legal types are only legal with NEON.
726 getActionDefinitionsBuilder(G_ABS
)
727 .lowerIf(isScalar(0))
728 .legalFor(PackedVectorAllTypeList
);
730 getActionDefinitionsBuilder(G_VECREDUCE_FADD
)
731 // We only have FADDP to do reduction-like operations. Lower the rest.
732 .legalFor({{s32
, v2s32
}, {s64
, v2s64
}})
733 .clampMaxNumElements(1, s64
, 2)
734 .clampMaxNumElements(1, s32
, 2)
737 getActionDefinitionsBuilder(G_VECREDUCE_ADD
)
739 {{s8
, v16s8
}, {s16
, v8s16
}, {s32
, v4s32
}, {s32
, v2s32
}, {s64
, v2s64
}})
740 .clampMaxNumElements(1, s64
, 2)
741 .clampMaxNumElements(1, s32
, 4)
744 getActionDefinitionsBuilder(
745 {G_VECREDUCE_OR
, G_VECREDUCE_AND
, G_VECREDUCE_XOR
})
746 // Try to break down into smaller vectors as long as they're at least 64
747 // bits. This lets us use vector operations for some parts of the
750 [=](const LegalityQuery
&Q
) {
751 LLT SrcTy
= Q
.Types
[1];
752 if (SrcTy
.isScalar())
754 if (!isPowerOf2_32(SrcTy
.getNumElements()))
756 // We can usually perform 64b vector operations.
757 return SrcTy
.getSizeInBits() > 64;
759 [=](const LegalityQuery
&Q
) {
760 LLT SrcTy
= Q
.Types
[1];
761 return std::make_pair(1, SrcTy
.divide(2));
766 getActionDefinitionsBuilder({G_UADDSAT
, G_USUBSAT
})
767 .lowerIf([=](const LegalityQuery
&Q
) { return Q
.Types
[0].isScalar(); });
769 getActionDefinitionsBuilder({G_FSHL
, G_FSHR
}).lower();
771 getActionDefinitionsBuilder(G_ROTR
)
772 .legalFor({{s32
, s64
}, {s64
, s64
}})
773 .customIf([=](const LegalityQuery
&Q
) {
774 return Q
.Types
[0].isScalar() && Q
.Types
[1].getScalarSizeInBits() < 64;
777 getActionDefinitionsBuilder(G_ROTL
).lower();
779 getActionDefinitionsBuilder({G_SBFX
, G_UBFX
})
780 .customFor({{s32
, s32
}, {s64
, s64
}});
782 // TODO: Use generic lowering when custom lowering is not possible.
783 auto always
= [=](const LegalityQuery
&Q
) { return true; };
784 getActionDefinitionsBuilder(G_CTPOP
)
785 .legalFor({{v8s8
, v8s8
}, {v16s8
, v16s8
}})
786 .clampScalar(0, s32
, s128
)
787 .widenScalarToNextPow2(0)
788 .minScalarEltSameAsIf(always
, 1, 0)
789 .maxScalarEltSameAsIf(always
, 1, 0)
790 .customFor({{s32
, s32
},
799 // TODO: Vector types.
800 getActionDefinitionsBuilder({G_SADDSAT
, G_SSUBSAT
}).lowerIf(isScalar(0));
802 // TODO: Vector types.
803 getActionDefinitionsBuilder({G_FMAXNUM
, G_FMINNUM
})
804 .legalFor({MinFPScalar
, s32
, s64
})
806 .minScalar(0, MinFPScalar
);
808 // TODO: Vector types.
809 getActionDefinitionsBuilder({G_FMAXIMUM
, G_FMINIMUM
})
810 .legalFor({MinFPScalar
, s32
, s64
})
811 .minScalar(0, MinFPScalar
);
813 // TODO: Libcall support for s128.
814 // TODO: s16 should be legal with full FP16 support.
815 getActionDefinitionsBuilder({G_LROUND
, G_LLROUND
})
816 .legalFor({{s64
, s32
}, {s64
, s64
}});
818 getLegacyLegalizerInfo().computeTables();
819 verify(*ST
.getInstrInfo());
822 bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper
&Helper
,
823 MachineInstr
&MI
) const {
824 MachineIRBuilder
&MIRBuilder
= Helper
.MIRBuilder
;
825 MachineRegisterInfo
&MRI
= *MIRBuilder
.getMRI();
826 GISelChangeObserver
&Observer
= Helper
.Observer
;
827 switch (MI
.getOpcode()) {
829 // No idea what to do.
831 case TargetOpcode::G_VAARG
:
832 return legalizeVaArg(MI
, MRI
, MIRBuilder
);
833 case TargetOpcode::G_LOAD
:
834 case TargetOpcode::G_STORE
:
835 return legalizeLoadStore(MI
, MRI
, MIRBuilder
, Observer
);
836 case TargetOpcode::G_SHL
:
837 case TargetOpcode::G_ASHR
:
838 case TargetOpcode::G_LSHR
:
839 return legalizeShlAshrLshr(MI
, MRI
, MIRBuilder
, Observer
);
840 case TargetOpcode::G_GLOBAL_VALUE
:
841 return legalizeSmallCMGlobalValue(MI
, MRI
, MIRBuilder
, Observer
);
842 case TargetOpcode::G_TRUNC
:
843 return legalizeVectorTrunc(MI
, Helper
);
844 case TargetOpcode::G_SBFX
:
845 case TargetOpcode::G_UBFX
:
846 return legalizeBitfieldExtract(MI
, MRI
, Helper
);
847 case TargetOpcode::G_ROTR
:
848 return legalizeRotate(MI
, MRI
, Helper
);
849 case TargetOpcode::G_CTPOP
:
850 return legalizeCTPOP(MI
, MRI
, Helper
);
851 case TargetOpcode::G_ATOMIC_CMPXCHG
:
852 return legalizeAtomicCmpxchg128(MI
, MRI
, Helper
);
853 case TargetOpcode::G_CTTZ
:
854 return legalizeCTTZ(MI
, Helper
);
855 case TargetOpcode::G_BZERO
:
856 case TargetOpcode::G_MEMCPY
:
857 case TargetOpcode::G_MEMMOVE
:
858 case TargetOpcode::G_MEMSET
:
859 return legalizeMemOps(MI
, Helper
);
862 llvm_unreachable("expected switch to return");
865 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr
&MI
,
866 MachineRegisterInfo
&MRI
,
867 LegalizerHelper
&Helper
) const {
868 // To allow for imported patterns to match, we ensure that the rotate amount
869 // is 64b with an extension.
870 Register AmtReg
= MI
.getOperand(2).getReg();
871 LLT AmtTy
= MRI
.getType(AmtReg
);
873 assert(AmtTy
.isScalar() && "Expected a scalar rotate");
874 assert(AmtTy
.getSizeInBits() < 64 && "Expected this rotate to be legal");
875 auto NewAmt
= Helper
.MIRBuilder
.buildSExt(LLT::scalar(64), AmtReg
);
876 Helper
.Observer
.changingInstr(MI
);
877 MI
.getOperand(2).setReg(NewAmt
.getReg(0));
878 Helper
.Observer
.changedInstr(MI
);
882 static void extractParts(Register Reg
, MachineRegisterInfo
&MRI
,
883 MachineIRBuilder
&MIRBuilder
, LLT Ty
, int NumParts
,
884 SmallVectorImpl
<Register
> &VRegs
) {
885 for (int I
= 0; I
< NumParts
; ++I
)
886 VRegs
.push_back(MRI
.createGenericVirtualRegister(Ty
));
887 MIRBuilder
.buildUnmerge(VRegs
, Reg
);
890 bool AArch64LegalizerInfo::legalizeVectorTrunc(
891 MachineInstr
&MI
, LegalizerHelper
&Helper
) const {
892 MachineIRBuilder
&MIRBuilder
= Helper
.MIRBuilder
;
893 MachineRegisterInfo
&MRI
= *MIRBuilder
.getMRI();
894 // Similar to how operand splitting is done in SelectiondDAG, we can handle
895 // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
896 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
897 // %lo16(<4 x s16>) = G_TRUNC %inlo
898 // %hi16(<4 x s16>) = G_TRUNC %inhi
899 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
900 // %res(<8 x s8>) = G_TRUNC %in16
902 Register DstReg
= MI
.getOperand(0).getReg();
903 Register SrcReg
= MI
.getOperand(1).getReg();
904 LLT DstTy
= MRI
.getType(DstReg
);
905 LLT SrcTy
= MRI
.getType(SrcReg
);
906 assert(isPowerOf2_32(DstTy
.getSizeInBits()) &&
907 isPowerOf2_32(SrcTy
.getSizeInBits()));
911 SrcTy
.changeElementCount(SrcTy
.getElementCount().divideCoefficientBy(2));
912 // First, split the source into two smaller vectors.
913 SmallVector
<Register
, 2> SplitSrcs
;
914 extractParts(SrcReg
, MRI
, MIRBuilder
, SplitSrcTy
, 2, SplitSrcs
);
916 // Truncate the splits into intermediate narrower elements.
917 LLT InterTy
= SplitSrcTy
.changeElementSize(DstTy
.getScalarSizeInBits() * 2);
918 for (unsigned I
= 0; I
< SplitSrcs
.size(); ++I
)
919 SplitSrcs
[I
] = MIRBuilder
.buildTrunc(InterTy
, SplitSrcs
[I
]).getReg(0);
921 auto Concat
= MIRBuilder
.buildConcatVectors(
922 DstTy
.changeElementSize(DstTy
.getScalarSizeInBits() * 2), SplitSrcs
);
924 Helper
.Observer
.changingInstr(MI
);
925 MI
.getOperand(1).setReg(Concat
.getReg(0));
926 Helper
.Observer
.changedInstr(MI
);
930 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
931 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&MIRBuilder
,
932 GISelChangeObserver
&Observer
) const {
933 assert(MI
.getOpcode() == TargetOpcode::G_GLOBAL_VALUE
);
934 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
935 // G_ADD_LOW instructions.
936 // By splitting this here, we can optimize accesses in the small code model by
937 // folding in the G_ADD_LOW into the load/store offset.
938 auto &GlobalOp
= MI
.getOperand(1);
939 const auto* GV
= GlobalOp
.getGlobal();
940 if (GV
->isThreadLocal())
941 return true; // Don't want to modify TLS vars.
943 auto &TM
= ST
->getTargetLowering()->getTargetMachine();
944 unsigned OpFlags
= ST
->ClassifyGlobalReference(GV
, TM
);
946 if (OpFlags
& AArch64II::MO_GOT
)
949 auto Offset
= GlobalOp
.getOffset();
950 Register DstReg
= MI
.getOperand(0).getReg();
951 auto ADRP
= MIRBuilder
.buildInstr(AArch64::ADRP
, {LLT::pointer(0, 64)}, {})
952 .addGlobalAddress(GV
, Offset
, OpFlags
| AArch64II::MO_PAGE
);
953 // Set the regclass on the dest reg too.
954 MRI
.setRegClass(ADRP
.getReg(0), &AArch64::GPR64RegClass
);
956 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
957 // by creating a MOVK that sets bits 48-63 of the register to (global address
958 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
959 // prevent an incorrect tag being generated during relocation when the the
960 // global appears before the code section. Without the offset, a global at
961 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
962 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
963 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
965 // This assumes that we're in the small code model so we can assume a binary
966 // size of <= 4GB, which makes the untagged PC relative offset positive. The
967 // binary must also be loaded into address range [0, 2^48). Both of these
968 // properties need to be ensured at runtime when using tagged addresses.
969 if (OpFlags
& AArch64II::MO_TAGGED
) {
971 "Should not have folded in an offset for a tagged global!");
972 ADRP
= MIRBuilder
.buildInstr(AArch64::MOVKXi
, {LLT::pointer(0, 64)}, {ADRP
})
973 .addGlobalAddress(GV
, 0x100000000,
974 AArch64II::MO_PREL
| AArch64II::MO_G3
)
976 MRI
.setRegClass(ADRP
.getReg(0), &AArch64::GPR64RegClass
);
979 MIRBuilder
.buildInstr(AArch64::G_ADD_LOW
, {DstReg
}, {ADRP
})
980 .addGlobalAddress(GV
, Offset
,
981 OpFlags
| AArch64II::MO_PAGEOFF
| AArch64II::MO_NC
);
982 MI
.eraseFromParent();
986 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper
&Helper
,
987 MachineInstr
&MI
) const {
988 switch (MI
.getIntrinsicID()) {
989 case Intrinsic::vacopy
: {
990 unsigned PtrSize
= ST
->isTargetILP32() ? 4 : 8;
991 unsigned VaListSize
=
992 (ST
->isTargetDarwin() || ST
->isTargetWindows())
994 : ST
->isTargetILP32() ? 20 : 32;
996 MachineFunction
&MF
= *MI
.getMF();
997 auto Val
= MF
.getRegInfo().createGenericVirtualRegister(
998 LLT::scalar(VaListSize
* 8));
999 MachineIRBuilder
MIB(MI
);
1000 MIB
.buildLoad(Val
, MI
.getOperand(2),
1001 *MF
.getMachineMemOperand(MachinePointerInfo(),
1002 MachineMemOperand::MOLoad
,
1003 VaListSize
, Align(PtrSize
)));
1004 MIB
.buildStore(Val
, MI
.getOperand(1),
1005 *MF
.getMachineMemOperand(MachinePointerInfo(),
1006 MachineMemOperand::MOStore
,
1007 VaListSize
, Align(PtrSize
)));
1008 MI
.eraseFromParent();
1011 case Intrinsic::get_dynamic_area_offset
: {
1012 MachineIRBuilder
&MIB
= Helper
.MIRBuilder
;
1013 MIB
.buildConstant(MI
.getOperand(0).getReg(), 0);
1014 MI
.eraseFromParent();
1017 case Intrinsic::aarch64_mops_memset_tag
: {
1018 assert(MI
.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS
);
1019 // Zext the value to 64 bit
1020 MachineIRBuilder
MIB(MI
);
1021 auto &Value
= MI
.getOperand(3);
1022 Register ZExtValueReg
= MIB
.buildAnyExt(LLT::scalar(64), Value
).getReg(0);
1023 Value
.setReg(ZExtValueReg
);
1031 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
1032 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&MIRBuilder
,
1033 GISelChangeObserver
&Observer
) const {
1034 assert(MI
.getOpcode() == TargetOpcode::G_ASHR
||
1035 MI
.getOpcode() == TargetOpcode::G_LSHR
||
1036 MI
.getOpcode() == TargetOpcode::G_SHL
);
1037 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
1038 // imported patterns can select it later. Either way, it will be legal.
1039 Register AmtReg
= MI
.getOperand(2).getReg();
1040 auto VRegAndVal
= getIConstantVRegValWithLookThrough(AmtReg
, MRI
);
1043 // Check the shift amount is in range for an immediate form.
1044 int64_t Amount
= VRegAndVal
->Value
.getSExtValue();
1046 return true; // This will have to remain a register variant.
1047 auto ExtCst
= MIRBuilder
.buildConstant(LLT::scalar(64), Amount
);
1048 Observer
.changingInstr(MI
);
1049 MI
.getOperand(2).setReg(ExtCst
.getReg(0));
1050 Observer
.changedInstr(MI
);
1054 static void matchLDPSTPAddrMode(Register Root
, Register
&Base
, int &Offset
,
1055 MachineRegisterInfo
&MRI
) {
1061 if (mi_match(Root
, MRI
, m_GPtrAdd(m_Reg(NewBase
), m_ICst(NewOffset
))) &&
1062 isShiftedInt
<7, 3>(NewOffset
)) {
1068 // FIXME: This should be removed and replaced with the generic bitcast legalize
1070 bool AArch64LegalizerInfo::legalizeLoadStore(
1071 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&MIRBuilder
,
1072 GISelChangeObserver
&Observer
) const {
1073 assert(MI
.getOpcode() == TargetOpcode::G_STORE
||
1074 MI
.getOpcode() == TargetOpcode::G_LOAD
);
1075 // Here we just try to handle vector loads/stores where our value type might
1076 // have pointer elements, which the SelectionDAG importer can't handle. To
1077 // allow the existing patterns for s64 to fire for p0, we just try to bitcast
1078 // the value to use s64 types.
1080 // Custom legalization requires the instruction, if not deleted, must be fully
1081 // legalized. In order to allow further legalization of the inst, we create
1082 // a new instruction and erase the existing one.
1084 Register ValReg
= MI
.getOperand(0).getReg();
1085 const LLT ValTy
= MRI
.getType(ValReg
);
1087 if (ValTy
== LLT::scalar(128)) {
1088 assert((*MI
.memoperands_begin())->getSuccessOrdering() ==
1089 AtomicOrdering::Monotonic
||
1090 (*MI
.memoperands_begin())->getSuccessOrdering() ==
1091 AtomicOrdering::Unordered
);
1092 assert(ST
->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
1093 LLT s64
= LLT::scalar(64);
1094 MachineInstrBuilder NewI
;
1095 if (MI
.getOpcode() == TargetOpcode::G_LOAD
) {
1096 NewI
= MIRBuilder
.buildInstr(AArch64::LDPXi
, {s64
, s64
}, {});
1097 MIRBuilder
.buildMerge(ValReg
, {NewI
->getOperand(0), NewI
->getOperand(1)});
1099 auto Split
= MIRBuilder
.buildUnmerge(s64
, MI
.getOperand(0));
1100 NewI
= MIRBuilder
.buildInstr(
1101 AArch64::STPXi
, {}, {Split
->getOperand(0), Split
->getOperand(1)});
1105 matchLDPSTPAddrMode(MI
.getOperand(1).getReg(), Base
, Offset
, MRI
);
1107 NewI
.addImm(Offset
/ 8);
1109 NewI
.cloneMemRefs(MI
);
1110 constrainSelectedInstRegOperands(*NewI
, *ST
->getInstrInfo(),
1111 *MRI
.getTargetRegisterInfo(),
1112 *ST
->getRegBankInfo());
1113 MI
.eraseFromParent();
1117 if (!ValTy
.isVector() || !ValTy
.getElementType().isPointer() ||
1118 ValTy
.getElementType().getAddressSpace() != 0) {
1119 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
1123 unsigned PtrSize
= ValTy
.getElementType().getSizeInBits();
1124 const LLT NewTy
= LLT::vector(ValTy
.getElementCount(), PtrSize
);
1125 auto &MMO
= **MI
.memoperands_begin();
1128 if (MI
.getOpcode() == TargetOpcode::G_STORE
) {
1129 auto Bitcast
= MIRBuilder
.buildBitcast(NewTy
, ValReg
);
1130 MIRBuilder
.buildStore(Bitcast
.getReg(0), MI
.getOperand(1), MMO
);
1132 auto NewLoad
= MIRBuilder
.buildLoad(NewTy
, MI
.getOperand(1), MMO
);
1133 MIRBuilder
.buildBitcast(ValReg
, NewLoad
);
1135 MI
.eraseFromParent();
1139 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr
&MI
,
1140 MachineRegisterInfo
&MRI
,
1141 MachineIRBuilder
&MIRBuilder
) const {
1142 MachineFunction
&MF
= MIRBuilder
.getMF();
1143 Align
Alignment(MI
.getOperand(2).getImm());
1144 Register Dst
= MI
.getOperand(0).getReg();
1145 Register ListPtr
= MI
.getOperand(1).getReg();
1147 LLT PtrTy
= MRI
.getType(ListPtr
);
1148 LLT IntPtrTy
= LLT::scalar(PtrTy
.getSizeInBits());
1150 const unsigned PtrSize
= PtrTy
.getSizeInBits() / 8;
1151 const Align PtrAlign
= Align(PtrSize
);
1152 auto List
= MIRBuilder
.buildLoad(
1154 *MF
.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad
,
1157 MachineInstrBuilder DstPtr
;
1158 if (Alignment
> PtrAlign
) {
1159 // Realign the list to the actual required alignment.
1161 MIRBuilder
.buildConstant(IntPtrTy
, Alignment
.value() - 1);
1162 auto ListTmp
= MIRBuilder
.buildPtrAdd(PtrTy
, List
, AlignMinus1
.getReg(0));
1163 DstPtr
= MIRBuilder
.buildMaskLowPtrBits(PtrTy
, ListTmp
, Log2(Alignment
));
1167 LLT ValTy
= MRI
.getType(Dst
);
1168 uint64_t ValSize
= ValTy
.getSizeInBits() / 8;
1169 MIRBuilder
.buildLoad(
1171 *MF
.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad
,
1172 ValTy
, std::max(Alignment
, PtrAlign
)));
1174 auto Size
= MIRBuilder
.buildConstant(IntPtrTy
, alignTo(ValSize
, PtrAlign
));
1176 auto NewList
= MIRBuilder
.buildPtrAdd(PtrTy
, DstPtr
, Size
.getReg(0));
1178 MIRBuilder
.buildStore(NewList
, ListPtr
,
1179 *MF
.getMachineMemOperand(MachinePointerInfo(),
1180 MachineMemOperand::MOStore
,
1183 MI
.eraseFromParent();
1187 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
1188 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, LegalizerHelper
&Helper
) const {
1189 // Only legal if we can select immediate forms.
1190 // TODO: Lower this otherwise.
1191 return getIConstantVRegValWithLookThrough(MI
.getOperand(2).getReg(), MRI
) &&
1192 getIConstantVRegValWithLookThrough(MI
.getOperand(3).getReg(), MRI
);
1195 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr
&MI
,
1196 MachineRegisterInfo
&MRI
,
1197 LegalizerHelper
&Helper
) const {
1198 // While there is no integer popcount instruction, it can
1199 // be more efficiently lowered to the following sequence that uses
1200 // AdvSIMD registers/instructions as long as the copies to/from
1201 // the AdvSIMD registers are cheap.
1202 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
1203 // CNT V0.8B, V0.8B // 8xbyte pop-counts
1204 // ADDV B0, V0.8B // sum 8xbyte pop-counts
1205 // UMOV X0, V0.B[0] // copy byte result back to integer reg
1207 // For 128 bit vector popcounts, we lower to the following sequence:
1208 // cnt.16b v0, v0 // v8s16, v4s32, v2s64
1209 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64
1210 // uaddlp.4s v0, v0 // v4s32, v2s64
1211 // uaddlp.2d v0, v0 // v2s64
1213 // For 64 bit vector popcounts, we lower to the following sequence:
1214 // cnt.8b v0, v0 // v4s16, v2s32
1215 // uaddlp.4h v0, v0 // v4s16, v2s32
1216 // uaddlp.2s v0, v0 // v2s32
1218 if (!ST
->hasNEON() ||
1219 MI
.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat
))
1221 MachineIRBuilder
&MIRBuilder
= Helper
.MIRBuilder
;
1222 Register Dst
= MI
.getOperand(0).getReg();
1223 Register Val
= MI
.getOperand(1).getReg();
1224 LLT Ty
= MRI
.getType(Val
);
1226 assert(Ty
== MRI
.getType(Dst
) &&
1227 "Expected src and dst to have the same type!");
1228 unsigned Size
= Ty
.getSizeInBits();
1230 // Pre-conditioning: widen Val up to the nearest vector type.
1231 // s32,s64,v4s16,v2s32 -> v8i8
1232 // v8s16,v4s32,v2s64 -> v16i8
1233 LLT VTy
= Size
== 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
1234 if (Ty
.isScalar()) {
1235 assert((Size
== 32 || Size
== 64 || Size
== 128) && "Expected only 32, 64, or 128 bit scalars!");
1237 Val
= MIRBuilder
.buildZExt(LLT::scalar(64), Val
).getReg(0);
1240 Val
= MIRBuilder
.buildBitcast(VTy
, Val
).getReg(0);
1242 // Count bits in each byte-sized lane.
1243 auto CTPOP
= MIRBuilder
.buildCTPOP(VTy
, Val
);
1245 // Sum across lanes.
1246 Register HSum
= CTPOP
.getReg(0);
1248 SmallVector
<LLT
> HAddTys
;
1249 if (Ty
.isScalar()) {
1250 Opc
= Intrinsic::aarch64_neon_uaddlv
;
1251 HAddTys
.push_back(LLT::scalar(32));
1252 } else if (Ty
== LLT::fixed_vector(8, 16)) {
1253 Opc
= Intrinsic::aarch64_neon_uaddlp
;
1254 HAddTys
.push_back(LLT::fixed_vector(8, 16));
1255 } else if (Ty
== LLT::fixed_vector(4, 32)) {
1256 Opc
= Intrinsic::aarch64_neon_uaddlp
;
1257 HAddTys
.push_back(LLT::fixed_vector(8, 16));
1258 HAddTys
.push_back(LLT::fixed_vector(4, 32));
1259 } else if (Ty
== LLT::fixed_vector(2, 64)) {
1260 Opc
= Intrinsic::aarch64_neon_uaddlp
;
1261 HAddTys
.push_back(LLT::fixed_vector(8, 16));
1262 HAddTys
.push_back(LLT::fixed_vector(4, 32));
1263 HAddTys
.push_back(LLT::fixed_vector(2, 64));
1264 } else if (Ty
== LLT::fixed_vector(4, 16)) {
1265 Opc
= Intrinsic::aarch64_neon_uaddlp
;
1266 HAddTys
.push_back(LLT::fixed_vector(4, 16));
1267 } else if (Ty
== LLT::fixed_vector(2, 32)) {
1268 Opc
= Intrinsic::aarch64_neon_uaddlp
;
1269 HAddTys
.push_back(LLT::fixed_vector(4, 16));
1270 HAddTys
.push_back(LLT::fixed_vector(2, 32));
1272 llvm_unreachable("unexpected vector shape");
1273 MachineInstrBuilder UADD
;
1274 for (LLT HTy
: HAddTys
) {
1275 UADD
= MIRBuilder
.buildIntrinsic(Opc
, {HTy
}, /*HasSideEffects =*/false)
1277 HSum
= UADD
.getReg(0);
1280 // Post-conditioning.
1281 if (Ty
.isScalar() && (Size
== 64 || Size
== 128))
1282 MIRBuilder
.buildZExt(Dst
, UADD
);
1284 UADD
->getOperand(0).setReg(Dst
);
1285 MI
.eraseFromParent();
1289 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
1290 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, LegalizerHelper
&Helper
) const {
1291 MachineIRBuilder
&MIRBuilder
= Helper
.MIRBuilder
;
1292 LLT s64
= LLT::scalar(64);
1293 auto Addr
= MI
.getOperand(1).getReg();
1294 auto DesiredI
= MIRBuilder
.buildUnmerge({s64
, s64
}, MI
.getOperand(2));
1295 auto NewI
= MIRBuilder
.buildUnmerge({s64
, s64
}, MI
.getOperand(3));
1296 auto DstLo
= MRI
.createGenericVirtualRegister(s64
);
1297 auto DstHi
= MRI
.createGenericVirtualRegister(s64
);
1299 MachineInstrBuilder CAS
;
1301 // We have 128-bit CASP instructions taking XSeqPair registers, which are
1302 // s128. We need the merge/unmerge to bracket the expansion and pair up with
1303 // the rest of the MIR so we must reassemble the extracted registers into a
1304 // 128-bit known-regclass one with code like this:
1306 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input
1307 // %out = CASP %in1, ...
1308 // %OldLo = G_EXTRACT %out, 0
1309 // %OldHi = G_EXTRACT %out, 64
1310 auto Ordering
= (*MI
.memoperands_begin())->getMergedOrdering();
1313 case AtomicOrdering::Acquire
:
1314 Opcode
= AArch64::CASPAX
;
1316 case AtomicOrdering::Release
:
1317 Opcode
= AArch64::CASPLX
;
1319 case AtomicOrdering::AcquireRelease
:
1320 case AtomicOrdering::SequentiallyConsistent
:
1321 Opcode
= AArch64::CASPALX
;
1324 Opcode
= AArch64::CASPX
;
1328 LLT s128
= LLT::scalar(128);
1329 auto CASDst
= MRI
.createGenericVirtualRegister(s128
);
1330 auto CASDesired
= MRI
.createGenericVirtualRegister(s128
);
1331 auto CASNew
= MRI
.createGenericVirtualRegister(s128
);
1332 MIRBuilder
.buildInstr(TargetOpcode::REG_SEQUENCE
, {CASDesired
}, {})
1333 .addUse(DesiredI
->getOperand(0).getReg())
1334 .addImm(AArch64::sube64
)
1335 .addUse(DesiredI
->getOperand(1).getReg())
1336 .addImm(AArch64::subo64
);
1337 MIRBuilder
.buildInstr(TargetOpcode::REG_SEQUENCE
, {CASNew
}, {})
1338 .addUse(NewI
->getOperand(0).getReg())
1339 .addImm(AArch64::sube64
)
1340 .addUse(NewI
->getOperand(1).getReg())
1341 .addImm(AArch64::subo64
);
1343 CAS
= MIRBuilder
.buildInstr(Opcode
, {CASDst
}, {CASDesired
, CASNew
, Addr
});
1345 MIRBuilder
.buildExtract({DstLo
}, {CASDst
}, 0);
1346 MIRBuilder
.buildExtract({DstHi
}, {CASDst
}, 64);
1348 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
1349 // can take arbitrary registers so it just has the normal GPR64 operands the
1350 // rest of AArch64 is expecting.
1351 auto Ordering
= (*MI
.memoperands_begin())->getMergedOrdering();
1354 case AtomicOrdering::Acquire
:
1355 Opcode
= AArch64::CMP_SWAP_128_ACQUIRE
;
1357 case AtomicOrdering::Release
:
1358 Opcode
= AArch64::CMP_SWAP_128_RELEASE
;
1360 case AtomicOrdering::AcquireRelease
:
1361 case AtomicOrdering::SequentiallyConsistent
:
1362 Opcode
= AArch64::CMP_SWAP_128
;
1365 Opcode
= AArch64::CMP_SWAP_128_MONOTONIC
;
1369 auto Scratch
= MRI
.createVirtualRegister(&AArch64::GPR64RegClass
);
1370 CAS
= MIRBuilder
.buildInstr(Opcode
, {DstLo
, DstHi
, Scratch
},
1371 {Addr
, DesiredI
->getOperand(0),
1372 DesiredI
->getOperand(1), NewI
->getOperand(0),
1373 NewI
->getOperand(1)});
1376 CAS
.cloneMemRefs(MI
);
1377 constrainSelectedInstRegOperands(*CAS
, *ST
->getInstrInfo(),
1378 *MRI
.getTargetRegisterInfo(),
1379 *ST
->getRegBankInfo());
1381 MIRBuilder
.buildMerge(MI
.getOperand(0), {DstLo
, DstHi
});
1382 MI
.eraseFromParent();
1386 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr
&MI
,
1387 LegalizerHelper
&Helper
) const {
1388 MachineIRBuilder
&MIRBuilder
= Helper
.MIRBuilder
;
1389 MachineRegisterInfo
&MRI
= *MIRBuilder
.getMRI();
1390 LLT Ty
= MRI
.getType(MI
.getOperand(1).getReg());
1391 auto BitReverse
= MIRBuilder
.buildBitReverse(Ty
, MI
.getOperand(1));
1392 MIRBuilder
.buildCTLZ(MI
.getOperand(0).getReg(), BitReverse
);
1393 MI
.eraseFromParent();
1397 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr
&MI
,
1398 LegalizerHelper
&Helper
) const {
1399 MachineIRBuilder
&MIRBuilder
= Helper
.MIRBuilder
;
1401 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
1402 if (MI
.getOpcode() == TargetOpcode::G_MEMSET
) {
1403 // Zext the value operand to 64 bit
1404 auto &Value
= MI
.getOperand(1);
1405 Register ZExtValueReg
=
1406 MIRBuilder
.buildAnyExt(LLT::scalar(64), Value
).getReg(0);
1407 Value
.setReg(ZExtValueReg
);