1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// This file implements the targeting of the Machinelegalizer class for
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
22 #include "llvm/CodeGen/GlobalISel/Utils.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineRegisterInfo.h"
25 #include "llvm/CodeGen/TargetOpcodes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/IntrinsicsAArch64.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/MathExtras.h"
31 #include <initializer_list>
33 #define DEBUG_TYPE "aarch64-legalinfo"
36 using namespace LegalizeActions
;
37 using namespace LegalizeMutations
;
38 using namespace LegalityPredicates
;
39 using namespace MIPatternMatch
;
41 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget
&ST
)
43 using namespace TargetOpcode
;
44 const LLT p0
= LLT::pointer(0, 64);
45 const LLT s8
= LLT::scalar(8);
46 const LLT s16
= LLT::scalar(16);
47 const LLT s32
= LLT::scalar(32);
48 const LLT s64
= LLT::scalar(64);
49 const LLT s128
= LLT::scalar(128);
50 const LLT v16s8
= LLT::fixed_vector(16, 8);
51 const LLT v8s8
= LLT::fixed_vector(8, 8);
52 const LLT v4s8
= LLT::fixed_vector(4, 8);
53 const LLT v2s8
= LLT::fixed_vector(2, 8);
54 const LLT v8s16
= LLT::fixed_vector(8, 16);
55 const LLT v4s16
= LLT::fixed_vector(4, 16);
56 const LLT v2s16
= LLT::fixed_vector(2, 16);
57 const LLT v2s32
= LLT::fixed_vector(2, 32);
58 const LLT v4s32
= LLT::fixed_vector(4, 32);
59 const LLT v2s64
= LLT::fixed_vector(2, 64);
60 const LLT v2p0
= LLT::fixed_vector(2, p0
);
62 const LLT nxv16s8
= LLT::scalable_vector(16, s8
);
63 const LLT nxv8s16
= LLT::scalable_vector(8, s16
);
64 const LLT nxv4s32
= LLT::scalable_vector(4, s32
);
65 const LLT nxv2s64
= LLT::scalable_vector(2, s64
);
67 std::initializer_list
<LLT
> PackedVectorAllTypeList
= {/* Begin 128bit types */
70 /* End 128bit types */
71 /* Begin 64bit types */
73 std::initializer_list
<LLT
> ScalarAndPtrTypesList
= {s8
, s16
, s32
, s64
, p0
};
74 SmallVector
<LLT
, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList
);
75 SmallVector
<LLT
, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList
);
77 const TargetMachine
&TM
= ST
.getTargetLowering()->getTargetMachine();
79 // FIXME: support subtargets which have neon/fp-armv8 disabled.
80 if (!ST
.hasNEON() || !ST
.hasFPARMv8()) {
81 getLegacyLegalizerInfo().computeTables();
85 // Some instructions only support s16 if the subtarget has full 16-bit FP
87 const bool HasFP16
= ST
.hasFullFP16();
88 const LLT
&MinFPScalar
= HasFP16
? s16
: s32
;
90 const bool HasCSSC
= ST
.hasCSSC();
91 const bool HasRCPC3
= ST
.hasRCPC3();
92 const bool HasSVE
= ST
.hasSVE();
94 getActionDefinitionsBuilder(
95 {G_IMPLICIT_DEF
, G_FREEZE
, G_CONSTANT_FOLD_BARRIER
})
96 .legalFor({p0
, s8
, s16
, s32
, s64
})
97 .legalFor({v16s8
, v8s16
, v4s32
, v2s64
, v2p0
, v8s8
, v4s16
, v2s32
, v4s8
,
99 .widenScalarToNextPow2(0)
100 .clampScalar(0, s8
, s64
)
101 .moreElementsToNextPow2(0)
102 .widenVectorEltsToVectorMinSize(0, 64)
103 .clampNumElements(0, v8s8
, v16s8
)
104 .clampNumElements(0, v4s16
, v8s16
)
105 .clampNumElements(0, v2s32
, v4s32
)
106 .clampMaxNumElements(0, s64
, 2)
107 .clampMaxNumElements(0, p0
, 2);
109 getActionDefinitionsBuilder(G_PHI
)
110 .legalFor({p0
, s16
, s32
, s64
})
111 .legalFor(PackedVectorAllTypeList
)
112 .widenScalarToNextPow2(0)
113 .moreElementsToNextPow2(0)
114 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
115 .clampScalar(0, s16
, s64
)
116 .clampNumElements(0, v8s8
, v16s8
)
117 .clampNumElements(0, v4s16
, v8s16
)
118 .clampNumElements(0, v2s32
, v4s32
)
119 .clampMaxNumElements(0, s64
, 2)
120 .clampMaxNumElements(0, p0
, 2);
122 getActionDefinitionsBuilder(G_BSWAP
)
123 .legalFor({s32
, s64
, v4s16
, v8s16
, v2s32
, v4s32
, v2s64
})
124 .widenScalarOrEltToNextPow2(0, 16)
125 .clampScalar(0, s32
, s64
)
126 .clampNumElements(0, v4s16
, v8s16
)
127 .clampNumElements(0, v2s32
, v4s32
)
128 .clampNumElements(0, v2s64
, v2s64
)
129 .moreElementsToNextPow2(0);
131 getActionDefinitionsBuilder({G_ADD
, G_SUB
, G_AND
, G_OR
, G_XOR
})
132 .legalFor({s32
, s64
, v2s32
, v2s64
, v4s32
, v4s16
, v8s16
, v16s8
, v8s8
})
133 .legalFor(HasSVE
, {nxv16s8
, nxv8s16
, nxv4s32
, nxv2s64
})
134 .widenScalarToNextPow2(0)
135 .clampScalar(0, s32
, s64
)
136 .clampMaxNumElements(0, s8
, 16)
137 .clampMaxNumElements(0, s16
, 8)
138 .clampNumElements(0, v2s32
, v4s32
)
139 .clampNumElements(0, v2s64
, v2s64
)
141 [=](const LegalityQuery
&Query
) {
142 return Query
.Types
[0].getNumElements() <= 2;
146 [=](const LegalityQuery
&Query
) {
147 return Query
.Types
[0].getNumElements() <= 4;
151 [=](const LegalityQuery
&Query
) {
152 return Query
.Types
[0].getNumElements() <= 16;
155 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
156 .moreElementsToNextPow2(0);
158 getActionDefinitionsBuilder(G_MUL
)
159 .legalFor({s32
, s64
, v2s32
, v2s64
, v4s32
, v4s16
, v8s16
, v16s8
, v8s8
})
160 .widenScalarToNextPow2(0)
161 .clampScalar(0, s32
, s64
)
162 .clampMaxNumElements(0, s8
, 16)
163 .clampMaxNumElements(0, s16
, 8)
164 .clampNumElements(0, v2s32
, v4s32
)
165 .clampNumElements(0, v2s64
, v2s64
)
167 [=](const LegalityQuery
&Query
) {
168 return Query
.Types
[0].getNumElements() <= 2;
172 [=](const LegalityQuery
&Query
) {
173 return Query
.Types
[0].getNumElements() <= 4;
177 [=](const LegalityQuery
&Query
) {
178 return Query
.Types
[0].getNumElements() <= 16;
181 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
182 .moreElementsToNextPow2(0);
184 getActionDefinitionsBuilder({G_SHL
, G_ASHR
, G_LSHR
})
185 .customIf([=](const LegalityQuery
&Query
) {
186 const auto &SrcTy
= Query
.Types
[0];
187 const auto &AmtTy
= Query
.Types
[1];
188 return !SrcTy
.isVector() && SrcTy
.getSizeInBits() == 32 &&
189 AmtTy
.getSizeInBits() == 32;
203 .widenScalarToNextPow2(0)
204 .clampScalar(1, s32
, s64
)
205 .clampScalar(0, s32
, s64
)
206 .clampNumElements(0, v8s8
, v16s8
)
207 .clampNumElements(0, v4s16
, v8s16
)
208 .clampNumElements(0, v2s32
, v4s32
)
209 .clampNumElements(0, v2s64
, v2s64
)
210 .moreElementsToNextPow2(0)
211 .minScalarSameAs(1, 0)
212 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
214 getActionDefinitionsBuilder(G_PTR_ADD
)
215 .legalFor({{p0
, s64
}, {v2p0
, v2s64
}})
216 .clampScalarOrElt(1, s64
, s64
)
217 .clampNumElements(0, v2p0
, v2p0
);
219 getActionDefinitionsBuilder(G_PTRMASK
).legalFor({{p0
, s64
}});
221 getActionDefinitionsBuilder({G_SDIV
, G_UDIV
})
222 .legalFor({s32
, s64
})
224 .clampScalar(0, s32
, s64
)
225 .widenScalarToNextPow2(0)
228 getActionDefinitionsBuilder({G_SREM
, G_UREM
, G_SDIVREM
, G_UDIVREM
})
229 .lowerFor({s8
, s16
, s32
, s64
, v2s64
, v4s32
, v2s32
})
231 .widenScalarOrEltToNextPow2(0)
232 .minScalarOrElt(0, s32
)
233 .clampNumElements(0, v2s32
, v4s32
)
234 .clampNumElements(0, v2s64
, v2s64
)
237 getActionDefinitionsBuilder({G_SMULO
, G_UMULO
})
238 .widenScalarToNextPow2(0, /*Min = */ 32)
239 .clampScalar(0, s32
, s64
)
242 getActionDefinitionsBuilder({G_SMULH
, G_UMULH
})
243 .legalFor({s64
, v8s16
, v16s8
, v4s32
})
246 getActionDefinitionsBuilder({G_SMIN
, G_SMAX
, G_UMIN
, G_UMAX
})
247 .legalFor({v8s8
, v16s8
, v4s16
, v8s16
, v2s32
, v4s32
})
248 .legalFor(HasCSSC
, {s32
, s64
})
249 .minScalar(HasCSSC
, 0, s32
)
250 .clampNumElements(0, v8s8
, v16s8
)
251 .clampNumElements(0, v4s16
, v8s16
)
252 .clampNumElements(0, v2s32
, v4s32
)
253 // FIXME: This sholdn't be needed as v2s64 types are going to
254 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
255 .clampNumElements(0, v2s64
, v2s64
)
258 getActionDefinitionsBuilder(
259 {G_SADDE
, G_SSUBE
, G_UADDE
, G_USUBE
, G_SADDO
, G_SSUBO
, G_UADDO
, G_USUBO
})
260 .legalFor({{s32
, s32
}, {s64
, s32
}})
261 .clampScalar(0, s32
, s64
)
262 .clampScalar(1, s32
, s64
)
263 .widenScalarToNextPow2(0);
265 getActionDefinitionsBuilder(
266 {G_FADD
, G_FSUB
, G_FMUL
, G_FDIV
, G_FMA
, G_FSQRT
, G_FMAXNUM
, G_FMINNUM
,
267 G_FMAXIMUM
, G_FMINIMUM
, G_FCEIL
, G_FFLOOR
, G_FRINT
, G_FNEARBYINT
,
268 G_INTRINSIC_TRUNC
, G_INTRINSIC_ROUND
, G_INTRINSIC_ROUNDEVEN
})
269 .legalFor({s32
, s64
, v2s32
, v4s32
, v2s64
})
270 .legalFor(HasFP16
, {s16
, v4s16
, v8s16
})
272 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
273 .minScalarOrElt(0, MinFPScalar
)
274 .clampNumElements(0, v4s16
, v8s16
)
275 .clampNumElements(0, v2s32
, v4s32
)
276 .clampNumElements(0, v2s64
, v2s64
)
277 .moreElementsToNextPow2(0);
279 getActionDefinitionsBuilder({G_FABS
, G_FNEG
})
280 .legalFor({s32
, s64
, v2s32
, v4s32
, v2s64
})
281 .legalFor(HasFP16
, {s16
, v4s16
, v8s16
})
282 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
283 .lowerIf(scalarOrEltWiderThan(0, 64))
284 .clampNumElements(0, v4s16
, v8s16
)
285 .clampNumElements(0, v2s32
, v4s32
)
286 .clampNumElements(0, v2s64
, v2s64
)
287 .moreElementsToNextPow2(0)
288 .lowerFor({s16
, v4s16
, v8s16
});
290 getActionDefinitionsBuilder(G_FREM
)
291 .libcallFor({s32
, s64
, s128
})
295 getActionDefinitionsBuilder({G_INTRINSIC_LRINT
, G_INTRINSIC_LLRINT
})
296 .legalFor({{s64
, MinFPScalar
}, {s64
, s32
}, {s64
, s64
}})
297 .libcallFor({{s64
, s128
}})
298 .minScalarOrElt(1, MinFPScalar
);
300 getActionDefinitionsBuilder({G_FCOS
, G_FSIN
, G_FPOW
, G_FLOG
, G_FLOG2
,
301 G_FLOG10
, G_FTAN
, G_FEXP
, G_FEXP2
, G_FEXP10
,
302 G_FACOS
, G_FASIN
, G_FATAN
, G_FATAN2
, G_FCOSH
,
304 // We need a call for these, so we always need to scalarize.
306 // Regardless of FP16 support, widen 16-bit elements to 32-bits.
308 .libcallFor({s32
, s64
, s128
});
309 getActionDefinitionsBuilder(G_FPOWI
)
312 .libcallFor({{s32
, s32
}, {s64
, s32
}, {s128
, s32
}});
314 getActionDefinitionsBuilder(G_INSERT
)
315 .legalIf(all(typeInSet(0, {s32
, s64
, p0
}),
316 typeInSet(1, {s8
, s16
, s32
}), smallerThan(1, 0)))
317 .widenScalarToNextPow2(0)
318 .clampScalar(0, s32
, s64
)
319 .widenScalarToNextPow2(1)
321 .maxScalarIf(typeInSet(0, {s32
}), 1, s16
)
322 .maxScalarIf(typeInSet(0, {s64
, p0
}), 1, s32
);
324 getActionDefinitionsBuilder(G_EXTRACT
)
325 .legalIf(all(typeInSet(0, {s16
, s32
, s64
, p0
}),
326 typeInSet(1, {s32
, s64
, s128
, p0
}), smallerThan(0, 1)))
327 .widenScalarToNextPow2(1)
328 .clampScalar(1, s32
, s128
)
329 .widenScalarToNextPow2(0)
331 .maxScalarIf(typeInSet(1, {s32
}), 0, s16
)
332 .maxScalarIf(typeInSet(1, {s64
, p0
}), 0, s32
)
333 .maxScalarIf(typeInSet(1, {s128
}), 0, s64
);
336 for (unsigned Op
: {G_SEXTLOAD
, G_ZEXTLOAD
}) {
337 auto &Actions
= getActionDefinitionsBuilder(Op
);
339 if (Op
== G_SEXTLOAD
)
340 Actions
.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered
));
342 // Atomics have zero extending behavior.
344 .legalForTypesWithMemDesc({{s32
, p0
, s8
, 8},
352 {v2s32
, p0
, s64
, 8}})
353 .widenScalarToNextPow2(0)
354 .clampScalar(0, s32
, s64
)
355 // TODO: We could support sum-of-pow2's but the lowering code doesn't know
356 // how to do that yet.
357 .unsupportedIfMemSizeNotPow2()
358 // Lower anything left over into G_*EXT and G_LOAD
362 auto IsPtrVecPred
= [=](const LegalityQuery
&Query
) {
363 const LLT
&ValTy
= Query
.Types
[0];
364 return ValTy
.isPointerVector() && ValTy
.getAddressSpace() == 0;
367 getActionDefinitionsBuilder(G_LOAD
)
368 .customIf([=](const LegalityQuery
&Query
) {
369 return HasRCPC3
&& Query
.Types
[0] == s128
&&
370 Query
.MMODescrs
[0].Ordering
== AtomicOrdering::Acquire
;
372 .customIf([=](const LegalityQuery
&Query
) {
373 return Query
.Types
[0] == s128
&&
374 Query
.MMODescrs
[0].Ordering
!= AtomicOrdering::NotAtomic
;
376 .legalForTypesWithMemDesc({{s8
, p0
, s8
, 8},
383 {v16s8
, p0
, s128
, 8},
385 {v8s16
, p0
, s128
, 8},
387 {v4s32
, p0
, s128
, 8},
388 {v2s64
, p0
, s128
, 8}})
389 // These extends are also legal
390 .legalForTypesWithMemDesc(
391 {{s32
, p0
, s8
, 8}, {s32
, p0
, s16
, 8}, {s64
, p0
, s32
, 8}})
392 .legalForTypesWithMemDesc({
393 // SVE vscale x 128 bit base sizes
394 {nxv16s8
, p0
, nxv16s8
, 8},
395 {nxv8s16
, p0
, nxv8s16
, 8},
396 {nxv4s32
, p0
, nxv4s32
, 8},
397 {nxv2s64
, p0
, nxv2s64
, 8},
399 .widenScalarToNextPow2(0, /* MinSize = */ 8)
400 .clampMaxNumElements(0, s8
, 16)
401 .clampMaxNumElements(0, s16
, 8)
402 .clampMaxNumElements(0, s32
, 4)
403 .clampMaxNumElements(0, s64
, 2)
404 .clampMaxNumElements(0, p0
, 2)
405 .lowerIfMemSizeNotByteSizePow2()
406 .clampScalar(0, s8
, s64
)
408 [=](const LegalityQuery
&Query
) {
409 // Clamp extending load results to 32-bits.
410 return Query
.Types
[0].isScalar() &&
411 Query
.Types
[0] != Query
.MMODescrs
[0].MemoryTy
&&
412 Query
.Types
[0].getSizeInBits() > 32;
415 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
416 .bitcastIf(typeInSet(0, {v4s8
}),
417 [=](const LegalityQuery
&Query
) {
418 const LLT VecTy
= Query
.Types
[0];
419 return std::pair(0, LLT::scalar(VecTy
.getSizeInBits()));
421 .customIf(IsPtrVecPred
)
422 .scalarizeIf(typeInSet(0, {v2s16
, v2s8
}), 0)
423 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
425 getActionDefinitionsBuilder(G_STORE
)
426 .customIf([=](const LegalityQuery
&Query
) {
427 return HasRCPC3
&& Query
.Types
[0] == s128
&&
428 Query
.MMODescrs
[0].Ordering
== AtomicOrdering::Release
;
430 .customIf([=](const LegalityQuery
&Query
) {
431 return Query
.Types
[0] == s128
&&
432 Query
.MMODescrs
[0].Ordering
!= AtomicOrdering::NotAtomic
;
434 .legalForTypesWithMemDesc(
435 {{s8
, p0
, s8
, 8}, {s16
, p0
, s8
, 8}, // truncstorei8 from s16
436 {s32
, p0
, s8
, 8}, // truncstorei8 from s32
437 {s64
, p0
, s8
, 8}, // truncstorei8 from s64
438 {s16
, p0
, s16
, 8}, {s32
, p0
, s16
, 8}, // truncstorei16 from s32
439 {s64
, p0
, s16
, 8}, // truncstorei16 from s64
440 {s32
, p0
, s8
, 8}, {s32
, p0
, s16
, 8}, {s32
, p0
, s32
, 8},
441 {s64
, p0
, s64
, 8}, {s64
, p0
, s32
, 8}, // truncstorei32 from s64
442 {p0
, p0
, s64
, 8}, {s128
, p0
, s128
, 8}, {v16s8
, p0
, s128
, 8},
443 {v8s8
, p0
, s64
, 8}, {v4s16
, p0
, s64
, 8}, {v8s16
, p0
, s128
, 8},
444 {v2s32
, p0
, s64
, 8}, {v4s32
, p0
, s128
, 8}, {v2s64
, p0
, s128
, 8}})
445 .legalForTypesWithMemDesc({
446 // SVE vscale x 128 bit base sizes
447 // TODO: Add nxv2p0. Consider bitcastIf.
449 // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
450 {nxv16s8
, p0
, nxv16s8
, 8},
451 {nxv8s16
, p0
, nxv8s16
, 8},
452 {nxv4s32
, p0
, nxv4s32
, 8},
453 {nxv2s64
, p0
, nxv2s64
, 8},
455 .clampScalar(0, s8
, s64
)
456 .lowerIf([=](const LegalityQuery
&Query
) {
457 return Query
.Types
[0].isScalar() &&
458 Query
.Types
[0] != Query
.MMODescrs
[0].MemoryTy
;
460 // Maximum: sN * k = 128
461 .clampMaxNumElements(0, s8
, 16)
462 .clampMaxNumElements(0, s16
, 8)
463 .clampMaxNumElements(0, s32
, 4)
464 .clampMaxNumElements(0, s64
, 2)
465 .clampMaxNumElements(0, p0
, 2)
466 .lowerIfMemSizeNotPow2()
467 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
468 .bitcastIf(typeInSet(0, {v4s8
}),
469 [=](const LegalityQuery
&Query
) {
470 const LLT VecTy
= Query
.Types
[0];
471 return std::pair(0, LLT::scalar(VecTy
.getSizeInBits()));
473 .customIf(IsPtrVecPred
)
474 .scalarizeIf(typeInSet(0, {v2s16
, v2s8
}), 0)
475 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
477 getActionDefinitionsBuilder(G_INDEXED_STORE
)
478 // Idx 0 == Ptr, Idx 1 == Val
479 // TODO: we can implement legalizations but as of now these are
480 // generated in a very specific way.
481 .legalForTypesWithMemDesc({
490 {p0
, v16s8
, v16s8
, 8},
491 {p0
, v4s16
, v4s16
, 8},
492 {p0
, v8s16
, v8s16
, 8},
493 {p0
, v2s32
, v2s32
, 8},
494 {p0
, v4s32
, v4s32
, 8},
495 {p0
, v2s64
, v2s64
, 8},
501 auto IndexedLoadBasicPred
= [=](const LegalityQuery
&Query
) {
502 LLT LdTy
= Query
.Types
[0];
503 LLT PtrTy
= Query
.Types
[1];
504 if (!llvm::is_contained(PackedVectorAllTypesVec
, LdTy
) &&
505 !llvm::is_contained(ScalarAndPtrTypesVec
, LdTy
) && LdTy
!= s128
)
511 getActionDefinitionsBuilder(G_INDEXED_LOAD
)
513 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered
))
514 .legalIf(IndexedLoadBasicPred
)
516 getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD
, G_INDEXED_ZEXTLOAD
})
518 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered
))
519 .legalIf(all(typeInSet(0, {s16
, s32
, s64
}),
520 LegalityPredicate([=](const LegalityQuery
&Q
) {
521 LLT LdTy
= Q
.Types
[0];
522 LLT PtrTy
= Q
.Types
[1];
523 LLT MemTy
= Q
.MMODescrs
[0].MemoryTy
;
529 return MemTy
== s8
|| MemTy
== s16
;
531 return MemTy
== s8
|| MemTy
== s16
|| MemTy
== s32
;
537 getActionDefinitionsBuilder(G_CONSTANT
)
538 .legalFor({p0
, s8
, s16
, s32
, s64
})
539 .widenScalarToNextPow2(0)
540 .clampScalar(0, s8
, s64
);
541 getActionDefinitionsBuilder(G_FCONSTANT
)
542 .legalFor({s32
, s64
, s128
})
543 .legalFor(HasFP16
, {s16
})
544 .clampScalar(0, MinFPScalar
, s128
);
546 // FIXME: fix moreElementsToNextPow2
547 getActionDefinitionsBuilder(G_ICMP
)
548 .legalFor({{s32
, s32
}, {s32
, s64
}, {s32
, p0
}})
549 .widenScalarOrEltToNextPow2(1)
550 .clampScalar(1, s32
, s64
)
551 .clampScalar(0, s32
, s32
)
552 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
553 .minScalarEltSameAsIf(
554 [=](const LegalityQuery
&Query
) {
555 const LLT
&Ty
= Query
.Types
[0];
556 const LLT
&SrcTy
= Query
.Types
[1];
557 return Ty
.isVector() && !SrcTy
.isPointerVector() &&
558 Ty
.getElementType() != SrcTy
.getElementType();
562 [=](const LegalityQuery
&Query
) { return Query
.Types
[1] == v2s16
; },
565 [=](const LegalityQuery
&Query
) { return Query
.Types
[1] == v2p0
; }, 0,
567 .moreElementsToNextPow2(1)
568 .clampNumElements(1, v8s8
, v16s8
)
569 .clampNumElements(1, v4s16
, v8s16
)
570 .clampNumElements(1, v2s32
, v4s32
)
571 .clampNumElements(1, v2s64
, v2s64
)
572 .customIf(isVector(0));
574 getActionDefinitionsBuilder(G_FCMP
)
575 .legalFor({{s32
, s32
},
580 .legalFor(HasFP16
, {{s32
, s16
}, {v4s16
, v4s16
}, {v8s16
, v8s16
}})
581 .widenScalarOrEltToNextPow2(1)
582 .clampScalar(0, s32
, s32
)
583 .minScalarOrElt(1, MinFPScalar
)
584 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
585 .minScalarEltSameAsIf(
586 [=](const LegalityQuery
&Query
) {
587 const LLT
&Ty
= Query
.Types
[0];
588 const LLT
&SrcTy
= Query
.Types
[1];
589 return Ty
.isVector() && !SrcTy
.isPointerVector() &&
590 Ty
.getElementType() != SrcTy
.getElementType();
593 .clampNumElements(1, v4s16
, v8s16
)
594 .clampNumElements(1, v2s32
, v4s32
)
595 .clampMaxNumElements(1, s64
, 2)
596 .moreElementsToNextPow2(1)
597 .libcallFor({{s32
, s128
}});
600 auto ExtLegalFunc
= [=](const LegalityQuery
&Query
) {
601 unsigned DstSize
= Query
.Types
[0].getSizeInBits();
603 // Handle legal vectors using legalFor
604 if (Query
.Types
[0].isVector())
607 if (DstSize
< 8 || DstSize
>= 128 || !isPowerOf2_32(DstSize
))
608 return false; // Extending to a scalar s128 needs narrowing.
610 const LLT
&SrcTy
= Query
.Types
[1];
612 // Make sure we fit in a register otherwise. Don't bother checking that
613 // the source type is below 128 bits. We shouldn't be allowing anything
614 // through which is wider than the destination in the first place.
615 unsigned SrcSize
= SrcTy
.getSizeInBits();
616 if (SrcSize
< 8 || !isPowerOf2_32(SrcSize
))
621 getActionDefinitionsBuilder({G_ZEXT
, G_SEXT
, G_ANYEXT
})
622 .legalIf(ExtLegalFunc
)
623 .legalFor({{v2s64
, v2s32
}, {v4s32
, v4s16
}, {v8s16
, v8s8
}})
624 .clampScalar(0, s64
, s64
) // Just for s128, others are handled above.
625 .moreElementsToNextPow2(0)
626 .clampMaxNumElements(1, s8
, 8)
627 .clampMaxNumElements(1, s16
, 4)
628 .clampMaxNumElements(1, s32
, 2)
629 // Tries to convert a large EXTEND into two smaller EXTENDs
630 .lowerIf([=](const LegalityQuery
&Query
) {
631 return (Query
.Types
[0].getScalarSizeInBits() >
632 Query
.Types
[1].getScalarSizeInBits() * 2) &&
633 Query
.Types
[0].isVector() &&
634 (Query
.Types
[1].getScalarSizeInBits() == 8 ||
635 Query
.Types
[1].getScalarSizeInBits() == 16);
637 .clampMinNumElements(1, s8
, 8)
638 .clampMinNumElements(1, s16
, 4);
640 getActionDefinitionsBuilder(G_TRUNC
)
641 .legalFor({{v2s32
, v2s64
}, {v4s16
, v4s32
}, {v8s8
, v8s16
}})
642 .moreElementsToNextPow2(0)
643 .clampMaxNumElements(0, s8
, 8)
644 .clampMaxNumElements(0, s16
, 4)
645 .clampMaxNumElements(0, s32
, 2)
647 [=](const LegalityQuery
&Query
) { return Query
.Types
[0].isVector(); },
649 .lowerIf([=](const LegalityQuery
&Query
) {
650 LLT DstTy
= Query
.Types
[0];
651 LLT SrcTy
= Query
.Types
[1];
652 return DstTy
.isVector() && SrcTy
.getSizeInBits() > 128 &&
653 DstTy
.getScalarSizeInBits() * 2 <= SrcTy
.getScalarSizeInBits();
655 .clampMinNumElements(0, s8
, 8)
656 .clampMinNumElements(0, s16
, 4)
659 getActionDefinitionsBuilder(G_SEXT_INREG
)
660 .legalFor({s32
, s64
})
661 .legalFor(PackedVectorAllTypeList
)
663 .clampNumElements(0, v8s8
, v16s8
)
664 .clampNumElements(0, v4s16
, v8s16
)
665 .clampNumElements(0, v2s32
, v4s32
)
666 .clampMaxNumElements(0, s64
, 2)
670 getActionDefinitionsBuilder(G_FPTRUNC
)
672 {{s16
, s32
}, {s16
, s64
}, {s32
, s64
}, {v4s16
, v4s32
}, {v2s32
, v2s64
}})
673 .libcallFor({{s16
, s128
}, {s32
, s128
}, {s64
, s128
}})
674 .clampNumElements(0, v4s16
, v4s16
)
675 .clampNumElements(0, v2s32
, v2s32
)
678 getActionDefinitionsBuilder(G_FPEXT
)
680 {{s32
, s16
}, {s64
, s16
}, {s64
, s32
}, {v4s32
, v4s16
}, {v2s64
, v2s32
}})
681 .libcallFor({{s128
, s64
}, {s128
, s32
}, {s128
, s16
}})
682 .clampNumElements(0, v4s32
, v4s32
)
683 .clampNumElements(0, v2s64
, v2s64
)
687 getActionDefinitionsBuilder({G_FPTOSI
, G_FPTOUI
})
688 .legalFor({{s32
, s32
},
696 {{s32
, s16
}, {s64
, s16
}, {v4s16
, v4s16
}, {v8s16
, v8s16
}})
697 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
698 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
699 // The range of a fp16 value fits into an i17, so we can lower the width
702 [=](const LegalityQuery
&Query
) {
703 return Query
.Types
[1] == s16
&& Query
.Types
[0].getSizeInBits() > 64;
706 .moreElementsToNextPow2(0)
707 .widenScalarOrEltToNextPow2OrMinSize(0)
709 .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16
? 16 : 32)
711 [=](const LegalityQuery
&Query
) {
712 return Query
.Types
[0].getScalarSizeInBits() <= 64 &&
713 Query
.Types
[0].getScalarSizeInBits() >
714 Query
.Types
[1].getScalarSizeInBits();
716 LegalizeMutations::changeElementSizeTo(1, 0))
718 [=](const LegalityQuery
&Query
) {
719 return Query
.Types
[1].getScalarSizeInBits() <= 64 &&
720 Query
.Types
[0].getScalarSizeInBits() <
721 Query
.Types
[1].getScalarSizeInBits();
723 LegalizeMutations::changeElementSizeTo(0, 1))
724 .clampNumElements(0, v4s16
, v8s16
)
725 .clampNumElements(0, v2s32
, v4s32
)
726 .clampMaxNumElements(0, s64
, 2)
728 {{s32
, s128
}, {s64
, s128
}, {s128
, s128
}, {s128
, s32
}, {s128
, s64
}});
730 getActionDefinitionsBuilder({G_FPTOSI_SAT
, G_FPTOUI_SAT
})
731 .legalFor({{s32
, s32
},
739 {{s32
, s16
}, {s64
, s16
}, {v4s16
, v4s16
}, {v8s16
, v8s16
}})
740 // Handle types larger than i64 by scalarizing/lowering.
741 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
742 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
743 // The range of a fp16 value fits into an i17, so we can lower the width
746 [=](const LegalityQuery
&Query
) {
747 return Query
.Types
[1] == s16
&& Query
.Types
[0].getSizeInBits() > 64;
750 .lowerIf(::any(scalarWiderThan(0, 64), scalarWiderThan(1, 64)), 0)
751 .moreElementsToNextPow2(0)
752 .widenScalarToNextPow2(0, /*MinSize=*/32)
754 .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16
? 16 : 32)
756 [=](const LegalityQuery
&Query
) {
757 unsigned ITySize
= Query
.Types
[0].getScalarSizeInBits();
758 return (ITySize
== 16 || ITySize
== 32 || ITySize
== 64) &&
759 ITySize
> Query
.Types
[1].getScalarSizeInBits();
761 LegalizeMutations::changeElementSizeTo(1, 0))
763 [=](const LegalityQuery
&Query
) {
764 unsigned FTySize
= Query
.Types
[1].getScalarSizeInBits();
765 return (FTySize
== 16 || FTySize
== 32 || FTySize
== 64) &&
766 Query
.Types
[0].getScalarSizeInBits() < FTySize
;
768 LegalizeMutations::changeElementSizeTo(0, 1))
769 .widenScalarOrEltToNextPow2(0)
770 .clampNumElements(0, v4s16
, v8s16
)
771 .clampNumElements(0, v2s32
, v4s32
)
772 .clampMaxNumElements(0, s64
, 2);
774 getActionDefinitionsBuilder({G_SITOFP
, G_UITOFP
})
775 .legalFor({{s32
, s32
},
783 {{s16
, s32
}, {s16
, s64
}, {v4s16
, v4s16
}, {v8s16
, v8s16
}})
784 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
785 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
786 .moreElementsToNextPow2(1)
787 .widenScalarOrEltToNextPow2OrMinSize(1)
789 .widenScalarOrEltToNextPow2OrMinSize(0, /*MinSize=*/HasFP16
? 16 : 32)
791 [=](const LegalityQuery
&Query
) {
792 return Query
.Types
[1].getScalarSizeInBits() <= 64 &&
793 Query
.Types
[0].getScalarSizeInBits() <
794 Query
.Types
[1].getScalarSizeInBits();
796 LegalizeMutations::changeElementSizeTo(0, 1))
798 [=](const LegalityQuery
&Query
) {
799 return Query
.Types
[0].getScalarSizeInBits() <= 64 &&
800 Query
.Types
[0].getScalarSizeInBits() >
801 Query
.Types
[1].getScalarSizeInBits();
803 LegalizeMutations::changeElementSizeTo(1, 0))
804 .clampNumElements(0, v4s16
, v8s16
)
805 .clampNumElements(0, v2s32
, v4s32
)
806 .clampMaxNumElements(0, s64
, 2)
807 .libcallFor({{s16
, s128
},
815 getActionDefinitionsBuilder(G_BRCOND
)
817 .clampScalar(0, s32
, s32
);
818 getActionDefinitionsBuilder(G_BRINDIRECT
).legalFor({p0
});
820 getActionDefinitionsBuilder(G_SELECT
)
821 .legalFor({{s32
, s32
}, {s64
, s32
}, {p0
, s32
}})
822 .widenScalarToNextPow2(0)
823 .clampScalar(0, s32
, s64
)
824 .clampScalar(1, s32
, s32
)
825 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
826 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
827 .lowerIf(isVector(0));
830 getActionDefinitionsBuilder(G_FRAME_INDEX
).legalFor({p0
});
832 if (TM
.getCodeModel() == CodeModel::Small
)
833 getActionDefinitionsBuilder(G_GLOBAL_VALUE
).custom();
835 getActionDefinitionsBuilder(G_GLOBAL_VALUE
).legalFor({p0
});
837 getActionDefinitionsBuilder(G_PTRAUTH_GLOBAL_VALUE
)
838 .legalIf(all(typeIs(0, p0
), typeIs(1, p0
)));
840 getActionDefinitionsBuilder(G_PTRTOINT
)
841 .legalFor({{s64
, p0
}, {v2s64
, v2p0
}})
842 .widenScalarToNextPow2(0, 64)
843 .clampScalar(0, s64
, s64
)
844 .clampMaxNumElements(0, s64
, 2);
846 getActionDefinitionsBuilder(G_INTTOPTR
)
847 .unsupportedIf([&](const LegalityQuery
&Query
) {
848 return Query
.Types
[0].getSizeInBits() != Query
.Types
[1].getSizeInBits();
850 .legalFor({{p0
, s64
}, {v2p0
, v2s64
}})
851 .clampMaxNumElements(1, s64
, 2);
853 // Casts for 32 and 64-bit width type are just copies.
854 // Same for 128-bit width type, except they are on the FPR bank.
855 getActionDefinitionsBuilder(G_BITCAST
)
856 // Keeping 32-bit instructions legal to prevent regression in some tests
857 .legalForCartesianProduct({s32
, v2s16
, v4s8
})
858 .legalForCartesianProduct({s64
, v8s8
, v4s16
, v2s32
})
859 .legalForCartesianProduct({s128
, v16s8
, v8s16
, v4s32
, v2s64
, v2p0
})
860 .lowerIf([=](const LegalityQuery
&Query
) {
861 return Query
.Types
[0].isVector() != Query
.Types
[1].isVector();
863 .moreElementsToNextPow2(0)
864 .clampNumElements(0, v8s8
, v16s8
)
865 .clampNumElements(0, v4s16
, v8s16
)
866 .clampNumElements(0, v2s32
, v4s32
)
869 getActionDefinitionsBuilder(G_VASTART
).legalFor({p0
});
871 // va_list must be a pointer, but most sized types are pretty easy to handle
872 // as the destination.
873 getActionDefinitionsBuilder(G_VAARG
)
874 .customForCartesianProduct({s8
, s16
, s32
, s64
, p0
}, {p0
})
875 .clampScalar(0, s8
, s64
)
876 .widenScalarToNextPow2(0, /*Min*/ 8);
878 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS
)
880 all(typeInSet(0, {s8
, s16
, s32
, s64
, s128
}), typeIs(2, p0
)));
882 bool UseOutlineAtomics
= ST
.outlineAtomics() && !ST
.hasLSE();
884 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG
)
885 .legalFor(!UseOutlineAtomics
, {{s32
, p0
}, {s64
, p0
}})
886 .customFor(!UseOutlineAtomics
, {{s128
, p0
}})
887 .libcallFor(UseOutlineAtomics
,
888 {{s8
, p0
}, {s16
, p0
}, {s32
, p0
}, {s64
, p0
}, {s128
, p0
}})
889 .clampScalar(0, s32
, s64
);
891 getActionDefinitionsBuilder({G_ATOMICRMW_XCHG
, G_ATOMICRMW_ADD
,
892 G_ATOMICRMW_SUB
, G_ATOMICRMW_AND
, G_ATOMICRMW_OR
,
894 .legalFor(!UseOutlineAtomics
, {{s32
, p0
}, {s64
, p0
}})
895 .libcallFor(UseOutlineAtomics
,
896 {{s8
, p0
}, {s16
, p0
}, {s32
, p0
}, {s64
, p0
}})
897 .clampScalar(0, s32
, s64
);
899 // Do not outline these atomics operations, as per comment in
900 // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().
901 getActionDefinitionsBuilder(
902 {G_ATOMICRMW_MIN
, G_ATOMICRMW_MAX
, G_ATOMICRMW_UMIN
, G_ATOMICRMW_UMAX
})
903 .legalIf(all(typeInSet(0, {s32
, s64
}), typeIs(1, p0
)))
904 .clampScalar(0, s32
, s64
);
906 getActionDefinitionsBuilder(G_BLOCK_ADDR
).legalFor({p0
});
909 for (unsigned Op
: {G_MERGE_VALUES
, G_UNMERGE_VALUES
}) {
910 unsigned BigTyIdx
= Op
== G_MERGE_VALUES
? 0 : 1;
911 unsigned LitTyIdx
= Op
== G_MERGE_VALUES
? 1 : 0;
912 getActionDefinitionsBuilder(Op
)
913 .widenScalarToNextPow2(LitTyIdx
, 8)
914 .widenScalarToNextPow2(BigTyIdx
, 32)
915 .clampScalar(LitTyIdx
, s8
, s64
)
916 .clampScalar(BigTyIdx
, s32
, s128
)
917 .legalIf([=](const LegalityQuery
&Q
) {
918 switch (Q
.Types
[BigTyIdx
].getSizeInBits()) {
926 switch (Q
.Types
[LitTyIdx
].getSizeInBits()) {
938 // TODO : nxv4s16, nxv2s16, nxv2s32
939 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT
)
940 .legalFor(HasSVE
, {{s16
, nxv16s8
, s64
},
943 {s64
, nxv2s64
, s64
}})
944 .unsupportedIf([=](const LegalityQuery
&Query
) {
945 const LLT
&EltTy
= Query
.Types
[1].getElementType();
946 if (Query
.Types
[1].isScalableVector())
948 return Query
.Types
[0] != EltTy
;
951 .customIf([=](const LegalityQuery
&Query
) {
952 const LLT
&VecTy
= Query
.Types
[1];
953 return VecTy
== v2s16
|| VecTy
== v4s16
|| VecTy
== v8s16
||
954 VecTy
== v4s32
|| VecTy
== v2s64
|| VecTy
== v2s32
||
955 VecTy
== v8s8
|| VecTy
== v16s8
|| VecTy
== v2p0
;
958 [=](const LegalityQuery
&Query
) {
959 // We want to promote to <M x s1> to <M x s64> if that wouldn't
960 // cause the total vec size to be > 128b.
961 return Query
.Types
[1].isFixedVector() &&
962 Query
.Types
[1].getNumElements() <= 2;
966 [=](const LegalityQuery
&Query
) {
967 return Query
.Types
[1].isFixedVector() &&
968 Query
.Types
[1].getNumElements() <= 4;
972 [=](const LegalityQuery
&Query
) {
973 return Query
.Types
[1].isFixedVector() &&
974 Query
.Types
[1].getNumElements() <= 8;
978 [=](const LegalityQuery
&Query
) {
979 return Query
.Types
[1].isFixedVector() &&
980 Query
.Types
[1].getNumElements() <= 16;
983 .minScalarOrElt(0, s8
) // Worst case, we need at least s8.
984 .moreElementsToNextPow2(1)
985 .clampMaxNumElements(1, s64
, 2)
986 .clampMaxNumElements(1, s32
, 4)
987 .clampMaxNumElements(1, s16
, 8)
988 .clampMaxNumElements(1, s8
, 16)
989 .clampMaxNumElements(1, p0
, 2);
991 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT
)
993 typeInSet(0, {v16s8
, v8s8
, v8s16
, v4s16
, v4s32
, v2s32
, v2s64
, v2p0
}))
994 .legalFor(HasSVE
, {{nxv16s8
, s32
, s64
},
997 {nxv2s64
, s64
, s64
}})
998 .moreElementsToNextPow2(0)
999 .widenVectorEltsToVectorMinSize(0, 64)
1000 .clampNumElements(0, v8s8
, v16s8
)
1001 .clampNumElements(0, v4s16
, v8s16
)
1002 .clampNumElements(0, v2s32
, v4s32
)
1003 .clampMaxNumElements(0, s64
, 2)
1004 .clampMaxNumElements(0, p0
, 2);
1006 getActionDefinitionsBuilder(G_BUILD_VECTOR
)
1007 .legalFor({{v8s8
, s8
},
1015 .clampNumElements(0, v4s32
, v4s32
)
1016 .clampNumElements(0, v2s64
, v2s64
)
1017 .minScalarOrElt(0, s8
)
1018 .widenVectorEltsToVectorMinSize(0, 64)
1019 .widenScalarOrEltToNextPow2(0)
1020 .minScalarSameAs(1, 0);
1022 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC
).lower();
1024 getActionDefinitionsBuilder(G_CTLZ
)
1025 .legalForCartesianProduct(
1026 {s32
, s64
, v8s8
, v16s8
, v4s16
, v8s16
, v2s32
, v4s32
})
1028 .widenScalarToNextPow2(1, /*Min=*/32)
1029 .clampScalar(1, s32
, s64
)
1030 .scalarSameSizeAs(0, 1);
1031 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF
).lower();
1033 // TODO: Custom lowering for v2s32, v4s32, v2s64.
1034 getActionDefinitionsBuilder(G_BITREVERSE
)
1035 .legalFor({s32
, s64
, v8s8
, v16s8
})
1036 .widenScalarToNextPow2(0, /*Min = */ 32)
1037 .clampScalar(0, s32
, s64
);
1039 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF
).lower();
1041 getActionDefinitionsBuilder(G_CTTZ
)
1042 .lowerIf(isVector(0))
1043 .widenScalarToNextPow2(1, /*Min=*/32)
1044 .clampScalar(1, s32
, s64
)
1045 .scalarSameSizeAs(0, 1)
1046 .legalFor(HasCSSC
, {s32
, s64
})
1047 .customFor(!HasCSSC
, {s32
, s64
});
1049 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR
)
1050 .legalIf([=](const LegalityQuery
&Query
) {
1051 const LLT
&DstTy
= Query
.Types
[0];
1052 const LLT
&SrcTy
= Query
.Types
[1];
1053 // For now just support the TBL2 variant which needs the source vectors
1054 // to be the same size as the dest.
1057 return llvm::is_contained(
1058 {v2s64
, v2s32
, v4s32
, v4s16
, v16s8
, v8s8
, v8s16
}, DstTy
);
1060 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
1061 // just want those lowered into G_BUILD_VECTOR
1062 .lowerIf([=](const LegalityQuery
&Query
) {
1063 return !Query
.Types
[1].isVector();
1066 [](const LegalityQuery
&Query
) {
1067 return Query
.Types
[0].isVector() && Query
.Types
[1].isVector() &&
1068 Query
.Types
[0].getNumElements() >
1069 Query
.Types
[1].getNumElements();
1072 .moreElementsToNextPow2(0)
1074 [](const LegalityQuery
&Query
) {
1075 return Query
.Types
[0].isVector() && Query
.Types
[1].isVector() &&
1076 Query
.Types
[0].getNumElements() <
1077 Query
.Types
[1].getNumElements();
1080 .widenScalarOrEltToNextPow2OrMinSize(0, 8)
1081 .clampNumElements(0, v8s8
, v16s8
)
1082 .clampNumElements(0, v4s16
, v8s16
)
1083 .clampNumElements(0, v4s32
, v4s32
)
1084 .clampNumElements(0, v2s64
, v2s64
)
1085 .bitcastIf(isPointerVector(0), [=](const LegalityQuery
&Query
) {
1086 // Bitcast pointers vector to i64.
1087 const LLT DstTy
= Query
.Types
[0];
1088 return std::pair(0, LLT::vector(DstTy
.getElementCount(), 64));
1091 getActionDefinitionsBuilder(G_CONCAT_VECTORS
)
1092 .legalFor({{v4s32
, v2s32
}, {v8s16
, v4s16
}, {v16s8
, v8s8
}})
1094 [=](const LegalityQuery
&Query
) {
1095 return Query
.Types
[0].getSizeInBits() <= 128 &&
1096 Query
.Types
[1].getSizeInBits() <= 64;
1098 [=](const LegalityQuery
&Query
) {
1099 const LLT DstTy
= Query
.Types
[0];
1100 const LLT SrcTy
= Query
.Types
[1];
1102 0, DstTy
.changeElementSize(SrcTy
.getSizeInBits())
1103 .changeElementCount(
1104 DstTy
.getElementCount().divideCoefficientBy(
1105 SrcTy
.getNumElements())));
1108 getActionDefinitionsBuilder(G_JUMP_TABLE
).legalFor({p0
});
1110 getActionDefinitionsBuilder(G_BRJT
).legalFor({{p0
, s64
}});
1112 getActionDefinitionsBuilder(G_DYN_STACKALLOC
).custom();
1114 getActionDefinitionsBuilder({G_STACKSAVE
, G_STACKRESTORE
}).lower();
1117 // G_BZERO is not supported. Currently it is only emitted by
1118 // PreLegalizerCombiner for G_MEMSET with zero constant.
1119 getActionDefinitionsBuilder(G_BZERO
).unsupported();
1121 getActionDefinitionsBuilder(G_MEMSET
)
1122 .legalForCartesianProduct({p0
}, {s64
}, {s64
})
1123 .customForCartesianProduct({p0
}, {s8
}, {s64
})
1124 .immIdx(0); // Inform verifier imm idx 0 is handled.
1126 getActionDefinitionsBuilder({G_MEMCPY
, G_MEMMOVE
})
1127 .legalForCartesianProduct({p0
}, {p0
}, {s64
})
1128 .immIdx(0); // Inform verifier imm idx 0 is handled.
1130 // G_MEMCPY_INLINE does not have a tailcall immediate
1131 getActionDefinitionsBuilder(G_MEMCPY_INLINE
)
1132 .legalForCartesianProduct({p0
}, {p0
}, {s64
});
1135 getActionDefinitionsBuilder({G_BZERO
, G_MEMCPY
, G_MEMMOVE
, G_MEMSET
})
1139 // FIXME: Legal vector types are only legal with NEON.
1140 getActionDefinitionsBuilder(G_ABS
)
1141 .legalFor(HasCSSC
, {s32
, s64
})
1142 .legalFor(PackedVectorAllTypeList
)
1143 .customIf([=](const LegalityQuery
&Q
) {
1144 // TODO: Fix suboptimal codegen for 128+ bit types.
1145 LLT SrcTy
= Q
.Types
[0];
1146 return SrcTy
.isScalar() && SrcTy
.getSizeInBits() < 128;
1149 [=](const LegalityQuery
&Query
) { return Query
.Types
[0] == v4s8
; },
1150 [=](const LegalityQuery
&Query
) { return std::make_pair(0, v4s16
); })
1152 [=](const LegalityQuery
&Query
) { return Query
.Types
[0] == v2s16
; },
1153 [=](const LegalityQuery
&Query
) { return std::make_pair(0, v2s32
); })
1154 .clampNumElements(0, v8s8
, v16s8
)
1155 .clampNumElements(0, v4s16
, v8s16
)
1156 .clampNumElements(0, v2s32
, v4s32
)
1157 .clampNumElements(0, v2s64
, v2s64
)
1158 .moreElementsToNextPow2(0)
1161 // For fadd reductions we have pairwise operations available. We treat the
1162 // usual legal types as legal and handle the lowering to pairwise instructions
1164 getActionDefinitionsBuilder(G_VECREDUCE_FADD
)
1165 .legalFor({{s32
, v2s32
}, {s32
, v4s32
}, {s64
, v2s64
}})
1166 .legalFor(HasFP16
, {{s16
, v4s16
}, {s16
, v8s16
}})
1167 .minScalarOrElt(0, MinFPScalar
)
1168 .clampMaxNumElements(1, s64
, 2)
1169 .clampMaxNumElements(1, s32
, 4)
1170 .clampMaxNumElements(1, s16
, 8)
1173 // For fmul reductions we need to split up into individual operations. We
1174 // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of
1175 // smaller types, followed by scalarizing what remains.
1176 getActionDefinitionsBuilder(G_VECREDUCE_FMUL
)
1177 .minScalarOrElt(0, MinFPScalar
)
1178 .clampMaxNumElements(1, s64
, 2)
1179 .clampMaxNumElements(1, s32
, 4)
1180 .clampMaxNumElements(1, s16
, 8)
1181 .clampMaxNumElements(1, s32
, 2)
1182 .clampMaxNumElements(1, s16
, 4)
1186 getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD
, G_VECREDUCE_SEQ_FMUL
})
1190 getActionDefinitionsBuilder(G_VECREDUCE_ADD
)
1191 .legalFor({{s8
, v16s8
},
1198 .clampMaxNumElements(1, s64
, 2)
1199 .clampMaxNumElements(1, s32
, 4)
1200 .clampMaxNumElements(1, s16
, 8)
1201 .clampMaxNumElements(1, s8
, 16)
1204 getActionDefinitionsBuilder({G_VECREDUCE_FMIN
, G_VECREDUCE_FMAX
,
1205 G_VECREDUCE_FMINIMUM
, G_VECREDUCE_FMAXIMUM
})
1206 .legalFor({{s32
, v4s32
}, {s32
, v2s32
}, {s64
, v2s64
}})
1207 .legalFor(HasFP16
, {{s16
, v4s16
}, {s16
, v8s16
}})
1208 .minScalarOrElt(0, MinFPScalar
)
1209 .clampMaxNumElements(1, s64
, 2)
1210 .clampMaxNumElements(1, s32
, 4)
1211 .clampMaxNumElements(1, s16
, 8)
1214 getActionDefinitionsBuilder(G_VECREDUCE_MUL
)
1215 .clampMaxNumElements(1, s32
, 2)
1216 .clampMaxNumElements(1, s16
, 4)
1217 .clampMaxNumElements(1, s8
, 8)
1221 getActionDefinitionsBuilder(
1222 {G_VECREDUCE_SMIN
, G_VECREDUCE_SMAX
, G_VECREDUCE_UMIN
, G_VECREDUCE_UMAX
})
1223 .legalFor({{s8
, v8s8
},
1230 [=](const LegalityQuery
&Query
) {
1231 return Query
.Types
[1].isVector() &&
1232 Query
.Types
[1].getElementType() != s8
&&
1233 Query
.Types
[1].getNumElements() & 1;
1235 LegalizeMutations::moreElementsToNextPow2(1))
1236 .clampMaxNumElements(1, s64
, 2)
1237 .clampMaxNumElements(1, s32
, 4)
1238 .clampMaxNumElements(1, s16
, 8)
1239 .clampMaxNumElements(1, s8
, 16)
1243 getActionDefinitionsBuilder(
1244 {G_VECREDUCE_OR
, G_VECREDUCE_AND
, G_VECREDUCE_XOR
})
1245 // Try to break down into smaller vectors as long as they're at least 64
1246 // bits. This lets us use vector operations for some parts of the
1249 [=](const LegalityQuery
&Q
) {
1250 LLT SrcTy
= Q
.Types
[1];
1251 if (SrcTy
.isScalar())
1253 if (!isPowerOf2_32(SrcTy
.getNumElements()))
1255 // We can usually perform 64b vector operations.
1256 return SrcTy
.getSizeInBits() > 64;
1258 [=](const LegalityQuery
&Q
) {
1259 LLT SrcTy
= Q
.Types
[1];
1260 return std::make_pair(1, SrcTy
.divide(2));
1265 // TODO: Update this to correct handling when adding AArch64/SVE support.
1266 getActionDefinitionsBuilder(G_VECTOR_COMPRESS
).lower();
1268 getActionDefinitionsBuilder({G_FSHL
, G_FSHR
})
1269 .customFor({{s32
, s32
}, {s32
, s64
}, {s64
, s64
}})
1272 getActionDefinitionsBuilder(G_ROTR
)
1273 .legalFor({{s32
, s64
}, {s64
, s64
}})
1274 .customIf([=](const LegalityQuery
&Q
) {
1275 return Q
.Types
[0].isScalar() && Q
.Types
[1].getScalarSizeInBits() < 64;
1278 getActionDefinitionsBuilder(G_ROTL
).lower();
1280 getActionDefinitionsBuilder({G_SBFX
, G_UBFX
})
1281 .customFor({{s32
, s32
}, {s64
, s64
}});
1283 auto always
= [=](const LegalityQuery
&Q
) { return true; };
1284 getActionDefinitionsBuilder(G_CTPOP
)
1285 .legalFor(HasCSSC
, {{s32
, s32
}, {s64
, s64
}})
1286 .legalFor({{v8s8
, v8s8
}, {v16s8
, v16s8
}})
1287 .customFor(!HasCSSC
, {{s32
, s32
}, {s64
, s64
}})
1288 .customFor({{s128
, s128
},
1294 .clampScalar(0, s32
, s128
)
1295 .widenScalarToNextPow2(0)
1296 .minScalarEltSameAsIf(always
, 1, 0)
1297 .maxScalarEltSameAsIf(always
, 1, 0);
1299 getActionDefinitionsBuilder({G_UADDSAT
, G_SADDSAT
, G_USUBSAT
, G_SSUBSAT
})
1300 .legalFor({v2s64
, v2s32
, v4s32
, v4s16
, v8s16
, v8s8
, v16s8
})
1301 .legalFor(HasSVE
, {nxv2s64
, nxv4s32
, nxv8s16
, nxv16s8
})
1302 .clampNumElements(0, v8s8
, v16s8
)
1303 .clampNumElements(0, v4s16
, v8s16
)
1304 .clampNumElements(0, v2s32
, v4s32
)
1305 .clampMaxNumElements(0, s64
, 2)
1306 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
1307 .moreElementsToNextPow2(0)
1310 // TODO: Libcall support for s128.
1311 // TODO: s16 should be legal with full FP16 support.
1312 getActionDefinitionsBuilder({G_LROUND
, G_LLROUND
})
1313 .legalFor({{s64
, s32
}, {s64
, s64
}});
1315 // TODO: Custom legalization for mismatched types.
1316 getActionDefinitionsBuilder(G_FCOPYSIGN
)
1318 [](const LegalityQuery
&Query
) { return Query
.Types
[0].isScalar(); },
1319 [=](const LegalityQuery
&Query
) {
1320 const LLT Ty
= Query
.Types
[0];
1321 return std::pair(0, LLT::fixed_vector(Ty
== s16
? 4 : 2, Ty
));
1325 getActionDefinitionsBuilder(G_FMAD
).lower();
1327 // Access to floating-point environment.
1328 getActionDefinitionsBuilder({G_GET_FPENV
, G_SET_FPENV
, G_RESET_FPENV
,
1329 G_GET_FPMODE
, G_SET_FPMODE
, G_RESET_FPMODE
})
1332 getActionDefinitionsBuilder(G_IS_FPCLASS
).lower();
1334 getActionDefinitionsBuilder(G_PREFETCH
).custom();
1336 getActionDefinitionsBuilder({G_SCMP
, G_UCMP
}).lower();
1338 getActionDefinitionsBuilder(G_EXTRACT_SUBVECTOR
)
1339 .legalFor({{v8s8
, v16s8
}, {v4s16
, v8s16
}, {v2s32
, v4s32
}})
1340 .widenScalarOrEltToNextPow2(0)
1341 .immIdx(0); // Inform verifier imm idx 0 is handled.
1343 // TODO: {nxv16s8, s8}, {nxv8s16, s16}
1344 getActionDefinitionsBuilder(G_SPLAT_VECTOR
)
1345 .legalFor(HasSVE
, {{nxv4s32
, s32
}, {nxv2s64
, s64
}});
1347 getLegacyLegalizerInfo().computeTables();
1348 verify(*ST
.getInstrInfo());
1351 bool AArch64LegalizerInfo::legalizeCustom(
1352 LegalizerHelper
&Helper
, MachineInstr
&MI
,
1353 LostDebugLocObserver
&LocObserver
) const {
1354 MachineIRBuilder
&MIRBuilder
= Helper
.MIRBuilder
;
1355 MachineRegisterInfo
&MRI
= *MIRBuilder
.getMRI();
1356 GISelChangeObserver
&Observer
= Helper
.Observer
;
1357 switch (MI
.getOpcode()) {
1359 // No idea what to do.
1361 case TargetOpcode::G_VAARG
:
1362 return legalizeVaArg(MI
, MRI
, MIRBuilder
);
1363 case TargetOpcode::G_LOAD
:
1364 case TargetOpcode::G_STORE
:
1365 return legalizeLoadStore(MI
, MRI
, MIRBuilder
, Observer
);
1366 case TargetOpcode::G_SHL
:
1367 case TargetOpcode::G_ASHR
:
1368 case TargetOpcode::G_LSHR
:
1369 return legalizeShlAshrLshr(MI
, MRI
, MIRBuilder
, Observer
);
1370 case TargetOpcode::G_GLOBAL_VALUE
:
1371 return legalizeSmallCMGlobalValue(MI
, MRI
, MIRBuilder
, Observer
);
1372 case TargetOpcode::G_SBFX
:
1373 case TargetOpcode::G_UBFX
:
1374 return legalizeBitfieldExtract(MI
, MRI
, Helper
);
1375 case TargetOpcode::G_FSHL
:
1376 case TargetOpcode::G_FSHR
:
1377 return legalizeFunnelShift(MI
, MRI
, MIRBuilder
, Observer
, Helper
);
1378 case TargetOpcode::G_ROTR
:
1379 return legalizeRotate(MI
, MRI
, Helper
);
1380 case TargetOpcode::G_CTPOP
:
1381 return legalizeCTPOP(MI
, MRI
, Helper
);
1382 case TargetOpcode::G_ATOMIC_CMPXCHG
:
1383 return legalizeAtomicCmpxchg128(MI
, MRI
, Helper
);
1384 case TargetOpcode::G_CTTZ
:
1385 return legalizeCTTZ(MI
, Helper
);
1386 case TargetOpcode::G_BZERO
:
1387 case TargetOpcode::G_MEMCPY
:
1388 case TargetOpcode::G_MEMMOVE
:
1389 case TargetOpcode::G_MEMSET
:
1390 return legalizeMemOps(MI
, Helper
);
1391 case TargetOpcode::G_EXTRACT_VECTOR_ELT
:
1392 return legalizeExtractVectorElt(MI
, MRI
, Helper
);
1393 case TargetOpcode::G_DYN_STACKALLOC
:
1394 return legalizeDynStackAlloc(MI
, Helper
);
1395 case TargetOpcode::G_PREFETCH
:
1396 return legalizePrefetch(MI
, Helper
);
1397 case TargetOpcode::G_ABS
:
1398 return Helper
.lowerAbsToCNeg(MI
);
1399 case TargetOpcode::G_ICMP
:
1400 return legalizeICMP(MI
, MRI
, MIRBuilder
);
1403 llvm_unreachable("expected switch to return");
1406 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr
&MI
,
1407 MachineRegisterInfo
&MRI
,
1408 MachineIRBuilder
&MIRBuilder
,
1409 GISelChangeObserver
&Observer
,
1410 LegalizerHelper
&Helper
) const {
1411 assert(MI
.getOpcode() == TargetOpcode::G_FSHL
||
1412 MI
.getOpcode() == TargetOpcode::G_FSHR
);
1414 // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic
1416 Register ShiftNo
= MI
.getOperand(3).getReg();
1417 LLT ShiftTy
= MRI
.getType(ShiftNo
);
1418 auto VRegAndVal
= getIConstantVRegValWithLookThrough(ShiftNo
, MRI
);
1420 // Adjust shift amount according to Opcode (FSHL/FSHR)
1421 // Convert FSHL to FSHR
1422 LLT OperationTy
= MRI
.getType(MI
.getOperand(0).getReg());
1423 APInt
BitWidth(ShiftTy
.getSizeInBits(), OperationTy
.getSizeInBits(), false);
1425 // Lower non-constant shifts and leave zero shifts to the optimizer.
1426 if (!VRegAndVal
|| VRegAndVal
->Value
.urem(BitWidth
) == 0)
1427 return (Helper
.lowerFunnelShiftAsShifts(MI
) ==
1428 LegalizerHelper::LegalizeResult::Legalized
);
1430 APInt Amount
= VRegAndVal
->Value
.urem(BitWidth
);
1432 Amount
= MI
.getOpcode() == TargetOpcode::G_FSHL
? BitWidth
- Amount
: Amount
;
1434 // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount
1435 // in the range of 0 <-> BitWidth, it is legal
1436 if (ShiftTy
.getSizeInBits() == 64 && MI
.getOpcode() == TargetOpcode::G_FSHR
&&
1437 VRegAndVal
->Value
.ult(BitWidth
))
1440 // Cast the ShiftNumber to a 64-bit type
1441 auto Cast64
= MIRBuilder
.buildConstant(LLT::scalar(64), Amount
.zext(64));
1443 if (MI
.getOpcode() == TargetOpcode::G_FSHR
) {
1444 Observer
.changingInstr(MI
);
1445 MI
.getOperand(3).setReg(Cast64
.getReg(0));
1446 Observer
.changedInstr(MI
);
1448 // If Opcode is FSHL, remove the FSHL instruction and create a FSHR
1450 else if (MI
.getOpcode() == TargetOpcode::G_FSHL
) {
1451 MIRBuilder
.buildInstr(TargetOpcode::G_FSHR
, {MI
.getOperand(0).getReg()},
1452 {MI
.getOperand(1).getReg(), MI
.getOperand(2).getReg(),
1454 MI
.eraseFromParent();
1459 bool AArch64LegalizerInfo::legalizeICMP(MachineInstr
&MI
,
1460 MachineRegisterInfo
&MRI
,
1461 MachineIRBuilder
&MIRBuilder
) const {
1462 Register DstReg
= MI
.getOperand(0).getReg();
1463 Register SrcReg1
= MI
.getOperand(2).getReg();
1464 Register SrcReg2
= MI
.getOperand(3).getReg();
1465 LLT DstTy
= MRI
.getType(DstReg
);
1466 LLT SrcTy
= MRI
.getType(SrcReg1
);
1468 // Check the vector types are legal
1469 if (DstTy
.getScalarSizeInBits() != SrcTy
.getScalarSizeInBits() ||
1470 DstTy
.getNumElements() != SrcTy
.getNumElements() ||
1471 (DstTy
.getSizeInBits() != 64 && DstTy
.getSizeInBits() != 128))
1474 // Lowers G_ICMP NE => G_ICMP EQ to allow better pattern matching for
1476 CmpInst::Predicate Pred
= (CmpInst::Predicate
)MI
.getOperand(1).getPredicate();
1477 if (Pred
!= CmpInst::ICMP_NE
)
1481 .buildICmp(CmpInst::ICMP_EQ
, MRI
.getType(DstReg
), SrcReg1
, SrcReg2
)
1483 MIRBuilder
.buildNot(DstReg
, CmpReg
);
1485 MI
.eraseFromParent();
1489 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr
&MI
,
1490 MachineRegisterInfo
&MRI
,
1491 LegalizerHelper
&Helper
) const {
1492 // To allow for imported patterns to match, we ensure that the rotate amount
1493 // is 64b with an extension.
1494 Register AmtReg
= MI
.getOperand(2).getReg();
1495 LLT AmtTy
= MRI
.getType(AmtReg
);
1497 assert(AmtTy
.isScalar() && "Expected a scalar rotate");
1498 assert(AmtTy
.getSizeInBits() < 64 && "Expected this rotate to be legal");
1499 auto NewAmt
= Helper
.MIRBuilder
.buildZExt(LLT::scalar(64), AmtReg
);
1500 Helper
.Observer
.changingInstr(MI
);
1501 MI
.getOperand(2).setReg(NewAmt
.getReg(0));
1502 Helper
.Observer
.changedInstr(MI
);
1506 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
1507 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&MIRBuilder
,
1508 GISelChangeObserver
&Observer
) const {
1509 assert(MI
.getOpcode() == TargetOpcode::G_GLOBAL_VALUE
);
1510 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
1511 // G_ADD_LOW instructions.
1512 // By splitting this here, we can optimize accesses in the small code model by
1513 // folding in the G_ADD_LOW into the load/store offset.
1514 auto &GlobalOp
= MI
.getOperand(1);
1515 // Don't modify an intrinsic call.
1516 if (GlobalOp
.isSymbol())
1518 const auto* GV
= GlobalOp
.getGlobal();
1519 if (GV
->isThreadLocal())
1520 return true; // Don't want to modify TLS vars.
1522 auto &TM
= ST
->getTargetLowering()->getTargetMachine();
1523 unsigned OpFlags
= ST
->ClassifyGlobalReference(GV
, TM
);
1525 if (OpFlags
& AArch64II::MO_GOT
)
1528 auto Offset
= GlobalOp
.getOffset();
1529 Register DstReg
= MI
.getOperand(0).getReg();
1530 auto ADRP
= MIRBuilder
.buildInstr(AArch64::ADRP
, {LLT::pointer(0, 64)}, {})
1531 .addGlobalAddress(GV
, Offset
, OpFlags
| AArch64II::MO_PAGE
);
1532 // Set the regclass on the dest reg too.
1533 MRI
.setRegClass(ADRP
.getReg(0), &AArch64::GPR64RegClass
);
1535 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
1536 // by creating a MOVK that sets bits 48-63 of the register to (global address
1537 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
1538 // prevent an incorrect tag being generated during relocation when the
1539 // global appears before the code section. Without the offset, a global at
1540 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
1541 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
1542 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
1543 // instead of `0xf`.
1544 // This assumes that we're in the small code model so we can assume a binary
1545 // size of <= 4GB, which makes the untagged PC relative offset positive. The
1546 // binary must also be loaded into address range [0, 2^48). Both of these
1547 // properties need to be ensured at runtime when using tagged addresses.
1548 if (OpFlags
& AArch64II::MO_TAGGED
) {
1550 "Should not have folded in an offset for a tagged global!");
1551 ADRP
= MIRBuilder
.buildInstr(AArch64::MOVKXi
, {LLT::pointer(0, 64)}, {ADRP
})
1552 .addGlobalAddress(GV
, 0x100000000,
1553 AArch64II::MO_PREL
| AArch64II::MO_G3
)
1555 MRI
.setRegClass(ADRP
.getReg(0), &AArch64::GPR64RegClass
);
1558 MIRBuilder
.buildInstr(AArch64::G_ADD_LOW
, {DstReg
}, {ADRP
})
1559 .addGlobalAddress(GV
, Offset
,
1560 OpFlags
| AArch64II::MO_PAGEOFF
| AArch64II::MO_NC
);
1561 MI
.eraseFromParent();
1565 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper
&Helper
,
1566 MachineInstr
&MI
) const {
1567 auto LowerBinOp
= [&MI
](unsigned Opcode
) {
1568 MachineIRBuilder
MIB(MI
);
1569 MIB
.buildInstr(Opcode
, {MI
.getOperand(0)},
1570 {MI
.getOperand(2), MI
.getOperand(3)});
1571 MI
.eraseFromParent();
1575 Intrinsic::ID IntrinsicID
= cast
<GIntrinsic
>(MI
).getIntrinsicID();
1576 switch (IntrinsicID
) {
1577 case Intrinsic::vacopy
: {
1578 unsigned PtrSize
= ST
->isTargetILP32() ? 4 : 8;
1579 unsigned VaListSize
=
1580 (ST
->isTargetDarwin() || ST
->isTargetWindows())
1582 : ST
->isTargetILP32() ? 20 : 32;
1584 MachineFunction
&MF
= *MI
.getMF();
1585 auto Val
= MF
.getRegInfo().createGenericVirtualRegister(
1586 LLT::scalar(VaListSize
* 8));
1587 MachineIRBuilder
MIB(MI
);
1588 MIB
.buildLoad(Val
, MI
.getOperand(2),
1589 *MF
.getMachineMemOperand(MachinePointerInfo(),
1590 MachineMemOperand::MOLoad
,
1591 VaListSize
, Align(PtrSize
)));
1592 MIB
.buildStore(Val
, MI
.getOperand(1),
1593 *MF
.getMachineMemOperand(MachinePointerInfo(),
1594 MachineMemOperand::MOStore
,
1595 VaListSize
, Align(PtrSize
)));
1596 MI
.eraseFromParent();
1599 case Intrinsic::get_dynamic_area_offset
: {
1600 MachineIRBuilder
&MIB
= Helper
.MIRBuilder
;
1601 MIB
.buildConstant(MI
.getOperand(0).getReg(), 0);
1602 MI
.eraseFromParent();
1605 case Intrinsic::aarch64_mops_memset_tag
: {
1606 assert(MI
.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS
);
1607 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
1608 // the instruction).
1609 MachineIRBuilder
MIB(MI
);
1610 auto &Value
= MI
.getOperand(3);
1611 Register ExtValueReg
= MIB
.buildAnyExt(LLT::scalar(64), Value
).getReg(0);
1612 Value
.setReg(ExtValueReg
);
1615 case Intrinsic::aarch64_prefetch
: {
1616 MachineIRBuilder
MIB(MI
);
1617 auto &AddrVal
= MI
.getOperand(1);
1619 int64_t IsWrite
= MI
.getOperand(2).getImm();
1620 int64_t Target
= MI
.getOperand(3).getImm();
1621 int64_t IsStream
= MI
.getOperand(4).getImm();
1622 int64_t IsData
= MI
.getOperand(5).getImm();
1624 unsigned PrfOp
= (IsWrite
<< 4) | // Load/Store bit
1625 (!IsData
<< 3) | // IsDataCache bit
1626 (Target
<< 1) | // Cache level bits
1627 (unsigned)IsStream
; // Stream bit
1629 MIB
.buildInstr(AArch64::G_AARCH64_PREFETCH
).addImm(PrfOp
).add(AddrVal
);
1630 MI
.eraseFromParent();
1633 case Intrinsic::aarch64_neon_uaddv
:
1634 case Intrinsic::aarch64_neon_saddv
:
1635 case Intrinsic::aarch64_neon_umaxv
:
1636 case Intrinsic::aarch64_neon_smaxv
:
1637 case Intrinsic::aarch64_neon_uminv
:
1638 case Intrinsic::aarch64_neon_sminv
: {
1639 MachineIRBuilder
MIB(MI
);
1640 MachineRegisterInfo
&MRI
= *MIB
.getMRI();
1641 bool IsSigned
= IntrinsicID
== Intrinsic::aarch64_neon_saddv
||
1642 IntrinsicID
== Intrinsic::aarch64_neon_smaxv
||
1643 IntrinsicID
== Intrinsic::aarch64_neon_sminv
;
1645 auto OldDst
= MI
.getOperand(0).getReg();
1646 auto OldDstTy
= MRI
.getType(OldDst
);
1647 LLT NewDstTy
= MRI
.getType(MI
.getOperand(2).getReg()).getElementType();
1648 if (OldDstTy
== NewDstTy
)
1651 auto NewDst
= MRI
.createGenericVirtualRegister(NewDstTy
);
1653 Helper
.Observer
.changingInstr(MI
);
1654 MI
.getOperand(0).setReg(NewDst
);
1655 Helper
.Observer
.changedInstr(MI
);
1657 MIB
.setInsertPt(MIB
.getMBB(), ++MIB
.getInsertPt());
1658 MIB
.buildExtOrTrunc(IsSigned
? TargetOpcode::G_SEXT
: TargetOpcode::G_ZEXT
,
1663 case Intrinsic::aarch64_neon_uaddlp
:
1664 case Intrinsic::aarch64_neon_saddlp
: {
1665 MachineIRBuilder
MIB(MI
);
1667 unsigned Opc
= IntrinsicID
== Intrinsic::aarch64_neon_uaddlp
1669 : AArch64::G_SADDLP
;
1670 MIB
.buildInstr(Opc
, {MI
.getOperand(0)}, {MI
.getOperand(2)});
1671 MI
.eraseFromParent();
1675 case Intrinsic::aarch64_neon_uaddlv
:
1676 case Intrinsic::aarch64_neon_saddlv
: {
1677 MachineIRBuilder
MIB(MI
);
1678 MachineRegisterInfo
&MRI
= *MIB
.getMRI();
1680 unsigned Opc
= IntrinsicID
== Intrinsic::aarch64_neon_uaddlv
1682 : AArch64::G_SADDLV
;
1683 Register DstReg
= MI
.getOperand(0).getReg();
1684 Register SrcReg
= MI
.getOperand(2).getReg();
1685 LLT DstTy
= MRI
.getType(DstReg
);
1688 if (DstTy
.isScalar() && DstTy
.getScalarSizeInBits() <= 32) {
1689 MidTy
= LLT::fixed_vector(4, 32);
1690 ExtTy
= LLT::scalar(32);
1692 MidTy
= LLT::fixed_vector(2, 64);
1693 ExtTy
= LLT::scalar(64);
1697 MIB
.buildInstr(Opc
, {MidTy
}, {SrcReg
})->getOperand(0).getReg();
1699 MIB
.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
1700 Register ExtReg
= MIB
.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT
, {ExtTy
},
1704 if (DstTy
.getScalarSizeInBits() < 32)
1705 MIB
.buildTrunc(DstReg
, ExtReg
);
1707 MIB
.buildCopy(DstReg
, ExtReg
);
1709 MI
.eraseFromParent();
1713 case Intrinsic::aarch64_neon_smax
:
1714 return LowerBinOp(TargetOpcode::G_SMAX
);
1715 case Intrinsic::aarch64_neon_smin
:
1716 return LowerBinOp(TargetOpcode::G_SMIN
);
1717 case Intrinsic::aarch64_neon_umax
:
1718 return LowerBinOp(TargetOpcode::G_UMAX
);
1719 case Intrinsic::aarch64_neon_umin
:
1720 return LowerBinOp(TargetOpcode::G_UMIN
);
1721 case Intrinsic::aarch64_neon_fmax
:
1722 return LowerBinOp(TargetOpcode::G_FMAXIMUM
);
1723 case Intrinsic::aarch64_neon_fmin
:
1724 return LowerBinOp(TargetOpcode::G_FMINIMUM
);
1725 case Intrinsic::aarch64_neon_fmaxnm
:
1726 return LowerBinOp(TargetOpcode::G_FMAXNUM
);
1727 case Intrinsic::aarch64_neon_fminnm
:
1728 return LowerBinOp(TargetOpcode::G_FMINNUM
);
1729 case Intrinsic::aarch64_neon_smull
:
1730 return LowerBinOp(AArch64::G_SMULL
);
1731 case Intrinsic::aarch64_neon_umull
:
1732 return LowerBinOp(AArch64::G_UMULL
);
1733 case Intrinsic::aarch64_neon_abs
: {
1734 // Lower the intrinsic to G_ABS.
1735 MachineIRBuilder
MIB(MI
);
1736 MIB
.buildInstr(TargetOpcode::G_ABS
, {MI
.getOperand(0)}, {MI
.getOperand(2)});
1737 MI
.eraseFromParent();
1741 case Intrinsic::vector_reverse
:
1742 // TODO: Add support for vector_reverse
1749 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
1750 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&MIRBuilder
,
1751 GISelChangeObserver
&Observer
) const {
1752 assert(MI
.getOpcode() == TargetOpcode::G_ASHR
||
1753 MI
.getOpcode() == TargetOpcode::G_LSHR
||
1754 MI
.getOpcode() == TargetOpcode::G_SHL
);
1755 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
1756 // imported patterns can select it later. Either way, it will be legal.
1757 Register AmtReg
= MI
.getOperand(2).getReg();
1758 auto VRegAndVal
= getIConstantVRegValWithLookThrough(AmtReg
, MRI
);
1761 // Check the shift amount is in range for an immediate form.
1762 int64_t Amount
= VRegAndVal
->Value
.getSExtValue();
1764 return true; // This will have to remain a register variant.
1765 auto ExtCst
= MIRBuilder
.buildConstant(LLT::scalar(64), Amount
);
1766 Observer
.changingInstr(MI
);
1767 MI
.getOperand(2).setReg(ExtCst
.getReg(0));
1768 Observer
.changedInstr(MI
);
1772 static void matchLDPSTPAddrMode(Register Root
, Register
&Base
, int &Offset
,
1773 MachineRegisterInfo
&MRI
) {
1779 if (mi_match(Root
, MRI
, m_GPtrAdd(m_Reg(NewBase
), m_ICst(NewOffset
))) &&
1780 isShiftedInt
<7, 3>(NewOffset
)) {
1786 // FIXME: This should be removed and replaced with the generic bitcast legalize
1788 bool AArch64LegalizerInfo::legalizeLoadStore(
1789 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, MachineIRBuilder
&MIRBuilder
,
1790 GISelChangeObserver
&Observer
) const {
1791 assert(MI
.getOpcode() == TargetOpcode::G_STORE
||
1792 MI
.getOpcode() == TargetOpcode::G_LOAD
);
1793 // Here we just try to handle vector loads/stores where our value type might
1794 // have pointer elements, which the SelectionDAG importer can't handle. To
1795 // allow the existing patterns for s64 to fire for p0, we just try to bitcast
1796 // the value to use s64 types.
1798 // Custom legalization requires the instruction, if not deleted, must be fully
1799 // legalized. In order to allow further legalization of the inst, we create
1800 // a new instruction and erase the existing one.
1802 Register ValReg
= MI
.getOperand(0).getReg();
1803 const LLT ValTy
= MRI
.getType(ValReg
);
1805 if (ValTy
== LLT::scalar(128)) {
1807 AtomicOrdering Ordering
= (*MI
.memoperands_begin())->getSuccessOrdering();
1808 bool IsLoad
= MI
.getOpcode() == TargetOpcode::G_LOAD
;
1809 bool IsLoadAcquire
= IsLoad
&& Ordering
== AtomicOrdering::Acquire
;
1810 bool IsStoreRelease
= !IsLoad
&& Ordering
== AtomicOrdering::Release
;
1812 ST
->hasLSE2() && ST
->hasRCPC3() && (IsLoadAcquire
|| IsStoreRelease
);
1814 LLT s64
= LLT::scalar(64);
1818 Opcode
= IsLoad
? AArch64::LDIAPPX
: AArch64::STILPX
;
1820 // For LSE2, loads/stores should have been converted to monotonic and had
1821 // a fence inserted after them.
1822 assert(Ordering
== AtomicOrdering::Monotonic
||
1823 Ordering
== AtomicOrdering::Unordered
);
1824 assert(ST
->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
1826 Opcode
= IsLoad
? AArch64::LDPXi
: AArch64::STPXi
;
1829 MachineInstrBuilder NewI
;
1831 NewI
= MIRBuilder
.buildInstr(Opcode
, {s64
, s64
}, {});
1832 MIRBuilder
.buildMergeLikeInstr(
1833 ValReg
, {NewI
->getOperand(0), NewI
->getOperand(1)});
1835 auto Split
= MIRBuilder
.buildUnmerge(s64
, MI
.getOperand(0));
1836 NewI
= MIRBuilder
.buildInstr(
1837 Opcode
, {}, {Split
->getOperand(0), Split
->getOperand(1)});
1841 NewI
.addUse(MI
.getOperand(1).getReg());
1845 matchLDPSTPAddrMode(MI
.getOperand(1).getReg(), Base
, Offset
, MRI
);
1847 NewI
.addImm(Offset
/ 8);
1850 NewI
.cloneMemRefs(MI
);
1851 constrainSelectedInstRegOperands(*NewI
, *ST
->getInstrInfo(),
1852 *MRI
.getTargetRegisterInfo(),
1853 *ST
->getRegBankInfo());
1854 MI
.eraseFromParent();
1858 if (!ValTy
.isPointerVector() ||
1859 ValTy
.getElementType().getAddressSpace() != 0) {
1860 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
1864 unsigned PtrSize
= ValTy
.getElementType().getSizeInBits();
1865 const LLT NewTy
= LLT::vector(ValTy
.getElementCount(), PtrSize
);
1866 auto &MMO
= **MI
.memoperands_begin();
1869 if (MI
.getOpcode() == TargetOpcode::G_STORE
) {
1870 auto Bitcast
= MIRBuilder
.buildBitcast(NewTy
, ValReg
);
1871 MIRBuilder
.buildStore(Bitcast
.getReg(0), MI
.getOperand(1), MMO
);
1873 auto NewLoad
= MIRBuilder
.buildLoad(NewTy
, MI
.getOperand(1), MMO
);
1874 MIRBuilder
.buildBitcast(ValReg
, NewLoad
);
1876 MI
.eraseFromParent();
1880 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr
&MI
,
1881 MachineRegisterInfo
&MRI
,
1882 MachineIRBuilder
&MIRBuilder
) const {
1883 MachineFunction
&MF
= MIRBuilder
.getMF();
1884 Align
Alignment(MI
.getOperand(2).getImm());
1885 Register Dst
= MI
.getOperand(0).getReg();
1886 Register ListPtr
= MI
.getOperand(1).getReg();
1888 LLT PtrTy
= MRI
.getType(ListPtr
);
1889 LLT IntPtrTy
= LLT::scalar(PtrTy
.getSizeInBits());
1891 const unsigned PtrSize
= PtrTy
.getSizeInBits() / 8;
1892 const Align PtrAlign
= Align(PtrSize
);
1893 auto List
= MIRBuilder
.buildLoad(
1895 *MF
.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad
,
1898 MachineInstrBuilder DstPtr
;
1899 if (Alignment
> PtrAlign
) {
1900 // Realign the list to the actual required alignment.
1902 MIRBuilder
.buildConstant(IntPtrTy
, Alignment
.value() - 1);
1903 auto ListTmp
= MIRBuilder
.buildPtrAdd(PtrTy
, List
, AlignMinus1
.getReg(0));
1904 DstPtr
= MIRBuilder
.buildMaskLowPtrBits(PtrTy
, ListTmp
, Log2(Alignment
));
1908 LLT ValTy
= MRI
.getType(Dst
);
1909 uint64_t ValSize
= ValTy
.getSizeInBits() / 8;
1910 MIRBuilder
.buildLoad(
1912 *MF
.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad
,
1913 ValTy
, std::max(Alignment
, PtrAlign
)));
1915 auto Size
= MIRBuilder
.buildConstant(IntPtrTy
, alignTo(ValSize
, PtrAlign
));
1917 auto NewList
= MIRBuilder
.buildPtrAdd(PtrTy
, DstPtr
, Size
.getReg(0));
1919 MIRBuilder
.buildStore(NewList
, ListPtr
,
1920 *MF
.getMachineMemOperand(MachinePointerInfo(),
1921 MachineMemOperand::MOStore
,
1924 MI
.eraseFromParent();
1928 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
1929 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, LegalizerHelper
&Helper
) const {
1930 // Only legal if we can select immediate forms.
1931 // TODO: Lower this otherwise.
1932 return getIConstantVRegValWithLookThrough(MI
.getOperand(2).getReg(), MRI
) &&
1933 getIConstantVRegValWithLookThrough(MI
.getOperand(3).getReg(), MRI
);
1936 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr
&MI
,
1937 MachineRegisterInfo
&MRI
,
1938 LegalizerHelper
&Helper
) const {
1939 // When there is no integer popcount instruction (FEAT_CSSC isn't available),
1940 // it can be more efficiently lowered to the following sequence that uses
1941 // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD
1942 // registers are cheap.
1943 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
1944 // CNT V0.8B, V0.8B // 8xbyte pop-counts
1945 // ADDV B0, V0.8B // sum 8xbyte pop-counts
1946 // UMOV X0, V0.B[0] // copy byte result back to integer reg
1948 // For 128 bit vector popcounts, we lower to the following sequence:
1949 // cnt.16b v0, v0 // v8s16, v4s32, v2s64
1950 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64
1951 // uaddlp.4s v0, v0 // v4s32, v2s64
1952 // uaddlp.2d v0, v0 // v2s64
1954 // For 64 bit vector popcounts, we lower to the following sequence:
1955 // cnt.8b v0, v0 // v4s16, v2s32
1956 // uaddlp.4h v0, v0 // v4s16, v2s32
1957 // uaddlp.2s v0, v0 // v2s32
1959 MachineIRBuilder
&MIRBuilder
= Helper
.MIRBuilder
;
1960 Register Dst
= MI
.getOperand(0).getReg();
1961 Register Val
= MI
.getOperand(1).getReg();
1962 LLT Ty
= MRI
.getType(Val
);
1963 unsigned Size
= Ty
.getSizeInBits();
1965 assert(Ty
== MRI
.getType(Dst
) &&
1966 "Expected src and dst to have the same type!");
1968 if (ST
->hasCSSC() && Ty
.isScalar() && Size
== 128) {
1969 LLT s64
= LLT::scalar(64);
1971 auto Split
= MIRBuilder
.buildUnmerge(s64
, Val
);
1972 auto CTPOP1
= MIRBuilder
.buildCTPOP(s64
, Split
->getOperand(0));
1973 auto CTPOP2
= MIRBuilder
.buildCTPOP(s64
, Split
->getOperand(1));
1974 auto Add
= MIRBuilder
.buildAdd(s64
, CTPOP1
, CTPOP2
);
1976 MIRBuilder
.buildZExt(Dst
, Add
);
1977 MI
.eraseFromParent();
1981 if (!ST
->hasNEON() ||
1982 MI
.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) {
1983 // Use generic lowering when custom lowering is not possible.
1984 return Ty
.isScalar() && (Size
== 32 || Size
== 64) &&
1985 Helper
.lowerBitCount(MI
) ==
1986 LegalizerHelper::LegalizeResult::Legalized
;
1989 // Pre-conditioning: widen Val up to the nearest vector type.
1990 // s32,s64,v4s16,v2s32 -> v8i8
1991 // v8s16,v4s32,v2s64 -> v16i8
1992 LLT VTy
= Size
== 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
1993 if (Ty
.isScalar()) {
1994 assert((Size
== 32 || Size
== 64 || Size
== 128) && "Expected only 32, 64, or 128 bit scalars!");
1996 Val
= MIRBuilder
.buildZExt(LLT::scalar(64), Val
).getReg(0);
1999 Val
= MIRBuilder
.buildBitcast(VTy
, Val
).getReg(0);
2001 // Count bits in each byte-sized lane.
2002 auto CTPOP
= MIRBuilder
.buildCTPOP(VTy
, Val
);
2004 // Sum across lanes.
2006 if (ST
->hasDotProd() && Ty
.isVector() && Ty
.getNumElements() >= 2 &&
2007 Ty
.getScalarSizeInBits() != 16) {
2008 LLT Dt
= Ty
== LLT::fixed_vector(2, 64) ? LLT::fixed_vector(4, 32) : Ty
;
2009 auto Zeros
= MIRBuilder
.buildConstant(Dt
, 0);
2010 auto Ones
= MIRBuilder
.buildConstant(VTy
, 1);
2011 MachineInstrBuilder Sum
;
2013 if (Ty
== LLT::fixed_vector(2, 64)) {
2015 MIRBuilder
.buildInstr(AArch64::G_UDOT
, {Dt
}, {Zeros
, Ones
, CTPOP
});
2016 Sum
= MIRBuilder
.buildInstr(AArch64::G_UADDLP
, {Ty
}, {UDOT
});
2017 } else if (Ty
== LLT::fixed_vector(4, 32)) {
2018 Sum
= MIRBuilder
.buildInstr(AArch64::G_UDOT
, {Dt
}, {Zeros
, Ones
, CTPOP
});
2019 } else if (Ty
== LLT::fixed_vector(2, 32)) {
2020 Sum
= MIRBuilder
.buildInstr(AArch64::G_UDOT
, {Dt
}, {Zeros
, Ones
, CTPOP
});
2022 llvm_unreachable("unexpected vector shape");
2025 Sum
->getOperand(0).setReg(Dst
);
2026 MI
.eraseFromParent();
2030 Register HSum
= CTPOP
.getReg(0);
2032 SmallVector
<LLT
> HAddTys
;
2033 if (Ty
.isScalar()) {
2034 Opc
= Intrinsic::aarch64_neon_uaddlv
;
2035 HAddTys
.push_back(LLT::scalar(32));
2036 } else if (Ty
== LLT::fixed_vector(8, 16)) {
2037 Opc
= Intrinsic::aarch64_neon_uaddlp
;
2038 HAddTys
.push_back(LLT::fixed_vector(8, 16));
2039 } else if (Ty
== LLT::fixed_vector(4, 32)) {
2040 Opc
= Intrinsic::aarch64_neon_uaddlp
;
2041 HAddTys
.push_back(LLT::fixed_vector(8, 16));
2042 HAddTys
.push_back(LLT::fixed_vector(4, 32));
2043 } else if (Ty
== LLT::fixed_vector(2, 64)) {
2044 Opc
= Intrinsic::aarch64_neon_uaddlp
;
2045 HAddTys
.push_back(LLT::fixed_vector(8, 16));
2046 HAddTys
.push_back(LLT::fixed_vector(4, 32));
2047 HAddTys
.push_back(LLT::fixed_vector(2, 64));
2048 } else if (Ty
== LLT::fixed_vector(4, 16)) {
2049 Opc
= Intrinsic::aarch64_neon_uaddlp
;
2050 HAddTys
.push_back(LLT::fixed_vector(4, 16));
2051 } else if (Ty
== LLT::fixed_vector(2, 32)) {
2052 Opc
= Intrinsic::aarch64_neon_uaddlp
;
2053 HAddTys
.push_back(LLT::fixed_vector(4, 16));
2054 HAddTys
.push_back(LLT::fixed_vector(2, 32));
2056 llvm_unreachable("unexpected vector shape");
2057 MachineInstrBuilder UADD
;
2058 for (LLT HTy
: HAddTys
) {
2059 UADD
= MIRBuilder
.buildIntrinsic(Opc
, {HTy
}).addUse(HSum
);
2060 HSum
= UADD
.getReg(0);
2063 // Post-conditioning.
2064 if (Ty
.isScalar() && (Size
== 64 || Size
== 128))
2065 MIRBuilder
.buildZExt(Dst
, UADD
);
2067 UADD
->getOperand(0).setReg(Dst
);
2068 MI
.eraseFromParent();
2072 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
2073 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, LegalizerHelper
&Helper
) const {
2074 MachineIRBuilder
&MIRBuilder
= Helper
.MIRBuilder
;
2075 LLT s64
= LLT::scalar(64);
2076 auto Addr
= MI
.getOperand(1).getReg();
2077 auto DesiredI
= MIRBuilder
.buildUnmerge({s64
, s64
}, MI
.getOperand(2));
2078 auto NewI
= MIRBuilder
.buildUnmerge({s64
, s64
}, MI
.getOperand(3));
2079 auto DstLo
= MRI
.createGenericVirtualRegister(s64
);
2080 auto DstHi
= MRI
.createGenericVirtualRegister(s64
);
2082 MachineInstrBuilder CAS
;
2084 // We have 128-bit CASP instructions taking XSeqPair registers, which are
2085 // s128. We need the merge/unmerge to bracket the expansion and pair up with
2086 // the rest of the MIR so we must reassemble the extracted registers into a
2087 // 128-bit known-regclass one with code like this:
2089 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input
2090 // %out = CASP %in1, ...
2091 // %OldLo = G_EXTRACT %out, 0
2092 // %OldHi = G_EXTRACT %out, 64
2093 auto Ordering
= (*MI
.memoperands_begin())->getMergedOrdering();
2096 case AtomicOrdering::Acquire
:
2097 Opcode
= AArch64::CASPAX
;
2099 case AtomicOrdering::Release
:
2100 Opcode
= AArch64::CASPLX
;
2102 case AtomicOrdering::AcquireRelease
:
2103 case AtomicOrdering::SequentiallyConsistent
:
2104 Opcode
= AArch64::CASPALX
;
2107 Opcode
= AArch64::CASPX
;
2111 LLT s128
= LLT::scalar(128);
2112 auto CASDst
= MRI
.createGenericVirtualRegister(s128
);
2113 auto CASDesired
= MRI
.createGenericVirtualRegister(s128
);
2114 auto CASNew
= MRI
.createGenericVirtualRegister(s128
);
2115 MIRBuilder
.buildInstr(TargetOpcode::REG_SEQUENCE
, {CASDesired
}, {})
2116 .addUse(DesiredI
->getOperand(0).getReg())
2117 .addImm(AArch64::sube64
)
2118 .addUse(DesiredI
->getOperand(1).getReg())
2119 .addImm(AArch64::subo64
);
2120 MIRBuilder
.buildInstr(TargetOpcode::REG_SEQUENCE
, {CASNew
}, {})
2121 .addUse(NewI
->getOperand(0).getReg())
2122 .addImm(AArch64::sube64
)
2123 .addUse(NewI
->getOperand(1).getReg())
2124 .addImm(AArch64::subo64
);
2126 CAS
= MIRBuilder
.buildInstr(Opcode
, {CASDst
}, {CASDesired
, CASNew
, Addr
});
2128 MIRBuilder
.buildExtract({DstLo
}, {CASDst
}, 0);
2129 MIRBuilder
.buildExtract({DstHi
}, {CASDst
}, 64);
2131 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
2132 // can take arbitrary registers so it just has the normal GPR64 operands the
2133 // rest of AArch64 is expecting.
2134 auto Ordering
= (*MI
.memoperands_begin())->getMergedOrdering();
2137 case AtomicOrdering::Acquire
:
2138 Opcode
= AArch64::CMP_SWAP_128_ACQUIRE
;
2140 case AtomicOrdering::Release
:
2141 Opcode
= AArch64::CMP_SWAP_128_RELEASE
;
2143 case AtomicOrdering::AcquireRelease
:
2144 case AtomicOrdering::SequentiallyConsistent
:
2145 Opcode
= AArch64::CMP_SWAP_128
;
2148 Opcode
= AArch64::CMP_SWAP_128_MONOTONIC
;
2152 auto Scratch
= MRI
.createVirtualRegister(&AArch64::GPR64RegClass
);
2153 CAS
= MIRBuilder
.buildInstr(Opcode
, {DstLo
, DstHi
, Scratch
},
2154 {Addr
, DesiredI
->getOperand(0),
2155 DesiredI
->getOperand(1), NewI
->getOperand(0),
2156 NewI
->getOperand(1)});
2159 CAS
.cloneMemRefs(MI
);
2160 constrainSelectedInstRegOperands(*CAS
, *ST
->getInstrInfo(),
2161 *MRI
.getTargetRegisterInfo(),
2162 *ST
->getRegBankInfo());
2164 MIRBuilder
.buildMergeLikeInstr(MI
.getOperand(0), {DstLo
, DstHi
});
2165 MI
.eraseFromParent();
2169 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr
&MI
,
2170 LegalizerHelper
&Helper
) const {
2171 MachineIRBuilder
&MIRBuilder
= Helper
.MIRBuilder
;
2172 MachineRegisterInfo
&MRI
= *MIRBuilder
.getMRI();
2173 LLT Ty
= MRI
.getType(MI
.getOperand(1).getReg());
2174 auto BitReverse
= MIRBuilder
.buildBitReverse(Ty
, MI
.getOperand(1));
2175 MIRBuilder
.buildCTLZ(MI
.getOperand(0).getReg(), BitReverse
);
2176 MI
.eraseFromParent();
2180 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr
&MI
,
2181 LegalizerHelper
&Helper
) const {
2182 MachineIRBuilder
&MIRBuilder
= Helper
.MIRBuilder
;
2184 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
2185 if (MI
.getOpcode() == TargetOpcode::G_MEMSET
) {
2186 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
2187 // the instruction).
2188 auto &Value
= MI
.getOperand(1);
2189 Register ExtValueReg
=
2190 MIRBuilder
.buildAnyExt(LLT::scalar(64), Value
).getReg(0);
2191 Value
.setReg(ExtValueReg
);
2198 bool AArch64LegalizerInfo::legalizeExtractVectorElt(
2199 MachineInstr
&MI
, MachineRegisterInfo
&MRI
, LegalizerHelper
&Helper
) const {
2200 const GExtractVectorElement
*Element
= cast
<GExtractVectorElement
>(&MI
);
2202 getIConstantVRegValWithLookThrough(Element
->getIndexReg(), MRI
);
2205 LLT VecTy
= MRI
.getType(Element
->getVectorReg());
2206 if (VecTy
.isScalableVector())
2208 return Helper
.lowerExtractInsertVectorElt(MI
) !=
2209 LegalizerHelper::LegalizeResult::UnableToLegalize
;
2212 bool AArch64LegalizerInfo::legalizeDynStackAlloc(
2213 MachineInstr
&MI
, LegalizerHelper
&Helper
) const {
2214 MachineFunction
&MF
= *MI
.getParent()->getParent();
2215 MachineIRBuilder
&MIRBuilder
= Helper
.MIRBuilder
;
2216 MachineRegisterInfo
&MRI
= *MIRBuilder
.getMRI();
2218 // If stack probing is not enabled for this function, use the default
2220 if (!MF
.getFunction().hasFnAttribute("probe-stack") ||
2221 MF
.getFunction().getFnAttribute("probe-stack").getValueAsString() !=
2223 Helper
.lowerDynStackAlloc(MI
);
2227 Register Dst
= MI
.getOperand(0).getReg();
2228 Register AllocSize
= MI
.getOperand(1).getReg();
2229 Align Alignment
= assumeAligned(MI
.getOperand(2).getImm());
2231 assert(MRI
.getType(Dst
) == LLT::pointer(0, 64) &&
2232 "Unexpected type for dynamic alloca");
2233 assert(MRI
.getType(AllocSize
) == LLT::scalar(64) &&
2234 "Unexpected type for dynamic alloca");
2236 LLT PtrTy
= MRI
.getType(Dst
);
2238 Helper
.getTargetLowering().getStackPointerRegisterToSaveRestore();
2240 Helper
.getDynStackAllocTargetPtr(SPReg
, AllocSize
, Alignment
, PtrTy
);
2242 MIRBuilder
.buildInstr(AArch64::PROBED_STACKALLOC_DYN
, {}, {SPTmp
});
2243 MRI
.setRegClass(NewMI
.getReg(0), &AArch64::GPR64commonRegClass
);
2244 MIRBuilder
.setInsertPt(*NewMI
->getParent(), NewMI
);
2245 MIRBuilder
.buildCopy(Dst
, SPTmp
);
2247 MI
.eraseFromParent();
2251 bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr
&MI
,
2252 LegalizerHelper
&Helper
) const {
2253 MachineIRBuilder
&MIB
= Helper
.MIRBuilder
;
2254 auto &AddrVal
= MI
.getOperand(0);
2256 int64_t IsWrite
= MI
.getOperand(1).getImm();
2257 int64_t Locality
= MI
.getOperand(2).getImm();
2258 int64_t IsData
= MI
.getOperand(3).getImm();
2260 bool IsStream
= Locality
== 0;
2261 if (Locality
!= 0) {
2262 assert(Locality
<= 3 && "Prefetch locality out-of-range");
2263 // The locality degree is the opposite of the cache speed.
2264 // Put the number the other way around.
2265 // The encoding starts at 0 for level 1
2266 Locality
= 3 - Locality
;
2269 unsigned PrfOp
= (IsWrite
<< 4) | (!IsData
<< 3) | (Locality
<< 1) | IsStream
;
2271 MIB
.buildInstr(AArch64::G_AARCH64_PREFETCH
).addImm(PrfOp
).add(AddrVal
);
2272 MI
.eraseFromParent();