[Clang] ensure mangled names are valid identifiers before being suggested in ifunc...
[llvm-project.git] / llvm / lib / Target / AArch64 / GISel / AArch64LegalizerInfo.cpp
blob619a041c273cd8be0eb9464390f17256c8a7f903
1 //===- AArch64LegalizerInfo.cpp ----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
14 #include "AArch64LegalizerInfo.h"
15 #include "AArch64Subtarget.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
22 #include "llvm/CodeGen/GlobalISel/Utils.h"
23 #include "llvm/CodeGen/MachineInstr.h"
24 #include "llvm/CodeGen/MachineRegisterInfo.h"
25 #include "llvm/CodeGen/TargetOpcodes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/IntrinsicsAArch64.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/Support/MathExtras.h"
31 #include <initializer_list>
33 #define DEBUG_TYPE "aarch64-legalinfo"
35 using namespace llvm;
36 using namespace LegalizeActions;
37 using namespace LegalizeMutations;
38 using namespace LegalityPredicates;
39 using namespace MIPatternMatch;
41 AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
42 : ST(&ST) {
43 using namespace TargetOpcode;
44 const LLT p0 = LLT::pointer(0, 64);
45 const LLT s8 = LLT::scalar(8);
46 const LLT s16 = LLT::scalar(16);
47 const LLT s32 = LLT::scalar(32);
48 const LLT s64 = LLT::scalar(64);
49 const LLT s128 = LLT::scalar(128);
50 const LLT v16s8 = LLT::fixed_vector(16, 8);
51 const LLT v8s8 = LLT::fixed_vector(8, 8);
52 const LLT v4s8 = LLT::fixed_vector(4, 8);
53 const LLT v2s8 = LLT::fixed_vector(2, 8);
54 const LLT v8s16 = LLT::fixed_vector(8, 16);
55 const LLT v4s16 = LLT::fixed_vector(4, 16);
56 const LLT v2s16 = LLT::fixed_vector(2, 16);
57 const LLT v2s32 = LLT::fixed_vector(2, 32);
58 const LLT v4s32 = LLT::fixed_vector(4, 32);
59 const LLT v2s64 = LLT::fixed_vector(2, 64);
60 const LLT v2p0 = LLT::fixed_vector(2, p0);
62 const LLT nxv16s8 = LLT::scalable_vector(16, s8);
63 const LLT nxv8s16 = LLT::scalable_vector(8, s16);
64 const LLT nxv4s32 = LLT::scalable_vector(4, s32);
65 const LLT nxv2s64 = LLT::scalable_vector(2, s64);
67 std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
68 v16s8, v8s16, v4s32,
69 v2s64, v2p0,
70 /* End 128bit types */
71 /* Begin 64bit types */
72 v8s8, v4s16, v2s32};
73 std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0};
74 SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList);
75 SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList);
77 const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
79 // FIXME: support subtargets which have neon/fp-armv8 disabled.
80 if (!ST.hasNEON() || !ST.hasFPARMv8()) {
81 getLegacyLegalizerInfo().computeTables();
82 return;
85 // Some instructions only support s16 if the subtarget has full 16-bit FP
86 // support.
87 const bool HasFP16 = ST.hasFullFP16();
88 const LLT &MinFPScalar = HasFP16 ? s16 : s32;
90 const bool HasCSSC = ST.hasCSSC();
91 const bool HasRCPC3 = ST.hasRCPC3();
92 const bool HasSVE = ST.hasSVE();
94 getActionDefinitionsBuilder(
95 {G_IMPLICIT_DEF, G_FREEZE, G_CONSTANT_FOLD_BARRIER})
96 .legalFor({p0, s8, s16, s32, s64})
97 .legalFor({v16s8, v8s16, v4s32, v2s64, v2p0, v8s8, v4s16, v2s32, v4s8,
98 v2s16, v2s8})
99 .widenScalarToNextPow2(0)
100 .clampScalar(0, s8, s64)
101 .moreElementsToNextPow2(0)
102 .widenVectorEltsToVectorMinSize(0, 64)
103 .clampNumElements(0, v8s8, v16s8)
104 .clampNumElements(0, v4s16, v8s16)
105 .clampNumElements(0, v2s32, v4s32)
106 .clampMaxNumElements(0, s64, 2)
107 .clampMaxNumElements(0, p0, 2);
109 getActionDefinitionsBuilder(G_PHI)
110 .legalFor({p0, s16, s32, s64})
111 .legalFor(PackedVectorAllTypeList)
112 .widenScalarToNextPow2(0)
113 .moreElementsToNextPow2(0)
114 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
115 .clampScalar(0, s16, s64)
116 .clampNumElements(0, v8s8, v16s8)
117 .clampNumElements(0, v4s16, v8s16)
118 .clampNumElements(0, v2s32, v4s32)
119 .clampMaxNumElements(0, s64, 2)
120 .clampMaxNumElements(0, p0, 2);
122 getActionDefinitionsBuilder(G_BSWAP)
123 .legalFor({s32, s64, v4s16, v8s16, v2s32, v4s32, v2s64})
124 .widenScalarOrEltToNextPow2(0, 16)
125 .clampScalar(0, s32, s64)
126 .clampNumElements(0, v4s16, v8s16)
127 .clampNumElements(0, v2s32, v4s32)
128 .clampNumElements(0, v2s64, v2s64)
129 .moreElementsToNextPow2(0);
131 getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR})
132 .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
133 .legalFor(HasSVE, {nxv16s8, nxv8s16, nxv4s32, nxv2s64})
134 .widenScalarToNextPow2(0)
135 .clampScalar(0, s32, s64)
136 .clampMaxNumElements(0, s8, 16)
137 .clampMaxNumElements(0, s16, 8)
138 .clampNumElements(0, v2s32, v4s32)
139 .clampNumElements(0, v2s64, v2s64)
140 .minScalarOrEltIf(
141 [=](const LegalityQuery &Query) {
142 return Query.Types[0].getNumElements() <= 2;
144 0, s32)
145 .minScalarOrEltIf(
146 [=](const LegalityQuery &Query) {
147 return Query.Types[0].getNumElements() <= 4;
149 0, s16)
150 .minScalarOrEltIf(
151 [=](const LegalityQuery &Query) {
152 return Query.Types[0].getNumElements() <= 16;
154 0, s8)
155 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
156 .moreElementsToNextPow2(0);
158 getActionDefinitionsBuilder(G_MUL)
159 .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
160 .widenScalarToNextPow2(0)
161 .clampScalar(0, s32, s64)
162 .clampMaxNumElements(0, s8, 16)
163 .clampMaxNumElements(0, s16, 8)
164 .clampNumElements(0, v2s32, v4s32)
165 .clampNumElements(0, v2s64, v2s64)
166 .minScalarOrEltIf(
167 [=](const LegalityQuery &Query) {
168 return Query.Types[0].getNumElements() <= 2;
170 0, s32)
171 .minScalarOrEltIf(
172 [=](const LegalityQuery &Query) {
173 return Query.Types[0].getNumElements() <= 4;
175 0, s16)
176 .minScalarOrEltIf(
177 [=](const LegalityQuery &Query) {
178 return Query.Types[0].getNumElements() <= 16;
180 0, s8)
181 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
182 .moreElementsToNextPow2(0);
184 getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR})
185 .customIf([=](const LegalityQuery &Query) {
186 const auto &SrcTy = Query.Types[0];
187 const auto &AmtTy = Query.Types[1];
188 return !SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
189 AmtTy.getSizeInBits() == 32;
191 .legalFor({
192 {s32, s32},
193 {s32, s64},
194 {s64, s64},
195 {v8s8, v8s8},
196 {v16s8, v16s8},
197 {v4s16, v4s16},
198 {v8s16, v8s16},
199 {v2s32, v2s32},
200 {v4s32, v4s32},
201 {v2s64, v2s64},
203 .widenScalarToNextPow2(0)
204 .clampScalar(1, s32, s64)
205 .clampScalar(0, s32, s64)
206 .clampNumElements(0, v8s8, v16s8)
207 .clampNumElements(0, v4s16, v8s16)
208 .clampNumElements(0, v2s32, v4s32)
209 .clampNumElements(0, v2s64, v2s64)
210 .moreElementsToNextPow2(0)
211 .minScalarSameAs(1, 0)
212 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
214 getActionDefinitionsBuilder(G_PTR_ADD)
215 .legalFor({{p0, s64}, {v2p0, v2s64}})
216 .clampScalarOrElt(1, s64, s64)
217 .clampNumElements(0, v2p0, v2p0);
219 getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
221 getActionDefinitionsBuilder({G_SDIV, G_UDIV})
222 .legalFor({s32, s64})
223 .libcallFor({s128})
224 .clampScalar(0, s32, s64)
225 .widenScalarToNextPow2(0)
226 .scalarize(0);
228 getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
229 .lowerFor({s8, s16, s32, s64, v2s64, v4s32, v2s32})
230 .libcallFor({s128})
231 .widenScalarOrEltToNextPow2(0)
232 .minScalarOrElt(0, s32)
233 .clampNumElements(0, v2s32, v4s32)
234 .clampNumElements(0, v2s64, v2s64)
235 .scalarize(0);
237 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
238 .widenScalarToNextPow2(0, /*Min = */ 32)
239 .clampScalar(0, s32, s64)
240 .lower();
242 getActionDefinitionsBuilder({G_SMULH, G_UMULH})
243 .legalFor({s64, v8s16, v16s8, v4s32})
244 .lower();
246 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
247 .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
248 .legalFor(HasCSSC, {s32, s64})
249 .minScalar(HasCSSC, 0, s32)
250 .clampNumElements(0, v8s8, v16s8)
251 .clampNumElements(0, v4s16, v8s16)
252 .clampNumElements(0, v2s32, v4s32)
253 // FIXME: This sholdn't be needed as v2s64 types are going to
254 // be expanded anyway, but G_ICMP doesn't support splitting vectors yet
255 .clampNumElements(0, v2s64, v2s64)
256 .lower();
258 getActionDefinitionsBuilder(
259 {G_SADDE, G_SSUBE, G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO, G_USUBO})
260 .legalFor({{s32, s32}, {s64, s32}})
261 .clampScalar(0, s32, s64)
262 .clampScalar(1, s32, s64)
263 .widenScalarToNextPow2(0);
265 getActionDefinitionsBuilder(
266 {G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FSQRT, G_FMAXNUM, G_FMINNUM,
267 G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, G_FRINT, G_FNEARBYINT,
268 G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
269 .legalFor({s32, s64, v2s32, v4s32, v2s64})
270 .legalFor(HasFP16, {s16, v4s16, v8s16})
271 .libcallFor({s128})
272 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
273 .minScalarOrElt(0, MinFPScalar)
274 .clampNumElements(0, v4s16, v8s16)
275 .clampNumElements(0, v2s32, v4s32)
276 .clampNumElements(0, v2s64, v2s64)
277 .moreElementsToNextPow2(0);
279 getActionDefinitionsBuilder({G_FABS, G_FNEG})
280 .legalFor({s32, s64, v2s32, v4s32, v2s64})
281 .legalFor(HasFP16, {s16, v4s16, v8s16})
282 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
283 .lowerIf(scalarOrEltWiderThan(0, 64))
284 .clampNumElements(0, v4s16, v8s16)
285 .clampNumElements(0, v2s32, v4s32)
286 .clampNumElements(0, v2s64, v2s64)
287 .moreElementsToNextPow2(0)
288 .lowerFor({s16, v4s16, v8s16});
290 getActionDefinitionsBuilder(G_FREM)
291 .libcallFor({s32, s64, s128})
292 .minScalar(0, s32)
293 .scalarize(0);
295 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
296 .legalFor({{s64, MinFPScalar}, {s64, s32}, {s64, s64}})
297 .libcallFor({{s64, s128}})
298 .minScalarOrElt(1, MinFPScalar);
300 getActionDefinitionsBuilder({G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2,
301 G_FLOG10, G_FTAN, G_FEXP, G_FEXP2, G_FEXP10,
302 G_FACOS, G_FASIN, G_FATAN, G_FATAN2, G_FCOSH,
303 G_FSINH, G_FTANH})
304 // We need a call for these, so we always need to scalarize.
305 .scalarize(0)
306 // Regardless of FP16 support, widen 16-bit elements to 32-bits.
307 .minScalar(0, s32)
308 .libcallFor({s32, s64, s128});
309 getActionDefinitionsBuilder(G_FPOWI)
310 .scalarize(0)
311 .minScalar(0, s32)
312 .libcallFor({{s32, s32}, {s64, s32}, {s128, s32}});
314 getActionDefinitionsBuilder(G_INSERT)
315 .legalIf(all(typeInSet(0, {s32, s64, p0}),
316 typeInSet(1, {s8, s16, s32}), smallerThan(1, 0)))
317 .widenScalarToNextPow2(0)
318 .clampScalar(0, s32, s64)
319 .widenScalarToNextPow2(1)
320 .minScalar(1, s8)
321 .maxScalarIf(typeInSet(0, {s32}), 1, s16)
322 .maxScalarIf(typeInSet(0, {s64, p0}), 1, s32);
324 getActionDefinitionsBuilder(G_EXTRACT)
325 .legalIf(all(typeInSet(0, {s16, s32, s64, p0}),
326 typeInSet(1, {s32, s64, s128, p0}), smallerThan(0, 1)))
327 .widenScalarToNextPow2(1)
328 .clampScalar(1, s32, s128)
329 .widenScalarToNextPow2(0)
330 .minScalar(0, s16)
331 .maxScalarIf(typeInSet(1, {s32}), 0, s16)
332 .maxScalarIf(typeInSet(1, {s64, p0}), 0, s32)
333 .maxScalarIf(typeInSet(1, {s128}), 0, s64);
336 for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) {
337 auto &Actions = getActionDefinitionsBuilder(Op);
339 if (Op == G_SEXTLOAD)
340 Actions.lowerIf(atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered));
342 // Atomics have zero extending behavior.
343 Actions
344 .legalForTypesWithMemDesc({{s32, p0, s8, 8},
345 {s32, p0, s16, 8},
346 {s32, p0, s32, 8},
347 {s64, p0, s8, 2},
348 {s64, p0, s16, 2},
349 {s64, p0, s32, 4},
350 {s64, p0, s64, 8},
351 {p0, p0, s64, 8},
352 {v2s32, p0, s64, 8}})
353 .widenScalarToNextPow2(0)
354 .clampScalar(0, s32, s64)
355 // TODO: We could support sum-of-pow2's but the lowering code doesn't know
356 // how to do that yet.
357 .unsupportedIfMemSizeNotPow2()
358 // Lower anything left over into G_*EXT and G_LOAD
359 .lower();
362 auto IsPtrVecPred = [=](const LegalityQuery &Query) {
363 const LLT &ValTy = Query.Types[0];
364 return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
367 getActionDefinitionsBuilder(G_LOAD)
368 .customIf([=](const LegalityQuery &Query) {
369 return HasRCPC3 && Query.Types[0] == s128 &&
370 Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
372 .customIf([=](const LegalityQuery &Query) {
373 return Query.Types[0] == s128 &&
374 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
376 .legalForTypesWithMemDesc({{s8, p0, s8, 8},
377 {s16, p0, s16, 8},
378 {s32, p0, s32, 8},
379 {s64, p0, s64, 8},
380 {p0, p0, s64, 8},
381 {s128, p0, s128, 8},
382 {v8s8, p0, s64, 8},
383 {v16s8, p0, s128, 8},
384 {v4s16, p0, s64, 8},
385 {v8s16, p0, s128, 8},
386 {v2s32, p0, s64, 8},
387 {v4s32, p0, s128, 8},
388 {v2s64, p0, s128, 8}})
389 // These extends are also legal
390 .legalForTypesWithMemDesc(
391 {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
392 .legalForTypesWithMemDesc({
393 // SVE vscale x 128 bit base sizes
394 {nxv16s8, p0, nxv16s8, 8},
395 {nxv8s16, p0, nxv8s16, 8},
396 {nxv4s32, p0, nxv4s32, 8},
397 {nxv2s64, p0, nxv2s64, 8},
399 .widenScalarToNextPow2(0, /* MinSize = */ 8)
400 .clampMaxNumElements(0, s8, 16)
401 .clampMaxNumElements(0, s16, 8)
402 .clampMaxNumElements(0, s32, 4)
403 .clampMaxNumElements(0, s64, 2)
404 .clampMaxNumElements(0, p0, 2)
405 .lowerIfMemSizeNotByteSizePow2()
406 .clampScalar(0, s8, s64)
407 .narrowScalarIf(
408 [=](const LegalityQuery &Query) {
409 // Clamp extending load results to 32-bits.
410 return Query.Types[0].isScalar() &&
411 Query.Types[0] != Query.MMODescrs[0].MemoryTy &&
412 Query.Types[0].getSizeInBits() > 32;
414 changeTo(0, s32))
415 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
416 .bitcastIf(typeInSet(0, {v4s8}),
417 [=](const LegalityQuery &Query) {
418 const LLT VecTy = Query.Types[0];
419 return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
421 .customIf(IsPtrVecPred)
422 .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
423 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
425 getActionDefinitionsBuilder(G_STORE)
426 .customIf([=](const LegalityQuery &Query) {
427 return HasRCPC3 && Query.Types[0] == s128 &&
428 Query.MMODescrs[0].Ordering == AtomicOrdering::Release;
430 .customIf([=](const LegalityQuery &Query) {
431 return Query.Types[0] == s128 &&
432 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
434 .legalForTypesWithMemDesc(
435 {{s8, p0, s8, 8}, {s16, p0, s8, 8}, // truncstorei8 from s16
436 {s32, p0, s8, 8}, // truncstorei8 from s32
437 {s64, p0, s8, 8}, // truncstorei8 from s64
438 {s16, p0, s16, 8}, {s32, p0, s16, 8}, // truncstorei16 from s32
439 {s64, p0, s16, 8}, // truncstorei16 from s64
440 {s32, p0, s8, 8}, {s32, p0, s16, 8}, {s32, p0, s32, 8},
441 {s64, p0, s64, 8}, {s64, p0, s32, 8}, // truncstorei32 from s64
442 {p0, p0, s64, 8}, {s128, p0, s128, 8}, {v16s8, p0, s128, 8},
443 {v8s8, p0, s64, 8}, {v4s16, p0, s64, 8}, {v8s16, p0, s128, 8},
444 {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
445 .legalForTypesWithMemDesc({
446 // SVE vscale x 128 bit base sizes
447 // TODO: Add nxv2p0. Consider bitcastIf.
448 // See #92130
449 // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
450 {nxv16s8, p0, nxv16s8, 8},
451 {nxv8s16, p0, nxv8s16, 8},
452 {nxv4s32, p0, nxv4s32, 8},
453 {nxv2s64, p0, nxv2s64, 8},
455 .clampScalar(0, s8, s64)
456 .lowerIf([=](const LegalityQuery &Query) {
457 return Query.Types[0].isScalar() &&
458 Query.Types[0] != Query.MMODescrs[0].MemoryTy;
460 // Maximum: sN * k = 128
461 .clampMaxNumElements(0, s8, 16)
462 .clampMaxNumElements(0, s16, 8)
463 .clampMaxNumElements(0, s32, 4)
464 .clampMaxNumElements(0, s64, 2)
465 .clampMaxNumElements(0, p0, 2)
466 .lowerIfMemSizeNotPow2()
467 // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
468 .bitcastIf(typeInSet(0, {v4s8}),
469 [=](const LegalityQuery &Query) {
470 const LLT VecTy = Query.Types[0];
471 return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
473 .customIf(IsPtrVecPred)
474 .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
475 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
477 getActionDefinitionsBuilder(G_INDEXED_STORE)
478 // Idx 0 == Ptr, Idx 1 == Val
479 // TODO: we can implement legalizations but as of now these are
480 // generated in a very specific way.
481 .legalForTypesWithMemDesc({
482 {p0, s8, s8, 8},
483 {p0, s16, s16, 8},
484 {p0, s32, s8, 8},
485 {p0, s32, s16, 8},
486 {p0, s32, s32, 8},
487 {p0, s64, s64, 8},
488 {p0, p0, p0, 8},
489 {p0, v8s8, v8s8, 8},
490 {p0, v16s8, v16s8, 8},
491 {p0, v4s16, v4s16, 8},
492 {p0, v8s16, v8s16, 8},
493 {p0, v2s32, v2s32, 8},
494 {p0, v4s32, v4s32, 8},
495 {p0, v2s64, v2s64, 8},
496 {p0, v2p0, v2p0, 8},
497 {p0, s128, s128, 8},
499 .unsupported();
501 auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {
502 LLT LdTy = Query.Types[0];
503 LLT PtrTy = Query.Types[1];
504 if (!llvm::is_contained(PackedVectorAllTypesVec, LdTy) &&
505 !llvm::is_contained(ScalarAndPtrTypesVec, LdTy) && LdTy != s128)
506 return false;
507 if (PtrTy != p0)
508 return false;
509 return true;
511 getActionDefinitionsBuilder(G_INDEXED_LOAD)
512 .unsupportedIf(
513 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
514 .legalIf(IndexedLoadBasicPred)
515 .unsupported();
516 getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD})
517 .unsupportedIf(
518 atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
519 .legalIf(all(typeInSet(0, {s16, s32, s64}),
520 LegalityPredicate([=](const LegalityQuery &Q) {
521 LLT LdTy = Q.Types[0];
522 LLT PtrTy = Q.Types[1];
523 LLT MemTy = Q.MMODescrs[0].MemoryTy;
524 if (PtrTy != p0)
525 return false;
526 if (LdTy == s16)
527 return MemTy == s8;
528 if (LdTy == s32)
529 return MemTy == s8 || MemTy == s16;
530 if (LdTy == s64)
531 return MemTy == s8 || MemTy == s16 || MemTy == s32;
532 return false;
533 })))
534 .unsupported();
536 // Constants
537 getActionDefinitionsBuilder(G_CONSTANT)
538 .legalFor({p0, s8, s16, s32, s64})
539 .widenScalarToNextPow2(0)
540 .clampScalar(0, s8, s64);
541 getActionDefinitionsBuilder(G_FCONSTANT)
542 .legalFor({s32, s64, s128})
543 .legalFor(HasFP16, {s16})
544 .clampScalar(0, MinFPScalar, s128);
546 // FIXME: fix moreElementsToNextPow2
547 getActionDefinitionsBuilder(G_ICMP)
548 .legalFor({{s32, s32}, {s32, s64}, {s32, p0}})
549 .widenScalarOrEltToNextPow2(1)
550 .clampScalar(1, s32, s64)
551 .clampScalar(0, s32, s32)
552 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
553 .minScalarEltSameAsIf(
554 [=](const LegalityQuery &Query) {
555 const LLT &Ty = Query.Types[0];
556 const LLT &SrcTy = Query.Types[1];
557 return Ty.isVector() && !SrcTy.isPointerVector() &&
558 Ty.getElementType() != SrcTy.getElementType();
560 0, 1)
561 .minScalarOrEltIf(
562 [=](const LegalityQuery &Query) { return Query.Types[1] == v2s16; },
563 1, s32)
564 .minScalarOrEltIf(
565 [=](const LegalityQuery &Query) { return Query.Types[1] == v2p0; }, 0,
566 s64)
567 .moreElementsToNextPow2(1)
568 .clampNumElements(1, v8s8, v16s8)
569 .clampNumElements(1, v4s16, v8s16)
570 .clampNumElements(1, v2s32, v4s32)
571 .clampNumElements(1, v2s64, v2s64)
572 .customIf(isVector(0));
574 getActionDefinitionsBuilder(G_FCMP)
575 .legalFor({{s32, s32},
576 {s32, s64},
577 {v4s32, v4s32},
578 {v2s32, v2s32},
579 {v2s64, v2s64}})
580 .legalFor(HasFP16, {{s32, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
581 .widenScalarOrEltToNextPow2(1)
582 .clampScalar(0, s32, s32)
583 .minScalarOrElt(1, MinFPScalar)
584 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
585 .minScalarEltSameAsIf(
586 [=](const LegalityQuery &Query) {
587 const LLT &Ty = Query.Types[0];
588 const LLT &SrcTy = Query.Types[1];
589 return Ty.isVector() && !SrcTy.isPointerVector() &&
590 Ty.getElementType() != SrcTy.getElementType();
592 0, 1)
593 .clampNumElements(1, v4s16, v8s16)
594 .clampNumElements(1, v2s32, v4s32)
595 .clampMaxNumElements(1, s64, 2)
596 .moreElementsToNextPow2(1)
597 .libcallFor({{s32, s128}});
599 // Extensions
600 auto ExtLegalFunc = [=](const LegalityQuery &Query) {
601 unsigned DstSize = Query.Types[0].getSizeInBits();
603 // Handle legal vectors using legalFor
604 if (Query.Types[0].isVector())
605 return false;
607 if (DstSize < 8 || DstSize >= 128 || !isPowerOf2_32(DstSize))
608 return false; // Extending to a scalar s128 needs narrowing.
610 const LLT &SrcTy = Query.Types[1];
612 // Make sure we fit in a register otherwise. Don't bother checking that
613 // the source type is below 128 bits. We shouldn't be allowing anything
614 // through which is wider than the destination in the first place.
615 unsigned SrcSize = SrcTy.getSizeInBits();
616 if (SrcSize < 8 || !isPowerOf2_32(SrcSize))
617 return false;
619 return true;
621 getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
622 .legalIf(ExtLegalFunc)
623 .legalFor({{v2s64, v2s32}, {v4s32, v4s16}, {v8s16, v8s8}})
624 .clampScalar(0, s64, s64) // Just for s128, others are handled above.
625 .moreElementsToNextPow2(0)
626 .clampMaxNumElements(1, s8, 8)
627 .clampMaxNumElements(1, s16, 4)
628 .clampMaxNumElements(1, s32, 2)
629 // Tries to convert a large EXTEND into two smaller EXTENDs
630 .lowerIf([=](const LegalityQuery &Query) {
631 return (Query.Types[0].getScalarSizeInBits() >
632 Query.Types[1].getScalarSizeInBits() * 2) &&
633 Query.Types[0].isVector() &&
634 (Query.Types[1].getScalarSizeInBits() == 8 ||
635 Query.Types[1].getScalarSizeInBits() == 16);
637 .clampMinNumElements(1, s8, 8)
638 .clampMinNumElements(1, s16, 4);
640 getActionDefinitionsBuilder(G_TRUNC)
641 .legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}})
642 .moreElementsToNextPow2(0)
643 .clampMaxNumElements(0, s8, 8)
644 .clampMaxNumElements(0, s16, 4)
645 .clampMaxNumElements(0, s32, 2)
646 .minScalarOrEltIf(
647 [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
648 0, s8)
649 .lowerIf([=](const LegalityQuery &Query) {
650 LLT DstTy = Query.Types[0];
651 LLT SrcTy = Query.Types[1];
652 return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
653 DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
655 .clampMinNumElements(0, s8, 8)
656 .clampMinNumElements(0, s16, 4)
657 .alwaysLegal();
659 getActionDefinitionsBuilder(G_SEXT_INREG)
660 .legalFor({s32, s64})
661 .legalFor(PackedVectorAllTypeList)
662 .maxScalar(0, s64)
663 .clampNumElements(0, v8s8, v16s8)
664 .clampNumElements(0, v4s16, v8s16)
665 .clampNumElements(0, v2s32, v4s32)
666 .clampMaxNumElements(0, s64, 2)
667 .lower();
669 // FP conversions
670 getActionDefinitionsBuilder(G_FPTRUNC)
671 .legalFor(
672 {{s16, s32}, {s16, s64}, {s32, s64}, {v4s16, v4s32}, {v2s32, v2s64}})
673 .libcallFor({{s16, s128}, {s32, s128}, {s64, s128}})
674 .clampNumElements(0, v4s16, v4s16)
675 .clampNumElements(0, v2s32, v2s32)
676 .scalarize(0);
678 getActionDefinitionsBuilder(G_FPEXT)
679 .legalFor(
680 {{s32, s16}, {s64, s16}, {s64, s32}, {v4s32, v4s16}, {v2s64, v2s32}})
681 .libcallFor({{s128, s64}, {s128, s32}, {s128, s16}})
682 .clampNumElements(0, v4s32, v4s32)
683 .clampNumElements(0, v2s64, v2s64)
684 .scalarize(0);
686 // Conversions
687 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
688 .legalFor({{s32, s32},
689 {s64, s32},
690 {s32, s64},
691 {s64, s64},
692 {v2s64, v2s64},
693 {v4s32, v4s32},
694 {v2s32, v2s32}})
695 .legalFor(HasFP16,
696 {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
697 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
698 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
699 // The range of a fp16 value fits into an i17, so we can lower the width
700 // to i64.
701 .narrowScalarIf(
702 [=](const LegalityQuery &Query) {
703 return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64;
705 changeTo(0, s64))
706 .moreElementsToNextPow2(0)
707 .widenScalarOrEltToNextPow2OrMinSize(0)
708 .minScalar(0, s32)
709 .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32)
710 .widenScalarIf(
711 [=](const LegalityQuery &Query) {
712 return Query.Types[0].getScalarSizeInBits() <= 64 &&
713 Query.Types[0].getScalarSizeInBits() >
714 Query.Types[1].getScalarSizeInBits();
716 LegalizeMutations::changeElementSizeTo(1, 0))
717 .widenScalarIf(
718 [=](const LegalityQuery &Query) {
719 return Query.Types[1].getScalarSizeInBits() <= 64 &&
720 Query.Types[0].getScalarSizeInBits() <
721 Query.Types[1].getScalarSizeInBits();
723 LegalizeMutations::changeElementSizeTo(0, 1))
724 .clampNumElements(0, v4s16, v8s16)
725 .clampNumElements(0, v2s32, v4s32)
726 .clampMaxNumElements(0, s64, 2)
727 .libcallFor(
728 {{s32, s128}, {s64, s128}, {s128, s128}, {s128, s32}, {s128, s64}});
730 getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
731 .legalFor({{s32, s32},
732 {s64, s32},
733 {s32, s64},
734 {s64, s64},
735 {v2s64, v2s64},
736 {v4s32, v4s32},
737 {v2s32, v2s32}})
738 .legalFor(HasFP16,
739 {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
740 // Handle types larger than i64 by scalarizing/lowering.
741 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
742 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
743 // The range of a fp16 value fits into an i17, so we can lower the width
744 // to i64.
745 .narrowScalarIf(
746 [=](const LegalityQuery &Query) {
747 return Query.Types[1] == s16 && Query.Types[0].getSizeInBits() > 64;
749 changeTo(0, s64))
750 .lowerIf(::any(scalarWiderThan(0, 64), scalarWiderThan(1, 64)), 0)
751 .moreElementsToNextPow2(0)
752 .widenScalarToNextPow2(0, /*MinSize=*/32)
753 .minScalar(0, s32)
754 .widenScalarOrEltToNextPow2OrMinSize(1, /*MinSize=*/HasFP16 ? 16 : 32)
755 .widenScalarIf(
756 [=](const LegalityQuery &Query) {
757 unsigned ITySize = Query.Types[0].getScalarSizeInBits();
758 return (ITySize == 16 || ITySize == 32 || ITySize == 64) &&
759 ITySize > Query.Types[1].getScalarSizeInBits();
761 LegalizeMutations::changeElementSizeTo(1, 0))
762 .widenScalarIf(
763 [=](const LegalityQuery &Query) {
764 unsigned FTySize = Query.Types[1].getScalarSizeInBits();
765 return (FTySize == 16 || FTySize == 32 || FTySize == 64) &&
766 Query.Types[0].getScalarSizeInBits() < FTySize;
768 LegalizeMutations::changeElementSizeTo(0, 1))
769 .widenScalarOrEltToNextPow2(0)
770 .clampNumElements(0, v4s16, v8s16)
771 .clampNumElements(0, v2s32, v4s32)
772 .clampMaxNumElements(0, s64, 2);
774 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
775 .legalFor({{s32, s32},
776 {s64, s32},
777 {s32, s64},
778 {s64, s64},
779 {v2s64, v2s64},
780 {v4s32, v4s32},
781 {v2s32, v2s32}})
782 .legalFor(HasFP16,
783 {{s16, s32}, {s16, s64}, {v4s16, v4s16}, {v8s16, v8s16}})
784 .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
785 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
786 .moreElementsToNextPow2(1)
787 .widenScalarOrEltToNextPow2OrMinSize(1)
788 .minScalar(1, s32)
789 .widenScalarOrEltToNextPow2OrMinSize(0, /*MinSize=*/HasFP16 ? 16 : 32)
790 .widenScalarIf(
791 [=](const LegalityQuery &Query) {
792 return Query.Types[1].getScalarSizeInBits() <= 64 &&
793 Query.Types[0].getScalarSizeInBits() <
794 Query.Types[1].getScalarSizeInBits();
796 LegalizeMutations::changeElementSizeTo(0, 1))
797 .widenScalarIf(
798 [=](const LegalityQuery &Query) {
799 return Query.Types[0].getScalarSizeInBits() <= 64 &&
800 Query.Types[0].getScalarSizeInBits() >
801 Query.Types[1].getScalarSizeInBits();
803 LegalizeMutations::changeElementSizeTo(1, 0))
804 .clampNumElements(0, v4s16, v8s16)
805 .clampNumElements(0, v2s32, v4s32)
806 .clampMaxNumElements(0, s64, 2)
807 .libcallFor({{s16, s128},
808 {s32, s128},
809 {s64, s128},
810 {s128, s128},
811 {s128, s32},
812 {s128, s64}});
814 // Control-flow
815 getActionDefinitionsBuilder(G_BRCOND)
816 .legalFor({s32})
817 .clampScalar(0, s32, s32);
818 getActionDefinitionsBuilder(G_BRINDIRECT).legalFor({p0});
820 getActionDefinitionsBuilder(G_SELECT)
821 .legalFor({{s32, s32}, {s64, s32}, {p0, s32}})
822 .widenScalarToNextPow2(0)
823 .clampScalar(0, s32, s64)
824 .clampScalar(1, s32, s32)
825 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
826 .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
827 .lowerIf(isVector(0));
829 // Pointer-handling
830 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
832 if (TM.getCodeModel() == CodeModel::Small)
833 getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
834 else
835 getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
837 getActionDefinitionsBuilder(G_PTRAUTH_GLOBAL_VALUE)
838 .legalIf(all(typeIs(0, p0), typeIs(1, p0)));
840 getActionDefinitionsBuilder(G_PTRTOINT)
841 .legalFor({{s64, p0}, {v2s64, v2p0}})
842 .widenScalarToNextPow2(0, 64)
843 .clampScalar(0, s64, s64)
844 .clampMaxNumElements(0, s64, 2);
846 getActionDefinitionsBuilder(G_INTTOPTR)
847 .unsupportedIf([&](const LegalityQuery &Query) {
848 return Query.Types[0].getSizeInBits() != Query.Types[1].getSizeInBits();
850 .legalFor({{p0, s64}, {v2p0, v2s64}})
851 .clampMaxNumElements(1, s64, 2);
853 // Casts for 32 and 64-bit width type are just copies.
854 // Same for 128-bit width type, except they are on the FPR bank.
855 getActionDefinitionsBuilder(G_BITCAST)
856 // Keeping 32-bit instructions legal to prevent regression in some tests
857 .legalForCartesianProduct({s32, v2s16, v4s8})
858 .legalForCartesianProduct({s64, v8s8, v4s16, v2s32})
859 .legalForCartesianProduct({s128, v16s8, v8s16, v4s32, v2s64, v2p0})
860 .lowerIf([=](const LegalityQuery &Query) {
861 return Query.Types[0].isVector() != Query.Types[1].isVector();
863 .moreElementsToNextPow2(0)
864 .clampNumElements(0, v8s8, v16s8)
865 .clampNumElements(0, v4s16, v8s16)
866 .clampNumElements(0, v2s32, v4s32)
867 .lower();
869 getActionDefinitionsBuilder(G_VASTART).legalFor({p0});
871 // va_list must be a pointer, but most sized types are pretty easy to handle
872 // as the destination.
873 getActionDefinitionsBuilder(G_VAARG)
874 .customForCartesianProduct({s8, s16, s32, s64, p0}, {p0})
875 .clampScalar(0, s8, s64)
876 .widenScalarToNextPow2(0, /*Min*/ 8);
878 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
879 .lowerIf(
880 all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
882 bool UseOutlineAtomics = ST.outlineAtomics() && !ST.hasLSE();
884 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
885 .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}})
886 .customFor(!UseOutlineAtomics, {{s128, p0}})
887 .libcallFor(UseOutlineAtomics,
888 {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}, {s128, p0}})
889 .clampScalar(0, s32, s64);
891 getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
892 G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
893 G_ATOMICRMW_XOR})
894 .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}})
895 .libcallFor(UseOutlineAtomics,
896 {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}})
897 .clampScalar(0, s32, s64);
899 // Do not outline these atomics operations, as per comment in
900 // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().
901 getActionDefinitionsBuilder(
902 {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})
903 .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)))
904 .clampScalar(0, s32, s64);
906 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({p0});
908 // Merge/Unmerge
909 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
910 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
911 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
912 getActionDefinitionsBuilder(Op)
913 .widenScalarToNextPow2(LitTyIdx, 8)
914 .widenScalarToNextPow2(BigTyIdx, 32)
915 .clampScalar(LitTyIdx, s8, s64)
916 .clampScalar(BigTyIdx, s32, s128)
917 .legalIf([=](const LegalityQuery &Q) {
918 switch (Q.Types[BigTyIdx].getSizeInBits()) {
919 case 32:
920 case 64:
921 case 128:
922 break;
923 default:
924 return false;
926 switch (Q.Types[LitTyIdx].getSizeInBits()) {
927 case 8:
928 case 16:
929 case 32:
930 case 64:
931 return true;
932 default:
933 return false;
938 // TODO : nxv4s16, nxv2s16, nxv2s32
939 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
940 .legalFor(HasSVE, {{s16, nxv16s8, s64},
941 {s16, nxv8s16, s64},
942 {s32, nxv4s32, s64},
943 {s64, nxv2s64, s64}})
944 .unsupportedIf([=](const LegalityQuery &Query) {
945 const LLT &EltTy = Query.Types[1].getElementType();
946 if (Query.Types[1].isScalableVector())
947 return false;
948 return Query.Types[0] != EltTy;
950 .minScalar(2, s64)
951 .customIf([=](const LegalityQuery &Query) {
952 const LLT &VecTy = Query.Types[1];
953 return VecTy == v2s16 || VecTy == v4s16 || VecTy == v8s16 ||
954 VecTy == v4s32 || VecTy == v2s64 || VecTy == v2s32 ||
955 VecTy == v8s8 || VecTy == v16s8 || VecTy == v2p0;
957 .minScalarOrEltIf(
958 [=](const LegalityQuery &Query) {
959 // We want to promote to <M x s1> to <M x s64> if that wouldn't
960 // cause the total vec size to be > 128b.
961 return Query.Types[1].isFixedVector() &&
962 Query.Types[1].getNumElements() <= 2;
964 0, s64)
965 .minScalarOrEltIf(
966 [=](const LegalityQuery &Query) {
967 return Query.Types[1].isFixedVector() &&
968 Query.Types[1].getNumElements() <= 4;
970 0, s32)
971 .minScalarOrEltIf(
972 [=](const LegalityQuery &Query) {
973 return Query.Types[1].isFixedVector() &&
974 Query.Types[1].getNumElements() <= 8;
976 0, s16)
977 .minScalarOrEltIf(
978 [=](const LegalityQuery &Query) {
979 return Query.Types[1].isFixedVector() &&
980 Query.Types[1].getNumElements() <= 16;
982 0, s8)
983 .minScalarOrElt(0, s8) // Worst case, we need at least s8.
984 .moreElementsToNextPow2(1)
985 .clampMaxNumElements(1, s64, 2)
986 .clampMaxNumElements(1, s32, 4)
987 .clampMaxNumElements(1, s16, 8)
988 .clampMaxNumElements(1, s8, 16)
989 .clampMaxNumElements(1, p0, 2);
991 getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT)
992 .legalIf(
993 typeInSet(0, {v16s8, v8s8, v8s16, v4s16, v4s32, v2s32, v2s64, v2p0}))
994 .legalFor(HasSVE, {{nxv16s8, s32, s64},
995 {nxv8s16, s32, s64},
996 {nxv4s32, s32, s64},
997 {nxv2s64, s64, s64}})
998 .moreElementsToNextPow2(0)
999 .widenVectorEltsToVectorMinSize(0, 64)
1000 .clampNumElements(0, v8s8, v16s8)
1001 .clampNumElements(0, v4s16, v8s16)
1002 .clampNumElements(0, v2s32, v4s32)
1003 .clampMaxNumElements(0, s64, 2)
1004 .clampMaxNumElements(0, p0, 2);
1006 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1007 .legalFor({{v8s8, s8},
1008 {v16s8, s8},
1009 {v4s16, s16},
1010 {v8s16, s16},
1011 {v2s32, s32},
1012 {v4s32, s32},
1013 {v2p0, p0},
1014 {v2s64, s64}})
1015 .clampNumElements(0, v4s32, v4s32)
1016 .clampNumElements(0, v2s64, v2s64)
1017 .minScalarOrElt(0, s8)
1018 .widenVectorEltsToVectorMinSize(0, 64)
1019 .widenScalarOrEltToNextPow2(0)
1020 .minScalarSameAs(1, 0);
1022 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
1024 getActionDefinitionsBuilder(G_CTLZ)
1025 .legalForCartesianProduct(
1026 {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
1027 .scalarize(1)
1028 .widenScalarToNextPow2(1, /*Min=*/32)
1029 .clampScalar(1, s32, s64)
1030 .scalarSameSizeAs(0, 1);
1031 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
1033 // TODO: Custom lowering for v2s32, v4s32, v2s64.
1034 getActionDefinitionsBuilder(G_BITREVERSE)
1035 .legalFor({s32, s64, v8s8, v16s8})
1036 .widenScalarToNextPow2(0, /*Min = */ 32)
1037 .clampScalar(0, s32, s64);
1039 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
1041 getActionDefinitionsBuilder(G_CTTZ)
1042 .lowerIf(isVector(0))
1043 .widenScalarToNextPow2(1, /*Min=*/32)
1044 .clampScalar(1, s32, s64)
1045 .scalarSameSizeAs(0, 1)
1046 .legalFor(HasCSSC, {s32, s64})
1047 .customFor(!HasCSSC, {s32, s64});
1049 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1050 .legalIf([=](const LegalityQuery &Query) {
1051 const LLT &DstTy = Query.Types[0];
1052 const LLT &SrcTy = Query.Types[1];
1053 // For now just support the TBL2 variant which needs the source vectors
1054 // to be the same size as the dest.
1055 if (DstTy != SrcTy)
1056 return false;
1057 return llvm::is_contained(
1058 {v2s64, v2s32, v4s32, v4s16, v16s8, v8s8, v8s16}, DstTy);
1060 // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
1061 // just want those lowered into G_BUILD_VECTOR
1062 .lowerIf([=](const LegalityQuery &Query) {
1063 return !Query.Types[1].isVector();
1065 .moreElementsIf(
1066 [](const LegalityQuery &Query) {
1067 return Query.Types[0].isVector() && Query.Types[1].isVector() &&
1068 Query.Types[0].getNumElements() >
1069 Query.Types[1].getNumElements();
1071 changeTo(1, 0))
1072 .moreElementsToNextPow2(0)
1073 .moreElementsIf(
1074 [](const LegalityQuery &Query) {
1075 return Query.Types[0].isVector() && Query.Types[1].isVector() &&
1076 Query.Types[0].getNumElements() <
1077 Query.Types[1].getNumElements();
1079 changeTo(0, 1))
1080 .widenScalarOrEltToNextPow2OrMinSize(0, 8)
1081 .clampNumElements(0, v8s8, v16s8)
1082 .clampNumElements(0, v4s16, v8s16)
1083 .clampNumElements(0, v4s32, v4s32)
1084 .clampNumElements(0, v2s64, v2s64)
1085 .bitcastIf(isPointerVector(0), [=](const LegalityQuery &Query) {
1086 // Bitcast pointers vector to i64.
1087 const LLT DstTy = Query.Types[0];
1088 return std::pair(0, LLT::vector(DstTy.getElementCount(), 64));
1091 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1092 .legalFor({{v4s32, v2s32}, {v8s16, v4s16}, {v16s8, v8s8}})
1093 .bitcastIf(
1094 [=](const LegalityQuery &Query) {
1095 return Query.Types[0].getSizeInBits() <= 128 &&
1096 Query.Types[1].getSizeInBits() <= 64;
1098 [=](const LegalityQuery &Query) {
1099 const LLT DstTy = Query.Types[0];
1100 const LLT SrcTy = Query.Types[1];
1101 return std::pair(
1102 0, DstTy.changeElementSize(SrcTy.getSizeInBits())
1103 .changeElementCount(
1104 DstTy.getElementCount().divideCoefficientBy(
1105 SrcTy.getNumElements())));
1108 getActionDefinitionsBuilder(G_JUMP_TABLE).legalFor({p0});
1110 getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, s64}});
1112 getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom();
1114 getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower();
1116 if (ST.hasMOPS()) {
1117 // G_BZERO is not supported. Currently it is only emitted by
1118 // PreLegalizerCombiner for G_MEMSET with zero constant.
1119 getActionDefinitionsBuilder(G_BZERO).unsupported();
1121 getActionDefinitionsBuilder(G_MEMSET)
1122 .legalForCartesianProduct({p0}, {s64}, {s64})
1123 .customForCartesianProduct({p0}, {s8}, {s64})
1124 .immIdx(0); // Inform verifier imm idx 0 is handled.
1126 getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
1127 .legalForCartesianProduct({p0}, {p0}, {s64})
1128 .immIdx(0); // Inform verifier imm idx 0 is handled.
1130 // G_MEMCPY_INLINE does not have a tailcall immediate
1131 getActionDefinitionsBuilder(G_MEMCPY_INLINE)
1132 .legalForCartesianProduct({p0}, {p0}, {s64});
1134 } else {
1135 getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
1136 .libcall();
1139 // FIXME: Legal vector types are only legal with NEON.
1140 getActionDefinitionsBuilder(G_ABS)
1141 .legalFor(HasCSSC, {s32, s64})
1142 .legalFor(PackedVectorAllTypeList)
1143 .customIf([=](const LegalityQuery &Q) {
1144 // TODO: Fix suboptimal codegen for 128+ bit types.
1145 LLT SrcTy = Q.Types[0];
1146 return SrcTy.isScalar() && SrcTy.getSizeInBits() < 128;
1148 .widenScalarIf(
1149 [=](const LegalityQuery &Query) { return Query.Types[0] == v4s8; },
1150 [=](const LegalityQuery &Query) { return std::make_pair(0, v4s16); })
1151 .widenScalarIf(
1152 [=](const LegalityQuery &Query) { return Query.Types[0] == v2s16; },
1153 [=](const LegalityQuery &Query) { return std::make_pair(0, v2s32); })
1154 .clampNumElements(0, v8s8, v16s8)
1155 .clampNumElements(0, v4s16, v8s16)
1156 .clampNumElements(0, v2s32, v4s32)
1157 .clampNumElements(0, v2s64, v2s64)
1158 .moreElementsToNextPow2(0)
1159 .lower();
1161 // For fadd reductions we have pairwise operations available. We treat the
1162 // usual legal types as legal and handle the lowering to pairwise instructions
1163 // later.
1164 getActionDefinitionsBuilder(G_VECREDUCE_FADD)
1165 .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
1166 .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
1167 .minScalarOrElt(0, MinFPScalar)
1168 .clampMaxNumElements(1, s64, 2)
1169 .clampMaxNumElements(1, s32, 4)
1170 .clampMaxNumElements(1, s16, 8)
1171 .lower();
1173 // For fmul reductions we need to split up into individual operations. We
1174 // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of
1175 // smaller types, followed by scalarizing what remains.
1176 getActionDefinitionsBuilder(G_VECREDUCE_FMUL)
1177 .minScalarOrElt(0, MinFPScalar)
1178 .clampMaxNumElements(1, s64, 2)
1179 .clampMaxNumElements(1, s32, 4)
1180 .clampMaxNumElements(1, s16, 8)
1181 .clampMaxNumElements(1, s32, 2)
1182 .clampMaxNumElements(1, s16, 4)
1183 .scalarize(1)
1184 .lower();
1186 getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL})
1187 .scalarize(2)
1188 .lower();
1190 getActionDefinitionsBuilder(G_VECREDUCE_ADD)
1191 .legalFor({{s8, v16s8},
1192 {s8, v8s8},
1193 {s16, v8s16},
1194 {s16, v4s16},
1195 {s32, v4s32},
1196 {s32, v2s32},
1197 {s64, v2s64}})
1198 .clampMaxNumElements(1, s64, 2)
1199 .clampMaxNumElements(1, s32, 4)
1200 .clampMaxNumElements(1, s16, 8)
1201 .clampMaxNumElements(1, s8, 16)
1202 .lower();
1204 getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
1205 G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
1206 .legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
1207 .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
1208 .minScalarOrElt(0, MinFPScalar)
1209 .clampMaxNumElements(1, s64, 2)
1210 .clampMaxNumElements(1, s32, 4)
1211 .clampMaxNumElements(1, s16, 8)
1212 .lower();
1214 getActionDefinitionsBuilder(G_VECREDUCE_MUL)
1215 .clampMaxNumElements(1, s32, 2)
1216 .clampMaxNumElements(1, s16, 4)
1217 .clampMaxNumElements(1, s8, 8)
1218 .scalarize(1)
1219 .lower();
1221 getActionDefinitionsBuilder(
1222 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX})
1223 .legalFor({{s8, v8s8},
1224 {s8, v16s8},
1225 {s16, v4s16},
1226 {s16, v8s16},
1227 {s32, v2s32},
1228 {s32, v4s32}})
1229 .moreElementsIf(
1230 [=](const LegalityQuery &Query) {
1231 return Query.Types[1].isVector() &&
1232 Query.Types[1].getElementType() != s8 &&
1233 Query.Types[1].getNumElements() & 1;
1235 LegalizeMutations::moreElementsToNextPow2(1))
1236 .clampMaxNumElements(1, s64, 2)
1237 .clampMaxNumElements(1, s32, 4)
1238 .clampMaxNumElements(1, s16, 8)
1239 .clampMaxNumElements(1, s8, 16)
1240 .scalarize(1)
1241 .lower();
1243 getActionDefinitionsBuilder(
1244 {G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
1245 // Try to break down into smaller vectors as long as they're at least 64
1246 // bits. This lets us use vector operations for some parts of the
1247 // reduction.
1248 .fewerElementsIf(
1249 [=](const LegalityQuery &Q) {
1250 LLT SrcTy = Q.Types[1];
1251 if (SrcTy.isScalar())
1252 return false;
1253 if (!isPowerOf2_32(SrcTy.getNumElements()))
1254 return false;
1255 // We can usually perform 64b vector operations.
1256 return SrcTy.getSizeInBits() > 64;
1258 [=](const LegalityQuery &Q) {
1259 LLT SrcTy = Q.Types[1];
1260 return std::make_pair(1, SrcTy.divide(2));
1262 .scalarize(1)
1263 .lower();
1265 // TODO: Update this to correct handling when adding AArch64/SVE support.
1266 getActionDefinitionsBuilder(G_VECTOR_COMPRESS).lower();
1268 getActionDefinitionsBuilder({G_FSHL, G_FSHR})
1269 .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
1270 .lower();
1272 getActionDefinitionsBuilder(G_ROTR)
1273 .legalFor({{s32, s64}, {s64, s64}})
1274 .customIf([=](const LegalityQuery &Q) {
1275 return Q.Types[0].isScalar() && Q.Types[1].getScalarSizeInBits() < 64;
1277 .lower();
1278 getActionDefinitionsBuilder(G_ROTL).lower();
1280 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1281 .customFor({{s32, s32}, {s64, s64}});
1283 auto always = [=](const LegalityQuery &Q) { return true; };
1284 getActionDefinitionsBuilder(G_CTPOP)
1285 .legalFor(HasCSSC, {{s32, s32}, {s64, s64}})
1286 .legalFor({{v8s8, v8s8}, {v16s8, v16s8}})
1287 .customFor(!HasCSSC, {{s32, s32}, {s64, s64}})
1288 .customFor({{s128, s128},
1289 {v2s64, v2s64},
1290 {v2s32, v2s32},
1291 {v4s32, v4s32},
1292 {v4s16, v4s16},
1293 {v8s16, v8s16}})
1294 .clampScalar(0, s32, s128)
1295 .widenScalarToNextPow2(0)
1296 .minScalarEltSameAsIf(always, 1, 0)
1297 .maxScalarEltSameAsIf(always, 1, 0);
1299 getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
1300 .legalFor({v2s64, v2s32, v4s32, v4s16, v8s16, v8s8, v16s8})
1301 .legalFor(HasSVE, {nxv2s64, nxv4s32, nxv8s16, nxv16s8})
1302 .clampNumElements(0, v8s8, v16s8)
1303 .clampNumElements(0, v4s16, v8s16)
1304 .clampNumElements(0, v2s32, v4s32)
1305 .clampMaxNumElements(0, s64, 2)
1306 .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
1307 .moreElementsToNextPow2(0)
1308 .lower();
1310 // TODO: Libcall support for s128.
1311 // TODO: s16 should be legal with full FP16 support.
1312 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1313 .legalFor({{s64, s32}, {s64, s64}});
1315 // TODO: Custom legalization for mismatched types.
1316 getActionDefinitionsBuilder(G_FCOPYSIGN)
1317 .moreElementsIf(
1318 [](const LegalityQuery &Query) { return Query.Types[0].isScalar(); },
1319 [=](const LegalityQuery &Query) {
1320 const LLT Ty = Query.Types[0];
1321 return std::pair(0, LLT::fixed_vector(Ty == s16 ? 4 : 2, Ty));
1323 .lower();
1325 getActionDefinitionsBuilder(G_FMAD).lower();
1327 // Access to floating-point environment.
1328 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV, G_RESET_FPENV,
1329 G_GET_FPMODE, G_SET_FPMODE, G_RESET_FPMODE})
1330 .libcall();
1332 getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
1334 getActionDefinitionsBuilder(G_PREFETCH).custom();
1336 getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower();
1338 getActionDefinitionsBuilder(G_EXTRACT_SUBVECTOR)
1339 .legalFor({{v8s8, v16s8}, {v4s16, v8s16}, {v2s32, v4s32}})
1340 .widenScalarOrEltToNextPow2(0)
1341 .immIdx(0); // Inform verifier imm idx 0 is handled.
1343 // TODO: {nxv16s8, s8}, {nxv8s16, s16}
1344 getActionDefinitionsBuilder(G_SPLAT_VECTOR)
1345 .legalFor(HasSVE, {{nxv4s32, s32}, {nxv2s64, s64}});
1347 getLegacyLegalizerInfo().computeTables();
1348 verify(*ST.getInstrInfo());
1351 bool AArch64LegalizerInfo::legalizeCustom(
1352 LegalizerHelper &Helper, MachineInstr &MI,
1353 LostDebugLocObserver &LocObserver) const {
1354 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1355 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
1356 GISelChangeObserver &Observer = Helper.Observer;
1357 switch (MI.getOpcode()) {
1358 default:
1359 // No idea what to do.
1360 return false;
1361 case TargetOpcode::G_VAARG:
1362 return legalizeVaArg(MI, MRI, MIRBuilder);
1363 case TargetOpcode::G_LOAD:
1364 case TargetOpcode::G_STORE:
1365 return legalizeLoadStore(MI, MRI, MIRBuilder, Observer);
1366 case TargetOpcode::G_SHL:
1367 case TargetOpcode::G_ASHR:
1368 case TargetOpcode::G_LSHR:
1369 return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
1370 case TargetOpcode::G_GLOBAL_VALUE:
1371 return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
1372 case TargetOpcode::G_SBFX:
1373 case TargetOpcode::G_UBFX:
1374 return legalizeBitfieldExtract(MI, MRI, Helper);
1375 case TargetOpcode::G_FSHL:
1376 case TargetOpcode::G_FSHR:
1377 return legalizeFunnelShift(MI, MRI, MIRBuilder, Observer, Helper);
1378 case TargetOpcode::G_ROTR:
1379 return legalizeRotate(MI, MRI, Helper);
1380 case TargetOpcode::G_CTPOP:
1381 return legalizeCTPOP(MI, MRI, Helper);
1382 case TargetOpcode::G_ATOMIC_CMPXCHG:
1383 return legalizeAtomicCmpxchg128(MI, MRI, Helper);
1384 case TargetOpcode::G_CTTZ:
1385 return legalizeCTTZ(MI, Helper);
1386 case TargetOpcode::G_BZERO:
1387 case TargetOpcode::G_MEMCPY:
1388 case TargetOpcode::G_MEMMOVE:
1389 case TargetOpcode::G_MEMSET:
1390 return legalizeMemOps(MI, Helper);
1391 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1392 return legalizeExtractVectorElt(MI, MRI, Helper);
1393 case TargetOpcode::G_DYN_STACKALLOC:
1394 return legalizeDynStackAlloc(MI, Helper);
1395 case TargetOpcode::G_PREFETCH:
1396 return legalizePrefetch(MI, Helper);
1397 case TargetOpcode::G_ABS:
1398 return Helper.lowerAbsToCNeg(MI);
1399 case TargetOpcode::G_ICMP:
1400 return legalizeICMP(MI, MRI, MIRBuilder);
1403 llvm_unreachable("expected switch to return");
1406 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
1407 MachineRegisterInfo &MRI,
1408 MachineIRBuilder &MIRBuilder,
1409 GISelChangeObserver &Observer,
1410 LegalizerHelper &Helper) const {
1411 assert(MI.getOpcode() == TargetOpcode::G_FSHL ||
1412 MI.getOpcode() == TargetOpcode::G_FSHR);
1414 // Keep as G_FSHR if shift amount is a G_CONSTANT, else use generic
1415 // lowering
1416 Register ShiftNo = MI.getOperand(3).getReg();
1417 LLT ShiftTy = MRI.getType(ShiftNo);
1418 auto VRegAndVal = getIConstantVRegValWithLookThrough(ShiftNo, MRI);
1420 // Adjust shift amount according to Opcode (FSHL/FSHR)
1421 // Convert FSHL to FSHR
1422 LLT OperationTy = MRI.getType(MI.getOperand(0).getReg());
1423 APInt BitWidth(ShiftTy.getSizeInBits(), OperationTy.getSizeInBits(), false);
1425 // Lower non-constant shifts and leave zero shifts to the optimizer.
1426 if (!VRegAndVal || VRegAndVal->Value.urem(BitWidth) == 0)
1427 return (Helper.lowerFunnelShiftAsShifts(MI) ==
1428 LegalizerHelper::LegalizeResult::Legalized);
1430 APInt Amount = VRegAndVal->Value.urem(BitWidth);
1432 Amount = MI.getOpcode() == TargetOpcode::G_FSHL ? BitWidth - Amount : Amount;
1434 // If the instruction is G_FSHR, has a 64-bit G_CONSTANT for shift amount
1435 // in the range of 0 <-> BitWidth, it is legal
1436 if (ShiftTy.getSizeInBits() == 64 && MI.getOpcode() == TargetOpcode::G_FSHR &&
1437 VRegAndVal->Value.ult(BitWidth))
1438 return true;
1440 // Cast the ShiftNumber to a 64-bit type
1441 auto Cast64 = MIRBuilder.buildConstant(LLT::scalar(64), Amount.zext(64));
1443 if (MI.getOpcode() == TargetOpcode::G_FSHR) {
1444 Observer.changingInstr(MI);
1445 MI.getOperand(3).setReg(Cast64.getReg(0));
1446 Observer.changedInstr(MI);
1448 // If Opcode is FSHL, remove the FSHL instruction and create a FSHR
1449 // instruction
1450 else if (MI.getOpcode() == TargetOpcode::G_FSHL) {
1451 MIRBuilder.buildInstr(TargetOpcode::G_FSHR, {MI.getOperand(0).getReg()},
1452 {MI.getOperand(1).getReg(), MI.getOperand(2).getReg(),
1453 Cast64.getReg(0)});
1454 MI.eraseFromParent();
1456 return true;
1459 bool AArch64LegalizerInfo::legalizeICMP(MachineInstr &MI,
1460 MachineRegisterInfo &MRI,
1461 MachineIRBuilder &MIRBuilder) const {
1462 Register DstReg = MI.getOperand(0).getReg();
1463 Register SrcReg1 = MI.getOperand(2).getReg();
1464 Register SrcReg2 = MI.getOperand(3).getReg();
1465 LLT DstTy = MRI.getType(DstReg);
1466 LLT SrcTy = MRI.getType(SrcReg1);
1468 // Check the vector types are legal
1469 if (DstTy.getScalarSizeInBits() != SrcTy.getScalarSizeInBits() ||
1470 DstTy.getNumElements() != SrcTy.getNumElements() ||
1471 (DstTy.getSizeInBits() != 64 && DstTy.getSizeInBits() != 128))
1472 return false;
1474 // Lowers G_ICMP NE => G_ICMP EQ to allow better pattern matching for
1475 // following passes
1476 CmpInst::Predicate Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate();
1477 if (Pred != CmpInst::ICMP_NE)
1478 return true;
1479 Register CmpReg =
1480 MIRBuilder
1481 .buildICmp(CmpInst::ICMP_EQ, MRI.getType(DstReg), SrcReg1, SrcReg2)
1482 .getReg(0);
1483 MIRBuilder.buildNot(DstReg, CmpReg);
1485 MI.eraseFromParent();
1486 return true;
1489 bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
1490 MachineRegisterInfo &MRI,
1491 LegalizerHelper &Helper) const {
1492 // To allow for imported patterns to match, we ensure that the rotate amount
1493 // is 64b with an extension.
1494 Register AmtReg = MI.getOperand(2).getReg();
1495 LLT AmtTy = MRI.getType(AmtReg);
1496 (void)AmtTy;
1497 assert(AmtTy.isScalar() && "Expected a scalar rotate");
1498 assert(AmtTy.getSizeInBits() < 64 && "Expected this rotate to be legal");
1499 auto NewAmt = Helper.MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
1500 Helper.Observer.changingInstr(MI);
1501 MI.getOperand(2).setReg(NewAmt.getReg(0));
1502 Helper.Observer.changedInstr(MI);
1503 return true;
1506 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
1507 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1508 GISelChangeObserver &Observer) const {
1509 assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
1510 // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
1511 // G_ADD_LOW instructions.
1512 // By splitting this here, we can optimize accesses in the small code model by
1513 // folding in the G_ADD_LOW into the load/store offset.
1514 auto &GlobalOp = MI.getOperand(1);
1515 // Don't modify an intrinsic call.
1516 if (GlobalOp.isSymbol())
1517 return true;
1518 const auto* GV = GlobalOp.getGlobal();
1519 if (GV->isThreadLocal())
1520 return true; // Don't want to modify TLS vars.
1522 auto &TM = ST->getTargetLowering()->getTargetMachine();
1523 unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
1525 if (OpFlags & AArch64II::MO_GOT)
1526 return true;
1528 auto Offset = GlobalOp.getOffset();
1529 Register DstReg = MI.getOperand(0).getReg();
1530 auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
1531 .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE);
1532 // Set the regclass on the dest reg too.
1533 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1535 // MO_TAGGED on the page indicates a tagged address. Set the tag now. We do so
1536 // by creating a MOVK that sets bits 48-63 of the register to (global address
1537 // + 0x100000000 - PC) >> 48. The additional 0x100000000 offset here is to
1538 // prevent an incorrect tag being generated during relocation when the
1539 // global appears before the code section. Without the offset, a global at
1540 // `0x0f00'0000'0000'1000` (i.e. at `0x1000` with tag `0xf`) that's referenced
1541 // by code at `0x2000` would result in `0x0f00'0000'0000'1000 - 0x2000 =
1542 // 0x0eff'ffff'ffff'f000`, meaning the tag would be incorrectly set to `0xe`
1543 // instead of `0xf`.
1544 // This assumes that we're in the small code model so we can assume a binary
1545 // size of <= 4GB, which makes the untagged PC relative offset positive. The
1546 // binary must also be loaded into address range [0, 2^48). Both of these
1547 // properties need to be ensured at runtime when using tagged addresses.
1548 if (OpFlags & AArch64II::MO_TAGGED) {
1549 assert(!Offset &&
1550 "Should not have folded in an offset for a tagged global!");
1551 ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP})
1552 .addGlobalAddress(GV, 0x100000000,
1553 AArch64II::MO_PREL | AArch64II::MO_G3)
1554 .addImm(48);
1555 MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
1558 MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
1559 .addGlobalAddress(GV, Offset,
1560 OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
1561 MI.eraseFromParent();
1562 return true;
1565 bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
1566 MachineInstr &MI) const {
1567 auto LowerBinOp = [&MI](unsigned Opcode) {
1568 MachineIRBuilder MIB(MI);
1569 MIB.buildInstr(Opcode, {MI.getOperand(0)},
1570 {MI.getOperand(2), MI.getOperand(3)});
1571 MI.eraseFromParent();
1572 return true;
1575 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1576 switch (IntrinsicID) {
1577 case Intrinsic::vacopy: {
1578 unsigned PtrSize = ST->isTargetILP32() ? 4 : 8;
1579 unsigned VaListSize =
1580 (ST->isTargetDarwin() || ST->isTargetWindows())
1581 ? PtrSize
1582 : ST->isTargetILP32() ? 20 : 32;
1584 MachineFunction &MF = *MI.getMF();
1585 auto Val = MF.getRegInfo().createGenericVirtualRegister(
1586 LLT::scalar(VaListSize * 8));
1587 MachineIRBuilder MIB(MI);
1588 MIB.buildLoad(Val, MI.getOperand(2),
1589 *MF.getMachineMemOperand(MachinePointerInfo(),
1590 MachineMemOperand::MOLoad,
1591 VaListSize, Align(PtrSize)));
1592 MIB.buildStore(Val, MI.getOperand(1),
1593 *MF.getMachineMemOperand(MachinePointerInfo(),
1594 MachineMemOperand::MOStore,
1595 VaListSize, Align(PtrSize)));
1596 MI.eraseFromParent();
1597 return true;
1599 case Intrinsic::get_dynamic_area_offset: {
1600 MachineIRBuilder &MIB = Helper.MIRBuilder;
1601 MIB.buildConstant(MI.getOperand(0).getReg(), 0);
1602 MI.eraseFromParent();
1603 return true;
1605 case Intrinsic::aarch64_mops_memset_tag: {
1606 assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1607 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
1608 // the instruction).
1609 MachineIRBuilder MIB(MI);
1610 auto &Value = MI.getOperand(3);
1611 Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
1612 Value.setReg(ExtValueReg);
1613 return true;
1615 case Intrinsic::aarch64_prefetch: {
1616 MachineIRBuilder MIB(MI);
1617 auto &AddrVal = MI.getOperand(1);
1619 int64_t IsWrite = MI.getOperand(2).getImm();
1620 int64_t Target = MI.getOperand(3).getImm();
1621 int64_t IsStream = MI.getOperand(4).getImm();
1622 int64_t IsData = MI.getOperand(5).getImm();
1624 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
1625 (!IsData << 3) | // IsDataCache bit
1626 (Target << 1) | // Cache level bits
1627 (unsigned)IsStream; // Stream bit
1629 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
1630 MI.eraseFromParent();
1631 return true;
1633 case Intrinsic::aarch64_neon_uaddv:
1634 case Intrinsic::aarch64_neon_saddv:
1635 case Intrinsic::aarch64_neon_umaxv:
1636 case Intrinsic::aarch64_neon_smaxv:
1637 case Intrinsic::aarch64_neon_uminv:
1638 case Intrinsic::aarch64_neon_sminv: {
1639 MachineIRBuilder MIB(MI);
1640 MachineRegisterInfo &MRI = *MIB.getMRI();
1641 bool IsSigned = IntrinsicID == Intrinsic::aarch64_neon_saddv ||
1642 IntrinsicID == Intrinsic::aarch64_neon_smaxv ||
1643 IntrinsicID == Intrinsic::aarch64_neon_sminv;
1645 auto OldDst = MI.getOperand(0).getReg();
1646 auto OldDstTy = MRI.getType(OldDst);
1647 LLT NewDstTy = MRI.getType(MI.getOperand(2).getReg()).getElementType();
1648 if (OldDstTy == NewDstTy)
1649 return true;
1651 auto NewDst = MRI.createGenericVirtualRegister(NewDstTy);
1653 Helper.Observer.changingInstr(MI);
1654 MI.getOperand(0).setReg(NewDst);
1655 Helper.Observer.changedInstr(MI);
1657 MIB.setInsertPt(MIB.getMBB(), ++MIB.getInsertPt());
1658 MIB.buildExtOrTrunc(IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT,
1659 OldDst, NewDst);
1661 return true;
1663 case Intrinsic::aarch64_neon_uaddlp:
1664 case Intrinsic::aarch64_neon_saddlp: {
1665 MachineIRBuilder MIB(MI);
1667 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
1668 ? AArch64::G_UADDLP
1669 : AArch64::G_SADDLP;
1670 MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)});
1671 MI.eraseFromParent();
1673 return true;
1675 case Intrinsic::aarch64_neon_uaddlv:
1676 case Intrinsic::aarch64_neon_saddlv: {
1677 MachineIRBuilder MIB(MI);
1678 MachineRegisterInfo &MRI = *MIB.getMRI();
1680 unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv
1681 ? AArch64::G_UADDLV
1682 : AArch64::G_SADDLV;
1683 Register DstReg = MI.getOperand(0).getReg();
1684 Register SrcReg = MI.getOperand(2).getReg();
1685 LLT DstTy = MRI.getType(DstReg);
1687 LLT MidTy, ExtTy;
1688 if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) {
1689 MidTy = LLT::fixed_vector(4, 32);
1690 ExtTy = LLT::scalar(32);
1691 } else {
1692 MidTy = LLT::fixed_vector(2, 64);
1693 ExtTy = LLT::scalar(64);
1696 Register MidReg =
1697 MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg();
1698 Register ZeroReg =
1699 MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
1700 Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy},
1701 {MidReg, ZeroReg})
1702 .getReg(0);
1704 if (DstTy.getScalarSizeInBits() < 32)
1705 MIB.buildTrunc(DstReg, ExtReg);
1706 else
1707 MIB.buildCopy(DstReg, ExtReg);
1709 MI.eraseFromParent();
1711 return true;
1713 case Intrinsic::aarch64_neon_smax:
1714 return LowerBinOp(TargetOpcode::G_SMAX);
1715 case Intrinsic::aarch64_neon_smin:
1716 return LowerBinOp(TargetOpcode::G_SMIN);
1717 case Intrinsic::aarch64_neon_umax:
1718 return LowerBinOp(TargetOpcode::G_UMAX);
1719 case Intrinsic::aarch64_neon_umin:
1720 return LowerBinOp(TargetOpcode::G_UMIN);
1721 case Intrinsic::aarch64_neon_fmax:
1722 return LowerBinOp(TargetOpcode::G_FMAXIMUM);
1723 case Intrinsic::aarch64_neon_fmin:
1724 return LowerBinOp(TargetOpcode::G_FMINIMUM);
1725 case Intrinsic::aarch64_neon_fmaxnm:
1726 return LowerBinOp(TargetOpcode::G_FMAXNUM);
1727 case Intrinsic::aarch64_neon_fminnm:
1728 return LowerBinOp(TargetOpcode::G_FMINNUM);
1729 case Intrinsic::aarch64_neon_smull:
1730 return LowerBinOp(AArch64::G_SMULL);
1731 case Intrinsic::aarch64_neon_umull:
1732 return LowerBinOp(AArch64::G_UMULL);
1733 case Intrinsic::aarch64_neon_abs: {
1734 // Lower the intrinsic to G_ABS.
1735 MachineIRBuilder MIB(MI);
1736 MIB.buildInstr(TargetOpcode::G_ABS, {MI.getOperand(0)}, {MI.getOperand(2)});
1737 MI.eraseFromParent();
1738 return true;
1741 case Intrinsic::vector_reverse:
1742 // TODO: Add support for vector_reverse
1743 return false;
1746 return true;
1749 bool AArch64LegalizerInfo::legalizeShlAshrLshr(
1750 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1751 GISelChangeObserver &Observer) const {
1752 assert(MI.getOpcode() == TargetOpcode::G_ASHR ||
1753 MI.getOpcode() == TargetOpcode::G_LSHR ||
1754 MI.getOpcode() == TargetOpcode::G_SHL);
1755 // If the shift amount is a G_CONSTANT, promote it to a 64 bit type so the
1756 // imported patterns can select it later. Either way, it will be legal.
1757 Register AmtReg = MI.getOperand(2).getReg();
1758 auto VRegAndVal = getIConstantVRegValWithLookThrough(AmtReg, MRI);
1759 if (!VRegAndVal)
1760 return true;
1761 // Check the shift amount is in range for an immediate form.
1762 int64_t Amount = VRegAndVal->Value.getSExtValue();
1763 if (Amount > 31)
1764 return true; // This will have to remain a register variant.
1765 auto ExtCst = MIRBuilder.buildConstant(LLT::scalar(64), Amount);
1766 Observer.changingInstr(MI);
1767 MI.getOperand(2).setReg(ExtCst.getReg(0));
1768 Observer.changedInstr(MI);
1769 return true;
1772 static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
1773 MachineRegisterInfo &MRI) {
1774 Base = Root;
1775 Offset = 0;
1777 Register NewBase;
1778 int64_t NewOffset;
1779 if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
1780 isShiftedInt<7, 3>(NewOffset)) {
1781 Base = NewBase;
1782 Offset = NewOffset;
1786 // FIXME: This should be removed and replaced with the generic bitcast legalize
1787 // action.
1788 bool AArch64LegalizerInfo::legalizeLoadStore(
1789 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
1790 GISelChangeObserver &Observer) const {
1791 assert(MI.getOpcode() == TargetOpcode::G_STORE ||
1792 MI.getOpcode() == TargetOpcode::G_LOAD);
1793 // Here we just try to handle vector loads/stores where our value type might
1794 // have pointer elements, which the SelectionDAG importer can't handle. To
1795 // allow the existing patterns for s64 to fire for p0, we just try to bitcast
1796 // the value to use s64 types.
1798 // Custom legalization requires the instruction, if not deleted, must be fully
1799 // legalized. In order to allow further legalization of the inst, we create
1800 // a new instruction and erase the existing one.
1802 Register ValReg = MI.getOperand(0).getReg();
1803 const LLT ValTy = MRI.getType(ValReg);
1805 if (ValTy == LLT::scalar(128)) {
1807 AtomicOrdering Ordering = (*MI.memoperands_begin())->getSuccessOrdering();
1808 bool IsLoad = MI.getOpcode() == TargetOpcode::G_LOAD;
1809 bool IsLoadAcquire = IsLoad && Ordering == AtomicOrdering::Acquire;
1810 bool IsStoreRelease = !IsLoad && Ordering == AtomicOrdering::Release;
1811 bool IsRcpC3 =
1812 ST->hasLSE2() && ST->hasRCPC3() && (IsLoadAcquire || IsStoreRelease);
1814 LLT s64 = LLT::scalar(64);
1816 unsigned Opcode;
1817 if (IsRcpC3) {
1818 Opcode = IsLoad ? AArch64::LDIAPPX : AArch64::STILPX;
1819 } else {
1820 // For LSE2, loads/stores should have been converted to monotonic and had
1821 // a fence inserted after them.
1822 assert(Ordering == AtomicOrdering::Monotonic ||
1823 Ordering == AtomicOrdering::Unordered);
1824 assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
1826 Opcode = IsLoad ? AArch64::LDPXi : AArch64::STPXi;
1829 MachineInstrBuilder NewI;
1830 if (IsLoad) {
1831 NewI = MIRBuilder.buildInstr(Opcode, {s64, s64}, {});
1832 MIRBuilder.buildMergeLikeInstr(
1833 ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
1834 } else {
1835 auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
1836 NewI = MIRBuilder.buildInstr(
1837 Opcode, {}, {Split->getOperand(0), Split->getOperand(1)});
1840 if (IsRcpC3) {
1841 NewI.addUse(MI.getOperand(1).getReg());
1842 } else {
1843 Register Base;
1844 int Offset;
1845 matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
1846 NewI.addUse(Base);
1847 NewI.addImm(Offset / 8);
1850 NewI.cloneMemRefs(MI);
1851 constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
1852 *MRI.getTargetRegisterInfo(),
1853 *ST->getRegBankInfo());
1854 MI.eraseFromParent();
1855 return true;
1858 if (!ValTy.isPointerVector() ||
1859 ValTy.getElementType().getAddressSpace() != 0) {
1860 LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
1861 return false;
1864 unsigned PtrSize = ValTy.getElementType().getSizeInBits();
1865 const LLT NewTy = LLT::vector(ValTy.getElementCount(), PtrSize);
1866 auto &MMO = **MI.memoperands_begin();
1867 MMO.setType(NewTy);
1869 if (MI.getOpcode() == TargetOpcode::G_STORE) {
1870 auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
1871 MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
1872 } else {
1873 auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
1874 MIRBuilder.buildBitcast(ValReg, NewLoad);
1876 MI.eraseFromParent();
1877 return true;
1880 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
1881 MachineRegisterInfo &MRI,
1882 MachineIRBuilder &MIRBuilder) const {
1883 MachineFunction &MF = MIRBuilder.getMF();
1884 Align Alignment(MI.getOperand(2).getImm());
1885 Register Dst = MI.getOperand(0).getReg();
1886 Register ListPtr = MI.getOperand(1).getReg();
1888 LLT PtrTy = MRI.getType(ListPtr);
1889 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1891 const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
1892 const Align PtrAlign = Align(PtrSize);
1893 auto List = MIRBuilder.buildLoad(
1894 PtrTy, ListPtr,
1895 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1896 PtrTy, PtrAlign));
1898 MachineInstrBuilder DstPtr;
1899 if (Alignment > PtrAlign) {
1900 // Realign the list to the actual required alignment.
1901 auto AlignMinus1 =
1902 MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
1903 auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
1904 DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
1905 } else
1906 DstPtr = List;
1908 LLT ValTy = MRI.getType(Dst);
1909 uint64_t ValSize = ValTy.getSizeInBits() / 8;
1910 MIRBuilder.buildLoad(
1911 Dst, DstPtr,
1912 *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
1913 ValTy, std::max(Alignment, PtrAlign)));
1915 auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
1917 auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
1919 MIRBuilder.buildStore(NewList, ListPtr,
1920 *MF.getMachineMemOperand(MachinePointerInfo(),
1921 MachineMemOperand::MOStore,
1922 PtrTy, PtrAlign));
1924 MI.eraseFromParent();
1925 return true;
1928 bool AArch64LegalizerInfo::legalizeBitfieldExtract(
1929 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
1930 // Only legal if we can select immediate forms.
1931 // TODO: Lower this otherwise.
1932 return getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
1933 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
1936 bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
1937 MachineRegisterInfo &MRI,
1938 LegalizerHelper &Helper) const {
1939 // When there is no integer popcount instruction (FEAT_CSSC isn't available),
1940 // it can be more efficiently lowered to the following sequence that uses
1941 // AdvSIMD registers/instructions as long as the copies to/from the AdvSIMD
1942 // registers are cheap.
1943 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
1944 // CNT V0.8B, V0.8B // 8xbyte pop-counts
1945 // ADDV B0, V0.8B // sum 8xbyte pop-counts
1946 // UMOV X0, V0.B[0] // copy byte result back to integer reg
1948 // For 128 bit vector popcounts, we lower to the following sequence:
1949 // cnt.16b v0, v0 // v8s16, v4s32, v2s64
1950 // uaddlp.8h v0, v0 // v8s16, v4s32, v2s64
1951 // uaddlp.4s v0, v0 // v4s32, v2s64
1952 // uaddlp.2d v0, v0 // v2s64
1954 // For 64 bit vector popcounts, we lower to the following sequence:
1955 // cnt.8b v0, v0 // v4s16, v2s32
1956 // uaddlp.4h v0, v0 // v4s16, v2s32
1957 // uaddlp.2s v0, v0 // v2s32
1959 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
1960 Register Dst = MI.getOperand(0).getReg();
1961 Register Val = MI.getOperand(1).getReg();
1962 LLT Ty = MRI.getType(Val);
1963 unsigned Size = Ty.getSizeInBits();
1965 assert(Ty == MRI.getType(Dst) &&
1966 "Expected src and dst to have the same type!");
1968 if (ST->hasCSSC() && Ty.isScalar() && Size == 128) {
1969 LLT s64 = LLT::scalar(64);
1971 auto Split = MIRBuilder.buildUnmerge(s64, Val);
1972 auto CTPOP1 = MIRBuilder.buildCTPOP(s64, Split->getOperand(0));
1973 auto CTPOP2 = MIRBuilder.buildCTPOP(s64, Split->getOperand(1));
1974 auto Add = MIRBuilder.buildAdd(s64, CTPOP1, CTPOP2);
1976 MIRBuilder.buildZExt(Dst, Add);
1977 MI.eraseFromParent();
1978 return true;
1981 if (!ST->hasNEON() ||
1982 MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) {
1983 // Use generic lowering when custom lowering is not possible.
1984 return Ty.isScalar() && (Size == 32 || Size == 64) &&
1985 Helper.lowerBitCount(MI) ==
1986 LegalizerHelper::LegalizeResult::Legalized;
1989 // Pre-conditioning: widen Val up to the nearest vector type.
1990 // s32,s64,v4s16,v2s32 -> v8i8
1991 // v8s16,v4s32,v2s64 -> v16i8
1992 LLT VTy = Size == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
1993 if (Ty.isScalar()) {
1994 assert((Size == 32 || Size == 64 || Size == 128) && "Expected only 32, 64, or 128 bit scalars!");
1995 if (Size == 32) {
1996 Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
1999 Val = MIRBuilder.buildBitcast(VTy, Val).getReg(0);
2001 // Count bits in each byte-sized lane.
2002 auto CTPOP = MIRBuilder.buildCTPOP(VTy, Val);
2004 // Sum across lanes.
2006 if (ST->hasDotProd() && Ty.isVector() && Ty.getNumElements() >= 2 &&
2007 Ty.getScalarSizeInBits() != 16) {
2008 LLT Dt = Ty == LLT::fixed_vector(2, 64) ? LLT::fixed_vector(4, 32) : Ty;
2009 auto Zeros = MIRBuilder.buildConstant(Dt, 0);
2010 auto Ones = MIRBuilder.buildConstant(VTy, 1);
2011 MachineInstrBuilder Sum;
2013 if (Ty == LLT::fixed_vector(2, 64)) {
2014 auto UDOT =
2015 MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
2016 Sum = MIRBuilder.buildInstr(AArch64::G_UADDLP, {Ty}, {UDOT});
2017 } else if (Ty == LLT::fixed_vector(4, 32)) {
2018 Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
2019 } else if (Ty == LLT::fixed_vector(2, 32)) {
2020 Sum = MIRBuilder.buildInstr(AArch64::G_UDOT, {Dt}, {Zeros, Ones, CTPOP});
2021 } else {
2022 llvm_unreachable("unexpected vector shape");
2025 Sum->getOperand(0).setReg(Dst);
2026 MI.eraseFromParent();
2027 return true;
2030 Register HSum = CTPOP.getReg(0);
2031 unsigned Opc;
2032 SmallVector<LLT> HAddTys;
2033 if (Ty.isScalar()) {
2034 Opc = Intrinsic::aarch64_neon_uaddlv;
2035 HAddTys.push_back(LLT::scalar(32));
2036 } else if (Ty == LLT::fixed_vector(8, 16)) {
2037 Opc = Intrinsic::aarch64_neon_uaddlp;
2038 HAddTys.push_back(LLT::fixed_vector(8, 16));
2039 } else if (Ty == LLT::fixed_vector(4, 32)) {
2040 Opc = Intrinsic::aarch64_neon_uaddlp;
2041 HAddTys.push_back(LLT::fixed_vector(8, 16));
2042 HAddTys.push_back(LLT::fixed_vector(4, 32));
2043 } else if (Ty == LLT::fixed_vector(2, 64)) {
2044 Opc = Intrinsic::aarch64_neon_uaddlp;
2045 HAddTys.push_back(LLT::fixed_vector(8, 16));
2046 HAddTys.push_back(LLT::fixed_vector(4, 32));
2047 HAddTys.push_back(LLT::fixed_vector(2, 64));
2048 } else if (Ty == LLT::fixed_vector(4, 16)) {
2049 Opc = Intrinsic::aarch64_neon_uaddlp;
2050 HAddTys.push_back(LLT::fixed_vector(4, 16));
2051 } else if (Ty == LLT::fixed_vector(2, 32)) {
2052 Opc = Intrinsic::aarch64_neon_uaddlp;
2053 HAddTys.push_back(LLT::fixed_vector(4, 16));
2054 HAddTys.push_back(LLT::fixed_vector(2, 32));
2055 } else
2056 llvm_unreachable("unexpected vector shape");
2057 MachineInstrBuilder UADD;
2058 for (LLT HTy : HAddTys) {
2059 UADD = MIRBuilder.buildIntrinsic(Opc, {HTy}).addUse(HSum);
2060 HSum = UADD.getReg(0);
2063 // Post-conditioning.
2064 if (Ty.isScalar() && (Size == 64 || Size == 128))
2065 MIRBuilder.buildZExt(Dst, UADD);
2066 else
2067 UADD->getOperand(0).setReg(Dst);
2068 MI.eraseFromParent();
2069 return true;
2072 bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
2073 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2074 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2075 LLT s64 = LLT::scalar(64);
2076 auto Addr = MI.getOperand(1).getReg();
2077 auto DesiredI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(2));
2078 auto NewI = MIRBuilder.buildUnmerge({s64, s64}, MI.getOperand(3));
2079 auto DstLo = MRI.createGenericVirtualRegister(s64);
2080 auto DstHi = MRI.createGenericVirtualRegister(s64);
2082 MachineInstrBuilder CAS;
2083 if (ST->hasLSE()) {
2084 // We have 128-bit CASP instructions taking XSeqPair registers, which are
2085 // s128. We need the merge/unmerge to bracket the expansion and pair up with
2086 // the rest of the MIR so we must reassemble the extracted registers into a
2087 // 128-bit known-regclass one with code like this:
2089 // %in1 = REG_SEQUENCE Lo, Hi ; One for each input
2090 // %out = CASP %in1, ...
2091 // %OldLo = G_EXTRACT %out, 0
2092 // %OldHi = G_EXTRACT %out, 64
2093 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2094 unsigned Opcode;
2095 switch (Ordering) {
2096 case AtomicOrdering::Acquire:
2097 Opcode = AArch64::CASPAX;
2098 break;
2099 case AtomicOrdering::Release:
2100 Opcode = AArch64::CASPLX;
2101 break;
2102 case AtomicOrdering::AcquireRelease:
2103 case AtomicOrdering::SequentiallyConsistent:
2104 Opcode = AArch64::CASPALX;
2105 break;
2106 default:
2107 Opcode = AArch64::CASPX;
2108 break;
2111 LLT s128 = LLT::scalar(128);
2112 auto CASDst = MRI.createGenericVirtualRegister(s128);
2113 auto CASDesired = MRI.createGenericVirtualRegister(s128);
2114 auto CASNew = MRI.createGenericVirtualRegister(s128);
2115 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASDesired}, {})
2116 .addUse(DesiredI->getOperand(0).getReg())
2117 .addImm(AArch64::sube64)
2118 .addUse(DesiredI->getOperand(1).getReg())
2119 .addImm(AArch64::subo64);
2120 MIRBuilder.buildInstr(TargetOpcode::REG_SEQUENCE, {CASNew}, {})
2121 .addUse(NewI->getOperand(0).getReg())
2122 .addImm(AArch64::sube64)
2123 .addUse(NewI->getOperand(1).getReg())
2124 .addImm(AArch64::subo64);
2126 CAS = MIRBuilder.buildInstr(Opcode, {CASDst}, {CASDesired, CASNew, Addr});
2128 MIRBuilder.buildExtract({DstLo}, {CASDst}, 0);
2129 MIRBuilder.buildExtract({DstHi}, {CASDst}, 64);
2130 } else {
2131 // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
2132 // can take arbitrary registers so it just has the normal GPR64 operands the
2133 // rest of AArch64 is expecting.
2134 auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
2135 unsigned Opcode;
2136 switch (Ordering) {
2137 case AtomicOrdering::Acquire:
2138 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
2139 break;
2140 case AtomicOrdering::Release:
2141 Opcode = AArch64::CMP_SWAP_128_RELEASE;
2142 break;
2143 case AtomicOrdering::AcquireRelease:
2144 case AtomicOrdering::SequentiallyConsistent:
2145 Opcode = AArch64::CMP_SWAP_128;
2146 break;
2147 default:
2148 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
2149 break;
2152 auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2153 CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
2154 {Addr, DesiredI->getOperand(0),
2155 DesiredI->getOperand(1), NewI->getOperand(0),
2156 NewI->getOperand(1)});
2159 CAS.cloneMemRefs(MI);
2160 constrainSelectedInstRegOperands(*CAS, *ST->getInstrInfo(),
2161 *MRI.getTargetRegisterInfo(),
2162 *ST->getRegBankInfo());
2164 MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {DstLo, DstHi});
2165 MI.eraseFromParent();
2166 return true;
2169 bool AArch64LegalizerInfo::legalizeCTTZ(MachineInstr &MI,
2170 LegalizerHelper &Helper) const {
2171 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2172 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2173 LLT Ty = MRI.getType(MI.getOperand(1).getReg());
2174 auto BitReverse = MIRBuilder.buildBitReverse(Ty, MI.getOperand(1));
2175 MIRBuilder.buildCTLZ(MI.getOperand(0).getReg(), BitReverse);
2176 MI.eraseFromParent();
2177 return true;
2180 bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
2181 LegalizerHelper &Helper) const {
2182 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2184 // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
2185 if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
2186 // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
2187 // the instruction).
2188 auto &Value = MI.getOperand(1);
2189 Register ExtValueReg =
2190 MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
2191 Value.setReg(ExtValueReg);
2192 return true;
2195 return false;
2198 bool AArch64LegalizerInfo::legalizeExtractVectorElt(
2199 MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const {
2200 const GExtractVectorElement *Element = cast<GExtractVectorElement>(&MI);
2201 auto VRegAndVal =
2202 getIConstantVRegValWithLookThrough(Element->getIndexReg(), MRI);
2203 if (VRegAndVal)
2204 return true;
2205 LLT VecTy = MRI.getType(Element->getVectorReg());
2206 if (VecTy.isScalableVector())
2207 return true;
2208 return Helper.lowerExtractInsertVectorElt(MI) !=
2209 LegalizerHelper::LegalizeResult::UnableToLegalize;
2212 bool AArch64LegalizerInfo::legalizeDynStackAlloc(
2213 MachineInstr &MI, LegalizerHelper &Helper) const {
2214 MachineFunction &MF = *MI.getParent()->getParent();
2215 MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
2216 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
2218 // If stack probing is not enabled for this function, use the default
2219 // lowering.
2220 if (!MF.getFunction().hasFnAttribute("probe-stack") ||
2221 MF.getFunction().getFnAttribute("probe-stack").getValueAsString() !=
2222 "inline-asm") {
2223 Helper.lowerDynStackAlloc(MI);
2224 return true;
2227 Register Dst = MI.getOperand(0).getReg();
2228 Register AllocSize = MI.getOperand(1).getReg();
2229 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
2231 assert(MRI.getType(Dst) == LLT::pointer(0, 64) &&
2232 "Unexpected type for dynamic alloca");
2233 assert(MRI.getType(AllocSize) == LLT::scalar(64) &&
2234 "Unexpected type for dynamic alloca");
2236 LLT PtrTy = MRI.getType(Dst);
2237 Register SPReg =
2238 Helper.getTargetLowering().getStackPointerRegisterToSaveRestore();
2239 Register SPTmp =
2240 Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
2241 auto NewMI =
2242 MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp});
2243 MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass);
2244 MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI);
2245 MIRBuilder.buildCopy(Dst, SPTmp);
2247 MI.eraseFromParent();
2248 return true;
2251 bool AArch64LegalizerInfo::legalizePrefetch(MachineInstr &MI,
2252 LegalizerHelper &Helper) const {
2253 MachineIRBuilder &MIB = Helper.MIRBuilder;
2254 auto &AddrVal = MI.getOperand(0);
2256 int64_t IsWrite = MI.getOperand(1).getImm();
2257 int64_t Locality = MI.getOperand(2).getImm();
2258 int64_t IsData = MI.getOperand(3).getImm();
2260 bool IsStream = Locality == 0;
2261 if (Locality != 0) {
2262 assert(Locality <= 3 && "Prefetch locality out-of-range");
2263 // The locality degree is the opposite of the cache speed.
2264 // Put the number the other way around.
2265 // The encoding starts at 0 for level 1
2266 Locality = 3 - Locality;
2269 unsigned PrfOp = (IsWrite << 4) | (!IsData << 3) | (Locality << 1) | IsStream;
2271 MIB.buildInstr(AArch64::G_AARCH64_PREFETCH).addImm(PrfOp).add(AddrVal);
2272 MI.eraseFromParent();
2273 return true;