1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the AArch64TargetLowering class.
11 //===----------------------------------------------------------------------===//
13 #include "AArch64ISelLowering.h"
14 #include "AArch64CallingConvention.h"
15 #include "AArch64ExpandImm.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "MCTargetDesc/AArch64AddressingModes.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
32 #include "llvm/Analysis/LoopInfo.h"
33 #include "llvm/Analysis/MemoryLocation.h"
34 #include "llvm/Analysis/ObjCARCUtil.h"
35 #include "llvm/Analysis/TargetTransformInfo.h"
36 #include "llvm/Analysis/VectorUtils.h"
37 #include "llvm/CodeGen/Analysis.h"
38 #include "llvm/CodeGen/CallingConvLower.h"
39 #include "llvm/CodeGen/ISDOpcodes.h"
40 #include "llvm/CodeGen/MachineBasicBlock.h"
41 #include "llvm/CodeGen/MachineFrameInfo.h"
42 #include "llvm/CodeGen/MachineFunction.h"
43 #include "llvm/CodeGen/MachineInstr.h"
44 #include "llvm/CodeGen/MachineInstrBuilder.h"
45 #include "llvm/CodeGen/MachineMemOperand.h"
46 #include "llvm/CodeGen/MachineRegisterInfo.h"
47 #include "llvm/CodeGen/RuntimeLibcalls.h"
48 #include "llvm/CodeGen/SelectionDAG.h"
49 #include "llvm/CodeGen/SelectionDAGNodes.h"
50 #include "llvm/CodeGen/TargetCallingConv.h"
51 #include "llvm/CodeGen/TargetInstrInfo.h"
52 #include "llvm/CodeGen/ValueTypes.h"
53 #include "llvm/IR/Attributes.h"
54 #include "llvm/IR/Constants.h"
55 #include "llvm/IR/DataLayout.h"
56 #include "llvm/IR/DebugLoc.h"
57 #include "llvm/IR/DerivedTypes.h"
58 #include "llvm/IR/Function.h"
59 #include "llvm/IR/GetElementPtrTypeIterator.h"
60 #include "llvm/IR/GlobalValue.h"
61 #include "llvm/IR/IRBuilder.h"
62 #include "llvm/IR/Instruction.h"
63 #include "llvm/IR/Instructions.h"
64 #include "llvm/IR/IntrinsicInst.h"
65 #include "llvm/IR/Intrinsics.h"
66 #include "llvm/IR/IntrinsicsAArch64.h"
67 #include "llvm/IR/Module.h"
68 #include "llvm/IR/OperandTraits.h"
69 #include "llvm/IR/PatternMatch.h"
70 #include "llvm/IR/Type.h"
71 #include "llvm/IR/Use.h"
72 #include "llvm/IR/Value.h"
73 #include "llvm/MC/MCRegisterInfo.h"
74 #include "llvm/Support/Casting.h"
75 #include "llvm/Support/CodeGen.h"
76 #include "llvm/Support/CommandLine.h"
77 #include "llvm/Support/Compiler.h"
78 #include "llvm/Support/Debug.h"
79 #include "llvm/Support/ErrorHandling.h"
80 #include "llvm/Support/InstructionCost.h"
81 #include "llvm/Support/KnownBits.h"
82 #include "llvm/Support/MachineValueType.h"
83 #include "llvm/Support/MathExtras.h"
84 #include "llvm/Support/raw_ostream.h"
85 #include "llvm/Target/TargetMachine.h"
86 #include "llvm/Target/TargetOptions.h"
100 using namespace llvm::PatternMatch
;
102 #define DEBUG_TYPE "aarch64-lower"
104 STATISTIC(NumTailCalls
, "Number of tail calls");
105 STATISTIC(NumShiftInserts
, "Number of vector shift inserts");
106 STATISTIC(NumOptimizedImms
, "Number of times immediates were optimized");
108 // FIXME: The necessary dtprel relocations don't seem to be supported
109 // well in the GNU bfd and gold linkers at the moment. Therefore, by
110 // default, for now, fall back to GeneralDynamic code generation.
111 cl::opt
<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
112 "aarch64-elf-ldtls-generation", cl::Hidden
,
113 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
117 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden
,
118 cl::desc("Enable AArch64 logical imm instruction "
122 // Temporary option added for the purpose of testing functionality added
123 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
124 // in future when both implementations will be based off MGATHER rather
125 // than the GLD1 nodes added for the SVE gather load intrinsics.
127 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden
,
128 cl::desc("Combine extends of AArch64 masked "
129 "gather intrinsics"),
132 /// Value type used for condition codes.
133 static const MVT MVT_CC
= MVT::i32
;
135 static inline EVT
getPackedSVEVectorVT(EVT VT
) {
136 switch (VT
.getSimpleVT().SimpleTy
) {
138 llvm_unreachable("unexpected element type for vector");
154 return MVT::nxv8bf16
;
158 // NOTE: Currently there's only a need to return integer vector types. If this
159 // changes then just add an extra "type" parameter.
160 static inline EVT
getPackedSVEVectorVT(ElementCount EC
) {
161 switch (EC
.getKnownMinValue()) {
163 llvm_unreachable("unexpected element count for vector");
175 static inline EVT
getPromotedVTForPredicate(EVT VT
) {
176 assert(VT
.isScalableVector() && (VT
.getVectorElementType() == MVT::i1
) &&
177 "Expected scalable predicate vector type!");
178 switch (VT
.getVectorMinNumElements()) {
180 llvm_unreachable("unexpected element count for vector");
192 /// Returns true if VT's elements occupy the lowest bit positions of its
193 /// associated register class without any intervening space.
195 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
196 /// same register class, but only nxv8f16 can be treated as a packed vector.
197 static inline bool isPackedVectorType(EVT VT
, SelectionDAG
&DAG
) {
198 assert(VT
.isVector() && DAG
.getTargetLoweringInfo().isTypeLegal(VT
) &&
199 "Expected legal vector type!");
200 return VT
.isFixedLengthVector() ||
201 VT
.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock
;
204 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
205 // predicate and end with a passthru value matching the result type.
206 static bool isMergePassthruOpcode(unsigned Opc
) {
210 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU
:
211 case AArch64ISD::BSWAP_MERGE_PASSTHRU
:
212 case AArch64ISD::REVH_MERGE_PASSTHRU
:
213 case AArch64ISD::REVW_MERGE_PASSTHRU
:
214 case AArch64ISD::REVD_MERGE_PASSTHRU
:
215 case AArch64ISD::CTLZ_MERGE_PASSTHRU
:
216 case AArch64ISD::CTPOP_MERGE_PASSTHRU
:
217 case AArch64ISD::DUP_MERGE_PASSTHRU
:
218 case AArch64ISD::ABS_MERGE_PASSTHRU
:
219 case AArch64ISD::NEG_MERGE_PASSTHRU
:
220 case AArch64ISD::FNEG_MERGE_PASSTHRU
:
221 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU
:
222 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU
:
223 case AArch64ISD::FCEIL_MERGE_PASSTHRU
:
224 case AArch64ISD::FFLOOR_MERGE_PASSTHRU
:
225 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU
:
226 case AArch64ISD::FRINT_MERGE_PASSTHRU
:
227 case AArch64ISD::FROUND_MERGE_PASSTHRU
:
228 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU
:
229 case AArch64ISD::FTRUNC_MERGE_PASSTHRU
:
230 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU
:
231 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU
:
232 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
:
233 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU
:
234 case AArch64ISD::FCVTZU_MERGE_PASSTHRU
:
235 case AArch64ISD::FCVTZS_MERGE_PASSTHRU
:
236 case AArch64ISD::FSQRT_MERGE_PASSTHRU
:
237 case AArch64ISD::FRECPX_MERGE_PASSTHRU
:
238 case AArch64ISD::FABS_MERGE_PASSTHRU
:
243 // Returns true if inactive lanes are known to be zeroed by construction.
244 static bool isZeroingInactiveLanes(SDValue Op
) {
245 switch (Op
.getOpcode()) {
247 // We guarantee i1 splat_vectors to zero the other lanes by
248 // implementing it with ptrue and possibly a punpklo for nxv1i1.
249 if (ISD::isConstantSplatVectorAllOnes(Op
.getNode()))
252 case AArch64ISD::PTRUE
:
253 case AArch64ISD::SETCC_MERGE_ZERO
:
255 case ISD::INTRINSIC_WO_CHAIN
:
256 switch (Op
.getConstantOperandVal(0)) {
259 case Intrinsic::aarch64_sve_ptrue
:
260 case Intrinsic::aarch64_sve_pnext
:
261 case Intrinsic::aarch64_sve_cmpeq
:
262 case Intrinsic::aarch64_sve_cmpne
:
263 case Intrinsic::aarch64_sve_cmpge
:
264 case Intrinsic::aarch64_sve_cmpgt
:
265 case Intrinsic::aarch64_sve_cmphs
:
266 case Intrinsic::aarch64_sve_cmphi
:
267 case Intrinsic::aarch64_sve_cmpeq_wide
:
268 case Intrinsic::aarch64_sve_cmpne_wide
:
269 case Intrinsic::aarch64_sve_cmpge_wide
:
270 case Intrinsic::aarch64_sve_cmpgt_wide
:
271 case Intrinsic::aarch64_sve_cmplt_wide
:
272 case Intrinsic::aarch64_sve_cmple_wide
:
273 case Intrinsic::aarch64_sve_cmphs_wide
:
274 case Intrinsic::aarch64_sve_cmphi_wide
:
275 case Intrinsic::aarch64_sve_cmplo_wide
:
276 case Intrinsic::aarch64_sve_cmpls_wide
:
277 case Intrinsic::aarch64_sve_fcmpeq
:
278 case Intrinsic::aarch64_sve_fcmpne
:
279 case Intrinsic::aarch64_sve_fcmpge
:
280 case Intrinsic::aarch64_sve_fcmpgt
:
281 case Intrinsic::aarch64_sve_fcmpuo
:
287 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine
&TM
,
288 const AArch64Subtarget
&STI
)
289 : TargetLowering(TM
), Subtarget(&STI
) {
290 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
291 // we have to make something up. Arbitrarily, choose ZeroOrOne.
292 setBooleanContents(ZeroOrOneBooleanContent
);
293 // When comparing vectors the result sets the different elements in the
294 // vector to all-one or all-zero.
295 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent
);
297 // Set up the register classes.
298 addRegisterClass(MVT::i32
, &AArch64::GPR32allRegClass
);
299 addRegisterClass(MVT::i64
, &AArch64::GPR64allRegClass
);
301 if (Subtarget
->hasLS64()) {
302 addRegisterClass(MVT::i64x8
, &AArch64::GPR64x8ClassRegClass
);
303 setOperationAction(ISD::LOAD
, MVT::i64x8
, Custom
);
304 setOperationAction(ISD::STORE
, MVT::i64x8
, Custom
);
307 if (Subtarget
->hasFPARMv8()) {
308 addRegisterClass(MVT::f16
, &AArch64::FPR16RegClass
);
309 addRegisterClass(MVT::bf16
, &AArch64::FPR16RegClass
);
310 addRegisterClass(MVT::f32
, &AArch64::FPR32RegClass
);
311 addRegisterClass(MVT::f64
, &AArch64::FPR64RegClass
);
312 addRegisterClass(MVT::f128
, &AArch64::FPR128RegClass
);
315 if (Subtarget
->hasNEON()) {
316 addRegisterClass(MVT::v16i8
, &AArch64::FPR8RegClass
);
317 addRegisterClass(MVT::v8i16
, &AArch64::FPR16RegClass
);
318 // Someone set us up the NEON.
319 addDRTypeForNEON(MVT::v2f32
);
320 addDRTypeForNEON(MVT::v8i8
);
321 addDRTypeForNEON(MVT::v4i16
);
322 addDRTypeForNEON(MVT::v2i32
);
323 addDRTypeForNEON(MVT::v1i64
);
324 addDRTypeForNEON(MVT::v1f64
);
325 addDRTypeForNEON(MVT::v4f16
);
326 if (Subtarget
->hasBF16())
327 addDRTypeForNEON(MVT::v4bf16
);
329 addQRTypeForNEON(MVT::v4f32
);
330 addQRTypeForNEON(MVT::v2f64
);
331 addQRTypeForNEON(MVT::v16i8
);
332 addQRTypeForNEON(MVT::v8i16
);
333 addQRTypeForNEON(MVT::v4i32
);
334 addQRTypeForNEON(MVT::v2i64
);
335 addQRTypeForNEON(MVT::v8f16
);
336 if (Subtarget
->hasBF16())
337 addQRTypeForNEON(MVT::v8bf16
);
340 if (Subtarget
->hasSVE() || Subtarget
->hasSME()) {
341 // Add legal sve predicate types
342 addRegisterClass(MVT::nxv1i1
, &AArch64::PPRRegClass
);
343 addRegisterClass(MVT::nxv2i1
, &AArch64::PPRRegClass
);
344 addRegisterClass(MVT::nxv4i1
, &AArch64::PPRRegClass
);
345 addRegisterClass(MVT::nxv8i1
, &AArch64::PPRRegClass
);
346 addRegisterClass(MVT::nxv16i1
, &AArch64::PPRRegClass
);
348 // Add legal sve data types
349 addRegisterClass(MVT::nxv16i8
, &AArch64::ZPRRegClass
);
350 addRegisterClass(MVT::nxv8i16
, &AArch64::ZPRRegClass
);
351 addRegisterClass(MVT::nxv4i32
, &AArch64::ZPRRegClass
);
352 addRegisterClass(MVT::nxv2i64
, &AArch64::ZPRRegClass
);
354 addRegisterClass(MVT::nxv2f16
, &AArch64::ZPRRegClass
);
355 addRegisterClass(MVT::nxv4f16
, &AArch64::ZPRRegClass
);
356 addRegisterClass(MVT::nxv8f16
, &AArch64::ZPRRegClass
);
357 addRegisterClass(MVT::nxv2f32
, &AArch64::ZPRRegClass
);
358 addRegisterClass(MVT::nxv4f32
, &AArch64::ZPRRegClass
);
359 addRegisterClass(MVT::nxv2f64
, &AArch64::ZPRRegClass
);
361 if (Subtarget
->hasBF16()) {
362 addRegisterClass(MVT::nxv2bf16
, &AArch64::ZPRRegClass
);
363 addRegisterClass(MVT::nxv4bf16
, &AArch64::ZPRRegClass
);
364 addRegisterClass(MVT::nxv8bf16
, &AArch64::ZPRRegClass
);
367 if (Subtarget
->useSVEForFixedLengthVectors()) {
368 for (MVT VT
: MVT::integer_fixedlen_vector_valuetypes())
369 if (useSVEForFixedLengthVectorVT(VT
))
370 addRegisterClass(VT
, &AArch64::ZPRRegClass
);
372 for (MVT VT
: MVT::fp_fixedlen_vector_valuetypes())
373 if (useSVEForFixedLengthVectorVT(VT
))
374 addRegisterClass(VT
, &AArch64::ZPRRegClass
);
378 // Compute derived properties from the register classes
379 computeRegisterProperties(Subtarget
->getRegisterInfo());
381 // Provide all sorts of operation actions
382 setOperationAction(ISD::GlobalAddress
, MVT::i64
, Custom
);
383 setOperationAction(ISD::GlobalTLSAddress
, MVT::i64
, Custom
);
384 setOperationAction(ISD::SETCC
, MVT::i32
, Custom
);
385 setOperationAction(ISD::SETCC
, MVT::i64
, Custom
);
386 setOperationAction(ISD::SETCC
, MVT::f16
, Custom
);
387 setOperationAction(ISD::SETCC
, MVT::f32
, Custom
);
388 setOperationAction(ISD::SETCC
, MVT::f64
, Custom
);
389 setOperationAction(ISD::STRICT_FSETCC
, MVT::f16
, Custom
);
390 setOperationAction(ISD::STRICT_FSETCC
, MVT::f32
, Custom
);
391 setOperationAction(ISD::STRICT_FSETCC
, MVT::f64
, Custom
);
392 setOperationAction(ISD::STRICT_FSETCCS
, MVT::f16
, Custom
);
393 setOperationAction(ISD::STRICT_FSETCCS
, MVT::f32
, Custom
);
394 setOperationAction(ISD::STRICT_FSETCCS
, MVT::f64
, Custom
);
395 setOperationAction(ISD::BITREVERSE
, MVT::i32
, Legal
);
396 setOperationAction(ISD::BITREVERSE
, MVT::i64
, Legal
);
397 setOperationAction(ISD::BRCOND
, MVT::Other
, Custom
);
398 setOperationAction(ISD::BR_CC
, MVT::i32
, Custom
);
399 setOperationAction(ISD::BR_CC
, MVT::i64
, Custom
);
400 setOperationAction(ISD::BR_CC
, MVT::f16
, Custom
);
401 setOperationAction(ISD::BR_CC
, MVT::f32
, Custom
);
402 setOperationAction(ISD::BR_CC
, MVT::f64
, Custom
);
403 setOperationAction(ISD::SELECT
, MVT::i32
, Custom
);
404 setOperationAction(ISD::SELECT
, MVT::i64
, Custom
);
405 setOperationAction(ISD::SELECT
, MVT::f16
, Custom
);
406 setOperationAction(ISD::SELECT
, MVT::bf16
, Custom
);
407 setOperationAction(ISD::SELECT
, MVT::f32
, Custom
);
408 setOperationAction(ISD::SELECT
, MVT::f64
, Custom
);
409 setOperationAction(ISD::SELECT_CC
, MVT::i32
, Custom
);
410 setOperationAction(ISD::SELECT_CC
, MVT::i64
, Custom
);
411 setOperationAction(ISD::SELECT_CC
, MVT::f16
, Custom
);
412 setOperationAction(ISD::SELECT_CC
, MVT::bf16
, Expand
);
413 setOperationAction(ISD::SELECT_CC
, MVT::f32
, Custom
);
414 setOperationAction(ISD::SELECT_CC
, MVT::f64
, Custom
);
415 setOperationAction(ISD::BR_JT
, MVT::Other
, Custom
);
416 setOperationAction(ISD::JumpTable
, MVT::i64
, Custom
);
417 setOperationAction(ISD::SETCCCARRY
, MVT::i64
, Custom
);
419 setOperationAction(ISD::SHL_PARTS
, MVT::i64
, Custom
);
420 setOperationAction(ISD::SRA_PARTS
, MVT::i64
, Custom
);
421 setOperationAction(ISD::SRL_PARTS
, MVT::i64
, Custom
);
423 setOperationAction(ISD::FREM
, MVT::f32
, Expand
);
424 setOperationAction(ISD::FREM
, MVT::f64
, Expand
);
425 setOperationAction(ISD::FREM
, MVT::f80
, Expand
);
427 setOperationAction(ISD::BUILD_PAIR
, MVT::i64
, Expand
);
429 // Custom lowering hooks are needed for XOR
430 // to fold it into CSINC/CSINV.
431 setOperationAction(ISD::XOR
, MVT::i32
, Custom
);
432 setOperationAction(ISD::XOR
, MVT::i64
, Custom
);
434 // Virtually no operation on f128 is legal, but LLVM can't expand them when
435 // there's a valid register class, so we need custom operations in most cases.
436 setOperationAction(ISD::FABS
, MVT::f128
, Expand
);
437 setOperationAction(ISD::FADD
, MVT::f128
, LibCall
);
438 setOperationAction(ISD::FCOPYSIGN
, MVT::f128
, Expand
);
439 setOperationAction(ISD::FCOS
, MVT::f128
, Expand
);
440 setOperationAction(ISD::FDIV
, MVT::f128
, LibCall
);
441 setOperationAction(ISD::FMA
, MVT::f128
, Expand
);
442 setOperationAction(ISD::FMUL
, MVT::f128
, LibCall
);
443 setOperationAction(ISD::FNEG
, MVT::f128
, Expand
);
444 setOperationAction(ISD::FPOW
, MVT::f128
, Expand
);
445 setOperationAction(ISD::FREM
, MVT::f128
, Expand
);
446 setOperationAction(ISD::FRINT
, MVT::f128
, Expand
);
447 setOperationAction(ISD::FSIN
, MVT::f128
, Expand
);
448 setOperationAction(ISD::FSINCOS
, MVT::f128
, Expand
);
449 setOperationAction(ISD::FSQRT
, MVT::f128
, Expand
);
450 setOperationAction(ISD::FSUB
, MVT::f128
, LibCall
);
451 setOperationAction(ISD::FTRUNC
, MVT::f128
, Expand
);
452 setOperationAction(ISD::SETCC
, MVT::f128
, Custom
);
453 setOperationAction(ISD::STRICT_FSETCC
, MVT::f128
, Custom
);
454 setOperationAction(ISD::STRICT_FSETCCS
, MVT::f128
, Custom
);
455 setOperationAction(ISD::BR_CC
, MVT::f128
, Custom
);
456 setOperationAction(ISD::SELECT
, MVT::f128
, Custom
);
457 setOperationAction(ISD::SELECT_CC
, MVT::f128
, Custom
);
458 setOperationAction(ISD::FP_EXTEND
, MVT::f128
, Custom
);
459 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
462 // Lowering for many of the conversions is actually specified by the non-f128
463 // type. The LowerXXX function will be trivial when f128 isn't involved.
464 setOperationAction(ISD::FP_TO_SINT
, MVT::i32
, Custom
);
465 setOperationAction(ISD::FP_TO_SINT
, MVT::i64
, Custom
);
466 setOperationAction(ISD::FP_TO_SINT
, MVT::i128
, Custom
);
467 setOperationAction(ISD::STRICT_FP_TO_SINT
, MVT::i32
, Custom
);
468 setOperationAction(ISD::STRICT_FP_TO_SINT
, MVT::i64
, Custom
);
469 setOperationAction(ISD::STRICT_FP_TO_SINT
, MVT::i128
, Custom
);
470 setOperationAction(ISD::FP_TO_UINT
, MVT::i32
, Custom
);
471 setOperationAction(ISD::FP_TO_UINT
, MVT::i64
, Custom
);
472 setOperationAction(ISD::FP_TO_UINT
, MVT::i128
, Custom
);
473 setOperationAction(ISD::STRICT_FP_TO_UINT
, MVT::i32
, Custom
);
474 setOperationAction(ISD::STRICT_FP_TO_UINT
, MVT::i64
, Custom
);
475 setOperationAction(ISD::STRICT_FP_TO_UINT
, MVT::i128
, Custom
);
476 setOperationAction(ISD::SINT_TO_FP
, MVT::i32
, Custom
);
477 setOperationAction(ISD::SINT_TO_FP
, MVT::i64
, Custom
);
478 setOperationAction(ISD::SINT_TO_FP
, MVT::i128
, Custom
);
479 setOperationAction(ISD::STRICT_SINT_TO_FP
, MVT::i32
, Custom
);
480 setOperationAction(ISD::STRICT_SINT_TO_FP
, MVT::i64
, Custom
);
481 setOperationAction(ISD::STRICT_SINT_TO_FP
, MVT::i128
, Custom
);
482 setOperationAction(ISD::UINT_TO_FP
, MVT::i32
, Custom
);
483 setOperationAction(ISD::UINT_TO_FP
, MVT::i64
, Custom
);
484 setOperationAction(ISD::UINT_TO_FP
, MVT::i128
, Custom
);
485 setOperationAction(ISD::STRICT_UINT_TO_FP
, MVT::i32
, Custom
);
486 setOperationAction(ISD::STRICT_UINT_TO_FP
, MVT::i64
, Custom
);
487 setOperationAction(ISD::STRICT_UINT_TO_FP
, MVT::i128
, Custom
);
488 setOperationAction(ISD::FP_ROUND
, MVT::f16
, Custom
);
489 setOperationAction(ISD::FP_ROUND
, MVT::f32
, Custom
);
490 setOperationAction(ISD::FP_ROUND
, MVT::f64
, Custom
);
491 setOperationAction(ISD::STRICT_FP_ROUND
, MVT::f16
, Custom
);
492 setOperationAction(ISD::STRICT_FP_ROUND
, MVT::f32
, Custom
);
493 setOperationAction(ISD::STRICT_FP_ROUND
, MVT::f64
, Custom
);
495 setOperationAction(ISD::FP_TO_UINT_SAT
, MVT::i32
, Custom
);
496 setOperationAction(ISD::FP_TO_UINT_SAT
, MVT::i64
, Custom
);
497 setOperationAction(ISD::FP_TO_SINT_SAT
, MVT::i32
, Custom
);
498 setOperationAction(ISD::FP_TO_SINT_SAT
, MVT::i64
, Custom
);
500 // Variable arguments.
501 setOperationAction(ISD::VASTART
, MVT::Other
, Custom
);
502 setOperationAction(ISD::VAARG
, MVT::Other
, Custom
);
503 setOperationAction(ISD::VACOPY
, MVT::Other
, Custom
);
504 setOperationAction(ISD::VAEND
, MVT::Other
, Expand
);
506 // Variable-sized objects.
507 setOperationAction(ISD::STACKSAVE
, MVT::Other
, Expand
);
508 setOperationAction(ISD::STACKRESTORE
, MVT::Other
, Expand
);
510 if (Subtarget
->isTargetWindows())
511 setOperationAction(ISD::DYNAMIC_STACKALLOC
, MVT::i64
, Custom
);
513 setOperationAction(ISD::DYNAMIC_STACKALLOC
, MVT::i64
, Expand
);
515 // Constant pool entries
516 setOperationAction(ISD::ConstantPool
, MVT::i64
, Custom
);
519 setOperationAction(ISD::BlockAddress
, MVT::i64
, Custom
);
521 // AArch64 lacks both left-rotate and popcount instructions.
522 setOperationAction(ISD::ROTL
, MVT::i32
, Expand
);
523 setOperationAction(ISD::ROTL
, MVT::i64
, Expand
);
524 for (MVT VT
: MVT::fixedlen_vector_valuetypes()) {
525 setOperationAction(ISD::ROTL
, VT
, Expand
);
526 setOperationAction(ISD::ROTR
, VT
, Expand
);
529 // AArch64 doesn't have i32 MULH{S|U}.
530 setOperationAction(ISD::MULHU
, MVT::i32
, Expand
);
531 setOperationAction(ISD::MULHS
, MVT::i32
, Expand
);
533 // AArch64 doesn't have {U|S}MUL_LOHI.
534 setOperationAction(ISD::UMUL_LOHI
, MVT::i64
, Expand
);
535 setOperationAction(ISD::SMUL_LOHI
, MVT::i64
, Expand
);
537 setOperationAction(ISD::CTPOP
, MVT::i32
, Custom
);
538 setOperationAction(ISD::CTPOP
, MVT::i64
, Custom
);
539 setOperationAction(ISD::CTPOP
, MVT::i128
, Custom
);
541 setOperationAction(ISD::PARITY
, MVT::i64
, Custom
);
542 setOperationAction(ISD::PARITY
, MVT::i128
, Custom
);
544 setOperationAction(ISD::ABS
, MVT::i32
, Custom
);
545 setOperationAction(ISD::ABS
, MVT::i64
, Custom
);
547 setOperationAction(ISD::SDIVREM
, MVT::i32
, Expand
);
548 setOperationAction(ISD::SDIVREM
, MVT::i64
, Expand
);
549 for (MVT VT
: MVT::fixedlen_vector_valuetypes()) {
550 setOperationAction(ISD::SDIVREM
, VT
, Expand
);
551 setOperationAction(ISD::UDIVREM
, VT
, Expand
);
553 setOperationAction(ISD::SREM
, MVT::i32
, Expand
);
554 setOperationAction(ISD::SREM
, MVT::i64
, Expand
);
555 setOperationAction(ISD::UDIVREM
, MVT::i32
, Expand
);
556 setOperationAction(ISD::UDIVREM
, MVT::i64
, Expand
);
557 setOperationAction(ISD::UREM
, MVT::i32
, Expand
);
558 setOperationAction(ISD::UREM
, MVT::i64
, Expand
);
560 // Custom lower Add/Sub/Mul with overflow.
561 setOperationAction(ISD::SADDO
, MVT::i32
, Custom
);
562 setOperationAction(ISD::SADDO
, MVT::i64
, Custom
);
563 setOperationAction(ISD::UADDO
, MVT::i32
, Custom
);
564 setOperationAction(ISD::UADDO
, MVT::i64
, Custom
);
565 setOperationAction(ISD::SSUBO
, MVT::i32
, Custom
);
566 setOperationAction(ISD::SSUBO
, MVT::i64
, Custom
);
567 setOperationAction(ISD::USUBO
, MVT::i32
, Custom
);
568 setOperationAction(ISD::USUBO
, MVT::i64
, Custom
);
569 setOperationAction(ISD::SMULO
, MVT::i32
, Custom
);
570 setOperationAction(ISD::SMULO
, MVT::i64
, Custom
);
571 setOperationAction(ISD::UMULO
, MVT::i32
, Custom
);
572 setOperationAction(ISD::UMULO
, MVT::i64
, Custom
);
574 setOperationAction(ISD::ADDCARRY
, MVT::i32
, Custom
);
575 setOperationAction(ISD::ADDCARRY
, MVT::i64
, Custom
);
576 setOperationAction(ISD::SUBCARRY
, MVT::i32
, Custom
);
577 setOperationAction(ISD::SUBCARRY
, MVT::i64
, Custom
);
578 setOperationAction(ISD::SADDO_CARRY
, MVT::i32
, Custom
);
579 setOperationAction(ISD::SADDO_CARRY
, MVT::i64
, Custom
);
580 setOperationAction(ISD::SSUBO_CARRY
, MVT::i32
, Custom
);
581 setOperationAction(ISD::SSUBO_CARRY
, MVT::i64
, Custom
);
583 setOperationAction(ISD::FSIN
, MVT::f32
, Expand
);
584 setOperationAction(ISD::FSIN
, MVT::f64
, Expand
);
585 setOperationAction(ISD::FCOS
, MVT::f32
, Expand
);
586 setOperationAction(ISD::FCOS
, MVT::f64
, Expand
);
587 setOperationAction(ISD::FPOW
, MVT::f32
, Expand
);
588 setOperationAction(ISD::FPOW
, MVT::f64
, Expand
);
589 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Custom
);
590 setOperationAction(ISD::FCOPYSIGN
, MVT::f32
, Custom
);
591 if (Subtarget
->hasFullFP16())
592 setOperationAction(ISD::FCOPYSIGN
, MVT::f16
, Custom
);
594 setOperationAction(ISD::FCOPYSIGN
, MVT::f16
, Promote
);
596 for (auto Op
: {ISD::FREM
, ISD::FPOW
, ISD::FPOWI
,
597 ISD::FCOS
, ISD::FSIN
, ISD::FSINCOS
,
598 ISD::FEXP
, ISD::FEXP2
, ISD::FLOG
,
599 ISD::FLOG2
, ISD::FLOG10
, ISD::STRICT_FREM
,
600 ISD::STRICT_FPOW
, ISD::STRICT_FPOWI
, ISD::STRICT_FCOS
,
601 ISD::STRICT_FSIN
, ISD::STRICT_FEXP
, ISD::STRICT_FEXP2
,
602 ISD::STRICT_FLOG
, ISD::STRICT_FLOG2
, ISD::STRICT_FLOG10
}) {
603 setOperationAction(Op
, MVT::f16
, Promote
);
604 setOperationAction(Op
, MVT::v4f16
, Expand
);
605 setOperationAction(Op
, MVT::v8f16
, Expand
);
608 if (!Subtarget
->hasFullFP16()) {
610 {ISD::SETCC
, ISD::SELECT_CC
,
611 ISD::BR_CC
, ISD::FADD
, ISD::FSUB
,
612 ISD::FMUL
, ISD::FDIV
, ISD::FMA
,
613 ISD::FNEG
, ISD::FABS
, ISD::FCEIL
,
614 ISD::FSQRT
, ISD::FFLOOR
, ISD::FNEARBYINT
,
615 ISD::FRINT
, ISD::FROUND
, ISD::FROUNDEVEN
,
616 ISD::FTRUNC
, ISD::FMINNUM
, ISD::FMAXNUM
,
617 ISD::FMINIMUM
, ISD::FMAXIMUM
, ISD::STRICT_FADD
,
618 ISD::STRICT_FSUB
, ISD::STRICT_FMUL
, ISD::STRICT_FDIV
,
619 ISD::STRICT_FMA
, ISD::STRICT_FCEIL
, ISD::STRICT_FFLOOR
,
620 ISD::STRICT_FSQRT
, ISD::STRICT_FRINT
, ISD::STRICT_FNEARBYINT
,
621 ISD::STRICT_FROUND
, ISD::STRICT_FTRUNC
, ISD::STRICT_FROUNDEVEN
,
622 ISD::STRICT_FMINNUM
, ISD::STRICT_FMAXNUM
, ISD::STRICT_FMINIMUM
,
623 ISD::STRICT_FMAXIMUM
})
624 setOperationAction(Op
, MVT::f16
, Promote
);
626 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
627 // because the result type is integer.
628 for (auto Op
: {ISD::STRICT_LROUND
, ISD::STRICT_LLROUND
, ISD::STRICT_LRINT
,
630 setOperationAction(Op
, MVT::f16
, Custom
);
632 // promote v4f16 to v4f32 when that is known to be safe.
633 setOperationPromotedToType(ISD::FADD
, MVT::v4f16
, MVT::v4f32
);
634 setOperationPromotedToType(ISD::FSUB
, MVT::v4f16
, MVT::v4f32
);
635 setOperationPromotedToType(ISD::FMUL
, MVT::v4f16
, MVT::v4f32
);
636 setOperationPromotedToType(ISD::FDIV
, MVT::v4f16
, MVT::v4f32
);
638 setOperationAction(ISD::FABS
, MVT::v4f16
, Expand
);
639 setOperationAction(ISD::FNEG
, MVT::v4f16
, Expand
);
640 setOperationAction(ISD::FROUND
, MVT::v4f16
, Expand
);
641 setOperationAction(ISD::FROUNDEVEN
, MVT::v4f16
, Expand
);
642 setOperationAction(ISD::FMA
, MVT::v4f16
, Expand
);
643 setOperationAction(ISD::SETCC
, MVT::v4f16
, Expand
);
644 setOperationAction(ISD::BR_CC
, MVT::v4f16
, Expand
);
645 setOperationAction(ISD::SELECT
, MVT::v4f16
, Expand
);
646 setOperationAction(ISD::SELECT_CC
, MVT::v4f16
, Expand
);
647 setOperationAction(ISD::FTRUNC
, MVT::v4f16
, Expand
);
648 setOperationAction(ISD::FCOPYSIGN
, MVT::v4f16
, Expand
);
649 setOperationAction(ISD::FFLOOR
, MVT::v4f16
, Expand
);
650 setOperationAction(ISD::FCEIL
, MVT::v4f16
, Expand
);
651 setOperationAction(ISD::FRINT
, MVT::v4f16
, Expand
);
652 setOperationAction(ISD::FNEARBYINT
, MVT::v4f16
, Expand
);
653 setOperationAction(ISD::FSQRT
, MVT::v4f16
, Expand
);
655 setOperationAction(ISD::FABS
, MVT::v8f16
, Expand
);
656 setOperationAction(ISD::FADD
, MVT::v8f16
, Expand
);
657 setOperationAction(ISD::FCEIL
, MVT::v8f16
, Expand
);
658 setOperationAction(ISD::FCOPYSIGN
, MVT::v8f16
, Expand
);
659 setOperationAction(ISD::FDIV
, MVT::v8f16
, Expand
);
660 setOperationAction(ISD::FFLOOR
, MVT::v8f16
, Expand
);
661 setOperationAction(ISD::FMA
, MVT::v8f16
, Expand
);
662 setOperationAction(ISD::FMUL
, MVT::v8f16
, Expand
);
663 setOperationAction(ISD::FNEARBYINT
, MVT::v8f16
, Expand
);
664 setOperationAction(ISD::FNEG
, MVT::v8f16
, Expand
);
665 setOperationAction(ISD::FROUND
, MVT::v8f16
, Expand
);
666 setOperationAction(ISD::FROUNDEVEN
, MVT::v8f16
, Expand
);
667 setOperationAction(ISD::FRINT
, MVT::v8f16
, Expand
);
668 setOperationAction(ISD::FSQRT
, MVT::v8f16
, Expand
);
669 setOperationAction(ISD::FSUB
, MVT::v8f16
, Expand
);
670 setOperationAction(ISD::FTRUNC
, MVT::v8f16
, Expand
);
671 setOperationAction(ISD::SETCC
, MVT::v8f16
, Expand
);
672 setOperationAction(ISD::BR_CC
, MVT::v8f16
, Expand
);
673 setOperationAction(ISD::SELECT
, MVT::v8f16
, Expand
);
674 setOperationAction(ISD::SELECT_CC
, MVT::v8f16
, Expand
);
675 setOperationAction(ISD::FP_EXTEND
, MVT::v8f16
, Expand
);
678 // AArch64 has implementations of a lot of rounding-like FP operations.
680 {ISD::FFLOOR
, ISD::FNEARBYINT
, ISD::FCEIL
,
681 ISD::FRINT
, ISD::FTRUNC
, ISD::FROUND
,
682 ISD::FROUNDEVEN
, ISD::FMINNUM
, ISD::FMAXNUM
,
683 ISD::FMINIMUM
, ISD::FMAXIMUM
, ISD::LROUND
,
684 ISD::LLROUND
, ISD::LRINT
, ISD::LLRINT
,
685 ISD::STRICT_FFLOOR
, ISD::STRICT_FCEIL
, ISD::STRICT_FNEARBYINT
,
686 ISD::STRICT_FRINT
, ISD::STRICT_FTRUNC
, ISD::STRICT_FROUNDEVEN
,
687 ISD::STRICT_FROUND
, ISD::STRICT_FMINNUM
, ISD::STRICT_FMAXNUM
,
688 ISD::STRICT_FMINIMUM
, ISD::STRICT_FMAXIMUM
, ISD::STRICT_LROUND
,
689 ISD::STRICT_LLROUND
, ISD::STRICT_LRINT
, ISD::STRICT_LLRINT
}) {
690 for (MVT Ty
: {MVT::f32
, MVT::f64
})
691 setOperationAction(Op
, Ty
, Legal
);
692 if (Subtarget
->hasFullFP16())
693 setOperationAction(Op
, MVT::f16
, Legal
);
696 // Basic strict FP operations are legal
697 for (auto Op
: {ISD::STRICT_FADD
, ISD::STRICT_FSUB
, ISD::STRICT_FMUL
,
698 ISD::STRICT_FDIV
, ISD::STRICT_FMA
, ISD::STRICT_FSQRT
}) {
699 for (MVT Ty
: {MVT::f32
, MVT::f64
})
700 setOperationAction(Op
, Ty
, Legal
);
701 if (Subtarget
->hasFullFP16())
702 setOperationAction(Op
, MVT::f16
, Legal
);
705 // Strict conversion to a larger type is legal
706 for (auto VT
: {MVT::f32
, MVT::f64
})
707 setOperationAction(ISD::STRICT_FP_EXTEND
, VT
, Legal
);
709 setOperationAction(ISD::PREFETCH
, MVT::Other
, Custom
);
711 setOperationAction(ISD::FLT_ROUNDS_
, MVT::i32
, Custom
);
712 setOperationAction(ISD::SET_ROUNDING
, MVT::Other
, Custom
);
714 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i128
, Custom
);
715 setOperationAction(ISD::ATOMIC_LOAD_SUB
, MVT::i32
, Custom
);
716 setOperationAction(ISD::ATOMIC_LOAD_SUB
, MVT::i64
, Custom
);
717 setOperationAction(ISD::ATOMIC_LOAD_AND
, MVT::i32
, Custom
);
718 setOperationAction(ISD::ATOMIC_LOAD_AND
, MVT::i64
, Custom
);
720 // Generate outline atomics library calls only if LSE was not specified for
722 if (Subtarget
->outlineAtomics() && !Subtarget
->hasLSE()) {
723 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i8
, LibCall
);
724 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i16
, LibCall
);
725 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i32
, LibCall
);
726 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i64
, LibCall
);
727 setOperationAction(ISD::ATOMIC_CMP_SWAP
, MVT::i128
, LibCall
);
728 setOperationAction(ISD::ATOMIC_SWAP
, MVT::i8
, LibCall
);
729 setOperationAction(ISD::ATOMIC_SWAP
, MVT::i16
, LibCall
);
730 setOperationAction(ISD::ATOMIC_SWAP
, MVT::i32
, LibCall
);
731 setOperationAction(ISD::ATOMIC_SWAP
, MVT::i64
, LibCall
);
732 setOperationAction(ISD::ATOMIC_LOAD_ADD
, MVT::i8
, LibCall
);
733 setOperationAction(ISD::ATOMIC_LOAD_ADD
, MVT::i16
, LibCall
);
734 setOperationAction(ISD::ATOMIC_LOAD_ADD
, MVT::i32
, LibCall
);
735 setOperationAction(ISD::ATOMIC_LOAD_ADD
, MVT::i64
, LibCall
);
736 setOperationAction(ISD::ATOMIC_LOAD_OR
, MVT::i8
, LibCall
);
737 setOperationAction(ISD::ATOMIC_LOAD_OR
, MVT::i16
, LibCall
);
738 setOperationAction(ISD::ATOMIC_LOAD_OR
, MVT::i32
, LibCall
);
739 setOperationAction(ISD::ATOMIC_LOAD_OR
, MVT::i64
, LibCall
);
740 setOperationAction(ISD::ATOMIC_LOAD_CLR
, MVT::i8
, LibCall
);
741 setOperationAction(ISD::ATOMIC_LOAD_CLR
, MVT::i16
, LibCall
);
742 setOperationAction(ISD::ATOMIC_LOAD_CLR
, MVT::i32
, LibCall
);
743 setOperationAction(ISD::ATOMIC_LOAD_CLR
, MVT::i64
, LibCall
);
744 setOperationAction(ISD::ATOMIC_LOAD_XOR
, MVT::i8
, LibCall
);
745 setOperationAction(ISD::ATOMIC_LOAD_XOR
, MVT::i16
, LibCall
);
746 setOperationAction(ISD::ATOMIC_LOAD_XOR
, MVT::i32
, LibCall
);
747 setOperationAction(ISD::ATOMIC_LOAD_XOR
, MVT::i64
, LibCall
);
748 #define LCALLNAMES(A, B, N) \
749 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
750 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
751 setLibcallName(A##N##_REL, #B #N "_rel"); \
752 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
753 #define LCALLNAME4(A, B) \
754 LCALLNAMES(A, B, 1) \
755 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
756 #define LCALLNAME5(A, B) \
757 LCALLNAMES(A, B, 1) \
758 LCALLNAMES(A, B, 2) \
759 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
760 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS
, __aarch64_cas
)
761 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP
, __aarch64_swp
)
762 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD
, __aarch64_ldadd
)
763 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET
, __aarch64_ldset
)
764 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR
, __aarch64_ldclr
)
765 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR
, __aarch64_ldeor
)
771 // 128-bit loads and stores can be done without expanding
772 setOperationAction(ISD::LOAD
, MVT::i128
, Custom
);
773 setOperationAction(ISD::STORE
, MVT::i128
, Custom
);
775 // Aligned 128-bit loads and stores are single-copy atomic according to the
777 if (Subtarget
->hasLSE2()) {
778 setOperationAction(ISD::ATOMIC_LOAD
, MVT::i128
, Custom
);
779 setOperationAction(ISD::ATOMIC_STORE
, MVT::i128
, Custom
);
782 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
783 // custom lowering, as there are no un-paired non-temporal stores and
784 // legalization will break up 256 bit inputs.
785 setOperationAction(ISD::STORE
, MVT::v32i8
, Custom
);
786 setOperationAction(ISD::STORE
, MVT::v16i16
, Custom
);
787 setOperationAction(ISD::STORE
, MVT::v16f16
, Custom
);
788 setOperationAction(ISD::STORE
, MVT::v8i32
, Custom
);
789 setOperationAction(ISD::STORE
, MVT::v8f32
, Custom
);
790 setOperationAction(ISD::STORE
, MVT::v4f64
, Custom
);
791 setOperationAction(ISD::STORE
, MVT::v4i64
, Custom
);
793 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
794 // custom lowering, as there are no un-paired non-temporal loads legalization
795 // will break up 256 bit inputs.
796 setOperationAction(ISD::LOAD
, MVT::v32i8
, Custom
);
797 setOperationAction(ISD::LOAD
, MVT::v16i16
, Custom
);
798 setOperationAction(ISD::LOAD
, MVT::v16f16
, Custom
);
799 setOperationAction(ISD::LOAD
, MVT::v8i32
, Custom
);
800 setOperationAction(ISD::LOAD
, MVT::v8f32
, Custom
);
801 setOperationAction(ISD::LOAD
, MVT::v4f64
, Custom
);
802 setOperationAction(ISD::LOAD
, MVT::v4i64
, Custom
);
804 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
805 // This requires the Performance Monitors extension.
806 if (Subtarget
->hasPerfMon())
807 setOperationAction(ISD::READCYCLECOUNTER
, MVT::i64
, Legal
);
809 if (getLibcallName(RTLIB::SINCOS_STRET_F32
) != nullptr &&
810 getLibcallName(RTLIB::SINCOS_STRET_F64
) != nullptr) {
811 // Issue __sincos_stret if available.
812 setOperationAction(ISD::FSINCOS
, MVT::f64
, Custom
);
813 setOperationAction(ISD::FSINCOS
, MVT::f32
, Custom
);
815 setOperationAction(ISD::FSINCOS
, MVT::f64
, Expand
);
816 setOperationAction(ISD::FSINCOS
, MVT::f32
, Expand
);
819 if (Subtarget
->getTargetTriple().isOSMSVCRT()) {
820 // MSVCRT doesn't have powi; fall back to pow
821 setLibcallName(RTLIB::POWI_F32
, nullptr);
822 setLibcallName(RTLIB::POWI_F64
, nullptr);
825 // Make floating-point constants legal for the large code model, so they don't
826 // become loads from the constant pool.
827 if (Subtarget
->isTargetMachO() && TM
.getCodeModel() == CodeModel::Large
) {
828 setOperationAction(ISD::ConstantFP
, MVT::f32
, Legal
);
829 setOperationAction(ISD::ConstantFP
, MVT::f64
, Legal
);
832 // AArch64 does not have floating-point extending loads, i1 sign-extending
833 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
834 for (MVT VT
: MVT::fp_valuetypes()) {
835 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::f16
, Expand
);
836 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::f32
, Expand
);
837 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::f64
, Expand
);
838 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::f80
, Expand
);
840 for (MVT VT
: MVT::integer_valuetypes())
841 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::i1
, Expand
);
843 setTruncStoreAction(MVT::f32
, MVT::f16
, Expand
);
844 setTruncStoreAction(MVT::f64
, MVT::f32
, Expand
);
845 setTruncStoreAction(MVT::f64
, MVT::f16
, Expand
);
846 setTruncStoreAction(MVT::f128
, MVT::f80
, Expand
);
847 setTruncStoreAction(MVT::f128
, MVT::f64
, Expand
);
848 setTruncStoreAction(MVT::f128
, MVT::f32
, Expand
);
849 setTruncStoreAction(MVT::f128
, MVT::f16
, Expand
);
851 setOperationAction(ISD::BITCAST
, MVT::i16
, Custom
);
852 setOperationAction(ISD::BITCAST
, MVT::f16
, Custom
);
853 setOperationAction(ISD::BITCAST
, MVT::bf16
, Custom
);
855 // Indexed loads and stores are supported.
856 for (unsigned im
= (unsigned)ISD::PRE_INC
;
857 im
!= (unsigned)ISD::LAST_INDEXED_MODE
; ++im
) {
858 setIndexedLoadAction(im
, MVT::i8
, Legal
);
859 setIndexedLoadAction(im
, MVT::i16
, Legal
);
860 setIndexedLoadAction(im
, MVT::i32
, Legal
);
861 setIndexedLoadAction(im
, MVT::i64
, Legal
);
862 setIndexedLoadAction(im
, MVT::f64
, Legal
);
863 setIndexedLoadAction(im
, MVT::f32
, Legal
);
864 setIndexedLoadAction(im
, MVT::f16
, Legal
);
865 setIndexedLoadAction(im
, MVT::bf16
, Legal
);
866 setIndexedStoreAction(im
, MVT::i8
, Legal
);
867 setIndexedStoreAction(im
, MVT::i16
, Legal
);
868 setIndexedStoreAction(im
, MVT::i32
, Legal
);
869 setIndexedStoreAction(im
, MVT::i64
, Legal
);
870 setIndexedStoreAction(im
, MVT::f64
, Legal
);
871 setIndexedStoreAction(im
, MVT::f32
, Legal
);
872 setIndexedStoreAction(im
, MVT::f16
, Legal
);
873 setIndexedStoreAction(im
, MVT::bf16
, Legal
);
877 setOperationAction(ISD::TRAP
, MVT::Other
, Legal
);
878 setOperationAction(ISD::DEBUGTRAP
, MVT::Other
, Legal
);
879 setOperationAction(ISD::UBSANTRAP
, MVT::Other
, Legal
);
881 // We combine OR nodes for bitfield operations.
882 setTargetDAGCombine(ISD::OR
);
883 // Try to create BICs for vector ANDs.
884 setTargetDAGCombine(ISD::AND
);
886 // Vector add and sub nodes may conceal a high-half opportunity.
887 // Also, try to fold ADD into CSINC/CSINV..
888 setTargetDAGCombine({ISD::ADD
, ISD::ABS
, ISD::SUB
, ISD::XOR
, ISD::SINT_TO_FP
,
891 setTargetDAGCombine({ISD::FP_TO_SINT
, ISD::FP_TO_UINT
, ISD::FP_TO_SINT_SAT
,
892 ISD::FP_TO_UINT_SAT
, ISD::FDIV
});
894 // Try and combine setcc with csel
895 setTargetDAGCombine(ISD::SETCC
);
897 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN
);
899 setTargetDAGCombine({ISD::ANY_EXTEND
, ISD::ZERO_EXTEND
, ISD::SIGN_EXTEND
,
900 ISD::VECTOR_SPLICE
, ISD::SIGN_EXTEND_INREG
,
901 ISD::CONCAT_VECTORS
, ISD::EXTRACT_SUBVECTOR
,
902 ISD::INSERT_SUBVECTOR
, ISD::STORE
, ISD::BUILD_VECTOR
});
903 setTargetDAGCombine(ISD::LOAD
);
905 setTargetDAGCombine(ISD::MSTORE
);
907 setTargetDAGCombine(ISD::MUL
);
909 setTargetDAGCombine({ISD::SELECT
, ISD::VSELECT
});
911 setTargetDAGCombine({ISD::INTRINSIC_VOID
, ISD::INTRINSIC_W_CHAIN
,
912 ISD::INSERT_VECTOR_ELT
, ISD::EXTRACT_VECTOR_ELT
,
913 ISD::VECREDUCE_ADD
, ISD::STEP_VECTOR
});
915 setTargetDAGCombine({ISD::MGATHER
, ISD::MSCATTER
});
917 setTargetDAGCombine(ISD::FP_EXTEND
);
919 setTargetDAGCombine(ISD::GlobalAddress
);
921 // In case of strict alignment, avoid an excessive number of byte wide stores.
922 MaxStoresPerMemsetOptSize
= 8;
924 Subtarget
->requiresStrictAlign() ? MaxStoresPerMemsetOptSize
: 32;
926 MaxGluedStoresPerMemcpy
= 4;
927 MaxStoresPerMemcpyOptSize
= 4;
929 Subtarget
->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize
: 16;
931 MaxStoresPerMemmoveOptSize
= 4;
932 MaxStoresPerMemmove
= 4;
934 MaxLoadsPerMemcmpOptSize
= 4;
936 Subtarget
->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize
: 8;
938 setStackPointerRegisterToSaveRestore(AArch64::SP
);
940 setSchedulingPreference(Sched::Hybrid
);
942 EnableExtLdPromotion
= true;
944 // Set required alignment.
945 setMinFunctionAlignment(Align(4));
946 // Set preferred alignments.
947 setPrefLoopAlignment(Align(1ULL << STI
.getPrefLoopLogAlignment()));
948 setMaxBytesForAlignment(STI
.getMaxBytesForLoopAlignment());
949 setPrefFunctionAlignment(Align(1ULL << STI
.getPrefFunctionLogAlignment()));
951 // Only change the limit for entries in a jump table if specified by
952 // the sub target, but not at the command line.
953 unsigned MaxJT
= STI
.getMaximumJumpTableSize();
954 if (MaxJT
&& getMaximumJumpTableSize() == UINT_MAX
)
955 setMaximumJumpTableSize(MaxJT
);
957 setHasExtractBitsInsn(true);
959 setMaxDivRemBitWidthSupported(128);
961 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::Other
, Custom
);
963 if (Subtarget
->hasNEON()) {
964 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
965 // silliness like this:
967 {ISD::SELECT
, ISD::SELECT_CC
, ISD::SETCC
,
968 ISD::BR_CC
, ISD::FADD
, ISD::FSUB
,
969 ISD::FMUL
, ISD::FDIV
, ISD::FMA
,
970 ISD::FNEG
, ISD::FABS
, ISD::FCEIL
,
971 ISD::FSQRT
, ISD::FFLOOR
, ISD::FNEARBYINT
,
972 ISD::FRINT
, ISD::FROUND
, ISD::FROUNDEVEN
,
973 ISD::FTRUNC
, ISD::FMINNUM
, ISD::FMAXNUM
,
974 ISD::FMINIMUM
, ISD::FMAXIMUM
, ISD::STRICT_FADD
,
975 ISD::STRICT_FSUB
, ISD::STRICT_FMUL
, ISD::STRICT_FDIV
,
976 ISD::STRICT_FMA
, ISD::STRICT_FCEIL
, ISD::STRICT_FFLOOR
,
977 ISD::STRICT_FSQRT
, ISD::STRICT_FRINT
, ISD::STRICT_FNEARBYINT
,
978 ISD::STRICT_FROUND
, ISD::STRICT_FTRUNC
, ISD::STRICT_FROUNDEVEN
,
979 ISD::STRICT_FMINNUM
, ISD::STRICT_FMAXNUM
, ISD::STRICT_FMINIMUM
,
980 ISD::STRICT_FMAXIMUM
})
981 setOperationAction(Op
, MVT::v1f64
, Expand
);
984 {ISD::FP_TO_SINT
, ISD::FP_TO_UINT
, ISD::SINT_TO_FP
, ISD::UINT_TO_FP
,
985 ISD::FP_ROUND
, ISD::FP_TO_SINT_SAT
, ISD::FP_TO_UINT_SAT
, ISD::MUL
,
986 ISD::STRICT_FP_TO_SINT
, ISD::STRICT_FP_TO_UINT
,
987 ISD::STRICT_SINT_TO_FP
, ISD::STRICT_UINT_TO_FP
, ISD::STRICT_FP_ROUND
})
988 setOperationAction(Op
, MVT::v1i64
, Expand
);
990 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
991 // elements smaller than i32, so promote the input to i32 first.
992 setOperationPromotedToType(ISD::UINT_TO_FP
, MVT::v4i8
, MVT::v4i32
);
993 setOperationPromotedToType(ISD::SINT_TO_FP
, MVT::v4i8
, MVT::v4i32
);
995 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
996 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
997 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
998 for (auto Op
: {ISD::SINT_TO_FP
, ISD::UINT_TO_FP
, ISD::STRICT_SINT_TO_FP
,
999 ISD::STRICT_UINT_TO_FP
})
1000 for (auto VT
: {MVT::v2i32
, MVT::v2i64
, MVT::v4i32
})
1001 setOperationAction(Op
, VT
, Custom
);
1003 if (Subtarget
->hasFullFP16()) {
1004 setOperationAction(ISD::ConstantFP
, MVT::f16
, Legal
);
1006 setOperationAction(ISD::SINT_TO_FP
, MVT::v8i8
, Custom
);
1007 setOperationAction(ISD::UINT_TO_FP
, MVT::v8i8
, Custom
);
1008 setOperationAction(ISD::SINT_TO_FP
, MVT::v16i8
, Custom
);
1009 setOperationAction(ISD::UINT_TO_FP
, MVT::v16i8
, Custom
);
1010 setOperationAction(ISD::SINT_TO_FP
, MVT::v4i16
, Custom
);
1011 setOperationAction(ISD::UINT_TO_FP
, MVT::v4i16
, Custom
);
1012 setOperationAction(ISD::SINT_TO_FP
, MVT::v8i16
, Custom
);
1013 setOperationAction(ISD::UINT_TO_FP
, MVT::v8i16
, Custom
);
1015 // when AArch64 doesn't have fullfp16 support, promote the input
1017 setOperationPromotedToType(ISD::SINT_TO_FP
, MVT::v8i8
, MVT::v8i32
);
1018 setOperationPromotedToType(ISD::UINT_TO_FP
, MVT::v8i8
, MVT::v8i32
);
1019 setOperationPromotedToType(ISD::UINT_TO_FP
, MVT::v16i8
, MVT::v16i32
);
1020 setOperationPromotedToType(ISD::SINT_TO_FP
, MVT::v16i8
, MVT::v16i32
);
1021 setOperationPromotedToType(ISD::UINT_TO_FP
, MVT::v4i16
, MVT::v4i32
);
1022 setOperationPromotedToType(ISD::SINT_TO_FP
, MVT::v4i16
, MVT::v4i32
);
1023 setOperationPromotedToType(ISD::SINT_TO_FP
, MVT::v8i16
, MVT::v8i32
);
1024 setOperationPromotedToType(ISD::UINT_TO_FP
, MVT::v8i16
, MVT::v8i32
);
1027 setOperationAction(ISD::CTLZ
, MVT::v1i64
, Expand
);
1028 setOperationAction(ISD::CTLZ
, MVT::v2i64
, Expand
);
1029 setOperationAction(ISD::BITREVERSE
, MVT::v8i8
, Legal
);
1030 setOperationAction(ISD::BITREVERSE
, MVT::v16i8
, Legal
);
1031 setOperationAction(ISD::BITREVERSE
, MVT::v2i32
, Custom
);
1032 setOperationAction(ISD::BITREVERSE
, MVT::v4i32
, Custom
);
1033 setOperationAction(ISD::BITREVERSE
, MVT::v1i64
, Custom
);
1034 setOperationAction(ISD::BITREVERSE
, MVT::v2i64
, Custom
);
1035 for (auto VT
: {MVT::v1i64
, MVT::v2i64
}) {
1036 setOperationAction(ISD::UMAX
, VT
, Custom
);
1037 setOperationAction(ISD::SMAX
, VT
, Custom
);
1038 setOperationAction(ISD::UMIN
, VT
, Custom
);
1039 setOperationAction(ISD::SMIN
, VT
, Custom
);
1042 // AArch64 doesn't have MUL.2d:
1043 setOperationAction(ISD::MUL
, MVT::v2i64
, Expand
);
1044 // Custom handling for some quad-vector types to detect MULL.
1045 setOperationAction(ISD::MUL
, MVT::v8i16
, Custom
);
1046 setOperationAction(ISD::MUL
, MVT::v4i32
, Custom
);
1047 setOperationAction(ISD::MUL
, MVT::v2i64
, Custom
);
1050 for (MVT VT
: { MVT::v8i8
, MVT::v4i16
, MVT::v2i32
,
1051 MVT::v16i8
, MVT::v8i16
, MVT::v4i32
, MVT::v2i64
}) {
1052 setOperationAction(ISD::SADDSAT
, VT
, Legal
);
1053 setOperationAction(ISD::UADDSAT
, VT
, Legal
);
1054 setOperationAction(ISD::SSUBSAT
, VT
, Legal
);
1055 setOperationAction(ISD::USUBSAT
, VT
, Legal
);
1058 for (MVT VT
: {MVT::v8i8
, MVT::v4i16
, MVT::v2i32
, MVT::v16i8
, MVT::v8i16
,
1060 setOperationAction(ISD::AVGFLOORS
, VT
, Legal
);
1061 setOperationAction(ISD::AVGFLOORU
, VT
, Legal
);
1062 setOperationAction(ISD::AVGCEILS
, VT
, Legal
);
1063 setOperationAction(ISD::AVGCEILU
, VT
, Legal
);
1064 setOperationAction(ISD::ABDS
, VT
, Legal
);
1065 setOperationAction(ISD::ABDU
, VT
, Legal
);
1068 // Vector reductions
1069 for (MVT VT
: { MVT::v4f16
, MVT::v2f32
,
1070 MVT::v8f16
, MVT::v4f32
, MVT::v2f64
}) {
1071 if (VT
.getVectorElementType() != MVT::f16
|| Subtarget
->hasFullFP16()) {
1072 setOperationAction(ISD::VECREDUCE_FMAX
, VT
, Custom
);
1073 setOperationAction(ISD::VECREDUCE_FMIN
, VT
, Custom
);
1075 setOperationAction(ISD::VECREDUCE_FADD
, VT
, Legal
);
1078 for (MVT VT
: { MVT::v8i8
, MVT::v4i16
, MVT::v2i32
,
1079 MVT::v16i8
, MVT::v8i16
, MVT::v4i32
}) {
1080 setOperationAction(ISD::VECREDUCE_ADD
, VT
, Custom
);
1081 setOperationAction(ISD::VECREDUCE_SMAX
, VT
, Custom
);
1082 setOperationAction(ISD::VECREDUCE_SMIN
, VT
, Custom
);
1083 setOperationAction(ISD::VECREDUCE_UMAX
, VT
, Custom
);
1084 setOperationAction(ISD::VECREDUCE_UMIN
, VT
, Custom
);
1086 setOperationAction(ISD::VECREDUCE_ADD
, MVT::v2i64
, Custom
);
1088 setOperationAction(ISD::ANY_EXTEND
, MVT::v4i32
, Legal
);
1089 setTruncStoreAction(MVT::v2i32
, MVT::v2i16
, Expand
);
1090 // Likewise, narrowing and extending vector loads/stores aren't handled
1092 for (MVT VT
: MVT::fixedlen_vector_valuetypes()) {
1093 setOperationAction(ISD::SIGN_EXTEND_INREG
, VT
, Expand
);
1095 if (VT
== MVT::v16i8
|| VT
== MVT::v8i16
|| VT
== MVT::v4i32
) {
1096 setOperationAction(ISD::MULHS
, VT
, Legal
);
1097 setOperationAction(ISD::MULHU
, VT
, Legal
);
1099 setOperationAction(ISD::MULHS
, VT
, Expand
);
1100 setOperationAction(ISD::MULHU
, VT
, Expand
);
1102 setOperationAction(ISD::SMUL_LOHI
, VT
, Expand
);
1103 setOperationAction(ISD::UMUL_LOHI
, VT
, Expand
);
1105 setOperationAction(ISD::BSWAP
, VT
, Expand
);
1106 setOperationAction(ISD::CTTZ
, VT
, Expand
);
1108 for (MVT InnerVT
: MVT::fixedlen_vector_valuetypes()) {
1109 setTruncStoreAction(VT
, InnerVT
, Expand
);
1110 setLoadExtAction(ISD::SEXTLOAD
, VT
, InnerVT
, Expand
);
1111 setLoadExtAction(ISD::ZEXTLOAD
, VT
, InnerVT
, Expand
);
1112 setLoadExtAction(ISD::EXTLOAD
, VT
, InnerVT
, Expand
);
1116 // AArch64 has implementations of a lot of rounding-like FP operations.
1118 {ISD::FFLOOR
, ISD::FNEARBYINT
, ISD::FCEIL
, ISD::FRINT
, ISD::FTRUNC
,
1119 ISD::FROUND
, ISD::FROUNDEVEN
, ISD::STRICT_FFLOOR
,
1120 ISD::STRICT_FNEARBYINT
, ISD::STRICT_FCEIL
, ISD::STRICT_FRINT
,
1121 ISD::STRICT_FTRUNC
, ISD::STRICT_FROUND
, ISD::STRICT_FROUNDEVEN
}) {
1122 for (MVT Ty
: {MVT::v2f32
, MVT::v4f32
, MVT::v2f64
})
1123 setOperationAction(Op
, Ty
, Legal
);
1124 if (Subtarget
->hasFullFP16())
1125 for (MVT Ty
: {MVT::v4f16
, MVT::v8f16
})
1126 setOperationAction(Op
, Ty
, Legal
);
1129 setTruncStoreAction(MVT::v4i16
, MVT::v4i8
, Custom
);
1131 setLoadExtAction(ISD::EXTLOAD
, MVT::v4i16
, MVT::v4i8
, Custom
);
1132 setLoadExtAction(ISD::SEXTLOAD
, MVT::v4i16
, MVT::v4i8
, Custom
);
1133 setLoadExtAction(ISD::ZEXTLOAD
, MVT::v4i16
, MVT::v4i8
, Custom
);
1134 setLoadExtAction(ISD::EXTLOAD
, MVT::v4i32
, MVT::v4i8
, Custom
);
1135 setLoadExtAction(ISD::SEXTLOAD
, MVT::v4i32
, MVT::v4i8
, Custom
);
1136 setLoadExtAction(ISD::ZEXTLOAD
, MVT::v4i32
, MVT::v4i8
, Custom
);
1138 // ADDP custom lowering
1139 for (MVT VT
: { MVT::v32i8
, MVT::v16i16
, MVT::v8i32
, MVT::v4i64
})
1140 setOperationAction(ISD::ADD
, VT
, Custom
);
1141 // FADDP custom lowering
1142 for (MVT VT
: { MVT::v16f16
, MVT::v8f32
, MVT::v4f64
})
1143 setOperationAction(ISD::FADD
, VT
, Custom
);
1146 if (Subtarget
->hasSME()) {
1147 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::Other
, Custom
);
1150 // FIXME: Move lowering for more nodes here if those are common between
1152 if (Subtarget
->hasSVE() || Subtarget
->hasSME()) {
1154 {MVT::nxv16i1
, MVT::nxv8i1
, MVT::nxv4i1
, MVT::nxv2i1
, MVT::nxv1i1
}) {
1155 setOperationAction(ISD::SPLAT_VECTOR
, VT
, Custom
);
1156 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, VT
, Custom
);
1160 if (Subtarget
->hasSME())
1161 setOperationAction(ISD::INTRINSIC_VOID
, MVT::Other
, Custom
);
1163 if (Subtarget
->hasSVE()) {
1164 for (auto VT
: {MVT::nxv16i8
, MVT::nxv8i16
, MVT::nxv4i32
, MVT::nxv2i64
}) {
1165 setOperationAction(ISD::BITREVERSE
, VT
, Custom
);
1166 setOperationAction(ISD::BSWAP
, VT
, Custom
);
1167 setOperationAction(ISD::CTLZ
, VT
, Custom
);
1168 setOperationAction(ISD::CTPOP
, VT
, Custom
);
1169 setOperationAction(ISD::CTTZ
, VT
, Custom
);
1170 setOperationAction(ISD::INSERT_SUBVECTOR
, VT
, Custom
);
1171 setOperationAction(ISD::UINT_TO_FP
, VT
, Custom
);
1172 setOperationAction(ISD::SINT_TO_FP
, VT
, Custom
);
1173 setOperationAction(ISD::FP_TO_UINT
, VT
, Custom
);
1174 setOperationAction(ISD::FP_TO_SINT
, VT
, Custom
);
1175 setOperationAction(ISD::MGATHER
, VT
, Custom
);
1176 setOperationAction(ISD::MSCATTER
, VT
, Custom
);
1177 setOperationAction(ISD::MLOAD
, VT
, Custom
);
1178 setOperationAction(ISD::MUL
, VT
, Custom
);
1179 setOperationAction(ISD::MULHS
, VT
, Custom
);
1180 setOperationAction(ISD::MULHU
, VT
, Custom
);
1181 setOperationAction(ISD::SPLAT_VECTOR
, VT
, Legal
);
1182 setOperationAction(ISD::VECTOR_SPLICE
, VT
, Custom
);
1183 setOperationAction(ISD::SELECT
, VT
, Custom
);
1184 setOperationAction(ISD::SETCC
, VT
, Custom
);
1185 setOperationAction(ISD::SDIV
, VT
, Custom
);
1186 setOperationAction(ISD::UDIV
, VT
, Custom
);
1187 setOperationAction(ISD::SMIN
, VT
, Custom
);
1188 setOperationAction(ISD::UMIN
, VT
, Custom
);
1189 setOperationAction(ISD::SMAX
, VT
, Custom
);
1190 setOperationAction(ISD::UMAX
, VT
, Custom
);
1191 setOperationAction(ISD::SHL
, VT
, Custom
);
1192 setOperationAction(ISD::SRL
, VT
, Custom
);
1193 setOperationAction(ISD::SRA
, VT
, Custom
);
1194 setOperationAction(ISD::ABS
, VT
, Custom
);
1195 setOperationAction(ISD::ABDS
, VT
, Custom
);
1196 setOperationAction(ISD::ABDU
, VT
, Custom
);
1197 setOperationAction(ISD::VECREDUCE_ADD
, VT
, Custom
);
1198 setOperationAction(ISD::VECREDUCE_AND
, VT
, Custom
);
1199 setOperationAction(ISD::VECREDUCE_OR
, VT
, Custom
);
1200 setOperationAction(ISD::VECREDUCE_XOR
, VT
, Custom
);
1201 setOperationAction(ISD::VECREDUCE_UMIN
, VT
, Custom
);
1202 setOperationAction(ISD::VECREDUCE_UMAX
, VT
, Custom
);
1203 setOperationAction(ISD::VECREDUCE_SMIN
, VT
, Custom
);
1204 setOperationAction(ISD::VECREDUCE_SMAX
, VT
, Custom
);
1205 setOperationAction(ISD::ZERO_EXTEND
, VT
, Custom
);
1207 setOperationAction(ISD::UMUL_LOHI
, VT
, Expand
);
1208 setOperationAction(ISD::SMUL_LOHI
, VT
, Expand
);
1209 setOperationAction(ISD::SELECT_CC
, VT
, Expand
);
1210 setOperationAction(ISD::ROTL
, VT
, Expand
);
1211 setOperationAction(ISD::ROTR
, VT
, Expand
);
1213 setOperationAction(ISD::SADDSAT
, VT
, Legal
);
1214 setOperationAction(ISD::UADDSAT
, VT
, Legal
);
1215 setOperationAction(ISD::SSUBSAT
, VT
, Legal
);
1216 setOperationAction(ISD::USUBSAT
, VT
, Legal
);
1217 setOperationAction(ISD::UREM
, VT
, Expand
);
1218 setOperationAction(ISD::SREM
, VT
, Expand
);
1219 setOperationAction(ISD::SDIVREM
, VT
, Expand
);
1220 setOperationAction(ISD::UDIVREM
, VT
, Expand
);
1223 // Illegal unpacked integer vector types.
1224 for (auto VT
: {MVT::nxv8i8
, MVT::nxv4i16
, MVT::nxv2i32
}) {
1225 setOperationAction(ISD::EXTRACT_SUBVECTOR
, VT
, Custom
);
1226 setOperationAction(ISD::INSERT_SUBVECTOR
, VT
, Custom
);
1229 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1230 for (auto VT
: {MVT::nxv2i16
, MVT::nxv4i16
, MVT::nxv2i32
, MVT::nxv2bf16
,
1231 MVT::nxv4bf16
, MVT::nxv2f16
, MVT::nxv4f16
, MVT::nxv2f32
})
1232 setOperationAction(ISD::BITCAST
, VT
, Custom
);
1235 { MVT::nxv2i8
, MVT::nxv2i16
, MVT::nxv2i32
, MVT::nxv2i64
, MVT::nxv4i8
,
1236 MVT::nxv4i16
, MVT::nxv4i32
, MVT::nxv8i8
, MVT::nxv8i16
})
1237 setOperationAction(ISD::SIGN_EXTEND_INREG
, VT
, Legal
);
1240 {MVT::nxv16i1
, MVT::nxv8i1
, MVT::nxv4i1
, MVT::nxv2i1
, MVT::nxv1i1
}) {
1241 setOperationAction(ISD::CONCAT_VECTORS
, VT
, Custom
);
1242 setOperationAction(ISD::SELECT
, VT
, Custom
);
1243 setOperationAction(ISD::SETCC
, VT
, Custom
);
1244 setOperationAction(ISD::TRUNCATE
, VT
, Custom
);
1245 setOperationAction(ISD::VECREDUCE_AND
, VT
, Custom
);
1246 setOperationAction(ISD::VECREDUCE_OR
, VT
, Custom
);
1247 setOperationAction(ISD::VECREDUCE_XOR
, VT
, Custom
);
1249 setOperationAction(ISD::SELECT_CC
, VT
, Expand
);
1250 setOperationAction(ISD::INSERT_VECTOR_ELT
, VT
, Custom
);
1251 setOperationAction(ISD::INSERT_SUBVECTOR
, VT
, Custom
);
1253 // There are no legal MVT::nxv16f## based types.
1254 if (VT
!= MVT::nxv16i1
) {
1255 setOperationAction(ISD::SINT_TO_FP
, VT
, Custom
);
1256 setOperationAction(ISD::UINT_TO_FP
, VT
, Custom
);
1260 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1261 for (auto VT
: {MVT::v4f16
, MVT::v8f16
, MVT::v2f32
, MVT::v4f32
, MVT::v1f64
,
1262 MVT::v2f64
, MVT::v8i8
, MVT::v16i8
, MVT::v4i16
, MVT::v8i16
,
1263 MVT::v2i32
, MVT::v4i32
, MVT::v1i64
, MVT::v2i64
}) {
1264 setOperationAction(ISD::MLOAD
, VT
, Custom
);
1265 setOperationAction(ISD::MSTORE
, VT
, Custom
);
1266 setOperationAction(ISD::MGATHER
, VT
, Custom
);
1267 setOperationAction(ISD::MSCATTER
, VT
, Custom
);
1270 // Firstly, exclude all scalable vector extending loads/truncating stores,
1271 // include both integer and floating scalable vector.
1272 for (MVT VT
: MVT::scalable_vector_valuetypes()) {
1273 for (MVT InnerVT
: MVT::scalable_vector_valuetypes()) {
1274 setTruncStoreAction(VT
, InnerVT
, Expand
);
1275 setLoadExtAction(ISD::SEXTLOAD
, VT
, InnerVT
, Expand
);
1276 setLoadExtAction(ISD::ZEXTLOAD
, VT
, InnerVT
, Expand
);
1277 setLoadExtAction(ISD::EXTLOAD
, VT
, InnerVT
, Expand
);
1281 // Then, selectively enable those which we directly support.
1282 setTruncStoreAction(MVT::nxv2i64
, MVT::nxv2i8
, Legal
);
1283 setTruncStoreAction(MVT::nxv2i64
, MVT::nxv2i16
, Legal
);
1284 setTruncStoreAction(MVT::nxv2i64
, MVT::nxv2i32
, Legal
);
1285 setTruncStoreAction(MVT::nxv4i32
, MVT::nxv4i8
, Legal
);
1286 setTruncStoreAction(MVT::nxv4i32
, MVT::nxv4i16
, Legal
);
1287 setTruncStoreAction(MVT::nxv8i16
, MVT::nxv8i8
, Legal
);
1288 for (auto Op
: {ISD::ZEXTLOAD
, ISD::SEXTLOAD
, ISD::EXTLOAD
}) {
1289 setLoadExtAction(Op
, MVT::nxv2i64
, MVT::nxv2i8
, Legal
);
1290 setLoadExtAction(Op
, MVT::nxv2i64
, MVT::nxv2i16
, Legal
);
1291 setLoadExtAction(Op
, MVT::nxv2i64
, MVT::nxv2i32
, Legal
);
1292 setLoadExtAction(Op
, MVT::nxv4i32
, MVT::nxv4i8
, Legal
);
1293 setLoadExtAction(Op
, MVT::nxv4i32
, MVT::nxv4i16
, Legal
);
1294 setLoadExtAction(Op
, MVT::nxv8i16
, MVT::nxv8i8
, Legal
);
1297 // SVE supports truncating stores of 64 and 128-bit vectors
1298 setTruncStoreAction(MVT::v2i64
, MVT::v2i8
, Custom
);
1299 setTruncStoreAction(MVT::v2i64
, MVT::v2i16
, Custom
);
1300 setTruncStoreAction(MVT::v2i64
, MVT::v2i32
, Custom
);
1301 setTruncStoreAction(MVT::v2i32
, MVT::v2i8
, Custom
);
1302 setTruncStoreAction(MVT::v2i32
, MVT::v2i16
, Custom
);
1304 for (auto VT
: {MVT::nxv2f16
, MVT::nxv4f16
, MVT::nxv8f16
, MVT::nxv2f32
,
1305 MVT::nxv4f32
, MVT::nxv2f64
}) {
1306 setOperationAction(ISD::CONCAT_VECTORS
, VT
, Custom
);
1307 setOperationAction(ISD::INSERT_SUBVECTOR
, VT
, Custom
);
1308 setOperationAction(ISD::MGATHER
, VT
, Custom
);
1309 setOperationAction(ISD::MSCATTER
, VT
, Custom
);
1310 setOperationAction(ISD::MLOAD
, VT
, Custom
);
1311 setOperationAction(ISD::SPLAT_VECTOR
, VT
, Legal
);
1312 setOperationAction(ISD::SELECT
, VT
, Custom
);
1313 setOperationAction(ISD::FADD
, VT
, Custom
);
1314 setOperationAction(ISD::FCOPYSIGN
, VT
, Custom
);
1315 setOperationAction(ISD::FDIV
, VT
, Custom
);
1316 setOperationAction(ISD::FMA
, VT
, Custom
);
1317 setOperationAction(ISD::FMAXIMUM
, VT
, Custom
);
1318 setOperationAction(ISD::FMAXNUM
, VT
, Custom
);
1319 setOperationAction(ISD::FMINIMUM
, VT
, Custom
);
1320 setOperationAction(ISD::FMINNUM
, VT
, Custom
);
1321 setOperationAction(ISD::FMUL
, VT
, Custom
);
1322 setOperationAction(ISD::FNEG
, VT
, Custom
);
1323 setOperationAction(ISD::FSUB
, VT
, Custom
);
1324 setOperationAction(ISD::FCEIL
, VT
, Custom
);
1325 setOperationAction(ISD::FFLOOR
, VT
, Custom
);
1326 setOperationAction(ISD::FNEARBYINT
, VT
, Custom
);
1327 setOperationAction(ISD::FRINT
, VT
, Custom
);
1328 setOperationAction(ISD::FROUND
, VT
, Custom
);
1329 setOperationAction(ISD::FROUNDEVEN
, VT
, Custom
);
1330 setOperationAction(ISD::FTRUNC
, VT
, Custom
);
1331 setOperationAction(ISD::FSQRT
, VT
, Custom
);
1332 setOperationAction(ISD::FABS
, VT
, Custom
);
1333 setOperationAction(ISD::FP_EXTEND
, VT
, Custom
);
1334 setOperationAction(ISD::FP_ROUND
, VT
, Custom
);
1335 setOperationAction(ISD::VECREDUCE_FADD
, VT
, Custom
);
1336 setOperationAction(ISD::VECREDUCE_FMAX
, VT
, Custom
);
1337 setOperationAction(ISD::VECREDUCE_FMIN
, VT
, Custom
);
1338 setOperationAction(ISD::VECREDUCE_SEQ_FADD
, VT
, Custom
);
1339 setOperationAction(ISD::VECTOR_SPLICE
, VT
, Custom
);
1341 setOperationAction(ISD::SELECT_CC
, VT
, Expand
);
1342 setOperationAction(ISD::FREM
, VT
, Expand
);
1343 setOperationAction(ISD::FPOW
, VT
, Expand
);
1344 setOperationAction(ISD::FPOWI
, VT
, Expand
);
1345 setOperationAction(ISD::FCOS
, VT
, Expand
);
1346 setOperationAction(ISD::FSIN
, VT
, Expand
);
1347 setOperationAction(ISD::FSINCOS
, VT
, Expand
);
1348 setOperationAction(ISD::FEXP
, VT
, Expand
);
1349 setOperationAction(ISD::FEXP2
, VT
, Expand
);
1350 setOperationAction(ISD::FLOG
, VT
, Expand
);
1351 setOperationAction(ISD::FLOG2
, VT
, Expand
);
1352 setOperationAction(ISD::FLOG10
, VT
, Expand
);
1354 setCondCodeAction(ISD::SETO
, VT
, Expand
);
1355 setCondCodeAction(ISD::SETOLT
, VT
, Expand
);
1356 setCondCodeAction(ISD::SETLT
, VT
, Expand
);
1357 setCondCodeAction(ISD::SETOLE
, VT
, Expand
);
1358 setCondCodeAction(ISD::SETLE
, VT
, Expand
);
1359 setCondCodeAction(ISD::SETULT
, VT
, Expand
);
1360 setCondCodeAction(ISD::SETULE
, VT
, Expand
);
1361 setCondCodeAction(ISD::SETUGE
, VT
, Expand
);
1362 setCondCodeAction(ISD::SETUGT
, VT
, Expand
);
1363 setCondCodeAction(ISD::SETUEQ
, VT
, Expand
);
1364 setCondCodeAction(ISD::SETONE
, VT
, Expand
);
1367 for (auto VT
: {MVT::nxv2bf16
, MVT::nxv4bf16
, MVT::nxv8bf16
}) {
1368 setOperationAction(ISD::CONCAT_VECTORS
, VT
, Custom
);
1369 setOperationAction(ISD::MGATHER
, VT
, Custom
);
1370 setOperationAction(ISD::MSCATTER
, VT
, Custom
);
1371 setOperationAction(ISD::MLOAD
, VT
, Custom
);
1372 setOperationAction(ISD::INSERT_SUBVECTOR
, VT
, Custom
);
1373 setOperationAction(ISD::SPLAT_VECTOR
, VT
, Legal
);
1376 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::i8
, Custom
);
1377 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::i16
, Custom
);
1379 // NEON doesn't support integer divides, but SVE does
1380 for (auto VT
: {MVT::v8i8
, MVT::v16i8
, MVT::v4i16
, MVT::v8i16
, MVT::v2i32
,
1381 MVT::v4i32
, MVT::v1i64
, MVT::v2i64
}) {
1382 setOperationAction(ISD::SDIV
, VT
, Custom
);
1383 setOperationAction(ISD::UDIV
, VT
, Custom
);
1386 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1387 setOperationAction(ISD::MUL
, MVT::v1i64
, Custom
);
1388 setOperationAction(ISD::MUL
, MVT::v2i64
, Custom
);
1390 // NEON doesn't support across-vector reductions, but SVE does.
1391 for (auto VT
: {MVT::v4f16
, MVT::v8f16
, MVT::v2f32
, MVT::v4f32
, MVT::v2f64
})
1392 setOperationAction(ISD::VECREDUCE_SEQ_FADD
, VT
, Custom
);
1394 // NOTE: Currently this has to happen after computeRegisterProperties rather
1395 // than the preferred option of combining it with the addRegisterClass call.
1396 if (Subtarget
->useSVEForFixedLengthVectors()) {
1397 for (MVT VT
: MVT::integer_fixedlen_vector_valuetypes())
1398 if (useSVEForFixedLengthVectorVT(VT
))
1399 addTypeForFixedLengthSVE(VT
);
1400 for (MVT VT
: MVT::fp_fixedlen_vector_valuetypes())
1401 if (useSVEForFixedLengthVectorVT(VT
))
1402 addTypeForFixedLengthSVE(VT
);
1404 // 64bit results can mean a bigger than NEON input.
1405 for (auto VT
: {MVT::v8i8
, MVT::v4i16
})
1406 setOperationAction(ISD::TRUNCATE
, VT
, Custom
);
1407 setOperationAction(ISD::FP_ROUND
, MVT::v4f16
, Custom
);
1409 // 128bit results imply a bigger than NEON input.
1410 for (auto VT
: {MVT::v16i8
, MVT::v8i16
, MVT::v4i32
})
1411 setOperationAction(ISD::TRUNCATE
, VT
, Custom
);
1412 for (auto VT
: {MVT::v8f16
, MVT::v4f32
})
1413 setOperationAction(ISD::FP_ROUND
, VT
, Custom
);
1415 // These operations are not supported on NEON but SVE can do them.
1416 setOperationAction(ISD::BITREVERSE
, MVT::v1i64
, Custom
);
1417 setOperationAction(ISD::CTLZ
, MVT::v1i64
, Custom
);
1418 setOperationAction(ISD::CTLZ
, MVT::v2i64
, Custom
);
1419 setOperationAction(ISD::CTTZ
, MVT::v1i64
, Custom
);
1420 setOperationAction(ISD::MULHS
, MVT::v1i64
, Custom
);
1421 setOperationAction(ISD::MULHS
, MVT::v2i64
, Custom
);
1422 setOperationAction(ISD::MULHU
, MVT::v1i64
, Custom
);
1423 setOperationAction(ISD::MULHU
, MVT::v2i64
, Custom
);
1424 setOperationAction(ISD::SMAX
, MVT::v1i64
, Custom
);
1425 setOperationAction(ISD::SMAX
, MVT::v2i64
, Custom
);
1426 setOperationAction(ISD::SMIN
, MVT::v1i64
, Custom
);
1427 setOperationAction(ISD::SMIN
, MVT::v2i64
, Custom
);
1428 setOperationAction(ISD::UMAX
, MVT::v1i64
, Custom
);
1429 setOperationAction(ISD::UMAX
, MVT::v2i64
, Custom
);
1430 setOperationAction(ISD::UMIN
, MVT::v1i64
, Custom
);
1431 setOperationAction(ISD::UMIN
, MVT::v2i64
, Custom
);
1432 setOperationAction(ISD::VECREDUCE_SMAX
, MVT::v2i64
, Custom
);
1433 setOperationAction(ISD::VECREDUCE_SMIN
, MVT::v2i64
, Custom
);
1434 setOperationAction(ISD::VECREDUCE_UMAX
, MVT::v2i64
, Custom
);
1435 setOperationAction(ISD::VECREDUCE_UMIN
, MVT::v2i64
, Custom
);
1437 // Int operations with no NEON support.
1438 for (auto VT
: {MVT::v8i8
, MVT::v16i8
, MVT::v4i16
, MVT::v8i16
,
1439 MVT::v2i32
, MVT::v4i32
, MVT::v2i64
}) {
1440 setOperationAction(ISD::BITREVERSE
, VT
, Custom
);
1441 setOperationAction(ISD::CTTZ
, VT
, Custom
);
1442 setOperationAction(ISD::VECREDUCE_AND
, VT
, Custom
);
1443 setOperationAction(ISD::VECREDUCE_OR
, VT
, Custom
);
1444 setOperationAction(ISD::VECREDUCE_XOR
, VT
, Custom
);
1448 // Use SVE for vectors with more than 2 elements.
1449 for (auto VT
: {MVT::v4f16
, MVT::v8f16
, MVT::v4f32
})
1450 setOperationAction(ISD::VECREDUCE_FADD
, VT
, Custom
);
1453 setOperationPromotedToType(ISD::VECTOR_SPLICE
, MVT::nxv2i1
, MVT::nxv2i64
);
1454 setOperationPromotedToType(ISD::VECTOR_SPLICE
, MVT::nxv4i1
, MVT::nxv4i32
);
1455 setOperationPromotedToType(ISD::VECTOR_SPLICE
, MVT::nxv8i1
, MVT::nxv8i16
);
1456 setOperationPromotedToType(ISD::VECTOR_SPLICE
, MVT::nxv16i1
, MVT::nxv16i8
);
1458 setOperationAction(ISD::VSCALE
, MVT::i32
, Custom
);
1461 if (Subtarget
->hasMOPS() && Subtarget
->hasMTE()) {
1462 // Only required for llvm.aarch64.mops.memset.tag
1463 setOperationAction(ISD::INTRINSIC_W_CHAIN
, MVT::i8
, Custom
);
1466 PredictableSelectIsExpensive
= Subtarget
->predictableSelectIsExpensive();
1468 IsStrictFPEnabled
= true;
1471 void AArch64TargetLowering::addTypeForNEON(MVT VT
) {
1472 assert(VT
.isVector() && "VT should be a vector type");
1474 if (VT
.isFloatingPoint()) {
1475 MVT PromoteTo
= EVT(VT
).changeVectorElementTypeToInteger().getSimpleVT();
1476 setOperationPromotedToType(ISD::LOAD
, VT
, PromoteTo
);
1477 setOperationPromotedToType(ISD::STORE
, VT
, PromoteTo
);
1480 // Mark vector float intrinsics as expand.
1481 if (VT
== MVT::v2f32
|| VT
== MVT::v4f32
|| VT
== MVT::v2f64
) {
1482 setOperationAction(ISD::FSIN
, VT
, Expand
);
1483 setOperationAction(ISD::FCOS
, VT
, Expand
);
1484 setOperationAction(ISD::FPOW
, VT
, Expand
);
1485 setOperationAction(ISD::FLOG
, VT
, Expand
);
1486 setOperationAction(ISD::FLOG2
, VT
, Expand
);
1487 setOperationAction(ISD::FLOG10
, VT
, Expand
);
1488 setOperationAction(ISD::FEXP
, VT
, Expand
);
1489 setOperationAction(ISD::FEXP2
, VT
, Expand
);
1492 // But we do support custom-lowering for FCOPYSIGN.
1493 if (VT
== MVT::v2f32
|| VT
== MVT::v4f32
|| VT
== MVT::v2f64
||
1494 ((VT
== MVT::v4f16
|| VT
== MVT::v8f16
) && Subtarget
->hasFullFP16()))
1495 setOperationAction(ISD::FCOPYSIGN
, VT
, Custom
);
1497 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, VT
, Custom
);
1498 setOperationAction(ISD::INSERT_VECTOR_ELT
, VT
, Custom
);
1499 setOperationAction(ISD::BUILD_VECTOR
, VT
, Custom
);
1500 setOperationAction(ISD::VECTOR_SHUFFLE
, VT
, Custom
);
1501 setOperationAction(ISD::EXTRACT_SUBVECTOR
, VT
, Custom
);
1502 setOperationAction(ISD::SRA
, VT
, Custom
);
1503 setOperationAction(ISD::SRL
, VT
, Custom
);
1504 setOperationAction(ISD::SHL
, VT
, Custom
);
1505 setOperationAction(ISD::OR
, VT
, Custom
);
1506 setOperationAction(ISD::SETCC
, VT
, Custom
);
1507 setOperationAction(ISD::CONCAT_VECTORS
, VT
, Legal
);
1509 setOperationAction(ISD::SELECT
, VT
, Expand
);
1510 setOperationAction(ISD::SELECT_CC
, VT
, Expand
);
1511 setOperationAction(ISD::VSELECT
, VT
, Expand
);
1512 for (MVT InnerVT
: MVT::all_valuetypes())
1513 setLoadExtAction(ISD::EXTLOAD
, InnerVT
, VT
, Expand
);
1515 // CNT supports only B element sizes, then use UADDLP to widen.
1516 if (VT
!= MVT::v8i8
&& VT
!= MVT::v16i8
)
1517 setOperationAction(ISD::CTPOP
, VT
, Custom
);
1519 setOperationAction(ISD::UDIV
, VT
, Expand
);
1520 setOperationAction(ISD::SDIV
, VT
, Expand
);
1521 setOperationAction(ISD::UREM
, VT
, Expand
);
1522 setOperationAction(ISD::SREM
, VT
, Expand
);
1523 setOperationAction(ISD::FREM
, VT
, Expand
);
1525 for (unsigned Opcode
:
1526 {ISD::FP_TO_SINT
, ISD::FP_TO_UINT
, ISD::FP_TO_SINT_SAT
,
1527 ISD::FP_TO_UINT_SAT
, ISD::STRICT_FP_TO_SINT
, ISD::STRICT_FP_TO_UINT
})
1528 setOperationAction(Opcode
, VT
, Custom
);
1530 if (!VT
.isFloatingPoint())
1531 setOperationAction(ISD::ABS
, VT
, Legal
);
1533 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1534 if (!VT
.isFloatingPoint() && VT
!= MVT::v2i64
&& VT
!= MVT::v1i64
)
1535 for (unsigned Opcode
: {ISD::SMIN
, ISD::SMAX
, ISD::UMIN
, ISD::UMAX
})
1536 setOperationAction(Opcode
, VT
, Legal
);
1538 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1540 if (VT
.isFloatingPoint() &&
1541 VT
.getVectorElementType() != MVT::bf16
&&
1542 (VT
.getVectorElementType() != MVT::f16
|| Subtarget
->hasFullFP16()))
1543 for (unsigned Opcode
:
1544 {ISD::FMINIMUM
, ISD::FMAXIMUM
, ISD::FMINNUM
, ISD::FMAXNUM
,
1545 ISD::STRICT_FMINIMUM
, ISD::STRICT_FMAXIMUM
, ISD::STRICT_FMINNUM
,
1546 ISD::STRICT_FMAXNUM
, ISD::STRICT_FADD
, ISD::STRICT_FSUB
,
1547 ISD::STRICT_FMUL
, ISD::STRICT_FDIV
, ISD::STRICT_FMA
,
1549 setOperationAction(Opcode
, VT
, Legal
);
1551 // Strict fp extend and trunc are legal
1552 if (VT
.isFloatingPoint() && VT
.getScalarSizeInBits() != 16)
1553 setOperationAction(ISD::STRICT_FP_EXTEND
, VT
, Legal
);
1554 if (VT
.isFloatingPoint() && VT
.getScalarSizeInBits() != 64)
1555 setOperationAction(ISD::STRICT_FP_ROUND
, VT
, Legal
);
1557 // FIXME: We could potentially make use of the vector comparison instructions
1558 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1560 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1561 // so we would need to expand when the condition code doesn't match the
1562 // kind of comparison.
1563 // * Some kinds of comparison require more than one FCMXY instruction so
1564 // would need to be expanded instead.
1565 // * The lowering of the non-strict versions involves target-specific ISD
1566 // nodes so we would likely need to add strict versions of all of them and
1567 // handle them appropriately.
1568 setOperationAction(ISD::STRICT_FSETCC
, VT
, Expand
);
1569 setOperationAction(ISD::STRICT_FSETCCS
, VT
, Expand
);
1571 if (Subtarget
->isLittleEndian()) {
1572 for (unsigned im
= (unsigned)ISD::PRE_INC
;
1573 im
!= (unsigned)ISD::LAST_INDEXED_MODE
; ++im
) {
1574 setIndexedLoadAction(im
, VT
, Legal
);
1575 setIndexedStoreAction(im
, VT
, Legal
);
1580 bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT
,
1582 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1583 if (!Subtarget
->hasSVE())
1586 // We can only support legal predicate result types. We can use the SVE
1587 // whilelo instruction for generating fixed-width predicates too.
1588 if (ResVT
!= MVT::nxv2i1
&& ResVT
!= MVT::nxv4i1
&& ResVT
!= MVT::nxv8i1
&&
1589 ResVT
!= MVT::nxv16i1
&& ResVT
!= MVT::v2i1
&& ResVT
!= MVT::v4i1
&&
1590 ResVT
!= MVT::v8i1
&& ResVT
!= MVT::v16i1
)
1593 // The whilelo instruction only works with i32 or i64 scalar inputs.
1594 if (OpVT
!= MVT::i32
&& OpVT
!= MVT::i64
)
1600 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT
) {
1601 assert(VT
.isFixedLengthVector() && "Expected fixed length vector type!");
1603 // By default everything must be expanded.
1604 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
)
1605 setOperationAction(Op
, VT
, Expand
);
1607 // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1608 setOperationAction(ISD::EXTRACT_SUBVECTOR
, VT
, Custom
);
1610 if (VT
.isFloatingPoint()) {
1611 setCondCodeAction(ISD::SETO
, VT
, Expand
);
1612 setCondCodeAction(ISD::SETOLT
, VT
, Expand
);
1613 setCondCodeAction(ISD::SETLT
, VT
, Expand
);
1614 setCondCodeAction(ISD::SETOLE
, VT
, Expand
);
1615 setCondCodeAction(ISD::SETLE
, VT
, Expand
);
1616 setCondCodeAction(ISD::SETULT
, VT
, Expand
);
1617 setCondCodeAction(ISD::SETULE
, VT
, Expand
);
1618 setCondCodeAction(ISD::SETUGE
, VT
, Expand
);
1619 setCondCodeAction(ISD::SETUGT
, VT
, Expand
);
1620 setCondCodeAction(ISD::SETUEQ
, VT
, Expand
);
1621 setCondCodeAction(ISD::SETONE
, VT
, Expand
);
1624 // Mark integer truncating stores/extending loads as having custom lowering
1625 if (VT
.isInteger()) {
1626 MVT InnerVT
= VT
.changeVectorElementType(MVT::i8
);
1627 while (InnerVT
!= VT
) {
1628 setTruncStoreAction(VT
, InnerVT
, Custom
);
1629 setLoadExtAction(ISD::ZEXTLOAD
, VT
, InnerVT
, Custom
);
1630 setLoadExtAction(ISD::SEXTLOAD
, VT
, InnerVT
, Custom
);
1631 InnerVT
= InnerVT
.changeVectorElementType(
1632 MVT::getIntegerVT(2 * InnerVT
.getScalarSizeInBits()));
1636 // Mark floating-point truncating stores/extending loads as having custom
1638 if (VT
.isFloatingPoint()) {
1639 MVT InnerVT
= VT
.changeVectorElementType(MVT::f16
);
1640 while (InnerVT
!= VT
) {
1641 setTruncStoreAction(VT
, InnerVT
, Custom
);
1642 setLoadExtAction(ISD::EXTLOAD
, VT
, InnerVT
, Custom
);
1643 InnerVT
= InnerVT
.changeVectorElementType(
1644 MVT::getFloatingPointVT(2 * InnerVT
.getScalarSizeInBits()));
1648 // Lower fixed length vector operations to scalable equivalents.
1649 setOperationAction(ISD::ABS
, VT
, Custom
);
1650 setOperationAction(ISD::ADD
, VT
, Custom
);
1651 setOperationAction(ISD::AND
, VT
, Custom
);
1652 setOperationAction(ISD::ANY_EXTEND
, VT
, Custom
);
1653 setOperationAction(ISD::BITCAST
, VT
, Custom
);
1654 setOperationAction(ISD::BITREVERSE
, VT
, Custom
);
1655 setOperationAction(ISD::BUILD_VECTOR
, VT
, Custom
);
1656 setOperationAction(ISD::BSWAP
, VT
, Custom
);
1657 setOperationAction(ISD::CONCAT_VECTORS
, VT
, Custom
);
1658 setOperationAction(ISD::CTLZ
, VT
, Custom
);
1659 setOperationAction(ISD::CTPOP
, VT
, Custom
);
1660 setOperationAction(ISD::CTTZ
, VT
, Custom
);
1661 setOperationAction(ISD::FABS
, VT
, Custom
);
1662 setOperationAction(ISD::FADD
, VT
, Custom
);
1663 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, VT
, Custom
);
1664 setOperationAction(ISD::FCEIL
, VT
, Custom
);
1665 setOperationAction(ISD::FCOPYSIGN
, VT
, Custom
);
1666 setOperationAction(ISD::FDIV
, VT
, Custom
);
1667 setOperationAction(ISD::FFLOOR
, VT
, Custom
);
1668 setOperationAction(ISD::FMA
, VT
, Custom
);
1669 setOperationAction(ISD::FMAXIMUM
, VT
, Custom
);
1670 setOperationAction(ISD::FMAXNUM
, VT
, Custom
);
1671 setOperationAction(ISD::FMINIMUM
, VT
, Custom
);
1672 setOperationAction(ISD::FMINNUM
, VT
, Custom
);
1673 setOperationAction(ISD::FMUL
, VT
, Custom
);
1674 setOperationAction(ISD::FNEARBYINT
, VT
, Custom
);
1675 setOperationAction(ISD::FNEG
, VT
, Custom
);
1676 setOperationAction(ISD::FP_EXTEND
, VT
, Custom
);
1677 setOperationAction(ISD::FP_ROUND
, VT
, Custom
);
1678 setOperationAction(ISD::FP_TO_SINT
, VT
, Custom
);
1679 setOperationAction(ISD::FP_TO_UINT
, VT
, Custom
);
1680 setOperationAction(ISD::FRINT
, VT
, Custom
);
1681 setOperationAction(ISD::FROUND
, VT
, Custom
);
1682 setOperationAction(ISD::FROUNDEVEN
, VT
, Custom
);
1683 setOperationAction(ISD::FSQRT
, VT
, Custom
);
1684 setOperationAction(ISD::FSUB
, VT
, Custom
);
1685 setOperationAction(ISD::FTRUNC
, VT
, Custom
);
1686 setOperationAction(ISD::LOAD
, VT
, Custom
);
1687 setOperationAction(ISD::MGATHER
, VT
, Custom
);
1688 setOperationAction(ISD::MLOAD
, VT
, Custom
);
1689 setOperationAction(ISD::MSCATTER
, VT
, Custom
);
1690 setOperationAction(ISD::MSTORE
, VT
, Custom
);
1691 setOperationAction(ISD::MUL
, VT
, Custom
);
1692 setOperationAction(ISD::MULHS
, VT
, Custom
);
1693 setOperationAction(ISD::MULHU
, VT
, Custom
);
1694 setOperationAction(ISD::OR
, VT
, Custom
);
1695 setOperationAction(ISD::SDIV
, VT
, Custom
);
1696 setOperationAction(ISD::SELECT
, VT
, Custom
);
1697 setOperationAction(ISD::SETCC
, VT
, Custom
);
1698 setOperationAction(ISD::SHL
, VT
, Custom
);
1699 setOperationAction(ISD::SIGN_EXTEND
, VT
, Custom
);
1700 setOperationAction(ISD::SIGN_EXTEND_INREG
, VT
, Custom
);
1701 setOperationAction(ISD::SINT_TO_FP
, VT
, Custom
);
1702 setOperationAction(ISD::SMAX
, VT
, Custom
);
1703 setOperationAction(ISD::SMIN
, VT
, Custom
);
1704 setOperationAction(ISD::SPLAT_VECTOR
, VT
, Custom
);
1705 setOperationAction(ISD::VECTOR_SPLICE
, VT
, Custom
);
1706 setOperationAction(ISD::SRA
, VT
, Custom
);
1707 setOperationAction(ISD::SRL
, VT
, Custom
);
1708 setOperationAction(ISD::STORE
, VT
, Custom
);
1709 setOperationAction(ISD::SUB
, VT
, Custom
);
1710 setOperationAction(ISD::TRUNCATE
, VT
, Custom
);
1711 setOperationAction(ISD::UDIV
, VT
, Custom
);
1712 setOperationAction(ISD::UINT_TO_FP
, VT
, Custom
);
1713 setOperationAction(ISD::UMAX
, VT
, Custom
);
1714 setOperationAction(ISD::UMIN
, VT
, Custom
);
1715 setOperationAction(ISD::VECREDUCE_ADD
, VT
, Custom
);
1716 setOperationAction(ISD::VECREDUCE_AND
, VT
, Custom
);
1717 setOperationAction(ISD::VECREDUCE_FADD
, VT
, Custom
);
1718 setOperationAction(ISD::VECREDUCE_SEQ_FADD
, VT
, Custom
);
1719 setOperationAction(ISD::VECREDUCE_FMAX
, VT
, Custom
);
1720 setOperationAction(ISD::VECREDUCE_FMIN
, VT
, Custom
);
1721 setOperationAction(ISD::VECREDUCE_OR
, VT
, Custom
);
1722 setOperationAction(ISD::INSERT_VECTOR_ELT
, VT
, Custom
);
1723 setOperationAction(ISD::VECREDUCE_SMAX
, VT
, Custom
);
1724 setOperationAction(ISD::VECREDUCE_SMIN
, VT
, Custom
);
1725 setOperationAction(ISD::VECREDUCE_UMAX
, VT
, Custom
);
1726 setOperationAction(ISD::VECREDUCE_UMIN
, VT
, Custom
);
1727 setOperationAction(ISD::VECREDUCE_XOR
, VT
, Custom
);
1728 setOperationAction(ISD::VECTOR_SHUFFLE
, VT
, Custom
);
1729 setOperationAction(ISD::VSELECT
, VT
, Custom
);
1730 setOperationAction(ISD::XOR
, VT
, Custom
);
1731 setOperationAction(ISD::ZERO_EXTEND
, VT
, Custom
);
1734 void AArch64TargetLowering::addDRTypeForNEON(MVT VT
) {
1735 addRegisterClass(VT
, &AArch64::FPR64RegClass
);
1739 void AArch64TargetLowering::addQRTypeForNEON(MVT VT
) {
1740 addRegisterClass(VT
, &AArch64::FPR128RegClass
);
1744 EVT
AArch64TargetLowering::getSetCCResultType(const DataLayout
&,
1745 LLVMContext
&C
, EVT VT
) const {
1748 if (VT
.isScalableVector())
1749 return EVT::getVectorVT(C
, MVT::i1
, VT
.getVectorElementCount());
1750 return VT
.changeVectorElementTypeToInteger();
1753 // isIntImmediate - This method tests to see if the node is a constant
1754 // operand. If so Imm will receive the value.
1755 static bool isIntImmediate(const SDNode
*N
, uint64_t &Imm
) {
1756 if (const ConstantSDNode
*C
= dyn_cast
<const ConstantSDNode
>(N
)) {
1757 Imm
= C
->getZExtValue();
1763 // isOpcWithIntImmediate - This method tests to see if the node is a specific
1764 // opcode and that it has a immediate integer right operand.
1765 // If so Imm will receive the value.
1766 static bool isOpcWithIntImmediate(const SDNode
*N
, unsigned Opc
,
1768 return N
->getOpcode() == Opc
&&
1769 isIntImmediate(N
->getOperand(1).getNode(), Imm
);
1772 static bool optimizeLogicalImm(SDValue Op
, unsigned Size
, uint64_t Imm
,
1773 const APInt
&Demanded
,
1774 TargetLowering::TargetLoweringOpt
&TLO
,
1776 uint64_t OldImm
= Imm
, NewImm
, Enc
;
1777 uint64_t Mask
= ((uint64_t)(-1LL) >> (64 - Size
)), OrigMask
= Mask
;
1779 // Return if the immediate is already all zeros, all ones, a bimm32 or a
1781 if (Imm
== 0 || Imm
== Mask
||
1782 AArch64_AM::isLogicalImmediate(Imm
& Mask
, Size
))
1785 unsigned EltSize
= Size
;
1786 uint64_t DemandedBits
= Demanded
.getZExtValue();
1788 // Clear bits that are not demanded.
1789 Imm
&= DemandedBits
;
1792 // The goal here is to set the non-demanded bits in a way that minimizes
1793 // the number of switching between 0 and 1. In order to achieve this goal,
1794 // we set the non-demanded bits to the value of the preceding demanded bits.
1795 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1796 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1797 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1798 // The final result is 0b11000011.
1799 uint64_t NonDemandedBits
= ~DemandedBits
;
1800 uint64_t InvertedImm
= ~Imm
& DemandedBits
;
1801 uint64_t RotatedImm
=
1802 ((InvertedImm
<< 1) | (InvertedImm
>> (EltSize
- 1) & 1)) &
1804 uint64_t Sum
= RotatedImm
+ NonDemandedBits
;
1805 bool Carry
= NonDemandedBits
& ~Sum
& (1ULL << (EltSize
- 1));
1806 uint64_t Ones
= (Sum
+ Carry
) & NonDemandedBits
;
1807 NewImm
= (Imm
| Ones
) & Mask
;
1809 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1810 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1811 // we halve the element size and continue the search.
1812 if (isShiftedMask_64(NewImm
) || isShiftedMask_64(~(NewImm
| ~Mask
)))
1815 // We cannot shrink the element size any further if it is 2-bits.
1821 uint64_t Hi
= Imm
>> EltSize
, DemandedBitsHi
= DemandedBits
>> EltSize
;
1823 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1824 if (((Imm
^ Hi
) & (DemandedBits
& DemandedBitsHi
) & Mask
) != 0)
1827 // Merge the upper and lower halves of Imm and DemandedBits.
1829 DemandedBits
|= DemandedBitsHi
;
1834 // Replicate the element across the register width.
1835 while (EltSize
< Size
) {
1836 NewImm
|= NewImm
<< EltSize
;
1841 assert(((OldImm
^ NewImm
) & Demanded
.getZExtValue()) == 0 &&
1842 "demanded bits should never be altered");
1843 assert(OldImm
!= NewImm
&& "the new imm shouldn't be equal to the old imm");
1845 // Create the new constant immediate node.
1846 EVT VT
= Op
.getValueType();
1850 // If the new constant immediate is all-zeros or all-ones, let the target
1851 // independent DAG combine optimize this node.
1852 if (NewImm
== 0 || NewImm
== OrigMask
) {
1853 New
= TLO
.DAG
.getNode(Op
.getOpcode(), DL
, VT
, Op
.getOperand(0),
1854 TLO
.DAG
.getConstant(NewImm
, DL
, VT
));
1855 // Otherwise, create a machine node so that target independent DAG combine
1856 // doesn't undo this optimization.
1858 Enc
= AArch64_AM::encodeLogicalImmediate(NewImm
, Size
);
1859 SDValue EncConst
= TLO
.DAG
.getTargetConstant(Enc
, DL
, VT
);
1861 TLO
.DAG
.getMachineNode(NewOpc
, DL
, VT
, Op
.getOperand(0), EncConst
), 0);
1864 return TLO
.CombineTo(Op
, New
);
1867 bool AArch64TargetLowering::targetShrinkDemandedConstant(
1868 SDValue Op
, const APInt
&DemandedBits
, const APInt
&DemandedElts
,
1869 TargetLoweringOpt
&TLO
) const {
1870 // Delay this optimization to as late as possible.
1874 if (!EnableOptimizeLogicalImm
)
1877 EVT VT
= Op
.getValueType();
1881 unsigned Size
= VT
.getSizeInBits();
1882 assert((Size
== 32 || Size
== 64) &&
1883 "i32 or i64 is expected after legalization.");
1885 // Exit early if we demand all bits.
1886 if (DemandedBits
.countPopulation() == Size
)
1890 switch (Op
.getOpcode()) {
1894 NewOpc
= Size
== 32 ? AArch64::ANDWri
: AArch64::ANDXri
;
1897 NewOpc
= Size
== 32 ? AArch64::ORRWri
: AArch64::ORRXri
;
1900 NewOpc
= Size
== 32 ? AArch64::EORWri
: AArch64::EORXri
;
1903 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1));
1906 uint64_t Imm
= C
->getZExtValue();
1907 return optimizeLogicalImm(Op
, Size
, Imm
, DemandedBits
, TLO
, NewOpc
);
1910 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1911 /// Mask are known to be either zero or one and return them Known.
1912 void AArch64TargetLowering::computeKnownBitsForTargetNode(
1913 const SDValue Op
, KnownBits
&Known
, const APInt
&DemandedElts
,
1914 const SelectionDAG
&DAG
, unsigned Depth
) const {
1915 switch (Op
.getOpcode()) {
1918 case AArch64ISD::DUP
: {
1919 SDValue SrcOp
= Op
.getOperand(0);
1920 Known
= DAG
.computeKnownBits(SrcOp
, Depth
+ 1);
1921 if (SrcOp
.getValueSizeInBits() != Op
.getScalarValueSizeInBits()) {
1922 assert(SrcOp
.getValueSizeInBits() > Op
.getScalarValueSizeInBits() &&
1923 "Expected DUP implicit truncation");
1924 Known
= Known
.trunc(Op
.getScalarValueSizeInBits());
1928 case AArch64ISD::CSEL
: {
1930 Known
= DAG
.computeKnownBits(Op
->getOperand(0), Depth
+ 1);
1931 Known2
= DAG
.computeKnownBits(Op
->getOperand(1), Depth
+ 1);
1932 Known
= KnownBits::commonBits(Known
, Known2
);
1935 case AArch64ISD::BICi
: {
1936 // Compute the bit cleared value.
1938 ~(Op
->getConstantOperandVal(1) << Op
->getConstantOperandVal(2));
1939 Known
= DAG
.computeKnownBits(Op
->getOperand(0), Depth
+ 1);
1940 Known
&= KnownBits::makeConstant(APInt(Known
.getBitWidth(), Mask
));
1943 case AArch64ISD::VLSHR
: {
1945 Known
= DAG
.computeKnownBits(Op
->getOperand(0), Depth
+ 1);
1946 Known2
= DAG
.computeKnownBits(Op
->getOperand(1), Depth
+ 1);
1947 Known
= KnownBits::lshr(Known
, Known2
);
1950 case AArch64ISD::VASHR
: {
1952 Known
= DAG
.computeKnownBits(Op
->getOperand(0), Depth
+ 1);
1953 Known2
= DAG
.computeKnownBits(Op
->getOperand(1), Depth
+ 1);
1954 Known
= KnownBits::ashr(Known
, Known2
);
1957 case AArch64ISD::LOADgot
:
1958 case AArch64ISD::ADDlow
: {
1959 if (!Subtarget
->isTargetILP32())
1961 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1962 Known
.Zero
= APInt::getHighBitsSet(64, 32);
1965 case AArch64ISD::ASSERT_ZEXT_BOOL
: {
1966 Known
= DAG
.computeKnownBits(Op
->getOperand(0), Depth
+ 1);
1967 Known
.Zero
|= APInt(Known
.getBitWidth(), 0xFE);
1970 case ISD::INTRINSIC_W_CHAIN
: {
1971 ConstantSDNode
*CN
= cast
<ConstantSDNode
>(Op
->getOperand(1));
1972 Intrinsic::ID IntID
= static_cast<Intrinsic::ID
>(CN
->getZExtValue());
1975 case Intrinsic::aarch64_ldaxr
:
1976 case Intrinsic::aarch64_ldxr
: {
1977 unsigned BitWidth
= Known
.getBitWidth();
1978 EVT VT
= cast
<MemIntrinsicSDNode
>(Op
)->getMemoryVT();
1979 unsigned MemBits
= VT
.getScalarSizeInBits();
1980 Known
.Zero
|= APInt::getHighBitsSet(BitWidth
, BitWidth
- MemBits
);
1986 case ISD::INTRINSIC_WO_CHAIN
:
1987 case ISD::INTRINSIC_VOID
: {
1988 unsigned IntNo
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
1992 case Intrinsic::aarch64_neon_umaxv
:
1993 case Intrinsic::aarch64_neon_uminv
: {
1994 // Figure out the datatype of the vector operand. The UMINV instruction
1995 // will zero extend the result, so we can mark as known zero all the
1996 // bits larger than the element datatype. 32-bit or larget doesn't need
1997 // this as those are legal types and will be handled by isel directly.
1998 MVT VT
= Op
.getOperand(1).getValueType().getSimpleVT();
1999 unsigned BitWidth
= Known
.getBitWidth();
2000 if (VT
== MVT::v8i8
|| VT
== MVT::v16i8
) {
2001 assert(BitWidth
>= 8 && "Unexpected width!");
2002 APInt Mask
= APInt::getHighBitsSet(BitWidth
, BitWidth
- 8);
2004 } else if (VT
== MVT::v4i16
|| VT
== MVT::v8i16
) {
2005 assert(BitWidth
>= 16 && "Unexpected width!");
2006 APInt Mask
= APInt::getHighBitsSet(BitWidth
, BitWidth
- 16);
2016 MVT
AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout
&DL
,
2021 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2022 EVT VT
, unsigned AddrSpace
, Align Alignment
, MachineMemOperand::Flags Flags
,
2024 if (Subtarget
->requiresStrictAlign())
2028 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2029 *Fast
= !Subtarget
->isMisaligned128StoreSlow() || VT
.getStoreSize() != 16 ||
2030 // See comments in performSTORECombine() for more details about
2031 // these conditions.
2033 // Code that uses clang vector extensions can mark that it
2034 // wants unaligned accesses to be treated as fast by
2035 // underspecifying alignment to be 1 or 2.
2038 // Disregard v2i64. Memcpy lowering produces those and splitting
2039 // them regresses performance on micro-benchmarks and olden/bh.
2045 // Same as above but handling LLTs instead.
2046 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2047 LLT Ty
, unsigned AddrSpace
, Align Alignment
, MachineMemOperand::Flags Flags
,
2049 if (Subtarget
->requiresStrictAlign())
2053 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2054 *Fast
= !Subtarget
->isMisaligned128StoreSlow() ||
2055 Ty
.getSizeInBytes() != 16 ||
2056 // See comments in performSTORECombine() for more details about
2057 // these conditions.
2059 // Code that uses clang vector extensions can mark that it
2060 // wants unaligned accesses to be treated as fast by
2061 // underspecifying alignment to be 1 or 2.
2064 // Disregard v2i64. Memcpy lowering produces those and splitting
2065 // them regresses performance on micro-benchmarks and olden/bh.
2066 Ty
== LLT::fixed_vector(2, 64);
2072 AArch64TargetLowering::createFastISel(FunctionLoweringInfo
&funcInfo
,
2073 const TargetLibraryInfo
*libInfo
) const {
2074 return AArch64::createFastISel(funcInfo
, libInfo
);
2077 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode
) const {
2078 #define MAKE_CASE(V) \
2081 switch ((AArch64ISD::NodeType
)Opcode
) {
2082 case AArch64ISD::FIRST_NUMBER
:
2084 MAKE_CASE(AArch64ISD::OBSCURE_COPY
)
2085 MAKE_CASE(AArch64ISD::SMSTART
)
2086 MAKE_CASE(AArch64ISD::SMSTOP
)
2087 MAKE_CASE(AArch64ISD::RESTORE_ZA
)
2088 MAKE_CASE(AArch64ISD::CALL
)
2089 MAKE_CASE(AArch64ISD::ADRP
)
2090 MAKE_CASE(AArch64ISD::ADR
)
2091 MAKE_CASE(AArch64ISD::ADDlow
)
2092 MAKE_CASE(AArch64ISD::LOADgot
)
2093 MAKE_CASE(AArch64ISD::RET_FLAG
)
2094 MAKE_CASE(AArch64ISD::BRCOND
)
2095 MAKE_CASE(AArch64ISD::CSEL
)
2096 MAKE_CASE(AArch64ISD::CSINV
)
2097 MAKE_CASE(AArch64ISD::CSNEG
)
2098 MAKE_CASE(AArch64ISD::CSINC
)
2099 MAKE_CASE(AArch64ISD::THREAD_POINTER
)
2100 MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ
)
2101 MAKE_CASE(AArch64ISD::ABDS_PRED
)
2102 MAKE_CASE(AArch64ISD::ABDU_PRED
)
2103 MAKE_CASE(AArch64ISD::MUL_PRED
)
2104 MAKE_CASE(AArch64ISD::MULHS_PRED
)
2105 MAKE_CASE(AArch64ISD::MULHU_PRED
)
2106 MAKE_CASE(AArch64ISD::SDIV_PRED
)
2107 MAKE_CASE(AArch64ISD::SHL_PRED
)
2108 MAKE_CASE(AArch64ISD::SMAX_PRED
)
2109 MAKE_CASE(AArch64ISD::SMIN_PRED
)
2110 MAKE_CASE(AArch64ISD::SRA_PRED
)
2111 MAKE_CASE(AArch64ISD::SRL_PRED
)
2112 MAKE_CASE(AArch64ISD::UDIV_PRED
)
2113 MAKE_CASE(AArch64ISD::UMAX_PRED
)
2114 MAKE_CASE(AArch64ISD::UMIN_PRED
)
2115 MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1
)
2116 MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU
)
2117 MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU
)
2118 MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU
)
2119 MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU
)
2120 MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU
)
2121 MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU
)
2122 MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU
)
2123 MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU
)
2124 MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU
)
2125 MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU
)
2126 MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU
)
2127 MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU
)
2128 MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
)
2129 MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU
)
2130 MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU
)
2131 MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU
)
2132 MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU
)
2133 MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU
)
2134 MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU
)
2135 MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU
)
2136 MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU
)
2137 MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO
)
2138 MAKE_CASE(AArch64ISD::ADC
)
2139 MAKE_CASE(AArch64ISD::SBC
)
2140 MAKE_CASE(AArch64ISD::ADDS
)
2141 MAKE_CASE(AArch64ISD::SUBS
)
2142 MAKE_CASE(AArch64ISD::ADCS
)
2143 MAKE_CASE(AArch64ISD::SBCS
)
2144 MAKE_CASE(AArch64ISD::ANDS
)
2145 MAKE_CASE(AArch64ISD::CCMP
)
2146 MAKE_CASE(AArch64ISD::CCMN
)
2147 MAKE_CASE(AArch64ISD::FCCMP
)
2148 MAKE_CASE(AArch64ISD::FCMP
)
2149 MAKE_CASE(AArch64ISD::STRICT_FCMP
)
2150 MAKE_CASE(AArch64ISD::STRICT_FCMPE
)
2151 MAKE_CASE(AArch64ISD::DUP
)
2152 MAKE_CASE(AArch64ISD::DUPLANE8
)
2153 MAKE_CASE(AArch64ISD::DUPLANE16
)
2154 MAKE_CASE(AArch64ISD::DUPLANE32
)
2155 MAKE_CASE(AArch64ISD::DUPLANE64
)
2156 MAKE_CASE(AArch64ISD::DUPLANE128
)
2157 MAKE_CASE(AArch64ISD::MOVI
)
2158 MAKE_CASE(AArch64ISD::MOVIshift
)
2159 MAKE_CASE(AArch64ISD::MOVIedit
)
2160 MAKE_CASE(AArch64ISD::MOVImsl
)
2161 MAKE_CASE(AArch64ISD::FMOV
)
2162 MAKE_CASE(AArch64ISD::MVNIshift
)
2163 MAKE_CASE(AArch64ISD::MVNImsl
)
2164 MAKE_CASE(AArch64ISD::BICi
)
2165 MAKE_CASE(AArch64ISD::ORRi
)
2166 MAKE_CASE(AArch64ISD::BSP
)
2167 MAKE_CASE(AArch64ISD::EXTR
)
2168 MAKE_CASE(AArch64ISD::ZIP1
)
2169 MAKE_CASE(AArch64ISD::ZIP2
)
2170 MAKE_CASE(AArch64ISD::UZP1
)
2171 MAKE_CASE(AArch64ISD::UZP2
)
2172 MAKE_CASE(AArch64ISD::TRN1
)
2173 MAKE_CASE(AArch64ISD::TRN2
)
2174 MAKE_CASE(AArch64ISD::REV16
)
2175 MAKE_CASE(AArch64ISD::REV32
)
2176 MAKE_CASE(AArch64ISD::REV64
)
2177 MAKE_CASE(AArch64ISD::EXT
)
2178 MAKE_CASE(AArch64ISD::SPLICE
)
2179 MAKE_CASE(AArch64ISD::VSHL
)
2180 MAKE_CASE(AArch64ISD::VLSHR
)
2181 MAKE_CASE(AArch64ISD::VASHR
)
2182 MAKE_CASE(AArch64ISD::VSLI
)
2183 MAKE_CASE(AArch64ISD::VSRI
)
2184 MAKE_CASE(AArch64ISD::CMEQ
)
2185 MAKE_CASE(AArch64ISD::CMGE
)
2186 MAKE_CASE(AArch64ISD::CMGT
)
2187 MAKE_CASE(AArch64ISD::CMHI
)
2188 MAKE_CASE(AArch64ISD::CMHS
)
2189 MAKE_CASE(AArch64ISD::FCMEQ
)
2190 MAKE_CASE(AArch64ISD::FCMGE
)
2191 MAKE_CASE(AArch64ISD::FCMGT
)
2192 MAKE_CASE(AArch64ISD::CMEQz
)
2193 MAKE_CASE(AArch64ISD::CMGEz
)
2194 MAKE_CASE(AArch64ISD::CMGTz
)
2195 MAKE_CASE(AArch64ISD::CMLEz
)
2196 MAKE_CASE(AArch64ISD::CMLTz
)
2197 MAKE_CASE(AArch64ISD::FCMEQz
)
2198 MAKE_CASE(AArch64ISD::FCMGEz
)
2199 MAKE_CASE(AArch64ISD::FCMGTz
)
2200 MAKE_CASE(AArch64ISD::FCMLEz
)
2201 MAKE_CASE(AArch64ISD::FCMLTz
)
2202 MAKE_CASE(AArch64ISD::SADDV
)
2203 MAKE_CASE(AArch64ISD::UADDV
)
2204 MAKE_CASE(AArch64ISD::SDOT
)
2205 MAKE_CASE(AArch64ISD::UDOT
)
2206 MAKE_CASE(AArch64ISD::SMINV
)
2207 MAKE_CASE(AArch64ISD::UMINV
)
2208 MAKE_CASE(AArch64ISD::SMAXV
)
2209 MAKE_CASE(AArch64ISD::UMAXV
)
2210 MAKE_CASE(AArch64ISD::SADDV_PRED
)
2211 MAKE_CASE(AArch64ISD::UADDV_PRED
)
2212 MAKE_CASE(AArch64ISD::SMAXV_PRED
)
2213 MAKE_CASE(AArch64ISD::UMAXV_PRED
)
2214 MAKE_CASE(AArch64ISD::SMINV_PRED
)
2215 MAKE_CASE(AArch64ISD::UMINV_PRED
)
2216 MAKE_CASE(AArch64ISD::ORV_PRED
)
2217 MAKE_CASE(AArch64ISD::EORV_PRED
)
2218 MAKE_CASE(AArch64ISD::ANDV_PRED
)
2219 MAKE_CASE(AArch64ISD::CLASTA_N
)
2220 MAKE_CASE(AArch64ISD::CLASTB_N
)
2221 MAKE_CASE(AArch64ISD::LASTA
)
2222 MAKE_CASE(AArch64ISD::LASTB
)
2223 MAKE_CASE(AArch64ISD::REINTERPRET_CAST
)
2224 MAKE_CASE(AArch64ISD::LS64_BUILD
)
2225 MAKE_CASE(AArch64ISD::LS64_EXTRACT
)
2226 MAKE_CASE(AArch64ISD::TBL
)
2227 MAKE_CASE(AArch64ISD::FADD_PRED
)
2228 MAKE_CASE(AArch64ISD::FADDA_PRED
)
2229 MAKE_CASE(AArch64ISD::FADDV_PRED
)
2230 MAKE_CASE(AArch64ISD::FDIV_PRED
)
2231 MAKE_CASE(AArch64ISD::FMA_PRED
)
2232 MAKE_CASE(AArch64ISD::FMAX_PRED
)
2233 MAKE_CASE(AArch64ISD::FMAXV_PRED
)
2234 MAKE_CASE(AArch64ISD::FMAXNM_PRED
)
2235 MAKE_CASE(AArch64ISD::FMAXNMV_PRED
)
2236 MAKE_CASE(AArch64ISD::FMIN_PRED
)
2237 MAKE_CASE(AArch64ISD::FMINV_PRED
)
2238 MAKE_CASE(AArch64ISD::FMINNM_PRED
)
2239 MAKE_CASE(AArch64ISD::FMINNMV_PRED
)
2240 MAKE_CASE(AArch64ISD::FMUL_PRED
)
2241 MAKE_CASE(AArch64ISD::FSUB_PRED
)
2242 MAKE_CASE(AArch64ISD::RDSVL
)
2243 MAKE_CASE(AArch64ISD::BIC
)
2244 MAKE_CASE(AArch64ISD::BIT
)
2245 MAKE_CASE(AArch64ISD::CBZ
)
2246 MAKE_CASE(AArch64ISD::CBNZ
)
2247 MAKE_CASE(AArch64ISD::TBZ
)
2248 MAKE_CASE(AArch64ISD::TBNZ
)
2249 MAKE_CASE(AArch64ISD::TC_RETURN
)
2250 MAKE_CASE(AArch64ISD::PREFETCH
)
2251 MAKE_CASE(AArch64ISD::SITOF
)
2252 MAKE_CASE(AArch64ISD::UITOF
)
2253 MAKE_CASE(AArch64ISD::NVCAST
)
2254 MAKE_CASE(AArch64ISD::MRS
)
2255 MAKE_CASE(AArch64ISD::SQSHL_I
)
2256 MAKE_CASE(AArch64ISD::UQSHL_I
)
2257 MAKE_CASE(AArch64ISD::SRSHR_I
)
2258 MAKE_CASE(AArch64ISD::URSHR_I
)
2259 MAKE_CASE(AArch64ISD::SQSHLU_I
)
2260 MAKE_CASE(AArch64ISD::WrapperLarge
)
2261 MAKE_CASE(AArch64ISD::LD2post
)
2262 MAKE_CASE(AArch64ISD::LD3post
)
2263 MAKE_CASE(AArch64ISD::LD4post
)
2264 MAKE_CASE(AArch64ISD::ST2post
)
2265 MAKE_CASE(AArch64ISD::ST3post
)
2266 MAKE_CASE(AArch64ISD::ST4post
)
2267 MAKE_CASE(AArch64ISD::LD1x2post
)
2268 MAKE_CASE(AArch64ISD::LD1x3post
)
2269 MAKE_CASE(AArch64ISD::LD1x4post
)
2270 MAKE_CASE(AArch64ISD::ST1x2post
)
2271 MAKE_CASE(AArch64ISD::ST1x3post
)
2272 MAKE_CASE(AArch64ISD::ST1x4post
)
2273 MAKE_CASE(AArch64ISD::LD1DUPpost
)
2274 MAKE_CASE(AArch64ISD::LD2DUPpost
)
2275 MAKE_CASE(AArch64ISD::LD3DUPpost
)
2276 MAKE_CASE(AArch64ISD::LD4DUPpost
)
2277 MAKE_CASE(AArch64ISD::LD1LANEpost
)
2278 MAKE_CASE(AArch64ISD::LD2LANEpost
)
2279 MAKE_CASE(AArch64ISD::LD3LANEpost
)
2280 MAKE_CASE(AArch64ISD::LD4LANEpost
)
2281 MAKE_CASE(AArch64ISD::ST2LANEpost
)
2282 MAKE_CASE(AArch64ISD::ST3LANEpost
)
2283 MAKE_CASE(AArch64ISD::ST4LANEpost
)
2284 MAKE_CASE(AArch64ISD::SMULL
)
2285 MAKE_CASE(AArch64ISD::UMULL
)
2286 MAKE_CASE(AArch64ISD::PMULL
)
2287 MAKE_CASE(AArch64ISD::FRECPE
)
2288 MAKE_CASE(AArch64ISD::FRECPS
)
2289 MAKE_CASE(AArch64ISD::FRSQRTE
)
2290 MAKE_CASE(AArch64ISD::FRSQRTS
)
2291 MAKE_CASE(AArch64ISD::STG
)
2292 MAKE_CASE(AArch64ISD::STZG
)
2293 MAKE_CASE(AArch64ISD::ST2G
)
2294 MAKE_CASE(AArch64ISD::STZ2G
)
2295 MAKE_CASE(AArch64ISD::SUNPKHI
)
2296 MAKE_CASE(AArch64ISD::SUNPKLO
)
2297 MAKE_CASE(AArch64ISD::UUNPKHI
)
2298 MAKE_CASE(AArch64ISD::UUNPKLO
)
2299 MAKE_CASE(AArch64ISD::INSR
)
2300 MAKE_CASE(AArch64ISD::PTEST
)
2301 MAKE_CASE(AArch64ISD::PTRUE
)
2302 MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO
)
2303 MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO
)
2304 MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO
)
2305 MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO
)
2306 MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO
)
2307 MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO
)
2308 MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO
)
2309 MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO
)
2310 MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO
)
2311 MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO
)
2312 MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO
)
2313 MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO
)
2314 MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO
)
2315 MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO
)
2316 MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO
)
2317 MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO
)
2318 MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO
)
2319 MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO
)
2320 MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO
)
2321 MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO
)
2322 MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO
)
2323 MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO
)
2324 MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO
)
2325 MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO
)
2326 MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO
)
2327 MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO
)
2328 MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO
)
2329 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO
)
2330 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO
)
2331 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO
)
2332 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO
)
2333 MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO
)
2334 MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO
)
2335 MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO
)
2336 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO
)
2337 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO
)
2338 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO
)
2339 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO
)
2340 MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO
)
2341 MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO
)
2342 MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO
)
2343 MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO
)
2344 MAKE_CASE(AArch64ISD::ST1_PRED
)
2345 MAKE_CASE(AArch64ISD::SST1_PRED
)
2346 MAKE_CASE(AArch64ISD::SST1_SCALED_PRED
)
2347 MAKE_CASE(AArch64ISD::SST1_SXTW_PRED
)
2348 MAKE_CASE(AArch64ISD::SST1_UXTW_PRED
)
2349 MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED
)
2350 MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED
)
2351 MAKE_CASE(AArch64ISD::SST1_IMM_PRED
)
2352 MAKE_CASE(AArch64ISD::SSTNT1_PRED
)
2353 MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED
)
2354 MAKE_CASE(AArch64ISD::LDP
)
2355 MAKE_CASE(AArch64ISD::LDNP
)
2356 MAKE_CASE(AArch64ISD::STP
)
2357 MAKE_CASE(AArch64ISD::STNP
)
2358 MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU
)
2359 MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU
)
2360 MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU
)
2361 MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU
)
2362 MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU
)
2363 MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU
)
2364 MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU
)
2365 MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU
)
2366 MAKE_CASE(AArch64ISD::INDEX_VECTOR
)
2367 MAKE_CASE(AArch64ISD::ADDP
)
2368 MAKE_CASE(AArch64ISD::SADDLP
)
2369 MAKE_CASE(AArch64ISD::UADDLP
)
2370 MAKE_CASE(AArch64ISD::CALL_RVMARKER
)
2371 MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL
)
2372 MAKE_CASE(AArch64ISD::MOPS_MEMSET
)
2373 MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING
)
2374 MAKE_CASE(AArch64ISD::MOPS_MEMCOPY
)
2375 MAKE_CASE(AArch64ISD::MOPS_MEMMOVE
)
2376 MAKE_CASE(AArch64ISD::CALL_BTI
)
2383 AArch64TargetLowering::EmitF128CSEL(MachineInstr
&MI
,
2384 MachineBasicBlock
*MBB
) const {
2385 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2389 // [... previous instrs leading to comparison ...]
2395 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2397 MachineFunction
*MF
= MBB
->getParent();
2398 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
2399 const BasicBlock
*LLVM_BB
= MBB
->getBasicBlock();
2400 DebugLoc DL
= MI
.getDebugLoc();
2401 MachineFunction::iterator It
= ++MBB
->getIterator();
2403 Register DestReg
= MI
.getOperand(0).getReg();
2404 Register IfTrueReg
= MI
.getOperand(1).getReg();
2405 Register IfFalseReg
= MI
.getOperand(2).getReg();
2406 unsigned CondCode
= MI
.getOperand(3).getImm();
2407 bool NZCVKilled
= MI
.getOperand(4).isKill();
2409 MachineBasicBlock
*TrueBB
= MF
->CreateMachineBasicBlock(LLVM_BB
);
2410 MachineBasicBlock
*EndBB
= MF
->CreateMachineBasicBlock(LLVM_BB
);
2411 MF
->insert(It
, TrueBB
);
2412 MF
->insert(It
, EndBB
);
2414 // Transfer rest of current basic-block to EndBB
2415 EndBB
->splice(EndBB
->begin(), MBB
, std::next(MachineBasicBlock::iterator(MI
)),
2417 EndBB
->transferSuccessorsAndUpdatePHIs(MBB
);
2419 BuildMI(MBB
, DL
, TII
->get(AArch64::Bcc
)).addImm(CondCode
).addMBB(TrueBB
);
2420 BuildMI(MBB
, DL
, TII
->get(AArch64::B
)).addMBB(EndBB
);
2421 MBB
->addSuccessor(TrueBB
);
2422 MBB
->addSuccessor(EndBB
);
2424 // TrueBB falls through to the end.
2425 TrueBB
->addSuccessor(EndBB
);
2428 TrueBB
->addLiveIn(AArch64::NZCV
);
2429 EndBB
->addLiveIn(AArch64::NZCV
);
2432 BuildMI(*EndBB
, EndBB
->begin(), DL
, TII
->get(AArch64::PHI
), DestReg
)
2438 MI
.eraseFromParent();
2442 MachineBasicBlock
*AArch64TargetLowering::EmitLoweredCatchRet(
2443 MachineInstr
&MI
, MachineBasicBlock
*BB
) const {
2444 assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2445 BB
->getParent()->getFunction().getPersonalityFn())) &&
2446 "SEH does not use catchret!");
2451 AArch64TargetLowering::EmitTileLoad(unsigned Opc
, unsigned BaseReg
,
2453 MachineBasicBlock
*BB
) const {
2454 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
2455 MachineInstrBuilder MIB
= BuildMI(*BB
, MI
, MI
.getDebugLoc(), TII
->get(Opc
));
2457 MIB
.addReg(BaseReg
+ MI
.getOperand(0).getImm(), RegState::Define
);
2458 MIB
.add(MI
.getOperand(1)); // slice index register
2459 MIB
.add(MI
.getOperand(2)); // slice index offset
2460 MIB
.add(MI
.getOperand(3)); // pg
2461 MIB
.add(MI
.getOperand(4)); // base
2462 MIB
.add(MI
.getOperand(5)); // offset
2464 MI
.eraseFromParent(); // The pseudo is gone now.
2469 AArch64TargetLowering::EmitFill(MachineInstr
&MI
, MachineBasicBlock
*BB
) const {
2470 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
2471 MachineInstrBuilder MIB
=
2472 BuildMI(*BB
, MI
, MI
.getDebugLoc(), TII
->get(AArch64::LDR_ZA
));
2474 MIB
.addReg(AArch64::ZA
, RegState::Define
);
2475 MIB
.add(MI
.getOperand(0)); // Vector select register
2476 MIB
.add(MI
.getOperand(1)); // Vector select offset
2477 MIB
.add(MI
.getOperand(2)); // Base
2478 MIB
.add(MI
.getOperand(1)); // Offset, same as vector select offset
2480 MI
.eraseFromParent(); // The pseudo is gone now.
2485 AArch64TargetLowering::EmitMopa(unsigned Opc
, unsigned BaseReg
,
2486 MachineInstr
&MI
, MachineBasicBlock
*BB
) const {
2487 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
2488 MachineInstrBuilder MIB
= BuildMI(*BB
, MI
, MI
.getDebugLoc(), TII
->get(Opc
));
2490 MIB
.addReg(BaseReg
+ MI
.getOperand(0).getImm(), RegState::Define
);
2491 MIB
.addReg(BaseReg
+ MI
.getOperand(0).getImm());
2492 MIB
.add(MI
.getOperand(1)); // pn
2493 MIB
.add(MI
.getOperand(2)); // pm
2494 MIB
.add(MI
.getOperand(3)); // zn
2495 MIB
.add(MI
.getOperand(4)); // zm
2497 MI
.eraseFromParent(); // The pseudo is gone now.
2502 AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc
, unsigned BaseReg
,
2504 MachineBasicBlock
*BB
) const {
2505 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
2506 MachineInstrBuilder MIB
= BuildMI(*BB
, MI
, MI
.getDebugLoc(), TII
->get(Opc
));
2508 MIB
.addReg(BaseReg
+ MI
.getOperand(0).getImm(), RegState::Define
);
2509 MIB
.addReg(BaseReg
+ MI
.getOperand(0).getImm());
2510 MIB
.add(MI
.getOperand(1)); // Slice index register
2511 MIB
.add(MI
.getOperand(2)); // Slice index offset
2512 MIB
.add(MI
.getOperand(3)); // pg
2513 MIB
.add(MI
.getOperand(4)); // zn
2515 MI
.eraseFromParent(); // The pseudo is gone now.
2520 AArch64TargetLowering::EmitZero(MachineInstr
&MI
, MachineBasicBlock
*BB
) const {
2521 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
2522 MachineInstrBuilder MIB
=
2523 BuildMI(*BB
, MI
, MI
.getDebugLoc(), TII
->get(AArch64::ZERO_M
));
2524 MIB
.add(MI
.getOperand(0)); // Mask
2526 unsigned Mask
= MI
.getOperand(0).getImm();
2527 for (unsigned I
= 0; I
< 8; I
++) {
2528 if (Mask
& (1 << I
))
2529 MIB
.addDef(AArch64::ZAD0
+ I
, RegState::ImplicitDefine
);
2532 MI
.eraseFromParent(); // The pseudo is gone now.
2537 AArch64TargetLowering::EmitAddVectorToTile(unsigned Opc
, unsigned BaseReg
,
2539 MachineBasicBlock
*BB
) const {
2540 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
2541 MachineInstrBuilder MIB
= BuildMI(*BB
, MI
, MI
.getDebugLoc(), TII
->get(Opc
));
2543 MIB
.addReg(BaseReg
+ MI
.getOperand(0).getImm(), RegState::Define
);
2544 MIB
.addReg(BaseReg
+ MI
.getOperand(0).getImm());
2545 MIB
.add(MI
.getOperand(1)); // pn
2546 MIB
.add(MI
.getOperand(2)); // pm
2547 MIB
.add(MI
.getOperand(3)); // zn
2549 MI
.eraseFromParent(); // The pseudo is gone now.
2553 MachineBasicBlock
*AArch64TargetLowering::EmitInstrWithCustomInserter(
2554 MachineInstr
&MI
, MachineBasicBlock
*BB
) const {
2555 switch (MI
.getOpcode()) {
2560 llvm_unreachable("Unexpected instruction for custom inserter!");
2562 case AArch64::F128CSEL
:
2563 return EmitF128CSEL(MI
, BB
);
2564 case TargetOpcode::STATEPOINT
:
2565 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2566 // while bl call instruction (where statepoint will be lowered at the end)
2567 // has implicit def. This def is early-clobber as it will be set at
2568 // the moment of the call and earlier than any use is read.
2569 // Add this implicit dead def here as a workaround.
2570 MI
.addOperand(*MI
.getMF(),
2571 MachineOperand::CreateReg(
2572 AArch64::LR
, /*isDef*/ true,
2573 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2574 /*isUndef*/ false, /*isEarlyClobber*/ true));
2576 case TargetOpcode::STACKMAP
:
2577 case TargetOpcode::PATCHPOINT
:
2578 return emitPatchPoint(MI
, BB
);
2580 case AArch64::CATCHRET
:
2581 return EmitLoweredCatchRet(MI
, BB
);
2582 case AArch64::LD1_MXIPXX_H_PSEUDO_B
:
2583 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B
, AArch64::ZAB0
, MI
, BB
);
2584 case AArch64::LD1_MXIPXX_H_PSEUDO_H
:
2585 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H
, AArch64::ZAH0
, MI
, BB
);
2586 case AArch64::LD1_MXIPXX_H_PSEUDO_S
:
2587 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S
, AArch64::ZAS0
, MI
, BB
);
2588 case AArch64::LD1_MXIPXX_H_PSEUDO_D
:
2589 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D
, AArch64::ZAD0
, MI
, BB
);
2590 case AArch64::LD1_MXIPXX_H_PSEUDO_Q
:
2591 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q
, AArch64::ZAQ0
, MI
, BB
);
2592 case AArch64::LD1_MXIPXX_V_PSEUDO_B
:
2593 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B
, AArch64::ZAB0
, MI
, BB
);
2594 case AArch64::LD1_MXIPXX_V_PSEUDO_H
:
2595 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H
, AArch64::ZAH0
, MI
, BB
);
2596 case AArch64::LD1_MXIPXX_V_PSEUDO_S
:
2597 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S
, AArch64::ZAS0
, MI
, BB
);
2598 case AArch64::LD1_MXIPXX_V_PSEUDO_D
:
2599 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D
, AArch64::ZAD0
, MI
, BB
);
2600 case AArch64::LD1_MXIPXX_V_PSEUDO_Q
:
2601 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q
, AArch64::ZAQ0
, MI
, BB
);
2602 case AArch64::LDR_ZA_PSEUDO
:
2603 return EmitFill(MI
, BB
);
2604 case AArch64::BFMOPA_MPPZZ_PSEUDO
:
2605 return EmitMopa(AArch64::BFMOPA_MPPZZ
, AArch64::ZAS0
, MI
, BB
);
2606 case AArch64::BFMOPS_MPPZZ_PSEUDO
:
2607 return EmitMopa(AArch64::BFMOPS_MPPZZ
, AArch64::ZAS0
, MI
, BB
);
2608 case AArch64::FMOPAL_MPPZZ_PSEUDO
:
2609 return EmitMopa(AArch64::FMOPAL_MPPZZ
, AArch64::ZAS0
, MI
, BB
);
2610 case AArch64::FMOPSL_MPPZZ_PSEUDO
:
2611 return EmitMopa(AArch64::FMOPSL_MPPZZ
, AArch64::ZAS0
, MI
, BB
);
2612 case AArch64::FMOPA_MPPZZ_S_PSEUDO
:
2613 return EmitMopa(AArch64::FMOPA_MPPZZ_S
, AArch64::ZAS0
, MI
, BB
);
2614 case AArch64::FMOPS_MPPZZ_S_PSEUDO
:
2615 return EmitMopa(AArch64::FMOPS_MPPZZ_S
, AArch64::ZAS0
, MI
, BB
);
2616 case AArch64::FMOPA_MPPZZ_D_PSEUDO
:
2617 return EmitMopa(AArch64::FMOPA_MPPZZ_D
, AArch64::ZAD0
, MI
, BB
);
2618 case AArch64::FMOPS_MPPZZ_D_PSEUDO
:
2619 return EmitMopa(AArch64::FMOPS_MPPZZ_D
, AArch64::ZAD0
, MI
, BB
);
2620 case AArch64::SMOPA_MPPZZ_S_PSEUDO
:
2621 return EmitMopa(AArch64::SMOPA_MPPZZ_S
, AArch64::ZAS0
, MI
, BB
);
2622 case AArch64::SMOPS_MPPZZ_S_PSEUDO
:
2623 return EmitMopa(AArch64::SMOPS_MPPZZ_S
, AArch64::ZAS0
, MI
, BB
);
2624 case AArch64::UMOPA_MPPZZ_S_PSEUDO
:
2625 return EmitMopa(AArch64::UMOPA_MPPZZ_S
, AArch64::ZAS0
, MI
, BB
);
2626 case AArch64::UMOPS_MPPZZ_S_PSEUDO
:
2627 return EmitMopa(AArch64::UMOPS_MPPZZ_S
, AArch64::ZAS0
, MI
, BB
);
2628 case AArch64::SUMOPA_MPPZZ_S_PSEUDO
:
2629 return EmitMopa(AArch64::SUMOPA_MPPZZ_S
, AArch64::ZAS0
, MI
, BB
);
2630 case AArch64::SUMOPS_MPPZZ_S_PSEUDO
:
2631 return EmitMopa(AArch64::SUMOPS_MPPZZ_S
, AArch64::ZAS0
, MI
, BB
);
2632 case AArch64::USMOPA_MPPZZ_S_PSEUDO
:
2633 return EmitMopa(AArch64::USMOPA_MPPZZ_S
, AArch64::ZAS0
, MI
, BB
);
2634 case AArch64::USMOPS_MPPZZ_S_PSEUDO
:
2635 return EmitMopa(AArch64::USMOPS_MPPZZ_S
, AArch64::ZAS0
, MI
, BB
);
2636 case AArch64::SMOPA_MPPZZ_D_PSEUDO
:
2637 return EmitMopa(AArch64::SMOPA_MPPZZ_D
, AArch64::ZAD0
, MI
, BB
);
2638 case AArch64::SMOPS_MPPZZ_D_PSEUDO
:
2639 return EmitMopa(AArch64::SMOPS_MPPZZ_D
, AArch64::ZAD0
, MI
, BB
);
2640 case AArch64::UMOPA_MPPZZ_D_PSEUDO
:
2641 return EmitMopa(AArch64::UMOPA_MPPZZ_D
, AArch64::ZAD0
, MI
, BB
);
2642 case AArch64::UMOPS_MPPZZ_D_PSEUDO
:
2643 return EmitMopa(AArch64::UMOPS_MPPZZ_D
, AArch64::ZAD0
, MI
, BB
);
2644 case AArch64::SUMOPA_MPPZZ_D_PSEUDO
:
2645 return EmitMopa(AArch64::SUMOPA_MPPZZ_D
, AArch64::ZAD0
, MI
, BB
);
2646 case AArch64::SUMOPS_MPPZZ_D_PSEUDO
:
2647 return EmitMopa(AArch64::SUMOPS_MPPZZ_D
, AArch64::ZAD0
, MI
, BB
);
2648 case AArch64::USMOPA_MPPZZ_D_PSEUDO
:
2649 return EmitMopa(AArch64::USMOPA_MPPZZ_D
, AArch64::ZAD0
, MI
, BB
);
2650 case AArch64::USMOPS_MPPZZ_D_PSEUDO
:
2651 return EmitMopa(AArch64::USMOPS_MPPZZ_D
, AArch64::ZAD0
, MI
, BB
);
2652 case AArch64::INSERT_MXIPZ_H_PSEUDO_B
:
2653 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_B
, AArch64::ZAB0
, MI
,
2655 case AArch64::INSERT_MXIPZ_H_PSEUDO_H
:
2656 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_H
, AArch64::ZAH0
, MI
,
2658 case AArch64::INSERT_MXIPZ_H_PSEUDO_S
:
2659 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_S
, AArch64::ZAS0
, MI
,
2661 case AArch64::INSERT_MXIPZ_H_PSEUDO_D
:
2662 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_D
, AArch64::ZAD0
, MI
,
2664 case AArch64::INSERT_MXIPZ_H_PSEUDO_Q
:
2665 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_Q
, AArch64::ZAQ0
, MI
,
2667 case AArch64::INSERT_MXIPZ_V_PSEUDO_B
:
2668 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_B
, AArch64::ZAB0
, MI
,
2670 case AArch64::INSERT_MXIPZ_V_PSEUDO_H
:
2671 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_H
, AArch64::ZAH0
, MI
,
2673 case AArch64::INSERT_MXIPZ_V_PSEUDO_S
:
2674 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_S
, AArch64::ZAS0
, MI
,
2676 case AArch64::INSERT_MXIPZ_V_PSEUDO_D
:
2677 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_D
, AArch64::ZAD0
, MI
,
2679 case AArch64::INSERT_MXIPZ_V_PSEUDO_Q
:
2680 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_Q
, AArch64::ZAQ0
, MI
,
2682 case AArch64::ZERO_M_PSEUDO
:
2683 return EmitZero(MI
, BB
);
2684 case AArch64::ADDHA_MPPZ_PSEUDO_S
:
2685 return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_S
, AArch64::ZAS0
, MI
, BB
);
2686 case AArch64::ADDVA_MPPZ_PSEUDO_S
:
2687 return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_S
, AArch64::ZAS0
, MI
, BB
);
2688 case AArch64::ADDHA_MPPZ_PSEUDO_D
:
2689 return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_D
, AArch64::ZAD0
, MI
, BB
);
2690 case AArch64::ADDVA_MPPZ_PSEUDO_D
:
2691 return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_D
, AArch64::ZAD0
, MI
, BB
);
2695 //===----------------------------------------------------------------------===//
2696 // AArch64 Lowering private implementation.
2697 //===----------------------------------------------------------------------===//
2699 //===----------------------------------------------------------------------===//
2701 //===----------------------------------------------------------------------===//
2703 // Forward declarations of SVE fixed length lowering helpers
2704 static EVT
getContainerForFixedLengthVector(SelectionDAG
&DAG
, EVT VT
);
2705 static SDValue
convertToScalableVector(SelectionDAG
&DAG
, EVT VT
, SDValue V
);
2706 static SDValue
convertFromScalableVector(SelectionDAG
&DAG
, EVT VT
, SDValue V
);
2707 static SDValue
convertFixedMaskToScalableVector(SDValue Mask
,
2709 static SDValue
getPredicateForScalableVector(SelectionDAG
&DAG
, SDLoc
&DL
,
2712 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
2713 static bool isZerosVector(const SDNode
*N
) {
2714 // Look through a bit convert.
2715 while (N
->getOpcode() == ISD::BITCAST
)
2716 N
= N
->getOperand(0).getNode();
2718 if (ISD::isConstantSplatVectorAllZeros(N
))
2721 if (N
->getOpcode() != AArch64ISD::DUP
)
2724 auto Opnd0
= N
->getOperand(0);
2725 return isNullConstant(Opnd0
) || isNullFPConstant(Opnd0
);
2728 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2730 static AArch64CC::CondCode
changeIntCCToAArch64CC(ISD::CondCode CC
) {
2733 llvm_unreachable("Unknown condition code!");
2735 return AArch64CC::NE
;
2737 return AArch64CC::EQ
;
2739 return AArch64CC::GT
;
2741 return AArch64CC::GE
;
2743 return AArch64CC::LT
;
2745 return AArch64CC::LE
;
2747 return AArch64CC::HI
;
2749 return AArch64CC::HS
;
2751 return AArch64CC::LO
;
2753 return AArch64CC::LS
;
2757 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2758 static void changeFPCCToAArch64CC(ISD::CondCode CC
,
2759 AArch64CC::CondCode
&CondCode
,
2760 AArch64CC::CondCode
&CondCode2
) {
2761 CondCode2
= AArch64CC::AL
;
2764 llvm_unreachable("Unknown FP condition!");
2767 CondCode
= AArch64CC::EQ
;
2771 CondCode
= AArch64CC::GT
;
2775 CondCode
= AArch64CC::GE
;
2778 CondCode
= AArch64CC::MI
;
2781 CondCode
= AArch64CC::LS
;
2784 CondCode
= AArch64CC::MI
;
2785 CondCode2
= AArch64CC::GT
;
2788 CondCode
= AArch64CC::VC
;
2791 CondCode
= AArch64CC::VS
;
2794 CondCode
= AArch64CC::EQ
;
2795 CondCode2
= AArch64CC::VS
;
2798 CondCode
= AArch64CC::HI
;
2801 CondCode
= AArch64CC::PL
;
2805 CondCode
= AArch64CC::LT
;
2809 CondCode
= AArch64CC::LE
;
2813 CondCode
= AArch64CC::NE
;
2818 /// Convert a DAG fp condition code to an AArch64 CC.
2819 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2820 /// should be AND'ed instead of OR'ed.
2821 static void changeFPCCToANDAArch64CC(ISD::CondCode CC
,
2822 AArch64CC::CondCode
&CondCode
,
2823 AArch64CC::CondCode
&CondCode2
) {
2824 CondCode2
= AArch64CC::AL
;
2827 changeFPCCToAArch64CC(CC
, CondCode
, CondCode2
);
2828 assert(CondCode2
== AArch64CC::AL
);
2832 // == ((a olt b) || (a ogt b))
2833 // == ((a ord b) && (a une b))
2834 CondCode
= AArch64CC::VC
;
2835 CondCode2
= AArch64CC::NE
;
2839 // == ((a uno b) || (a oeq b))
2840 // == ((a ule b) && (a uge b))
2841 CondCode
= AArch64CC::PL
;
2842 CondCode2
= AArch64CC::LE
;
2847 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2848 /// CC usable with the vector instructions. Fewer operations are available
2849 /// without a real NZCV register, so we have to use less efficient combinations
2850 /// to get the same effect.
2851 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC
,
2852 AArch64CC::CondCode
&CondCode
,
2853 AArch64CC::CondCode
&CondCode2
,
2858 // Mostly the scalar mappings work fine.
2859 changeFPCCToAArch64CC(CC
, CondCode
, CondCode2
);
2865 CondCode
= AArch64CC::MI
;
2866 CondCode2
= AArch64CC::GE
;
2873 // All of the compare-mask comparisons are ordered, but we can switch
2874 // between the two by a double inversion. E.g. ULE == !OGT.
2876 changeFPCCToAArch64CC(getSetCCInverse(CC
, /* FP inverse */ MVT::f32
),
2877 CondCode
, CondCode2
);
2882 static bool isLegalArithImmed(uint64_t C
) {
2883 // Matches AArch64DAGToDAGISel::SelectArithImmed().
2884 bool IsLegal
= (C
>> 12 == 0) || ((C
& 0xFFFULL
) == 0 && C
>> 24 == 0);
2885 LLVM_DEBUG(dbgs() << "Is imm " << C
2886 << " legal: " << (IsLegal
? "yes\n" : "no\n"));
2890 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2891 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2892 // can be set differently by this operation. It comes down to whether
2893 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2894 // everything is fine. If not then the optimization is wrong. Thus general
2895 // comparisons are only valid if op2 != 0.
2897 // So, finally, the only LLVM-native comparisons that don't mention C and V
2898 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2899 // the absence of information about op2.
2900 static bool isCMN(SDValue Op
, ISD::CondCode CC
) {
2901 return Op
.getOpcode() == ISD::SUB
&& isNullConstant(Op
.getOperand(0)) &&
2902 (CC
== ISD::SETEQ
|| CC
== ISD::SETNE
);
2905 static SDValue
emitStrictFPComparison(SDValue LHS
, SDValue RHS
, const SDLoc
&dl
,
2906 SelectionDAG
&DAG
, SDValue Chain
,
2908 EVT VT
= LHS
.getValueType();
2909 assert(VT
!= MVT::f128
);
2911 const bool FullFP16
= DAG
.getSubtarget
<AArch64Subtarget
>().hasFullFP16();
2913 if (VT
== MVT::f16
&& !FullFP16
) {
2914 LHS
= DAG
.getNode(ISD::STRICT_FP_EXTEND
, dl
, {MVT::f32
, MVT::Other
},
2916 RHS
= DAG
.getNode(ISD::STRICT_FP_EXTEND
, dl
, {MVT::f32
, MVT::Other
},
2917 {LHS
.getValue(1), RHS
});
2918 Chain
= RHS
.getValue(1);
2922 IsSignaling
? AArch64ISD::STRICT_FCMPE
: AArch64ISD::STRICT_FCMP
;
2923 return DAG
.getNode(Opcode
, dl
, {VT
, MVT::Other
}, {Chain
, LHS
, RHS
});
2926 static SDValue
emitComparison(SDValue LHS
, SDValue RHS
, ISD::CondCode CC
,
2927 const SDLoc
&dl
, SelectionDAG
&DAG
) {
2928 EVT VT
= LHS
.getValueType();
2929 const bool FullFP16
= DAG
.getSubtarget
<AArch64Subtarget
>().hasFullFP16();
2931 if (VT
.isFloatingPoint()) {
2932 assert(VT
!= MVT::f128
);
2933 if (VT
== MVT::f16
&& !FullFP16
) {
2934 LHS
= DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::f32
, LHS
);
2935 RHS
= DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::f32
, RHS
);
2938 return DAG
.getNode(AArch64ISD::FCMP
, dl
, VT
, LHS
, RHS
);
2941 // The CMP instruction is just an alias for SUBS, and representing it as
2942 // SUBS means that it's possible to get CSE with subtract operations.
2943 // A later phase can perform the optimization of setting the destination
2944 // register to WZR/XZR if it ends up being unused.
2945 unsigned Opcode
= AArch64ISD::SUBS
;
2947 if (isCMN(RHS
, CC
)) {
2948 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2949 Opcode
= AArch64ISD::ADDS
;
2950 RHS
= RHS
.getOperand(1);
2951 } else if (isCMN(LHS
, CC
)) {
2952 // As we are looking for EQ/NE compares, the operands can be commuted ; can
2953 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2954 Opcode
= AArch64ISD::ADDS
;
2955 LHS
= LHS
.getOperand(1);
2956 } else if (isNullConstant(RHS
) && !isUnsignedIntSetCC(CC
)) {
2957 if (LHS
.getOpcode() == ISD::AND
) {
2958 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2959 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2960 // of the signed comparisons.
2961 const SDValue ANDSNode
= DAG
.getNode(AArch64ISD::ANDS
, dl
,
2962 DAG
.getVTList(VT
, MVT_CC
),
2965 // Replace all users of (and X, Y) with newly generated (ands X, Y)
2966 DAG
.ReplaceAllUsesWith(LHS
, ANDSNode
);
2967 return ANDSNode
.getValue(1);
2968 } else if (LHS
.getOpcode() == AArch64ISD::ANDS
) {
2969 // Use result of ANDS
2970 return LHS
.getValue(1);
2974 return DAG
.getNode(Opcode
, dl
, DAG
.getVTList(VT
, MVT_CC
), LHS
, RHS
)
2978 /// \defgroup AArch64CCMP CMP;CCMP matching
2980 /// These functions deal with the formation of CMP;CCMP;... sequences.
2981 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2982 /// a comparison. They set the NZCV flags to a predefined value if their
2983 /// predicate is false. This allows to express arbitrary conjunctions, for
2984 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2987 /// ccmp B, inv(CB), CA
2988 /// check for CB flags
2990 /// This naturally lets us implement chains of AND operations with SETCC
2991 /// operands. And we can even implement some other situations by transforming
2993 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
2994 /// negating the flags used in a CCMP/FCCMP operations.
2995 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2996 /// by negating the flags we test for afterwards. i.e.
2997 /// NEG (CMP CCMP CCCMP ...) can be implemented.
2998 /// - Note that we can only ever negate all previously processed results.
2999 /// What we can not implement by flipping the flags to test is a negation
3000 /// of two sub-trees (because the negation affects all sub-trees emitted so
3001 /// far, so the 2nd sub-tree we emit would also affect the first).
3002 /// With those tools we can implement some OR operations:
3003 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
3004 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3005 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
3006 /// elimination rules from earlier to implement the whole thing as a
3007 /// CCMP/FCCMP chain.
3009 /// As complete example:
3010 /// or (or (setCA (cmp A)) (setCB (cmp B)))
3011 /// (and (setCC (cmp C)) (setCD (cmp D)))"
3012 /// can be reassociated to:
3013 /// or (and (setCC (cmp C)) setCD (cmp D))
3014 // (or (setCA (cmp A)) (setCB (cmp B)))
3015 /// can be transformed to:
3016 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3017 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3018 /// which can be implemented as:
3020 /// ccmp D, inv(CD), CC
3021 /// ccmp A, CA, inv(CD)
3022 /// ccmp B, CB, inv(CA)
3023 /// check for CB flags
3025 /// A counterexample is "or (and A B) (and C D)" which translates to
3026 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3027 /// can only implement 1 of the inner (not) operations, but not both!
3030 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3031 static SDValue
emitConditionalComparison(SDValue LHS
, SDValue RHS
,
3032 ISD::CondCode CC
, SDValue CCOp
,
3033 AArch64CC::CondCode Predicate
,
3034 AArch64CC::CondCode OutCC
,
3035 const SDLoc
&DL
, SelectionDAG
&DAG
) {
3036 unsigned Opcode
= 0;
3037 const bool FullFP16
= DAG
.getSubtarget
<AArch64Subtarget
>().hasFullFP16();
3039 if (LHS
.getValueType().isFloatingPoint()) {
3040 assert(LHS
.getValueType() != MVT::f128
);
3041 if (LHS
.getValueType() == MVT::f16
&& !FullFP16
) {
3042 LHS
= DAG
.getNode(ISD::FP_EXTEND
, DL
, MVT::f32
, LHS
);
3043 RHS
= DAG
.getNode(ISD::FP_EXTEND
, DL
, MVT::f32
, RHS
);
3045 Opcode
= AArch64ISD::FCCMP
;
3046 } else if (RHS
.getOpcode() == ISD::SUB
) {
3047 SDValue SubOp0
= RHS
.getOperand(0);
3048 if (isNullConstant(SubOp0
) && (CC
== ISD::SETEQ
|| CC
== ISD::SETNE
)) {
3049 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3050 Opcode
= AArch64ISD::CCMN
;
3051 RHS
= RHS
.getOperand(1);
3055 Opcode
= AArch64ISD::CCMP
;
3057 SDValue Condition
= DAG
.getConstant(Predicate
, DL
, MVT_CC
);
3058 AArch64CC::CondCode InvOutCC
= AArch64CC::getInvertedCondCode(OutCC
);
3059 unsigned NZCV
= AArch64CC::getNZCVToSatisfyCondCode(InvOutCC
);
3060 SDValue NZCVOp
= DAG
.getConstant(NZCV
, DL
, MVT::i32
);
3061 return DAG
.getNode(Opcode
, DL
, MVT_CC
, LHS
, RHS
, NZCVOp
, Condition
, CCOp
);
3064 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3065 /// expressed as a conjunction. See \ref AArch64CCMP.
3066 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
3067 /// changing the conditions on the SETCC tests.
3068 /// (this means we can call emitConjunctionRec() with
3069 /// Negate==true on this sub-tree)
3070 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
3071 /// cannot do the negation naturally. We are required to
3072 /// emit the subtree first in this case.
3073 /// \param WillNegate Is true if are called when the result of this
3074 /// subexpression must be negated. This happens when the
3075 /// outer expression is an OR. We can use this fact to know
3076 /// that we have a double negation (or (or ...) ...) that
3077 /// can be implemented for free.
3078 static bool canEmitConjunction(const SDValue Val
, bool &CanNegate
,
3079 bool &MustBeFirst
, bool WillNegate
,
3080 unsigned Depth
= 0) {
3081 if (!Val
.hasOneUse())
3083 unsigned Opcode
= Val
->getOpcode();
3084 if (Opcode
== ISD::SETCC
) {
3085 if (Val
->getOperand(0).getValueType() == MVT::f128
)
3088 MustBeFirst
= false;
3091 // Protect against exponential runtime and stack overflow.
3094 if (Opcode
== ISD::AND
|| Opcode
== ISD::OR
) {
3095 bool IsOR
= Opcode
== ISD::OR
;
3096 SDValue O0
= Val
->getOperand(0);
3097 SDValue O1
= Val
->getOperand(1);
3100 if (!canEmitConjunction(O0
, CanNegateL
, MustBeFirstL
, IsOR
, Depth
+1))
3104 if (!canEmitConjunction(O1
, CanNegateR
, MustBeFirstR
, IsOR
, Depth
+1))
3107 if (MustBeFirstL
&& MustBeFirstR
)
3111 // For an OR expression we need to be able to naturally negate at least
3112 // one side or we cannot do the transformation at all.
3113 if (!CanNegateL
&& !CanNegateR
)
3115 // If we the result of the OR will be negated and we can naturally negate
3116 // the leafs, then this sub-tree as a whole negates naturally.
3117 CanNegate
= WillNegate
&& CanNegateL
&& CanNegateR
;
3118 // If we cannot naturally negate the whole sub-tree, then this must be
3120 MustBeFirst
= !CanNegate
;
3122 assert(Opcode
== ISD::AND
&& "Must be OR or AND");
3123 // We cannot naturally negate an AND operation.
3125 MustBeFirst
= MustBeFirstL
|| MustBeFirstR
;
3132 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3133 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3134 /// Tries to transform the given i1 producing node @p Val to a series compare
3135 /// and conditional compare operations. @returns an NZCV flags producing node
3136 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3137 /// transformation was not possible.
3138 /// \p Negate is true if we want this sub-tree being negated just by changing
3139 /// SETCC conditions.
3140 static SDValue
emitConjunctionRec(SelectionDAG
&DAG
, SDValue Val
,
3141 AArch64CC::CondCode
&OutCC
, bool Negate
, SDValue CCOp
,
3142 AArch64CC::CondCode Predicate
) {
3143 // We're at a tree leaf, produce a conditional comparison operation.
3144 unsigned Opcode
= Val
->getOpcode();
3145 if (Opcode
== ISD::SETCC
) {
3146 SDValue LHS
= Val
->getOperand(0);
3147 SDValue RHS
= Val
->getOperand(1);
3148 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Val
->getOperand(2))->get();
3149 bool isInteger
= LHS
.getValueType().isInteger();
3151 CC
= getSetCCInverse(CC
, LHS
.getValueType());
3153 // Determine OutCC and handle FP special case.
3155 OutCC
= changeIntCCToAArch64CC(CC
);
3157 assert(LHS
.getValueType().isFloatingPoint());
3158 AArch64CC::CondCode ExtraCC
;
3159 changeFPCCToANDAArch64CC(CC
, OutCC
, ExtraCC
);
3160 // Some floating point conditions can't be tested with a single condition
3161 // code. Construct an additional comparison in this case.
3162 if (ExtraCC
!= AArch64CC::AL
) {
3164 if (!CCOp
.getNode())
3165 ExtraCmp
= emitComparison(LHS
, RHS
, CC
, DL
, DAG
);
3167 ExtraCmp
= emitConditionalComparison(LHS
, RHS
, CC
, CCOp
, Predicate
,
3170 Predicate
= ExtraCC
;
3174 // Produce a normal comparison if we are first in the chain
3176 return emitComparison(LHS
, RHS
, CC
, DL
, DAG
);
3177 // Otherwise produce a ccmp.
3178 return emitConditionalComparison(LHS
, RHS
, CC
, CCOp
, Predicate
, OutCC
, DL
,
3181 assert(Val
->hasOneUse() && "Valid conjunction/disjunction tree");
3183 bool IsOR
= Opcode
== ISD::OR
;
3185 SDValue LHS
= Val
->getOperand(0);
3188 bool ValidL
= canEmitConjunction(LHS
, CanNegateL
, MustBeFirstL
, IsOR
);
3189 assert(ValidL
&& "Valid conjunction/disjunction tree");
3192 SDValue RHS
= Val
->getOperand(1);
3195 bool ValidR
= canEmitConjunction(RHS
, CanNegateR
, MustBeFirstR
, IsOR
);
3196 assert(ValidR
&& "Valid conjunction/disjunction tree");
3199 // Swap sub-tree that must come first to the right side.
3201 assert(!MustBeFirstR
&& "Valid conjunction/disjunction tree");
3202 std::swap(LHS
, RHS
);
3203 std::swap(CanNegateL
, CanNegateR
);
3204 std::swap(MustBeFirstL
, MustBeFirstR
);
3210 bool NegateAfterAll
;
3211 if (Opcode
== ISD::OR
) {
3212 // Swap the sub-tree that we can negate naturally to the left.
3214 assert(CanNegateR
&& "at least one side must be negatable");
3215 assert(!MustBeFirstR
&& "invalid conjunction/disjunction tree");
3217 std::swap(LHS
, RHS
);
3219 NegateAfterR
= true;
3221 // Negate the left sub-tree if possible, otherwise negate the result.
3222 NegateR
= CanNegateR
;
3223 NegateAfterR
= !CanNegateR
;
3226 NegateAfterAll
= !Negate
;
3228 assert(Opcode
== ISD::AND
&& "Valid conjunction/disjunction tree");
3229 assert(!Negate
&& "Valid conjunction/disjunction tree");
3233 NegateAfterR
= false;
3234 NegateAfterAll
= false;
3238 AArch64CC::CondCode RHSCC
;
3239 SDValue CmpR
= emitConjunctionRec(DAG
, RHS
, RHSCC
, NegateR
, CCOp
, Predicate
);
3241 RHSCC
= AArch64CC::getInvertedCondCode(RHSCC
);
3242 SDValue CmpL
= emitConjunctionRec(DAG
, LHS
, OutCC
, NegateL
, CmpR
, RHSCC
);
3244 OutCC
= AArch64CC::getInvertedCondCode(OutCC
);
3248 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3249 /// In some cases this is even possible with OR operations in the expression.
3250 /// See \ref AArch64CCMP.
3251 /// \see emitConjunctionRec().
3252 static SDValue
emitConjunction(SelectionDAG
&DAG
, SDValue Val
,
3253 AArch64CC::CondCode
&OutCC
) {
3254 bool DummyCanNegate
;
3255 bool DummyMustBeFirst
;
3256 if (!canEmitConjunction(Val
, DummyCanNegate
, DummyMustBeFirst
, false))
3259 return emitConjunctionRec(DAG
, Val
, OutCC
, false, SDValue(), AArch64CC::AL
);
3264 /// Returns how profitable it is to fold a comparison's operand's shift and/or
3265 /// extension operations.
3266 static unsigned getCmpOperandFoldingProfit(SDValue Op
) {
3267 auto isSupportedExtend
= [&](SDValue V
) {
3268 if (V
.getOpcode() == ISD::SIGN_EXTEND_INREG
)
3271 if (V
.getOpcode() == ISD::AND
)
3272 if (ConstantSDNode
*MaskCst
= dyn_cast
<ConstantSDNode
>(V
.getOperand(1))) {
3273 uint64_t Mask
= MaskCst
->getZExtValue();
3274 return (Mask
== 0xFF || Mask
== 0xFFFF || Mask
== 0xFFFFFFFF);
3280 if (!Op
.hasOneUse())
3283 if (isSupportedExtend(Op
))
3286 unsigned Opc
= Op
.getOpcode();
3287 if (Opc
== ISD::SHL
|| Opc
== ISD::SRL
|| Opc
== ISD::SRA
)
3288 if (ConstantSDNode
*ShiftCst
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1))) {
3289 uint64_t Shift
= ShiftCst
->getZExtValue();
3290 if (isSupportedExtend(Op
.getOperand(0)))
3291 return (Shift
<= 4) ? 2 : 1;
3292 EVT VT
= Op
.getValueType();
3293 if ((VT
== MVT::i32
&& Shift
<= 31) || (VT
== MVT::i64
&& Shift
<= 63))
3300 static SDValue
getAArch64Cmp(SDValue LHS
, SDValue RHS
, ISD::CondCode CC
,
3301 SDValue
&AArch64cc
, SelectionDAG
&DAG
,
3303 if (ConstantSDNode
*RHSC
= dyn_cast
<ConstantSDNode
>(RHS
.getNode())) {
3304 EVT VT
= RHS
.getValueType();
3305 uint64_t C
= RHSC
->getZExtValue();
3306 if (!isLegalArithImmed(C
)) {
3307 // Constant does not fit, try adjusting it by one?
3313 if ((VT
== MVT::i32
&& C
!= 0x80000000 &&
3314 isLegalArithImmed((uint32_t)(C
- 1))) ||
3315 (VT
== MVT::i64
&& C
!= 0x80000000ULL
&&
3316 isLegalArithImmed(C
- 1ULL))) {
3317 CC
= (CC
== ISD::SETLT
) ? ISD::SETLE
: ISD::SETGT
;
3318 C
= (VT
== MVT::i32
) ? (uint32_t)(C
- 1) : C
- 1;
3319 RHS
= DAG
.getConstant(C
, dl
, VT
);
3324 if ((VT
== MVT::i32
&& C
!= 0 &&
3325 isLegalArithImmed((uint32_t)(C
- 1))) ||
3326 (VT
== MVT::i64
&& C
!= 0ULL && isLegalArithImmed(C
- 1ULL))) {
3327 CC
= (CC
== ISD::SETULT
) ? ISD::SETULE
: ISD::SETUGT
;
3328 C
= (VT
== MVT::i32
) ? (uint32_t)(C
- 1) : C
- 1;
3329 RHS
= DAG
.getConstant(C
, dl
, VT
);
3334 if ((VT
== MVT::i32
&& C
!= INT32_MAX
&&
3335 isLegalArithImmed((uint32_t)(C
+ 1))) ||
3336 (VT
== MVT::i64
&& C
!= INT64_MAX
&&
3337 isLegalArithImmed(C
+ 1ULL))) {
3338 CC
= (CC
== ISD::SETLE
) ? ISD::SETLT
: ISD::SETGE
;
3339 C
= (VT
== MVT::i32
) ? (uint32_t)(C
+ 1) : C
+ 1;
3340 RHS
= DAG
.getConstant(C
, dl
, VT
);
3345 if ((VT
== MVT::i32
&& C
!= UINT32_MAX
&&
3346 isLegalArithImmed((uint32_t)(C
+ 1))) ||
3347 (VT
== MVT::i64
&& C
!= UINT64_MAX
&&
3348 isLegalArithImmed(C
+ 1ULL))) {
3349 CC
= (CC
== ISD::SETULE
) ? ISD::SETULT
: ISD::SETUGE
;
3350 C
= (VT
== MVT::i32
) ? (uint32_t)(C
+ 1) : C
+ 1;
3351 RHS
= DAG
.getConstant(C
, dl
, VT
);
3358 // Comparisons are canonicalized so that the RHS operand is simpler than the
3359 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3360 // can fold some shift+extend operations on the RHS operand, so swap the
3361 // operands if that can be done.
3366 // can be turned into:
3367 // cmp w12, w11, lsl #1
3368 if (!isa
<ConstantSDNode
>(RHS
) ||
3369 !isLegalArithImmed(cast
<ConstantSDNode
>(RHS
)->getZExtValue())) {
3370 SDValue TheLHS
= isCMN(LHS
, CC
) ? LHS
.getOperand(1) : LHS
;
3372 if (getCmpOperandFoldingProfit(TheLHS
) > getCmpOperandFoldingProfit(RHS
)) {
3373 std::swap(LHS
, RHS
);
3374 CC
= ISD::getSetCCSwappedOperands(CC
);
3379 AArch64CC::CondCode AArch64CC
;
3380 if ((CC
== ISD::SETEQ
|| CC
== ISD::SETNE
) && isa
<ConstantSDNode
>(RHS
)) {
3381 const ConstantSDNode
*RHSC
= cast
<ConstantSDNode
>(RHS
);
3383 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3384 // For the i8 operand, the largest immediate is 255, so this can be easily
3385 // encoded in the compare instruction. For the i16 operand, however, the
3386 // largest immediate cannot be encoded in the compare.
3387 // Therefore, use a sign extending load and cmn to avoid materializing the
3388 // -1 constant. For example,
3390 // ldrh w0, [x0, #0]
3393 // ldrsh w0, [x0, #0]
3395 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3396 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3397 // ensure both the LHS and RHS are truly zero extended and to make sure the
3398 // transformation is profitable.
3399 if ((RHSC
->getZExtValue() >> 16 == 0) && isa
<LoadSDNode
>(LHS
) &&
3400 cast
<LoadSDNode
>(LHS
)->getExtensionType() == ISD::ZEXTLOAD
&&
3401 cast
<LoadSDNode
>(LHS
)->getMemoryVT() == MVT::i16
&&
3402 LHS
.getNode()->hasNUsesOfValue(1, 0)) {
3403 int16_t ValueofRHS
= cast
<ConstantSDNode
>(RHS
)->getZExtValue();
3404 if (ValueofRHS
< 0 && isLegalArithImmed(-ValueofRHS
)) {
3406 DAG
.getNode(ISD::SIGN_EXTEND_INREG
, dl
, LHS
.getValueType(), LHS
,
3407 DAG
.getValueType(MVT::i16
));
3408 Cmp
= emitComparison(SExt
, DAG
.getConstant(ValueofRHS
, dl
,
3409 RHS
.getValueType()),
3411 AArch64CC
= changeIntCCToAArch64CC(CC
);
3415 if (!Cmp
&& (RHSC
->isZero() || RHSC
->isOne())) {
3416 if ((Cmp
= emitConjunction(DAG
, LHS
, AArch64CC
))) {
3417 if ((CC
== ISD::SETNE
) ^ RHSC
->isZero())
3418 AArch64CC
= AArch64CC::getInvertedCondCode(AArch64CC
);
3424 Cmp
= emitComparison(LHS
, RHS
, CC
, dl
, DAG
);
3425 AArch64CC
= changeIntCCToAArch64CC(CC
);
3427 AArch64cc
= DAG
.getConstant(AArch64CC
, dl
, MVT_CC
);
3431 static std::pair
<SDValue
, SDValue
>
3432 getAArch64XALUOOp(AArch64CC::CondCode
&CC
, SDValue Op
, SelectionDAG
&DAG
) {
3433 assert((Op
.getValueType() == MVT::i32
|| Op
.getValueType() == MVT::i64
) &&
3434 "Unsupported value type");
3435 SDValue Value
, Overflow
;
3437 SDValue LHS
= Op
.getOperand(0);
3438 SDValue RHS
= Op
.getOperand(1);
3440 switch (Op
.getOpcode()) {
3442 llvm_unreachable("Unknown overflow instruction!");
3444 Opc
= AArch64ISD::ADDS
;
3448 Opc
= AArch64ISD::ADDS
;
3452 Opc
= AArch64ISD::SUBS
;
3456 Opc
= AArch64ISD::SUBS
;
3459 // Multiply needs a little bit extra work.
3463 bool IsSigned
= Op
.getOpcode() == ISD::SMULO
;
3464 if (Op
.getValueType() == MVT::i32
) {
3465 // Extend to 64-bits, then perform a 64-bit multiply.
3466 unsigned ExtendOpc
= IsSigned
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
3467 LHS
= DAG
.getNode(ExtendOpc
, DL
, MVT::i64
, LHS
);
3468 RHS
= DAG
.getNode(ExtendOpc
, DL
, MVT::i64
, RHS
);
3469 SDValue Mul
= DAG
.getNode(ISD::MUL
, DL
, MVT::i64
, LHS
, RHS
);
3470 Value
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i32
, Mul
);
3472 // Check that the result fits into a 32-bit integer.
3473 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT_CC
);
3475 // cmp xreg, wreg, sxtw
3476 SDValue SExtMul
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, MVT::i64
, Value
);
3478 DAG
.getNode(AArch64ISD::SUBS
, DL
, VTs
, Mul
, SExtMul
).getValue(1);
3480 // tst xreg, #0xffffffff00000000
3481 SDValue UpperBits
= DAG
.getConstant(0xFFFFFFFF00000000, DL
, MVT::i64
);
3483 DAG
.getNode(AArch64ISD::ANDS
, DL
, VTs
, Mul
, UpperBits
).getValue(1);
3487 assert(Op
.getValueType() == MVT::i64
&& "Expected an i64 value type");
3488 // For the 64 bit multiply
3489 Value
= DAG
.getNode(ISD::MUL
, DL
, MVT::i64
, LHS
, RHS
);
3491 SDValue UpperBits
= DAG
.getNode(ISD::MULHS
, DL
, MVT::i64
, LHS
, RHS
);
3492 SDValue LowerBits
= DAG
.getNode(ISD::SRA
, DL
, MVT::i64
, Value
,
3493 DAG
.getConstant(63, DL
, MVT::i64
));
3494 // It is important that LowerBits is last, otherwise the arithmetic
3495 // shift will not be folded into the compare (SUBS).
3496 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT::i32
);
3497 Overflow
= DAG
.getNode(AArch64ISD::SUBS
, DL
, VTs
, UpperBits
, LowerBits
)
3500 SDValue UpperBits
= DAG
.getNode(ISD::MULHU
, DL
, MVT::i64
, LHS
, RHS
);
3501 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT::i32
);
3503 DAG
.getNode(AArch64ISD::SUBS
, DL
, VTs
,
3504 DAG
.getConstant(0, DL
, MVT::i64
),
3505 UpperBits
).getValue(1);
3512 SDVTList VTs
= DAG
.getVTList(Op
->getValueType(0), MVT::i32
);
3514 // Emit the AArch64 operation with overflow check.
3515 Value
= DAG
.getNode(Opc
, DL
, VTs
, LHS
, RHS
);
3516 Overflow
= Value
.getValue(1);
3518 return std::make_pair(Value
, Overflow
);
3521 SDValue
AArch64TargetLowering::LowerXOR(SDValue Op
, SelectionDAG
&DAG
) const {
3522 if (useSVEForFixedLengthVectorVT(Op
.getValueType()))
3523 return LowerToScalableOp(Op
, DAG
);
3525 SDValue Sel
= Op
.getOperand(0);
3526 SDValue Other
= Op
.getOperand(1);
3529 // If the operand is an overflow checking operation, invert the condition
3530 // code and kill the Not operation. I.e., transform:
3531 // (xor (overflow_op_bool, 1))
3533 // (csel 1, 0, invert(cc), overflow_op_bool)
3534 // ... which later gets transformed to just a cset instruction with an
3535 // inverted condition code, rather than a cset + eor sequence.
3536 if (isOneConstant(Other
) && ISD::isOverflowIntrOpRes(Sel
)) {
3537 // Only lower legal XALUO ops.
3538 if (!DAG
.getTargetLoweringInfo().isTypeLegal(Sel
->getValueType(0)))
3541 SDValue TVal
= DAG
.getConstant(1, dl
, MVT::i32
);
3542 SDValue FVal
= DAG
.getConstant(0, dl
, MVT::i32
);
3543 AArch64CC::CondCode CC
;
3544 SDValue Value
, Overflow
;
3545 std::tie(Value
, Overflow
) = getAArch64XALUOOp(CC
, Sel
.getValue(0), DAG
);
3546 SDValue CCVal
= DAG
.getConstant(getInvertedCondCode(CC
), dl
, MVT::i32
);
3547 return DAG
.getNode(AArch64ISD::CSEL
, dl
, Op
.getValueType(), TVal
, FVal
,
3550 // If neither operand is a SELECT_CC, give up.
3551 if (Sel
.getOpcode() != ISD::SELECT_CC
)
3552 std::swap(Sel
, Other
);
3553 if (Sel
.getOpcode() != ISD::SELECT_CC
)
3556 // The folding we want to perform is:
3557 // (xor x, (select_cc a, b, cc, 0, -1) )
3559 // (csel x, (xor x, -1), cc ...)
3561 // The latter will get matched to a CSINV instruction.
3563 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Sel
.getOperand(4))->get();
3564 SDValue LHS
= Sel
.getOperand(0);
3565 SDValue RHS
= Sel
.getOperand(1);
3566 SDValue TVal
= Sel
.getOperand(2);
3567 SDValue FVal
= Sel
.getOperand(3);
3569 // FIXME: This could be generalized to non-integer comparisons.
3570 if (LHS
.getValueType() != MVT::i32
&& LHS
.getValueType() != MVT::i64
)
3573 ConstantSDNode
*CFVal
= dyn_cast
<ConstantSDNode
>(FVal
);
3574 ConstantSDNode
*CTVal
= dyn_cast
<ConstantSDNode
>(TVal
);
3576 // The values aren't constants, this isn't the pattern we're looking for.
3577 if (!CFVal
|| !CTVal
)
3580 // We can commute the SELECT_CC by inverting the condition. This
3581 // might be needed to make this fit into a CSINV pattern.
3582 if (CTVal
->isAllOnes() && CFVal
->isZero()) {
3583 std::swap(TVal
, FVal
);
3584 std::swap(CTVal
, CFVal
);
3585 CC
= ISD::getSetCCInverse(CC
, LHS
.getValueType());
3588 // If the constants line up, perform the transform!
3589 if (CTVal
->isZero() && CFVal
->isAllOnes()) {
3591 SDValue Cmp
= getAArch64Cmp(LHS
, RHS
, CC
, CCVal
, DAG
, dl
);
3594 TVal
= DAG
.getNode(ISD::XOR
, dl
, Other
.getValueType(), Other
,
3595 DAG
.getConstant(-1ULL, dl
, Other
.getValueType()));
3597 return DAG
.getNode(AArch64ISD::CSEL
, dl
, Sel
.getValueType(), FVal
, TVal
,
3604 // If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3605 // bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3606 // sets 'C' bit to 0.
3607 static SDValue
valueToCarryFlag(SDValue Value
, SelectionDAG
&DAG
, bool Invert
) {
3609 EVT VT
= Value
.getValueType();
3610 SDValue Op0
= Invert
? DAG
.getConstant(0, DL
, VT
) : Value
;
3611 SDValue Op1
= Invert
? Value
: DAG
.getConstant(1, DL
, VT
);
3613 DAG
.getNode(AArch64ISD::SUBS
, DL
, DAG
.getVTList(VT
, MVT::Glue
), Op0
, Op1
);
3614 return Cmp
.getValue(1);
3617 // If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3618 // If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3619 static SDValue
carryFlagToValue(SDValue Flag
, EVT VT
, SelectionDAG
&DAG
,
3621 assert(Flag
.getResNo() == 1);
3623 SDValue Zero
= DAG
.getConstant(0, DL
, VT
);
3624 SDValue One
= DAG
.getConstant(1, DL
, VT
);
3625 unsigned Cond
= Invert
? AArch64CC::LO
: AArch64CC::HS
;
3626 SDValue CC
= DAG
.getConstant(Cond
, DL
, MVT::i32
);
3627 return DAG
.getNode(AArch64ISD::CSEL
, DL
, VT
, One
, Zero
, CC
, Flag
);
3630 // Value is 1 if 'V' bit of NZCV is 1, else 0
3631 static SDValue
overflowFlagToValue(SDValue Flag
, EVT VT
, SelectionDAG
&DAG
) {
3632 assert(Flag
.getResNo() == 1);
3634 SDValue Zero
= DAG
.getConstant(0, DL
, VT
);
3635 SDValue One
= DAG
.getConstant(1, DL
, VT
);
3636 SDValue CC
= DAG
.getConstant(AArch64CC::VS
, DL
, MVT::i32
);
3637 return DAG
.getNode(AArch64ISD::CSEL
, DL
, VT
, One
, Zero
, CC
, Flag
);
3640 // This lowering is inefficient, but it will get cleaned up by
3641 // `foldOverflowCheck`
3642 static SDValue
lowerADDSUBCARRY(SDValue Op
, SelectionDAG
&DAG
, unsigned Opcode
,
3644 EVT VT0
= Op
.getValue(0).getValueType();
3645 EVT VT1
= Op
.getValue(1).getValueType();
3647 if (VT0
!= MVT::i32
&& VT0
!= MVT::i64
)
3650 bool InvertCarry
= Opcode
== AArch64ISD::SBCS
;
3651 SDValue OpLHS
= Op
.getOperand(0);
3652 SDValue OpRHS
= Op
.getOperand(1);
3653 SDValue OpCarryIn
= valueToCarryFlag(Op
.getOperand(2), DAG
, InvertCarry
);
3656 SDVTList VTs
= DAG
.getVTList(VT0
, VT1
);
3658 SDValue Sum
= DAG
.getNode(Opcode
, DL
, DAG
.getVTList(VT0
, MVT::Glue
), OpLHS
,
3662 IsSigned
? overflowFlagToValue(Sum
.getValue(1), VT1
, DAG
)
3663 : carryFlagToValue(Sum
.getValue(1), VT1
, DAG
, InvertCarry
);
3665 return DAG
.getNode(ISD::MERGE_VALUES
, DL
, VTs
, Sum
, OutFlag
);
3668 static SDValue
LowerXALUO(SDValue Op
, SelectionDAG
&DAG
) {
3669 // Let legalize expand this if it isn't a legal type yet.
3670 if (!DAG
.getTargetLoweringInfo().isTypeLegal(Op
.getValueType()))
3674 AArch64CC::CondCode CC
;
3675 // The actual operation that sets the overflow or carry flag.
3676 SDValue Value
, Overflow
;
3677 std::tie(Value
, Overflow
) = getAArch64XALUOOp(CC
, Op
, DAG
);
3679 // We use 0 and 1 as false and true values.
3680 SDValue TVal
= DAG
.getConstant(1, dl
, MVT::i32
);
3681 SDValue FVal
= DAG
.getConstant(0, dl
, MVT::i32
);
3683 // We use an inverted condition, because the conditional select is inverted
3684 // too. This will allow it to be selected to a single instruction:
3685 // CSINC Wd, WZR, WZR, invert(cond).
3686 SDValue CCVal
= DAG
.getConstant(getInvertedCondCode(CC
), dl
, MVT::i32
);
3687 Overflow
= DAG
.getNode(AArch64ISD::CSEL
, dl
, MVT::i32
, FVal
, TVal
,
3690 SDVTList VTs
= DAG
.getVTList(Op
.getValueType(), MVT::i32
);
3691 return DAG
.getNode(ISD::MERGE_VALUES
, dl
, VTs
, Value
, Overflow
);
3694 // Prefetch operands are:
3695 // 1: Address to prefetch
3697 // 3: int locality (0 = no locality ... 3 = extreme locality)
3698 // 4: bool isDataCache
3699 static SDValue
LowerPREFETCH(SDValue Op
, SelectionDAG
&DAG
) {
3701 unsigned IsWrite
= cast
<ConstantSDNode
>(Op
.getOperand(2))->getZExtValue();
3702 unsigned Locality
= cast
<ConstantSDNode
>(Op
.getOperand(3))->getZExtValue();
3703 unsigned IsData
= cast
<ConstantSDNode
>(Op
.getOperand(4))->getZExtValue();
3705 bool IsStream
= !Locality
;
3706 // When the locality number is set
3708 // The front-end should have filtered out the out-of-range values
3709 assert(Locality
<= 3 && "Prefetch locality out-of-range");
3710 // The locality degree is the opposite of the cache speed.
3711 // Put the number the other way around.
3712 // The encoding starts at 0 for level 1
3713 Locality
= 3 - Locality
;
3716 // built the mask value encoding the expected behavior.
3717 unsigned PrfOp
= (IsWrite
<< 4) | // Load/Store bit
3718 (!IsData
<< 3) | // IsDataCache bit
3719 (Locality
<< 1) | // Cache level bits
3720 (unsigned)IsStream
; // Stream bit
3721 return DAG
.getNode(AArch64ISD::PREFETCH
, DL
, MVT::Other
, Op
.getOperand(0),
3722 DAG
.getTargetConstant(PrfOp
, DL
, MVT::i32
),
3726 SDValue
AArch64TargetLowering::LowerFP_EXTEND(SDValue Op
,
3727 SelectionDAG
&DAG
) const {
3728 EVT VT
= Op
.getValueType();
3729 if (VT
.isScalableVector())
3730 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU
);
3732 if (useSVEForFixedLengthVectorVT(VT
))
3733 return LowerFixedLengthFPExtendToSVE(Op
, DAG
);
3735 assert(Op
.getValueType() == MVT::f128
&& "Unexpected lowering");
3739 SDValue
AArch64TargetLowering::LowerFP_ROUND(SDValue Op
,
3740 SelectionDAG
&DAG
) const {
3741 if (Op
.getValueType().isScalableVector())
3742 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FP_ROUND_MERGE_PASSTHRU
);
3744 bool IsStrict
= Op
->isStrictFPOpcode();
3745 SDValue SrcVal
= Op
.getOperand(IsStrict
? 1 : 0);
3746 EVT SrcVT
= SrcVal
.getValueType();
3748 if (useSVEForFixedLengthVectorVT(SrcVT
))
3749 return LowerFixedLengthFPRoundToSVE(Op
, DAG
);
3751 if (SrcVT
!= MVT::f128
) {
3752 // Expand cases where the input is a vector bigger than NEON.
3753 if (useSVEForFixedLengthVectorVT(SrcVT
))
3756 // It's legal except when f128 is involved
3763 SDValue
AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op
,
3764 SelectionDAG
&DAG
) const {
3765 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3766 // Any additional optimization in this function should be recorded
3767 // in the cost tables.
3768 bool IsStrict
= Op
->isStrictFPOpcode();
3769 EVT InVT
= Op
.getOperand(IsStrict
? 1 : 0).getValueType();
3770 EVT VT
= Op
.getValueType();
3772 if (VT
.isScalableVector()) {
3773 unsigned Opcode
= Op
.getOpcode() == ISD::FP_TO_UINT
3774 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
3775 : AArch64ISD::FCVTZS_MERGE_PASSTHRU
;
3776 return LowerToPredicatedOp(Op
, DAG
, Opcode
);
3779 if (useSVEForFixedLengthVectorVT(VT
) || useSVEForFixedLengthVectorVT(InVT
))
3780 return LowerFixedLengthFPToIntToSVE(Op
, DAG
);
3782 unsigned NumElts
= InVT
.getVectorNumElements();
3784 // f16 conversions are promoted to f32 when full fp16 is not supported.
3785 if (InVT
.getVectorElementType() == MVT::f16
&&
3786 !Subtarget
->hasFullFP16()) {
3787 MVT NewVT
= MVT::getVectorVT(MVT::f32
, NumElts
);
3790 SDValue Ext
= DAG
.getNode(ISD::STRICT_FP_EXTEND
, dl
, {NewVT
, MVT::Other
},
3791 {Op
.getOperand(0), Op
.getOperand(1)});
3792 return DAG
.getNode(Op
.getOpcode(), dl
, {VT
, MVT::Other
},
3793 {Ext
.getValue(1), Ext
.getValue(0)});
3796 Op
.getOpcode(), dl
, Op
.getValueType(),
3797 DAG
.getNode(ISD::FP_EXTEND
, dl
, NewVT
, Op
.getOperand(0)));
3800 uint64_t VTSize
= VT
.getFixedSizeInBits();
3801 uint64_t InVTSize
= InVT
.getFixedSizeInBits();
3802 if (VTSize
< InVTSize
) {
3805 InVT
= InVT
.changeVectorElementTypeToInteger();
3806 SDValue Cv
= DAG
.getNode(Op
.getOpcode(), dl
, {InVT
, MVT::Other
},
3807 {Op
.getOperand(0), Op
.getOperand(1)});
3808 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, dl
, VT
, Cv
);
3809 return DAG
.getMergeValues({Trunc
, Cv
.getValue(1)}, dl
);
3812 DAG
.getNode(Op
.getOpcode(), dl
, InVT
.changeVectorElementTypeToInteger(),
3814 return DAG
.getNode(ISD::TRUNCATE
, dl
, VT
, Cv
);
3817 if (VTSize
> InVTSize
) {
3820 MVT::getVectorVT(MVT::getFloatingPointVT(VT
.getScalarSizeInBits()),
3821 VT
.getVectorNumElements());
3823 SDValue Ext
= DAG
.getNode(ISD::STRICT_FP_EXTEND
, dl
, {ExtVT
, MVT::Other
},
3824 {Op
.getOperand(0), Op
.getOperand(1)});
3825 return DAG
.getNode(Op
.getOpcode(), dl
, {VT
, MVT::Other
},
3826 {Ext
.getValue(1), Ext
.getValue(0)});
3828 SDValue Ext
= DAG
.getNode(ISD::FP_EXTEND
, dl
, ExtVT
, Op
.getOperand(0));
3829 return DAG
.getNode(Op
.getOpcode(), dl
, VT
, Ext
);
3832 // Use a scalar operation for conversions between single-element vectors of
3836 SDValue Extract
= DAG
.getNode(
3837 ISD::EXTRACT_VECTOR_ELT
, dl
, InVT
.getScalarType(),
3838 Op
.getOperand(IsStrict
? 1 : 0), DAG
.getConstant(0, dl
, MVT::i64
));
3839 EVT ScalarVT
= VT
.getScalarType();
3841 return DAG
.getNode(Op
.getOpcode(), dl
, {ScalarVT
, MVT::Other
},
3842 {Op
.getOperand(0), Extract
});
3843 return DAG
.getNode(Op
.getOpcode(), dl
, ScalarVT
, Extract
);
3846 // Type changing conversions are illegal.
3850 SDValue
AArch64TargetLowering::LowerFP_TO_INT(SDValue Op
,
3851 SelectionDAG
&DAG
) const {
3852 bool IsStrict
= Op
->isStrictFPOpcode();
3853 SDValue SrcVal
= Op
.getOperand(IsStrict
? 1 : 0);
3855 if (SrcVal
.getValueType().isVector())
3856 return LowerVectorFP_TO_INT(Op
, DAG
);
3858 // f16 conversions are promoted to f32 when full fp16 is not supported.
3859 if (SrcVal
.getValueType() == MVT::f16
&& !Subtarget
->hasFullFP16()) {
3863 DAG
.getNode(ISD::STRICT_FP_EXTEND
, dl
, {MVT::f32
, MVT::Other
},
3864 {Op
.getOperand(0), SrcVal
});
3865 return DAG
.getNode(Op
.getOpcode(), dl
, {Op
.getValueType(), MVT::Other
},
3866 {Ext
.getValue(1), Ext
.getValue(0)});
3869 Op
.getOpcode(), dl
, Op
.getValueType(),
3870 DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::f32
, SrcVal
));
3873 if (SrcVal
.getValueType() != MVT::f128
) {
3874 // It's legal except when f128 is involved
3882 AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op
,
3883 SelectionDAG
&DAG
) const {
3884 // AArch64 FP-to-int conversions saturate to the destination element size, so
3885 // we can lower common saturating conversions to simple instructions.
3886 SDValue SrcVal
= Op
.getOperand(0);
3887 EVT SrcVT
= SrcVal
.getValueType();
3888 EVT DstVT
= Op
.getValueType();
3889 EVT SatVT
= cast
<VTSDNode
>(Op
.getOperand(1))->getVT();
3891 uint64_t SrcElementWidth
= SrcVT
.getScalarSizeInBits();
3892 uint64_t DstElementWidth
= DstVT
.getScalarSizeInBits();
3893 uint64_t SatWidth
= SatVT
.getScalarSizeInBits();
3894 assert(SatWidth
<= DstElementWidth
&&
3895 "Saturation width cannot exceed result width");
3897 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
3898 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
3899 // types, so this is hard to reach.
3900 if (DstVT
.isScalableVector())
3903 EVT SrcElementVT
= SrcVT
.getVectorElementType();
3905 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
3906 if (SrcElementVT
== MVT::f16
&&
3907 (!Subtarget
->hasFullFP16() || DstElementWidth
> 16)) {
3908 MVT F32VT
= MVT::getVectorVT(MVT::f32
, SrcVT
.getVectorNumElements());
3909 SrcVal
= DAG
.getNode(ISD::FP_EXTEND
, SDLoc(Op
), F32VT
, SrcVal
);
3911 SrcElementVT
= MVT::f32
;
3912 SrcElementWidth
= 32;
3913 } else if (SrcElementVT
!= MVT::f64
&& SrcElementVT
!= MVT::f32
&&
3914 SrcElementVT
!= MVT::f16
)
3918 // Cases that we can emit directly.
3919 if (SrcElementWidth
== DstElementWidth
&& SrcElementWidth
== SatWidth
)
3920 return DAG
.getNode(Op
.getOpcode(), DL
, DstVT
, SrcVal
,
3921 DAG
.getValueType(DstVT
.getScalarType()));
3923 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
3924 // result. This is only valid if the legal cvt is larger than the saturate
3925 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
3926 // (at least until sqxtn is selected).
3927 if (SrcElementWidth
< SatWidth
|| SrcElementVT
== MVT::f64
)
3930 EVT IntVT
= SrcVT
.changeVectorElementTypeToInteger();
3931 SDValue NativeCvt
= DAG
.getNode(Op
.getOpcode(), DL
, IntVT
, SrcVal
,
3932 DAG
.getValueType(IntVT
.getScalarType()));
3934 if (Op
.getOpcode() == ISD::FP_TO_SINT_SAT
) {
3935 SDValue MinC
= DAG
.getConstant(
3936 APInt::getSignedMaxValue(SatWidth
).sext(SrcElementWidth
), DL
, IntVT
);
3937 SDValue Min
= DAG
.getNode(ISD::SMIN
, DL
, IntVT
, NativeCvt
, MinC
);
3938 SDValue MaxC
= DAG
.getConstant(
3939 APInt::getSignedMinValue(SatWidth
).sext(SrcElementWidth
), DL
, IntVT
);
3940 Sat
= DAG
.getNode(ISD::SMAX
, DL
, IntVT
, Min
, MaxC
);
3942 SDValue MinC
= DAG
.getConstant(
3943 APInt::getAllOnesValue(SatWidth
).zext(SrcElementWidth
), DL
, IntVT
);
3944 Sat
= DAG
.getNode(ISD::UMIN
, DL
, IntVT
, NativeCvt
, MinC
);
3947 return DAG
.getNode(ISD::TRUNCATE
, DL
, DstVT
, Sat
);
3950 SDValue
AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op
,
3951 SelectionDAG
&DAG
) const {
3952 // AArch64 FP-to-int conversions saturate to the destination register size, so
3953 // we can lower common saturating conversions to simple instructions.
3954 SDValue SrcVal
= Op
.getOperand(0);
3955 EVT SrcVT
= SrcVal
.getValueType();
3957 if (SrcVT
.isVector())
3958 return LowerVectorFP_TO_INT_SAT(Op
, DAG
);
3960 EVT DstVT
= Op
.getValueType();
3961 EVT SatVT
= cast
<VTSDNode
>(Op
.getOperand(1))->getVT();
3962 uint64_t SatWidth
= SatVT
.getScalarSizeInBits();
3963 uint64_t DstWidth
= DstVT
.getScalarSizeInBits();
3964 assert(SatWidth
<= DstWidth
&& "Saturation width cannot exceed result width");
3966 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
3967 if (SrcVT
== MVT::f16
&& !Subtarget
->hasFullFP16()) {
3968 SrcVal
= DAG
.getNode(ISD::FP_EXTEND
, SDLoc(Op
), MVT::f32
, SrcVal
);
3970 } else if (SrcVT
!= MVT::f64
&& SrcVT
!= MVT::f32
&& SrcVT
!= MVT::f16
)
3974 // Cases that we can emit directly.
3975 if ((SrcVT
== MVT::f64
|| SrcVT
== MVT::f32
||
3976 (SrcVT
== MVT::f16
&& Subtarget
->hasFullFP16())) &&
3977 DstVT
== SatVT
&& (DstVT
== MVT::i64
|| DstVT
== MVT::i32
))
3978 return DAG
.getNode(Op
.getOpcode(), DL
, DstVT
, SrcVal
,
3979 DAG
.getValueType(DstVT
));
3981 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
3982 // result. This is only valid if the legal cvt is larger than the saturate
3984 if (DstWidth
< SatWidth
)
3988 DAG
.getNode(Op
.getOpcode(), DL
, DstVT
, SrcVal
, DAG
.getValueType(DstVT
));
3990 if (Op
.getOpcode() == ISD::FP_TO_SINT_SAT
) {
3991 SDValue MinC
= DAG
.getConstant(
3992 APInt::getSignedMaxValue(SatWidth
).sext(DstWidth
), DL
, DstVT
);
3993 SDValue Min
= DAG
.getNode(ISD::SMIN
, DL
, DstVT
, NativeCvt
, MinC
);
3994 SDValue MaxC
= DAG
.getConstant(
3995 APInt::getSignedMinValue(SatWidth
).sext(DstWidth
), DL
, DstVT
);
3996 Sat
= DAG
.getNode(ISD::SMAX
, DL
, DstVT
, Min
, MaxC
);
3998 SDValue MinC
= DAG
.getConstant(
3999 APInt::getAllOnesValue(SatWidth
).zext(DstWidth
), DL
, DstVT
);
4000 Sat
= DAG
.getNode(ISD::UMIN
, DL
, DstVT
, NativeCvt
, MinC
);
4003 return DAG
.getNode(ISD::TRUNCATE
, DL
, DstVT
, Sat
);
4006 SDValue
AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op
,
4007 SelectionDAG
&DAG
) const {
4008 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4009 // Any additional optimization in this function should be recorded
4010 // in the cost tables.
4011 bool IsStrict
= Op
->isStrictFPOpcode();
4012 EVT VT
= Op
.getValueType();
4014 SDValue In
= Op
.getOperand(IsStrict
? 1 : 0);
4015 EVT InVT
= In
.getValueType();
4016 unsigned Opc
= Op
.getOpcode();
4017 bool IsSigned
= Opc
== ISD::SINT_TO_FP
|| Opc
== ISD::STRICT_SINT_TO_FP
;
4019 if (VT
.isScalableVector()) {
4020 if (InVT
.getVectorElementType() == MVT::i1
) {
4021 // We can't directly extend an SVE predicate; extend it first.
4022 unsigned CastOpc
= IsSigned
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
4023 EVT CastVT
= getPromotedVTForPredicate(InVT
);
4024 In
= DAG
.getNode(CastOpc
, dl
, CastVT
, In
);
4025 return DAG
.getNode(Opc
, dl
, VT
, In
);
4028 unsigned Opcode
= IsSigned
? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4029 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU
;
4030 return LowerToPredicatedOp(Op
, DAG
, Opcode
);
4033 if (useSVEForFixedLengthVectorVT(VT
) || useSVEForFixedLengthVectorVT(InVT
))
4034 return LowerFixedLengthIntToFPToSVE(Op
, DAG
);
4036 uint64_t VTSize
= VT
.getFixedSizeInBits();
4037 uint64_t InVTSize
= InVT
.getFixedSizeInBits();
4038 if (VTSize
< InVTSize
) {
4040 MVT::getVectorVT(MVT::getFloatingPointVT(InVT
.getScalarSizeInBits()),
4041 InVT
.getVectorNumElements());
4043 In
= DAG
.getNode(Opc
, dl
, {CastVT
, MVT::Other
},
4044 {Op
.getOperand(0), In
});
4046 ISD::STRICT_FP_ROUND
, dl
, {VT
, MVT::Other
},
4047 {In
.getValue(1), In
.getValue(0), DAG
.getIntPtrConstant(0, dl
)});
4049 In
= DAG
.getNode(Opc
, dl
, CastVT
, In
);
4050 return DAG
.getNode(ISD::FP_ROUND
, dl
, VT
, In
,
4051 DAG
.getIntPtrConstant(0, dl
, /*isTarget=*/true));
4054 if (VTSize
> InVTSize
) {
4055 unsigned CastOpc
= IsSigned
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
4056 EVT CastVT
= VT
.changeVectorElementTypeToInteger();
4057 In
= DAG
.getNode(CastOpc
, dl
, CastVT
, In
);
4059 return DAG
.getNode(Opc
, dl
, {VT
, MVT::Other
}, {Op
.getOperand(0), In
});
4060 return DAG
.getNode(Opc
, dl
, VT
, In
);
4063 // Use a scalar operation for conversions between single-element vectors of
4065 if (VT
.getVectorNumElements() == 1) {
4066 SDValue Extract
= DAG
.getNode(
4067 ISD::EXTRACT_VECTOR_ELT
, dl
, InVT
.getScalarType(),
4068 In
, DAG
.getConstant(0, dl
, MVT::i64
));
4069 EVT ScalarVT
= VT
.getScalarType();
4071 return DAG
.getNode(Op
.getOpcode(), dl
, {ScalarVT
, MVT::Other
},
4072 {Op
.getOperand(0), Extract
});
4073 return DAG
.getNode(Op
.getOpcode(), dl
, ScalarVT
, Extract
);
4079 SDValue
AArch64TargetLowering::LowerINT_TO_FP(SDValue Op
,
4080 SelectionDAG
&DAG
) const {
4081 if (Op
.getValueType().isVector())
4082 return LowerVectorINT_TO_FP(Op
, DAG
);
4084 bool IsStrict
= Op
->isStrictFPOpcode();
4085 SDValue SrcVal
= Op
.getOperand(IsStrict
? 1 : 0);
4087 // f16 conversions are promoted to f32 when full fp16 is not supported.
4088 if (Op
.getValueType() == MVT::f16
&& !Subtarget
->hasFullFP16()) {
4091 SDValue Val
= DAG
.getNode(Op
.getOpcode(), dl
, {MVT::f32
, MVT::Other
},
4092 {Op
.getOperand(0), SrcVal
});
4094 ISD::STRICT_FP_ROUND
, dl
, {MVT::f16
, MVT::Other
},
4095 {Val
.getValue(1), Val
.getValue(0), DAG
.getIntPtrConstant(0, dl
)});
4098 ISD::FP_ROUND
, dl
, MVT::f16
,
4099 DAG
.getNode(Op
.getOpcode(), dl
, MVT::f32
, SrcVal
),
4100 DAG
.getIntPtrConstant(0, dl
));
4103 // i128 conversions are libcalls.
4104 if (SrcVal
.getValueType() == MVT::i128
)
4107 // Other conversions are legal, unless it's to the completely software-based
4109 if (Op
.getValueType() != MVT::f128
)
4114 SDValue
AArch64TargetLowering::LowerFSINCOS(SDValue Op
,
4115 SelectionDAG
&DAG
) const {
4116 // For iOS, we want to call an alternative entry point: __sincos_stret,
4117 // which returns the values in two S / D registers.
4119 SDValue Arg
= Op
.getOperand(0);
4120 EVT ArgVT
= Arg
.getValueType();
4121 Type
*ArgTy
= ArgVT
.getTypeForEVT(*DAG
.getContext());
4128 Entry
.IsSExt
= false;
4129 Entry
.IsZExt
= false;
4130 Args
.push_back(Entry
);
4132 RTLIB::Libcall LC
= ArgVT
== MVT::f64
? RTLIB::SINCOS_STRET_F64
4133 : RTLIB::SINCOS_STRET_F32
;
4134 const char *LibcallName
= getLibcallName(LC
);
4136 DAG
.getExternalSymbol(LibcallName
, getPointerTy(DAG
.getDataLayout()));
4138 StructType
*RetTy
= StructType::get(ArgTy
, ArgTy
);
4139 TargetLowering::CallLoweringInfo
CLI(DAG
);
4141 .setChain(DAG
.getEntryNode())
4142 .setLibCallee(CallingConv::Fast
, RetTy
, Callee
, std::move(Args
));
4144 std::pair
<SDValue
, SDValue
> CallResult
= LowerCallTo(CLI
);
4145 return CallResult
.first
;
4148 static MVT
getSVEContainerType(EVT ContentTy
);
4150 SDValue
AArch64TargetLowering::LowerBITCAST(SDValue Op
,
4151 SelectionDAG
&DAG
) const {
4152 EVT OpVT
= Op
.getValueType();
4153 EVT ArgVT
= Op
.getOperand(0).getValueType();
4155 if (useSVEForFixedLengthVectorVT(OpVT
))
4156 return LowerFixedLengthBitcastToSVE(Op
, DAG
);
4158 if (OpVT
.isScalableVector()) {
4159 // Bitcasting between unpacked vector types of different element counts is
4160 // not a NOP because the live elements are laid out differently.
4162 // e.g. nxv2i32 = XX??XX??
4163 // nxv4f16 = X?X?X?X?
4164 if (OpVT
.getVectorElementCount() != ArgVT
.getVectorElementCount())
4167 if (isTypeLegal(OpVT
) && !isTypeLegal(ArgVT
)) {
4168 assert(OpVT
.isFloatingPoint() && !ArgVT
.isFloatingPoint() &&
4169 "Expected int->fp bitcast!");
4171 DAG
.getNode(ISD::ANY_EXTEND
, SDLoc(Op
), getSVEContainerType(ArgVT
),
4173 return getSVESafeBitCast(OpVT
, ExtResult
, DAG
);
4175 return getSVESafeBitCast(OpVT
, Op
.getOperand(0), DAG
);
4178 if (OpVT
!= MVT::f16
&& OpVT
!= MVT::bf16
)
4181 // Bitcasts between f16 and bf16 are legal.
4182 if (ArgVT
== MVT::f16
|| ArgVT
== MVT::bf16
)
4185 assert(ArgVT
== MVT::i16
);
4188 Op
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, Op
.getOperand(0));
4189 Op
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::f32
, Op
);
4191 DAG
.getMachineNode(TargetOpcode::EXTRACT_SUBREG
, DL
, OpVT
, Op
,
4192 DAG
.getTargetConstant(AArch64::hsub
, DL
, MVT::i32
)),
4196 static EVT
getExtensionTo64Bits(const EVT
&OrigVT
) {
4197 if (OrigVT
.getSizeInBits() >= 64)
4200 assert(OrigVT
.isSimple() && "Expecting a simple value type");
4202 MVT::SimpleValueType OrigSimpleTy
= OrigVT
.getSimpleVT().SimpleTy
;
4203 switch (OrigSimpleTy
) {
4204 default: llvm_unreachable("Unexpected Vector Type");
4213 static SDValue
addRequiredExtensionForVectorMULL(SDValue N
, SelectionDAG
&DAG
,
4216 unsigned ExtOpcode
) {
4217 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4218 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4219 // 64-bits we need to insert a new extension so that it will be 64-bits.
4220 assert(ExtTy
.is128BitVector() && "Unexpected extension size");
4221 if (OrigTy
.getSizeInBits() >= 64)
4224 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4225 EVT NewVT
= getExtensionTo64Bits(OrigTy
);
4227 return DAG
.getNode(ExtOpcode
, SDLoc(N
), NewVT
, N
);
4230 // Returns lane if Op extracts from a two-element vector and lane is constant
4231 // (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and None otherwise.
4232 static Optional
<uint64_t> getConstantLaneNumOfExtractHalfOperand(SDValue
&Op
) {
4233 SDNode
*OpNode
= Op
.getNode();
4234 if (OpNode
->getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
4237 EVT VT
= OpNode
->getOperand(0).getValueType();
4238 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(OpNode
->getOperand(1));
4239 if (!VT
.isFixedLengthVector() || VT
.getVectorNumElements() != 2 || !C
)
4242 return C
->getZExtValue();
4245 static bool isExtendedBUILD_VECTOR(SDNode
*N
, SelectionDAG
&DAG
,
4247 EVT VT
= N
->getValueType(0);
4249 if (N
->getOpcode() != ISD::BUILD_VECTOR
)
4252 for (const SDValue
&Elt
: N
->op_values()) {
4253 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Elt
)) {
4254 unsigned EltSize
= VT
.getScalarSizeInBits();
4255 unsigned HalfSize
= EltSize
/ 2;
4257 if (!isIntN(HalfSize
, C
->getSExtValue()))
4260 if (!isUIntN(HalfSize
, C
->getZExtValue()))
4271 static SDValue
skipExtensionForVectorMULL(SDNode
*N
, SelectionDAG
&DAG
) {
4272 if (N
->getOpcode() == ISD::SIGN_EXTEND
||
4273 N
->getOpcode() == ISD::ZERO_EXTEND
|| N
->getOpcode() == ISD::ANY_EXTEND
)
4274 return addRequiredExtensionForVectorMULL(N
->getOperand(0), DAG
,
4275 N
->getOperand(0)->getValueType(0),
4279 assert(N
->getOpcode() == ISD::BUILD_VECTOR
&& "expected BUILD_VECTOR");
4280 EVT VT
= N
->getValueType(0);
4282 unsigned EltSize
= VT
.getScalarSizeInBits() / 2;
4283 unsigned NumElts
= VT
.getVectorNumElements();
4284 MVT TruncVT
= MVT::getIntegerVT(EltSize
);
4285 SmallVector
<SDValue
, 8> Ops
;
4286 for (unsigned i
= 0; i
!= NumElts
; ++i
) {
4287 ConstantSDNode
*C
= cast
<ConstantSDNode
>(N
->getOperand(i
));
4288 const APInt
&CInt
= C
->getAPIntValue();
4289 // Element types smaller than 32 bits are not legal, so use i32 elements.
4290 // The values are implicitly truncated so sext vs. zext doesn't matter.
4291 Ops
.push_back(DAG
.getConstant(CInt
.zextOrTrunc(32), dl
, MVT::i32
));
4293 return DAG
.getBuildVector(MVT::getVectorVT(TruncVT
, NumElts
), dl
, Ops
);
4296 static bool isSignExtended(SDNode
*N
, SelectionDAG
&DAG
) {
4297 return N
->getOpcode() == ISD::SIGN_EXTEND
||
4298 N
->getOpcode() == ISD::ANY_EXTEND
||
4299 isExtendedBUILD_VECTOR(N
, DAG
, true);
4302 static bool isZeroExtended(SDNode
*N
, SelectionDAG
&DAG
) {
4303 return N
->getOpcode() == ISD::ZERO_EXTEND
||
4304 N
->getOpcode() == ISD::ANY_EXTEND
||
4305 isExtendedBUILD_VECTOR(N
, DAG
, false);
4308 static bool isAddSubSExt(SDNode
*N
, SelectionDAG
&DAG
) {
4309 unsigned Opcode
= N
->getOpcode();
4310 if (Opcode
== ISD::ADD
|| Opcode
== ISD::SUB
) {
4311 SDNode
*N0
= N
->getOperand(0).getNode();
4312 SDNode
*N1
= N
->getOperand(1).getNode();
4313 return N0
->hasOneUse() && N1
->hasOneUse() &&
4314 isSignExtended(N0
, DAG
) && isSignExtended(N1
, DAG
);
4319 static bool isAddSubZExt(SDNode
*N
, SelectionDAG
&DAG
) {
4320 unsigned Opcode
= N
->getOpcode();
4321 if (Opcode
== ISD::ADD
|| Opcode
== ISD::SUB
) {
4322 SDNode
*N0
= N
->getOperand(0).getNode();
4323 SDNode
*N1
= N
->getOperand(1).getNode();
4324 return N0
->hasOneUse() && N1
->hasOneUse() &&
4325 isZeroExtended(N0
, DAG
) && isZeroExtended(N1
, DAG
);
4330 SDValue
AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op
,
4331 SelectionDAG
&DAG
) const {
4332 // The rounding mode is in bits 23:22 of the FPSCR.
4333 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4334 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4335 // so that the shift + and get folded into a bitfield extract.
4338 SDValue Chain
= Op
.getOperand(0);
4339 SDValue FPCR_64
= DAG
.getNode(
4340 ISD::INTRINSIC_W_CHAIN
, dl
, {MVT::i64
, MVT::Other
},
4341 {Chain
, DAG
.getConstant(Intrinsic::aarch64_get_fpcr
, dl
, MVT::i64
)});
4342 Chain
= FPCR_64
.getValue(1);
4343 SDValue FPCR_32
= DAG
.getNode(ISD::TRUNCATE
, dl
, MVT::i32
, FPCR_64
);
4344 SDValue FltRounds
= DAG
.getNode(ISD::ADD
, dl
, MVT::i32
, FPCR_32
,
4345 DAG
.getConstant(1U << 22, dl
, MVT::i32
));
4346 SDValue RMODE
= DAG
.getNode(ISD::SRL
, dl
, MVT::i32
, FltRounds
,
4347 DAG
.getConstant(22, dl
, MVT::i32
));
4348 SDValue AND
= DAG
.getNode(ISD::AND
, dl
, MVT::i32
, RMODE
,
4349 DAG
.getConstant(3, dl
, MVT::i32
));
4350 return DAG
.getMergeValues({AND
, Chain
}, dl
);
4353 SDValue
AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op
,
4354 SelectionDAG
&DAG
) const {
4356 SDValue Chain
= Op
->getOperand(0);
4357 SDValue RMValue
= Op
->getOperand(1);
4359 // The rounding mode is in bits 23:22 of the FPCR.
4360 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4361 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4362 // ((arg - 1) & 3) << 22).
4364 // The argument of llvm.set.rounding must be within the segment [0, 3], so
4365 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4366 // generated llvm.set.rounding to ensure this condition.
4368 // Calculate new value of FPCR[23:22].
4369 RMValue
= DAG
.getNode(ISD::SUB
, DL
, MVT::i32
, RMValue
,
4370 DAG
.getConstant(1, DL
, MVT::i32
));
4371 RMValue
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, RMValue
,
4372 DAG
.getConstant(0x3, DL
, MVT::i32
));
4374 DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, RMValue
,
4375 DAG
.getConstant(AArch64::RoundingBitsPos
, DL
, MVT::i32
));
4376 RMValue
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, MVT::i64
, RMValue
);
4378 // Get current value of FPCR.
4380 Chain
, DAG
.getTargetConstant(Intrinsic::aarch64_get_fpcr
, DL
, MVT::i64
)};
4382 DAG
.getNode(ISD::INTRINSIC_W_CHAIN
, DL
, {MVT::i64
, MVT::Other
}, Ops
);
4383 Chain
= FPCR
.getValue(1);
4384 FPCR
= FPCR
.getValue(0);
4386 // Put new rounding mode into FPSCR[23:22].
4387 const int RMMask
= ~(AArch64::Rounding::rmMask
<< AArch64::RoundingBitsPos
);
4388 FPCR
= DAG
.getNode(ISD::AND
, DL
, MVT::i64
, FPCR
,
4389 DAG
.getConstant(RMMask
, DL
, MVT::i64
));
4390 FPCR
= DAG
.getNode(ISD::OR
, DL
, MVT::i64
, FPCR
, RMValue
);
4392 Chain
, DAG
.getTargetConstant(Intrinsic::aarch64_set_fpcr
, DL
, MVT::i64
),
4394 return DAG
.getNode(ISD::INTRINSIC_VOID
, DL
, MVT::Other
, Ops2
);
4397 static unsigned selectUmullSmull(SDNode
*&N0
, SDNode
*&N1
, SelectionDAG
&DAG
,
4399 bool IsN0SExt
= isSignExtended(N0
, DAG
);
4400 bool IsN1SExt
= isSignExtended(N1
, DAG
);
4401 if (IsN0SExt
&& IsN1SExt
)
4402 return AArch64ISD::SMULL
;
4404 bool IsN0ZExt
= isZeroExtended(N0
, DAG
);
4405 bool IsN1ZExt
= isZeroExtended(N1
, DAG
);
4407 if (IsN0ZExt
&& IsN1ZExt
)
4408 return AArch64ISD::UMULL
;
4410 if (!IsN1SExt
&& !IsN1ZExt
)
4412 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4413 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4414 if (IsN1SExt
&& isAddSubSExt(N0
, DAG
)) {
4416 return AArch64ISD::SMULL
;
4418 if (IsN1ZExt
&& isAddSubZExt(N0
, DAG
)) {
4420 return AArch64ISD::UMULL
;
4422 if (IsN0ZExt
&& isAddSubZExt(N1
, DAG
)) {
4425 return AArch64ISD::UMULL
;
4430 SDValue
AArch64TargetLowering::LowerMUL(SDValue Op
, SelectionDAG
&DAG
) const {
4431 EVT VT
= Op
.getValueType();
4433 // If SVE is available then i64 vector multiplications can also be made legal.
4434 bool OverrideNEON
= VT
== MVT::v2i64
|| VT
== MVT::v1i64
;
4436 if (VT
.isScalableVector() || useSVEForFixedLengthVectorVT(VT
, OverrideNEON
))
4437 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::MUL_PRED
);
4439 // Multiplications are only custom-lowered for 128-bit vectors so that
4440 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
4441 assert(VT
.is128BitVector() && VT
.isInteger() &&
4442 "unexpected type for custom-lowering ISD::MUL");
4443 SDNode
*N0
= Op
.getOperand(0).getNode();
4444 SDNode
*N1
= Op
.getOperand(1).getNode();
4446 unsigned NewOpc
= selectUmullSmull(N0
, N1
, DAG
, isMLA
);
4449 if (VT
== MVT::v2i64
)
4450 // Fall through to expand this. It is not legal.
4453 // Other vector multiplications are legal.
4457 // Legalize to a S/UMULL instruction
4460 SDValue Op1
= skipExtensionForVectorMULL(N1
, DAG
);
4462 Op0
= skipExtensionForVectorMULL(N0
, DAG
);
4463 assert(Op0
.getValueType().is64BitVector() &&
4464 Op1
.getValueType().is64BitVector() &&
4465 "unexpected types for extended operands to VMULL");
4466 return DAG
.getNode(NewOpc
, DL
, VT
, Op0
, Op1
);
4468 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
4469 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
4470 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
4471 SDValue N00
= skipExtensionForVectorMULL(N0
->getOperand(0).getNode(), DAG
);
4472 SDValue N01
= skipExtensionForVectorMULL(N0
->getOperand(1).getNode(), DAG
);
4473 EVT Op1VT
= Op1
.getValueType();
4474 return DAG
.getNode(N0
->getOpcode(), DL
, VT
,
4475 DAG
.getNode(NewOpc
, DL
, VT
,
4476 DAG
.getNode(ISD::BITCAST
, DL
, Op1VT
, N00
), Op1
),
4477 DAG
.getNode(NewOpc
, DL
, VT
,
4478 DAG
.getNode(ISD::BITCAST
, DL
, Op1VT
, N01
), Op1
));
4481 static inline SDValue
getPTrue(SelectionDAG
&DAG
, SDLoc DL
, EVT VT
,
4483 if (VT
== MVT::nxv1i1
&& Pattern
== AArch64SVEPredPattern::all
)
4484 return DAG
.getConstant(1, DL
, MVT::nxv1i1
);
4485 return DAG
.getNode(AArch64ISD::PTRUE
, DL
, VT
,
4486 DAG
.getTargetConstant(Pattern
, DL
, MVT::i32
));
4489 // Returns a safe bitcast between two scalable vector predicates, where
4490 // any newly created lanes from a widening bitcast are defined as zero.
4491 static SDValue
getSVEPredicateBitCast(EVT VT
, SDValue Op
, SelectionDAG
&DAG
) {
4493 EVT InVT
= Op
.getValueType();
4495 assert(InVT
.getVectorElementType() == MVT::i1
&&
4496 VT
.getVectorElementType() == MVT::i1
&&
4497 "Expected a predicate-to-predicate bitcast");
4498 assert(VT
.isScalableVector() && DAG
.getTargetLoweringInfo().isTypeLegal(VT
) &&
4499 InVT
.isScalableVector() &&
4500 DAG
.getTargetLoweringInfo().isTypeLegal(InVT
) &&
4501 "Only expect to cast between legal scalable predicate types!");
4503 // Return the operand if the cast isn't changing type,
4504 // e.g. <n x 16 x i1> -> <n x 16 x i1>
4508 SDValue Reinterpret
= DAG
.getNode(AArch64ISD::REINTERPRET_CAST
, DL
, VT
, Op
);
4510 // We only have to zero the lanes if new lanes are being defined, e.g. when
4511 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
4512 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
4513 // we can return here.
4514 if (InVT
.bitsGT(VT
))
4517 // Check if the other lanes are already known to be zeroed by
4519 if (isZeroingInactiveLanes(Op
))
4522 // Zero the newly introduced lanes.
4523 SDValue Mask
= DAG
.getConstant(1, DL
, InVT
);
4524 Mask
= DAG
.getNode(AArch64ISD::REINTERPRET_CAST
, DL
, VT
, Mask
);
4525 return DAG
.getNode(ISD::AND
, DL
, VT
, Reinterpret
, Mask
);
4528 SDValue
AArch64TargetLowering::getPStateSM(SelectionDAG
&DAG
, SDValue Chain
,
4529 SMEAttrs Attrs
, SDLoc DL
,
4531 if (Attrs
.hasStreamingInterfaceOrBody())
4532 return DAG
.getConstant(1, DL
, VT
);
4534 if (Attrs
.hasNonStreamingInterfaceAndBody())
4535 return DAG
.getConstant(0, DL
, VT
);
4537 assert(Attrs
.hasStreamingCompatibleInterface() && "Unexpected interface");
4539 SDValue Callee
= DAG
.getExternalSymbol("__arm_sme_state",
4540 getPointerTy(DAG
.getDataLayout()));
4541 Type
*Int64Ty
= Type::getInt64Ty(*DAG
.getContext());
4542 Type
*RetTy
= StructType::get(Int64Ty
, Int64Ty
);
4543 TargetLowering::CallLoweringInfo
CLI(DAG
);
4545 CLI
.setDebugLoc(DL
).setChain(Chain
).setLibCallee(
4546 CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
,
4547 RetTy
, Callee
, std::move(Args
));
4548 std::pair
<SDValue
, SDValue
> CallResult
= LowerCallTo(CLI
);
4549 SDValue Mask
= DAG
.getConstant(/*PSTATE.SM*/ 1, DL
, MVT::i64
);
4550 return DAG
.getNode(ISD::AND
, DL
, MVT::i64
, CallResult
.first
.getOperand(0),
4554 static Optional
<SMEAttrs
> getCalleeAttrsFromExternalFunction(SDValue V
) {
4555 if (auto *ES
= dyn_cast
<ExternalSymbolSDNode
>(V
)) {
4556 StringRef
S(ES
->getSymbol());
4557 if (S
== "__arm_sme_state" || S
== "__arm_tpidr2_save")
4558 return SMEAttrs(SMEAttrs::SM_Compatible
| SMEAttrs::ZA_Preserved
);
4559 if (S
== "__arm_tpidr2_restore")
4560 return SMEAttrs(SMEAttrs::SM_Compatible
| SMEAttrs::ZA_Shared
);
4565 SDValue
AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op
,
4566 SelectionDAG
&DAG
) const {
4567 unsigned IntNo
= Op
.getConstantOperandVal(1);
4571 return SDValue(); // Don't custom lower most intrinsics.
4572 case Intrinsic::aarch64_mops_memset_tag
: {
4573 auto Node
= cast
<MemIntrinsicSDNode
>(Op
.getNode());
4574 SDValue Chain
= Node
->getChain();
4575 SDValue Dst
= Op
.getOperand(2);
4576 SDValue Val
= Op
.getOperand(3);
4577 Val
= DAG
.getAnyExtOrTrunc(Val
, DL
, MVT::i64
);
4578 SDValue Size
= Op
.getOperand(4);
4579 auto Alignment
= Node
->getMemOperand()->getAlign();
4580 bool IsVol
= Node
->isVolatile();
4581 auto DstPtrInfo
= Node
->getPointerInfo();
4584 static_cast<const AArch64SelectionDAGInfo
&>(DAG
.getSelectionDAGInfo());
4586 SDI
.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING
, DAG
, DL
, Chain
, Dst
, Val
,
4587 Size
, Alignment
, IsVol
, DstPtrInfo
, MachinePointerInfo
{});
4589 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
4590 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
4591 // LowerOperationWrapper will complain that the number of results has
4593 return DAG
.getMergeValues({MS
.getValue(0), MS
.getValue(2)}, DL
);
4595 case Intrinsic::aarch64_sme_za_enable
:
4597 AArch64ISD::SMSTART
, DL
, MVT::Other
,
4598 Op
->getOperand(0), // Chain
4599 DAG
.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA
), DL
, MVT::i32
),
4600 DAG
.getConstant(0, DL
, MVT::i64
), DAG
.getConstant(1, DL
, MVT::i64
));
4601 case Intrinsic::aarch64_sme_za_disable
:
4603 AArch64ISD::SMSTOP
, DL
, MVT::Other
,
4604 Op
->getOperand(0), // Chain
4605 DAG
.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA
), DL
, MVT::i32
),
4606 DAG
.getConstant(0, DL
, MVT::i64
), DAG
.getConstant(1, DL
, MVT::i64
));
4610 SDValue
AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op
,
4611 SelectionDAG
&DAG
) const {
4612 unsigned IntNo
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
4615 default: return SDValue(); // Don't custom lower most intrinsics.
4616 case Intrinsic::thread_pointer
: {
4617 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
4618 return DAG
.getNode(AArch64ISD::THREAD_POINTER
, dl
, PtrVT
);
4620 case Intrinsic::aarch64_neon_abs
: {
4621 EVT Ty
= Op
.getValueType();
4622 if (Ty
== MVT::i64
) {
4623 SDValue Result
= DAG
.getNode(ISD::BITCAST
, dl
, MVT::v1i64
,
4625 Result
= DAG
.getNode(ISD::ABS
, dl
, MVT::v1i64
, Result
);
4626 return DAG
.getNode(ISD::BITCAST
, dl
, MVT::i64
, Result
);
4627 } else if (Ty
.isVector() && Ty
.isInteger() && isTypeLegal(Ty
)) {
4628 return DAG
.getNode(ISD::ABS
, dl
, Ty
, Op
.getOperand(1));
4630 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
4633 case Intrinsic::aarch64_neon_pmull64
: {
4634 SDValue LHS
= Op
.getOperand(1);
4635 SDValue RHS
= Op
.getOperand(2);
4637 Optional
<uint64_t> LHSLane
= getConstantLaneNumOfExtractHalfOperand(LHS
);
4638 Optional
<uint64_t> RHSLane
= getConstantLaneNumOfExtractHalfOperand(RHS
);
4640 assert((!LHSLane
|| *LHSLane
< 2) && "Expect lane to be None or 0 or 1");
4641 assert((!RHSLane
|| *RHSLane
< 2) && "Expect lane to be None or 0 or 1");
4643 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
4644 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
4645 // which ISel recognizes better. For example, generate a ldr into d*
4646 // registers as opposed to a GPR load followed by a fmov.
4647 auto TryVectorizeOperand
=
4648 [](SDValue N
, Optional
<uint64_t> NLane
, Optional
<uint64_t> OtherLane
,
4649 const SDLoc
&dl
, SelectionDAG
&DAG
) -> SDValue
{
4650 // If the operand is an higher half itself, rewrite it to
4651 // extract_high_v2i64; this way aarch64_neon_pmull64 could
4652 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
4653 if (NLane
&& *NLane
== 1)
4654 return DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, MVT::v1i64
,
4655 N
.getOperand(0), DAG
.getConstant(1, dl
, MVT::i64
));
4657 // Operand N is not a higher half but the other operand is.
4658 if (OtherLane
&& *OtherLane
== 1) {
4659 // If this operand is a lower half, rewrite it to
4660 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
4661 // align lanes of two operands. A roundtrip sequence (to move from lane
4662 // 1 to lane 0) is like this:
4665 if (NLane
&& *NLane
== 0)
4666 return DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, MVT::v1i64
,
4667 DAG
.getNode(AArch64ISD::DUPLANE64
, dl
, MVT::v2i64
,
4669 DAG
.getConstant(0, dl
, MVT::i64
)),
4670 DAG
.getConstant(1, dl
, MVT::i64
));
4672 // Otherwise just dup from main to all lanes.
4673 return DAG
.getNode(AArch64ISD::DUP
, dl
, MVT::v1i64
, N
);
4676 // Neither operand is an extract of higher half, so codegen may just use
4677 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
4678 assert(N
.getValueType() == MVT::i64
&&
4679 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
4680 return DAG
.getNode(ISD::SCALAR_TO_VECTOR
, dl
, MVT::v1i64
, N
);
4683 LHS
= TryVectorizeOperand(LHS
, LHSLane
, RHSLane
, dl
, DAG
);
4684 RHS
= TryVectorizeOperand(RHS
, RHSLane
, LHSLane
, dl
, DAG
);
4686 return DAG
.getNode(AArch64ISD::PMULL
, dl
, Op
.getValueType(), LHS
, RHS
);
4688 case Intrinsic::aarch64_neon_smax
:
4689 return DAG
.getNode(ISD::SMAX
, dl
, Op
.getValueType(),
4690 Op
.getOperand(1), Op
.getOperand(2));
4691 case Intrinsic::aarch64_neon_umax
:
4692 return DAG
.getNode(ISD::UMAX
, dl
, Op
.getValueType(),
4693 Op
.getOperand(1), Op
.getOperand(2));
4694 case Intrinsic::aarch64_neon_smin
:
4695 return DAG
.getNode(ISD::SMIN
, dl
, Op
.getValueType(),
4696 Op
.getOperand(1), Op
.getOperand(2));
4697 case Intrinsic::aarch64_neon_umin
:
4698 return DAG
.getNode(ISD::UMIN
, dl
, Op
.getValueType(),
4699 Op
.getOperand(1), Op
.getOperand(2));
4700 case Intrinsic::aarch64_neon_scalar_sqxtn
:
4701 case Intrinsic::aarch64_neon_scalar_sqxtun
:
4702 case Intrinsic::aarch64_neon_scalar_uqxtn
: {
4703 assert(Op
.getValueType() == MVT::i32
|| Op
.getValueType() == MVT::f32
);
4704 if (Op
.getValueType() == MVT::i32
)
4705 return DAG
.getNode(ISD::BITCAST
, dl
, MVT::i32
,
4706 DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
, MVT::f32
,
4708 DAG
.getNode(ISD::BITCAST
, dl
, MVT::f64
,
4709 Op
.getOperand(1))));
4712 case Intrinsic::aarch64_sve_sunpkhi
:
4713 return DAG
.getNode(AArch64ISD::SUNPKHI
, dl
, Op
.getValueType(),
4715 case Intrinsic::aarch64_sve_sunpklo
:
4716 return DAG
.getNode(AArch64ISD::SUNPKLO
, dl
, Op
.getValueType(),
4718 case Intrinsic::aarch64_sve_uunpkhi
:
4719 return DAG
.getNode(AArch64ISD::UUNPKHI
, dl
, Op
.getValueType(),
4721 case Intrinsic::aarch64_sve_uunpklo
:
4722 return DAG
.getNode(AArch64ISD::UUNPKLO
, dl
, Op
.getValueType(),
4724 case Intrinsic::aarch64_sve_clasta_n
:
4725 return DAG
.getNode(AArch64ISD::CLASTA_N
, dl
, Op
.getValueType(),
4726 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
4727 case Intrinsic::aarch64_sve_clastb_n
:
4728 return DAG
.getNode(AArch64ISD::CLASTB_N
, dl
, Op
.getValueType(),
4729 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
4730 case Intrinsic::aarch64_sve_lasta
:
4731 return DAG
.getNode(AArch64ISD::LASTA
, dl
, Op
.getValueType(),
4732 Op
.getOperand(1), Op
.getOperand(2));
4733 case Intrinsic::aarch64_sve_lastb
:
4734 return DAG
.getNode(AArch64ISD::LASTB
, dl
, Op
.getValueType(),
4735 Op
.getOperand(1), Op
.getOperand(2));
4736 case Intrinsic::aarch64_sve_rev
:
4737 return DAG
.getNode(ISD::VECTOR_REVERSE
, dl
, Op
.getValueType(),
4739 case Intrinsic::aarch64_sve_tbl
:
4740 return DAG
.getNode(AArch64ISD::TBL
, dl
, Op
.getValueType(),
4741 Op
.getOperand(1), Op
.getOperand(2));
4742 case Intrinsic::aarch64_sve_trn1
:
4743 return DAG
.getNode(AArch64ISD::TRN1
, dl
, Op
.getValueType(),
4744 Op
.getOperand(1), Op
.getOperand(2));
4745 case Intrinsic::aarch64_sve_trn2
:
4746 return DAG
.getNode(AArch64ISD::TRN2
, dl
, Op
.getValueType(),
4747 Op
.getOperand(1), Op
.getOperand(2));
4748 case Intrinsic::aarch64_sve_uzp1
:
4749 return DAG
.getNode(AArch64ISD::UZP1
, dl
, Op
.getValueType(),
4750 Op
.getOperand(1), Op
.getOperand(2));
4751 case Intrinsic::aarch64_sve_uzp2
:
4752 return DAG
.getNode(AArch64ISD::UZP2
, dl
, Op
.getValueType(),
4753 Op
.getOperand(1), Op
.getOperand(2));
4754 case Intrinsic::aarch64_sve_zip1
:
4755 return DAG
.getNode(AArch64ISD::ZIP1
, dl
, Op
.getValueType(),
4756 Op
.getOperand(1), Op
.getOperand(2));
4757 case Intrinsic::aarch64_sve_zip2
:
4758 return DAG
.getNode(AArch64ISD::ZIP2
, dl
, Op
.getValueType(),
4759 Op
.getOperand(1), Op
.getOperand(2));
4760 case Intrinsic::aarch64_sve_splice
:
4761 return DAG
.getNode(AArch64ISD::SPLICE
, dl
, Op
.getValueType(),
4762 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
4763 case Intrinsic::aarch64_sve_ptrue
:
4764 return getPTrue(DAG
, dl
, Op
.getValueType(),
4765 cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue());
4766 case Intrinsic::aarch64_sve_clz
:
4767 return DAG
.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4768 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4769 case Intrinsic::aarch64_sme_cntsb
:
4770 return DAG
.getNode(AArch64ISD::RDSVL
, dl
, Op
.getValueType(),
4771 DAG
.getConstant(1, dl
, MVT::i32
));
4772 case Intrinsic::aarch64_sme_cntsh
: {
4773 SDValue One
= DAG
.getConstant(1, dl
, MVT::i32
);
4774 SDValue Bytes
= DAG
.getNode(AArch64ISD::RDSVL
, dl
, Op
.getValueType(), One
);
4775 return DAG
.getNode(ISD::SRL
, dl
, Op
.getValueType(), Bytes
, One
);
4777 case Intrinsic::aarch64_sme_cntsw
: {
4778 SDValue Bytes
= DAG
.getNode(AArch64ISD::RDSVL
, dl
, Op
.getValueType(),
4779 DAG
.getConstant(1, dl
, MVT::i32
));
4780 return DAG
.getNode(ISD::SRL
, dl
, Op
.getValueType(), Bytes
,
4781 DAG
.getConstant(2, dl
, MVT::i32
));
4783 case Intrinsic::aarch64_sme_cntsd
: {
4784 SDValue Bytes
= DAG
.getNode(AArch64ISD::RDSVL
, dl
, Op
.getValueType(),
4785 DAG
.getConstant(1, dl
, MVT::i32
));
4786 return DAG
.getNode(ISD::SRL
, dl
, Op
.getValueType(), Bytes
,
4787 DAG
.getConstant(3, dl
, MVT::i32
));
4789 case Intrinsic::aarch64_sve_cnt
: {
4790 SDValue Data
= Op
.getOperand(3);
4791 // CTPOP only supports integer operands.
4792 if (Data
.getValueType().isFloatingPoint())
4793 Data
= DAG
.getNode(ISD::BITCAST
, dl
, Op
.getValueType(), Data
);
4794 return DAG
.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4795 Op
.getOperand(2), Data
, Op
.getOperand(1));
4797 case Intrinsic::aarch64_sve_dupq_lane
:
4798 return LowerDUPQLane(Op
, DAG
);
4799 case Intrinsic::aarch64_sve_convert_from_svbool
:
4800 return getSVEPredicateBitCast(Op
.getValueType(), Op
.getOperand(1), DAG
);
4801 case Intrinsic::aarch64_sve_convert_to_svbool
:
4802 return getSVEPredicateBitCast(MVT::nxv16i1
, Op
.getOperand(1), DAG
);
4803 case Intrinsic::aarch64_sve_fneg
:
4804 return DAG
.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4805 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4806 case Intrinsic::aarch64_sve_frintp
:
4807 return DAG
.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4808 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4809 case Intrinsic::aarch64_sve_frintm
:
4810 return DAG
.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4811 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4812 case Intrinsic::aarch64_sve_frinti
:
4813 return DAG
.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4814 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4815 case Intrinsic::aarch64_sve_frintx
:
4816 return DAG
.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4817 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4818 case Intrinsic::aarch64_sve_frinta
:
4819 return DAG
.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4820 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4821 case Intrinsic::aarch64_sve_frintn
:
4822 return DAG
.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4823 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4824 case Intrinsic::aarch64_sve_frintz
:
4825 return DAG
.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4826 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4827 case Intrinsic::aarch64_sve_ucvtf
:
4828 return DAG
.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU
, dl
,
4829 Op
.getValueType(), Op
.getOperand(2), Op
.getOperand(3),
4831 case Intrinsic::aarch64_sve_scvtf
:
4832 return DAG
.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
, dl
,
4833 Op
.getValueType(), Op
.getOperand(2), Op
.getOperand(3),
4835 case Intrinsic::aarch64_sve_fcvtzu
:
4836 return DAG
.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU
, dl
,
4837 Op
.getValueType(), Op
.getOperand(2), Op
.getOperand(3),
4839 case Intrinsic::aarch64_sve_fcvtzs
:
4840 return DAG
.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU
, dl
,
4841 Op
.getValueType(), Op
.getOperand(2), Op
.getOperand(3),
4843 case Intrinsic::aarch64_sve_fsqrt
:
4844 return DAG
.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4845 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4846 case Intrinsic::aarch64_sve_frecpx
:
4847 return DAG
.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4848 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4849 case Intrinsic::aarch64_sve_frecpe_x
:
4850 return DAG
.getNode(AArch64ISD::FRECPE
, dl
, Op
.getValueType(),
4852 case Intrinsic::aarch64_sve_frecps_x
:
4853 return DAG
.getNode(AArch64ISD::FRECPS
, dl
, Op
.getValueType(),
4854 Op
.getOperand(1), Op
.getOperand(2));
4855 case Intrinsic::aarch64_sve_frsqrte_x
:
4856 return DAG
.getNode(AArch64ISD::FRSQRTE
, dl
, Op
.getValueType(),
4858 case Intrinsic::aarch64_sve_frsqrts_x
:
4859 return DAG
.getNode(AArch64ISD::FRSQRTS
, dl
, Op
.getValueType(),
4860 Op
.getOperand(1), Op
.getOperand(2));
4861 case Intrinsic::aarch64_sve_fabs
:
4862 return DAG
.getNode(AArch64ISD::FABS_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4863 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4864 case Intrinsic::aarch64_sve_abs
:
4865 return DAG
.getNode(AArch64ISD::ABS_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4866 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4867 case Intrinsic::aarch64_sve_neg
:
4868 return DAG
.getNode(AArch64ISD::NEG_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4869 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4870 case Intrinsic::aarch64_sve_insr
: {
4871 SDValue Scalar
= Op
.getOperand(2);
4872 EVT ScalarTy
= Scalar
.getValueType();
4873 if ((ScalarTy
== MVT::i8
) || (ScalarTy
== MVT::i16
))
4874 Scalar
= DAG
.getNode(ISD::ANY_EXTEND
, dl
, MVT::i32
, Scalar
);
4876 return DAG
.getNode(AArch64ISD::INSR
, dl
, Op
.getValueType(),
4877 Op
.getOperand(1), Scalar
);
4879 case Intrinsic::aarch64_sve_rbit
:
4880 return DAG
.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU
, dl
,
4881 Op
.getValueType(), Op
.getOperand(2), Op
.getOperand(3),
4883 case Intrinsic::aarch64_sve_revb
:
4884 return DAG
.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4885 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4886 case Intrinsic::aarch64_sve_revh
:
4887 return DAG
.getNode(AArch64ISD::REVH_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4888 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4889 case Intrinsic::aarch64_sve_revw
:
4890 return DAG
.getNode(AArch64ISD::REVW_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4891 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4892 case Intrinsic::aarch64_sve_revd
:
4893 return DAG
.getNode(AArch64ISD::REVD_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4894 Op
.getOperand(2), Op
.getOperand(3), Op
.getOperand(1));
4895 case Intrinsic::aarch64_sve_sxtb
:
4897 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4898 Op
.getOperand(2), Op
.getOperand(3),
4899 DAG
.getValueType(Op
.getValueType().changeVectorElementType(MVT::i8
)),
4901 case Intrinsic::aarch64_sve_sxth
:
4903 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4904 Op
.getOperand(2), Op
.getOperand(3),
4905 DAG
.getValueType(Op
.getValueType().changeVectorElementType(MVT::i16
)),
4907 case Intrinsic::aarch64_sve_sxtw
:
4909 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4910 Op
.getOperand(2), Op
.getOperand(3),
4911 DAG
.getValueType(Op
.getValueType().changeVectorElementType(MVT::i32
)),
4913 case Intrinsic::aarch64_sve_uxtb
:
4915 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4916 Op
.getOperand(2), Op
.getOperand(3),
4917 DAG
.getValueType(Op
.getValueType().changeVectorElementType(MVT::i8
)),
4919 case Intrinsic::aarch64_sve_uxth
:
4921 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4922 Op
.getOperand(2), Op
.getOperand(3),
4923 DAG
.getValueType(Op
.getValueType().changeVectorElementType(MVT::i16
)),
4925 case Intrinsic::aarch64_sve_uxtw
:
4927 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU
, dl
, Op
.getValueType(),
4928 Op
.getOperand(2), Op
.getOperand(3),
4929 DAG
.getValueType(Op
.getValueType().changeVectorElementType(MVT::i32
)),
4931 case Intrinsic::localaddress
: {
4932 const auto &MF
= DAG
.getMachineFunction();
4933 const auto *RegInfo
= Subtarget
->getRegisterInfo();
4934 unsigned Reg
= RegInfo
->getLocalAddressRegister(MF
);
4935 return DAG
.getCopyFromReg(DAG
.getEntryNode(), dl
, Reg
,
4936 Op
.getSimpleValueType());
4939 case Intrinsic::eh_recoverfp
: {
4940 // FIXME: This needs to be implemented to correctly handle highly aligned
4941 // stack objects. For now we simply return the incoming FP. Refer D53541
4942 // for more details.
4943 SDValue FnOp
= Op
.getOperand(1);
4944 SDValue IncomingFPOp
= Op
.getOperand(2);
4945 GlobalAddressSDNode
*GSD
= dyn_cast
<GlobalAddressSDNode
>(FnOp
);
4946 auto *Fn
= dyn_cast_or_null
<Function
>(GSD
? GSD
->getGlobal() : nullptr);
4949 "llvm.eh.recoverfp must take a function as the first argument");
4950 return IncomingFPOp
;
4953 case Intrinsic::aarch64_neon_vsri
:
4954 case Intrinsic::aarch64_neon_vsli
: {
4955 EVT Ty
= Op
.getValueType();
4958 report_fatal_error("Unexpected type for aarch64_neon_vsli");
4960 assert(Op
.getConstantOperandVal(3) <= Ty
.getScalarSizeInBits());
4962 bool IsShiftRight
= IntNo
== Intrinsic::aarch64_neon_vsri
;
4963 unsigned Opcode
= IsShiftRight
? AArch64ISD::VSRI
: AArch64ISD::VSLI
;
4964 return DAG
.getNode(Opcode
, dl
, Ty
, Op
.getOperand(1), Op
.getOperand(2),
4968 case Intrinsic::aarch64_neon_srhadd
:
4969 case Intrinsic::aarch64_neon_urhadd
:
4970 case Intrinsic::aarch64_neon_shadd
:
4971 case Intrinsic::aarch64_neon_uhadd
: {
4972 bool IsSignedAdd
= (IntNo
== Intrinsic::aarch64_neon_srhadd
||
4973 IntNo
== Intrinsic::aarch64_neon_shadd
);
4974 bool IsRoundingAdd
= (IntNo
== Intrinsic::aarch64_neon_srhadd
||
4975 IntNo
== Intrinsic::aarch64_neon_urhadd
);
4976 unsigned Opcode
= IsSignedAdd
4977 ? (IsRoundingAdd
? ISD::AVGCEILS
: ISD::AVGFLOORS
)
4978 : (IsRoundingAdd
? ISD::AVGCEILU
: ISD::AVGFLOORU
);
4979 return DAG
.getNode(Opcode
, dl
, Op
.getValueType(), Op
.getOperand(1),
4982 case Intrinsic::aarch64_neon_sabd
:
4983 case Intrinsic::aarch64_neon_uabd
: {
4984 unsigned Opcode
= IntNo
== Intrinsic::aarch64_neon_uabd
? ISD::ABDU
4986 return DAG
.getNode(Opcode
, dl
, Op
.getValueType(), Op
.getOperand(1),
4989 case Intrinsic::aarch64_neon_saddlp
:
4990 case Intrinsic::aarch64_neon_uaddlp
: {
4991 unsigned Opcode
= IntNo
== Intrinsic::aarch64_neon_uaddlp
4992 ? AArch64ISD::UADDLP
4993 : AArch64ISD::SADDLP
;
4994 return DAG
.getNode(Opcode
, dl
, Op
.getValueType(), Op
.getOperand(1));
4996 case Intrinsic::aarch64_neon_sdot
:
4997 case Intrinsic::aarch64_neon_udot
:
4998 case Intrinsic::aarch64_sve_sdot
:
4999 case Intrinsic::aarch64_sve_udot
: {
5000 unsigned Opcode
= (IntNo
== Intrinsic::aarch64_neon_udot
||
5001 IntNo
== Intrinsic::aarch64_sve_udot
)
5004 return DAG
.getNode(Opcode
, dl
, Op
.getValueType(), Op
.getOperand(1),
5005 Op
.getOperand(2), Op
.getOperand(3));
5007 case Intrinsic::get_active_lane_mask
: {
5009 DAG
.getTargetConstant(Intrinsic::aarch64_sve_whilelo
, dl
, MVT::i64
);
5010 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
, Op
.getValueType(), ID
,
5011 Op
.getOperand(1), Op
.getOperand(2));
5016 bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT
, EVT
&EltTy
) const {
5017 if (VT
.getVectorElementType() == MVT::i8
||
5018 VT
.getVectorElementType() == MVT::i16
) {
5025 bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT
,
5027 // SVE only supports implicit extension of 32-bit indices.
5028 if (!Subtarget
->hasSVE() || IndexVT
.getVectorElementType() != MVT::i32
)
5031 // Indices cannot be smaller than the main data type.
5032 if (IndexVT
.getScalarSizeInBits() < DataVT
.getScalarSizeInBits())
5035 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5036 // element container type, which would violate the previous clause.
5037 return DataVT
.isFixedLengthVector() || DataVT
.getVectorMinNumElements() > 2;
5040 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal
) const {
5041 return ExtVal
.getValueType().isScalableVector() ||
5042 useSVEForFixedLengthVectorVT(
5043 ExtVal
.getValueType(),
5044 /*OverrideNEON=*/Subtarget
->useSVEForFixedLengthVectors());
5047 unsigned getGatherVecOpcode(bool IsScaled
, bool IsSigned
, bool NeedsExtend
) {
5048 std::map
<std::tuple
<bool, bool, bool>, unsigned> AddrModes
= {
5049 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5050 AArch64ISD::GLD1_MERGE_ZERO
},
5051 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5052 AArch64ISD::GLD1_UXTW_MERGE_ZERO
},
5053 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5054 AArch64ISD::GLD1_MERGE_ZERO
},
5055 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5056 AArch64ISD::GLD1_SXTW_MERGE_ZERO
},
5057 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5058 AArch64ISD::GLD1_SCALED_MERGE_ZERO
},
5059 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5060 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO
},
5061 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5062 AArch64ISD::GLD1_SCALED_MERGE_ZERO
},
5063 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5064 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO
},
5066 auto Key
= std::make_tuple(IsScaled
, IsSigned
, NeedsExtend
);
5067 return AddrModes
.find(Key
)->second
;
5070 unsigned getSignExtendedGatherOpcode(unsigned Opcode
) {
5073 llvm_unreachable("unimplemented opcode");
5075 case AArch64ISD::GLD1_MERGE_ZERO
:
5076 return AArch64ISD::GLD1S_MERGE_ZERO
;
5077 case AArch64ISD::GLD1_IMM_MERGE_ZERO
:
5078 return AArch64ISD::GLD1S_IMM_MERGE_ZERO
;
5079 case AArch64ISD::GLD1_UXTW_MERGE_ZERO
:
5080 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO
;
5081 case AArch64ISD::GLD1_SXTW_MERGE_ZERO
:
5082 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO
;
5083 case AArch64ISD::GLD1_SCALED_MERGE_ZERO
:
5084 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO
;
5085 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO
:
5086 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO
;
5087 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO
:
5088 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO
;
5092 SDValue
AArch64TargetLowering::LowerMGATHER(SDValue Op
,
5093 SelectionDAG
&DAG
) const {
5094 MaskedGatherSDNode
*MGT
= cast
<MaskedGatherSDNode
>(Op
);
5097 SDValue Chain
= MGT
->getChain();
5098 SDValue PassThru
= MGT
->getPassThru();
5099 SDValue Mask
= MGT
->getMask();
5100 SDValue BasePtr
= MGT
->getBasePtr();
5101 SDValue Index
= MGT
->getIndex();
5102 SDValue Scale
= MGT
->getScale();
5103 EVT VT
= Op
.getValueType();
5104 EVT MemVT
= MGT
->getMemoryVT();
5105 ISD::LoadExtType ExtType
= MGT
->getExtensionType();
5106 ISD::MemIndexType IndexType
= MGT
->getIndexType();
5108 // SVE supports zero (and so undef) passthrough values only, everything else
5109 // must be handled manually by an explicit select on the load's output.
5110 if (!PassThru
->isUndef() && !isZerosVector(PassThru
.getNode())) {
5111 SDValue Ops
[] = {Chain
, DAG
.getUNDEF(VT
), Mask
, BasePtr
, Index
, Scale
};
5113 DAG
.getMaskedGather(MGT
->getVTList(), MemVT
, DL
, Ops
,
5114 MGT
->getMemOperand(), IndexType
, ExtType
);
5115 SDValue Select
= DAG
.getSelect(DL
, VT
, Mask
, Load
, PassThru
);
5116 return DAG
.getMergeValues({Select
, Load
.getValue(1)}, DL
);
5119 bool IsScaled
= MGT
->isIndexScaled();
5120 bool IsSigned
= MGT
->isIndexSigned();
5122 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5123 // must be calculated before hand.
5124 uint64_t ScaleVal
= cast
<ConstantSDNode
>(Scale
)->getZExtValue();
5125 if (IsScaled
&& ScaleVal
!= MemVT
.getScalarStoreSize()) {
5126 assert(isPowerOf2_64(ScaleVal
) && "Expecting power-of-two types");
5127 EVT IndexVT
= Index
.getValueType();
5128 Index
= DAG
.getNode(ISD::SHL
, DL
, IndexVT
, Index
,
5129 DAG
.getConstant(Log2_32(ScaleVal
), DL
, IndexVT
));
5130 Scale
= DAG
.getTargetConstant(1, DL
, Scale
.getValueType());
5132 SDValue Ops
[] = {Chain
, PassThru
, Mask
, BasePtr
, Index
, Scale
};
5133 return DAG
.getMaskedGather(MGT
->getVTList(), MemVT
, DL
, Ops
,
5134 MGT
->getMemOperand(), IndexType
, ExtType
);
5137 // Lower fixed length gather to a scalable equivalent.
5138 if (VT
.isFixedLengthVector()) {
5139 assert(Subtarget
->useSVEForFixedLengthVectors() &&
5140 "Cannot lower when not using SVE for fixed vectors!");
5142 // NOTE: Handle floating-point as if integer then bitcast the result.
5143 EVT DataVT
= VT
.changeVectorElementTypeToInteger();
5144 MemVT
= MemVT
.changeVectorElementTypeToInteger();
5146 // Find the smallest integer fixed length vector we can use for the gather.
5147 EVT PromotedVT
= VT
.changeVectorElementType(MVT::i32
);
5148 if (DataVT
.getVectorElementType() == MVT::i64
||
5149 Index
.getValueType().getVectorElementType() == MVT::i64
||
5150 Mask
.getValueType().getVectorElementType() == MVT::i64
)
5151 PromotedVT
= VT
.changeVectorElementType(MVT::i64
);
5153 // Promote vector operands except for passthrough, which we know is either
5154 // undef or zero, and thus best constructed directly.
5155 unsigned ExtOpcode
= IsSigned
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
5156 Index
= DAG
.getNode(ExtOpcode
, DL
, PromotedVT
, Index
);
5157 Mask
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, PromotedVT
, Mask
);
5159 // A promoted result type forces the need for an extending load.
5160 if (PromotedVT
!= DataVT
&& ExtType
== ISD::NON_EXTLOAD
)
5161 ExtType
= ISD::EXTLOAD
;
5163 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, PromotedVT
);
5165 // Convert fixed length vector operands to scalable.
5166 MemVT
= ContainerVT
.changeVectorElementType(MemVT
.getVectorElementType());
5167 Index
= convertToScalableVector(DAG
, ContainerVT
, Index
);
5168 Mask
= convertFixedMaskToScalableVector(Mask
, DAG
);
5169 PassThru
= PassThru
->isUndef() ? DAG
.getUNDEF(ContainerVT
)
5170 : DAG
.getConstant(0, DL
, ContainerVT
);
5172 // Emit equivalent scalable vector gather.
5173 SDValue Ops
[] = {Chain
, PassThru
, Mask
, BasePtr
, Index
, Scale
};
5175 DAG
.getMaskedGather(DAG
.getVTList(ContainerVT
, MVT::Other
), MemVT
, DL
,
5176 Ops
, MGT
->getMemOperand(), IndexType
, ExtType
);
5178 // Extract fixed length data then convert to the required result type.
5179 SDValue Result
= convertFromScalableVector(DAG
, PromotedVT
, Load
);
5180 Result
= DAG
.getNode(ISD::TRUNCATE
, DL
, DataVT
, Result
);
5181 if (VT
.isFloatingPoint())
5182 Result
= DAG
.getNode(ISD::BITCAST
, DL
, VT
, Result
);
5184 return DAG
.getMergeValues({Result
, Load
.getValue(1)}, DL
);
5187 // Everything else is legal.
5191 SDValue
AArch64TargetLowering::LowerMSCATTER(SDValue Op
,
5192 SelectionDAG
&DAG
) const {
5193 MaskedScatterSDNode
*MSC
= cast
<MaskedScatterSDNode
>(Op
);
5196 SDValue Chain
= MSC
->getChain();
5197 SDValue StoreVal
= MSC
->getValue();
5198 SDValue Mask
= MSC
->getMask();
5199 SDValue BasePtr
= MSC
->getBasePtr();
5200 SDValue Index
= MSC
->getIndex();
5201 SDValue Scale
= MSC
->getScale();
5202 EVT VT
= StoreVal
.getValueType();
5203 EVT MemVT
= MSC
->getMemoryVT();
5204 ISD::MemIndexType IndexType
= MSC
->getIndexType();
5205 bool Truncating
= MSC
->isTruncatingStore();
5207 bool IsScaled
= MSC
->isIndexScaled();
5208 bool IsSigned
= MSC
->isIndexSigned();
5210 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5211 // must be calculated before hand.
5212 uint64_t ScaleVal
= cast
<ConstantSDNode
>(Scale
)->getZExtValue();
5213 if (IsScaled
&& ScaleVal
!= MemVT
.getScalarStoreSize()) {
5214 assert(isPowerOf2_64(ScaleVal
) && "Expecting power-of-two types");
5215 EVT IndexVT
= Index
.getValueType();
5216 Index
= DAG
.getNode(ISD::SHL
, DL
, IndexVT
, Index
,
5217 DAG
.getConstant(Log2_32(ScaleVal
), DL
, IndexVT
));
5218 Scale
= DAG
.getTargetConstant(1, DL
, Scale
.getValueType());
5220 SDValue Ops
[] = {Chain
, StoreVal
, Mask
, BasePtr
, Index
, Scale
};
5221 return DAG
.getMaskedScatter(MSC
->getVTList(), MemVT
, DL
, Ops
,
5222 MSC
->getMemOperand(), IndexType
, Truncating
);
5225 // Lower fixed length scatter to a scalable equivalent.
5226 if (VT
.isFixedLengthVector()) {
5227 assert(Subtarget
->useSVEForFixedLengthVectors() &&
5228 "Cannot lower when not using SVE for fixed vectors!");
5230 // Once bitcast we treat floating-point scatters as if integer.
5231 if (VT
.isFloatingPoint()) {
5232 VT
= VT
.changeVectorElementTypeToInteger();
5233 MemVT
= MemVT
.changeVectorElementTypeToInteger();
5234 StoreVal
= DAG
.getNode(ISD::BITCAST
, DL
, VT
, StoreVal
);
5237 // Find the smallest integer fixed length vector we can use for the scatter.
5238 EVT PromotedVT
= VT
.changeVectorElementType(MVT::i32
);
5239 if (VT
.getVectorElementType() == MVT::i64
||
5240 Index
.getValueType().getVectorElementType() == MVT::i64
||
5241 Mask
.getValueType().getVectorElementType() == MVT::i64
)
5242 PromotedVT
= VT
.changeVectorElementType(MVT::i64
);
5244 // Promote vector operands.
5245 unsigned ExtOpcode
= IsSigned
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
5246 Index
= DAG
.getNode(ExtOpcode
, DL
, PromotedVT
, Index
);
5247 Mask
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, PromotedVT
, Mask
);
5248 StoreVal
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, PromotedVT
, StoreVal
);
5250 // A promoted value type forces the need for a truncating store.
5251 if (PromotedVT
!= VT
)
5254 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, PromotedVT
);
5256 // Convert fixed length vector operands to scalable.
5257 MemVT
= ContainerVT
.changeVectorElementType(MemVT
.getVectorElementType());
5258 Index
= convertToScalableVector(DAG
, ContainerVT
, Index
);
5259 Mask
= convertFixedMaskToScalableVector(Mask
, DAG
);
5260 StoreVal
= convertToScalableVector(DAG
, ContainerVT
, StoreVal
);
5262 // Emit equivalent scalable vector scatter.
5263 SDValue Ops
[] = {Chain
, StoreVal
, Mask
, BasePtr
, Index
, Scale
};
5264 return DAG
.getMaskedScatter(MSC
->getVTList(), MemVT
, DL
, Ops
,
5265 MSC
->getMemOperand(), IndexType
, Truncating
);
5268 // Everything else is legal.
5272 SDValue
AArch64TargetLowering::LowerMLOAD(SDValue Op
, SelectionDAG
&DAG
) const {
5274 MaskedLoadSDNode
*LoadNode
= cast
<MaskedLoadSDNode
>(Op
);
5275 assert(LoadNode
&& "Expected custom lowering of a masked load node");
5276 EVT VT
= Op
->getValueType(0);
5278 if (useSVEForFixedLengthVectorVT(
5280 /*OverrideNEON=*/Subtarget
->useSVEForFixedLengthVectors()))
5281 return LowerFixedLengthVectorMLoadToSVE(Op
, DAG
);
5283 SDValue PassThru
= LoadNode
->getPassThru();
5284 SDValue Mask
= LoadNode
->getMask();
5286 if (PassThru
->isUndef() || isZerosVector(PassThru
.getNode()))
5289 SDValue Load
= DAG
.getMaskedLoad(
5290 VT
, DL
, LoadNode
->getChain(), LoadNode
->getBasePtr(),
5291 LoadNode
->getOffset(), Mask
, DAG
.getUNDEF(VT
), LoadNode
->getMemoryVT(),
5292 LoadNode
->getMemOperand(), LoadNode
->getAddressingMode(),
5293 LoadNode
->getExtensionType());
5295 SDValue Result
= DAG
.getSelect(DL
, VT
, Mask
, Load
, PassThru
);
5297 return DAG
.getMergeValues({Result
, Load
.getValue(1)}, DL
);
5300 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
5301 static SDValue
LowerTruncateVectorStore(SDLoc DL
, StoreSDNode
*ST
,
5303 SelectionDAG
&DAG
) {
5304 assert(VT
.isVector() && "VT should be a vector type");
5305 assert(MemVT
== MVT::v4i8
&& VT
== MVT::v4i16
);
5307 SDValue Value
= ST
->getValue();
5309 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
5310 // the word lane which represent the v4i8 subvector. It optimizes the store
5316 SDValue Undef
= DAG
.getUNDEF(MVT::i16
);
5317 SDValue UndefVec
= DAG
.getBuildVector(MVT::v4i16
, DL
,
5318 {Undef
, Undef
, Undef
, Undef
});
5320 SDValue TruncExt
= DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, MVT::v8i16
,
5322 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::v8i8
, TruncExt
);
5324 Trunc
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Trunc
);
5325 SDValue ExtractTrunc
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
,
5326 Trunc
, DAG
.getConstant(0, DL
, MVT::i64
));
5328 return DAG
.getStore(ST
->getChain(), DL
, ExtractTrunc
,
5329 ST
->getBasePtr(), ST
->getMemOperand());
5332 // Custom lowering for any store, vector or scalar and/or default or with
5333 // a truncate operations. Currently only custom lower truncate operation
5334 // from vector v4i16 to v4i8 or volatile stores of i128.
5335 SDValue
AArch64TargetLowering::LowerSTORE(SDValue Op
,
5336 SelectionDAG
&DAG
) const {
5338 StoreSDNode
*StoreNode
= cast
<StoreSDNode
>(Op
);
5339 assert (StoreNode
&& "Can only custom lower store nodes");
5341 SDValue Value
= StoreNode
->getValue();
5343 EVT VT
= Value
.getValueType();
5344 EVT MemVT
= StoreNode
->getMemoryVT();
5346 if (VT
.isVector()) {
5347 if (useSVEForFixedLengthVectorVT(
5349 /*OverrideNEON=*/Subtarget
->useSVEForFixedLengthVectors()))
5350 return LowerFixedLengthVectorStoreToSVE(Op
, DAG
);
5352 unsigned AS
= StoreNode
->getAddressSpace();
5353 Align Alignment
= StoreNode
->getAlign();
5354 if (Alignment
< MemVT
.getStoreSize() &&
5355 !allowsMisalignedMemoryAccesses(MemVT
, AS
, Alignment
,
5356 StoreNode
->getMemOperand()->getFlags(),
5358 return scalarizeVectorStore(StoreNode
, DAG
);
5361 if (StoreNode
->isTruncatingStore() && VT
== MVT::v4i16
&&
5362 MemVT
== MVT::v4i8
) {
5363 return LowerTruncateVectorStore(Dl
, StoreNode
, VT
, MemVT
, DAG
);
5365 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
5366 // the custom lowering, as there are no un-paired non-temporal stores and
5367 // legalization will break up 256 bit inputs.
5368 ElementCount EC
= MemVT
.getVectorElementCount();
5369 if (StoreNode
->isNonTemporal() && MemVT
.getSizeInBits() == 256u &&
5371 ((MemVT
.getScalarSizeInBits() == 8u ||
5372 MemVT
.getScalarSizeInBits() == 16u ||
5373 MemVT
.getScalarSizeInBits() == 32u ||
5374 MemVT
.getScalarSizeInBits() == 64u))) {
5376 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, Dl
,
5377 MemVT
.getHalfNumVectorElementsVT(*DAG
.getContext()),
5378 StoreNode
->getValue(), DAG
.getConstant(0, Dl
, MVT::i64
));
5380 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, Dl
,
5381 MemVT
.getHalfNumVectorElementsVT(*DAG
.getContext()),
5382 StoreNode
->getValue(),
5383 DAG
.getConstant(EC
.getKnownMinValue() / 2, Dl
, MVT::i64
));
5384 SDValue Result
= DAG
.getMemIntrinsicNode(
5385 AArch64ISD::STNP
, Dl
, DAG
.getVTList(MVT::Other
),
5386 {StoreNode
->getChain(), Lo
, Hi
, StoreNode
->getBasePtr()},
5387 StoreNode
->getMemoryVT(), StoreNode
->getMemOperand());
5390 } else if (MemVT
== MVT::i128
&& StoreNode
->isVolatile()) {
5391 return LowerStore128(Op
, DAG
);
5392 } else if (MemVT
== MVT::i64x8
) {
5393 SDValue Value
= StoreNode
->getValue();
5394 assert(Value
->getValueType(0) == MVT::i64x8
);
5395 SDValue Chain
= StoreNode
->getChain();
5396 SDValue Base
= StoreNode
->getBasePtr();
5397 EVT PtrVT
= Base
.getValueType();
5398 for (unsigned i
= 0; i
< 8; i
++) {
5399 SDValue Part
= DAG
.getNode(AArch64ISD::LS64_EXTRACT
, Dl
, MVT::i64
,
5400 Value
, DAG
.getConstant(i
, Dl
, MVT::i32
));
5401 SDValue Ptr
= DAG
.getNode(ISD::ADD
, Dl
, PtrVT
, Base
,
5402 DAG
.getConstant(i
* 8, Dl
, PtrVT
));
5403 Chain
= DAG
.getStore(Chain
, Dl
, Part
, Ptr
, StoreNode
->getPointerInfo(),
5404 StoreNode
->getOriginalAlign());
5412 /// Lower atomic or volatile 128-bit stores to a single STP instruction.
5413 SDValue
AArch64TargetLowering::LowerStore128(SDValue Op
,
5414 SelectionDAG
&DAG
) const {
5415 MemSDNode
*StoreNode
= cast
<MemSDNode
>(Op
);
5416 assert(StoreNode
->getMemoryVT() == MVT::i128
);
5417 assert(StoreNode
->isVolatile() || StoreNode
->isAtomic());
5418 assert(!StoreNode
->isAtomic() ||
5419 StoreNode
->getMergedOrdering() == AtomicOrdering::Unordered
||
5420 StoreNode
->getMergedOrdering() == AtomicOrdering::Monotonic
);
5422 SDValue Value
= StoreNode
->getOpcode() == ISD::STORE
5423 ? StoreNode
->getOperand(1)
5424 : StoreNode
->getOperand(2);
5426 SDValue Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, MVT::i64
, Value
,
5427 DAG
.getConstant(0, DL
, MVT::i64
));
5428 SDValue Hi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, MVT::i64
, Value
,
5429 DAG
.getConstant(1, DL
, MVT::i64
));
5430 SDValue Result
= DAG
.getMemIntrinsicNode(
5431 AArch64ISD::STP
, DL
, DAG
.getVTList(MVT::Other
),
5432 {StoreNode
->getChain(), Lo
, Hi
, StoreNode
->getBasePtr()},
5433 StoreNode
->getMemoryVT(), StoreNode
->getMemOperand());
5437 SDValue
AArch64TargetLowering::LowerLOAD(SDValue Op
,
5438 SelectionDAG
&DAG
) const {
5440 LoadSDNode
*LoadNode
= cast
<LoadSDNode
>(Op
);
5441 assert(LoadNode
&& "Expected custom lowering of a load node");
5443 if (LoadNode
->getMemoryVT() == MVT::i64x8
) {
5444 SmallVector
<SDValue
, 8> Ops
;
5445 SDValue Base
= LoadNode
->getBasePtr();
5446 SDValue Chain
= LoadNode
->getChain();
5447 EVT PtrVT
= Base
.getValueType();
5448 for (unsigned i
= 0; i
< 8; i
++) {
5449 SDValue Ptr
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, Base
,
5450 DAG
.getConstant(i
* 8, DL
, PtrVT
));
5451 SDValue Part
= DAG
.getLoad(MVT::i64
, DL
, Chain
, Ptr
,
5452 LoadNode
->getPointerInfo(),
5453 LoadNode
->getOriginalAlign());
5454 Ops
.push_back(Part
);
5455 Chain
= SDValue(Part
.getNode(), 1);
5457 SDValue Loaded
= DAG
.getNode(AArch64ISD::LS64_BUILD
, DL
, MVT::i64x8
, Ops
);
5458 return DAG
.getMergeValues({Loaded
, Chain
}, DL
);
5461 // Custom lowering for extending v4i8 vector loads.
5462 EVT VT
= Op
->getValueType(0);
5463 assert((VT
== MVT::v4i16
|| VT
== MVT::v4i32
) && "Expected v4i16 or v4i32");
5465 if (LoadNode
->getMemoryVT() != MVT::v4i8
)
5469 if (LoadNode
->getExtensionType() == ISD::SEXTLOAD
)
5470 ExtType
= ISD::SIGN_EXTEND
;
5471 else if (LoadNode
->getExtensionType() == ISD::ZEXTLOAD
||
5472 LoadNode
->getExtensionType() == ISD::EXTLOAD
)
5473 ExtType
= ISD::ZERO_EXTEND
;
5477 SDValue Load
= DAG
.getLoad(MVT::f32
, DL
, LoadNode
->getChain(),
5478 LoadNode
->getBasePtr(), MachinePointerInfo());
5479 SDValue Chain
= Load
.getValue(1);
5480 SDValue Vec
= DAG
.getNode(ISD::SCALAR_TO_VECTOR
, DL
, MVT::v2f32
, Load
);
5481 SDValue BC
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v8i8
, Vec
);
5482 SDValue Ext
= DAG
.getNode(ExtType
, DL
, MVT::v8i16
, BC
);
5483 Ext
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, MVT::v4i16
, Ext
,
5484 DAG
.getConstant(0, DL
, MVT::i64
));
5485 if (VT
== MVT::v4i32
)
5486 Ext
= DAG
.getNode(ExtType
, DL
, MVT::v4i32
, Ext
);
5487 return DAG
.getMergeValues({Ext
, Chain
}, DL
);
5490 // Generate SUBS and CSEL for integer abs.
5491 SDValue
AArch64TargetLowering::LowerABS(SDValue Op
, SelectionDAG
&DAG
) const {
5492 MVT VT
= Op
.getSimpleValueType();
5495 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::ABS_MERGE_PASSTHRU
);
5498 SDValue Neg
= DAG
.getNode(ISD::SUB
, DL
, VT
, DAG
.getConstant(0, DL
, VT
),
5500 // Generate SUBS & CSEL.
5502 DAG
.getNode(AArch64ISD::SUBS
, DL
, DAG
.getVTList(VT
, MVT::i32
),
5503 Op
.getOperand(0), DAG
.getConstant(0, DL
, VT
));
5504 return DAG
.getNode(AArch64ISD::CSEL
, DL
, VT
, Op
.getOperand(0), Neg
,
5505 DAG
.getConstant(AArch64CC::PL
, DL
, MVT::i32
),
5509 static SDValue
LowerBRCOND(SDValue Op
, SelectionDAG
&DAG
) {
5510 SDValue Chain
= Op
.getOperand(0);
5511 SDValue Cond
= Op
.getOperand(1);
5512 SDValue Dest
= Op
.getOperand(2);
5514 AArch64CC::CondCode CC
;
5515 if (SDValue Cmp
= emitConjunction(DAG
, Cond
, CC
)) {
5517 SDValue CCVal
= DAG
.getConstant(CC
, dl
, MVT::i32
);
5518 return DAG
.getNode(AArch64ISD::BRCOND
, dl
, MVT::Other
, Chain
, Dest
, CCVal
,
5525 SDValue
AArch64TargetLowering::LowerZERO_EXTEND(SDValue Op
, SelectionDAG
&DAG
) const {
5526 assert(Op
->getOpcode() == ISD::ZERO_EXTEND
&& "Expected ZERO_EXTEND");
5528 if (Op
.getValueType().isFixedLengthVector())
5529 return LowerFixedLengthVectorIntExtendToSVE(Op
, DAG
);
5531 // Try to lower to VSELECT to allow zext to transform into
5532 // a predicated instruction like add, sub or mul.
5533 SDValue Value
= Op
->getOperand(0);
5534 if (!Value
->getValueType(0).isScalableVector() ||
5535 Value
->getValueType(0).getScalarType() != MVT::i1
)
5538 SDLoc DL
= SDLoc(Op
);
5539 EVT VT
= Op
->getValueType(0);
5540 SDValue Ones
= DAG
.getConstant(1, DL
, VT
);
5541 SDValue Zeros
= DAG
.getConstant(0, DL
, VT
);
5542 return DAG
.getNode(ISD::VSELECT
, DL
, VT
, Value
, Ones
, Zeros
);
5545 SDValue
AArch64TargetLowering::LowerOperation(SDValue Op
,
5546 SelectionDAG
&DAG
) const {
5547 LLVM_DEBUG(dbgs() << "Custom lowering: ");
5548 LLVM_DEBUG(Op
.dump());
5550 switch (Op
.getOpcode()) {
5552 llvm_unreachable("unimplemented operand");
5555 return LowerBITCAST(Op
, DAG
);
5556 case ISD::GlobalAddress
:
5557 return LowerGlobalAddress(Op
, DAG
);
5558 case ISD::GlobalTLSAddress
:
5559 return LowerGlobalTLSAddress(Op
, DAG
);
5561 case ISD::STRICT_FSETCC
:
5562 case ISD::STRICT_FSETCCS
:
5563 return LowerSETCC(Op
, DAG
);
5564 case ISD::SETCCCARRY
:
5565 return LowerSETCCCARRY(Op
, DAG
);
5567 return LowerBRCOND(Op
, DAG
);
5569 return LowerBR_CC(Op
, DAG
);
5571 return LowerSELECT(Op
, DAG
);
5572 case ISD::SELECT_CC
:
5573 return LowerSELECT_CC(Op
, DAG
);
5574 case ISD::JumpTable
:
5575 return LowerJumpTable(Op
, DAG
);
5577 return LowerBR_JT(Op
, DAG
);
5578 case ISD::ConstantPool
:
5579 return LowerConstantPool(Op
, DAG
);
5580 case ISD::BlockAddress
:
5581 return LowerBlockAddress(Op
, DAG
);
5583 return LowerVASTART(Op
, DAG
);
5585 return LowerVACOPY(Op
, DAG
);
5587 return LowerVAARG(Op
, DAG
);
5589 return lowerADDSUBCARRY(Op
, DAG
, AArch64ISD::ADCS
, false /*unsigned*/);
5591 return lowerADDSUBCARRY(Op
, DAG
, AArch64ISD::SBCS
, false /*unsigned*/);
5592 case ISD::SADDO_CARRY
:
5593 return lowerADDSUBCARRY(Op
, DAG
, AArch64ISD::ADCS
, true /*signed*/);
5594 case ISD::SSUBO_CARRY
:
5595 return lowerADDSUBCARRY(Op
, DAG
, AArch64ISD::SBCS
, true /*signed*/);
5602 return LowerXALUO(Op
, DAG
);
5604 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FADD_PRED
);
5606 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FSUB_PRED
);
5608 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FMUL_PRED
);
5610 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FMA_PRED
);
5612 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FDIV_PRED
);
5614 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FNEG_MERGE_PASSTHRU
);
5616 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FCEIL_MERGE_PASSTHRU
);
5618 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FFLOOR_MERGE_PASSTHRU
);
5619 case ISD::FNEARBYINT
:
5620 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU
);
5622 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FRINT_MERGE_PASSTHRU
);
5624 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FROUND_MERGE_PASSTHRU
);
5625 case ISD::FROUNDEVEN
:
5626 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU
);
5628 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FTRUNC_MERGE_PASSTHRU
);
5630 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FSQRT_MERGE_PASSTHRU
);
5632 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FABS_MERGE_PASSTHRU
);
5634 case ISD::STRICT_FP_ROUND
:
5635 return LowerFP_ROUND(Op
, DAG
);
5636 case ISD::FP_EXTEND
:
5637 return LowerFP_EXTEND(Op
, DAG
);
5638 case ISD::FRAMEADDR
:
5639 return LowerFRAMEADDR(Op
, DAG
);
5640 case ISD::SPONENTRY
:
5641 return LowerSPONENTRY(Op
, DAG
);
5642 case ISD::RETURNADDR
:
5643 return LowerRETURNADDR(Op
, DAG
);
5644 case ISD::ADDROFRETURNADDR
:
5645 return LowerADDROFRETURNADDR(Op
, DAG
);
5646 case ISD::CONCAT_VECTORS
:
5647 return LowerCONCAT_VECTORS(Op
, DAG
);
5648 case ISD::INSERT_VECTOR_ELT
:
5649 return LowerINSERT_VECTOR_ELT(Op
, DAG
);
5650 case ISD::EXTRACT_VECTOR_ELT
:
5651 return LowerEXTRACT_VECTOR_ELT(Op
, DAG
);
5652 case ISD::BUILD_VECTOR
:
5653 return LowerBUILD_VECTOR(Op
, DAG
);
5654 case ISD::VECTOR_SHUFFLE
:
5655 return LowerVECTOR_SHUFFLE(Op
, DAG
);
5656 case ISD::SPLAT_VECTOR
:
5657 return LowerSPLAT_VECTOR(Op
, DAG
);
5658 case ISD::EXTRACT_SUBVECTOR
:
5659 return LowerEXTRACT_SUBVECTOR(Op
, DAG
);
5660 case ISD::INSERT_SUBVECTOR
:
5661 return LowerINSERT_SUBVECTOR(Op
, DAG
);
5664 return LowerDIV(Op
, DAG
);
5669 return LowerMinMax(Op
, DAG
);
5673 return LowerVectorSRA_SRL_SHL(Op
, DAG
);
5674 case ISD::SHL_PARTS
:
5675 case ISD::SRL_PARTS
:
5676 case ISD::SRA_PARTS
:
5677 return LowerShiftParts(Op
, DAG
);
5680 return LowerCTPOP_PARITY(Op
, DAG
);
5681 case ISD::FCOPYSIGN
:
5682 return LowerFCOPYSIGN(Op
, DAG
);
5684 return LowerVectorOR(Op
, DAG
);
5686 return LowerXOR(Op
, DAG
);
5688 return LowerPREFETCH(Op
, DAG
);
5689 case ISD::SINT_TO_FP
:
5690 case ISD::UINT_TO_FP
:
5691 case ISD::STRICT_SINT_TO_FP
:
5692 case ISD::STRICT_UINT_TO_FP
:
5693 return LowerINT_TO_FP(Op
, DAG
);
5694 case ISD::FP_TO_SINT
:
5695 case ISD::FP_TO_UINT
:
5696 case ISD::STRICT_FP_TO_SINT
:
5697 case ISD::STRICT_FP_TO_UINT
:
5698 return LowerFP_TO_INT(Op
, DAG
);
5699 case ISD::FP_TO_SINT_SAT
:
5700 case ISD::FP_TO_UINT_SAT
:
5701 return LowerFP_TO_INT_SAT(Op
, DAG
);
5703 return LowerFSINCOS(Op
, DAG
);
5704 case ISD::FLT_ROUNDS_
:
5705 return LowerFLT_ROUNDS_(Op
, DAG
);
5706 case ISD::SET_ROUNDING
:
5707 return LowerSET_ROUNDING(Op
, DAG
);
5709 return LowerMUL(Op
, DAG
);
5711 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::MULHS_PRED
);
5713 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::MULHU_PRED
);
5714 case ISD::INTRINSIC_VOID
:
5715 case ISD::INTRINSIC_W_CHAIN
:
5716 return LowerINTRINSIC_W_CHAIN(Op
, DAG
);
5717 case ISD::INTRINSIC_WO_CHAIN
:
5718 return LowerINTRINSIC_WO_CHAIN(Op
, DAG
);
5719 case ISD::ATOMIC_STORE
:
5720 if (cast
<MemSDNode
>(Op
)->getMemoryVT() == MVT::i128
) {
5721 assert(Subtarget
->hasLSE2());
5722 return LowerStore128(Op
, DAG
);
5726 return LowerSTORE(Op
, DAG
);
5728 return LowerFixedLengthVectorMStoreToSVE(Op
, DAG
);
5730 return LowerMGATHER(Op
, DAG
);
5732 return LowerMSCATTER(Op
, DAG
);
5733 case ISD::VECREDUCE_SEQ_FADD
:
5734 return LowerVECREDUCE_SEQ_FADD(Op
, DAG
);
5735 case ISD::VECREDUCE_ADD
:
5736 case ISD::VECREDUCE_AND
:
5737 case ISD::VECREDUCE_OR
:
5738 case ISD::VECREDUCE_XOR
:
5739 case ISD::VECREDUCE_SMAX
:
5740 case ISD::VECREDUCE_SMIN
:
5741 case ISD::VECREDUCE_UMAX
:
5742 case ISD::VECREDUCE_UMIN
:
5743 case ISD::VECREDUCE_FADD
:
5744 case ISD::VECREDUCE_FMAX
:
5745 case ISD::VECREDUCE_FMIN
:
5746 return LowerVECREDUCE(Op
, DAG
);
5747 case ISD::ATOMIC_LOAD_SUB
:
5748 return LowerATOMIC_LOAD_SUB(Op
, DAG
);
5749 case ISD::ATOMIC_LOAD_AND
:
5750 return LowerATOMIC_LOAD_AND(Op
, DAG
);
5751 case ISD::DYNAMIC_STACKALLOC
:
5752 return LowerDYNAMIC_STACKALLOC(Op
, DAG
);
5754 return LowerVSCALE(Op
, DAG
);
5755 case ISD::ANY_EXTEND
:
5756 case ISD::SIGN_EXTEND
:
5757 return LowerFixedLengthVectorIntExtendToSVE(Op
, DAG
);
5758 case ISD::ZERO_EXTEND
:
5759 return LowerZERO_EXTEND(Op
, DAG
);
5760 case ISD::SIGN_EXTEND_INREG
: {
5761 // Only custom lower when ExtraVT has a legal byte based element type.
5762 EVT ExtraVT
= cast
<VTSDNode
>(Op
.getOperand(1))->getVT();
5763 EVT ExtraEltVT
= ExtraVT
.getVectorElementType();
5764 if ((ExtraEltVT
!= MVT::i8
) && (ExtraEltVT
!= MVT::i16
) &&
5765 (ExtraEltVT
!= MVT::i32
) && (ExtraEltVT
!= MVT::i64
))
5768 return LowerToPredicatedOp(Op
, DAG
,
5769 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU
);
5772 return LowerTRUNCATE(Op
, DAG
);
5774 return LowerMLOAD(Op
, DAG
);
5776 if (useSVEForFixedLengthVectorVT(Op
.getValueType(),
5777 Subtarget
->forceStreamingCompatibleSVE()))
5778 return LowerFixedLengthVectorLoadToSVE(Op
, DAG
);
5779 return LowerLOAD(Op
, DAG
);
5783 return LowerToScalableOp(Op
, DAG
);
5785 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FMAX_PRED
);
5787 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FMAXNM_PRED
);
5789 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FMIN_PRED
);
5791 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::FMINNM_PRED
);
5793 return LowerFixedLengthVectorSelectToSVE(Op
, DAG
);
5795 return LowerABS(Op
, DAG
);
5797 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::ABDS_PRED
);
5799 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::ABDU_PRED
);
5800 case ISD::BITREVERSE
:
5801 return LowerBitreverse(Op
, DAG
);
5803 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::BSWAP_MERGE_PASSTHRU
);
5805 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::CTLZ_MERGE_PASSTHRU
);
5807 return LowerCTTZ(Op
, DAG
);
5808 case ISD::VECTOR_SPLICE
:
5809 return LowerVECTOR_SPLICE(Op
, DAG
);
5810 case ISD::STRICT_LROUND
:
5811 case ISD::STRICT_LLROUND
:
5812 case ISD::STRICT_LRINT
:
5813 case ISD::STRICT_LLRINT
: {
5814 assert(Op
.getOperand(1).getValueType() == MVT::f16
&&
5815 "Expected custom lowering of rounding operations only for f16");
5817 SDValue Ext
= DAG
.getNode(ISD::STRICT_FP_EXTEND
, DL
, {MVT::f32
, MVT::Other
},
5818 {Op
.getOperand(0), Op
.getOperand(1)});
5819 return DAG
.getNode(Op
.getOpcode(), DL
, {Op
.getValueType(), MVT::Other
},
5820 {Ext
.getValue(1), Ext
.getValue(0)});
5825 bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT
) const {
5826 return !Subtarget
->useSVEForFixedLengthVectors();
5829 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
5830 EVT VT
, bool OverrideNEON
) const {
5831 if (!VT
.isFixedLengthVector() || !VT
.isSimple())
5834 // Don't use SVE for vectors we cannot scalarize if required.
5835 switch (VT
.getVectorElementType().getSimpleVT().SimpleTy
) {
5836 // Fixed length predicates should be promoted to i8.
5837 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
5851 // All SVE implementations support NEON sized vectors.
5852 if (OverrideNEON
&& (VT
.is128BitVector() || VT
.is64BitVector()))
5853 return Subtarget
->hasSVE();
5855 // Ensure NEON MVTs only belong to a single register class.
5856 if (VT
.getFixedSizeInBits() <= 128)
5859 // Ensure wider than NEON code generation is enabled.
5860 if (!Subtarget
->useSVEForFixedLengthVectors())
5863 // Don't use SVE for types that don't fit.
5864 if (VT
.getFixedSizeInBits() > Subtarget
->getMinSVEVectorSizeInBits())
5867 // TODO: Perhaps an artificial restriction, but worth having whilst getting
5868 // the base fixed length SVE support in place.
5869 if (!VT
.isPow2VectorType())
5875 //===----------------------------------------------------------------------===//
5876 // Calling Convention Implementation
5877 //===----------------------------------------------------------------------===//
5879 static unsigned getIntrinsicID(const SDNode
*N
) {
5880 unsigned Opcode
= N
->getOpcode();
5883 return Intrinsic::not_intrinsic
;
5884 case ISD::INTRINSIC_WO_CHAIN
: {
5885 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
5886 if (IID
< Intrinsic::num_intrinsics
)
5888 return Intrinsic::not_intrinsic
;
5893 bool AArch64TargetLowering::isReassocProfitable(SelectionDAG
&DAG
, SDValue N0
,
5895 if (!N0
.hasOneUse())
5898 unsigned IID
= getIntrinsicID(N1
.getNode());
5899 // Avoid reassociating expressions that can be lowered to smlal/umlal.
5900 if (IID
== Intrinsic::aarch64_neon_umull
||
5901 N1
.getOpcode() == AArch64ISD::UMULL
||
5902 IID
== Intrinsic::aarch64_neon_smull
||
5903 N1
.getOpcode() == AArch64ISD::SMULL
)
5904 return N0
.getOpcode() != ISD::ADD
;
5909 /// Selects the correct CCAssignFn for a given CallingConvention value.
5910 CCAssignFn
*AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC
,
5911 bool IsVarArg
) const {
5914 report_fatal_error("Unsupported calling convention.");
5915 case CallingConv::WebKit_JS
:
5916 return CC_AArch64_WebKit_JS
;
5917 case CallingConv::GHC
:
5918 return CC_AArch64_GHC
;
5919 case CallingConv::C
:
5920 case CallingConv::Fast
:
5921 case CallingConv::PreserveMost
:
5922 case CallingConv::CXX_FAST_TLS
:
5923 case CallingConv::Swift
:
5924 case CallingConv::SwiftTail
:
5925 case CallingConv::Tail
:
5926 if (Subtarget
->isTargetWindows() && IsVarArg
) {
5927 if (Subtarget
->isWindowsArm64EC())
5928 return CC_AArch64_Arm64EC_VarArg
;
5929 return CC_AArch64_Win64_VarArg
;
5931 if (!Subtarget
->isTargetDarwin())
5932 return CC_AArch64_AAPCS
;
5934 return CC_AArch64_DarwinPCS
;
5935 return Subtarget
->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
5936 : CC_AArch64_DarwinPCS_VarArg
;
5937 case CallingConv::Win64
:
5939 if (Subtarget
->isWindowsArm64EC())
5940 return CC_AArch64_Arm64EC_VarArg
;
5941 return CC_AArch64_Win64_VarArg
;
5943 return CC_AArch64_AAPCS
;
5944 case CallingConv::CFGuard_Check
:
5945 return CC_AArch64_Win64_CFGuard_Check
;
5946 case CallingConv::AArch64_VectorCall
:
5947 case CallingConv::AArch64_SVE_VectorCall
:
5948 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
:
5949 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
:
5950 return CC_AArch64_AAPCS
;
5955 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC
) const {
5956 return CC
== CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
5957 : RetCC_AArch64_AAPCS
;
5961 /// Returns true if the Function has ZA state and contains at least one call to
5962 /// a function that requires setting up a lazy-save buffer.
5963 static bool requiresBufferForLazySave(const Function
&F
) {
5964 SMEAttrs
CallerAttrs(F
);
5965 if (!CallerAttrs
.hasZAState())
5968 for (const BasicBlock
&BB
: F
)
5969 for (const Instruction
&I
: BB
)
5970 if (const CallInst
*Call
= dyn_cast
<CallInst
>(&I
))
5971 if (CallerAttrs
.requiresLazySave(SMEAttrs(*Call
)))
5976 unsigned AArch64TargetLowering::allocateLazySaveBuffer(
5977 SDValue
&Chain
, const SDLoc
&DL
, SelectionDAG
&DAG
, Register
&Reg
) const {
5978 MachineFunction
&MF
= DAG
.getMachineFunction();
5979 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
5981 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
5982 SDValue N
= DAG
.getNode(AArch64ISD::RDSVL
, DL
, MVT::i64
,
5983 DAG
.getConstant(1, DL
, MVT::i32
));
5984 SDValue NN
= DAG
.getNode(ISD::MUL
, DL
, MVT::i64
, N
, N
);
5985 SDValue Ops
[] = {Chain
, NN
, DAG
.getConstant(1, DL
, MVT::i64
)};
5986 SDVTList VTs
= DAG
.getVTList(MVT::i64
, MVT::Other
);
5987 SDValue Buffer
= DAG
.getNode(ISD::DYNAMIC_STACKALLOC
, DL
, VTs
, Ops
);
5988 unsigned FI
= MFI
.CreateVariableSizedObject(Align(1), nullptr);
5989 Reg
= MF
.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64
));
5990 Chain
= DAG
.getCopyToReg(Buffer
.getValue(1), DL
, Reg
, Buffer
.getValue(0));
5992 // Allocate an additional TPIDR2 object on the stack (16 bytes)
5993 unsigned TPIDR2Obj
= MFI
.CreateStackObject(16, Align(16), false);
5995 // Store the buffer pointer to the TPIDR2 stack object.
5996 MachinePointerInfo MPI
= MachinePointerInfo::getStack(MF
, FI
);
5997 SDValue Ptr
= DAG
.getFrameIndex(
5998 FI
, DAG
.getTargetLoweringInfo().getFrameIndexTy(DAG
.getDataLayout()));
5999 Chain
= DAG
.getStore(Chain
, DL
, Buffer
, Ptr
, MPI
);
6004 SDValue
AArch64TargetLowering::LowerFormalArguments(
6005 SDValue Chain
, CallingConv::ID CallConv
, bool isVarArg
,
6006 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
6007 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
) const {
6008 MachineFunction
&MF
= DAG
.getMachineFunction();
6009 const Function
&F
= MF
.getFunction();
6010 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
6011 bool IsWin64
= Subtarget
->isCallingConvWin64(F
.getCallingConv());
6012 AArch64FunctionInfo
*FuncInfo
= MF
.getInfo
<AArch64FunctionInfo
>();
6014 SmallVector
<ISD::OutputArg
, 4> Outs
;
6015 GetReturnInfo(CallConv
, F
.getReturnType(), F
.getAttributes(), Outs
,
6016 DAG
.getTargetLoweringInfo(), MF
.getDataLayout());
6017 if (any_of(Outs
, [](ISD::OutputArg
&Out
){ return Out
.VT
.isScalableVector(); }))
6018 FuncInfo
->setIsSVECC(true);
6020 // Assign locations to all of the incoming arguments.
6021 SmallVector
<CCValAssign
, 16> ArgLocs
;
6022 DenseMap
<unsigned, SDValue
> CopiedRegs
;
6023 CCState
CCInfo(CallConv
, isVarArg
, MF
, ArgLocs
, *DAG
.getContext());
6025 // At this point, Ins[].VT may already be promoted to i32. To correctly
6026 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6027 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6028 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6029 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6031 unsigned NumArgs
= Ins
.size();
6032 Function::const_arg_iterator CurOrigArg
= F
.arg_begin();
6033 unsigned CurArgIdx
= 0;
6034 for (unsigned i
= 0; i
!= NumArgs
; ++i
) {
6035 MVT ValVT
= Ins
[i
].VT
;
6036 if (Ins
[i
].isOrigArg()) {
6037 std::advance(CurOrigArg
, Ins
[i
].getOrigArgIndex() - CurArgIdx
);
6038 CurArgIdx
= Ins
[i
].getOrigArgIndex();
6040 // Get type of the original argument.
6041 EVT ActualVT
= getValueType(DAG
.getDataLayout(), CurOrigArg
->getType(),
6042 /*AllowUnknown*/ true);
6043 MVT ActualMVT
= ActualVT
.isSimple() ? ActualVT
.getSimpleVT() : MVT::Other
;
6044 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6045 if (ActualMVT
== MVT::i1
|| ActualMVT
== MVT::i8
)
6047 else if (ActualMVT
== MVT::i16
)
6050 bool UseVarArgCC
= false;
6052 UseVarArgCC
= isVarArg
;
6053 CCAssignFn
*AssignFn
= CCAssignFnForCall(CallConv
, UseVarArgCC
);
6055 AssignFn(i
, ValVT
, ValVT
, CCValAssign::Full
, Ins
[i
].Flags
, CCInfo
);
6056 assert(!Res
&& "Call operand has unhandled type");
6060 SMEAttrs
Attrs(MF
.getFunction());
6061 bool IsLocallyStreaming
=
6062 !Attrs
.hasStreamingInterface() && Attrs
.hasStreamingBody();
6063 assert(Chain
.getOpcode() == ISD::EntryToken
&& "Unexpected Chain value");
6064 SDValue Glue
= Chain
.getValue(1);
6066 SmallVector
<SDValue
, 16> ArgValues
;
6067 unsigned ExtraArgLocs
= 0;
6068 for (unsigned i
= 0, e
= Ins
.size(); i
!= e
; ++i
) {
6069 CCValAssign
&VA
= ArgLocs
[i
- ExtraArgLocs
];
6071 if (Ins
[i
].Flags
.isByVal()) {
6072 // Byval is used for HFAs in the PCS, but the system should work in a
6073 // non-compliant manner for larger structs.
6074 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
6075 int Size
= Ins
[i
].Flags
.getByValSize();
6076 unsigned NumRegs
= (Size
+ 7) / 8;
6078 // FIXME: This works on big-endian for composite byvals, which are the common
6079 // case. It should also work for fundamental types too.
6081 MFI
.CreateFixedObject(8 * NumRegs
, VA
.getLocMemOffset(), false);
6082 SDValue FrameIdxN
= DAG
.getFrameIndex(FrameIdx
, PtrVT
);
6083 InVals
.push_back(FrameIdxN
);
6088 if (Ins
[i
].Flags
.isSwiftAsync())
6089 MF
.getInfo
<AArch64FunctionInfo
>()->setHasSwiftAsyncContext(true);
6092 if (VA
.isRegLoc()) {
6093 // Arguments stored in registers.
6094 EVT RegVT
= VA
.getLocVT();
6095 const TargetRegisterClass
*RC
;
6097 if (RegVT
== MVT::i32
)
6098 RC
= &AArch64::GPR32RegClass
;
6099 else if (RegVT
== MVT::i64
)
6100 RC
= &AArch64::GPR64RegClass
;
6101 else if (RegVT
== MVT::f16
|| RegVT
== MVT::bf16
)
6102 RC
= &AArch64::FPR16RegClass
;
6103 else if (RegVT
== MVT::f32
)
6104 RC
= &AArch64::FPR32RegClass
;
6105 else if (RegVT
== MVT::f64
|| RegVT
.is64BitVector())
6106 RC
= &AArch64::FPR64RegClass
;
6107 else if (RegVT
== MVT::f128
|| RegVT
.is128BitVector())
6108 RC
= &AArch64::FPR128RegClass
;
6109 else if (RegVT
.isScalableVector() &&
6110 RegVT
.getVectorElementType() == MVT::i1
) {
6111 FuncInfo
->setIsSVECC(true);
6112 RC
= &AArch64::PPRRegClass
;
6113 } else if (RegVT
.isScalableVector()) {
6114 FuncInfo
->setIsSVECC(true);
6115 RC
= &AArch64::ZPRRegClass
;
6117 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
6119 // Transform the arguments in physical registers into virtual ones.
6120 Register Reg
= MF
.addLiveIn(VA
.getLocReg(), RC
);
6122 if (IsLocallyStreaming
) {
6123 // LocallyStreamingFunctions must insert the SMSTART in the correct
6124 // position, so we use Glue to ensure no instructions can be scheduled
6125 // between the chain of:
6126 // t0: ch,glue = EntryNode
6127 // t1: res,ch,glue = CopyFromReg
6129 // tn: res,ch,glue = CopyFromReg t(n-1), ..
6130 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
6132 // This will be the new Chain/Root node.
6133 ArgValue
= DAG
.getCopyFromReg(Chain
, DL
, Reg
, RegVT
, Glue
);
6134 Glue
= ArgValue
.getValue(2);
6136 ArgValue
= DAG
.getCopyFromReg(Chain
, DL
, Reg
, RegVT
);
6138 // If this is an 8, 16 or 32-bit value, it is really passed promoted
6139 // to 64 bits. Insert an assert[sz]ext to capture this, then
6140 // truncate to the right size.
6141 switch (VA
.getLocInfo()) {
6143 llvm_unreachable("Unknown loc info!");
6144 case CCValAssign::Full
:
6146 case CCValAssign::Indirect
:
6147 assert((VA
.getValVT().isScalableVector() ||
6148 Subtarget
->isWindowsArm64EC()) &&
6149 "Indirect arguments should be scalable on most subtargets");
6151 case CCValAssign::BCvt
:
6152 ArgValue
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getValVT(), ArgValue
);
6154 case CCValAssign::AExt
:
6155 case CCValAssign::SExt
:
6156 case CCValAssign::ZExt
:
6158 case CCValAssign::AExtUpper
:
6159 ArgValue
= DAG
.getNode(ISD::SRL
, DL
, RegVT
, ArgValue
,
6160 DAG
.getConstant(32, DL
, RegVT
));
6161 ArgValue
= DAG
.getZExtOrTrunc(ArgValue
, DL
, VA
.getValVT());
6164 } else { // VA.isRegLoc()
6165 assert(VA
.isMemLoc() && "CCValAssign is neither reg nor mem");
6166 unsigned ArgOffset
= VA
.getLocMemOffset();
6167 unsigned ArgSize
= (VA
.getLocInfo() == CCValAssign::Indirect
6168 ? VA
.getLocVT().getSizeInBits()
6169 : VA
.getValVT().getSizeInBits()) / 8;
6171 uint32_t BEAlign
= 0;
6172 if (!Subtarget
->isLittleEndian() && ArgSize
< 8 &&
6173 !Ins
[i
].Flags
.isInConsecutiveRegs())
6174 BEAlign
= 8 - ArgSize
;
6177 MachinePointerInfo PtrInfo
;
6178 if (isVarArg
&& Subtarget
->isWindowsArm64EC()) {
6179 // In the ARM64EC varargs convention, fixed arguments on the stack are
6180 // accessed relative to x4, not sp.
6181 unsigned ObjOffset
= ArgOffset
+ BEAlign
;
6182 Register VReg
= MF
.addLiveIn(AArch64::X4
, &AArch64::GPR64RegClass
);
6183 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, VReg
, MVT::i64
);
6184 FIN
= DAG
.getNode(ISD::ADD
, DL
, MVT::i64
, Val
,
6185 DAG
.getConstant(ObjOffset
, DL
, MVT::i64
));
6186 PtrInfo
= MachinePointerInfo::getUnknownStack(MF
);
6188 int FI
= MFI
.CreateFixedObject(ArgSize
, ArgOffset
+ BEAlign
, true);
6190 // Create load nodes to retrieve arguments from the stack.
6191 FIN
= DAG
.getFrameIndex(FI
, getPointerTy(DAG
.getDataLayout()));
6192 PtrInfo
= MachinePointerInfo::getFixedStack(MF
, FI
);
6195 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
6196 ISD::LoadExtType ExtType
= ISD::NON_EXTLOAD
;
6197 MVT MemVT
= VA
.getValVT();
6199 switch (VA
.getLocInfo()) {
6202 case CCValAssign::Trunc
:
6203 case CCValAssign::BCvt
:
6204 MemVT
= VA
.getLocVT();
6206 case CCValAssign::Indirect
:
6207 assert((VA
.getValVT().isScalableVector() ||
6208 Subtarget
->isWindowsArm64EC()) &&
6209 "Indirect arguments should be scalable on most subtargets");
6210 MemVT
= VA
.getLocVT();
6212 case CCValAssign::SExt
:
6213 ExtType
= ISD::SEXTLOAD
;
6215 case CCValAssign::ZExt
:
6216 ExtType
= ISD::ZEXTLOAD
;
6218 case CCValAssign::AExt
:
6219 ExtType
= ISD::EXTLOAD
;
6223 ArgValue
= DAG
.getExtLoad(ExtType
, DL
, VA
.getLocVT(), Chain
, FIN
, PtrInfo
,
6227 if (VA
.getLocInfo() == CCValAssign::Indirect
) {
6229 (VA
.getValVT().isScalableVector() || Subtarget
->isWindowsArm64EC()) &&
6230 "Indirect arguments should be scalable on most subtargets");
6232 uint64_t PartSize
= VA
.getValVT().getStoreSize().getKnownMinSize();
6233 unsigned NumParts
= 1;
6234 if (Ins
[i
].Flags
.isInConsecutiveRegs()) {
6235 assert(!Ins
[i
].Flags
.isInConsecutiveRegsLast());
6236 while (!Ins
[i
+ NumParts
- 1].Flags
.isInConsecutiveRegsLast())
6240 MVT PartLoad
= VA
.getValVT();
6241 SDValue Ptr
= ArgValue
;
6243 // Ensure we generate all loads for each tuple part, whilst updating the
6244 // pointer after each load correctly using vscale.
6245 while (NumParts
> 0) {
6246 ArgValue
= DAG
.getLoad(PartLoad
, DL
, Chain
, Ptr
, MachinePointerInfo());
6247 InVals
.push_back(ArgValue
);
6250 SDValue BytesIncrement
;
6251 if (PartLoad
.isScalableVector()) {
6252 BytesIncrement
= DAG
.getVScale(
6253 DL
, Ptr
.getValueType(),
6254 APInt(Ptr
.getValueSizeInBits().getFixedSize(), PartSize
));
6256 BytesIncrement
= DAG
.getConstant(
6257 APInt(Ptr
.getValueSizeInBits().getFixedSize(), PartSize
), DL
,
6258 Ptr
.getValueType());
6261 Flags
.setNoUnsignedWrap(true);
6262 Ptr
= DAG
.getNode(ISD::ADD
, DL
, Ptr
.getValueType(), Ptr
,
6263 BytesIncrement
, Flags
);
6269 if (Subtarget
->isTargetILP32() && Ins
[i
].Flags
.isPointer())
6270 ArgValue
= DAG
.getNode(ISD::AssertZext
, DL
, ArgValue
.getValueType(),
6271 ArgValue
, DAG
.getValueType(MVT::i32
));
6273 // i1 arguments are zero-extended to i8 by the caller. Emit a
6274 // hint to reflect this.
6275 if (Ins
[i
].isOrigArg()) {
6276 Argument
*OrigArg
= F
.getArg(Ins
[i
].getOrigArgIndex());
6277 if (OrigArg
->getType()->isIntegerTy(1)) {
6278 if (!Ins
[i
].Flags
.isZExt()) {
6279 ArgValue
= DAG
.getNode(AArch64ISD::ASSERT_ZEXT_BOOL
, DL
,
6280 ArgValue
.getValueType(), ArgValue
);
6285 InVals
.push_back(ArgValue
);
6288 assert((ArgLocs
.size() + ExtraArgLocs
) == Ins
.size());
6290 // Insert the SMSTART if this is a locally streaming function and
6291 // make sure it is Glued to the last CopyFromReg value.
6292 if (IsLocallyStreaming
) {
6293 const AArch64RegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
6294 Chain
= DAG
.getNode(
6295 AArch64ISD::SMSTART
, DL
, DAG
.getVTList(MVT::Other
, MVT::Glue
),
6297 DAG
.getTargetConstant((int32_t)AArch64SVCR::SVCRSM
, DL
, MVT::i32
),
6298 DAG
.getConstant(0, DL
, MVT::i64
), DAG
.getConstant(1, DL
, MVT::i64
),
6299 DAG
.getRegisterMask(TRI
->getSMStartStopCallPreservedMask()), Glue
});
6300 // Ensure that the SMSTART happens after the CopyWithChain such that its
6301 // chain result is used.
6302 for (unsigned I
=0; I
<InVals
.size(); ++I
) {
6303 Register Reg
= MF
.getRegInfo().createVirtualRegister(
6304 getRegClassFor(InVals
[I
].getValueType().getSimpleVT()));
6305 SDValue X
= DAG
.getCopyToReg(Chain
, DL
, Reg
, InVals
[I
]);
6306 InVals
[I
] = DAG
.getCopyFromReg(X
, DL
, Reg
,
6307 InVals
[I
].getValueType());
6313 if (!Subtarget
->isTargetDarwin() || IsWin64
) {
6314 // The AAPCS variadic function ABI is identical to the non-variadic
6315 // one. As a result there may be more arguments in registers and we should
6316 // save them for future reference.
6317 // Win64 variadic functions also pass arguments in registers, but all float
6318 // arguments are passed in integer registers.
6319 saveVarArgRegisters(CCInfo
, DAG
, DL
, Chain
);
6322 // This will point to the next argument passed via stack.
6323 unsigned StackOffset
= CCInfo
.getNextStackOffset();
6324 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
6325 StackOffset
= alignTo(StackOffset
, Subtarget
->isTargetILP32() ? 4 : 8);
6326 FuncInfo
->setVarArgsStackOffset(StackOffset
);
6327 FuncInfo
->setVarArgsStackIndex(MFI
.CreateFixedObject(4, StackOffset
, true));
6329 if (MFI
.hasMustTailInVarArgFunc()) {
6330 SmallVector
<MVT
, 2> RegParmTypes
;
6331 RegParmTypes
.push_back(MVT::i64
);
6332 RegParmTypes
.push_back(MVT::f128
);
6333 // Compute the set of forwarded registers. The rest are scratch.
6334 SmallVectorImpl
<ForwardedRegister
> &Forwards
=
6335 FuncInfo
->getForwardedMustTailRegParms();
6336 CCInfo
.analyzeMustTailForwardedRegisters(Forwards
, RegParmTypes
,
6339 // Conservatively forward X8, since it might be used for aggregate return.
6340 if (!CCInfo
.isAllocated(AArch64::X8
)) {
6341 Register X8VReg
= MF
.addLiveIn(AArch64::X8
, &AArch64::GPR64RegClass
);
6342 Forwards
.push_back(ForwardedRegister(X8VReg
, AArch64::X8
, MVT::i64
));
6347 // On Windows, InReg pointers must be returned, so record the pointer in a
6348 // virtual register at the start of the function so it can be returned in the
6351 for (unsigned I
= 0, E
= Ins
.size(); I
!= E
; ++I
) {
6352 if (Ins
[I
].Flags
.isInReg() && Ins
[I
].Flags
.isSRet()) {
6353 assert(!FuncInfo
->getSRetReturnReg());
6355 MVT PtrTy
= getPointerTy(DAG
.getDataLayout());
6357 MF
.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy
));
6358 FuncInfo
->setSRetReturnReg(Reg
);
6360 SDValue Copy
= DAG
.getCopyToReg(DAG
.getEntryNode(), DL
, Reg
, InVals
[I
]);
6361 Chain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Copy
, Chain
);
6367 unsigned StackArgSize
= CCInfo
.getNextStackOffset();
6368 bool TailCallOpt
= MF
.getTarget().Options
.GuaranteedTailCallOpt
;
6369 if (DoesCalleeRestoreStack(CallConv
, TailCallOpt
)) {
6370 // This is a non-standard ABI so by fiat I say we're allowed to make full
6371 // use of the stack area to be popped, which must be aligned to 16 bytes in
6373 StackArgSize
= alignTo(StackArgSize
, 16);
6375 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
6376 // a multiple of 16.
6377 FuncInfo
->setArgumentStackToRestore(StackArgSize
);
6379 // This realignment carries over to the available bytes below. Our own
6380 // callers will guarantee the space is free by giving an aligned value to
6383 // Even if we're not expected to free up the space, it's useful to know how
6384 // much is there while considering tail calls (because we can reuse it).
6385 FuncInfo
->setBytesInStackArgArea(StackArgSize
);
6387 if (Subtarget
->hasCustomCallingConv())
6388 Subtarget
->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF
);
6390 if (requiresBufferForLazySave(MF
.getFunction())) {
6391 // Set up a buffer once and store the buffer in the MachineFunctionInfo.
6393 unsigned TPIDR2Obj
= allocateLazySaveBuffer(Chain
, DL
, DAG
, Reg
);
6394 FuncInfo
->setLazySaveBufferReg(Reg
);
6395 FuncInfo
->setLazySaveTPIDR2Obj(TPIDR2Obj
);
6401 void AArch64TargetLowering::saveVarArgRegisters(CCState
&CCInfo
,
6404 SDValue
&Chain
) const {
6405 MachineFunction
&MF
= DAG
.getMachineFunction();
6406 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
6407 AArch64FunctionInfo
*FuncInfo
= MF
.getInfo
<AArch64FunctionInfo
>();
6408 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
6409 bool IsWin64
= Subtarget
->isCallingConvWin64(MF
.getFunction().getCallingConv());
6411 SmallVector
<SDValue
, 8> MemOps
;
6413 static const MCPhysReg GPRArgRegs
[] = { AArch64::X0
, AArch64::X1
, AArch64::X2
,
6414 AArch64::X3
, AArch64::X4
, AArch64::X5
,
6415 AArch64::X6
, AArch64::X7
};
6416 unsigned NumGPRArgRegs
= std::size(GPRArgRegs
);
6417 if (Subtarget
->isWindowsArm64EC()) {
6418 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
6422 unsigned FirstVariadicGPR
= CCInfo
.getFirstUnallocated(GPRArgRegs
);
6424 unsigned GPRSaveSize
= 8 * (NumGPRArgRegs
- FirstVariadicGPR
);
6426 if (GPRSaveSize
!= 0) {
6428 GPRIdx
= MFI
.CreateFixedObject(GPRSaveSize
, -(int)GPRSaveSize
, false);
6429 if (GPRSaveSize
& 15)
6430 // The extra size here, if triggered, will always be 8.
6431 MFI
.CreateFixedObject(16 - (GPRSaveSize
& 15), -(int)alignTo(GPRSaveSize
, 16), false);
6433 GPRIdx
= MFI
.CreateStackObject(GPRSaveSize
, Align(8), false);
6436 if (Subtarget
->isWindowsArm64EC()) {
6437 // With the Arm64EC ABI, we reserve the save area as usual, but we
6438 // compute its address relative to x4. For a normal AArch64->AArch64
6439 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
6440 // different address.
6441 Register VReg
= MF
.addLiveIn(AArch64::X4
, &AArch64::GPR64RegClass
);
6442 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, VReg
, MVT::i64
);
6443 FIN
= DAG
.getNode(ISD::SUB
, DL
, MVT::i64
, Val
,
6444 DAG
.getConstant(GPRSaveSize
, DL
, MVT::i64
));
6446 FIN
= DAG
.getFrameIndex(GPRIdx
, PtrVT
);
6449 for (unsigned i
= FirstVariadicGPR
; i
< NumGPRArgRegs
; ++i
) {
6450 Register VReg
= MF
.addLiveIn(GPRArgRegs
[i
], &AArch64::GPR64RegClass
);
6451 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, VReg
, MVT::i64
);
6453 DAG
.getStore(Val
.getValue(1), DL
, Val
, FIN
,
6454 IsWin64
? MachinePointerInfo::getFixedStack(
6455 MF
, GPRIdx
, (i
- FirstVariadicGPR
) * 8)
6456 : MachinePointerInfo::getStack(MF
, i
* 8));
6457 MemOps
.push_back(Store
);
6459 DAG
.getNode(ISD::ADD
, DL
, PtrVT
, FIN
, DAG
.getConstant(8, DL
, PtrVT
));
6462 FuncInfo
->setVarArgsGPRIndex(GPRIdx
);
6463 FuncInfo
->setVarArgsGPRSize(GPRSaveSize
);
6465 if (Subtarget
->hasFPARMv8() && !IsWin64
) {
6466 static const MCPhysReg FPRArgRegs
[] = {
6467 AArch64::Q0
, AArch64::Q1
, AArch64::Q2
, AArch64::Q3
,
6468 AArch64::Q4
, AArch64::Q5
, AArch64::Q6
, AArch64::Q7
};
6469 static const unsigned NumFPRArgRegs
= std::size(FPRArgRegs
);
6470 unsigned FirstVariadicFPR
= CCInfo
.getFirstUnallocated(FPRArgRegs
);
6472 unsigned FPRSaveSize
= 16 * (NumFPRArgRegs
- FirstVariadicFPR
);
6474 if (FPRSaveSize
!= 0) {
6475 FPRIdx
= MFI
.CreateStackObject(FPRSaveSize
, Align(16), false);
6477 SDValue FIN
= DAG
.getFrameIndex(FPRIdx
, PtrVT
);
6479 for (unsigned i
= FirstVariadicFPR
; i
< NumFPRArgRegs
; ++i
) {
6480 Register VReg
= MF
.addLiveIn(FPRArgRegs
[i
], &AArch64::FPR128RegClass
);
6481 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, VReg
, MVT::f128
);
6483 SDValue Store
= DAG
.getStore(Val
.getValue(1), DL
, Val
, FIN
,
6484 MachinePointerInfo::getStack(MF
, i
* 16));
6485 MemOps
.push_back(Store
);
6486 FIN
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, FIN
,
6487 DAG
.getConstant(16, DL
, PtrVT
));
6490 FuncInfo
->setVarArgsFPRIndex(FPRIdx
);
6491 FuncInfo
->setVarArgsFPRSize(FPRSaveSize
);
6494 if (!MemOps
.empty()) {
6495 Chain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, MemOps
);
6499 /// LowerCallResult - Lower the result values of a call into the
6500 /// appropriate copies out of appropriate physical registers.
6501 SDValue
AArch64TargetLowering::LowerCallResult(
6502 SDValue Chain
, SDValue InFlag
, CallingConv::ID CallConv
, bool isVarArg
,
6503 const SmallVectorImpl
<CCValAssign
> &RVLocs
, const SDLoc
&DL
,
6504 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
, bool isThisReturn
,
6505 SDValue ThisVal
) const {
6506 DenseMap
<unsigned, SDValue
> CopiedRegs
;
6507 // Copy all of the result registers out of their specified physreg.
6508 for (unsigned i
= 0; i
!= RVLocs
.size(); ++i
) {
6509 CCValAssign VA
= RVLocs
[i
];
6511 // Pass 'this' value directly from the argument to return value, to avoid
6512 // reg unit interference
6513 if (i
== 0 && isThisReturn
) {
6514 assert(!VA
.needsCustom() && VA
.getLocVT() == MVT::i64
&&
6515 "unexpected return calling convention register assignment");
6516 InVals
.push_back(ThisVal
);
6520 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
6521 // allows one use of a physreg per block.
6522 SDValue Val
= CopiedRegs
.lookup(VA
.getLocReg());
6525 DAG
.getCopyFromReg(Chain
, DL
, VA
.getLocReg(), VA
.getLocVT(), InFlag
);
6526 Chain
= Val
.getValue(1);
6527 InFlag
= Val
.getValue(2);
6528 CopiedRegs
[VA
.getLocReg()] = Val
;
6531 switch (VA
.getLocInfo()) {
6533 llvm_unreachable("Unknown loc info!");
6534 case CCValAssign::Full
:
6536 case CCValAssign::BCvt
:
6537 Val
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getValVT(), Val
);
6539 case CCValAssign::AExtUpper
:
6540 Val
= DAG
.getNode(ISD::SRL
, DL
, VA
.getLocVT(), Val
,
6541 DAG
.getConstant(32, DL
, VA
.getLocVT()));
6543 case CCValAssign::AExt
:
6545 case CCValAssign::ZExt
:
6546 Val
= DAG
.getZExtOrTrunc(Val
, DL
, VA
.getValVT());
6550 InVals
.push_back(Val
);
6556 /// Return true if the calling convention is one that we can guarantee TCO for.
6557 static bool canGuaranteeTCO(CallingConv::ID CC
, bool GuaranteeTailCalls
) {
6558 return (CC
== CallingConv::Fast
&& GuaranteeTailCalls
) ||
6559 CC
== CallingConv::Tail
|| CC
== CallingConv::SwiftTail
;
6562 /// Return true if we might ever do TCO for calls with this calling convention.
6563 static bool mayTailCallThisCC(CallingConv::ID CC
) {
6565 case CallingConv::C
:
6566 case CallingConv::AArch64_SVE_VectorCall
:
6567 case CallingConv::PreserveMost
:
6568 case CallingConv::Swift
:
6569 case CallingConv::SwiftTail
:
6570 case CallingConv::Tail
:
6571 case CallingConv::Fast
:
6578 static void analyzeCallOperands(const AArch64TargetLowering
&TLI
,
6579 const AArch64Subtarget
*Subtarget
,
6580 const TargetLowering::CallLoweringInfo
&CLI
,
6582 const SelectionDAG
&DAG
= CLI
.DAG
;
6583 CallingConv::ID CalleeCC
= CLI
.CallConv
;
6584 bool IsVarArg
= CLI
.IsVarArg
;
6585 const SmallVector
<ISD::OutputArg
, 32> &Outs
= CLI
.Outs
;
6586 bool IsCalleeWin64
= Subtarget
->isCallingConvWin64(CalleeCC
);
6588 unsigned NumArgs
= Outs
.size();
6589 for (unsigned i
= 0; i
!= NumArgs
; ++i
) {
6590 MVT ArgVT
= Outs
[i
].VT
;
6591 ISD::ArgFlagsTy ArgFlags
= Outs
[i
].Flags
;
6593 bool UseVarArgCC
= false;
6595 // On Windows, the fixed arguments in a vararg call are passed in GPRs
6596 // too, so use the vararg CC to force them to integer registers.
6597 if (IsCalleeWin64
) {
6600 UseVarArgCC
= !Outs
[i
].IsFixed
;
6603 // Get type of the original argument.
6605 TLI
.getValueType(DAG
.getDataLayout(), CLI
.Args
[Outs
[i
].OrigArgIndex
].Ty
,
6606 /*AllowUnknown*/ true);
6607 MVT ActualMVT
= ActualVT
.isSimple() ? ActualVT
.getSimpleVT() : ArgVT
;
6608 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6609 if (ActualMVT
== MVT::i1
|| ActualMVT
== MVT::i8
)
6611 else if (ActualMVT
== MVT::i16
)
6615 CCAssignFn
*AssignFn
= TLI
.CCAssignFnForCall(CalleeCC
, UseVarArgCC
);
6616 bool Res
= AssignFn(i
, ArgVT
, ArgVT
, CCValAssign::Full
, ArgFlags
, CCInfo
);
6617 assert(!Res
&& "Call operand has unhandled type");
6622 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
6623 const CallLoweringInfo
&CLI
) const {
6624 CallingConv::ID CalleeCC
= CLI
.CallConv
;
6625 if (!mayTailCallThisCC(CalleeCC
))
6628 SDValue Callee
= CLI
.Callee
;
6629 bool IsVarArg
= CLI
.IsVarArg
;
6630 const SmallVector
<ISD::OutputArg
, 32> &Outs
= CLI
.Outs
;
6631 const SmallVector
<SDValue
, 32> &OutVals
= CLI
.OutVals
;
6632 const SmallVector
<ISD::InputArg
, 32> &Ins
= CLI
.Ins
;
6633 const SelectionDAG
&DAG
= CLI
.DAG
;
6634 MachineFunction
&MF
= DAG
.getMachineFunction();
6635 const Function
&CallerF
= MF
.getFunction();
6636 CallingConv::ID CallerCC
= CallerF
.getCallingConv();
6638 // SME Streaming functions are not eligible for TCO as they may require
6639 // the streaming mode or ZA to be restored after returning from the call.
6640 SMEAttrs
CallerAttrs(MF
.getFunction());
6641 auto CalleeAttrs
= CLI
.CB
? SMEAttrs(*CLI
.CB
) : SMEAttrs(SMEAttrs::Normal
);
6642 if (CallerAttrs
.requiresSMChange(CalleeAttrs
) ||
6643 CallerAttrs
.requiresLazySave(CalleeAttrs
))
6646 // Functions using the C or Fast calling convention that have an SVE signature
6647 // preserve more registers and should assume the SVE_VectorCall CC.
6648 // The check for matching callee-saved regs will determine whether it is
6649 // eligible for TCO.
6650 if ((CallerCC
== CallingConv::C
|| CallerCC
== CallingConv::Fast
) &&
6651 MF
.getInfo
<AArch64FunctionInfo
>()->isSVECC())
6652 CallerCC
= CallingConv::AArch64_SVE_VectorCall
;
6654 bool CCMatch
= CallerCC
== CalleeCC
;
6656 // When using the Windows calling convention on a non-windows OS, we want
6657 // to back up and restore X18 in such functions; we can't do a tail call
6658 // from those functions.
6659 if (CallerCC
== CallingConv::Win64
&& !Subtarget
->isTargetWindows() &&
6660 CalleeCC
!= CallingConv::Win64
)
6663 // Byval parameters hand the function a pointer directly into the stack area
6664 // we want to reuse during a tail call. Working around this *is* possible (see
6665 // X86) but less efficient and uglier in LowerCall.
6666 for (Function::const_arg_iterator i
= CallerF
.arg_begin(),
6667 e
= CallerF
.arg_end();
6669 if (i
->hasByValAttr())
6672 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
6673 // In this case, it is necessary to save/restore X0 in the callee. Tail
6674 // call opt interferes with this. So we disable tail call opt when the
6675 // caller has an argument with "inreg" attribute.
6677 // FIXME: Check whether the callee also has an "inreg" argument.
6678 if (i
->hasInRegAttr())
6682 if (canGuaranteeTCO(CalleeCC
, getTargetMachine().Options
.GuaranteedTailCallOpt
))
6685 // Externally-defined functions with weak linkage should not be
6686 // tail-called on AArch64 when the OS does not support dynamic
6687 // pre-emption of symbols, as the AAELF spec requires normal calls
6688 // to undefined weak functions to be replaced with a NOP or jump to the
6689 // next instruction. The behaviour of branch instructions in this
6690 // situation (as used for tail calls) is implementation-defined, so we
6691 // cannot rely on the linker replacing the tail call with a return.
6692 if (GlobalAddressSDNode
*G
= dyn_cast
<GlobalAddressSDNode
>(Callee
)) {
6693 const GlobalValue
*GV
= G
->getGlobal();
6694 const Triple
&TT
= getTargetMachine().getTargetTriple();
6695 if (GV
->hasExternalWeakLinkage() &&
6696 (!TT
.isOSWindows() || TT
.isOSBinFormatELF() || TT
.isOSBinFormatMachO()))
6700 // Now we search for cases where we can use a tail call without changing the
6701 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
6704 // I want anyone implementing a new calling convention to think long and hard
6705 // about this assert.
6706 assert((!IsVarArg
|| CalleeCC
== CallingConv::C
) &&
6707 "Unexpected variadic calling convention");
6709 LLVMContext
&C
= *DAG
.getContext();
6710 // Check that the call results are passed in the same way.
6711 if (!CCState::resultsCompatible(CalleeCC
, CallerCC
, MF
, C
, Ins
,
6712 CCAssignFnForCall(CalleeCC
, IsVarArg
),
6713 CCAssignFnForCall(CallerCC
, IsVarArg
)))
6715 // The callee has to preserve all registers the caller needs to preserve.
6716 const AArch64RegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
6717 const uint32_t *CallerPreserved
= TRI
->getCallPreservedMask(MF
, CallerCC
);
6719 const uint32_t *CalleePreserved
= TRI
->getCallPreservedMask(MF
, CalleeCC
);
6720 if (Subtarget
->hasCustomCallingConv()) {
6721 TRI
->UpdateCustomCallPreservedMask(MF
, &CallerPreserved
);
6722 TRI
->UpdateCustomCallPreservedMask(MF
, &CalleePreserved
);
6724 if (!TRI
->regmaskSubsetEqual(CallerPreserved
, CalleePreserved
))
6728 // Nothing more to check if the callee is taking no arguments
6732 SmallVector
<CCValAssign
, 16> ArgLocs
;
6733 CCState
CCInfo(CalleeCC
, IsVarArg
, MF
, ArgLocs
, C
);
6735 analyzeCallOperands(*this, Subtarget
, CLI
, CCInfo
);
6737 if (IsVarArg
&& !(CLI
.CB
&& CLI
.CB
->isMustTailCall())) {
6738 // When we are musttail, additional checks have been done and we can safely ignore this check
6739 // At least two cases here: if caller is fastcc then we can't have any
6740 // memory arguments (we'd be expected to clean up the stack afterwards). If
6741 // caller is C then we could potentially use its argument area.
6743 // FIXME: for now we take the most conservative of these in both cases:
6744 // disallow all variadic memory operands.
6745 for (const CCValAssign
&ArgLoc
: ArgLocs
)
6746 if (!ArgLoc
.isRegLoc())
6750 const AArch64FunctionInfo
*FuncInfo
= MF
.getInfo
<AArch64FunctionInfo
>();
6752 // If any of the arguments is passed indirectly, it must be SVE, so the
6753 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
6754 // allocate space on the stack. That is why we determine this explicitly here
6755 // the call cannot be a tailcall.
6756 if (llvm::any_of(ArgLocs
, [&](CCValAssign
&A
) {
6757 assert((A
.getLocInfo() != CCValAssign::Indirect
||
6758 A
.getValVT().isScalableVector() ||
6759 Subtarget
->isWindowsArm64EC()) &&
6760 "Expected value to be scalable");
6761 return A
.getLocInfo() == CCValAssign::Indirect
;
6765 // If the stack arguments for this call do not fit into our own save area then
6766 // the call cannot be made tail.
6767 if (CCInfo
.getNextStackOffset() > FuncInfo
->getBytesInStackArgArea())
6770 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
6771 if (!parametersInCSRMatch(MRI
, CallerPreserved
, ArgLocs
, OutVals
))
6777 SDValue
AArch64TargetLowering::addTokenForArgument(SDValue Chain
,
6779 MachineFrameInfo
&MFI
,
6780 int ClobberedFI
) const {
6781 SmallVector
<SDValue
, 8> ArgChains
;
6782 int64_t FirstByte
= MFI
.getObjectOffset(ClobberedFI
);
6783 int64_t LastByte
= FirstByte
+ MFI
.getObjectSize(ClobberedFI
) - 1;
6785 // Include the original chain at the beginning of the list. When this is
6786 // used by target LowerCall hooks, this helps legalize find the
6787 // CALLSEQ_BEGIN node.
6788 ArgChains
.push_back(Chain
);
6790 // Add a chain value for each stack argument corresponding
6791 for (SDNode
*U
: DAG
.getEntryNode().getNode()->uses())
6792 if (LoadSDNode
*L
= dyn_cast
<LoadSDNode
>(U
))
6793 if (FrameIndexSDNode
*FI
= dyn_cast
<FrameIndexSDNode
>(L
->getBasePtr()))
6794 if (FI
->getIndex() < 0) {
6795 int64_t InFirstByte
= MFI
.getObjectOffset(FI
->getIndex());
6796 int64_t InLastByte
= InFirstByte
;
6797 InLastByte
+= MFI
.getObjectSize(FI
->getIndex()) - 1;
6799 if ((InFirstByte
<= FirstByte
&& FirstByte
<= InLastByte
) ||
6800 (FirstByte
<= InFirstByte
&& InFirstByte
<= LastByte
))
6801 ArgChains
.push_back(SDValue(L
, 1));
6804 // Build a tokenfactor for all the chains.
6805 return DAG
.getNode(ISD::TokenFactor
, SDLoc(Chain
), MVT::Other
, ArgChains
);
6808 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC
,
6809 bool TailCallOpt
) const {
6810 return (CallCC
== CallingConv::Fast
&& TailCallOpt
) ||
6811 CallCC
== CallingConv::Tail
|| CallCC
== CallingConv::SwiftTail
;
6814 // Check if the value is zero-extended from i1 to i8
6815 static bool checkZExtBool(SDValue Arg
, const SelectionDAG
&DAG
) {
6816 unsigned SizeInBits
= Arg
.getValueType().getSizeInBits();
6820 APInt
RequredZero(SizeInBits
, 0xFE);
6821 KnownBits Bits
= DAG
.computeKnownBits(Arg
, 4);
6822 bool ZExtBool
= (Bits
.Zero
& RequredZero
) == RequredZero
;
6826 SDValue
AArch64TargetLowering::changeStreamingMode(
6827 SelectionDAG
&DAG
, SDLoc DL
, bool Enable
,
6828 SDValue Chain
, SDValue InFlag
, SDValue PStateSM
, bool Entry
) const {
6829 const AArch64RegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
6830 SDValue RegMask
= DAG
.getRegisterMask(TRI
->getSMStartStopCallPreservedMask());
6832 DAG
.getTargetConstant((int32_t)AArch64SVCR::SVCRSM
, DL
, MVT::i32
);
6834 SDValue ExpectedSMVal
=
6835 DAG
.getTargetConstant(Entry
? Enable
: !Enable
, DL
, MVT::i64
);
6836 SmallVector
<SDValue
> Ops
= {Chain
, MSROp
, PStateSM
, ExpectedSMVal
, RegMask
};
6839 Ops
.push_back(InFlag
);
6841 unsigned Opcode
= Enable
? AArch64ISD::SMSTART
: AArch64ISD::SMSTOP
;
6842 return DAG
.getNode(Opcode
, DL
, DAG
.getVTList(MVT::Other
, MVT::Glue
), Ops
);
6845 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
6846 /// and add input and output parameter nodes.
6848 AArch64TargetLowering::LowerCall(CallLoweringInfo
&CLI
,
6849 SmallVectorImpl
<SDValue
> &InVals
) const {
6850 SelectionDAG
&DAG
= CLI
.DAG
;
6852 SmallVector
<ISD::OutputArg
, 32> &Outs
= CLI
.Outs
;
6853 SmallVector
<SDValue
, 32> &OutVals
= CLI
.OutVals
;
6854 SmallVector
<ISD::InputArg
, 32> &Ins
= CLI
.Ins
;
6855 SDValue Chain
= CLI
.Chain
;
6856 SDValue Callee
= CLI
.Callee
;
6857 bool &IsTailCall
= CLI
.IsTailCall
;
6858 CallingConv::ID
&CallConv
= CLI
.CallConv
;
6859 bool IsVarArg
= CLI
.IsVarArg
;
6861 MachineFunction
&MF
= DAG
.getMachineFunction();
6862 MachineFunction::CallSiteInfo CSInfo
;
6863 bool IsThisReturn
= false;
6865 AArch64FunctionInfo
*FuncInfo
= MF
.getInfo
<AArch64FunctionInfo
>();
6866 bool TailCallOpt
= MF
.getTarget().Options
.GuaranteedTailCallOpt
;
6867 bool IsCFICall
= CLI
.CB
&& CLI
.CB
->isIndirectCall() && CLI
.CFIType
;
6868 bool IsSibCall
= false;
6869 bool GuardWithBTI
= false;
6871 if (CLI
.CB
&& CLI
.CB
->getAttributes().hasFnAttr(Attribute::ReturnsTwice
) &&
6872 !Subtarget
->noBTIAtReturnTwice()) {
6873 GuardWithBTI
= FuncInfo
->branchTargetEnforcement();
6876 // Analyze operands of the call, assigning locations to each operand.
6877 SmallVector
<CCValAssign
, 16> ArgLocs
;
6878 CCState
CCInfo(CallConv
, IsVarArg
, MF
, ArgLocs
, *DAG
.getContext());
6881 unsigned NumArgs
= Outs
.size();
6883 for (unsigned i
= 0; i
!= NumArgs
; ++i
) {
6884 if (!Outs
[i
].IsFixed
&& Outs
[i
].VT
.isScalableVector())
6885 report_fatal_error("Passing SVE types to variadic functions is "
6886 "currently not supported");
6890 analyzeCallOperands(*this, Subtarget
, CLI
, CCInfo
);
6892 CCAssignFn
*RetCC
= CCAssignFnForReturn(CallConv
);
6893 // Assign locations to each value returned by this call.
6894 SmallVector
<CCValAssign
, 16> RVLocs
;
6895 CCState
RetCCInfo(CallConv
, IsVarArg
, DAG
.getMachineFunction(), RVLocs
,
6897 RetCCInfo
.AnalyzeCallResult(Ins
, RetCC
);
6899 // Check callee args/returns for SVE registers and set calling convention
6901 if (CallConv
== CallingConv::C
|| CallConv
== CallingConv::Fast
) {
6902 auto HasSVERegLoc
= [](CCValAssign
&Loc
) {
6903 if (!Loc
.isRegLoc())
6905 return AArch64::ZPRRegClass
.contains(Loc
.getLocReg()) ||
6906 AArch64::PPRRegClass
.contains(Loc
.getLocReg());
6908 if (any_of(RVLocs
, HasSVERegLoc
) || any_of(ArgLocs
, HasSVERegLoc
))
6909 CallConv
= CallingConv::AArch64_SVE_VectorCall
;
6913 // Check if it's really possible to do a tail call.
6914 IsTailCall
= isEligibleForTailCallOptimization(CLI
);
6916 // A sibling call is one where we're under the usual C ABI and not planning
6917 // to change that but can still do a tail call:
6918 if (!TailCallOpt
&& IsTailCall
&& CallConv
!= CallingConv::Tail
&&
6919 CallConv
!= CallingConv::SwiftTail
)
6926 if (!IsTailCall
&& CLI
.CB
&& CLI
.CB
->isMustTailCall())
6927 report_fatal_error("failed to perform tail call elimination on a call "
6928 "site marked musttail");
6930 // Get a count of how many bytes are to be pushed on the stack.
6931 unsigned NumBytes
= CCInfo
.getNextStackOffset();
6934 // Since we're not changing the ABI to make this a tail call, the memory
6935 // operands are already available in the caller's incoming argument space.
6939 // FPDiff is the byte offset of the call's argument area from the callee's.
6940 // Stores to callee stack arguments will be placed in FixedStackSlots offset
6941 // by this amount for a tail call. In a sibling call it must be 0 because the
6942 // caller will deallocate the entire stack and the callee still expects its
6943 // arguments to begin at SP+0. Completely unused for non-tail calls.
6946 if (IsTailCall
&& !IsSibCall
) {
6947 unsigned NumReusableBytes
= FuncInfo
->getBytesInStackArgArea();
6949 // Since callee will pop argument stack as a tail call, we must keep the
6950 // popped size 16-byte aligned.
6951 NumBytes
= alignTo(NumBytes
, 16);
6953 // FPDiff will be negative if this tail call requires more space than we
6954 // would automatically have in our incoming argument space. Positive if we
6955 // can actually shrink the stack.
6956 FPDiff
= NumReusableBytes
- NumBytes
;
6958 // Update the required reserved area if this is the tail call requiring the
6959 // most argument stack space.
6960 if (FPDiff
< 0 && FuncInfo
->getTailCallReservedStack() < (unsigned)-FPDiff
)
6961 FuncInfo
->setTailCallReservedStack(-FPDiff
);
6963 // The stack pointer must be 16-byte aligned at all times it's used for a
6964 // memory operation, which in practice means at *all* times and in
6965 // particular across call boundaries. Therefore our own arguments started at
6966 // a 16-byte aligned SP and the delta applied for the tail call should
6967 // satisfy the same constraint.
6968 assert(FPDiff
% 16 == 0 && "unaligned stack on tail call");
6971 // Determine whether we need any streaming mode changes.
6972 SMEAttrs CalleeAttrs
, CallerAttrs(MF
.getFunction());
6974 CalleeAttrs
= SMEAttrs(*CLI
.CB
);
6975 else if (Optional
<SMEAttrs
> Attrs
=
6976 getCalleeAttrsFromExternalFunction(CLI
.Callee
))
6977 CalleeAttrs
= *Attrs
;
6979 bool RequiresLazySave
= CallerAttrs
.requiresLazySave(CalleeAttrs
);
6981 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
6982 if (RequiresLazySave
) {
6983 // Set up a lazy save mechanism by storing the runtime live slices
6984 // (worst-case N*N) to the TPIDR2 stack object.
6985 SDValue N
= DAG
.getNode(AArch64ISD::RDSVL
, DL
, MVT::i64
,
6986 DAG
.getConstant(1, DL
, MVT::i32
));
6987 SDValue NN
= DAG
.getNode(ISD::MUL
, DL
, MVT::i64
, N
, N
);
6988 unsigned TPIDR2Obj
= FuncInfo
->getLazySaveTPIDR2Obj();
6992 TPIDR2Obj
= allocateLazySaveBuffer(Chain
, DL
, DAG
, Reg
);
6995 MachinePointerInfo MPI
= MachinePointerInfo::getStack(MF
, TPIDR2Obj
);
6996 SDValue TPIDR2ObjAddr
= DAG
.getFrameIndex(TPIDR2Obj
,
6997 DAG
.getTargetLoweringInfo().getFrameIndexTy(DAG
.getDataLayout()));
6998 SDValue BufferPtrAddr
=
6999 DAG
.getNode(ISD::ADD
, DL
, TPIDR2ObjAddr
.getValueType(), TPIDR2ObjAddr
,
7000 DAG
.getConstant(8, DL
, TPIDR2ObjAddr
.getValueType()));
7001 Chain
= DAG
.getTruncStore(Chain
, DL
, NN
, BufferPtrAddr
, MPI
, MVT::i16
);
7002 Chain
= DAG
.getNode(
7003 ISD::INTRINSIC_VOID
, DL
, MVT::Other
, Chain
,
7004 DAG
.getConstant(Intrinsic::aarch64_sme_set_tpidr2
, DL
, MVT::i32
),
7009 Optional
<bool> RequiresSMChange
= CallerAttrs
.requiresSMChange(CalleeAttrs
);
7010 if (RequiresSMChange
)
7011 PStateSM
= getPStateSM(DAG
, Chain
, CallerAttrs
, DL
, MVT::i64
);
7013 // Adjust the stack pointer for the new arguments...
7014 // These operations are automatically eliminated by the prolog/epilog pass
7016 Chain
= DAG
.getCALLSEQ_START(Chain
, IsTailCall
? 0 : NumBytes
, 0, DL
);
7018 SDValue StackPtr
= DAG
.getCopyFromReg(Chain
, DL
, AArch64::SP
,
7019 getPointerTy(DAG
.getDataLayout()));
7021 SmallVector
<std::pair
<unsigned, SDValue
>, 8> RegsToPass
;
7022 SmallSet
<unsigned, 8> RegsUsed
;
7023 SmallVector
<SDValue
, 8> MemOpChains
;
7024 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
7026 if (IsVarArg
&& CLI
.CB
&& CLI
.CB
->isMustTailCall()) {
7027 const auto &Forwards
= FuncInfo
->getForwardedMustTailRegParms();
7028 for (const auto &F
: Forwards
) {
7029 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, F
.VReg
, F
.VT
);
7030 RegsToPass
.emplace_back(F
.PReg
, Val
);
7034 // Walk the register/memloc assignments, inserting copies/loads.
7035 unsigned ExtraArgLocs
= 0;
7036 for (unsigned i
= 0, e
= Outs
.size(); i
!= e
; ++i
) {
7037 CCValAssign
&VA
= ArgLocs
[i
- ExtraArgLocs
];
7038 SDValue Arg
= OutVals
[i
];
7039 ISD::ArgFlagsTy Flags
= Outs
[i
].Flags
;
7041 // Promote the value if needed.
7042 switch (VA
.getLocInfo()) {
7044 llvm_unreachable("Unknown loc info!");
7045 case CCValAssign::Full
:
7047 case CCValAssign::SExt
:
7048 Arg
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VA
.getLocVT(), Arg
);
7050 case CCValAssign::ZExt
:
7051 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
7053 case CCValAssign::AExt
:
7054 if (Outs
[i
].ArgVT
== MVT::i1
) {
7055 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
7057 // Check if we actually have to do this, because the value may
7058 // already be zero-extended.
7060 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
7061 // and rely on DAGCombiner to fold this, because the following
7062 // (anyext i32) is combined with (zext i8) in DAG.getNode:
7064 // (ext (zext x)) -> (zext x)
7066 // This will give us (zext i32), which we cannot remove, so
7067 // try to check this beforehand.
7068 if (!checkZExtBool(Arg
, DAG
)) {
7069 Arg
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i1
, Arg
);
7070 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, MVT::i8
, Arg
);
7073 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
7075 case CCValAssign::AExtUpper
:
7076 assert(VA
.getValVT() == MVT::i32
&& "only expect 32 -> 64 upper bits");
7077 Arg
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VA
.getLocVT(), Arg
);
7078 Arg
= DAG
.getNode(ISD::SHL
, DL
, VA
.getLocVT(), Arg
,
7079 DAG
.getConstant(32, DL
, VA
.getLocVT()));
7081 case CCValAssign::BCvt
:
7082 Arg
= DAG
.getBitcast(VA
.getLocVT(), Arg
);
7084 case CCValAssign::Trunc
:
7085 Arg
= DAG
.getZExtOrTrunc(Arg
, DL
, VA
.getLocVT());
7087 case CCValAssign::FPExt
:
7088 Arg
= DAG
.getNode(ISD::FP_EXTEND
, DL
, VA
.getLocVT(), Arg
);
7090 case CCValAssign::Indirect
:
7091 bool isScalable
= VA
.getValVT().isScalableVector();
7092 assert((isScalable
|| Subtarget
->isWindowsArm64EC()) &&
7093 "Indirect arguments should be scalable on most subtargets");
7095 uint64_t StoreSize
= VA
.getValVT().getStoreSize().getKnownMinSize();
7096 uint64_t PartSize
= StoreSize
;
7097 unsigned NumParts
= 1;
7098 if (Outs
[i
].Flags
.isInConsecutiveRegs()) {
7099 assert(!Outs
[i
].Flags
.isInConsecutiveRegsLast());
7100 while (!Outs
[i
+ NumParts
- 1].Flags
.isInConsecutiveRegsLast())
7102 StoreSize
*= NumParts
;
7105 Type
*Ty
= EVT(VA
.getValVT()).getTypeForEVT(*DAG
.getContext());
7106 Align Alignment
= DAG
.getDataLayout().getPrefTypeAlign(Ty
);
7107 int FI
= MFI
.CreateStackObject(StoreSize
, Alignment
, false);
7109 MFI
.setStackID(FI
, TargetStackID::ScalableVector
);
7111 MachinePointerInfo MPI
= MachinePointerInfo::getFixedStack(MF
, FI
);
7112 SDValue Ptr
= DAG
.getFrameIndex(
7113 FI
, DAG
.getTargetLoweringInfo().getFrameIndexTy(DAG
.getDataLayout()));
7114 SDValue SpillSlot
= Ptr
;
7116 // Ensure we generate all stores for each tuple part, whilst updating the
7117 // pointer after each store correctly using vscale.
7119 Chain
= DAG
.getStore(Chain
, DL
, OutVals
[i
], Ptr
, MPI
);
7122 SDValue BytesIncrement
;
7124 BytesIncrement
= DAG
.getVScale(
7125 DL
, Ptr
.getValueType(),
7126 APInt(Ptr
.getValueSizeInBits().getFixedSize(), PartSize
));
7128 BytesIncrement
= DAG
.getConstant(
7129 APInt(Ptr
.getValueSizeInBits().getFixedSize(), PartSize
), DL
,
7130 Ptr
.getValueType());
7133 Flags
.setNoUnsignedWrap(true);
7135 MPI
= MachinePointerInfo(MPI
.getAddrSpace());
7136 Ptr
= DAG
.getNode(ISD::ADD
, DL
, Ptr
.getValueType(), Ptr
,
7137 BytesIncrement
, Flags
);
7147 if (VA
.isRegLoc()) {
7148 if (i
== 0 && Flags
.isReturned() && !Flags
.isSwiftSelf() &&
7149 Outs
[0].VT
== MVT::i64
) {
7150 assert(VA
.getLocVT() == MVT::i64
&&
7151 "unexpected calling convention register assignment");
7152 assert(!Ins
.empty() && Ins
[0].VT
== MVT::i64
&&
7153 "unexpected use of 'returned'");
7154 IsThisReturn
= true;
7156 if (RegsUsed
.count(VA
.getLocReg())) {
7157 // If this register has already been used then we're trying to pack
7158 // parts of an [N x i32] into an X-register. The extension type will
7159 // take care of putting the two halves in the right place but we have to
7162 llvm::find_if(RegsToPass
,
7163 [=](const std::pair
<unsigned, SDValue
> &Elt
) {
7164 return Elt
.first
== VA
.getLocReg();
7167 Bits
= DAG
.getNode(ISD::OR
, DL
, Bits
.getValueType(), Bits
, Arg
);
7168 // Call site info is used for function's parameter entry value
7169 // tracking. For now we track only simple cases when parameter
7170 // is transferred through whole register.
7171 llvm::erase_if(CSInfo
, [&VA
](MachineFunction::ArgRegPair ArgReg
) {
7172 return ArgReg
.Reg
== VA
.getLocReg();
7175 // Add an extra level of indirection for streaming mode changes by
7176 // using a pseudo copy node that cannot be rematerialised between a
7177 // smstart/smstop and the call by the simple register coalescer.
7178 if (RequiresSMChange
&& isa
<FrameIndexSDNode
>(Arg
))
7179 Arg
= DAG
.getNode(AArch64ISD::OBSCURE_COPY
, DL
, MVT::i64
, Arg
);
7180 RegsToPass
.emplace_back(VA
.getLocReg(), Arg
);
7181 RegsUsed
.insert(VA
.getLocReg());
7182 const TargetOptions
&Options
= DAG
.getTarget().Options
;
7183 if (Options
.EmitCallSiteInfo
)
7184 CSInfo
.emplace_back(VA
.getLocReg(), i
);
7187 assert(VA
.isMemLoc());
7190 MachinePointerInfo DstInfo
;
7192 // FIXME: This works on big-endian for composite byvals, which are the
7193 // common case. It should also work for fundamental types too.
7194 uint32_t BEAlign
= 0;
7196 if (VA
.getLocInfo() == CCValAssign::Indirect
||
7197 VA
.getLocInfo() == CCValAssign::Trunc
)
7198 OpSize
= VA
.getLocVT().getFixedSizeInBits();
7200 OpSize
= Flags
.isByVal() ? Flags
.getByValSize() * 8
7201 : VA
.getValVT().getSizeInBits();
7202 OpSize
= (OpSize
+ 7) / 8;
7203 if (!Subtarget
->isLittleEndian() && !Flags
.isByVal() &&
7204 !Flags
.isInConsecutiveRegs()) {
7206 BEAlign
= 8 - OpSize
;
7208 unsigned LocMemOffset
= VA
.getLocMemOffset();
7209 int32_t Offset
= LocMemOffset
+ BEAlign
;
7210 SDValue PtrOff
= DAG
.getIntPtrConstant(Offset
, DL
);
7211 PtrOff
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, StackPtr
, PtrOff
);
7214 Offset
= Offset
+ FPDiff
;
7215 int FI
= MF
.getFrameInfo().CreateFixedObject(OpSize
, Offset
, true);
7217 DstAddr
= DAG
.getFrameIndex(FI
, PtrVT
);
7218 DstInfo
= MachinePointerInfo::getFixedStack(MF
, FI
);
7220 // Make sure any stack arguments overlapping with where we're storing
7221 // are loaded before this eventual operation. Otherwise they'll be
7223 Chain
= addTokenForArgument(Chain
, DAG
, MF
.getFrameInfo(), FI
);
7225 SDValue PtrOff
= DAG
.getIntPtrConstant(Offset
, DL
);
7227 DstAddr
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, StackPtr
, PtrOff
);
7228 DstInfo
= MachinePointerInfo::getStack(MF
, LocMemOffset
);
7231 if (Outs
[i
].Flags
.isByVal()) {
7233 DAG
.getConstant(Outs
[i
].Flags
.getByValSize(), DL
, MVT::i64
);
7234 SDValue Cpy
= DAG
.getMemcpy(
7235 Chain
, DL
, DstAddr
, Arg
, SizeNode
,
7236 Outs
[i
].Flags
.getNonZeroByValAlign(),
7237 /*isVol = */ false, /*AlwaysInline = */ false,
7238 /*isTailCall = */ false, DstInfo
, MachinePointerInfo());
7240 MemOpChains
.push_back(Cpy
);
7242 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
7243 // promoted to a legal register type i32, we should truncate Arg back to
7245 if (VA
.getValVT() == MVT::i1
|| VA
.getValVT() == MVT::i8
||
7246 VA
.getValVT() == MVT::i16
)
7247 Arg
= DAG
.getNode(ISD::TRUNCATE
, DL
, VA
.getValVT(), Arg
);
7249 SDValue Store
= DAG
.getStore(Chain
, DL
, Arg
, DstAddr
, DstInfo
);
7250 MemOpChains
.push_back(Store
);
7255 if (IsVarArg
&& Subtarget
->isWindowsArm64EC()) {
7256 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
7257 // describing the argument list. x4 contains the address of the
7258 // first stack parameter. x5 contains the size in bytes of all parameters
7259 // passed on the stack.
7260 RegsToPass
.emplace_back(AArch64::X4
, StackPtr
);
7261 RegsToPass
.emplace_back(AArch64::X5
,
7262 DAG
.getConstant(NumBytes
, DL
, MVT::i64
));
7265 if (!MemOpChains
.empty())
7266 Chain
= DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, MemOpChains
);
7269 if (RequiresSMChange
) {
7270 SDValue NewChain
= changeStreamingMode(DAG
, DL
, *RequiresSMChange
, Chain
,
7271 InFlag
, PStateSM
, true);
7272 Chain
= NewChain
.getValue(0);
7273 InFlag
= NewChain
.getValue(1);
7276 // Build a sequence of copy-to-reg nodes chained together with token chain
7277 // and flag operands which copy the outgoing args into the appropriate regs.
7278 for (auto &RegToPass
: RegsToPass
) {
7279 Chain
= DAG
.getCopyToReg(Chain
, DL
, RegToPass
.first
,
7280 RegToPass
.second
, InFlag
);
7281 InFlag
= Chain
.getValue(1);
7284 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
7285 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
7286 // node so that legalize doesn't hack it.
7287 if (auto *G
= dyn_cast
<GlobalAddressSDNode
>(Callee
)) {
7288 auto GV
= G
->getGlobal();
7290 Subtarget
->classifyGlobalFunctionReference(GV
, getTargetMachine());
7291 if (OpFlags
& AArch64II::MO_GOT
) {
7292 Callee
= DAG
.getTargetGlobalAddress(GV
, DL
, PtrVT
, 0, OpFlags
);
7293 Callee
= DAG
.getNode(AArch64ISD::LOADgot
, DL
, PtrVT
, Callee
);
7295 const GlobalValue
*GV
= G
->getGlobal();
7296 Callee
= DAG
.getTargetGlobalAddress(GV
, DL
, PtrVT
, 0, 0);
7298 } else if (auto *S
= dyn_cast
<ExternalSymbolSDNode
>(Callee
)) {
7299 if (getTargetMachine().getCodeModel() == CodeModel::Large
&&
7300 Subtarget
->isTargetMachO()) {
7301 const char *Sym
= S
->getSymbol();
7302 Callee
= DAG
.getTargetExternalSymbol(Sym
, PtrVT
, AArch64II::MO_GOT
);
7303 Callee
= DAG
.getNode(AArch64ISD::LOADgot
, DL
, PtrVT
, Callee
);
7305 const char *Sym
= S
->getSymbol();
7306 Callee
= DAG
.getTargetExternalSymbol(Sym
, PtrVT
, 0);
7310 // We don't usually want to end the call-sequence here because we would tidy
7311 // the frame up *after* the call, however in the ABI-changing tail-call case
7312 // we've carefully laid out the parameters so that when sp is reset they'll be
7313 // in the correct location.
7314 if (IsTailCall
&& !IsSibCall
) {
7315 Chain
= DAG
.getCALLSEQ_END(Chain
, 0, 0, InFlag
, DL
);
7316 InFlag
= Chain
.getValue(1);
7319 std::vector
<SDValue
> Ops
;
7320 Ops
.push_back(Chain
);
7321 Ops
.push_back(Callee
);
7324 // Each tail call may have to adjust the stack by a different amount, so
7325 // this information must travel along with the operation for eventual
7326 // consumption by emitEpilogue.
7327 Ops
.push_back(DAG
.getTargetConstant(FPDiff
, DL
, MVT::i32
));
7330 // Add argument registers to the end of the list so that they are known live
7332 for (auto &RegToPass
: RegsToPass
)
7333 Ops
.push_back(DAG
.getRegister(RegToPass
.first
,
7334 RegToPass
.second
.getValueType()));
7336 // Add a register mask operand representing the call-preserved registers.
7337 const uint32_t *Mask
;
7338 const AArch64RegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
7340 // For 'this' returns, use the X0-preserving mask if applicable
7341 Mask
= TRI
->getThisReturnPreservedMask(MF
, CallConv
);
7343 IsThisReturn
= false;
7344 Mask
= TRI
->getCallPreservedMask(MF
, CallConv
);
7347 Mask
= TRI
->getCallPreservedMask(MF
, CallConv
);
7349 if (Subtarget
->hasCustomCallingConv())
7350 TRI
->UpdateCustomCallPreservedMask(MF
, &Mask
);
7352 if (TRI
->isAnyArgRegReserved(MF
))
7353 TRI
->emitReservedArgRegCallError(MF
);
7355 assert(Mask
&& "Missing call preserved mask for calling convention");
7356 Ops
.push_back(DAG
.getRegisterMask(Mask
));
7358 if (InFlag
.getNode())
7359 Ops
.push_back(InFlag
);
7361 SDVTList NodeTys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
7363 // If we're doing a tall call, use a TC_RETURN here rather than an
7364 // actual call instruction.
7366 MF
.getFrameInfo().setHasTailCall();
7367 SDValue Ret
= DAG
.getNode(AArch64ISD::TC_RETURN
, DL
, NodeTys
, Ops
);
7370 Ret
.getNode()->setCFIType(CLI
.CFIType
->getZExtValue());
7372 DAG
.addCallSiteInfo(Ret
.getNode(), std::move(CSInfo
));
7376 unsigned CallOpc
= AArch64ISD::CALL
;
7377 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
7378 // be expanded to the call, directly followed by a special marker sequence and
7379 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
7380 if (CLI
.CB
&& objcarc::hasAttachedCallOpBundle(CLI
.CB
)) {
7381 assert(!IsTailCall
&&
7382 "tail calls cannot be marked with clang.arc.attachedcall");
7383 CallOpc
= AArch64ISD::CALL_RVMARKER
;
7385 // Add a target global address for the retainRV/claimRV runtime function
7386 // just before the call target.
7387 Function
*ARCFn
= *objcarc::getAttachedARCFunction(CLI
.CB
);
7388 auto GA
= DAG
.getTargetGlobalAddress(ARCFn
, DL
, PtrVT
);
7389 Ops
.insert(Ops
.begin() + 1, GA
);
7390 } else if (GuardWithBTI
)
7391 CallOpc
= AArch64ISD::CALL_BTI
;
7393 // Returns a chain and a flag for retval copy to use.
7394 Chain
= DAG
.getNode(CallOpc
, DL
, NodeTys
, Ops
);
7397 Chain
.getNode()->setCFIType(CLI
.CFIType
->getZExtValue());
7399 DAG
.addNoMergeSiteInfo(Chain
.getNode(), CLI
.NoMerge
);
7400 InFlag
= Chain
.getValue(1);
7401 DAG
.addCallSiteInfo(Chain
.getNode(), std::move(CSInfo
));
7403 uint64_t CalleePopBytes
=
7404 DoesCalleeRestoreStack(CallConv
, TailCallOpt
) ? alignTo(NumBytes
, 16) : 0;
7406 Chain
= DAG
.getCALLSEQ_END(Chain
, NumBytes
, CalleePopBytes
, InFlag
, DL
);
7407 InFlag
= Chain
.getValue(1);
7409 // Handle result values, copying them out of physregs into vregs that we
7411 SDValue Result
= LowerCallResult(Chain
, InFlag
, CallConv
, IsVarArg
, RVLocs
,
7412 DL
, DAG
, InVals
, IsThisReturn
,
7413 IsThisReturn
? OutVals
[0] : SDValue());
7416 InFlag
= Result
.getValue(Result
->getNumValues() - 1);
7418 if (RequiresSMChange
) {
7419 assert(PStateSM
&& "Expected a PStateSM to be set");
7420 Result
= changeStreamingMode(DAG
, DL
, !*RequiresSMChange
, Result
, InFlag
,
7424 if (RequiresLazySave
) {
7425 // Unconditionally resume ZA.
7426 Result
= DAG
.getNode(
7427 AArch64ISD::SMSTART
, DL
, MVT::Other
, Result
,
7428 DAG
.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA
), DL
, MVT::i32
),
7429 DAG
.getConstant(0, DL
, MVT::i64
), DAG
.getConstant(1, DL
, MVT::i64
));
7431 // Conditionally restore the lazy save using a pseudo node.
7432 unsigned FI
= FuncInfo
->getLazySaveTPIDR2Obj();
7433 SDValue RegMask
= DAG
.getRegisterMask(
7434 TRI
->SMEABISupportRoutinesCallPreservedMaskFromX0());
7435 SDValue RestoreRoutine
= DAG
.getTargetExternalSymbol(
7436 "__arm_tpidr2_restore", getPointerTy(DAG
.getDataLayout()));
7437 SDValue TPIDR2_EL0
= DAG
.getNode(
7438 ISD::INTRINSIC_W_CHAIN
, DL
, MVT::i64
, Result
,
7439 DAG
.getConstant(Intrinsic::aarch64_sme_get_tpidr2
, DL
, MVT::i32
));
7441 // Copy the address of the TPIDR2 block into X0 before 'calling' the
7442 // RESTORE_ZA pseudo.
7444 SDValue TPIDR2Block
= DAG
.getFrameIndex(
7445 FI
, DAG
.getTargetLoweringInfo().getFrameIndexTy(DAG
.getDataLayout()));
7446 Result
= DAG
.getCopyToReg(Result
, DL
, AArch64::X0
, TPIDR2Block
, Glue
);
7447 Result
= DAG
.getNode(AArch64ISD::RESTORE_ZA
, DL
, MVT::Other
,
7448 {Result
, TPIDR2_EL0
,
7449 DAG
.getRegister(AArch64::X0
, MVT::i64
),
7452 Result
.getValue(1)});
7454 // Finally reset the TPIDR2_EL0 register to 0.
7455 Result
= DAG
.getNode(
7456 ISD::INTRINSIC_VOID
, DL
, MVT::Other
, Result
,
7457 DAG
.getConstant(Intrinsic::aarch64_sme_set_tpidr2
, DL
, MVT::i32
),
7458 DAG
.getConstant(0, DL
, MVT::i64
));
7461 if (RequiresSMChange
|| RequiresLazySave
) {
7462 for (unsigned I
= 0; I
< InVals
.size(); ++I
) {
7463 // The smstart/smstop is chained as part of the call, but when the
7464 // resulting chain is discarded (which happens when the call is not part
7465 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
7466 // smstart/smstop is chained to the result value. We can do that by doing
7467 // a vreg -> vreg copy.
7468 Register Reg
= MF
.getRegInfo().createVirtualRegister(
7469 getRegClassFor(InVals
[I
].getValueType().getSimpleVT()));
7470 SDValue X
= DAG
.getCopyToReg(Result
, DL
, Reg
, InVals
[I
]);
7471 InVals
[I
] = DAG
.getCopyFromReg(X
, DL
, Reg
,
7472 InVals
[I
].getValueType());
7479 bool AArch64TargetLowering::CanLowerReturn(
7480 CallingConv::ID CallConv
, MachineFunction
&MF
, bool isVarArg
,
7481 const SmallVectorImpl
<ISD::OutputArg
> &Outs
, LLVMContext
&Context
) const {
7482 CCAssignFn
*RetCC
= CCAssignFnForReturn(CallConv
);
7483 SmallVector
<CCValAssign
, 16> RVLocs
;
7484 CCState
CCInfo(CallConv
, isVarArg
, MF
, RVLocs
, Context
);
7485 return CCInfo
.CheckReturn(Outs
, RetCC
);
7489 AArch64TargetLowering::LowerReturn(SDValue Chain
, CallingConv::ID CallConv
,
7491 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
7492 const SmallVectorImpl
<SDValue
> &OutVals
,
7493 const SDLoc
&DL
, SelectionDAG
&DAG
) const {
7494 auto &MF
= DAG
.getMachineFunction();
7495 auto *FuncInfo
= MF
.getInfo
<AArch64FunctionInfo
>();
7497 CCAssignFn
*RetCC
= CCAssignFnForReturn(CallConv
);
7498 SmallVector
<CCValAssign
, 16> RVLocs
;
7499 CCState
CCInfo(CallConv
, isVarArg
, MF
, RVLocs
, *DAG
.getContext());
7500 CCInfo
.AnalyzeReturn(Outs
, RetCC
);
7502 // Copy the result values into the output registers.
7504 SmallVector
<std::pair
<unsigned, SDValue
>, 4> RetVals
;
7505 SmallSet
<unsigned, 4> RegsUsed
;
7506 for (unsigned i
= 0, realRVLocIdx
= 0; i
!= RVLocs
.size();
7507 ++i
, ++realRVLocIdx
) {
7508 CCValAssign
&VA
= RVLocs
[i
];
7509 assert(VA
.isRegLoc() && "Can only return in registers!");
7510 SDValue Arg
= OutVals
[realRVLocIdx
];
7512 switch (VA
.getLocInfo()) {
7514 llvm_unreachable("Unknown loc info!");
7515 case CCValAssign::Full
:
7516 if (Outs
[i
].ArgVT
== MVT::i1
) {
7517 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
7518 // value. This is strictly redundant on Darwin (which uses "zeroext
7519 // i1"), but will be optimised out before ISel.
7520 Arg
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i1
, Arg
);
7521 Arg
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, VA
.getLocVT(), Arg
);
7524 case CCValAssign::BCvt
:
7525 Arg
= DAG
.getNode(ISD::BITCAST
, DL
, VA
.getLocVT(), Arg
);
7527 case CCValAssign::AExt
:
7528 case CCValAssign::ZExt
:
7529 Arg
= DAG
.getZExtOrTrunc(Arg
, DL
, VA
.getLocVT());
7531 case CCValAssign::AExtUpper
:
7532 assert(VA
.getValVT() == MVT::i32
&& "only expect 32 -> 64 upper bits");
7533 Arg
= DAG
.getZExtOrTrunc(Arg
, DL
, VA
.getLocVT());
7534 Arg
= DAG
.getNode(ISD::SHL
, DL
, VA
.getLocVT(), Arg
,
7535 DAG
.getConstant(32, DL
, VA
.getLocVT()));
7539 if (RegsUsed
.count(VA
.getLocReg())) {
7541 llvm::find_if(RetVals
, [=](const std::pair
<unsigned, SDValue
> &Elt
) {
7542 return Elt
.first
== VA
.getLocReg();
7544 Bits
= DAG
.getNode(ISD::OR
, DL
, Bits
.getValueType(), Bits
, Arg
);
7546 RetVals
.emplace_back(VA
.getLocReg(), Arg
);
7547 RegsUsed
.insert(VA
.getLocReg());
7551 const AArch64RegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
7553 // Emit SMSTOP before returning from a locally streaming function
7554 SMEAttrs
FuncAttrs(MF
.getFunction());
7555 if (FuncAttrs
.hasStreamingBody() && !FuncAttrs
.hasStreamingInterface()) {
7556 Chain
= DAG
.getNode(
7557 AArch64ISD::SMSTOP
, DL
, DAG
.getVTList(MVT::Other
, MVT::Glue
), Chain
,
7558 DAG
.getTargetConstant((int32_t)AArch64SVCR::SVCRSM
, DL
, MVT::i32
),
7559 DAG
.getConstant(1, DL
, MVT::i64
), DAG
.getConstant(0, DL
, MVT::i64
),
7560 DAG
.getRegisterMask(TRI
->getSMStartStopCallPreservedMask()));
7561 Flag
= Chain
.getValue(1);
7564 SmallVector
<SDValue
, 4> RetOps(1, Chain
);
7565 for (auto &RetVal
: RetVals
) {
7566 Chain
= DAG
.getCopyToReg(Chain
, DL
, RetVal
.first
, RetVal
.second
, Flag
);
7567 Flag
= Chain
.getValue(1);
7569 DAG
.getRegister(RetVal
.first
, RetVal
.second
.getValueType()));
7572 // Windows AArch64 ABIs require that for returning structs by value we copy
7573 // the sret argument into X0 for the return.
7574 // We saved the argument into a virtual register in the entry block,
7575 // so now we copy the value out and into X0.
7576 if (unsigned SRetReg
= FuncInfo
->getSRetReturnReg()) {
7577 SDValue Val
= DAG
.getCopyFromReg(RetOps
[0], DL
, SRetReg
,
7578 getPointerTy(MF
.getDataLayout()));
7580 unsigned RetValReg
= AArch64::X0
;
7581 Chain
= DAG
.getCopyToReg(Chain
, DL
, RetValReg
, Val
, Flag
);
7582 Flag
= Chain
.getValue(1);
7585 DAG
.getRegister(RetValReg
, getPointerTy(DAG
.getDataLayout())));
7588 const MCPhysReg
*I
= TRI
->getCalleeSavedRegsViaCopy(&MF
);
7591 if (AArch64::GPR64RegClass
.contains(*I
))
7592 RetOps
.push_back(DAG
.getRegister(*I
, MVT::i64
));
7593 else if (AArch64::FPR64RegClass
.contains(*I
))
7594 RetOps
.push_back(DAG
.getRegister(*I
, MVT::getFloatingPointVT(64)));
7596 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
7600 RetOps
[0] = Chain
; // Update chain.
7602 // Add the flag if we have it.
7604 RetOps
.push_back(Flag
);
7606 return DAG
.getNode(AArch64ISD::RET_FLAG
, DL
, MVT::Other
, RetOps
);
7609 //===----------------------------------------------------------------------===//
7610 // Other Lowering Code
7611 //===----------------------------------------------------------------------===//
7613 SDValue
AArch64TargetLowering::getTargetNode(GlobalAddressSDNode
*N
, EVT Ty
,
7615 unsigned Flag
) const {
7616 return DAG
.getTargetGlobalAddress(N
->getGlobal(), SDLoc(N
), Ty
,
7617 N
->getOffset(), Flag
);
7620 SDValue
AArch64TargetLowering::getTargetNode(JumpTableSDNode
*N
, EVT Ty
,
7622 unsigned Flag
) const {
7623 return DAG
.getTargetJumpTable(N
->getIndex(), Ty
, Flag
);
7626 SDValue
AArch64TargetLowering::getTargetNode(ConstantPoolSDNode
*N
, EVT Ty
,
7628 unsigned Flag
) const {
7629 return DAG
.getTargetConstantPool(N
->getConstVal(), Ty
, N
->getAlign(),
7630 N
->getOffset(), Flag
);
7633 SDValue
AArch64TargetLowering::getTargetNode(BlockAddressSDNode
* N
, EVT Ty
,
7635 unsigned Flag
) const {
7636 return DAG
.getTargetBlockAddress(N
->getBlockAddress(), Ty
, 0, Flag
);
7640 template <class NodeTy
>
7641 SDValue
AArch64TargetLowering::getGOT(NodeTy
*N
, SelectionDAG
&DAG
,
7642 unsigned Flags
) const {
7643 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
7645 EVT Ty
= getPointerTy(DAG
.getDataLayout());
7646 SDValue GotAddr
= getTargetNode(N
, Ty
, DAG
, AArch64II::MO_GOT
| Flags
);
7647 // FIXME: Once remat is capable of dealing with instructions with register
7648 // operands, expand this into two nodes instead of using a wrapper node.
7649 return DAG
.getNode(AArch64ISD::LOADgot
, DL
, Ty
, GotAddr
);
7652 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
7653 template <class NodeTy
>
7654 SDValue
AArch64TargetLowering::getAddrLarge(NodeTy
*N
, SelectionDAG
&DAG
,
7655 unsigned Flags
) const {
7656 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
7658 EVT Ty
= getPointerTy(DAG
.getDataLayout());
7659 const unsigned char MO_NC
= AArch64II::MO_NC
;
7661 AArch64ISD::WrapperLarge
, DL
, Ty
,
7662 getTargetNode(N
, Ty
, DAG
, AArch64II::MO_G3
| Flags
),
7663 getTargetNode(N
, Ty
, DAG
, AArch64II::MO_G2
| MO_NC
| Flags
),
7664 getTargetNode(N
, Ty
, DAG
, AArch64II::MO_G1
| MO_NC
| Flags
),
7665 getTargetNode(N
, Ty
, DAG
, AArch64II::MO_G0
| MO_NC
| Flags
));
7668 // (addlow (adrp %hi(sym)) %lo(sym))
7669 template <class NodeTy
>
7670 SDValue
AArch64TargetLowering::getAddr(NodeTy
*N
, SelectionDAG
&DAG
,
7671 unsigned Flags
) const {
7672 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
7674 EVT Ty
= getPointerTy(DAG
.getDataLayout());
7675 SDValue Hi
= getTargetNode(N
, Ty
, DAG
, AArch64II::MO_PAGE
| Flags
);
7676 SDValue Lo
= getTargetNode(N
, Ty
, DAG
,
7677 AArch64II::MO_PAGEOFF
| AArch64II::MO_NC
| Flags
);
7678 SDValue ADRP
= DAG
.getNode(AArch64ISD::ADRP
, DL
, Ty
, Hi
);
7679 return DAG
.getNode(AArch64ISD::ADDlow
, DL
, Ty
, ADRP
, Lo
);
7683 template <class NodeTy
>
7684 SDValue
AArch64TargetLowering::getAddrTiny(NodeTy
*N
, SelectionDAG
&DAG
,
7685 unsigned Flags
) const {
7686 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
7688 EVT Ty
= getPointerTy(DAG
.getDataLayout());
7689 SDValue Sym
= getTargetNode(N
, Ty
, DAG
, Flags
);
7690 return DAG
.getNode(AArch64ISD::ADR
, DL
, Ty
, Sym
);
7693 SDValue
AArch64TargetLowering::LowerGlobalAddress(SDValue Op
,
7694 SelectionDAG
&DAG
) const {
7695 GlobalAddressSDNode
*GN
= cast
<GlobalAddressSDNode
>(Op
);
7696 const GlobalValue
*GV
= GN
->getGlobal();
7697 unsigned OpFlags
= Subtarget
->ClassifyGlobalReference(GV
, getTargetMachine());
7699 if (OpFlags
!= AArch64II::MO_NO_FLAG
)
7700 assert(cast
<GlobalAddressSDNode
>(Op
)->getOffset() == 0 &&
7701 "unexpected offset in global node");
7703 // This also catches the large code model case for Darwin, and tiny code
7704 // model with got relocations.
7705 if ((OpFlags
& AArch64II::MO_GOT
) != 0) {
7706 return getGOT(GN
, DAG
, OpFlags
);
7710 if (getTargetMachine().getCodeModel() == CodeModel::Large
) {
7711 Result
= getAddrLarge(GN
, DAG
, OpFlags
);
7712 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny
) {
7713 Result
= getAddrTiny(GN
, DAG
, OpFlags
);
7715 Result
= getAddr(GN
, DAG
, OpFlags
);
7717 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
7719 if (OpFlags
& (AArch64II::MO_DLLIMPORT
| AArch64II::MO_DLLIMPORTAUX
|
7720 AArch64II::MO_COFFSTUB
))
7721 Result
= DAG
.getLoad(PtrVT
, DL
, DAG
.getEntryNode(), Result
,
7722 MachinePointerInfo::getGOT(DAG
.getMachineFunction()));
7726 /// Convert a TLS address reference into the correct sequence of loads
7727 /// and calls to compute the variable's address (for Darwin, currently) and
7728 /// return an SDValue containing the final node.
7730 /// Darwin only has one TLS scheme which must be capable of dealing with the
7731 /// fully general situation, in the worst case. This means:
7732 /// + "extern __thread" declaration.
7733 /// + Defined in a possibly unknown dynamic library.
7735 /// The general system is that each __thread variable has a [3 x i64] descriptor
7736 /// which contains information used by the runtime to calculate the address. The
7737 /// only part of this the compiler needs to know about is the first xword, which
7738 /// contains a function pointer that must be called with the address of the
7739 /// entire descriptor in "x0".
7741 /// Since this descriptor may be in a different unit, in general even the
7742 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
7744 /// adrp x0, _var@TLVPPAGE
7745 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
7746 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
7747 /// ; the function pointer
7748 /// blr x1 ; Uses descriptor address in x0
7749 /// ; Address of _var is now in x0.
7751 /// If the address of _var's descriptor *is* known to the linker, then it can
7752 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
7753 /// a slight efficiency gain.
7755 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op
,
7756 SelectionDAG
&DAG
) const {
7757 assert(Subtarget
->isTargetDarwin() &&
7758 "This function expects a Darwin target");
7761 MVT PtrVT
= getPointerTy(DAG
.getDataLayout());
7762 MVT PtrMemVT
= getPointerMemTy(DAG
.getDataLayout());
7763 const GlobalValue
*GV
= cast
<GlobalAddressSDNode
>(Op
)->getGlobal();
7766 DAG
.getTargetGlobalAddress(GV
, DL
, PtrVT
, 0, AArch64II::MO_TLS
);
7767 SDValue DescAddr
= DAG
.getNode(AArch64ISD::LOADgot
, DL
, PtrVT
, TLVPAddr
);
7769 // The first entry in the descriptor is a function pointer that we must call
7770 // to obtain the address of the variable.
7771 SDValue Chain
= DAG
.getEntryNode();
7772 SDValue FuncTLVGet
= DAG
.getLoad(
7773 PtrMemVT
, DL
, Chain
, DescAddr
,
7774 MachinePointerInfo::getGOT(DAG
.getMachineFunction()),
7775 Align(PtrMemVT
.getSizeInBits() / 8),
7776 MachineMemOperand::MOInvariant
| MachineMemOperand::MODereferenceable
);
7777 Chain
= FuncTLVGet
.getValue(1);
7779 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
7780 FuncTLVGet
= DAG
.getZExtOrTrunc(FuncTLVGet
, DL
, PtrVT
);
7782 MachineFrameInfo
&MFI
= DAG
.getMachineFunction().getFrameInfo();
7783 MFI
.setAdjustsStack(true);
7785 // TLS calls preserve all registers except those that absolutely must be
7786 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
7788 const AArch64RegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
7789 const uint32_t *Mask
= TRI
->getTLSCallPreservedMask();
7790 if (Subtarget
->hasCustomCallingConv())
7791 TRI
->UpdateCustomCallPreservedMask(DAG
.getMachineFunction(), &Mask
);
7793 // Finally, we can make the call. This is just a degenerate version of a
7794 // normal AArch64 call node: x0 takes the address of the descriptor, and
7795 // returns the address of the variable in this thread.
7796 Chain
= DAG
.getCopyToReg(Chain
, DL
, AArch64::X0
, DescAddr
, SDValue());
7798 DAG
.getNode(AArch64ISD::CALL
, DL
, DAG
.getVTList(MVT::Other
, MVT::Glue
),
7799 Chain
, FuncTLVGet
, DAG
.getRegister(AArch64::X0
, MVT::i64
),
7800 DAG
.getRegisterMask(Mask
), Chain
.getValue(1));
7801 return DAG
.getCopyFromReg(Chain
, DL
, AArch64::X0
, PtrVT
, Chain
.getValue(1));
7804 /// Convert a thread-local variable reference into a sequence of instructions to
7805 /// compute the variable's address for the local exec TLS model of ELF targets.
7806 /// The sequence depends on the maximum TLS area size.
7807 SDValue
AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue
*GV
,
7810 SelectionDAG
&DAG
) const {
7811 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
7812 SDValue TPOff
, Addr
;
7814 switch (DAG
.getTarget().Options
.TLSSize
) {
7816 llvm_unreachable("Unexpected TLS size");
7819 // mrs x0, TPIDR_EL0
7820 // add x0, x0, :tprel_lo12:a
7821 SDValue Var
= DAG
.getTargetGlobalAddress(
7822 GV
, DL
, PtrVT
, 0, AArch64II::MO_TLS
| AArch64II::MO_PAGEOFF
);
7823 return SDValue(DAG
.getMachineNode(AArch64::ADDXri
, DL
, PtrVT
, ThreadBase
,
7825 DAG
.getTargetConstant(0, DL
, MVT::i32
)),
7830 // mrs x0, TPIDR_EL0
7831 // add x0, x0, :tprel_hi12:a
7832 // add x0, x0, :tprel_lo12_nc:a
7833 SDValue HiVar
= DAG
.getTargetGlobalAddress(
7834 GV
, DL
, PtrVT
, 0, AArch64II::MO_TLS
| AArch64II::MO_HI12
);
7835 SDValue LoVar
= DAG
.getTargetGlobalAddress(
7837 AArch64II::MO_TLS
| AArch64II::MO_PAGEOFF
| AArch64II::MO_NC
);
7838 Addr
= SDValue(DAG
.getMachineNode(AArch64::ADDXri
, DL
, PtrVT
, ThreadBase
,
7840 DAG
.getTargetConstant(0, DL
, MVT::i32
)),
7842 return SDValue(DAG
.getMachineNode(AArch64::ADDXri
, DL
, PtrVT
, Addr
,
7844 DAG
.getTargetConstant(0, DL
, MVT::i32
)),
7849 // mrs x1, TPIDR_EL0
7850 // movz x0, #:tprel_g1:a
7851 // movk x0, #:tprel_g0_nc:a
7853 SDValue HiVar
= DAG
.getTargetGlobalAddress(
7854 GV
, DL
, PtrVT
, 0, AArch64II::MO_TLS
| AArch64II::MO_G1
);
7855 SDValue LoVar
= DAG
.getTargetGlobalAddress(
7857 AArch64II::MO_TLS
| AArch64II::MO_G0
| AArch64II::MO_NC
);
7858 TPOff
= SDValue(DAG
.getMachineNode(AArch64::MOVZXi
, DL
, PtrVT
, HiVar
,
7859 DAG
.getTargetConstant(16, DL
, MVT::i32
)),
7861 TPOff
= SDValue(DAG
.getMachineNode(AArch64::MOVKXi
, DL
, PtrVT
, TPOff
, LoVar
,
7862 DAG
.getTargetConstant(0, DL
, MVT::i32
)),
7864 return DAG
.getNode(ISD::ADD
, DL
, PtrVT
, ThreadBase
, TPOff
);
7868 // mrs x1, TPIDR_EL0
7869 // movz x0, #:tprel_g2:a
7870 // movk x0, #:tprel_g1_nc:a
7871 // movk x0, #:tprel_g0_nc:a
7873 SDValue HiVar
= DAG
.getTargetGlobalAddress(
7874 GV
, DL
, PtrVT
, 0, AArch64II::MO_TLS
| AArch64II::MO_G2
);
7875 SDValue MiVar
= DAG
.getTargetGlobalAddress(
7877 AArch64II::MO_TLS
| AArch64II::MO_G1
| AArch64II::MO_NC
);
7878 SDValue LoVar
= DAG
.getTargetGlobalAddress(
7880 AArch64II::MO_TLS
| AArch64II::MO_G0
| AArch64II::MO_NC
);
7881 TPOff
= SDValue(DAG
.getMachineNode(AArch64::MOVZXi
, DL
, PtrVT
, HiVar
,
7882 DAG
.getTargetConstant(32, DL
, MVT::i32
)),
7884 TPOff
= SDValue(DAG
.getMachineNode(AArch64::MOVKXi
, DL
, PtrVT
, TPOff
, MiVar
,
7885 DAG
.getTargetConstant(16, DL
, MVT::i32
)),
7887 TPOff
= SDValue(DAG
.getMachineNode(AArch64::MOVKXi
, DL
, PtrVT
, TPOff
, LoVar
,
7888 DAG
.getTargetConstant(0, DL
, MVT::i32
)),
7890 return DAG
.getNode(ISD::ADD
, DL
, PtrVT
, ThreadBase
, TPOff
);
7895 /// When accessing thread-local variables under either the general-dynamic or
7896 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
7897 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
7898 /// is a function pointer to carry out the resolution.
7900 /// The sequence is:
7901 /// adrp x0, :tlsdesc:var
7902 /// ldr x1, [x0, #:tlsdesc_lo12:var]
7903 /// add x0, x0, #:tlsdesc_lo12:var
7904 /// .tlsdesccall var
7906 /// (TPIDR_EL0 offset now in x0)
7908 /// The above sequence must be produced unscheduled, to enable the linker to
7909 /// optimize/relax this sequence.
7910 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
7911 /// above sequence, and expanded really late in the compilation flow, to ensure
7912 /// the sequence is produced as per above.
7913 SDValue
AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr
,
7915 SelectionDAG
&DAG
) const {
7916 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
7918 SDValue Chain
= DAG
.getEntryNode();
7919 SDVTList NodeTys
= DAG
.getVTList(MVT::Other
, MVT::Glue
);
7922 DAG
.getNode(AArch64ISD::TLSDESC_CALLSEQ
, DL
, NodeTys
, {Chain
, SymAddr
});
7923 SDValue Glue
= Chain
.getValue(1);
7925 return DAG
.getCopyFromReg(Chain
, DL
, AArch64::X0
, PtrVT
, Glue
);
7929 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op
,
7930 SelectionDAG
&DAG
) const {
7931 assert(Subtarget
->isTargetELF() && "This function expects an ELF target");
7933 const GlobalAddressSDNode
*GA
= cast
<GlobalAddressSDNode
>(Op
);
7935 TLSModel::Model Model
= getTargetMachine().getTLSModel(GA
->getGlobal());
7937 if (!EnableAArch64ELFLocalDynamicTLSGeneration
) {
7938 if (Model
== TLSModel::LocalDynamic
)
7939 Model
= TLSModel::GeneralDynamic
;
7942 if (getTargetMachine().getCodeModel() == CodeModel::Large
&&
7943 Model
!= TLSModel::LocalExec
)
7944 report_fatal_error("ELF TLS only supported in small memory model or "
7945 "in local exec TLS model");
7946 // Different choices can be made for the maximum size of the TLS area for a
7947 // module. For the small address model, the default TLS size is 16MiB and the
7948 // maximum TLS size is 4GiB.
7949 // FIXME: add tiny and large code model support for TLS access models other
7950 // than local exec. We currently generate the same code as small for tiny,
7951 // which may be larger than needed.
7954 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
7956 const GlobalValue
*GV
= GA
->getGlobal();
7958 SDValue ThreadBase
= DAG
.getNode(AArch64ISD::THREAD_POINTER
, DL
, PtrVT
);
7960 if (Model
== TLSModel::LocalExec
) {
7961 return LowerELFTLSLocalExec(GV
, ThreadBase
, DL
, DAG
);
7962 } else if (Model
== TLSModel::InitialExec
) {
7963 TPOff
= DAG
.getTargetGlobalAddress(GV
, DL
, PtrVT
, 0, AArch64II::MO_TLS
);
7964 TPOff
= DAG
.getNode(AArch64ISD::LOADgot
, DL
, PtrVT
, TPOff
);
7965 } else if (Model
== TLSModel::LocalDynamic
) {
7966 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
7967 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
7968 // the beginning of the module's TLS region, followed by a DTPREL offset
7971 // These accesses will need deduplicating if there's more than one.
7972 AArch64FunctionInfo
*MFI
=
7973 DAG
.getMachineFunction().getInfo
<AArch64FunctionInfo
>();
7974 MFI
->incNumLocalDynamicTLSAccesses();
7976 // The call needs a relocation too for linker relaxation. It doesn't make
7977 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
7979 SDValue SymAddr
= DAG
.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT
,
7982 // Now we can calculate the offset from TPIDR_EL0 to this module's
7983 // thread-local area.
7984 TPOff
= LowerELFTLSDescCallSeq(SymAddr
, DL
, DAG
);
7986 // Now use :dtprel_whatever: operations to calculate this variable's offset
7987 // in its thread-storage area.
7988 SDValue HiVar
= DAG
.getTargetGlobalAddress(
7989 GV
, DL
, MVT::i64
, 0, AArch64II::MO_TLS
| AArch64II::MO_HI12
);
7990 SDValue LoVar
= DAG
.getTargetGlobalAddress(
7991 GV
, DL
, MVT::i64
, 0,
7992 AArch64II::MO_TLS
| AArch64II::MO_PAGEOFF
| AArch64II::MO_NC
);
7994 TPOff
= SDValue(DAG
.getMachineNode(AArch64::ADDXri
, DL
, PtrVT
, TPOff
, HiVar
,
7995 DAG
.getTargetConstant(0, DL
, MVT::i32
)),
7997 TPOff
= SDValue(DAG
.getMachineNode(AArch64::ADDXri
, DL
, PtrVT
, TPOff
, LoVar
,
7998 DAG
.getTargetConstant(0, DL
, MVT::i32
)),
8000 } else if (Model
== TLSModel::GeneralDynamic
) {
8001 // The call needs a relocation too for linker relaxation. It doesn't make
8002 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
8005 DAG
.getTargetGlobalAddress(GV
, DL
, PtrVT
, 0, AArch64II::MO_TLS
);
8007 // Finally we can make a call to calculate the offset from tpidr_el0.
8008 TPOff
= LowerELFTLSDescCallSeq(SymAddr
, DL
, DAG
);
8010 llvm_unreachable("Unsupported ELF TLS access model");
8012 return DAG
.getNode(ISD::ADD
, DL
, PtrVT
, ThreadBase
, TPOff
);
8016 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op
,
8017 SelectionDAG
&DAG
) const {
8018 assert(Subtarget
->isTargetWindows() && "Windows specific TLS lowering");
8020 SDValue Chain
= DAG
.getEntryNode();
8021 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
8024 SDValue TEB
= DAG
.getRegister(AArch64::X18
, MVT::i64
);
8026 // Load the ThreadLocalStoragePointer from the TEB
8027 // A pointer to the TLS array is located at offset 0x58 from the TEB.
8029 DAG
.getNode(ISD::ADD
, DL
, PtrVT
, TEB
, DAG
.getIntPtrConstant(0x58, DL
));
8030 TLSArray
= DAG
.getLoad(PtrVT
, DL
, Chain
, TLSArray
, MachinePointerInfo());
8031 Chain
= TLSArray
.getValue(1);
8033 // Load the TLS index from the C runtime;
8034 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
8035 // This also does the same as LOADgot, but using a generic i32 load,
8036 // while LOADgot only loads i64.
8037 SDValue TLSIndexHi
=
8038 DAG
.getTargetExternalSymbol("_tls_index", PtrVT
, AArch64II::MO_PAGE
);
8039 SDValue TLSIndexLo
= DAG
.getTargetExternalSymbol(
8040 "_tls_index", PtrVT
, AArch64II::MO_PAGEOFF
| AArch64II::MO_NC
);
8041 SDValue ADRP
= DAG
.getNode(AArch64ISD::ADRP
, DL
, PtrVT
, TLSIndexHi
);
8043 DAG
.getNode(AArch64ISD::ADDlow
, DL
, PtrVT
, ADRP
, TLSIndexLo
);
8044 TLSIndex
= DAG
.getLoad(MVT::i32
, DL
, Chain
, TLSIndex
, MachinePointerInfo());
8045 Chain
= TLSIndex
.getValue(1);
8047 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
8048 // offset into the TLSArray.
8049 TLSIndex
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, PtrVT
, TLSIndex
);
8050 SDValue Slot
= DAG
.getNode(ISD::SHL
, DL
, PtrVT
, TLSIndex
,
8051 DAG
.getConstant(3, DL
, PtrVT
));
8052 SDValue TLS
= DAG
.getLoad(PtrVT
, DL
, Chain
,
8053 DAG
.getNode(ISD::ADD
, DL
, PtrVT
, TLSArray
, Slot
),
8054 MachinePointerInfo());
8055 Chain
= TLS
.getValue(1);
8057 const GlobalAddressSDNode
*GA
= cast
<GlobalAddressSDNode
>(Op
);
8058 const GlobalValue
*GV
= GA
->getGlobal();
8059 SDValue TGAHi
= DAG
.getTargetGlobalAddress(
8060 GV
, DL
, PtrVT
, 0, AArch64II::MO_TLS
| AArch64II::MO_HI12
);
8061 SDValue TGALo
= DAG
.getTargetGlobalAddress(
8063 AArch64II::MO_TLS
| AArch64II::MO_PAGEOFF
| AArch64II::MO_NC
);
8065 // Add the offset from the start of the .tls section (section base).
8067 SDValue(DAG
.getMachineNode(AArch64::ADDXri
, DL
, PtrVT
, TLS
, TGAHi
,
8068 DAG
.getTargetConstant(0, DL
, MVT::i32
)),
8070 Addr
= DAG
.getNode(AArch64ISD::ADDlow
, DL
, PtrVT
, Addr
, TGALo
);
8074 SDValue
AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op
,
8075 SelectionDAG
&DAG
) const {
8076 const GlobalAddressSDNode
*GA
= cast
<GlobalAddressSDNode
>(Op
);
8077 if (DAG
.getTarget().useEmulatedTLS())
8078 return LowerToTLSEmulatedModel(GA
, DAG
);
8080 if (Subtarget
->isTargetDarwin())
8081 return LowerDarwinGlobalTLSAddress(Op
, DAG
);
8082 if (Subtarget
->isTargetELF())
8083 return LowerELFGlobalTLSAddress(Op
, DAG
);
8084 if (Subtarget
->isTargetWindows())
8085 return LowerWindowsGlobalTLSAddress(Op
, DAG
);
8087 llvm_unreachable("Unexpected platform trying to use TLS");
8090 // Looks through \param Val to determine the bit that can be used to
8091 // check the sign of the value. It returns the unextended value and
8092 // the sign bit position.
8093 std::pair
<SDValue
, uint64_t> lookThroughSignExtension(SDValue Val
) {
8094 if (Val
.getOpcode() == ISD::SIGN_EXTEND_INREG
)
8095 return {Val
.getOperand(0),
8096 cast
<VTSDNode
>(Val
.getOperand(1))->getVT().getFixedSizeInBits() -
8099 if (Val
.getOpcode() == ISD::SIGN_EXTEND
)
8100 return {Val
.getOperand(0),
8101 Val
.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
8103 return {Val
, Val
.getValueSizeInBits() - 1};
8106 SDValue
AArch64TargetLowering::LowerBR_CC(SDValue Op
, SelectionDAG
&DAG
) const {
8107 SDValue Chain
= Op
.getOperand(0);
8108 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(1))->get();
8109 SDValue LHS
= Op
.getOperand(2);
8110 SDValue RHS
= Op
.getOperand(3);
8111 SDValue Dest
= Op
.getOperand(4);
8114 MachineFunction
&MF
= DAG
.getMachineFunction();
8115 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
8116 // will not be produced, as they are conditional branch instructions that do
8118 bool ProduceNonFlagSettingCondBr
=
8119 !MF
.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening
);
8121 // Handle f128 first, since lowering it will result in comparing the return
8122 // value of a libcall against zero, which is just what the rest of LowerBR_CC
8123 // is expecting to deal with.
8124 if (LHS
.getValueType() == MVT::f128
) {
8125 softenSetCCOperands(DAG
, MVT::f128
, LHS
, RHS
, CC
, dl
, LHS
, RHS
);
8127 // If softenSetCCOperands returned a scalar, we need to compare the result
8128 // against zero to select between true and false values.
8129 if (!RHS
.getNode()) {
8130 RHS
= DAG
.getConstant(0, dl
, LHS
.getValueType());
8135 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
8137 if (ISD::isOverflowIntrOpRes(LHS
) && isOneConstant(RHS
) &&
8138 (CC
== ISD::SETEQ
|| CC
== ISD::SETNE
)) {
8139 // Only lower legal XALUO ops.
8140 if (!DAG
.getTargetLoweringInfo().isTypeLegal(LHS
->getValueType(0)))
8143 // The actual operation with overflow check.
8144 AArch64CC::CondCode OFCC
;
8145 SDValue Value
, Overflow
;
8146 std::tie(Value
, Overflow
) = getAArch64XALUOOp(OFCC
, LHS
.getValue(0), DAG
);
8148 if (CC
== ISD::SETNE
)
8149 OFCC
= getInvertedCondCode(OFCC
);
8150 SDValue CCVal
= DAG
.getConstant(OFCC
, dl
, MVT::i32
);
8152 return DAG
.getNode(AArch64ISD::BRCOND
, dl
, MVT::Other
, Chain
, Dest
, CCVal
,
8156 if (LHS
.getValueType().isInteger()) {
8157 assert((LHS
.getValueType() == RHS
.getValueType()) &&
8158 (LHS
.getValueType() == MVT::i32
|| LHS
.getValueType() == MVT::i64
));
8160 // If the RHS of the comparison is zero, we can potentially fold this
8161 // to a specialized branch.
8162 const ConstantSDNode
*RHSC
= dyn_cast
<ConstantSDNode
>(RHS
);
8163 if (RHSC
&& RHSC
->getZExtValue() == 0 && ProduceNonFlagSettingCondBr
) {
8164 if (CC
== ISD::SETEQ
) {
8165 // See if we can use a TBZ to fold in an AND as well.
8166 // TBZ has a smaller branch displacement than CBZ. If the offset is
8167 // out of bounds, a late MI-layer pass rewrites branches.
8168 // 403.gcc is an example that hits this case.
8169 if (LHS
.getOpcode() == ISD::AND
&&
8170 isa
<ConstantSDNode
>(LHS
.getOperand(1)) &&
8171 isPowerOf2_64(LHS
.getConstantOperandVal(1))) {
8172 SDValue Test
= LHS
.getOperand(0);
8173 uint64_t Mask
= LHS
.getConstantOperandVal(1);
8174 return DAG
.getNode(AArch64ISD::TBZ
, dl
, MVT::Other
, Chain
, Test
,
8175 DAG
.getConstant(Log2_64(Mask
), dl
, MVT::i64
),
8179 return DAG
.getNode(AArch64ISD::CBZ
, dl
, MVT::Other
, Chain
, LHS
, Dest
);
8180 } else if (CC
== ISD::SETNE
) {
8181 // See if we can use a TBZ to fold in an AND as well.
8182 // TBZ has a smaller branch displacement than CBZ. If the offset is
8183 // out of bounds, a late MI-layer pass rewrites branches.
8184 // 403.gcc is an example that hits this case.
8185 if (LHS
.getOpcode() == ISD::AND
&&
8186 isa
<ConstantSDNode
>(LHS
.getOperand(1)) &&
8187 isPowerOf2_64(LHS
.getConstantOperandVal(1))) {
8188 SDValue Test
= LHS
.getOperand(0);
8189 uint64_t Mask
= LHS
.getConstantOperandVal(1);
8190 return DAG
.getNode(AArch64ISD::TBNZ
, dl
, MVT::Other
, Chain
, Test
,
8191 DAG
.getConstant(Log2_64(Mask
), dl
, MVT::i64
),
8195 return DAG
.getNode(AArch64ISD::CBNZ
, dl
, MVT::Other
, Chain
, LHS
, Dest
);
8196 } else if (CC
== ISD::SETLT
&& LHS
.getOpcode() != ISD::AND
) {
8197 // Don't combine AND since emitComparison converts the AND to an ANDS
8198 // (a.k.a. TST) and the test in the test bit and branch instruction
8199 // becomes redundant. This would also increase register pressure.
8200 uint64_t SignBitPos
;
8201 std::tie(LHS
, SignBitPos
) = lookThroughSignExtension(LHS
);
8202 return DAG
.getNode(AArch64ISD::TBNZ
, dl
, MVT::Other
, Chain
, LHS
,
8203 DAG
.getConstant(SignBitPos
, dl
, MVT::i64
), Dest
);
8206 if (RHSC
&& RHSC
->getSExtValue() == -1 && CC
== ISD::SETGT
&&
8207 LHS
.getOpcode() != ISD::AND
&& ProduceNonFlagSettingCondBr
) {
8208 // Don't combine AND since emitComparison converts the AND to an ANDS
8209 // (a.k.a. TST) and the test in the test bit and branch instruction
8210 // becomes redundant. This would also increase register pressure.
8211 uint64_t SignBitPos
;
8212 std::tie(LHS
, SignBitPos
) = lookThroughSignExtension(LHS
);
8213 return DAG
.getNode(AArch64ISD::TBZ
, dl
, MVT::Other
, Chain
, LHS
,
8214 DAG
.getConstant(SignBitPos
, dl
, MVT::i64
), Dest
);
8218 SDValue Cmp
= getAArch64Cmp(LHS
, RHS
, CC
, CCVal
, DAG
, dl
);
8219 return DAG
.getNode(AArch64ISD::BRCOND
, dl
, MVT::Other
, Chain
, Dest
, CCVal
,
8223 assert(LHS
.getValueType() == MVT::f16
|| LHS
.getValueType() == MVT::bf16
||
8224 LHS
.getValueType() == MVT::f32
|| LHS
.getValueType() == MVT::f64
);
8226 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
8227 // clean. Some of them require two branches to implement.
8228 SDValue Cmp
= emitComparison(LHS
, RHS
, CC
, dl
, DAG
);
8229 AArch64CC::CondCode CC1
, CC2
;
8230 changeFPCCToAArch64CC(CC
, CC1
, CC2
);
8231 SDValue CC1Val
= DAG
.getConstant(CC1
, dl
, MVT::i32
);
8233 DAG
.getNode(AArch64ISD::BRCOND
, dl
, MVT::Other
, Chain
, Dest
, CC1Val
, Cmp
);
8234 if (CC2
!= AArch64CC::AL
) {
8235 SDValue CC2Val
= DAG
.getConstant(CC2
, dl
, MVT::i32
);
8236 return DAG
.getNode(AArch64ISD::BRCOND
, dl
, MVT::Other
, BR1
, Dest
, CC2Val
,
8243 SDValue
AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op
,
8244 SelectionDAG
&DAG
) const {
8245 if (!Subtarget
->hasNEON())
8248 EVT VT
= Op
.getValueType();
8249 EVT IntVT
= VT
.changeTypeToInteger();
8252 SDValue In1
= Op
.getOperand(0);
8253 SDValue In2
= Op
.getOperand(1);
8254 EVT SrcVT
= In2
.getValueType();
8256 if (!SrcVT
.bitsEq(VT
))
8257 In2
= DAG
.getFPExtendOrRound(In2
, DL
, VT
);
8259 if (VT
.isScalableVector())
8261 getPackedSVEVectorVT(VT
.getVectorElementType().changeTypeToInteger());
8263 if (VT
.isFixedLengthVector() && useSVEForFixedLengthVectorVT(VT
)) {
8264 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, VT
);
8266 In1
= convertToScalableVector(DAG
, ContainerVT
, In1
);
8267 In2
= convertToScalableVector(DAG
, ContainerVT
, In2
);
8269 SDValue Res
= DAG
.getNode(ISD::FCOPYSIGN
, DL
, ContainerVT
, In1
, In2
);
8270 return convertFromScalableVector(DAG
, VT
, Res
);
8273 auto BitCast
= [this](EVT VT
, SDValue Op
, SelectionDAG
&DAG
) {
8274 if (VT
.isScalableVector())
8275 return getSVESafeBitCast(VT
, Op
, DAG
);
8277 return DAG
.getBitcast(VT
, Op
);
8280 SDValue VecVal1
, VecVal2
;
8282 auto SetVecVal
= [&](int Idx
= -1) {
8283 if (!VT
.isVector()) {
8285 DAG
.getTargetInsertSubreg(Idx
, DL
, VecVT
, DAG
.getUNDEF(VecVT
), In1
);
8287 DAG
.getTargetInsertSubreg(Idx
, DL
, VecVT
, DAG
.getUNDEF(VecVT
), In2
);
8289 VecVal1
= BitCast(VecVT
, In1
, DAG
);
8290 VecVal2
= BitCast(VecVT
, In2
, DAG
);
8293 if (VT
.isVector()) {
8296 } else if (VT
== MVT::f64
) {
8298 SetVecVal(AArch64::dsub
);
8299 } else if (VT
== MVT::f32
) {
8301 SetVecVal(AArch64::ssub
);
8302 } else if (VT
== MVT::f16
) {
8304 SetVecVal(AArch64::hsub
);
8306 llvm_unreachable("Invalid type for copysign!");
8309 unsigned BitWidth
= In1
.getScalarValueSizeInBits();
8310 SDValue SignMaskV
= DAG
.getConstant(~APInt::getSignMask(BitWidth
), DL
, VecVT
);
8312 // We want to materialize a mask with every bit but the high bit set, but the
8313 // AdvSIMD immediate moves cannot materialize that in a single instruction for
8314 // 64-bit elements. Instead, materialize all bits set and then negate that.
8315 if (VT
== MVT::f64
|| VT
== MVT::v2f64
) {
8316 SignMaskV
= DAG
.getConstant(APInt::getAllOnes(BitWidth
), DL
, VecVT
);
8317 SignMaskV
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2f64
, SignMaskV
);
8318 SignMaskV
= DAG
.getNode(ISD::FNEG
, DL
, MVT::v2f64
, SignMaskV
);
8319 SignMaskV
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i64
, SignMaskV
);
8323 DAG
.getNode(AArch64ISD::BSP
, DL
, VecVT
, SignMaskV
, VecVal1
, VecVal2
);
8325 return DAG
.getTargetExtractSubreg(AArch64::hsub
, DL
, VT
, BSP
);
8327 return DAG
.getTargetExtractSubreg(AArch64::ssub
, DL
, VT
, BSP
);
8329 return DAG
.getTargetExtractSubreg(AArch64::dsub
, DL
, VT
, BSP
);
8331 return BitCast(VT
, BSP
, DAG
);
8334 SDValue
AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op
,
8335 SelectionDAG
&DAG
) const {
8336 if (DAG
.getMachineFunction().getFunction().hasFnAttribute(
8337 Attribute::NoImplicitFloat
))
8340 if (!Subtarget
->hasNEON())
8343 bool IsParity
= Op
.getOpcode() == ISD::PARITY
;
8345 // While there is no integer popcount instruction, it can
8346 // be more efficiently lowered to the following sequence that uses
8347 // AdvSIMD registers/instructions as long as the copies to/from
8348 // the AdvSIMD registers are cheap.
8349 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
8350 // CNT V0.8B, V0.8B // 8xbyte pop-counts
8351 // ADDV B0, V0.8B // sum 8xbyte pop-counts
8352 // UMOV X0, V0.B[0] // copy byte result back to integer reg
8353 SDValue Val
= Op
.getOperand(0);
8355 EVT VT
= Op
.getValueType();
8357 if (VT
== MVT::i32
|| VT
== MVT::i64
) {
8359 Val
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, MVT::i64
, Val
);
8360 Val
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v8i8
, Val
);
8362 SDValue CtPop
= DAG
.getNode(ISD::CTPOP
, DL
, MVT::v8i8
, Val
);
8363 SDValue UaddLV
= DAG
.getNode(
8364 ISD::INTRINSIC_WO_CHAIN
, DL
, MVT::i32
,
8365 DAG
.getConstant(Intrinsic::aarch64_neon_uaddlv
, DL
, MVT::i32
), CtPop
);
8368 UaddLV
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, UaddLV
,
8369 DAG
.getConstant(1, DL
, MVT::i32
));
8372 UaddLV
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, MVT::i64
, UaddLV
);
8374 } else if (VT
== MVT::i128
) {
8375 Val
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v16i8
, Val
);
8377 SDValue CtPop
= DAG
.getNode(ISD::CTPOP
, DL
, MVT::v16i8
, Val
);
8378 SDValue UaddLV
= DAG
.getNode(
8379 ISD::INTRINSIC_WO_CHAIN
, DL
, MVT::i32
,
8380 DAG
.getConstant(Intrinsic::aarch64_neon_uaddlv
, DL
, MVT::i32
), CtPop
);
8383 UaddLV
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, UaddLV
,
8384 DAG
.getConstant(1, DL
, MVT::i32
));
8386 return DAG
.getNode(ISD::ZERO_EXTEND
, DL
, MVT::i128
, UaddLV
);
8389 assert(!IsParity
&& "ISD::PARITY of vector types not supported");
8391 if (VT
.isScalableVector() || useSVEForFixedLengthVectorVT(VT
))
8392 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::CTPOP_MERGE_PASSTHRU
);
8394 assert((VT
== MVT::v1i64
|| VT
== MVT::v2i64
|| VT
== MVT::v2i32
||
8395 VT
== MVT::v4i32
|| VT
== MVT::v4i16
|| VT
== MVT::v8i16
) &&
8396 "Unexpected type for custom ctpop lowering");
8398 EVT VT8Bit
= VT
.is64BitVector() ? MVT::v8i8
: MVT::v16i8
;
8399 Val
= DAG
.getBitcast(VT8Bit
, Val
);
8400 Val
= DAG
.getNode(ISD::CTPOP
, DL
, VT8Bit
, Val
);
8402 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
8403 unsigned EltSize
= 8;
8404 unsigned NumElts
= VT
.is64BitVector() ? 8 : 16;
8405 while (EltSize
!= VT
.getScalarSizeInBits()) {
8408 MVT WidenVT
= MVT::getVectorVT(MVT::getIntegerVT(EltSize
), NumElts
);
8410 ISD::INTRINSIC_WO_CHAIN
, DL
, WidenVT
,
8411 DAG
.getConstant(Intrinsic::aarch64_neon_uaddlp
, DL
, MVT::i32
), Val
);
8417 SDValue
AArch64TargetLowering::LowerCTTZ(SDValue Op
, SelectionDAG
&DAG
) const {
8418 EVT VT
= Op
.getValueType();
8419 assert(VT
.isScalableVector() ||
8420 useSVEForFixedLengthVectorVT(
8421 VT
, /*OverrideNEON=*/Subtarget
->useSVEForFixedLengthVectors()));
8424 SDValue RBIT
= DAG
.getNode(ISD::BITREVERSE
, DL
, VT
, Op
.getOperand(0));
8425 return DAG
.getNode(ISD::CTLZ
, DL
, VT
, RBIT
);
8428 SDValue
AArch64TargetLowering::LowerMinMax(SDValue Op
,
8429 SelectionDAG
&DAG
) const {
8431 EVT VT
= Op
.getValueType();
8433 unsigned Opcode
= Op
.getOpcode();
8437 llvm_unreachable("Wrong instruction");
8452 if (VT
.isScalableVector() ||
8453 useSVEForFixedLengthVectorVT(
8454 VT
, /*OverrideNEON=*/Subtarget
->useSVEForFixedLengthVectors())) {
8457 llvm_unreachable("Wrong instruction");
8459 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::SMAX_PRED
);
8461 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::SMIN_PRED
);
8463 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::UMAX_PRED
);
8465 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::UMIN_PRED
);
8469 SDValue Op0
= Op
.getOperand(0);
8470 SDValue Op1
= Op
.getOperand(1);
8471 SDValue Cond
= DAG
.getSetCC(DL
, VT
, Op0
, Op1
, CC
);
8472 return DAG
.getSelect(DL
, VT
, Cond
, Op0
, Op1
);
8475 SDValue
AArch64TargetLowering::LowerBitreverse(SDValue Op
,
8476 SelectionDAG
&DAG
) const {
8477 EVT VT
= Op
.getValueType();
8479 if (VT
.isScalableVector() ||
8480 useSVEForFixedLengthVectorVT(
8481 VT
, /*OverrideNEON=*/Subtarget
->useSVEForFixedLengthVectors()))
8482 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::BITREVERSE_MERGE_PASSTHRU
);
8488 switch (VT
.getSimpleVT().SimpleTy
) {
8490 llvm_unreachable("Invalid type for bitreverse!");
8494 REVB
= DAG
.getNode(AArch64ISD::REV32
, DL
, VST
, Op
.getOperand(0));
8501 REVB
= DAG
.getNode(AArch64ISD::REV32
, DL
, VST
, Op
.getOperand(0));
8508 REVB
= DAG
.getNode(AArch64ISD::REV64
, DL
, VST
, Op
.getOperand(0));
8515 REVB
= DAG
.getNode(AArch64ISD::REV64
, DL
, VST
, Op
.getOperand(0));
8521 return DAG
.getNode(AArch64ISD::NVCAST
, DL
, VT
,
8522 DAG
.getNode(ISD::BITREVERSE
, DL
, VST
, REVB
));
8525 SDValue
AArch64TargetLowering::LowerSETCC(SDValue Op
, SelectionDAG
&DAG
) const {
8527 if (Op
.getValueType().isVector())
8528 return LowerVSETCC(Op
, DAG
);
8530 bool IsStrict
= Op
->isStrictFPOpcode();
8531 bool IsSignaling
= Op
.getOpcode() == ISD::STRICT_FSETCCS
;
8532 unsigned OpNo
= IsStrict
? 1 : 0;
8535 Chain
= Op
.getOperand(0);
8536 SDValue LHS
= Op
.getOperand(OpNo
+ 0);
8537 SDValue RHS
= Op
.getOperand(OpNo
+ 1);
8538 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(OpNo
+ 2))->get();
8541 // We chose ZeroOrOneBooleanContents, so use zero and one.
8542 EVT VT
= Op
.getValueType();
8543 SDValue TVal
= DAG
.getConstant(1, dl
, VT
);
8544 SDValue FVal
= DAG
.getConstant(0, dl
, VT
);
8546 // Handle f128 first, since one possible outcome is a normal integer
8547 // comparison which gets picked up by the next if statement.
8548 if (LHS
.getValueType() == MVT::f128
) {
8549 softenSetCCOperands(DAG
, MVT::f128
, LHS
, RHS
, CC
, dl
, LHS
, RHS
, Chain
,
8552 // If softenSetCCOperands returned a scalar, use it.
8553 if (!RHS
.getNode()) {
8554 assert(LHS
.getValueType() == Op
.getValueType() &&
8555 "Unexpected setcc expansion!");
8556 return IsStrict
? DAG
.getMergeValues({LHS
, Chain
}, dl
) : LHS
;
8560 if (LHS
.getValueType().isInteger()) {
8562 SDValue Cmp
= getAArch64Cmp(
8563 LHS
, RHS
, ISD::getSetCCInverse(CC
, LHS
.getValueType()), CCVal
, DAG
, dl
);
8565 // Note that we inverted the condition above, so we reverse the order of
8566 // the true and false operands here. This will allow the setcc to be
8567 // matched to a single CSINC instruction.
8568 SDValue Res
= DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, FVal
, TVal
, CCVal
, Cmp
);
8569 return IsStrict
? DAG
.getMergeValues({Res
, Chain
}, dl
) : Res
;
8572 // Now we know we're dealing with FP values.
8573 assert(LHS
.getValueType() == MVT::f16
|| LHS
.getValueType() == MVT::f32
||
8574 LHS
.getValueType() == MVT::f64
);
8576 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
8577 // and do the comparison.
8580 Cmp
= emitStrictFPComparison(LHS
, RHS
, dl
, DAG
, Chain
, IsSignaling
);
8582 Cmp
= emitComparison(LHS
, RHS
, CC
, dl
, DAG
);
8584 AArch64CC::CondCode CC1
, CC2
;
8585 changeFPCCToAArch64CC(CC
, CC1
, CC2
);
8587 if (CC2
== AArch64CC::AL
) {
8588 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC
, LHS
.getValueType()), CC1
,
8590 SDValue CC1Val
= DAG
.getConstant(CC1
, dl
, MVT::i32
);
8592 // Note that we inverted the condition above, so we reverse the order of
8593 // the true and false operands here. This will allow the setcc to be
8594 // matched to a single CSINC instruction.
8595 Res
= DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, FVal
, TVal
, CC1Val
, Cmp
);
8597 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
8598 // totally clean. Some of them require two CSELs to implement. As is in
8599 // this case, we emit the first CSEL and then emit a second using the output
8600 // of the first as the RHS. We're effectively OR'ing the two CC's together.
8602 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
8603 SDValue CC1Val
= DAG
.getConstant(CC1
, dl
, MVT::i32
);
8605 DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, TVal
, FVal
, CC1Val
, Cmp
);
8607 SDValue CC2Val
= DAG
.getConstant(CC2
, dl
, MVT::i32
);
8608 Res
= DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, TVal
, CS1
, CC2Val
, Cmp
);
8610 return IsStrict
? DAG
.getMergeValues({Res
, Cmp
.getValue(1)}, dl
) : Res
;
8613 SDValue
AArch64TargetLowering::LowerSETCCCARRY(SDValue Op
,
8614 SelectionDAG
&DAG
) const {
8616 SDValue LHS
= Op
.getOperand(0);
8617 SDValue RHS
= Op
.getOperand(1);
8618 EVT VT
= LHS
.getValueType();
8619 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
8623 SDValue Carry
= Op
.getOperand(2);
8624 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
8625 SDValue InvCarry
= valueToCarryFlag(Carry
, DAG
, true);
8626 SDValue Cmp
= DAG
.getNode(AArch64ISD::SBCS
, DL
, DAG
.getVTList(VT
, MVT::Glue
),
8627 LHS
, RHS
, InvCarry
);
8629 EVT OpVT
= Op
.getValueType();
8630 SDValue TVal
= DAG
.getConstant(1, DL
, OpVT
);
8631 SDValue FVal
= DAG
.getConstant(0, DL
, OpVT
);
8633 ISD::CondCode Cond
= cast
<CondCodeSDNode
>(Op
.getOperand(3))->get();
8634 ISD::CondCode CondInv
= ISD::getSetCCInverse(Cond
, VT
);
8636 DAG
.getConstant(changeIntCCToAArch64CC(CondInv
), DL
, MVT::i32
);
8637 // Inputs are swapped because the condition is inverted. This will allow
8638 // matching with a single CSINC instruction.
8639 return DAG
.getNode(AArch64ISD::CSEL
, DL
, OpVT
, FVal
, TVal
, CCVal
,
8643 SDValue
AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC
, SDValue LHS
,
8644 SDValue RHS
, SDValue TVal
,
8645 SDValue FVal
, const SDLoc
&dl
,
8646 SelectionDAG
&DAG
) const {
8647 // Handle f128 first, because it will result in a comparison of some RTLIB
8648 // call result against zero.
8649 if (LHS
.getValueType() == MVT::f128
) {
8650 softenSetCCOperands(DAG
, MVT::f128
, LHS
, RHS
, CC
, dl
, LHS
, RHS
);
8652 // If softenSetCCOperands returned a scalar, we need to compare the result
8653 // against zero to select between true and false values.
8654 if (!RHS
.getNode()) {
8655 RHS
= DAG
.getConstant(0, dl
, LHS
.getValueType());
8660 // Also handle f16, for which we need to do a f32 comparison.
8661 if (LHS
.getValueType() == MVT::f16
&& !Subtarget
->hasFullFP16()) {
8662 LHS
= DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::f32
, LHS
);
8663 RHS
= DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::f32
, RHS
);
8666 // Next, handle integers.
8667 if (LHS
.getValueType().isInteger()) {
8668 assert((LHS
.getValueType() == RHS
.getValueType()) &&
8669 (LHS
.getValueType() == MVT::i32
|| LHS
.getValueType() == MVT::i64
));
8671 ConstantSDNode
*CFVal
= dyn_cast
<ConstantSDNode
>(FVal
);
8672 ConstantSDNode
*CTVal
= dyn_cast
<ConstantSDNode
>(TVal
);
8673 ConstantSDNode
*RHSC
= dyn_cast
<ConstantSDNode
>(RHS
);
8674 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
8675 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
8677 if (CC
== ISD::SETGT
&& RHSC
&& RHSC
->isAllOnes() && CTVal
&& CFVal
&&
8678 CTVal
->isOne() && CFVal
->isAllOnes() &&
8679 LHS
.getValueType() == TVal
.getValueType()) {
8680 EVT VT
= LHS
.getValueType();
8682 DAG
.getNode(ISD::SRA
, dl
, VT
, LHS
,
8683 DAG
.getConstant(VT
.getSizeInBits() - 1, dl
, VT
));
8684 return DAG
.getNode(ISD::OR
, dl
, VT
, Shift
, DAG
.getConstant(1, dl
, VT
));
8687 unsigned Opcode
= AArch64ISD::CSEL
;
8689 // If both the TVal and the FVal are constants, see if we can swap them in
8690 // order to for a CSINV or CSINC out of them.
8691 if (CTVal
&& CFVal
&& CTVal
->isAllOnes() && CFVal
->isZero()) {
8692 std::swap(TVal
, FVal
);
8693 std::swap(CTVal
, CFVal
);
8694 CC
= ISD::getSetCCInverse(CC
, LHS
.getValueType());
8695 } else if (CTVal
&& CFVal
&& CTVal
->isOne() && CFVal
->isZero()) {
8696 std::swap(TVal
, FVal
);
8697 std::swap(CTVal
, CFVal
);
8698 CC
= ISD::getSetCCInverse(CC
, LHS
.getValueType());
8699 } else if (TVal
.getOpcode() == ISD::XOR
) {
8700 // If TVal is a NOT we want to swap TVal and FVal so that we can match
8701 // with a CSINV rather than a CSEL.
8702 if (isAllOnesConstant(TVal
.getOperand(1))) {
8703 std::swap(TVal
, FVal
);
8704 std::swap(CTVal
, CFVal
);
8705 CC
= ISD::getSetCCInverse(CC
, LHS
.getValueType());
8707 } else if (TVal
.getOpcode() == ISD::SUB
) {
8708 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
8709 // that we can match with a CSNEG rather than a CSEL.
8710 if (isNullConstant(TVal
.getOperand(0))) {
8711 std::swap(TVal
, FVal
);
8712 std::swap(CTVal
, CFVal
);
8713 CC
= ISD::getSetCCInverse(CC
, LHS
.getValueType());
8715 } else if (CTVal
&& CFVal
) {
8716 const int64_t TrueVal
= CTVal
->getSExtValue();
8717 const int64_t FalseVal
= CFVal
->getSExtValue();
8720 // If both TVal and FVal are constants, see if FVal is the
8721 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
8722 // instead of a CSEL in that case.
8723 if (TrueVal
== ~FalseVal
) {
8724 Opcode
= AArch64ISD::CSINV
;
8725 } else if (FalseVal
> std::numeric_limits
<int64_t>::min() &&
8726 TrueVal
== -FalseVal
) {
8727 Opcode
= AArch64ISD::CSNEG
;
8728 } else if (TVal
.getValueType() == MVT::i32
) {
8729 // If our operands are only 32-bit wide, make sure we use 32-bit
8730 // arithmetic for the check whether we can use CSINC. This ensures that
8731 // the addition in the check will wrap around properly in case there is
8732 // an overflow (which would not be the case if we do the check with
8733 // 64-bit arithmetic).
8734 const uint32_t TrueVal32
= CTVal
->getZExtValue();
8735 const uint32_t FalseVal32
= CFVal
->getZExtValue();
8737 if ((TrueVal32
== FalseVal32
+ 1) || (TrueVal32
+ 1 == FalseVal32
)) {
8738 Opcode
= AArch64ISD::CSINC
;
8740 if (TrueVal32
> FalseVal32
) {
8745 // 64-bit check whether we can use CSINC.
8746 const uint64_t TrueVal64
= TrueVal
;
8747 const uint64_t FalseVal64
= FalseVal
;
8749 if ((TrueVal64
== FalseVal64
+ 1) || (TrueVal64
+ 1 == FalseVal64
)) {
8750 Opcode
= AArch64ISD::CSINC
;
8752 if (TrueVal
> FalseVal
) {
8758 // Swap TVal and FVal if necessary.
8760 std::swap(TVal
, FVal
);
8761 std::swap(CTVal
, CFVal
);
8762 CC
= ISD::getSetCCInverse(CC
, LHS
.getValueType());
8765 if (Opcode
!= AArch64ISD::CSEL
) {
8766 // Drop FVal since we can get its value by simply inverting/negating
8772 // Avoid materializing a constant when possible by reusing a known value in
8773 // a register. However, don't perform this optimization if the known value
8774 // is one, zero or negative one in the case of a CSEL. We can always
8775 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
8776 // FVal, respectively.
8777 ConstantSDNode
*RHSVal
= dyn_cast
<ConstantSDNode
>(RHS
);
8778 if (Opcode
== AArch64ISD::CSEL
&& RHSVal
&& !RHSVal
->isOne() &&
8779 !RHSVal
->isZero() && !RHSVal
->isAllOnes()) {
8780 AArch64CC::CondCode AArch64CC
= changeIntCCToAArch64CC(CC
);
8781 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
8782 // "a != C ? x : a" to avoid materializing C.
8783 if (CTVal
&& CTVal
== RHSVal
&& AArch64CC
== AArch64CC::EQ
)
8785 else if (CFVal
&& CFVal
== RHSVal
&& AArch64CC
== AArch64CC::NE
)
8787 } else if (Opcode
== AArch64ISD::CSNEG
&& RHSVal
&& RHSVal
->isOne()) {
8788 assert (CTVal
&& CFVal
&& "Expected constant operands for CSNEG.");
8789 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
8790 // avoid materializing C.
8791 AArch64CC::CondCode AArch64CC
= changeIntCCToAArch64CC(CC
);
8792 if (CTVal
== RHSVal
&& AArch64CC
== AArch64CC::EQ
) {
8793 Opcode
= AArch64ISD::CSINV
;
8795 FVal
= DAG
.getConstant(0, dl
, FVal
.getValueType());
8800 SDValue Cmp
= getAArch64Cmp(LHS
, RHS
, CC
, CCVal
, DAG
, dl
);
8801 EVT VT
= TVal
.getValueType();
8802 return DAG
.getNode(Opcode
, dl
, VT
, TVal
, FVal
, CCVal
, Cmp
);
8805 // Now we know we're dealing with FP values.
8806 assert(LHS
.getValueType() == MVT::f16
|| LHS
.getValueType() == MVT::f32
||
8807 LHS
.getValueType() == MVT::f64
);
8808 assert(LHS
.getValueType() == RHS
.getValueType());
8809 EVT VT
= TVal
.getValueType();
8810 SDValue Cmp
= emitComparison(LHS
, RHS
, CC
, dl
, DAG
);
8812 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
8813 // clean. Some of them require two CSELs to implement.
8814 AArch64CC::CondCode CC1
, CC2
;
8815 changeFPCCToAArch64CC(CC
, CC1
, CC2
);
8817 if (DAG
.getTarget().Options
.UnsafeFPMath
) {
8818 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
8819 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
8820 ConstantFPSDNode
*RHSVal
= dyn_cast
<ConstantFPSDNode
>(RHS
);
8821 if (RHSVal
&& RHSVal
->isZero()) {
8822 ConstantFPSDNode
*CFVal
= dyn_cast
<ConstantFPSDNode
>(FVal
);
8823 ConstantFPSDNode
*CTVal
= dyn_cast
<ConstantFPSDNode
>(TVal
);
8825 if ((CC
== ISD::SETEQ
|| CC
== ISD::SETOEQ
|| CC
== ISD::SETUEQ
) &&
8826 CTVal
&& CTVal
->isZero() && TVal
.getValueType() == LHS
.getValueType())
8828 else if ((CC
== ISD::SETNE
|| CC
== ISD::SETONE
|| CC
== ISD::SETUNE
) &&
8829 CFVal
&& CFVal
->isZero() &&
8830 FVal
.getValueType() == LHS
.getValueType())
8835 // Emit first, and possibly only, CSEL.
8836 SDValue CC1Val
= DAG
.getConstant(CC1
, dl
, MVT::i32
);
8837 SDValue CS1
= DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, TVal
, FVal
, CC1Val
, Cmp
);
8839 // If we need a second CSEL, emit it, using the output of the first as the
8840 // RHS. We're effectively OR'ing the two CC's together.
8841 if (CC2
!= AArch64CC::AL
) {
8842 SDValue CC2Val
= DAG
.getConstant(CC2
, dl
, MVT::i32
);
8843 return DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, TVal
, CS1
, CC2Val
, Cmp
);
8846 // Otherwise, return the output of the first CSEL.
8850 SDValue
AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op
,
8851 SelectionDAG
&DAG
) const {
8852 EVT Ty
= Op
.getValueType();
8853 auto Idx
= Op
.getConstantOperandAPInt(2);
8854 int64_t IdxVal
= Idx
.getSExtValue();
8855 assert(Ty
.isScalableVector() &&
8856 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
8858 // We can use the splice instruction for certain index values where we are
8859 // able to efficiently generate the correct predicate. The index will be
8860 // inverted and used directly as the input to the ptrue instruction, i.e.
8861 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
8862 // splice predicate. However, we can only do this if we can guarantee that
8863 // there are enough elements in the vector, hence we check the index <= min
8864 // number of elements.
8865 Optional
<unsigned> PredPattern
;
8866 if (Ty
.isScalableVector() && IdxVal
< 0 &&
8867 (PredPattern
= getSVEPredPatternFromNumElements(std::abs(IdxVal
))) !=
8871 // Create a predicate where all but the last -IdxVal elements are false.
8872 EVT PredVT
= Ty
.changeVectorElementType(MVT::i1
);
8873 SDValue Pred
= getPTrue(DAG
, DL
, PredVT
, *PredPattern
);
8874 Pred
= DAG
.getNode(ISD::VECTOR_REVERSE
, DL
, PredVT
, Pred
);
8876 // Now splice the two inputs together using the predicate.
8877 return DAG
.getNode(AArch64ISD::SPLICE
, DL
, Ty
, Pred
, Op
.getOperand(0),
8881 // This will select to an EXT instruction, which has a maximum immediate
8882 // value of 255, hence 2048-bits is the maximum value we can lower.
8884 IdxVal
< int64_t(2048 / Ty
.getVectorElementType().getSizeInBits()))
8890 SDValue
AArch64TargetLowering::LowerSELECT_CC(SDValue Op
,
8891 SelectionDAG
&DAG
) const {
8892 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(4))->get();
8893 SDValue LHS
= Op
.getOperand(0);
8894 SDValue RHS
= Op
.getOperand(1);
8895 SDValue TVal
= Op
.getOperand(2);
8896 SDValue FVal
= Op
.getOperand(3);
8898 return LowerSELECT_CC(CC
, LHS
, RHS
, TVal
, FVal
, DL
, DAG
);
8901 SDValue
AArch64TargetLowering::LowerSELECT(SDValue Op
,
8902 SelectionDAG
&DAG
) const {
8903 SDValue CCVal
= Op
->getOperand(0);
8904 SDValue TVal
= Op
->getOperand(1);
8905 SDValue FVal
= Op
->getOperand(2);
8908 EVT Ty
= Op
.getValueType();
8909 if (Ty
.isScalableVector()) {
8910 SDValue TruncCC
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i1
, CCVal
);
8911 MVT PredVT
= MVT::getVectorVT(MVT::i1
, Ty
.getVectorElementCount());
8912 SDValue SplatPred
= DAG
.getNode(ISD::SPLAT_VECTOR
, DL
, PredVT
, TruncCC
);
8913 return DAG
.getNode(ISD::VSELECT
, DL
, Ty
, SplatPred
, TVal
, FVal
);
8916 if (useSVEForFixedLengthVectorVT(Ty
)) {
8917 // FIXME: Ideally this would be the same as above using i1 types, however
8918 // for the moment we can't deal with fixed i1 vector types properly, so
8919 // instead extend the predicate to a result type sized integer vector.
8920 MVT SplatValVT
= MVT::getIntegerVT(Ty
.getScalarSizeInBits());
8921 MVT PredVT
= MVT::getVectorVT(SplatValVT
, Ty
.getVectorElementCount());
8922 SDValue SplatVal
= DAG
.getSExtOrTrunc(CCVal
, DL
, SplatValVT
);
8923 SDValue SplatPred
= DAG
.getNode(ISD::SPLAT_VECTOR
, DL
, PredVT
, SplatVal
);
8924 return DAG
.getNode(ISD::VSELECT
, DL
, Ty
, SplatPred
, TVal
, FVal
);
8927 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
8929 if (ISD::isOverflowIntrOpRes(CCVal
)) {
8930 // Only lower legal XALUO ops.
8931 if (!DAG
.getTargetLoweringInfo().isTypeLegal(CCVal
->getValueType(0)))
8934 AArch64CC::CondCode OFCC
;
8935 SDValue Value
, Overflow
;
8936 std::tie(Value
, Overflow
) = getAArch64XALUOOp(OFCC
, CCVal
.getValue(0), DAG
);
8937 SDValue CCVal
= DAG
.getConstant(OFCC
, DL
, MVT::i32
);
8939 return DAG
.getNode(AArch64ISD::CSEL
, DL
, Op
.getValueType(), TVal
, FVal
,
8943 // Lower it the same way as we would lower a SELECT_CC node.
8946 if (CCVal
.getOpcode() == ISD::SETCC
) {
8947 LHS
= CCVal
.getOperand(0);
8948 RHS
= CCVal
.getOperand(1);
8949 CC
= cast
<CondCodeSDNode
>(CCVal
.getOperand(2))->get();
8952 RHS
= DAG
.getConstant(0, DL
, CCVal
.getValueType());
8956 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
8957 // order to use FCSELSrrr
8958 if ((Ty
== MVT::f16
|| Ty
== MVT::bf16
) && !Subtarget
->hasFullFP16()) {
8960 DAG
.getMachineNode(TargetOpcode::INSERT_SUBREG
, DL
, MVT::f32
,
8961 DAG
.getUNDEF(MVT::f32
), TVal
,
8962 DAG
.getTargetConstant(AArch64::hsub
, DL
, MVT::i32
)),
8965 DAG
.getMachineNode(TargetOpcode::INSERT_SUBREG
, DL
, MVT::f32
,
8966 DAG
.getUNDEF(MVT::f32
), FVal
,
8967 DAG
.getTargetConstant(AArch64::hsub
, DL
, MVT::i32
)),
8971 SDValue Res
= LowerSELECT_CC(CC
, LHS
, RHS
, TVal
, FVal
, DL
, DAG
);
8973 if ((Ty
== MVT::f16
|| Ty
== MVT::bf16
) && !Subtarget
->hasFullFP16()) {
8975 DAG
.getMachineNode(TargetOpcode::EXTRACT_SUBREG
, DL
, Ty
, Res
,
8976 DAG
.getTargetConstant(AArch64::hsub
, DL
, MVT::i32
)),
8983 SDValue
AArch64TargetLowering::LowerJumpTable(SDValue Op
,
8984 SelectionDAG
&DAG
) const {
8985 // Jump table entries as PC relative offsets. No additional tweaking
8986 // is necessary here. Just get the address of the jump table.
8987 JumpTableSDNode
*JT
= cast
<JumpTableSDNode
>(Op
);
8989 if (getTargetMachine().getCodeModel() == CodeModel::Large
&&
8990 !Subtarget
->isTargetMachO()) {
8991 return getAddrLarge(JT
, DAG
);
8992 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny
) {
8993 return getAddrTiny(JT
, DAG
);
8995 return getAddr(JT
, DAG
);
8998 SDValue
AArch64TargetLowering::LowerBR_JT(SDValue Op
,
8999 SelectionDAG
&DAG
) const {
9000 // Jump table entries as PC relative offsets. No additional tweaking
9001 // is necessary here. Just get the address of the jump table.
9003 SDValue JT
= Op
.getOperand(1);
9004 SDValue Entry
= Op
.getOperand(2);
9005 int JTI
= cast
<JumpTableSDNode
>(JT
.getNode())->getIndex();
9007 auto *AFI
= DAG
.getMachineFunction().getInfo
<AArch64FunctionInfo
>();
9008 AFI
->setJumpTableEntryInfo(JTI
, 4, nullptr);
9011 DAG
.getMachineNode(AArch64::JumpTableDest32
, DL
, MVT::i64
, MVT::i64
, JT
,
9012 Entry
, DAG
.getTargetJumpTable(JTI
, MVT::i32
));
9013 return DAG
.getNode(ISD::BRIND
, DL
, MVT::Other
, Op
.getOperand(0),
9017 SDValue
AArch64TargetLowering::LowerConstantPool(SDValue Op
,
9018 SelectionDAG
&DAG
) const {
9019 ConstantPoolSDNode
*CP
= cast
<ConstantPoolSDNode
>(Op
);
9021 if (getTargetMachine().getCodeModel() == CodeModel::Large
) {
9022 // Use the GOT for the large code model on iOS.
9023 if (Subtarget
->isTargetMachO()) {
9024 return getGOT(CP
, DAG
);
9026 return getAddrLarge(CP
, DAG
);
9027 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny
) {
9028 return getAddrTiny(CP
, DAG
);
9030 return getAddr(CP
, DAG
);
9034 SDValue
AArch64TargetLowering::LowerBlockAddress(SDValue Op
,
9035 SelectionDAG
&DAG
) const {
9036 BlockAddressSDNode
*BA
= cast
<BlockAddressSDNode
>(Op
);
9037 if (getTargetMachine().getCodeModel() == CodeModel::Large
&&
9038 !Subtarget
->isTargetMachO()) {
9039 return getAddrLarge(BA
, DAG
);
9040 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny
) {
9041 return getAddrTiny(BA
, DAG
);
9043 return getAddr(BA
, DAG
);
9046 SDValue
AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op
,
9047 SelectionDAG
&DAG
) const {
9048 AArch64FunctionInfo
*FuncInfo
=
9049 DAG
.getMachineFunction().getInfo
<AArch64FunctionInfo
>();
9052 SDValue FR
= DAG
.getFrameIndex(FuncInfo
->getVarArgsStackIndex(),
9053 getPointerTy(DAG
.getDataLayout()));
9054 FR
= DAG
.getZExtOrTrunc(FR
, DL
, getPointerMemTy(DAG
.getDataLayout()));
9055 const Value
*SV
= cast
<SrcValueSDNode
>(Op
.getOperand(2))->getValue();
9056 return DAG
.getStore(Op
.getOperand(0), DL
, FR
, Op
.getOperand(1),
9057 MachinePointerInfo(SV
));
9060 SDValue
AArch64TargetLowering::LowerWin64_VASTART(SDValue Op
,
9061 SelectionDAG
&DAG
) const {
9062 MachineFunction
&MF
= DAG
.getMachineFunction();
9063 AArch64FunctionInfo
*FuncInfo
= MF
.getInfo
<AArch64FunctionInfo
>();
9067 if (Subtarget
->isWindowsArm64EC()) {
9068 // With the Arm64EC ABI, we compute the address of the varargs save area
9069 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
9070 // but calls from an entry thunk can pass in a different address.
9071 Register VReg
= MF
.addLiveIn(AArch64::X4
, &AArch64::GPR64RegClass
);
9072 SDValue Val
= DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, VReg
, MVT::i64
);
9073 uint64_t StackOffset
;
9074 if (FuncInfo
->getVarArgsGPRSize() > 0)
9075 StackOffset
= -(uint64_t)FuncInfo
->getVarArgsGPRSize();
9077 StackOffset
= FuncInfo
->getVarArgsStackOffset();
9078 FR
= DAG
.getNode(ISD::ADD
, DL
, MVT::i64
, Val
,
9079 DAG
.getConstant(StackOffset
, DL
, MVT::i64
));
9081 FR
= DAG
.getFrameIndex(FuncInfo
->getVarArgsGPRSize() > 0
9082 ? FuncInfo
->getVarArgsGPRIndex()
9083 : FuncInfo
->getVarArgsStackIndex(),
9084 getPointerTy(DAG
.getDataLayout()));
9086 const Value
*SV
= cast
<SrcValueSDNode
>(Op
.getOperand(2))->getValue();
9087 return DAG
.getStore(Op
.getOperand(0), DL
, FR
, Op
.getOperand(1),
9088 MachinePointerInfo(SV
));
9091 SDValue
AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op
,
9092 SelectionDAG
&DAG
) const {
9093 // The layout of the va_list struct is specified in the AArch64 Procedure Call
9094 // Standard, section B.3.
9095 MachineFunction
&MF
= DAG
.getMachineFunction();
9096 AArch64FunctionInfo
*FuncInfo
= MF
.getInfo
<AArch64FunctionInfo
>();
9097 unsigned PtrSize
= Subtarget
->isTargetILP32() ? 4 : 8;
9098 auto PtrMemVT
= getPointerMemTy(DAG
.getDataLayout());
9099 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
9102 SDValue Chain
= Op
.getOperand(0);
9103 SDValue VAList
= Op
.getOperand(1);
9104 const Value
*SV
= cast
<SrcValueSDNode
>(Op
.getOperand(2))->getValue();
9105 SmallVector
<SDValue
, 4> MemOps
;
9107 // void *__stack at offset 0
9108 unsigned Offset
= 0;
9109 SDValue Stack
= DAG
.getFrameIndex(FuncInfo
->getVarArgsStackIndex(), PtrVT
);
9110 Stack
= DAG
.getZExtOrTrunc(Stack
, DL
, PtrMemVT
);
9111 MemOps
.push_back(DAG
.getStore(Chain
, DL
, Stack
, VAList
,
9112 MachinePointerInfo(SV
), Align(PtrSize
)));
9114 // void *__gr_top at offset 8 (4 on ILP32)
9116 int GPRSize
= FuncInfo
->getVarArgsGPRSize();
9118 SDValue GRTop
, GRTopAddr
;
9120 GRTopAddr
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VAList
,
9121 DAG
.getConstant(Offset
, DL
, PtrVT
));
9123 GRTop
= DAG
.getFrameIndex(FuncInfo
->getVarArgsGPRIndex(), PtrVT
);
9124 GRTop
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, GRTop
,
9125 DAG
.getConstant(GPRSize
, DL
, PtrVT
));
9126 GRTop
= DAG
.getZExtOrTrunc(GRTop
, DL
, PtrMemVT
);
9128 MemOps
.push_back(DAG
.getStore(Chain
, DL
, GRTop
, GRTopAddr
,
9129 MachinePointerInfo(SV
, Offset
),
9133 // void *__vr_top at offset 16 (8 on ILP32)
9135 int FPRSize
= FuncInfo
->getVarArgsFPRSize();
9137 SDValue VRTop
, VRTopAddr
;
9138 VRTopAddr
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VAList
,
9139 DAG
.getConstant(Offset
, DL
, PtrVT
));
9141 VRTop
= DAG
.getFrameIndex(FuncInfo
->getVarArgsFPRIndex(), PtrVT
);
9142 VRTop
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VRTop
,
9143 DAG
.getConstant(FPRSize
, DL
, PtrVT
));
9144 VRTop
= DAG
.getZExtOrTrunc(VRTop
, DL
, PtrMemVT
);
9146 MemOps
.push_back(DAG
.getStore(Chain
, DL
, VRTop
, VRTopAddr
,
9147 MachinePointerInfo(SV
, Offset
),
9151 // int __gr_offs at offset 24 (12 on ILP32)
9153 SDValue GROffsAddr
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VAList
,
9154 DAG
.getConstant(Offset
, DL
, PtrVT
));
9156 DAG
.getStore(Chain
, DL
, DAG
.getConstant(-GPRSize
, DL
, MVT::i32
),
9157 GROffsAddr
, MachinePointerInfo(SV
, Offset
), Align(4)));
9159 // int __vr_offs at offset 28 (16 on ILP32)
9161 SDValue VROffsAddr
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VAList
,
9162 DAG
.getConstant(Offset
, DL
, PtrVT
));
9164 DAG
.getStore(Chain
, DL
, DAG
.getConstant(-FPRSize
, DL
, MVT::i32
),
9165 VROffsAddr
, MachinePointerInfo(SV
, Offset
), Align(4)));
9167 return DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, MemOps
);
9170 SDValue
AArch64TargetLowering::LowerVASTART(SDValue Op
,
9171 SelectionDAG
&DAG
) const {
9172 MachineFunction
&MF
= DAG
.getMachineFunction();
9174 if (Subtarget
->isCallingConvWin64(MF
.getFunction().getCallingConv()))
9175 return LowerWin64_VASTART(Op
, DAG
);
9176 else if (Subtarget
->isTargetDarwin())
9177 return LowerDarwin_VASTART(Op
, DAG
);
9179 return LowerAAPCS_VASTART(Op
, DAG
);
9182 SDValue
AArch64TargetLowering::LowerVACOPY(SDValue Op
,
9183 SelectionDAG
&DAG
) const {
9184 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
9187 unsigned PtrSize
= Subtarget
->isTargetILP32() ? 4 : 8;
9188 unsigned VaListSize
=
9189 (Subtarget
->isTargetDarwin() || Subtarget
->isTargetWindows())
9191 : Subtarget
->isTargetILP32() ? 20 : 32;
9192 const Value
*DestSV
= cast
<SrcValueSDNode
>(Op
.getOperand(3))->getValue();
9193 const Value
*SrcSV
= cast
<SrcValueSDNode
>(Op
.getOperand(4))->getValue();
9195 return DAG
.getMemcpy(Op
.getOperand(0), DL
, Op
.getOperand(1), Op
.getOperand(2),
9196 DAG
.getConstant(VaListSize
, DL
, MVT::i32
),
9197 Align(PtrSize
), false, false, false,
9198 MachinePointerInfo(DestSV
), MachinePointerInfo(SrcSV
));
9201 SDValue
AArch64TargetLowering::LowerVAARG(SDValue Op
, SelectionDAG
&DAG
) const {
9202 assert(Subtarget
->isTargetDarwin() &&
9203 "automatic va_arg instruction only works on Darwin");
9205 const Value
*V
= cast
<SrcValueSDNode
>(Op
.getOperand(2))->getValue();
9206 EVT VT
= Op
.getValueType();
9208 SDValue Chain
= Op
.getOperand(0);
9209 SDValue Addr
= Op
.getOperand(1);
9210 MaybeAlign
Align(Op
.getConstantOperandVal(3));
9211 unsigned MinSlotSize
= Subtarget
->isTargetILP32() ? 4 : 8;
9212 auto PtrVT
= getPointerTy(DAG
.getDataLayout());
9213 auto PtrMemVT
= getPointerMemTy(DAG
.getDataLayout());
9215 DAG
.getLoad(PtrMemVT
, DL
, Chain
, Addr
, MachinePointerInfo(V
));
9216 Chain
= VAList
.getValue(1);
9217 VAList
= DAG
.getZExtOrTrunc(VAList
, DL
, PtrVT
);
9219 if (VT
.isScalableVector())
9220 report_fatal_error("Passing SVE types to variadic functions is "
9221 "currently not supported");
9223 if (Align
&& *Align
> MinSlotSize
) {
9224 VAList
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VAList
,
9225 DAG
.getConstant(Align
->value() - 1, DL
, PtrVT
));
9226 VAList
= DAG
.getNode(ISD::AND
, DL
, PtrVT
, VAList
,
9227 DAG
.getConstant(-(int64_t)Align
->value(), DL
, PtrVT
));
9230 Type
*ArgTy
= VT
.getTypeForEVT(*DAG
.getContext());
9231 unsigned ArgSize
= DAG
.getDataLayout().getTypeAllocSize(ArgTy
);
9233 // Scalar integer and FP values smaller than 64 bits are implicitly extended
9234 // up to 64 bits. At the very least, we have to increase the striding of the
9235 // vaargs list to match this, and for FP values we need to introduce
9236 // FP_ROUND nodes as well.
9237 if (VT
.isInteger() && !VT
.isVector())
9238 ArgSize
= std::max(ArgSize
, MinSlotSize
);
9239 bool NeedFPTrunc
= false;
9240 if (VT
.isFloatingPoint() && !VT
.isVector() && VT
!= MVT::f64
) {
9245 // Increment the pointer, VAList, to the next vaarg
9246 SDValue VANext
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, VAList
,
9247 DAG
.getConstant(ArgSize
, DL
, PtrVT
));
9248 VANext
= DAG
.getZExtOrTrunc(VANext
, DL
, PtrMemVT
);
9250 // Store the incremented VAList to the legalized pointer
9252 DAG
.getStore(Chain
, DL
, VANext
, Addr
, MachinePointerInfo(V
));
9254 // Load the actual argument out of the pointer VAList
9256 // Load the value as an f64.
9258 DAG
.getLoad(MVT::f64
, DL
, APStore
, VAList
, MachinePointerInfo());
9259 // Round the value down to an f32.
9261 DAG
.getNode(ISD::FP_ROUND
, DL
, VT
, WideFP
.getValue(0),
9262 DAG
.getIntPtrConstant(1, DL
, /*isTarget=*/true));
9263 SDValue Ops
[] = { NarrowFP
, WideFP
.getValue(1) };
9264 // Merge the rounded value with the chain output of the load.
9265 return DAG
.getMergeValues(Ops
, DL
);
9268 return DAG
.getLoad(VT
, DL
, APStore
, VAList
, MachinePointerInfo());
9271 SDValue
AArch64TargetLowering::LowerFRAMEADDR(SDValue Op
,
9272 SelectionDAG
&DAG
) const {
9273 MachineFrameInfo
&MFI
= DAG
.getMachineFunction().getFrameInfo();
9274 MFI
.setFrameAddressIsTaken(true);
9276 EVT VT
= Op
.getValueType();
9278 unsigned Depth
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
9280 DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, AArch64::FP
, MVT::i64
);
9282 FrameAddr
= DAG
.getLoad(VT
, DL
, DAG
.getEntryNode(), FrameAddr
,
9283 MachinePointerInfo());
9285 if (Subtarget
->isTargetILP32())
9286 FrameAddr
= DAG
.getNode(ISD::AssertZext
, DL
, MVT::i64
, FrameAddr
,
9287 DAG
.getValueType(VT
));
9292 SDValue
AArch64TargetLowering::LowerSPONENTRY(SDValue Op
,
9293 SelectionDAG
&DAG
) const {
9294 MachineFrameInfo
&MFI
= DAG
.getMachineFunction().getFrameInfo();
9296 EVT VT
= getPointerTy(DAG
.getDataLayout());
9298 int FI
= MFI
.CreateFixedObject(4, 0, false);
9299 return DAG
.getFrameIndex(FI
, VT
);
9302 #define GET_REGISTER_MATCHER
9303 #include "AArch64GenAsmMatcher.inc"
9305 // FIXME? Maybe this could be a TableGen attribute on some registers and
9306 // this table could be generated automatically from RegInfo.
9307 Register
AArch64TargetLowering::
9308 getRegisterByName(const char* RegName
, LLT VT
, const MachineFunction
&MF
) const {
9309 Register Reg
= MatchRegisterName(RegName
);
9310 if (AArch64::X1
<= Reg
&& Reg
<= AArch64::X28
) {
9311 const MCRegisterInfo
*MRI
= Subtarget
->getRegisterInfo();
9312 unsigned DwarfRegNum
= MRI
->getDwarfRegNum(Reg
, false);
9313 if (!Subtarget
->isXRegisterReserved(DwarfRegNum
))
9318 report_fatal_error(Twine("Invalid register name \""
9319 + StringRef(RegName
) + "\"."));
9322 SDValue
AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op
,
9323 SelectionDAG
&DAG
) const {
9324 DAG
.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
9326 EVT VT
= Op
.getValueType();
9330 DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, AArch64::FP
, VT
);
9331 SDValue Offset
= DAG
.getConstant(8, DL
, getPointerTy(DAG
.getDataLayout()));
9333 return DAG
.getNode(ISD::ADD
, DL
, VT
, FrameAddr
, Offset
);
9336 SDValue
AArch64TargetLowering::LowerRETURNADDR(SDValue Op
,
9337 SelectionDAG
&DAG
) const {
9338 MachineFunction
&MF
= DAG
.getMachineFunction();
9339 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
9340 MFI
.setReturnAddressIsTaken(true);
9342 EVT VT
= Op
.getValueType();
9344 unsigned Depth
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
9345 SDValue ReturnAddress
;
9347 SDValue FrameAddr
= LowerFRAMEADDR(Op
, DAG
);
9348 SDValue Offset
= DAG
.getConstant(8, DL
, getPointerTy(DAG
.getDataLayout()));
9349 ReturnAddress
= DAG
.getLoad(
9350 VT
, DL
, DAG
.getEntryNode(),
9351 DAG
.getNode(ISD::ADD
, DL
, VT
, FrameAddr
, Offset
), MachinePointerInfo());
9353 // Return LR, which contains the return address. Mark it an implicit
9355 Register Reg
= MF
.addLiveIn(AArch64::LR
, &AArch64::GPR64RegClass
);
9356 ReturnAddress
= DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, Reg
, VT
);
9359 // The XPACLRI instruction assembles to a hint-space instruction before
9360 // Armv8.3-A therefore this instruction can be safely used for any pre
9361 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
9364 if (Subtarget
->hasPAuth()) {
9365 St
= DAG
.getMachineNode(AArch64::XPACI
, DL
, VT
, ReturnAddress
);
9367 // XPACLRI operates on LR therefore we must move the operand accordingly.
9369 DAG
.getCopyToReg(DAG
.getEntryNode(), DL
, AArch64::LR
, ReturnAddress
);
9370 St
= DAG
.getMachineNode(AArch64::XPACLRI
, DL
, VT
, Chain
);
9372 return SDValue(St
, 0);
9375 /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
9376 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
9377 SDValue
AArch64TargetLowering::LowerShiftParts(SDValue Op
,
9378 SelectionDAG
&DAG
) const {
9380 expandShiftParts(Op
.getNode(), Lo
, Hi
, DAG
);
9381 return DAG
.getMergeValues({Lo
, Hi
}, SDLoc(Op
));
9384 bool AArch64TargetLowering::isOffsetFoldingLegal(
9385 const GlobalAddressSDNode
*GA
) const {
9386 // Offsets are folded in the DAG combine rather than here so that we can
9387 // intelligently choose an offset based on the uses.
9391 bool AArch64TargetLowering::isFPImmLegal(const APFloat
&Imm
, EVT VT
,
9392 bool OptForSize
) const {
9393 bool IsLegal
= false;
9394 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
9395 // 16-bit case when target has full fp16 support.
9396 // FIXME: We should be able to handle f128 as well with a clever lowering.
9397 const APInt ImmInt
= Imm
.bitcastToAPInt();
9399 IsLegal
= AArch64_AM::getFP64Imm(ImmInt
) != -1 || Imm
.isPosZero();
9400 else if (VT
== MVT::f32
)
9401 IsLegal
= AArch64_AM::getFP32Imm(ImmInt
) != -1 || Imm
.isPosZero();
9402 else if (VT
== MVT::f16
&& Subtarget
->hasFullFP16())
9403 IsLegal
= AArch64_AM::getFP16Imm(ImmInt
) != -1 || Imm
.isPosZero();
9404 // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
9405 // generate that fmov.
9407 // If we can not materialize in immediate field for fmov, check if the
9408 // value can be encoded as the immediate operand of a logical instruction.
9409 // The immediate value will be created with either MOVZ, MOVN, or ORR.
9410 if (!IsLegal
&& (VT
== MVT::f64
|| VT
== MVT::f32
)) {
9411 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
9412 // however the mov+fmov sequence is always better because of the reduced
9413 // cache pressure. The timings are still the same if you consider
9414 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
9415 // movw+movk is fused). So we limit up to 2 instrdduction at most.
9416 SmallVector
<AArch64_IMM::ImmInsnModel
, 4> Insn
;
9417 AArch64_IMM::expandMOVImm(ImmInt
.getZExtValue(), VT
.getSizeInBits(),
9419 unsigned Limit
= (OptForSize
? 1 : (Subtarget
->hasFuseLiterals() ? 5 : 2));
9420 IsLegal
= Insn
.size() <= Limit
;
9423 LLVM_DEBUG(dbgs() << (IsLegal
? "Legal " : "Illegal ") << VT
.getEVTString()
9424 << " imm value: "; Imm
.dump(););
9428 //===----------------------------------------------------------------------===//
9429 // AArch64 Optimization Hooks
9430 //===----------------------------------------------------------------------===//
9432 static SDValue
getEstimate(const AArch64Subtarget
*ST
, unsigned Opcode
,
9433 SDValue Operand
, SelectionDAG
&DAG
,
9435 EVT VT
= Operand
.getValueType();
9436 if ((ST
->hasNEON() &&
9437 (VT
== MVT::f64
|| VT
== MVT::v1f64
|| VT
== MVT::v2f64
||
9438 VT
== MVT::f32
|| VT
== MVT::v1f32
|| VT
== MVT::v2f32
||
9439 VT
== MVT::v4f32
)) ||
9441 (VT
== MVT::nxv8f16
|| VT
== MVT::nxv4f32
|| VT
== MVT::nxv2f64
))) {
9442 if (ExtraSteps
== TargetLoweringBase::ReciprocalEstimate::Unspecified
)
9443 // For the reciprocal estimates, convergence is quadratic, so the number
9444 // of digits is doubled after each iteration. In ARMv8, the accuracy of
9445 // the initial estimate is 2^-8. Thus the number of extra steps to refine
9446 // the result for float (23 mantissa bits) is 2 and for double (52
9447 // mantissa bits) is 3.
9448 ExtraSteps
= VT
.getScalarType() == MVT::f64
? 3 : 2;
9450 return DAG
.getNode(Opcode
, SDLoc(Operand
), VT
, Operand
);
9457 AArch64TargetLowering::getSqrtInputTest(SDValue Op
, SelectionDAG
&DAG
,
9458 const DenormalMode
&Mode
) const {
9460 EVT VT
= Op
.getValueType();
9461 EVT CCVT
= getSetCCResultType(DAG
.getDataLayout(), *DAG
.getContext(), VT
);
9462 SDValue FPZero
= DAG
.getConstantFP(0.0, DL
, VT
);
9463 return DAG
.getSetCC(DL
, CCVT
, Op
, FPZero
, ISD::SETEQ
);
9467 AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op
,
9468 SelectionDAG
&DAG
) const {
9472 SDValue
AArch64TargetLowering::getSqrtEstimate(SDValue Operand
,
9473 SelectionDAG
&DAG
, int Enabled
,
9476 bool Reciprocal
) const {
9477 if (Enabled
== ReciprocalEstimate::Enabled
||
9478 (Enabled
== ReciprocalEstimate::Unspecified
&& Subtarget
->useRSqrt()))
9479 if (SDValue Estimate
= getEstimate(Subtarget
, AArch64ISD::FRSQRTE
, Operand
,
9482 EVT VT
= Operand
.getValueType();
9485 Flags
.setAllowReassociation(true);
9487 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
9488 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
9489 for (int i
= ExtraSteps
; i
> 0; --i
) {
9490 SDValue Step
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Estimate
, Estimate
,
9492 Step
= DAG
.getNode(AArch64ISD::FRSQRTS
, DL
, VT
, Operand
, Step
, Flags
);
9493 Estimate
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Estimate
, Step
, Flags
);
9496 Estimate
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Operand
, Estimate
, Flags
);
9505 SDValue
AArch64TargetLowering::getRecipEstimate(SDValue Operand
,
9506 SelectionDAG
&DAG
, int Enabled
,
9507 int &ExtraSteps
) const {
9508 if (Enabled
== ReciprocalEstimate::Enabled
)
9509 if (SDValue Estimate
= getEstimate(Subtarget
, AArch64ISD::FRECPE
, Operand
,
9512 EVT VT
= Operand
.getValueType();
9515 Flags
.setAllowReassociation(true);
9517 // Newton reciprocal iteration: E * (2 - X * E)
9518 // AArch64 reciprocal iteration instruction: (2 - M * N)
9519 for (int i
= ExtraSteps
; i
> 0; --i
) {
9520 SDValue Step
= DAG
.getNode(AArch64ISD::FRECPS
, DL
, VT
, Operand
,
9522 Estimate
= DAG
.getNode(ISD::FMUL
, DL
, VT
, Estimate
, Step
, Flags
);
9532 //===----------------------------------------------------------------------===//
9533 // AArch64 Inline Assembly Support
9534 //===----------------------------------------------------------------------===//
9536 // Table of Constraints
9537 // TODO: This is the current set of constraints supported by ARM for the
9538 // compiler, not all of them may make sense.
9540 // r - A general register
9541 // w - An FP/SIMD register of some size in the range v0-v31
9542 // x - An FP/SIMD register of some size in the range v0-v15
9543 // I - Constant that can be used with an ADD instruction
9544 // J - Constant that can be used with a SUB instruction
9545 // K - Constant that can be used with a 32-bit logical instruction
9546 // L - Constant that can be used with a 64-bit logical instruction
9547 // M - Constant that can be used as a 32-bit MOV immediate
9548 // N - Constant that can be used as a 64-bit MOV immediate
9549 // Q - A memory reference with base register and no offset
9550 // S - A symbolic address
9551 // Y - Floating point constant zero
9552 // Z - Integer constant zero
9554 // Note that general register operands will be output using their 64-bit x
9555 // register name, whatever the size of the variable, unless the asm operand
9556 // is prefixed by the %w modifier. Floating-point and SIMD register operands
9557 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
9559 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT
) const {
9560 // At this point, we have to lower this constraint to something else, so we
9561 // lower it to an "r" or "w". However, by doing this we will force the result
9562 // to be in register, while the X constraint is much more permissive.
9564 // Although we are correct (we are free to emit anything, without
9565 // constraints), we might break use cases that would expect us to be more
9566 // efficient and emit something else.
9567 if (!Subtarget
->hasFPARMv8())
9570 if (ConstraintVT
.isFloatingPoint())
9573 if (ConstraintVT
.isVector() &&
9574 (ConstraintVT
.getSizeInBits() == 64 ||
9575 ConstraintVT
.getSizeInBits() == 128))
9581 enum PredicateConstraint
{
9587 static PredicateConstraint
parsePredicateConstraint(StringRef Constraint
) {
9588 PredicateConstraint P
= PredicateConstraint::Invalid
;
9589 if (Constraint
== "Upa")
9590 P
= PredicateConstraint::Upa
;
9591 if (Constraint
== "Upl")
9592 P
= PredicateConstraint::Upl
;
9596 /// getConstraintType - Given a constraint letter, return the type of
9597 /// constraint it is for this target.
9598 AArch64TargetLowering::ConstraintType
9599 AArch64TargetLowering::getConstraintType(StringRef Constraint
) const {
9600 if (Constraint
.size() == 1) {
9601 switch (Constraint
[0]) {
9607 return C_RegisterClass
;
9608 // An address with a single base register. Due to the way we
9609 // currently handle addresses it is the same as 'r'.
9622 case 'S': // A symbolic address
9625 } else if (parsePredicateConstraint(Constraint
) !=
9626 PredicateConstraint::Invalid
)
9627 return C_RegisterClass
;
9628 return TargetLowering::getConstraintType(Constraint
);
9631 /// Examine constraint type and operand type and determine a weight value.
9632 /// This object must already have been set up with the operand type
9633 /// and the current alternative constraint selected.
9634 TargetLowering::ConstraintWeight
9635 AArch64TargetLowering::getSingleConstraintMatchWeight(
9636 AsmOperandInfo
&info
, const char *constraint
) const {
9637 ConstraintWeight weight
= CW_Invalid
;
9638 Value
*CallOperandVal
= info
.CallOperandVal
;
9639 // If we don't have a value, we can't do a match,
9640 // but allow it at the lowest weight.
9641 if (!CallOperandVal
)
9643 Type
*type
= CallOperandVal
->getType();
9644 // Look at the constraint type.
9645 switch (*constraint
) {
9647 weight
= TargetLowering::getSingleConstraintMatchWeight(info
, constraint
);
9652 if (type
->isFloatingPointTy() || type
->isVectorTy())
9653 weight
= CW_Register
;
9656 weight
= CW_Constant
;
9659 if (parsePredicateConstraint(constraint
) != PredicateConstraint::Invalid
)
9660 weight
= CW_Register
;
9666 std::pair
<unsigned, const TargetRegisterClass
*>
9667 AArch64TargetLowering::getRegForInlineAsmConstraint(
9668 const TargetRegisterInfo
*TRI
, StringRef Constraint
, MVT VT
) const {
9669 if (Constraint
.size() == 1) {
9670 switch (Constraint
[0]) {
9672 if (VT
.isScalableVector())
9673 return std::make_pair(0U, nullptr);
9674 if (Subtarget
->hasLS64() && VT
.getSizeInBits() == 512)
9675 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass
);
9676 if (VT
.getFixedSizeInBits() == 64)
9677 return std::make_pair(0U, &AArch64::GPR64commonRegClass
);
9678 return std::make_pair(0U, &AArch64::GPR32commonRegClass
);
9680 if (!Subtarget
->hasFPARMv8())
9682 if (VT
.isScalableVector()) {
9683 if (VT
.getVectorElementType() != MVT::i1
)
9684 return std::make_pair(0U, &AArch64::ZPRRegClass
);
9685 return std::make_pair(0U, nullptr);
9687 uint64_t VTSize
= VT
.getFixedSizeInBits();
9689 return std::make_pair(0U, &AArch64::FPR16RegClass
);
9691 return std::make_pair(0U, &AArch64::FPR32RegClass
);
9693 return std::make_pair(0U, &AArch64::FPR64RegClass
);
9695 return std::make_pair(0U, &AArch64::FPR128RegClass
);
9698 // The instructions that this constraint is designed for can
9699 // only take 128-bit registers so just use that regclass.
9701 if (!Subtarget
->hasFPARMv8())
9703 if (VT
.isScalableVector())
9704 return std::make_pair(0U, &AArch64::ZPR_4bRegClass
);
9705 if (VT
.getSizeInBits() == 128)
9706 return std::make_pair(0U, &AArch64::FPR128_loRegClass
);
9709 if (!Subtarget
->hasFPARMv8())
9711 if (VT
.isScalableVector())
9712 return std::make_pair(0U, &AArch64::ZPR_3bRegClass
);
9716 PredicateConstraint PC
= parsePredicateConstraint(Constraint
);
9717 if (PC
!= PredicateConstraint::Invalid
) {
9718 if (!VT
.isScalableVector() || VT
.getVectorElementType() != MVT::i1
)
9719 return std::make_pair(0U, nullptr);
9720 bool restricted
= (PC
== PredicateConstraint::Upl
);
9721 return restricted
? std::make_pair(0U, &AArch64::PPR_3bRegClass
)
9722 : std::make_pair(0U, &AArch64::PPRRegClass
);
9725 if (StringRef("{cc}").equals_insensitive(Constraint
))
9726 return std::make_pair(unsigned(AArch64::NZCV
), &AArch64::CCRRegClass
);
9728 // Use the default implementation in TargetLowering to convert the register
9729 // constraint into a member of a register class.
9730 std::pair
<unsigned, const TargetRegisterClass
*> Res
;
9731 Res
= TargetLowering::getRegForInlineAsmConstraint(TRI
, Constraint
, VT
);
9733 // Not found as a standard register?
9735 unsigned Size
= Constraint
.size();
9736 if ((Size
== 4 || Size
== 5) && Constraint
[0] == '{' &&
9737 tolower(Constraint
[1]) == 'v' && Constraint
[Size
- 1] == '}') {
9739 bool Failed
= Constraint
.slice(2, Size
- 1).getAsInteger(10, RegNo
);
9740 if (!Failed
&& RegNo
>= 0 && RegNo
<= 31) {
9741 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
9742 // By default we'll emit v0-v31 for this unless there's a modifier where
9743 // we'll emit the correct register as well.
9744 if (VT
!= MVT::Other
&& VT
.getSizeInBits() == 64) {
9745 Res
.first
= AArch64::FPR64RegClass
.getRegister(RegNo
);
9746 Res
.second
= &AArch64::FPR64RegClass
;
9748 Res
.first
= AArch64::FPR128RegClass
.getRegister(RegNo
);
9749 Res
.second
= &AArch64::FPR128RegClass
;
9755 if (Res
.second
&& !Subtarget
->hasFPARMv8() &&
9756 !AArch64::GPR32allRegClass
.hasSubClassEq(Res
.second
) &&
9757 !AArch64::GPR64allRegClass
.hasSubClassEq(Res
.second
))
9758 return std::make_pair(0U, nullptr);
9763 EVT
AArch64TargetLowering::getAsmOperandValueType(const DataLayout
&DL
,
9765 bool AllowUnknown
) const {
9766 if (Subtarget
->hasLS64() && Ty
->isIntegerTy(512))
9767 return EVT(MVT::i64x8
);
9769 return TargetLowering::getAsmOperandValueType(DL
, Ty
, AllowUnknown
);
9772 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
9773 /// vector. If it is invalid, don't add anything to Ops.
9774 void AArch64TargetLowering::LowerAsmOperandForConstraint(
9775 SDValue Op
, std::string
&Constraint
, std::vector
<SDValue
> &Ops
,
9776 SelectionDAG
&DAG
) const {
9779 // Currently only support length 1 constraints.
9780 if (Constraint
.length() != 1)
9783 char ConstraintLetter
= Constraint
[0];
9784 switch (ConstraintLetter
) {
9788 // This set of constraints deal with valid constants for various instructions.
9789 // Validate and return a target constant for them if we can.
9791 // 'z' maps to xzr or wzr so it needs an input of 0.
9792 if (!isNullConstant(Op
))
9795 if (Op
.getValueType() == MVT::i64
)
9796 Result
= DAG
.getRegister(AArch64::XZR
, MVT::i64
);
9798 Result
= DAG
.getRegister(AArch64::WZR
, MVT::i32
);
9802 // An absolute symbolic address or label reference.
9803 if (const GlobalAddressSDNode
*GA
= dyn_cast
<GlobalAddressSDNode
>(Op
)) {
9804 Result
= DAG
.getTargetGlobalAddress(GA
->getGlobal(), SDLoc(Op
),
9805 GA
->getValueType(0));
9806 } else if (const BlockAddressSDNode
*BA
=
9807 dyn_cast
<BlockAddressSDNode
>(Op
)) {
9809 DAG
.getTargetBlockAddress(BA
->getBlockAddress(), BA
->getValueType(0));
9821 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Op
);
9825 // Grab the value and do some validation.
9826 uint64_t CVal
= C
->getZExtValue();
9827 switch (ConstraintLetter
) {
9828 // The I constraint applies only to simple ADD or SUB immediate operands:
9829 // i.e. 0 to 4095 with optional shift by 12
9830 // The J constraint applies only to ADD or SUB immediates that would be
9831 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
9832 // instruction [or vice versa], in other words -1 to -4095 with optional
9833 // left shift by 12.
9835 if (isUInt
<12>(CVal
) || isShiftedUInt
<12, 12>(CVal
))
9839 uint64_t NVal
= -C
->getSExtValue();
9840 if (isUInt
<12>(NVal
) || isShiftedUInt
<12, 12>(NVal
)) {
9841 CVal
= C
->getSExtValue();
9846 // The K and L constraints apply *only* to logical immediates, including
9847 // what used to be the MOVI alias for ORR (though the MOVI alias has now
9848 // been removed and MOV should be used). So these constraints have to
9849 // distinguish between bit patterns that are valid 32-bit or 64-bit
9850 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
9851 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
9854 if (AArch64_AM::isLogicalImmediate(CVal
, 32))
9858 if (AArch64_AM::isLogicalImmediate(CVal
, 64))
9861 // The M and N constraints are a superset of K and L respectively, for use
9862 // with the MOV (immediate) alias. As well as the logical immediates they
9863 // also match 32 or 64-bit immediates that can be loaded either using a
9864 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
9865 // (M) or 64-bit 0x1234000000000000 (N) etc.
9866 // As a note some of this code is liberally stolen from the asm parser.
9868 if (!isUInt
<32>(CVal
))
9870 if (AArch64_AM::isLogicalImmediate(CVal
, 32))
9872 if ((CVal
& 0xFFFF) == CVal
)
9874 if ((CVal
& 0xFFFF0000ULL
) == CVal
)
9876 uint64_t NCVal
= ~(uint32_t)CVal
;
9877 if ((NCVal
& 0xFFFFULL
) == NCVal
)
9879 if ((NCVal
& 0xFFFF0000ULL
) == NCVal
)
9884 if (AArch64_AM::isLogicalImmediate(CVal
, 64))
9886 if ((CVal
& 0xFFFFULL
) == CVal
)
9888 if ((CVal
& 0xFFFF0000ULL
) == CVal
)
9890 if ((CVal
& 0xFFFF00000000ULL
) == CVal
)
9892 if ((CVal
& 0xFFFF000000000000ULL
) == CVal
)
9894 uint64_t NCVal
= ~CVal
;
9895 if ((NCVal
& 0xFFFFULL
) == NCVal
)
9897 if ((NCVal
& 0xFFFF0000ULL
) == NCVal
)
9899 if ((NCVal
& 0xFFFF00000000ULL
) == NCVal
)
9901 if ((NCVal
& 0xFFFF000000000000ULL
) == NCVal
)
9909 // All assembler immediates are 64-bit integers.
9910 Result
= DAG
.getTargetConstant(CVal
, SDLoc(Op
), MVT::i64
);
9914 if (Result
.getNode()) {
9915 Ops
.push_back(Result
);
9919 return TargetLowering::LowerAsmOperandForConstraint(Op
, Constraint
, Ops
, DAG
);
9922 //===----------------------------------------------------------------------===//
9923 // AArch64 Advanced SIMD Support
9924 //===----------------------------------------------------------------------===//
9926 /// WidenVector - Given a value in the V64 register class, produce the
9927 /// equivalent value in the V128 register class.
9928 static SDValue
WidenVector(SDValue V64Reg
, SelectionDAG
&DAG
) {
9929 EVT VT
= V64Reg
.getValueType();
9930 unsigned NarrowSize
= VT
.getVectorNumElements();
9931 MVT EltTy
= VT
.getVectorElementType().getSimpleVT();
9932 MVT WideTy
= MVT::getVectorVT(EltTy
, 2 * NarrowSize
);
9935 return DAG
.getNode(ISD::INSERT_SUBVECTOR
, DL
, WideTy
, DAG
.getUNDEF(WideTy
),
9936 V64Reg
, DAG
.getConstant(0, DL
, MVT::i64
));
9939 /// getExtFactor - Determine the adjustment factor for the position when
9940 /// generating an "extract from vector registers" instruction.
9941 static unsigned getExtFactor(SDValue
&V
) {
9942 EVT EltType
= V
.getValueType().getVectorElementType();
9943 return EltType
.getSizeInBits() / 8;
9946 /// NarrowVector - Given a value in the V128 register class, produce the
9947 /// equivalent value in the V64 register class.
9948 static SDValue
NarrowVector(SDValue V128Reg
, SelectionDAG
&DAG
) {
9949 EVT VT
= V128Reg
.getValueType();
9950 unsigned WideSize
= VT
.getVectorNumElements();
9951 MVT EltTy
= VT
.getVectorElementType().getSimpleVT();
9952 MVT NarrowTy
= MVT::getVectorVT(EltTy
, WideSize
/ 2);
9955 return DAG
.getTargetExtractSubreg(AArch64::dsub
, DL
, NarrowTy
, V128Reg
);
9958 // Gather data to see if the operation can be modelled as a
9959 // shuffle in combination with VEXTs.
9960 SDValue
AArch64TargetLowering::ReconstructShuffle(SDValue Op
,
9961 SelectionDAG
&DAG
) const {
9962 assert(Op
.getOpcode() == ISD::BUILD_VECTOR
&& "Unknown opcode!");
9963 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
9965 EVT VT
= Op
.getValueType();
9966 assert(!VT
.isScalableVector() &&
9967 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
9968 unsigned NumElts
= VT
.getVectorNumElements();
9970 struct ShuffleSourceInfo
{
9975 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
9976 // be compatible with the shuffle we intend to construct. As a result
9977 // ShuffleVec will be some sliding window into the original Vec.
9980 // Code should guarantee that element i in Vec starts at element "WindowBase
9981 // + i * WindowScale in ShuffleVec".
9985 ShuffleSourceInfo(SDValue Vec
)
9986 : Vec(Vec
), MinElt(std::numeric_limits
<unsigned>::max()), MaxElt(0),
9987 ShuffleVec(Vec
), WindowBase(0), WindowScale(1) {}
9989 bool operator ==(SDValue OtherVec
) { return Vec
== OtherVec
; }
9992 // First gather all vectors used as an immediate source for this BUILD_VECTOR
9994 SmallVector
<ShuffleSourceInfo
, 2> Sources
;
9995 for (unsigned i
= 0; i
< NumElts
; ++i
) {
9996 SDValue V
= Op
.getOperand(i
);
9999 else if (V
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
10000 !isa
<ConstantSDNode
>(V
.getOperand(1)) ||
10001 V
.getOperand(0).getValueType().isScalableVector()) {
10003 dbgs() << "Reshuffle failed: "
10004 "a shuffle can only come from building a vector from "
10005 "various elements of other fixed-width vectors, provided "
10006 "their indices are constant\n");
10010 // Add this element source to the list if it's not already there.
10011 SDValue SourceVec
= V
.getOperand(0);
10012 auto Source
= find(Sources
, SourceVec
);
10013 if (Source
== Sources
.end())
10014 Source
= Sources
.insert(Sources
.end(), ShuffleSourceInfo(SourceVec
));
10016 // Update the minimum and maximum lane number seen.
10017 unsigned EltNo
= cast
<ConstantSDNode
>(V
.getOperand(1))->getZExtValue();
10018 Source
->MinElt
= std::min(Source
->MinElt
, EltNo
);
10019 Source
->MaxElt
= std::max(Source
->MaxElt
, EltNo
);
10022 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
10023 // better than moving to/from gpr registers for larger vectors.
10024 if ((Sources
.size() == 3 || Sources
.size() == 4) && NumElts
> 4) {
10025 // Construct a mask for the tbl. We may need to adjust the index for types
10027 SmallVector
<unsigned, 16> Mask
;
10028 unsigned OutputFactor
= VT
.getScalarSizeInBits() / 8;
10029 for (unsigned I
= 0; I
< NumElts
; ++I
) {
10030 SDValue V
= Op
.getOperand(I
);
10032 for (unsigned OF
= 0; OF
< OutputFactor
; OF
++)
10033 Mask
.push_back(-1);
10036 // Set the Mask lanes adjusted for the size of the input and output
10037 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
10038 // output element, adjusted in their positions per input and output types.
10039 unsigned Lane
= V
.getConstantOperandVal(1);
10040 for (unsigned S
= 0; S
< Sources
.size(); S
++) {
10041 if (V
.getOperand(0) == Sources
[S
].Vec
) {
10042 unsigned InputSize
= Sources
[S
].Vec
.getScalarValueSizeInBits();
10043 unsigned InputBase
= 16 * S
+ Lane
* InputSize
/ 8;
10044 for (unsigned OF
= 0; OF
< OutputFactor
; OF
++)
10045 Mask
.push_back(InputBase
+ OF
);
10051 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
10052 // v16i8, and the TBLMask
10053 SmallVector
<SDValue
, 16> TBLOperands
;
10054 TBLOperands
.push_back(DAG
.getConstant(Sources
.size() == 3
10055 ? Intrinsic::aarch64_neon_tbl3
10056 : Intrinsic::aarch64_neon_tbl4
,
10058 for (unsigned i
= 0; i
< Sources
.size(); i
++) {
10059 SDValue Src
= Sources
[i
].Vec
;
10060 EVT SrcVT
= Src
.getValueType();
10061 Src
= DAG
.getBitcast(SrcVT
.is64BitVector() ? MVT::v8i8
: MVT::v16i8
, Src
);
10062 assert((SrcVT
.is64BitVector() || SrcVT
.is128BitVector()) &&
10063 "Expected a legally typed vector");
10064 if (SrcVT
.is64BitVector())
10065 Src
= DAG
.getNode(ISD::CONCAT_VECTORS
, dl
, MVT::v16i8
, Src
,
10066 DAG
.getUNDEF(MVT::v8i8
));
10067 TBLOperands
.push_back(Src
);
10070 SmallVector
<SDValue
, 16> TBLMask
;
10071 for (unsigned i
= 0; i
< Mask
.size(); i
++)
10072 TBLMask
.push_back(DAG
.getConstant(Mask
[i
], dl
, MVT::i32
));
10073 assert((Mask
.size() == 8 || Mask
.size() == 16) &&
10074 "Expected a v8i8 or v16i8 Mask");
10075 TBLOperands
.push_back(
10076 DAG
.getBuildVector(Mask
.size() == 8 ? MVT::v8i8
: MVT::v16i8
, dl
, TBLMask
));
10079 DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
,
10080 Mask
.size() == 8 ? MVT::v8i8
: MVT::v16i8
, TBLOperands
);
10081 return DAG
.getBitcast(VT
, Shuffle
);
10084 if (Sources
.size() > 2) {
10085 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
10086 << "sensible when at most two source vectors are "
10091 // Find out the smallest element size among result and two sources, and use
10092 // it as element size to build the shuffle_vector.
10093 EVT SmallestEltTy
= VT
.getVectorElementType();
10094 for (auto &Source
: Sources
) {
10095 EVT SrcEltTy
= Source
.Vec
.getValueType().getVectorElementType();
10096 if (SrcEltTy
.bitsLT(SmallestEltTy
)) {
10097 SmallestEltTy
= SrcEltTy
;
10100 unsigned ResMultiplier
=
10101 VT
.getScalarSizeInBits() / SmallestEltTy
.getFixedSizeInBits();
10102 uint64_t VTSize
= VT
.getFixedSizeInBits();
10103 NumElts
= VTSize
/ SmallestEltTy
.getFixedSizeInBits();
10104 EVT ShuffleVT
= EVT::getVectorVT(*DAG
.getContext(), SmallestEltTy
, NumElts
);
10106 // If the source vector is too wide or too narrow, we may nevertheless be able
10107 // to construct a compatible shuffle either by concatenating it with UNDEF or
10108 // extracting a suitable range of elements.
10109 for (auto &Src
: Sources
) {
10110 EVT SrcVT
= Src
.ShuffleVec
.getValueType();
10112 TypeSize SrcVTSize
= SrcVT
.getSizeInBits();
10113 if (SrcVTSize
== TypeSize::Fixed(VTSize
))
10116 // This stage of the search produces a source with the same element type as
10117 // the original, but with a total width matching the BUILD_VECTOR output.
10118 EVT EltVT
= SrcVT
.getVectorElementType();
10119 unsigned NumSrcElts
= VTSize
/ EltVT
.getFixedSizeInBits();
10120 EVT DestVT
= EVT::getVectorVT(*DAG
.getContext(), EltVT
, NumSrcElts
);
10122 if (SrcVTSize
.getFixedValue() < VTSize
) {
10123 assert(2 * SrcVTSize
== VTSize
);
10124 // We can pad out the smaller vector for free, so if it's part of a
10127 DAG
.getNode(ISD::CONCAT_VECTORS
, dl
, DestVT
, Src
.ShuffleVec
,
10128 DAG
.getUNDEF(Src
.ShuffleVec
.getValueType()));
10132 if (SrcVTSize
.getFixedValue() != 2 * VTSize
) {
10134 dbgs() << "Reshuffle failed: result vector too small to extract\n");
10138 if (Src
.MaxElt
- Src
.MinElt
>= NumSrcElts
) {
10140 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
10144 if (Src
.MinElt
>= NumSrcElts
) {
10145 // The extraction can just take the second half
10147 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, DestVT
, Src
.ShuffleVec
,
10148 DAG
.getConstant(NumSrcElts
, dl
, MVT::i64
));
10149 Src
.WindowBase
= -NumSrcElts
;
10150 } else if (Src
.MaxElt
< NumSrcElts
) {
10151 // The extraction can just take the first half
10153 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, DestVT
, Src
.ShuffleVec
,
10154 DAG
.getConstant(0, dl
, MVT::i64
));
10156 // An actual VEXT is needed
10158 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, DestVT
, Src
.ShuffleVec
,
10159 DAG
.getConstant(0, dl
, MVT::i64
));
10161 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, DestVT
, Src
.ShuffleVec
,
10162 DAG
.getConstant(NumSrcElts
, dl
, MVT::i64
));
10163 unsigned Imm
= Src
.MinElt
* getExtFactor(VEXTSrc1
);
10165 if (!SrcVT
.is64BitVector()) {
10167 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
10168 "for SVE vectors.");
10172 Src
.ShuffleVec
= DAG
.getNode(AArch64ISD::EXT
, dl
, DestVT
, VEXTSrc1
,
10174 DAG
.getConstant(Imm
, dl
, MVT::i32
));
10175 Src
.WindowBase
= -Src
.MinElt
;
10179 // Another possible incompatibility occurs from the vector element types. We
10180 // can fix this by bitcasting the source vectors to the same type we intend
10181 // for the shuffle.
10182 for (auto &Src
: Sources
) {
10183 EVT SrcEltTy
= Src
.ShuffleVec
.getValueType().getVectorElementType();
10184 if (SrcEltTy
== SmallestEltTy
)
10186 assert(ShuffleVT
.getVectorElementType() == SmallestEltTy
);
10187 Src
.ShuffleVec
= DAG
.getNode(ISD::BITCAST
, dl
, ShuffleVT
, Src
.ShuffleVec
);
10189 SrcEltTy
.getFixedSizeInBits() / SmallestEltTy
.getFixedSizeInBits();
10190 Src
.WindowBase
*= Src
.WindowScale
;
10193 // Final check before we try to actually produce a shuffle.
10194 LLVM_DEBUG(for (auto Src
10196 assert(Src
.ShuffleVec
.getValueType() == ShuffleVT
););
10198 // The stars all align, our next step is to produce the mask for the shuffle.
10199 SmallVector
<int, 8> Mask(ShuffleVT
.getVectorNumElements(), -1);
10200 int BitsPerShuffleLane
= ShuffleVT
.getScalarSizeInBits();
10201 for (unsigned i
= 0; i
< VT
.getVectorNumElements(); ++i
) {
10202 SDValue Entry
= Op
.getOperand(i
);
10203 if (Entry
.isUndef())
10206 auto Src
= find(Sources
, Entry
.getOperand(0));
10207 int EltNo
= cast
<ConstantSDNode
>(Entry
.getOperand(1))->getSExtValue();
10209 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
10210 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
10212 EVT OrigEltTy
= Entry
.getOperand(0).getValueType().getVectorElementType();
10213 int BitsDefined
= std::min(OrigEltTy
.getScalarSizeInBits(),
10214 VT
.getScalarSizeInBits());
10215 int LanesDefined
= BitsDefined
/ BitsPerShuffleLane
;
10217 // This source is expected to fill ResMultiplier lanes of the final shuffle,
10218 // starting at the appropriate offset.
10219 int *LaneMask
= &Mask
[i
* ResMultiplier
];
10221 int ExtractBase
= EltNo
* Src
->WindowScale
+ Src
->WindowBase
;
10222 ExtractBase
+= NumElts
* (Src
- Sources
.begin());
10223 for (int j
= 0; j
< LanesDefined
; ++j
)
10224 LaneMask
[j
] = ExtractBase
+ j
;
10227 // Final check before we try to produce nonsense...
10228 if (!isShuffleMaskLegal(Mask
, ShuffleVT
)) {
10229 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
10233 SDValue ShuffleOps
[] = { DAG
.getUNDEF(ShuffleVT
), DAG
.getUNDEF(ShuffleVT
) };
10234 for (unsigned i
= 0; i
< Sources
.size(); ++i
)
10235 ShuffleOps
[i
] = Sources
[i
].ShuffleVec
;
10237 SDValue Shuffle
= DAG
.getVectorShuffle(ShuffleVT
, dl
, ShuffleOps
[0],
10238 ShuffleOps
[1], Mask
);
10239 SDValue V
= DAG
.getNode(ISD::BITCAST
, dl
, VT
, Shuffle
);
10241 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle
.dump();
10242 dbgs() << "Reshuffle, creating node: "; V
.dump(););
10247 // check if an EXT instruction can handle the shuffle mask when the
10248 // vector sources of the shuffle are the same.
10249 static bool isSingletonEXTMask(ArrayRef
<int> M
, EVT VT
, unsigned &Imm
) {
10250 unsigned NumElts
= VT
.getVectorNumElements();
10252 // Assume that the first shuffle index is not UNDEF. Fail if it is.
10258 // If this is a VEXT shuffle, the immediate value is the index of the first
10259 // element. The other shuffle indices must be the successive elements after
10261 unsigned ExpectedElt
= Imm
;
10262 for (unsigned i
= 1; i
< NumElts
; ++i
) {
10263 // Increment the expected index. If it wraps around, just follow it
10264 // back to index zero and keep going.
10266 if (ExpectedElt
== NumElts
)
10270 continue; // ignore UNDEF indices
10271 if (ExpectedElt
!= static_cast<unsigned>(M
[i
]))
10278 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
10279 // v4i32s. This is really a truncate, which we can construct out of (legal)
10280 // concats and truncate nodes.
10281 static SDValue
ReconstructTruncateFromBuildVector(SDValue V
, SelectionDAG
&DAG
) {
10282 if (V
.getValueType() != MVT::v16i8
)
10284 assert(V
.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
10286 for (unsigned X
= 0; X
< 4; X
++) {
10287 // Check the first item in each group is an extract from lane 0 of a v4i32
10289 SDValue BaseExt
= V
.getOperand(X
* 4);
10290 if (BaseExt
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
10291 (BaseExt
.getOperand(0).getValueType() != MVT::v4i16
&&
10292 BaseExt
.getOperand(0).getValueType() != MVT::v4i32
) ||
10293 !isa
<ConstantSDNode
>(BaseExt
.getOperand(1)) ||
10294 BaseExt
.getConstantOperandVal(1) != 0)
10296 SDValue Base
= BaseExt
.getOperand(0);
10297 // And check the other items are extracts from the same vector.
10298 for (unsigned Y
= 1; Y
< 4; Y
++) {
10299 SDValue Ext
= V
.getOperand(X
* 4 + Y
);
10300 if (Ext
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
10301 Ext
.getOperand(0) != Base
||
10302 !isa
<ConstantSDNode
>(Ext
.getOperand(1)) ||
10303 Ext
.getConstantOperandVal(1) != Y
)
10308 // Turn the buildvector into a series of truncates and concates, which will
10309 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
10310 // concat together to produce 2 v8i16. These are both truncated and concat
10313 SDValue Trunc
[4] = {
10314 V
.getOperand(0).getOperand(0), V
.getOperand(4).getOperand(0),
10315 V
.getOperand(8).getOperand(0), V
.getOperand(12).getOperand(0)};
10316 for (SDValue
&V
: Trunc
)
10317 if (V
.getValueType() == MVT::v4i32
)
10318 V
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::v4i16
, V
);
10320 DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, MVT::v8i16
, Trunc
[0], Trunc
[1]);
10322 DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, MVT::v8i16
, Trunc
[2], Trunc
[3]);
10323 SDValue Trunc0
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::v8i8
, Concat0
);
10324 SDValue Trunc1
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::v8i8
, Concat1
);
10325 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, MVT::v16i8
, Trunc0
, Trunc1
);
10328 /// Check if a vector shuffle corresponds to a DUP instructions with a larger
10329 /// element width than the vector lane type. If that is the case the function
10330 /// returns true and writes the value of the DUP instruction lane operand into
10332 static bool isWideDUPMask(ArrayRef
<int> M
, EVT VT
, unsigned BlockSize
,
10333 unsigned &DupLaneOp
) {
10334 assert((BlockSize
== 16 || BlockSize
== 32 || BlockSize
== 64) &&
10335 "Only possible block sizes for wide DUP are: 16, 32, 64");
10337 if (BlockSize
<= VT
.getScalarSizeInBits())
10339 if (BlockSize
% VT
.getScalarSizeInBits() != 0)
10341 if (VT
.getSizeInBits() % BlockSize
!= 0)
10344 size_t SingleVecNumElements
= VT
.getVectorNumElements();
10345 size_t NumEltsPerBlock
= BlockSize
/ VT
.getScalarSizeInBits();
10346 size_t NumBlocks
= VT
.getSizeInBits() / BlockSize
;
10348 // We are looking for masks like
10349 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
10350 // might be replaced by 'undefined'. BlockIndices will eventually contain
10351 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
10352 // for the above examples)
10353 SmallVector
<int, 8> BlockElts(NumEltsPerBlock
, -1);
10354 for (size_t BlockIndex
= 0; BlockIndex
< NumBlocks
; BlockIndex
++)
10355 for (size_t I
= 0; I
< NumEltsPerBlock
; I
++) {
10356 int Elt
= M
[BlockIndex
* NumEltsPerBlock
+ I
];
10359 // For now we don't support shuffles that use the second operand
10360 if ((unsigned)Elt
>= SingleVecNumElements
)
10362 if (BlockElts
[I
] < 0)
10363 BlockElts
[I
] = Elt
;
10364 else if (BlockElts
[I
] != Elt
)
10368 // We found a candidate block (possibly with some undefs). It must be a
10369 // sequence of consecutive integers starting with a value divisible by
10370 // NumEltsPerBlock with some values possibly replaced by undef-s.
10372 // Find first non-undef element
10373 auto FirstRealEltIter
= find_if(BlockElts
, [](int Elt
) { return Elt
>= 0; });
10374 assert(FirstRealEltIter
!= BlockElts
.end() &&
10375 "Shuffle with all-undefs must have been caught by previous cases, "
10377 if (FirstRealEltIter
== BlockElts
.end()) {
10382 // Index of FirstRealElt in BlockElts
10383 size_t FirstRealIndex
= FirstRealEltIter
- BlockElts
.begin();
10385 if ((unsigned)*FirstRealEltIter
< FirstRealIndex
)
10387 // BlockElts[0] must have the following value if it isn't undef:
10388 size_t Elt0
= *FirstRealEltIter
- FirstRealIndex
;
10390 // Check the first element
10391 if (Elt0
% NumEltsPerBlock
!= 0)
10393 // Check that the sequence indeed consists of consecutive integers (modulo
10395 for (size_t I
= 0; I
< NumEltsPerBlock
; I
++)
10396 if (BlockElts
[I
] >= 0 && (unsigned)BlockElts
[I
] != Elt0
+ I
)
10399 DupLaneOp
= Elt0
/ NumEltsPerBlock
;
10403 // check if an EXT instruction can handle the shuffle mask when the
10404 // vector sources of the shuffle are different.
10405 static bool isEXTMask(ArrayRef
<int> M
, EVT VT
, bool &ReverseEXT
,
10407 // Look for the first non-undef element.
10408 const int *FirstRealElt
= find_if(M
, [](int Elt
) { return Elt
>= 0; });
10410 // Benefit form APInt to handle overflow when calculating expected element.
10411 unsigned NumElts
= VT
.getVectorNumElements();
10412 unsigned MaskBits
= APInt(32, NumElts
* 2).logBase2();
10413 APInt ExpectedElt
= APInt(MaskBits
, *FirstRealElt
+ 1);
10414 // The following shuffle indices must be the successive elements after the
10415 // first real element.
10416 bool FoundWrongElt
= std::any_of(FirstRealElt
+ 1, M
.end(), [&](int Elt
) {
10417 return Elt
!= ExpectedElt
++ && Elt
!= -1;
10422 // The index of an EXT is the first element if it is not UNDEF.
10423 // Watch out for the beginning UNDEFs. The EXT index should be the expected
10424 // value of the first element. E.g.
10425 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
10426 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
10427 // ExpectedElt is the last mask index plus 1.
10428 Imm
= ExpectedElt
.getZExtValue();
10430 // There are two difference cases requiring to reverse input vectors.
10431 // For example, for vector <4 x i32> we have the following cases,
10432 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
10433 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
10434 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
10435 // to reverse two input vectors.
10444 /// isREVMask - Check if a vector shuffle corresponds to a REV
10445 /// instruction with the specified blocksize. (The order of the elements
10446 /// within each block of the vector is reversed.)
10447 static bool isREVMask(ArrayRef
<int> M
, EVT VT
, unsigned BlockSize
) {
10448 assert((BlockSize
== 16 || BlockSize
== 32 || BlockSize
== 64) &&
10449 "Only possible block sizes for REV are: 16, 32, 64");
10451 unsigned EltSz
= VT
.getScalarSizeInBits();
10455 unsigned NumElts
= VT
.getVectorNumElements();
10456 unsigned BlockElts
= M
[0] + 1;
10457 // If the first shuffle index is UNDEF, be optimistic.
10459 BlockElts
= BlockSize
/ EltSz
;
10461 if (BlockSize
<= EltSz
|| BlockSize
!= BlockElts
* EltSz
)
10464 for (unsigned i
= 0; i
< NumElts
; ++i
) {
10466 continue; // ignore UNDEF indices
10467 if ((unsigned)M
[i
] != (i
- i
% BlockElts
) + (BlockElts
- 1 - i
% BlockElts
))
10474 static bool isZIPMask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
10475 unsigned NumElts
= VT
.getVectorNumElements();
10476 if (NumElts
% 2 != 0)
10478 WhichResult
= (M
[0] == 0 ? 0 : 1);
10479 unsigned Idx
= WhichResult
* NumElts
/ 2;
10480 for (unsigned i
= 0; i
!= NumElts
; i
+= 2) {
10481 if ((M
[i
] >= 0 && (unsigned)M
[i
] != Idx
) ||
10482 (M
[i
+ 1] >= 0 && (unsigned)M
[i
+ 1] != Idx
+ NumElts
))
10490 static bool isUZPMask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
10491 unsigned NumElts
= VT
.getVectorNumElements();
10492 WhichResult
= (M
[0] == 0 ? 0 : 1);
10493 for (unsigned i
= 0; i
!= NumElts
; ++i
) {
10495 continue; // ignore UNDEF indices
10496 if ((unsigned)M
[i
] != 2 * i
+ WhichResult
)
10503 static bool isTRNMask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
10504 unsigned NumElts
= VT
.getVectorNumElements();
10505 if (NumElts
% 2 != 0)
10507 WhichResult
= (M
[0] == 0 ? 0 : 1);
10508 for (unsigned i
= 0; i
< NumElts
; i
+= 2) {
10509 if ((M
[i
] >= 0 && (unsigned)M
[i
] != i
+ WhichResult
) ||
10510 (M
[i
+ 1] >= 0 && (unsigned)M
[i
+ 1] != i
+ NumElts
+ WhichResult
))
10516 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
10517 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
10518 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
10519 static bool isZIP_v_undef_Mask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
10520 unsigned NumElts
= VT
.getVectorNumElements();
10521 if (NumElts
% 2 != 0)
10523 WhichResult
= (M
[0] == 0 ? 0 : 1);
10524 unsigned Idx
= WhichResult
* NumElts
/ 2;
10525 for (unsigned i
= 0; i
!= NumElts
; i
+= 2) {
10526 if ((M
[i
] >= 0 && (unsigned)M
[i
] != Idx
) ||
10527 (M
[i
+ 1] >= 0 && (unsigned)M
[i
+ 1] != Idx
))
10535 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
10536 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
10537 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
10538 static bool isUZP_v_undef_Mask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
10539 unsigned Half
= VT
.getVectorNumElements() / 2;
10540 WhichResult
= (M
[0] == 0 ? 0 : 1);
10541 for (unsigned j
= 0; j
!= 2; ++j
) {
10542 unsigned Idx
= WhichResult
;
10543 for (unsigned i
= 0; i
!= Half
; ++i
) {
10544 int MIdx
= M
[i
+ j
* Half
];
10545 if (MIdx
>= 0 && (unsigned)MIdx
!= Idx
)
10554 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
10555 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
10556 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
10557 static bool isTRN_v_undef_Mask(ArrayRef
<int> M
, EVT VT
, unsigned &WhichResult
) {
10558 unsigned NumElts
= VT
.getVectorNumElements();
10559 if (NumElts
% 2 != 0)
10561 WhichResult
= (M
[0] == 0 ? 0 : 1);
10562 for (unsigned i
= 0; i
< NumElts
; i
+= 2) {
10563 if ((M
[i
] >= 0 && (unsigned)M
[i
] != i
+ WhichResult
) ||
10564 (M
[i
+ 1] >= 0 && (unsigned)M
[i
+ 1] != i
+ WhichResult
))
10570 static bool isINSMask(ArrayRef
<int> M
, int NumInputElements
,
10571 bool &DstIsLeft
, int &Anomaly
) {
10572 if (M
.size() != static_cast<size_t>(NumInputElements
))
10575 int NumLHSMatch
= 0, NumRHSMatch
= 0;
10576 int LastLHSMismatch
= -1, LastRHSMismatch
= -1;
10578 for (int i
= 0; i
< NumInputElements
; ++i
) {
10588 LastLHSMismatch
= i
;
10590 if (M
[i
] == i
+ NumInputElements
)
10593 LastRHSMismatch
= i
;
10596 if (NumLHSMatch
== NumInputElements
- 1) {
10598 Anomaly
= LastLHSMismatch
;
10600 } else if (NumRHSMatch
== NumInputElements
- 1) {
10602 Anomaly
= LastRHSMismatch
;
10609 static bool isConcatMask(ArrayRef
<int> Mask
, EVT VT
, bool SplitLHS
) {
10610 if (VT
.getSizeInBits() != 128)
10613 unsigned NumElts
= VT
.getVectorNumElements();
10615 for (int I
= 0, E
= NumElts
/ 2; I
!= E
; I
++) {
10620 int Offset
= NumElts
/ 2;
10621 for (int I
= NumElts
/ 2, E
= NumElts
; I
!= E
; I
++) {
10622 if (Mask
[I
] != I
+ SplitLHS
* Offset
)
10629 static SDValue
tryFormConcatFromShuffle(SDValue Op
, SelectionDAG
&DAG
) {
10631 EVT VT
= Op
.getValueType();
10632 SDValue V0
= Op
.getOperand(0);
10633 SDValue V1
= Op
.getOperand(1);
10634 ArrayRef
<int> Mask
= cast
<ShuffleVectorSDNode
>(Op
)->getMask();
10636 if (VT
.getVectorElementType() != V0
.getValueType().getVectorElementType() ||
10637 VT
.getVectorElementType() != V1
.getValueType().getVectorElementType())
10640 bool SplitV0
= V0
.getValueSizeInBits() == 128;
10642 if (!isConcatMask(Mask
, VT
, SplitV0
))
10645 EVT CastVT
= VT
.getHalfNumVectorElementsVT(*DAG
.getContext());
10647 V0
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, CastVT
, V0
,
10648 DAG
.getConstant(0, DL
, MVT::i64
));
10650 if (V1
.getValueSizeInBits() == 128) {
10651 V1
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, CastVT
, V1
,
10652 DAG
.getConstant(0, DL
, MVT::i64
));
10654 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, VT
, V0
, V1
);
10657 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
10658 /// the specified operations to build the shuffle. ID is the perfect-shuffle
10659 //ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
10660 //table entry and LHS/RHS are the immediate inputs for this stage of the
10662 static SDValue
GeneratePerfectShuffle(unsigned ID
, SDValue V1
,
10663 SDValue V2
, unsigned PFEntry
, SDValue LHS
,
10664 SDValue RHS
, SelectionDAG
&DAG
,
10666 unsigned OpNum
= (PFEntry
>> 26) & 0x0F;
10667 unsigned LHSID
= (PFEntry
>> 13) & ((1 << 13) - 1);
10668 unsigned RHSID
= (PFEntry
>> 0) & ((1 << 13) - 1);
10671 OP_COPY
= 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
10680 OP_VUZPL
, // VUZP, left result
10681 OP_VUZPR
, // VUZP, right result
10682 OP_VZIPL
, // VZIP, left result
10683 OP_VZIPR
, // VZIP, right result
10684 OP_VTRNL
, // VTRN, left result
10685 OP_VTRNR
, // VTRN, right result
10686 OP_MOVLANE
// Move lane. RHSID is the lane to move into
10689 if (OpNum
== OP_COPY
) {
10690 if (LHSID
== (1 * 9 + 2) * 9 + 3)
10692 assert(LHSID
== ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
10696 if (OpNum
== OP_MOVLANE
) {
10697 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
10698 auto getPFIDLane
= [](unsigned ID
, int Elt
) -> int {
10699 assert(Elt
< 4 && "Expected Perfect Lanes to be less than 4");
10705 return (ID
% 9 == 8) ? -1 : ID
% 9;
10708 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
10709 // get the lane to move from from the PFID, which is always from the
10710 // original vectors (V1 or V2).
10711 SDValue OpLHS
= GeneratePerfectShuffle(
10712 LHSID
, V1
, V2
, PerfectShuffleTable
[LHSID
], LHS
, RHS
, DAG
, dl
);
10713 EVT VT
= OpLHS
.getValueType();
10714 assert(RHSID
< 8 && "Expected a lane index for RHSID!");
10715 unsigned ExtLane
= 0;
10718 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
10719 // convert into a higher type.
10721 int MaskElt
= getPFIDLane(ID
, (RHSID
& 0x01) << 1) >> 1;
10723 MaskElt
= (getPFIDLane(ID
, ((RHSID
& 0x01) << 1) + 1) - 1) >> 1;
10724 assert(MaskElt
>= 0 && "Didn't expect an undef movlane index!");
10725 ExtLane
= MaskElt
< 2 ? MaskElt
: (MaskElt
- 2);
10726 Input
= MaskElt
< 2 ? V1
: V2
;
10727 if (VT
.getScalarSizeInBits() == 16) {
10728 Input
= DAG
.getBitcast(MVT::v2f32
, Input
);
10729 OpLHS
= DAG
.getBitcast(MVT::v2f32
, OpLHS
);
10731 assert(VT
.getScalarSizeInBits() == 32 &&
10732 "Expected 16 or 32 bit shuffle elemements");
10733 Input
= DAG
.getBitcast(MVT::v2f64
, Input
);
10734 OpLHS
= DAG
.getBitcast(MVT::v2f64
, OpLHS
);
10737 int MaskElt
= getPFIDLane(ID
, RHSID
);
10738 assert(MaskElt
>= 0 && "Didn't expect an undef movlane index!");
10739 ExtLane
= MaskElt
< 4 ? MaskElt
: (MaskElt
- 4);
10740 Input
= MaskElt
< 4 ? V1
: V2
;
10741 // Be careful about creating illegal types. Use f16 instead of i16.
10742 if (VT
== MVT::v4i16
) {
10743 Input
= DAG
.getBitcast(MVT::v4f16
, Input
);
10744 OpLHS
= DAG
.getBitcast(MVT::v4f16
, OpLHS
);
10747 SDValue Ext
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
,
10748 Input
.getValueType().getVectorElementType(),
10749 Input
, DAG
.getVectorIdxConstant(ExtLane
, dl
));
10751 DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, Input
.getValueType(), OpLHS
,
10752 Ext
, DAG
.getVectorIdxConstant(RHSID
& 0x3, dl
));
10753 return DAG
.getBitcast(VT
, Ins
);
10756 SDValue OpLHS
, OpRHS
;
10757 OpLHS
= GeneratePerfectShuffle(LHSID
, V1
, V2
, PerfectShuffleTable
[LHSID
], LHS
,
10759 OpRHS
= GeneratePerfectShuffle(RHSID
, V1
, V2
, PerfectShuffleTable
[RHSID
], LHS
,
10761 EVT VT
= OpLHS
.getValueType();
10765 llvm_unreachable("Unknown shuffle opcode!");
10767 // VREV divides the vector in half and swaps within the half.
10768 if (VT
.getVectorElementType() == MVT::i32
||
10769 VT
.getVectorElementType() == MVT::f32
)
10770 return DAG
.getNode(AArch64ISD::REV64
, dl
, VT
, OpLHS
);
10771 // vrev <4 x i16> -> REV32
10772 if (VT
.getVectorElementType() == MVT::i16
||
10773 VT
.getVectorElementType() == MVT::f16
||
10774 VT
.getVectorElementType() == MVT::bf16
)
10775 return DAG
.getNode(AArch64ISD::REV32
, dl
, VT
, OpLHS
);
10776 // vrev <4 x i8> -> REV16
10777 assert(VT
.getVectorElementType() == MVT::i8
);
10778 return DAG
.getNode(AArch64ISD::REV16
, dl
, VT
, OpLHS
);
10783 EVT EltTy
= VT
.getVectorElementType();
10785 if (EltTy
== MVT::i8
)
10786 Opcode
= AArch64ISD::DUPLANE8
;
10787 else if (EltTy
== MVT::i16
|| EltTy
== MVT::f16
|| EltTy
== MVT::bf16
)
10788 Opcode
= AArch64ISD::DUPLANE16
;
10789 else if (EltTy
== MVT::i32
|| EltTy
== MVT::f32
)
10790 Opcode
= AArch64ISD::DUPLANE32
;
10791 else if (EltTy
== MVT::i64
|| EltTy
== MVT::f64
)
10792 Opcode
= AArch64ISD::DUPLANE64
;
10794 llvm_unreachable("Invalid vector element type?");
10796 if (VT
.getSizeInBits() == 64)
10797 OpLHS
= WidenVector(OpLHS
, DAG
);
10798 SDValue Lane
= DAG
.getConstant(OpNum
- OP_VDUP0
, dl
, MVT::i64
);
10799 return DAG
.getNode(Opcode
, dl
, VT
, OpLHS
, Lane
);
10804 unsigned Imm
= (OpNum
- OP_VEXT1
+ 1) * getExtFactor(OpLHS
);
10805 return DAG
.getNode(AArch64ISD::EXT
, dl
, VT
, OpLHS
, OpRHS
,
10806 DAG
.getConstant(Imm
, dl
, MVT::i32
));
10809 return DAG
.getNode(AArch64ISD::UZP1
, dl
, DAG
.getVTList(VT
, VT
), OpLHS
,
10812 return DAG
.getNode(AArch64ISD::UZP2
, dl
, DAG
.getVTList(VT
, VT
), OpLHS
,
10815 return DAG
.getNode(AArch64ISD::ZIP1
, dl
, DAG
.getVTList(VT
, VT
), OpLHS
,
10818 return DAG
.getNode(AArch64ISD::ZIP2
, dl
, DAG
.getVTList(VT
, VT
), OpLHS
,
10821 return DAG
.getNode(AArch64ISD::TRN1
, dl
, DAG
.getVTList(VT
, VT
), OpLHS
,
10824 return DAG
.getNode(AArch64ISD::TRN2
, dl
, DAG
.getVTList(VT
, VT
), OpLHS
,
10829 static SDValue
GenerateTBL(SDValue Op
, ArrayRef
<int> ShuffleMask
,
10830 SelectionDAG
&DAG
) {
10831 // Check to see if we can use the TBL instruction.
10832 SDValue V1
= Op
.getOperand(0);
10833 SDValue V2
= Op
.getOperand(1);
10836 EVT EltVT
= Op
.getValueType().getVectorElementType();
10837 unsigned BytesPerElt
= EltVT
.getSizeInBits() / 8;
10840 if (V1
.isUndef() || isZerosVector(V1
.getNode())) {
10845 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
10846 // out of range values with 0s. We do need to make sure that any out-of-range
10847 // values are really out-of-range for a v16i8 vector.
10848 bool IsUndefOrZero
= V2
.isUndef() || isZerosVector(V2
.getNode());
10849 MVT IndexVT
= MVT::v8i8
;
10850 unsigned IndexLen
= 8;
10851 if (Op
.getValueSizeInBits() == 128) {
10852 IndexVT
= MVT::v16i8
;
10856 SmallVector
<SDValue
, 8> TBLMask
;
10857 for (int Val
: ShuffleMask
) {
10858 for (unsigned Byte
= 0; Byte
< BytesPerElt
; ++Byte
) {
10859 unsigned Offset
= Byte
+ Val
* BytesPerElt
;
10861 Offset
= Offset
< IndexLen
? Offset
+ IndexLen
: Offset
- IndexLen
;
10862 if (IsUndefOrZero
&& Offset
>= IndexLen
)
10864 TBLMask
.push_back(DAG
.getConstant(Offset
, DL
, MVT::i32
));
10868 SDValue V1Cst
= DAG
.getNode(ISD::BITCAST
, DL
, IndexVT
, V1
);
10869 SDValue V2Cst
= DAG
.getNode(ISD::BITCAST
, DL
, IndexVT
, V2
);
10872 if (IsUndefOrZero
) {
10874 V1Cst
= DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, MVT::v16i8
, V1Cst
, V1Cst
);
10875 Shuffle
= DAG
.getNode(
10876 ISD::INTRINSIC_WO_CHAIN
, DL
, IndexVT
,
10877 DAG
.getConstant(Intrinsic::aarch64_neon_tbl1
, DL
, MVT::i32
), V1Cst
,
10878 DAG
.getBuildVector(IndexVT
, DL
,
10879 makeArrayRef(TBLMask
.data(), IndexLen
)));
10881 if (IndexLen
== 8) {
10882 V1Cst
= DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, MVT::v16i8
, V1Cst
, V2Cst
);
10883 Shuffle
= DAG
.getNode(
10884 ISD::INTRINSIC_WO_CHAIN
, DL
, IndexVT
,
10885 DAG
.getConstant(Intrinsic::aarch64_neon_tbl1
, DL
, MVT::i32
), V1Cst
,
10886 DAG
.getBuildVector(IndexVT
, DL
,
10887 makeArrayRef(TBLMask
.data(), IndexLen
)));
10889 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
10890 // cannot currently represent the register constraints on the input
10891 // table registers.
10892 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
10893 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
10895 Shuffle
= DAG
.getNode(
10896 ISD::INTRINSIC_WO_CHAIN
, DL
, IndexVT
,
10897 DAG
.getConstant(Intrinsic::aarch64_neon_tbl2
, DL
, MVT::i32
), V1Cst
,
10898 V2Cst
, DAG
.getBuildVector(IndexVT
, DL
,
10899 makeArrayRef(TBLMask
.data(), IndexLen
)));
10902 return DAG
.getNode(ISD::BITCAST
, DL
, Op
.getValueType(), Shuffle
);
10905 static unsigned getDUPLANEOp(EVT EltType
) {
10906 if (EltType
== MVT::i8
)
10907 return AArch64ISD::DUPLANE8
;
10908 if (EltType
== MVT::i16
|| EltType
== MVT::f16
|| EltType
== MVT::bf16
)
10909 return AArch64ISD::DUPLANE16
;
10910 if (EltType
== MVT::i32
|| EltType
== MVT::f32
)
10911 return AArch64ISD::DUPLANE32
;
10912 if (EltType
== MVT::i64
|| EltType
== MVT::f64
)
10913 return AArch64ISD::DUPLANE64
;
10915 llvm_unreachable("Invalid vector element type?");
10918 static SDValue
constructDup(SDValue V
, int Lane
, SDLoc dl
, EVT VT
,
10919 unsigned Opcode
, SelectionDAG
&DAG
) {
10920 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
10921 auto getScaledOffsetDup
= [](SDValue BitCast
, int &LaneC
, MVT
&CastVT
) {
10922 // Match: dup (bitcast (extract_subv X, C)), LaneC
10923 if (BitCast
.getOpcode() != ISD::BITCAST
||
10924 BitCast
.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR
)
10927 // The extract index must align in the destination type. That may not
10928 // happen if the bitcast is from narrow to wide type.
10929 SDValue Extract
= BitCast
.getOperand(0);
10930 unsigned ExtIdx
= Extract
.getConstantOperandVal(1);
10931 unsigned SrcEltBitWidth
= Extract
.getScalarValueSizeInBits();
10932 unsigned ExtIdxInBits
= ExtIdx
* SrcEltBitWidth
;
10933 unsigned CastedEltBitWidth
= BitCast
.getScalarValueSizeInBits();
10934 if (ExtIdxInBits
% CastedEltBitWidth
!= 0)
10937 // Can't handle cases where vector size is not 128-bit
10938 if (!Extract
.getOperand(0).getValueType().is128BitVector())
10941 // Update the lane value by offsetting with the scaled extract index.
10942 LaneC
+= ExtIdxInBits
/ CastedEltBitWidth
;
10944 // Determine the casted vector type of the wide vector input.
10945 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
10947 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
10948 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
10949 unsigned SrcVecNumElts
=
10950 Extract
.getOperand(0).getValueSizeInBits() / CastedEltBitWidth
;
10951 CastVT
= MVT::getVectorVT(BitCast
.getSimpleValueType().getScalarType(),
10956 if (getScaledOffsetDup(V
, Lane
, CastVT
)) {
10957 V
= DAG
.getBitcast(CastVT
, V
.getOperand(0).getOperand(0));
10958 } else if (V
.getOpcode() == ISD::EXTRACT_SUBVECTOR
&&
10959 V
.getOperand(0).getValueType().is128BitVector()) {
10960 // The lane is incremented by the index of the extract.
10961 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
10962 Lane
+= V
.getConstantOperandVal(1);
10963 V
= V
.getOperand(0);
10964 } else if (V
.getOpcode() == ISD::CONCAT_VECTORS
) {
10965 // The lane is decremented if we are splatting from the 2nd operand.
10966 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
10967 unsigned Idx
= Lane
>= (int)VT
.getVectorNumElements() / 2;
10968 Lane
-= Idx
* VT
.getVectorNumElements() / 2;
10969 V
= WidenVector(V
.getOperand(Idx
), DAG
);
10970 } else if (VT
.getSizeInBits() == 64) {
10971 // Widen the operand to 128-bit register with undef.
10972 V
= WidenVector(V
, DAG
);
10974 return DAG
.getNode(Opcode
, dl
, VT
, V
, DAG
.getConstant(Lane
, dl
, MVT::i64
));
10977 // Return true if we can get a new shuffle mask by checking the parameter mask
10978 // array to test whether every two adjacent mask values are continuous and
10979 // starting from an even number.
10980 static bool isWideTypeMask(ArrayRef
<int> M
, EVT VT
,
10981 SmallVectorImpl
<int> &NewMask
) {
10982 unsigned NumElts
= VT
.getVectorNumElements();
10983 if (NumElts
% 2 != 0)
10987 for (unsigned i
= 0; i
< NumElts
; i
+= 2) {
10991 // If both elements are undef, new mask is undef too.
10992 if (M0
== -1 && M1
== -1) {
10993 NewMask
.push_back(-1);
10997 if (M0
== -1 && M1
!= -1 && (M1
% 2) == 1) {
10998 NewMask
.push_back(M1
/ 2);
11002 if (M0
!= -1 && (M0
% 2) == 0 && ((M0
+ 1) == M1
|| M1
== -1)) {
11003 NewMask
.push_back(M0
/ 2);
11011 assert(NewMask
.size() == NumElts
/ 2 && "Incorrect size for mask!");
11015 // Try to widen element type to get a new mask value for a better permutation
11016 // sequence, so that we can use NEON shuffle instructions, such as zip1/2,
11017 // UZP1/2, TRN1/2, REV, INS, etc.
11019 // shufflevector <4 x i32> %a, <4 x i32> %b,
11020 // <4 x i32> <i32 6, i32 7, i32 2, i32 3>
11021 // is equivalent to:
11022 // shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
11023 // Finally, we can get:
11024 // mov v0.d[0], v1.d[1]
11025 static SDValue
tryWidenMaskForShuffle(SDValue Op
, SelectionDAG
&DAG
) {
11027 EVT VT
= Op
.getValueType();
11028 EVT ScalarVT
= VT
.getVectorElementType();
11029 unsigned ElementSize
= ScalarVT
.getFixedSizeInBits();
11030 SDValue V0
= Op
.getOperand(0);
11031 SDValue V1
= Op
.getOperand(1);
11032 ArrayRef
<int> Mask
= cast
<ShuffleVectorSDNode
>(Op
)->getMask();
11034 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
11035 // We need to make sure the wider element type is legal. Thus, ElementSize
11036 // should be not larger than 32 bits, and i1 type should also be excluded.
11037 if (ElementSize
> 32 || ElementSize
== 1)
11040 SmallVector
<int, 8> NewMask
;
11041 if (isWideTypeMask(Mask
, VT
, NewMask
)) {
11042 MVT NewEltVT
= VT
.isFloatingPoint()
11043 ? MVT::getFloatingPointVT(ElementSize
* 2)
11044 : MVT::getIntegerVT(ElementSize
* 2);
11045 MVT NewVT
= MVT::getVectorVT(NewEltVT
, VT
.getVectorNumElements() / 2);
11046 if (DAG
.getTargetLoweringInfo().isTypeLegal(NewVT
)) {
11047 V0
= DAG
.getBitcast(NewVT
, V0
);
11048 V1
= DAG
.getBitcast(NewVT
, V1
);
11049 return DAG
.getBitcast(VT
,
11050 DAG
.getVectorShuffle(NewVT
, DL
, V0
, V1
, NewMask
));
11057 // Try to fold shuffle (tbl2, tbl2) into a single tbl4.
11058 static SDValue
tryToConvertShuffleOfTbl2ToTbl4(SDValue Op
,
11059 ArrayRef
<int> ShuffleMask
,
11060 SelectionDAG
&DAG
) {
11061 SDValue Tbl1
= Op
->getOperand(0);
11062 SDValue Tbl2
= Op
->getOperand(1);
11065 DAG
.getTargetConstant(Intrinsic::aarch64_neon_tbl2
, dl
, MVT::i64
);
11067 EVT VT
= Op
.getValueType();
11068 if (Tbl1
->getOpcode() != ISD::INTRINSIC_WO_CHAIN
||
11069 Tbl1
->getOperand(0) != Tbl2ID
||
11070 Tbl2
->getOpcode() != ISD::INTRINSIC_WO_CHAIN
||
11071 Tbl2
->getOperand(0) != Tbl2ID
)
11074 if (Tbl1
->getValueType(0) != MVT::v16i8
||
11075 Tbl2
->getValueType(0) != MVT::v16i8
)
11078 SDValue Mask1
= Tbl1
->getOperand(3);
11079 SDValue Mask2
= Tbl2
->getOperand(3);
11080 SmallVector
<SDValue
, 16> TBLMaskParts(16, SDValue());
11081 for (unsigned I
= 0; I
< 16; I
++) {
11082 if (ShuffleMask
[I
] < 16)
11083 TBLMaskParts
[I
] = Mask1
->getOperand(ShuffleMask
[I
]);
11086 dyn_cast
<ConstantSDNode
>(Mask2
->getOperand(ShuffleMask
[I
] - 16));
11089 TBLMaskParts
[I
] = DAG
.getConstant(C
->getSExtValue() + 32, dl
, MVT::i32
);
11093 SDValue TBLMask
= DAG
.getBuildVector(VT
, dl
, TBLMaskParts
);
11095 DAG
.getTargetConstant(Intrinsic::aarch64_neon_tbl4
, dl
, MVT::i64
);
11097 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, dl
, MVT::v16i8
,
11098 {ID
, Tbl1
->getOperand(1), Tbl1
->getOperand(2),
11099 Tbl2
->getOperand(1), Tbl2
->getOperand(2), TBLMask
});
11102 SDValue
AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op
,
11103 SelectionDAG
&DAG
) const {
11105 EVT VT
= Op
.getValueType();
11107 ShuffleVectorSDNode
*SVN
= cast
<ShuffleVectorSDNode
>(Op
.getNode());
11109 if (useSVEForFixedLengthVectorVT(VT
,
11110 Subtarget
->forceStreamingCompatibleSVE()))
11111 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op
, DAG
);
11113 // Convert shuffles that are directly supported on NEON to target-specific
11114 // DAG nodes, instead of keeping them as shuffles and matching them again
11115 // during code selection. This is more efficient and avoids the possibility
11116 // of inconsistencies between legalization and selection.
11117 ArrayRef
<int> ShuffleMask
= SVN
->getMask();
11119 SDValue V1
= Op
.getOperand(0);
11120 SDValue V2
= Op
.getOperand(1);
11122 assert(V1
.getValueType() == VT
&& "Unexpected VECTOR_SHUFFLE type!");
11123 assert(ShuffleMask
.size() == VT
.getVectorNumElements() &&
11124 "Unexpected VECTOR_SHUFFLE mask size!");
11126 if (SDValue Res
= tryToConvertShuffleOfTbl2ToTbl4(Op
, ShuffleMask
, DAG
))
11129 if (SVN
->isSplat()) {
11130 int Lane
= SVN
->getSplatIndex();
11131 // If this is undef splat, generate it via "just" vdup, if possible.
11135 if (Lane
== 0 && V1
.getOpcode() == ISD::SCALAR_TO_VECTOR
)
11136 return DAG
.getNode(AArch64ISD::DUP
, dl
, V1
.getValueType(),
11138 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
11139 // constant. If so, we can just reference the lane's definition directly.
11140 if (V1
.getOpcode() == ISD::BUILD_VECTOR
&&
11141 !isa
<ConstantSDNode
>(V1
.getOperand(Lane
)))
11142 return DAG
.getNode(AArch64ISD::DUP
, dl
, VT
, V1
.getOperand(Lane
));
11144 // Otherwise, duplicate from the lane of the input vector.
11145 unsigned Opcode
= getDUPLANEOp(V1
.getValueType().getVectorElementType());
11146 return constructDup(V1
, Lane
, dl
, VT
, Opcode
, DAG
);
11149 // Check if the mask matches a DUP for a wider element
11150 for (unsigned LaneSize
: {64U, 32U, 16U}) {
11152 if (isWideDUPMask(ShuffleMask
, VT
, LaneSize
, Lane
)) {
11153 unsigned Opcode
= LaneSize
== 64 ? AArch64ISD::DUPLANE64
11154 : LaneSize
== 32 ? AArch64ISD::DUPLANE32
11155 : AArch64ISD::DUPLANE16
;
11156 // Cast V1 to an integer vector with required lane size
11157 MVT NewEltTy
= MVT::getIntegerVT(LaneSize
);
11158 unsigned NewEltCount
= VT
.getSizeInBits() / LaneSize
;
11159 MVT NewVecTy
= MVT::getVectorVT(NewEltTy
, NewEltCount
);
11160 V1
= DAG
.getBitcast(NewVecTy
, V1
);
11161 // Constuct the DUP instruction
11162 V1
= constructDup(V1
, Lane
, dl
, NewVecTy
, Opcode
, DAG
);
11163 // Cast back to the original type
11164 return DAG
.getBitcast(VT
, V1
);
11168 if (isREVMask(ShuffleMask
, VT
, 64))
11169 return DAG
.getNode(AArch64ISD::REV64
, dl
, V1
.getValueType(), V1
, V2
);
11170 if (isREVMask(ShuffleMask
, VT
, 32))
11171 return DAG
.getNode(AArch64ISD::REV32
, dl
, V1
.getValueType(), V1
, V2
);
11172 if (isREVMask(ShuffleMask
, VT
, 16))
11173 return DAG
.getNode(AArch64ISD::REV16
, dl
, V1
.getValueType(), V1
, V2
);
11175 if (((VT
.getVectorNumElements() == 8 && VT
.getScalarSizeInBits() == 16) ||
11176 (VT
.getVectorNumElements() == 16 && VT
.getScalarSizeInBits() == 8)) &&
11177 ShuffleVectorInst::isReverseMask(ShuffleMask
)) {
11178 SDValue Rev
= DAG
.getNode(AArch64ISD::REV64
, dl
, VT
, V1
);
11179 return DAG
.getNode(AArch64ISD::EXT
, dl
, VT
, Rev
, Rev
,
11180 DAG
.getConstant(8, dl
, MVT::i32
));
11183 bool ReverseEXT
= false;
11185 if (isEXTMask(ShuffleMask
, VT
, ReverseEXT
, Imm
)) {
11188 Imm
*= getExtFactor(V1
);
11189 return DAG
.getNode(AArch64ISD::EXT
, dl
, V1
.getValueType(), V1
, V2
,
11190 DAG
.getConstant(Imm
, dl
, MVT::i32
));
11191 } else if (V2
->isUndef() && isSingletonEXTMask(ShuffleMask
, VT
, Imm
)) {
11192 Imm
*= getExtFactor(V1
);
11193 return DAG
.getNode(AArch64ISD::EXT
, dl
, V1
.getValueType(), V1
, V1
,
11194 DAG
.getConstant(Imm
, dl
, MVT::i32
));
11197 unsigned WhichResult
;
11198 if (isZIPMask(ShuffleMask
, VT
, WhichResult
)) {
11199 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::ZIP1
: AArch64ISD::ZIP2
;
11200 return DAG
.getNode(Opc
, dl
, V1
.getValueType(), V1
, V2
);
11202 if (isUZPMask(ShuffleMask
, VT
, WhichResult
)) {
11203 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::UZP1
: AArch64ISD::UZP2
;
11204 return DAG
.getNode(Opc
, dl
, V1
.getValueType(), V1
, V2
);
11206 if (isTRNMask(ShuffleMask
, VT
, WhichResult
)) {
11207 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::TRN1
: AArch64ISD::TRN2
;
11208 return DAG
.getNode(Opc
, dl
, V1
.getValueType(), V1
, V2
);
11211 if (isZIP_v_undef_Mask(ShuffleMask
, VT
, WhichResult
)) {
11212 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::ZIP1
: AArch64ISD::ZIP2
;
11213 return DAG
.getNode(Opc
, dl
, V1
.getValueType(), V1
, V1
);
11215 if (isUZP_v_undef_Mask(ShuffleMask
, VT
, WhichResult
)) {
11216 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::UZP1
: AArch64ISD::UZP2
;
11217 return DAG
.getNode(Opc
, dl
, V1
.getValueType(), V1
, V1
);
11219 if (isTRN_v_undef_Mask(ShuffleMask
, VT
, WhichResult
)) {
11220 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::TRN1
: AArch64ISD::TRN2
;
11221 return DAG
.getNode(Opc
, dl
, V1
.getValueType(), V1
, V1
);
11224 if (SDValue Concat
= tryFormConcatFromShuffle(Op
, DAG
))
11229 int NumInputElements
= V1
.getValueType().getVectorNumElements();
11230 if (isINSMask(ShuffleMask
, NumInputElements
, DstIsLeft
, Anomaly
)) {
11231 SDValue DstVec
= DstIsLeft
? V1
: V2
;
11232 SDValue DstLaneV
= DAG
.getConstant(Anomaly
, dl
, MVT::i64
);
11234 SDValue SrcVec
= V1
;
11235 int SrcLane
= ShuffleMask
[Anomaly
];
11236 if (SrcLane
>= NumInputElements
) {
11238 SrcLane
-= VT
.getVectorNumElements();
11240 SDValue SrcLaneV
= DAG
.getConstant(SrcLane
, dl
, MVT::i64
);
11242 EVT ScalarVT
= VT
.getVectorElementType();
11244 if (ScalarVT
.getFixedSizeInBits() < 32 && ScalarVT
.isInteger())
11245 ScalarVT
= MVT::i32
;
11247 return DAG
.getNode(
11248 ISD::INSERT_VECTOR_ELT
, dl
, VT
, DstVec
,
11249 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, ScalarVT
, SrcVec
, SrcLaneV
),
11253 if (SDValue NewSD
= tryWidenMaskForShuffle(Op
, DAG
))
11256 // If the shuffle is not directly supported and it has 4 elements, use
11257 // the PerfectShuffle-generated table to synthesize it from other shuffles.
11258 unsigned NumElts
= VT
.getVectorNumElements();
11259 if (NumElts
== 4) {
11260 unsigned PFIndexes
[4];
11261 for (unsigned i
= 0; i
!= 4; ++i
) {
11262 if (ShuffleMask
[i
] < 0)
11265 PFIndexes
[i
] = ShuffleMask
[i
];
11268 // Compute the index in the perfect shuffle table.
11269 unsigned PFTableIndex
= PFIndexes
[0] * 9 * 9 * 9 + PFIndexes
[1] * 9 * 9 +
11270 PFIndexes
[2] * 9 + PFIndexes
[3];
11271 unsigned PFEntry
= PerfectShuffleTable
[PFTableIndex
];
11272 return GeneratePerfectShuffle(PFTableIndex
, V1
, V2
, PFEntry
, V1
, V2
, DAG
,
11276 return GenerateTBL(Op
, ShuffleMask
, DAG
);
11279 SDValue
AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op
,
11280 SelectionDAG
&DAG
) const {
11281 EVT VT
= Op
.getValueType();
11283 if (useSVEForFixedLengthVectorVT(VT
))
11284 return LowerToScalableOp(Op
, DAG
);
11286 assert(VT
.isScalableVector() && VT
.getVectorElementType() == MVT::i1
&&
11287 "Unexpected vector type!");
11289 // We can handle the constant cases during isel.
11290 if (isa
<ConstantSDNode
>(Op
.getOperand(0)))
11293 // There isn't a natural way to handle the general i1 case, so we use some
11294 // trickery with whilelo.
11296 SDValue SplatVal
= DAG
.getAnyExtOrTrunc(Op
.getOperand(0), DL
, MVT::i64
);
11297 SplatVal
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, MVT::i64
, SplatVal
,
11298 DAG
.getValueType(MVT::i1
));
11300 DAG
.getTargetConstant(Intrinsic::aarch64_sve_whilelo
, DL
, MVT::i64
);
11301 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i64
);
11302 if (VT
== MVT::nxv1i1
)
11303 return DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, MVT::nxv1i1
,
11304 DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, MVT::nxv2i1
, ID
,
11307 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, VT
, ID
, Zero
, SplatVal
);
11310 SDValue
AArch64TargetLowering::LowerDUPQLane(SDValue Op
,
11311 SelectionDAG
&DAG
) const {
11314 EVT VT
= Op
.getValueType();
11315 if (!isTypeLegal(VT
) || !VT
.isScalableVector())
11318 // Current lowering only supports the SVE-ACLE types.
11319 if (VT
.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock
)
11322 // The DUPQ operation is indepedent of element type so normalise to i64s.
11323 SDValue Idx128
= Op
.getOperand(2);
11325 // DUPQ can be used when idx is in range.
11326 auto *CIdx
= dyn_cast
<ConstantSDNode
>(Idx128
);
11327 if (CIdx
&& (CIdx
->getZExtValue() <= 3)) {
11328 SDValue CI
= DAG
.getTargetConstant(CIdx
->getZExtValue(), DL
, MVT::i64
);
11329 return DAG
.getNode(AArch64ISD::DUPLANE128
, DL
, VT
, Op
.getOperand(1), CI
);
11332 SDValue V
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::nxv2i64
, Op
.getOperand(1));
11334 // The ACLE says this must produce the same result as:
11335 // svtbl(data, svadd_x(svptrue_b64(),
11336 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
11338 SDValue One
= DAG
.getConstant(1, DL
, MVT::i64
);
11339 SDValue SplatOne
= DAG
.getNode(ISD::SPLAT_VECTOR
, DL
, MVT::nxv2i64
, One
);
11341 // create the vector 0,1,0,1,...
11342 SDValue SV
= DAG
.getStepVector(DL
, MVT::nxv2i64
);
11343 SV
= DAG
.getNode(ISD::AND
, DL
, MVT::nxv2i64
, SV
, SplatOne
);
11345 // create the vector idx64,idx64+1,idx64,idx64+1,...
11346 SDValue Idx64
= DAG
.getNode(ISD::ADD
, DL
, MVT::i64
, Idx128
, Idx128
);
11347 SDValue SplatIdx64
= DAG
.getNode(ISD::SPLAT_VECTOR
, DL
, MVT::nxv2i64
, Idx64
);
11348 SDValue ShuffleMask
= DAG
.getNode(ISD::ADD
, DL
, MVT::nxv2i64
, SV
, SplatIdx64
);
11350 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
11351 SDValue TBL
= DAG
.getNode(AArch64ISD::TBL
, DL
, MVT::nxv2i64
, V
, ShuffleMask
);
11352 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, TBL
);
11356 static bool resolveBuildVector(BuildVectorSDNode
*BVN
, APInt
&CnstBits
,
11357 APInt
&UndefBits
) {
11358 EVT VT
= BVN
->getValueType(0);
11359 APInt SplatBits
, SplatUndef
;
11360 unsigned SplatBitSize
;
11362 if (BVN
->isConstantSplat(SplatBits
, SplatUndef
, SplatBitSize
, HasAnyUndefs
)) {
11363 unsigned NumSplats
= VT
.getSizeInBits() / SplatBitSize
;
11365 for (unsigned i
= 0; i
< NumSplats
; ++i
) {
11366 CnstBits
<<= SplatBitSize
;
11367 UndefBits
<<= SplatBitSize
;
11368 CnstBits
|= SplatBits
.zextOrTrunc(VT
.getSizeInBits());
11369 UndefBits
|= (SplatBits
^ SplatUndef
).zextOrTrunc(VT
.getSizeInBits());
11378 // Try 64-bit splatted SIMD immediate.
11379 static SDValue
tryAdvSIMDModImm64(unsigned NewOp
, SDValue Op
, SelectionDAG
&DAG
,
11380 const APInt
&Bits
) {
11381 if (Bits
.getHiBits(64) == Bits
.getLoBits(64)) {
11382 uint64_t Value
= Bits
.zextOrTrunc(64).getZExtValue();
11383 EVT VT
= Op
.getValueType();
11384 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v2i64
: MVT::f64
;
11386 if (AArch64_AM::isAdvSIMDModImmType10(Value
)) {
11387 Value
= AArch64_AM::encodeAdvSIMDModImmType10(Value
);
11390 SDValue Mov
= DAG
.getNode(NewOp
, dl
, MovTy
,
11391 DAG
.getConstant(Value
, dl
, MVT::i32
));
11392 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
11399 // Try 32-bit splatted SIMD immediate.
11400 static SDValue
tryAdvSIMDModImm32(unsigned NewOp
, SDValue Op
, SelectionDAG
&DAG
,
11402 const SDValue
*LHS
= nullptr) {
11403 if (Bits
.getHiBits(64) == Bits
.getLoBits(64)) {
11404 uint64_t Value
= Bits
.zextOrTrunc(64).getZExtValue();
11405 EVT VT
= Op
.getValueType();
11406 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
11407 bool isAdvSIMDModImm
= false;
11410 if ((isAdvSIMDModImm
= AArch64_AM::isAdvSIMDModImmType1(Value
))) {
11411 Value
= AArch64_AM::encodeAdvSIMDModImmType1(Value
);
11414 else if ((isAdvSIMDModImm
= AArch64_AM::isAdvSIMDModImmType2(Value
))) {
11415 Value
= AArch64_AM::encodeAdvSIMDModImmType2(Value
);
11418 else if ((isAdvSIMDModImm
= AArch64_AM::isAdvSIMDModImmType3(Value
))) {
11419 Value
= AArch64_AM::encodeAdvSIMDModImmType3(Value
);
11422 else if ((isAdvSIMDModImm
= AArch64_AM::isAdvSIMDModImmType4(Value
))) {
11423 Value
= AArch64_AM::encodeAdvSIMDModImmType4(Value
);
11427 if (isAdvSIMDModImm
) {
11432 Mov
= DAG
.getNode(NewOp
, dl
, MovTy
, *LHS
,
11433 DAG
.getConstant(Value
, dl
, MVT::i32
),
11434 DAG
.getConstant(Shift
, dl
, MVT::i32
));
11436 Mov
= DAG
.getNode(NewOp
, dl
, MovTy
,
11437 DAG
.getConstant(Value
, dl
, MVT::i32
),
11438 DAG
.getConstant(Shift
, dl
, MVT::i32
));
11440 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
11447 // Try 16-bit splatted SIMD immediate.
11448 static SDValue
tryAdvSIMDModImm16(unsigned NewOp
, SDValue Op
, SelectionDAG
&DAG
,
11450 const SDValue
*LHS
= nullptr) {
11451 if (Bits
.getHiBits(64) == Bits
.getLoBits(64)) {
11452 uint64_t Value
= Bits
.zextOrTrunc(64).getZExtValue();
11453 EVT VT
= Op
.getValueType();
11454 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v8i16
: MVT::v4i16
;
11455 bool isAdvSIMDModImm
= false;
11458 if ((isAdvSIMDModImm
= AArch64_AM::isAdvSIMDModImmType5(Value
))) {
11459 Value
= AArch64_AM::encodeAdvSIMDModImmType5(Value
);
11462 else if ((isAdvSIMDModImm
= AArch64_AM::isAdvSIMDModImmType6(Value
))) {
11463 Value
= AArch64_AM::encodeAdvSIMDModImmType6(Value
);
11467 if (isAdvSIMDModImm
) {
11472 Mov
= DAG
.getNode(NewOp
, dl
, MovTy
, *LHS
,
11473 DAG
.getConstant(Value
, dl
, MVT::i32
),
11474 DAG
.getConstant(Shift
, dl
, MVT::i32
));
11476 Mov
= DAG
.getNode(NewOp
, dl
, MovTy
,
11477 DAG
.getConstant(Value
, dl
, MVT::i32
),
11478 DAG
.getConstant(Shift
, dl
, MVT::i32
));
11480 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
11487 // Try 32-bit splatted SIMD immediate with shifted ones.
11488 static SDValue
tryAdvSIMDModImm321s(unsigned NewOp
, SDValue Op
,
11489 SelectionDAG
&DAG
, const APInt
&Bits
) {
11490 if (Bits
.getHiBits(64) == Bits
.getLoBits(64)) {
11491 uint64_t Value
= Bits
.zextOrTrunc(64).getZExtValue();
11492 EVT VT
= Op
.getValueType();
11493 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v4i32
: MVT::v2i32
;
11494 bool isAdvSIMDModImm
= false;
11497 if ((isAdvSIMDModImm
= AArch64_AM::isAdvSIMDModImmType7(Value
))) {
11498 Value
= AArch64_AM::encodeAdvSIMDModImmType7(Value
);
11501 else if ((isAdvSIMDModImm
= AArch64_AM::isAdvSIMDModImmType8(Value
))) {
11502 Value
= AArch64_AM::encodeAdvSIMDModImmType8(Value
);
11506 if (isAdvSIMDModImm
) {
11508 SDValue Mov
= DAG
.getNode(NewOp
, dl
, MovTy
,
11509 DAG
.getConstant(Value
, dl
, MVT::i32
),
11510 DAG
.getConstant(Shift
, dl
, MVT::i32
));
11511 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
11518 // Try 8-bit splatted SIMD immediate.
11519 static SDValue
tryAdvSIMDModImm8(unsigned NewOp
, SDValue Op
, SelectionDAG
&DAG
,
11520 const APInt
&Bits
) {
11521 if (Bits
.getHiBits(64) == Bits
.getLoBits(64)) {
11522 uint64_t Value
= Bits
.zextOrTrunc(64).getZExtValue();
11523 EVT VT
= Op
.getValueType();
11524 MVT MovTy
= (VT
.getSizeInBits() == 128) ? MVT::v16i8
: MVT::v8i8
;
11526 if (AArch64_AM::isAdvSIMDModImmType9(Value
)) {
11527 Value
= AArch64_AM::encodeAdvSIMDModImmType9(Value
);
11530 SDValue Mov
= DAG
.getNode(NewOp
, dl
, MovTy
,
11531 DAG
.getConstant(Value
, dl
, MVT::i32
));
11532 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
11539 // Try FP splatted SIMD immediate.
11540 static SDValue
tryAdvSIMDModImmFP(unsigned NewOp
, SDValue Op
, SelectionDAG
&DAG
,
11541 const APInt
&Bits
) {
11542 if (Bits
.getHiBits(64) == Bits
.getLoBits(64)) {
11543 uint64_t Value
= Bits
.zextOrTrunc(64).getZExtValue();
11544 EVT VT
= Op
.getValueType();
11545 bool isWide
= (VT
.getSizeInBits() == 128);
11547 bool isAdvSIMDModImm
= false;
11549 if ((isAdvSIMDModImm
= AArch64_AM::isAdvSIMDModImmType11(Value
))) {
11550 Value
= AArch64_AM::encodeAdvSIMDModImmType11(Value
);
11551 MovTy
= isWide
? MVT::v4f32
: MVT::v2f32
;
11554 (isAdvSIMDModImm
= AArch64_AM::isAdvSIMDModImmType12(Value
))) {
11555 Value
= AArch64_AM::encodeAdvSIMDModImmType12(Value
);
11556 MovTy
= MVT::v2f64
;
11559 if (isAdvSIMDModImm
) {
11561 SDValue Mov
= DAG
.getNode(NewOp
, dl
, MovTy
,
11562 DAG
.getConstant(Value
, dl
, MVT::i32
));
11563 return DAG
.getNode(AArch64ISD::NVCAST
, dl
, VT
, Mov
);
11570 // Specialized code to quickly find if PotentialBVec is a BuildVector that
11571 // consists of only the same constant int value, returned in reference arg
11573 static bool isAllConstantBuildVector(const SDValue
&PotentialBVec
,
11574 uint64_t &ConstVal
) {
11575 BuildVectorSDNode
*Bvec
= dyn_cast
<BuildVectorSDNode
>(PotentialBVec
);
11578 ConstantSDNode
*FirstElt
= dyn_cast
<ConstantSDNode
>(Bvec
->getOperand(0));
11581 EVT VT
= Bvec
->getValueType(0);
11582 unsigned NumElts
= VT
.getVectorNumElements();
11583 for (unsigned i
= 1; i
< NumElts
; ++i
)
11584 if (dyn_cast
<ConstantSDNode
>(Bvec
->getOperand(i
)) != FirstElt
)
11586 ConstVal
= FirstElt
->getZExtValue();
11590 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
11591 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
11592 // BUILD_VECTORs with constant element C1, C2 is a constant, and:
11593 // - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
11594 // - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
11595 // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
11596 static SDValue
tryLowerToSLI(SDNode
*N
, SelectionDAG
&DAG
) {
11597 EVT VT
= N
->getValueType(0);
11599 if (!VT
.isVector())
11607 SDValue FirstOp
= N
->getOperand(0);
11608 unsigned FirstOpc
= FirstOp
.getOpcode();
11609 SDValue SecondOp
= N
->getOperand(1);
11610 unsigned SecondOpc
= SecondOp
.getOpcode();
11612 // Is one of the operands an AND or a BICi? The AND may have been optimised to
11613 // a BICi in order to use an immediate instead of a register.
11614 // Is the other operand an shl or lshr? This will have been turned into:
11615 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
11616 if ((FirstOpc
== ISD::AND
|| FirstOpc
== AArch64ISD::BICi
) &&
11617 (SecondOpc
== AArch64ISD::VSHL
|| SecondOpc
== AArch64ISD::VLSHR
)) {
11621 } else if ((SecondOpc
== ISD::AND
|| SecondOpc
== AArch64ISD::BICi
) &&
11622 (FirstOpc
== AArch64ISD::VSHL
|| FirstOpc
== AArch64ISD::VLSHR
)) {
11628 bool IsAnd
= And
.getOpcode() == ISD::AND
;
11629 bool IsShiftRight
= Shift
.getOpcode() == AArch64ISD::VLSHR
;
11631 // Is the shift amount constant?
11632 ConstantSDNode
*C2node
= dyn_cast
<ConstantSDNode
>(Shift
.getOperand(1));
11638 // Is the and mask vector all constant?
11639 if (!isAllConstantBuildVector(And
.getOperand(1), C1
))
11642 // Reconstruct the corresponding AND immediate from the two BICi immediates.
11643 ConstantSDNode
*C1nodeImm
= dyn_cast
<ConstantSDNode
>(And
.getOperand(1));
11644 ConstantSDNode
*C1nodeShift
= dyn_cast
<ConstantSDNode
>(And
.getOperand(2));
11645 assert(C1nodeImm
&& C1nodeShift
);
11646 C1
= ~(C1nodeImm
->getZExtValue() << C1nodeShift
->getZExtValue());
11649 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
11650 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
11651 // how much one can shift elements of a particular size?
11652 uint64_t C2
= C2node
->getZExtValue();
11653 unsigned ElemSizeInBits
= VT
.getScalarSizeInBits();
11654 if (C2
> ElemSizeInBits
)
11657 APInt
C1AsAPInt(ElemSizeInBits
, C1
);
11658 APInt RequiredC1
= IsShiftRight
? APInt::getHighBitsSet(ElemSizeInBits
, C2
)
11659 : APInt::getLowBitsSet(ElemSizeInBits
, C2
);
11660 if (C1AsAPInt
!= RequiredC1
)
11663 SDValue X
= And
.getOperand(0);
11664 SDValue Y
= Shift
.getOperand(0);
11666 unsigned Inst
= IsShiftRight
? AArch64ISD::VSRI
: AArch64ISD::VSLI
;
11667 SDValue ResultSLI
= DAG
.getNode(Inst
, DL
, VT
, X
, Y
, Shift
.getOperand(1));
11669 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
11670 LLVM_DEBUG(N
->dump(&DAG
));
11671 LLVM_DEBUG(dbgs() << "into: \n");
11672 LLVM_DEBUG(ResultSLI
->dump(&DAG
));
11678 SDValue
AArch64TargetLowering::LowerVectorOR(SDValue Op
,
11679 SelectionDAG
&DAG
) const {
11680 if (useSVEForFixedLengthVectorVT(Op
.getValueType()))
11681 return LowerToScalableOp(Op
, DAG
);
11683 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
11684 if (SDValue Res
= tryLowerToSLI(Op
.getNode(), DAG
))
11687 EVT VT
= Op
.getValueType();
11689 SDValue LHS
= Op
.getOperand(0);
11690 BuildVectorSDNode
*BVN
=
11691 dyn_cast
<BuildVectorSDNode
>(Op
.getOperand(1).getNode());
11693 // OR commutes, so try swapping the operands.
11694 LHS
= Op
.getOperand(1);
11695 BVN
= dyn_cast
<BuildVectorSDNode
>(Op
.getOperand(0).getNode());
11700 APInt
DefBits(VT
.getSizeInBits(), 0);
11701 APInt
UndefBits(VT
.getSizeInBits(), 0);
11702 if (resolveBuildVector(BVN
, DefBits
, UndefBits
)) {
11705 if ((NewOp
= tryAdvSIMDModImm32(AArch64ISD::ORRi
, Op
, DAG
,
11707 (NewOp
= tryAdvSIMDModImm16(AArch64ISD::ORRi
, Op
, DAG
,
11711 if ((NewOp
= tryAdvSIMDModImm32(AArch64ISD::ORRi
, Op
, DAG
,
11712 UndefBits
, &LHS
)) ||
11713 (NewOp
= tryAdvSIMDModImm16(AArch64ISD::ORRi
, Op
, DAG
,
11718 // We can always fall back to a non-immediate OR.
11722 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
11723 // be truncated to fit element width.
11724 static SDValue
NormalizeBuildVector(SDValue Op
,
11725 SelectionDAG
&DAG
) {
11726 assert(Op
.getOpcode() == ISD::BUILD_VECTOR
&& "Unknown opcode!");
11728 EVT VT
= Op
.getValueType();
11729 EVT EltTy
= VT
.getVectorElementType();
11731 if (EltTy
.isFloatingPoint() || EltTy
.getSizeInBits() > 16)
11734 SmallVector
<SDValue
, 16> Ops
;
11735 for (SDValue Lane
: Op
->ops()) {
11736 // For integer vectors, type legalization would have promoted the
11737 // operands already. Otherwise, if Op is a floating-point splat
11738 // (with operands cast to integers), then the only possibilities
11739 // are constants and UNDEFs.
11740 if (auto *CstLane
= dyn_cast
<ConstantSDNode
>(Lane
)) {
11741 APInt
LowBits(EltTy
.getSizeInBits(),
11742 CstLane
->getZExtValue());
11743 Lane
= DAG
.getConstant(LowBits
.getZExtValue(), dl
, MVT::i32
);
11744 } else if (Lane
.getNode()->isUndef()) {
11745 Lane
= DAG
.getUNDEF(MVT::i32
);
11747 assert(Lane
.getValueType() == MVT::i32
&&
11748 "Unexpected BUILD_VECTOR operand type");
11750 Ops
.push_back(Lane
);
11752 return DAG
.getBuildVector(VT
, dl
, Ops
);
11755 static SDValue
ConstantBuildVector(SDValue Op
, SelectionDAG
&DAG
) {
11756 EVT VT
= Op
.getValueType();
11758 APInt
DefBits(VT
.getSizeInBits(), 0);
11759 APInt
UndefBits(VT
.getSizeInBits(), 0);
11760 BuildVectorSDNode
*BVN
= cast
<BuildVectorSDNode
>(Op
.getNode());
11761 if (resolveBuildVector(BVN
, DefBits
, UndefBits
)) {
11763 if ((NewOp
= tryAdvSIMDModImm64(AArch64ISD::MOVIedit
, Op
, DAG
, DefBits
)) ||
11764 (NewOp
= tryAdvSIMDModImm32(AArch64ISD::MOVIshift
, Op
, DAG
, DefBits
)) ||
11765 (NewOp
= tryAdvSIMDModImm321s(AArch64ISD::MOVImsl
, Op
, DAG
, DefBits
)) ||
11766 (NewOp
= tryAdvSIMDModImm16(AArch64ISD::MOVIshift
, Op
, DAG
, DefBits
)) ||
11767 (NewOp
= tryAdvSIMDModImm8(AArch64ISD::MOVI
, Op
, DAG
, DefBits
)) ||
11768 (NewOp
= tryAdvSIMDModImmFP(AArch64ISD::FMOV
, Op
, DAG
, DefBits
)))
11771 DefBits
= ~DefBits
;
11772 if ((NewOp
= tryAdvSIMDModImm32(AArch64ISD::MVNIshift
, Op
, DAG
, DefBits
)) ||
11773 (NewOp
= tryAdvSIMDModImm321s(AArch64ISD::MVNImsl
, Op
, DAG
, DefBits
)) ||
11774 (NewOp
= tryAdvSIMDModImm16(AArch64ISD::MVNIshift
, Op
, DAG
, DefBits
)))
11777 DefBits
= UndefBits
;
11778 if ((NewOp
= tryAdvSIMDModImm64(AArch64ISD::MOVIedit
, Op
, DAG
, DefBits
)) ||
11779 (NewOp
= tryAdvSIMDModImm32(AArch64ISD::MOVIshift
, Op
, DAG
, DefBits
)) ||
11780 (NewOp
= tryAdvSIMDModImm321s(AArch64ISD::MOVImsl
, Op
, DAG
, DefBits
)) ||
11781 (NewOp
= tryAdvSIMDModImm16(AArch64ISD::MOVIshift
, Op
, DAG
, DefBits
)) ||
11782 (NewOp
= tryAdvSIMDModImm8(AArch64ISD::MOVI
, Op
, DAG
, DefBits
)) ||
11783 (NewOp
= tryAdvSIMDModImmFP(AArch64ISD::FMOV
, Op
, DAG
, DefBits
)))
11786 DefBits
= ~UndefBits
;
11787 if ((NewOp
= tryAdvSIMDModImm32(AArch64ISD::MVNIshift
, Op
, DAG
, DefBits
)) ||
11788 (NewOp
= tryAdvSIMDModImm321s(AArch64ISD::MVNImsl
, Op
, DAG
, DefBits
)) ||
11789 (NewOp
= tryAdvSIMDModImm16(AArch64ISD::MVNIshift
, Op
, DAG
, DefBits
)))
11796 SDValue
AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op
,
11797 SelectionDAG
&DAG
) const {
11798 EVT VT
= Op
.getValueType();
11800 if (useSVEForFixedLengthVectorVT(VT
,
11801 Subtarget
->forceStreamingCompatibleSVE())) {
11802 if (auto SeqInfo
= cast
<BuildVectorSDNode
>(Op
)->isConstantSequence()) {
11804 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, VT
);
11805 SDValue Start
= DAG
.getConstant(SeqInfo
->first
, DL
, ContainerVT
);
11806 SDValue Steps
= DAG
.getStepVector(DL
, ContainerVT
, SeqInfo
->second
);
11807 SDValue Seq
= DAG
.getNode(ISD::ADD
, DL
, ContainerVT
, Start
, Steps
);
11808 return convertFromScalableVector(DAG
, Op
.getValueType(), Seq
);
11811 // Revert to common legalisation for all other variants.
11815 // Try to build a simple constant vector.
11816 Op
= NormalizeBuildVector(Op
, DAG
);
11817 if (VT
.isInteger()) {
11818 // Certain vector constants, used to express things like logical NOT and
11819 // arithmetic NEG, are passed through unmodified. This allows special
11820 // patterns for these operations to match, which will lower these constants
11821 // to whatever is proven necessary.
11822 BuildVectorSDNode
*BVN
= cast
<BuildVectorSDNode
>(Op
.getNode());
11823 if (BVN
->isConstant())
11824 if (ConstantSDNode
*Const
= BVN
->getConstantSplatNode()) {
11825 unsigned BitSize
= VT
.getVectorElementType().getSizeInBits();
11827 Const
->getAPIntValue().zextOrTrunc(BitSize
).getZExtValue());
11828 if (Val
.isZero() || Val
.isAllOnes())
11833 if (SDValue V
= ConstantBuildVector(Op
, DAG
))
11836 // Scan through the operands to find some interesting properties we can
11838 // 1) If only one value is used, we can use a DUP, or
11839 // 2) if only the low element is not undef, we can just insert that, or
11840 // 3) if only one constant value is used (w/ some non-constant lanes),
11841 // we can splat the constant value into the whole vector then fill
11842 // in the non-constant lanes.
11843 // 4) FIXME: If different constant values are used, but we can intelligently
11844 // select the values we'll be overwriting for the non-constant
11845 // lanes such that we can directly materialize the vector
11846 // some other way (MOVI, e.g.), we can be sneaky.
11847 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
11849 unsigned NumElts
= VT
.getVectorNumElements();
11850 bool isOnlyLowElement
= true;
11851 bool usesOnlyOneValue
= true;
11852 bool usesOnlyOneConstantValue
= true;
11853 bool isConstant
= true;
11854 bool AllLanesExtractElt
= true;
11855 unsigned NumConstantLanes
= 0;
11856 unsigned NumDifferentLanes
= 0;
11857 unsigned NumUndefLanes
= 0;
11859 SDValue ConstantValue
;
11860 for (unsigned i
= 0; i
< NumElts
; ++i
) {
11861 SDValue V
= Op
.getOperand(i
);
11862 if (V
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
11863 AllLanesExtractElt
= false;
11869 isOnlyLowElement
= false;
11870 if (!isIntOrFPConstant(V
))
11871 isConstant
= false;
11873 if (isIntOrFPConstant(V
)) {
11874 ++NumConstantLanes
;
11875 if (!ConstantValue
.getNode())
11877 else if (ConstantValue
!= V
)
11878 usesOnlyOneConstantValue
= false;
11881 if (!Value
.getNode())
11883 else if (V
!= Value
) {
11884 usesOnlyOneValue
= false;
11885 ++NumDifferentLanes
;
11889 if (!Value
.getNode()) {
11891 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
11892 return DAG
.getUNDEF(VT
);
11895 // Convert BUILD_VECTOR where all elements but the lowest are undef into
11896 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
11897 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
11898 if (isOnlyLowElement
&& !(NumElts
== 1 && isIntOrFPConstant(Value
))) {
11899 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
11900 "SCALAR_TO_VECTOR node\n");
11901 return DAG
.getNode(ISD::SCALAR_TO_VECTOR
, dl
, VT
, Value
);
11904 if (AllLanesExtractElt
) {
11905 SDNode
*Vector
= nullptr;
11908 // Check whether the extract elements match the Even pattern <0,2,4,...> or
11909 // the Odd pattern <1,3,5,...>.
11910 for (unsigned i
= 0; i
< NumElts
; ++i
) {
11911 SDValue V
= Op
.getOperand(i
);
11912 const SDNode
*N
= V
.getNode();
11913 if (!isa
<ConstantSDNode
>(N
->getOperand(1)))
11915 SDValue N0
= N
->getOperand(0);
11917 // All elements are extracted from the same vector.
11919 Vector
= N0
.getNode();
11920 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
11922 if (VT
.getVectorElementType() !=
11923 N0
.getValueType().getVectorElementType())
11925 } else if (Vector
!= N0
.getNode()) {
11931 // Extracted values are either at Even indices <0,2,4,...> or at Odd
11932 // indices <1,3,5,...>.
11933 uint64_t Val
= N
->getConstantOperandVal(1);
11934 if (Val
== 2 * i
) {
11938 if (Val
- 1 == 2 * i
) {
11943 // Something does not match: abort.
11950 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, VT
, SDValue(Vector
, 0),
11951 DAG
.getConstant(0, dl
, MVT::i64
));
11953 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, dl
, VT
, SDValue(Vector
, 0),
11954 DAG
.getConstant(NumElts
, dl
, MVT::i64
));
11957 return DAG
.getNode(AArch64ISD::UZP1
, dl
, DAG
.getVTList(VT
, VT
), LHS
,
11960 return DAG
.getNode(AArch64ISD::UZP2
, dl
, DAG
.getVTList(VT
, VT
), LHS
,
11965 // Use DUP for non-constant splats. For f32 constant splats, reduce to
11966 // i32 and try again.
11967 if (usesOnlyOneValue
) {
11969 if (Value
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
11970 Value
.getValueType() != VT
) {
11972 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
11973 return DAG
.getNode(AArch64ISD::DUP
, dl
, VT
, Value
);
11976 // This is actually a DUPLANExx operation, which keeps everything vectory.
11978 SDValue Lane
= Value
.getOperand(1);
11979 Value
= Value
.getOperand(0);
11980 if (Value
.getValueSizeInBits() == 64) {
11982 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
11984 Value
= WidenVector(Value
, DAG
);
11987 unsigned Opcode
= getDUPLANEOp(VT
.getVectorElementType());
11988 return DAG
.getNode(Opcode
, dl
, VT
, Value
, Lane
);
11991 if (VT
.getVectorElementType().isFloatingPoint()) {
11992 SmallVector
<SDValue
, 8> Ops
;
11993 EVT EltTy
= VT
.getVectorElementType();
11994 assert ((EltTy
== MVT::f16
|| EltTy
== MVT::bf16
|| EltTy
== MVT::f32
||
11995 EltTy
== MVT::f64
) && "Unsupported floating-point vector type");
11997 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
11998 "BITCASTS, and try again\n");
11999 MVT NewType
= MVT::getIntegerVT(EltTy
.getSizeInBits());
12000 for (unsigned i
= 0; i
< NumElts
; ++i
)
12001 Ops
.push_back(DAG
.getNode(ISD::BITCAST
, dl
, NewType
, Op
.getOperand(i
)));
12002 EVT VecVT
= EVT::getVectorVT(*DAG
.getContext(), NewType
, NumElts
);
12003 SDValue Val
= DAG
.getBuildVector(VecVT
, dl
, Ops
);
12004 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
12006 Val
= LowerBUILD_VECTOR(Val
, DAG
);
12008 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, Val
);
12012 // If we need to insert a small number of different non-constant elements and
12013 // the vector width is sufficiently large, prefer using DUP with the common
12014 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
12015 // skip the constant lane handling below.
12016 bool PreferDUPAndInsert
=
12017 !isConstant
&& NumDifferentLanes
>= 1 &&
12018 NumDifferentLanes
< ((NumElts
- NumUndefLanes
) / 2) &&
12019 NumDifferentLanes
>= NumConstantLanes
;
12021 // If there was only one constant value used and for more than one lane,
12022 // start by splatting that value, then replace the non-constant lanes. This
12023 // is better than the default, which will perform a separate initialization
12025 if (!PreferDUPAndInsert
&& NumConstantLanes
> 0 && usesOnlyOneConstantValue
) {
12026 // Firstly, try to materialize the splat constant.
12027 SDValue Vec
= DAG
.getSplatBuildVector(VT
, dl
, ConstantValue
),
12028 Val
= ConstantBuildVector(Vec
, DAG
);
12030 // Otherwise, materialize the constant and splat it.
12031 Val
= DAG
.getNode(AArch64ISD::DUP
, dl
, VT
, ConstantValue
);
12032 DAG
.ReplaceAllUsesWith(Vec
.getNode(), &Val
);
12035 // Now insert the non-constant lanes.
12036 for (unsigned i
= 0; i
< NumElts
; ++i
) {
12037 SDValue V
= Op
.getOperand(i
);
12038 SDValue LaneIdx
= DAG
.getConstant(i
, dl
, MVT::i64
);
12039 if (!isIntOrFPConstant(V
))
12040 // Note that type legalization likely mucked about with the VT of the
12041 // source operand, so we may have to convert it here before inserting.
12042 Val
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, VT
, Val
, V
, LaneIdx
);
12047 // This will generate a load from the constant pool.
12050 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
12055 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
12056 // v4i32s. This is really a truncate, which we can construct out of (legal)
12057 // concats and truncate nodes.
12058 if (SDValue M
= ReconstructTruncateFromBuildVector(Op
, DAG
))
12061 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
12062 if (NumElts
>= 4) {
12063 if (SDValue shuffle
= ReconstructShuffle(Op
, DAG
))
12067 if (PreferDUPAndInsert
) {
12068 // First, build a constant vector with the common element.
12069 SmallVector
<SDValue
, 8> Ops(NumElts
, Value
);
12070 SDValue NewVector
= LowerBUILD_VECTOR(DAG
.getBuildVector(VT
, dl
, Ops
), DAG
);
12071 // Next, insert the elements that do not match the common value.
12072 for (unsigned I
= 0; I
< NumElts
; ++I
)
12073 if (Op
.getOperand(I
) != Value
)
12075 DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, VT
, NewVector
,
12076 Op
.getOperand(I
), DAG
.getConstant(I
, dl
, MVT::i64
));
12081 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
12082 // know the default expansion would otherwise fall back on something even
12083 // worse. For a vector with one or two non-undef values, that's
12084 // scalar_to_vector for the elements followed by a shuffle (provided the
12085 // shuffle is valid for the target) and materialization element by element
12086 // on the stack followed by a load for everything else.
12087 if (!isConstant
&& !usesOnlyOneValue
) {
12089 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
12090 "of INSERT_VECTOR_ELT\n");
12092 SDValue Vec
= DAG
.getUNDEF(VT
);
12093 SDValue Op0
= Op
.getOperand(0);
12096 // Use SCALAR_TO_VECTOR for lane zero to
12097 // a) Avoid a RMW dependency on the full vector register, and
12098 // b) Allow the register coalescer to fold away the copy if the
12099 // value is already in an S or D register, and we're forced to emit an
12100 // INSERT_SUBREG that we can't fold anywhere.
12102 // We also allow types like i8 and i16 which are illegal scalar but legal
12103 // vector element types. After type-legalization the inserted value is
12104 // extended (i32) and it is safe to cast them to the vector type by ignoring
12105 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
12106 if (!Op0
.isUndef()) {
12107 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
12108 Vec
= DAG
.getNode(ISD::SCALAR_TO_VECTOR
, dl
, VT
, Op0
);
12111 LLVM_DEBUG(if (i
< NumElts
) dbgs()
12112 << "Creating nodes for the other vector elements:\n";);
12113 for (; i
< NumElts
; ++i
) {
12114 SDValue V
= Op
.getOperand(i
);
12117 SDValue LaneIdx
= DAG
.getConstant(i
, dl
, MVT::i64
);
12118 Vec
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, dl
, VT
, Vec
, V
, LaneIdx
);
12124 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
12125 "better alternative\n");
12129 SDValue
AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op
,
12130 SelectionDAG
&DAG
) const {
12131 if (useSVEForFixedLengthVectorVT(Op
.getValueType()))
12132 return LowerFixedLengthConcatVectorsToSVE(Op
, DAG
);
12134 assert(Op
.getValueType().isScalableVector() &&
12135 isTypeLegal(Op
.getValueType()) &&
12136 "Expected legal scalable vector type!");
12138 if (isTypeLegal(Op
.getOperand(0).getValueType())) {
12139 unsigned NumOperands
= Op
->getNumOperands();
12140 assert(NumOperands
> 1 && isPowerOf2_32(NumOperands
) &&
12141 "Unexpected number of operands in CONCAT_VECTORS");
12143 if (NumOperands
== 2)
12146 // Concat each pair of subvectors and pack into the lower half of the array.
12147 SmallVector
<SDValue
> ConcatOps(Op
->op_begin(), Op
->op_end());
12148 while (ConcatOps
.size() > 1) {
12149 for (unsigned I
= 0, E
= ConcatOps
.size(); I
!= E
; I
+= 2) {
12150 SDValue V1
= ConcatOps
[I
];
12151 SDValue V2
= ConcatOps
[I
+ 1];
12152 EVT SubVT
= V1
.getValueType();
12153 EVT PairVT
= SubVT
.getDoubleNumVectorElementsVT(*DAG
.getContext());
12155 DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(Op
), PairVT
, V1
, V2
);
12157 ConcatOps
.resize(ConcatOps
.size() / 2);
12159 return ConcatOps
[0];
12165 SDValue
AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op
,
12166 SelectionDAG
&DAG
) const {
12167 assert(Op
.getOpcode() == ISD::INSERT_VECTOR_ELT
&& "Unknown opcode!");
12169 if (useSVEForFixedLengthVectorVT(Op
.getValueType()))
12170 return LowerFixedLengthInsertVectorElt(Op
, DAG
);
12172 // Check for non-constant or out of range lane.
12173 EVT VT
= Op
.getOperand(0).getValueType();
12175 if (VT
.getScalarType() == MVT::i1
) {
12176 EVT VectorVT
= getPromotedVTForPredicate(VT
);
12178 SDValue ExtendedVector
=
12179 DAG
.getAnyExtOrTrunc(Op
.getOperand(0), DL
, VectorVT
);
12180 SDValue ExtendedValue
=
12181 DAG
.getAnyExtOrTrunc(Op
.getOperand(1), DL
,
12182 VectorVT
.getScalarType().getSizeInBits() < 32
12184 : VectorVT
.getScalarType());
12186 DAG
.getNode(ISD::INSERT_VECTOR_ELT
, DL
, VectorVT
, ExtendedVector
,
12187 ExtendedValue
, Op
.getOperand(2));
12188 return DAG
.getAnyExtOrTrunc(ExtendedVector
, DL
, VT
);
12191 ConstantSDNode
*CI
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(2));
12192 if (!CI
|| CI
->getZExtValue() >= VT
.getVectorNumElements())
12195 // Insertion/extraction are legal for V128 types.
12196 if (VT
== MVT::v16i8
|| VT
== MVT::v8i16
|| VT
== MVT::v4i32
||
12197 VT
== MVT::v2i64
|| VT
== MVT::v4f32
|| VT
== MVT::v2f64
||
12198 VT
== MVT::v8f16
|| VT
== MVT::v8bf16
)
12201 if (VT
!= MVT::v8i8
&& VT
!= MVT::v4i16
&& VT
!= MVT::v2i32
&&
12202 VT
!= MVT::v1i64
&& VT
!= MVT::v2f32
&& VT
!= MVT::v4f16
&&
12206 // For V64 types, we perform insertion by expanding the value
12207 // to a V128 type and perform the insertion on that.
12209 SDValue WideVec
= WidenVector(Op
.getOperand(0), DAG
);
12210 EVT WideTy
= WideVec
.getValueType();
12212 SDValue Node
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, DL
, WideTy
, WideVec
,
12213 Op
.getOperand(1), Op
.getOperand(2));
12214 // Re-narrow the resultant vector.
12215 return NarrowVector(Node
, DAG
);
12219 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op
,
12220 SelectionDAG
&DAG
) const {
12221 assert(Op
.getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unknown opcode!");
12222 EVT VT
= Op
.getOperand(0).getValueType();
12224 if (VT
.getScalarType() == MVT::i1
) {
12225 // We can't directly extract from an SVE predicate; extend it first.
12226 // (This isn't the only possible lowering, but it's straightforward.)
12227 EVT VectorVT
= getPromotedVTForPredicate(VT
);
12230 DAG
.getNode(ISD::ANY_EXTEND
, DL
, VectorVT
, Op
.getOperand(0));
12231 MVT ExtractTy
= VectorVT
== MVT::nxv2i64
? MVT::i64
: MVT::i32
;
12232 SDValue Extract
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, ExtractTy
,
12233 Extend
, Op
.getOperand(1));
12234 return DAG
.getAnyExtOrTrunc(Extract
, DL
, Op
.getValueType());
12237 if (useSVEForFixedLengthVectorVT(VT
))
12238 return LowerFixedLengthExtractVectorElt(Op
, DAG
);
12240 // Check for non-constant or out of range lane.
12241 ConstantSDNode
*CI
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1));
12242 if (!CI
|| CI
->getZExtValue() >= VT
.getVectorNumElements())
12245 // Insertion/extraction are legal for V128 types.
12246 if (VT
== MVT::v16i8
|| VT
== MVT::v8i16
|| VT
== MVT::v4i32
||
12247 VT
== MVT::v2i64
|| VT
== MVT::v4f32
|| VT
== MVT::v2f64
||
12248 VT
== MVT::v8f16
|| VT
== MVT::v8bf16
)
12251 if (VT
!= MVT::v8i8
&& VT
!= MVT::v4i16
&& VT
!= MVT::v2i32
&&
12252 VT
!= MVT::v1i64
&& VT
!= MVT::v2f32
&& VT
!= MVT::v4f16
&&
12256 // For V64 types, we perform extraction by expanding the value
12257 // to a V128 type and perform the extraction on that.
12259 SDValue WideVec
= WidenVector(Op
.getOperand(0), DAG
);
12260 EVT WideTy
= WideVec
.getValueType();
12262 EVT ExtrTy
= WideTy
.getVectorElementType();
12263 if (ExtrTy
== MVT::i16
|| ExtrTy
== MVT::i8
)
12266 // For extractions, we just return the result directly.
12267 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, ExtrTy
, WideVec
,
12271 SDValue
AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op
,
12272 SelectionDAG
&DAG
) const {
12273 assert(Op
.getValueType().isFixedLengthVector() &&
12274 "Only cases that extract a fixed length vector are supported!");
12276 EVT InVT
= Op
.getOperand(0).getValueType();
12277 unsigned Idx
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
12278 unsigned Size
= Op
.getValueSizeInBits();
12280 // If we don't have legal types yet, do nothing
12281 if (!DAG
.getTargetLoweringInfo().isTypeLegal(InVT
))
12284 if (InVT
.isScalableVector()) {
12285 // This will be matched by custom code during ISelDAGToDAG.
12286 if (Idx
== 0 && isPackedVectorType(InVT
, DAG
))
12292 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
12293 if (Idx
== 0 && InVT
.getSizeInBits() <= 128)
12296 // If this is extracting the upper 64-bits of a 128-bit vector, we match
12298 if (Size
== 64 && Idx
* InVT
.getScalarSizeInBits() == 64 &&
12299 InVT
.getSizeInBits() == 128)
12302 if (useSVEForFixedLengthVectorVT(InVT
)) {
12305 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, InVT
);
12307 convertToScalableVector(DAG
, ContainerVT
, Op
.getOperand(0));
12309 SDValue Splice
= DAG
.getNode(ISD::VECTOR_SPLICE
, DL
, ContainerVT
, NewInVec
,
12310 NewInVec
, DAG
.getConstant(Idx
, DL
, MVT::i64
));
12311 return convertFromScalableVector(DAG
, Op
.getValueType(), Splice
);
12317 SDValue
AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op
,
12318 SelectionDAG
&DAG
) const {
12319 assert(Op
.getValueType().isScalableVector() &&
12320 "Only expect to lower inserts into scalable vectors!");
12322 EVT InVT
= Op
.getOperand(1).getValueType();
12323 unsigned Idx
= cast
<ConstantSDNode
>(Op
.getOperand(2))->getZExtValue();
12325 SDValue Vec0
= Op
.getOperand(0);
12326 SDValue Vec1
= Op
.getOperand(1);
12328 EVT VT
= Op
.getValueType();
12330 if (InVT
.isScalableVector()) {
12331 if (!isTypeLegal(VT
))
12334 // Break down insert_subvector into simpler parts.
12335 if (VT
.getVectorElementType() == MVT::i1
) {
12336 unsigned NumElts
= VT
.getVectorMinNumElements();
12337 EVT HalfVT
= VT
.getHalfNumVectorElementsVT(*DAG
.getContext());
12340 Lo
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, HalfVT
, Vec0
,
12341 DAG
.getVectorIdxConstant(0, DL
));
12342 Hi
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, HalfVT
, Vec0
,
12343 DAG
.getVectorIdxConstant(NumElts
/ 2, DL
));
12344 if (Idx
< (NumElts
/ 2)) {
12345 SDValue NewLo
= DAG
.getNode(ISD::INSERT_SUBVECTOR
, DL
, HalfVT
, Lo
, Vec1
,
12346 DAG
.getVectorIdxConstant(Idx
, DL
));
12347 return DAG
.getNode(AArch64ISD::UZP1
, DL
, VT
, NewLo
, Hi
);
12350 DAG
.getNode(ISD::INSERT_SUBVECTOR
, DL
, HalfVT
, Hi
, Vec1
,
12351 DAG
.getVectorIdxConstant(Idx
- (NumElts
/ 2), DL
));
12352 return DAG
.getNode(AArch64ISD::UZP1
, DL
, VT
, Lo
, NewHi
);
12356 // Ensure the subvector is half the size of the main vector.
12357 if (VT
.getVectorElementCount() != (InVT
.getVectorElementCount() * 2))
12360 // Here narrow and wide refers to the vector element types. After "casting"
12361 // both vectors must have the same bit length and so because the subvector
12362 // has fewer elements, those elements need to be bigger.
12363 EVT NarrowVT
= getPackedSVEVectorVT(VT
.getVectorElementCount());
12364 EVT WideVT
= getPackedSVEVectorVT(InVT
.getVectorElementCount());
12366 // NOP cast operands to the largest legal vector of the same element count.
12367 if (VT
.isFloatingPoint()) {
12368 Vec0
= getSVESafeBitCast(NarrowVT
, Vec0
, DAG
);
12369 Vec1
= getSVESafeBitCast(WideVT
, Vec1
, DAG
);
12371 // Legal integer vectors are already their largest so Vec0 is fine as is.
12372 Vec1
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, WideVT
, Vec1
);
12375 // To replace the top/bottom half of vector V with vector SubV we widen the
12376 // preserved half of V, concatenate this to SubV (the order depending on the
12377 // half being replaced) and then narrow the result.
12380 SDValue HiVec0
= DAG
.getNode(AArch64ISD::UUNPKHI
, DL
, WideVT
, Vec0
);
12381 Narrow
= DAG
.getNode(AArch64ISD::UZP1
, DL
, NarrowVT
, Vec1
, HiVec0
);
12383 assert(Idx
== InVT
.getVectorMinNumElements() &&
12384 "Invalid subvector index!");
12385 SDValue LoVec0
= DAG
.getNode(AArch64ISD::UUNPKLO
, DL
, WideVT
, Vec0
);
12386 Narrow
= DAG
.getNode(AArch64ISD::UZP1
, DL
, NarrowVT
, LoVec0
, Vec1
);
12389 return getSVESafeBitCast(VT
, Narrow
, DAG
);
12392 if (Idx
== 0 && isPackedVectorType(VT
, DAG
)) {
12393 // This will be matched by custom code during ISelDAGToDAG.
12394 if (Vec0
.isUndef())
12397 Optional
<unsigned> PredPattern
=
12398 getSVEPredPatternFromNumElements(InVT
.getVectorNumElements());
12399 auto PredTy
= VT
.changeVectorElementType(MVT::i1
);
12400 SDValue PTrue
= getPTrue(DAG
, DL
, PredTy
, *PredPattern
);
12401 SDValue ScalableVec1
= convertToScalableVector(DAG
, VT
, Vec1
);
12402 return DAG
.getNode(ISD::VSELECT
, DL
, VT
, PTrue
, ScalableVec1
, Vec0
);
12408 static bool isPow2Splat(SDValue Op
, uint64_t &SplatVal
, bool &Negated
) {
12409 if (Op
.getOpcode() != AArch64ISD::DUP
&&
12410 Op
.getOpcode() != ISD::SPLAT_VECTOR
&&
12411 Op
.getOpcode() != ISD::BUILD_VECTOR
)
12414 if (Op
.getOpcode() == ISD::BUILD_VECTOR
&&
12415 !isAllConstantBuildVector(Op
, SplatVal
))
12418 if (Op
.getOpcode() != ISD::BUILD_VECTOR
&&
12419 !isa
<ConstantSDNode
>(Op
->getOperand(0)))
12422 SplatVal
= Op
->getConstantOperandVal(0);
12423 if (Op
.getValueType().getVectorElementType() != MVT::i64
)
12424 SplatVal
= (int32_t)SplatVal
;
12427 if (isPowerOf2_64(SplatVal
))
12431 if (isPowerOf2_64(-SplatVal
)) {
12432 SplatVal
= -SplatVal
;
12439 SDValue
AArch64TargetLowering::LowerDIV(SDValue Op
, SelectionDAG
&DAG
) const {
12440 EVT VT
= Op
.getValueType();
12443 if (useSVEForFixedLengthVectorVT(VT
, /*OverrideNEON=*/true))
12444 return LowerFixedLengthVectorIntDivideToSVE(Op
, DAG
);
12446 assert(VT
.isScalableVector() && "Expected a scalable vector.");
12448 bool Signed
= Op
.getOpcode() == ISD::SDIV
;
12449 unsigned PredOpcode
= Signed
? AArch64ISD::SDIV_PRED
: AArch64ISD::UDIV_PRED
;
12453 if (Signed
&& isPow2Splat(Op
.getOperand(1), SplatVal
, Negated
)) {
12454 SDValue Pg
= getPredicateForScalableVector(DAG
, dl
, VT
);
12456 DAG
.getNode(AArch64ISD::SRAD_MERGE_OP1
, dl
, VT
, Pg
, Op
->getOperand(0),
12457 DAG
.getTargetConstant(Log2_64(SplatVal
), dl
, MVT::i32
));
12459 Res
= DAG
.getNode(ISD::SUB
, dl
, VT
, DAG
.getConstant(0, dl
, VT
), Res
);
12464 if (VT
== MVT::nxv4i32
|| VT
== MVT::nxv2i64
)
12465 return LowerToPredicatedOp(Op
, DAG
, PredOpcode
);
12467 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
12468 // operations, and truncate the result.
12470 if (VT
== MVT::nxv16i8
)
12471 WidenedVT
= MVT::nxv8i16
;
12472 else if (VT
== MVT::nxv8i16
)
12473 WidenedVT
= MVT::nxv4i32
;
12475 llvm_unreachable("Unexpected Custom DIV operation");
12477 unsigned UnpkLo
= Signed
? AArch64ISD::SUNPKLO
: AArch64ISD::UUNPKLO
;
12478 unsigned UnpkHi
= Signed
? AArch64ISD::SUNPKHI
: AArch64ISD::UUNPKHI
;
12479 SDValue Op0Lo
= DAG
.getNode(UnpkLo
, dl
, WidenedVT
, Op
.getOperand(0));
12480 SDValue Op1Lo
= DAG
.getNode(UnpkLo
, dl
, WidenedVT
, Op
.getOperand(1));
12481 SDValue Op0Hi
= DAG
.getNode(UnpkHi
, dl
, WidenedVT
, Op
.getOperand(0));
12482 SDValue Op1Hi
= DAG
.getNode(UnpkHi
, dl
, WidenedVT
, Op
.getOperand(1));
12483 SDValue ResultLo
= DAG
.getNode(Op
.getOpcode(), dl
, WidenedVT
, Op0Lo
, Op1Lo
);
12484 SDValue ResultHi
= DAG
.getNode(Op
.getOpcode(), dl
, WidenedVT
, Op0Hi
, Op1Hi
);
12485 return DAG
.getNode(AArch64ISD::UZP1
, dl
, VT
, ResultLo
, ResultHi
);
12488 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef
<int> M
, EVT VT
) const {
12489 // Currently no fixed length shuffles that require SVE are legal.
12490 if (useSVEForFixedLengthVectorVT(VT
))
12493 if (VT
.getVectorNumElements() == 4 &&
12494 (VT
.is128BitVector() || VT
.is64BitVector())) {
12495 unsigned Cost
= getPerfectShuffleCost(M
);
12502 unsigned DummyUnsigned
;
12504 return (ShuffleVectorSDNode::isSplatMask(&M
[0], VT
) || isREVMask(M
, VT
, 64) ||
12505 isREVMask(M
, VT
, 32) || isREVMask(M
, VT
, 16) ||
12506 isEXTMask(M
, VT
, DummyBool
, DummyUnsigned
) ||
12507 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
12508 isTRNMask(M
, VT
, DummyUnsigned
) || isUZPMask(M
, VT
, DummyUnsigned
) ||
12509 isZIPMask(M
, VT
, DummyUnsigned
) ||
12510 isTRN_v_undef_Mask(M
, VT
, DummyUnsigned
) ||
12511 isUZP_v_undef_Mask(M
, VT
, DummyUnsigned
) ||
12512 isZIP_v_undef_Mask(M
, VT
, DummyUnsigned
) ||
12513 isINSMask(M
, VT
.getVectorNumElements(), DummyBool
, DummyInt
) ||
12514 isConcatMask(M
, VT
, VT
.getSizeInBits() == 128));
12517 bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef
<int> M
,
12519 // Just delegate to the generic legality, clear masks aren't special.
12520 return isShuffleMaskLegal(M
, VT
);
12523 /// getVShiftImm - Check if this is a valid build_vector for the immediate
12524 /// operand of a vector shift operation, where all the elements of the
12525 /// build_vector must have the same constant integer value.
12526 static bool getVShiftImm(SDValue Op
, unsigned ElementBits
, int64_t &Cnt
) {
12527 // Ignore bit_converts.
12528 while (Op
.getOpcode() == ISD::BITCAST
)
12529 Op
= Op
.getOperand(0);
12530 BuildVectorSDNode
*BVN
= dyn_cast
<BuildVectorSDNode
>(Op
.getNode());
12531 APInt SplatBits
, SplatUndef
;
12532 unsigned SplatBitSize
;
12534 if (!BVN
|| !BVN
->isConstantSplat(SplatBits
, SplatUndef
, SplatBitSize
,
12535 HasAnyUndefs
, ElementBits
) ||
12536 SplatBitSize
> ElementBits
)
12538 Cnt
= SplatBits
.getSExtValue();
12542 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
12543 /// operand of a vector shift left operation. That value must be in the range:
12544 /// 0 <= Value < ElementBits for a left shift; or
12545 /// 0 <= Value <= ElementBits for a long left shift.
12546 static bool isVShiftLImm(SDValue Op
, EVT VT
, bool isLong
, int64_t &Cnt
) {
12547 assert(VT
.isVector() && "vector shift count is not a vector type");
12548 int64_t ElementBits
= VT
.getScalarSizeInBits();
12549 if (!getVShiftImm(Op
, ElementBits
, Cnt
))
12551 return (Cnt
>= 0 && (isLong
? Cnt
- 1 : Cnt
) < ElementBits
);
12554 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
12555 /// operand of a vector shift right operation. The value must be in the range:
12556 /// 1 <= Value <= ElementBits for a right shift; or
12557 static bool isVShiftRImm(SDValue Op
, EVT VT
, bool isNarrow
, int64_t &Cnt
) {
12558 assert(VT
.isVector() && "vector shift count is not a vector type");
12559 int64_t ElementBits
= VT
.getScalarSizeInBits();
12560 if (!getVShiftImm(Op
, ElementBits
, Cnt
))
12562 return (Cnt
>= 1 && Cnt
<= (isNarrow
? ElementBits
/ 2 : ElementBits
));
12565 SDValue
AArch64TargetLowering::LowerTRUNCATE(SDValue Op
,
12566 SelectionDAG
&DAG
) const {
12567 EVT VT
= Op
.getValueType();
12569 if (VT
.getScalarType() == MVT::i1
) {
12570 // Lower i1 truncate to `(x & 1) != 0`.
12572 EVT OpVT
= Op
.getOperand(0).getValueType();
12573 SDValue Zero
= DAG
.getConstant(0, dl
, OpVT
);
12574 SDValue One
= DAG
.getConstant(1, dl
, OpVT
);
12575 SDValue And
= DAG
.getNode(ISD::AND
, dl
, OpVT
, Op
.getOperand(0), One
);
12576 return DAG
.getSetCC(dl
, VT
, And
, Zero
, ISD::SETNE
);
12579 if (!VT
.isVector() || VT
.isScalableVector())
12582 if (useSVEForFixedLengthVectorVT(Op
.getOperand(0).getValueType()))
12583 return LowerFixedLengthVectorTruncateToSVE(Op
, DAG
);
12588 SDValue
AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op
,
12589 SelectionDAG
&DAG
) const {
12590 EVT VT
= Op
.getValueType();
12594 if (!Op
.getOperand(1).getValueType().isVector())
12596 unsigned EltSize
= VT
.getScalarSizeInBits();
12598 switch (Op
.getOpcode()) {
12600 if (VT
.isScalableVector() || useSVEForFixedLengthVectorVT(VT
))
12601 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::SHL_PRED
);
12603 if (isVShiftLImm(Op
.getOperand(1), VT
, false, Cnt
) && Cnt
< EltSize
)
12604 return DAG
.getNode(AArch64ISD::VSHL
, DL
, VT
, Op
.getOperand(0),
12605 DAG
.getConstant(Cnt
, DL
, MVT::i32
));
12606 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, VT
,
12607 DAG
.getConstant(Intrinsic::aarch64_neon_ushl
, DL
,
12609 Op
.getOperand(0), Op
.getOperand(1));
12612 if (VT
.isScalableVector() || useSVEForFixedLengthVectorVT(VT
)) {
12613 unsigned Opc
= Op
.getOpcode() == ISD::SRA
? AArch64ISD::SRA_PRED
12614 : AArch64ISD::SRL_PRED
;
12615 return LowerToPredicatedOp(Op
, DAG
, Opc
);
12618 // Right shift immediate
12619 if (isVShiftRImm(Op
.getOperand(1), VT
, false, Cnt
) && Cnt
< EltSize
) {
12621 (Op
.getOpcode() == ISD::SRA
) ? AArch64ISD::VASHR
: AArch64ISD::VLSHR
;
12622 return DAG
.getNode(Opc
, DL
, VT
, Op
.getOperand(0),
12623 DAG
.getConstant(Cnt
, DL
, MVT::i32
));
12626 // Right shift register. Note, there is not a shift right register
12627 // instruction, but the shift left register instruction takes a signed
12628 // value, where negative numbers specify a right shift.
12629 unsigned Opc
= (Op
.getOpcode() == ISD::SRA
) ? Intrinsic::aarch64_neon_sshl
12630 : Intrinsic::aarch64_neon_ushl
;
12631 // negate the shift amount
12632 SDValue NegShift
= DAG
.getNode(ISD::SUB
, DL
, VT
, DAG
.getConstant(0, DL
, VT
),
12634 SDValue NegShiftLeft
=
12635 DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, VT
,
12636 DAG
.getConstant(Opc
, DL
, MVT::i32
), Op
.getOperand(0),
12638 return NegShiftLeft
;
12641 llvm_unreachable("unexpected shift opcode");
12644 static SDValue
EmitVectorComparison(SDValue LHS
, SDValue RHS
,
12645 AArch64CC::CondCode CC
, bool NoNans
, EVT VT
,
12646 const SDLoc
&dl
, SelectionDAG
&DAG
) {
12647 EVT SrcVT
= LHS
.getValueType();
12648 assert(VT
.getSizeInBits() == SrcVT
.getSizeInBits() &&
12649 "function only supposed to emit natural comparisons");
12651 BuildVectorSDNode
*BVN
= dyn_cast
<BuildVectorSDNode
>(RHS
.getNode());
12652 APInt
CnstBits(VT
.getSizeInBits(), 0);
12653 APInt
UndefBits(VT
.getSizeInBits(), 0);
12654 bool IsCnst
= BVN
&& resolveBuildVector(BVN
, CnstBits
, UndefBits
);
12655 bool IsZero
= IsCnst
&& (CnstBits
== 0);
12657 if (SrcVT
.getVectorElementType().isFloatingPoint()) {
12661 case AArch64CC::NE
: {
12664 Fcmeq
= DAG
.getNode(AArch64ISD::FCMEQz
, dl
, VT
, LHS
);
12666 Fcmeq
= DAG
.getNode(AArch64ISD::FCMEQ
, dl
, VT
, LHS
, RHS
);
12667 return DAG
.getNOT(dl
, Fcmeq
, VT
);
12669 case AArch64CC::EQ
:
12671 return DAG
.getNode(AArch64ISD::FCMEQz
, dl
, VT
, LHS
);
12672 return DAG
.getNode(AArch64ISD::FCMEQ
, dl
, VT
, LHS
, RHS
);
12673 case AArch64CC::GE
:
12675 return DAG
.getNode(AArch64ISD::FCMGEz
, dl
, VT
, LHS
);
12676 return DAG
.getNode(AArch64ISD::FCMGE
, dl
, VT
, LHS
, RHS
);
12677 case AArch64CC::GT
:
12679 return DAG
.getNode(AArch64ISD::FCMGTz
, dl
, VT
, LHS
);
12680 return DAG
.getNode(AArch64ISD::FCMGT
, dl
, VT
, LHS
, RHS
);
12681 case AArch64CC::LE
:
12684 // If we ignore NaNs then we can use to the LS implementation.
12686 case AArch64CC::LS
:
12688 return DAG
.getNode(AArch64ISD::FCMLEz
, dl
, VT
, LHS
);
12689 return DAG
.getNode(AArch64ISD::FCMGE
, dl
, VT
, RHS
, LHS
);
12690 case AArch64CC::LT
:
12693 // If we ignore NaNs then we can use to the MI implementation.
12695 case AArch64CC::MI
:
12697 return DAG
.getNode(AArch64ISD::FCMLTz
, dl
, VT
, LHS
);
12698 return DAG
.getNode(AArch64ISD::FCMGT
, dl
, VT
, RHS
, LHS
);
12705 case AArch64CC::NE
: {
12708 Cmeq
= DAG
.getNode(AArch64ISD::CMEQz
, dl
, VT
, LHS
);
12710 Cmeq
= DAG
.getNode(AArch64ISD::CMEQ
, dl
, VT
, LHS
, RHS
);
12711 return DAG
.getNOT(dl
, Cmeq
, VT
);
12713 case AArch64CC::EQ
:
12715 return DAG
.getNode(AArch64ISD::CMEQz
, dl
, VT
, LHS
);
12716 return DAG
.getNode(AArch64ISD::CMEQ
, dl
, VT
, LHS
, RHS
);
12717 case AArch64CC::GE
:
12719 return DAG
.getNode(AArch64ISD::CMGEz
, dl
, VT
, LHS
);
12720 return DAG
.getNode(AArch64ISD::CMGE
, dl
, VT
, LHS
, RHS
);
12721 case AArch64CC::GT
:
12723 return DAG
.getNode(AArch64ISD::CMGTz
, dl
, VT
, LHS
);
12724 return DAG
.getNode(AArch64ISD::CMGT
, dl
, VT
, LHS
, RHS
);
12725 case AArch64CC::LE
:
12727 return DAG
.getNode(AArch64ISD::CMLEz
, dl
, VT
, LHS
);
12728 return DAG
.getNode(AArch64ISD::CMGE
, dl
, VT
, RHS
, LHS
);
12729 case AArch64CC::LS
:
12730 return DAG
.getNode(AArch64ISD::CMHS
, dl
, VT
, RHS
, LHS
);
12731 case AArch64CC::LO
:
12732 return DAG
.getNode(AArch64ISD::CMHI
, dl
, VT
, RHS
, LHS
);
12733 case AArch64CC::LT
:
12735 return DAG
.getNode(AArch64ISD::CMLTz
, dl
, VT
, LHS
);
12736 return DAG
.getNode(AArch64ISD::CMGT
, dl
, VT
, RHS
, LHS
);
12737 case AArch64CC::HI
:
12738 return DAG
.getNode(AArch64ISD::CMHI
, dl
, VT
, LHS
, RHS
);
12739 case AArch64CC::HS
:
12740 return DAG
.getNode(AArch64ISD::CMHS
, dl
, VT
, LHS
, RHS
);
12744 SDValue
AArch64TargetLowering::LowerVSETCC(SDValue Op
,
12745 SelectionDAG
&DAG
) const {
12746 if (Op
.getValueType().isScalableVector())
12747 return LowerToPredicatedOp(Op
, DAG
, AArch64ISD::SETCC_MERGE_ZERO
);
12749 if (useSVEForFixedLengthVectorVT(Op
.getOperand(0).getValueType()))
12750 return LowerFixedLengthVectorSetccToSVE(Op
, DAG
);
12752 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
.getOperand(2))->get();
12753 SDValue LHS
= Op
.getOperand(0);
12754 SDValue RHS
= Op
.getOperand(1);
12755 EVT CmpVT
= LHS
.getValueType().changeVectorElementTypeToInteger();
12758 if (LHS
.getValueType().getVectorElementType().isInteger()) {
12759 assert(LHS
.getValueType() == RHS
.getValueType());
12760 AArch64CC::CondCode AArch64CC
= changeIntCCToAArch64CC(CC
);
12762 EmitVectorComparison(LHS
, RHS
, AArch64CC
, false, CmpVT
, dl
, DAG
);
12763 return DAG
.getSExtOrTrunc(Cmp
, dl
, Op
.getValueType());
12766 const bool FullFP16
= DAG
.getSubtarget
<AArch64Subtarget
>().hasFullFP16();
12768 // Make v4f16 (only) fcmp operations utilise vector instructions
12769 // v8f16 support will be a litle more complicated
12770 if (!FullFP16
&& LHS
.getValueType().getVectorElementType() == MVT::f16
) {
12771 if (LHS
.getValueType().getVectorNumElements() == 4) {
12772 LHS
= DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::v4f32
, LHS
);
12773 RHS
= DAG
.getNode(ISD::FP_EXTEND
, dl
, MVT::v4f32
, RHS
);
12774 SDValue NewSetcc
= DAG
.getSetCC(dl
, MVT::v4i16
, LHS
, RHS
, CC
);
12775 DAG
.ReplaceAllUsesWith(Op
, NewSetcc
);
12776 CmpVT
= MVT::v4i32
;
12781 assert((!FullFP16
&& LHS
.getValueType().getVectorElementType() != MVT::f16
) ||
12782 LHS
.getValueType().getVectorElementType() != MVT::f128
);
12784 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
12785 // clean. Some of them require two branches to implement.
12786 AArch64CC::CondCode CC1
, CC2
;
12788 changeVectorFPCCToAArch64CC(CC
, CC1
, CC2
, ShouldInvert
);
12790 bool NoNaNs
= getTargetMachine().Options
.NoNaNsFPMath
|| Op
->getFlags().hasNoNaNs();
12792 EmitVectorComparison(LHS
, RHS
, CC1
, NoNaNs
, CmpVT
, dl
, DAG
);
12793 if (!Cmp
.getNode())
12796 if (CC2
!= AArch64CC::AL
) {
12798 EmitVectorComparison(LHS
, RHS
, CC2
, NoNaNs
, CmpVT
, dl
, DAG
);
12799 if (!Cmp2
.getNode())
12802 Cmp
= DAG
.getNode(ISD::OR
, dl
, CmpVT
, Cmp
, Cmp2
);
12805 Cmp
= DAG
.getSExtOrTrunc(Cmp
, dl
, Op
.getValueType());
12808 Cmp
= DAG
.getNOT(dl
, Cmp
, Cmp
.getValueType());
12813 static SDValue
getReductionSDNode(unsigned Op
, SDLoc DL
, SDValue ScalarOp
,
12814 SelectionDAG
&DAG
) {
12815 SDValue VecOp
= ScalarOp
.getOperand(0);
12816 auto Rdx
= DAG
.getNode(Op
, DL
, VecOp
.getSimpleValueType(), VecOp
);
12817 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, ScalarOp
.getValueType(), Rdx
,
12818 DAG
.getConstant(0, DL
, MVT::i64
));
12821 SDValue
AArch64TargetLowering::LowerVECREDUCE(SDValue Op
,
12822 SelectionDAG
&DAG
) const {
12823 SDValue Src
= Op
.getOperand(0);
12825 // Try to lower fixed length reductions to SVE.
12826 EVT SrcVT
= Src
.getValueType();
12827 bool OverrideNEON
= Op
.getOpcode() == ISD::VECREDUCE_AND
||
12828 Op
.getOpcode() == ISD::VECREDUCE_OR
||
12829 Op
.getOpcode() == ISD::VECREDUCE_XOR
||
12830 Op
.getOpcode() == ISD::VECREDUCE_FADD
||
12831 (Op
.getOpcode() != ISD::VECREDUCE_ADD
&&
12832 SrcVT
.getVectorElementType() == MVT::i64
);
12833 if (SrcVT
.isScalableVector() ||
12834 useSVEForFixedLengthVectorVT(
12835 SrcVT
, OverrideNEON
&& Subtarget
->useSVEForFixedLengthVectors())) {
12837 if (SrcVT
.getVectorElementType() == MVT::i1
)
12838 return LowerPredReductionToSVE(Op
, DAG
);
12840 switch (Op
.getOpcode()) {
12841 case ISD::VECREDUCE_ADD
:
12842 return LowerReductionToSVE(AArch64ISD::UADDV_PRED
, Op
, DAG
);
12843 case ISD::VECREDUCE_AND
:
12844 return LowerReductionToSVE(AArch64ISD::ANDV_PRED
, Op
, DAG
);
12845 case ISD::VECREDUCE_OR
:
12846 return LowerReductionToSVE(AArch64ISD::ORV_PRED
, Op
, DAG
);
12847 case ISD::VECREDUCE_SMAX
:
12848 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED
, Op
, DAG
);
12849 case ISD::VECREDUCE_SMIN
:
12850 return LowerReductionToSVE(AArch64ISD::SMINV_PRED
, Op
, DAG
);
12851 case ISD::VECREDUCE_UMAX
:
12852 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED
, Op
, DAG
);
12853 case ISD::VECREDUCE_UMIN
:
12854 return LowerReductionToSVE(AArch64ISD::UMINV_PRED
, Op
, DAG
);
12855 case ISD::VECREDUCE_XOR
:
12856 return LowerReductionToSVE(AArch64ISD::EORV_PRED
, Op
, DAG
);
12857 case ISD::VECREDUCE_FADD
:
12858 return LowerReductionToSVE(AArch64ISD::FADDV_PRED
, Op
, DAG
);
12859 case ISD::VECREDUCE_FMAX
:
12860 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED
, Op
, DAG
);
12861 case ISD::VECREDUCE_FMIN
:
12862 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED
, Op
, DAG
);
12864 llvm_unreachable("Unhandled fixed length reduction");
12868 // Lower NEON reductions.
12870 switch (Op
.getOpcode()) {
12871 case ISD::VECREDUCE_ADD
:
12872 return getReductionSDNode(AArch64ISD::UADDV
, dl
, Op
, DAG
);
12873 case ISD::VECREDUCE_SMAX
:
12874 return getReductionSDNode(AArch64ISD::SMAXV
, dl
, Op
, DAG
);
12875 case ISD::VECREDUCE_SMIN
:
12876 return getReductionSDNode(AArch64ISD::SMINV
, dl
, Op
, DAG
);
12877 case ISD::VECREDUCE_UMAX
:
12878 return getReductionSDNode(AArch64ISD::UMAXV
, dl
, Op
, DAG
);
12879 case ISD::VECREDUCE_UMIN
:
12880 return getReductionSDNode(AArch64ISD::UMINV
, dl
, Op
, DAG
);
12881 case ISD::VECREDUCE_FMAX
: {
12882 return DAG
.getNode(
12883 ISD::INTRINSIC_WO_CHAIN
, dl
, Op
.getValueType(),
12884 DAG
.getConstant(Intrinsic::aarch64_neon_fmaxnmv
, dl
, MVT::i32
),
12887 case ISD::VECREDUCE_FMIN
: {
12888 return DAG
.getNode(
12889 ISD::INTRINSIC_WO_CHAIN
, dl
, Op
.getValueType(),
12890 DAG
.getConstant(Intrinsic::aarch64_neon_fminnmv
, dl
, MVT::i32
),
12894 llvm_unreachable("Unhandled reduction");
12898 SDValue
AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op
,
12899 SelectionDAG
&DAG
) const {
12900 auto &Subtarget
= DAG
.getSubtarget
<AArch64Subtarget
>();
12901 if (!Subtarget
.hasLSE() && !Subtarget
.outlineAtomics())
12904 // LSE has an atomic load-add instruction, but not a load-sub.
12906 MVT VT
= Op
.getSimpleValueType();
12907 SDValue RHS
= Op
.getOperand(2);
12908 AtomicSDNode
*AN
= cast
<AtomicSDNode
>(Op
.getNode());
12909 RHS
= DAG
.getNode(ISD::SUB
, dl
, VT
, DAG
.getConstant(0, dl
, VT
), RHS
);
12910 return DAG
.getAtomic(ISD::ATOMIC_LOAD_ADD
, dl
, AN
->getMemoryVT(),
12911 Op
.getOperand(0), Op
.getOperand(1), RHS
,
12912 AN
->getMemOperand());
12915 SDValue
AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op
,
12916 SelectionDAG
&DAG
) const {
12917 auto &Subtarget
= DAG
.getSubtarget
<AArch64Subtarget
>();
12918 if (!Subtarget
.hasLSE() && !Subtarget
.outlineAtomics())
12921 // LSE has an atomic load-clear instruction, but not a load-and.
12923 MVT VT
= Op
.getSimpleValueType();
12924 SDValue RHS
= Op
.getOperand(2);
12925 AtomicSDNode
*AN
= cast
<AtomicSDNode
>(Op
.getNode());
12926 RHS
= DAG
.getNode(ISD::XOR
, dl
, VT
, DAG
.getConstant(-1ULL, dl
, VT
), RHS
);
12927 return DAG
.getAtomic(ISD::ATOMIC_LOAD_CLR
, dl
, AN
->getMemoryVT(),
12928 Op
.getOperand(0), Op
.getOperand(1), RHS
,
12929 AN
->getMemOperand());
12932 SDValue
AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
12933 SDValue Op
, SDValue Chain
, SDValue
&Size
, SelectionDAG
&DAG
) const {
12935 EVT PtrVT
= getPointerTy(DAG
.getDataLayout());
12936 SDValue Callee
= DAG
.getTargetExternalSymbol(Subtarget
->getChkStkName(),
12939 const AArch64RegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
12940 const uint32_t *Mask
= TRI
->getWindowsStackProbePreservedMask();
12941 if (Subtarget
->hasCustomCallingConv())
12942 TRI
->UpdateCustomCallPreservedMask(DAG
.getMachineFunction(), &Mask
);
12944 Size
= DAG
.getNode(ISD::SRL
, dl
, MVT::i64
, Size
,
12945 DAG
.getConstant(4, dl
, MVT::i64
));
12946 Chain
= DAG
.getCopyToReg(Chain
, dl
, AArch64::X15
, Size
, SDValue());
12948 DAG
.getNode(AArch64ISD::CALL
, dl
, DAG
.getVTList(MVT::Other
, MVT::Glue
),
12949 Chain
, Callee
, DAG
.getRegister(AArch64::X15
, MVT::i64
),
12950 DAG
.getRegisterMask(Mask
), Chain
.getValue(1));
12951 // To match the actual intent better, we should read the output from X15 here
12952 // again (instead of potentially spilling it to the stack), but rereading Size
12953 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
12956 Size
= DAG
.getNode(ISD::SHL
, dl
, MVT::i64
, Size
,
12957 DAG
.getConstant(4, dl
, MVT::i64
));
12962 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op
,
12963 SelectionDAG
&DAG
) const {
12964 assert(Subtarget
->isTargetWindows() &&
12965 "Only Windows alloca probing supported");
12968 SDNode
*Node
= Op
.getNode();
12969 SDValue Chain
= Op
.getOperand(0);
12970 SDValue Size
= Op
.getOperand(1);
12972 cast
<ConstantSDNode
>(Op
.getOperand(2))->getMaybeAlignValue();
12973 EVT VT
= Node
->getValueType(0);
12975 if (DAG
.getMachineFunction().getFunction().hasFnAttribute(
12976 "no-stack-arg-probe")) {
12977 SDValue SP
= DAG
.getCopyFromReg(Chain
, dl
, AArch64::SP
, MVT::i64
);
12978 Chain
= SP
.getValue(1);
12979 SP
= DAG
.getNode(ISD::SUB
, dl
, MVT::i64
, SP
, Size
);
12981 SP
= DAG
.getNode(ISD::AND
, dl
, VT
, SP
.getValue(0),
12982 DAG
.getConstant(-(uint64_t)Align
->value(), dl
, VT
));
12983 Chain
= DAG
.getCopyToReg(Chain
, dl
, AArch64::SP
, SP
);
12984 SDValue Ops
[2] = {SP
, Chain
};
12985 return DAG
.getMergeValues(Ops
, dl
);
12988 Chain
= DAG
.getCALLSEQ_START(Chain
, 0, 0, dl
);
12990 Chain
= LowerWindowsDYNAMIC_STACKALLOC(Op
, Chain
, Size
, DAG
);
12992 SDValue SP
= DAG
.getCopyFromReg(Chain
, dl
, AArch64::SP
, MVT::i64
);
12993 Chain
= SP
.getValue(1);
12994 SP
= DAG
.getNode(ISD::SUB
, dl
, MVT::i64
, SP
, Size
);
12996 SP
= DAG
.getNode(ISD::AND
, dl
, VT
, SP
.getValue(0),
12997 DAG
.getConstant(-(uint64_t)Align
->value(), dl
, VT
));
12998 Chain
= DAG
.getCopyToReg(Chain
, dl
, AArch64::SP
, SP
);
13000 Chain
= DAG
.getCALLSEQ_END(Chain
, 0, 0, SDValue(), dl
);
13002 SDValue Ops
[2] = {SP
, Chain
};
13003 return DAG
.getMergeValues(Ops
, dl
);
13006 SDValue
AArch64TargetLowering::LowerVSCALE(SDValue Op
,
13007 SelectionDAG
&DAG
) const {
13008 EVT VT
= Op
.getValueType();
13009 assert(VT
!= MVT::i64
&& "Expected illegal VSCALE node");
13012 APInt MulImm
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getAPIntValue();
13013 return DAG
.getZExtOrTrunc(DAG
.getVScale(DL
, MVT::i64
, MulImm
.sext(64)), DL
,
13017 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
13018 template <unsigned NumVecs
>
13020 setInfoSVEStN(const AArch64TargetLowering
&TLI
, const DataLayout
&DL
,
13021 AArch64TargetLowering::IntrinsicInfo
&Info
, const CallInst
&CI
) {
13022 Info
.opc
= ISD::INTRINSIC_VOID
;
13023 // Retrieve EC from first vector argument.
13024 const EVT VT
= TLI
.getMemValueType(DL
, CI
.getArgOperand(0)->getType());
13025 ElementCount EC
= VT
.getVectorElementCount();
13027 // Check the assumption that all input vectors are the same type.
13028 for (unsigned I
= 0; I
< NumVecs
; ++I
)
13029 assert(VT
== TLI
.getMemValueType(DL
, CI
.getArgOperand(I
)->getType()) &&
13032 // memVT is `NumVecs * VT`.
13033 Info
.memVT
= EVT::getVectorVT(CI
.getType()->getContext(), VT
.getScalarType(),
13035 Info
.ptrVal
= CI
.getArgOperand(CI
.arg_size() - 1);
13037 Info
.align
.reset();
13038 Info
.flags
= MachineMemOperand::MOStore
;
13042 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
13043 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
13044 /// specified in the intrinsic calls.
13045 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo
&Info
,
13047 MachineFunction
&MF
,
13048 unsigned Intrinsic
) const {
13049 auto &DL
= I
.getModule()->getDataLayout();
13050 switch (Intrinsic
) {
13051 case Intrinsic::aarch64_sve_st2
:
13052 return setInfoSVEStN
<2>(*this, DL
, Info
, I
);
13053 case Intrinsic::aarch64_sve_st3
:
13054 return setInfoSVEStN
<3>(*this, DL
, Info
, I
);
13055 case Intrinsic::aarch64_sve_st4
:
13056 return setInfoSVEStN
<4>(*this, DL
, Info
, I
);
13057 case Intrinsic::aarch64_neon_ld2
:
13058 case Intrinsic::aarch64_neon_ld3
:
13059 case Intrinsic::aarch64_neon_ld4
:
13060 case Intrinsic::aarch64_neon_ld1x2
:
13061 case Intrinsic::aarch64_neon_ld1x3
:
13062 case Intrinsic::aarch64_neon_ld1x4
:
13063 case Intrinsic::aarch64_neon_ld2lane
:
13064 case Intrinsic::aarch64_neon_ld3lane
:
13065 case Intrinsic::aarch64_neon_ld4lane
:
13066 case Intrinsic::aarch64_neon_ld2r
:
13067 case Intrinsic::aarch64_neon_ld3r
:
13068 case Intrinsic::aarch64_neon_ld4r
: {
13069 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
13070 // Conservatively set memVT to the entire set of vectors loaded.
13071 uint64_t NumElts
= DL
.getTypeSizeInBits(I
.getType()) / 64;
13072 Info
.memVT
= EVT::getVectorVT(I
.getType()->getContext(), MVT::i64
, NumElts
);
13073 Info
.ptrVal
= I
.getArgOperand(I
.arg_size() - 1);
13075 Info
.align
.reset();
13076 // volatile loads with NEON intrinsics not supported
13077 Info
.flags
= MachineMemOperand::MOLoad
;
13080 case Intrinsic::aarch64_neon_st2
:
13081 case Intrinsic::aarch64_neon_st3
:
13082 case Intrinsic::aarch64_neon_st4
:
13083 case Intrinsic::aarch64_neon_st1x2
:
13084 case Intrinsic::aarch64_neon_st1x3
:
13085 case Intrinsic::aarch64_neon_st1x4
:
13086 case Intrinsic::aarch64_neon_st2lane
:
13087 case Intrinsic::aarch64_neon_st3lane
:
13088 case Intrinsic::aarch64_neon_st4lane
: {
13089 Info
.opc
= ISD::INTRINSIC_VOID
;
13090 // Conservatively set memVT to the entire set of vectors stored.
13091 unsigned NumElts
= 0;
13092 for (const Value
*Arg
: I
.args()) {
13093 Type
*ArgTy
= Arg
->getType();
13094 if (!ArgTy
->isVectorTy())
13096 NumElts
+= DL
.getTypeSizeInBits(ArgTy
) / 64;
13098 Info
.memVT
= EVT::getVectorVT(I
.getType()->getContext(), MVT::i64
, NumElts
);
13099 Info
.ptrVal
= I
.getArgOperand(I
.arg_size() - 1);
13101 Info
.align
.reset();
13102 // volatile stores with NEON intrinsics not supported
13103 Info
.flags
= MachineMemOperand::MOStore
;
13106 case Intrinsic::aarch64_ldaxr
:
13107 case Intrinsic::aarch64_ldxr
: {
13108 Type
*ValTy
= I
.getParamElementType(0);
13109 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
13110 Info
.memVT
= MVT::getVT(ValTy
);
13111 Info
.ptrVal
= I
.getArgOperand(0);
13113 Info
.align
= DL
.getABITypeAlign(ValTy
);
13114 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOVolatile
;
13117 case Intrinsic::aarch64_stlxr
:
13118 case Intrinsic::aarch64_stxr
: {
13119 Type
*ValTy
= I
.getParamElementType(1);
13120 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
13121 Info
.memVT
= MVT::getVT(ValTy
);
13122 Info
.ptrVal
= I
.getArgOperand(1);
13124 Info
.align
= DL
.getABITypeAlign(ValTy
);
13125 Info
.flags
= MachineMemOperand::MOStore
| MachineMemOperand::MOVolatile
;
13128 case Intrinsic::aarch64_ldaxp
:
13129 case Intrinsic::aarch64_ldxp
:
13130 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
13131 Info
.memVT
= MVT::i128
;
13132 Info
.ptrVal
= I
.getArgOperand(0);
13134 Info
.align
= Align(16);
13135 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MOVolatile
;
13137 case Intrinsic::aarch64_stlxp
:
13138 case Intrinsic::aarch64_stxp
:
13139 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
13140 Info
.memVT
= MVT::i128
;
13141 Info
.ptrVal
= I
.getArgOperand(2);
13143 Info
.align
= Align(16);
13144 Info
.flags
= MachineMemOperand::MOStore
| MachineMemOperand::MOVolatile
;
13146 case Intrinsic::aarch64_sve_ldnt1
: {
13147 Type
*ElTy
= cast
<VectorType
>(I
.getType())->getElementType();
13148 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
13149 Info
.memVT
= MVT::getVT(I
.getType());
13150 Info
.ptrVal
= I
.getArgOperand(1);
13152 Info
.align
= DL
.getABITypeAlign(ElTy
);
13153 Info
.flags
= MachineMemOperand::MOLoad
| MachineMemOperand::MONonTemporal
;
13156 case Intrinsic::aarch64_sve_stnt1
: {
13158 cast
<VectorType
>(I
.getArgOperand(0)->getType())->getElementType();
13159 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
13160 Info
.memVT
= MVT::getVT(I
.getOperand(0)->getType());
13161 Info
.ptrVal
= I
.getArgOperand(2);
13163 Info
.align
= DL
.getABITypeAlign(ElTy
);
13164 Info
.flags
= MachineMemOperand::MOStore
| MachineMemOperand::MONonTemporal
;
13167 case Intrinsic::aarch64_mops_memset_tag
: {
13168 Value
*Dst
= I
.getArgOperand(0);
13169 Value
*Val
= I
.getArgOperand(1);
13170 Info
.opc
= ISD::INTRINSIC_W_CHAIN
;
13171 Info
.memVT
= MVT::getVT(Val
->getType());
13174 Info
.align
= I
.getParamAlign(0).valueOrOne();
13175 Info
.flags
= MachineMemOperand::MOStore
;
13176 // The size of the memory being operated on is unknown at this point
13177 Info
.size
= MemoryLocation::UnknownSize
;
13187 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode
*Load
,
13188 ISD::LoadExtType ExtTy
,
13190 // TODO: This may be worth removing. Check regression tests for diffs.
13191 if (!TargetLoweringBase::shouldReduceLoadWidth(Load
, ExtTy
, NewVT
))
13194 // If we're reducing the load width in order to avoid having to use an extra
13195 // instruction to do extension then it's probably a good idea.
13196 if (ExtTy
!= ISD::NON_EXTLOAD
)
13198 // Don't reduce load width if it would prevent us from combining a shift into
13200 MemSDNode
*Mem
= dyn_cast
<MemSDNode
>(Load
);
13202 const SDValue
&Base
= Mem
->getBasePtr();
13203 if (Base
.getOpcode() == ISD::ADD
&&
13204 Base
.getOperand(1).getOpcode() == ISD::SHL
&&
13205 Base
.getOperand(1).hasOneUse() &&
13206 Base
.getOperand(1).getOperand(1).getOpcode() == ISD::Constant
) {
13207 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
13208 if (Mem
->getMemoryVT().isScalableVector())
13210 // The shift can be combined if it matches the size of the value being
13211 // loaded (and so reducing the width would make it not match).
13212 uint64_t ShiftAmount
= Base
.getOperand(1).getConstantOperandVal(1);
13213 uint64_t LoadBytes
= Mem
->getMemoryVT().getSizeInBits()/8;
13214 if (ShiftAmount
== Log2_32(LoadBytes
))
13217 // We have no reason to disallow reducing the load width, so allow it.
13221 // Truncations from 64-bit GPR to 32-bit GPR is free.
13222 bool AArch64TargetLowering::isTruncateFree(Type
*Ty1
, Type
*Ty2
) const {
13223 if (!Ty1
->isIntegerTy() || !Ty2
->isIntegerTy())
13225 uint64_t NumBits1
= Ty1
->getPrimitiveSizeInBits().getFixedSize();
13226 uint64_t NumBits2
= Ty2
->getPrimitiveSizeInBits().getFixedSize();
13227 return NumBits1
> NumBits2
;
13229 bool AArch64TargetLowering::isTruncateFree(EVT VT1
, EVT VT2
) const {
13230 if (VT1
.isVector() || VT2
.isVector() || !VT1
.isInteger() || !VT2
.isInteger())
13232 uint64_t NumBits1
= VT1
.getFixedSizeInBits();
13233 uint64_t NumBits2
= VT2
.getFixedSizeInBits();
13234 return NumBits1
> NumBits2
;
13237 /// Check if it is profitable to hoist instruction in then/else to if.
13238 /// Not profitable if I and it's user can form a FMA instruction
13239 /// because we prefer FMSUB/FMADD.
13240 bool AArch64TargetLowering::isProfitableToHoist(Instruction
*I
) const {
13241 if (I
->getOpcode() != Instruction::FMul
)
13244 if (!I
->hasOneUse())
13247 Instruction
*User
= I
->user_back();
13249 if (!(User
->getOpcode() == Instruction::FSub
||
13250 User
->getOpcode() == Instruction::FAdd
))
13253 const TargetOptions
&Options
= getTargetMachine().Options
;
13254 const Function
*F
= I
->getFunction();
13255 const DataLayout
&DL
= F
->getParent()->getDataLayout();
13256 Type
*Ty
= User
->getOperand(0)->getType();
13258 return !(isFMAFasterThanFMulAndFAdd(*F
, Ty
) &&
13259 isOperationLegalOrCustom(ISD::FMA
, getValueType(DL
, Ty
)) &&
13260 (Options
.AllowFPOpFusion
== FPOpFusion::Fast
||
13261 Options
.UnsafeFPMath
));
13264 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
13266 bool AArch64TargetLowering::isZExtFree(Type
*Ty1
, Type
*Ty2
) const {
13267 if (!Ty1
->isIntegerTy() || !Ty2
->isIntegerTy())
13269 unsigned NumBits1
= Ty1
->getPrimitiveSizeInBits();
13270 unsigned NumBits2
= Ty2
->getPrimitiveSizeInBits();
13271 return NumBits1
== 32 && NumBits2
== 64;
13273 bool AArch64TargetLowering::isZExtFree(EVT VT1
, EVT VT2
) const {
13274 if (VT1
.isVector() || VT2
.isVector() || !VT1
.isInteger() || !VT2
.isInteger())
13276 unsigned NumBits1
= VT1
.getSizeInBits();
13277 unsigned NumBits2
= VT2
.getSizeInBits();
13278 return NumBits1
== 32 && NumBits2
== 64;
13281 bool AArch64TargetLowering::isZExtFree(SDValue Val
, EVT VT2
) const {
13282 EVT VT1
= Val
.getValueType();
13283 if (isZExtFree(VT1
, VT2
)) {
13287 if (Val
.getOpcode() != ISD::LOAD
)
13290 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
13291 return (VT1
.isSimple() && !VT1
.isVector() && VT1
.isInteger() &&
13292 VT2
.isSimple() && !VT2
.isVector() && VT2
.isInteger() &&
13293 VT1
.getSizeInBits() <= 32);
13296 bool AArch64TargetLowering::isExtFreeImpl(const Instruction
*Ext
) const {
13297 if (isa
<FPExtInst
>(Ext
))
13300 // Vector types are not free.
13301 if (Ext
->getType()->isVectorTy())
13304 for (const Use
&U
: Ext
->uses()) {
13305 // The extension is free if we can fold it with a left shift in an
13306 // addressing mode or an arithmetic operation: add, sub, and cmp.
13308 // Is there a shift?
13309 const Instruction
*Instr
= cast
<Instruction
>(U
.getUser());
13311 // Is this a constant shift?
13312 switch (Instr
->getOpcode()) {
13313 case Instruction::Shl
:
13314 if (!isa
<ConstantInt
>(Instr
->getOperand(1)))
13317 case Instruction::GetElementPtr
: {
13318 gep_type_iterator GTI
= gep_type_begin(Instr
);
13319 auto &DL
= Ext
->getModule()->getDataLayout();
13320 std::advance(GTI
, U
.getOperandNo()-1);
13321 Type
*IdxTy
= GTI
.getIndexedType();
13322 // This extension will end up with a shift because of the scaling factor.
13323 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
13324 // Get the shift amount based on the scaling factor:
13325 // log2(sizeof(IdxTy)) - log2(8).
13326 uint64_t ShiftAmt
=
13327 countTrailingZeros(DL
.getTypeStoreSizeInBits(IdxTy
).getFixedSize()) - 3;
13328 // Is the constant foldable in the shift of the addressing mode?
13329 // I.e., shift amount is between 1 and 4 inclusive.
13330 if (ShiftAmt
== 0 || ShiftAmt
> 4)
13334 case Instruction::Trunc
:
13335 // Check if this is a noop.
13336 // trunc(sext ty1 to ty2) to ty1.
13337 if (Instr
->getType() == Ext
->getOperand(0)->getType())
13344 // At this point we can use the bfm family, so this extension is free
13350 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower
13351 /// or upper half of the vector elements.
13352 static bool areExtractShuffleVectors(Value
*Op1
, Value
*Op2
) {
13353 auto areTypesHalfed
= [](Value
*FullV
, Value
*HalfV
) {
13354 auto *FullTy
= FullV
->getType();
13355 auto *HalfTy
= HalfV
->getType();
13356 return FullTy
->getPrimitiveSizeInBits().getFixedSize() ==
13357 2 * HalfTy
->getPrimitiveSizeInBits().getFixedSize();
13360 auto extractHalf
= [](Value
*FullV
, Value
*HalfV
) {
13361 auto *FullVT
= cast
<FixedVectorType
>(FullV
->getType());
13362 auto *HalfVT
= cast
<FixedVectorType
>(HalfV
->getType());
13363 return FullVT
->getNumElements() == 2 * HalfVT
->getNumElements();
13366 ArrayRef
<int> M1
, M2
;
13367 Value
*S1Op1
, *S2Op1
;
13368 if (!match(Op1
, m_Shuffle(m_Value(S1Op1
), m_Undef(), m_Mask(M1
))) ||
13369 !match(Op2
, m_Shuffle(m_Value(S2Op1
), m_Undef(), m_Mask(M2
))))
13372 // Check that the operands are half as wide as the result and we extract
13373 // half of the elements of the input vectors.
13374 if (!areTypesHalfed(S1Op1
, Op1
) || !areTypesHalfed(S2Op1
, Op2
) ||
13375 !extractHalf(S1Op1
, Op1
) || !extractHalf(S2Op1
, Op2
))
13378 // Check the mask extracts either the lower or upper half of vector
13382 int NumElements
= cast
<FixedVectorType
>(Op1
->getType())->getNumElements() * 2;
13383 if (!ShuffleVectorInst::isExtractSubvectorMask(M1
, NumElements
, M1Start
) ||
13384 !ShuffleVectorInst::isExtractSubvectorMask(M2
, NumElements
, M2Start
) ||
13385 M1Start
!= M2Start
|| (M1Start
!= 0 && M2Start
!= (NumElements
/ 2)))
13391 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
13392 /// of the vector elements.
13393 static bool areExtractExts(Value
*Ext1
, Value
*Ext2
) {
13394 auto areExtDoubled
= [](Instruction
*Ext
) {
13395 return Ext
->getType()->getScalarSizeInBits() ==
13396 2 * Ext
->getOperand(0)->getType()->getScalarSizeInBits();
13399 if (!match(Ext1
, m_ZExtOrSExt(m_Value())) ||
13400 !match(Ext2
, m_ZExtOrSExt(m_Value())) ||
13401 !areExtDoubled(cast
<Instruction
>(Ext1
)) ||
13402 !areExtDoubled(cast
<Instruction
>(Ext2
)))
13408 /// Check if Op could be used with vmull_high_p64 intrinsic.
13409 static bool isOperandOfVmullHighP64(Value
*Op
) {
13410 Value
*VectorOperand
= nullptr;
13411 ConstantInt
*ElementIndex
= nullptr;
13412 return match(Op
, m_ExtractElt(m_Value(VectorOperand
),
13413 m_ConstantInt(ElementIndex
))) &&
13414 ElementIndex
->getValue() == 1 &&
13415 isa
<FixedVectorType
>(VectorOperand
->getType()) &&
13416 cast
<FixedVectorType
>(VectorOperand
->getType())->getNumElements() == 2;
13419 /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
13420 static bool areOperandsOfVmullHighP64(Value
*Op1
, Value
*Op2
) {
13421 return isOperandOfVmullHighP64(Op1
) && isOperandOfVmullHighP64(Op2
);
13424 static bool isSplatShuffle(Value
*V
) {
13425 if (auto *Shuf
= dyn_cast
<ShuffleVectorInst
>(V
))
13426 return all_equal(Shuf
->getShuffleMask());
13430 /// Check if sinking \p I's operands to I's basic block is profitable, because
13431 /// the operands can be folded into a target instruction, e.g.
13432 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
13433 bool AArch64TargetLowering::shouldSinkOperands(
13434 Instruction
*I
, SmallVectorImpl
<Use
*> &Ops
) const {
13435 if (IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(I
)) {
13436 switch (II
->getIntrinsicID()) {
13437 case Intrinsic::aarch64_neon_smull
:
13438 case Intrinsic::aarch64_neon_umull
:
13439 if (areExtractShuffleVectors(II
->getOperand(0), II
->getOperand(1))) {
13440 Ops
.push_back(&II
->getOperandUse(0));
13441 Ops
.push_back(&II
->getOperandUse(1));
13446 case Intrinsic::fma
:
13447 if (isa
<VectorType
>(I
->getType()) &&
13448 cast
<VectorType
>(I
->getType())->getElementType()->isHalfTy() &&
13449 !Subtarget
->hasFullFP16())
13452 case Intrinsic::aarch64_neon_sqdmull
:
13453 case Intrinsic::aarch64_neon_sqdmulh
:
13454 case Intrinsic::aarch64_neon_sqrdmulh
:
13455 // Sink splats for index lane variants
13456 if (isSplatShuffle(II
->getOperand(0)))
13457 Ops
.push_back(&II
->getOperandUse(0));
13458 if (isSplatShuffle(II
->getOperand(1)))
13459 Ops
.push_back(&II
->getOperandUse(1));
13460 return !Ops
.empty();
13461 case Intrinsic::aarch64_sve_ptest_first
:
13462 case Intrinsic::aarch64_sve_ptest_last
:
13463 if (auto *IIOp
= dyn_cast
<IntrinsicInst
>(II
->getOperand(0)))
13464 if (IIOp
->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue
)
13465 Ops
.push_back(&II
->getOperandUse(0));
13466 return !Ops
.empty();
13467 case Intrinsic::aarch64_sme_write_horiz
:
13468 case Intrinsic::aarch64_sme_write_vert
:
13469 case Intrinsic::aarch64_sme_writeq_horiz
:
13470 case Intrinsic::aarch64_sme_writeq_vert
: {
13471 auto *Idx
= dyn_cast
<Instruction
>(II
->getOperand(1));
13472 if (!Idx
|| Idx
->getOpcode() != Instruction::Add
)
13474 Ops
.push_back(&II
->getOperandUse(1));
13477 case Intrinsic::aarch64_sme_read_horiz
:
13478 case Intrinsic::aarch64_sme_read_vert
:
13479 case Intrinsic::aarch64_sme_readq_horiz
:
13480 case Intrinsic::aarch64_sme_readq_vert
:
13481 case Intrinsic::aarch64_sme_ld1b_vert
:
13482 case Intrinsic::aarch64_sme_ld1h_vert
:
13483 case Intrinsic::aarch64_sme_ld1w_vert
:
13484 case Intrinsic::aarch64_sme_ld1d_vert
:
13485 case Intrinsic::aarch64_sme_ld1q_vert
:
13486 case Intrinsic::aarch64_sme_st1b_vert
:
13487 case Intrinsic::aarch64_sme_st1h_vert
:
13488 case Intrinsic::aarch64_sme_st1w_vert
:
13489 case Intrinsic::aarch64_sme_st1d_vert
:
13490 case Intrinsic::aarch64_sme_st1q_vert
:
13491 case Intrinsic::aarch64_sme_ld1b_horiz
:
13492 case Intrinsic::aarch64_sme_ld1h_horiz
:
13493 case Intrinsic::aarch64_sme_ld1w_horiz
:
13494 case Intrinsic::aarch64_sme_ld1d_horiz
:
13495 case Intrinsic::aarch64_sme_ld1q_horiz
:
13496 case Intrinsic::aarch64_sme_st1b_horiz
:
13497 case Intrinsic::aarch64_sme_st1h_horiz
:
13498 case Intrinsic::aarch64_sme_st1w_horiz
:
13499 case Intrinsic::aarch64_sme_st1d_horiz
:
13500 case Intrinsic::aarch64_sme_st1q_horiz
: {
13501 auto *Idx
= dyn_cast
<Instruction
>(II
->getOperand(3));
13502 if (!Idx
|| Idx
->getOpcode() != Instruction::Add
)
13504 Ops
.push_back(&II
->getOperandUse(3));
13507 case Intrinsic::aarch64_neon_pmull
:
13508 if (!areExtractShuffleVectors(II
->getOperand(0), II
->getOperand(1)))
13510 Ops
.push_back(&II
->getOperandUse(0));
13511 Ops
.push_back(&II
->getOperandUse(1));
13513 case Intrinsic::aarch64_neon_pmull64
:
13514 if (!areOperandsOfVmullHighP64(II
->getArgOperand(0),
13515 II
->getArgOperand(1)))
13517 Ops
.push_back(&II
->getArgOperandUse(0));
13518 Ops
.push_back(&II
->getArgOperandUse(1));
13525 if (!I
->getType()->isVectorTy())
13528 switch (I
->getOpcode()) {
13529 case Instruction::Sub
:
13530 case Instruction::Add
: {
13531 if (!areExtractExts(I
->getOperand(0), I
->getOperand(1)))
13534 // If the exts' operands extract either the lower or upper elements, we
13535 // can sink them too.
13536 auto Ext1
= cast
<Instruction
>(I
->getOperand(0));
13537 auto Ext2
= cast
<Instruction
>(I
->getOperand(1));
13538 if (areExtractShuffleVectors(Ext1
->getOperand(0), Ext2
->getOperand(0))) {
13539 Ops
.push_back(&Ext1
->getOperandUse(0));
13540 Ops
.push_back(&Ext2
->getOperandUse(0));
13543 Ops
.push_back(&I
->getOperandUse(0));
13544 Ops
.push_back(&I
->getOperandUse(1));
13548 case Instruction::Mul
: {
13549 bool IsProfitable
= false;
13550 for (auto &Op
: I
->operands()) {
13551 // Make sure we are not already sinking this operand
13552 if (any_of(Ops
, [&](Use
*U
) { return U
->get() == Op
; }))
13555 ShuffleVectorInst
*Shuffle
= dyn_cast
<ShuffleVectorInst
>(Op
);
13557 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
13558 // operand and the s/zext can help create indexed s/umull. This is
13559 // especially useful to prevent i64 mul being scalarized.
13560 if (Shuffle
&& isSplatShuffle(Shuffle
) &&
13561 match(Shuffle
->getOperand(0), m_ZExtOrSExt(m_Value()))) {
13562 Ops
.push_back(&Shuffle
->getOperandUse(0));
13563 Ops
.push_back(&Op
);
13564 IsProfitable
= true;
13568 if (!Shuffle
|| !Shuffle
->isZeroEltSplat())
13571 Value
*ShuffleOperand
= Shuffle
->getOperand(0);
13572 InsertElementInst
*Insert
= dyn_cast
<InsertElementInst
>(ShuffleOperand
);
13576 Instruction
*OperandInstr
= dyn_cast
<Instruction
>(Insert
->getOperand(1));
13580 ConstantInt
*ElementConstant
=
13581 dyn_cast
<ConstantInt
>(Insert
->getOperand(2));
13582 // Check that the insertelement is inserting into element 0
13583 if (!ElementConstant
|| ElementConstant
->getZExtValue() != 0)
13586 unsigned Opcode
= OperandInstr
->getOpcode();
13587 if (Opcode
!= Instruction::SExt
&& Opcode
!= Instruction::ZExt
)
13590 Ops
.push_back(&Shuffle
->getOperandUse(0));
13591 Ops
.push_back(&Op
);
13592 IsProfitable
= true;
13595 return IsProfitable
;
13603 static void createTblShuffleForZExt(ZExtInst
*ZExt
, bool IsLittleEndian
) {
13604 Value
*Op
= ZExt
->getOperand(0);
13605 auto *SrcTy
= dyn_cast
<FixedVectorType
>(Op
->getType());
13606 auto *DstTy
= dyn_cast
<FixedVectorType
>(ZExt
->getType());
13607 unsigned NumElts
= SrcTy
->getNumElements();
13608 IRBuilder
<> Builder(ZExt
);
13609 SmallVector
<int> Mask(4 * NumElts
, NumElts
);
13610 // Create a mask that selects <0,0,0,Op[i]> for each lane of vector of i32 to
13611 // replace the original ZExt. This can later be lowered to a set of tbl
13613 for (unsigned i
= 0; i
< NumElts
; i
++) {
13614 if (IsLittleEndian
)
13617 Mask
[i
* 4 + 3] = i
;
13620 auto *FirstEltZero
= Builder
.CreateInsertElement(
13621 PoisonValue::get(SrcTy
), Builder
.getInt8(0), uint64_t(0));
13622 Value
*Result
= Builder
.CreateShuffleVector(Op
, FirstEltZero
, Mask
);
13623 Result
= Builder
.CreateBitCast(Result
, DstTy
);
13624 ZExt
->replaceAllUsesWith(Result
);
13625 ZExt
->eraseFromParent();
13628 static void createTblForTrunc(TruncInst
*TI
, bool IsLittleEndian
) {
13629 IRBuilder
<> Builder(TI
);
13630 SmallVector
<Value
*> Parts
;
13631 Type
*VecTy
= FixedVectorType::get(Builder
.getInt8Ty(), 16);
13632 Parts
.push_back(Builder
.CreateBitCast(
13633 Builder
.CreateShuffleVector(TI
->getOperand(0), {0, 1, 2, 3}), VecTy
));
13634 Parts
.push_back(Builder
.CreateBitCast(
13635 Builder
.CreateShuffleVector(TI
->getOperand(0), {4, 5, 6, 7}), VecTy
));
13637 Intrinsic::ID TblID
= Intrinsic::aarch64_neon_tbl2
;
13638 unsigned NumElements
= cast
<FixedVectorType
>(TI
->getType())->getNumElements();
13639 if (NumElements
== 16) {
13640 Parts
.push_back(Builder
.CreateBitCast(
13641 Builder
.CreateShuffleVector(TI
->getOperand(0), {8, 9, 10, 11}), VecTy
));
13642 Parts
.push_back(Builder
.CreateBitCast(
13643 Builder
.CreateShuffleVector(TI
->getOperand(0), {12, 13, 14, 15}),
13645 TblID
= Intrinsic::aarch64_neon_tbl4
;
13647 SmallVector
<Constant
*, 16> MaskConst
;
13648 for (unsigned Idx
= 0; Idx
< NumElements
* 4; Idx
+= 4)
13649 MaskConst
.push_back(
13650 ConstantInt::get(Builder
.getInt8Ty(), IsLittleEndian
? Idx
: Idx
+ 3));
13652 for (unsigned Idx
= NumElements
* 4; Idx
< 64; Idx
+= 4)
13653 MaskConst
.push_back(ConstantInt::get(Builder
.getInt8Ty(), 255));
13655 Parts
.push_back(ConstantVector::get(MaskConst
));
13657 Intrinsic::getDeclaration(TI
->getModule(), TblID
, Parts
[0]->getType());
13658 Value
*Res
= Builder
.CreateCall(F
, Parts
);
13660 if (NumElements
== 8)
13661 Res
= Builder
.CreateShuffleVector(Res
, {0, 1, 2, 3, 4, 5, 6, 7});
13662 TI
->replaceAllUsesWith(Res
);
13663 TI
->eraseFromParent();
13666 bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction
*I
,
13668 // Try to optimize conversions using tbl. This requires materializing constant
13669 // index vectors, which can increase code size and add loads. Skip the
13670 // transform unless the conversion is in a loop block guaranteed to execute
13671 // and we are not optimizing for size.
13672 Function
*F
= I
->getParent()->getParent();
13673 if (!L
|| L
->getHeader() != I
->getParent() || F
->hasMinSize() ||
13677 auto *SrcTy
= dyn_cast
<FixedVectorType
>(I
->getOperand(0)->getType());
13678 auto *DstTy
= dyn_cast
<FixedVectorType
>(I
->getType());
13679 if (!SrcTy
|| !DstTy
)
13682 // Convert 'zext <(8|16) x i8> %x to <(8|16) x i32>' to a shuffle that can be
13683 // lowered to either 2 or 4 tbl instructions to insert the original i8
13684 // elements into i32 lanes.
13685 auto *ZExt
= dyn_cast
<ZExtInst
>(I
);
13686 if (ZExt
&& (SrcTy
->getNumElements() == 8 || SrcTy
->getNumElements() == 16) &&
13687 SrcTy
->getElementType()->isIntegerTy(8) &&
13688 DstTy
->getElementType()->isIntegerTy(32)) {
13689 createTblShuffleForZExt(ZExt
, Subtarget
->isLittleEndian());
13693 auto *UIToFP
= dyn_cast
<UIToFPInst
>(I
);
13695 (SrcTy
->getNumElements() == 8 || SrcTy
->getNumElements() == 16) &&
13696 SrcTy
->getElementType()->isIntegerTy(8) &&
13697 DstTy
->getElementType()->isFloatTy()) {
13698 IRBuilder
<> Builder(I
);
13699 auto *ZExt
= cast
<ZExtInst
>(
13700 Builder
.CreateZExt(I
->getOperand(0), VectorType::getInteger(DstTy
)));
13701 auto *UI
= Builder
.CreateUIToFP(ZExt
, DstTy
);
13702 I
->replaceAllUsesWith(UI
);
13703 I
->eraseFromParent();
13704 createTblShuffleForZExt(ZExt
, Subtarget
->isLittleEndian());
13708 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
13709 // followed by a truncate lowered to using tbl.4.
13710 auto *FPToUI
= dyn_cast
<FPToUIInst
>(I
);
13712 (SrcTy
->getNumElements() == 8 || SrcTy
->getNumElements() == 16) &&
13713 SrcTy
->getElementType()->isFloatTy() &&
13714 DstTy
->getElementType()->isIntegerTy(8)) {
13715 IRBuilder
<> Builder(I
);
13716 auto *WideConv
= Builder
.CreateFPToUI(FPToUI
->getOperand(0),
13717 VectorType::getInteger(SrcTy
));
13718 auto *TruncI
= Builder
.CreateTrunc(WideConv
, DstTy
);
13719 I
->replaceAllUsesWith(TruncI
);
13720 I
->eraseFromParent();
13721 createTblForTrunc(cast
<TruncInst
>(TruncI
), Subtarget
->isLittleEndian());
13725 // Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' to a single tbl.4
13726 // instruction selecting the lowest 8 bits per lane of the input interpreted
13727 // as 2 or 4 <4 x i32> vectors.
13728 auto *TI
= dyn_cast
<TruncInst
>(I
);
13729 if (TI
&& (SrcTy
->getNumElements() == 8 || SrcTy
->getNumElements() == 16) &&
13730 SrcTy
->getElementType()->isIntegerTy(32) &&
13731 DstTy
->getElementType()->isIntegerTy(8)) {
13732 createTblForTrunc(TI
, Subtarget
->isLittleEndian());
13739 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType
,
13740 Align
&RequiredAligment
) const {
13741 if (!LoadedType
.isSimple() ||
13742 (!LoadedType
.isInteger() && !LoadedType
.isFloatingPoint()))
13744 // Cyclone supports unaligned accesses.
13745 RequiredAligment
= Align(1);
13746 unsigned NumBits
= LoadedType
.getSizeInBits();
13747 return NumBits
== 32 || NumBits
== 64;
13750 /// A helper function for determining the number of interleaved accesses we
13751 /// will generate when lowering accesses of the given type.
13752 unsigned AArch64TargetLowering::getNumInterleavedAccesses(
13753 VectorType
*VecTy
, const DataLayout
&DL
, bool UseScalable
) const {
13754 unsigned VecSize
= UseScalable
? Subtarget
->getMinSVEVectorSizeInBits() : 128;
13755 return std::max
<unsigned>(1, (DL
.getTypeSizeInBits(VecTy
) + 127) / VecSize
);
13758 MachineMemOperand::Flags
13759 AArch64TargetLowering::getTargetMMOFlags(const Instruction
&I
) const {
13760 if (Subtarget
->getProcFamily() == AArch64Subtarget::Falkor
&&
13761 I
.getMetadata(FALKOR_STRIDED_ACCESS_MD
) != nullptr)
13762 return MOStridedAccess
;
13763 return MachineMemOperand::MONone
;
13766 bool AArch64TargetLowering::isLegalInterleavedAccessType(
13767 VectorType
*VecTy
, const DataLayout
&DL
, bool &UseScalable
) const {
13769 unsigned VecSize
= DL
.getTypeSizeInBits(VecTy
);
13770 unsigned ElSize
= DL
.getTypeSizeInBits(VecTy
->getElementType());
13771 unsigned NumElements
= cast
<FixedVectorType
>(VecTy
)->getNumElements();
13773 UseScalable
= false;
13775 // Ensure the number of vector elements is greater than 1.
13776 if (NumElements
< 2)
13779 // Ensure the element type is legal.
13780 if (ElSize
!= 8 && ElSize
!= 16 && ElSize
!= 32 && ElSize
!= 64)
13783 if (Subtarget
->useSVEForFixedLengthVectors() &&
13784 (VecSize
% Subtarget
->getMinSVEVectorSizeInBits() == 0 ||
13785 (VecSize
< Subtarget
->getMinSVEVectorSizeInBits() &&
13786 isPowerOf2_32(NumElements
) && VecSize
> 128))) {
13787 UseScalable
= true;
13791 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
13792 // 128 will be split into multiple interleaved accesses.
13793 return VecSize
== 64 || VecSize
% 128 == 0;
13796 static ScalableVectorType
*getSVEContainerIRType(FixedVectorType
*VTy
) {
13797 if (VTy
->getElementType() == Type::getDoubleTy(VTy
->getContext()))
13798 return ScalableVectorType::get(VTy
->getElementType(), 2);
13800 if (VTy
->getElementType() == Type::getFloatTy(VTy
->getContext()))
13801 return ScalableVectorType::get(VTy
->getElementType(), 4);
13803 if (VTy
->getElementType() == Type::getBFloatTy(VTy
->getContext()))
13804 return ScalableVectorType::get(VTy
->getElementType(), 8);
13806 if (VTy
->getElementType() == Type::getHalfTy(VTy
->getContext()))
13807 return ScalableVectorType::get(VTy
->getElementType(), 8);
13809 if (VTy
->getElementType() == Type::getInt64Ty(VTy
->getContext()))
13810 return ScalableVectorType::get(VTy
->getElementType(), 2);
13812 if (VTy
->getElementType() == Type::getInt32Ty(VTy
->getContext()))
13813 return ScalableVectorType::get(VTy
->getElementType(), 4);
13815 if (VTy
->getElementType() == Type::getInt16Ty(VTy
->getContext()))
13816 return ScalableVectorType::get(VTy
->getElementType(), 8);
13818 if (VTy
->getElementType() == Type::getInt8Ty(VTy
->getContext()))
13819 return ScalableVectorType::get(VTy
->getElementType(), 16);
13821 llvm_unreachable("Cannot handle input vector type");
13824 /// Lower an interleaved load into a ldN intrinsic.
13826 /// E.g. Lower an interleaved load (Factor = 2):
13827 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
13828 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
13829 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
13832 /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
13833 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
13834 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
13835 bool AArch64TargetLowering::lowerInterleavedLoad(
13836 LoadInst
*LI
, ArrayRef
<ShuffleVectorInst
*> Shuffles
,
13837 ArrayRef
<unsigned> Indices
, unsigned Factor
) const {
13838 assert(Factor
>= 2 && Factor
<= getMaxSupportedInterleaveFactor() &&
13839 "Invalid interleave factor");
13840 assert(!Shuffles
.empty() && "Empty shufflevector input");
13841 assert(Shuffles
.size() == Indices
.size() &&
13842 "Unmatched number of shufflevectors and indices");
13844 const DataLayout
&DL
= LI
->getModule()->getDataLayout();
13846 VectorType
*VTy
= Shuffles
[0]->getType();
13848 // Skip if we do not have NEON and skip illegal vector types. We can
13849 // "legalize" wide vector types into multiple interleaved accesses as long as
13850 // the vector types are divisible by 128.
13852 if (!Subtarget
->hasNEON() ||
13853 !isLegalInterleavedAccessType(VTy
, DL
, UseScalable
))
13856 unsigned NumLoads
= getNumInterleavedAccesses(VTy
, DL
, UseScalable
);
13858 auto *FVTy
= cast
<FixedVectorType
>(VTy
);
13860 // A pointer vector can not be the return type of the ldN intrinsics. Need to
13861 // load integer vectors first and then convert to pointer vectors.
13862 Type
*EltTy
= FVTy
->getElementType();
13863 if (EltTy
->isPointerTy())
13865 FixedVectorType::get(DL
.getIntPtrType(EltTy
), FVTy
->getNumElements());
13867 // If we're going to generate more than one load, reset the sub-vector type
13868 // to something legal.
13869 FVTy
= FixedVectorType::get(FVTy
->getElementType(),
13870 FVTy
->getNumElements() / NumLoads
);
13873 UseScalable
? cast
<VectorType
>(getSVEContainerIRType(FVTy
)) : FVTy
;
13875 IRBuilder
<> Builder(LI
);
13877 // The base address of the load.
13878 Value
*BaseAddr
= LI
->getPointerOperand();
13880 if (NumLoads
> 1) {
13881 // We will compute the pointer operand of each load from the original base
13882 // address using GEPs. Cast the base address to a pointer to the scalar
13884 BaseAddr
= Builder
.CreateBitCast(
13886 LDVTy
->getElementType()->getPointerTo(LI
->getPointerAddressSpace()));
13891 ? LDVTy
->getElementType()->getPointerTo(LI
->getPointerAddressSpace())
13892 : LDVTy
->getPointerTo(LI
->getPointerAddressSpace());
13893 Type
*PredTy
= VectorType::get(Type::getInt1Ty(LDVTy
->getContext()),
13894 LDVTy
->getElementCount());
13896 static const Intrinsic::ID SVELoadIntrs
[3] = {
13897 Intrinsic::aarch64_sve_ld2_sret
, Intrinsic::aarch64_sve_ld3_sret
,
13898 Intrinsic::aarch64_sve_ld4_sret
};
13899 static const Intrinsic::ID NEONLoadIntrs
[3] = {Intrinsic::aarch64_neon_ld2
,
13900 Intrinsic::aarch64_neon_ld3
,
13901 Intrinsic::aarch64_neon_ld4
};
13904 LdNFunc
= Intrinsic::getDeclaration(LI
->getModule(),
13905 SVELoadIntrs
[Factor
- 2], {LDVTy
});
13907 LdNFunc
= Intrinsic::getDeclaration(
13908 LI
->getModule(), NEONLoadIntrs
[Factor
- 2], {LDVTy
, PtrTy
});
13910 // Holds sub-vectors extracted from the load intrinsic return values. The
13911 // sub-vectors are associated with the shufflevector instructions they will
13913 DenseMap
<ShuffleVectorInst
*, SmallVector
<Value
*, 4>> SubVecs
;
13915 Value
*PTrue
= nullptr;
13917 Optional
<unsigned> PgPattern
=
13918 getSVEPredPatternFromNumElements(FVTy
->getNumElements());
13919 if (Subtarget
->getMinSVEVectorSizeInBits() ==
13920 Subtarget
->getMaxSVEVectorSizeInBits() &&
13921 Subtarget
->getMinSVEVectorSizeInBits() == DL
.getTypeSizeInBits(FVTy
))
13922 PgPattern
= AArch64SVEPredPattern::all
;
13925 ConstantInt::get(Type::getInt32Ty(LDVTy
->getContext()), *PgPattern
);
13926 PTrue
= Builder
.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue
, {PredTy
},
13930 for (unsigned LoadCount
= 0; LoadCount
< NumLoads
; ++LoadCount
) {
13932 // If we're generating more than one load, compute the base address of
13933 // subsequent loads as an offset from the previous.
13935 BaseAddr
= Builder
.CreateConstGEP1_32(LDVTy
->getElementType(), BaseAddr
,
13936 FVTy
->getNumElements() * Factor
);
13940 LdN
= Builder
.CreateCall(
13941 LdNFunc
, {PTrue
, Builder
.CreateBitCast(BaseAddr
, PtrTy
)}, "ldN");
13943 LdN
= Builder
.CreateCall(LdNFunc
, Builder
.CreateBitCast(BaseAddr
, PtrTy
),
13946 // Extract and store the sub-vectors returned by the load intrinsic.
13947 for (unsigned i
= 0; i
< Shuffles
.size(); i
++) {
13948 ShuffleVectorInst
*SVI
= Shuffles
[i
];
13949 unsigned Index
= Indices
[i
];
13951 Value
*SubVec
= Builder
.CreateExtractValue(LdN
, Index
);
13954 SubVec
= Builder
.CreateExtractVector(
13956 ConstantInt::get(Type::getInt64Ty(VTy
->getContext()), 0));
13958 // Convert the integer vector to pointer vector if the element is pointer.
13959 if (EltTy
->isPointerTy())
13960 SubVec
= Builder
.CreateIntToPtr(
13961 SubVec
, FixedVectorType::get(SVI
->getType()->getElementType(),
13962 FVTy
->getNumElements()));
13964 SubVecs
[SVI
].push_back(SubVec
);
13968 // Replace uses of the shufflevector instructions with the sub-vectors
13969 // returned by the load intrinsic. If a shufflevector instruction is
13970 // associated with more than one sub-vector, those sub-vectors will be
13971 // concatenated into a single wide vector.
13972 for (ShuffleVectorInst
*SVI
: Shuffles
) {
13973 auto &SubVec
= SubVecs
[SVI
];
13975 SubVec
.size() > 1 ? concatenateVectors(Builder
, SubVec
) : SubVec
[0];
13976 SVI
->replaceAllUsesWith(WideVec
);
13982 /// Lower an interleaved store into a stN intrinsic.
13984 /// E.g. Lower an interleaved store (Factor = 3):
13985 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
13986 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
13987 /// store <12 x i32> %i.vec, <12 x i32>* %ptr
13990 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
13991 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
13992 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
13993 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
13995 /// Note that the new shufflevectors will be removed and we'll only generate one
13996 /// st3 instruction in CodeGen.
13998 /// Example for a more general valid mask (Factor 3). Lower:
13999 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
14000 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
14001 /// store <12 x i32> %i.vec, <12 x i32>* %ptr
14004 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
14005 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
14006 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
14007 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
14008 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst
*SI
,
14009 ShuffleVectorInst
*SVI
,
14010 unsigned Factor
) const {
14011 assert(Factor
>= 2 && Factor
<= getMaxSupportedInterleaveFactor() &&
14012 "Invalid interleave factor");
14014 auto *VecTy
= cast
<FixedVectorType
>(SVI
->getType());
14015 assert(VecTy
->getNumElements() % Factor
== 0 && "Invalid interleaved store");
14017 unsigned LaneLen
= VecTy
->getNumElements() / Factor
;
14018 Type
*EltTy
= VecTy
->getElementType();
14019 auto *SubVecTy
= FixedVectorType::get(EltTy
, LaneLen
);
14021 const DataLayout
&DL
= SI
->getModule()->getDataLayout();
14024 // Skip if we do not have NEON and skip illegal vector types. We can
14025 // "legalize" wide vector types into multiple interleaved accesses as long as
14026 // the vector types are divisible by 128.
14027 if (!Subtarget
->hasNEON() ||
14028 !isLegalInterleavedAccessType(SubVecTy
, DL
, UseScalable
))
14031 unsigned NumStores
= getNumInterleavedAccesses(SubVecTy
, DL
, UseScalable
);
14033 Value
*Op0
= SVI
->getOperand(0);
14034 Value
*Op1
= SVI
->getOperand(1);
14035 IRBuilder
<> Builder(SI
);
14037 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
14038 // vectors to integer vectors.
14039 if (EltTy
->isPointerTy()) {
14040 Type
*IntTy
= DL
.getIntPtrType(EltTy
);
14041 unsigned NumOpElts
=
14042 cast
<FixedVectorType
>(Op0
->getType())->getNumElements();
14044 // Convert to the corresponding integer vector.
14045 auto *IntVecTy
= FixedVectorType::get(IntTy
, NumOpElts
);
14046 Op0
= Builder
.CreatePtrToInt(Op0
, IntVecTy
);
14047 Op1
= Builder
.CreatePtrToInt(Op1
, IntVecTy
);
14049 SubVecTy
= FixedVectorType::get(IntTy
, LaneLen
);
14052 // If we're going to generate more than one store, reset the lane length
14053 // and sub-vector type to something legal.
14054 LaneLen
/= NumStores
;
14055 SubVecTy
= FixedVectorType::get(SubVecTy
->getElementType(), LaneLen
);
14057 auto *STVTy
= UseScalable
? cast
<VectorType
>(getSVEContainerIRType(SubVecTy
))
14060 // The base address of the store.
14061 Value
*BaseAddr
= SI
->getPointerOperand();
14063 if (NumStores
> 1) {
14064 // We will compute the pointer operand of each store from the original base
14065 // address using GEPs. Cast the base address to a pointer to the scalar
14067 BaseAddr
= Builder
.CreateBitCast(
14069 SubVecTy
->getElementType()->getPointerTo(SI
->getPointerAddressSpace()));
14072 auto Mask
= SVI
->getShuffleMask();
14074 // Sanity check if all the indices are NOT in range.
14075 // If mask is `undef` or `poison`, `Mask` may be a vector of -1s.
14076 // If all of them are `undef`, OOB read will happen later.
14077 if (llvm::all_of(Mask
, [](int Idx
) { return Idx
== UndefMaskElem
; })) {
14083 ? STVTy
->getElementType()->getPointerTo(SI
->getPointerAddressSpace())
14084 : STVTy
->getPointerTo(SI
->getPointerAddressSpace());
14085 Type
*PredTy
= VectorType::get(Type::getInt1Ty(STVTy
->getContext()),
14086 STVTy
->getElementCount());
14088 static const Intrinsic::ID SVEStoreIntrs
[3] = {Intrinsic::aarch64_sve_st2
,
14089 Intrinsic::aarch64_sve_st3
,
14090 Intrinsic::aarch64_sve_st4
};
14091 static const Intrinsic::ID NEONStoreIntrs
[3] = {Intrinsic::aarch64_neon_st2
,
14092 Intrinsic::aarch64_neon_st3
,
14093 Intrinsic::aarch64_neon_st4
};
14096 StNFunc
= Intrinsic::getDeclaration(SI
->getModule(),
14097 SVEStoreIntrs
[Factor
- 2], {STVTy
});
14099 StNFunc
= Intrinsic::getDeclaration(
14100 SI
->getModule(), NEONStoreIntrs
[Factor
- 2], {STVTy
, PtrTy
});
14102 Value
*PTrue
= nullptr;
14104 Optional
<unsigned> PgPattern
=
14105 getSVEPredPatternFromNumElements(SubVecTy
->getNumElements());
14106 if (Subtarget
->getMinSVEVectorSizeInBits() ==
14107 Subtarget
->getMaxSVEVectorSizeInBits() &&
14108 Subtarget
->getMinSVEVectorSizeInBits() ==
14109 DL
.getTypeSizeInBits(SubVecTy
))
14110 PgPattern
= AArch64SVEPredPattern::all
;
14113 ConstantInt::get(Type::getInt32Ty(STVTy
->getContext()), *PgPattern
);
14114 PTrue
= Builder
.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue
, {PredTy
},
14118 for (unsigned StoreCount
= 0; StoreCount
< NumStores
; ++StoreCount
) {
14120 SmallVector
<Value
*, 5> Ops
;
14122 // Split the shufflevector operands into sub vectors for the new stN call.
14123 for (unsigned i
= 0; i
< Factor
; i
++) {
14125 unsigned IdxI
= StoreCount
* LaneLen
* Factor
+ i
;
14126 if (Mask
[IdxI
] >= 0) {
14127 Shuffle
= Builder
.CreateShuffleVector(
14128 Op0
, Op1
, createSequentialMask(Mask
[IdxI
], LaneLen
, 0));
14130 unsigned StartMask
= 0;
14131 for (unsigned j
= 1; j
< LaneLen
; j
++) {
14132 unsigned IdxJ
= StoreCount
* LaneLen
* Factor
+ j
* Factor
+ i
;
14133 if (Mask
[IdxJ
] >= 0) {
14134 StartMask
= Mask
[IdxJ
] - j
;
14138 // Note: Filling undef gaps with random elements is ok, since
14139 // those elements were being written anyway (with undefs).
14140 // In the case of all undefs we're defaulting to using elems from 0
14141 // Note: StartMask cannot be negative, it's checked in
14142 // isReInterleaveMask
14143 Shuffle
= Builder
.CreateShuffleVector(
14144 Op0
, Op1
, createSequentialMask(StartMask
, LaneLen
, 0));
14148 Shuffle
= Builder
.CreateInsertVector(
14149 STVTy
, UndefValue::get(STVTy
), Shuffle
,
14150 ConstantInt::get(Type::getInt64Ty(STVTy
->getContext()), 0));
14152 Ops
.push_back(Shuffle
);
14156 Ops
.push_back(PTrue
);
14158 // If we generating more than one store, we compute the base address of
14159 // subsequent stores as an offset from the previous.
14160 if (StoreCount
> 0)
14161 BaseAddr
= Builder
.CreateConstGEP1_32(SubVecTy
->getElementType(),
14162 BaseAddr
, LaneLen
* Factor
);
14164 Ops
.push_back(Builder
.CreateBitCast(BaseAddr
, PtrTy
));
14165 Builder
.CreateCall(StNFunc
, Ops
);
14170 EVT
AArch64TargetLowering::getOptimalMemOpType(
14171 const MemOp
&Op
, const AttributeList
&FuncAttributes
) const {
14172 bool CanImplicitFloat
= !FuncAttributes
.hasFnAttr(Attribute::NoImplicitFloat
);
14173 bool CanUseNEON
= Subtarget
->hasNEON() && CanImplicitFloat
;
14174 bool CanUseFP
= Subtarget
->hasFPARMv8() && CanImplicitFloat
;
14175 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
14176 // taken one instruction to materialize the v2i64 zero and one store (with
14177 // restrictive addressing mode). Just do i64 stores.
14178 bool IsSmallMemset
= Op
.isMemset() && Op
.size() < 32;
14179 auto AlignmentIsAcceptable
= [&](EVT VT
, Align AlignCheck
) {
14180 if (Op
.isAligned(AlignCheck
))
14183 return allowsMisalignedMemoryAccesses(VT
, 0, Align(1),
14184 MachineMemOperand::MONone
, &Fast
) &&
14188 if (CanUseNEON
&& Op
.isMemset() && !IsSmallMemset
&&
14189 AlignmentIsAcceptable(MVT::v16i8
, Align(16)))
14191 if (CanUseFP
&& !IsSmallMemset
&& AlignmentIsAcceptable(MVT::f128
, Align(16)))
14193 if (Op
.size() >= 8 && AlignmentIsAcceptable(MVT::i64
, Align(8)))
14195 if (Op
.size() >= 4 && AlignmentIsAcceptable(MVT::i32
, Align(4)))
14200 LLT
AArch64TargetLowering::getOptimalMemOpLLT(
14201 const MemOp
&Op
, const AttributeList
&FuncAttributes
) const {
14202 bool CanImplicitFloat
= !FuncAttributes
.hasFnAttr(Attribute::NoImplicitFloat
);
14203 bool CanUseNEON
= Subtarget
->hasNEON() && CanImplicitFloat
;
14204 bool CanUseFP
= Subtarget
->hasFPARMv8() && CanImplicitFloat
;
14205 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
14206 // taken one instruction to materialize the v2i64 zero and one store (with
14207 // restrictive addressing mode). Just do i64 stores.
14208 bool IsSmallMemset
= Op
.isMemset() && Op
.size() < 32;
14209 auto AlignmentIsAcceptable
= [&](EVT VT
, Align AlignCheck
) {
14210 if (Op
.isAligned(AlignCheck
))
14213 return allowsMisalignedMemoryAccesses(VT
, 0, Align(1),
14214 MachineMemOperand::MONone
, &Fast
) &&
14218 if (CanUseNEON
&& Op
.isMemset() && !IsSmallMemset
&&
14219 AlignmentIsAcceptable(MVT::v2i64
, Align(16)))
14220 return LLT::fixed_vector(2, 64);
14221 if (CanUseFP
&& !IsSmallMemset
&& AlignmentIsAcceptable(MVT::f128
, Align(16)))
14222 return LLT::scalar(128);
14223 if (Op
.size() >= 8 && AlignmentIsAcceptable(MVT::i64
, Align(8)))
14224 return LLT::scalar(64);
14225 if (Op
.size() >= 4 && AlignmentIsAcceptable(MVT::i32
, Align(4)))
14226 return LLT::scalar(32);
14230 // 12-bit optionally shifted immediates are legal for adds.
14231 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed
) const {
14232 if (Immed
== std::numeric_limits
<int64_t>::min()) {
14233 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
14234 << ": avoid UB for INT64_MIN\n");
14237 // Same encoding for add/sub, just flip the sign.
14238 Immed
= std::abs(Immed
);
14239 bool IsLegal
= ((Immed
>> 12) == 0 ||
14240 ((Immed
& 0xfff) == 0 && Immed
>> 24 == 0));
14241 LLVM_DEBUG(dbgs() << "Is " << Immed
14242 << " legal add imm: " << (IsLegal
? "yes" : "no") << "\n");
14246 // Return false to prevent folding
14247 // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
14248 // if the folding leads to worse code.
14249 bool AArch64TargetLowering::isMulAddWithConstProfitable(
14250 SDValue AddNode
, SDValue ConstNode
) const {
14251 // Let the DAGCombiner decide for vector types and large types.
14252 const EVT VT
= AddNode
.getValueType();
14253 if (VT
.isVector() || VT
.getScalarSizeInBits() > 64)
14256 // It is worse if c1 is legal add immediate, while c1*c2 is not
14257 // and has to be composed by at least two instructions.
14258 const ConstantSDNode
*C1Node
= cast
<ConstantSDNode
>(AddNode
.getOperand(1));
14259 const ConstantSDNode
*C2Node
= cast
<ConstantSDNode
>(ConstNode
);
14260 const int64_t C1
= C1Node
->getSExtValue();
14261 const APInt C1C2
= C1Node
->getAPIntValue() * C2Node
->getAPIntValue();
14262 if (!isLegalAddImmediate(C1
) || isLegalAddImmediate(C1C2
.getSExtValue()))
14264 SmallVector
<AArch64_IMM::ImmInsnModel
, 4> Insn
;
14265 AArch64_IMM::expandMOVImm(C1C2
.getZExtValue(), VT
.getSizeInBits(), Insn
);
14266 if (Insn
.size() > 1)
14269 // Default to true and let the DAGCombiner decide.
14273 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
14274 // immediates is the same as for an add or a sub.
14275 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed
) const {
14276 return isLegalAddImmediate(Immed
);
14279 /// isLegalAddressingMode - Return true if the addressing mode represented
14280 /// by AM is legal for this target, for a load/store of the specified type.
14281 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout
&DL
,
14282 const AddrMode
&AM
, Type
*Ty
,
14283 unsigned AS
, Instruction
*I
) const {
14284 // AArch64 has five basic addressing modes:
14286 // reg + 9-bit signed offset
14287 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
14289 // reg + SIZE_IN_BYTES * reg
14291 // No global is ever allowed as a base.
14295 // No reg+reg+imm addressing.
14296 if (AM
.HasBaseReg
&& AM
.BaseOffs
&& AM
.Scale
)
14299 // FIXME: Update this method to support scalable addressing modes.
14300 if (isa
<ScalableVectorType
>(Ty
)) {
14301 uint64_t VecElemNumBytes
=
14302 DL
.getTypeSizeInBits(cast
<VectorType
>(Ty
)->getElementType()) / 8;
14303 return AM
.HasBaseReg
&& !AM
.BaseOffs
&&
14304 (AM
.Scale
== 0 || (uint64_t)AM
.Scale
== VecElemNumBytes
);
14307 // check reg + imm case:
14308 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
14309 uint64_t NumBytes
= 0;
14310 if (Ty
->isSized()) {
14311 uint64_t NumBits
= DL
.getTypeSizeInBits(Ty
);
14312 NumBytes
= NumBits
/ 8;
14313 if (!isPowerOf2_64(NumBits
))
14318 int64_t Offset
= AM
.BaseOffs
;
14320 // 9-bit signed offset
14321 if (isInt
<9>(Offset
))
14324 // 12-bit unsigned offset
14325 unsigned shift
= Log2_64(NumBytes
);
14326 if (NumBytes
&& Offset
> 0 && (Offset
/ NumBytes
) <= (1LL << 12) - 1 &&
14327 // Must be a multiple of NumBytes (NumBytes is a power of 2)
14328 (Offset
>> shift
) << shift
== Offset
)
14333 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
14335 return AM
.Scale
== 1 || (AM
.Scale
> 0 && (uint64_t)AM
.Scale
== NumBytes
);
14338 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
14339 // Consider splitting large offset of struct or array.
14343 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
14344 const MachineFunction
&MF
, EVT VT
) const {
14345 VT
= VT
.getScalarType();
14347 if (!VT
.isSimple())
14350 switch (VT
.getSimpleVT().SimpleTy
) {
14352 return Subtarget
->hasFullFP16();
14363 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function
&F
,
14365 switch (Ty
->getScalarType()->getTypeID()) {
14366 case Type::FloatTyID
:
14367 case Type::DoubleTyID
:
14374 bool AArch64TargetLowering::generateFMAsInMachineCombiner(
14375 EVT VT
, CodeGenOpt::Level OptLevel
) const {
14376 return (OptLevel
>= CodeGenOpt::Aggressive
) && !VT
.isScalableVector() &&
14377 !useSVEForFixedLengthVectorVT(VT
);
14381 AArch64TargetLowering::getScratchRegisters(CallingConv::ID
) const {
14382 // LR is a callee-save register, but we must treat it as clobbered by any call
14383 // site. Hence we include LR in the scratch registers, which are in turn added
14384 // as implicit-defs for stackmaps and patchpoints.
14385 static const MCPhysReg ScratchRegs
[] = {
14386 AArch64::X16
, AArch64::X17
, AArch64::LR
, 0
14388 return ScratchRegs
;
14392 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode
*N
,
14393 CombineLevel Level
) const {
14394 assert((N
->getOpcode() == ISD::SHL
|| N
->getOpcode() == ISD::SRA
||
14395 N
->getOpcode() == ISD::SRL
) &&
14396 "Expected shift op");
14398 SDValue ShiftLHS
= N
->getOperand(0);
14399 EVT VT
= N
->getValueType(0);
14401 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not combine
14402 // it with shift 'N' to let it be lowered to UBFX.
14403 if (ShiftLHS
.getOpcode() == ISD::AND
&& (VT
== MVT::i32
|| VT
== MVT::i64
) &&
14404 isa
<ConstantSDNode
>(ShiftLHS
.getOperand(1))) {
14405 uint64_t TruncMask
= ShiftLHS
.getConstantOperandVal(1);
14406 if (isMask_64(TruncMask
) &&
14407 ShiftLHS
.getOperand(0).getOpcode() == ISD::SRL
&&
14408 isa
<ConstantSDNode
>(ShiftLHS
.getOperand(0).getOperand(1)))
14414 bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
14415 const SDNode
*N
) const {
14416 assert(N
->getOpcode() == ISD::XOR
&&
14417 (N
->getOperand(0).getOpcode() == ISD::SHL
||
14418 N
->getOperand(0).getOpcode() == ISD::SRL
) &&
14419 "Expected XOR(SHIFT) pattern");
14421 // Only commute if the entire NOT mask is a hidden shifted mask.
14422 auto *XorC
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
14423 auto *ShiftC
= dyn_cast
<ConstantSDNode
>(N
->getOperand(0).getOperand(1));
14424 if (XorC
&& ShiftC
) {
14425 unsigned MaskIdx
, MaskLen
;
14426 if (XorC
->getAPIntValue().isShiftedMask(MaskIdx
, MaskLen
)) {
14427 unsigned ShiftAmt
= ShiftC
->getZExtValue();
14428 unsigned BitWidth
= N
->getValueType(0).getScalarSizeInBits();
14429 if (N
->getOperand(0).getOpcode() == ISD::SHL
)
14430 return MaskIdx
== ShiftAmt
&& MaskLen
== (BitWidth
- ShiftAmt
);
14431 return MaskIdx
== 0 && MaskLen
== (BitWidth
- ShiftAmt
);
14438 bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
14439 const SDNode
*N
, CombineLevel Level
) const {
14440 assert(((N
->getOpcode() == ISD::SHL
&&
14441 N
->getOperand(0).getOpcode() == ISD::SRL
) ||
14442 (N
->getOpcode() == ISD::SRL
&&
14443 N
->getOperand(0).getOpcode() == ISD::SHL
)) &&
14444 "Expected shift-shift mask");
14445 // Don't allow multiuse shift folding with the same shift amount.
14446 if (!N
->getOperand(0)->hasOneUse())
14449 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
14450 EVT VT
= N
->getValueType(0);
14451 if (N
->getOpcode() == ISD::SRL
&& (VT
== MVT::i32
|| VT
== MVT::i64
)) {
14452 auto *C1
= dyn_cast
<ConstantSDNode
>(N
->getOperand(0).getOperand(1));
14453 auto *C2
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
14454 return (!C1
|| !C2
|| C1
->getZExtValue() >= C2
->getZExtValue());
14460 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt
&Imm
,
14462 assert(Ty
->isIntegerTy());
14464 unsigned BitSize
= Ty
->getPrimitiveSizeInBits();
14468 int64_t Val
= Imm
.getSExtValue();
14469 if (Val
== 0 || AArch64_AM::isLogicalImmediate(Val
, BitSize
))
14472 if ((int64_t)Val
< 0)
14475 Val
&= (1LL << 32) - 1;
14477 unsigned LZ
= countLeadingZeros((uint64_t)Val
);
14478 unsigned Shift
= (63 - LZ
) / 16;
14479 // MOVZ is free so return true for one or fewer MOVK.
14483 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT
, EVT SrcVT
,
14484 unsigned Index
) const {
14485 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR
, ResVT
))
14488 return (Index
== 0 || Index
== ResVT
.getVectorMinNumElements());
14491 /// Turn vector tests of the signbit in the form of:
14492 /// xor (sra X, elt_size(X)-1), -1
14495 static SDValue
foldVectorXorShiftIntoCmp(SDNode
*N
, SelectionDAG
&DAG
,
14496 const AArch64Subtarget
*Subtarget
) {
14497 EVT VT
= N
->getValueType(0);
14498 if (!Subtarget
->hasNEON() || !VT
.isVector())
14501 // There must be a shift right algebraic before the xor, and the xor must be a
14502 // 'not' operation.
14503 SDValue Shift
= N
->getOperand(0);
14504 SDValue Ones
= N
->getOperand(1);
14505 if (Shift
.getOpcode() != AArch64ISD::VASHR
|| !Shift
.hasOneUse() ||
14506 !ISD::isBuildVectorAllOnes(Ones
.getNode()))
14509 // The shift should be smearing the sign bit across each vector element.
14510 auto *ShiftAmt
= dyn_cast
<ConstantSDNode
>(Shift
.getOperand(1));
14511 EVT ShiftEltTy
= Shift
.getValueType().getVectorElementType();
14512 if (!ShiftAmt
|| ShiftAmt
->getZExtValue() != ShiftEltTy
.getSizeInBits() - 1)
14515 return DAG
.getNode(AArch64ISD::CMGEz
, SDLoc(N
), VT
, Shift
.getOperand(0));
14518 // Given a vecreduce_add node, detect the below pattern and convert it to the
14519 // node sequence with UABDL, [S|U]ADB and UADDLP.
14521 // i32 vecreduce_add(
14524 // v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
14525 // =================>
14526 // i32 vecreduce_add(
14530 // v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
14532 // v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
14533 static SDValue
performVecReduceAddCombineWithUADDLP(SDNode
*N
,
14534 SelectionDAG
&DAG
) {
14535 // Assumed i32 vecreduce_add
14536 if (N
->getValueType(0) != MVT::i32
)
14539 SDValue VecReduceOp0
= N
->getOperand(0);
14540 unsigned Opcode
= VecReduceOp0
.getOpcode();
14541 // Assumed v16i32 abs
14542 if (Opcode
!= ISD::ABS
|| VecReduceOp0
->getValueType(0) != MVT::v16i32
)
14545 SDValue ABS
= VecReduceOp0
;
14546 // Assumed v16i32 sub
14547 if (ABS
->getOperand(0)->getOpcode() != ISD::SUB
||
14548 ABS
->getOperand(0)->getValueType(0) != MVT::v16i32
)
14551 SDValue SUB
= ABS
->getOperand(0);
14552 unsigned Opcode0
= SUB
->getOperand(0).getOpcode();
14553 unsigned Opcode1
= SUB
->getOperand(1).getOpcode();
14554 // Assumed v16i32 type
14555 if (SUB
->getOperand(0)->getValueType(0) != MVT::v16i32
||
14556 SUB
->getOperand(1)->getValueType(0) != MVT::v16i32
)
14559 // Assumed zext or sext
14560 bool IsZExt
= false;
14561 if (Opcode0
== ISD::ZERO_EXTEND
&& Opcode1
== ISD::ZERO_EXTEND
) {
14563 } else if (Opcode0
== ISD::SIGN_EXTEND
&& Opcode1
== ISD::SIGN_EXTEND
) {
14568 SDValue EXT0
= SUB
->getOperand(0);
14569 SDValue EXT1
= SUB
->getOperand(1);
14570 // Assumed zext's operand has v16i8 type
14571 if (EXT0
->getOperand(0)->getValueType(0) != MVT::v16i8
||
14572 EXT1
->getOperand(0)->getValueType(0) != MVT::v16i8
)
14575 // Pattern is dectected. Let's convert it to sequence of nodes.
14578 // First, create the node pattern of UABD/SABD.
14579 SDValue UABDHigh8Op0
=
14580 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, MVT::v8i8
, EXT0
->getOperand(0),
14581 DAG
.getConstant(8, DL
, MVT::i64
));
14582 SDValue UABDHigh8Op1
=
14583 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, MVT::v8i8
, EXT1
->getOperand(0),
14584 DAG
.getConstant(8, DL
, MVT::i64
));
14585 SDValue UABDHigh8
= DAG
.getNode(IsZExt
? ISD::ABDU
: ISD::ABDS
, DL
, MVT::v8i8
,
14586 UABDHigh8Op0
, UABDHigh8Op1
);
14587 SDValue UABDL
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, MVT::v8i16
, UABDHigh8
);
14589 // Second, create the node pattern of UABAL.
14590 SDValue UABDLo8Op0
=
14591 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, MVT::v8i8
, EXT0
->getOperand(0),
14592 DAG
.getConstant(0, DL
, MVT::i64
));
14593 SDValue UABDLo8Op1
=
14594 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, MVT::v8i8
, EXT1
->getOperand(0),
14595 DAG
.getConstant(0, DL
, MVT::i64
));
14596 SDValue UABDLo8
= DAG
.getNode(IsZExt
? ISD::ABDU
: ISD::ABDS
, DL
, MVT::v8i8
,
14597 UABDLo8Op0
, UABDLo8Op1
);
14598 SDValue ZExtUABD
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, MVT::v8i16
, UABDLo8
);
14599 SDValue UABAL
= DAG
.getNode(ISD::ADD
, DL
, MVT::v8i16
, UABDL
, ZExtUABD
);
14601 // Third, create the node of UADDLP.
14602 SDValue UADDLP
= DAG
.getNode(AArch64ISD::UADDLP
, DL
, MVT::v4i32
, UABAL
);
14604 // Fourth, create the node of VECREDUCE_ADD.
14605 return DAG
.getNode(ISD::VECREDUCE_ADD
, DL
, MVT::i32
, UADDLP
);
14608 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
14609 // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
14610 // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
14611 static SDValue
performVecReduceAddCombine(SDNode
*N
, SelectionDAG
&DAG
,
14612 const AArch64Subtarget
*ST
) {
14613 if (!ST
->hasDotProd())
14614 return performVecReduceAddCombineWithUADDLP(N
, DAG
);
14616 SDValue Op0
= N
->getOperand(0);
14617 if (N
->getValueType(0) != MVT::i32
||
14618 Op0
.getValueType().getVectorElementType() != MVT::i32
)
14621 unsigned ExtOpcode
= Op0
.getOpcode();
14624 if (ExtOpcode
== ISD::MUL
) {
14625 A
= Op0
.getOperand(0);
14626 B
= Op0
.getOperand(1);
14627 if (A
.getOpcode() != B
.getOpcode() ||
14628 A
.getOperand(0).getValueType() != B
.getOperand(0).getValueType())
14630 ExtOpcode
= A
.getOpcode();
14632 if (ExtOpcode
!= ISD::ZERO_EXTEND
&& ExtOpcode
!= ISD::SIGN_EXTEND
)
14635 EVT Op0VT
= A
.getOperand(0).getValueType();
14636 if (Op0VT
!= MVT::v8i8
&& Op0VT
!= MVT::v16i8
)
14640 // For non-mla reductions B can be set to 1. For MLA we take the operand of
14643 B
= DAG
.getConstant(1, DL
, Op0VT
);
14645 B
= B
.getOperand(0);
14648 DAG
.getConstant(0, DL
, Op0VT
== MVT::v8i8
? MVT::v2i32
: MVT::v4i32
);
14650 (ExtOpcode
== ISD::ZERO_EXTEND
) ? AArch64ISD::UDOT
: AArch64ISD::SDOT
;
14651 SDValue Dot
= DAG
.getNode(DotOpcode
, DL
, Zeros
.getValueType(), Zeros
,
14652 A
.getOperand(0), B
);
14653 return DAG
.getNode(ISD::VECREDUCE_ADD
, DL
, N
->getValueType(0), Dot
);
14656 // Given an (integer) vecreduce, we know the order of the inputs does not
14657 // matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
14658 // into UADDV(UADDLP(x)). This can also happen through an extra add, where we
14659 // transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
14660 static SDValue
performUADDVCombine(SDNode
*N
, SelectionDAG
&DAG
) {
14661 auto DetectAddExtract
= [&](SDValue A
) {
14662 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
14663 // UADDLP(x) if found.
14664 if (A
.getOpcode() != ISD::ADD
)
14666 EVT VT
= A
.getValueType();
14667 SDValue Op0
= A
.getOperand(0);
14668 SDValue Op1
= A
.getOperand(1);
14669 if (Op0
.getOpcode() != Op0
.getOpcode() ||
14670 (Op0
.getOpcode() != ISD::ZERO_EXTEND
&&
14671 Op0
.getOpcode() != ISD::SIGN_EXTEND
))
14673 SDValue Ext0
= Op0
.getOperand(0);
14674 SDValue Ext1
= Op1
.getOperand(0);
14675 if (Ext0
.getOpcode() != ISD::EXTRACT_SUBVECTOR
||
14676 Ext1
.getOpcode() != ISD::EXTRACT_SUBVECTOR
||
14677 Ext0
.getOperand(0) != Ext1
.getOperand(0))
14679 // Check that the type is twice the add types, and the extract are from
14680 // upper/lower parts of the same source.
14681 if (Ext0
.getOperand(0).getValueType().getVectorNumElements() !=
14682 VT
.getVectorNumElements() * 2)
14684 if ((Ext0
.getConstantOperandVal(1) != 0 &&
14685 Ext1
.getConstantOperandVal(1) != VT
.getVectorNumElements()) &&
14686 (Ext1
.getConstantOperandVal(1) != 0 &&
14687 Ext0
.getConstantOperandVal(1) != VT
.getVectorNumElements()))
14689 unsigned Opcode
= Op0
.getOpcode() == ISD::ZERO_EXTEND
? AArch64ISD::UADDLP
14690 : AArch64ISD::SADDLP
;
14691 return DAG
.getNode(Opcode
, SDLoc(A
), VT
, Ext0
.getOperand(0));
14694 SDValue A
= N
->getOperand(0);
14695 if (SDValue R
= DetectAddExtract(A
))
14696 return DAG
.getNode(N
->getOpcode(), SDLoc(N
), N
->getValueType(0), R
);
14697 if (A
.getOpcode() == ISD::ADD
) {
14698 if (SDValue R
= DetectAddExtract(A
.getOperand(0)))
14699 return DAG
.getNode(N
->getOpcode(), SDLoc(N
), N
->getValueType(0),
14700 DAG
.getNode(ISD::ADD
, SDLoc(A
), A
.getValueType(), R
,
14702 if (SDValue R
= DetectAddExtract(A
.getOperand(1)))
14703 return DAG
.getNode(N
->getOpcode(), SDLoc(N
), N
->getValueType(0),
14704 DAG
.getNode(ISD::ADD
, SDLoc(A
), A
.getValueType(), R
,
14711 static SDValue
performXorCombine(SDNode
*N
, SelectionDAG
&DAG
,
14712 TargetLowering::DAGCombinerInfo
&DCI
,
14713 const AArch64Subtarget
*Subtarget
) {
14714 if (DCI
.isBeforeLegalizeOps())
14717 return foldVectorXorShiftIntoCmp(N
, DAG
, Subtarget
);
14721 AArch64TargetLowering::BuildSDIVPow2(SDNode
*N
, const APInt
&Divisor
,
14723 SmallVectorImpl
<SDNode
*> &Created
) const {
14724 AttributeList Attr
= DAG
.getMachineFunction().getFunction().getAttributes();
14725 if (isIntDivCheap(N
->getValueType(0), Attr
))
14726 return SDValue(N
,0); // Lower SDIV as SDIV
14728 EVT VT
= N
->getValueType(0);
14730 // For scalable and fixed types, mark them as cheap so we can handle it much
14731 // later. This allows us to handle larger than legal types.
14732 if (VT
.isScalableVector() || Subtarget
->useSVEForFixedLengthVectors())
14733 return SDValue(N
, 0);
14735 // fold (sdiv X, pow2)
14736 if ((VT
!= MVT::i32
&& VT
!= MVT::i64
) ||
14737 !(Divisor
.isPowerOf2() || Divisor
.isNegatedPowerOf2()))
14741 SDValue N0
= N
->getOperand(0);
14742 unsigned Lg2
= Divisor
.countTrailingZeros();
14743 SDValue Zero
= DAG
.getConstant(0, DL
, VT
);
14744 SDValue Pow2MinusOne
= DAG
.getConstant((1ULL << Lg2
) - 1, DL
, VT
);
14746 // Add (N0 < 0) ? Pow2 - 1 : 0;
14748 SDValue Cmp
= getAArch64Cmp(N0
, Zero
, ISD::SETLT
, CCVal
, DAG
, DL
);
14749 SDValue Add
= DAG
.getNode(ISD::ADD
, DL
, VT
, N0
, Pow2MinusOne
);
14750 SDValue CSel
= DAG
.getNode(AArch64ISD::CSEL
, DL
, VT
, Add
, N0
, CCVal
, Cmp
);
14752 Created
.push_back(Cmp
.getNode());
14753 Created
.push_back(Add
.getNode());
14754 Created
.push_back(CSel
.getNode());
14758 DAG
.getNode(ISD::SRA
, DL
, VT
, CSel
, DAG
.getConstant(Lg2
, DL
, MVT::i64
));
14760 // If we're dividing by a positive value, we're done. Otherwise, we must
14761 // negate the result.
14762 if (Divisor
.isNonNegative())
14765 Created
.push_back(SRA
.getNode());
14766 return DAG
.getNode(ISD::SUB
, DL
, VT
, DAG
.getConstant(0, DL
, VT
), SRA
);
14770 AArch64TargetLowering::BuildSREMPow2(SDNode
*N
, const APInt
&Divisor
,
14772 SmallVectorImpl
<SDNode
*> &Created
) const {
14773 AttributeList Attr
= DAG
.getMachineFunction().getFunction().getAttributes();
14774 if (isIntDivCheap(N
->getValueType(0), Attr
))
14775 return SDValue(N
, 0); // Lower SREM as SREM
14777 EVT VT
= N
->getValueType(0);
14779 // For scalable and fixed types, mark them as cheap so we can handle it much
14780 // later. This allows us to handle larger than legal types.
14781 if (VT
.isScalableVector() || Subtarget
->useSVEForFixedLengthVectors())
14782 return SDValue(N
, 0);
14784 // fold (srem X, pow2)
14785 if ((VT
!= MVT::i32
&& VT
!= MVT::i64
) ||
14786 !(Divisor
.isPowerOf2() || Divisor
.isNegatedPowerOf2()))
14789 unsigned Lg2
= Divisor
.countTrailingZeros();
14794 SDValue N0
= N
->getOperand(0);
14795 SDValue Pow2MinusOne
= DAG
.getConstant((1ULL << Lg2
) - 1, DL
, VT
);
14796 SDValue Zero
= DAG
.getConstant(0, DL
, VT
);
14797 SDValue CCVal
, CSNeg
;
14799 SDValue Cmp
= getAArch64Cmp(N0
, Zero
, ISD::SETGE
, CCVal
, DAG
, DL
);
14800 SDValue And
= DAG
.getNode(ISD::AND
, DL
, VT
, N0
, Pow2MinusOne
);
14801 CSNeg
= DAG
.getNode(AArch64ISD::CSNEG
, DL
, VT
, And
, And
, CCVal
, Cmp
);
14803 Created
.push_back(Cmp
.getNode());
14804 Created
.push_back(And
.getNode());
14806 SDValue CCVal
= DAG
.getConstant(AArch64CC::MI
, DL
, MVT_CC
);
14807 SDVTList VTs
= DAG
.getVTList(VT
, MVT::i32
);
14809 SDValue Negs
= DAG
.getNode(AArch64ISD::SUBS
, DL
, VTs
, Zero
, N0
);
14810 SDValue AndPos
= DAG
.getNode(ISD::AND
, DL
, VT
, N0
, Pow2MinusOne
);
14811 SDValue AndNeg
= DAG
.getNode(ISD::AND
, DL
, VT
, Negs
, Pow2MinusOne
);
14812 CSNeg
= DAG
.getNode(AArch64ISD::CSNEG
, DL
, VT
, AndPos
, AndNeg
, CCVal
,
14815 Created
.push_back(Negs
.getNode());
14816 Created
.push_back(AndPos
.getNode());
14817 Created
.push_back(AndNeg
.getNode());
14823 static bool IsSVECntIntrinsic(SDValue S
) {
14824 switch(getIntrinsicID(S
.getNode())) {
14827 case Intrinsic::aarch64_sve_cntb
:
14828 case Intrinsic::aarch64_sve_cnth
:
14829 case Intrinsic::aarch64_sve_cntw
:
14830 case Intrinsic::aarch64_sve_cntd
:
14836 /// Calculates what the pre-extend type is, based on the extension
14837 /// operation node provided by \p Extend.
14839 /// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
14840 /// pre-extend type is pulled directly from the operand, while other extend
14841 /// operations need a bit more inspection to get this information.
14843 /// \param Extend The SDNode from the DAG that represents the extend operation
14845 /// \returns The type representing the \p Extend source type, or \p MVT::Other
14846 /// if no valid type can be determined
14847 static EVT
calculatePreExtendType(SDValue Extend
) {
14848 switch (Extend
.getOpcode()) {
14849 case ISD::SIGN_EXTEND
:
14850 case ISD::ZERO_EXTEND
:
14851 return Extend
.getOperand(0).getValueType();
14852 case ISD::AssertSext
:
14853 case ISD::AssertZext
:
14854 case ISD::SIGN_EXTEND_INREG
: {
14855 VTSDNode
*TypeNode
= dyn_cast
<VTSDNode
>(Extend
.getOperand(1));
14858 return TypeNode
->getVT();
14861 ConstantSDNode
*Constant
=
14862 dyn_cast
<ConstantSDNode
>(Extend
.getOperand(1).getNode());
14866 uint32_t Mask
= Constant
->getZExtValue();
14868 if (Mask
== UCHAR_MAX
)
14870 else if (Mask
== USHRT_MAX
)
14872 else if (Mask
== UINT_MAX
)
14882 /// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
14883 /// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
14884 /// SExt/ZExt rather than the scalar SExt/ZExt
14885 static SDValue
performBuildShuffleExtendCombine(SDValue BV
, SelectionDAG
&DAG
) {
14886 EVT VT
= BV
.getValueType();
14887 if (BV
.getOpcode() != ISD::BUILD_VECTOR
&&
14888 BV
.getOpcode() != ISD::VECTOR_SHUFFLE
)
14891 // Use the first item in the buildvector/shuffle to get the size of the
14892 // extend, and make sure it looks valid.
14893 SDValue Extend
= BV
->getOperand(0);
14894 unsigned ExtendOpcode
= Extend
.getOpcode();
14895 bool IsSExt
= ExtendOpcode
== ISD::SIGN_EXTEND
||
14896 ExtendOpcode
== ISD::SIGN_EXTEND_INREG
||
14897 ExtendOpcode
== ISD::AssertSext
;
14898 if (!IsSExt
&& ExtendOpcode
!= ISD::ZERO_EXTEND
&&
14899 ExtendOpcode
!= ISD::AssertZext
&& ExtendOpcode
!= ISD::AND
)
14901 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
14902 // calculatePreExtendType will work without issue.
14903 if (BV
.getOpcode() == ISD::VECTOR_SHUFFLE
&&
14904 ExtendOpcode
!= ISD::SIGN_EXTEND
&& ExtendOpcode
!= ISD::ZERO_EXTEND
)
14907 // Restrict valid pre-extend data type
14908 EVT PreExtendType
= calculatePreExtendType(Extend
);
14909 if (PreExtendType
== MVT::Other
||
14910 PreExtendType
.getScalarSizeInBits() != VT
.getScalarSizeInBits() / 2)
14913 // Make sure all other operands are equally extended
14914 for (SDValue Op
: drop_begin(BV
->ops())) {
14917 unsigned Opc
= Op
.getOpcode();
14918 bool OpcIsSExt
= Opc
== ISD::SIGN_EXTEND
|| Opc
== ISD::SIGN_EXTEND_INREG
||
14919 Opc
== ISD::AssertSext
;
14920 if (OpcIsSExt
!= IsSExt
|| calculatePreExtendType(Op
) != PreExtendType
)
14926 if (BV
.getOpcode() == ISD::BUILD_VECTOR
) {
14927 EVT PreExtendVT
= VT
.changeVectorElementType(PreExtendType
);
14928 EVT PreExtendLegalType
=
14929 PreExtendType
.getScalarSizeInBits() < 32 ? MVT::i32
: PreExtendType
;
14930 SmallVector
<SDValue
, 8> NewOps
;
14931 for (SDValue Op
: BV
->ops())
14932 NewOps
.push_back(Op
.isUndef() ? DAG
.getUNDEF(PreExtendLegalType
)
14933 : DAG
.getAnyExtOrTrunc(Op
.getOperand(0), DL
,
14934 PreExtendLegalType
));
14935 NBV
= DAG
.getNode(ISD::BUILD_VECTOR
, DL
, PreExtendVT
, NewOps
);
14936 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
14937 EVT PreExtendVT
= VT
.changeVectorElementType(PreExtendType
.getScalarType());
14938 NBV
= DAG
.getVectorShuffle(PreExtendVT
, DL
, BV
.getOperand(0).getOperand(0),
14939 BV
.getOperand(1).isUndef()
14940 ? DAG
.getUNDEF(PreExtendVT
)
14941 : BV
.getOperand(1).getOperand(0),
14942 cast
<ShuffleVectorSDNode
>(BV
)->getMask());
14944 return DAG
.getNode(IsSExt
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
, DL
, VT
, NBV
);
14947 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
14948 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
14949 static SDValue
performMulVectorExtendCombine(SDNode
*Mul
, SelectionDAG
&DAG
) {
14950 // If the value type isn't a vector, none of the operands are going to be dups
14951 EVT VT
= Mul
->getValueType(0);
14952 if (VT
!= MVT::v8i16
&& VT
!= MVT::v4i32
&& VT
!= MVT::v2i64
)
14955 SDValue Op0
= performBuildShuffleExtendCombine(Mul
->getOperand(0), DAG
);
14956 SDValue Op1
= performBuildShuffleExtendCombine(Mul
->getOperand(1), DAG
);
14958 // Neither operands have been changed, don't make any further changes
14963 return DAG
.getNode(Mul
->getOpcode(), DL
, VT
, Op0
? Op0
: Mul
->getOperand(0),
14964 Op1
? Op1
: Mul
->getOperand(1));
14967 // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
14968 // Same for other types with equivalent constants.
14969 static SDValue
performMulVectorCmpZeroCombine(SDNode
*N
, SelectionDAG
&DAG
) {
14970 EVT VT
= N
->getValueType(0);
14971 if (VT
!= MVT::v2i64
&& VT
!= MVT::v1i64
&& VT
!= MVT::v2i32
&&
14972 VT
!= MVT::v4i32
&& VT
!= MVT::v4i16
&& VT
!= MVT::v8i16
)
14974 if (N
->getOperand(0).getOpcode() != ISD::AND
||
14975 N
->getOperand(0).getOperand(0).getOpcode() != ISD::SRL
)
14978 SDValue And
= N
->getOperand(0);
14979 SDValue Srl
= And
.getOperand(0);
14982 if (!ISD::isConstantSplatVector(N
->getOperand(1).getNode(), V1
) ||
14983 !ISD::isConstantSplatVector(And
.getOperand(1).getNode(), V2
) ||
14984 !ISD::isConstantSplatVector(Srl
.getOperand(1).getNode(), V3
))
14987 unsigned HalfSize
= VT
.getScalarSizeInBits() / 2;
14988 if (!V1
.isMask(HalfSize
) || V2
!= (1ULL | 1ULL << HalfSize
) ||
14989 V3
!= (HalfSize
- 1))
14992 EVT HalfVT
= EVT::getVectorVT(*DAG
.getContext(),
14993 EVT::getIntegerVT(*DAG
.getContext(), HalfSize
),
14994 VT
.getVectorElementCount() * 2);
14997 SDValue In
= DAG
.getNode(AArch64ISD::NVCAST
, DL
, HalfVT
, Srl
.getOperand(0));
14998 SDValue CM
= DAG
.getNode(AArch64ISD::CMLTz
, DL
, HalfVT
, In
);
14999 return DAG
.getNode(AArch64ISD::NVCAST
, DL
, VT
, CM
);
15002 static SDValue
performMulCombine(SDNode
*N
, SelectionDAG
&DAG
,
15003 TargetLowering::DAGCombinerInfo
&DCI
,
15004 const AArch64Subtarget
*Subtarget
) {
15006 if (SDValue Ext
= performMulVectorExtendCombine(N
, DAG
))
15008 if (SDValue Ext
= performMulVectorCmpZeroCombine(N
, DAG
))
15011 if (DCI
.isBeforeLegalizeOps())
15014 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
15015 // and in MachineCombiner pass, add+mul will be combined into madd.
15016 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
15018 EVT VT
= N
->getValueType(0);
15019 SDValue N0
= N
->getOperand(0);
15020 SDValue N1
= N
->getOperand(1);
15022 unsigned AddSubOpc
;
15024 auto IsAddSubWith1
= [&](SDValue V
) -> bool {
15025 AddSubOpc
= V
->getOpcode();
15026 if ((AddSubOpc
== ISD::ADD
|| AddSubOpc
== ISD::SUB
) && V
->hasOneUse()) {
15027 SDValue Opnd
= V
->getOperand(1);
15028 MulOper
= V
->getOperand(0);
15029 if (AddSubOpc
== ISD::SUB
)
15030 std::swap(Opnd
, MulOper
);
15031 if (auto C
= dyn_cast
<ConstantSDNode
>(Opnd
))
15037 if (IsAddSubWith1(N0
)) {
15038 SDValue MulVal
= DAG
.getNode(ISD::MUL
, DL
, VT
, N1
, MulOper
);
15039 return DAG
.getNode(AddSubOpc
, DL
, VT
, N1
, MulVal
);
15042 if (IsAddSubWith1(N1
)) {
15043 SDValue MulVal
= DAG
.getNode(ISD::MUL
, DL
, VT
, N0
, MulOper
);
15044 return DAG
.getNode(AddSubOpc
, DL
, VT
, N0
, MulVal
);
15047 // The below optimizations require a constant RHS.
15048 if (!isa
<ConstantSDNode
>(N1
))
15051 ConstantSDNode
*C
= cast
<ConstantSDNode
>(N1
);
15052 const APInt
&ConstValue
= C
->getAPIntValue();
15054 // Allow the scaling to be folded into the `cnt` instruction by preventing
15055 // the scaling to be obscured here. This makes it easier to pattern match.
15056 if (IsSVECntIntrinsic(N0
) ||
15057 (N0
->getOpcode() == ISD::TRUNCATE
&&
15058 (IsSVECntIntrinsic(N0
->getOperand(0)))))
15059 if (ConstValue
.sge(1) && ConstValue
.sle(16))
15062 // Multiplication of a power of two plus/minus one can be done more
15063 // cheaply as as shift+add/sub. For now, this is true unilaterally. If
15064 // future CPUs have a cheaper MADD instruction, this may need to be
15065 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
15066 // 64-bit is 5 cycles, so this is always a win.
15067 // More aggressively, some multiplications N0 * C can be lowered to
15068 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
15069 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
15070 // TODO: lower more cases.
15072 // TrailingZeroes is used to test if the mul can be lowered to
15073 // shift+add+shift.
15074 unsigned TrailingZeroes
= ConstValue
.countTrailingZeros();
15075 if (TrailingZeroes
) {
15076 // Conservatively do not lower to shift+add+shift if the mul might be
15077 // folded into smul or umul.
15078 if (N0
->hasOneUse() && (isSignExtended(N0
.getNode(), DAG
) ||
15079 isZeroExtended(N0
.getNode(), DAG
)))
15081 // Conservatively do not lower to shift+add+shift if the mul might be
15082 // folded into madd or msub.
15083 if (N
->hasOneUse() && (N
->use_begin()->getOpcode() == ISD::ADD
||
15084 N
->use_begin()->getOpcode() == ISD::SUB
))
15087 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
15088 // and shift+add+shift.
15089 APInt ShiftedConstValue
= ConstValue
.ashr(TrailingZeroes
);
15092 auto Shl
= [&](SDValue N0
, unsigned N1
) {
15093 SDValue RHS
= DAG
.getConstant(N1
, DL
, MVT::i64
);
15094 return DAG
.getNode(ISD::SHL
, DL
, VT
, N0
, RHS
);
15096 auto Add
= [&](SDValue N0
, SDValue N1
) {
15097 return DAG
.getNode(ISD::ADD
, DL
, VT
, N0
, N1
);
15099 auto Sub
= [&](SDValue N0
, SDValue N1
) {
15100 return DAG
.getNode(ISD::SUB
, DL
, VT
, N0
, N1
);
15102 auto Negate
= [&](SDValue N
) {
15103 SDValue Zero
= DAG
.getConstant(0, DL
, VT
);
15104 return DAG
.getNode(ISD::SUB
, DL
, VT
, Zero
, N
);
15107 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
15108 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
15109 // the (2^N - 1) can't be execused via a single instruction.
15110 auto isPowPlusPlusConst
= [](APInt C
, APInt
&M
, APInt
&N
) {
15111 unsigned BitWidth
= C
.getBitWidth();
15112 for (unsigned i
= 1; i
< BitWidth
/ 2; i
++) {
15114 APInt
X(BitWidth
, (1 << i
) + 1);
15115 APInt::sdivrem(C
, X
, N
, Rem
);
15116 APInt NVMinus1
= N
- 1;
15117 if (Rem
== 0 && NVMinus1
.isPowerOf2()) {
15125 if (ConstValue
.isNonNegative()) {
15126 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
15127 // (mul x, 2^N - 1) => (sub (shl x, N), x)
15128 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
15129 // (mul x, (2^M + 1) * (2^N + 1))
15130 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
15131 APInt SCVMinus1
= ShiftedConstValue
- 1;
15132 APInt SCVPlus1
= ShiftedConstValue
+ 1;
15133 APInt CVPlus1
= ConstValue
+ 1;
15135 if (SCVMinus1
.isPowerOf2()) {
15136 ShiftAmt
= SCVMinus1
.logBase2();
15137 return Shl(Add(Shl(N0
, ShiftAmt
), N0
), TrailingZeroes
);
15138 } else if (CVPlus1
.isPowerOf2()) {
15139 ShiftAmt
= CVPlus1
.logBase2();
15140 return Sub(Shl(N0
, ShiftAmt
), N0
);
15141 } else if (SCVPlus1
.isPowerOf2()) {
15142 ShiftAmt
= SCVPlus1
.logBase2() + TrailingZeroes
;
15143 return Sub(Shl(N0
, ShiftAmt
), Shl(N0
, TrailingZeroes
));
15144 } else if (Subtarget
->hasLSLFast() &&
15145 isPowPlusPlusConst(ConstValue
, CVM
, CVN
)) {
15146 APInt CVMMinus1
= CVM
- 1;
15147 APInt CVNMinus1
= CVN
- 1;
15148 unsigned ShiftM1
= CVMMinus1
.logBase2();
15149 unsigned ShiftN1
= CVNMinus1
.logBase2();
15150 // LSLFast implicate that Shifts <= 3 places are fast
15151 if (ShiftM1
<= 3 && ShiftN1
<= 3) {
15152 SDValue MVal
= Add(Shl(N0
, ShiftM1
), N0
);
15153 return Add(Shl(MVal
, ShiftN1
), MVal
);
15157 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
15158 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
15159 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
15160 APInt SCVPlus1
= -ShiftedConstValue
+ 1;
15161 APInt CVNegPlus1
= -ConstValue
+ 1;
15162 APInt CVNegMinus1
= -ConstValue
- 1;
15163 if (CVNegPlus1
.isPowerOf2()) {
15164 ShiftAmt
= CVNegPlus1
.logBase2();
15165 return Sub(N0
, Shl(N0
, ShiftAmt
));
15166 } else if (CVNegMinus1
.isPowerOf2()) {
15167 ShiftAmt
= CVNegMinus1
.logBase2();
15168 return Negate(Add(Shl(N0
, ShiftAmt
), N0
));
15169 } else if (SCVPlus1
.isPowerOf2()) {
15170 ShiftAmt
= SCVPlus1
.logBase2() + TrailingZeroes
;
15171 return Sub(Shl(N0
, TrailingZeroes
), Shl(N0
, ShiftAmt
));
15178 static SDValue
performVectorCompareAndMaskUnaryOpCombine(SDNode
*N
,
15179 SelectionDAG
&DAG
) {
15180 // Take advantage of vector comparisons producing 0 or -1 in each lane to
15181 // optimize away operation when it's from a constant.
15183 // The general transformation is:
15184 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
15185 // AND(VECTOR_CMP(x,y), constant2)
15186 // constant2 = UNARYOP(constant)
15188 // Early exit if this isn't a vector operation, the operand of the
15189 // unary operation isn't a bitwise AND, or if the sizes of the operations
15190 // aren't the same.
15191 EVT VT
= N
->getValueType(0);
15192 if (!VT
.isVector() || N
->getOperand(0)->getOpcode() != ISD::AND
||
15193 N
->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC
||
15194 VT
.getSizeInBits() != N
->getOperand(0)->getValueType(0).getSizeInBits())
15197 // Now check that the other operand of the AND is a constant. We could
15198 // make the transformation for non-constant splats as well, but it's unclear
15199 // that would be a benefit as it would not eliminate any operations, just
15200 // perform one more step in scalar code before moving to the vector unit.
15201 if (BuildVectorSDNode
*BV
=
15202 dyn_cast
<BuildVectorSDNode
>(N
->getOperand(0)->getOperand(1))) {
15203 // Bail out if the vector isn't a constant.
15204 if (!BV
->isConstant())
15207 // Everything checks out. Build up the new and improved node.
15209 EVT IntVT
= BV
->getValueType(0);
15210 // Create a new constant of the appropriate type for the transformed
15212 SDValue SourceConst
= DAG
.getNode(N
->getOpcode(), DL
, VT
, SDValue(BV
, 0));
15213 // The AND node needs bitcasts to/from an integer vector type around it.
15214 SDValue MaskConst
= DAG
.getNode(ISD::BITCAST
, DL
, IntVT
, SourceConst
);
15215 SDValue NewAnd
= DAG
.getNode(ISD::AND
, DL
, IntVT
,
15216 N
->getOperand(0)->getOperand(0), MaskConst
);
15217 SDValue Res
= DAG
.getNode(ISD::BITCAST
, DL
, VT
, NewAnd
);
15224 static SDValue
performIntToFpCombine(SDNode
*N
, SelectionDAG
&DAG
,
15225 const AArch64Subtarget
*Subtarget
) {
15226 // First try to optimize away the conversion when it's conditionally from
15227 // a constant. Vectors only.
15228 if (SDValue Res
= performVectorCompareAndMaskUnaryOpCombine(N
, DAG
))
15231 EVT VT
= N
->getValueType(0);
15232 if (VT
!= MVT::f32
&& VT
!= MVT::f64
)
15235 // Only optimize when the source and destination types have the same width.
15236 if (VT
.getSizeInBits() != N
->getOperand(0).getValueSizeInBits())
15239 // If the result of an integer load is only used by an integer-to-float
15240 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
15241 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
15242 SDValue N0
= N
->getOperand(0);
15243 if (Subtarget
->hasNEON() && ISD::isNormalLoad(N0
.getNode()) && N0
.hasOneUse() &&
15244 // Do not change the width of a volatile load.
15245 !cast
<LoadSDNode
>(N0
)->isVolatile()) {
15246 LoadSDNode
*LN0
= cast
<LoadSDNode
>(N0
);
15247 SDValue Load
= DAG
.getLoad(VT
, SDLoc(N
), LN0
->getChain(), LN0
->getBasePtr(),
15248 LN0
->getPointerInfo(), LN0
->getAlign(),
15249 LN0
->getMemOperand()->getFlags());
15251 // Make sure successors of the original load stay after it by updating them
15252 // to use the new Chain.
15253 DAG
.ReplaceAllUsesOfValueWith(SDValue(LN0
, 1), Load
.getValue(1));
15256 (N
->getOpcode() == ISD::SINT_TO_FP
) ? AArch64ISD::SITOF
: AArch64ISD::UITOF
;
15257 return DAG
.getNode(Opcode
, SDLoc(N
), VT
, Load
);
15263 /// Fold a floating-point multiply by power of two into floating-point to
15264 /// fixed-point conversion.
15265 static SDValue
performFpToIntCombine(SDNode
*N
, SelectionDAG
&DAG
,
15266 TargetLowering::DAGCombinerInfo
&DCI
,
15267 const AArch64Subtarget
*Subtarget
) {
15268 if (!Subtarget
->hasNEON())
15271 if (!N
->getValueType(0).isSimple())
15274 SDValue Op
= N
->getOperand(0);
15275 if (!Op
.getValueType().isSimple() || Op
.getOpcode() != ISD::FMUL
)
15278 if (!Op
.getValueType().is64BitVector() && !Op
.getValueType().is128BitVector())
15281 SDValue ConstVec
= Op
->getOperand(1);
15282 if (!isa
<BuildVectorSDNode
>(ConstVec
))
15285 MVT FloatTy
= Op
.getSimpleValueType().getVectorElementType();
15286 uint32_t FloatBits
= FloatTy
.getSizeInBits();
15287 if (FloatBits
!= 32 && FloatBits
!= 64 &&
15288 (FloatBits
!= 16 || !Subtarget
->hasFullFP16()))
15291 MVT IntTy
= N
->getSimpleValueType(0).getVectorElementType();
15292 uint32_t IntBits
= IntTy
.getSizeInBits();
15293 if (IntBits
!= 16 && IntBits
!= 32 && IntBits
!= 64)
15296 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
15297 if (IntBits
> FloatBits
)
15300 BitVector UndefElements
;
15301 BuildVectorSDNode
*BV
= cast
<BuildVectorSDNode
>(ConstVec
);
15302 int32_t Bits
= IntBits
== 64 ? 64 : 32;
15303 int32_t C
= BV
->getConstantFPSplatPow2ToLog2Int(&UndefElements
, Bits
+ 1);
15304 if (C
== -1 || C
== 0 || C
> Bits
)
15307 EVT ResTy
= Op
.getValueType().changeVectorElementTypeToInteger();
15308 if (!DAG
.getTargetLoweringInfo().isTypeLegal(ResTy
))
15311 if (N
->getOpcode() == ISD::FP_TO_SINT_SAT
||
15312 N
->getOpcode() == ISD::FP_TO_UINT_SAT
) {
15313 EVT SatVT
= cast
<VTSDNode
>(N
->getOperand(1))->getVT();
15314 if (SatVT
.getScalarSizeInBits() != IntBits
|| IntBits
!= FloatBits
)
15319 bool IsSigned
= (N
->getOpcode() == ISD::FP_TO_SINT
||
15320 N
->getOpcode() == ISD::FP_TO_SINT_SAT
);
15321 unsigned IntrinsicOpcode
= IsSigned
? Intrinsic::aarch64_neon_vcvtfp2fxs
15322 : Intrinsic::aarch64_neon_vcvtfp2fxu
;
15324 DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, ResTy
,
15325 DAG
.getConstant(IntrinsicOpcode
, DL
, MVT::i32
),
15326 Op
->getOperand(0), DAG
.getConstant(C
, DL
, MVT::i32
));
15327 // We can handle smaller integers by generating an extra trunc.
15328 if (IntBits
< FloatBits
)
15329 FixConv
= DAG
.getNode(ISD::TRUNCATE
, DL
, N
->getValueType(0), FixConv
);
15334 /// Fold a floating-point divide by power of two into fixed-point to
15335 /// floating-point conversion.
15336 static SDValue
performFDivCombine(SDNode
*N
, SelectionDAG
&DAG
,
15337 TargetLowering::DAGCombinerInfo
&DCI
,
15338 const AArch64Subtarget
*Subtarget
) {
15339 if (!Subtarget
->hasNEON())
15342 SDValue Op
= N
->getOperand(0);
15343 unsigned Opc
= Op
->getOpcode();
15344 if (!Op
.getValueType().isVector() || !Op
.getValueType().isSimple() ||
15345 !Op
.getOperand(0).getValueType().isSimple() ||
15346 (Opc
!= ISD::SINT_TO_FP
&& Opc
!= ISD::UINT_TO_FP
))
15349 SDValue ConstVec
= N
->getOperand(1);
15350 if (!isa
<BuildVectorSDNode
>(ConstVec
))
15353 MVT IntTy
= Op
.getOperand(0).getSimpleValueType().getVectorElementType();
15354 int32_t IntBits
= IntTy
.getSizeInBits();
15355 if (IntBits
!= 16 && IntBits
!= 32 && IntBits
!= 64)
15358 MVT FloatTy
= N
->getSimpleValueType(0).getVectorElementType();
15359 int32_t FloatBits
= FloatTy
.getSizeInBits();
15360 if (FloatBits
!= 32 && FloatBits
!= 64)
15363 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
15364 if (IntBits
> FloatBits
)
15367 BitVector UndefElements
;
15368 BuildVectorSDNode
*BV
= cast
<BuildVectorSDNode
>(ConstVec
);
15369 int32_t C
= BV
->getConstantFPSplatPow2ToLog2Int(&UndefElements
, FloatBits
+ 1);
15370 if (C
== -1 || C
== 0 || C
> FloatBits
)
15374 unsigned NumLanes
= Op
.getValueType().getVectorNumElements();
15375 switch (NumLanes
) {
15379 ResTy
= FloatBits
== 32 ? MVT::v2i32
: MVT::v2i64
;
15382 ResTy
= FloatBits
== 32 ? MVT::v4i32
: MVT::v4i64
;
15386 if (ResTy
== MVT::v4i64
&& DCI
.isBeforeLegalizeOps())
15390 SDValue ConvInput
= Op
.getOperand(0);
15391 bool IsSigned
= Opc
== ISD::SINT_TO_FP
;
15392 if (IntBits
< FloatBits
)
15393 ConvInput
= DAG
.getNode(IsSigned
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
, DL
,
15396 unsigned IntrinsicOpcode
= IsSigned
? Intrinsic::aarch64_neon_vcvtfxs2fp
15397 : Intrinsic::aarch64_neon_vcvtfxu2fp
;
15398 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, Op
.getValueType(),
15399 DAG
.getConstant(IntrinsicOpcode
, DL
, MVT::i32
), ConvInput
,
15400 DAG
.getConstant(C
, DL
, MVT::i32
));
15403 /// An EXTR instruction is made up of two shifts, ORed together. This helper
15404 /// searches for and classifies those shifts.
15405 static bool findEXTRHalf(SDValue N
, SDValue
&Src
, uint32_t &ShiftAmount
,
15407 if (N
.getOpcode() == ISD::SHL
)
15409 else if (N
.getOpcode() == ISD::SRL
)
15414 if (!isa
<ConstantSDNode
>(N
.getOperand(1)))
15417 ShiftAmount
= N
->getConstantOperandVal(1);
15418 Src
= N
->getOperand(0);
15422 /// EXTR instruction extracts a contiguous chunk of bits from two existing
15423 /// registers viewed as a high/low pair. This function looks for the pattern:
15424 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
15425 /// with an EXTR. Can't quite be done in TableGen because the two immediates
15426 /// aren't independent.
15427 static SDValue
tryCombineToEXTR(SDNode
*N
,
15428 TargetLowering::DAGCombinerInfo
&DCI
) {
15429 SelectionDAG
&DAG
= DCI
.DAG
;
15431 EVT VT
= N
->getValueType(0);
15433 assert(N
->getOpcode() == ISD::OR
&& "Unexpected root");
15435 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
15439 uint32_t ShiftLHS
= 0;
15440 bool LHSFromHi
= false;
15441 if (!findEXTRHalf(N
->getOperand(0), LHS
, ShiftLHS
, LHSFromHi
))
15445 uint32_t ShiftRHS
= 0;
15446 bool RHSFromHi
= false;
15447 if (!findEXTRHalf(N
->getOperand(1), RHS
, ShiftRHS
, RHSFromHi
))
15450 // If they're both trying to come from the high part of the register, they're
15451 // not really an EXTR.
15452 if (LHSFromHi
== RHSFromHi
)
15455 if (ShiftLHS
+ ShiftRHS
!= VT
.getSizeInBits())
15459 std::swap(LHS
, RHS
);
15460 std::swap(ShiftLHS
, ShiftRHS
);
15463 return DAG
.getNode(AArch64ISD::EXTR
, DL
, VT
, LHS
, RHS
,
15464 DAG
.getConstant(ShiftRHS
, DL
, MVT::i64
));
15467 static SDValue
tryCombineToBSL(SDNode
*N
,
15468 TargetLowering::DAGCombinerInfo
&DCI
) {
15469 EVT VT
= N
->getValueType(0);
15470 SelectionDAG
&DAG
= DCI
.DAG
;
15473 if (!VT
.isVector())
15476 // The combining code currently only works for NEON vectors. In particular,
15477 // it does not work for SVE when dealing with vectors wider than 128 bits.
15478 if (!VT
.is64BitVector() && !VT
.is128BitVector())
15481 SDValue N0
= N
->getOperand(0);
15482 if (N0
.getOpcode() != ISD::AND
)
15485 SDValue N1
= N
->getOperand(1);
15486 if (N1
.getOpcode() != ISD::AND
)
15489 // InstCombine does (not (neg a)) => (add a -1).
15490 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15491 // Loop over all combinations of AND operands.
15492 for (int i
= 1; i
>= 0; --i
) {
15493 for (int j
= 1; j
>= 0; --j
) {
15494 SDValue O0
= N0
->getOperand(i
);
15495 SDValue O1
= N1
->getOperand(j
);
15496 SDValue Sub
, Add
, SubSibling
, AddSibling
;
15498 // Find a SUB and an ADD operand, one from each AND.
15499 if (O0
.getOpcode() == ISD::SUB
&& O1
.getOpcode() == ISD::ADD
) {
15502 SubSibling
= N0
->getOperand(1 - i
);
15503 AddSibling
= N1
->getOperand(1 - j
);
15504 } else if (O0
.getOpcode() == ISD::ADD
&& O1
.getOpcode() == ISD::SUB
) {
15507 AddSibling
= N0
->getOperand(1 - i
);
15508 SubSibling
= N1
->getOperand(1 - j
);
15512 if (!ISD::isBuildVectorAllZeros(Sub
.getOperand(0).getNode()))
15515 // Constant ones is always righthand operand of the Add.
15516 if (!ISD::isBuildVectorAllOnes(Add
.getOperand(1).getNode()))
15519 if (Sub
.getOperand(1) != Add
.getOperand(0))
15522 return DAG
.getNode(AArch64ISD::BSP
, DL
, VT
, Sub
, SubSibling
, AddSibling
);
15526 // (or (and a b) (and (not a) c)) => (bsl a b c)
15527 // We only have to look for constant vectors here since the general, variable
15528 // case can be handled in TableGen.
15529 unsigned Bits
= VT
.getScalarSizeInBits();
15530 uint64_t BitMask
= Bits
== 64 ? -1ULL : ((1ULL << Bits
) - 1);
15531 for (int i
= 1; i
>= 0; --i
)
15532 for (int j
= 1; j
>= 0; --j
) {
15533 BuildVectorSDNode
*BVN0
= dyn_cast
<BuildVectorSDNode
>(N0
->getOperand(i
));
15534 BuildVectorSDNode
*BVN1
= dyn_cast
<BuildVectorSDNode
>(N1
->getOperand(j
));
15535 if (!BVN0
|| !BVN1
)
15538 bool FoundMatch
= true;
15539 for (unsigned k
= 0; k
< VT
.getVectorNumElements(); ++k
) {
15540 ConstantSDNode
*CN0
= dyn_cast
<ConstantSDNode
>(BVN0
->getOperand(k
));
15541 ConstantSDNode
*CN1
= dyn_cast
<ConstantSDNode
>(BVN1
->getOperand(k
));
15542 if (!CN0
|| !CN1
||
15543 CN0
->getZExtValue() != (BitMask
& ~CN1
->getZExtValue())) {
15544 FoundMatch
= false;
15550 return DAG
.getNode(AArch64ISD::BSP
, DL
, VT
, SDValue(BVN0
, 0),
15551 N0
->getOperand(1 - i
), N1
->getOperand(1 - j
));
15557 // Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
15558 // convert to csel(ccmp(.., cc0)), depending on cc1:
15560 // (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
15562 // (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
15564 // (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
15566 // (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
15567 static SDValue
performANDORCSELCombine(SDNode
*N
, SelectionDAG
&DAG
) {
15568 EVT VT
= N
->getValueType(0);
15569 SDValue CSel0
= N
->getOperand(0);
15570 SDValue CSel1
= N
->getOperand(1);
15572 if (CSel0
.getOpcode() != AArch64ISD::CSEL
||
15573 CSel1
.getOpcode() != AArch64ISD::CSEL
)
15576 if (!CSel0
->hasOneUse() || !CSel1
->hasOneUse())
15579 if (!isNullConstant(CSel0
.getOperand(0)) ||
15580 !isOneConstant(CSel0
.getOperand(1)) ||
15581 !isNullConstant(CSel1
.getOperand(0)) ||
15582 !isOneConstant(CSel1
.getOperand(1)))
15585 SDValue Cmp0
= CSel0
.getOperand(3);
15586 SDValue Cmp1
= CSel1
.getOperand(3);
15587 AArch64CC::CondCode CC0
= (AArch64CC::CondCode
)CSel0
.getConstantOperandVal(2);
15588 AArch64CC::CondCode CC1
= (AArch64CC::CondCode
)CSel1
.getConstantOperandVal(2);
15589 if (!Cmp0
->hasOneUse() || !Cmp1
->hasOneUse())
15591 if (Cmp1
.getOpcode() != AArch64ISD::SUBS
&&
15592 Cmp0
.getOpcode() == AArch64ISD::SUBS
) {
15593 std::swap(Cmp0
, Cmp1
);
15594 std::swap(CC0
, CC1
);
15597 if (Cmp1
.getOpcode() != AArch64ISD::SUBS
)
15601 SDValue CCmp
, Condition
;
15604 if (N
->getOpcode() == ISD::AND
) {
15605 AArch64CC::CondCode InvCC0
= AArch64CC::getInvertedCondCode(CC0
);
15606 Condition
= DAG
.getConstant(InvCC0
, DL
, MVT_CC
);
15607 NZCV
= AArch64CC::getNZCVToSatisfyCondCode(CC1
);
15609 AArch64CC::CondCode InvCC1
= AArch64CC::getInvertedCondCode(CC1
);
15610 Condition
= DAG
.getConstant(CC0
, DL
, MVT_CC
);
15611 NZCV
= AArch64CC::getNZCVToSatisfyCondCode(InvCC1
);
15614 SDValue NZCVOp
= DAG
.getConstant(NZCV
, DL
, MVT::i32
);
15616 auto *Op1
= dyn_cast
<ConstantSDNode
>(Cmp1
.getOperand(1));
15617 if (Op1
&& Op1
->getAPIntValue().isNegative() &&
15618 Op1
->getAPIntValue().sgt(-32)) {
15619 // CCMP accept the constant int the range [0, 31]
15620 // if the Op1 is a constant in the range [-31, -1], we
15621 // can select to CCMN to avoid the extra mov
15623 DAG
.getConstant(Op1
->getAPIntValue().abs(), DL
, Op1
->getValueType(0));
15624 CCmp
= DAG
.getNode(AArch64ISD::CCMN
, DL
, MVT_CC
, Cmp1
.getOperand(0), AbsOp1
,
15625 NZCVOp
, Condition
, Cmp0
);
15627 CCmp
= DAG
.getNode(AArch64ISD::CCMP
, DL
, MVT_CC
, Cmp1
.getOperand(0),
15628 Cmp1
.getOperand(1), NZCVOp
, Condition
, Cmp0
);
15630 return DAG
.getNode(AArch64ISD::CSEL
, DL
, VT
, CSel0
.getOperand(0),
15631 CSel0
.getOperand(1), DAG
.getConstant(CC1
, DL
, MVT::i32
),
15635 static SDValue
performORCombine(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
,
15636 const AArch64Subtarget
*Subtarget
) {
15637 SelectionDAG
&DAG
= DCI
.DAG
;
15638 EVT VT
= N
->getValueType(0);
15640 if (SDValue R
= performANDORCSELCombine(N
, DAG
))
15643 if (!DAG
.getTargetLoweringInfo().isTypeLegal(VT
))
15646 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
15647 if (SDValue Res
= tryCombineToEXTR(N
, DCI
))
15650 if (SDValue Res
= tryCombineToBSL(N
, DCI
))
15656 static bool isConstantSplatVectorMaskForType(SDNode
*N
, EVT MemVT
) {
15657 if (!MemVT
.getVectorElementType().isSimple())
15660 uint64_t MaskForTy
= 0ull;
15661 switch (MemVT
.getVectorElementType().getSimpleVT().SimpleTy
) {
15663 MaskForTy
= 0xffull
;
15666 MaskForTy
= 0xffffull
;
15669 MaskForTy
= 0xffffffffull
;
15676 if (N
->getOpcode() == AArch64ISD::DUP
|| N
->getOpcode() == ISD::SPLAT_VECTOR
)
15677 if (auto *Op0
= dyn_cast
<ConstantSDNode
>(N
->getOperand(0)))
15678 return Op0
->getAPIntValue().getLimitedValue() == MaskForTy
;
15683 static SDValue
performSVEAndCombine(SDNode
*N
,
15684 TargetLowering::DAGCombinerInfo
&DCI
) {
15685 if (DCI
.isBeforeLegalizeOps())
15688 SelectionDAG
&DAG
= DCI
.DAG
;
15689 SDValue Src
= N
->getOperand(0);
15690 unsigned Opc
= Src
->getOpcode();
15692 // Zero/any extend of an unsigned unpack
15693 if (Opc
== AArch64ISD::UUNPKHI
|| Opc
== AArch64ISD::UUNPKLO
) {
15694 SDValue UnpkOp
= Src
->getOperand(0);
15695 SDValue Dup
= N
->getOperand(1);
15697 if (Dup
.getOpcode() != ISD::SPLAT_VECTOR
)
15701 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Dup
->getOperand(0));
15705 uint64_t ExtVal
= C
->getZExtValue();
15707 // If the mask is fully covered by the unpack, we don't need to push
15708 // a new AND onto the operand
15709 EVT EltTy
= UnpkOp
->getValueType(0).getVectorElementType();
15710 if ((ExtVal
== 0xFF && EltTy
== MVT::i8
) ||
15711 (ExtVal
== 0xFFFF && EltTy
== MVT::i16
) ||
15712 (ExtVal
== 0xFFFFFFFF && EltTy
== MVT::i32
))
15715 // Truncate to prevent a DUP with an over wide constant
15716 APInt Mask
= C
->getAPIntValue().trunc(EltTy
.getSizeInBits());
15718 // Otherwise, make sure we propagate the AND to the operand
15720 Dup
= DAG
.getNode(ISD::SPLAT_VECTOR
, DL
, UnpkOp
->getValueType(0),
15721 DAG
.getConstant(Mask
.zextOrTrunc(32), DL
, MVT::i32
));
15723 SDValue And
= DAG
.getNode(ISD::AND
, DL
,
15724 UnpkOp
->getValueType(0), UnpkOp
, Dup
);
15726 return DAG
.getNode(Opc
, DL
, N
->getValueType(0), And
);
15729 if (!EnableCombineMGatherIntrinsics
)
15732 SDValue Mask
= N
->getOperand(1);
15734 if (!Src
.hasOneUse())
15739 // SVE load instructions perform an implicit zero-extend, which makes them
15740 // perfect candidates for combining.
15742 case AArch64ISD::LD1_MERGE_ZERO
:
15743 case AArch64ISD::LDNF1_MERGE_ZERO
:
15744 case AArch64ISD::LDFF1_MERGE_ZERO
:
15745 MemVT
= cast
<VTSDNode
>(Src
->getOperand(3))->getVT();
15747 case AArch64ISD::GLD1_MERGE_ZERO
:
15748 case AArch64ISD::GLD1_SCALED_MERGE_ZERO
:
15749 case AArch64ISD::GLD1_SXTW_MERGE_ZERO
:
15750 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO
:
15751 case AArch64ISD::GLD1_UXTW_MERGE_ZERO
:
15752 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO
:
15753 case AArch64ISD::GLD1_IMM_MERGE_ZERO
:
15754 case AArch64ISD::GLDFF1_MERGE_ZERO
:
15755 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO
:
15756 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO
:
15757 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO
:
15758 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO
:
15759 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO
:
15760 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO
:
15761 case AArch64ISD::GLDNT1_MERGE_ZERO
:
15762 MemVT
= cast
<VTSDNode
>(Src
->getOperand(4))->getVT();
15768 if (isConstantSplatVectorMaskForType(Mask
.getNode(), MemVT
))
15774 static SDValue
performANDCombine(SDNode
*N
,
15775 TargetLowering::DAGCombinerInfo
&DCI
) {
15776 SelectionDAG
&DAG
= DCI
.DAG
;
15777 SDValue LHS
= N
->getOperand(0);
15778 SDValue RHS
= N
->getOperand(1);
15779 EVT VT
= N
->getValueType(0);
15781 if (SDValue R
= performANDORCSELCombine(N
, DAG
))
15784 if (!DAG
.getTargetLoweringInfo().isTypeLegal(VT
))
15787 if (VT
.isScalableVector())
15788 return performSVEAndCombine(N
, DCI
);
15790 // The combining code below works only for NEON vectors. In particular, it
15791 // does not work for SVE when dealing with vectors wider than 128 bits.
15792 if (!VT
.is64BitVector() && !VT
.is128BitVector())
15795 BuildVectorSDNode
*BVN
= dyn_cast
<BuildVectorSDNode
>(RHS
.getNode());
15799 // AND does not accept an immediate, so check if we can use a BIC immediate
15800 // instruction instead. We do this here instead of using a (and x, (mvni imm))
15801 // pattern in isel, because some immediates may be lowered to the preferred
15802 // (and x, (movi imm)) form, even though an mvni representation also exists.
15803 APInt
DefBits(VT
.getSizeInBits(), 0);
15804 APInt
UndefBits(VT
.getSizeInBits(), 0);
15805 if (resolveBuildVector(BVN
, DefBits
, UndefBits
)) {
15808 DefBits
= ~DefBits
;
15809 if ((NewOp
= tryAdvSIMDModImm32(AArch64ISD::BICi
, SDValue(N
, 0), DAG
,
15811 (NewOp
= tryAdvSIMDModImm16(AArch64ISD::BICi
, SDValue(N
, 0), DAG
,
15815 UndefBits
= ~UndefBits
;
15816 if ((NewOp
= tryAdvSIMDModImm32(AArch64ISD::BICi
, SDValue(N
, 0), DAG
,
15817 UndefBits
, &LHS
)) ||
15818 (NewOp
= tryAdvSIMDModImm16(AArch64ISD::BICi
, SDValue(N
, 0), DAG
,
15826 static bool hasPairwiseAdd(unsigned Opcode
, EVT VT
, bool FullFP16
) {
15828 case ISD::STRICT_FADD
:
15830 return (FullFP16
&& VT
== MVT::f16
) || VT
== MVT::f32
|| VT
== MVT::f64
;
15832 return VT
== MVT::i64
;
15838 static SDValue
getPTest(SelectionDAG
&DAG
, EVT VT
, SDValue Pg
, SDValue Op
,
15839 AArch64CC::CondCode Cond
);
15841 static bool isPredicateCCSettingOp(SDValue N
) {
15842 if ((N
.getOpcode() == ISD::SETCC
) ||
15843 (N
.getOpcode() == ISD::INTRINSIC_WO_CHAIN
&&
15844 (N
.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege
||
15845 N
.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt
||
15846 N
.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi
||
15847 N
.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs
||
15848 N
.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele
||
15849 N
.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo
||
15850 N
.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels
||
15851 N
.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt
||
15852 // get_active_lane_mask is lowered to a whilelo instruction.
15853 N
.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask
)))
15859 // Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
15860 // ... into: "ptrue p, all" + PTEST
15862 performFirstTrueTestVectorCombine(SDNode
*N
,
15863 TargetLowering::DAGCombinerInfo
&DCI
,
15864 const AArch64Subtarget
*Subtarget
) {
15865 assert(N
->getOpcode() == ISD::EXTRACT_VECTOR_ELT
);
15866 // Make sure PTEST can be legalised with illegal types.
15867 if (!Subtarget
->hasSVE() || DCI
.isBeforeLegalize())
15870 SDValue N0
= N
->getOperand(0);
15871 EVT VT
= N0
.getValueType();
15873 if (!VT
.isScalableVector() || VT
.getVectorElementType() != MVT::i1
||
15874 !isNullConstant(N
->getOperand(1)))
15877 // Restricted the DAG combine to only cases where we're extracting from a
15878 // flag-setting operation.
15879 if (!isPredicateCCSettingOp(N0
))
15882 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
15883 SelectionDAG
&DAG
= DCI
.DAG
;
15884 SDValue Pg
= getPTrue(DAG
, SDLoc(N
), VT
, AArch64SVEPredPattern::all
);
15885 return getPTest(DAG
, N
->getValueType(0), Pg
, N0
, AArch64CC::FIRST_ACTIVE
);
15888 // Materialize : Idx = (add (mul vscale, NumEls), -1)
15889 // i1 = extract_vector_elt t37, Constant:i64<Idx>
15890 // ... into: "ptrue p, all" + PTEST
15892 performLastTrueTestVectorCombine(SDNode
*N
,
15893 TargetLowering::DAGCombinerInfo
&DCI
,
15894 const AArch64Subtarget
*Subtarget
) {
15895 assert(N
->getOpcode() == ISD::EXTRACT_VECTOR_ELT
);
15896 // Make sure PTEST is legal types.
15897 if (!Subtarget
->hasSVE() || DCI
.isBeforeLegalize())
15900 SDValue N0
= N
->getOperand(0);
15901 EVT OpVT
= N0
.getValueType();
15903 if (!OpVT
.isScalableVector() || OpVT
.getVectorElementType() != MVT::i1
)
15906 // Idx == (add (mul vscale, NumEls), -1)
15907 SDValue Idx
= N
->getOperand(1);
15908 if (Idx
.getOpcode() != ISD::ADD
|| !isAllOnesConstant(Idx
.getOperand(1)))
15911 SDValue VS
= Idx
.getOperand(0);
15912 if (VS
.getOpcode() != ISD::VSCALE
)
15915 unsigned NumEls
= OpVT
.getVectorElementCount().getKnownMinValue();
15916 if (VS
.getConstantOperandVal(0) != NumEls
)
15919 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
15920 SelectionDAG
&DAG
= DCI
.DAG
;
15921 SDValue Pg
= getPTrue(DAG
, SDLoc(N
), OpVT
, AArch64SVEPredPattern::all
);
15922 return getPTest(DAG
, N
->getValueType(0), Pg
, N0
, AArch64CC::LAST_ACTIVE
);
15926 performExtractVectorEltCombine(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
,
15927 const AArch64Subtarget
*Subtarget
) {
15928 assert(N
->getOpcode() == ISD::EXTRACT_VECTOR_ELT
);
15929 if (SDValue Res
= performFirstTrueTestVectorCombine(N
, DCI
, Subtarget
))
15931 if (SDValue Res
= performLastTrueTestVectorCombine(N
, DCI
, Subtarget
))
15934 SelectionDAG
&DAG
= DCI
.DAG
;
15935 SDValue N0
= N
->getOperand(0), N1
= N
->getOperand(1);
15936 ConstantSDNode
*ConstantN1
= dyn_cast
<ConstantSDNode
>(N1
);
15938 EVT VT
= N
->getValueType(0);
15939 const bool FullFP16
= DAG
.getSubtarget
<AArch64Subtarget
>().hasFullFP16();
15940 bool IsStrict
= N0
->isStrictFPOpcode();
15942 // extract(dup x) -> x
15943 if (N0
.getOpcode() == AArch64ISD::DUP
)
15944 return DAG
.getZExtOrTrunc(N0
.getOperand(0), SDLoc(N
), VT
);
15946 // Rewrite for pairwise fadd pattern
15947 // (f32 (extract_vector_elt
15948 // (fadd (vXf32 Other)
15949 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
15951 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
15952 // (extract_vector_elt (vXf32 Other) 1))
15953 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
15954 // we can only do this when it's used only by the extract_vector_elt.
15955 if (ConstantN1
&& ConstantN1
->getZExtValue() == 0 &&
15956 hasPairwiseAdd(N0
->getOpcode(), VT
, FullFP16
) &&
15957 (!IsStrict
|| N0
.hasOneUse())) {
15959 SDValue N00
= N0
->getOperand(IsStrict
? 1 : 0);
15960 SDValue N01
= N0
->getOperand(IsStrict
? 2 : 1);
15962 ShuffleVectorSDNode
*Shuffle
= dyn_cast
<ShuffleVectorSDNode
>(N01
);
15963 SDValue Other
= N00
;
15965 // And handle the commutative case.
15967 Shuffle
= dyn_cast
<ShuffleVectorSDNode
>(N00
);
15971 if (Shuffle
&& Shuffle
->getMaskElt(0) == 1 &&
15972 Other
== Shuffle
->getOperand(0)) {
15973 SDValue Extract1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, VT
, Other
,
15974 DAG
.getConstant(0, DL
, MVT::i64
));
15975 SDValue Extract2
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, VT
, Other
,
15976 DAG
.getConstant(1, DL
, MVT::i64
));
15978 return DAG
.getNode(N0
->getOpcode(), DL
, VT
, Extract1
, Extract2
);
15980 // For strict_fadd we need uses of the final extract_vector to be replaced
15981 // with the strict_fadd, but we also need uses of the chain output of the
15982 // original strict_fadd to use the chain output of the new strict_fadd as
15983 // otherwise it may not be deleted.
15984 SDValue Ret
= DAG
.getNode(N0
->getOpcode(), DL
,
15986 {N0
->getOperand(0), Extract1
, Extract2
});
15987 DAG
.ReplaceAllUsesOfValueWith(SDValue(N
, 0), Ret
);
15988 DAG
.ReplaceAllUsesOfValueWith(N0
.getValue(1), Ret
.getValue(1));
15989 return SDValue(N
, 0);
15996 static SDValue
performConcatVectorsCombine(SDNode
*N
,
15997 TargetLowering::DAGCombinerInfo
&DCI
,
15998 SelectionDAG
&DAG
) {
16000 EVT VT
= N
->getValueType(0);
16001 SDValue N0
= N
->getOperand(0), N1
= N
->getOperand(1);
16002 unsigned N0Opc
= N0
->getOpcode(), N1Opc
= N1
->getOpcode();
16004 if (VT
.isScalableVector())
16007 // Optimize concat_vectors of truncated vectors, where the intermediate
16008 // type is illegal, to avoid said illegality, e.g.,
16009 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
16010 // (v2i16 (truncate (v2i64)))))
16012 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
16013 // (v4i32 (bitcast (v2i64))),
16015 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
16016 // on both input and result type, so we might generate worse code.
16017 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
16018 if (N
->getNumOperands() == 2 && N0Opc
== ISD::TRUNCATE
&&
16019 N1Opc
== ISD::TRUNCATE
) {
16020 SDValue N00
= N0
->getOperand(0);
16021 SDValue N10
= N1
->getOperand(0);
16022 EVT N00VT
= N00
.getValueType();
16024 if (N00VT
== N10
.getValueType() &&
16025 (N00VT
== MVT::v2i64
|| N00VT
== MVT::v4i32
) &&
16026 N00VT
.getScalarSizeInBits() == 4 * VT
.getScalarSizeInBits()) {
16027 MVT MidVT
= (N00VT
== MVT::v2i64
? MVT::v4i32
: MVT::v8i16
);
16028 SmallVector
<int, 8> Mask(MidVT
.getVectorNumElements());
16029 for (size_t i
= 0; i
< Mask
.size(); ++i
)
16031 return DAG
.getNode(ISD::TRUNCATE
, dl
, VT
,
16032 DAG
.getVectorShuffle(
16034 DAG
.getNode(ISD::BITCAST
, dl
, MidVT
, N00
),
16035 DAG
.getNode(ISD::BITCAST
, dl
, MidVT
, N10
), Mask
));
16039 if (N
->getOperand(0).getValueType() == MVT::v4i8
) {
16040 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
16041 // loads to prevent having to go through the v4i8 load legalization that
16042 // needs to extend each element into a larger type.
16043 if (N
->getNumOperands() % 2 == 0 && all_of(N
->op_values(), [](SDValue V
) {
16044 if (V
.getValueType() != MVT::v4i8
)
16048 LoadSDNode
*LD
= dyn_cast
<LoadSDNode
>(V
);
16049 return LD
&& V
.hasOneUse() && LD
->isSimple() && !LD
->isIndexed() &&
16050 LD
->getExtensionType() == ISD::NON_EXTLOAD
;
16053 EVT::getVectorVT(*DAG
.getContext(), MVT::f32
, N
->getNumOperands());
16054 SmallVector
<SDValue
> Ops
;
16056 for (unsigned i
= 0; i
< N
->getNumOperands(); i
++) {
16057 SDValue V
= N
->getOperand(i
);
16059 Ops
.push_back(DAG
.getUNDEF(MVT::f32
));
16061 LoadSDNode
*LD
= cast
<LoadSDNode
>(V
);
16063 DAG
.getLoad(MVT::f32
, dl
, LD
->getChain(), LD
->getBasePtr(),
16064 LD
->getMemOperand());
16065 DAG
.ReplaceAllUsesOfValueWith(SDValue(LD
, 1), NewLoad
.getValue(1));
16066 Ops
.push_back(NewLoad
);
16069 return DAG
.getBitcast(N
->getValueType(0),
16070 DAG
.getBuildVector(NVT
, dl
, Ops
));
16075 // Wait 'til after everything is legalized to try this. That way we have
16076 // legal vector types and such.
16077 if (DCI
.isBeforeLegalizeOps())
16080 // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use
16081 // extracted subvectors from the same original vectors. Combine these into a
16082 // single avg that operates on the two original vectors.
16083 // avgceil is the target independant name for rhadd, avgfloor is a hadd.
16085 // (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>),
16086 // extract_subvector (v16i8 OpB, <0>))),
16087 // (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>),
16088 // extract_subvector (v16i8 OpB, <8>)))))
16090 // (v16i8(avgceils(v16i8 OpA, v16i8 OpB)))
16091 if (N
->getNumOperands() == 2 && N0Opc
== N1Opc
&&
16092 (N0Opc
== ISD::AVGCEILU
|| N0Opc
== ISD::AVGCEILS
||
16093 N0Opc
== ISD::AVGFLOORU
|| N0Opc
== ISD::AVGFLOORS
)) {
16094 SDValue N00
= N0
->getOperand(0);
16095 SDValue N01
= N0
->getOperand(1);
16096 SDValue N10
= N1
->getOperand(0);
16097 SDValue N11
= N1
->getOperand(1);
16099 EVT N00VT
= N00
.getValueType();
16100 EVT N10VT
= N10
.getValueType();
16102 if (N00
->getOpcode() == ISD::EXTRACT_SUBVECTOR
&&
16103 N01
->getOpcode() == ISD::EXTRACT_SUBVECTOR
&&
16104 N10
->getOpcode() == ISD::EXTRACT_SUBVECTOR
&&
16105 N11
->getOpcode() == ISD::EXTRACT_SUBVECTOR
&& N00VT
== N10VT
) {
16106 SDValue N00Source
= N00
->getOperand(0);
16107 SDValue N01Source
= N01
->getOperand(0);
16108 SDValue N10Source
= N10
->getOperand(0);
16109 SDValue N11Source
= N11
->getOperand(0);
16111 if (N00Source
== N10Source
&& N01Source
== N11Source
&&
16112 N00Source
.getValueType() == VT
&& N01Source
.getValueType() == VT
) {
16113 assert(N0
.getValueType() == N1
.getValueType());
16115 uint64_t N00Index
= N00
.getConstantOperandVal(1);
16116 uint64_t N01Index
= N01
.getConstantOperandVal(1);
16117 uint64_t N10Index
= N10
.getConstantOperandVal(1);
16118 uint64_t N11Index
= N11
.getConstantOperandVal(1);
16120 if (N00Index
== N01Index
&& N10Index
== N11Index
&& N00Index
== 0 &&
16121 N10Index
== N00VT
.getVectorNumElements())
16122 return DAG
.getNode(N0Opc
, dl
, VT
, N00Source
, N01Source
);
16127 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
16128 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
16129 // canonicalise to that.
16130 if (N
->getNumOperands() == 2 && N0
== N1
&& VT
.getVectorNumElements() == 2) {
16131 assert(VT
.getScalarSizeInBits() == 64);
16132 return DAG
.getNode(AArch64ISD::DUPLANE64
, dl
, VT
, WidenVector(N0
, DAG
),
16133 DAG
.getConstant(0, dl
, MVT::i64
));
16136 // Canonicalise concat_vectors so that the right-hand vector has as few
16137 // bit-casts as possible before its real operation. The primary matching
16138 // destination for these operations will be the narrowing "2" instructions,
16139 // which depend on the operation being performed on this right-hand vector.
16141 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
16143 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
16145 if (N
->getNumOperands() != 2 || N1Opc
!= ISD::BITCAST
)
16147 SDValue RHS
= N1
->getOperand(0);
16148 MVT RHSTy
= RHS
.getValueType().getSimpleVT();
16149 // If the RHS is not a vector, this is not the pattern we're looking for.
16150 if (!RHSTy
.isVector())
16154 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
16156 MVT ConcatTy
= MVT::getVectorVT(RHSTy
.getVectorElementType(),
16157 RHSTy
.getVectorNumElements() * 2);
16158 return DAG
.getNode(ISD::BITCAST
, dl
, VT
,
16159 DAG
.getNode(ISD::CONCAT_VECTORS
, dl
, ConcatTy
,
16160 DAG
.getNode(ISD::BITCAST
, dl
, RHSTy
, N0
),
16165 performExtractSubvectorCombine(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
,
16166 SelectionDAG
&DAG
) {
16167 if (DCI
.isBeforeLegalizeOps())
16170 EVT VT
= N
->getValueType(0);
16171 if (!VT
.isScalableVector() || VT
.getVectorElementType() != MVT::i1
)
16174 SDValue V
= N
->getOperand(0);
16176 // NOTE: This combine exists in DAGCombiner, but that version's legality check
16177 // blocks this combine because the non-const case requires custom lowering.
16179 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
16180 if (V
.getOpcode() == ISD::SPLAT_VECTOR
)
16181 if (isa
<ConstantSDNode
>(V
.getOperand(0)))
16182 return DAG
.getNode(ISD::SPLAT_VECTOR
, SDLoc(N
), VT
, V
.getOperand(0));
16188 performInsertSubvectorCombine(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
,
16189 SelectionDAG
&DAG
) {
16191 SDValue Vec
= N
->getOperand(0);
16192 SDValue SubVec
= N
->getOperand(1);
16193 uint64_t IdxVal
= N
->getConstantOperandVal(2);
16194 EVT VecVT
= Vec
.getValueType();
16195 EVT SubVT
= SubVec
.getValueType();
16197 // Only do this for legal fixed vector types.
16198 if (!VecVT
.isFixedLengthVector() ||
16199 !DAG
.getTargetLoweringInfo().isTypeLegal(VecVT
) ||
16200 !DAG
.getTargetLoweringInfo().isTypeLegal(SubVT
))
16203 // Ignore widening patterns.
16204 if (IdxVal
== 0 && Vec
.isUndef())
16207 // Subvector must be half the width and an "aligned" insertion.
16208 unsigned NumSubElts
= SubVT
.getVectorNumElements();
16209 if ((SubVT
.getSizeInBits() * 2) != VecVT
.getSizeInBits() ||
16210 (IdxVal
!= 0 && IdxVal
!= NumSubElts
))
16213 // Fold insert_subvector -> concat_vectors
16214 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
16215 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
16219 Hi
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, SubVT
, Vec
,
16220 DAG
.getVectorIdxConstant(NumSubElts
, DL
));
16222 Lo
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, SubVT
, Vec
,
16223 DAG
.getVectorIdxConstant(0, DL
));
16226 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, VecVT
, Lo
, Hi
);
16229 static SDValue
tryCombineFixedPointConvert(SDNode
*N
,
16230 TargetLowering::DAGCombinerInfo
&DCI
,
16231 SelectionDAG
&DAG
) {
16232 // Wait until after everything is legalized to try this. That way we have
16233 // legal vector types and such.
16234 if (DCI
.isBeforeLegalizeOps())
16236 // Transform a scalar conversion of a value from a lane extract into a
16237 // lane extract of a vector conversion. E.g., from foo1 to foo2:
16238 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
16239 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
16241 // The second form interacts better with instruction selection and the
16242 // register allocator to avoid cross-class register copies that aren't
16243 // coalescable due to a lane reference.
16245 // Check the operand and see if it originates from a lane extract.
16246 SDValue Op1
= N
->getOperand(1);
16247 if (Op1
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
16250 // Yep, no additional predication needed. Perform the transform.
16251 SDValue IID
= N
->getOperand(0);
16252 SDValue Shift
= N
->getOperand(2);
16253 SDValue Vec
= Op1
.getOperand(0);
16254 SDValue Lane
= Op1
.getOperand(1);
16255 EVT ResTy
= N
->getValueType(0);
16259 // The vector width should be 128 bits by the time we get here, even
16260 // if it started as 64 bits (the extract_vector handling will have
16261 // done so). Bail if it is not.
16262 if (Vec
.getValueSizeInBits() != 128)
16265 if (Vec
.getValueType() == MVT::v4i32
)
16266 VecResTy
= MVT::v4f32
;
16267 else if (Vec
.getValueType() == MVT::v2i64
)
16268 VecResTy
= MVT::v2f64
;
16273 DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, VecResTy
, IID
, Vec
, Shift
);
16274 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, ResTy
, Convert
, Lane
);
16277 // AArch64 high-vector "long" operations are formed by performing the non-high
16278 // version on an extract_subvector of each operand which gets the high half:
16280 // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
16282 // However, there are cases which don't have an extract_high explicitly, but
16283 // have another operation that can be made compatible with one for free. For
16286 // (dupv64 scalar) --> (extract_high (dup128 scalar))
16288 // This routine does the actual conversion of such DUPs, once outer routines
16289 // have determined that everything else is in order.
16290 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
16292 static SDValue
tryExtendDUPToExtractHigh(SDValue N
, SelectionDAG
&DAG
) {
16293 MVT VT
= N
.getSimpleValueType();
16294 if (N
.getOpcode() == ISD::EXTRACT_SUBVECTOR
&&
16295 N
.getConstantOperandVal(1) == 0)
16296 N
= N
.getOperand(0);
16298 switch (N
.getOpcode()) {
16299 case AArch64ISD::DUP
:
16300 case AArch64ISD::DUPLANE8
:
16301 case AArch64ISD::DUPLANE16
:
16302 case AArch64ISD::DUPLANE32
:
16303 case AArch64ISD::DUPLANE64
:
16304 case AArch64ISD::MOVI
:
16305 case AArch64ISD::MOVIshift
:
16306 case AArch64ISD::MOVIedit
:
16307 case AArch64ISD::MOVImsl
:
16308 case AArch64ISD::MVNIshift
:
16309 case AArch64ISD::MVNImsl
:
16312 // FMOV could be supported, but isn't very useful, as it would only occur
16313 // if you passed a bitcast' floating point immediate to an eligible long
16314 // integer op (addl, smull, ...).
16318 if (!VT
.is64BitVector())
16322 unsigned NumElems
= VT
.getVectorNumElements();
16323 if (N
.getValueType().is64BitVector()) {
16324 MVT ElementTy
= VT
.getVectorElementType();
16325 MVT NewVT
= MVT::getVectorVT(ElementTy
, NumElems
* 2);
16326 N
= DAG
.getNode(N
->getOpcode(), DL
, NewVT
, N
->ops());
16329 return DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, VT
, N
,
16330 DAG
.getConstant(NumElems
, DL
, MVT::i64
));
16333 static bool isEssentiallyExtractHighSubvector(SDValue N
) {
16334 if (N
.getOpcode() == ISD::BITCAST
)
16335 N
= N
.getOperand(0);
16336 if (N
.getOpcode() != ISD::EXTRACT_SUBVECTOR
)
16338 if (N
.getOperand(0).getValueType().isScalableVector())
16340 return cast
<ConstantSDNode
>(N
.getOperand(1))->getAPIntValue() ==
16341 N
.getOperand(0).getValueType().getVectorNumElements() / 2;
16344 /// Helper structure to keep track of ISD::SET_CC operands.
16345 struct GenericSetCCInfo
{
16346 const SDValue
*Opnd0
;
16347 const SDValue
*Opnd1
;
16351 /// Helper structure to keep track of a SET_CC lowered into AArch64 code.
16352 struct AArch64SetCCInfo
{
16353 const SDValue
*Cmp
;
16354 AArch64CC::CondCode CC
;
16357 /// Helper structure to keep track of SetCC information.
16359 GenericSetCCInfo Generic
;
16360 AArch64SetCCInfo AArch64
;
16363 /// Helper structure to be able to read SetCC information. If set to
16364 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
16365 /// GenericSetCCInfo.
16366 struct SetCCInfoAndKind
{
16371 /// Check whether or not \p Op is a SET_CC operation, either a generic or
16373 /// AArch64 lowered one.
16374 /// \p SetCCInfo is filled accordingly.
16375 /// \post SetCCInfo is meanginfull only when this function returns true.
16376 /// \return True when Op is a kind of SET_CC operation.
16377 static bool isSetCC(SDValue Op
, SetCCInfoAndKind
&SetCCInfo
) {
16378 // If this is a setcc, this is straight forward.
16379 if (Op
.getOpcode() == ISD::SETCC
) {
16380 SetCCInfo
.Info
.Generic
.Opnd0
= &Op
.getOperand(0);
16381 SetCCInfo
.Info
.Generic
.Opnd1
= &Op
.getOperand(1);
16382 SetCCInfo
.Info
.Generic
.CC
= cast
<CondCodeSDNode
>(Op
.getOperand(2))->get();
16383 SetCCInfo
.IsAArch64
= false;
16386 // Otherwise, check if this is a matching csel instruction.
16389 // - csel 0, 1, !cc
16390 if (Op
.getOpcode() != AArch64ISD::CSEL
)
16392 // Set the information about the operands.
16393 // TODO: we want the operands of the Cmp not the csel
16394 SetCCInfo
.Info
.AArch64
.Cmp
= &Op
.getOperand(3);
16395 SetCCInfo
.IsAArch64
= true;
16396 SetCCInfo
.Info
.AArch64
.CC
= static_cast<AArch64CC::CondCode
>(
16397 cast
<ConstantSDNode
>(Op
.getOperand(2))->getZExtValue());
16399 // Check that the operands matches the constraints:
16400 // (1) Both operands must be constants.
16401 // (2) One must be 1 and the other must be 0.
16402 ConstantSDNode
*TValue
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(0));
16403 ConstantSDNode
*FValue
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1));
16406 if (!TValue
|| !FValue
)
16410 if (!TValue
->isOne()) {
16411 // Update the comparison when we are interested in !cc.
16412 std::swap(TValue
, FValue
);
16413 SetCCInfo
.Info
.AArch64
.CC
=
16414 AArch64CC::getInvertedCondCode(SetCCInfo
.Info
.AArch64
.CC
);
16416 return TValue
->isOne() && FValue
->isZero();
16419 // Returns true if Op is setcc or zext of setcc.
16420 static bool isSetCCOrZExtSetCC(const SDValue
& Op
, SetCCInfoAndKind
&Info
) {
16421 if (isSetCC(Op
, Info
))
16423 return ((Op
.getOpcode() == ISD::ZERO_EXTEND
) &&
16424 isSetCC(Op
->getOperand(0), Info
));
16427 // The folding we want to perform is:
16428 // (add x, [zext] (setcc cc ...) )
16430 // (csel x, (add x, 1), !cc ...)
16432 // The latter will get matched to a CSINC instruction.
16433 static SDValue
performSetccAddFolding(SDNode
*Op
, SelectionDAG
&DAG
) {
16434 assert(Op
&& Op
->getOpcode() == ISD::ADD
&& "Unexpected operation!");
16435 SDValue LHS
= Op
->getOperand(0);
16436 SDValue RHS
= Op
->getOperand(1);
16437 SetCCInfoAndKind InfoAndKind
;
16439 // If both operands are a SET_CC, then we don't want to perform this
16440 // folding and create another csel as this results in more instructions
16441 // (and higher register usage).
16442 if (isSetCCOrZExtSetCC(LHS
, InfoAndKind
) &&
16443 isSetCCOrZExtSetCC(RHS
, InfoAndKind
))
16446 // If neither operand is a SET_CC, give up.
16447 if (!isSetCCOrZExtSetCC(LHS
, InfoAndKind
)) {
16448 std::swap(LHS
, RHS
);
16449 if (!isSetCCOrZExtSetCC(LHS
, InfoAndKind
))
16453 // FIXME: This could be generatized to work for FP comparisons.
16454 EVT CmpVT
= InfoAndKind
.IsAArch64
16455 ? InfoAndKind
.Info
.AArch64
.Cmp
->getOperand(0).getValueType()
16456 : InfoAndKind
.Info
.Generic
.Opnd0
->getValueType();
16457 if (CmpVT
!= MVT::i32
&& CmpVT
!= MVT::i64
)
16463 if (InfoAndKind
.IsAArch64
) {
16464 CCVal
= DAG
.getConstant(
16465 AArch64CC::getInvertedCondCode(InfoAndKind
.Info
.AArch64
.CC
), dl
,
16467 Cmp
= *InfoAndKind
.Info
.AArch64
.Cmp
;
16469 Cmp
= getAArch64Cmp(
16470 *InfoAndKind
.Info
.Generic
.Opnd0
, *InfoAndKind
.Info
.Generic
.Opnd1
,
16471 ISD::getSetCCInverse(InfoAndKind
.Info
.Generic
.CC
, CmpVT
), CCVal
, DAG
,
16474 EVT VT
= Op
->getValueType(0);
16475 LHS
= DAG
.getNode(ISD::ADD
, dl
, VT
, RHS
, DAG
.getConstant(1, dl
, VT
));
16476 return DAG
.getNode(AArch64ISD::CSEL
, dl
, VT
, RHS
, LHS
, CCVal
, Cmp
);
16479 // ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
16480 static SDValue
performAddUADDVCombine(SDNode
*N
, SelectionDAG
&DAG
) {
16481 EVT VT
= N
->getValueType(0);
16482 // Only scalar integer and vector types.
16483 if (N
->getOpcode() != ISD::ADD
|| !VT
.isScalarInteger())
16486 SDValue LHS
= N
->getOperand(0);
16487 SDValue RHS
= N
->getOperand(1);
16488 if (LHS
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
||
16489 RHS
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
|| LHS
.getValueType() != VT
)
16492 auto *LHSN1
= dyn_cast
<ConstantSDNode
>(LHS
->getOperand(1));
16493 auto *RHSN1
= dyn_cast
<ConstantSDNode
>(RHS
->getOperand(1));
16494 if (!LHSN1
|| LHSN1
!= RHSN1
|| !RHSN1
->isZero())
16497 SDValue Op1
= LHS
->getOperand(0);
16498 SDValue Op2
= RHS
->getOperand(0);
16499 EVT OpVT1
= Op1
.getValueType();
16500 EVT OpVT2
= Op2
.getValueType();
16501 if (Op1
.getOpcode() != AArch64ISD::UADDV
|| OpVT1
!= OpVT2
||
16502 Op2
.getOpcode() != AArch64ISD::UADDV
||
16503 OpVT1
.getVectorElementType() != VT
)
16506 SDValue Val1
= Op1
.getOperand(0);
16507 SDValue Val2
= Op2
.getOperand(0);
16508 EVT ValVT
= Val1
->getValueType(0);
16510 SDValue AddVal
= DAG
.getNode(ISD::ADD
, DL
, ValVT
, Val1
, Val2
);
16511 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, VT
,
16512 DAG
.getNode(AArch64ISD::UADDV
, DL
, ValVT
, AddVal
),
16513 DAG
.getConstant(0, DL
, MVT::i64
));
16516 /// Perform the scalar expression combine in the form of:
16517 /// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
16518 /// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
16519 static SDValue
performAddCSelIntoCSinc(SDNode
*N
, SelectionDAG
&DAG
) {
16520 EVT VT
= N
->getValueType(0);
16521 if (!VT
.isScalarInteger() || N
->getOpcode() != ISD::ADD
)
16524 SDValue LHS
= N
->getOperand(0);
16525 SDValue RHS
= N
->getOperand(1);
16527 // Handle commutivity.
16528 if (LHS
.getOpcode() != AArch64ISD::CSEL
&&
16529 LHS
.getOpcode() != AArch64ISD::CSNEG
) {
16530 std::swap(LHS
, RHS
);
16531 if (LHS
.getOpcode() != AArch64ISD::CSEL
&&
16532 LHS
.getOpcode() != AArch64ISD::CSNEG
) {
16537 if (!LHS
.hasOneUse())
16540 AArch64CC::CondCode AArch64CC
=
16541 static_cast<AArch64CC::CondCode
>(LHS
.getConstantOperandVal(2));
16543 // The CSEL should include a const one operand, and the CSNEG should include
16544 // One or NegOne operand.
16545 ConstantSDNode
*CTVal
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(0));
16546 ConstantSDNode
*CFVal
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
16547 if (!CTVal
|| !CFVal
)
16550 if (!(LHS
.getOpcode() == AArch64ISD::CSEL
&&
16551 (CTVal
->isOne() || CFVal
->isOne())) &&
16552 !(LHS
.getOpcode() == AArch64ISD::CSNEG
&&
16553 (CTVal
->isOne() || CFVal
->isAllOnes())))
16556 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
16557 if (LHS
.getOpcode() == AArch64ISD::CSEL
&& CTVal
->isOne() &&
16559 std::swap(CTVal
, CFVal
);
16560 AArch64CC
= AArch64CC::getInvertedCondCode(AArch64CC
);
16564 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
16565 if (LHS
.getOpcode() == AArch64ISD::CSNEG
&& CTVal
->isOne() &&
16566 !CFVal
->isAllOnes()) {
16567 APInt C
= -1 * CFVal
->getAPIntValue();
16568 CTVal
= cast
<ConstantSDNode
>(DAG
.getConstant(C
, DL
, VT
));
16569 CFVal
= cast
<ConstantSDNode
>(DAG
.getAllOnesConstant(DL
, VT
));
16570 AArch64CC
= AArch64CC::getInvertedCondCode(AArch64CC
);
16573 // It might be neutral for larger constants, as the immediate need to be
16574 // materialized in a register.
16575 APInt ADDC
= CTVal
->getAPIntValue();
16576 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
16577 if (!TLI
.isLegalAddImmediate(ADDC
.getSExtValue()))
16580 assert(((LHS
.getOpcode() == AArch64ISD::CSEL
&& CFVal
->isOne()) ||
16581 (LHS
.getOpcode() == AArch64ISD::CSNEG
&& CFVal
->isAllOnes())) &&
16582 "Unexpected constant value");
16584 SDValue NewNode
= DAG
.getNode(ISD::ADD
, DL
, VT
, RHS
, SDValue(CTVal
, 0));
16585 SDValue CCVal
= DAG
.getConstant(AArch64CC
, DL
, MVT::i32
);
16586 SDValue Cmp
= LHS
.getOperand(3);
16588 return DAG
.getNode(AArch64ISD::CSINC
, DL
, VT
, NewNode
, RHS
, CCVal
, Cmp
);
16591 // ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
16592 static SDValue
performAddDotCombine(SDNode
*N
, SelectionDAG
&DAG
) {
16593 EVT VT
= N
->getValueType(0);
16594 if (N
->getOpcode() != ISD::ADD
)
16597 SDValue Dot
= N
->getOperand(0);
16598 SDValue A
= N
->getOperand(1);
16599 // Handle commutivity
16600 auto isZeroDot
= [](SDValue Dot
) {
16601 return (Dot
.getOpcode() == AArch64ISD::UDOT
||
16602 Dot
.getOpcode() == AArch64ISD::SDOT
) &&
16603 isZerosVector(Dot
.getOperand(0).getNode());
16605 if (!isZeroDot(Dot
))
16607 if (!isZeroDot(Dot
))
16610 return DAG
.getNode(Dot
.getOpcode(), SDLoc(N
), VT
, A
, Dot
.getOperand(1),
16611 Dot
.getOperand(2));
16614 static bool isNegatedInteger(SDValue Op
) {
16615 return Op
.getOpcode() == ISD::SUB
&& isNullConstant(Op
.getOperand(0));
16618 static SDValue
getNegatedInteger(SDValue Op
, SelectionDAG
&DAG
) {
16620 EVT VT
= Op
.getValueType();
16621 SDValue Zero
= DAG
.getConstant(0, DL
, VT
);
16622 return DAG
.getNode(ISD::SUB
, DL
, VT
, Zero
, Op
);
16627 // (neg (csel X, Y)) -> (csel (neg X), (neg Y))
16629 // The folding helps csel to be matched with csneg without generating
16630 // redundant neg instruction, which includes negation of the csel expansion
16631 // of abs node lowered by lowerABS.
16632 static SDValue
performNegCSelCombine(SDNode
*N
, SelectionDAG
&DAG
) {
16633 if (!isNegatedInteger(SDValue(N
, 0)))
16636 SDValue CSel
= N
->getOperand(1);
16637 if (CSel
.getOpcode() != AArch64ISD::CSEL
|| !CSel
->hasOneUse())
16640 SDValue N0
= CSel
.getOperand(0);
16641 SDValue N1
= CSel
.getOperand(1);
16643 // If both of them is not negations, it's not worth the folding as it
16644 // introduces two additional negations while reducing one negation.
16645 if (!isNegatedInteger(N0
) && !isNegatedInteger(N1
))
16648 SDValue N0N
= getNegatedInteger(N0
, DAG
);
16649 SDValue N1N
= getNegatedInteger(N1
, DAG
);
16652 EVT VT
= CSel
.getValueType();
16653 return DAG
.getNode(AArch64ISD::CSEL
, DL
, VT
, N0N
, N1N
, CSel
.getOperand(2),
16654 CSel
.getOperand(3));
16657 // The basic add/sub long vector instructions have variants with "2" on the end
16658 // which act on the high-half of their inputs. They are normally matched by
16661 // (add (zeroext (extract_high LHS)),
16662 // (zeroext (extract_high RHS)))
16663 // -> uaddl2 vD, vN, vM
16665 // However, if one of the extracts is something like a duplicate, this
16666 // instruction can still be used profitably. This function puts the DAG into a
16667 // more appropriate form for those patterns to trigger.
16668 static SDValue
performAddSubLongCombine(SDNode
*N
,
16669 TargetLowering::DAGCombinerInfo
&DCI
,
16670 SelectionDAG
&DAG
) {
16671 if (DCI
.isBeforeLegalizeOps())
16674 MVT VT
= N
->getSimpleValueType(0);
16675 if (!VT
.is128BitVector()) {
16676 if (N
->getOpcode() == ISD::ADD
)
16677 return performSetccAddFolding(N
, DAG
);
16681 // Make sure both branches are extended in the same way.
16682 SDValue LHS
= N
->getOperand(0);
16683 SDValue RHS
= N
->getOperand(1);
16684 if ((LHS
.getOpcode() != ISD::ZERO_EXTEND
&&
16685 LHS
.getOpcode() != ISD::SIGN_EXTEND
) ||
16686 LHS
.getOpcode() != RHS
.getOpcode())
16689 unsigned ExtType
= LHS
.getOpcode();
16691 // It's not worth doing if at least one of the inputs isn't already an
16692 // extract, but we don't know which it'll be so we have to try both.
16693 if (isEssentiallyExtractHighSubvector(LHS
.getOperand(0))) {
16694 RHS
= tryExtendDUPToExtractHigh(RHS
.getOperand(0), DAG
);
16695 if (!RHS
.getNode())
16698 RHS
= DAG
.getNode(ExtType
, SDLoc(N
), VT
, RHS
);
16699 } else if (isEssentiallyExtractHighSubvector(RHS
.getOperand(0))) {
16700 LHS
= tryExtendDUPToExtractHigh(LHS
.getOperand(0), DAG
);
16701 if (!LHS
.getNode())
16704 LHS
= DAG
.getNode(ExtType
, SDLoc(N
), VT
, LHS
);
16707 return DAG
.getNode(N
->getOpcode(), SDLoc(N
), VT
, LHS
, RHS
);
16710 static bool isCMP(SDValue Op
) {
16711 return Op
.getOpcode() == AArch64ISD::SUBS
&&
16712 !Op
.getNode()->hasAnyUseOfValue(0);
16715 // (CSEL 1 0 CC Cond) => CC
16716 // (CSEL 0 1 CC Cond) => !CC
16717 static Optional
<AArch64CC::CondCode
> getCSETCondCode(SDValue Op
) {
16718 if (Op
.getOpcode() != AArch64ISD::CSEL
)
16720 auto CC
= static_cast<AArch64CC::CondCode
>(Op
.getConstantOperandVal(2));
16721 if (CC
== AArch64CC::AL
|| CC
== AArch64CC::NV
)
16723 SDValue OpLHS
= Op
.getOperand(0);
16724 SDValue OpRHS
= Op
.getOperand(1);
16725 if (isOneConstant(OpLHS
) && isNullConstant(OpRHS
))
16727 if (isNullConstant(OpLHS
) && isOneConstant(OpRHS
))
16728 return getInvertedCondCode(CC
);
16733 // (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
16734 // (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
16735 static SDValue
foldOverflowCheck(SDNode
*Op
, SelectionDAG
&DAG
, bool IsAdd
) {
16736 SDValue CmpOp
= Op
->getOperand(2);
16741 if (!isOneConstant(CmpOp
.getOperand(1)))
16744 if (!isNullConstant(CmpOp
.getOperand(0)))
16748 SDValue CsetOp
= CmpOp
->getOperand(IsAdd
? 0 : 1);
16749 auto CC
= getCSETCondCode(CsetOp
);
16750 if (CC
!= (IsAdd
? AArch64CC::HS
: AArch64CC::LO
))
16753 return DAG
.getNode(Op
->getOpcode(), SDLoc(Op
), Op
->getVTList(),
16754 Op
->getOperand(0), Op
->getOperand(1),
16755 CsetOp
.getOperand(3));
16758 // (ADC x 0 cond) => (CINC x HS cond)
16759 static SDValue
foldADCToCINC(SDNode
*N
, SelectionDAG
&DAG
) {
16760 SDValue LHS
= N
->getOperand(0);
16761 SDValue RHS
= N
->getOperand(1);
16762 SDValue Cond
= N
->getOperand(2);
16764 if (!isNullConstant(RHS
))
16767 EVT VT
= N
->getValueType(0);
16770 // (CINC x cc cond) <=> (CSINC x x !cc cond)
16771 SDValue CC
= DAG
.getConstant(AArch64CC::LO
, DL
, MVT::i32
);
16772 return DAG
.getNode(AArch64ISD::CSINC
, DL
, VT
, LHS
, LHS
, CC
, Cond
);
16775 // Transform vector add(zext i8 to i32, zext i8 to i32)
16776 // into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
16777 // This allows extra uses of saddl/uaddl at the lower vector widths, and less
16779 static SDValue
performVectorAddSubExtCombine(SDNode
*N
, SelectionDAG
&DAG
) {
16780 EVT VT
= N
->getValueType(0);
16781 if (!VT
.isFixedLengthVector() || VT
.getSizeInBits() <= 128 ||
16782 (N
->getOperand(0).getOpcode() != ISD::ZERO_EXTEND
&&
16783 N
->getOperand(0).getOpcode() != ISD::SIGN_EXTEND
) ||
16784 (N
->getOperand(1).getOpcode() != ISD::ZERO_EXTEND
&&
16785 N
->getOperand(1).getOpcode() != ISD::SIGN_EXTEND
) ||
16786 N
->getOperand(0).getOperand(0).getValueType() !=
16787 N
->getOperand(1).getOperand(0).getValueType())
16790 SDValue N0
= N
->getOperand(0).getOperand(0);
16791 SDValue N1
= N
->getOperand(1).getOperand(0);
16792 EVT InVT
= N0
.getValueType();
16794 EVT S1
= InVT
.getScalarType();
16795 EVT S2
= VT
.getScalarType();
16796 if ((S2
== MVT::i32
&& S1
== MVT::i8
) ||
16797 (S2
== MVT::i64
&& (S1
== MVT::i8
|| S1
== MVT::i16
))) {
16799 EVT HalfVT
= EVT::getVectorVT(*DAG
.getContext(),
16800 S2
.getHalfSizedIntegerVT(*DAG
.getContext()),
16801 VT
.getVectorElementCount());
16802 SDValue NewN0
= DAG
.getNode(N
->getOperand(0).getOpcode(), DL
, HalfVT
, N0
);
16803 SDValue NewN1
= DAG
.getNode(N
->getOperand(1).getOpcode(), DL
, HalfVT
, N1
);
16804 SDValue NewOp
= DAG
.getNode(N
->getOpcode(), DL
, HalfVT
, NewN0
, NewN1
);
16805 return DAG
.getNode(ISD::SIGN_EXTEND
, DL
, VT
, NewOp
);
16810 static SDValue
performBuildVectorCombine(SDNode
*N
,
16811 TargetLowering::DAGCombinerInfo
&DCI
,
16812 SelectionDAG
&DAG
) {
16814 EVT VT
= N
->getValueType(0);
16816 // A build vector of two extracted elements is equivalent to an
16817 // extract subvector where the inner vector is any-extended to the
16818 // extract_vector_elt VT.
16819 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
16820 // (extract_elt_iXX_to_i32 vec Idx+1))
16821 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
16823 // For now, only consider the v2i32 case, which arises as a result of
16825 if (VT
!= MVT::v2i32
)
16828 SDValue Elt0
= N
->getOperand(0), Elt1
= N
->getOperand(1);
16829 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
16830 if (Elt0
->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&&
16831 Elt1
->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&&
16833 isa
<ConstantSDNode
>(Elt0
->getOperand(1)) &&
16834 isa
<ConstantSDNode
>(Elt1
->getOperand(1)) &&
16835 // Both EXTRACT_VECTOR_ELT from same vector...
16836 Elt0
->getOperand(0) == Elt1
->getOperand(0) &&
16837 // ... and contiguous. First element's index +1 == second element's index.
16838 Elt0
->getConstantOperandVal(1) + 1 == Elt1
->getConstantOperandVal(1) &&
16839 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
16840 // ResultType's known minimum vector length.
16841 Elt0
->getConstantOperandVal(1) % VT
.getVectorMinNumElements() == 0) {
16842 SDValue VecToExtend
= Elt0
->getOperand(0);
16843 EVT ExtVT
= VecToExtend
.getValueType().changeVectorElementType(MVT::i32
);
16844 if (!DAG
.getTargetLoweringInfo().isTypeLegal(ExtVT
))
16847 SDValue SubvectorIdx
= DAG
.getVectorIdxConstant(Elt0
->getConstantOperandVal(1), DL
);
16849 SDValue Ext
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, ExtVT
, VecToExtend
);
16850 return DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, MVT::v2i32
, Ext
,
16857 static SDValue
performAddCombineForShiftedOperands(SDNode
*N
,
16858 SelectionDAG
&DAG
) {
16859 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
16861 if (N
->getOpcode() != ISD::ADD
)
16864 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
16865 // shifted register is only available for i32 and i64.
16866 EVT VT
= N
->getValueType(0);
16867 if (VT
!= MVT::i32
&& VT
!= MVT::i64
)
16871 SDValue LHS
= N
->getOperand(0);
16872 SDValue RHS
= N
->getOperand(1);
16874 uint64_t LHSImm
= 0, RHSImm
= 0;
16875 // If both operand are shifted by imm and shift amount is not greater than 4
16876 // for one operand, swap LHS and RHS to put operand with smaller shift amount
16879 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
16880 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
16881 // with LSL (shift > 4). For the rest of processors, this is no-op for
16882 // performance or correctness.
16883 if (isOpcWithIntImmediate(LHS
.getNode(), ISD::SHL
, LHSImm
) &&
16884 isOpcWithIntImmediate(RHS
.getNode(), ISD::SHL
, RHSImm
) && LHSImm
<= 4 &&
16885 RHSImm
> 4 && LHS
.hasOneUse())
16886 return DAG
.getNode(ISD::ADD
, DL
, VT
, RHS
, LHS
);
16891 static SDValue
performAddSubCombine(SDNode
*N
,
16892 TargetLowering::DAGCombinerInfo
&DCI
,
16893 SelectionDAG
&DAG
) {
16894 // Try to change sum of two reductions.
16895 if (SDValue Val
= performAddUADDVCombine(N
, DAG
))
16897 if (SDValue Val
= performAddDotCombine(N
, DAG
))
16899 if (SDValue Val
= performAddCSelIntoCSinc(N
, DAG
))
16901 if (SDValue Val
= performNegCSelCombine(N
, DAG
))
16903 if (SDValue Val
= performVectorAddSubExtCombine(N
, DAG
))
16905 if (SDValue Val
= performAddCombineForShiftedOperands(N
, DAG
))
16908 return performAddSubLongCombine(N
, DCI
, DAG
);
16911 // Massage DAGs which we can use the high-half "long" operations on into
16912 // something isel will recognize better. E.g.
16914 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
16915 // (aarch64_neon_umull (extract_high (v2i64 vec)))
16916 // (extract_high (v2i64 (dup128 scalar)))))
16918 static SDValue
tryCombineLongOpWithDup(unsigned IID
, SDNode
*N
,
16919 TargetLowering::DAGCombinerInfo
&DCI
,
16920 SelectionDAG
&DAG
) {
16921 if (DCI
.isBeforeLegalizeOps())
16924 SDValue LHS
= N
->getOperand((IID
== Intrinsic::not_intrinsic
) ? 0 : 1);
16925 SDValue RHS
= N
->getOperand((IID
== Intrinsic::not_intrinsic
) ? 1 : 2);
16926 assert(LHS
.getValueType().is64BitVector() &&
16927 RHS
.getValueType().is64BitVector() &&
16928 "unexpected shape for long operation");
16930 // Either node could be a DUP, but it's not worth doing both of them (you'd
16931 // just as well use the non-high version) so look for a corresponding extract
16932 // operation on the other "wing".
16933 if (isEssentiallyExtractHighSubvector(LHS
)) {
16934 RHS
= tryExtendDUPToExtractHigh(RHS
, DAG
);
16935 if (!RHS
.getNode())
16937 } else if (isEssentiallyExtractHighSubvector(RHS
)) {
16938 LHS
= tryExtendDUPToExtractHigh(LHS
, DAG
);
16939 if (!LHS
.getNode())
16943 if (IID
== Intrinsic::not_intrinsic
)
16944 return DAG
.getNode(N
->getOpcode(), SDLoc(N
), N
->getValueType(0), LHS
, RHS
);
16946 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, SDLoc(N
), N
->getValueType(0),
16947 N
->getOperand(0), LHS
, RHS
);
16950 static SDValue
tryCombineShiftImm(unsigned IID
, SDNode
*N
, SelectionDAG
&DAG
) {
16951 MVT ElemTy
= N
->getSimpleValueType(0).getScalarType();
16952 unsigned ElemBits
= ElemTy
.getSizeInBits();
16954 int64_t ShiftAmount
;
16955 if (BuildVectorSDNode
*BVN
= dyn_cast
<BuildVectorSDNode
>(N
->getOperand(2))) {
16956 APInt SplatValue
, SplatUndef
;
16957 unsigned SplatBitSize
;
16959 if (!BVN
->isConstantSplat(SplatValue
, SplatUndef
, SplatBitSize
,
16960 HasAnyUndefs
, ElemBits
) ||
16961 SplatBitSize
!= ElemBits
)
16964 ShiftAmount
= SplatValue
.getSExtValue();
16965 } else if (ConstantSDNode
*CVN
= dyn_cast
<ConstantSDNode
>(N
->getOperand(2))) {
16966 ShiftAmount
= CVN
->getSExtValue();
16974 llvm_unreachable("Unknown shift intrinsic");
16975 case Intrinsic::aarch64_neon_sqshl
:
16976 Opcode
= AArch64ISD::SQSHL_I
;
16977 IsRightShift
= false;
16979 case Intrinsic::aarch64_neon_uqshl
:
16980 Opcode
= AArch64ISD::UQSHL_I
;
16981 IsRightShift
= false;
16983 case Intrinsic::aarch64_neon_srshl
:
16984 Opcode
= AArch64ISD::SRSHR_I
;
16985 IsRightShift
= true;
16987 case Intrinsic::aarch64_neon_urshl
:
16988 Opcode
= AArch64ISD::URSHR_I
;
16989 IsRightShift
= true;
16991 case Intrinsic::aarch64_neon_sqshlu
:
16992 Opcode
= AArch64ISD::SQSHLU_I
;
16993 IsRightShift
= false;
16995 case Intrinsic::aarch64_neon_sshl
:
16996 case Intrinsic::aarch64_neon_ushl
:
16997 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
16998 // left shift for positive shift amounts. Below, we only replace the current
16999 // node with VSHL, if this condition is met.
17000 Opcode
= AArch64ISD::VSHL
;
17001 IsRightShift
= false;
17005 if (IsRightShift
&& ShiftAmount
<= -1 && ShiftAmount
>= -(int)ElemBits
) {
17007 return DAG
.getNode(Opcode
, dl
, N
->getValueType(0), N
->getOperand(1),
17008 DAG
.getConstant(-ShiftAmount
, dl
, MVT::i32
));
17009 } else if (!IsRightShift
&& ShiftAmount
>= 0 && ShiftAmount
< ElemBits
) {
17011 return DAG
.getNode(Opcode
, dl
, N
->getValueType(0), N
->getOperand(1),
17012 DAG
.getConstant(ShiftAmount
, dl
, MVT::i32
));
17018 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
17019 // the intrinsics must be legal and take an i32, this means there's almost
17020 // certainly going to be a zext in the DAG which we can eliminate.
17021 static SDValue
tryCombineCRC32(unsigned Mask
, SDNode
*N
, SelectionDAG
&DAG
) {
17022 SDValue AndN
= N
->getOperand(2);
17023 if (AndN
.getOpcode() != ISD::AND
)
17026 ConstantSDNode
*CMask
= dyn_cast
<ConstantSDNode
>(AndN
.getOperand(1));
17027 if (!CMask
|| CMask
->getZExtValue() != Mask
)
17030 return DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, SDLoc(N
), MVT::i32
,
17031 N
->getOperand(0), N
->getOperand(1), AndN
.getOperand(0));
17034 static SDValue
combineAcrossLanesIntrinsic(unsigned Opc
, SDNode
*N
,
17035 SelectionDAG
&DAG
) {
17037 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, dl
, N
->getValueType(0),
17038 DAG
.getNode(Opc
, dl
,
17039 N
->getOperand(1).getSimpleValueType(),
17041 DAG
.getConstant(0, dl
, MVT::i64
));
17044 static SDValue
LowerSVEIntrinsicIndex(SDNode
*N
, SelectionDAG
&DAG
) {
17046 SDValue Op1
= N
->getOperand(1);
17047 SDValue Op2
= N
->getOperand(2);
17048 EVT ScalarTy
= Op2
.getValueType();
17049 if ((ScalarTy
== MVT::i8
) || (ScalarTy
== MVT::i16
))
17050 ScalarTy
= MVT::i32
;
17052 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
17053 SDValue StepVector
= DAG
.getStepVector(DL
, N
->getValueType(0));
17054 SDValue Step
= DAG
.getNode(ISD::SPLAT_VECTOR
, DL
, N
->getValueType(0), Op2
);
17055 SDValue Mul
= DAG
.getNode(ISD::MUL
, DL
, N
->getValueType(0), StepVector
, Step
);
17056 SDValue Base
= DAG
.getNode(ISD::SPLAT_VECTOR
, DL
, N
->getValueType(0), Op1
);
17057 return DAG
.getNode(ISD::ADD
, DL
, N
->getValueType(0), Mul
, Base
);
17060 static SDValue
LowerSVEIntrinsicDUP(SDNode
*N
, SelectionDAG
&DAG
) {
17062 SDValue Scalar
= N
->getOperand(3);
17063 EVT ScalarTy
= Scalar
.getValueType();
17065 if ((ScalarTy
== MVT::i8
) || (ScalarTy
== MVT::i16
))
17066 Scalar
= DAG
.getNode(ISD::ANY_EXTEND
, dl
, MVT::i32
, Scalar
);
17068 SDValue Passthru
= N
->getOperand(1);
17069 SDValue Pred
= N
->getOperand(2);
17070 return DAG
.getNode(AArch64ISD::DUP_MERGE_PASSTHRU
, dl
, N
->getValueType(0),
17071 Pred
, Scalar
, Passthru
);
17074 static SDValue
LowerSVEIntrinsicEXT(SDNode
*N
, SelectionDAG
&DAG
) {
17076 LLVMContext
&Ctx
= *DAG
.getContext();
17077 EVT VT
= N
->getValueType(0);
17079 assert(VT
.isScalableVector() && "Expected a scalable vector.");
17081 // Current lowering only supports the SVE-ACLE types.
17082 if (VT
.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock
)
17085 unsigned ElemSize
= VT
.getVectorElementType().getSizeInBits() / 8;
17086 unsigned ByteSize
= VT
.getSizeInBits().getKnownMinSize() / 8;
17088 EVT::getVectorVT(Ctx
, MVT::i8
, ElementCount::getScalable(ByteSize
));
17090 // Convert everything to the domain of EXT (i.e bytes).
17091 SDValue Op0
= DAG
.getNode(ISD::BITCAST
, dl
, ByteVT
, N
->getOperand(1));
17092 SDValue Op1
= DAG
.getNode(ISD::BITCAST
, dl
, ByteVT
, N
->getOperand(2));
17093 SDValue Op2
= DAG
.getNode(ISD::MUL
, dl
, MVT::i32
, N
->getOperand(3),
17094 DAG
.getConstant(ElemSize
, dl
, MVT::i32
));
17096 SDValue EXT
= DAG
.getNode(AArch64ISD::EXT
, dl
, ByteVT
, Op0
, Op1
, Op2
);
17097 return DAG
.getNode(ISD::BITCAST
, dl
, VT
, EXT
);
17100 static SDValue
tryConvertSVEWideCompare(SDNode
*N
, ISD::CondCode CC
,
17101 TargetLowering::DAGCombinerInfo
&DCI
,
17102 SelectionDAG
&DAG
) {
17103 if (DCI
.isBeforeLegalize())
17106 SDValue Comparator
= N
->getOperand(3);
17107 if (Comparator
.getOpcode() == AArch64ISD::DUP
||
17108 Comparator
.getOpcode() == ISD::SPLAT_VECTOR
) {
17109 unsigned IID
= getIntrinsicID(N
);
17110 EVT VT
= N
->getValueType(0);
17111 EVT CmpVT
= N
->getOperand(2).getValueType();
17112 SDValue Pred
= N
->getOperand(1);
17118 llvm_unreachable("Called with wrong intrinsic!");
17121 // Signed comparisons
17122 case Intrinsic::aarch64_sve_cmpeq_wide
:
17123 case Intrinsic::aarch64_sve_cmpne_wide
:
17124 case Intrinsic::aarch64_sve_cmpge_wide
:
17125 case Intrinsic::aarch64_sve_cmpgt_wide
:
17126 case Intrinsic::aarch64_sve_cmplt_wide
:
17127 case Intrinsic::aarch64_sve_cmple_wide
: {
17128 if (auto *CN
= dyn_cast
<ConstantSDNode
>(Comparator
.getOperand(0))) {
17129 int64_t ImmVal
= CN
->getSExtValue();
17130 if (ImmVal
>= -16 && ImmVal
<= 15)
17131 Imm
= DAG
.getConstant(ImmVal
, DL
, MVT::i32
);
17137 // Unsigned comparisons
17138 case Intrinsic::aarch64_sve_cmphs_wide
:
17139 case Intrinsic::aarch64_sve_cmphi_wide
:
17140 case Intrinsic::aarch64_sve_cmplo_wide
:
17141 case Intrinsic::aarch64_sve_cmpls_wide
: {
17142 if (auto *CN
= dyn_cast
<ConstantSDNode
>(Comparator
.getOperand(0))) {
17143 uint64_t ImmVal
= CN
->getZExtValue();
17145 Imm
= DAG
.getConstant(ImmVal
, DL
, MVT::i32
);
17156 SDValue Splat
= DAG
.getNode(ISD::SPLAT_VECTOR
, DL
, CmpVT
, Imm
);
17157 return DAG
.getNode(AArch64ISD::SETCC_MERGE_ZERO
, DL
, VT
, Pred
,
17158 N
->getOperand(2), Splat
, DAG
.getCondCode(CC
));
17164 static SDValue
getPTest(SelectionDAG
&DAG
, EVT VT
, SDValue Pg
, SDValue Op
,
17165 AArch64CC::CondCode Cond
) {
17166 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
17169 assert(Op
.getValueType().isScalableVector() &&
17170 TLI
.isTypeLegal(Op
.getValueType()) &&
17171 "Expected legal scalable vector type!");
17172 assert(Op
.getValueType() == Pg
.getValueType() &&
17173 "Expected same type for PTEST operands");
17175 // Ensure target specific opcodes are using legal type.
17176 EVT OutVT
= TLI
.getTypeToTransformTo(*DAG
.getContext(), VT
);
17177 SDValue TVal
= DAG
.getConstant(1, DL
, OutVT
);
17178 SDValue FVal
= DAG
.getConstant(0, DL
, OutVT
);
17180 // Ensure operands have type nxv16i1.
17181 if (Op
.getValueType() != MVT::nxv16i1
) {
17182 if ((Cond
== AArch64CC::ANY_ACTIVE
|| Cond
== AArch64CC::NONE_ACTIVE
) &&
17183 isZeroingInactiveLanes(Op
))
17184 Pg
= DAG
.getNode(AArch64ISD::REINTERPRET_CAST
, DL
, MVT::nxv16i1
, Pg
);
17186 Pg
= getSVEPredicateBitCast(MVT::nxv16i1
, Pg
, DAG
);
17187 Op
= DAG
.getNode(AArch64ISD::REINTERPRET_CAST
, DL
, MVT::nxv16i1
, Op
);
17190 // Set condition code (CC) flags.
17191 SDValue Test
= DAG
.getNode(AArch64ISD::PTEST
, DL
, MVT::Other
, Pg
, Op
);
17193 // Convert CC to integer based on requested condition.
17194 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
17195 SDValue CC
= DAG
.getConstant(getInvertedCondCode(Cond
), DL
, MVT::i32
);
17196 SDValue Res
= DAG
.getNode(AArch64ISD::CSEL
, DL
, OutVT
, FVal
, TVal
, CC
, Test
);
17197 return DAG
.getZExtOrTrunc(Res
, DL
, VT
);
17200 static SDValue
combineSVEReductionInt(SDNode
*N
, unsigned Opc
,
17201 SelectionDAG
&DAG
) {
17204 SDValue Pred
= N
->getOperand(1);
17205 SDValue VecToReduce
= N
->getOperand(2);
17207 // NOTE: The integer reduction's result type is not always linked to the
17208 // operand's element type so we construct it from the intrinsic's result type.
17209 EVT ReduceVT
= getPackedSVEVectorVT(N
->getValueType(0));
17210 SDValue Reduce
= DAG
.getNode(Opc
, DL
, ReduceVT
, Pred
, VecToReduce
);
17212 // SVE reductions set the whole vector register with the first element
17213 // containing the reduction result, which we'll now extract.
17214 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i64
);
17215 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, N
->getValueType(0), Reduce
,
17219 static SDValue
combineSVEReductionFP(SDNode
*N
, unsigned Opc
,
17220 SelectionDAG
&DAG
) {
17223 SDValue Pred
= N
->getOperand(1);
17224 SDValue VecToReduce
= N
->getOperand(2);
17226 EVT ReduceVT
= VecToReduce
.getValueType();
17227 SDValue Reduce
= DAG
.getNode(Opc
, DL
, ReduceVT
, Pred
, VecToReduce
);
17229 // SVE reductions set the whole vector register with the first element
17230 // containing the reduction result, which we'll now extract.
17231 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i64
);
17232 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, N
->getValueType(0), Reduce
,
17236 static SDValue
combineSVEReductionOrderedFP(SDNode
*N
, unsigned Opc
,
17237 SelectionDAG
&DAG
) {
17240 SDValue Pred
= N
->getOperand(1);
17241 SDValue InitVal
= N
->getOperand(2);
17242 SDValue VecToReduce
= N
->getOperand(3);
17243 EVT ReduceVT
= VecToReduce
.getValueType();
17245 // Ordered reductions use the first lane of the result vector as the
17246 // reduction's initial value.
17247 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i64
);
17248 InitVal
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, DL
, ReduceVT
,
17249 DAG
.getUNDEF(ReduceVT
), InitVal
, Zero
);
17251 SDValue Reduce
= DAG
.getNode(Opc
, DL
, ReduceVT
, Pred
, InitVal
, VecToReduce
);
17253 // SVE reductions set the whole vector register with the first element
17254 // containing the reduction result, which we'll now extract.
17255 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, N
->getValueType(0), Reduce
,
17259 static bool isAllInactivePredicate(SDValue N
) {
17260 // Look through cast.
17261 while (N
.getOpcode() == AArch64ISD::REINTERPRET_CAST
)
17262 N
= N
.getOperand(0);
17264 return ISD::isConstantSplatVectorAllZeros(N
.getNode());
17267 static bool isAllActivePredicate(SelectionDAG
&DAG
, SDValue N
) {
17268 unsigned NumElts
= N
.getValueType().getVectorMinNumElements();
17270 // Look through cast.
17271 while (N
.getOpcode() == AArch64ISD::REINTERPRET_CAST
) {
17272 N
= N
.getOperand(0);
17273 // When reinterpreting from a type with fewer elements the "new" elements
17274 // are not active, so bail if they're likely to be used.
17275 if (N
.getValueType().getVectorMinNumElements() < NumElts
)
17279 if (ISD::isConstantSplatVectorAllOnes(N
.getNode()))
17282 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
17283 // or smaller than the implicit element type represented by N.
17284 // NOTE: A larger element count implies a smaller element type.
17285 if (N
.getOpcode() == AArch64ISD::PTRUE
&&
17286 N
.getConstantOperandVal(0) == AArch64SVEPredPattern::all
)
17287 return N
.getValueType().getVectorMinNumElements() >= NumElts
;
17289 // If we're compiling for a specific vector-length, we can check if the
17290 // pattern's VL equals that of the scalable vector at runtime.
17291 if (N
.getOpcode() == AArch64ISD::PTRUE
) {
17292 const auto &Subtarget
= DAG
.getSubtarget
<AArch64Subtarget
>();
17293 unsigned MinSVESize
= Subtarget
.getMinSVEVectorSizeInBits();
17294 unsigned MaxSVESize
= Subtarget
.getMaxSVEVectorSizeInBits();
17295 if (MaxSVESize
&& MinSVESize
== MaxSVESize
) {
17296 unsigned VScale
= MaxSVESize
/ AArch64::SVEBitsPerBlock
;
17297 unsigned PatNumElts
=
17298 getNumElementsFromSVEPredPattern(N
.getConstantOperandVal(0));
17299 return PatNumElts
== (NumElts
* VScale
);
17306 // If a merged operation has no inactive lanes we can relax it to a predicated
17307 // or unpredicated operation, which potentially allows better isel (perhaps
17308 // using immediate forms) or relaxing register reuse requirements.
17309 static SDValue
convertMergedOpToPredOp(SDNode
*N
, unsigned Opc
,
17310 SelectionDAG
&DAG
, bool UnpredOp
= false,
17311 bool SwapOperands
= false) {
17312 assert(N
->getOpcode() == ISD::INTRINSIC_WO_CHAIN
&& "Expected intrinsic!");
17313 assert(N
->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
17314 SDValue Pg
= N
->getOperand(1);
17315 SDValue Op1
= N
->getOperand(SwapOperands
? 3 : 2);
17316 SDValue Op2
= N
->getOperand(SwapOperands
? 2 : 3);
17318 // ISD way to specify an all active predicate.
17319 if (isAllActivePredicate(DAG
, Pg
)) {
17321 return DAG
.getNode(Opc
, SDLoc(N
), N
->getValueType(0), Op1
, Op2
);
17323 return DAG
.getNode(Opc
, SDLoc(N
), N
->getValueType(0), Pg
, Op1
, Op2
);
17326 // FUTURE: SplatVector(true)
17330 static SDValue
performIntrinsicCombine(SDNode
*N
,
17331 TargetLowering::DAGCombinerInfo
&DCI
,
17332 const AArch64Subtarget
*Subtarget
) {
17333 SelectionDAG
&DAG
= DCI
.DAG
;
17334 unsigned IID
= getIntrinsicID(N
);
17338 case Intrinsic::get_active_lane_mask
: {
17339 SDValue Res
= SDValue();
17340 EVT VT
= N
->getValueType(0);
17341 if (VT
.isFixedLengthVector()) {
17342 // We can use the SVE whilelo instruction to lower this intrinsic by
17343 // creating the appropriate sequence of scalable vector operations and
17344 // then extracting a fixed-width subvector from the scalable vector.
17348 DAG
.getTargetConstant(Intrinsic::aarch64_sve_whilelo
, DL
, MVT::i64
);
17350 EVT WhileVT
= EVT::getVectorVT(
17351 *DAG
.getContext(), MVT::i1
,
17352 ElementCount::getScalable(VT
.getVectorNumElements()));
17354 // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
17355 EVT PromVT
= getPromotedVTForPredicate(WhileVT
);
17357 // Get the fixed-width equivalent of PromVT for extraction.
17359 EVT::getVectorVT(*DAG
.getContext(), PromVT
.getVectorElementType(),
17360 VT
.getVectorElementCount());
17362 Res
= DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, WhileVT
, ID
,
17363 N
->getOperand(1), N
->getOperand(2));
17364 Res
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, PromVT
, Res
);
17365 Res
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, ExtVT
, Res
,
17366 DAG
.getConstant(0, DL
, MVT::i64
));
17367 Res
= DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, Res
);
17371 case Intrinsic::aarch64_neon_vcvtfxs2fp
:
17372 case Intrinsic::aarch64_neon_vcvtfxu2fp
:
17373 return tryCombineFixedPointConvert(N
, DCI
, DAG
);
17374 case Intrinsic::aarch64_neon_saddv
:
17375 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV
, N
, DAG
);
17376 case Intrinsic::aarch64_neon_uaddv
:
17377 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV
, N
, DAG
);
17378 case Intrinsic::aarch64_neon_sminv
:
17379 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV
, N
, DAG
);
17380 case Intrinsic::aarch64_neon_uminv
:
17381 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV
, N
, DAG
);
17382 case Intrinsic::aarch64_neon_smaxv
:
17383 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV
, N
, DAG
);
17384 case Intrinsic::aarch64_neon_umaxv
:
17385 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV
, N
, DAG
);
17386 case Intrinsic::aarch64_neon_fmax
:
17387 return DAG
.getNode(ISD::FMAXIMUM
, SDLoc(N
), N
->getValueType(0),
17388 N
->getOperand(1), N
->getOperand(2));
17389 case Intrinsic::aarch64_neon_fmin
:
17390 return DAG
.getNode(ISD::FMINIMUM
, SDLoc(N
), N
->getValueType(0),
17391 N
->getOperand(1), N
->getOperand(2));
17392 case Intrinsic::aarch64_neon_fmaxnm
:
17393 return DAG
.getNode(ISD::FMAXNUM
, SDLoc(N
), N
->getValueType(0),
17394 N
->getOperand(1), N
->getOperand(2));
17395 case Intrinsic::aarch64_neon_fminnm
:
17396 return DAG
.getNode(ISD::FMINNUM
, SDLoc(N
), N
->getValueType(0),
17397 N
->getOperand(1), N
->getOperand(2));
17398 case Intrinsic::aarch64_neon_smull
:
17399 return DAG
.getNode(AArch64ISD::SMULL
, SDLoc(N
), N
->getValueType(0),
17400 N
->getOperand(1), N
->getOperand(2));
17401 case Intrinsic::aarch64_neon_umull
:
17402 return DAG
.getNode(AArch64ISD::UMULL
, SDLoc(N
), N
->getValueType(0),
17403 N
->getOperand(1), N
->getOperand(2));
17404 case Intrinsic::aarch64_neon_pmull
:
17405 return DAG
.getNode(AArch64ISD::PMULL
, SDLoc(N
), N
->getValueType(0),
17406 N
->getOperand(1), N
->getOperand(2));
17407 case Intrinsic::aarch64_neon_sqdmull
:
17408 return tryCombineLongOpWithDup(IID
, N
, DCI
, DAG
);
17409 case Intrinsic::aarch64_neon_sqshl
:
17410 case Intrinsic::aarch64_neon_uqshl
:
17411 case Intrinsic::aarch64_neon_sqshlu
:
17412 case Intrinsic::aarch64_neon_srshl
:
17413 case Intrinsic::aarch64_neon_urshl
:
17414 case Intrinsic::aarch64_neon_sshl
:
17415 case Intrinsic::aarch64_neon_ushl
:
17416 return tryCombineShiftImm(IID
, N
, DAG
);
17417 case Intrinsic::aarch64_crc32b
:
17418 case Intrinsic::aarch64_crc32cb
:
17419 return tryCombineCRC32(0xff, N
, DAG
);
17420 case Intrinsic::aarch64_crc32h
:
17421 case Intrinsic::aarch64_crc32ch
:
17422 return tryCombineCRC32(0xffff, N
, DAG
);
17423 case Intrinsic::aarch64_sve_saddv
:
17424 // There is no i64 version of SADDV because the sign is irrelevant.
17425 if (N
->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64
)
17426 return combineSVEReductionInt(N
, AArch64ISD::UADDV_PRED
, DAG
);
17428 return combineSVEReductionInt(N
, AArch64ISD::SADDV_PRED
, DAG
);
17429 case Intrinsic::aarch64_sve_uaddv
:
17430 return combineSVEReductionInt(N
, AArch64ISD::UADDV_PRED
, DAG
);
17431 case Intrinsic::aarch64_sve_smaxv
:
17432 return combineSVEReductionInt(N
, AArch64ISD::SMAXV_PRED
, DAG
);
17433 case Intrinsic::aarch64_sve_umaxv
:
17434 return combineSVEReductionInt(N
, AArch64ISD::UMAXV_PRED
, DAG
);
17435 case Intrinsic::aarch64_sve_sminv
:
17436 return combineSVEReductionInt(N
, AArch64ISD::SMINV_PRED
, DAG
);
17437 case Intrinsic::aarch64_sve_uminv
:
17438 return combineSVEReductionInt(N
, AArch64ISD::UMINV_PRED
, DAG
);
17439 case Intrinsic::aarch64_sve_orv
:
17440 return combineSVEReductionInt(N
, AArch64ISD::ORV_PRED
, DAG
);
17441 case Intrinsic::aarch64_sve_eorv
:
17442 return combineSVEReductionInt(N
, AArch64ISD::EORV_PRED
, DAG
);
17443 case Intrinsic::aarch64_sve_andv
:
17444 return combineSVEReductionInt(N
, AArch64ISD::ANDV_PRED
, DAG
);
17445 case Intrinsic::aarch64_sve_index
:
17446 return LowerSVEIntrinsicIndex(N
, DAG
);
17447 case Intrinsic::aarch64_sve_dup
:
17448 return LowerSVEIntrinsicDUP(N
, DAG
);
17449 case Intrinsic::aarch64_sve_dup_x
:
17450 return DAG
.getNode(ISD::SPLAT_VECTOR
, SDLoc(N
), N
->getValueType(0),
17452 case Intrinsic::aarch64_sve_ext
:
17453 return LowerSVEIntrinsicEXT(N
, DAG
);
17454 case Intrinsic::aarch64_sve_mul
:
17455 return convertMergedOpToPredOp(N
, AArch64ISD::MUL_PRED
, DAG
);
17456 case Intrinsic::aarch64_sve_smulh
:
17457 return convertMergedOpToPredOp(N
, AArch64ISD::MULHS_PRED
, DAG
);
17458 case Intrinsic::aarch64_sve_umulh
:
17459 return convertMergedOpToPredOp(N
, AArch64ISD::MULHU_PRED
, DAG
);
17460 case Intrinsic::aarch64_sve_smin
:
17461 return convertMergedOpToPredOp(N
, AArch64ISD::SMIN_PRED
, DAG
);
17462 case Intrinsic::aarch64_sve_umin
:
17463 return convertMergedOpToPredOp(N
, AArch64ISD::UMIN_PRED
, DAG
);
17464 case Intrinsic::aarch64_sve_smax
:
17465 return convertMergedOpToPredOp(N
, AArch64ISD::SMAX_PRED
, DAG
);
17466 case Intrinsic::aarch64_sve_umax
:
17467 return convertMergedOpToPredOp(N
, AArch64ISD::UMAX_PRED
, DAG
);
17468 case Intrinsic::aarch64_sve_lsl
:
17469 return convertMergedOpToPredOp(N
, AArch64ISD::SHL_PRED
, DAG
);
17470 case Intrinsic::aarch64_sve_lsr
:
17471 return convertMergedOpToPredOp(N
, AArch64ISD::SRL_PRED
, DAG
);
17472 case Intrinsic::aarch64_sve_asr
:
17473 return convertMergedOpToPredOp(N
, AArch64ISD::SRA_PRED
, DAG
);
17474 case Intrinsic::aarch64_sve_fadd
:
17475 return convertMergedOpToPredOp(N
, AArch64ISD::FADD_PRED
, DAG
);
17476 case Intrinsic::aarch64_sve_fsub
:
17477 return convertMergedOpToPredOp(N
, AArch64ISD::FSUB_PRED
, DAG
);
17478 case Intrinsic::aarch64_sve_fmul
:
17479 return convertMergedOpToPredOp(N
, AArch64ISD::FMUL_PRED
, DAG
);
17480 case Intrinsic::aarch64_sve_add
:
17481 return convertMergedOpToPredOp(N
, ISD::ADD
, DAG
, true);
17482 case Intrinsic::aarch64_sve_sub
:
17483 return convertMergedOpToPredOp(N
, ISD::SUB
, DAG
, true);
17484 case Intrinsic::aarch64_sve_subr
:
17485 return convertMergedOpToPredOp(N
, ISD::SUB
, DAG
, true, true);
17486 case Intrinsic::aarch64_sve_and
:
17487 return convertMergedOpToPredOp(N
, ISD::AND
, DAG
, true);
17488 case Intrinsic::aarch64_sve_bic
:
17489 return convertMergedOpToPredOp(N
, AArch64ISD::BIC
, DAG
, true);
17490 case Intrinsic::aarch64_sve_eor
:
17491 return convertMergedOpToPredOp(N
, ISD::XOR
, DAG
, true);
17492 case Intrinsic::aarch64_sve_orr
:
17493 return convertMergedOpToPredOp(N
, ISD::OR
, DAG
, true);
17494 case Intrinsic::aarch64_sve_sabd
:
17495 return convertMergedOpToPredOp(N
, ISD::ABDS
, DAG
, true);
17496 case Intrinsic::aarch64_sve_uabd
:
17497 return convertMergedOpToPredOp(N
, ISD::ABDU
, DAG
, true);
17498 case Intrinsic::aarch64_sve_sqadd
:
17499 return convertMergedOpToPredOp(N
, ISD::SADDSAT
, DAG
, true);
17500 case Intrinsic::aarch64_sve_sqsub
:
17501 return convertMergedOpToPredOp(N
, ISD::SSUBSAT
, DAG
, true);
17502 case Intrinsic::aarch64_sve_uqadd
:
17503 return convertMergedOpToPredOp(N
, ISD::UADDSAT
, DAG
, true);
17504 case Intrinsic::aarch64_sve_uqsub
:
17505 return convertMergedOpToPredOp(N
, ISD::USUBSAT
, DAG
, true);
17506 case Intrinsic::aarch64_sve_sqadd_x
:
17507 return DAG
.getNode(ISD::SADDSAT
, SDLoc(N
), N
->getValueType(0),
17508 N
->getOperand(1), N
->getOperand(2));
17509 case Intrinsic::aarch64_sve_sqsub_x
:
17510 return DAG
.getNode(ISD::SSUBSAT
, SDLoc(N
), N
->getValueType(0),
17511 N
->getOperand(1), N
->getOperand(2));
17512 case Intrinsic::aarch64_sve_uqadd_x
:
17513 return DAG
.getNode(ISD::UADDSAT
, SDLoc(N
), N
->getValueType(0),
17514 N
->getOperand(1), N
->getOperand(2));
17515 case Intrinsic::aarch64_sve_uqsub_x
:
17516 return DAG
.getNode(ISD::USUBSAT
, SDLoc(N
), N
->getValueType(0),
17517 N
->getOperand(1), N
->getOperand(2));
17518 case Intrinsic::aarch64_sve_asrd
:
17519 return DAG
.getNode(AArch64ISD::SRAD_MERGE_OP1
, SDLoc(N
), N
->getValueType(0),
17520 N
->getOperand(1), N
->getOperand(2), N
->getOperand(3));
17521 case Intrinsic::aarch64_sve_cmphs
:
17522 if (!N
->getOperand(2).getValueType().isFloatingPoint())
17523 return DAG
.getNode(AArch64ISD::SETCC_MERGE_ZERO
, SDLoc(N
),
17524 N
->getValueType(0), N
->getOperand(1), N
->getOperand(2),
17525 N
->getOperand(3), DAG
.getCondCode(ISD::SETUGE
));
17527 case Intrinsic::aarch64_sve_cmphi
:
17528 if (!N
->getOperand(2).getValueType().isFloatingPoint())
17529 return DAG
.getNode(AArch64ISD::SETCC_MERGE_ZERO
, SDLoc(N
),
17530 N
->getValueType(0), N
->getOperand(1), N
->getOperand(2),
17531 N
->getOperand(3), DAG
.getCondCode(ISD::SETUGT
));
17533 case Intrinsic::aarch64_sve_fcmpge
:
17534 case Intrinsic::aarch64_sve_cmpge
:
17535 return DAG
.getNode(AArch64ISD::SETCC_MERGE_ZERO
, SDLoc(N
),
17536 N
->getValueType(0), N
->getOperand(1), N
->getOperand(2),
17537 N
->getOperand(3), DAG
.getCondCode(ISD::SETGE
));
17539 case Intrinsic::aarch64_sve_fcmpgt
:
17540 case Intrinsic::aarch64_sve_cmpgt
:
17541 return DAG
.getNode(AArch64ISD::SETCC_MERGE_ZERO
, SDLoc(N
),
17542 N
->getValueType(0), N
->getOperand(1), N
->getOperand(2),
17543 N
->getOperand(3), DAG
.getCondCode(ISD::SETGT
));
17545 case Intrinsic::aarch64_sve_fcmpeq
:
17546 case Intrinsic::aarch64_sve_cmpeq
:
17547 return DAG
.getNode(AArch64ISD::SETCC_MERGE_ZERO
, SDLoc(N
),
17548 N
->getValueType(0), N
->getOperand(1), N
->getOperand(2),
17549 N
->getOperand(3), DAG
.getCondCode(ISD::SETEQ
));
17551 case Intrinsic::aarch64_sve_fcmpne
:
17552 case Intrinsic::aarch64_sve_cmpne
:
17553 return DAG
.getNode(AArch64ISD::SETCC_MERGE_ZERO
, SDLoc(N
),
17554 N
->getValueType(0), N
->getOperand(1), N
->getOperand(2),
17555 N
->getOperand(3), DAG
.getCondCode(ISD::SETNE
));
17557 case Intrinsic::aarch64_sve_fcmpuo
:
17558 return DAG
.getNode(AArch64ISD::SETCC_MERGE_ZERO
, SDLoc(N
),
17559 N
->getValueType(0), N
->getOperand(1), N
->getOperand(2),
17560 N
->getOperand(3), DAG
.getCondCode(ISD::SETUO
));
17562 case Intrinsic::aarch64_sve_fadda
:
17563 return combineSVEReductionOrderedFP(N
, AArch64ISD::FADDA_PRED
, DAG
);
17564 case Intrinsic::aarch64_sve_faddv
:
17565 return combineSVEReductionFP(N
, AArch64ISD::FADDV_PRED
, DAG
);
17566 case Intrinsic::aarch64_sve_fmaxnmv
:
17567 return combineSVEReductionFP(N
, AArch64ISD::FMAXNMV_PRED
, DAG
);
17568 case Intrinsic::aarch64_sve_fmaxv
:
17569 return combineSVEReductionFP(N
, AArch64ISD::FMAXV_PRED
, DAG
);
17570 case Intrinsic::aarch64_sve_fminnmv
:
17571 return combineSVEReductionFP(N
, AArch64ISD::FMINNMV_PRED
, DAG
);
17572 case Intrinsic::aarch64_sve_fminv
:
17573 return combineSVEReductionFP(N
, AArch64ISD::FMINV_PRED
, DAG
);
17574 case Intrinsic::aarch64_sve_sel
:
17575 return DAG
.getNode(ISD::VSELECT
, SDLoc(N
), N
->getValueType(0),
17576 N
->getOperand(1), N
->getOperand(2), N
->getOperand(3));
17577 case Intrinsic::aarch64_sve_cmpeq_wide
:
17578 return tryConvertSVEWideCompare(N
, ISD::SETEQ
, DCI
, DAG
);
17579 case Intrinsic::aarch64_sve_cmpne_wide
:
17580 return tryConvertSVEWideCompare(N
, ISD::SETNE
, DCI
, DAG
);
17581 case Intrinsic::aarch64_sve_cmpge_wide
:
17582 return tryConvertSVEWideCompare(N
, ISD::SETGE
, DCI
, DAG
);
17583 case Intrinsic::aarch64_sve_cmpgt_wide
:
17584 return tryConvertSVEWideCompare(N
, ISD::SETGT
, DCI
, DAG
);
17585 case Intrinsic::aarch64_sve_cmplt_wide
:
17586 return tryConvertSVEWideCompare(N
, ISD::SETLT
, DCI
, DAG
);
17587 case Intrinsic::aarch64_sve_cmple_wide
:
17588 return tryConvertSVEWideCompare(N
, ISD::SETLE
, DCI
, DAG
);
17589 case Intrinsic::aarch64_sve_cmphs_wide
:
17590 return tryConvertSVEWideCompare(N
, ISD::SETUGE
, DCI
, DAG
);
17591 case Intrinsic::aarch64_sve_cmphi_wide
:
17592 return tryConvertSVEWideCompare(N
, ISD::SETUGT
, DCI
, DAG
);
17593 case Intrinsic::aarch64_sve_cmplo_wide
:
17594 return tryConvertSVEWideCompare(N
, ISD::SETULT
, DCI
, DAG
);
17595 case Intrinsic::aarch64_sve_cmpls_wide
:
17596 return tryConvertSVEWideCompare(N
, ISD::SETULE
, DCI
, DAG
);
17597 case Intrinsic::aarch64_sve_ptest_any
:
17598 return getPTest(DAG
, N
->getValueType(0), N
->getOperand(1), N
->getOperand(2),
17599 AArch64CC::ANY_ACTIVE
);
17600 case Intrinsic::aarch64_sve_ptest_first
:
17601 return getPTest(DAG
, N
->getValueType(0), N
->getOperand(1), N
->getOperand(2),
17602 AArch64CC::FIRST_ACTIVE
);
17603 case Intrinsic::aarch64_sve_ptest_last
:
17604 return getPTest(DAG
, N
->getValueType(0), N
->getOperand(1), N
->getOperand(2),
17605 AArch64CC::LAST_ACTIVE
);
17610 static bool isCheapToExtend(const SDValue
&N
) {
17611 unsigned OC
= N
->getOpcode();
17612 return OC
== ISD::LOAD
|| OC
== ISD::MLOAD
||
17613 ISD::isConstantSplatVectorAllZeros(N
.getNode());
17617 performSignExtendSetCCCombine(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
,
17618 SelectionDAG
&DAG
) {
17619 // If we have (sext (setcc A B)) and A and B are cheap to extend,
17620 // we can move the sext into the arguments and have the same result. For
17621 // example, if A and B are both loads, we can make those extending loads and
17622 // avoid an extra instruction. This pattern appears often in VLS code
17623 // generation where the inputs to the setcc have a different size to the
17624 // instruction that wants to use the result of the setcc.
17625 assert(N
->getOpcode() == ISD::SIGN_EXTEND
&&
17626 N
->getOperand(0)->getOpcode() == ISD::SETCC
);
17627 const SDValue SetCC
= N
->getOperand(0);
17629 const SDValue CCOp0
= SetCC
.getOperand(0);
17630 const SDValue CCOp1
= SetCC
.getOperand(1);
17631 if (!CCOp0
->getValueType(0).isInteger() ||
17632 !CCOp1
->getValueType(0).isInteger())
17635 ISD::CondCode Code
=
17636 cast
<CondCodeSDNode
>(SetCC
->getOperand(2).getNode())->get();
17638 ISD::NodeType ExtType
=
17639 isSignedIntSetCC(Code
) ? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
17641 if (isCheapToExtend(SetCC
.getOperand(0)) &&
17642 isCheapToExtend(SetCC
.getOperand(1))) {
17643 const SDValue Ext1
=
17644 DAG
.getNode(ExtType
, SDLoc(N
), N
->getValueType(0), CCOp0
);
17645 const SDValue Ext2
=
17646 DAG
.getNode(ExtType
, SDLoc(N
), N
->getValueType(0), CCOp1
);
17648 return DAG
.getSetCC(
17649 SDLoc(SetCC
), N
->getValueType(0), Ext1
, Ext2
,
17650 cast
<CondCodeSDNode
>(SetCC
->getOperand(2).getNode())->get());
17656 static SDValue
performExtendCombine(SDNode
*N
,
17657 TargetLowering::DAGCombinerInfo
&DCI
,
17658 SelectionDAG
&DAG
) {
17659 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
17660 // we can convert that DUP into another extract_high (of a bigger DUP), which
17661 // helps the backend to decide that an sabdl2 would be useful, saving a real
17662 // extract_high operation.
17663 if (!DCI
.isBeforeLegalizeOps() && N
->getOpcode() == ISD::ZERO_EXTEND
&&
17664 (N
->getOperand(0).getOpcode() == ISD::ABDU
||
17665 N
->getOperand(0).getOpcode() == ISD::ABDS
)) {
17666 SDNode
*ABDNode
= N
->getOperand(0).getNode();
17668 tryCombineLongOpWithDup(Intrinsic::not_intrinsic
, ABDNode
, DCI
, DAG
);
17669 if (!NewABD
.getNode())
17672 return DAG
.getNode(ISD::ZERO_EXTEND
, SDLoc(N
), N
->getValueType(0), NewABD
);
17675 if (N
->getValueType(0).isFixedLengthVector() &&
17676 N
->getOpcode() == ISD::SIGN_EXTEND
&&
17677 N
->getOperand(0)->getOpcode() == ISD::SETCC
)
17678 return performSignExtendSetCCCombine(N
, DCI
, DAG
);
17683 static SDValue
splitStoreSplat(SelectionDAG
&DAG
, StoreSDNode
&St
,
17684 SDValue SplatVal
, unsigned NumVecElts
) {
17685 assert(!St
.isTruncatingStore() && "cannot split truncating vector store");
17686 Align OrigAlignment
= St
.getAlign();
17687 unsigned EltOffset
= SplatVal
.getValueType().getSizeInBits() / 8;
17689 // Create scalar stores. This is at least as good as the code sequence for a
17690 // split unaligned store which is a dup.s, ext.b, and two stores.
17691 // Most of the time the three stores should be replaced by store pair
17692 // instructions (stp).
17694 SDValue BasePtr
= St
.getBasePtr();
17695 uint64_t BaseOffset
= 0;
17697 const MachinePointerInfo
&PtrInfo
= St
.getPointerInfo();
17699 DAG
.getStore(St
.getChain(), DL
, SplatVal
, BasePtr
, PtrInfo
,
17700 OrigAlignment
, St
.getMemOperand()->getFlags());
17702 // As this in ISel, we will not merge this add which may degrade results.
17703 if (BasePtr
->getOpcode() == ISD::ADD
&&
17704 isa
<ConstantSDNode
>(BasePtr
->getOperand(1))) {
17705 BaseOffset
= cast
<ConstantSDNode
>(BasePtr
->getOperand(1))->getSExtValue();
17706 BasePtr
= BasePtr
->getOperand(0);
17709 unsigned Offset
= EltOffset
;
17710 while (--NumVecElts
) {
17711 Align Alignment
= commonAlignment(OrigAlignment
, Offset
);
17712 SDValue OffsetPtr
=
17713 DAG
.getNode(ISD::ADD
, DL
, MVT::i64
, BasePtr
,
17714 DAG
.getConstant(BaseOffset
+ Offset
, DL
, MVT::i64
));
17715 NewST1
= DAG
.getStore(NewST1
.getValue(0), DL
, SplatVal
, OffsetPtr
,
17716 PtrInfo
.getWithOffset(Offset
), Alignment
,
17717 St
.getMemOperand()->getFlags());
17718 Offset
+= EltOffset
;
17723 // Returns an SVE type that ContentTy can be trivially sign or zero extended
17725 static MVT
getSVEContainerType(EVT ContentTy
) {
17726 assert(ContentTy
.isSimple() && "No SVE containers for extended types");
17728 switch (ContentTy
.getSimpleVT().SimpleTy
) {
17730 llvm_unreachable("No known SVE container for this MVT type");
17737 return MVT::nxv2i64
;
17742 return MVT::nxv4i32
;
17746 case MVT::nxv8bf16
:
17747 return MVT::nxv8i16
;
17749 return MVT::nxv16i8
;
17753 static SDValue
performLD1Combine(SDNode
*N
, SelectionDAG
&DAG
, unsigned Opc
) {
17755 EVT VT
= N
->getValueType(0);
17757 if (VT
.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock
)
17760 EVT ContainerVT
= VT
;
17761 if (ContainerVT
.isInteger())
17762 ContainerVT
= getSVEContainerType(ContainerVT
);
17764 SDVTList VTs
= DAG
.getVTList(ContainerVT
, MVT::Other
);
17765 SDValue Ops
[] = { N
->getOperand(0), // Chain
17766 N
->getOperand(2), // Pg
17767 N
->getOperand(3), // Base
17768 DAG
.getValueType(VT
) };
17770 SDValue Load
= DAG
.getNode(Opc
, DL
, VTs
, Ops
);
17771 SDValue LoadChain
= SDValue(Load
.getNode(), 1);
17773 if (ContainerVT
.isInteger() && (VT
!= ContainerVT
))
17774 Load
= DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, Load
.getValue(0));
17776 return DAG
.getMergeValues({ Load
, LoadChain
}, DL
);
17779 static SDValue
performLDNT1Combine(SDNode
*N
, SelectionDAG
&DAG
) {
17781 EVT VT
= N
->getValueType(0);
17782 EVT PtrTy
= N
->getOperand(3).getValueType();
17785 if (VT
.isFloatingPoint())
17786 LoadVT
= VT
.changeTypeToInteger();
17788 auto *MINode
= cast
<MemIntrinsicSDNode
>(N
);
17789 SDValue PassThru
= DAG
.getConstant(0, DL
, LoadVT
);
17790 SDValue L
= DAG
.getMaskedLoad(LoadVT
, DL
, MINode
->getChain(),
17791 MINode
->getOperand(3), DAG
.getUNDEF(PtrTy
),
17792 MINode
->getOperand(2), PassThru
,
17793 MINode
->getMemoryVT(), MINode
->getMemOperand(),
17794 ISD::UNINDEXED
, ISD::NON_EXTLOAD
, false);
17796 if (VT
.isFloatingPoint()) {
17797 SDValue Ops
[] = { DAG
.getNode(ISD::BITCAST
, DL
, VT
, L
), L
.getValue(1) };
17798 return DAG
.getMergeValues(Ops
, DL
);
17804 template <unsigned Opcode
>
17805 static SDValue
performLD1ReplicateCombine(SDNode
*N
, SelectionDAG
&DAG
) {
17806 static_assert(Opcode
== AArch64ISD::LD1RQ_MERGE_ZERO
||
17807 Opcode
== AArch64ISD::LD1RO_MERGE_ZERO
,
17808 "Unsupported opcode.");
17810 EVT VT
= N
->getValueType(0);
17813 if (VT
.isFloatingPoint())
17814 LoadVT
= VT
.changeTypeToInteger();
17816 SDValue Ops
[] = {N
->getOperand(0), N
->getOperand(2), N
->getOperand(3)};
17817 SDValue Load
= DAG
.getNode(Opcode
, DL
, {LoadVT
, MVT::Other
}, Ops
);
17818 SDValue LoadChain
= SDValue(Load
.getNode(), 1);
17820 if (VT
.isFloatingPoint())
17821 Load
= DAG
.getNode(ISD::BITCAST
, DL
, VT
, Load
.getValue(0));
17823 return DAG
.getMergeValues({Load
, LoadChain
}, DL
);
17826 static SDValue
performST1Combine(SDNode
*N
, SelectionDAG
&DAG
) {
17828 SDValue Data
= N
->getOperand(2);
17829 EVT DataVT
= Data
.getValueType();
17830 EVT HwSrcVt
= getSVEContainerType(DataVT
);
17831 SDValue InputVT
= DAG
.getValueType(DataVT
);
17833 if (DataVT
.isFloatingPoint())
17834 InputVT
= DAG
.getValueType(HwSrcVt
);
17837 if (Data
.getValueType().isFloatingPoint())
17838 SrcNew
= DAG
.getNode(ISD::BITCAST
, DL
, HwSrcVt
, Data
);
17840 SrcNew
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, HwSrcVt
, Data
);
17842 SDValue Ops
[] = { N
->getOperand(0), // Chain
17844 N
->getOperand(4), // Base
17845 N
->getOperand(3), // Pg
17849 return DAG
.getNode(AArch64ISD::ST1_PRED
, DL
, N
->getValueType(0), Ops
);
17852 static SDValue
performSTNT1Combine(SDNode
*N
, SelectionDAG
&DAG
) {
17855 SDValue Data
= N
->getOperand(2);
17856 EVT DataVT
= Data
.getValueType();
17857 EVT PtrTy
= N
->getOperand(4).getValueType();
17859 if (DataVT
.isFloatingPoint())
17860 Data
= DAG
.getNode(ISD::BITCAST
, DL
, DataVT
.changeTypeToInteger(), Data
);
17862 auto *MINode
= cast
<MemIntrinsicSDNode
>(N
);
17863 return DAG
.getMaskedStore(MINode
->getChain(), DL
, Data
, MINode
->getOperand(4),
17864 DAG
.getUNDEF(PtrTy
), MINode
->getOperand(3),
17865 MINode
->getMemoryVT(), MINode
->getMemOperand(),
17866 ISD::UNINDEXED
, false, false);
17869 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
17870 /// load store optimizer pass will merge them to store pair stores. This should
17871 /// be better than a movi to create the vector zero followed by a vector store
17872 /// if the zero constant is not re-used, since one instructions and one register
17873 /// live range will be removed.
17875 /// For example, the final generated code should be:
17877 /// stp xzr, xzr, [x0]
17884 static SDValue
replaceZeroVectorStore(SelectionDAG
&DAG
, StoreSDNode
&St
) {
17885 SDValue StVal
= St
.getValue();
17886 EVT VT
= StVal
.getValueType();
17888 // Avoid scalarizing zero splat stores for scalable vectors.
17889 if (VT
.isScalableVector())
17892 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
17893 // 2, 3 or 4 i32 elements.
17894 int NumVecElts
= VT
.getVectorNumElements();
17895 if (!(((NumVecElts
== 2 || NumVecElts
== 3) &&
17896 VT
.getVectorElementType().getSizeInBits() == 64) ||
17897 ((NumVecElts
== 2 || NumVecElts
== 3 || NumVecElts
== 4) &&
17898 VT
.getVectorElementType().getSizeInBits() == 32)))
17901 if (StVal
.getOpcode() != ISD::BUILD_VECTOR
)
17904 // If the zero constant has more than one use then the vector store could be
17905 // better since the constant mov will be amortized and stp q instructions
17906 // should be able to be formed.
17907 if (!StVal
.hasOneUse())
17910 // If the store is truncating then it's going down to i16 or smaller, which
17911 // means it can be implemented in a single store anyway.
17912 if (St
.isTruncatingStore())
17915 // If the immediate offset of the address operand is too large for the stp
17916 // instruction, then bail out.
17917 if (DAG
.isBaseWithConstantOffset(St
.getBasePtr())) {
17918 int64_t Offset
= St
.getBasePtr()->getConstantOperandVal(1);
17919 if (Offset
< -512 || Offset
> 504)
17923 for (int I
= 0; I
< NumVecElts
; ++I
) {
17924 SDValue EltVal
= StVal
.getOperand(I
);
17925 if (!isNullConstant(EltVal
) && !isNullFPConstant(EltVal
))
17929 // Use a CopyFromReg WZR/XZR here to prevent
17930 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
17934 if (VT
.getVectorElementType().getSizeInBits() == 32) {
17935 ZeroReg
= AArch64::WZR
;
17938 ZeroReg
= AArch64::XZR
;
17942 DAG
.getCopyFromReg(DAG
.getEntryNode(), DL
, ZeroReg
, ZeroVT
);
17943 return splitStoreSplat(DAG
, St
, SplatVal
, NumVecElts
);
17946 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
17947 /// value. The load store optimizer pass will merge them to store pair stores.
17948 /// This has better performance than a splat of the scalar followed by a split
17949 /// vector store. Even if the stores are not merged it is four stores vs a dup,
17950 /// followed by an ext.b and two stores.
17951 static SDValue
replaceSplatVectorStore(SelectionDAG
&DAG
, StoreSDNode
&St
) {
17952 SDValue StVal
= St
.getValue();
17953 EVT VT
= StVal
.getValueType();
17955 // Don't replace floating point stores, they possibly won't be transformed to
17956 // stp because of the store pair suppress pass.
17957 if (VT
.isFloatingPoint())
17960 // We can express a splat as store pair(s) for 2 or 4 elements.
17961 unsigned NumVecElts
= VT
.getVectorNumElements();
17962 if (NumVecElts
!= 4 && NumVecElts
!= 2)
17965 // If the store is truncating then it's going down to i16 or smaller, which
17966 // means it can be implemented in a single store anyway.
17967 if (St
.isTruncatingStore())
17970 // Check that this is a splat.
17971 // Make sure that each of the relevant vector element locations are inserted
17972 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
17973 std::bitset
<4> IndexNotInserted((1 << NumVecElts
) - 1);
17975 for (unsigned I
= 0; I
< NumVecElts
; ++I
) {
17976 // Check for insert vector elements.
17977 if (StVal
.getOpcode() != ISD::INSERT_VECTOR_ELT
)
17980 // Check that same value is inserted at each vector element.
17982 SplatVal
= StVal
.getOperand(1);
17983 else if (StVal
.getOperand(1) != SplatVal
)
17986 // Check insert element index.
17987 ConstantSDNode
*CIndex
= dyn_cast
<ConstantSDNode
>(StVal
.getOperand(2));
17990 uint64_t IndexVal
= CIndex
->getZExtValue();
17991 if (IndexVal
>= NumVecElts
)
17993 IndexNotInserted
.reset(IndexVal
);
17995 StVal
= StVal
.getOperand(0);
17997 // Check that all vector element locations were inserted to.
17998 if (IndexNotInserted
.any())
18001 return splitStoreSplat(DAG
, St
, SplatVal
, NumVecElts
);
18004 static SDValue
splitStores(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
,
18006 const AArch64Subtarget
*Subtarget
) {
18008 StoreSDNode
*S
= cast
<StoreSDNode
>(N
);
18009 if (S
->isVolatile() || S
->isIndexed())
18012 SDValue StVal
= S
->getValue();
18013 EVT VT
= StVal
.getValueType();
18015 if (!VT
.isFixedLengthVector())
18018 // If we get a splat of zeros, convert this vector store to a store of
18019 // scalars. They will be merged into store pairs of xzr thereby removing one
18020 // instruction and one register.
18021 if (SDValue ReplacedZeroSplat
= replaceZeroVectorStore(DAG
, *S
))
18022 return ReplacedZeroSplat
;
18024 // FIXME: The logic for deciding if an unaligned store should be split should
18025 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
18026 // a call to that function here.
18028 if (!Subtarget
->isMisaligned128StoreSlow())
18031 // Don't split at -Oz.
18032 if (DAG
.getMachineFunction().getFunction().hasMinSize())
18035 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
18036 // those up regresses performance on micro-benchmarks and olden/bh.
18037 if (VT
.getVectorNumElements() < 2 || VT
== MVT::v2i64
)
18040 // Split unaligned 16B stores. They are terrible for performance.
18041 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
18042 // extensions can use this to mark that it does not want splitting to happen
18043 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
18044 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
18045 if (VT
.getSizeInBits() != 128 || S
->getAlign() >= Align(16) ||
18046 S
->getAlign() <= Align(2))
18049 // If we get a splat of a scalar convert this vector store to a store of
18050 // scalars. They will be merged into store pairs thereby removing two
18052 if (SDValue ReplacedSplat
= replaceSplatVectorStore(DAG
, *S
))
18053 return ReplacedSplat
;
18057 // Split VT into two.
18058 EVT HalfVT
= VT
.getHalfNumVectorElementsVT(*DAG
.getContext());
18059 unsigned NumElts
= HalfVT
.getVectorNumElements();
18060 SDValue SubVector0
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, HalfVT
, StVal
,
18061 DAG
.getConstant(0, DL
, MVT::i64
));
18062 SDValue SubVector1
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, HalfVT
, StVal
,
18063 DAG
.getConstant(NumElts
, DL
, MVT::i64
));
18064 SDValue BasePtr
= S
->getBasePtr();
18066 DAG
.getStore(S
->getChain(), DL
, SubVector0
, BasePtr
, S
->getPointerInfo(),
18067 S
->getAlign(), S
->getMemOperand()->getFlags());
18068 SDValue OffsetPtr
= DAG
.getNode(ISD::ADD
, DL
, MVT::i64
, BasePtr
,
18069 DAG
.getConstant(8, DL
, MVT::i64
));
18070 return DAG
.getStore(NewST1
.getValue(0), DL
, SubVector1
, OffsetPtr
,
18071 S
->getPointerInfo(), S
->getAlign(),
18072 S
->getMemOperand()->getFlags());
18075 static SDValue
performSpliceCombine(SDNode
*N
, SelectionDAG
&DAG
) {
18076 assert(N
->getOpcode() == AArch64ISD::SPLICE
&& "Unexepected Opcode!");
18078 // splice(pg, op1, undef) -> op1
18079 if (N
->getOperand(2).isUndef())
18080 return N
->getOperand(1);
18085 static SDValue
performUnpackCombine(SDNode
*N
, SelectionDAG
&DAG
,
18086 const AArch64Subtarget
*Subtarget
) {
18087 assert((N
->getOpcode() == AArch64ISD::UUNPKHI
||
18088 N
->getOpcode() == AArch64ISD::UUNPKLO
) &&
18089 "Unexpected Opcode!");
18091 // uunpklo/hi undef -> undef
18092 if (N
->getOperand(0).isUndef())
18093 return DAG
.getUNDEF(N
->getValueType(0));
18095 // If this is a masked load followed by an UUNPKLO, fold this into a masked
18096 // extending load. We can do this even if this is already a masked
18098 if (N
->getOperand(0).getOpcode() == ISD::MLOAD
&&
18099 N
->getOpcode() == AArch64ISD::UUNPKLO
) {
18100 MaskedLoadSDNode
*MLD
= cast
<MaskedLoadSDNode
>(N
->getOperand(0));
18101 SDValue Mask
= MLD
->getMask();
18104 if (MLD
->isUnindexed() && MLD
->getExtensionType() != ISD::SEXTLOAD
&&
18105 SDValue(MLD
, 0).hasOneUse() && Mask
->getOpcode() == AArch64ISD::PTRUE
&&
18106 (MLD
->getPassThru()->isUndef() ||
18107 isZerosVector(MLD
->getPassThru().getNode()))) {
18108 unsigned MinSVESize
= Subtarget
->getMinSVEVectorSizeInBits();
18109 unsigned PgPattern
= Mask
->getConstantOperandVal(0);
18110 EVT VT
= N
->getValueType(0);
18112 // Ensure we can double the size of the predicate pattern
18113 unsigned NumElts
= getNumElementsFromSVEPredPattern(PgPattern
);
18115 NumElts
* VT
.getVectorElementType().getSizeInBits() <= MinSVESize
) {
18117 getPTrue(DAG
, DL
, VT
.changeVectorElementType(MVT::i1
), PgPattern
);
18118 SDValue PassThru
= DAG
.getConstant(0, DL
, VT
);
18119 SDValue NewLoad
= DAG
.getMaskedLoad(
18120 VT
, DL
, MLD
->getChain(), MLD
->getBasePtr(), MLD
->getOffset(), Mask
,
18121 PassThru
, MLD
->getMemoryVT(), MLD
->getMemOperand(),
18122 MLD
->getAddressingMode(), ISD::ZEXTLOAD
);
18124 DAG
.ReplaceAllUsesOfValueWith(SDValue(MLD
, 1), NewLoad
.getValue(1));
18134 static SDValue
performUzpCombine(SDNode
*N
, SelectionDAG
&DAG
) {
18136 SDValue Op0
= N
->getOperand(0);
18137 SDValue Op1
= N
->getOperand(1);
18138 EVT ResVT
= N
->getValueType(0);
18140 // uzp1(x, undef) -> concat(truncate(x), undef)
18141 if (Op1
.getOpcode() == ISD::UNDEF
) {
18142 EVT BCVT
= MVT::Other
, HalfVT
= MVT::Other
;
18143 switch (ResVT
.getSimpleVT().SimpleTy
) {
18148 HalfVT
= MVT::v8i8
;
18152 HalfVT
= MVT::v4i16
;
18156 HalfVT
= MVT::v2i32
;
18159 if (BCVT
!= MVT::Other
) {
18160 SDValue BC
= DAG
.getBitcast(BCVT
, Op0
);
18161 SDValue Trunc
= DAG
.getNode(ISD::TRUNCATE
, DL
, HalfVT
, BC
);
18162 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, ResVT
, Trunc
,
18163 DAG
.getUNDEF(HalfVT
));
18167 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
18168 if (Op0
.getOpcode() == AArch64ISD::UUNPKLO
) {
18169 if (Op0
.getOperand(0).getOpcode() == AArch64ISD::UZP1
) {
18170 SDValue X
= Op0
.getOperand(0).getOperand(0);
18171 return DAG
.getNode(AArch64ISD::UZP1
, DL
, ResVT
, X
, Op1
);
18175 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
18176 if (Op1
.getOpcode() == AArch64ISD::UUNPKHI
) {
18177 if (Op1
.getOperand(0).getOpcode() == AArch64ISD::UZP1
) {
18178 SDValue Z
= Op1
.getOperand(0).getOperand(1);
18179 return DAG
.getNode(AArch64ISD::UZP1
, DL
, ResVT
, Op0
, Z
);
18183 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
18184 // Only implemented on little-endian subtargets.
18185 bool IsLittleEndian
= DAG
.getDataLayout().isLittleEndian();
18187 // This optimization only works on little endian.
18188 if (!IsLittleEndian
)
18191 if (ResVT
!= MVT::v2i32
&& ResVT
!= MVT::v4i16
&& ResVT
!= MVT::v8i8
)
18194 auto getSourceOp
= [](SDValue Operand
) -> SDValue
{
18195 const unsigned Opcode
= Operand
.getOpcode();
18196 if (Opcode
== ISD::TRUNCATE
)
18197 return Operand
->getOperand(0);
18198 if (Opcode
== ISD::BITCAST
&&
18199 Operand
->getOperand(0).getOpcode() == ISD::TRUNCATE
)
18200 return Operand
->getOperand(0)->getOperand(0);
18204 SDValue SourceOp0
= getSourceOp(Op0
);
18205 SDValue SourceOp1
= getSourceOp(Op1
);
18207 if (!SourceOp0
|| !SourceOp1
)
18210 if (SourceOp0
.getValueType() != SourceOp1
.getValueType() ||
18211 !SourceOp0
.getValueType().isSimple())
18216 switch (SourceOp0
.getSimpleValueType().SimpleTy
) {
18218 ResultTy
= MVT::v4i32
;
18221 ResultTy
= MVT::v8i16
;
18224 ResultTy
= MVT::v16i8
;
18230 SDValue UzpOp0
= DAG
.getNode(ISD::BITCAST
, DL
, ResultTy
, SourceOp0
);
18231 SDValue UzpOp1
= DAG
.getNode(ISD::BITCAST
, DL
, ResultTy
, SourceOp1
);
18232 SDValue UzpResult
=
18233 DAG
.getNode(AArch64ISD::UZP1
, DL
, UzpOp0
.getValueType(), UzpOp0
, UzpOp1
);
18235 EVT BitcastResultTy
;
18237 switch (ResVT
.getSimpleVT().SimpleTy
) {
18239 BitcastResultTy
= MVT::v2i64
;
18242 BitcastResultTy
= MVT::v4i32
;
18245 BitcastResultTy
= MVT::v8i16
;
18248 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
18251 return DAG
.getNode(ISD::TRUNCATE
, DL
, ResVT
,
18252 DAG
.getNode(ISD::BITCAST
, DL
, BitcastResultTy
, UzpResult
));
18255 static SDValue
performGLD1Combine(SDNode
*N
, SelectionDAG
&DAG
) {
18256 unsigned Opc
= N
->getOpcode();
18258 assert(((Opc
>= AArch64ISD::GLD1_MERGE_ZERO
&& // unsigned gather loads
18259 Opc
<= AArch64ISD::GLD1_IMM_MERGE_ZERO
) ||
18260 (Opc
>= AArch64ISD::GLD1S_MERGE_ZERO
&& // signed gather loads
18261 Opc
<= AArch64ISD::GLD1S_IMM_MERGE_ZERO
)) &&
18262 "Invalid opcode.");
18264 const bool Scaled
= Opc
== AArch64ISD::GLD1_SCALED_MERGE_ZERO
||
18265 Opc
== AArch64ISD::GLD1S_SCALED_MERGE_ZERO
;
18266 const bool Signed
= Opc
== AArch64ISD::GLD1S_MERGE_ZERO
||
18267 Opc
== AArch64ISD::GLD1S_SCALED_MERGE_ZERO
;
18268 const bool Extended
= Opc
== AArch64ISD::GLD1_SXTW_MERGE_ZERO
||
18269 Opc
== AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO
||
18270 Opc
== AArch64ISD::GLD1_UXTW_MERGE_ZERO
||
18271 Opc
== AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO
;
18274 SDValue Chain
= N
->getOperand(0);
18275 SDValue Pg
= N
->getOperand(1);
18276 SDValue Base
= N
->getOperand(2);
18277 SDValue Offset
= N
->getOperand(3);
18278 SDValue Ty
= N
->getOperand(4);
18280 EVT ResVT
= N
->getValueType(0);
18282 const auto OffsetOpc
= Offset
.getOpcode();
18283 const bool OffsetIsZExt
=
18284 OffsetOpc
== AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU
;
18285 const bool OffsetIsSExt
=
18286 OffsetOpc
== AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU
;
18288 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
18289 if (!Extended
&& (OffsetIsSExt
|| OffsetIsZExt
)) {
18290 SDValue ExtPg
= Offset
.getOperand(0);
18291 VTSDNode
*ExtFrom
= cast
<VTSDNode
>(Offset
.getOperand(2).getNode());
18292 EVT ExtFromEVT
= ExtFrom
->getVT().getVectorElementType();
18294 // If the predicate for the sign- or zero-extended offset is the
18295 // same as the predicate used for this load and the sign-/zero-extension
18296 // was from a 32-bits...
18297 if (ExtPg
== Pg
&& ExtFromEVT
== MVT::i32
) {
18298 SDValue UnextendedOffset
= Offset
.getOperand(1);
18300 unsigned NewOpc
= getGatherVecOpcode(Scaled
, OffsetIsSExt
, true);
18302 NewOpc
= getSignExtendedGatherOpcode(NewOpc
);
18304 return DAG
.getNode(NewOpc
, DL
, {ResVT
, MVT::Other
},
18305 {Chain
, Pg
, Base
, UnextendedOffset
, Ty
});
18312 /// Optimize a vector shift instruction and its operand if shifted out
18313 /// bits are not used.
18314 static SDValue
performVectorShiftCombine(SDNode
*N
,
18315 const AArch64TargetLowering
&TLI
,
18316 TargetLowering::DAGCombinerInfo
&DCI
) {
18317 assert(N
->getOpcode() == AArch64ISD::VASHR
||
18318 N
->getOpcode() == AArch64ISD::VLSHR
);
18320 SDValue Op
= N
->getOperand(0);
18321 unsigned OpScalarSize
= Op
.getScalarValueSizeInBits();
18323 unsigned ShiftImm
= N
->getConstantOperandVal(1);
18324 assert(OpScalarSize
> ShiftImm
&& "Invalid shift imm");
18326 APInt ShiftedOutBits
= APInt::getLowBitsSet(OpScalarSize
, ShiftImm
);
18327 APInt DemandedMask
= ~ShiftedOutBits
;
18329 if (TLI
.SimplifyDemandedBits(Op
, DemandedMask
, DCI
))
18330 return SDValue(N
, 0);
18335 static SDValue
performSunpkloCombine(SDNode
*N
, SelectionDAG
&DAG
) {
18336 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
18337 // This transform works in partnership with performSetCCPunpkCombine to
18338 // remove unnecessary transfer of predicates into standard registers and back
18339 if (N
->getOperand(0).getOpcode() == ISD::SIGN_EXTEND
&&
18340 N
->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
18342 SDValue CC
= N
->getOperand(0)->getOperand(0);
18343 auto VT
= CC
->getValueType(0).getHalfNumVectorElementsVT(*DAG
.getContext());
18344 SDValue Unpk
= DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, SDLoc(N
), VT
, CC
,
18345 DAG
.getVectorIdxConstant(0, SDLoc(N
)));
18346 return DAG
.getNode(ISD::SIGN_EXTEND
, SDLoc(N
), N
->getValueType(0), Unpk
);
18352 /// Target-specific DAG combine function for post-increment LD1 (lane) and
18353 /// post-increment LD1R.
18354 static SDValue
performPostLD1Combine(SDNode
*N
,
18355 TargetLowering::DAGCombinerInfo
&DCI
,
18357 if (DCI
.isBeforeLegalizeOps())
18360 SelectionDAG
&DAG
= DCI
.DAG
;
18361 EVT VT
= N
->getValueType(0);
18363 if (!VT
.is128BitVector() && !VT
.is64BitVector())
18366 unsigned LoadIdx
= IsLaneOp
? 1 : 0;
18367 SDNode
*LD
= N
->getOperand(LoadIdx
).getNode();
18368 // If it is not LOAD, can not do such combine.
18369 if (LD
->getOpcode() != ISD::LOAD
)
18372 // The vector lane must be a constant in the LD1LANE opcode.
18375 Lane
= N
->getOperand(2);
18376 auto *LaneC
= dyn_cast
<ConstantSDNode
>(Lane
);
18377 if (!LaneC
|| LaneC
->getZExtValue() >= VT
.getVectorNumElements())
18381 LoadSDNode
*LoadSDN
= cast
<LoadSDNode
>(LD
);
18382 EVT MemVT
= LoadSDN
->getMemoryVT();
18383 // Check if memory operand is the same type as the vector element.
18384 if (MemVT
!= VT
.getVectorElementType())
18387 // Check if there are other uses. If so, do not combine as it will introduce
18389 for (SDNode::use_iterator UI
= LD
->use_begin(), UE
= LD
->use_end(); UI
!= UE
;
18391 if (UI
.getUse().getResNo() == 1) // Ignore uses of the chain result.
18397 SDValue Addr
= LD
->getOperand(1);
18398 SDValue Vector
= N
->getOperand(0);
18399 // Search for a use of the address operand that is an increment.
18400 for (SDNode::use_iterator UI
= Addr
.getNode()->use_begin(), UE
=
18401 Addr
.getNode()->use_end(); UI
!= UE
; ++UI
) {
18402 SDNode
*User
= *UI
;
18403 if (User
->getOpcode() != ISD::ADD
18404 || UI
.getUse().getResNo() != Addr
.getResNo())
18407 // If the increment is a constant, it must match the memory ref size.
18408 SDValue Inc
= User
->getOperand(User
->getOperand(0) == Addr
? 1 : 0);
18409 if (ConstantSDNode
*CInc
= dyn_cast
<ConstantSDNode
>(Inc
.getNode())) {
18410 uint32_t IncVal
= CInc
->getZExtValue();
18411 unsigned NumBytes
= VT
.getScalarSizeInBits() / 8;
18412 if (IncVal
!= NumBytes
)
18414 Inc
= DAG
.getRegister(AArch64::XZR
, MVT::i64
);
18417 // To avoid cycle construction make sure that neither the load nor the add
18418 // are predecessors to each other or the Vector.
18419 SmallPtrSet
<const SDNode
*, 32> Visited
;
18420 SmallVector
<const SDNode
*, 16> Worklist
;
18421 Visited
.insert(Addr
.getNode());
18422 Worklist
.push_back(User
);
18423 Worklist
.push_back(LD
);
18424 Worklist
.push_back(Vector
.getNode());
18425 if (SDNode::hasPredecessorHelper(LD
, Visited
, Worklist
) ||
18426 SDNode::hasPredecessorHelper(User
, Visited
, Worklist
))
18429 SmallVector
<SDValue
, 8> Ops
;
18430 Ops
.push_back(LD
->getOperand(0)); // Chain
18432 Ops
.push_back(Vector
); // The vector to be inserted
18433 Ops
.push_back(Lane
); // The lane to be inserted in the vector
18435 Ops
.push_back(Addr
);
18436 Ops
.push_back(Inc
);
18438 EVT Tys
[3] = { VT
, MVT::i64
, MVT::Other
};
18439 SDVTList SDTys
= DAG
.getVTList(Tys
);
18440 unsigned NewOp
= IsLaneOp
? AArch64ISD::LD1LANEpost
: AArch64ISD::LD1DUPpost
;
18441 SDValue UpdN
= DAG
.getMemIntrinsicNode(NewOp
, SDLoc(N
), SDTys
, Ops
,
18443 LoadSDN
->getMemOperand());
18445 // Update the uses.
18446 SDValue NewResults
[] = {
18447 SDValue(LD
, 0), // The result of load
18448 SDValue(UpdN
.getNode(), 2) // Chain
18450 DCI
.CombineTo(LD
, NewResults
);
18451 DCI
.CombineTo(N
, SDValue(UpdN
.getNode(), 0)); // Dup/Inserted Result
18452 DCI
.CombineTo(User
, SDValue(UpdN
.getNode(), 1)); // Write back register
18459 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during
18460 /// address translation.
18461 static bool performTBISimplification(SDValue Addr
,
18462 TargetLowering::DAGCombinerInfo
&DCI
,
18463 SelectionDAG
&DAG
) {
18464 APInt DemandedMask
= APInt::getLowBitsSet(64, 56);
18466 TargetLowering::TargetLoweringOpt
TLO(DAG
, !DCI
.isBeforeLegalize(),
18467 !DCI
.isBeforeLegalizeOps());
18468 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
18469 if (TLI
.SimplifyDemandedBits(Addr
, DemandedMask
, Known
, TLO
)) {
18470 DCI
.CommitTargetLoweringOpt(TLO
);
18476 static SDValue
foldTruncStoreOfExt(SelectionDAG
&DAG
, SDNode
*N
) {
18477 assert((N
->getOpcode() == ISD::STORE
|| N
->getOpcode() == ISD::MSTORE
) &&
18478 "Expected STORE dag node in input!");
18480 if (auto Store
= dyn_cast
<StoreSDNode
>(N
)) {
18481 if (!Store
->isTruncatingStore() || Store
->isIndexed())
18483 SDValue Ext
= Store
->getValue();
18484 auto ExtOpCode
= Ext
.getOpcode();
18485 if (ExtOpCode
!= ISD::ZERO_EXTEND
&& ExtOpCode
!= ISD::SIGN_EXTEND
&&
18486 ExtOpCode
!= ISD::ANY_EXTEND
)
18488 SDValue Orig
= Ext
->getOperand(0);
18489 if (Store
->getMemoryVT() != Orig
.getValueType())
18491 return DAG
.getStore(Store
->getChain(), SDLoc(Store
), Orig
,
18492 Store
->getBasePtr(), Store
->getMemOperand());
18498 // Perform TBI simplification if supported by the target and try to break up
18499 // nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
18500 // load instructions can be selected.
18501 static SDValue
performLOADCombine(SDNode
*N
,
18502 TargetLowering::DAGCombinerInfo
&DCI
,
18504 const AArch64Subtarget
*Subtarget
) {
18505 if (Subtarget
->supportsAddressTopByteIgnored())
18506 performTBISimplification(N
->getOperand(1), DCI
, DAG
);
18508 LoadSDNode
*LD
= cast
<LoadSDNode
>(N
);
18509 EVT MemVT
= LD
->getMemoryVT();
18510 if (LD
->isVolatile() || !LD
->isNonTemporal() || !Subtarget
->isLittleEndian())
18511 return SDValue(N
, 0);
18513 if (MemVT
.isScalableVector() || MemVT
.getSizeInBits() <= 256 ||
18514 MemVT
.getSizeInBits() % 256 == 0 ||
18515 256 % MemVT
.getScalarSizeInBits() != 0)
18516 return SDValue(N
, 0);
18519 SDValue Chain
= LD
->getChain();
18520 SDValue BasePtr
= LD
->getBasePtr();
18521 SDNodeFlags Flags
= LD
->getFlags();
18522 SmallVector
<SDValue
, 4> LoadOps
;
18523 SmallVector
<SDValue
, 4> LoadOpsChain
;
18524 // Replace any non temporal load over 256-bit with a series of 256 bit loads
18525 // and a scalar/vector load less than 256. This way we can utilize 256-bit
18526 // loads and reduce the amount of load instructions generated.
18528 MVT::getVectorVT(MemVT
.getVectorElementType().getSimpleVT(),
18529 256 / MemVT
.getVectorElementType().getSizeInBits());
18530 unsigned Num256Loads
= MemVT
.getSizeInBits() / 256;
18531 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
18532 for (unsigned I
= 0; I
< Num256Loads
; I
++) {
18533 unsigned PtrOffset
= I
* 32;
18534 SDValue NewPtr
= DAG
.getMemBasePlusOffset(
18535 BasePtr
, TypeSize::Fixed(PtrOffset
), DL
, Flags
);
18536 Align NewAlign
= commonAlignment(LD
->getAlign(), PtrOffset
);
18537 SDValue NewLoad
= DAG
.getLoad(
18538 NewVT
, DL
, Chain
, NewPtr
, LD
->getPointerInfo().getWithOffset(PtrOffset
),
18539 NewAlign
, LD
->getMemOperand()->getFlags(), LD
->getAAInfo());
18540 LoadOps
.push_back(NewLoad
);
18541 LoadOpsChain
.push_back(SDValue(cast
<SDNode
>(NewLoad
), 1));
18544 // Process remaining bits of the load operation.
18545 // This is done by creating an UNDEF vector to match the size of the
18546 // 256-bit loads and inserting the remaining load to it. We extract the
18547 // original load type at the end using EXTRACT_SUBVECTOR instruction.
18548 unsigned BitsRemaining
= MemVT
.getSizeInBits() % 256;
18549 unsigned PtrOffset
= (MemVT
.getSizeInBits() - BitsRemaining
) / 8;
18550 MVT RemainingVT
= MVT::getVectorVT(
18551 MemVT
.getVectorElementType().getSimpleVT(),
18552 BitsRemaining
/ MemVT
.getVectorElementType().getSizeInBits());
18554 DAG
.getMemBasePlusOffset(BasePtr
, TypeSize::Fixed(PtrOffset
), DL
, Flags
);
18555 Align NewAlign
= commonAlignment(LD
->getAlign(), PtrOffset
);
18556 SDValue RemainingLoad
=
18557 DAG
.getLoad(RemainingVT
, DL
, Chain
, NewPtr
,
18558 LD
->getPointerInfo().getWithOffset(PtrOffset
), NewAlign
,
18559 LD
->getMemOperand()->getFlags(), LD
->getAAInfo());
18560 SDValue UndefVector
= DAG
.getUNDEF(NewVT
);
18561 SDValue InsertIdx
= DAG
.getVectorIdxConstant(0, DL
);
18562 SDValue ExtendedReminingLoad
=
18563 DAG
.getNode(ISD::INSERT_SUBVECTOR
, DL
, NewVT
,
18564 {UndefVector
, RemainingLoad
, InsertIdx
});
18565 LoadOps
.push_back(ExtendedReminingLoad
);
18566 LoadOpsChain
.push_back(SDValue(cast
<SDNode
>(RemainingLoad
), 1));
18568 EVT::getVectorVT(*DAG
.getContext(), MemVT
.getScalarType(),
18569 LoadOps
.size() * NewVT
.getVectorNumElements());
18570 SDValue ConcatVectors
=
18571 DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, ConcatVT
, LoadOps
);
18572 // Extract the original vector type size.
18573 SDValue ExtractSubVector
=
18574 DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, MemVT
,
18575 {ConcatVectors
, DAG
.getVectorIdxConstant(0, DL
)});
18576 SDValue TokenFactor
=
18577 DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, LoadOpsChain
);
18578 return DAG
.getMergeValues({ExtractSubVector
, TokenFactor
}, DL
);
18581 static SDValue
performSTORECombine(SDNode
*N
,
18582 TargetLowering::DAGCombinerInfo
&DCI
,
18584 const AArch64Subtarget
*Subtarget
) {
18585 StoreSDNode
*ST
= cast
<StoreSDNode
>(N
);
18586 SDValue Chain
= ST
->getChain();
18587 SDValue Value
= ST
->getValue();
18588 SDValue Ptr
= ST
->getBasePtr();
18590 // If this is an FP_ROUND followed by a store, fold this into a truncating
18591 // store. We can do this even if this is already a truncstore.
18592 // We purposefully don't care about legality of the nodes here as we know
18593 // they can be split down into something legal.
18594 if (DCI
.isBeforeLegalizeOps() && Value
.getOpcode() == ISD::FP_ROUND
&&
18595 Value
.getNode()->hasOneUse() && ST
->isUnindexed() &&
18596 Subtarget
->useSVEForFixedLengthVectors() &&
18597 Value
.getValueType().isFixedLengthVector() &&
18598 Value
.getValueType().getFixedSizeInBits() >=
18599 Subtarget
->getMinSVEVectorSizeInBits())
18600 return DAG
.getTruncStore(Chain
, SDLoc(N
), Value
.getOperand(0), Ptr
,
18601 ST
->getMemoryVT(), ST
->getMemOperand());
18603 if (SDValue Split
= splitStores(N
, DCI
, DAG
, Subtarget
))
18606 if (Subtarget
->supportsAddressTopByteIgnored() &&
18607 performTBISimplification(N
->getOperand(2), DCI
, DAG
))
18608 return SDValue(N
, 0);
18610 if (SDValue Store
= foldTruncStoreOfExt(DAG
, N
))
18616 static SDValue
performMSTORECombine(SDNode
*N
,
18617 TargetLowering::DAGCombinerInfo
&DCI
,
18619 const AArch64Subtarget
*Subtarget
) {
18620 MaskedStoreSDNode
*MST
= cast
<MaskedStoreSDNode
>(N
);
18621 SDValue Value
= MST
->getValue();
18622 SDValue Mask
= MST
->getMask();
18625 // If this is a UZP1 followed by a masked store, fold this into a masked
18626 // truncating store. We can do this even if this is already a masked
18628 if (Value
.getOpcode() == AArch64ISD::UZP1
&& Value
->hasOneUse() &&
18629 MST
->isUnindexed() && Mask
->getOpcode() == AArch64ISD::PTRUE
&&
18630 Value
.getValueType().isInteger()) {
18631 Value
= Value
.getOperand(0);
18632 if (Value
.getOpcode() == ISD::BITCAST
) {
18634 Value
.getValueType().getHalfNumVectorElementsVT(*DAG
.getContext());
18635 EVT InVT
= Value
.getOperand(0).getValueType();
18637 if (HalfVT
.widenIntegerVectorElementType(*DAG
.getContext()) == InVT
) {
18638 unsigned MinSVESize
= Subtarget
->getMinSVEVectorSizeInBits();
18639 unsigned PgPattern
= Mask
->getConstantOperandVal(0);
18641 // Ensure we can double the size of the predicate pattern
18642 unsigned NumElts
= getNumElementsFromSVEPredPattern(PgPattern
);
18643 if (NumElts
&& NumElts
* InVT
.getVectorElementType().getSizeInBits() <=
18645 Mask
= getPTrue(DAG
, DL
, InVT
.changeVectorElementType(MVT::i1
),
18647 return DAG
.getMaskedStore(MST
->getChain(), DL
, Value
.getOperand(0),
18648 MST
->getBasePtr(), MST
->getOffset(), Mask
,
18649 MST
->getMemoryVT(), MST
->getMemOperand(),
18650 MST
->getAddressingMode(),
18651 /*IsTruncating=*/true);
18660 /// \return true if part of the index was folded into the Base.
18661 static bool foldIndexIntoBase(SDValue
&BasePtr
, SDValue
&Index
, SDValue Scale
,
18662 SDLoc DL
, SelectionDAG
&DAG
) {
18663 // This function assumes a vector of i64 indices.
18664 EVT IndexVT
= Index
.getValueType();
18665 if (!IndexVT
.isVector() || IndexVT
.getVectorElementType() != MVT::i64
)
18670 // Index = X + splat(Offset)
18672 // BasePtr = Ptr + Offset * scale.
18674 if (Index
.getOpcode() == ISD::ADD
) {
18675 if (auto Offset
= DAG
.getSplatValue(Index
.getOperand(1))) {
18676 Offset
= DAG
.getNode(ISD::MUL
, DL
, MVT::i64
, Offset
, Scale
);
18677 BasePtr
= DAG
.getNode(ISD::ADD
, DL
, MVT::i64
, BasePtr
, Offset
);
18678 Index
= Index
.getOperand(0);
18685 // Index = (X + splat(Offset)) << splat(Shift)
18687 // BasePtr = Ptr + (Offset << Shift) * scale)
18688 // Index = X << splat(shift)
18689 if (Index
.getOpcode() == ISD::SHL
&&
18690 Index
.getOperand(0).getOpcode() == ISD::ADD
) {
18691 SDValue Add
= Index
.getOperand(0);
18692 SDValue ShiftOp
= Index
.getOperand(1);
18693 SDValue OffsetOp
= Add
.getOperand(1);
18694 if (auto Shift
= DAG
.getSplatValue(ShiftOp
))
18695 if (auto Offset
= DAG
.getSplatValue(OffsetOp
)) {
18696 Offset
= DAG
.getNode(ISD::SHL
, DL
, MVT::i64
, Offset
, Shift
);
18697 Offset
= DAG
.getNode(ISD::MUL
, DL
, MVT::i64
, Offset
, Scale
);
18698 BasePtr
= DAG
.getNode(ISD::ADD
, DL
, MVT::i64
, BasePtr
, Offset
);
18699 Index
= DAG
.getNode(ISD::SHL
, DL
, Index
.getValueType(),
18700 Add
.getOperand(0), ShiftOp
);
18708 // Analyse the specified address returning true if a more optimal addressing
18709 // mode is available. When returning true all parameters are updated to reflect
18710 // their recommended values.
18711 static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode
*N
,
18712 SDValue
&BasePtr
, SDValue
&Index
,
18713 SelectionDAG
&DAG
) {
18714 // Try to iteratively fold parts of the index into the base pointer to
18715 // simplify the index as much as possible.
18716 bool Changed
= false;
18717 while (foldIndexIntoBase(BasePtr
, Index
, N
->getScale(), SDLoc(N
), DAG
))
18720 // Only consider element types that are pointer sized as smaller types can
18721 // be easily promoted.
18722 EVT IndexVT
= Index
.getValueType();
18723 if (IndexVT
.getVectorElementType() != MVT::i64
|| IndexVT
== MVT::nxv2i64
)
18726 // Can indices be trivially shrunk?
18727 EVT DataVT
= N
->getOperand(1).getValueType();
18728 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
18729 // will later be re-extended to 64 bits in legalization
18730 if (DataVT
.isFixedLengthVector() && DataVT
.getScalarSizeInBits() == 64)
18732 if (ISD::isVectorShrinkable(Index
.getNode(), 32, N
->isIndexSigned())) {
18733 EVT NewIndexVT
= IndexVT
.changeVectorElementType(MVT::i32
);
18734 Index
= DAG
.getNode(ISD::TRUNCATE
, SDLoc(N
), NewIndexVT
, Index
);
18739 // Index = step(const)
18740 int64_t Stride
= 0;
18741 if (Index
.getOpcode() == ISD::STEP_VECTOR
) {
18742 Stride
= cast
<ConstantSDNode
>(Index
.getOperand(0))->getSExtValue();
18745 // Index = step(const) << shift(const)
18746 else if (Index
.getOpcode() == ISD::SHL
&&
18747 Index
.getOperand(0).getOpcode() == ISD::STEP_VECTOR
) {
18748 SDValue RHS
= Index
.getOperand(1);
18750 dyn_cast_or_null
<ConstantSDNode
>(DAG
.getSplatValue(RHS
))) {
18751 int64_t Step
= (int64_t)Index
.getOperand(0).getConstantOperandVal(1);
18752 Stride
= Step
<< Shift
->getZExtValue();
18756 // Return early because no supported pattern is found.
18760 if (Stride
< std::numeric_limits
<int32_t>::min() ||
18761 Stride
> std::numeric_limits
<int32_t>::max())
18764 const auto &Subtarget
= DAG
.getSubtarget
<AArch64Subtarget
>();
18765 unsigned MaxVScale
=
18766 Subtarget
.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock
;
18767 int64_t LastElementOffset
=
18768 IndexVT
.getVectorMinNumElements() * Stride
* MaxVScale
;
18770 if (LastElementOffset
< std::numeric_limits
<int32_t>::min() ||
18771 LastElementOffset
> std::numeric_limits
<int32_t>::max())
18774 EVT NewIndexVT
= IndexVT
.changeVectorElementType(MVT::i32
);
18775 // Stride does not scale explicitly by 'Scale', because it happens in
18776 // the gather/scatter addressing mode.
18777 Index
= DAG
.getStepVector(SDLoc(N
), NewIndexVT
, APInt(32, Stride
));
18781 static SDValue
performMaskedGatherScatterCombine(
18782 SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
, SelectionDAG
&DAG
) {
18783 MaskedGatherScatterSDNode
*MGS
= cast
<MaskedGatherScatterSDNode
>(N
);
18784 assert(MGS
&& "Can only combine gather load or scatter store nodes");
18786 if (!DCI
.isBeforeLegalize())
18790 SDValue Chain
= MGS
->getChain();
18791 SDValue Scale
= MGS
->getScale();
18792 SDValue Index
= MGS
->getIndex();
18793 SDValue Mask
= MGS
->getMask();
18794 SDValue BasePtr
= MGS
->getBasePtr();
18795 ISD::MemIndexType IndexType
= MGS
->getIndexType();
18797 if (!findMoreOptimalIndexType(MGS
, BasePtr
, Index
, DAG
))
18800 // Here we catch such cases early and change MGATHER's IndexType to allow
18801 // the use of an Index that's more legalisation friendly.
18802 if (auto *MGT
= dyn_cast
<MaskedGatherSDNode
>(MGS
)) {
18803 SDValue PassThru
= MGT
->getPassThru();
18804 SDValue Ops
[] = {Chain
, PassThru
, Mask
, BasePtr
, Index
, Scale
};
18805 return DAG
.getMaskedGather(
18806 DAG
.getVTList(N
->getValueType(0), MVT::Other
), MGT
->getMemoryVT(), DL
,
18807 Ops
, MGT
->getMemOperand(), IndexType
, MGT
->getExtensionType());
18809 auto *MSC
= cast
<MaskedScatterSDNode
>(MGS
);
18810 SDValue Data
= MSC
->getValue();
18811 SDValue Ops
[] = {Chain
, Data
, Mask
, BasePtr
, Index
, Scale
};
18812 return DAG
.getMaskedScatter(DAG
.getVTList(MVT::Other
), MSC
->getMemoryVT(), DL
,
18813 Ops
, MSC
->getMemOperand(), IndexType
,
18814 MSC
->isTruncatingStore());
18817 /// Target-specific DAG combine function for NEON load/store intrinsics
18818 /// to merge base address updates.
18819 static SDValue
performNEONPostLDSTCombine(SDNode
*N
,
18820 TargetLowering::DAGCombinerInfo
&DCI
,
18821 SelectionDAG
&DAG
) {
18822 if (DCI
.isBeforeLegalize() || DCI
.isCalledByLegalizer())
18825 unsigned AddrOpIdx
= N
->getNumOperands() - 1;
18826 SDValue Addr
= N
->getOperand(AddrOpIdx
);
18828 // Search for a use of the address operand that is an increment.
18829 for (SDNode::use_iterator UI
= Addr
.getNode()->use_begin(),
18830 UE
= Addr
.getNode()->use_end(); UI
!= UE
; ++UI
) {
18831 SDNode
*User
= *UI
;
18832 if (User
->getOpcode() != ISD::ADD
||
18833 UI
.getUse().getResNo() != Addr
.getResNo())
18836 // Check that the add is independent of the load/store. Otherwise, folding
18837 // it would create a cycle.
18838 SmallPtrSet
<const SDNode
*, 32> Visited
;
18839 SmallVector
<const SDNode
*, 16> Worklist
;
18840 Visited
.insert(Addr
.getNode());
18841 Worklist
.push_back(N
);
18842 Worklist
.push_back(User
);
18843 if (SDNode::hasPredecessorHelper(N
, Visited
, Worklist
) ||
18844 SDNode::hasPredecessorHelper(User
, Visited
, Worklist
))
18847 // Find the new opcode for the updating load/store.
18848 bool IsStore
= false;
18849 bool IsLaneOp
= false;
18850 bool IsDupOp
= false;
18851 unsigned NewOpc
= 0;
18852 unsigned NumVecs
= 0;
18853 unsigned IntNo
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
18855 default: llvm_unreachable("unexpected intrinsic for Neon base update");
18856 case Intrinsic::aarch64_neon_ld2
: NewOpc
= AArch64ISD::LD2post
;
18857 NumVecs
= 2; break;
18858 case Intrinsic::aarch64_neon_ld3
: NewOpc
= AArch64ISD::LD3post
;
18859 NumVecs
= 3; break;
18860 case Intrinsic::aarch64_neon_ld4
: NewOpc
= AArch64ISD::LD4post
;
18861 NumVecs
= 4; break;
18862 case Intrinsic::aarch64_neon_st2
: NewOpc
= AArch64ISD::ST2post
;
18863 NumVecs
= 2; IsStore
= true; break;
18864 case Intrinsic::aarch64_neon_st3
: NewOpc
= AArch64ISD::ST3post
;
18865 NumVecs
= 3; IsStore
= true; break;
18866 case Intrinsic::aarch64_neon_st4
: NewOpc
= AArch64ISD::ST4post
;
18867 NumVecs
= 4; IsStore
= true; break;
18868 case Intrinsic::aarch64_neon_ld1x2
: NewOpc
= AArch64ISD::LD1x2post
;
18869 NumVecs
= 2; break;
18870 case Intrinsic::aarch64_neon_ld1x3
: NewOpc
= AArch64ISD::LD1x3post
;
18871 NumVecs
= 3; break;
18872 case Intrinsic::aarch64_neon_ld1x4
: NewOpc
= AArch64ISD::LD1x4post
;
18873 NumVecs
= 4; break;
18874 case Intrinsic::aarch64_neon_st1x2
: NewOpc
= AArch64ISD::ST1x2post
;
18875 NumVecs
= 2; IsStore
= true; break;
18876 case Intrinsic::aarch64_neon_st1x3
: NewOpc
= AArch64ISD::ST1x3post
;
18877 NumVecs
= 3; IsStore
= true; break;
18878 case Intrinsic::aarch64_neon_st1x4
: NewOpc
= AArch64ISD::ST1x4post
;
18879 NumVecs
= 4; IsStore
= true; break;
18880 case Intrinsic::aarch64_neon_ld2r
: NewOpc
= AArch64ISD::LD2DUPpost
;
18881 NumVecs
= 2; IsDupOp
= true; break;
18882 case Intrinsic::aarch64_neon_ld3r
: NewOpc
= AArch64ISD::LD3DUPpost
;
18883 NumVecs
= 3; IsDupOp
= true; break;
18884 case Intrinsic::aarch64_neon_ld4r
: NewOpc
= AArch64ISD::LD4DUPpost
;
18885 NumVecs
= 4; IsDupOp
= true; break;
18886 case Intrinsic::aarch64_neon_ld2lane
: NewOpc
= AArch64ISD::LD2LANEpost
;
18887 NumVecs
= 2; IsLaneOp
= true; break;
18888 case Intrinsic::aarch64_neon_ld3lane
: NewOpc
= AArch64ISD::LD3LANEpost
;
18889 NumVecs
= 3; IsLaneOp
= true; break;
18890 case Intrinsic::aarch64_neon_ld4lane
: NewOpc
= AArch64ISD::LD4LANEpost
;
18891 NumVecs
= 4; IsLaneOp
= true; break;
18892 case Intrinsic::aarch64_neon_st2lane
: NewOpc
= AArch64ISD::ST2LANEpost
;
18893 NumVecs
= 2; IsStore
= true; IsLaneOp
= true; break;
18894 case Intrinsic::aarch64_neon_st3lane
: NewOpc
= AArch64ISD::ST3LANEpost
;
18895 NumVecs
= 3; IsStore
= true; IsLaneOp
= true; break;
18896 case Intrinsic::aarch64_neon_st4lane
: NewOpc
= AArch64ISD::ST4LANEpost
;
18897 NumVecs
= 4; IsStore
= true; IsLaneOp
= true; break;
18902 VecTy
= N
->getOperand(2).getValueType();
18904 VecTy
= N
->getValueType(0);
18906 // If the increment is a constant, it must match the memory ref size.
18907 SDValue Inc
= User
->getOperand(User
->getOperand(0) == Addr
? 1 : 0);
18908 if (ConstantSDNode
*CInc
= dyn_cast
<ConstantSDNode
>(Inc
.getNode())) {
18909 uint32_t IncVal
= CInc
->getZExtValue();
18910 unsigned NumBytes
= NumVecs
* VecTy
.getSizeInBits() / 8;
18911 if (IsLaneOp
|| IsDupOp
)
18912 NumBytes
/= VecTy
.getVectorNumElements();
18913 if (IncVal
!= NumBytes
)
18915 Inc
= DAG
.getRegister(AArch64::XZR
, MVT::i64
);
18917 SmallVector
<SDValue
, 8> Ops
;
18918 Ops
.push_back(N
->getOperand(0)); // Incoming chain
18919 // Load lane and store have vector list as input.
18920 if (IsLaneOp
|| IsStore
)
18921 for (unsigned i
= 2; i
< AddrOpIdx
; ++i
)
18922 Ops
.push_back(N
->getOperand(i
));
18923 Ops
.push_back(Addr
); // Base register
18924 Ops
.push_back(Inc
);
18928 unsigned NumResultVecs
= (IsStore
? 0 : NumVecs
);
18930 for (n
= 0; n
< NumResultVecs
; ++n
)
18932 Tys
[n
++] = MVT::i64
; // Type of write back register
18933 Tys
[n
] = MVT::Other
; // Type of the chain
18934 SDVTList SDTys
= DAG
.getVTList(makeArrayRef(Tys
, NumResultVecs
+ 2));
18936 MemIntrinsicSDNode
*MemInt
= cast
<MemIntrinsicSDNode
>(N
);
18937 SDValue UpdN
= DAG
.getMemIntrinsicNode(NewOpc
, SDLoc(N
), SDTys
, Ops
,
18938 MemInt
->getMemoryVT(),
18939 MemInt
->getMemOperand());
18941 // Update the uses.
18942 std::vector
<SDValue
> NewResults
;
18943 for (unsigned i
= 0; i
< NumResultVecs
; ++i
) {
18944 NewResults
.push_back(SDValue(UpdN
.getNode(), i
));
18946 NewResults
.push_back(SDValue(UpdN
.getNode(), NumResultVecs
+ 1));
18947 DCI
.CombineTo(N
, NewResults
);
18948 DCI
.CombineTo(User
, SDValue(UpdN
.getNode(), NumResultVecs
));
18955 // Checks to see if the value is the prescribed width and returns information
18956 // about its extension mode.
18958 bool checkValueWidth(SDValue V
, unsigned width
, ISD::LoadExtType
&ExtType
) {
18959 ExtType
= ISD::NON_EXTLOAD
;
18960 switch(V
.getNode()->getOpcode()) {
18964 LoadSDNode
*LoadNode
= cast
<LoadSDNode
>(V
.getNode());
18965 if ((LoadNode
->getMemoryVT() == MVT::i8
&& width
== 8)
18966 || (LoadNode
->getMemoryVT() == MVT::i16
&& width
== 16)) {
18967 ExtType
= LoadNode
->getExtensionType();
18972 case ISD::AssertSext
: {
18973 VTSDNode
*TypeNode
= cast
<VTSDNode
>(V
.getNode()->getOperand(1));
18974 if ((TypeNode
->getVT() == MVT::i8
&& width
== 8)
18975 || (TypeNode
->getVT() == MVT::i16
&& width
== 16)) {
18976 ExtType
= ISD::SEXTLOAD
;
18981 case ISD::AssertZext
: {
18982 VTSDNode
*TypeNode
= cast
<VTSDNode
>(V
.getNode()->getOperand(1));
18983 if ((TypeNode
->getVT() == MVT::i8
&& width
== 8)
18984 || (TypeNode
->getVT() == MVT::i16
&& width
== 16)) {
18985 ExtType
= ISD::ZEXTLOAD
;
18990 case ISD::Constant
:
18991 case ISD::TargetConstant
: {
18992 return std::abs(cast
<ConstantSDNode
>(V
.getNode())->getSExtValue()) <
18993 1LL << (width
- 1);
19000 // This function does a whole lot of voodoo to determine if the tests are
19001 // equivalent without and with a mask. Essentially what happens is that given a
19004 // +-------------+ +-------------+ +-------------+ +-------------+
19005 // | Input | | AddConstant | | CompConstant| | CC |
19006 // +-------------+ +-------------+ +-------------+ +-------------+
19008 // V V | +----------+
19009 // +-------------+ +----+ | |
19010 // | ADD | |0xff| | |
19011 // +-------------+ +----+ | |
19014 // +-------------+ | |
19016 // +-------------+ | |
19025 // The AND node may be safely removed for some combinations of inputs. In
19026 // particular we need to take into account the extension type of the Input,
19027 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
19028 // width of the input (this can work for any width inputs, the above graph is
19029 // specific to 8 bits.
19031 // The specific equations were worked out by generating output tables for each
19032 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
19033 // problem was simplified by working with 4 bit inputs, which means we only
19034 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
19035 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
19036 // patterns present in both extensions (0,7). For every distinct set of
19037 // AddConstant and CompConstants bit patterns we can consider the masked and
19038 // unmasked versions to be equivalent if the result of this function is true for
19039 // all 16 distinct bit patterns of for the current extension type of Input (w0).
19042 // and w10, w8, #0x0f
19044 // cset w9, AArch64CC
19046 // cset w11, AArch64CC
19051 // Since the above function shows when the outputs are equivalent it defines
19052 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
19053 // would be expensive to run during compiles. The equations below were written
19054 // in a test harness that confirmed they gave equivalent outputs to the above
19055 // for all inputs function, so they can be used determine if the removal is
19058 // isEquivalentMaskless() is the code for testing if the AND can be removed
19059 // factored out of the DAG recognition as the DAG can take several forms.
19061 static bool isEquivalentMaskless(unsigned CC
, unsigned width
,
19062 ISD::LoadExtType ExtType
, int AddConstant
,
19063 int CompConstant
) {
19064 // By being careful about our equations and only writing the in term
19065 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
19066 // make them generally applicable to all bit widths.
19067 int MaxUInt
= (1 << width
);
19069 // For the purposes of these comparisons sign extending the type is
19070 // equivalent to zero extending the add and displacing it by half the integer
19071 // width. Provided we are careful and make sure our equations are valid over
19072 // the whole range we can just adjust the input and avoid writing equations
19073 // for sign extended inputs.
19074 if (ExtType
== ISD::SEXTLOAD
)
19075 AddConstant
-= (1 << (width
-1));
19078 case AArch64CC::LE
:
19079 case AArch64CC::GT
:
19080 if ((AddConstant
== 0) ||
19081 (CompConstant
== MaxUInt
- 1 && AddConstant
< 0) ||
19082 (AddConstant
>= 0 && CompConstant
< 0) ||
19083 (AddConstant
<= 0 && CompConstant
<= 0 && CompConstant
< AddConstant
))
19086 case AArch64CC::LT
:
19087 case AArch64CC::GE
:
19088 if ((AddConstant
== 0) ||
19089 (AddConstant
>= 0 && CompConstant
<= 0) ||
19090 (AddConstant
<= 0 && CompConstant
<= 0 && CompConstant
<= AddConstant
))
19093 case AArch64CC::HI
:
19094 case AArch64CC::LS
:
19095 if ((AddConstant
>= 0 && CompConstant
< 0) ||
19096 (AddConstant
<= 0 && CompConstant
>= -1 &&
19097 CompConstant
< AddConstant
+ MaxUInt
))
19100 case AArch64CC::PL
:
19101 case AArch64CC::MI
:
19102 if ((AddConstant
== 0) ||
19103 (AddConstant
> 0 && CompConstant
<= 0) ||
19104 (AddConstant
< 0 && CompConstant
<= AddConstant
))
19107 case AArch64CC::LO
:
19108 case AArch64CC::HS
:
19109 if ((AddConstant
>= 0 && CompConstant
<= 0) ||
19110 (AddConstant
<= 0 && CompConstant
>= 0 &&
19111 CompConstant
<= AddConstant
+ MaxUInt
))
19114 case AArch64CC::EQ
:
19115 case AArch64CC::NE
:
19116 if ((AddConstant
> 0 && CompConstant
< 0) ||
19117 (AddConstant
< 0 && CompConstant
>= 0 &&
19118 CompConstant
< AddConstant
+ MaxUInt
) ||
19119 (AddConstant
>= 0 && CompConstant
>= 0 &&
19120 CompConstant
>= AddConstant
) ||
19121 (AddConstant
<= 0 && CompConstant
< 0 && CompConstant
< AddConstant
))
19124 case AArch64CC::VS
:
19125 case AArch64CC::VC
:
19126 case AArch64CC::AL
:
19127 case AArch64CC::NV
:
19129 case AArch64CC::Invalid
:
19137 SDValue
performCONDCombine(SDNode
*N
,
19138 TargetLowering::DAGCombinerInfo
&DCI
,
19139 SelectionDAG
&DAG
, unsigned CCIndex
,
19140 unsigned CmpIndex
) {
19141 unsigned CC
= cast
<ConstantSDNode
>(N
->getOperand(CCIndex
))->getSExtValue();
19142 SDNode
*SubsNode
= N
->getOperand(CmpIndex
).getNode();
19143 unsigned CondOpcode
= SubsNode
->getOpcode();
19145 if (CondOpcode
!= AArch64ISD::SUBS
|| SubsNode
->hasAnyUseOfValue(0))
19148 // There is a SUBS feeding this condition. Is it fed by a mask we can
19151 SDNode
*AndNode
= SubsNode
->getOperand(0).getNode();
19152 unsigned MaskBits
= 0;
19154 if (AndNode
->getOpcode() != ISD::AND
)
19157 if (ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(AndNode
->getOperand(1))) {
19158 uint32_t CNV
= CN
->getZExtValue();
19161 else if (CNV
== 65535)
19168 SDValue AddValue
= AndNode
->getOperand(0);
19170 if (AddValue
.getOpcode() != ISD::ADD
)
19173 // The basic dag structure is correct, grab the inputs and validate them.
19175 SDValue AddInputValue1
= AddValue
.getNode()->getOperand(0);
19176 SDValue AddInputValue2
= AddValue
.getNode()->getOperand(1);
19177 SDValue SubsInputValue
= SubsNode
->getOperand(1);
19179 // The mask is present and the provenance of all the values is a smaller type,
19180 // lets see if the mask is superfluous.
19182 if (!isa
<ConstantSDNode
>(AddInputValue2
.getNode()) ||
19183 !isa
<ConstantSDNode
>(SubsInputValue
.getNode()))
19186 ISD::LoadExtType ExtType
;
19188 if (!checkValueWidth(SubsInputValue
, MaskBits
, ExtType
) ||
19189 !checkValueWidth(AddInputValue2
, MaskBits
, ExtType
) ||
19190 !checkValueWidth(AddInputValue1
, MaskBits
, ExtType
) )
19193 if(!isEquivalentMaskless(CC
, MaskBits
, ExtType
,
19194 cast
<ConstantSDNode
>(AddInputValue2
.getNode())->getSExtValue(),
19195 cast
<ConstantSDNode
>(SubsInputValue
.getNode())->getSExtValue()))
19198 // The AND is not necessary, remove it.
19200 SDVTList VTs
= DAG
.getVTList(SubsNode
->getValueType(0),
19201 SubsNode
->getValueType(1));
19202 SDValue Ops
[] = { AddValue
, SubsNode
->getOperand(1) };
19204 SDValue NewValue
= DAG
.getNode(CondOpcode
, SDLoc(SubsNode
), VTs
, Ops
);
19205 DAG
.ReplaceAllUsesWith(SubsNode
, NewValue
.getNode());
19207 return SDValue(N
, 0);
19210 // Optimize compare with zero and branch.
19211 static SDValue
performBRCONDCombine(SDNode
*N
,
19212 TargetLowering::DAGCombinerInfo
&DCI
,
19213 SelectionDAG
&DAG
) {
19214 MachineFunction
&MF
= DAG
.getMachineFunction();
19215 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
19216 // will not be produced, as they are conditional branch instructions that do
19218 if (MF
.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening
))
19221 if (SDValue NV
= performCONDCombine(N
, DCI
, DAG
, 2, 3))
19223 SDValue Chain
= N
->getOperand(0);
19224 SDValue Dest
= N
->getOperand(1);
19225 SDValue CCVal
= N
->getOperand(2);
19226 SDValue Cmp
= N
->getOperand(3);
19228 assert(isa
<ConstantSDNode
>(CCVal
) && "Expected a ConstantSDNode here!");
19229 unsigned CC
= cast
<ConstantSDNode
>(CCVal
)->getZExtValue();
19230 if (CC
!= AArch64CC::EQ
&& CC
!= AArch64CC::NE
)
19233 unsigned CmpOpc
= Cmp
.getOpcode();
19234 if (CmpOpc
!= AArch64ISD::ADDS
&& CmpOpc
!= AArch64ISD::SUBS
)
19237 // Only attempt folding if there is only one use of the flag and no use of the
19239 if (!Cmp
->hasNUsesOfValue(0, 0) || !Cmp
->hasNUsesOfValue(1, 1))
19242 SDValue LHS
= Cmp
.getOperand(0);
19243 SDValue RHS
= Cmp
.getOperand(1);
19245 assert(LHS
.getValueType() == RHS
.getValueType() &&
19246 "Expected the value type to be the same for both operands!");
19247 if (LHS
.getValueType() != MVT::i32
&& LHS
.getValueType() != MVT::i64
)
19250 if (isNullConstant(LHS
))
19251 std::swap(LHS
, RHS
);
19253 if (!isNullConstant(RHS
))
19256 if (LHS
.getOpcode() == ISD::SHL
|| LHS
.getOpcode() == ISD::SRA
||
19257 LHS
.getOpcode() == ISD::SRL
)
19260 // Fold the compare into the branch instruction.
19262 if (CC
== AArch64CC::EQ
)
19263 BR
= DAG
.getNode(AArch64ISD::CBZ
, SDLoc(N
), MVT::Other
, Chain
, LHS
, Dest
);
19265 BR
= DAG
.getNode(AArch64ISD::CBNZ
, SDLoc(N
), MVT::Other
, Chain
, LHS
, Dest
);
19267 // Do not add new nodes to DAG combiner worklist.
19268 DCI
.CombineTo(N
, BR
, false);
19273 static SDValue
foldCSELofCTTZ(SDNode
*N
, SelectionDAG
&DAG
) {
19274 unsigned CC
= N
->getConstantOperandVal(2);
19275 SDValue SUBS
= N
->getOperand(3);
19276 SDValue Zero
, CTTZ
;
19278 if (CC
== AArch64CC::EQ
&& SUBS
.getOpcode() == AArch64ISD::SUBS
) {
19279 Zero
= N
->getOperand(0);
19280 CTTZ
= N
->getOperand(1);
19281 } else if (CC
== AArch64CC::NE
&& SUBS
.getOpcode() == AArch64ISD::SUBS
) {
19282 Zero
= N
->getOperand(1);
19283 CTTZ
= N
->getOperand(0);
19287 if ((CTTZ
.getOpcode() != ISD::CTTZ
&& CTTZ
.getOpcode() != ISD::TRUNCATE
) ||
19288 (CTTZ
.getOpcode() == ISD::TRUNCATE
&&
19289 CTTZ
.getOperand(0).getOpcode() != ISD::CTTZ
))
19292 assert((CTTZ
.getValueType() == MVT::i32
|| CTTZ
.getValueType() == MVT::i64
) &&
19293 "Illegal type in CTTZ folding");
19295 if (!isNullConstant(Zero
) || !isNullConstant(SUBS
.getOperand(1)))
19298 SDValue X
= CTTZ
.getOpcode() == ISD::TRUNCATE
19299 ? CTTZ
.getOperand(0).getOperand(0)
19300 : CTTZ
.getOperand(0);
19302 if (X
!= SUBS
.getOperand(0))
19305 unsigned BitWidth
= CTTZ
.getOpcode() == ISD::TRUNCATE
19306 ? CTTZ
.getOperand(0).getValueSizeInBits()
19307 : CTTZ
.getValueSizeInBits();
19308 SDValue BitWidthMinusOne
=
19309 DAG
.getConstant(BitWidth
- 1, SDLoc(N
), CTTZ
.getValueType());
19310 return DAG
.getNode(ISD::AND
, SDLoc(N
), CTTZ
.getValueType(), CTTZ
,
19314 // (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
19315 // (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
19316 // Where x and y are constants
19318 // (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
19319 // (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
19320 // Where x and y are constants
19321 static SDValue
foldCSELOfCSEL(SDNode
*Op
, SelectionDAG
&DAG
) {
19322 SDValue L
= Op
->getOperand(0);
19323 SDValue R
= Op
->getOperand(1);
19324 AArch64CC::CondCode OpCC
=
19325 static_cast<AArch64CC::CondCode
>(Op
->getConstantOperandVal(2));
19327 SDValue OpCmp
= Op
->getOperand(3);
19331 SDValue CmpLHS
= OpCmp
.getOperand(0);
19332 SDValue CmpRHS
= OpCmp
.getOperand(1);
19334 if (CmpRHS
.getOpcode() == AArch64ISD::CSEL
)
19335 std::swap(CmpLHS
, CmpRHS
);
19336 else if (CmpLHS
.getOpcode() != AArch64ISD::CSEL
)
19339 SDValue X
= CmpLHS
->getOperand(0);
19340 SDValue Y
= CmpLHS
->getOperand(1);
19341 if (!isa
<ConstantSDNode
>(X
) || !isa
<ConstantSDNode
>(Y
)) {
19345 AArch64CC::CondCode CC
=
19346 static_cast<AArch64CC::CondCode
>(CmpLHS
->getConstantOperandVal(2));
19347 SDValue Cond
= CmpLHS
->getOperand(3);
19350 CC
= AArch64CC::getInvertedCondCode(CC
);
19351 else if (CmpRHS
!= X
)
19354 if (OpCC
== AArch64CC::NE
)
19355 CC
= AArch64CC::getInvertedCondCode(CC
);
19356 else if (OpCC
!= AArch64CC::EQ
)
19360 EVT VT
= Op
->getValueType(0);
19362 SDValue CCValue
= DAG
.getConstant(CC
, DL
, MVT::i32
);
19363 return DAG
.getNode(AArch64ISD::CSEL
, DL
, VT
, L
, R
, CCValue
, Cond
);
19366 // Optimize CSEL instructions
19367 static SDValue
performCSELCombine(SDNode
*N
,
19368 TargetLowering::DAGCombinerInfo
&DCI
,
19369 SelectionDAG
&DAG
) {
19370 // CSEL x, x, cc -> x
19371 if (N
->getOperand(0) == N
->getOperand(1))
19372 return N
->getOperand(0);
19374 if (SDValue R
= foldCSELOfCSEL(N
, DAG
))
19377 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
19378 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
19379 if (SDValue Folded
= foldCSELofCTTZ(N
, DAG
))
19382 return performCONDCombine(N
, DCI
, DAG
, 2, 3);
19385 // Try to re-use an already extended operand of a vector SetCC feeding a
19386 // extended select. Doing so avoids requiring another full extension of the
19387 // SET_CC result when lowering the select.
19388 static SDValue
tryToWidenSetCCOperands(SDNode
*Op
, SelectionDAG
&DAG
) {
19389 EVT Op0MVT
= Op
->getOperand(0).getValueType();
19390 if (!Op0MVT
.isVector() || Op
->use_empty())
19393 // Make sure that all uses of Op are VSELECTs with result matching types where
19394 // the result type has a larger element type than the SetCC operand.
19395 SDNode
*FirstUse
= *Op
->use_begin();
19396 if (FirstUse
->getOpcode() != ISD::VSELECT
)
19398 EVT UseMVT
= FirstUse
->getValueType(0);
19399 if (UseMVT
.getScalarSizeInBits() <= Op0MVT
.getScalarSizeInBits())
19401 if (any_of(Op
->uses(), [&UseMVT
](const SDNode
*N
) {
19402 return N
->getOpcode() != ISD::VSELECT
|| N
->getValueType(0) != UseMVT
;
19407 if (!ISD::isConstantSplatVector(Op
->getOperand(1).getNode(), V
))
19413 ISD::CondCode CC
= cast
<CondCodeSDNode
>(Op
->getOperand(2))->get();
19414 // Check if the first operand of the SET_CC is already extended. If it is,
19415 // split the SET_CC and re-use the extended version of the operand.
19416 SDNode
*Op0SExt
= DAG
.getNodeIfExists(ISD::SIGN_EXTEND
, DAG
.getVTList(UseMVT
),
19417 Op
->getOperand(0));
19418 SDNode
*Op0ZExt
= DAG
.getNodeIfExists(ISD::ZERO_EXTEND
, DAG
.getVTList(UseMVT
),
19419 Op
->getOperand(0));
19420 if (Op0SExt
&& (isSignedIntSetCC(CC
) || isIntEqualitySetCC(CC
))) {
19421 Op0ExtV
= SDValue(Op0SExt
, 0);
19422 Op1ExtV
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, UseMVT
, Op
->getOperand(1));
19423 } else if (Op0ZExt
&& (isUnsignedIntSetCC(CC
) || isIntEqualitySetCC(CC
))) {
19424 Op0ExtV
= SDValue(Op0ZExt
, 0);
19425 Op1ExtV
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, UseMVT
, Op
->getOperand(1));
19429 return DAG
.getNode(ISD::SETCC
, DL
, UseMVT
.changeVectorElementType(MVT::i1
),
19430 Op0ExtV
, Op1ExtV
, Op
->getOperand(2));
19433 static SDValue
performSETCCCombine(SDNode
*N
,
19434 TargetLowering::DAGCombinerInfo
&DCI
,
19435 SelectionDAG
&DAG
) {
19436 assert(N
->getOpcode() == ISD::SETCC
&& "Unexpected opcode!");
19437 SDValue LHS
= N
->getOperand(0);
19438 SDValue RHS
= N
->getOperand(1);
19439 ISD::CondCode Cond
= cast
<CondCodeSDNode
>(N
->getOperand(2))->get();
19441 EVT VT
= N
->getValueType(0);
19443 if (SDValue V
= tryToWidenSetCCOperands(N
, DAG
))
19446 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
19447 if (Cond
== ISD::SETNE
&& isOneConstant(RHS
) &&
19448 LHS
->getOpcode() == AArch64ISD::CSEL
&&
19449 isNullConstant(LHS
->getOperand(0)) && isOneConstant(LHS
->getOperand(1)) &&
19450 LHS
->hasOneUse()) {
19451 // Invert CSEL's condition.
19452 auto *OpCC
= cast
<ConstantSDNode
>(LHS
.getOperand(2));
19453 auto OldCond
= static_cast<AArch64CC::CondCode
>(OpCC
->getZExtValue());
19454 auto NewCond
= getInvertedCondCode(OldCond
);
19456 // csel 0, 1, !cond, X
19458 DAG
.getNode(AArch64ISD::CSEL
, DL
, LHS
.getValueType(), LHS
.getOperand(0),
19459 LHS
.getOperand(1), DAG
.getConstant(NewCond
, DL
, MVT::i32
),
19460 LHS
.getOperand(3));
19461 return DAG
.getZExtOrTrunc(CSEL
, DL
, VT
);
19464 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
19465 if (Cond
== ISD::SETNE
&& isNullConstant(RHS
) &&
19466 LHS
->getOpcode() == ISD::SRL
&& isa
<ConstantSDNode
>(LHS
->getOperand(1)) &&
19467 LHS
->hasOneUse()) {
19468 EVT TstVT
= LHS
->getValueType(0);
19469 if (TstVT
.isScalarInteger() && TstVT
.getFixedSizeInBits() <= 64) {
19470 // this pattern will get better opt in emitComparison
19471 uint64_t TstImm
= -1ULL << LHS
->getConstantOperandVal(1);
19472 SDValue TST
= DAG
.getNode(ISD::AND
, DL
, TstVT
, LHS
->getOperand(0),
19473 DAG
.getConstant(TstImm
, DL
, TstVT
));
19474 return DAG
.getNode(ISD::SETCC
, DL
, VT
, TST
, RHS
, N
->getOperand(2));
19478 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
19479 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
19480 if (DCI
.isBeforeLegalize() && VT
.isScalarInteger() &&
19481 (Cond
== ISD::SETEQ
|| Cond
== ISD::SETNE
) && isNullConstant(RHS
) &&
19482 LHS
->getOpcode() == ISD::BITCAST
) {
19483 EVT ToVT
= LHS
->getValueType(0);
19484 EVT FromVT
= LHS
->getOperand(0).getValueType();
19485 if (FromVT
.isFixedLengthVector() &&
19486 FromVT
.getVectorElementType() == MVT::i1
) {
19487 LHS
= DAG
.getNode(ISD::VECREDUCE_OR
, DL
, MVT::i1
, LHS
->getOperand(0));
19488 LHS
= DAG
.getNode(ISD::ZERO_EXTEND
, DL
, ToVT
, LHS
);
19489 return DAG
.getSetCC(DL
, VT
, LHS
, RHS
, Cond
);
19496 // Replace a flag-setting operator (eg ANDS) with the generic version
19497 // (eg AND) if the flag is unused.
19498 static SDValue
performFlagSettingCombine(SDNode
*N
,
19499 TargetLowering::DAGCombinerInfo
&DCI
,
19500 unsigned GenericOpcode
) {
19502 SDValue LHS
= N
->getOperand(0);
19503 SDValue RHS
= N
->getOperand(1);
19504 EVT VT
= N
->getValueType(0);
19506 // If the flag result isn't used, convert back to a generic opcode.
19507 if (!N
->hasAnyUseOfValue(1)) {
19508 SDValue Res
= DCI
.DAG
.getNode(GenericOpcode
, DL
, VT
, N
->ops());
19509 return DCI
.DAG
.getMergeValues({Res
, DCI
.DAG
.getConstant(0, DL
, MVT::i32
)},
19513 // Combine identical generic nodes into this node, re-using the result.
19514 if (SDNode
*Generic
= DCI
.DAG
.getNodeIfExists(
19515 GenericOpcode
, DCI
.DAG
.getVTList(VT
), {LHS
, RHS
}))
19516 DCI
.CombineTo(Generic
, SDValue(N
, 0));
19521 static SDValue
performSetCCPunpkCombine(SDNode
*N
, SelectionDAG
&DAG
) {
19522 // setcc_merge_zero pred
19523 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
19524 // => extract_subvector (inner setcc_merge_zero)
19525 SDValue Pred
= N
->getOperand(0);
19526 SDValue LHS
= N
->getOperand(1);
19527 SDValue RHS
= N
->getOperand(2);
19528 ISD::CondCode Cond
= cast
<CondCodeSDNode
>(N
->getOperand(3))->get();
19530 if (Cond
!= ISD::SETNE
|| !isZerosVector(RHS
.getNode()) ||
19531 LHS
->getOpcode() != ISD::SIGN_EXTEND
)
19534 SDValue Extract
= LHS
->getOperand(0);
19535 if (Extract
->getOpcode() != ISD::EXTRACT_SUBVECTOR
||
19536 Extract
->getValueType(0) != N
->getValueType(0) ||
19537 Extract
->getConstantOperandVal(1) != 0)
19540 SDValue InnerSetCC
= Extract
->getOperand(0);
19541 if (InnerSetCC
->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO
)
19544 // By this point we've effectively got
19545 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
19546 // lanes are already zero then the trunc(sext()) sequence is redundant and we
19547 // can operate on A directly.
19548 SDValue InnerPred
= InnerSetCC
.getOperand(0);
19549 if (Pred
.getOpcode() == AArch64ISD::PTRUE
&&
19550 InnerPred
.getOpcode() == AArch64ISD::PTRUE
&&
19551 Pred
.getConstantOperandVal(0) == InnerPred
.getConstantOperandVal(0) &&
19552 Pred
->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1
&&
19553 Pred
->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256
)
19560 performSetccMergeZeroCombine(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
) {
19561 assert(N
->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO
&&
19562 "Unexpected opcode!");
19564 SelectionDAG
&DAG
= DCI
.DAG
;
19565 SDValue Pred
= N
->getOperand(0);
19566 SDValue LHS
= N
->getOperand(1);
19567 SDValue RHS
= N
->getOperand(2);
19568 ISD::CondCode Cond
= cast
<CondCodeSDNode
>(N
->getOperand(3))->get();
19570 if (SDValue V
= performSetCCPunpkCombine(N
, DAG
))
19573 if (Cond
== ISD::SETNE
&& isZerosVector(RHS
.getNode()) &&
19574 LHS
->getOpcode() == ISD::SIGN_EXTEND
&&
19575 LHS
->getOperand(0)->getValueType(0) == N
->getValueType(0)) {
19576 // setcc_merge_zero(
19577 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
19578 // => setcc_merge_zero(pred, ...)
19579 if (LHS
->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO
&&
19580 LHS
->getOperand(0)->getOperand(0) == Pred
)
19581 return LHS
->getOperand(0);
19583 // setcc_merge_zero(
19584 // all_active, extend(nxvNi1 ...), != splat(0))
19586 if (isAllActivePredicate(DAG
, Pred
))
19587 return LHS
->getOperand(0);
19589 // setcc_merge_zero(
19590 // pred, extend(nxvNi1 ...), != splat(0))
19591 // -> nxvNi1 and(pred, ...)
19592 if (DCI
.isAfterLegalizeDAG())
19593 // Do this after legalization to allow more folds on setcc_merge_zero
19594 // to be recognized.
19595 return DAG
.getNode(ISD::AND
, SDLoc(N
), N
->getValueType(0),
19596 LHS
->getOperand(0), Pred
);
19602 // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
19603 // as well as whether the test should be inverted. This code is required to
19604 // catch these cases (as opposed to standard dag combines) because
19605 // AArch64ISD::TBZ is matched during legalization.
19606 static SDValue
getTestBitOperand(SDValue Op
, unsigned &Bit
, bool &Invert
,
19607 SelectionDAG
&DAG
) {
19609 if (!Op
->hasOneUse())
19612 // We don't handle undef/constant-fold cases below, as they should have
19613 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
19616 // (tbz (trunc x), b) -> (tbz x, b)
19617 // This case is just here to enable more of the below cases to be caught.
19618 if (Op
->getOpcode() == ISD::TRUNCATE
&&
19619 Bit
< Op
->getValueType(0).getSizeInBits()) {
19620 return getTestBitOperand(Op
->getOperand(0), Bit
, Invert
, DAG
);
19623 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
19624 if (Op
->getOpcode() == ISD::ANY_EXTEND
&&
19625 Bit
< Op
->getOperand(0).getValueSizeInBits()) {
19626 return getTestBitOperand(Op
->getOperand(0), Bit
, Invert
, DAG
);
19629 if (Op
->getNumOperands() != 2)
19632 auto *C
= dyn_cast
<ConstantSDNode
>(Op
->getOperand(1));
19636 switch (Op
->getOpcode()) {
19640 // (tbz (and x, m), b) -> (tbz x, b)
19642 if ((C
->getZExtValue() >> Bit
) & 1)
19643 return getTestBitOperand(Op
->getOperand(0), Bit
, Invert
, DAG
);
19646 // (tbz (shl x, c), b) -> (tbz x, b-c)
19648 if (C
->getZExtValue() <= Bit
&&
19649 (Bit
- C
->getZExtValue()) < Op
->getValueType(0).getSizeInBits()) {
19650 Bit
= Bit
- C
->getZExtValue();
19651 return getTestBitOperand(Op
->getOperand(0), Bit
, Invert
, DAG
);
19655 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
19657 Bit
= Bit
+ C
->getZExtValue();
19658 if (Bit
>= Op
->getValueType(0).getSizeInBits())
19659 Bit
= Op
->getValueType(0).getSizeInBits() - 1;
19660 return getTestBitOperand(Op
->getOperand(0), Bit
, Invert
, DAG
);
19662 // (tbz (srl x, c), b) -> (tbz x, b+c)
19664 if ((Bit
+ C
->getZExtValue()) < Op
->getValueType(0).getSizeInBits()) {
19665 Bit
= Bit
+ C
->getZExtValue();
19666 return getTestBitOperand(Op
->getOperand(0), Bit
, Invert
, DAG
);
19670 // (tbz (xor x, -1), b) -> (tbnz x, b)
19672 if ((C
->getZExtValue() >> Bit
) & 1)
19674 return getTestBitOperand(Op
->getOperand(0), Bit
, Invert
, DAG
);
19678 // Optimize test single bit zero/non-zero and branch.
19679 static SDValue
performTBZCombine(SDNode
*N
,
19680 TargetLowering::DAGCombinerInfo
&DCI
,
19681 SelectionDAG
&DAG
) {
19682 unsigned Bit
= cast
<ConstantSDNode
>(N
->getOperand(2))->getZExtValue();
19683 bool Invert
= false;
19684 SDValue TestSrc
= N
->getOperand(1);
19685 SDValue NewTestSrc
= getTestBitOperand(TestSrc
, Bit
, Invert
, DAG
);
19687 if (TestSrc
== NewTestSrc
)
19690 unsigned NewOpc
= N
->getOpcode();
19692 if (NewOpc
== AArch64ISD::TBZ
)
19693 NewOpc
= AArch64ISD::TBNZ
;
19695 assert(NewOpc
== AArch64ISD::TBNZ
);
19696 NewOpc
= AArch64ISD::TBZ
;
19701 return DAG
.getNode(NewOpc
, DL
, MVT::Other
, N
->getOperand(0), NewTestSrc
,
19702 DAG
.getConstant(Bit
, DL
, MVT::i64
), N
->getOperand(3));
19705 // Swap vselect operands where it may allow a predicated operation to achieve
19708 // (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
19709 // => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
19710 static SDValue
trySwapVSelectOperands(SDNode
*N
, SelectionDAG
&DAG
) {
19711 auto SelectA
= N
->getOperand(1);
19712 auto SelectB
= N
->getOperand(2);
19713 auto NTy
= N
->getValueType(0);
19715 if (!NTy
.isScalableVector())
19717 SDValue SetCC
= N
->getOperand(0);
19718 if (SetCC
.getOpcode() != ISD::SETCC
|| !SetCC
.hasOneUse())
19721 switch (SelectB
.getOpcode()) {
19729 if (SelectA
!= SelectB
.getOperand(0))
19732 ISD::CondCode CC
= cast
<CondCodeSDNode
>(SetCC
.getOperand(2))->get();
19733 ISD::CondCode InverseCC
=
19734 ISD::getSetCCInverse(CC
, SetCC
.getOperand(0).getValueType());
19735 auto InverseSetCC
=
19736 DAG
.getSetCC(SDLoc(SetCC
), SetCC
.getValueType(), SetCC
.getOperand(0),
19737 SetCC
.getOperand(1), InverseCC
);
19739 return DAG
.getNode(ISD::VSELECT
, SDLoc(N
), NTy
,
19740 {InverseSetCC
, SelectB
, SelectA
});
19743 // vselect (v1i1 setcc) ->
19744 // vselect (v1iXX setcc) (XX is the size of the compared operand type)
19745 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
19746 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
19748 static SDValue
performVSelectCombine(SDNode
*N
, SelectionDAG
&DAG
) {
19749 if (auto SwapResult
= trySwapVSelectOperands(N
, DAG
))
19752 SDValue N0
= N
->getOperand(0);
19753 EVT CCVT
= N0
.getValueType();
19755 if (isAllActivePredicate(DAG
, N0
))
19756 return N
->getOperand(1);
19758 if (isAllInactivePredicate(N0
))
19759 return N
->getOperand(2);
19761 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
19762 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
19763 // supported types.
19764 SDValue SetCC
= N
->getOperand(0);
19765 if (SetCC
.getOpcode() == ISD::SETCC
&&
19766 SetCC
.getOperand(2) == DAG
.getCondCode(ISD::SETGT
)) {
19767 SDValue CmpLHS
= SetCC
.getOperand(0);
19768 EVT VT
= CmpLHS
.getValueType();
19769 SDNode
*CmpRHS
= SetCC
.getOperand(1).getNode();
19770 SDNode
*SplatLHS
= N
->getOperand(1).getNode();
19771 SDNode
*SplatRHS
= N
->getOperand(2).getNode();
19773 if (CmpLHS
.getValueType() == N
->getOperand(1).getValueType() &&
19776 makeArrayRef({MVT::v8i8
, MVT::v16i8
, MVT::v4i16
, MVT::v8i16
,
19777 MVT::v2i32
, MVT::v4i32
, MVT::v2i64
}),
19778 VT
.getSimpleVT().SimpleTy
) &&
19779 ISD::isConstantSplatVector(SplatLHS
, SplatLHSVal
) &&
19780 SplatLHSVal
.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS
) &&
19781 ISD::isConstantSplatVectorAllOnes(SplatRHS
)) {
19782 unsigned NumElts
= VT
.getVectorNumElements();
19783 SmallVector
<SDValue
, 8> Ops(
19784 NumElts
, DAG
.getConstant(VT
.getScalarSizeInBits() - 1, SDLoc(N
),
19785 VT
.getScalarType()));
19786 SDValue Val
= DAG
.getBuildVector(VT
, SDLoc(N
), Ops
);
19788 auto Shift
= DAG
.getNode(ISD::SRA
, SDLoc(N
), VT
, CmpLHS
, Val
);
19789 auto Or
= DAG
.getNode(ISD::OR
, SDLoc(N
), VT
, Shift
, N
->getOperand(1));
19794 if (N0
.getOpcode() != ISD::SETCC
||
19795 CCVT
.getVectorElementCount() != ElementCount::getFixed(1) ||
19796 CCVT
.getVectorElementType() != MVT::i1
)
19799 EVT ResVT
= N
->getValueType(0);
19800 EVT CmpVT
= N0
.getOperand(0).getValueType();
19801 // Only combine when the result type is of the same size as the compared
19803 if (ResVT
.getSizeInBits() != CmpVT
.getSizeInBits())
19806 SDValue IfTrue
= N
->getOperand(1);
19807 SDValue IfFalse
= N
->getOperand(2);
19808 SetCC
= DAG
.getSetCC(SDLoc(N
), CmpVT
.changeVectorElementTypeToInteger(),
19809 N0
.getOperand(0), N0
.getOperand(1),
19810 cast
<CondCodeSDNode
>(N0
.getOperand(2))->get());
19811 return DAG
.getNode(ISD::VSELECT
, SDLoc(N
), ResVT
, SetCC
,
19815 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
19816 /// the compare-mask instructions rather than going via NZCV, even if LHS and
19817 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
19818 /// with a vector one followed by a DUP shuffle on the result.
19819 static SDValue
performSelectCombine(SDNode
*N
,
19820 TargetLowering::DAGCombinerInfo
&DCI
) {
19821 SelectionDAG
&DAG
= DCI
.DAG
;
19822 SDValue N0
= N
->getOperand(0);
19823 EVT ResVT
= N
->getValueType(0);
19825 if (N0
.getOpcode() != ISD::SETCC
)
19828 if (ResVT
.isScalableVector())
19831 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
19832 // scalar SetCCResultType. We also don't expect vectors, because we assume
19833 // that selects fed by vector SETCCs are canonicalized to VSELECT.
19834 assert((N0
.getValueType() == MVT::i1
|| N0
.getValueType() == MVT::i32
) &&
19835 "Scalar-SETCC feeding SELECT has unexpected result type!");
19837 // If NumMaskElts == 0, the comparison is larger than select result. The
19838 // largest real NEON comparison is 64-bits per lane, which means the result is
19839 // at most 32-bits and an illegal vector. Just bail out for now.
19840 EVT SrcVT
= N0
.getOperand(0).getValueType();
19842 // Don't try to do this optimization when the setcc itself has i1 operands.
19843 // There are no legal vectors of i1, so this would be pointless.
19844 if (SrcVT
== MVT::i1
)
19847 int NumMaskElts
= ResVT
.getSizeInBits() / SrcVT
.getSizeInBits();
19848 if (!ResVT
.isVector() || NumMaskElts
== 0)
19851 SrcVT
= EVT::getVectorVT(*DAG
.getContext(), SrcVT
, NumMaskElts
);
19852 EVT CCVT
= SrcVT
.changeVectorElementTypeToInteger();
19854 // Also bail out if the vector CCVT isn't the same size as ResVT.
19855 // This can happen if the SETCC operand size doesn't divide the ResVT size
19856 // (e.g., f64 vs v3f32).
19857 if (CCVT
.getSizeInBits() != ResVT
.getSizeInBits())
19860 // Make sure we didn't create illegal types, if we're not supposed to.
19861 assert(DCI
.isBeforeLegalize() ||
19862 DAG
.getTargetLoweringInfo().isTypeLegal(SrcVT
));
19864 // First perform a vector comparison, where lane 0 is the one we're interested
19868 DAG
.getNode(ISD::SCALAR_TO_VECTOR
, DL
, SrcVT
, N0
.getOperand(0));
19870 DAG
.getNode(ISD::SCALAR_TO_VECTOR
, DL
, SrcVT
, N0
.getOperand(1));
19871 SDValue SetCC
= DAG
.getNode(ISD::SETCC
, DL
, CCVT
, LHS
, RHS
, N0
.getOperand(2));
19873 // Now duplicate the comparison mask we want across all other lanes.
19874 SmallVector
<int, 8> DUPMask(CCVT
.getVectorNumElements(), 0);
19875 SDValue Mask
= DAG
.getVectorShuffle(CCVT
, DL
, SetCC
, SetCC
, DUPMask
);
19876 Mask
= DAG
.getNode(ISD::BITCAST
, DL
,
19877 ResVT
.changeVectorElementTypeToInteger(), Mask
);
19879 return DAG
.getSelect(DL
, ResVT
, Mask
, N
->getOperand(1), N
->getOperand(2));
19882 static SDValue
performDUPCombine(SDNode
*N
,
19883 TargetLowering::DAGCombinerInfo
&DCI
) {
19884 EVT VT
= N
->getValueType(0);
19885 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
19886 // 128bit vector version.
19887 if (VT
.is64BitVector() && DCI
.isAfterLegalizeDAG()) {
19888 EVT LVT
= VT
.getDoubleNumVectorElementsVT(*DCI
.DAG
.getContext());
19889 if (SDNode
*LN
= DCI
.DAG
.getNodeIfExists(
19890 N
->getOpcode(), DCI
.DAG
.getVTList(LVT
), {N
->getOperand(0)})) {
19892 return DCI
.DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, VT
, SDValue(LN
, 0),
19893 DCI
.DAG
.getConstant(0, DL
, MVT::i64
));
19897 return performPostLD1Combine(N
, DCI
, false);
19900 /// Get rid of unnecessary NVCASTs (that don't change the type).
19901 static SDValue
performNVCASTCombine(SDNode
*N
) {
19902 if (N
->getValueType(0) == N
->getOperand(0).getValueType())
19903 return N
->getOperand(0);
19908 // If all users of the globaladdr are of the form (globaladdr + constant), find
19909 // the smallest constant, fold it into the globaladdr's offset and rewrite the
19910 // globaladdr as (globaladdr + constant) - constant.
19911 static SDValue
performGlobalAddressCombine(SDNode
*N
, SelectionDAG
&DAG
,
19912 const AArch64Subtarget
*Subtarget
,
19913 const TargetMachine
&TM
) {
19914 auto *GN
= cast
<GlobalAddressSDNode
>(N
);
19915 if (Subtarget
->ClassifyGlobalReference(GN
->getGlobal(), TM
) !=
19916 AArch64II::MO_NO_FLAG
)
19919 uint64_t MinOffset
= -1ull;
19920 for (SDNode
*N
: GN
->uses()) {
19921 if (N
->getOpcode() != ISD::ADD
)
19923 auto *C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(0));
19925 C
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
19928 MinOffset
= std::min(MinOffset
, C
->getZExtValue());
19930 uint64_t Offset
= MinOffset
+ GN
->getOffset();
19932 // Require that the new offset is larger than the existing one. Otherwise, we
19933 // can end up oscillating between two possible DAGs, for example,
19934 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
19935 if (Offset
<= uint64_t(GN
->getOffset()))
19938 // Check whether folding this offset is legal. It must not go out of bounds of
19939 // the referenced object to avoid violating the code model, and must be
19940 // smaller than 2^20 because this is the largest offset expressible in all
19941 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
19942 // stores an immediate signed 21 bit offset.)
19944 // This check also prevents us from folding negative offsets, which will end
19945 // up being treated in the same way as large positive ones. They could also
19946 // cause code model violations, and aren't really common enough to matter.
19947 if (Offset
>= (1 << 20))
19950 const GlobalValue
*GV
= GN
->getGlobal();
19951 Type
*T
= GV
->getValueType();
19952 if (!T
->isSized() ||
19953 Offset
> GV
->getParent()->getDataLayout().getTypeAllocSize(T
))
19957 SDValue Result
= DAG
.getGlobalAddress(GV
, DL
, MVT::i64
, Offset
);
19958 return DAG
.getNode(ISD::SUB
, DL
, MVT::i64
, Result
,
19959 DAG
.getConstant(MinOffset
, DL
, MVT::i64
));
19962 // Turns the vector of indices into a vector of byte offstes by scaling Offset
19963 // by (BitWidth / 8).
19964 static SDValue
getScaledOffsetForBitWidth(SelectionDAG
&DAG
, SDValue Offset
,
19965 SDLoc DL
, unsigned BitWidth
) {
19966 assert(Offset
.getValueType().isScalableVector() &&
19967 "This method is only for scalable vectors of offsets");
19969 SDValue Shift
= DAG
.getConstant(Log2_32(BitWidth
/ 8), DL
, MVT::i64
);
19970 SDValue SplatShift
= DAG
.getNode(ISD::SPLAT_VECTOR
, DL
, MVT::nxv2i64
, Shift
);
19972 return DAG
.getNode(ISD::SHL
, DL
, MVT::nxv2i64
, Offset
, SplatShift
);
19975 /// Check if the value of \p OffsetInBytes can be used as an immediate for
19976 /// the gather load/prefetch and scatter store instructions with vector base and
19977 /// immediate offset addressing mode:
19979 /// [<Zn>.[S|D]{, #<imm>}]
19981 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
19982 inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes
,
19983 unsigned ScalarSizeInBytes
) {
19984 // The immediate is not a multiple of the scalar size.
19985 if (OffsetInBytes
% ScalarSizeInBytes
)
19988 // The immediate is out of range.
19989 if (OffsetInBytes
/ ScalarSizeInBytes
> 31)
19995 /// Check if the value of \p Offset represents a valid immediate for the SVE
19996 /// gather load/prefetch and scatter store instructiona with vector base and
19997 /// immediate offset addressing mode:
19999 /// [<Zn>.[S|D]{, #<imm>}]
20001 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
20002 static bool isValidImmForSVEVecImmAddrMode(SDValue Offset
,
20003 unsigned ScalarSizeInBytes
) {
20004 ConstantSDNode
*OffsetConst
= dyn_cast
<ConstantSDNode
>(Offset
.getNode());
20005 return OffsetConst
&& isValidImmForSVEVecImmAddrMode(
20006 OffsetConst
->getZExtValue(), ScalarSizeInBytes
);
20009 static SDValue
performScatterStoreCombine(SDNode
*N
, SelectionDAG
&DAG
,
20011 bool OnlyPackedOffsets
= true) {
20012 const SDValue Src
= N
->getOperand(2);
20013 const EVT SrcVT
= Src
->getValueType(0);
20014 assert(SrcVT
.isScalableVector() &&
20015 "Scatter stores are only possible for SVE vectors");
20018 MVT SrcElVT
= SrcVT
.getVectorElementType().getSimpleVT();
20020 // Make sure that source data will fit into an SVE register
20021 if (SrcVT
.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock
)
20024 // For FPs, ACLE only supports _packed_ single and double precision types.
20025 if (SrcElVT
.isFloatingPoint())
20026 if ((SrcVT
!= MVT::nxv4f32
) && (SrcVT
!= MVT::nxv2f64
))
20029 // Depending on the addressing mode, this is either a pointer or a vector of
20030 // pointers (that fits into one register)
20031 SDValue Base
= N
->getOperand(4);
20032 // Depending on the addressing mode, this is either a single offset or a
20033 // vector of offsets (that fits into one register)
20034 SDValue Offset
= N
->getOperand(5);
20036 // For "scalar + vector of indices", just scale the indices. This only
20037 // applies to non-temporal scatters because there's no instruction that takes
20039 if (Opcode
== AArch64ISD::SSTNT1_INDEX_PRED
) {
20041 getScaledOffsetForBitWidth(DAG
, Offset
, DL
, SrcElVT
.getSizeInBits());
20042 Opcode
= AArch64ISD::SSTNT1_PRED
;
20045 // In the case of non-temporal gather loads there's only one SVE instruction
20046 // per data-size: "scalar + vector", i.e.
20047 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
20048 // Since we do have intrinsics that allow the arguments to be in a different
20049 // order, we may need to swap them to match the spec.
20050 if (Opcode
== AArch64ISD::SSTNT1_PRED
&& Offset
.getValueType().isVector())
20051 std::swap(Base
, Offset
);
20053 // SST1_IMM requires that the offset is an immediate that is:
20054 // * a multiple of #SizeInBytes,
20055 // * in the range [0, 31 x #SizeInBytes],
20056 // where #SizeInBytes is the size in bytes of the stored items. For
20057 // immediates outside that range and non-immediate scalar offsets use SST1 or
20058 // SST1_UXTW instead.
20059 if (Opcode
== AArch64ISD::SST1_IMM_PRED
) {
20060 if (!isValidImmForSVEVecImmAddrMode(Offset
,
20061 SrcVT
.getScalarSizeInBits() / 8)) {
20062 if (MVT::nxv4i32
== Base
.getValueType().getSimpleVT().SimpleTy
)
20063 Opcode
= AArch64ISD::SST1_UXTW_PRED
;
20065 Opcode
= AArch64ISD::SST1_PRED
;
20067 std::swap(Base
, Offset
);
20071 auto &TLI
= DAG
.getTargetLoweringInfo();
20072 if (!TLI
.isTypeLegal(Base
.getValueType()))
20075 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
20076 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
20077 // nxv2i64. Legalize accordingly.
20078 if (!OnlyPackedOffsets
&&
20079 Offset
.getValueType().getSimpleVT().SimpleTy
== MVT::nxv2i32
)
20080 Offset
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::nxv2i64
, Offset
).getValue(0);
20082 if (!TLI
.isTypeLegal(Offset
.getValueType()))
20085 // Source value type that is representable in hardware
20086 EVT HwSrcVt
= getSVEContainerType(SrcVT
);
20088 // Keep the original type of the input data to store - this is needed to be
20089 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
20090 // FP values we want the integer equivalent, so just use HwSrcVt.
20091 SDValue InputVT
= DAG
.getValueType(SrcVT
);
20092 if (SrcVT
.isFloatingPoint())
20093 InputVT
= DAG
.getValueType(HwSrcVt
);
20095 SDVTList VTs
= DAG
.getVTList(MVT::Other
);
20098 if (Src
.getValueType().isFloatingPoint())
20099 SrcNew
= DAG
.getNode(ISD::BITCAST
, DL
, HwSrcVt
, Src
);
20101 SrcNew
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, HwSrcVt
, Src
);
20103 SDValue Ops
[] = {N
->getOperand(0), // Chain
20105 N
->getOperand(3), // Pg
20110 return DAG
.getNode(Opcode
, DL
, VTs
, Ops
);
20113 static SDValue
performGatherLoadCombine(SDNode
*N
, SelectionDAG
&DAG
,
20115 bool OnlyPackedOffsets
= true) {
20116 const EVT RetVT
= N
->getValueType(0);
20117 assert(RetVT
.isScalableVector() &&
20118 "Gather loads are only possible for SVE vectors");
20122 // Make sure that the loaded data will fit into an SVE register
20123 if (RetVT
.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock
)
20126 // Depending on the addressing mode, this is either a pointer or a vector of
20127 // pointers (that fits into one register)
20128 SDValue Base
= N
->getOperand(3);
20129 // Depending on the addressing mode, this is either a single offset or a
20130 // vector of offsets (that fits into one register)
20131 SDValue Offset
= N
->getOperand(4);
20133 // For "scalar + vector of indices", just scale the indices. This only
20134 // applies to non-temporal gathers because there's no instruction that takes
20136 if (Opcode
== AArch64ISD::GLDNT1_INDEX_MERGE_ZERO
) {
20137 Offset
= getScaledOffsetForBitWidth(DAG
, Offset
, DL
,
20138 RetVT
.getScalarSizeInBits());
20139 Opcode
= AArch64ISD::GLDNT1_MERGE_ZERO
;
20142 // In the case of non-temporal gather loads there's only one SVE instruction
20143 // per data-size: "scalar + vector", i.e.
20144 // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
20145 // Since we do have intrinsics that allow the arguments to be in a different
20146 // order, we may need to swap them to match the spec.
20147 if (Opcode
== AArch64ISD::GLDNT1_MERGE_ZERO
&&
20148 Offset
.getValueType().isVector())
20149 std::swap(Base
, Offset
);
20151 // GLD{FF}1_IMM requires that the offset is an immediate that is:
20152 // * a multiple of #SizeInBytes,
20153 // * in the range [0, 31 x #SizeInBytes],
20154 // where #SizeInBytes is the size in bytes of the loaded items. For
20155 // immediates outside that range and non-immediate scalar offsets use
20156 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
20157 if (Opcode
== AArch64ISD::GLD1_IMM_MERGE_ZERO
||
20158 Opcode
== AArch64ISD::GLDFF1_IMM_MERGE_ZERO
) {
20159 if (!isValidImmForSVEVecImmAddrMode(Offset
,
20160 RetVT
.getScalarSizeInBits() / 8)) {
20161 if (MVT::nxv4i32
== Base
.getValueType().getSimpleVT().SimpleTy
)
20162 Opcode
= (Opcode
== AArch64ISD::GLD1_IMM_MERGE_ZERO
)
20163 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
20164 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO
;
20166 Opcode
= (Opcode
== AArch64ISD::GLD1_IMM_MERGE_ZERO
)
20167 ? AArch64ISD::GLD1_MERGE_ZERO
20168 : AArch64ISD::GLDFF1_MERGE_ZERO
;
20170 std::swap(Base
, Offset
);
20174 auto &TLI
= DAG
.getTargetLoweringInfo();
20175 if (!TLI
.isTypeLegal(Base
.getValueType()))
20178 // Some gather load variants allow unpacked offsets, but only as nxv2i32
20179 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
20180 // nxv2i64. Legalize accordingly.
20181 if (!OnlyPackedOffsets
&&
20182 Offset
.getValueType().getSimpleVT().SimpleTy
== MVT::nxv2i32
)
20183 Offset
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::nxv2i64
, Offset
).getValue(0);
20185 // Return value type that is representable in hardware
20186 EVT HwRetVt
= getSVEContainerType(RetVT
);
20188 // Keep the original output value type around - this is needed to be able to
20189 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
20190 // values we want the integer equivalent, so just use HwRetVT.
20191 SDValue OutVT
= DAG
.getValueType(RetVT
);
20192 if (RetVT
.isFloatingPoint())
20193 OutVT
= DAG
.getValueType(HwRetVt
);
20195 SDVTList VTs
= DAG
.getVTList(HwRetVt
, MVT::Other
);
20196 SDValue Ops
[] = {N
->getOperand(0), // Chain
20197 N
->getOperand(2), // Pg
20198 Base
, Offset
, OutVT
};
20200 SDValue Load
= DAG
.getNode(Opcode
, DL
, VTs
, Ops
);
20201 SDValue LoadChain
= SDValue(Load
.getNode(), 1);
20203 if (RetVT
.isInteger() && (RetVT
!= HwRetVt
))
20204 Load
= DAG
.getNode(ISD::TRUNCATE
, DL
, RetVT
, Load
.getValue(0));
20206 // If the original return value was FP, bitcast accordingly. Doing it here
20207 // means that we can avoid adding TableGen patterns for FPs.
20208 if (RetVT
.isFloatingPoint())
20209 Load
= DAG
.getNode(ISD::BITCAST
, DL
, RetVT
, Load
.getValue(0));
20211 return DAG
.getMergeValues({Load
, LoadChain
}, DL
);
20215 performSignExtendInRegCombine(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
,
20216 SelectionDAG
&DAG
) {
20218 SDValue Src
= N
->getOperand(0);
20219 unsigned Opc
= Src
->getOpcode();
20221 // Sign extend of an unsigned unpack -> signed unpack
20222 if (Opc
== AArch64ISD::UUNPKHI
|| Opc
== AArch64ISD::UUNPKLO
) {
20224 unsigned SOpc
= Opc
== AArch64ISD::UUNPKHI
? AArch64ISD::SUNPKHI
20225 : AArch64ISD::SUNPKLO
;
20227 // Push the sign extend to the operand of the unpack
20228 // This is necessary where, for example, the operand of the unpack
20229 // is another unpack:
20230 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
20232 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
20234 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
20235 SDValue ExtOp
= Src
->getOperand(0);
20236 auto VT
= cast
<VTSDNode
>(N
->getOperand(1))->getVT();
20237 EVT EltTy
= VT
.getVectorElementType();
20240 assert((EltTy
== MVT::i8
|| EltTy
== MVT::i16
|| EltTy
== MVT::i32
) &&
20241 "Sign extending from an invalid type");
20243 EVT ExtVT
= VT
.getDoubleNumVectorElementsVT(*DAG
.getContext());
20245 SDValue Ext
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, ExtOp
.getValueType(),
20246 ExtOp
, DAG
.getValueType(ExtVT
));
20248 return DAG
.getNode(SOpc
, DL
, N
->getValueType(0), Ext
);
20251 if (DCI
.isBeforeLegalizeOps())
20254 if (!EnableCombineMGatherIntrinsics
)
20257 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
20258 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
20260 unsigned MemVTOpNum
= 4;
20262 case AArch64ISD::LD1_MERGE_ZERO
:
20263 NewOpc
= AArch64ISD::LD1S_MERGE_ZERO
;
20266 case AArch64ISD::LDNF1_MERGE_ZERO
:
20267 NewOpc
= AArch64ISD::LDNF1S_MERGE_ZERO
;
20270 case AArch64ISD::LDFF1_MERGE_ZERO
:
20271 NewOpc
= AArch64ISD::LDFF1S_MERGE_ZERO
;
20274 case AArch64ISD::GLD1_MERGE_ZERO
:
20275 NewOpc
= AArch64ISD::GLD1S_MERGE_ZERO
;
20277 case AArch64ISD::GLD1_SCALED_MERGE_ZERO
:
20278 NewOpc
= AArch64ISD::GLD1S_SCALED_MERGE_ZERO
;
20280 case AArch64ISD::GLD1_SXTW_MERGE_ZERO
:
20281 NewOpc
= AArch64ISD::GLD1S_SXTW_MERGE_ZERO
;
20283 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO
:
20284 NewOpc
= AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO
;
20286 case AArch64ISD::GLD1_UXTW_MERGE_ZERO
:
20287 NewOpc
= AArch64ISD::GLD1S_UXTW_MERGE_ZERO
;
20289 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO
:
20290 NewOpc
= AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO
;
20292 case AArch64ISD::GLD1_IMM_MERGE_ZERO
:
20293 NewOpc
= AArch64ISD::GLD1S_IMM_MERGE_ZERO
;
20295 case AArch64ISD::GLDFF1_MERGE_ZERO
:
20296 NewOpc
= AArch64ISD::GLDFF1S_MERGE_ZERO
;
20298 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO
:
20299 NewOpc
= AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO
;
20301 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO
:
20302 NewOpc
= AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO
;
20304 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO
:
20305 NewOpc
= AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO
;
20307 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO
:
20308 NewOpc
= AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO
;
20310 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO
:
20311 NewOpc
= AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO
;
20313 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO
:
20314 NewOpc
= AArch64ISD::GLDFF1S_IMM_MERGE_ZERO
;
20316 case AArch64ISD::GLDNT1_MERGE_ZERO
:
20317 NewOpc
= AArch64ISD::GLDNT1S_MERGE_ZERO
;
20323 EVT SignExtSrcVT
= cast
<VTSDNode
>(N
->getOperand(1))->getVT();
20324 EVT SrcMemVT
= cast
<VTSDNode
>(Src
->getOperand(MemVTOpNum
))->getVT();
20326 if ((SignExtSrcVT
!= SrcMemVT
) || !Src
.hasOneUse())
20329 EVT DstVT
= N
->getValueType(0);
20330 SDVTList VTs
= DAG
.getVTList(DstVT
, MVT::Other
);
20332 SmallVector
<SDValue
, 5> Ops
;
20333 for (unsigned I
= 0; I
< Src
->getNumOperands(); ++I
)
20334 Ops
.push_back(Src
->getOperand(I
));
20336 SDValue ExtLoad
= DAG
.getNode(NewOpc
, SDLoc(N
), VTs
, Ops
);
20337 DCI
.CombineTo(N
, ExtLoad
);
20338 DCI
.CombineTo(Src
.getNode(), ExtLoad
, ExtLoad
.getValue(1));
20340 // Return N so it doesn't get rechecked
20341 return SDValue(N
, 0);
20344 /// Legalize the gather prefetch (scalar + vector addressing mode) when the
20345 /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
20346 /// != nxv2i32) do not need legalization.
20347 static SDValue
legalizeSVEGatherPrefetchOffsVec(SDNode
*N
, SelectionDAG
&DAG
) {
20348 const unsigned OffsetPos
= 4;
20349 SDValue Offset
= N
->getOperand(OffsetPos
);
20351 // Not an unpacked vector, bail out.
20352 if (Offset
.getValueType().getSimpleVT().SimpleTy
!= MVT::nxv2i32
)
20355 // Extend the unpacked offset vector to 64-bit lanes.
20357 Offset
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::nxv2i64
, Offset
);
20358 SmallVector
<SDValue
, 5> Ops(N
->op_begin(), N
->op_end());
20359 // Replace the offset operand with the 64-bit one.
20360 Ops
[OffsetPos
] = Offset
;
20362 return DAG
.getNode(N
->getOpcode(), DL
, DAG
.getVTList(MVT::Other
), Ops
);
20365 /// Combines a node carrying the intrinsic
20366 /// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
20367 /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
20368 /// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
20369 /// sve gather prefetch instruction with vector plus immediate addressing mode.
20370 static SDValue
combineSVEPrefetchVecBaseImmOff(SDNode
*N
, SelectionDAG
&DAG
,
20371 unsigned ScalarSizeInBytes
) {
20372 const unsigned ImmPos
= 4, OffsetPos
= 3;
20373 // No need to combine the node if the immediate is valid...
20374 if (isValidImmForSVEVecImmAddrMode(N
->getOperand(ImmPos
), ScalarSizeInBytes
))
20377 // ...otherwise swap the offset base with the offset...
20378 SmallVector
<SDValue
, 5> Ops(N
->op_begin(), N
->op_end());
20379 std::swap(Ops
[ImmPos
], Ops
[OffsetPos
]);
20380 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
20381 // `aarch64_sve_prfb_gather_uxtw_index`.
20383 Ops
[1] = DAG
.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index
, DL
,
20386 return DAG
.getNode(N
->getOpcode(), DL
, DAG
.getVTList(MVT::Other
), Ops
);
20389 // Return true if the vector operation can guarantee only the first lane of its
20390 // result contains data, with all bits in other lanes set to zero.
20391 static bool isLanes1toNKnownZero(SDValue Op
) {
20392 switch (Op
.getOpcode()) {
20395 case AArch64ISD::ANDV_PRED
:
20396 case AArch64ISD::EORV_PRED
:
20397 case AArch64ISD::FADDA_PRED
:
20398 case AArch64ISD::FADDV_PRED
:
20399 case AArch64ISD::FMAXNMV_PRED
:
20400 case AArch64ISD::FMAXV_PRED
:
20401 case AArch64ISD::FMINNMV_PRED
:
20402 case AArch64ISD::FMINV_PRED
:
20403 case AArch64ISD::ORV_PRED
:
20404 case AArch64ISD::SADDV_PRED
:
20405 case AArch64ISD::SMAXV_PRED
:
20406 case AArch64ISD::SMINV_PRED
:
20407 case AArch64ISD::UADDV_PRED
:
20408 case AArch64ISD::UMAXV_PRED
:
20409 case AArch64ISD::UMINV_PRED
:
20414 static SDValue
removeRedundantInsertVectorElt(SDNode
*N
) {
20415 assert(N
->getOpcode() == ISD::INSERT_VECTOR_ELT
&& "Unexpected node!");
20416 SDValue InsertVec
= N
->getOperand(0);
20417 SDValue InsertElt
= N
->getOperand(1);
20418 SDValue InsertIdx
= N
->getOperand(2);
20420 // We only care about inserts into the first element...
20421 if (!isNullConstant(InsertIdx
))
20423 // ...of a zero'd vector...
20424 if (!ISD::isConstantSplatVectorAllZeros(InsertVec
.getNode()))
20426 // ...where the inserted data was previously extracted...
20427 if (InsertElt
.getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
20430 SDValue ExtractVec
= InsertElt
.getOperand(0);
20431 SDValue ExtractIdx
= InsertElt
.getOperand(1);
20433 // ...from the first element of a vector.
20434 if (!isNullConstant(ExtractIdx
))
20437 // If we get here we are effectively trying to zero lanes 1-N of a vector.
20439 // Ensure there's no type conversion going on.
20440 if (N
->getValueType(0) != ExtractVec
.getValueType())
20443 if (!isLanes1toNKnownZero(ExtractVec
))
20446 // The explicit zeroing is redundant.
20451 performInsertVectorEltCombine(SDNode
*N
, TargetLowering::DAGCombinerInfo
&DCI
) {
20452 if (SDValue Res
= removeRedundantInsertVectorElt(N
))
20455 return performPostLD1Combine(N
, DCI
, true);
20458 static SDValue
performSVESpliceCombine(SDNode
*N
, SelectionDAG
&DAG
) {
20459 EVT Ty
= N
->getValueType(0);
20460 if (Ty
.isInteger())
20463 EVT IntTy
= Ty
.changeVectorElementTypeToInteger();
20464 EVT ExtIntTy
= getPackedSVEVectorVT(IntTy
.getVectorElementCount());
20465 if (ExtIntTy
.getVectorElementType().getScalarSizeInBits() <
20466 IntTy
.getVectorElementType().getScalarSizeInBits())
20470 SDValue LHS
= DAG
.getAnyExtOrTrunc(DAG
.getBitcast(IntTy
, N
->getOperand(0)),
20472 SDValue RHS
= DAG
.getAnyExtOrTrunc(DAG
.getBitcast(IntTy
, N
->getOperand(1)),
20474 SDValue Idx
= N
->getOperand(2);
20475 SDValue Splice
= DAG
.getNode(ISD::VECTOR_SPLICE
, DL
, ExtIntTy
, LHS
, RHS
, Idx
);
20476 SDValue Trunc
= DAG
.getAnyExtOrTrunc(Splice
, DL
, IntTy
);
20477 return DAG
.getBitcast(Ty
, Trunc
);
20480 static SDValue
performFPExtendCombine(SDNode
*N
, SelectionDAG
&DAG
,
20481 TargetLowering::DAGCombinerInfo
&DCI
,
20482 const AArch64Subtarget
*Subtarget
) {
20483 SDValue N0
= N
->getOperand(0);
20484 EVT VT
= N
->getValueType(0);
20486 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
20487 if (N
->hasOneUse() && N
->use_begin()->getOpcode() == ISD::FP_ROUND
)
20490 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
20491 // We purposefully don't care about legality of the nodes here as we know
20492 // they can be split down into something legal.
20493 if (DCI
.isBeforeLegalizeOps() && ISD::isNormalLoad(N0
.getNode()) &&
20494 N0
.hasOneUse() && Subtarget
->useSVEForFixedLengthVectors() &&
20495 VT
.isFixedLengthVector() &&
20496 VT
.getFixedSizeInBits() >= Subtarget
->getMinSVEVectorSizeInBits()) {
20497 LoadSDNode
*LN0
= cast
<LoadSDNode
>(N0
);
20498 SDValue ExtLoad
= DAG
.getExtLoad(ISD::EXTLOAD
, SDLoc(N
), VT
,
20499 LN0
->getChain(), LN0
->getBasePtr(),
20500 N0
.getValueType(), LN0
->getMemOperand());
20501 DCI
.CombineTo(N
, ExtLoad
);
20504 DAG
.getNode(ISD::FP_ROUND
, SDLoc(N0
), N0
.getValueType(), ExtLoad
,
20505 DAG
.getIntPtrConstant(1, SDLoc(N0
), /*isTarget=*/true)),
20506 ExtLoad
.getValue(1));
20507 return SDValue(N
, 0); // Return N so it doesn't get rechecked!
20513 static SDValue
performBSPExpandForSVE(SDNode
*N
, SelectionDAG
&DAG
,
20514 const AArch64Subtarget
*Subtarget
) {
20515 EVT VT
= N
->getValueType(0);
20517 // Don't expand for NEON, SVE2 or SME
20518 if (!VT
.isScalableVector() || Subtarget
->hasSVE2() || Subtarget
->hasSME())
20523 SDValue Mask
= N
->getOperand(0);
20524 SDValue In1
= N
->getOperand(1);
20525 SDValue In2
= N
->getOperand(2);
20527 SDValue InvMask
= DAG
.getNOT(DL
, Mask
, VT
);
20528 SDValue Sel
= DAG
.getNode(ISD::AND
, DL
, VT
, Mask
, In1
);
20529 SDValue SelInv
= DAG
.getNode(ISD::AND
, DL
, VT
, InvMask
, In2
);
20530 return DAG
.getNode(ISD::OR
, DL
, VT
, Sel
, SelInv
);
20533 static SDValue
performDupLane128Combine(SDNode
*N
, SelectionDAG
&DAG
) {
20534 EVT VT
= N
->getValueType(0);
20536 SDValue Insert
= N
->getOperand(0);
20537 if (Insert
.getOpcode() != ISD::INSERT_SUBVECTOR
)
20540 if (!Insert
.getOperand(0).isUndef())
20543 uint64_t IdxInsert
= Insert
.getConstantOperandVal(2);
20544 uint64_t IdxDupLane
= N
->getConstantOperandVal(1);
20545 if (IdxInsert
!= 0 || IdxDupLane
!= 0)
20548 SDValue Bitcast
= Insert
.getOperand(1);
20549 if (Bitcast
.getOpcode() != ISD::BITCAST
)
20552 SDValue Subvec
= Bitcast
.getOperand(0);
20553 EVT SubvecVT
= Subvec
.getValueType();
20554 if (!SubvecVT
.is128BitVector())
20557 getPackedSVEVectorVT(Subvec
.getValueType().getVectorElementType());
20560 SDValue NewInsert
=
20561 DAG
.getNode(ISD::INSERT_SUBVECTOR
, DL
, NewSubvecVT
,
20562 DAG
.getUNDEF(NewSubvecVT
), Subvec
, Insert
->getOperand(2));
20563 SDValue NewDuplane128
= DAG
.getNode(AArch64ISD::DUPLANE128
, DL
, NewSubvecVT
,
20564 NewInsert
, N
->getOperand(1));
20565 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, NewDuplane128
);
20568 SDValue
AArch64TargetLowering::PerformDAGCombine(SDNode
*N
,
20569 DAGCombinerInfo
&DCI
) const {
20570 SelectionDAG
&DAG
= DCI
.DAG
;
20571 switch (N
->getOpcode()) {
20573 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
20577 return performAddSubCombine(N
, DCI
, DAG
);
20578 case ISD::BUILD_VECTOR
:
20579 return performBuildVectorCombine(N
, DCI
, DAG
);
20580 case AArch64ISD::ANDS
:
20581 return performFlagSettingCombine(N
, DCI
, ISD::AND
);
20582 case AArch64ISD::ADC
:
20583 if (auto R
= foldOverflowCheck(N
, DAG
, /* IsAdd */ true))
20585 return foldADCToCINC(N
, DAG
);
20586 case AArch64ISD::SBC
:
20587 return foldOverflowCheck(N
, DAG
, /* IsAdd */ false);
20588 case AArch64ISD::ADCS
:
20589 if (auto R
= foldOverflowCheck(N
, DAG
, /* IsAdd */ true))
20591 return performFlagSettingCombine(N
, DCI
, AArch64ISD::ADC
);
20592 case AArch64ISD::SBCS
:
20593 if (auto R
= foldOverflowCheck(N
, DAG
, /* IsAdd */ false))
20595 return performFlagSettingCombine(N
, DCI
, AArch64ISD::SBC
);
20597 return performXorCombine(N
, DAG
, DCI
, Subtarget
);
20599 return performMulCombine(N
, DAG
, DCI
, Subtarget
);
20600 case ISD::SINT_TO_FP
:
20601 case ISD::UINT_TO_FP
:
20602 return performIntToFpCombine(N
, DAG
, Subtarget
);
20603 case ISD::FP_TO_SINT
:
20604 case ISD::FP_TO_UINT
:
20605 case ISD::FP_TO_SINT_SAT
:
20606 case ISD::FP_TO_UINT_SAT
:
20607 return performFpToIntCombine(N
, DAG
, DCI
, Subtarget
);
20609 return performFDivCombine(N
, DAG
, DCI
, Subtarget
);
20611 return performORCombine(N
, DCI
, Subtarget
);
20613 return performANDCombine(N
, DCI
);
20614 case ISD::INTRINSIC_WO_CHAIN
:
20615 return performIntrinsicCombine(N
, DCI
, Subtarget
);
20616 case ISD::ANY_EXTEND
:
20617 case ISD::ZERO_EXTEND
:
20618 case ISD::SIGN_EXTEND
:
20619 return performExtendCombine(N
, DCI
, DAG
);
20620 case ISD::SIGN_EXTEND_INREG
:
20621 return performSignExtendInRegCombine(N
, DCI
, DAG
);
20622 case ISD::CONCAT_VECTORS
:
20623 return performConcatVectorsCombine(N
, DCI
, DAG
);
20624 case ISD::EXTRACT_SUBVECTOR
:
20625 return performExtractSubvectorCombine(N
, DCI
, DAG
);
20626 case ISD::INSERT_SUBVECTOR
:
20627 return performInsertSubvectorCombine(N
, DCI
, DAG
);
20629 return performSelectCombine(N
, DCI
);
20631 return performVSelectCombine(N
, DCI
.DAG
);
20633 return performSETCCCombine(N
, DCI
, DAG
);
20635 return performLOADCombine(N
, DCI
, DAG
, Subtarget
);
20637 return performSTORECombine(N
, DCI
, DAG
, Subtarget
);
20639 return performMSTORECombine(N
, DCI
, DAG
, Subtarget
);
20641 case ISD::MSCATTER
:
20642 return performMaskedGatherScatterCombine(N
, DCI
, DAG
);
20643 case ISD::VECTOR_SPLICE
:
20644 return performSVESpliceCombine(N
, DAG
);
20645 case ISD::FP_EXTEND
:
20646 return performFPExtendCombine(N
, DAG
, DCI
, Subtarget
);
20647 case AArch64ISD::BRCOND
:
20648 return performBRCONDCombine(N
, DCI
, DAG
);
20649 case AArch64ISD::TBNZ
:
20650 case AArch64ISD::TBZ
:
20651 return performTBZCombine(N
, DCI
, DAG
);
20652 case AArch64ISD::CSEL
:
20653 return performCSELCombine(N
, DCI
, DAG
);
20654 case AArch64ISD::DUP
:
20655 return performDUPCombine(N
, DCI
);
20656 case AArch64ISD::DUPLANE128
:
20657 return performDupLane128Combine(N
, DAG
);
20658 case AArch64ISD::NVCAST
:
20659 return performNVCASTCombine(N
);
20660 case AArch64ISD::SPLICE
:
20661 return performSpliceCombine(N
, DAG
);
20662 case AArch64ISD::UUNPKLO
:
20663 case AArch64ISD::UUNPKHI
:
20664 return performUnpackCombine(N
, DAG
, Subtarget
);
20665 case AArch64ISD::UZP1
:
20666 return performUzpCombine(N
, DAG
);
20667 case AArch64ISD::SETCC_MERGE_ZERO
:
20668 return performSetccMergeZeroCombine(N
, DCI
);
20669 case AArch64ISD::GLD1_MERGE_ZERO
:
20670 case AArch64ISD::GLD1_SCALED_MERGE_ZERO
:
20671 case AArch64ISD::GLD1_UXTW_MERGE_ZERO
:
20672 case AArch64ISD::GLD1_SXTW_MERGE_ZERO
:
20673 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO
:
20674 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO
:
20675 case AArch64ISD::GLD1_IMM_MERGE_ZERO
:
20676 case AArch64ISD::GLD1S_MERGE_ZERO
:
20677 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO
:
20678 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO
:
20679 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO
:
20680 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO
:
20681 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO
:
20682 case AArch64ISD::GLD1S_IMM_MERGE_ZERO
:
20683 return performGLD1Combine(N
, DAG
);
20684 case AArch64ISD::VASHR
:
20685 case AArch64ISD::VLSHR
:
20686 return performVectorShiftCombine(N
, *this, DCI
);
20687 case AArch64ISD::SUNPKLO
:
20688 return performSunpkloCombine(N
, DAG
);
20689 case AArch64ISD::BSP
:
20690 return performBSPExpandForSVE(N
, DAG
, Subtarget
);
20691 case ISD::INSERT_VECTOR_ELT
:
20692 return performInsertVectorEltCombine(N
, DCI
);
20693 case ISD::EXTRACT_VECTOR_ELT
:
20694 return performExtractVectorEltCombine(N
, DCI
, Subtarget
);
20695 case ISD::VECREDUCE_ADD
:
20696 return performVecReduceAddCombine(N
, DCI
.DAG
, Subtarget
);
20697 case AArch64ISD::UADDV
:
20698 return performUADDVCombine(N
, DAG
);
20699 case AArch64ISD::SMULL
:
20700 case AArch64ISD::UMULL
:
20701 case AArch64ISD::PMULL
:
20702 return tryCombineLongOpWithDup(Intrinsic::not_intrinsic
, N
, DCI
, DAG
);
20703 case ISD::INTRINSIC_VOID
:
20704 case ISD::INTRINSIC_W_CHAIN
:
20705 switch (cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue()) {
20706 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset
:
20707 return combineSVEPrefetchVecBaseImmOff(N
, DAG
, 1 /*=ScalarSizeInBytes*/);
20708 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset
:
20709 return combineSVEPrefetchVecBaseImmOff(N
, DAG
, 2 /*=ScalarSizeInBytes*/);
20710 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset
:
20711 return combineSVEPrefetchVecBaseImmOff(N
, DAG
, 4 /*=ScalarSizeInBytes*/);
20712 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset
:
20713 return combineSVEPrefetchVecBaseImmOff(N
, DAG
, 8 /*=ScalarSizeInBytes*/);
20714 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index
:
20715 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index
:
20716 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index
:
20717 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index
:
20718 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index
:
20719 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index
:
20720 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index
:
20721 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index
:
20722 return legalizeSVEGatherPrefetchOffsVec(N
, DAG
);
20723 case Intrinsic::aarch64_neon_ld2
:
20724 case Intrinsic::aarch64_neon_ld3
:
20725 case Intrinsic::aarch64_neon_ld4
:
20726 case Intrinsic::aarch64_neon_ld1x2
:
20727 case Intrinsic::aarch64_neon_ld1x3
:
20728 case Intrinsic::aarch64_neon_ld1x4
:
20729 case Intrinsic::aarch64_neon_ld2lane
:
20730 case Intrinsic::aarch64_neon_ld3lane
:
20731 case Intrinsic::aarch64_neon_ld4lane
:
20732 case Intrinsic::aarch64_neon_ld2r
:
20733 case Intrinsic::aarch64_neon_ld3r
:
20734 case Intrinsic::aarch64_neon_ld4r
:
20735 case Intrinsic::aarch64_neon_st2
:
20736 case Intrinsic::aarch64_neon_st3
:
20737 case Intrinsic::aarch64_neon_st4
:
20738 case Intrinsic::aarch64_neon_st1x2
:
20739 case Intrinsic::aarch64_neon_st1x3
:
20740 case Intrinsic::aarch64_neon_st1x4
:
20741 case Intrinsic::aarch64_neon_st2lane
:
20742 case Intrinsic::aarch64_neon_st3lane
:
20743 case Intrinsic::aarch64_neon_st4lane
:
20744 return performNEONPostLDSTCombine(N
, DCI
, DAG
);
20745 case Intrinsic::aarch64_sve_ldnt1
:
20746 return performLDNT1Combine(N
, DAG
);
20747 case Intrinsic::aarch64_sve_ld1rq
:
20748 return performLD1ReplicateCombine
<AArch64ISD::LD1RQ_MERGE_ZERO
>(N
, DAG
);
20749 case Intrinsic::aarch64_sve_ld1ro
:
20750 return performLD1ReplicateCombine
<AArch64ISD::LD1RO_MERGE_ZERO
>(N
, DAG
);
20751 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset
:
20752 return performGatherLoadCombine(N
, DAG
, AArch64ISD::GLDNT1_MERGE_ZERO
);
20753 case Intrinsic::aarch64_sve_ldnt1_gather
:
20754 return performGatherLoadCombine(N
, DAG
, AArch64ISD::GLDNT1_MERGE_ZERO
);
20755 case Intrinsic::aarch64_sve_ldnt1_gather_index
:
20756 return performGatherLoadCombine(N
, DAG
,
20757 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO
);
20758 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw
:
20759 return performGatherLoadCombine(N
, DAG
, AArch64ISD::GLDNT1_MERGE_ZERO
);
20760 case Intrinsic::aarch64_sve_ld1
:
20761 return performLD1Combine(N
, DAG
, AArch64ISD::LD1_MERGE_ZERO
);
20762 case Intrinsic::aarch64_sve_ldnf1
:
20763 return performLD1Combine(N
, DAG
, AArch64ISD::LDNF1_MERGE_ZERO
);
20764 case Intrinsic::aarch64_sve_ldff1
:
20765 return performLD1Combine(N
, DAG
, AArch64ISD::LDFF1_MERGE_ZERO
);
20766 case Intrinsic::aarch64_sve_st1
:
20767 return performST1Combine(N
, DAG
);
20768 case Intrinsic::aarch64_sve_stnt1
:
20769 return performSTNT1Combine(N
, DAG
);
20770 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset
:
20771 return performScatterStoreCombine(N
, DAG
, AArch64ISD::SSTNT1_PRED
);
20772 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw
:
20773 return performScatterStoreCombine(N
, DAG
, AArch64ISD::SSTNT1_PRED
);
20774 case Intrinsic::aarch64_sve_stnt1_scatter
:
20775 return performScatterStoreCombine(N
, DAG
, AArch64ISD::SSTNT1_PRED
);
20776 case Intrinsic::aarch64_sve_stnt1_scatter_index
:
20777 return performScatterStoreCombine(N
, DAG
, AArch64ISD::SSTNT1_INDEX_PRED
);
20778 case Intrinsic::aarch64_sve_ld1_gather
:
20779 return performGatherLoadCombine(N
, DAG
, AArch64ISD::GLD1_MERGE_ZERO
);
20780 case Intrinsic::aarch64_sve_ld1_gather_index
:
20781 return performGatherLoadCombine(N
, DAG
,
20782 AArch64ISD::GLD1_SCALED_MERGE_ZERO
);
20783 case Intrinsic::aarch64_sve_ld1_gather_sxtw
:
20784 return performGatherLoadCombine(N
, DAG
, AArch64ISD::GLD1_SXTW_MERGE_ZERO
,
20785 /*OnlyPackedOffsets=*/false);
20786 case Intrinsic::aarch64_sve_ld1_gather_uxtw
:
20787 return performGatherLoadCombine(N
, DAG
, AArch64ISD::GLD1_UXTW_MERGE_ZERO
,
20788 /*OnlyPackedOffsets=*/false);
20789 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index
:
20790 return performGatherLoadCombine(N
, DAG
,
20791 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO
,
20792 /*OnlyPackedOffsets=*/false);
20793 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index
:
20794 return performGatherLoadCombine(N
, DAG
,
20795 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO
,
20796 /*OnlyPackedOffsets=*/false);
20797 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset
:
20798 return performGatherLoadCombine(N
, DAG
, AArch64ISD::GLD1_IMM_MERGE_ZERO
);
20799 case Intrinsic::aarch64_sve_ldff1_gather
:
20800 return performGatherLoadCombine(N
, DAG
, AArch64ISD::GLDFF1_MERGE_ZERO
);
20801 case Intrinsic::aarch64_sve_ldff1_gather_index
:
20802 return performGatherLoadCombine(N
, DAG
,
20803 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO
);
20804 case Intrinsic::aarch64_sve_ldff1_gather_sxtw
:
20805 return performGatherLoadCombine(N
, DAG
,
20806 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO
,
20807 /*OnlyPackedOffsets=*/false);
20808 case Intrinsic::aarch64_sve_ldff1_gather_uxtw
:
20809 return performGatherLoadCombine(N
, DAG
,
20810 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO
,
20811 /*OnlyPackedOffsets=*/false);
20812 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index
:
20813 return performGatherLoadCombine(N
, DAG
,
20814 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO
,
20815 /*OnlyPackedOffsets=*/false);
20816 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index
:
20817 return performGatherLoadCombine(N
, DAG
,
20818 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO
,
20819 /*OnlyPackedOffsets=*/false);
20820 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset
:
20821 return performGatherLoadCombine(N
, DAG
,
20822 AArch64ISD::GLDFF1_IMM_MERGE_ZERO
);
20823 case Intrinsic::aarch64_sve_st1_scatter
:
20824 return performScatterStoreCombine(N
, DAG
, AArch64ISD::SST1_PRED
);
20825 case Intrinsic::aarch64_sve_st1_scatter_index
:
20826 return performScatterStoreCombine(N
, DAG
, AArch64ISD::SST1_SCALED_PRED
);
20827 case Intrinsic::aarch64_sve_st1_scatter_sxtw
:
20828 return performScatterStoreCombine(N
, DAG
, AArch64ISD::SST1_SXTW_PRED
,
20829 /*OnlyPackedOffsets=*/false);
20830 case Intrinsic::aarch64_sve_st1_scatter_uxtw
:
20831 return performScatterStoreCombine(N
, DAG
, AArch64ISD::SST1_UXTW_PRED
,
20832 /*OnlyPackedOffsets=*/false);
20833 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index
:
20834 return performScatterStoreCombine(N
, DAG
,
20835 AArch64ISD::SST1_SXTW_SCALED_PRED
,
20836 /*OnlyPackedOffsets=*/false);
20837 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index
:
20838 return performScatterStoreCombine(N
, DAG
,
20839 AArch64ISD::SST1_UXTW_SCALED_PRED
,
20840 /*OnlyPackedOffsets=*/false);
20841 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset
:
20842 return performScatterStoreCombine(N
, DAG
, AArch64ISD::SST1_IMM_PRED
);
20843 case Intrinsic::aarch64_rndr
:
20844 case Intrinsic::aarch64_rndrrs
: {
20845 unsigned IntrinsicID
=
20846 cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
20848 (IntrinsicID
== Intrinsic::aarch64_rndr
? AArch64SysReg::RNDR
20849 : AArch64SysReg::RNDRRS
);
20851 SDValue A
= DAG
.getNode(
20852 AArch64ISD::MRS
, DL
, DAG
.getVTList(MVT::i64
, MVT::Glue
, MVT::Other
),
20853 N
->getOperand(0), DAG
.getConstant(Register
, DL
, MVT::i64
));
20854 SDValue B
= DAG
.getNode(
20855 AArch64ISD::CSINC
, DL
, MVT::i32
, DAG
.getConstant(0, DL
, MVT::i32
),
20856 DAG
.getConstant(0, DL
, MVT::i32
),
20857 DAG
.getConstant(AArch64CC::NE
, DL
, MVT::i32
), A
.getValue(1));
20858 return DAG
.getMergeValues(
20859 {A
, DAG
.getZExtOrTrunc(B
, DL
, MVT::i1
), A
.getValue(2)}, DL
);
20865 case ISD::GlobalAddress
:
20866 return performGlobalAddressCombine(N
, DAG
, Subtarget
, getTargetMachine());
20871 // Check if the return value is used as only a return value, as otherwise
20872 // we can't perform a tail-call. In particular, we need to check for
20873 // target ISD nodes that are returns and any other "odd" constructs
20874 // that the generic analysis code won't necessarily catch.
20875 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode
*N
,
20876 SDValue
&Chain
) const {
20877 if (N
->getNumValues() != 1)
20879 if (!N
->hasNUsesOfValue(1, 0))
20882 SDValue TCChain
= Chain
;
20883 SDNode
*Copy
= *N
->use_begin();
20884 if (Copy
->getOpcode() == ISD::CopyToReg
) {
20885 // If the copy has a glue operand, we conservatively assume it isn't safe to
20886 // perform a tail call.
20887 if (Copy
->getOperand(Copy
->getNumOperands() - 1).getValueType() ==
20890 TCChain
= Copy
->getOperand(0);
20891 } else if (Copy
->getOpcode() != ISD::FP_EXTEND
)
20894 bool HasRet
= false;
20895 for (SDNode
*Node
: Copy
->uses()) {
20896 if (Node
->getOpcode() != AArch64ISD::RET_FLAG
)
20908 // Return whether the an instruction can potentially be optimized to a tail
20909 // call. This will cause the optimizers to attempt to move, or duplicate,
20910 // return instructions to help enable tail call optimizations for this
20912 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst
*CI
) const {
20913 return CI
->isTailCall();
20916 bool AArch64TargetLowering::getIndexedAddressParts(SDNode
*Op
, SDValue
&Base
,
20918 ISD::MemIndexedMode
&AM
,
20920 SelectionDAG
&DAG
) const {
20921 if (Op
->getOpcode() != ISD::ADD
&& Op
->getOpcode() != ISD::SUB
)
20924 Base
= Op
->getOperand(0);
20925 // All of the indexed addressing mode instructions take a signed
20926 // 9 bit immediate offset.
20927 if (ConstantSDNode
*RHS
= dyn_cast
<ConstantSDNode
>(Op
->getOperand(1))) {
20928 int64_t RHSC
= RHS
->getSExtValue();
20929 if (Op
->getOpcode() == ISD::SUB
)
20930 RHSC
= -(uint64_t)RHSC
;
20931 if (!isInt
<9>(RHSC
))
20933 IsInc
= (Op
->getOpcode() == ISD::ADD
);
20934 Offset
= Op
->getOperand(1);
20940 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode
*N
, SDValue
&Base
,
20942 ISD::MemIndexedMode
&AM
,
20943 SelectionDAG
&DAG
) const {
20946 if (LoadSDNode
*LD
= dyn_cast
<LoadSDNode
>(N
)) {
20947 VT
= LD
->getMemoryVT();
20948 Ptr
= LD
->getBasePtr();
20949 } else if (StoreSDNode
*ST
= dyn_cast
<StoreSDNode
>(N
)) {
20950 VT
= ST
->getMemoryVT();
20951 Ptr
= ST
->getBasePtr();
20956 if (!getIndexedAddressParts(Ptr
.getNode(), Base
, Offset
, AM
, IsInc
, DAG
))
20958 AM
= IsInc
? ISD::PRE_INC
: ISD::PRE_DEC
;
20962 bool AArch64TargetLowering::getPostIndexedAddressParts(
20963 SDNode
*N
, SDNode
*Op
, SDValue
&Base
, SDValue
&Offset
,
20964 ISD::MemIndexedMode
&AM
, SelectionDAG
&DAG
) const {
20967 if (LoadSDNode
*LD
= dyn_cast
<LoadSDNode
>(N
)) {
20968 VT
= LD
->getMemoryVT();
20969 Ptr
= LD
->getBasePtr();
20970 } else if (StoreSDNode
*ST
= dyn_cast
<StoreSDNode
>(N
)) {
20971 VT
= ST
->getMemoryVT();
20972 Ptr
= ST
->getBasePtr();
20977 if (!getIndexedAddressParts(Op
, Base
, Offset
, AM
, IsInc
, DAG
))
20979 // Post-indexing updates the base, so it's not a valid transform
20980 // if that's not the same as the load's pointer.
20983 AM
= IsInc
? ISD::POST_INC
: ISD::POST_DEC
;
20987 void AArch64TargetLowering::ReplaceBITCASTResults(
20988 SDNode
*N
, SmallVectorImpl
<SDValue
> &Results
, SelectionDAG
&DAG
) const {
20990 SDValue Op
= N
->getOperand(0);
20991 EVT VT
= N
->getValueType(0);
20992 EVT SrcVT
= Op
.getValueType();
20994 if (VT
.isScalableVector() && !isTypeLegal(VT
) && isTypeLegal(SrcVT
)) {
20995 assert(!VT
.isFloatingPoint() && SrcVT
.isFloatingPoint() &&
20996 "Expected fp->int bitcast!");
20998 // Bitcasting between unpacked vector types of different element counts is
20999 // not a NOP because the live elements are laid out differently.
21001 // e.g. nxv2i32 = XX??XX??
21002 // nxv4f16 = X?X?X?X?
21003 if (VT
.getVectorElementCount() != SrcVT
.getVectorElementCount())
21006 SDValue CastResult
= getSVESafeBitCast(getSVEContainerType(VT
), Op
, DAG
);
21007 Results
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, CastResult
));
21011 if (VT
!= MVT::i16
|| (SrcVT
!= MVT::f16
&& SrcVT
!= MVT::bf16
))
21015 DAG
.getMachineNode(TargetOpcode::INSERT_SUBREG
, DL
, MVT::f32
,
21016 DAG
.getUNDEF(MVT::i32
), Op
,
21017 DAG
.getTargetConstant(AArch64::hsub
, DL
, MVT::i32
)),
21019 Op
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::i32
, Op
);
21020 Results
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i16
, Op
));
21023 static void ReplaceAddWithADDP(SDNode
*N
, SmallVectorImpl
<SDValue
> &Results
,
21025 const AArch64Subtarget
*Subtarget
) {
21026 EVT VT
= N
->getValueType(0);
21027 if (!VT
.is256BitVector() ||
21028 (VT
.getScalarType().isFloatingPoint() &&
21029 !N
->getFlags().hasAllowReassociation()) ||
21030 (VT
.getScalarType() == MVT::f16
&& !Subtarget
->hasFullFP16()))
21033 SDValue X
= N
->getOperand(0);
21034 auto *Shuf
= dyn_cast
<ShuffleVectorSDNode
>(N
->getOperand(1));
21036 Shuf
= dyn_cast
<ShuffleVectorSDNode
>(N
->getOperand(0));
21037 X
= N
->getOperand(1);
21042 if (Shuf
->getOperand(0) != X
|| !Shuf
->getOperand(1)->isUndef())
21045 // Check the mask is 1,0,3,2,5,4,...
21046 ArrayRef
<int> Mask
= Shuf
->getMask();
21047 for (int I
= 0, E
= Mask
.size(); I
< E
; I
++)
21048 if (Mask
[I
] != (I
% 2 == 0 ? I
+ 1 : I
- 1))
21052 auto LoHi
= DAG
.SplitVector(X
, DL
);
21053 assert(LoHi
.first
.getValueType() == LoHi
.second
.getValueType());
21054 SDValue Addp
= DAG
.getNode(AArch64ISD::ADDP
, N
, LoHi
.first
.getValueType(),
21055 LoHi
.first
, LoHi
.second
);
21057 // Shuffle the elements back into order.
21058 SmallVector
<int> NMask
;
21059 for (unsigned I
= 0, E
= VT
.getVectorNumElements() / 2; I
< E
; I
++) {
21060 NMask
.push_back(I
);
21061 NMask
.push_back(I
);
21064 DAG
.getVectorShuffle(VT
, DL
,
21065 DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, VT
, Addp
,
21066 DAG
.getUNDEF(LoHi
.first
.getValueType())),
21067 DAG
.getUNDEF(VT
), NMask
));
21070 static void ReplaceReductionResults(SDNode
*N
,
21071 SmallVectorImpl
<SDValue
> &Results
,
21072 SelectionDAG
&DAG
, unsigned InterOp
,
21073 unsigned AcrossOp
) {
21077 std::tie(LoVT
, HiVT
) = DAG
.GetSplitDestVTs(N
->getValueType(0));
21078 std::tie(Lo
, Hi
) = DAG
.SplitVectorOperand(N
, 0);
21079 SDValue InterVal
= DAG
.getNode(InterOp
, dl
, LoVT
, Lo
, Hi
);
21080 SDValue SplitVal
= DAG
.getNode(AcrossOp
, dl
, LoVT
, InterVal
);
21081 Results
.push_back(SplitVal
);
21084 static std::pair
<SDValue
, SDValue
> splitInt128(SDValue N
, SelectionDAG
&DAG
) {
21086 SDValue Lo
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i64
, N
);
21087 SDValue Hi
= DAG
.getNode(ISD::TRUNCATE
, DL
, MVT::i64
,
21088 DAG
.getNode(ISD::SRL
, DL
, MVT::i128
, N
,
21089 DAG
.getConstant(64, DL
, MVT::i64
)));
21090 return std::make_pair(Lo
, Hi
);
21093 void AArch64TargetLowering::ReplaceExtractSubVectorResults(
21094 SDNode
*N
, SmallVectorImpl
<SDValue
> &Results
, SelectionDAG
&DAG
) const {
21095 SDValue In
= N
->getOperand(0);
21096 EVT InVT
= In
.getValueType();
21098 // Common code will handle these just fine.
21099 if (!InVT
.isScalableVector() || !InVT
.isInteger())
21103 EVT VT
= N
->getValueType(0);
21105 // The following checks bail if this is not a halving operation.
21107 ElementCount ResEC
= VT
.getVectorElementCount();
21109 if (InVT
.getVectorElementCount() != (ResEC
* 2))
21112 auto *CIndex
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
21116 unsigned Index
= CIndex
->getZExtValue();
21117 if ((Index
!= 0) && (Index
!= ResEC
.getKnownMinValue()))
21120 unsigned Opcode
= (Index
== 0) ? AArch64ISD::UUNPKLO
: AArch64ISD::UUNPKHI
;
21121 EVT ExtendedHalfVT
= VT
.widenIntegerVectorElementType(*DAG
.getContext());
21123 SDValue Half
= DAG
.getNode(Opcode
, DL
, ExtendedHalfVT
, N
->getOperand(0));
21124 Results
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, Half
));
21127 // Create an even/odd pair of X registers holding integer value V.
21128 static SDValue
createGPRPairNode(SelectionDAG
&DAG
, SDValue V
) {
21129 SDLoc
dl(V
.getNode());
21130 SDValue VLo
= DAG
.getAnyExtOrTrunc(V
, dl
, MVT::i64
);
21131 SDValue VHi
= DAG
.getAnyExtOrTrunc(
21132 DAG
.getNode(ISD::SRL
, dl
, MVT::i128
, V
, DAG
.getConstant(64, dl
, MVT::i64
)),
21134 if (DAG
.getDataLayout().isBigEndian())
21135 std::swap (VLo
, VHi
);
21137 DAG
.getTargetConstant(AArch64::XSeqPairsClassRegClassID
, dl
, MVT::i32
);
21138 SDValue SubReg0
= DAG
.getTargetConstant(AArch64::sube64
, dl
, MVT::i32
);
21139 SDValue SubReg1
= DAG
.getTargetConstant(AArch64::subo64
, dl
, MVT::i32
);
21140 const SDValue Ops
[] = { RegClass
, VLo
, SubReg0
, VHi
, SubReg1
};
21142 DAG
.getMachineNode(TargetOpcode::REG_SEQUENCE
, dl
, MVT::Untyped
, Ops
), 0);
21145 static void ReplaceCMP_SWAP_128Results(SDNode
*N
,
21146 SmallVectorImpl
<SDValue
> &Results
,
21148 const AArch64Subtarget
*Subtarget
) {
21149 assert(N
->getValueType(0) == MVT::i128
&&
21150 "AtomicCmpSwap on types less than 128 should be legal");
21152 MachineMemOperand
*MemOp
= cast
<MemSDNode
>(N
)->getMemOperand();
21153 if (Subtarget
->hasLSE() || Subtarget
->outlineAtomics()) {
21154 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
21155 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
21157 createGPRPairNode(DAG
, N
->getOperand(2)), // Compare value
21158 createGPRPairNode(DAG
, N
->getOperand(3)), // Store value
21159 N
->getOperand(1), // Ptr
21160 N
->getOperand(0), // Chain in
21164 switch (MemOp
->getMergedOrdering()) {
21165 case AtomicOrdering::Monotonic
:
21166 Opcode
= AArch64::CASPX
;
21168 case AtomicOrdering::Acquire
:
21169 Opcode
= AArch64::CASPAX
;
21171 case AtomicOrdering::Release
:
21172 Opcode
= AArch64::CASPLX
;
21174 case AtomicOrdering::AcquireRelease
:
21175 case AtomicOrdering::SequentiallyConsistent
:
21176 Opcode
= AArch64::CASPALX
;
21179 llvm_unreachable("Unexpected ordering!");
21182 MachineSDNode
*CmpSwap
= DAG
.getMachineNode(
21183 Opcode
, SDLoc(N
), DAG
.getVTList(MVT::Untyped
, MVT::Other
), Ops
);
21184 DAG
.setNodeMemRefs(CmpSwap
, {MemOp
});
21186 unsigned SubReg1
= AArch64::sube64
, SubReg2
= AArch64::subo64
;
21187 if (DAG
.getDataLayout().isBigEndian())
21188 std::swap(SubReg1
, SubReg2
);
21189 SDValue Lo
= DAG
.getTargetExtractSubreg(SubReg1
, SDLoc(N
), MVT::i64
,
21190 SDValue(CmpSwap
, 0));
21191 SDValue Hi
= DAG
.getTargetExtractSubreg(SubReg2
, SDLoc(N
), MVT::i64
,
21192 SDValue(CmpSwap
, 0));
21194 DAG
.getNode(ISD::BUILD_PAIR
, SDLoc(N
), MVT::i128
, Lo
, Hi
));
21195 Results
.push_back(SDValue(CmpSwap
, 1)); // Chain out
21200 switch (MemOp
->getMergedOrdering()) {
21201 case AtomicOrdering::Monotonic
:
21202 Opcode
= AArch64::CMP_SWAP_128_MONOTONIC
;
21204 case AtomicOrdering::Acquire
:
21205 Opcode
= AArch64::CMP_SWAP_128_ACQUIRE
;
21207 case AtomicOrdering::Release
:
21208 Opcode
= AArch64::CMP_SWAP_128_RELEASE
;
21210 case AtomicOrdering::AcquireRelease
:
21211 case AtomicOrdering::SequentiallyConsistent
:
21212 Opcode
= AArch64::CMP_SWAP_128
;
21215 llvm_unreachable("Unexpected ordering!");
21218 auto Desired
= splitInt128(N
->getOperand(2), DAG
);
21219 auto New
= splitInt128(N
->getOperand(3), DAG
);
21220 SDValue Ops
[] = {N
->getOperand(1), Desired
.first
, Desired
.second
,
21221 New
.first
, New
.second
, N
->getOperand(0)};
21222 SDNode
*CmpSwap
= DAG
.getMachineNode(
21223 Opcode
, SDLoc(N
), DAG
.getVTList(MVT::i64
, MVT::i64
, MVT::i32
, MVT::Other
),
21225 DAG
.setNodeMemRefs(cast
<MachineSDNode
>(CmpSwap
), {MemOp
});
21227 Results
.push_back(DAG
.getNode(ISD::BUILD_PAIR
, SDLoc(N
), MVT::i128
,
21228 SDValue(CmpSwap
, 0), SDValue(CmpSwap
, 1)));
21229 Results
.push_back(SDValue(CmpSwap
, 3));
21232 void AArch64TargetLowering::ReplaceNodeResults(
21233 SDNode
*N
, SmallVectorImpl
<SDValue
> &Results
, SelectionDAG
&DAG
) const {
21234 switch (N
->getOpcode()) {
21236 llvm_unreachable("Don't know how to custom expand this");
21238 ReplaceBITCASTResults(N
, Results
, DAG
);
21240 case ISD::VECREDUCE_ADD
:
21241 case ISD::VECREDUCE_SMAX
:
21242 case ISD::VECREDUCE_SMIN
:
21243 case ISD::VECREDUCE_UMAX
:
21244 case ISD::VECREDUCE_UMIN
:
21245 Results
.push_back(LowerVECREDUCE(SDValue(N
, 0), DAG
));
21249 ReplaceAddWithADDP(N
, Results
, DAG
, Subtarget
);
21254 if (SDValue Result
= LowerCTPOP_PARITY(SDValue(N
, 0), DAG
))
21255 Results
.push_back(Result
);
21257 case AArch64ISD::SADDV
:
21258 ReplaceReductionResults(N
, Results
, DAG
, ISD::ADD
, AArch64ISD::SADDV
);
21260 case AArch64ISD::UADDV
:
21261 ReplaceReductionResults(N
, Results
, DAG
, ISD::ADD
, AArch64ISD::UADDV
);
21263 case AArch64ISD::SMINV
:
21264 ReplaceReductionResults(N
, Results
, DAG
, ISD::SMIN
, AArch64ISD::SMINV
);
21266 case AArch64ISD::UMINV
:
21267 ReplaceReductionResults(N
, Results
, DAG
, ISD::UMIN
, AArch64ISD::UMINV
);
21269 case AArch64ISD::SMAXV
:
21270 ReplaceReductionResults(N
, Results
, DAG
, ISD::SMAX
, AArch64ISD::SMAXV
);
21272 case AArch64ISD::UMAXV
:
21273 ReplaceReductionResults(N
, Results
, DAG
, ISD::UMAX
, AArch64ISD::UMAXV
);
21275 case ISD::FP_TO_UINT
:
21276 case ISD::FP_TO_SINT
:
21277 case ISD::STRICT_FP_TO_SINT
:
21278 case ISD::STRICT_FP_TO_UINT
:
21279 assert(N
->getValueType(0) == MVT::i128
&& "unexpected illegal conversion");
21280 // Let normal code take care of it by not adding anything to Results.
21282 case ISD::ATOMIC_CMP_SWAP
:
21283 ReplaceCMP_SWAP_128Results(N
, Results
, DAG
, Subtarget
);
21285 case ISD::ATOMIC_LOAD
:
21287 MemSDNode
*LoadNode
= cast
<MemSDNode
>(N
);
21288 EVT MemVT
= LoadNode
->getMemoryVT();
21289 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
21291 if (LoadNode
->isNonTemporal() && Subtarget
->isLittleEndian() &&
21292 MemVT
.getSizeInBits() == 256u &&
21293 (MemVT
.getScalarSizeInBits() == 8u ||
21294 MemVT
.getScalarSizeInBits() == 16u ||
21295 MemVT
.getScalarSizeInBits() == 32u ||
21296 MemVT
.getScalarSizeInBits() == 64u)) {
21298 SDValue Result
= DAG
.getMemIntrinsicNode(
21299 AArch64ISD::LDNP
, SDLoc(N
),
21300 DAG
.getVTList({MemVT
.getHalfNumVectorElementsVT(*DAG
.getContext()),
21301 MemVT
.getHalfNumVectorElementsVT(*DAG
.getContext()),
21303 {LoadNode
->getChain(), LoadNode
->getBasePtr()},
21304 LoadNode
->getMemoryVT(), LoadNode
->getMemOperand());
21306 SDValue Pair
= DAG
.getNode(ISD::CONCAT_VECTORS
, SDLoc(N
), MemVT
,
21307 Result
.getValue(0), Result
.getValue(1));
21308 Results
.append({Pair
, Result
.getValue(2) /* Chain */});
21312 if ((!LoadNode
->isVolatile() && !LoadNode
->isAtomic()) ||
21313 LoadNode
->getMemoryVT() != MVT::i128
) {
21314 // Non-volatile or atomic loads are optimized later in AArch64's load/store
21319 if (SDValue(N
, 0).getValueType() == MVT::i128
) {
21320 SDValue Result
= DAG
.getMemIntrinsicNode(
21321 AArch64ISD::LDP
, SDLoc(N
),
21322 DAG
.getVTList({MVT::i64
, MVT::i64
, MVT::Other
}),
21323 {LoadNode
->getChain(), LoadNode
->getBasePtr()},
21324 LoadNode
->getMemoryVT(), LoadNode
->getMemOperand());
21326 SDValue Pair
= DAG
.getNode(ISD::BUILD_PAIR
, SDLoc(N
), MVT::i128
,
21327 Result
.getValue(0), Result
.getValue(1));
21328 Results
.append({Pair
, Result
.getValue(2) /* Chain */});
21332 case ISD::EXTRACT_SUBVECTOR
:
21333 ReplaceExtractSubVectorResults(N
, Results
, DAG
);
21335 case ISD::INSERT_SUBVECTOR
:
21336 case ISD::CONCAT_VECTORS
:
21337 // Custom lowering has been requested for INSERT_SUBVECTOR and
21338 // CONCAT_VECTORS -- but delegate to common code for result type
21341 case ISD::INTRINSIC_WO_CHAIN
: {
21342 EVT VT
= N
->getValueType(0);
21343 assert((VT
== MVT::i8
|| VT
== MVT::i16
) &&
21344 "custom lowering for unexpected type");
21346 ConstantSDNode
*CN
= cast
<ConstantSDNode
>(N
->getOperand(0));
21347 Intrinsic::ID IntID
= static_cast<Intrinsic::ID
>(CN
->getZExtValue());
21351 case Intrinsic::aarch64_sve_clasta_n
: {
21353 auto Op2
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, N
->getOperand(2));
21354 auto V
= DAG
.getNode(AArch64ISD::CLASTA_N
, DL
, MVT::i32
,
21355 N
->getOperand(1), Op2
, N
->getOperand(3));
21356 Results
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, V
));
21359 case Intrinsic::aarch64_sve_clastb_n
: {
21361 auto Op2
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, MVT::i32
, N
->getOperand(2));
21362 auto V
= DAG
.getNode(AArch64ISD::CLASTB_N
, DL
, MVT::i32
,
21363 N
->getOperand(1), Op2
, N
->getOperand(3));
21364 Results
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, V
));
21367 case Intrinsic::aarch64_sve_lasta
: {
21369 auto V
= DAG
.getNode(AArch64ISD::LASTA
, DL
, MVT::i32
,
21370 N
->getOperand(1), N
->getOperand(2));
21371 Results
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, V
));
21374 case Intrinsic::aarch64_sve_lastb
: {
21376 auto V
= DAG
.getNode(AArch64ISD::LASTB
, DL
, MVT::i32
,
21377 N
->getOperand(1), N
->getOperand(2));
21378 Results
.push_back(DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, V
));
21386 bool AArch64TargetLowering::useLoadStackGuardNode() const {
21387 if (Subtarget
->isTargetAndroid() || Subtarget
->isTargetFuchsia())
21388 return TargetLowering::useLoadStackGuardNode();
21392 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
21393 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
21394 // reciprocal if there are three or more FDIVs.
21398 TargetLoweringBase::LegalizeTypeAction
21399 AArch64TargetLowering::getPreferredVectorAction(MVT VT
) const {
21400 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
21401 // v4i16, v2i32 instead of to promote.
21402 if (VT
== MVT::v1i8
|| VT
== MVT::v1i16
|| VT
== MVT::v1i32
||
21404 return TypeWidenVector
;
21406 return TargetLoweringBase::getPreferredVectorAction(VT
);
21409 // In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
21410 // provided the address is 16-byte aligned.
21411 bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction
*I
) const {
21412 if (!Subtarget
->hasLSE2())
21415 if (auto LI
= dyn_cast
<LoadInst
>(I
))
21416 return LI
->getType()->getPrimitiveSizeInBits() == 128 &&
21417 LI
->getAlign() >= Align(16);
21419 if (auto SI
= dyn_cast
<StoreInst
>(I
))
21420 return SI
->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
21421 SI
->getAlign() >= Align(16);
21426 bool AArch64TargetLowering::shouldInsertFencesForAtomic(
21427 const Instruction
*I
) const {
21428 return isOpSuitableForLDPSTP(I
);
21431 // Loads and stores less than 128-bits are already atomic; ones above that
21432 // are doomed anyway, so defer to the default libcall and blame the OS when
21433 // things go wrong.
21434 TargetLoweringBase::AtomicExpansionKind
21435 AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst
*SI
) const {
21436 unsigned Size
= SI
->getValueOperand()->getType()->getPrimitiveSizeInBits();
21437 if (Size
!= 128 || isOpSuitableForLDPSTP(SI
))
21438 return AtomicExpansionKind::None
;
21439 return AtomicExpansionKind::Expand
;
21442 // Loads and stores less than 128-bits are already atomic; ones above that
21443 // are doomed anyway, so defer to the default libcall and blame the OS when
21444 // things go wrong.
21445 TargetLowering::AtomicExpansionKind
21446 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst
*LI
) const {
21447 unsigned Size
= LI
->getType()->getPrimitiveSizeInBits();
21449 if (Size
!= 128 || isOpSuitableForLDPSTP(LI
))
21450 return AtomicExpansionKind::None
;
21452 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21453 // implement atomicrmw without spilling. If the target address is also on the
21454 // stack and close enough to the spill slot, this can lead to a situation
21455 // where the monitor always gets cleared and the atomic operation can never
21456 // succeed. So at -O0 lower this operation to a CAS loop.
21457 if (getTargetMachine().getOptLevel() == CodeGenOpt::None
)
21458 return AtomicExpansionKind::CmpXChg
;
21460 return AtomicExpansionKind::LLSC
;
21463 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
21464 TargetLowering::AtomicExpansionKind
21465 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst
*AI
) const {
21466 if (AI
->isFloatingPointOperation())
21467 return AtomicExpansionKind::CmpXChg
;
21469 unsigned Size
= AI
->getType()->getPrimitiveSizeInBits();
21470 if (Size
> 128) return AtomicExpansionKind::None
;
21472 // Nand is not supported in LSE.
21473 // Leave 128 bits to LLSC or CmpXChg.
21474 if (AI
->getOperation() != AtomicRMWInst::Nand
&& Size
< 128) {
21475 if (Subtarget
->hasLSE())
21476 return AtomicExpansionKind::None
;
21477 if (Subtarget
->outlineAtomics()) {
21478 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
21479 // Don't outline them unless
21480 // (1) high level <atomic> support approved:
21481 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
21482 // (2) low level libgcc and compiler-rt support implemented by:
21483 // min/max outline atomics helpers
21484 if (AI
->getOperation() != AtomicRMWInst::Min
&&
21485 AI
->getOperation() != AtomicRMWInst::Max
&&
21486 AI
->getOperation() != AtomicRMWInst::UMin
&&
21487 AI
->getOperation() != AtomicRMWInst::UMax
) {
21488 return AtomicExpansionKind::None
;
21493 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21494 // implement atomicrmw without spilling. If the target address is also on the
21495 // stack and close enough to the spill slot, this can lead to a situation
21496 // where the monitor always gets cleared and the atomic operation can never
21497 // succeed. So at -O0 lower this operation to a CAS loop.
21498 if (getTargetMachine().getOptLevel() == CodeGenOpt::None
)
21499 return AtomicExpansionKind::CmpXChg
;
21501 return AtomicExpansionKind::LLSC
;
21504 TargetLowering::AtomicExpansionKind
21505 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
21506 AtomicCmpXchgInst
*AI
) const {
21507 // If subtarget has LSE, leave cmpxchg intact for codegen.
21508 if (Subtarget
->hasLSE() || Subtarget
->outlineAtomics())
21509 return AtomicExpansionKind::None
;
21510 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21511 // implement cmpxchg without spilling. If the address being exchanged is also
21512 // on the stack and close enough to the spill slot, this can lead to a
21513 // situation where the monitor always gets cleared and the atomic operation
21514 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21515 if (getTargetMachine().getOptLevel() == CodeGenOpt::None
)
21516 return AtomicExpansionKind::None
;
21518 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
21520 unsigned Size
= AI
->getCompareOperand()->getType()->getPrimitiveSizeInBits();
21522 return AtomicExpansionKind::None
;
21524 return AtomicExpansionKind::LLSC
;
21527 Value
*AArch64TargetLowering::emitLoadLinked(IRBuilderBase
&Builder
,
21528 Type
*ValueTy
, Value
*Addr
,
21529 AtomicOrdering Ord
) const {
21530 Module
*M
= Builder
.GetInsertBlock()->getParent()->getParent();
21531 bool IsAcquire
= isAcquireOrStronger(Ord
);
21533 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
21534 // intrinsic must return {i64, i64} and we have to recombine them into a
21535 // single i128 here.
21536 if (ValueTy
->getPrimitiveSizeInBits() == 128) {
21537 Intrinsic::ID Int
=
21538 IsAcquire
? Intrinsic::aarch64_ldaxp
: Intrinsic::aarch64_ldxp
;
21539 Function
*Ldxr
= Intrinsic::getDeclaration(M
, Int
);
21541 Addr
= Builder
.CreateBitCast(Addr
, Type::getInt8PtrTy(M
->getContext()));
21542 Value
*LoHi
= Builder
.CreateCall(Ldxr
, Addr
, "lohi");
21544 Value
*Lo
= Builder
.CreateExtractValue(LoHi
, 0, "lo");
21545 Value
*Hi
= Builder
.CreateExtractValue(LoHi
, 1, "hi");
21546 Lo
= Builder
.CreateZExt(Lo
, ValueTy
, "lo64");
21547 Hi
= Builder
.CreateZExt(Hi
, ValueTy
, "hi64");
21548 return Builder
.CreateOr(
21549 Lo
, Builder
.CreateShl(Hi
, ConstantInt::get(ValueTy
, 64)), "val64");
21552 Type
*Tys
[] = { Addr
->getType() };
21553 Intrinsic::ID Int
=
21554 IsAcquire
? Intrinsic::aarch64_ldaxr
: Intrinsic::aarch64_ldxr
;
21555 Function
*Ldxr
= Intrinsic::getDeclaration(M
, Int
, Tys
);
21557 const DataLayout
&DL
= M
->getDataLayout();
21558 IntegerType
*IntEltTy
= Builder
.getIntNTy(DL
.getTypeSizeInBits(ValueTy
));
21559 CallInst
*CI
= Builder
.CreateCall(Ldxr
, Addr
);
21561 0, Attribute::get(Builder
.getContext(), Attribute::ElementType
, ValueTy
));
21562 Value
*Trunc
= Builder
.CreateTrunc(CI
, IntEltTy
);
21564 return Builder
.CreateBitCast(Trunc
, ValueTy
);
21567 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
21568 IRBuilderBase
&Builder
) const {
21569 Module
*M
= Builder
.GetInsertBlock()->getParent()->getParent();
21570 Builder
.CreateCall(Intrinsic::getDeclaration(M
, Intrinsic::aarch64_clrex
));
21573 Value
*AArch64TargetLowering::emitStoreConditional(IRBuilderBase
&Builder
,
21574 Value
*Val
, Value
*Addr
,
21575 AtomicOrdering Ord
) const {
21576 Module
*M
= Builder
.GetInsertBlock()->getParent()->getParent();
21577 bool IsRelease
= isReleaseOrStronger(Ord
);
21579 // Since the intrinsics must have legal type, the i128 intrinsics take two
21580 // parameters: "i64, i64". We must marshal Val into the appropriate form
21581 // before the call.
21582 if (Val
->getType()->getPrimitiveSizeInBits() == 128) {
21583 Intrinsic::ID Int
=
21584 IsRelease
? Intrinsic::aarch64_stlxp
: Intrinsic::aarch64_stxp
;
21585 Function
*Stxr
= Intrinsic::getDeclaration(M
, Int
);
21586 Type
*Int64Ty
= Type::getInt64Ty(M
->getContext());
21588 Value
*Lo
= Builder
.CreateTrunc(Val
, Int64Ty
, "lo");
21589 Value
*Hi
= Builder
.CreateTrunc(Builder
.CreateLShr(Val
, 64), Int64Ty
, "hi");
21590 Addr
= Builder
.CreateBitCast(Addr
, Type::getInt8PtrTy(M
->getContext()));
21591 return Builder
.CreateCall(Stxr
, {Lo
, Hi
, Addr
});
21594 Intrinsic::ID Int
=
21595 IsRelease
? Intrinsic::aarch64_stlxr
: Intrinsic::aarch64_stxr
;
21596 Type
*Tys
[] = { Addr
->getType() };
21597 Function
*Stxr
= Intrinsic::getDeclaration(M
, Int
, Tys
);
21599 const DataLayout
&DL
= M
->getDataLayout();
21600 IntegerType
*IntValTy
= Builder
.getIntNTy(DL
.getTypeSizeInBits(Val
->getType()));
21601 Val
= Builder
.CreateBitCast(Val
, IntValTy
);
21603 CallInst
*CI
= Builder
.CreateCall(
21604 Stxr
, {Builder
.CreateZExtOrBitCast(
21605 Val
, Stxr
->getFunctionType()->getParamType(0)),
21607 CI
->addParamAttr(1, Attribute::get(Builder
.getContext(),
21608 Attribute::ElementType
, Val
->getType()));
21612 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
21613 Type
*Ty
, CallingConv::ID CallConv
, bool isVarArg
,
21614 const DataLayout
&DL
) const {
21615 if (!Ty
->isArrayTy()) {
21616 const TypeSize
&TySize
= Ty
->getPrimitiveSizeInBits();
21617 return TySize
.isScalable() && TySize
.getKnownMinSize() > 128;
21620 // All non aggregate members of the type must have the same type
21621 SmallVector
<EVT
> ValueVTs
;
21622 ComputeValueVTs(*this, DL
, Ty
, ValueVTs
);
21623 return all_equal(ValueVTs
);
21626 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext
&,
21631 static Value
*UseTlsOffset(IRBuilderBase
&IRB
, unsigned Offset
) {
21632 Module
*M
= IRB
.GetInsertBlock()->getParent()->getParent();
21633 Function
*ThreadPointerFunc
=
21634 Intrinsic::getDeclaration(M
, Intrinsic::thread_pointer
);
21635 return IRB
.CreatePointerCast(
21636 IRB
.CreateConstGEP1_32(IRB
.getInt8Ty(), IRB
.CreateCall(ThreadPointerFunc
),
21638 IRB
.getInt8PtrTy()->getPointerTo(0));
21641 Value
*AArch64TargetLowering::getIRStackGuard(IRBuilderBase
&IRB
) const {
21642 // Android provides a fixed TLS slot for the stack cookie. See the definition
21643 // of TLS_SLOT_STACK_GUARD in
21644 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
21645 if (Subtarget
->isTargetAndroid())
21646 return UseTlsOffset(IRB
, 0x28);
21648 // Fuchsia is similar.
21649 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
21650 if (Subtarget
->isTargetFuchsia())
21651 return UseTlsOffset(IRB
, -0x10);
21653 return TargetLowering::getIRStackGuard(IRB
);
21656 void AArch64TargetLowering::insertSSPDeclarations(Module
&M
) const {
21657 // MSVC CRT provides functionalities for stack protection.
21658 if (Subtarget
->getTargetTriple().isWindowsMSVCEnvironment()) {
21659 // MSVC CRT has a global variable holding security cookie.
21660 M
.getOrInsertGlobal("__security_cookie",
21661 Type::getInt8PtrTy(M
.getContext()));
21663 // MSVC CRT has a function to validate security cookie.
21664 FunctionCallee SecurityCheckCookie
= M
.getOrInsertFunction(
21665 Subtarget
->getSecurityCheckCookieName(),
21666 Type::getVoidTy(M
.getContext()), Type::getInt8PtrTy(M
.getContext()));
21667 if (Function
*F
= dyn_cast
<Function
>(SecurityCheckCookie
.getCallee())) {
21668 F
->setCallingConv(CallingConv::Win64
);
21669 F
->addParamAttr(0, Attribute::AttrKind::InReg
);
21673 TargetLowering::insertSSPDeclarations(M
);
21676 Value
*AArch64TargetLowering::getSDagStackGuard(const Module
&M
) const {
21677 // MSVC CRT has a global variable holding security cookie.
21678 if (Subtarget
->getTargetTriple().isWindowsMSVCEnvironment())
21679 return M
.getGlobalVariable("__security_cookie");
21680 return TargetLowering::getSDagStackGuard(M
);
21683 Function
*AArch64TargetLowering::getSSPStackGuardCheck(const Module
&M
) const {
21684 // MSVC CRT has a function to validate security cookie.
21685 if (Subtarget
->getTargetTriple().isWindowsMSVCEnvironment())
21686 return M
.getFunction(Subtarget
->getSecurityCheckCookieName());
21687 return TargetLowering::getSSPStackGuardCheck(M
);
21691 AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase
&IRB
) const {
21692 // Android provides a fixed TLS slot for the SafeStack pointer. See the
21693 // definition of TLS_SLOT_SAFESTACK in
21694 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
21695 if (Subtarget
->isTargetAndroid())
21696 return UseTlsOffset(IRB
, 0x48);
21698 // Fuchsia is similar.
21699 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
21700 if (Subtarget
->isTargetFuchsia())
21701 return UseTlsOffset(IRB
, -0x8);
21703 return TargetLowering::getSafeStackPointerLocation(IRB
);
21706 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
21707 const Instruction
&AndI
) const {
21708 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
21709 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
21710 // may be beneficial to sink in other cases, but we would have to check that
21711 // the cmp would not get folded into the br to form a cbz for these to be
21713 ConstantInt
* Mask
= dyn_cast
<ConstantInt
>(AndI
.getOperand(1));
21716 return Mask
->getValue().isPowerOf2();
21719 bool AArch64TargetLowering::
21720 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
21721 SDValue X
, ConstantSDNode
*XC
, ConstantSDNode
*CC
, SDValue Y
,
21722 unsigned OldShiftOpcode
, unsigned NewShiftOpcode
,
21723 SelectionDAG
&DAG
) const {
21724 // Does baseline recommend not to perform the fold by default?
21725 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
21726 X
, XC
, CC
, Y
, OldShiftOpcode
, NewShiftOpcode
, DAG
))
21728 // Else, if this is a vector shift, prefer 'shl'.
21729 return X
.getValueType().isScalarInteger() || NewShiftOpcode
== ISD::SHL
;
21732 bool AArch64TargetLowering::shouldExpandShift(SelectionDAG
&DAG
,
21734 if (DAG
.getMachineFunction().getFunction().hasMinSize() &&
21735 !Subtarget
->isTargetWindows() && !Subtarget
->isTargetDarwin())
21740 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock
*Entry
) const {
21741 // Update IsSplitCSR in AArch64unctionInfo.
21742 AArch64FunctionInfo
*AFI
= Entry
->getParent()->getInfo
<AArch64FunctionInfo
>();
21743 AFI
->setIsSplitCSR(true);
21746 void AArch64TargetLowering::insertCopiesSplitCSR(
21747 MachineBasicBlock
*Entry
,
21748 const SmallVectorImpl
<MachineBasicBlock
*> &Exits
) const {
21749 const AArch64RegisterInfo
*TRI
= Subtarget
->getRegisterInfo();
21750 const MCPhysReg
*IStart
= TRI
->getCalleeSavedRegsViaCopy(Entry
->getParent());
21754 const TargetInstrInfo
*TII
= Subtarget
->getInstrInfo();
21755 MachineRegisterInfo
*MRI
= &Entry
->getParent()->getRegInfo();
21756 MachineBasicBlock::iterator MBBI
= Entry
->begin();
21757 for (const MCPhysReg
*I
= IStart
; *I
; ++I
) {
21758 const TargetRegisterClass
*RC
= nullptr;
21759 if (AArch64::GPR64RegClass
.contains(*I
))
21760 RC
= &AArch64::GPR64RegClass
;
21761 else if (AArch64::FPR64RegClass
.contains(*I
))
21762 RC
= &AArch64::FPR64RegClass
;
21764 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21766 Register NewVR
= MRI
->createVirtualRegister(RC
);
21767 // Create copy from CSR to a virtual register.
21768 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21769 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21770 // nounwind. If we want to generalize this later, we may need to emit
21771 // CFI pseudo-instructions.
21772 assert(Entry
->getParent()->getFunction().hasFnAttribute(
21773 Attribute::NoUnwind
) &&
21774 "Function should be nounwind in insertCopiesSplitCSR!");
21775 Entry
->addLiveIn(*I
);
21776 BuildMI(*Entry
, MBBI
, DebugLoc(), TII
->get(TargetOpcode::COPY
), NewVR
)
21779 // Insert the copy-back instructions right before the terminator.
21780 for (auto *Exit
: Exits
)
21781 BuildMI(*Exit
, Exit
->getFirstTerminator(), DebugLoc(),
21782 TII
->get(TargetOpcode::COPY
), *I
)
21787 bool AArch64TargetLowering::isIntDivCheap(EVT VT
, AttributeList Attr
) const {
21788 // Integer division on AArch64 is expensive. However, when aggressively
21789 // optimizing for code size, we prefer to use a div instruction, as it is
21790 // usually smaller than the alternative sequence.
21791 // The exception to this is vector division. Since AArch64 doesn't have vector
21792 // integer division, leaving the division as-is is a loss even in terms of
21793 // size, because it will have to be scalarized, while the alternative code
21794 // sequence can be performed in vector form.
21795 bool OptSize
= Attr
.hasFnAttr(Attribute::MinSize
);
21796 return OptSize
&& !VT
.isVector();
21799 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT
) const {
21800 // We want inc-of-add for scalars and sub-of-not for vectors.
21801 return VT
.isScalarInteger();
21804 bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op
, EVT FPVT
,
21806 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
21808 if (FPVT
== MVT::v8f16
&& !Subtarget
->hasFullFP16())
21810 return TargetLowering::shouldConvertFpToSat(Op
, FPVT
, VT
);
21813 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT
) const {
21814 return Subtarget
->hasAggressiveFMA() && VT
.isFloatingPoint();
21818 AArch64TargetLowering::getVaListSizeInBits(const DataLayout
&DL
) const {
21819 if (Subtarget
->isTargetDarwin() || Subtarget
->isTargetWindows())
21820 return getPointerTy(DL
).getSizeInBits();
21822 return 3 * getPointerTy(DL
).getSizeInBits() + 2 * 32;
21825 void AArch64TargetLowering::finalizeLowering(MachineFunction
&MF
) const {
21826 MachineFrameInfo
&MFI
= MF
.getFrameInfo();
21827 // If we have any vulnerable SVE stack objects then the stack protector
21828 // needs to be placed at the top of the SVE stack area, as the SVE locals
21829 // are placed above the other locals, so we allocate it as if it were a
21830 // scalable vector.
21831 // FIXME: It may be worthwhile having a specific interface for this rather
21832 // than doing it here in finalizeLowering.
21833 if (MFI
.hasStackProtectorIndex()) {
21834 for (unsigned int i
= 0, e
= MFI
.getObjectIndexEnd(); i
!= e
; ++i
) {
21835 if (MFI
.getStackID(i
) == TargetStackID::ScalableVector
&&
21836 MFI
.getObjectSSPLayout(i
) != MachineFrameInfo::SSPLK_None
) {
21837 MFI
.setStackID(MFI
.getStackProtectorIndex(),
21838 TargetStackID::ScalableVector
);
21839 MFI
.setObjectAlignment(MFI
.getStackProtectorIndex(), Align(16));
21844 MFI
.computeMaxCallFrameSize(MF
);
21845 TargetLoweringBase::finalizeLowering(MF
);
21848 // Unlike X86, we let frame lowering assign offsets to all catch objects.
21849 bool AArch64TargetLowering::needsFixedCatchObjects() const {
21853 bool AArch64TargetLowering::shouldLocalize(
21854 const MachineInstr
&MI
, const TargetTransformInfo
*TTI
) const {
21855 auto &MF
= *MI
.getMF();
21856 auto &MRI
= MF
.getRegInfo();
21857 auto maxUses
= [](unsigned RematCost
) {
21858 // A cost of 1 means remats are basically free.
21859 if (RematCost
== 1)
21860 return std::numeric_limits
<unsigned>::max();
21861 if (RematCost
== 2)
21864 // Remat is too expensive, only sink if there's one user.
21867 llvm_unreachable("Unexpected remat cost");
21870 switch (MI
.getOpcode()) {
21871 case TargetOpcode::G_GLOBAL_VALUE
: {
21872 // On Darwin, TLS global vars get selected into function calls, which
21873 // we don't want localized, as they can get moved into the middle of a
21874 // another call sequence.
21875 const GlobalValue
&GV
= *MI
.getOperand(1).getGlobal();
21876 if (GV
.isThreadLocal() && Subtarget
->isTargetMachO())
21880 case TargetOpcode::G_CONSTANT
: {
21881 auto *CI
= MI
.getOperand(1).getCImm();
21882 APInt Imm
= CI
->getValue();
21883 InstructionCost Cost
= TTI
->getIntImmCost(
21884 Imm
, CI
->getType(), TargetTransformInfo::TCK_CodeSize
);
21885 assert(Cost
.isValid() && "Expected a valid imm cost");
21887 unsigned RematCost
= *Cost
.getValue();
21888 Register Reg
= MI
.getOperand(0).getReg();
21889 unsigned MaxUses
= maxUses(RematCost
);
21890 // Don't pass UINT_MAX sentinal value to hasAtMostUserInstrs().
21891 if (MaxUses
== std::numeric_limits
<unsigned>::max())
21893 return MRI
.hasAtMostUserInstrs(Reg
, MaxUses
);
21895 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
21897 case AArch64::ADRP
:
21898 case AArch64::G_ADD_LOW
:
21903 return TargetLoweringBase::shouldLocalize(MI
, TTI
);
21906 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction
&Inst
) const {
21907 if (isa
<ScalableVectorType
>(Inst
.getType()))
21910 for (unsigned i
= 0; i
< Inst
.getNumOperands(); ++i
)
21911 if (isa
<ScalableVectorType
>(Inst
.getOperand(i
)->getType()))
21914 if (const AllocaInst
*AI
= dyn_cast
<AllocaInst
>(&Inst
)) {
21915 if (isa
<ScalableVectorType
>(AI
->getAllocatedType()))
21922 // Return the largest legal scalable vector type that matches VT's element type.
21923 static EVT
getContainerForFixedLengthVector(SelectionDAG
&DAG
, EVT VT
) {
21924 assert(VT
.isFixedLengthVector() &&
21925 DAG
.getTargetLoweringInfo().isTypeLegal(VT
) &&
21926 "Expected legal fixed length vector!");
21927 switch (VT
.getVectorElementType().getSimpleVT().SimpleTy
) {
21929 llvm_unreachable("unexpected element type for SVE container");
21931 return EVT(MVT::nxv16i8
);
21933 return EVT(MVT::nxv8i16
);
21935 return EVT(MVT::nxv4i32
);
21937 return EVT(MVT::nxv2i64
);
21939 return EVT(MVT::nxv8f16
);
21941 return EVT(MVT::nxv4f32
);
21943 return EVT(MVT::nxv2f64
);
21947 // Return a PTRUE with active lanes corresponding to the extent of VT.
21948 static SDValue
getPredicateForFixedLengthVector(SelectionDAG
&DAG
, SDLoc
&DL
,
21950 assert(VT
.isFixedLengthVector() &&
21951 DAG
.getTargetLoweringInfo().isTypeLegal(VT
) &&
21952 "Expected legal fixed length vector!");
21954 Optional
<unsigned> PgPattern
=
21955 getSVEPredPatternFromNumElements(VT
.getVectorNumElements());
21956 assert(PgPattern
&& "Unexpected element count for SVE predicate");
21958 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
21959 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
21960 // variants of instructions when available.
21961 const auto &Subtarget
= DAG
.getSubtarget
<AArch64Subtarget
>();
21962 unsigned MinSVESize
= Subtarget
.getMinSVEVectorSizeInBits();
21963 unsigned MaxSVESize
= Subtarget
.getMaxSVEVectorSizeInBits();
21964 if (MaxSVESize
&& MinSVESize
== MaxSVESize
&&
21965 MaxSVESize
== VT
.getSizeInBits())
21966 PgPattern
= AArch64SVEPredPattern::all
;
21969 switch (VT
.getVectorElementType().getSimpleVT().SimpleTy
) {
21971 llvm_unreachable("unexpected element type for SVE predicate");
21973 MaskVT
= MVT::nxv16i1
;
21977 MaskVT
= MVT::nxv8i1
;
21981 MaskVT
= MVT::nxv4i1
;
21985 MaskVT
= MVT::nxv2i1
;
21989 return getPTrue(DAG
, DL
, MaskVT
, *PgPattern
);
21992 static SDValue
getPredicateForScalableVector(SelectionDAG
&DAG
, SDLoc
&DL
,
21994 assert(VT
.isScalableVector() && DAG
.getTargetLoweringInfo().isTypeLegal(VT
) &&
21995 "Expected legal scalable vector!");
21996 auto PredTy
= VT
.changeVectorElementType(MVT::i1
);
21997 return getPTrue(DAG
, DL
, PredTy
, AArch64SVEPredPattern::all
);
22000 static SDValue
getPredicateForVector(SelectionDAG
&DAG
, SDLoc
&DL
, EVT VT
) {
22001 if (VT
.isFixedLengthVector())
22002 return getPredicateForFixedLengthVector(DAG
, DL
, VT
);
22004 return getPredicateForScalableVector(DAG
, DL
, VT
);
22007 // Grow V to consume an entire SVE register.
22008 static SDValue
convertToScalableVector(SelectionDAG
&DAG
, EVT VT
, SDValue V
) {
22009 assert(VT
.isScalableVector() &&
22010 "Expected to convert into a scalable vector!");
22011 assert(V
.getValueType().isFixedLengthVector() &&
22012 "Expected a fixed length vector operand!");
22014 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i64
);
22015 return DAG
.getNode(ISD::INSERT_SUBVECTOR
, DL
, VT
, DAG
.getUNDEF(VT
), V
, Zero
);
22018 // Shrink V so it's just big enough to maintain a VT's worth of data.
22019 static SDValue
convertFromScalableVector(SelectionDAG
&DAG
, EVT VT
, SDValue V
) {
22020 assert(VT
.isFixedLengthVector() &&
22021 "Expected to convert into a fixed length vector!");
22022 assert(V
.getValueType().isScalableVector() &&
22023 "Expected a scalable vector operand!");
22025 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i64
);
22026 return DAG
.getNode(ISD::EXTRACT_SUBVECTOR
, DL
, VT
, V
, Zero
);
22029 // Convert all fixed length vector loads larger than NEON to masked_loads.
22030 SDValue
AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
22031 SDValue Op
, SelectionDAG
&DAG
) const {
22032 auto Load
= cast
<LoadSDNode
>(Op
);
22035 EVT VT
= Op
.getValueType();
22036 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, VT
);
22037 EVT LoadVT
= ContainerVT
;
22038 EVT MemVT
= Load
->getMemoryVT();
22040 auto Pg
= getPredicateForFixedLengthVector(DAG
, DL
, VT
);
22042 if (VT
.isFloatingPoint()) {
22043 LoadVT
= ContainerVT
.changeTypeToInteger();
22044 MemVT
= MemVT
.changeTypeToInteger();
22047 SDValue NewLoad
= DAG
.getMaskedLoad(
22048 LoadVT
, DL
, Load
->getChain(), Load
->getBasePtr(), Load
->getOffset(), Pg
,
22049 DAG
.getUNDEF(LoadVT
), MemVT
, Load
->getMemOperand(),
22050 Load
->getAddressingMode(), Load
->getExtensionType());
22052 SDValue Result
= NewLoad
;
22053 if (VT
.isFloatingPoint() && Load
->getExtensionType() == ISD::EXTLOAD
) {
22054 EVT ExtendVT
= ContainerVT
.changeVectorElementType(
22055 Load
->getMemoryVT().getVectorElementType());
22057 Result
= getSVESafeBitCast(ExtendVT
, Result
, DAG
);
22058 Result
= DAG
.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU
, DL
, ContainerVT
,
22059 Pg
, Result
, DAG
.getUNDEF(ContainerVT
));
22060 } else if (VT
.isFloatingPoint()) {
22061 Result
= DAG
.getNode(ISD::BITCAST
, DL
, ContainerVT
, Result
);
22064 Result
= convertFromScalableVector(DAG
, VT
, Result
);
22065 SDValue MergedValues
[2] = {Result
, NewLoad
.getValue(1)};
22066 return DAG
.getMergeValues(MergedValues
, DL
);
22069 static SDValue
convertFixedMaskToScalableVector(SDValue Mask
,
22070 SelectionDAG
&DAG
) {
22072 EVT InVT
= Mask
.getValueType();
22073 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, InVT
);
22075 auto Pg
= getPredicateForFixedLengthVector(DAG
, DL
, InVT
);
22077 if (ISD::isBuildVectorAllOnes(Mask
.getNode()))
22080 auto Op1
= convertToScalableVector(DAG
, ContainerVT
, Mask
);
22081 auto Op2
= DAG
.getConstant(0, DL
, ContainerVT
);
22083 return DAG
.getNode(AArch64ISD::SETCC_MERGE_ZERO
, DL
, Pg
.getValueType(),
22084 {Pg
, Op1
, Op2
, DAG
.getCondCode(ISD::SETNE
)});
22087 // Convert all fixed length vector loads larger than NEON to masked_loads.
22088 SDValue
AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
22089 SDValue Op
, SelectionDAG
&DAG
) const {
22090 auto Load
= cast
<MaskedLoadSDNode
>(Op
);
22093 EVT VT
= Op
.getValueType();
22094 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, VT
);
22096 SDValue Mask
= convertFixedMaskToScalableVector(Load
->getMask(), DAG
);
22099 bool IsPassThruZeroOrUndef
= false;
22101 if (Load
->getPassThru()->isUndef()) {
22102 PassThru
= DAG
.getUNDEF(ContainerVT
);
22103 IsPassThruZeroOrUndef
= true;
22105 if (ContainerVT
.isInteger())
22106 PassThru
= DAG
.getConstant(0, DL
, ContainerVT
);
22108 PassThru
= DAG
.getConstantFP(0, DL
, ContainerVT
);
22109 if (isZerosVector(Load
->getPassThru().getNode()))
22110 IsPassThruZeroOrUndef
= true;
22113 SDValue NewLoad
= DAG
.getMaskedLoad(
22114 ContainerVT
, DL
, Load
->getChain(), Load
->getBasePtr(), Load
->getOffset(),
22115 Mask
, PassThru
, Load
->getMemoryVT(), Load
->getMemOperand(),
22116 Load
->getAddressingMode(), Load
->getExtensionType());
22118 SDValue Result
= NewLoad
;
22119 if (!IsPassThruZeroOrUndef
) {
22120 SDValue OldPassThru
=
22121 convertToScalableVector(DAG
, ContainerVT
, Load
->getPassThru());
22122 Result
= DAG
.getSelect(DL
, ContainerVT
, Mask
, Result
, OldPassThru
);
22125 Result
= convertFromScalableVector(DAG
, VT
, Result
);
22126 SDValue MergedValues
[2] = {Result
, NewLoad
.getValue(1)};
22127 return DAG
.getMergeValues(MergedValues
, DL
);
22130 // Convert all fixed length vector stores larger than NEON to masked_stores.
22131 SDValue
AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
22132 SDValue Op
, SelectionDAG
&DAG
) const {
22133 auto Store
= cast
<StoreSDNode
>(Op
);
22136 EVT VT
= Store
->getValue().getValueType();
22137 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, VT
);
22138 EVT MemVT
= Store
->getMemoryVT();
22140 auto Pg
= getPredicateForFixedLengthVector(DAG
, DL
, VT
);
22141 auto NewValue
= convertToScalableVector(DAG
, ContainerVT
, Store
->getValue());
22143 if (VT
.isFloatingPoint() && Store
->isTruncatingStore()) {
22144 EVT TruncVT
= ContainerVT
.changeVectorElementType(
22145 Store
->getMemoryVT().getVectorElementType());
22146 MemVT
= MemVT
.changeTypeToInteger();
22147 NewValue
= DAG
.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU
, DL
, TruncVT
, Pg
,
22148 NewValue
, DAG
.getTargetConstant(0, DL
, MVT::i64
),
22149 DAG
.getUNDEF(TruncVT
));
22151 getSVESafeBitCast(ContainerVT
.changeTypeToInteger(), NewValue
, DAG
);
22152 } else if (VT
.isFloatingPoint()) {
22153 MemVT
= MemVT
.changeTypeToInteger();
22155 getSVESafeBitCast(ContainerVT
.changeTypeToInteger(), NewValue
, DAG
);
22158 return DAG
.getMaskedStore(Store
->getChain(), DL
, NewValue
,
22159 Store
->getBasePtr(), Store
->getOffset(), Pg
, MemVT
,
22160 Store
->getMemOperand(), Store
->getAddressingMode(),
22161 Store
->isTruncatingStore());
22164 SDValue
AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
22165 SDValue Op
, SelectionDAG
&DAG
) const {
22166 auto *Store
= cast
<MaskedStoreSDNode
>(Op
);
22169 EVT VT
= Store
->getValue().getValueType();
22170 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, VT
);
22172 auto NewValue
= convertToScalableVector(DAG
, ContainerVT
, Store
->getValue());
22173 SDValue Mask
= convertFixedMaskToScalableVector(Store
->getMask(), DAG
);
22175 return DAG
.getMaskedStore(
22176 Store
->getChain(), DL
, NewValue
, Store
->getBasePtr(), Store
->getOffset(),
22177 Mask
, Store
->getMemoryVT(), Store
->getMemOperand(),
22178 Store
->getAddressingMode(), Store
->isTruncatingStore());
22181 SDValue
AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
22182 SDValue Op
, SelectionDAG
&DAG
) const {
22184 EVT VT
= Op
.getValueType();
22185 EVT EltVT
= VT
.getVectorElementType();
22187 bool Signed
= Op
.getOpcode() == ISD::SDIV
;
22188 unsigned PredOpcode
= Signed
? AArch64ISD::SDIV_PRED
: AArch64ISD::UDIV_PRED
;
22192 if (Signed
&& isPow2Splat(Op
.getOperand(1), SplatVal
, Negated
)) {
22193 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, VT
);
22194 SDValue Op1
= convertToScalableVector(DAG
, ContainerVT
, Op
.getOperand(0));
22195 SDValue Op2
= DAG
.getTargetConstant(Log2_64(SplatVal
), dl
, MVT::i32
);
22197 SDValue Pg
= getPredicateForFixedLengthVector(DAG
, dl
, VT
);
22198 SDValue Res
= DAG
.getNode(AArch64ISD::SRAD_MERGE_OP1
, dl
, ContainerVT
, Pg
, Op1
, Op2
);
22200 Res
= DAG
.getNode(ISD::SUB
, dl
, VT
, DAG
.getConstant(0, dl
, VT
), Res
);
22202 return convertFromScalableVector(DAG
, VT
, Res
);
22205 // Scalable vector i32/i64 DIV is supported.
22206 if (EltVT
== MVT::i32
|| EltVT
== MVT::i64
)
22207 return LowerToPredicatedOp(Op
, DAG
, PredOpcode
);
22209 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
22210 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, VT
);
22211 EVT HalfVT
= VT
.getHalfNumVectorElementsVT(*DAG
.getContext());
22212 EVT FixedWidenedVT
= HalfVT
.widenIntegerVectorElementType(*DAG
.getContext());
22213 EVT ScalableWidenedVT
= getContainerForFixedLengthVector(DAG
, FixedWidenedVT
);
22215 // If this is not a full vector, extend, div, and truncate it.
22216 EVT WidenedVT
= VT
.widenIntegerVectorElementType(*DAG
.getContext());
22217 if (DAG
.getTargetLoweringInfo().isTypeLegal(WidenedVT
)) {
22218 unsigned ExtendOpcode
= Signed
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
;
22219 SDValue Op0
= DAG
.getNode(ExtendOpcode
, dl
, WidenedVT
, Op
.getOperand(0));
22220 SDValue Op1
= DAG
.getNode(ExtendOpcode
, dl
, WidenedVT
, Op
.getOperand(1));
22221 SDValue Div
= DAG
.getNode(Op
.getOpcode(), dl
, WidenedVT
, Op0
, Op1
);
22222 return DAG
.getNode(ISD::TRUNCATE
, dl
, VT
, Div
);
22225 // Convert the operands to scalable vectors.
22226 SDValue Op0
= convertToScalableVector(DAG
, ContainerVT
, Op
.getOperand(0));
22227 SDValue Op1
= convertToScalableVector(DAG
, ContainerVT
, Op
.getOperand(1));
22229 // Extend the scalable operands.
22230 unsigned UnpkLo
= Signed
? AArch64ISD::SUNPKLO
: AArch64ISD::UUNPKLO
;
22231 unsigned UnpkHi
= Signed
? AArch64ISD::SUNPKHI
: AArch64ISD::UUNPKHI
;
22232 SDValue Op0Lo
= DAG
.getNode(UnpkLo
, dl
, ScalableWidenedVT
, Op0
);
22233 SDValue Op1Lo
= DAG
.getNode(UnpkLo
, dl
, ScalableWidenedVT
, Op1
);
22234 SDValue Op0Hi
= DAG
.getNode(UnpkHi
, dl
, ScalableWidenedVT
, Op0
);
22235 SDValue Op1Hi
= DAG
.getNode(UnpkHi
, dl
, ScalableWidenedVT
, Op1
);
22237 // Convert back to fixed vectors so the DIV can be further lowered.
22238 Op0Lo
= convertFromScalableVector(DAG
, FixedWidenedVT
, Op0Lo
);
22239 Op1Lo
= convertFromScalableVector(DAG
, FixedWidenedVT
, Op1Lo
);
22240 Op0Hi
= convertFromScalableVector(DAG
, FixedWidenedVT
, Op0Hi
);
22241 Op1Hi
= convertFromScalableVector(DAG
, FixedWidenedVT
, Op1Hi
);
22242 SDValue ResultLo
= DAG
.getNode(Op
.getOpcode(), dl
, FixedWidenedVT
,
22244 SDValue ResultHi
= DAG
.getNode(Op
.getOpcode(), dl
, FixedWidenedVT
,
22247 // Convert again to scalable vectors to truncate.
22248 ResultLo
= convertToScalableVector(DAG
, ScalableWidenedVT
, ResultLo
);
22249 ResultHi
= convertToScalableVector(DAG
, ScalableWidenedVT
, ResultHi
);
22250 SDValue ScalableResult
= DAG
.getNode(AArch64ISD::UZP1
, dl
, ContainerVT
,
22251 ResultLo
, ResultHi
);
22253 return convertFromScalableVector(DAG
, VT
, ScalableResult
);
22256 SDValue
AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
22257 SDValue Op
, SelectionDAG
&DAG
) const {
22258 EVT VT
= Op
.getValueType();
22259 assert(VT
.isFixedLengthVector() && "Expected fixed length vector type!");
22262 SDValue Val
= Op
.getOperand(0);
22263 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, Val
.getValueType());
22264 Val
= convertToScalableVector(DAG
, ContainerVT
, Val
);
22266 bool Signed
= Op
.getOpcode() == ISD::SIGN_EXTEND
;
22267 unsigned ExtendOpc
= Signed
? AArch64ISD::SUNPKLO
: AArch64ISD::UUNPKLO
;
22269 // Repeatedly unpack Val until the result is of the desired element type.
22270 switch (ContainerVT
.getSimpleVT().SimpleTy
) {
22272 llvm_unreachable("unimplemented container type");
22274 Val
= DAG
.getNode(ExtendOpc
, DL
, MVT::nxv8i16
, Val
);
22275 if (VT
.getVectorElementType() == MVT::i16
)
22279 Val
= DAG
.getNode(ExtendOpc
, DL
, MVT::nxv4i32
, Val
);
22280 if (VT
.getVectorElementType() == MVT::i32
)
22284 Val
= DAG
.getNode(ExtendOpc
, DL
, MVT::nxv2i64
, Val
);
22285 assert(VT
.getVectorElementType() == MVT::i64
&& "Unexpected element type!");
22289 return convertFromScalableVector(DAG
, VT
, Val
);
22292 SDValue
AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
22293 SDValue Op
, SelectionDAG
&DAG
) const {
22294 EVT VT
= Op
.getValueType();
22295 assert(VT
.isFixedLengthVector() && "Expected fixed length vector type!");
22298 SDValue Val
= Op
.getOperand(0);
22299 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, Val
.getValueType());
22300 Val
= convertToScalableVector(DAG
, ContainerVT
, Val
);
22302 // Repeatedly truncate Val until the result is of the desired element type.
22303 switch (ContainerVT
.getSimpleVT().SimpleTy
) {
22305 llvm_unreachable("unimplemented container type");
22307 Val
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::nxv4i32
, Val
);
22308 Val
= DAG
.getNode(AArch64ISD::UZP1
, DL
, MVT::nxv4i32
, Val
, Val
);
22309 if (VT
.getVectorElementType() == MVT::i32
)
22313 Val
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::nxv8i16
, Val
);
22314 Val
= DAG
.getNode(AArch64ISD::UZP1
, DL
, MVT::nxv8i16
, Val
, Val
);
22315 if (VT
.getVectorElementType() == MVT::i16
)
22319 Val
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::nxv16i8
, Val
);
22320 Val
= DAG
.getNode(AArch64ISD::UZP1
, DL
, MVT::nxv16i8
, Val
, Val
);
22321 assert(VT
.getVectorElementType() == MVT::i8
&& "Unexpected element type!");
22325 return convertFromScalableVector(DAG
, VT
, Val
);
22328 SDValue
AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
22329 SDValue Op
, SelectionDAG
&DAG
) const {
22330 EVT VT
= Op
.getValueType();
22331 EVT InVT
= Op
.getOperand(0).getValueType();
22332 assert(InVT
.isFixedLengthVector() && "Expected fixed length vector type!");
22335 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, InVT
);
22336 SDValue Op0
= convertToScalableVector(DAG
, ContainerVT
, Op
->getOperand(0));
22338 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, VT
, Op0
, Op
.getOperand(1));
22341 SDValue
AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
22342 SDValue Op
, SelectionDAG
&DAG
) const {
22343 EVT VT
= Op
.getValueType();
22344 assert(VT
.isFixedLengthVector() && "Expected fixed length vector type!");
22347 EVT InVT
= Op
.getOperand(0).getValueType();
22348 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, InVT
);
22349 SDValue Op0
= convertToScalableVector(DAG
, ContainerVT
, Op
->getOperand(0));
22351 auto ScalableRes
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, DL
, ContainerVT
, Op0
,
22352 Op
.getOperand(1), Op
.getOperand(2));
22354 return convertFromScalableVector(DAG
, VT
, ScalableRes
);
22357 // Convert vector operation 'Op' to an equivalent predicated operation whereby
22358 // the original operation's type is used to construct a suitable predicate.
22359 // NOTE: The results for inactive lanes are undefined.
22360 SDValue
AArch64TargetLowering::LowerToPredicatedOp(SDValue Op
,
22362 unsigned NewOp
) const {
22363 EVT VT
= Op
.getValueType();
22365 auto Pg
= getPredicateForVector(DAG
, DL
, VT
);
22367 if (VT
.isFixedLengthVector()) {
22368 assert(isTypeLegal(VT
) && "Expected only legal fixed-width types");
22369 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, VT
);
22371 // Create list of operands by converting existing ones to scalable types.
22372 SmallVector
<SDValue
, 4> Operands
= {Pg
};
22373 for (const SDValue
&V
: Op
->op_values()) {
22374 if (isa
<CondCodeSDNode
>(V
)) {
22375 Operands
.push_back(V
);
22379 if (const VTSDNode
*VTNode
= dyn_cast
<VTSDNode
>(V
)) {
22380 EVT VTArg
= VTNode
->getVT().getVectorElementType();
22381 EVT NewVTArg
= ContainerVT
.changeVectorElementType(VTArg
);
22382 Operands
.push_back(DAG
.getValueType(NewVTArg
));
22386 assert(isTypeLegal(V
.getValueType()) &&
22387 "Expected only legal fixed-width types");
22388 Operands
.push_back(convertToScalableVector(DAG
, ContainerVT
, V
));
22391 if (isMergePassthruOpcode(NewOp
))
22392 Operands
.push_back(DAG
.getUNDEF(ContainerVT
));
22394 auto ScalableRes
= DAG
.getNode(NewOp
, DL
, ContainerVT
, Operands
);
22395 return convertFromScalableVector(DAG
, VT
, ScalableRes
);
22398 assert(VT
.isScalableVector() && "Only expect to lower scalable vector op!");
22400 SmallVector
<SDValue
, 4> Operands
= {Pg
};
22401 for (const SDValue
&V
: Op
->op_values()) {
22402 assert((!V
.getValueType().isVector() ||
22403 V
.getValueType().isScalableVector()) &&
22404 "Only scalable vectors are supported!");
22405 Operands
.push_back(V
);
22408 if (isMergePassthruOpcode(NewOp
))
22409 Operands
.push_back(DAG
.getUNDEF(VT
));
22411 return DAG
.getNode(NewOp
, DL
, VT
, Operands
, Op
->getFlags());
22414 // If a fixed length vector operation has no side effects when applied to
22415 // undefined elements, we can safely use scalable vectors to perform the same
22416 // operation without needing to worry about predication.
22417 SDValue
AArch64TargetLowering::LowerToScalableOp(SDValue Op
,
22418 SelectionDAG
&DAG
) const {
22419 EVT VT
= Op
.getValueType();
22420 assert(useSVEForFixedLengthVectorVT(VT
) &&
22421 "Only expected to lower fixed length vector operation!");
22422 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, VT
);
22424 // Create list of operands by converting existing ones to scalable types.
22425 SmallVector
<SDValue
, 4> Ops
;
22426 for (const SDValue
&V
: Op
->op_values()) {
22427 assert(!isa
<VTSDNode
>(V
) && "Unexpected VTSDNode node!");
22429 // Pass through non-vector operands.
22430 if (!V
.getValueType().isVector()) {
22435 // "cast" fixed length vector to a scalable vector.
22436 assert(useSVEForFixedLengthVectorVT(V
.getValueType()) &&
22437 "Only fixed length vectors are supported!");
22438 Ops
.push_back(convertToScalableVector(DAG
, ContainerVT
, V
));
22441 auto ScalableRes
= DAG
.getNode(Op
.getOpcode(), SDLoc(Op
), ContainerVT
, Ops
);
22442 return convertFromScalableVector(DAG
, VT
, ScalableRes
);
22445 SDValue
AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp
,
22446 SelectionDAG
&DAG
) const {
22447 SDLoc
DL(ScalarOp
);
22448 SDValue AccOp
= ScalarOp
.getOperand(0);
22449 SDValue VecOp
= ScalarOp
.getOperand(1);
22450 EVT SrcVT
= VecOp
.getValueType();
22451 EVT ResVT
= SrcVT
.getVectorElementType();
22453 EVT ContainerVT
= SrcVT
;
22454 if (SrcVT
.isFixedLengthVector()) {
22455 ContainerVT
= getContainerForFixedLengthVector(DAG
, SrcVT
);
22456 VecOp
= convertToScalableVector(DAG
, ContainerVT
, VecOp
);
22459 SDValue Pg
= getPredicateForVector(DAG
, DL
, SrcVT
);
22460 SDValue Zero
= DAG
.getConstant(0, DL
, MVT::i64
);
22462 // Convert operands to Scalable.
22463 AccOp
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, DL
, ContainerVT
,
22464 DAG
.getUNDEF(ContainerVT
), AccOp
, Zero
);
22466 // Perform reduction.
22467 SDValue Rdx
= DAG
.getNode(AArch64ISD::FADDA_PRED
, DL
, ContainerVT
,
22470 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, ResVT
, Rdx
, Zero
);
22473 SDValue
AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp
,
22474 SelectionDAG
&DAG
) const {
22475 SDLoc
DL(ReduceOp
);
22476 SDValue Op
= ReduceOp
.getOperand(0);
22477 EVT OpVT
= Op
.getValueType();
22478 EVT VT
= ReduceOp
.getValueType();
22480 if (!OpVT
.isScalableVector() || OpVT
.getVectorElementType() != MVT::i1
)
22483 SDValue Pg
= getPredicateForVector(DAG
, DL
, OpVT
);
22485 switch (ReduceOp
.getOpcode()) {
22488 case ISD::VECREDUCE_OR
:
22489 if (isAllActivePredicate(DAG
, Pg
) && OpVT
== MVT::nxv16i1
)
22490 // The predicate can be 'Op' because
22491 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
22492 return getPTest(DAG
, VT
, Op
, Op
, AArch64CC::ANY_ACTIVE
);
22494 return getPTest(DAG
, VT
, Pg
, Op
, AArch64CC::ANY_ACTIVE
);
22495 case ISD::VECREDUCE_AND
: {
22496 Op
= DAG
.getNode(ISD::XOR
, DL
, OpVT
, Op
, Pg
);
22497 return getPTest(DAG
, VT
, Pg
, Op
, AArch64CC::NONE_ACTIVE
);
22499 case ISD::VECREDUCE_XOR
: {
22501 DAG
.getTargetConstant(Intrinsic::aarch64_sve_cntp
, DL
, MVT::i64
);
22502 if (OpVT
== MVT::nxv1i1
) {
22503 // Emulate a CNTP on .Q using .D and a different governing predicate.
22504 Pg
= DAG
.getNode(AArch64ISD::REINTERPRET_CAST
, DL
, MVT::nxv2i1
, Pg
);
22505 Op
= DAG
.getNode(AArch64ISD::REINTERPRET_CAST
, DL
, MVT::nxv2i1
, Op
);
22508 DAG
.getNode(ISD::INTRINSIC_WO_CHAIN
, DL
, MVT::i64
, ID
, Pg
, Op
);
22509 return DAG
.getAnyExtOrTrunc(Cntp
, DL
, VT
);
22516 SDValue
AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode
,
22518 SelectionDAG
&DAG
) const {
22519 SDLoc
DL(ScalarOp
);
22520 SDValue VecOp
= ScalarOp
.getOperand(0);
22521 EVT SrcVT
= VecOp
.getValueType();
22523 if (useSVEForFixedLengthVectorVT(
22525 /*OverrideNEON=*/Subtarget
->useSVEForFixedLengthVectors())) {
22526 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, SrcVT
);
22527 VecOp
= convertToScalableVector(DAG
, ContainerVT
, VecOp
);
22530 // UADDV always returns an i64 result.
22531 EVT ResVT
= (Opcode
== AArch64ISD::UADDV_PRED
) ? MVT::i64
:
22532 SrcVT
.getVectorElementType();
22534 if (SrcVT
.isFixedLengthVector() || Opcode
== AArch64ISD::UADDV_PRED
)
22535 RdxVT
= getPackedSVEVectorVT(ResVT
);
22537 SDValue Pg
= getPredicateForVector(DAG
, DL
, SrcVT
);
22538 SDValue Rdx
= DAG
.getNode(Opcode
, DL
, RdxVT
, Pg
, VecOp
);
22539 SDValue Res
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, ResVT
,
22540 Rdx
, DAG
.getConstant(0, DL
, MVT::i64
));
22542 // The VEC_REDUCE nodes expect an element size result.
22543 if (ResVT
!= ScalarOp
.getValueType())
22544 Res
= DAG
.getAnyExtOrTrunc(Res
, DL
, ScalarOp
.getValueType());
22550 AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op
,
22551 SelectionDAG
&DAG
) const {
22552 EVT VT
= Op
.getValueType();
22555 EVT InVT
= Op
.getOperand(1).getValueType();
22556 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, InVT
);
22557 SDValue Op1
= convertToScalableVector(DAG
, ContainerVT
, Op
->getOperand(1));
22558 SDValue Op2
= convertToScalableVector(DAG
, ContainerVT
, Op
->getOperand(2));
22560 // Convert the mask to a predicated (NOTE: We don't need to worry about
22561 // inactive lanes since VSELECT is safe when given undefined elements).
22562 EVT MaskVT
= Op
.getOperand(0).getValueType();
22563 EVT MaskContainerVT
= getContainerForFixedLengthVector(DAG
, MaskVT
);
22564 auto Mask
= convertToScalableVector(DAG
, MaskContainerVT
, Op
.getOperand(0));
22565 Mask
= DAG
.getNode(ISD::TRUNCATE
, DL
,
22566 MaskContainerVT
.changeVectorElementType(MVT::i1
), Mask
);
22568 auto ScalableRes
= DAG
.getNode(ISD::VSELECT
, DL
, ContainerVT
,
22571 return convertFromScalableVector(DAG
, VT
, ScalableRes
);
22574 SDValue
AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
22575 SDValue Op
, SelectionDAG
&DAG
) const {
22577 EVT InVT
= Op
.getOperand(0).getValueType();
22578 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, InVT
);
22580 assert(useSVEForFixedLengthVectorVT(InVT
) &&
22581 "Only expected to lower fixed length vector operation!");
22582 assert(Op
.getValueType() == InVT
.changeTypeToInteger() &&
22583 "Expected integer result of the same bit length as the inputs!");
22585 auto Op1
= convertToScalableVector(DAG
, ContainerVT
, Op
.getOperand(0));
22586 auto Op2
= convertToScalableVector(DAG
, ContainerVT
, Op
.getOperand(1));
22587 auto Pg
= getPredicateForFixedLengthVector(DAG
, DL
, InVT
);
22589 EVT CmpVT
= Pg
.getValueType();
22590 auto Cmp
= DAG
.getNode(AArch64ISD::SETCC_MERGE_ZERO
, DL
, CmpVT
,
22591 {Pg
, Op1
, Op2
, Op
.getOperand(2)});
22593 EVT PromoteVT
= ContainerVT
.changeTypeToInteger();
22594 auto Promote
= DAG
.getBoolExtOrTrunc(Cmp
, DL
, PromoteVT
, InVT
);
22595 return convertFromScalableVector(DAG
, Op
.getValueType(), Promote
);
22599 AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op
,
22600 SelectionDAG
&DAG
) const {
22602 auto SrcOp
= Op
.getOperand(0);
22603 EVT VT
= Op
.getValueType();
22604 EVT ContainerDstVT
= getContainerForFixedLengthVector(DAG
, VT
);
22605 EVT ContainerSrcVT
=
22606 getContainerForFixedLengthVector(DAG
, SrcOp
.getValueType());
22608 SrcOp
= convertToScalableVector(DAG
, ContainerSrcVT
, SrcOp
);
22609 Op
= DAG
.getNode(ISD::BITCAST
, DL
, ContainerDstVT
, SrcOp
);
22610 return convertFromScalableVector(DAG
, VT
, Op
);
22613 SDValue
AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
22614 SDValue Op
, SelectionDAG
&DAG
) const {
22616 unsigned NumOperands
= Op
->getNumOperands();
22618 assert(NumOperands
> 1 && isPowerOf2_32(NumOperands
) &&
22619 "Unexpected number of operands in CONCAT_VECTORS");
22621 auto SrcOp1
= Op
.getOperand(0);
22622 auto SrcOp2
= Op
.getOperand(1);
22623 EVT VT
= Op
.getValueType();
22624 EVT SrcVT
= SrcOp1
.getValueType();
22626 if (NumOperands
> 2) {
22627 SmallVector
<SDValue
, 4> Ops
;
22628 EVT PairVT
= SrcVT
.getDoubleNumVectorElementsVT(*DAG
.getContext());
22629 for (unsigned I
= 0; I
< NumOperands
; I
+= 2)
22630 Ops
.push_back(DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, PairVT
,
22631 Op
->getOperand(I
), Op
->getOperand(I
+ 1)));
22633 return DAG
.getNode(ISD::CONCAT_VECTORS
, DL
, VT
, Ops
);
22636 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, VT
);
22638 SDValue Pg
= getPredicateForFixedLengthVector(DAG
, DL
, SrcVT
);
22639 SrcOp1
= convertToScalableVector(DAG
, ContainerVT
, SrcOp1
);
22640 SrcOp2
= convertToScalableVector(DAG
, ContainerVT
, SrcOp2
);
22642 Op
= DAG
.getNode(AArch64ISD::SPLICE
, DL
, ContainerVT
, Pg
, SrcOp1
, SrcOp2
);
22644 return convertFromScalableVector(DAG
, VT
, Op
);
22648 AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op
,
22649 SelectionDAG
&DAG
) const {
22650 EVT VT
= Op
.getValueType();
22651 assert(VT
.isFixedLengthVector() && "Expected fixed length vector type!");
22654 SDValue Val
= Op
.getOperand(0);
22655 SDValue Pg
= getPredicateForVector(DAG
, DL
, VT
);
22656 EVT SrcVT
= Val
.getValueType();
22657 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, VT
);
22658 EVT ExtendVT
= ContainerVT
.changeVectorElementType(
22659 SrcVT
.getVectorElementType());
22661 Val
= DAG
.getNode(ISD::BITCAST
, DL
, SrcVT
.changeTypeToInteger(), Val
);
22662 Val
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VT
.changeTypeToInteger(), Val
);
22664 Val
= convertToScalableVector(DAG
, ContainerVT
.changeTypeToInteger(), Val
);
22665 Val
= getSVESafeBitCast(ExtendVT
, Val
, DAG
);
22666 Val
= DAG
.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU
, DL
, ContainerVT
,
22667 Pg
, Val
, DAG
.getUNDEF(ContainerVT
));
22669 return convertFromScalableVector(DAG
, VT
, Val
);
22673 AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op
,
22674 SelectionDAG
&DAG
) const {
22675 EVT VT
= Op
.getValueType();
22676 assert(VT
.isFixedLengthVector() && "Expected fixed length vector type!");
22679 SDValue Val
= Op
.getOperand(0);
22680 EVT SrcVT
= Val
.getValueType();
22681 EVT ContainerSrcVT
= getContainerForFixedLengthVector(DAG
, SrcVT
);
22682 EVT RoundVT
= ContainerSrcVT
.changeVectorElementType(
22683 VT
.getVectorElementType());
22684 SDValue Pg
= getPredicateForVector(DAG
, DL
, RoundVT
);
22686 Val
= convertToScalableVector(DAG
, ContainerSrcVT
, Val
);
22687 Val
= DAG
.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU
, DL
, RoundVT
, Pg
, Val
,
22688 Op
.getOperand(1), DAG
.getUNDEF(RoundVT
));
22689 Val
= getSVESafeBitCast(ContainerSrcVT
.changeTypeToInteger(), Val
, DAG
);
22690 Val
= convertFromScalableVector(DAG
, SrcVT
.changeTypeToInteger(), Val
);
22692 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VT
.changeTypeToInteger(), Val
);
22693 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Val
);
22697 AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op
,
22698 SelectionDAG
&DAG
) const {
22699 EVT VT
= Op
.getValueType();
22700 assert(VT
.isFixedLengthVector() && "Expected fixed length vector type!");
22702 bool IsSigned
= Op
.getOpcode() == ISD::SINT_TO_FP
;
22703 unsigned Opcode
= IsSigned
? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
22704 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU
;
22707 SDValue Val
= Op
.getOperand(0);
22708 EVT SrcVT
= Val
.getValueType();
22709 EVT ContainerDstVT
= getContainerForFixedLengthVector(DAG
, VT
);
22710 EVT ContainerSrcVT
= getContainerForFixedLengthVector(DAG
, SrcVT
);
22712 if (ContainerSrcVT
.getVectorElementType().getSizeInBits() <=
22713 ContainerDstVT
.getVectorElementType().getSizeInBits()) {
22714 SDValue Pg
= getPredicateForVector(DAG
, DL
, VT
);
22716 Val
= DAG
.getNode(IsSigned
? ISD::SIGN_EXTEND
: ISD::ZERO_EXTEND
, DL
,
22717 VT
.changeTypeToInteger(), Val
);
22719 Val
= convertToScalableVector(DAG
, ContainerSrcVT
, Val
);
22720 Val
= getSVESafeBitCast(ContainerDstVT
.changeTypeToInteger(), Val
, DAG
);
22721 // Safe to use a larger than specified operand since we just unpacked the
22722 // data, hence the upper bits are zero.
22723 Val
= DAG
.getNode(Opcode
, DL
, ContainerDstVT
, Pg
, Val
,
22724 DAG
.getUNDEF(ContainerDstVT
));
22725 return convertFromScalableVector(DAG
, VT
, Val
);
22727 EVT CvtVT
= ContainerSrcVT
.changeVectorElementType(
22728 ContainerDstVT
.getVectorElementType());
22729 SDValue Pg
= getPredicateForVector(DAG
, DL
, CvtVT
);
22731 Val
= convertToScalableVector(DAG
, ContainerSrcVT
, Val
);
22732 Val
= DAG
.getNode(Opcode
, DL
, CvtVT
, Pg
, Val
, DAG
.getUNDEF(CvtVT
));
22733 Val
= getSVESafeBitCast(ContainerSrcVT
, Val
, DAG
);
22734 Val
= convertFromScalableVector(DAG
, SrcVT
, Val
);
22736 Val
= DAG
.getNode(ISD::TRUNCATE
, DL
, VT
.changeTypeToInteger(), Val
);
22737 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, Val
);
22742 AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op
,
22743 SelectionDAG
&DAG
) const {
22744 EVT VT
= Op
.getValueType();
22745 assert(VT
.isFixedLengthVector() && "Expected fixed length vector type!");
22747 bool IsSigned
= Op
.getOpcode() == ISD::FP_TO_SINT
;
22748 unsigned Opcode
= IsSigned
? AArch64ISD::FCVTZS_MERGE_PASSTHRU
22749 : AArch64ISD::FCVTZU_MERGE_PASSTHRU
;
22752 SDValue Val
= Op
.getOperand(0);
22753 EVT SrcVT
= Val
.getValueType();
22754 EVT ContainerDstVT
= getContainerForFixedLengthVector(DAG
, VT
);
22755 EVT ContainerSrcVT
= getContainerForFixedLengthVector(DAG
, SrcVT
);
22757 if (ContainerSrcVT
.getVectorElementType().getSizeInBits() <=
22758 ContainerDstVT
.getVectorElementType().getSizeInBits()) {
22759 EVT CvtVT
= ContainerDstVT
.changeVectorElementType(
22760 ContainerSrcVT
.getVectorElementType());
22761 SDValue Pg
= getPredicateForVector(DAG
, DL
, VT
);
22763 Val
= DAG
.getNode(ISD::BITCAST
, DL
, SrcVT
.changeTypeToInteger(), Val
);
22764 Val
= DAG
.getNode(ISD::ANY_EXTEND
, DL
, VT
, Val
);
22766 Val
= convertToScalableVector(DAG
, ContainerSrcVT
, Val
);
22767 Val
= getSVESafeBitCast(CvtVT
, Val
, DAG
);
22768 Val
= DAG
.getNode(Opcode
, DL
, ContainerDstVT
, Pg
, Val
,
22769 DAG
.getUNDEF(ContainerDstVT
));
22770 return convertFromScalableVector(DAG
, VT
, Val
);
22772 EVT CvtVT
= ContainerSrcVT
.changeTypeToInteger();
22773 SDValue Pg
= getPredicateForVector(DAG
, DL
, CvtVT
);
22775 // Safe to use a larger than specified result since an fp_to_int where the
22776 // result doesn't fit into the destination is undefined.
22777 Val
= convertToScalableVector(DAG
, ContainerSrcVT
, Val
);
22778 Val
= DAG
.getNode(Opcode
, DL
, CvtVT
, Pg
, Val
, DAG
.getUNDEF(CvtVT
));
22779 Val
= convertFromScalableVector(DAG
, SrcVT
.changeTypeToInteger(), Val
);
22781 return DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, Val
);
22785 SDValue
AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
22786 SDValue Op
, SelectionDAG
&DAG
) const {
22787 EVT VT
= Op
.getValueType();
22788 assert(VT
.isFixedLengthVector() && "Expected fixed length vector type!");
22790 auto *SVN
= cast
<ShuffleVectorSDNode
>(Op
.getNode());
22791 auto ShuffleMask
= SVN
->getMask();
22794 SDValue Op1
= Op
.getOperand(0);
22795 SDValue Op2
= Op
.getOperand(1);
22797 EVT ContainerVT
= getContainerForFixedLengthVector(DAG
, VT
);
22798 Op1
= convertToScalableVector(DAG
, ContainerVT
, Op1
);
22799 Op2
= convertToScalableVector(DAG
, ContainerVT
, Op2
);
22801 bool ReverseEXT
= false;
22803 if (isEXTMask(ShuffleMask
, VT
, ReverseEXT
, Imm
) &&
22804 Imm
== VT
.getVectorNumElements() - 1) {
22806 std::swap(Op1
, Op2
);
22808 EVT ScalarTy
= VT
.getVectorElementType();
22809 if ((ScalarTy
== MVT::i8
) || (ScalarTy
== MVT::i16
))
22810 ScalarTy
= MVT::i32
;
22811 SDValue Scalar
= DAG
.getNode(
22812 ISD::EXTRACT_VECTOR_ELT
, DL
, ScalarTy
, Op1
,
22813 DAG
.getConstant(VT
.getVectorNumElements() - 1, DL
, MVT::i64
));
22814 Op
= DAG
.getNode(AArch64ISD::INSR
, DL
, ContainerVT
, Op2
, Scalar
);
22815 return convertFromScalableVector(DAG
, VT
, Op
);
22818 for (unsigned LaneSize
: {64U, 32U, 16U}) {
22819 if (isREVMask(ShuffleMask
, VT
, LaneSize
)) {
22821 getPackedSVEVectorVT(EVT::getIntegerVT(*DAG
.getContext(), LaneSize
));
22823 unsigned EltSz
= VT
.getScalarSizeInBits();
22825 RevOp
= AArch64ISD::BSWAP_MERGE_PASSTHRU
;
22826 else if (EltSz
== 16)
22827 RevOp
= AArch64ISD::REVH_MERGE_PASSTHRU
;
22829 RevOp
= AArch64ISD::REVW_MERGE_PASSTHRU
;
22831 Op
= DAG
.getNode(ISD::BITCAST
, DL
, NewVT
, Op1
);
22832 Op
= LowerToPredicatedOp(Op
, DAG
, RevOp
);
22833 Op
= DAG
.getNode(ISD::BITCAST
, DL
, ContainerVT
, Op
);
22834 return convertFromScalableVector(DAG
, VT
, Op
);
22838 unsigned WhichResult
;
22839 if (isZIPMask(ShuffleMask
, VT
, WhichResult
) && WhichResult
== 0)
22840 return convertFromScalableVector(
22841 DAG
, VT
, DAG
.getNode(AArch64ISD::ZIP1
, DL
, ContainerVT
, Op1
, Op2
));
22843 if (isTRNMask(ShuffleMask
, VT
, WhichResult
)) {
22844 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::TRN1
: AArch64ISD::TRN2
;
22845 return convertFromScalableVector(
22846 DAG
, VT
, DAG
.getNode(Opc
, DL
, ContainerVT
, Op1
, Op2
));
22849 if (isZIP_v_undef_Mask(ShuffleMask
, VT
, WhichResult
) && WhichResult
== 0)
22850 return convertFromScalableVector(
22851 DAG
, VT
, DAG
.getNode(AArch64ISD::ZIP1
, DL
, ContainerVT
, Op1
, Op1
));
22853 if (isTRN_v_undef_Mask(ShuffleMask
, VT
, WhichResult
)) {
22854 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::TRN1
: AArch64ISD::TRN2
;
22855 return convertFromScalableVector(
22856 DAG
, VT
, DAG
.getNode(Opc
, DL
, ContainerVT
, Op1
, Op1
));
22859 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
22860 // represents the same logical operation as performed by a ZIP instruction. In
22861 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
22862 // equivalent to an AArch64 instruction. There's the extra component of
22863 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
22864 // only operated on 64/128bit vector types that have a direct mapping to a
22865 // target register and so an exact mapping is implied.
22866 // However, when using SVE for fixed length vectors, most legal vector types
22867 // are actually sub-vectors of a larger SVE register. When mapping
22868 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
22869 // how the mask's indices translate. Specifically, when the mapping requires
22870 // an exact meaning for a specific vector index (e.g. Index X is the last
22871 // vector element in the register) then such mappings are often only safe when
22872 // the exact SVE register size is know. The main exception to this is when
22873 // indices are logically relative to the first element of either
22874 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
22875 // when converting from fixed-length to scalable vector types (i.e. the start
22876 // of a fixed length vector is always the start of a scalable vector).
22877 unsigned MinSVESize
= Subtarget
->getMinSVEVectorSizeInBits();
22878 unsigned MaxSVESize
= Subtarget
->getMaxSVEVectorSizeInBits();
22879 if (MinSVESize
== MaxSVESize
&& MaxSVESize
== VT
.getSizeInBits()) {
22880 if (ShuffleVectorInst::isReverseMask(ShuffleMask
) && Op2
.isUndef()) {
22881 Op
= DAG
.getNode(ISD::VECTOR_REVERSE
, DL
, ContainerVT
, Op1
);
22882 return convertFromScalableVector(DAG
, VT
, Op
);
22885 if (isZIPMask(ShuffleMask
, VT
, WhichResult
) && WhichResult
!= 0)
22886 return convertFromScalableVector(
22887 DAG
, VT
, DAG
.getNode(AArch64ISD::ZIP2
, DL
, ContainerVT
, Op1
, Op2
));
22889 if (isUZPMask(ShuffleMask
, VT
, WhichResult
)) {
22890 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::UZP1
: AArch64ISD::UZP2
;
22891 return convertFromScalableVector(
22892 DAG
, VT
, DAG
.getNode(Opc
, DL
, ContainerVT
, Op1
, Op2
));
22895 if (isZIP_v_undef_Mask(ShuffleMask
, VT
, WhichResult
) && WhichResult
!= 0)
22896 return convertFromScalableVector(
22897 DAG
, VT
, DAG
.getNode(AArch64ISD::ZIP2
, DL
, ContainerVT
, Op1
, Op1
));
22899 if (isUZP_v_undef_Mask(ShuffleMask
, VT
, WhichResult
)) {
22900 unsigned Opc
= (WhichResult
== 0) ? AArch64ISD::UZP1
: AArch64ISD::UZP2
;
22901 return convertFromScalableVector(
22902 DAG
, VT
, DAG
.getNode(Opc
, DL
, ContainerVT
, Op1
, Op1
));
22909 SDValue
AArch64TargetLowering::getSVESafeBitCast(EVT VT
, SDValue Op
,
22910 SelectionDAG
&DAG
) const {
22912 EVT InVT
= Op
.getValueType();
22914 assert(VT
.isScalableVector() && isTypeLegal(VT
) &&
22915 InVT
.isScalableVector() && isTypeLegal(InVT
) &&
22916 "Only expect to cast between legal scalable vector types!");
22917 assert(VT
.getVectorElementType() != MVT::i1
&&
22918 InVT
.getVectorElementType() != MVT::i1
&&
22919 "For predicate bitcasts, use getSVEPredicateBitCast");
22924 EVT PackedVT
= getPackedSVEVectorVT(VT
.getVectorElementType());
22925 EVT PackedInVT
= getPackedSVEVectorVT(InVT
.getVectorElementType());
22927 // Safe bitcasting between unpacked vector types of different element counts
22928 // is currently unsupported because the following is missing the necessary
22929 // work to ensure the result's elements live where they're supposed to within
22930 // an SVE register.
22932 // e.g. nxv2i32 = XX??XX??
22933 // nxv4f16 = X?X?X?X?
22934 assert((VT
.getVectorElementCount() == InVT
.getVectorElementCount() ||
22935 VT
== PackedVT
|| InVT
== PackedInVT
) &&
22936 "Unexpected bitcast!");
22938 // Pack input if required.
22939 if (InVT
!= PackedInVT
)
22940 Op
= DAG
.getNode(AArch64ISD::REINTERPRET_CAST
, DL
, PackedInVT
, Op
);
22942 Op
= DAG
.getNode(ISD::BITCAST
, DL
, PackedVT
, Op
);
22944 // Unpack result if required.
22945 if (VT
!= PackedVT
)
22946 Op
= DAG
.getNode(AArch64ISD::REINTERPRET_CAST
, DL
, VT
, Op
);
22951 bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG
&DAG
,
22953 return ::isAllActivePredicate(DAG
, N
);
22956 EVT
AArch64TargetLowering::getPromotedVTForPredicate(EVT VT
) const {
22957 return ::getPromotedVTForPredicate(VT
);
22960 bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
22961 SDValue Op
, const APInt
&OriginalDemandedBits
,
22962 const APInt
&OriginalDemandedElts
, KnownBits
&Known
, TargetLoweringOpt
&TLO
,
22963 unsigned Depth
) const {
22965 unsigned Opc
= Op
.getOpcode();
22967 case AArch64ISD::VSHL
: {
22968 // Match (VSHL (VLSHR Val X) X)
22969 SDValue ShiftL
= Op
;
22970 SDValue ShiftR
= Op
->getOperand(0);
22971 if (ShiftR
->getOpcode() != AArch64ISD::VLSHR
)
22974 if (!ShiftL
.hasOneUse() || !ShiftR
.hasOneUse())
22977 unsigned ShiftLBits
= ShiftL
->getConstantOperandVal(1);
22978 unsigned ShiftRBits
= ShiftR
->getConstantOperandVal(1);
22980 // Other cases can be handled as well, but this is not
22982 if (ShiftRBits
!= ShiftLBits
)
22985 unsigned ScalarSize
= Op
.getScalarValueSizeInBits();
22986 assert(ScalarSize
> ShiftLBits
&& "Invalid shift imm");
22988 APInt ZeroBits
= APInt::getLowBitsSet(ScalarSize
, ShiftLBits
);
22989 APInt UnusedBits
= ~OriginalDemandedBits
;
22991 if ((ZeroBits
& UnusedBits
) != ZeroBits
)
22994 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
22995 // used - simplify to just Val.
22996 return TLO
.CombineTo(Op
, ShiftR
->getOperand(0));
23000 return TargetLowering::SimplifyDemandedBitsForTargetNode(
23001 Op
, OriginalDemandedBits
, OriginalDemandedElts
, Known
, TLO
, Depth
);
23004 bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op
) const {
23005 return Op
.getOpcode() == AArch64ISD::DUP
||
23006 (Op
.getOpcode() == ISD::EXTRACT_SUBVECTOR
&&
23007 Op
.getOperand(0).getOpcode() == AArch64ISD::DUP
) ||
23008 TargetLowering::isTargetCanonicalConstantNode(Op
);
23011 bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal(
23012 unsigned Opc
, LLT Ty1
, LLT Ty2
) const {
23013 return Ty1
== Ty2
&& (Ty1
== LLT::scalar(32) || Ty1
== LLT::scalar(64));