[Arm64EC] Refer to dllimport'ed functions correctly.
[llvm-project.git] / llvm / lib / Target / AArch64 / AArch64ISelLowering.cpp
blob943a489f0fa280b4f5a230f935a4e02ccb2056ce
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
11 //===----------------------------------------------------------------------===//
13 #include "AArch64ISelLowering.h"
14 #include "AArch64CallingConvention.h"
15 #include "AArch64ExpandImm.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "MCTargetDesc/AArch64AddressingModes.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
32 #include "llvm/Analysis/LoopInfo.h"
33 #include "llvm/Analysis/MemoryLocation.h"
34 #include "llvm/Analysis/ObjCARCUtil.h"
35 #include "llvm/Analysis/TargetTransformInfo.h"
36 #include "llvm/Analysis/VectorUtils.h"
37 #include "llvm/CodeGen/Analysis.h"
38 #include "llvm/CodeGen/CallingConvLower.h"
39 #include "llvm/CodeGen/ISDOpcodes.h"
40 #include "llvm/CodeGen/MachineBasicBlock.h"
41 #include "llvm/CodeGen/MachineFrameInfo.h"
42 #include "llvm/CodeGen/MachineFunction.h"
43 #include "llvm/CodeGen/MachineInstr.h"
44 #include "llvm/CodeGen/MachineInstrBuilder.h"
45 #include "llvm/CodeGen/MachineMemOperand.h"
46 #include "llvm/CodeGen/MachineRegisterInfo.h"
47 #include "llvm/CodeGen/RuntimeLibcalls.h"
48 #include "llvm/CodeGen/SelectionDAG.h"
49 #include "llvm/CodeGen/SelectionDAGNodes.h"
50 #include "llvm/CodeGen/TargetCallingConv.h"
51 #include "llvm/CodeGen/TargetInstrInfo.h"
52 #include "llvm/CodeGen/ValueTypes.h"
53 #include "llvm/IR/Attributes.h"
54 #include "llvm/IR/Constants.h"
55 #include "llvm/IR/DataLayout.h"
56 #include "llvm/IR/DebugLoc.h"
57 #include "llvm/IR/DerivedTypes.h"
58 #include "llvm/IR/Function.h"
59 #include "llvm/IR/GetElementPtrTypeIterator.h"
60 #include "llvm/IR/GlobalValue.h"
61 #include "llvm/IR/IRBuilder.h"
62 #include "llvm/IR/Instruction.h"
63 #include "llvm/IR/Instructions.h"
64 #include "llvm/IR/IntrinsicInst.h"
65 #include "llvm/IR/Intrinsics.h"
66 #include "llvm/IR/IntrinsicsAArch64.h"
67 #include "llvm/IR/Module.h"
68 #include "llvm/IR/OperandTraits.h"
69 #include "llvm/IR/PatternMatch.h"
70 #include "llvm/IR/Type.h"
71 #include "llvm/IR/Use.h"
72 #include "llvm/IR/Value.h"
73 #include "llvm/MC/MCRegisterInfo.h"
74 #include "llvm/Support/Casting.h"
75 #include "llvm/Support/CodeGen.h"
76 #include "llvm/Support/CommandLine.h"
77 #include "llvm/Support/Compiler.h"
78 #include "llvm/Support/Debug.h"
79 #include "llvm/Support/ErrorHandling.h"
80 #include "llvm/Support/InstructionCost.h"
81 #include "llvm/Support/KnownBits.h"
82 #include "llvm/Support/MachineValueType.h"
83 #include "llvm/Support/MathExtras.h"
84 #include "llvm/Support/raw_ostream.h"
85 #include "llvm/Target/TargetMachine.h"
86 #include "llvm/Target/TargetOptions.h"
87 #include <algorithm>
88 #include <bitset>
89 #include <cassert>
90 #include <cctype>
91 #include <cstdint>
92 #include <cstdlib>
93 #include <iterator>
94 #include <limits>
95 #include <tuple>
96 #include <utility>
97 #include <vector>
99 using namespace llvm;
100 using namespace llvm::PatternMatch;
102 #define DEBUG_TYPE "aarch64-lower"
104 STATISTIC(NumTailCalls, "Number of tail calls");
105 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
106 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
108 // FIXME: The necessary dtprel relocations don't seem to be supported
109 // well in the GNU bfd and gold linkers at the moment. Therefore, by
110 // default, for now, fall back to GeneralDynamic code generation.
111 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
112 "aarch64-elf-ldtls-generation", cl::Hidden,
113 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
114 cl::init(false));
116 static cl::opt<bool>
117 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
118 cl::desc("Enable AArch64 logical imm instruction "
119 "optimization"),
120 cl::init(true));
122 // Temporary option added for the purpose of testing functionality added
123 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
124 // in future when both implementations will be based off MGATHER rather
125 // than the GLD1 nodes added for the SVE gather load intrinsics.
126 static cl::opt<bool>
127 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
128 cl::desc("Combine extends of AArch64 masked "
129 "gather intrinsics"),
130 cl::init(true));
132 /// Value type used for condition codes.
133 static const MVT MVT_CC = MVT::i32;
135 static inline EVT getPackedSVEVectorVT(EVT VT) {
136 switch (VT.getSimpleVT().SimpleTy) {
137 default:
138 llvm_unreachable("unexpected element type for vector");
139 case MVT::i8:
140 return MVT::nxv16i8;
141 case MVT::i16:
142 return MVT::nxv8i16;
143 case MVT::i32:
144 return MVT::nxv4i32;
145 case MVT::i64:
146 return MVT::nxv2i64;
147 case MVT::f16:
148 return MVT::nxv8f16;
149 case MVT::f32:
150 return MVT::nxv4f32;
151 case MVT::f64:
152 return MVT::nxv2f64;
153 case MVT::bf16:
154 return MVT::nxv8bf16;
158 // NOTE: Currently there's only a need to return integer vector types. If this
159 // changes then just add an extra "type" parameter.
160 static inline EVT getPackedSVEVectorVT(ElementCount EC) {
161 switch (EC.getKnownMinValue()) {
162 default:
163 llvm_unreachable("unexpected element count for vector");
164 case 16:
165 return MVT::nxv16i8;
166 case 8:
167 return MVT::nxv8i16;
168 case 4:
169 return MVT::nxv4i32;
170 case 2:
171 return MVT::nxv2i64;
175 static inline EVT getPromotedVTForPredicate(EVT VT) {
176 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
177 "Expected scalable predicate vector type!");
178 switch (VT.getVectorMinNumElements()) {
179 default:
180 llvm_unreachable("unexpected element count for vector");
181 case 2:
182 return MVT::nxv2i64;
183 case 4:
184 return MVT::nxv4i32;
185 case 8:
186 return MVT::nxv8i16;
187 case 16:
188 return MVT::nxv16i8;
192 /// Returns true if VT's elements occupy the lowest bit positions of its
193 /// associated register class without any intervening space.
195 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
196 /// same register class, but only nxv8f16 can be treated as a packed vector.
197 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
198 assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
199 "Expected legal vector type!");
200 return VT.isFixedLengthVector() ||
201 VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
204 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
205 // predicate and end with a passthru value matching the result type.
206 static bool isMergePassthruOpcode(unsigned Opc) {
207 switch (Opc) {
208 default:
209 return false;
210 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
211 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
212 case AArch64ISD::REVH_MERGE_PASSTHRU:
213 case AArch64ISD::REVW_MERGE_PASSTHRU:
214 case AArch64ISD::REVD_MERGE_PASSTHRU:
215 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
216 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
217 case AArch64ISD::DUP_MERGE_PASSTHRU:
218 case AArch64ISD::ABS_MERGE_PASSTHRU:
219 case AArch64ISD::NEG_MERGE_PASSTHRU:
220 case AArch64ISD::FNEG_MERGE_PASSTHRU:
221 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
222 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
223 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
224 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
225 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
226 case AArch64ISD::FRINT_MERGE_PASSTHRU:
227 case AArch64ISD::FROUND_MERGE_PASSTHRU:
228 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
229 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
230 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
231 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
232 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
233 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
234 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
235 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
236 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
237 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
238 case AArch64ISD::FABS_MERGE_PASSTHRU:
239 return true;
243 // Returns true if inactive lanes are known to be zeroed by construction.
244 static bool isZeroingInactiveLanes(SDValue Op) {
245 switch (Op.getOpcode()) {
246 default:
247 // We guarantee i1 splat_vectors to zero the other lanes by
248 // implementing it with ptrue and possibly a punpklo for nxv1i1.
249 if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
250 return true;
251 return false;
252 case AArch64ISD::PTRUE:
253 case AArch64ISD::SETCC_MERGE_ZERO:
254 return true;
255 case ISD::INTRINSIC_WO_CHAIN:
256 switch (Op.getConstantOperandVal(0)) {
257 default:
258 return false;
259 case Intrinsic::aarch64_sve_ptrue:
260 case Intrinsic::aarch64_sve_pnext:
261 case Intrinsic::aarch64_sve_cmpeq:
262 case Intrinsic::aarch64_sve_cmpne:
263 case Intrinsic::aarch64_sve_cmpge:
264 case Intrinsic::aarch64_sve_cmpgt:
265 case Intrinsic::aarch64_sve_cmphs:
266 case Intrinsic::aarch64_sve_cmphi:
267 case Intrinsic::aarch64_sve_cmpeq_wide:
268 case Intrinsic::aarch64_sve_cmpne_wide:
269 case Intrinsic::aarch64_sve_cmpge_wide:
270 case Intrinsic::aarch64_sve_cmpgt_wide:
271 case Intrinsic::aarch64_sve_cmplt_wide:
272 case Intrinsic::aarch64_sve_cmple_wide:
273 case Intrinsic::aarch64_sve_cmphs_wide:
274 case Intrinsic::aarch64_sve_cmphi_wide:
275 case Intrinsic::aarch64_sve_cmplo_wide:
276 case Intrinsic::aarch64_sve_cmpls_wide:
277 case Intrinsic::aarch64_sve_fcmpeq:
278 case Intrinsic::aarch64_sve_fcmpne:
279 case Intrinsic::aarch64_sve_fcmpge:
280 case Intrinsic::aarch64_sve_fcmpgt:
281 case Intrinsic::aarch64_sve_fcmpuo:
282 return true;
287 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
288 const AArch64Subtarget &STI)
289 : TargetLowering(TM), Subtarget(&STI) {
290 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
291 // we have to make something up. Arbitrarily, choose ZeroOrOne.
292 setBooleanContents(ZeroOrOneBooleanContent);
293 // When comparing vectors the result sets the different elements in the
294 // vector to all-one or all-zero.
295 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
297 // Set up the register classes.
298 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
299 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
301 if (Subtarget->hasLS64()) {
302 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
303 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
304 setOperationAction(ISD::STORE, MVT::i64x8, Custom);
307 if (Subtarget->hasFPARMv8()) {
308 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
309 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
310 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
311 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
312 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
315 if (Subtarget->hasNEON()) {
316 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
317 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
318 // Someone set us up the NEON.
319 addDRTypeForNEON(MVT::v2f32);
320 addDRTypeForNEON(MVT::v8i8);
321 addDRTypeForNEON(MVT::v4i16);
322 addDRTypeForNEON(MVT::v2i32);
323 addDRTypeForNEON(MVT::v1i64);
324 addDRTypeForNEON(MVT::v1f64);
325 addDRTypeForNEON(MVT::v4f16);
326 if (Subtarget->hasBF16())
327 addDRTypeForNEON(MVT::v4bf16);
329 addQRTypeForNEON(MVT::v4f32);
330 addQRTypeForNEON(MVT::v2f64);
331 addQRTypeForNEON(MVT::v16i8);
332 addQRTypeForNEON(MVT::v8i16);
333 addQRTypeForNEON(MVT::v4i32);
334 addQRTypeForNEON(MVT::v2i64);
335 addQRTypeForNEON(MVT::v8f16);
336 if (Subtarget->hasBF16())
337 addQRTypeForNEON(MVT::v8bf16);
340 if (Subtarget->hasSVE() || Subtarget->hasSME()) {
341 // Add legal sve predicate types
342 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
343 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
344 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
345 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
346 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
348 // Add legal sve data types
349 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
350 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
351 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
352 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
354 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
355 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
356 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
357 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
358 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
359 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
361 if (Subtarget->hasBF16()) {
362 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
363 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
364 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
367 if (Subtarget->useSVEForFixedLengthVectors()) {
368 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
369 if (useSVEForFixedLengthVectorVT(VT))
370 addRegisterClass(VT, &AArch64::ZPRRegClass);
372 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
373 if (useSVEForFixedLengthVectorVT(VT))
374 addRegisterClass(VT, &AArch64::ZPRRegClass);
378 // Compute derived properties from the register classes
379 computeRegisterProperties(Subtarget->getRegisterInfo());
381 // Provide all sorts of operation actions
382 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
383 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
384 setOperationAction(ISD::SETCC, MVT::i32, Custom);
385 setOperationAction(ISD::SETCC, MVT::i64, Custom);
386 setOperationAction(ISD::SETCC, MVT::f16, Custom);
387 setOperationAction(ISD::SETCC, MVT::f32, Custom);
388 setOperationAction(ISD::SETCC, MVT::f64, Custom);
389 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
390 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
391 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
392 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
393 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
394 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
395 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
396 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
397 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
398 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
399 setOperationAction(ISD::BR_CC, MVT::i64, Custom);
400 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
401 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
402 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
403 setOperationAction(ISD::SELECT, MVT::i32, Custom);
404 setOperationAction(ISD::SELECT, MVT::i64, Custom);
405 setOperationAction(ISD::SELECT, MVT::f16, Custom);
406 setOperationAction(ISD::SELECT, MVT::bf16, Custom);
407 setOperationAction(ISD::SELECT, MVT::f32, Custom);
408 setOperationAction(ISD::SELECT, MVT::f64, Custom);
409 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
410 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
411 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
412 setOperationAction(ISD::SELECT_CC, MVT::bf16, Expand);
413 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
414 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
415 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
416 setOperationAction(ISD::JumpTable, MVT::i64, Custom);
417 setOperationAction(ISD::SETCCCARRY, MVT::i64, Custom);
419 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
420 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
421 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
423 setOperationAction(ISD::FREM, MVT::f32, Expand);
424 setOperationAction(ISD::FREM, MVT::f64, Expand);
425 setOperationAction(ISD::FREM, MVT::f80, Expand);
427 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
429 // Custom lowering hooks are needed for XOR
430 // to fold it into CSINC/CSINV.
431 setOperationAction(ISD::XOR, MVT::i32, Custom);
432 setOperationAction(ISD::XOR, MVT::i64, Custom);
434 // Virtually no operation on f128 is legal, but LLVM can't expand them when
435 // there's a valid register class, so we need custom operations in most cases.
436 setOperationAction(ISD::FABS, MVT::f128, Expand);
437 setOperationAction(ISD::FADD, MVT::f128, LibCall);
438 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
439 setOperationAction(ISD::FCOS, MVT::f128, Expand);
440 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
441 setOperationAction(ISD::FMA, MVT::f128, Expand);
442 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
443 setOperationAction(ISD::FNEG, MVT::f128, Expand);
444 setOperationAction(ISD::FPOW, MVT::f128, Expand);
445 setOperationAction(ISD::FREM, MVT::f128, Expand);
446 setOperationAction(ISD::FRINT, MVT::f128, Expand);
447 setOperationAction(ISD::FSIN, MVT::f128, Expand);
448 setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
449 setOperationAction(ISD::FSQRT, MVT::f128, Expand);
450 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
451 setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
452 setOperationAction(ISD::SETCC, MVT::f128, Custom);
453 setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
454 setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
455 setOperationAction(ISD::BR_CC, MVT::f128, Custom);
456 setOperationAction(ISD::SELECT, MVT::f128, Custom);
457 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
458 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
459 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
460 // aren't handled.
462 // Lowering for many of the conversions is actually specified by the non-f128
463 // type. The LowerXXX function will be trivial when f128 isn't involved.
464 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
465 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
466 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
467 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
468 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
469 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
470 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
471 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
472 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
473 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
474 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
475 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
476 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
477 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
478 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
479 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
480 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
481 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
482 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
483 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
484 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
485 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
486 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
487 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
488 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
489 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
490 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
491 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
492 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
493 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
495 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
496 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
497 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
498 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
500 // Variable arguments.
501 setOperationAction(ISD::VASTART, MVT::Other, Custom);
502 setOperationAction(ISD::VAARG, MVT::Other, Custom);
503 setOperationAction(ISD::VACOPY, MVT::Other, Custom);
504 setOperationAction(ISD::VAEND, MVT::Other, Expand);
506 // Variable-sized objects.
507 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
508 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
510 if (Subtarget->isTargetWindows())
511 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
512 else
513 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
515 // Constant pool entries
516 setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
518 // BlockAddress
519 setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
521 // AArch64 lacks both left-rotate and popcount instructions.
522 setOperationAction(ISD::ROTL, MVT::i32, Expand);
523 setOperationAction(ISD::ROTL, MVT::i64, Expand);
524 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
525 setOperationAction(ISD::ROTL, VT, Expand);
526 setOperationAction(ISD::ROTR, VT, Expand);
529 // AArch64 doesn't have i32 MULH{S|U}.
530 setOperationAction(ISD::MULHU, MVT::i32, Expand);
531 setOperationAction(ISD::MULHS, MVT::i32, Expand);
533 // AArch64 doesn't have {U|S}MUL_LOHI.
534 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
535 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
537 setOperationAction(ISD::CTPOP, MVT::i32, Custom);
538 setOperationAction(ISD::CTPOP, MVT::i64, Custom);
539 setOperationAction(ISD::CTPOP, MVT::i128, Custom);
541 setOperationAction(ISD::PARITY, MVT::i64, Custom);
542 setOperationAction(ISD::PARITY, MVT::i128, Custom);
544 setOperationAction(ISD::ABS, MVT::i32, Custom);
545 setOperationAction(ISD::ABS, MVT::i64, Custom);
547 setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
548 setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
549 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
550 setOperationAction(ISD::SDIVREM, VT, Expand);
551 setOperationAction(ISD::UDIVREM, VT, Expand);
553 setOperationAction(ISD::SREM, MVT::i32, Expand);
554 setOperationAction(ISD::SREM, MVT::i64, Expand);
555 setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
556 setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
557 setOperationAction(ISD::UREM, MVT::i32, Expand);
558 setOperationAction(ISD::UREM, MVT::i64, Expand);
560 // Custom lower Add/Sub/Mul with overflow.
561 setOperationAction(ISD::SADDO, MVT::i32, Custom);
562 setOperationAction(ISD::SADDO, MVT::i64, Custom);
563 setOperationAction(ISD::UADDO, MVT::i32, Custom);
564 setOperationAction(ISD::UADDO, MVT::i64, Custom);
565 setOperationAction(ISD::SSUBO, MVT::i32, Custom);
566 setOperationAction(ISD::SSUBO, MVT::i64, Custom);
567 setOperationAction(ISD::USUBO, MVT::i32, Custom);
568 setOperationAction(ISD::USUBO, MVT::i64, Custom);
569 setOperationAction(ISD::SMULO, MVT::i32, Custom);
570 setOperationAction(ISD::SMULO, MVT::i64, Custom);
571 setOperationAction(ISD::UMULO, MVT::i32, Custom);
572 setOperationAction(ISD::UMULO, MVT::i64, Custom);
574 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
575 setOperationAction(ISD::ADDCARRY, MVT::i64, Custom);
576 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
577 setOperationAction(ISD::SUBCARRY, MVT::i64, Custom);
578 setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom);
579 setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom);
580 setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom);
581 setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom);
583 setOperationAction(ISD::FSIN, MVT::f32, Expand);
584 setOperationAction(ISD::FSIN, MVT::f64, Expand);
585 setOperationAction(ISD::FCOS, MVT::f32, Expand);
586 setOperationAction(ISD::FCOS, MVT::f64, Expand);
587 setOperationAction(ISD::FPOW, MVT::f32, Expand);
588 setOperationAction(ISD::FPOW, MVT::f64, Expand);
589 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
590 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
591 if (Subtarget->hasFullFP16())
592 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
593 else
594 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
596 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
597 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
598 ISD::FEXP, ISD::FEXP2, ISD::FLOG,
599 ISD::FLOG2, ISD::FLOG10, ISD::STRICT_FREM,
600 ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS,
601 ISD::STRICT_FSIN, ISD::STRICT_FEXP, ISD::STRICT_FEXP2,
602 ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
603 setOperationAction(Op, MVT::f16, Promote);
604 setOperationAction(Op, MVT::v4f16, Expand);
605 setOperationAction(Op, MVT::v8f16, Expand);
608 if (!Subtarget->hasFullFP16()) {
609 for (auto Op :
610 {ISD::SETCC, ISD::SELECT_CC,
611 ISD::BR_CC, ISD::FADD, ISD::FSUB,
612 ISD::FMUL, ISD::FDIV, ISD::FMA,
613 ISD::FNEG, ISD::FABS, ISD::FCEIL,
614 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
615 ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN,
616 ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM,
617 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD,
618 ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV,
619 ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR,
620 ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT,
621 ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
622 ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
623 ISD::STRICT_FMAXIMUM})
624 setOperationAction(Op, MVT::f16, Promote);
626 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
627 // because the result type is integer.
628 for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
629 ISD::STRICT_LLRINT})
630 setOperationAction(Op, MVT::f16, Custom);
632 // promote v4f16 to v4f32 when that is known to be safe.
633 setOperationPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
634 setOperationPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
635 setOperationPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
636 setOperationPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
638 setOperationAction(ISD::FABS, MVT::v4f16, Expand);
639 setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
640 setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
641 setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Expand);
642 setOperationAction(ISD::FMA, MVT::v4f16, Expand);
643 setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
644 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
645 setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
646 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
647 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
648 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
649 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
650 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
651 setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
652 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
653 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
655 setOperationAction(ISD::FABS, MVT::v8f16, Expand);
656 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
657 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
658 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
659 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
660 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
661 setOperationAction(ISD::FMA, MVT::v8f16, Expand);
662 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
663 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
664 setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
665 setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
666 setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Expand);
667 setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
668 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
669 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
670 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
671 setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
672 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
673 setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
674 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
675 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
678 // AArch64 has implementations of a lot of rounding-like FP operations.
679 for (auto Op :
680 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
681 ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
682 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
683 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
684 ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
685 ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT,
686 ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
687 ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
688 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
689 ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) {
690 for (MVT Ty : {MVT::f32, MVT::f64})
691 setOperationAction(Op, Ty, Legal);
692 if (Subtarget->hasFullFP16())
693 setOperationAction(Op, MVT::f16, Legal);
696 // Basic strict FP operations are legal
697 for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
698 ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
699 for (MVT Ty : {MVT::f32, MVT::f64})
700 setOperationAction(Op, Ty, Legal);
701 if (Subtarget->hasFullFP16())
702 setOperationAction(Op, MVT::f16, Legal);
705 // Strict conversion to a larger type is legal
706 for (auto VT : {MVT::f32, MVT::f64})
707 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
709 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
711 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
712 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
714 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
715 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
716 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
717 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
718 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
720 // Generate outline atomics library calls only if LSE was not specified for
721 // subtarget
722 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
723 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
724 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
725 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
726 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
727 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
728 setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
729 setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
730 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
731 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
732 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
733 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
734 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
735 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
736 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
737 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
738 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
739 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
740 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
741 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
742 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
743 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
744 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
745 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
746 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
747 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
748 #define LCALLNAMES(A, B, N) \
749 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
750 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
751 setLibcallName(A##N##_REL, #B #N "_rel"); \
752 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
753 #define LCALLNAME4(A, B) \
754 LCALLNAMES(A, B, 1) \
755 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
756 #define LCALLNAME5(A, B) \
757 LCALLNAMES(A, B, 1) \
758 LCALLNAMES(A, B, 2) \
759 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
760 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
761 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
762 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
763 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
764 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
765 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
766 #undef LCALLNAMES
767 #undef LCALLNAME4
768 #undef LCALLNAME5
771 // 128-bit loads and stores can be done without expanding
772 setOperationAction(ISD::LOAD, MVT::i128, Custom);
773 setOperationAction(ISD::STORE, MVT::i128, Custom);
775 // Aligned 128-bit loads and stores are single-copy atomic according to the
776 // v8.4a spec.
777 if (Subtarget->hasLSE2()) {
778 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
779 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
782 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
783 // custom lowering, as there are no un-paired non-temporal stores and
784 // legalization will break up 256 bit inputs.
785 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
786 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
787 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
788 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
789 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
790 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
791 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
793 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
794 // custom lowering, as there are no un-paired non-temporal loads legalization
795 // will break up 256 bit inputs.
796 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
797 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
798 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
799 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
800 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
801 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
802 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
804 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
805 // This requires the Performance Monitors extension.
806 if (Subtarget->hasPerfMon())
807 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
809 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
810 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
811 // Issue __sincos_stret if available.
812 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
813 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
814 } else {
815 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
816 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
819 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
820 // MSVCRT doesn't have powi; fall back to pow
821 setLibcallName(RTLIB::POWI_F32, nullptr);
822 setLibcallName(RTLIB::POWI_F64, nullptr);
825 // Make floating-point constants legal for the large code model, so they don't
826 // become loads from the constant pool.
827 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
828 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
829 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
832 // AArch64 does not have floating-point extending loads, i1 sign-extending
833 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
834 for (MVT VT : MVT::fp_valuetypes()) {
835 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
836 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
837 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
838 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
840 for (MVT VT : MVT::integer_valuetypes())
841 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
843 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
844 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
845 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
846 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
847 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
848 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
849 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
851 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
852 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
853 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
855 // Indexed loads and stores are supported.
856 for (unsigned im = (unsigned)ISD::PRE_INC;
857 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
858 setIndexedLoadAction(im, MVT::i8, Legal);
859 setIndexedLoadAction(im, MVT::i16, Legal);
860 setIndexedLoadAction(im, MVT::i32, Legal);
861 setIndexedLoadAction(im, MVT::i64, Legal);
862 setIndexedLoadAction(im, MVT::f64, Legal);
863 setIndexedLoadAction(im, MVT::f32, Legal);
864 setIndexedLoadAction(im, MVT::f16, Legal);
865 setIndexedLoadAction(im, MVT::bf16, Legal);
866 setIndexedStoreAction(im, MVT::i8, Legal);
867 setIndexedStoreAction(im, MVT::i16, Legal);
868 setIndexedStoreAction(im, MVT::i32, Legal);
869 setIndexedStoreAction(im, MVT::i64, Legal);
870 setIndexedStoreAction(im, MVT::f64, Legal);
871 setIndexedStoreAction(im, MVT::f32, Legal);
872 setIndexedStoreAction(im, MVT::f16, Legal);
873 setIndexedStoreAction(im, MVT::bf16, Legal);
876 // Trap.
877 setOperationAction(ISD::TRAP, MVT::Other, Legal);
878 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
879 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
881 // We combine OR nodes for bitfield operations.
882 setTargetDAGCombine(ISD::OR);
883 // Try to create BICs for vector ANDs.
884 setTargetDAGCombine(ISD::AND);
886 // Vector add and sub nodes may conceal a high-half opportunity.
887 // Also, try to fold ADD into CSINC/CSINV..
888 setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
889 ISD::UINT_TO_FP});
891 setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
892 ISD::FP_TO_UINT_SAT, ISD::FDIV});
894 // Try and combine setcc with csel
895 setTargetDAGCombine(ISD::SETCC);
897 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
899 setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
900 ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG,
901 ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR,
902 ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR});
903 setTargetDAGCombine(ISD::LOAD);
905 setTargetDAGCombine(ISD::MSTORE);
907 setTargetDAGCombine(ISD::MUL);
909 setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
911 setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
912 ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
913 ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
915 setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
917 setTargetDAGCombine(ISD::FP_EXTEND);
919 setTargetDAGCombine(ISD::GlobalAddress);
921 // In case of strict alignment, avoid an excessive number of byte wide stores.
922 MaxStoresPerMemsetOptSize = 8;
923 MaxStoresPerMemset =
924 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
926 MaxGluedStoresPerMemcpy = 4;
927 MaxStoresPerMemcpyOptSize = 4;
928 MaxStoresPerMemcpy =
929 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
931 MaxStoresPerMemmoveOptSize = 4;
932 MaxStoresPerMemmove = 4;
934 MaxLoadsPerMemcmpOptSize = 4;
935 MaxLoadsPerMemcmp =
936 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
938 setStackPointerRegisterToSaveRestore(AArch64::SP);
940 setSchedulingPreference(Sched::Hybrid);
942 EnableExtLdPromotion = true;
944 // Set required alignment.
945 setMinFunctionAlignment(Align(4));
946 // Set preferred alignments.
947 setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
948 setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
949 setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));
951 // Only change the limit for entries in a jump table if specified by
952 // the sub target, but not at the command line.
953 unsigned MaxJT = STI.getMaximumJumpTableSize();
954 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
955 setMaximumJumpTableSize(MaxJT);
957 setHasExtractBitsInsn(true);
959 setMaxDivRemBitWidthSupported(128);
961 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
963 if (Subtarget->hasNEON()) {
964 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
965 // silliness like this:
966 for (auto Op :
967 {ISD::SELECT, ISD::SELECT_CC, ISD::SETCC,
968 ISD::BR_CC, ISD::FADD, ISD::FSUB,
969 ISD::FMUL, ISD::FDIV, ISD::FMA,
970 ISD::FNEG, ISD::FABS, ISD::FCEIL,
971 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
972 ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN,
973 ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM,
974 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD,
975 ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV,
976 ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR,
977 ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT,
978 ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
979 ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
980 ISD::STRICT_FMAXIMUM})
981 setOperationAction(Op, MVT::v1f64, Expand);
983 for (auto Op :
984 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
985 ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
986 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
987 ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
988 setOperationAction(Op, MVT::v1i64, Expand);
990 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
991 // elements smaller than i32, so promote the input to i32 first.
992 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
993 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
995 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
996 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
997 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
998 for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
999 ISD::STRICT_UINT_TO_FP})
1000 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1001 setOperationAction(Op, VT, Custom);
1003 if (Subtarget->hasFullFP16()) {
1004 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
1006 setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
1007 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1008 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom);
1009 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1010 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
1011 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1012 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
1013 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1014 } else {
1015 // when AArch64 doesn't have fullfp16 support, promote the input
1016 // to i32 first.
1017 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1018 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1019 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1020 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1021 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1022 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1023 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1024 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1027 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1028 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1029 setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
1030 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
1031 setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
1032 setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
1033 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1034 setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
1035 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1036 setOperationAction(ISD::UMAX, VT, Custom);
1037 setOperationAction(ISD::SMAX, VT, Custom);
1038 setOperationAction(ISD::UMIN, VT, Custom);
1039 setOperationAction(ISD::SMIN, VT, Custom);
1042 // AArch64 doesn't have MUL.2d:
1043 setOperationAction(ISD::MUL, MVT::v2i64, Expand);
1044 // Custom handling for some quad-vector types to detect MULL.
1045 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1046 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1047 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1049 // Saturates
1050 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1051 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1052 setOperationAction(ISD::SADDSAT, VT, Legal);
1053 setOperationAction(ISD::UADDSAT, VT, Legal);
1054 setOperationAction(ISD::SSUBSAT, VT, Legal);
1055 setOperationAction(ISD::USUBSAT, VT, Legal);
1058 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1059 MVT::v4i32}) {
1060 setOperationAction(ISD::AVGFLOORS, VT, Legal);
1061 setOperationAction(ISD::AVGFLOORU, VT, Legal);
1062 setOperationAction(ISD::AVGCEILS, VT, Legal);
1063 setOperationAction(ISD::AVGCEILU, VT, Legal);
1064 setOperationAction(ISD::ABDS, VT, Legal);
1065 setOperationAction(ISD::ABDU, VT, Legal);
1068 // Vector reductions
1069 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1070 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1071 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1072 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1073 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1075 setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1078 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1079 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1080 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1081 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1082 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1083 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1084 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1086 setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1088 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
1089 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1090 // Likewise, narrowing and extending vector loads/stores aren't handled
1091 // directly.
1092 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1093 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
1095 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1096 setOperationAction(ISD::MULHS, VT, Legal);
1097 setOperationAction(ISD::MULHU, VT, Legal);
1098 } else {
1099 setOperationAction(ISD::MULHS, VT, Expand);
1100 setOperationAction(ISD::MULHU, VT, Expand);
1102 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1103 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1105 setOperationAction(ISD::BSWAP, VT, Expand);
1106 setOperationAction(ISD::CTTZ, VT, Expand);
1108 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1109 setTruncStoreAction(VT, InnerVT, Expand);
1110 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1111 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1112 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1116 // AArch64 has implementations of a lot of rounding-like FP operations.
1117 for (auto Op :
1118 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1119 ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR,
1120 ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT,
1121 ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) {
1122 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1123 setOperationAction(Op, Ty, Legal);
1124 if (Subtarget->hasFullFP16())
1125 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1126 setOperationAction(Op, Ty, Legal);
1129 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1131 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1132 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1133 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1134 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1135 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1136 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1138 // ADDP custom lowering
1139 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1140 setOperationAction(ISD::ADD, VT, Custom);
1141 // FADDP custom lowering
1142 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1143 setOperationAction(ISD::FADD, VT, Custom);
1146 if (Subtarget->hasSME()) {
1147 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1150 // FIXME: Move lowering for more nodes here if those are common between
1151 // SVE and SME.
1152 if (Subtarget->hasSVE() || Subtarget->hasSME()) {
1153 for (auto VT :
1154 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1155 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1156 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1160 if (Subtarget->hasSME())
1161 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1163 if (Subtarget->hasSVE()) {
1164 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1165 setOperationAction(ISD::BITREVERSE, VT, Custom);
1166 setOperationAction(ISD::BSWAP, VT, Custom);
1167 setOperationAction(ISD::CTLZ, VT, Custom);
1168 setOperationAction(ISD::CTPOP, VT, Custom);
1169 setOperationAction(ISD::CTTZ, VT, Custom);
1170 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1171 setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1172 setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1173 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1174 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1175 setOperationAction(ISD::MGATHER, VT, Custom);
1176 setOperationAction(ISD::MSCATTER, VT, Custom);
1177 setOperationAction(ISD::MLOAD, VT, Custom);
1178 setOperationAction(ISD::MUL, VT, Custom);
1179 setOperationAction(ISD::MULHS, VT, Custom);
1180 setOperationAction(ISD::MULHU, VT, Custom);
1181 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1182 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1183 setOperationAction(ISD::SELECT, VT, Custom);
1184 setOperationAction(ISD::SETCC, VT, Custom);
1185 setOperationAction(ISD::SDIV, VT, Custom);
1186 setOperationAction(ISD::UDIV, VT, Custom);
1187 setOperationAction(ISD::SMIN, VT, Custom);
1188 setOperationAction(ISD::UMIN, VT, Custom);
1189 setOperationAction(ISD::SMAX, VT, Custom);
1190 setOperationAction(ISD::UMAX, VT, Custom);
1191 setOperationAction(ISD::SHL, VT, Custom);
1192 setOperationAction(ISD::SRL, VT, Custom);
1193 setOperationAction(ISD::SRA, VT, Custom);
1194 setOperationAction(ISD::ABS, VT, Custom);
1195 setOperationAction(ISD::ABDS, VT, Custom);
1196 setOperationAction(ISD::ABDU, VT, Custom);
1197 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1198 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1199 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1200 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1201 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1202 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1203 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1204 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1205 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1207 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1208 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1209 setOperationAction(ISD::SELECT_CC, VT, Expand);
1210 setOperationAction(ISD::ROTL, VT, Expand);
1211 setOperationAction(ISD::ROTR, VT, Expand);
1213 setOperationAction(ISD::SADDSAT, VT, Legal);
1214 setOperationAction(ISD::UADDSAT, VT, Legal);
1215 setOperationAction(ISD::SSUBSAT, VT, Legal);
1216 setOperationAction(ISD::USUBSAT, VT, Legal);
1217 setOperationAction(ISD::UREM, VT, Expand);
1218 setOperationAction(ISD::SREM, VT, Expand);
1219 setOperationAction(ISD::SDIVREM, VT, Expand);
1220 setOperationAction(ISD::UDIVREM, VT, Expand);
1223 // Illegal unpacked integer vector types.
1224 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1225 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1226 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1229 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1230 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1231 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1232 setOperationAction(ISD::BITCAST, VT, Custom);
1234 for (auto VT :
1235 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1236 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1237 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
1239 for (auto VT :
1240 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1241 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1242 setOperationAction(ISD::SELECT, VT, Custom);
1243 setOperationAction(ISD::SETCC, VT, Custom);
1244 setOperationAction(ISD::TRUNCATE, VT, Custom);
1245 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1246 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1247 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1249 setOperationAction(ISD::SELECT_CC, VT, Expand);
1250 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1251 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1253 // There are no legal MVT::nxv16f## based types.
1254 if (VT != MVT::nxv16i1) {
1255 setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1256 setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1260 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1261 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1262 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1263 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1264 setOperationAction(ISD::MLOAD, VT, Custom);
1265 setOperationAction(ISD::MSTORE, VT, Custom);
1266 setOperationAction(ISD::MGATHER, VT, Custom);
1267 setOperationAction(ISD::MSCATTER, VT, Custom);
1270 // Firstly, exclude all scalable vector extending loads/truncating stores,
1271 // include both integer and floating scalable vector.
1272 for (MVT VT : MVT::scalable_vector_valuetypes()) {
1273 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1274 setTruncStoreAction(VT, InnerVT, Expand);
1275 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1276 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1277 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1281 // Then, selectively enable those which we directly support.
1282 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1283 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1284 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1285 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1286 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1287 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1288 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1289 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1290 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1291 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1292 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1293 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1294 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1297 // SVE supports truncating stores of 64 and 128-bit vectors
1298 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1299 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1300 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1301 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1302 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1304 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1305 MVT::nxv4f32, MVT::nxv2f64}) {
1306 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1307 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1308 setOperationAction(ISD::MGATHER, VT, Custom);
1309 setOperationAction(ISD::MSCATTER, VT, Custom);
1310 setOperationAction(ISD::MLOAD, VT, Custom);
1311 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1312 setOperationAction(ISD::SELECT, VT, Custom);
1313 setOperationAction(ISD::FADD, VT, Custom);
1314 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1315 setOperationAction(ISD::FDIV, VT, Custom);
1316 setOperationAction(ISD::FMA, VT, Custom);
1317 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1318 setOperationAction(ISD::FMAXNUM, VT, Custom);
1319 setOperationAction(ISD::FMINIMUM, VT, Custom);
1320 setOperationAction(ISD::FMINNUM, VT, Custom);
1321 setOperationAction(ISD::FMUL, VT, Custom);
1322 setOperationAction(ISD::FNEG, VT, Custom);
1323 setOperationAction(ISD::FSUB, VT, Custom);
1324 setOperationAction(ISD::FCEIL, VT, Custom);
1325 setOperationAction(ISD::FFLOOR, VT, Custom);
1326 setOperationAction(ISD::FNEARBYINT, VT, Custom);
1327 setOperationAction(ISD::FRINT, VT, Custom);
1328 setOperationAction(ISD::FROUND, VT, Custom);
1329 setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1330 setOperationAction(ISD::FTRUNC, VT, Custom);
1331 setOperationAction(ISD::FSQRT, VT, Custom);
1332 setOperationAction(ISD::FABS, VT, Custom);
1333 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1334 setOperationAction(ISD::FP_ROUND, VT, Custom);
1335 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1336 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1337 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1338 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1339 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1341 setOperationAction(ISD::SELECT_CC, VT, Expand);
1342 setOperationAction(ISD::FREM, VT, Expand);
1343 setOperationAction(ISD::FPOW, VT, Expand);
1344 setOperationAction(ISD::FPOWI, VT, Expand);
1345 setOperationAction(ISD::FCOS, VT, Expand);
1346 setOperationAction(ISD::FSIN, VT, Expand);
1347 setOperationAction(ISD::FSINCOS, VT, Expand);
1348 setOperationAction(ISD::FEXP, VT, Expand);
1349 setOperationAction(ISD::FEXP2, VT, Expand);
1350 setOperationAction(ISD::FLOG, VT, Expand);
1351 setOperationAction(ISD::FLOG2, VT, Expand);
1352 setOperationAction(ISD::FLOG10, VT, Expand);
1354 setCondCodeAction(ISD::SETO, VT, Expand);
1355 setCondCodeAction(ISD::SETOLT, VT, Expand);
1356 setCondCodeAction(ISD::SETLT, VT, Expand);
1357 setCondCodeAction(ISD::SETOLE, VT, Expand);
1358 setCondCodeAction(ISD::SETLE, VT, Expand);
1359 setCondCodeAction(ISD::SETULT, VT, Expand);
1360 setCondCodeAction(ISD::SETULE, VT, Expand);
1361 setCondCodeAction(ISD::SETUGE, VT, Expand);
1362 setCondCodeAction(ISD::SETUGT, VT, Expand);
1363 setCondCodeAction(ISD::SETUEQ, VT, Expand);
1364 setCondCodeAction(ISD::SETONE, VT, Expand);
1367 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1368 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1369 setOperationAction(ISD::MGATHER, VT, Custom);
1370 setOperationAction(ISD::MSCATTER, VT, Custom);
1371 setOperationAction(ISD::MLOAD, VT, Custom);
1372 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1373 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1376 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
1377 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
1379 // NEON doesn't support integer divides, but SVE does
1380 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1381 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1382 setOperationAction(ISD::SDIV, VT, Custom);
1383 setOperationAction(ISD::UDIV, VT, Custom);
1386 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1387 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1388 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1390 // NEON doesn't support across-vector reductions, but SVE does.
1391 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1392 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1394 // NOTE: Currently this has to happen after computeRegisterProperties rather
1395 // than the preferred option of combining it with the addRegisterClass call.
1396 if (Subtarget->useSVEForFixedLengthVectors()) {
1397 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
1398 if (useSVEForFixedLengthVectorVT(VT))
1399 addTypeForFixedLengthSVE(VT);
1400 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
1401 if (useSVEForFixedLengthVectorVT(VT))
1402 addTypeForFixedLengthSVE(VT);
1404 // 64bit results can mean a bigger than NEON input.
1405 for (auto VT : {MVT::v8i8, MVT::v4i16})
1406 setOperationAction(ISD::TRUNCATE, VT, Custom);
1407 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
1409 // 128bit results imply a bigger than NEON input.
1410 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1411 setOperationAction(ISD::TRUNCATE, VT, Custom);
1412 for (auto VT : {MVT::v8f16, MVT::v4f32})
1413 setOperationAction(ISD::FP_ROUND, VT, Custom);
1415 // These operations are not supported on NEON but SVE can do them.
1416 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1417 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1418 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1419 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1420 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1421 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1422 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1423 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1424 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1425 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1426 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1427 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1428 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1429 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1430 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1431 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1432 setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1433 setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1434 setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1435 setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1437 // Int operations with no NEON support.
1438 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1439 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1440 setOperationAction(ISD::BITREVERSE, VT, Custom);
1441 setOperationAction(ISD::CTTZ, VT, Custom);
1442 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1443 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1444 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1448 // Use SVE for vectors with more than 2 elements.
1449 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1450 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1453 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1454 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1455 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1456 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1458 setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1461 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1462 // Only required for llvm.aarch64.mops.memset.tag
1463 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
1466 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1468 IsStrictFPEnabled = true;
1471 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1472 assert(VT.isVector() && "VT should be a vector type");
1474 if (VT.isFloatingPoint()) {
1475 MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
1476 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1477 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1480 // Mark vector float intrinsics as expand.
1481 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1482 setOperationAction(ISD::FSIN, VT, Expand);
1483 setOperationAction(ISD::FCOS, VT, Expand);
1484 setOperationAction(ISD::FPOW, VT, Expand);
1485 setOperationAction(ISD::FLOG, VT, Expand);
1486 setOperationAction(ISD::FLOG2, VT, Expand);
1487 setOperationAction(ISD::FLOG10, VT, Expand);
1488 setOperationAction(ISD::FEXP, VT, Expand);
1489 setOperationAction(ISD::FEXP2, VT, Expand);
1492 // But we do support custom-lowering for FCOPYSIGN.
1493 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1494 ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1495 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1497 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1498 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1499 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1500 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1501 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1502 setOperationAction(ISD::SRA, VT, Custom);
1503 setOperationAction(ISD::SRL, VT, Custom);
1504 setOperationAction(ISD::SHL, VT, Custom);
1505 setOperationAction(ISD::OR, VT, Custom);
1506 setOperationAction(ISD::SETCC, VT, Custom);
1507 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
1509 setOperationAction(ISD::SELECT, VT, Expand);
1510 setOperationAction(ISD::SELECT_CC, VT, Expand);
1511 setOperationAction(ISD::VSELECT, VT, Expand);
1512 for (MVT InnerVT : MVT::all_valuetypes())
1513 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1515 // CNT supports only B element sizes, then use UADDLP to widen.
1516 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1517 setOperationAction(ISD::CTPOP, VT, Custom);
1519 setOperationAction(ISD::UDIV, VT, Expand);
1520 setOperationAction(ISD::SDIV, VT, Expand);
1521 setOperationAction(ISD::UREM, VT, Expand);
1522 setOperationAction(ISD::SREM, VT, Expand);
1523 setOperationAction(ISD::FREM, VT, Expand);
1525 for (unsigned Opcode :
1526 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1527 ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1528 setOperationAction(Opcode, VT, Custom);
1530 if (!VT.isFloatingPoint())
1531 setOperationAction(ISD::ABS, VT, Legal);
1533 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1534 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1535 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1536 setOperationAction(Opcode, VT, Legal);
1538 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1539 // NEON types.
1540 if (VT.isFloatingPoint() &&
1541 VT.getVectorElementType() != MVT::bf16 &&
1542 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1543 for (unsigned Opcode :
1544 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
1545 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM,
1546 ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
1547 ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA,
1548 ISD::STRICT_FSQRT})
1549 setOperationAction(Opcode, VT, Legal);
1551 // Strict fp extend and trunc are legal
1552 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1553 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
1554 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1555 setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
1557 // FIXME: We could potentially make use of the vector comparison instructions
1558 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1559 // complications:
1560 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1561 // so we would need to expand when the condition code doesn't match the
1562 // kind of comparison.
1563 // * Some kinds of comparison require more than one FCMXY instruction so
1564 // would need to be expanded instead.
1565 // * The lowering of the non-strict versions involves target-specific ISD
1566 // nodes so we would likely need to add strict versions of all of them and
1567 // handle them appropriately.
1568 setOperationAction(ISD::STRICT_FSETCC, VT, Expand);
1569 setOperationAction(ISD::STRICT_FSETCCS, VT, Expand);
1571 if (Subtarget->isLittleEndian()) {
1572 for (unsigned im = (unsigned)ISD::PRE_INC;
1573 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1574 setIndexedLoadAction(im, VT, Legal);
1575 setIndexedStoreAction(im, VT, Legal);
1580 bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
1581 EVT OpVT) const {
1582 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1583 if (!Subtarget->hasSVE())
1584 return true;
1586 // We can only support legal predicate result types. We can use the SVE
1587 // whilelo instruction for generating fixed-width predicates too.
1588 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1589 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1590 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1591 return true;
1593 // The whilelo instruction only works with i32 or i64 scalar inputs.
1594 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1595 return true;
1597 return false;
1600 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1601 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1603 // By default everything must be expanded.
1604 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1605 setOperationAction(Op, VT, Expand);
1607 // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1608 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1610 if (VT.isFloatingPoint()) {
1611 setCondCodeAction(ISD::SETO, VT, Expand);
1612 setCondCodeAction(ISD::SETOLT, VT, Expand);
1613 setCondCodeAction(ISD::SETLT, VT, Expand);
1614 setCondCodeAction(ISD::SETOLE, VT, Expand);
1615 setCondCodeAction(ISD::SETLE, VT, Expand);
1616 setCondCodeAction(ISD::SETULT, VT, Expand);
1617 setCondCodeAction(ISD::SETULE, VT, Expand);
1618 setCondCodeAction(ISD::SETUGE, VT, Expand);
1619 setCondCodeAction(ISD::SETUGT, VT, Expand);
1620 setCondCodeAction(ISD::SETUEQ, VT, Expand);
1621 setCondCodeAction(ISD::SETONE, VT, Expand);
1624 // Mark integer truncating stores/extending loads as having custom lowering
1625 if (VT.isInteger()) {
1626 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1627 while (InnerVT != VT) {
1628 setTruncStoreAction(VT, InnerVT, Custom);
1629 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1630 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1631 InnerVT = InnerVT.changeVectorElementType(
1632 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1636 // Mark floating-point truncating stores/extending loads as having custom
1637 // lowering
1638 if (VT.isFloatingPoint()) {
1639 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1640 while (InnerVT != VT) {
1641 setTruncStoreAction(VT, InnerVT, Custom);
1642 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1643 InnerVT = InnerVT.changeVectorElementType(
1644 MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
1648 // Lower fixed length vector operations to scalable equivalents.
1649 setOperationAction(ISD::ABS, VT, Custom);
1650 setOperationAction(ISD::ADD, VT, Custom);
1651 setOperationAction(ISD::AND, VT, Custom);
1652 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1653 setOperationAction(ISD::BITCAST, VT, Custom);
1654 setOperationAction(ISD::BITREVERSE, VT, Custom);
1655 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1656 setOperationAction(ISD::BSWAP, VT, Custom);
1657 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1658 setOperationAction(ISD::CTLZ, VT, Custom);
1659 setOperationAction(ISD::CTPOP, VT, Custom);
1660 setOperationAction(ISD::CTTZ, VT, Custom);
1661 setOperationAction(ISD::FABS, VT, Custom);
1662 setOperationAction(ISD::FADD, VT, Custom);
1663 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1664 setOperationAction(ISD::FCEIL, VT, Custom);
1665 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1666 setOperationAction(ISD::FDIV, VT, Custom);
1667 setOperationAction(ISD::FFLOOR, VT, Custom);
1668 setOperationAction(ISD::FMA, VT, Custom);
1669 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1670 setOperationAction(ISD::FMAXNUM, VT, Custom);
1671 setOperationAction(ISD::FMINIMUM, VT, Custom);
1672 setOperationAction(ISD::FMINNUM, VT, Custom);
1673 setOperationAction(ISD::FMUL, VT, Custom);
1674 setOperationAction(ISD::FNEARBYINT, VT, Custom);
1675 setOperationAction(ISD::FNEG, VT, Custom);
1676 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1677 setOperationAction(ISD::FP_ROUND, VT, Custom);
1678 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1679 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1680 setOperationAction(ISD::FRINT, VT, Custom);
1681 setOperationAction(ISD::FROUND, VT, Custom);
1682 setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1683 setOperationAction(ISD::FSQRT, VT, Custom);
1684 setOperationAction(ISD::FSUB, VT, Custom);
1685 setOperationAction(ISD::FTRUNC, VT, Custom);
1686 setOperationAction(ISD::LOAD, VT, Custom);
1687 setOperationAction(ISD::MGATHER, VT, Custom);
1688 setOperationAction(ISD::MLOAD, VT, Custom);
1689 setOperationAction(ISD::MSCATTER, VT, Custom);
1690 setOperationAction(ISD::MSTORE, VT, Custom);
1691 setOperationAction(ISD::MUL, VT, Custom);
1692 setOperationAction(ISD::MULHS, VT, Custom);
1693 setOperationAction(ISD::MULHU, VT, Custom);
1694 setOperationAction(ISD::OR, VT, Custom);
1695 setOperationAction(ISD::SDIV, VT, Custom);
1696 setOperationAction(ISD::SELECT, VT, Custom);
1697 setOperationAction(ISD::SETCC, VT, Custom);
1698 setOperationAction(ISD::SHL, VT, Custom);
1699 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1700 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1701 setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1702 setOperationAction(ISD::SMAX, VT, Custom);
1703 setOperationAction(ISD::SMIN, VT, Custom);
1704 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1705 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1706 setOperationAction(ISD::SRA, VT, Custom);
1707 setOperationAction(ISD::SRL, VT, Custom);
1708 setOperationAction(ISD::STORE, VT, Custom);
1709 setOperationAction(ISD::SUB, VT, Custom);
1710 setOperationAction(ISD::TRUNCATE, VT, Custom);
1711 setOperationAction(ISD::UDIV, VT, Custom);
1712 setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1713 setOperationAction(ISD::UMAX, VT, Custom);
1714 setOperationAction(ISD::UMIN, VT, Custom);
1715 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1716 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1717 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1718 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1719 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1720 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1721 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1722 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1723 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1724 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1725 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1726 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1727 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1728 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1729 setOperationAction(ISD::VSELECT, VT, Custom);
1730 setOperationAction(ISD::XOR, VT, Custom);
1731 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1734 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1735 addRegisterClass(VT, &AArch64::FPR64RegClass);
1736 addTypeForNEON(VT);
1739 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1740 addRegisterClass(VT, &AArch64::FPR128RegClass);
1741 addTypeForNEON(VT);
1744 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
1745 LLVMContext &C, EVT VT) const {
1746 if (!VT.isVector())
1747 return MVT::i32;
1748 if (VT.isScalableVector())
1749 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
1750 return VT.changeVectorElementTypeToInteger();
1753 // isIntImmediate - This method tests to see if the node is a constant
1754 // operand. If so Imm will receive the value.
1755 static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
1756 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
1757 Imm = C->getZExtValue();
1758 return true;
1760 return false;
1763 // isOpcWithIntImmediate - This method tests to see if the node is a specific
1764 // opcode and that it has a immediate integer right operand.
1765 // If so Imm will receive the value.
1766 static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
1767 uint64_t &Imm) {
1768 return N->getOpcode() == Opc &&
1769 isIntImmediate(N->getOperand(1).getNode(), Imm);
1772 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1773 const APInt &Demanded,
1774 TargetLowering::TargetLoweringOpt &TLO,
1775 unsigned NewOpc) {
1776 uint64_t OldImm = Imm, NewImm, Enc;
1777 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1779 // Return if the immediate is already all zeros, all ones, a bimm32 or a
1780 // bimm64.
1781 if (Imm == 0 || Imm == Mask ||
1782 AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
1783 return false;
1785 unsigned EltSize = Size;
1786 uint64_t DemandedBits = Demanded.getZExtValue();
1788 // Clear bits that are not demanded.
1789 Imm &= DemandedBits;
1791 while (true) {
1792 // The goal here is to set the non-demanded bits in a way that minimizes
1793 // the number of switching between 0 and 1. In order to achieve this goal,
1794 // we set the non-demanded bits to the value of the preceding demanded bits.
1795 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1796 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1797 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1798 // The final result is 0b11000011.
1799 uint64_t NonDemandedBits = ~DemandedBits;
1800 uint64_t InvertedImm = ~Imm & DemandedBits;
1801 uint64_t RotatedImm =
1802 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1803 NonDemandedBits;
1804 uint64_t Sum = RotatedImm + NonDemandedBits;
1805 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1806 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1807 NewImm = (Imm | Ones) & Mask;
1809 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1810 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1811 // we halve the element size and continue the search.
1812 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1813 break;
1815 // We cannot shrink the element size any further if it is 2-bits.
1816 if (EltSize == 2)
1817 return false;
1819 EltSize /= 2;
1820 Mask >>= EltSize;
1821 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1823 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1824 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1825 return false;
1827 // Merge the upper and lower halves of Imm and DemandedBits.
1828 Imm |= Hi;
1829 DemandedBits |= DemandedBitsHi;
1832 ++NumOptimizedImms;
1834 // Replicate the element across the register width.
1835 while (EltSize < Size) {
1836 NewImm |= NewImm << EltSize;
1837 EltSize *= 2;
1840 (void)OldImm;
1841 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1842 "demanded bits should never be altered");
1843 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1845 // Create the new constant immediate node.
1846 EVT VT = Op.getValueType();
1847 SDLoc DL(Op);
1848 SDValue New;
1850 // If the new constant immediate is all-zeros or all-ones, let the target
1851 // independent DAG combine optimize this node.
1852 if (NewImm == 0 || NewImm == OrigMask) {
1853 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1854 TLO.DAG.getConstant(NewImm, DL, VT));
1855 // Otherwise, create a machine node so that target independent DAG combine
1856 // doesn't undo this optimization.
1857 } else {
1858 Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
1859 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1860 New = SDValue(
1861 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1864 return TLO.CombineTo(Op, New);
1867 bool AArch64TargetLowering::targetShrinkDemandedConstant(
1868 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1869 TargetLoweringOpt &TLO) const {
1870 // Delay this optimization to as late as possible.
1871 if (!TLO.LegalOps)
1872 return false;
1874 if (!EnableOptimizeLogicalImm)
1875 return false;
1877 EVT VT = Op.getValueType();
1878 if (VT.isVector())
1879 return false;
1881 unsigned Size = VT.getSizeInBits();
1882 assert((Size == 32 || Size == 64) &&
1883 "i32 or i64 is expected after legalization.");
1885 // Exit early if we demand all bits.
1886 if (DemandedBits.countPopulation() == Size)
1887 return false;
1889 unsigned NewOpc;
1890 switch (Op.getOpcode()) {
1891 default:
1892 return false;
1893 case ISD::AND:
1894 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1895 break;
1896 case ISD::OR:
1897 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1898 break;
1899 case ISD::XOR:
1900 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1901 break;
1903 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1904 if (!C)
1905 return false;
1906 uint64_t Imm = C->getZExtValue();
1907 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1910 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1911 /// Mask are known to be either zero or one and return them Known.
1912 void AArch64TargetLowering::computeKnownBitsForTargetNode(
1913 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
1914 const SelectionDAG &DAG, unsigned Depth) const {
1915 switch (Op.getOpcode()) {
1916 default:
1917 break;
1918 case AArch64ISD::DUP: {
1919 SDValue SrcOp = Op.getOperand(0);
1920 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
1921 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
1922 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
1923 "Expected DUP implicit truncation");
1924 Known = Known.trunc(Op.getScalarValueSizeInBits());
1926 break;
1928 case AArch64ISD::CSEL: {
1929 KnownBits Known2;
1930 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1931 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1932 Known = KnownBits::commonBits(Known, Known2);
1933 break;
1935 case AArch64ISD::BICi: {
1936 // Compute the bit cleared value.
1937 uint64_t Mask =
1938 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
1939 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1940 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
1941 break;
1943 case AArch64ISD::VLSHR: {
1944 KnownBits Known2;
1945 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1946 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1947 Known = KnownBits::lshr(Known, Known2);
1948 break;
1950 case AArch64ISD::VASHR: {
1951 KnownBits Known2;
1952 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1953 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1954 Known = KnownBits::ashr(Known, Known2);
1955 break;
1957 case AArch64ISD::LOADgot:
1958 case AArch64ISD::ADDlow: {
1959 if (!Subtarget->isTargetILP32())
1960 break;
1961 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1962 Known.Zero = APInt::getHighBitsSet(64, 32);
1963 break;
1965 case AArch64ISD::ASSERT_ZEXT_BOOL: {
1966 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1967 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
1968 break;
1970 case ISD::INTRINSIC_W_CHAIN: {
1971 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1972 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1973 switch (IntID) {
1974 default: return;
1975 case Intrinsic::aarch64_ldaxr:
1976 case Intrinsic::aarch64_ldxr: {
1977 unsigned BitWidth = Known.getBitWidth();
1978 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1979 unsigned MemBits = VT.getScalarSizeInBits();
1980 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1981 return;
1984 break;
1986 case ISD::INTRINSIC_WO_CHAIN:
1987 case ISD::INTRINSIC_VOID: {
1988 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1989 switch (IntNo) {
1990 default:
1991 break;
1992 case Intrinsic::aarch64_neon_umaxv:
1993 case Intrinsic::aarch64_neon_uminv: {
1994 // Figure out the datatype of the vector operand. The UMINV instruction
1995 // will zero extend the result, so we can mark as known zero all the
1996 // bits larger than the element datatype. 32-bit or larget doesn't need
1997 // this as those are legal types and will be handled by isel directly.
1998 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1999 unsigned BitWidth = Known.getBitWidth();
2000 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2001 assert(BitWidth >= 8 && "Unexpected width!");
2002 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
2003 Known.Zero |= Mask;
2004 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2005 assert(BitWidth >= 16 && "Unexpected width!");
2006 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
2007 Known.Zero |= Mask;
2009 break;
2010 } break;
2016 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
2017 EVT) const {
2018 return MVT::i64;
2021 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2022 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2023 bool *Fast) const {
2024 if (Subtarget->requiresStrictAlign())
2025 return false;
2027 if (Fast) {
2028 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2029 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2030 // See comments in performSTORECombine() for more details about
2031 // these conditions.
2033 // Code that uses clang vector extensions can mark that it
2034 // wants unaligned accesses to be treated as fast by
2035 // underspecifying alignment to be 1 or 2.
2036 Alignment <= 2 ||
2038 // Disregard v2i64. Memcpy lowering produces those and splitting
2039 // them regresses performance on micro-benchmarks and olden/bh.
2040 VT == MVT::v2i64;
2042 return true;
2045 // Same as above but handling LLTs instead.
2046 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2047 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2048 bool *Fast) const {
2049 if (Subtarget->requiresStrictAlign())
2050 return false;
2052 if (Fast) {
2053 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2054 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2055 Ty.getSizeInBytes() != 16 ||
2056 // See comments in performSTORECombine() for more details about
2057 // these conditions.
2059 // Code that uses clang vector extensions can mark that it
2060 // wants unaligned accesses to be treated as fast by
2061 // underspecifying alignment to be 1 or 2.
2062 Alignment <= 2 ||
2064 // Disregard v2i64. Memcpy lowering produces those and splitting
2065 // them regresses performance on micro-benchmarks and olden/bh.
2066 Ty == LLT::fixed_vector(2, 64);
2068 return true;
2071 FastISel *
2072 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2073 const TargetLibraryInfo *libInfo) const {
2074 return AArch64::createFastISel(funcInfo, libInfo);
2077 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2078 #define MAKE_CASE(V) \
2079 case V: \
2080 return #V;
2081 switch ((AArch64ISD::NodeType)Opcode) {
2082 case AArch64ISD::FIRST_NUMBER:
2083 break;
2084 MAKE_CASE(AArch64ISD::OBSCURE_COPY)
2085 MAKE_CASE(AArch64ISD::SMSTART)
2086 MAKE_CASE(AArch64ISD::SMSTOP)
2087 MAKE_CASE(AArch64ISD::RESTORE_ZA)
2088 MAKE_CASE(AArch64ISD::CALL)
2089 MAKE_CASE(AArch64ISD::ADRP)
2090 MAKE_CASE(AArch64ISD::ADR)
2091 MAKE_CASE(AArch64ISD::ADDlow)
2092 MAKE_CASE(AArch64ISD::LOADgot)
2093 MAKE_CASE(AArch64ISD::RET_FLAG)
2094 MAKE_CASE(AArch64ISD::BRCOND)
2095 MAKE_CASE(AArch64ISD::CSEL)
2096 MAKE_CASE(AArch64ISD::CSINV)
2097 MAKE_CASE(AArch64ISD::CSNEG)
2098 MAKE_CASE(AArch64ISD::CSINC)
2099 MAKE_CASE(AArch64ISD::THREAD_POINTER)
2100 MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
2101 MAKE_CASE(AArch64ISD::ABDS_PRED)
2102 MAKE_CASE(AArch64ISD::ABDU_PRED)
2103 MAKE_CASE(AArch64ISD::MUL_PRED)
2104 MAKE_CASE(AArch64ISD::MULHS_PRED)
2105 MAKE_CASE(AArch64ISD::MULHU_PRED)
2106 MAKE_CASE(AArch64ISD::SDIV_PRED)
2107 MAKE_CASE(AArch64ISD::SHL_PRED)
2108 MAKE_CASE(AArch64ISD::SMAX_PRED)
2109 MAKE_CASE(AArch64ISD::SMIN_PRED)
2110 MAKE_CASE(AArch64ISD::SRA_PRED)
2111 MAKE_CASE(AArch64ISD::SRL_PRED)
2112 MAKE_CASE(AArch64ISD::UDIV_PRED)
2113 MAKE_CASE(AArch64ISD::UMAX_PRED)
2114 MAKE_CASE(AArch64ISD::UMIN_PRED)
2115 MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1)
2116 MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
2117 MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
2118 MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
2119 MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
2120 MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
2121 MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
2122 MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
2123 MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
2124 MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
2125 MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
2126 MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
2127 MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
2128 MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
2129 MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
2130 MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
2131 MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
2132 MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
2133 MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
2134 MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
2135 MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
2136 MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
2137 MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
2138 MAKE_CASE(AArch64ISD::ADC)
2139 MAKE_CASE(AArch64ISD::SBC)
2140 MAKE_CASE(AArch64ISD::ADDS)
2141 MAKE_CASE(AArch64ISD::SUBS)
2142 MAKE_CASE(AArch64ISD::ADCS)
2143 MAKE_CASE(AArch64ISD::SBCS)
2144 MAKE_CASE(AArch64ISD::ANDS)
2145 MAKE_CASE(AArch64ISD::CCMP)
2146 MAKE_CASE(AArch64ISD::CCMN)
2147 MAKE_CASE(AArch64ISD::FCCMP)
2148 MAKE_CASE(AArch64ISD::FCMP)
2149 MAKE_CASE(AArch64ISD::STRICT_FCMP)
2150 MAKE_CASE(AArch64ISD::STRICT_FCMPE)
2151 MAKE_CASE(AArch64ISD::DUP)
2152 MAKE_CASE(AArch64ISD::DUPLANE8)
2153 MAKE_CASE(AArch64ISD::DUPLANE16)
2154 MAKE_CASE(AArch64ISD::DUPLANE32)
2155 MAKE_CASE(AArch64ISD::DUPLANE64)
2156 MAKE_CASE(AArch64ISD::DUPLANE128)
2157 MAKE_CASE(AArch64ISD::MOVI)
2158 MAKE_CASE(AArch64ISD::MOVIshift)
2159 MAKE_CASE(AArch64ISD::MOVIedit)
2160 MAKE_CASE(AArch64ISD::MOVImsl)
2161 MAKE_CASE(AArch64ISD::FMOV)
2162 MAKE_CASE(AArch64ISD::MVNIshift)
2163 MAKE_CASE(AArch64ISD::MVNImsl)
2164 MAKE_CASE(AArch64ISD::BICi)
2165 MAKE_CASE(AArch64ISD::ORRi)
2166 MAKE_CASE(AArch64ISD::BSP)
2167 MAKE_CASE(AArch64ISD::EXTR)
2168 MAKE_CASE(AArch64ISD::ZIP1)
2169 MAKE_CASE(AArch64ISD::ZIP2)
2170 MAKE_CASE(AArch64ISD::UZP1)
2171 MAKE_CASE(AArch64ISD::UZP2)
2172 MAKE_CASE(AArch64ISD::TRN1)
2173 MAKE_CASE(AArch64ISD::TRN2)
2174 MAKE_CASE(AArch64ISD::REV16)
2175 MAKE_CASE(AArch64ISD::REV32)
2176 MAKE_CASE(AArch64ISD::REV64)
2177 MAKE_CASE(AArch64ISD::EXT)
2178 MAKE_CASE(AArch64ISD::SPLICE)
2179 MAKE_CASE(AArch64ISD::VSHL)
2180 MAKE_CASE(AArch64ISD::VLSHR)
2181 MAKE_CASE(AArch64ISD::VASHR)
2182 MAKE_CASE(AArch64ISD::VSLI)
2183 MAKE_CASE(AArch64ISD::VSRI)
2184 MAKE_CASE(AArch64ISD::CMEQ)
2185 MAKE_CASE(AArch64ISD::CMGE)
2186 MAKE_CASE(AArch64ISD::CMGT)
2187 MAKE_CASE(AArch64ISD::CMHI)
2188 MAKE_CASE(AArch64ISD::CMHS)
2189 MAKE_CASE(AArch64ISD::FCMEQ)
2190 MAKE_CASE(AArch64ISD::FCMGE)
2191 MAKE_CASE(AArch64ISD::FCMGT)
2192 MAKE_CASE(AArch64ISD::CMEQz)
2193 MAKE_CASE(AArch64ISD::CMGEz)
2194 MAKE_CASE(AArch64ISD::CMGTz)
2195 MAKE_CASE(AArch64ISD::CMLEz)
2196 MAKE_CASE(AArch64ISD::CMLTz)
2197 MAKE_CASE(AArch64ISD::FCMEQz)
2198 MAKE_CASE(AArch64ISD::FCMGEz)
2199 MAKE_CASE(AArch64ISD::FCMGTz)
2200 MAKE_CASE(AArch64ISD::FCMLEz)
2201 MAKE_CASE(AArch64ISD::FCMLTz)
2202 MAKE_CASE(AArch64ISD::SADDV)
2203 MAKE_CASE(AArch64ISD::UADDV)
2204 MAKE_CASE(AArch64ISD::SDOT)
2205 MAKE_CASE(AArch64ISD::UDOT)
2206 MAKE_CASE(AArch64ISD::SMINV)
2207 MAKE_CASE(AArch64ISD::UMINV)
2208 MAKE_CASE(AArch64ISD::SMAXV)
2209 MAKE_CASE(AArch64ISD::UMAXV)
2210 MAKE_CASE(AArch64ISD::SADDV_PRED)
2211 MAKE_CASE(AArch64ISD::UADDV_PRED)
2212 MAKE_CASE(AArch64ISD::SMAXV_PRED)
2213 MAKE_CASE(AArch64ISD::UMAXV_PRED)
2214 MAKE_CASE(AArch64ISD::SMINV_PRED)
2215 MAKE_CASE(AArch64ISD::UMINV_PRED)
2216 MAKE_CASE(AArch64ISD::ORV_PRED)
2217 MAKE_CASE(AArch64ISD::EORV_PRED)
2218 MAKE_CASE(AArch64ISD::ANDV_PRED)
2219 MAKE_CASE(AArch64ISD::CLASTA_N)
2220 MAKE_CASE(AArch64ISD::CLASTB_N)
2221 MAKE_CASE(AArch64ISD::LASTA)
2222 MAKE_CASE(AArch64ISD::LASTB)
2223 MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
2224 MAKE_CASE(AArch64ISD::LS64_BUILD)
2225 MAKE_CASE(AArch64ISD::LS64_EXTRACT)
2226 MAKE_CASE(AArch64ISD::TBL)
2227 MAKE_CASE(AArch64ISD::FADD_PRED)
2228 MAKE_CASE(AArch64ISD::FADDA_PRED)
2229 MAKE_CASE(AArch64ISD::FADDV_PRED)
2230 MAKE_CASE(AArch64ISD::FDIV_PRED)
2231 MAKE_CASE(AArch64ISD::FMA_PRED)
2232 MAKE_CASE(AArch64ISD::FMAX_PRED)
2233 MAKE_CASE(AArch64ISD::FMAXV_PRED)
2234 MAKE_CASE(AArch64ISD::FMAXNM_PRED)
2235 MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
2236 MAKE_CASE(AArch64ISD::FMIN_PRED)
2237 MAKE_CASE(AArch64ISD::FMINV_PRED)
2238 MAKE_CASE(AArch64ISD::FMINNM_PRED)
2239 MAKE_CASE(AArch64ISD::FMINNMV_PRED)
2240 MAKE_CASE(AArch64ISD::FMUL_PRED)
2241 MAKE_CASE(AArch64ISD::FSUB_PRED)
2242 MAKE_CASE(AArch64ISD::RDSVL)
2243 MAKE_CASE(AArch64ISD::BIC)
2244 MAKE_CASE(AArch64ISD::BIT)
2245 MAKE_CASE(AArch64ISD::CBZ)
2246 MAKE_CASE(AArch64ISD::CBNZ)
2247 MAKE_CASE(AArch64ISD::TBZ)
2248 MAKE_CASE(AArch64ISD::TBNZ)
2249 MAKE_CASE(AArch64ISD::TC_RETURN)
2250 MAKE_CASE(AArch64ISD::PREFETCH)
2251 MAKE_CASE(AArch64ISD::SITOF)
2252 MAKE_CASE(AArch64ISD::UITOF)
2253 MAKE_CASE(AArch64ISD::NVCAST)
2254 MAKE_CASE(AArch64ISD::MRS)
2255 MAKE_CASE(AArch64ISD::SQSHL_I)
2256 MAKE_CASE(AArch64ISD::UQSHL_I)
2257 MAKE_CASE(AArch64ISD::SRSHR_I)
2258 MAKE_CASE(AArch64ISD::URSHR_I)
2259 MAKE_CASE(AArch64ISD::SQSHLU_I)
2260 MAKE_CASE(AArch64ISD::WrapperLarge)
2261 MAKE_CASE(AArch64ISD::LD2post)
2262 MAKE_CASE(AArch64ISD::LD3post)
2263 MAKE_CASE(AArch64ISD::LD4post)
2264 MAKE_CASE(AArch64ISD::ST2post)
2265 MAKE_CASE(AArch64ISD::ST3post)
2266 MAKE_CASE(AArch64ISD::ST4post)
2267 MAKE_CASE(AArch64ISD::LD1x2post)
2268 MAKE_CASE(AArch64ISD::LD1x3post)
2269 MAKE_CASE(AArch64ISD::LD1x4post)
2270 MAKE_CASE(AArch64ISD::ST1x2post)
2271 MAKE_CASE(AArch64ISD::ST1x3post)
2272 MAKE_CASE(AArch64ISD::ST1x4post)
2273 MAKE_CASE(AArch64ISD::LD1DUPpost)
2274 MAKE_CASE(AArch64ISD::LD2DUPpost)
2275 MAKE_CASE(AArch64ISD::LD3DUPpost)
2276 MAKE_CASE(AArch64ISD::LD4DUPpost)
2277 MAKE_CASE(AArch64ISD::LD1LANEpost)
2278 MAKE_CASE(AArch64ISD::LD2LANEpost)
2279 MAKE_CASE(AArch64ISD::LD3LANEpost)
2280 MAKE_CASE(AArch64ISD::LD4LANEpost)
2281 MAKE_CASE(AArch64ISD::ST2LANEpost)
2282 MAKE_CASE(AArch64ISD::ST3LANEpost)
2283 MAKE_CASE(AArch64ISD::ST4LANEpost)
2284 MAKE_CASE(AArch64ISD::SMULL)
2285 MAKE_CASE(AArch64ISD::UMULL)
2286 MAKE_CASE(AArch64ISD::PMULL)
2287 MAKE_CASE(AArch64ISD::FRECPE)
2288 MAKE_CASE(AArch64ISD::FRECPS)
2289 MAKE_CASE(AArch64ISD::FRSQRTE)
2290 MAKE_CASE(AArch64ISD::FRSQRTS)
2291 MAKE_CASE(AArch64ISD::STG)
2292 MAKE_CASE(AArch64ISD::STZG)
2293 MAKE_CASE(AArch64ISD::ST2G)
2294 MAKE_CASE(AArch64ISD::STZ2G)
2295 MAKE_CASE(AArch64ISD::SUNPKHI)
2296 MAKE_CASE(AArch64ISD::SUNPKLO)
2297 MAKE_CASE(AArch64ISD::UUNPKHI)
2298 MAKE_CASE(AArch64ISD::UUNPKLO)
2299 MAKE_CASE(AArch64ISD::INSR)
2300 MAKE_CASE(AArch64ISD::PTEST)
2301 MAKE_CASE(AArch64ISD::PTRUE)
2302 MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
2303 MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
2304 MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
2305 MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
2306 MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
2307 MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
2308 MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
2309 MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
2310 MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
2311 MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
2312 MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
2313 MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
2314 MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
2315 MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
2316 MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
2317 MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
2318 MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
2319 MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
2320 MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
2321 MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
2322 MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
2323 MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
2324 MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
2325 MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
2326 MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
2327 MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
2328 MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
2329 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
2330 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
2331 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
2332 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
2333 MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
2334 MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
2335 MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
2336 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
2337 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
2338 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
2339 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
2340 MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
2341 MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
2342 MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
2343 MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
2344 MAKE_CASE(AArch64ISD::ST1_PRED)
2345 MAKE_CASE(AArch64ISD::SST1_PRED)
2346 MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
2347 MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
2348 MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
2349 MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
2350 MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
2351 MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
2352 MAKE_CASE(AArch64ISD::SSTNT1_PRED)
2353 MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
2354 MAKE_CASE(AArch64ISD::LDP)
2355 MAKE_CASE(AArch64ISD::LDNP)
2356 MAKE_CASE(AArch64ISD::STP)
2357 MAKE_CASE(AArch64ISD::STNP)
2358 MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
2359 MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
2360 MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
2361 MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
2362 MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU)
2363 MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
2364 MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
2365 MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
2366 MAKE_CASE(AArch64ISD::INDEX_VECTOR)
2367 MAKE_CASE(AArch64ISD::ADDP)
2368 MAKE_CASE(AArch64ISD::SADDLP)
2369 MAKE_CASE(AArch64ISD::UADDLP)
2370 MAKE_CASE(AArch64ISD::CALL_RVMARKER)
2371 MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
2372 MAKE_CASE(AArch64ISD::MOPS_MEMSET)
2373 MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
2374 MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
2375 MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
2376 MAKE_CASE(AArch64ISD::CALL_BTI)
2378 #undef MAKE_CASE
2379 return nullptr;
2382 MachineBasicBlock *
2383 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2384 MachineBasicBlock *MBB) const {
2385 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2386 // phi node:
2388 // OrigBB:
2389 // [... previous instrs leading to comparison ...]
2390 // b.ne TrueBB
2391 // b EndBB
2392 // TrueBB:
2393 // ; Fallthrough
2394 // EndBB:
2395 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2397 MachineFunction *MF = MBB->getParent();
2398 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2399 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2400 DebugLoc DL = MI.getDebugLoc();
2401 MachineFunction::iterator It = ++MBB->getIterator();
2403 Register DestReg = MI.getOperand(0).getReg();
2404 Register IfTrueReg = MI.getOperand(1).getReg();
2405 Register IfFalseReg = MI.getOperand(2).getReg();
2406 unsigned CondCode = MI.getOperand(3).getImm();
2407 bool NZCVKilled = MI.getOperand(4).isKill();
2409 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2410 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2411 MF->insert(It, TrueBB);
2412 MF->insert(It, EndBB);
2414 // Transfer rest of current basic-block to EndBB
2415 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2416 MBB->end());
2417 EndBB->transferSuccessorsAndUpdatePHIs(MBB);
2419 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2420 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2421 MBB->addSuccessor(TrueBB);
2422 MBB->addSuccessor(EndBB);
2424 // TrueBB falls through to the end.
2425 TrueBB->addSuccessor(EndBB);
2427 if (!NZCVKilled) {
2428 TrueBB->addLiveIn(AArch64::NZCV);
2429 EndBB->addLiveIn(AArch64::NZCV);
2432 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2433 .addReg(IfTrueReg)
2434 .addMBB(TrueBB)
2435 .addReg(IfFalseReg)
2436 .addMBB(MBB);
2438 MI.eraseFromParent();
2439 return EndBB;
2442 MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2443 MachineInstr &MI, MachineBasicBlock *BB) const {
2444 assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2445 BB->getParent()->getFunction().getPersonalityFn())) &&
2446 "SEH does not use catchret!");
2447 return BB;
2450 MachineBasicBlock *
2451 AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2452 MachineInstr &MI,
2453 MachineBasicBlock *BB) const {
2454 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2455 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2457 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2458 MIB.add(MI.getOperand(1)); // slice index register
2459 MIB.add(MI.getOperand(2)); // slice index offset
2460 MIB.add(MI.getOperand(3)); // pg
2461 MIB.add(MI.getOperand(4)); // base
2462 MIB.add(MI.getOperand(5)); // offset
2464 MI.eraseFromParent(); // The pseudo is gone now.
2465 return BB;
2468 MachineBasicBlock *
2469 AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
2470 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2471 MachineInstrBuilder MIB =
2472 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2474 MIB.addReg(AArch64::ZA, RegState::Define);
2475 MIB.add(MI.getOperand(0)); // Vector select register
2476 MIB.add(MI.getOperand(1)); // Vector select offset
2477 MIB.add(MI.getOperand(2)); // Base
2478 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2480 MI.eraseFromParent(); // The pseudo is gone now.
2481 return BB;
2484 MachineBasicBlock *
2485 AArch64TargetLowering::EmitMopa(unsigned Opc, unsigned BaseReg,
2486 MachineInstr &MI, MachineBasicBlock *BB) const {
2487 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2488 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2490 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2491 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2492 MIB.add(MI.getOperand(1)); // pn
2493 MIB.add(MI.getOperand(2)); // pm
2494 MIB.add(MI.getOperand(3)); // zn
2495 MIB.add(MI.getOperand(4)); // zm
2497 MI.eraseFromParent(); // The pseudo is gone now.
2498 return BB;
2501 MachineBasicBlock *
2502 AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg,
2503 MachineInstr &MI,
2504 MachineBasicBlock *BB) const {
2505 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2506 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2508 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2509 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2510 MIB.add(MI.getOperand(1)); // Slice index register
2511 MIB.add(MI.getOperand(2)); // Slice index offset
2512 MIB.add(MI.getOperand(3)); // pg
2513 MIB.add(MI.getOperand(4)); // zn
2515 MI.eraseFromParent(); // The pseudo is gone now.
2516 return BB;
2519 MachineBasicBlock *
2520 AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
2521 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2522 MachineInstrBuilder MIB =
2523 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2524 MIB.add(MI.getOperand(0)); // Mask
2526 unsigned Mask = MI.getOperand(0).getImm();
2527 for (unsigned I = 0; I < 8; I++) {
2528 if (Mask & (1 << I))
2529 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2532 MI.eraseFromParent(); // The pseudo is gone now.
2533 return BB;
2536 MachineBasicBlock *
2537 AArch64TargetLowering::EmitAddVectorToTile(unsigned Opc, unsigned BaseReg,
2538 MachineInstr &MI,
2539 MachineBasicBlock *BB) const {
2540 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2541 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2543 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2544 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2545 MIB.add(MI.getOperand(1)); // pn
2546 MIB.add(MI.getOperand(2)); // pm
2547 MIB.add(MI.getOperand(3)); // zn
2549 MI.eraseFromParent(); // The pseudo is gone now.
2550 return BB;
2553 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
2554 MachineInstr &MI, MachineBasicBlock *BB) const {
2555 switch (MI.getOpcode()) {
2556 default:
2557 #ifndef NDEBUG
2558 MI.dump();
2559 #endif
2560 llvm_unreachable("Unexpected instruction for custom inserter!");
2562 case AArch64::F128CSEL:
2563 return EmitF128CSEL(MI, BB);
2564 case TargetOpcode::STATEPOINT:
2565 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2566 // while bl call instruction (where statepoint will be lowered at the end)
2567 // has implicit def. This def is early-clobber as it will be set at
2568 // the moment of the call and earlier than any use is read.
2569 // Add this implicit dead def here as a workaround.
2570 MI.addOperand(*MI.getMF(),
2571 MachineOperand::CreateReg(
2572 AArch64::LR, /*isDef*/ true,
2573 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2574 /*isUndef*/ false, /*isEarlyClobber*/ true));
2575 [[fallthrough]];
2576 case TargetOpcode::STACKMAP:
2577 case TargetOpcode::PATCHPOINT:
2578 return emitPatchPoint(MI, BB);
2580 case AArch64::CATCHRET:
2581 return EmitLoweredCatchRet(MI, BB);
2582 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2583 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2584 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2585 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2586 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2587 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2588 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2589 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2590 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2591 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2592 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2593 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2594 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2595 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2596 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2597 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2598 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2599 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2600 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2601 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2602 case AArch64::LDR_ZA_PSEUDO:
2603 return EmitFill(MI, BB);
2604 case AArch64::BFMOPA_MPPZZ_PSEUDO:
2605 return EmitMopa(AArch64::BFMOPA_MPPZZ, AArch64::ZAS0, MI, BB);
2606 case AArch64::BFMOPS_MPPZZ_PSEUDO:
2607 return EmitMopa(AArch64::BFMOPS_MPPZZ, AArch64::ZAS0, MI, BB);
2608 case AArch64::FMOPAL_MPPZZ_PSEUDO:
2609 return EmitMopa(AArch64::FMOPAL_MPPZZ, AArch64::ZAS0, MI, BB);
2610 case AArch64::FMOPSL_MPPZZ_PSEUDO:
2611 return EmitMopa(AArch64::FMOPSL_MPPZZ, AArch64::ZAS0, MI, BB);
2612 case AArch64::FMOPA_MPPZZ_S_PSEUDO:
2613 return EmitMopa(AArch64::FMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2614 case AArch64::FMOPS_MPPZZ_S_PSEUDO:
2615 return EmitMopa(AArch64::FMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2616 case AArch64::FMOPA_MPPZZ_D_PSEUDO:
2617 return EmitMopa(AArch64::FMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2618 case AArch64::FMOPS_MPPZZ_D_PSEUDO:
2619 return EmitMopa(AArch64::FMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2620 case AArch64::SMOPA_MPPZZ_S_PSEUDO:
2621 return EmitMopa(AArch64::SMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2622 case AArch64::SMOPS_MPPZZ_S_PSEUDO:
2623 return EmitMopa(AArch64::SMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2624 case AArch64::UMOPA_MPPZZ_S_PSEUDO:
2625 return EmitMopa(AArch64::UMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2626 case AArch64::UMOPS_MPPZZ_S_PSEUDO:
2627 return EmitMopa(AArch64::UMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2628 case AArch64::SUMOPA_MPPZZ_S_PSEUDO:
2629 return EmitMopa(AArch64::SUMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2630 case AArch64::SUMOPS_MPPZZ_S_PSEUDO:
2631 return EmitMopa(AArch64::SUMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2632 case AArch64::USMOPA_MPPZZ_S_PSEUDO:
2633 return EmitMopa(AArch64::USMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2634 case AArch64::USMOPS_MPPZZ_S_PSEUDO:
2635 return EmitMopa(AArch64::USMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2636 case AArch64::SMOPA_MPPZZ_D_PSEUDO:
2637 return EmitMopa(AArch64::SMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2638 case AArch64::SMOPS_MPPZZ_D_PSEUDO:
2639 return EmitMopa(AArch64::SMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2640 case AArch64::UMOPA_MPPZZ_D_PSEUDO:
2641 return EmitMopa(AArch64::UMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2642 case AArch64::UMOPS_MPPZZ_D_PSEUDO:
2643 return EmitMopa(AArch64::UMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2644 case AArch64::SUMOPA_MPPZZ_D_PSEUDO:
2645 return EmitMopa(AArch64::SUMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2646 case AArch64::SUMOPS_MPPZZ_D_PSEUDO:
2647 return EmitMopa(AArch64::SUMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2648 case AArch64::USMOPA_MPPZZ_D_PSEUDO:
2649 return EmitMopa(AArch64::USMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2650 case AArch64::USMOPS_MPPZZ_D_PSEUDO:
2651 return EmitMopa(AArch64::USMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2652 case AArch64::INSERT_MXIPZ_H_PSEUDO_B:
2653 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_B, AArch64::ZAB0, MI,
2654 BB);
2655 case AArch64::INSERT_MXIPZ_H_PSEUDO_H:
2656 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_H, AArch64::ZAH0, MI,
2657 BB);
2658 case AArch64::INSERT_MXIPZ_H_PSEUDO_S:
2659 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_S, AArch64::ZAS0, MI,
2660 BB);
2661 case AArch64::INSERT_MXIPZ_H_PSEUDO_D:
2662 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_D, AArch64::ZAD0, MI,
2663 BB);
2664 case AArch64::INSERT_MXIPZ_H_PSEUDO_Q:
2665 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_Q, AArch64::ZAQ0, MI,
2666 BB);
2667 case AArch64::INSERT_MXIPZ_V_PSEUDO_B:
2668 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_B, AArch64::ZAB0, MI,
2669 BB);
2670 case AArch64::INSERT_MXIPZ_V_PSEUDO_H:
2671 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_H, AArch64::ZAH0, MI,
2672 BB);
2673 case AArch64::INSERT_MXIPZ_V_PSEUDO_S:
2674 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_S, AArch64::ZAS0, MI,
2675 BB);
2676 case AArch64::INSERT_MXIPZ_V_PSEUDO_D:
2677 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_D, AArch64::ZAD0, MI,
2678 BB);
2679 case AArch64::INSERT_MXIPZ_V_PSEUDO_Q:
2680 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_Q, AArch64::ZAQ0, MI,
2681 BB);
2682 case AArch64::ZERO_M_PSEUDO:
2683 return EmitZero(MI, BB);
2684 case AArch64::ADDHA_MPPZ_PSEUDO_S:
2685 return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_S, AArch64::ZAS0, MI, BB);
2686 case AArch64::ADDVA_MPPZ_PSEUDO_S:
2687 return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_S, AArch64::ZAS0, MI, BB);
2688 case AArch64::ADDHA_MPPZ_PSEUDO_D:
2689 return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_D, AArch64::ZAD0, MI, BB);
2690 case AArch64::ADDVA_MPPZ_PSEUDO_D:
2691 return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_D, AArch64::ZAD0, MI, BB);
2695 //===----------------------------------------------------------------------===//
2696 // AArch64 Lowering private implementation.
2697 //===----------------------------------------------------------------------===//
2699 //===----------------------------------------------------------------------===//
2700 // Lowering Code
2701 //===----------------------------------------------------------------------===//
2703 // Forward declarations of SVE fixed length lowering helpers
2704 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
2705 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2706 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2707 static SDValue convertFixedMaskToScalableVector(SDValue Mask,
2708 SelectionDAG &DAG);
2709 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
2710 EVT VT);
2712 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
2713 static bool isZerosVector(const SDNode *N) {
2714 // Look through a bit convert.
2715 while (N->getOpcode() == ISD::BITCAST)
2716 N = N->getOperand(0).getNode();
2718 if (ISD::isConstantSplatVectorAllZeros(N))
2719 return true;
2721 if (N->getOpcode() != AArch64ISD::DUP)
2722 return false;
2724 auto Opnd0 = N->getOperand(0);
2725 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
2728 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2729 /// CC
2730 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
2731 switch (CC) {
2732 default:
2733 llvm_unreachable("Unknown condition code!");
2734 case ISD::SETNE:
2735 return AArch64CC::NE;
2736 case ISD::SETEQ:
2737 return AArch64CC::EQ;
2738 case ISD::SETGT:
2739 return AArch64CC::GT;
2740 case ISD::SETGE:
2741 return AArch64CC::GE;
2742 case ISD::SETLT:
2743 return AArch64CC::LT;
2744 case ISD::SETLE:
2745 return AArch64CC::LE;
2746 case ISD::SETUGT:
2747 return AArch64CC::HI;
2748 case ISD::SETUGE:
2749 return AArch64CC::HS;
2750 case ISD::SETULT:
2751 return AArch64CC::LO;
2752 case ISD::SETULE:
2753 return AArch64CC::LS;
2757 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2758 static void changeFPCCToAArch64CC(ISD::CondCode CC,
2759 AArch64CC::CondCode &CondCode,
2760 AArch64CC::CondCode &CondCode2) {
2761 CondCode2 = AArch64CC::AL;
2762 switch (CC) {
2763 default:
2764 llvm_unreachable("Unknown FP condition!");
2765 case ISD::SETEQ:
2766 case ISD::SETOEQ:
2767 CondCode = AArch64CC::EQ;
2768 break;
2769 case ISD::SETGT:
2770 case ISD::SETOGT:
2771 CondCode = AArch64CC::GT;
2772 break;
2773 case ISD::SETGE:
2774 case ISD::SETOGE:
2775 CondCode = AArch64CC::GE;
2776 break;
2777 case ISD::SETOLT:
2778 CondCode = AArch64CC::MI;
2779 break;
2780 case ISD::SETOLE:
2781 CondCode = AArch64CC::LS;
2782 break;
2783 case ISD::SETONE:
2784 CondCode = AArch64CC::MI;
2785 CondCode2 = AArch64CC::GT;
2786 break;
2787 case ISD::SETO:
2788 CondCode = AArch64CC::VC;
2789 break;
2790 case ISD::SETUO:
2791 CondCode = AArch64CC::VS;
2792 break;
2793 case ISD::SETUEQ:
2794 CondCode = AArch64CC::EQ;
2795 CondCode2 = AArch64CC::VS;
2796 break;
2797 case ISD::SETUGT:
2798 CondCode = AArch64CC::HI;
2799 break;
2800 case ISD::SETUGE:
2801 CondCode = AArch64CC::PL;
2802 break;
2803 case ISD::SETLT:
2804 case ISD::SETULT:
2805 CondCode = AArch64CC::LT;
2806 break;
2807 case ISD::SETLE:
2808 case ISD::SETULE:
2809 CondCode = AArch64CC::LE;
2810 break;
2811 case ISD::SETNE:
2812 case ISD::SETUNE:
2813 CondCode = AArch64CC::NE;
2814 break;
2818 /// Convert a DAG fp condition code to an AArch64 CC.
2819 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2820 /// should be AND'ed instead of OR'ed.
2821 static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
2822 AArch64CC::CondCode &CondCode,
2823 AArch64CC::CondCode &CondCode2) {
2824 CondCode2 = AArch64CC::AL;
2825 switch (CC) {
2826 default:
2827 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2828 assert(CondCode2 == AArch64CC::AL);
2829 break;
2830 case ISD::SETONE:
2831 // (a one b)
2832 // == ((a olt b) || (a ogt b))
2833 // == ((a ord b) && (a une b))
2834 CondCode = AArch64CC::VC;
2835 CondCode2 = AArch64CC::NE;
2836 break;
2837 case ISD::SETUEQ:
2838 // (a ueq b)
2839 // == ((a uno b) || (a oeq b))
2840 // == ((a ule b) && (a uge b))
2841 CondCode = AArch64CC::PL;
2842 CondCode2 = AArch64CC::LE;
2843 break;
2847 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2848 /// CC usable with the vector instructions. Fewer operations are available
2849 /// without a real NZCV register, so we have to use less efficient combinations
2850 /// to get the same effect.
2851 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
2852 AArch64CC::CondCode &CondCode,
2853 AArch64CC::CondCode &CondCode2,
2854 bool &Invert) {
2855 Invert = false;
2856 switch (CC) {
2857 default:
2858 // Mostly the scalar mappings work fine.
2859 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2860 break;
2861 case ISD::SETUO:
2862 Invert = true;
2863 [[fallthrough]];
2864 case ISD::SETO:
2865 CondCode = AArch64CC::MI;
2866 CondCode2 = AArch64CC::GE;
2867 break;
2868 case ISD::SETUEQ:
2869 case ISD::SETULT:
2870 case ISD::SETULE:
2871 case ISD::SETUGT:
2872 case ISD::SETUGE:
2873 // All of the compare-mask comparisons are ordered, but we can switch
2874 // between the two by a double inversion. E.g. ULE == !OGT.
2875 Invert = true;
2876 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2877 CondCode, CondCode2);
2878 break;
2882 static bool isLegalArithImmed(uint64_t C) {
2883 // Matches AArch64DAGToDAGISel::SelectArithImmed().
2884 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2885 LLVM_DEBUG(dbgs() << "Is imm " << C
2886 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2887 return IsLegal;
2890 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2891 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2892 // can be set differently by this operation. It comes down to whether
2893 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2894 // everything is fine. If not then the optimization is wrong. Thus general
2895 // comparisons are only valid if op2 != 0.
2897 // So, finally, the only LLVM-native comparisons that don't mention C and V
2898 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2899 // the absence of information about op2.
2900 static bool isCMN(SDValue Op, ISD::CondCode CC) {
2901 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2902 (CC == ISD::SETEQ || CC == ISD::SETNE);
2905 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
2906 SelectionDAG &DAG, SDValue Chain,
2907 bool IsSignaling) {
2908 EVT VT = LHS.getValueType();
2909 assert(VT != MVT::f128);
2911 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
2913 if (VT == MVT::f16 && !FullFP16) {
2914 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
2915 {Chain, LHS});
2916 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
2917 {LHS.getValue(1), RHS});
2918 Chain = RHS.getValue(1);
2919 VT = MVT::f32;
2921 unsigned Opcode =
2922 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
2923 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2926 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
2927 const SDLoc &dl, SelectionDAG &DAG) {
2928 EVT VT = LHS.getValueType();
2929 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
2931 if (VT.isFloatingPoint()) {
2932 assert(VT != MVT::f128);
2933 if (VT == MVT::f16 && !FullFP16) {
2934 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2935 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2936 VT = MVT::f32;
2938 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2941 // The CMP instruction is just an alias for SUBS, and representing it as
2942 // SUBS means that it's possible to get CSE with subtract operations.
2943 // A later phase can perform the optimization of setting the destination
2944 // register to WZR/XZR if it ends up being unused.
2945 unsigned Opcode = AArch64ISD::SUBS;
2947 if (isCMN(RHS, CC)) {
2948 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2949 Opcode = AArch64ISD::ADDS;
2950 RHS = RHS.getOperand(1);
2951 } else if (isCMN(LHS, CC)) {
2952 // As we are looking for EQ/NE compares, the operands can be commuted ; can
2953 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2954 Opcode = AArch64ISD::ADDS;
2955 LHS = LHS.getOperand(1);
2956 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2957 if (LHS.getOpcode() == ISD::AND) {
2958 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2959 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2960 // of the signed comparisons.
2961 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2962 DAG.getVTList(VT, MVT_CC),
2963 LHS.getOperand(0),
2964 LHS.getOperand(1));
2965 // Replace all users of (and X, Y) with newly generated (ands X, Y)
2966 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2967 return ANDSNode.getValue(1);
2968 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2969 // Use result of ANDS
2970 return LHS.getValue(1);
2974 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2975 .getValue(1);
2978 /// \defgroup AArch64CCMP CMP;CCMP matching
2980 /// These functions deal with the formation of CMP;CCMP;... sequences.
2981 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2982 /// a comparison. They set the NZCV flags to a predefined value if their
2983 /// predicate is false. This allows to express arbitrary conjunctions, for
2984 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2985 /// expressed as:
2986 /// cmp A
2987 /// ccmp B, inv(CB), CA
2988 /// check for CB flags
2990 /// This naturally lets us implement chains of AND operations with SETCC
2991 /// operands. And we can even implement some other situations by transforming
2992 /// them:
2993 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
2994 /// negating the flags used in a CCMP/FCCMP operations.
2995 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2996 /// by negating the flags we test for afterwards. i.e.
2997 /// NEG (CMP CCMP CCCMP ...) can be implemented.
2998 /// - Note that we can only ever negate all previously processed results.
2999 /// What we can not implement by flipping the flags to test is a negation
3000 /// of two sub-trees (because the negation affects all sub-trees emitted so
3001 /// far, so the 2nd sub-tree we emit would also affect the first).
3002 /// With those tools we can implement some OR operations:
3003 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
3004 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3005 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
3006 /// elimination rules from earlier to implement the whole thing as a
3007 /// CCMP/FCCMP chain.
3009 /// As complete example:
3010 /// or (or (setCA (cmp A)) (setCB (cmp B)))
3011 /// (and (setCC (cmp C)) (setCD (cmp D)))"
3012 /// can be reassociated to:
3013 /// or (and (setCC (cmp C)) setCD (cmp D))
3014 // (or (setCA (cmp A)) (setCB (cmp B)))
3015 /// can be transformed to:
3016 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3017 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3018 /// which can be implemented as:
3019 /// cmp C
3020 /// ccmp D, inv(CD), CC
3021 /// ccmp A, CA, inv(CD)
3022 /// ccmp B, CB, inv(CA)
3023 /// check for CB flags
3025 /// A counterexample is "or (and A B) (and C D)" which translates to
3026 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3027 /// can only implement 1 of the inner (not) operations, but not both!
3028 /// @{
3030 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3031 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
3032 ISD::CondCode CC, SDValue CCOp,
3033 AArch64CC::CondCode Predicate,
3034 AArch64CC::CondCode OutCC,
3035 const SDLoc &DL, SelectionDAG &DAG) {
3036 unsigned Opcode = 0;
3037 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3039 if (LHS.getValueType().isFloatingPoint()) {
3040 assert(LHS.getValueType() != MVT::f128);
3041 if (LHS.getValueType() == MVT::f16 && !FullFP16) {
3042 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3043 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3045 Opcode = AArch64ISD::FCCMP;
3046 } else if (RHS.getOpcode() == ISD::SUB) {
3047 SDValue SubOp0 = RHS.getOperand(0);
3048 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3049 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3050 Opcode = AArch64ISD::CCMN;
3051 RHS = RHS.getOperand(1);
3054 if (Opcode == 0)
3055 Opcode = AArch64ISD::CCMP;
3057 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3058 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
3059 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3060 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3061 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3064 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3065 /// expressed as a conjunction. See \ref AArch64CCMP.
3066 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
3067 /// changing the conditions on the SETCC tests.
3068 /// (this means we can call emitConjunctionRec() with
3069 /// Negate==true on this sub-tree)
3070 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
3071 /// cannot do the negation naturally. We are required to
3072 /// emit the subtree first in this case.
3073 /// \param WillNegate Is true if are called when the result of this
3074 /// subexpression must be negated. This happens when the
3075 /// outer expression is an OR. We can use this fact to know
3076 /// that we have a double negation (or (or ...) ...) that
3077 /// can be implemented for free.
3078 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3079 bool &MustBeFirst, bool WillNegate,
3080 unsigned Depth = 0) {
3081 if (!Val.hasOneUse())
3082 return false;
3083 unsigned Opcode = Val->getOpcode();
3084 if (Opcode == ISD::SETCC) {
3085 if (Val->getOperand(0).getValueType() == MVT::f128)
3086 return false;
3087 CanNegate = true;
3088 MustBeFirst = false;
3089 return true;
3091 // Protect against exponential runtime and stack overflow.
3092 if (Depth > 6)
3093 return false;
3094 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3095 bool IsOR = Opcode == ISD::OR;
3096 SDValue O0 = Val->getOperand(0);
3097 SDValue O1 = Val->getOperand(1);
3098 bool CanNegateL;
3099 bool MustBeFirstL;
3100 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3101 return false;
3102 bool CanNegateR;
3103 bool MustBeFirstR;
3104 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3105 return false;
3107 if (MustBeFirstL && MustBeFirstR)
3108 return false;
3110 if (IsOR) {
3111 // For an OR expression we need to be able to naturally negate at least
3112 // one side or we cannot do the transformation at all.
3113 if (!CanNegateL && !CanNegateR)
3114 return false;
3115 // If we the result of the OR will be negated and we can naturally negate
3116 // the leafs, then this sub-tree as a whole negates naturally.
3117 CanNegate = WillNegate && CanNegateL && CanNegateR;
3118 // If we cannot naturally negate the whole sub-tree, then this must be
3119 // emitted first.
3120 MustBeFirst = !CanNegate;
3121 } else {
3122 assert(Opcode == ISD::AND && "Must be OR or AND");
3123 // We cannot naturally negate an AND operation.
3124 CanNegate = false;
3125 MustBeFirst = MustBeFirstL || MustBeFirstR;
3127 return true;
3129 return false;
3132 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3133 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3134 /// Tries to transform the given i1 producing node @p Val to a series compare
3135 /// and conditional compare operations. @returns an NZCV flags producing node
3136 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3137 /// transformation was not possible.
3138 /// \p Negate is true if we want this sub-tree being negated just by changing
3139 /// SETCC conditions.
3140 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
3141 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3142 AArch64CC::CondCode Predicate) {
3143 // We're at a tree leaf, produce a conditional comparison operation.
3144 unsigned Opcode = Val->getOpcode();
3145 if (Opcode == ISD::SETCC) {
3146 SDValue LHS = Val->getOperand(0);
3147 SDValue RHS = Val->getOperand(1);
3148 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3149 bool isInteger = LHS.getValueType().isInteger();
3150 if (Negate)
3151 CC = getSetCCInverse(CC, LHS.getValueType());
3152 SDLoc DL(Val);
3153 // Determine OutCC and handle FP special case.
3154 if (isInteger) {
3155 OutCC = changeIntCCToAArch64CC(CC);
3156 } else {
3157 assert(LHS.getValueType().isFloatingPoint());
3158 AArch64CC::CondCode ExtraCC;
3159 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3160 // Some floating point conditions can't be tested with a single condition
3161 // code. Construct an additional comparison in this case.
3162 if (ExtraCC != AArch64CC::AL) {
3163 SDValue ExtraCmp;
3164 if (!CCOp.getNode())
3165 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3166 else
3167 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3168 ExtraCC, DL, DAG);
3169 CCOp = ExtraCmp;
3170 Predicate = ExtraCC;
3174 // Produce a normal comparison if we are first in the chain
3175 if (!CCOp)
3176 return emitComparison(LHS, RHS, CC, DL, DAG);
3177 // Otherwise produce a ccmp.
3178 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3179 DAG);
3181 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3183 bool IsOR = Opcode == ISD::OR;
3185 SDValue LHS = Val->getOperand(0);
3186 bool CanNegateL;
3187 bool MustBeFirstL;
3188 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3189 assert(ValidL && "Valid conjunction/disjunction tree");
3190 (void)ValidL;
3192 SDValue RHS = Val->getOperand(1);
3193 bool CanNegateR;
3194 bool MustBeFirstR;
3195 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3196 assert(ValidR && "Valid conjunction/disjunction tree");
3197 (void)ValidR;
3199 // Swap sub-tree that must come first to the right side.
3200 if (MustBeFirstL) {
3201 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3202 std::swap(LHS, RHS);
3203 std::swap(CanNegateL, CanNegateR);
3204 std::swap(MustBeFirstL, MustBeFirstR);
3207 bool NegateR;
3208 bool NegateAfterR;
3209 bool NegateL;
3210 bool NegateAfterAll;
3211 if (Opcode == ISD::OR) {
3212 // Swap the sub-tree that we can negate naturally to the left.
3213 if (!CanNegateL) {
3214 assert(CanNegateR && "at least one side must be negatable");
3215 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3216 assert(!Negate);
3217 std::swap(LHS, RHS);
3218 NegateR = false;
3219 NegateAfterR = true;
3220 } else {
3221 // Negate the left sub-tree if possible, otherwise negate the result.
3222 NegateR = CanNegateR;
3223 NegateAfterR = !CanNegateR;
3225 NegateL = true;
3226 NegateAfterAll = !Negate;
3227 } else {
3228 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3229 assert(!Negate && "Valid conjunction/disjunction tree");
3231 NegateL = false;
3232 NegateR = false;
3233 NegateAfterR = false;
3234 NegateAfterAll = false;
3237 // Emit sub-trees.
3238 AArch64CC::CondCode RHSCC;
3239 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3240 if (NegateAfterR)
3241 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3242 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3243 if (NegateAfterAll)
3244 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3245 return CmpL;
3248 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3249 /// In some cases this is even possible with OR operations in the expression.
3250 /// See \ref AArch64CCMP.
3251 /// \see emitConjunctionRec().
3252 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
3253 AArch64CC::CondCode &OutCC) {
3254 bool DummyCanNegate;
3255 bool DummyMustBeFirst;
3256 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3257 return SDValue();
3259 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3262 /// @}
3264 /// Returns how profitable it is to fold a comparison's operand's shift and/or
3265 /// extension operations.
3266 static unsigned getCmpOperandFoldingProfit(SDValue Op) {
3267 auto isSupportedExtend = [&](SDValue V) {
3268 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3269 return true;
3271 if (V.getOpcode() == ISD::AND)
3272 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3273 uint64_t Mask = MaskCst->getZExtValue();
3274 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3277 return false;
3280 if (!Op.hasOneUse())
3281 return 0;
3283 if (isSupportedExtend(Op))
3284 return 1;
3286 unsigned Opc = Op.getOpcode();
3287 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3288 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3289 uint64_t Shift = ShiftCst->getZExtValue();
3290 if (isSupportedExtend(Op.getOperand(0)))
3291 return (Shift <= 4) ? 2 : 1;
3292 EVT VT = Op.getValueType();
3293 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3294 return 1;
3297 return 0;
3300 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3301 SDValue &AArch64cc, SelectionDAG &DAG,
3302 const SDLoc &dl) {
3303 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3304 EVT VT = RHS.getValueType();
3305 uint64_t C = RHSC->getZExtValue();
3306 if (!isLegalArithImmed(C)) {
3307 // Constant does not fit, try adjusting it by one?
3308 switch (CC) {
3309 default:
3310 break;
3311 case ISD::SETLT:
3312 case ISD::SETGE:
3313 if ((VT == MVT::i32 && C != 0x80000000 &&
3314 isLegalArithImmed((uint32_t)(C - 1))) ||
3315 (VT == MVT::i64 && C != 0x80000000ULL &&
3316 isLegalArithImmed(C - 1ULL))) {
3317 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3318 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3319 RHS = DAG.getConstant(C, dl, VT);
3321 break;
3322 case ISD::SETULT:
3323 case ISD::SETUGE:
3324 if ((VT == MVT::i32 && C != 0 &&
3325 isLegalArithImmed((uint32_t)(C - 1))) ||
3326 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3327 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3328 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3329 RHS = DAG.getConstant(C, dl, VT);
3331 break;
3332 case ISD::SETLE:
3333 case ISD::SETGT:
3334 if ((VT == MVT::i32 && C != INT32_MAX &&
3335 isLegalArithImmed((uint32_t)(C + 1))) ||
3336 (VT == MVT::i64 && C != INT64_MAX &&
3337 isLegalArithImmed(C + 1ULL))) {
3338 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3339 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3340 RHS = DAG.getConstant(C, dl, VT);
3342 break;
3343 case ISD::SETULE:
3344 case ISD::SETUGT:
3345 if ((VT == MVT::i32 && C != UINT32_MAX &&
3346 isLegalArithImmed((uint32_t)(C + 1))) ||
3347 (VT == MVT::i64 && C != UINT64_MAX &&
3348 isLegalArithImmed(C + 1ULL))) {
3349 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3350 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3351 RHS = DAG.getConstant(C, dl, VT);
3353 break;
3358 // Comparisons are canonicalized so that the RHS operand is simpler than the
3359 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3360 // can fold some shift+extend operations on the RHS operand, so swap the
3361 // operands if that can be done.
3363 // For example:
3364 // lsl w13, w11, #1
3365 // cmp w13, w12
3366 // can be turned into:
3367 // cmp w12, w11, lsl #1
3368 if (!isa<ConstantSDNode>(RHS) ||
3369 !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
3370 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3372 if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
3373 std::swap(LHS, RHS);
3374 CC = ISD::getSetCCSwappedOperands(CC);
3378 SDValue Cmp;
3379 AArch64CC::CondCode AArch64CC;
3380 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3381 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3383 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3384 // For the i8 operand, the largest immediate is 255, so this can be easily
3385 // encoded in the compare instruction. For the i16 operand, however, the
3386 // largest immediate cannot be encoded in the compare.
3387 // Therefore, use a sign extending load and cmn to avoid materializing the
3388 // -1 constant. For example,
3389 // movz w1, #65535
3390 // ldrh w0, [x0, #0]
3391 // cmp w0, w1
3392 // >
3393 // ldrsh w0, [x0, #0]
3394 // cmn w0, #1
3395 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3396 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3397 // ensure both the LHS and RHS are truly zero extended and to make sure the
3398 // transformation is profitable.
3399 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3400 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3401 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3402 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3403 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
3404 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3405 SDValue SExt =
3406 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3407 DAG.getValueType(MVT::i16));
3408 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3409 RHS.getValueType()),
3410 CC, dl, DAG);
3411 AArch64CC = changeIntCCToAArch64CC(CC);
3415 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3416 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3417 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3418 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3423 if (!Cmp) {
3424 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3425 AArch64CC = changeIntCCToAArch64CC(CC);
3427 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3428 return Cmp;
3431 static std::pair<SDValue, SDValue>
3432 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
3433 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3434 "Unsupported value type");
3435 SDValue Value, Overflow;
3436 SDLoc DL(Op);
3437 SDValue LHS = Op.getOperand(0);
3438 SDValue RHS = Op.getOperand(1);
3439 unsigned Opc = 0;
3440 switch (Op.getOpcode()) {
3441 default:
3442 llvm_unreachable("Unknown overflow instruction!");
3443 case ISD::SADDO:
3444 Opc = AArch64ISD::ADDS;
3445 CC = AArch64CC::VS;
3446 break;
3447 case ISD::UADDO:
3448 Opc = AArch64ISD::ADDS;
3449 CC = AArch64CC::HS;
3450 break;
3451 case ISD::SSUBO:
3452 Opc = AArch64ISD::SUBS;
3453 CC = AArch64CC::VS;
3454 break;
3455 case ISD::USUBO:
3456 Opc = AArch64ISD::SUBS;
3457 CC = AArch64CC::LO;
3458 break;
3459 // Multiply needs a little bit extra work.
3460 case ISD::SMULO:
3461 case ISD::UMULO: {
3462 CC = AArch64CC::NE;
3463 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3464 if (Op.getValueType() == MVT::i32) {
3465 // Extend to 64-bits, then perform a 64-bit multiply.
3466 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3467 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3468 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3469 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3470 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3472 // Check that the result fits into a 32-bit integer.
3473 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3474 if (IsSigned) {
3475 // cmp xreg, wreg, sxtw
3476 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3477 Overflow =
3478 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3479 } else {
3480 // tst xreg, #0xffffffff00000000
3481 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3482 Overflow =
3483 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3485 break;
3487 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3488 // For the 64 bit multiply
3489 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3490 if (IsSigned) {
3491 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3492 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3493 DAG.getConstant(63, DL, MVT::i64));
3494 // It is important that LowerBits is last, otherwise the arithmetic
3495 // shift will not be folded into the compare (SUBS).
3496 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3497 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3498 .getValue(1);
3499 } else {
3500 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3501 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3502 Overflow =
3503 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3504 DAG.getConstant(0, DL, MVT::i64),
3505 UpperBits).getValue(1);
3507 break;
3509 } // switch (...)
3511 if (Opc) {
3512 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3514 // Emit the AArch64 operation with overflow check.
3515 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3516 Overflow = Value.getValue(1);
3518 return std::make_pair(Value, Overflow);
3521 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3522 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3523 return LowerToScalableOp(Op, DAG);
3525 SDValue Sel = Op.getOperand(0);
3526 SDValue Other = Op.getOperand(1);
3527 SDLoc dl(Sel);
3529 // If the operand is an overflow checking operation, invert the condition
3530 // code and kill the Not operation. I.e., transform:
3531 // (xor (overflow_op_bool, 1))
3532 // -->
3533 // (csel 1, 0, invert(cc), overflow_op_bool)
3534 // ... which later gets transformed to just a cset instruction with an
3535 // inverted condition code, rather than a cset + eor sequence.
3536 if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
3537 // Only lower legal XALUO ops.
3538 if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
3539 return SDValue();
3541 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3542 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3543 AArch64CC::CondCode CC;
3544 SDValue Value, Overflow;
3545 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3546 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3547 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3548 CCVal, Overflow);
3550 // If neither operand is a SELECT_CC, give up.
3551 if (Sel.getOpcode() != ISD::SELECT_CC)
3552 std::swap(Sel, Other);
3553 if (Sel.getOpcode() != ISD::SELECT_CC)
3554 return Op;
3556 // The folding we want to perform is:
3557 // (xor x, (select_cc a, b, cc, 0, -1) )
3558 // -->
3559 // (csel x, (xor x, -1), cc ...)
3561 // The latter will get matched to a CSINV instruction.
3563 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3564 SDValue LHS = Sel.getOperand(0);
3565 SDValue RHS = Sel.getOperand(1);
3566 SDValue TVal = Sel.getOperand(2);
3567 SDValue FVal = Sel.getOperand(3);
3569 // FIXME: This could be generalized to non-integer comparisons.
3570 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3571 return Op;
3573 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3574 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3576 // The values aren't constants, this isn't the pattern we're looking for.
3577 if (!CFVal || !CTVal)
3578 return Op;
3580 // We can commute the SELECT_CC by inverting the condition. This
3581 // might be needed to make this fit into a CSINV pattern.
3582 if (CTVal->isAllOnes() && CFVal->isZero()) {
3583 std::swap(TVal, FVal);
3584 std::swap(CTVal, CFVal);
3585 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3588 // If the constants line up, perform the transform!
3589 if (CTVal->isZero() && CFVal->isAllOnes()) {
3590 SDValue CCVal;
3591 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3593 FVal = Other;
3594 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3595 DAG.getConstant(-1ULL, dl, Other.getValueType()));
3597 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3598 CCVal, Cmp);
3601 return Op;
3604 // If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3605 // bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3606 // sets 'C' bit to 0.
3607 static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
3608 SDLoc DL(Value);
3609 EVT VT = Value.getValueType();
3610 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3611 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3612 SDValue Cmp =
3613 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3614 return Cmp.getValue(1);
3617 // If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3618 // If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3619 static SDValue carryFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG,
3620 bool Invert) {
3621 assert(Flag.getResNo() == 1);
3622 SDLoc DL(Flag);
3623 SDValue Zero = DAG.getConstant(0, DL, VT);
3624 SDValue One = DAG.getConstant(1, DL, VT);
3625 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3626 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3627 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
3630 // Value is 1 if 'V' bit of NZCV is 1, else 0
3631 static SDValue overflowFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG) {
3632 assert(Flag.getResNo() == 1);
3633 SDLoc DL(Flag);
3634 SDValue Zero = DAG.getConstant(0, DL, VT);
3635 SDValue One = DAG.getConstant(1, DL, VT);
3636 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3637 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
3640 // This lowering is inefficient, but it will get cleaned up by
3641 // `foldOverflowCheck`
3642 static SDValue lowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode,
3643 bool IsSigned) {
3644 EVT VT0 = Op.getValue(0).getValueType();
3645 EVT VT1 = Op.getValue(1).getValueType();
3647 if (VT0 != MVT::i32 && VT0 != MVT::i64)
3648 return SDValue();
3650 bool InvertCarry = Opcode == AArch64ISD::SBCS;
3651 SDValue OpLHS = Op.getOperand(0);
3652 SDValue OpRHS = Op.getOperand(1);
3653 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3655 SDLoc DL(Op);
3656 SDVTList VTs = DAG.getVTList(VT0, VT1);
3658 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3659 OpRHS, OpCarryIn);
3661 SDValue OutFlag =
3662 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3663 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3665 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3668 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
3669 // Let legalize expand this if it isn't a legal type yet.
3670 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3671 return SDValue();
3673 SDLoc dl(Op);
3674 AArch64CC::CondCode CC;
3675 // The actual operation that sets the overflow or carry flag.
3676 SDValue Value, Overflow;
3677 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3679 // We use 0 and 1 as false and true values.
3680 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3681 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3683 // We use an inverted condition, because the conditional select is inverted
3684 // too. This will allow it to be selected to a single instruction:
3685 // CSINC Wd, WZR, WZR, invert(cond).
3686 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3687 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3688 CCVal, Overflow);
3690 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3691 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3694 // Prefetch operands are:
3695 // 1: Address to prefetch
3696 // 2: bool isWrite
3697 // 3: int locality (0 = no locality ... 3 = extreme locality)
3698 // 4: bool isDataCache
3699 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
3700 SDLoc DL(Op);
3701 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
3702 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
3703 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3705 bool IsStream = !Locality;
3706 // When the locality number is set
3707 if (Locality) {
3708 // The front-end should have filtered out the out-of-range values
3709 assert(Locality <= 3 && "Prefetch locality out-of-range");
3710 // The locality degree is the opposite of the cache speed.
3711 // Put the number the other way around.
3712 // The encoding starts at 0 for level 1
3713 Locality = 3 - Locality;
3716 // built the mask value encoding the expected behavior.
3717 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
3718 (!IsData << 3) | // IsDataCache bit
3719 (Locality << 1) | // Cache level bits
3720 (unsigned)IsStream; // Stream bit
3721 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
3722 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
3723 Op.getOperand(1));
3726 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
3727 SelectionDAG &DAG) const {
3728 EVT VT = Op.getValueType();
3729 if (VT.isScalableVector())
3730 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
3732 if (useSVEForFixedLengthVectorVT(VT))
3733 return LowerFixedLengthFPExtendToSVE(Op, DAG);
3735 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
3736 return SDValue();
3739 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
3740 SelectionDAG &DAG) const {
3741 if (Op.getValueType().isScalableVector())
3742 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
3744 bool IsStrict = Op->isStrictFPOpcode();
3745 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3746 EVT SrcVT = SrcVal.getValueType();
3748 if (useSVEForFixedLengthVectorVT(SrcVT))
3749 return LowerFixedLengthFPRoundToSVE(Op, DAG);
3751 if (SrcVT != MVT::f128) {
3752 // Expand cases where the input is a vector bigger than NEON.
3753 if (useSVEForFixedLengthVectorVT(SrcVT))
3754 return SDValue();
3756 // It's legal except when f128 is involved
3757 return Op;
3760 return SDValue();
3763 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
3764 SelectionDAG &DAG) const {
3765 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3766 // Any additional optimization in this function should be recorded
3767 // in the cost tables.
3768 bool IsStrict = Op->isStrictFPOpcode();
3769 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
3770 EVT VT = Op.getValueType();
3772 if (VT.isScalableVector()) {
3773 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
3774 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
3775 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
3776 return LowerToPredicatedOp(Op, DAG, Opcode);
3779 if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
3780 return LowerFixedLengthFPToIntToSVE(Op, DAG);
3782 unsigned NumElts = InVT.getVectorNumElements();
3784 // f16 conversions are promoted to f32 when full fp16 is not supported.
3785 if (InVT.getVectorElementType() == MVT::f16 &&
3786 !Subtarget->hasFullFP16()) {
3787 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
3788 SDLoc dl(Op);
3789 if (IsStrict) {
3790 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
3791 {Op.getOperand(0), Op.getOperand(1)});
3792 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
3793 {Ext.getValue(1), Ext.getValue(0)});
3795 return DAG.getNode(
3796 Op.getOpcode(), dl, Op.getValueType(),
3797 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
3800 uint64_t VTSize = VT.getFixedSizeInBits();
3801 uint64_t InVTSize = InVT.getFixedSizeInBits();
3802 if (VTSize < InVTSize) {
3803 SDLoc dl(Op);
3804 if (IsStrict) {
3805 InVT = InVT.changeVectorElementTypeToInteger();
3806 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
3807 {Op.getOperand(0), Op.getOperand(1)});
3808 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3809 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
3811 SDValue Cv =
3812 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
3813 Op.getOperand(0));
3814 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3817 if (VTSize > InVTSize) {
3818 SDLoc dl(Op);
3819 MVT ExtVT =
3820 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
3821 VT.getVectorNumElements());
3822 if (IsStrict) {
3823 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
3824 {Op.getOperand(0), Op.getOperand(1)});
3825 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
3826 {Ext.getValue(1), Ext.getValue(0)});
3828 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
3829 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
3832 // Use a scalar operation for conversions between single-element vectors of
3833 // the same size.
3834 if (NumElts == 1) {
3835 SDLoc dl(Op);
3836 SDValue Extract = DAG.getNode(
3837 ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
3838 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
3839 EVT ScalarVT = VT.getScalarType();
3840 if (IsStrict)
3841 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
3842 {Op.getOperand(0), Extract});
3843 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
3846 // Type changing conversions are illegal.
3847 return Op;
3850 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
3851 SelectionDAG &DAG) const {
3852 bool IsStrict = Op->isStrictFPOpcode();
3853 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3855 if (SrcVal.getValueType().isVector())
3856 return LowerVectorFP_TO_INT(Op, DAG);
3858 // f16 conversions are promoted to f32 when full fp16 is not supported.
3859 if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
3860 SDLoc dl(Op);
3861 if (IsStrict) {
3862 SDValue Ext =
3863 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3864 {Op.getOperand(0), SrcVal});
3865 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
3866 {Ext.getValue(1), Ext.getValue(0)});
3868 return DAG.getNode(
3869 Op.getOpcode(), dl, Op.getValueType(),
3870 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
3873 if (SrcVal.getValueType() != MVT::f128) {
3874 // It's legal except when f128 is involved
3875 return Op;
3878 return SDValue();
3881 SDValue
3882 AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
3883 SelectionDAG &DAG) const {
3884 // AArch64 FP-to-int conversions saturate to the destination element size, so
3885 // we can lower common saturating conversions to simple instructions.
3886 SDValue SrcVal = Op.getOperand(0);
3887 EVT SrcVT = SrcVal.getValueType();
3888 EVT DstVT = Op.getValueType();
3889 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3891 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
3892 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
3893 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3894 assert(SatWidth <= DstElementWidth &&
3895 "Saturation width cannot exceed result width");
3897 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
3898 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
3899 // types, so this is hard to reach.
3900 if (DstVT.isScalableVector())
3901 return SDValue();
3903 EVT SrcElementVT = SrcVT.getVectorElementType();
3905 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
3906 if (SrcElementVT == MVT::f16 &&
3907 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) {
3908 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
3909 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
3910 SrcVT = F32VT;
3911 SrcElementVT = MVT::f32;
3912 SrcElementWidth = 32;
3913 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
3914 SrcElementVT != MVT::f16)
3915 return SDValue();
3917 SDLoc DL(Op);
3918 // Cases that we can emit directly.
3919 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
3920 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
3921 DAG.getValueType(DstVT.getScalarType()));
3923 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
3924 // result. This is only valid if the legal cvt is larger than the saturate
3925 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
3926 // (at least until sqxtn is selected).
3927 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
3928 return SDValue();
3930 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
3931 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
3932 DAG.getValueType(IntVT.getScalarType()));
3933 SDValue Sat;
3934 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3935 SDValue MinC = DAG.getConstant(
3936 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
3937 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
3938 SDValue MaxC = DAG.getConstant(
3939 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
3940 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
3941 } else {
3942 SDValue MinC = DAG.getConstant(
3943 APInt::getAllOnesValue(SatWidth).zext(SrcElementWidth), DL, IntVT);
3944 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
3947 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
3950 SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
3951 SelectionDAG &DAG) const {
3952 // AArch64 FP-to-int conversions saturate to the destination register size, so
3953 // we can lower common saturating conversions to simple instructions.
3954 SDValue SrcVal = Op.getOperand(0);
3955 EVT SrcVT = SrcVal.getValueType();
3957 if (SrcVT.isVector())
3958 return LowerVectorFP_TO_INT_SAT(Op, DAG);
3960 EVT DstVT = Op.getValueType();
3961 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3962 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3963 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3964 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3966 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
3967 if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) {
3968 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
3969 SrcVT = MVT::f32;
3970 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16)
3971 return SDValue();
3973 SDLoc DL(Op);
3974 // Cases that we can emit directly.
3975 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
3976 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
3977 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
3978 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
3979 DAG.getValueType(DstVT));
3981 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
3982 // result. This is only valid if the legal cvt is larger than the saturate
3983 // width.
3984 if (DstWidth < SatWidth)
3985 return SDValue();
3987 SDValue NativeCvt =
3988 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
3989 SDValue Sat;
3990 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3991 SDValue MinC = DAG.getConstant(
3992 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
3993 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
3994 SDValue MaxC = DAG.getConstant(
3995 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
3996 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
3997 } else {
3998 SDValue MinC = DAG.getConstant(
3999 APInt::getAllOnesValue(SatWidth).zext(DstWidth), DL, DstVT);
4000 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4003 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4006 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4007 SelectionDAG &DAG) const {
4008 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4009 // Any additional optimization in this function should be recorded
4010 // in the cost tables.
4011 bool IsStrict = Op->isStrictFPOpcode();
4012 EVT VT = Op.getValueType();
4013 SDLoc dl(Op);
4014 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4015 EVT InVT = In.getValueType();
4016 unsigned Opc = Op.getOpcode();
4017 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4019 if (VT.isScalableVector()) {
4020 if (InVT.getVectorElementType() == MVT::i1) {
4021 // We can't directly extend an SVE predicate; extend it first.
4022 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4023 EVT CastVT = getPromotedVTForPredicate(InVT);
4024 In = DAG.getNode(CastOpc, dl, CastVT, In);
4025 return DAG.getNode(Opc, dl, VT, In);
4028 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4029 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
4030 return LowerToPredicatedOp(Op, DAG, Opcode);
4033 if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
4034 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4036 uint64_t VTSize = VT.getFixedSizeInBits();
4037 uint64_t InVTSize = InVT.getFixedSizeInBits();
4038 if (VTSize < InVTSize) {
4039 MVT CastVT =
4040 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
4041 InVT.getVectorNumElements());
4042 if (IsStrict) {
4043 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4044 {Op.getOperand(0), In});
4045 return DAG.getNode(
4046 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4047 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4049 In = DAG.getNode(Opc, dl, CastVT, In);
4050 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4051 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4054 if (VTSize > InVTSize) {
4055 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4056 EVT CastVT = VT.changeVectorElementTypeToInteger();
4057 In = DAG.getNode(CastOpc, dl, CastVT, In);
4058 if (IsStrict)
4059 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4060 return DAG.getNode(Opc, dl, VT, In);
4063 // Use a scalar operation for conversions between single-element vectors of
4064 // the same size.
4065 if (VT.getVectorNumElements() == 1) {
4066 SDValue Extract = DAG.getNode(
4067 ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
4068 In, DAG.getConstant(0, dl, MVT::i64));
4069 EVT ScalarVT = VT.getScalarType();
4070 if (IsStrict)
4071 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4072 {Op.getOperand(0), Extract});
4073 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4076 return Op;
4079 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4080 SelectionDAG &DAG) const {
4081 if (Op.getValueType().isVector())
4082 return LowerVectorINT_TO_FP(Op, DAG);
4084 bool IsStrict = Op->isStrictFPOpcode();
4085 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4087 // f16 conversions are promoted to f32 when full fp16 is not supported.
4088 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4089 SDLoc dl(Op);
4090 if (IsStrict) {
4091 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other},
4092 {Op.getOperand(0), SrcVal});
4093 return DAG.getNode(
4094 ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other},
4095 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4097 return DAG.getNode(
4098 ISD::FP_ROUND, dl, MVT::f16,
4099 DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
4100 DAG.getIntPtrConstant(0, dl));
4103 // i128 conversions are libcalls.
4104 if (SrcVal.getValueType() == MVT::i128)
4105 return SDValue();
4107 // Other conversions are legal, unless it's to the completely software-based
4108 // fp128.
4109 if (Op.getValueType() != MVT::f128)
4110 return Op;
4111 return SDValue();
4114 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4115 SelectionDAG &DAG) const {
4116 // For iOS, we want to call an alternative entry point: __sincos_stret,
4117 // which returns the values in two S / D registers.
4118 SDLoc dl(Op);
4119 SDValue Arg = Op.getOperand(0);
4120 EVT ArgVT = Arg.getValueType();
4121 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4123 ArgListTy Args;
4124 ArgListEntry Entry;
4126 Entry.Node = Arg;
4127 Entry.Ty = ArgTy;
4128 Entry.IsSExt = false;
4129 Entry.IsZExt = false;
4130 Args.push_back(Entry);
4132 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4133 : RTLIB::SINCOS_STRET_F32;
4134 const char *LibcallName = getLibcallName(LC);
4135 SDValue Callee =
4136 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4138 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4139 TargetLowering::CallLoweringInfo CLI(DAG);
4140 CLI.setDebugLoc(dl)
4141 .setChain(DAG.getEntryNode())
4142 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4144 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4145 return CallResult.first;
4148 static MVT getSVEContainerType(EVT ContentTy);
4150 SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4151 SelectionDAG &DAG) const {
4152 EVT OpVT = Op.getValueType();
4153 EVT ArgVT = Op.getOperand(0).getValueType();
4155 if (useSVEForFixedLengthVectorVT(OpVT))
4156 return LowerFixedLengthBitcastToSVE(Op, DAG);
4158 if (OpVT.isScalableVector()) {
4159 // Bitcasting between unpacked vector types of different element counts is
4160 // not a NOP because the live elements are laid out differently.
4161 // 01234567
4162 // e.g. nxv2i32 = XX??XX??
4163 // nxv4f16 = X?X?X?X?
4164 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4165 return SDValue();
4167 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4168 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4169 "Expected int->fp bitcast!");
4170 SDValue ExtResult =
4171 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
4172 Op.getOperand(0));
4173 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4175 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4178 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4179 return SDValue();
4181 // Bitcasts between f16 and bf16 are legal.
4182 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4183 return Op;
4185 assert(ArgVT == MVT::i16);
4186 SDLoc DL(Op);
4188 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4189 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4190 return SDValue(
4191 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
4192 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
4196 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4197 if (OrigVT.getSizeInBits() >= 64)
4198 return OrigVT;
4200 assert(OrigVT.isSimple() && "Expecting a simple value type");
4202 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4203 switch (OrigSimpleTy) {
4204 default: llvm_unreachable("Unexpected Vector Type");
4205 case MVT::v2i8:
4206 case MVT::v2i16:
4207 return MVT::v2i32;
4208 case MVT::v4i8:
4209 return MVT::v4i16;
4213 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
4214 const EVT &OrigTy,
4215 const EVT &ExtTy,
4216 unsigned ExtOpcode) {
4217 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4218 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4219 // 64-bits we need to insert a new extension so that it will be 64-bits.
4220 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4221 if (OrigTy.getSizeInBits() >= 64)
4222 return N;
4224 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4225 EVT NewVT = getExtensionTo64Bits(OrigTy);
4227 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4230 // Returns lane if Op extracts from a two-element vector and lane is constant
4231 // (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and None otherwise.
4232 static Optional<uint64_t> getConstantLaneNumOfExtractHalfOperand(SDValue &Op) {
4233 SDNode *OpNode = Op.getNode();
4234 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4235 return None;
4237 EVT VT = OpNode->getOperand(0).getValueType();
4238 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4239 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4240 return None;
4242 return C->getZExtValue();
4245 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
4246 bool isSigned) {
4247 EVT VT = N->getValueType(0);
4249 if (N->getOpcode() != ISD::BUILD_VECTOR)
4250 return false;
4252 for (const SDValue &Elt : N->op_values()) {
4253 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4254 unsigned EltSize = VT.getScalarSizeInBits();
4255 unsigned HalfSize = EltSize / 2;
4256 if (isSigned) {
4257 if (!isIntN(HalfSize, C->getSExtValue()))
4258 return false;
4259 } else {
4260 if (!isUIntN(HalfSize, C->getZExtValue()))
4261 return false;
4263 continue;
4265 return false;
4268 return true;
4271 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
4272 if (N->getOpcode() == ISD::SIGN_EXTEND ||
4273 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
4274 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
4275 N->getOperand(0)->getValueType(0),
4276 N->getValueType(0),
4277 N->getOpcode());
4279 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4280 EVT VT = N->getValueType(0);
4281 SDLoc dl(N);
4282 unsigned EltSize = VT.getScalarSizeInBits() / 2;
4283 unsigned NumElts = VT.getVectorNumElements();
4284 MVT TruncVT = MVT::getIntegerVT(EltSize);
4285 SmallVector<SDValue, 8> Ops;
4286 for (unsigned i = 0; i != NumElts; ++i) {
4287 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
4288 const APInt &CInt = C->getAPIntValue();
4289 // Element types smaller than 32 bits are not legal, so use i32 elements.
4290 // The values are implicitly truncated so sext vs. zext doesn't matter.
4291 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4293 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
4296 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
4297 return N->getOpcode() == ISD::SIGN_EXTEND ||
4298 N->getOpcode() == ISD::ANY_EXTEND ||
4299 isExtendedBUILD_VECTOR(N, DAG, true);
4302 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
4303 return N->getOpcode() == ISD::ZERO_EXTEND ||
4304 N->getOpcode() == ISD::ANY_EXTEND ||
4305 isExtendedBUILD_VECTOR(N, DAG, false);
4308 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
4309 unsigned Opcode = N->getOpcode();
4310 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4311 SDNode *N0 = N->getOperand(0).getNode();
4312 SDNode *N1 = N->getOperand(1).getNode();
4313 return N0->hasOneUse() && N1->hasOneUse() &&
4314 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4316 return false;
4319 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
4320 unsigned Opcode = N->getOpcode();
4321 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4322 SDNode *N0 = N->getOperand(0).getNode();
4323 SDNode *N1 = N->getOperand(1).getNode();
4324 return N0->hasOneUse() && N1->hasOneUse() &&
4325 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4327 return false;
4330 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
4331 SelectionDAG &DAG) const {
4332 // The rounding mode is in bits 23:22 of the FPSCR.
4333 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4334 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4335 // so that the shift + and get folded into a bitfield extract.
4336 SDLoc dl(Op);
4338 SDValue Chain = Op.getOperand(0);
4339 SDValue FPCR_64 = DAG.getNode(
4340 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4341 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4342 Chain = FPCR_64.getValue(1);
4343 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4344 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4345 DAG.getConstant(1U << 22, dl, MVT::i32));
4346 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4347 DAG.getConstant(22, dl, MVT::i32));
4348 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4349 DAG.getConstant(3, dl, MVT::i32));
4350 return DAG.getMergeValues({AND, Chain}, dl);
4353 SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4354 SelectionDAG &DAG) const {
4355 SDLoc DL(Op);
4356 SDValue Chain = Op->getOperand(0);
4357 SDValue RMValue = Op->getOperand(1);
4359 // The rounding mode is in bits 23:22 of the FPCR.
4360 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4361 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4362 // ((arg - 1) & 3) << 22).
4364 // The argument of llvm.set.rounding must be within the segment [0, 3], so
4365 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4366 // generated llvm.set.rounding to ensure this condition.
4368 // Calculate new value of FPCR[23:22].
4369 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4370 DAG.getConstant(1, DL, MVT::i32));
4371 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4372 DAG.getConstant(0x3, DL, MVT::i32));
4373 RMValue =
4374 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4375 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4376 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4378 // Get current value of FPCR.
4379 SDValue Ops[] = {
4380 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4381 SDValue FPCR =
4382 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4383 Chain = FPCR.getValue(1);
4384 FPCR = FPCR.getValue(0);
4386 // Put new rounding mode into FPSCR[23:22].
4387 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4388 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4389 DAG.getConstant(RMMask, DL, MVT::i64));
4390 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4391 SDValue Ops2[] = {
4392 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4393 FPCR};
4394 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4397 static unsigned selectUmullSmull(SDNode *&N0, SDNode *&N1, SelectionDAG &DAG,
4398 bool &IsMLA) {
4399 bool IsN0SExt = isSignExtended(N0, DAG);
4400 bool IsN1SExt = isSignExtended(N1, DAG);
4401 if (IsN0SExt && IsN1SExt)
4402 return AArch64ISD::SMULL;
4404 bool IsN0ZExt = isZeroExtended(N0, DAG);
4405 bool IsN1ZExt = isZeroExtended(N1, DAG);
4407 if (IsN0ZExt && IsN1ZExt)
4408 return AArch64ISD::UMULL;
4410 if (!IsN1SExt && !IsN1ZExt)
4411 return 0;
4412 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4413 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4414 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
4415 IsMLA = true;
4416 return AArch64ISD::SMULL;
4418 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
4419 IsMLA = true;
4420 return AArch64ISD::UMULL;
4422 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
4423 std::swap(N0, N1);
4424 IsMLA = true;
4425 return AArch64ISD::UMULL;
4427 return 0;
4430 SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
4431 EVT VT = Op.getValueType();
4433 // If SVE is available then i64 vector multiplications can also be made legal.
4434 bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
4436 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
4437 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4439 // Multiplications are only custom-lowered for 128-bit vectors so that
4440 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
4441 assert(VT.is128BitVector() && VT.isInteger() &&
4442 "unexpected type for custom-lowering ISD::MUL");
4443 SDNode *N0 = Op.getOperand(0).getNode();
4444 SDNode *N1 = Op.getOperand(1).getNode();
4445 bool isMLA = false;
4446 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, isMLA);
4448 if (!NewOpc) {
4449 if (VT == MVT::v2i64)
4450 // Fall through to expand this. It is not legal.
4451 return SDValue();
4452 else
4453 // Other vector multiplications are legal.
4454 return Op;
4457 // Legalize to a S/UMULL instruction
4458 SDLoc DL(Op);
4459 SDValue Op0;
4460 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
4461 if (!isMLA) {
4462 Op0 = skipExtensionForVectorMULL(N0, DAG);
4463 assert(Op0.getValueType().is64BitVector() &&
4464 Op1.getValueType().is64BitVector() &&
4465 "unexpected types for extended operands to VMULL");
4466 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
4468 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
4469 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
4470 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
4471 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
4472 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
4473 EVT Op1VT = Op1.getValueType();
4474 return DAG.getNode(N0->getOpcode(), DL, VT,
4475 DAG.getNode(NewOpc, DL, VT,
4476 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
4477 DAG.getNode(NewOpc, DL, VT,
4478 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
4481 static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
4482 int Pattern) {
4483 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
4484 return DAG.getConstant(1, DL, MVT::nxv1i1);
4485 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
4486 DAG.getTargetConstant(Pattern, DL, MVT::i32));
4489 // Returns a safe bitcast between two scalable vector predicates, where
4490 // any newly created lanes from a widening bitcast are defined as zero.
4491 static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
4492 SDLoc DL(Op);
4493 EVT InVT = Op.getValueType();
4495 assert(InVT.getVectorElementType() == MVT::i1 &&
4496 VT.getVectorElementType() == MVT::i1 &&
4497 "Expected a predicate-to-predicate bitcast");
4498 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
4499 InVT.isScalableVector() &&
4500 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
4501 "Only expect to cast between legal scalable predicate types!");
4503 // Return the operand if the cast isn't changing type,
4504 // e.g. <n x 16 x i1> -> <n x 16 x i1>
4505 if (InVT == VT)
4506 return Op;
4508 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
4510 // We only have to zero the lanes if new lanes are being defined, e.g. when
4511 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
4512 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
4513 // we can return here.
4514 if (InVT.bitsGT(VT))
4515 return Reinterpret;
4517 // Check if the other lanes are already known to be zeroed by
4518 // construction.
4519 if (isZeroingInactiveLanes(Op))
4520 return Reinterpret;
4522 // Zero the newly introduced lanes.
4523 SDValue Mask = DAG.getConstant(1, DL, InVT);
4524 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
4525 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
4528 SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain,
4529 SMEAttrs Attrs, SDLoc DL,
4530 EVT VT) const {
4531 if (Attrs.hasStreamingInterfaceOrBody())
4532 return DAG.getConstant(1, DL, VT);
4534 if (Attrs.hasNonStreamingInterfaceAndBody())
4535 return DAG.getConstant(0, DL, VT);
4537 assert(Attrs.hasStreamingCompatibleInterface() && "Unexpected interface");
4539 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
4540 getPointerTy(DAG.getDataLayout()));
4541 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
4542 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
4543 TargetLowering::CallLoweringInfo CLI(DAG);
4544 ArgListTy Args;
4545 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
4546 CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
4547 RetTy, Callee, std::move(Args));
4548 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4549 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
4550 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
4551 Mask);
4554 static Optional<SMEAttrs> getCalleeAttrsFromExternalFunction(SDValue V) {
4555 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(V)) {
4556 StringRef S(ES->getSymbol());
4557 if (S == "__arm_sme_state" || S == "__arm_tpidr2_save")
4558 return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved);
4559 if (S == "__arm_tpidr2_restore")
4560 return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared);
4562 return None;
4565 SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
4566 SelectionDAG &DAG) const {
4567 unsigned IntNo = Op.getConstantOperandVal(1);
4568 SDLoc DL(Op);
4569 switch (IntNo) {
4570 default:
4571 return SDValue(); // Don't custom lower most intrinsics.
4572 case Intrinsic::aarch64_mops_memset_tag: {
4573 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
4574 SDValue Chain = Node->getChain();
4575 SDValue Dst = Op.getOperand(2);
4576 SDValue Val = Op.getOperand(3);
4577 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
4578 SDValue Size = Op.getOperand(4);
4579 auto Alignment = Node->getMemOperand()->getAlign();
4580 bool IsVol = Node->isVolatile();
4581 auto DstPtrInfo = Node->getPointerInfo();
4583 const auto &SDI =
4584 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
4585 SDValue MS =
4586 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
4587 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
4589 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
4590 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
4591 // LowerOperationWrapper will complain that the number of results has
4592 // changed.
4593 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
4595 case Intrinsic::aarch64_sme_za_enable:
4596 return DAG.getNode(
4597 AArch64ISD::SMSTART, DL, MVT::Other,
4598 Op->getOperand(0), // Chain
4599 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
4600 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
4601 case Intrinsic::aarch64_sme_za_disable:
4602 return DAG.getNode(
4603 AArch64ISD::SMSTOP, DL, MVT::Other,
4604 Op->getOperand(0), // Chain
4605 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
4606 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
4610 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
4611 SelectionDAG &DAG) const {
4612 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4613 SDLoc dl(Op);
4614 switch (IntNo) {
4615 default: return SDValue(); // Don't custom lower most intrinsics.
4616 case Intrinsic::thread_pointer: {
4617 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4618 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
4620 case Intrinsic::aarch64_neon_abs: {
4621 EVT Ty = Op.getValueType();
4622 if (Ty == MVT::i64) {
4623 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
4624 Op.getOperand(1));
4625 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
4626 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
4627 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
4628 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
4629 } else {
4630 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
4633 case Intrinsic::aarch64_neon_pmull64: {
4634 SDValue LHS = Op.getOperand(1);
4635 SDValue RHS = Op.getOperand(2);
4637 Optional<uint64_t> LHSLane = getConstantLaneNumOfExtractHalfOperand(LHS);
4638 Optional<uint64_t> RHSLane = getConstantLaneNumOfExtractHalfOperand(RHS);
4640 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
4641 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
4643 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
4644 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
4645 // which ISel recognizes better. For example, generate a ldr into d*
4646 // registers as opposed to a GPR load followed by a fmov.
4647 auto TryVectorizeOperand =
4648 [](SDValue N, Optional<uint64_t> NLane, Optional<uint64_t> OtherLane,
4649 const SDLoc &dl, SelectionDAG &DAG) -> SDValue {
4650 // If the operand is an higher half itself, rewrite it to
4651 // extract_high_v2i64; this way aarch64_neon_pmull64 could
4652 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
4653 if (NLane && *NLane == 1)
4654 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
4655 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
4657 // Operand N is not a higher half but the other operand is.
4658 if (OtherLane && *OtherLane == 1) {
4659 // If this operand is a lower half, rewrite it to
4660 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
4661 // align lanes of two operands. A roundtrip sequence (to move from lane
4662 // 1 to lane 0) is like this:
4663 // mov x8, v0.d[1]
4664 // fmov d0, x8
4665 if (NLane && *NLane == 0)
4666 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
4667 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
4668 N.getOperand(0),
4669 DAG.getConstant(0, dl, MVT::i64)),
4670 DAG.getConstant(1, dl, MVT::i64));
4672 // Otherwise just dup from main to all lanes.
4673 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
4676 // Neither operand is an extract of higher half, so codegen may just use
4677 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
4678 assert(N.getValueType() == MVT::i64 &&
4679 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
4680 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
4683 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
4684 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
4686 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
4688 case Intrinsic::aarch64_neon_smax:
4689 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
4690 Op.getOperand(1), Op.getOperand(2));
4691 case Intrinsic::aarch64_neon_umax:
4692 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
4693 Op.getOperand(1), Op.getOperand(2));
4694 case Intrinsic::aarch64_neon_smin:
4695 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
4696 Op.getOperand(1), Op.getOperand(2));
4697 case Intrinsic::aarch64_neon_umin:
4698 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
4699 Op.getOperand(1), Op.getOperand(2));
4700 case Intrinsic::aarch64_neon_scalar_sqxtn:
4701 case Intrinsic::aarch64_neon_scalar_sqxtun:
4702 case Intrinsic::aarch64_neon_scalar_uqxtn: {
4703 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
4704 if (Op.getValueType() == MVT::i32)
4705 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
4706 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
4707 Op.getOperand(0),
4708 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
4709 Op.getOperand(1))));
4710 return SDValue();
4712 case Intrinsic::aarch64_sve_sunpkhi:
4713 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
4714 Op.getOperand(1));
4715 case Intrinsic::aarch64_sve_sunpklo:
4716 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
4717 Op.getOperand(1));
4718 case Intrinsic::aarch64_sve_uunpkhi:
4719 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
4720 Op.getOperand(1));
4721 case Intrinsic::aarch64_sve_uunpklo:
4722 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
4723 Op.getOperand(1));
4724 case Intrinsic::aarch64_sve_clasta_n:
4725 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
4726 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4727 case Intrinsic::aarch64_sve_clastb_n:
4728 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
4729 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4730 case Intrinsic::aarch64_sve_lasta:
4731 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
4732 Op.getOperand(1), Op.getOperand(2));
4733 case Intrinsic::aarch64_sve_lastb:
4734 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
4735 Op.getOperand(1), Op.getOperand(2));
4736 case Intrinsic::aarch64_sve_rev:
4737 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
4738 Op.getOperand(1));
4739 case Intrinsic::aarch64_sve_tbl:
4740 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
4741 Op.getOperand(1), Op.getOperand(2));
4742 case Intrinsic::aarch64_sve_trn1:
4743 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
4744 Op.getOperand(1), Op.getOperand(2));
4745 case Intrinsic::aarch64_sve_trn2:
4746 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
4747 Op.getOperand(1), Op.getOperand(2));
4748 case Intrinsic::aarch64_sve_uzp1:
4749 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
4750 Op.getOperand(1), Op.getOperand(2));
4751 case Intrinsic::aarch64_sve_uzp2:
4752 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
4753 Op.getOperand(1), Op.getOperand(2));
4754 case Intrinsic::aarch64_sve_zip1:
4755 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
4756 Op.getOperand(1), Op.getOperand(2));
4757 case Intrinsic::aarch64_sve_zip2:
4758 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
4759 Op.getOperand(1), Op.getOperand(2));
4760 case Intrinsic::aarch64_sve_splice:
4761 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
4762 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4763 case Intrinsic::aarch64_sve_ptrue:
4764 return getPTrue(DAG, dl, Op.getValueType(),
4765 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
4766 case Intrinsic::aarch64_sve_clz:
4767 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
4768 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4769 case Intrinsic::aarch64_sme_cntsb:
4770 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
4771 DAG.getConstant(1, dl, MVT::i32));
4772 case Intrinsic::aarch64_sme_cntsh: {
4773 SDValue One = DAG.getConstant(1, dl, MVT::i32);
4774 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
4775 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
4777 case Intrinsic::aarch64_sme_cntsw: {
4778 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
4779 DAG.getConstant(1, dl, MVT::i32));
4780 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
4781 DAG.getConstant(2, dl, MVT::i32));
4783 case Intrinsic::aarch64_sme_cntsd: {
4784 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
4785 DAG.getConstant(1, dl, MVT::i32));
4786 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
4787 DAG.getConstant(3, dl, MVT::i32));
4789 case Intrinsic::aarch64_sve_cnt: {
4790 SDValue Data = Op.getOperand(3);
4791 // CTPOP only supports integer operands.
4792 if (Data.getValueType().isFloatingPoint())
4793 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
4794 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
4795 Op.getOperand(2), Data, Op.getOperand(1));
4797 case Intrinsic::aarch64_sve_dupq_lane:
4798 return LowerDUPQLane(Op, DAG);
4799 case Intrinsic::aarch64_sve_convert_from_svbool:
4800 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
4801 case Intrinsic::aarch64_sve_convert_to_svbool:
4802 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
4803 case Intrinsic::aarch64_sve_fneg:
4804 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
4805 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4806 case Intrinsic::aarch64_sve_frintp:
4807 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
4808 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4809 case Intrinsic::aarch64_sve_frintm:
4810 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
4811 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4812 case Intrinsic::aarch64_sve_frinti:
4813 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
4814 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4815 case Intrinsic::aarch64_sve_frintx:
4816 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
4817 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4818 case Intrinsic::aarch64_sve_frinta:
4819 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
4820 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4821 case Intrinsic::aarch64_sve_frintn:
4822 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
4823 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4824 case Intrinsic::aarch64_sve_frintz:
4825 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
4826 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4827 case Intrinsic::aarch64_sve_ucvtf:
4828 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl,
4829 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4830 Op.getOperand(1));
4831 case Intrinsic::aarch64_sve_scvtf:
4832 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl,
4833 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4834 Op.getOperand(1));
4835 case Intrinsic::aarch64_sve_fcvtzu:
4836 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl,
4837 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4838 Op.getOperand(1));
4839 case Intrinsic::aarch64_sve_fcvtzs:
4840 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl,
4841 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4842 Op.getOperand(1));
4843 case Intrinsic::aarch64_sve_fsqrt:
4844 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
4845 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4846 case Intrinsic::aarch64_sve_frecpx:
4847 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
4848 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4849 case Intrinsic::aarch64_sve_frecpe_x:
4850 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
4851 Op.getOperand(1));
4852 case Intrinsic::aarch64_sve_frecps_x:
4853 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
4854 Op.getOperand(1), Op.getOperand(2));
4855 case Intrinsic::aarch64_sve_frsqrte_x:
4856 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
4857 Op.getOperand(1));
4858 case Intrinsic::aarch64_sve_frsqrts_x:
4859 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
4860 Op.getOperand(1), Op.getOperand(2));
4861 case Intrinsic::aarch64_sve_fabs:
4862 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
4863 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4864 case Intrinsic::aarch64_sve_abs:
4865 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
4866 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4867 case Intrinsic::aarch64_sve_neg:
4868 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
4869 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4870 case Intrinsic::aarch64_sve_insr: {
4871 SDValue Scalar = Op.getOperand(2);
4872 EVT ScalarTy = Scalar.getValueType();
4873 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
4874 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
4876 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
4877 Op.getOperand(1), Scalar);
4879 case Intrinsic::aarch64_sve_rbit:
4880 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl,
4881 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4882 Op.getOperand(1));
4883 case Intrinsic::aarch64_sve_revb:
4884 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
4885 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4886 case Intrinsic::aarch64_sve_revh:
4887 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
4888 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4889 case Intrinsic::aarch64_sve_revw:
4890 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
4891 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4892 case Intrinsic::aarch64_sve_revd:
4893 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
4894 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4895 case Intrinsic::aarch64_sve_sxtb:
4896 return DAG.getNode(
4897 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4898 Op.getOperand(2), Op.getOperand(3),
4899 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
4900 Op.getOperand(1));
4901 case Intrinsic::aarch64_sve_sxth:
4902 return DAG.getNode(
4903 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4904 Op.getOperand(2), Op.getOperand(3),
4905 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
4906 Op.getOperand(1));
4907 case Intrinsic::aarch64_sve_sxtw:
4908 return DAG.getNode(
4909 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4910 Op.getOperand(2), Op.getOperand(3),
4911 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
4912 Op.getOperand(1));
4913 case Intrinsic::aarch64_sve_uxtb:
4914 return DAG.getNode(
4915 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4916 Op.getOperand(2), Op.getOperand(3),
4917 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
4918 Op.getOperand(1));
4919 case Intrinsic::aarch64_sve_uxth:
4920 return DAG.getNode(
4921 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4922 Op.getOperand(2), Op.getOperand(3),
4923 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
4924 Op.getOperand(1));
4925 case Intrinsic::aarch64_sve_uxtw:
4926 return DAG.getNode(
4927 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4928 Op.getOperand(2), Op.getOperand(3),
4929 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
4930 Op.getOperand(1));
4931 case Intrinsic::localaddress: {
4932 const auto &MF = DAG.getMachineFunction();
4933 const auto *RegInfo = Subtarget->getRegisterInfo();
4934 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
4935 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
4936 Op.getSimpleValueType());
4939 case Intrinsic::eh_recoverfp: {
4940 // FIXME: This needs to be implemented to correctly handle highly aligned
4941 // stack objects. For now we simply return the incoming FP. Refer D53541
4942 // for more details.
4943 SDValue FnOp = Op.getOperand(1);
4944 SDValue IncomingFPOp = Op.getOperand(2);
4945 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
4946 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
4947 if (!Fn)
4948 report_fatal_error(
4949 "llvm.eh.recoverfp must take a function as the first argument");
4950 return IncomingFPOp;
4953 case Intrinsic::aarch64_neon_vsri:
4954 case Intrinsic::aarch64_neon_vsli: {
4955 EVT Ty = Op.getValueType();
4957 if (!Ty.isVector())
4958 report_fatal_error("Unexpected type for aarch64_neon_vsli");
4960 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
4962 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
4963 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
4964 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
4965 Op.getOperand(3));
4968 case Intrinsic::aarch64_neon_srhadd:
4969 case Intrinsic::aarch64_neon_urhadd:
4970 case Intrinsic::aarch64_neon_shadd:
4971 case Intrinsic::aarch64_neon_uhadd: {
4972 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
4973 IntNo == Intrinsic::aarch64_neon_shadd);
4974 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
4975 IntNo == Intrinsic::aarch64_neon_urhadd);
4976 unsigned Opcode = IsSignedAdd
4977 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
4978 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
4979 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4980 Op.getOperand(2));
4982 case Intrinsic::aarch64_neon_sabd:
4983 case Intrinsic::aarch64_neon_uabd: {
4984 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? ISD::ABDU
4985 : ISD::ABDS;
4986 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4987 Op.getOperand(2));
4989 case Intrinsic::aarch64_neon_saddlp:
4990 case Intrinsic::aarch64_neon_uaddlp: {
4991 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
4992 ? AArch64ISD::UADDLP
4993 : AArch64ISD::SADDLP;
4994 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
4996 case Intrinsic::aarch64_neon_sdot:
4997 case Intrinsic::aarch64_neon_udot:
4998 case Intrinsic::aarch64_sve_sdot:
4999 case Intrinsic::aarch64_sve_udot: {
5000 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5001 IntNo == Intrinsic::aarch64_sve_udot)
5002 ? AArch64ISD::UDOT
5003 : AArch64ISD::SDOT;
5004 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5005 Op.getOperand(2), Op.getOperand(3));
5007 case Intrinsic::get_active_lane_mask: {
5008 SDValue ID =
5009 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5010 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
5011 Op.getOperand(1), Op.getOperand(2));
5016 bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5017 if (VT.getVectorElementType() == MVT::i8 ||
5018 VT.getVectorElementType() == MVT::i16) {
5019 EltTy = MVT::i32;
5020 return true;
5022 return false;
5025 bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT,
5026 EVT DataVT) const {
5027 // SVE only supports implicit extension of 32-bit indices.
5028 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5029 return false;
5031 // Indices cannot be smaller than the main data type.
5032 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5033 return false;
5035 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5036 // element container type, which would violate the previous clause.
5037 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5040 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5041 return ExtVal.getValueType().isScalableVector() ||
5042 useSVEForFixedLengthVectorVT(
5043 ExtVal.getValueType(),
5044 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors());
5047 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5048 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5049 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5050 AArch64ISD::GLD1_MERGE_ZERO},
5051 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5052 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
5053 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5054 AArch64ISD::GLD1_MERGE_ZERO},
5055 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5056 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
5057 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5058 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
5059 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5060 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
5061 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5062 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
5063 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5064 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
5066 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
5067 return AddrModes.find(Key)->second;
5070 unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5071 switch (Opcode) {
5072 default:
5073 llvm_unreachable("unimplemented opcode");
5074 return Opcode;
5075 case AArch64ISD::GLD1_MERGE_ZERO:
5076 return AArch64ISD::GLD1S_MERGE_ZERO;
5077 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
5078 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
5079 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
5080 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
5081 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
5082 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
5083 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
5084 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
5085 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
5086 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
5087 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
5088 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
5092 SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5093 SelectionDAG &DAG) const {
5094 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
5096 SDLoc DL(Op);
5097 SDValue Chain = MGT->getChain();
5098 SDValue PassThru = MGT->getPassThru();
5099 SDValue Mask = MGT->getMask();
5100 SDValue BasePtr = MGT->getBasePtr();
5101 SDValue Index = MGT->getIndex();
5102 SDValue Scale = MGT->getScale();
5103 EVT VT = Op.getValueType();
5104 EVT MemVT = MGT->getMemoryVT();
5105 ISD::LoadExtType ExtType = MGT->getExtensionType();
5106 ISD::MemIndexType IndexType = MGT->getIndexType();
5108 // SVE supports zero (and so undef) passthrough values only, everything else
5109 // must be handled manually by an explicit select on the load's output.
5110 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
5111 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5112 SDValue Load =
5113 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5114 MGT->getMemOperand(), IndexType, ExtType);
5115 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5116 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
5119 bool IsScaled = MGT->isIndexScaled();
5120 bool IsSigned = MGT->isIndexSigned();
5122 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5123 // must be calculated before hand.
5124 uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
5125 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5126 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5127 EVT IndexVT = Index.getValueType();
5128 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5129 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5130 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5132 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5133 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5134 MGT->getMemOperand(), IndexType, ExtType);
5137 // Lower fixed length gather to a scalable equivalent.
5138 if (VT.isFixedLengthVector()) {
5139 assert(Subtarget->useSVEForFixedLengthVectors() &&
5140 "Cannot lower when not using SVE for fixed vectors!");
5142 // NOTE: Handle floating-point as if integer then bitcast the result.
5143 EVT DataVT = VT.changeVectorElementTypeToInteger();
5144 MemVT = MemVT.changeVectorElementTypeToInteger();
5146 // Find the smallest integer fixed length vector we can use for the gather.
5147 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5148 if (DataVT.getVectorElementType() == MVT::i64 ||
5149 Index.getValueType().getVectorElementType() == MVT::i64 ||
5150 Mask.getValueType().getVectorElementType() == MVT::i64)
5151 PromotedVT = VT.changeVectorElementType(MVT::i64);
5153 // Promote vector operands except for passthrough, which we know is either
5154 // undef or zero, and thus best constructed directly.
5155 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5156 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5157 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5159 // A promoted result type forces the need for an extending load.
5160 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5161 ExtType = ISD::EXTLOAD;
5163 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5165 // Convert fixed length vector operands to scalable.
5166 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5167 Index = convertToScalableVector(DAG, ContainerVT, Index);
5168 Mask = convertFixedMaskToScalableVector(Mask, DAG);
5169 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
5170 : DAG.getConstant(0, DL, ContainerVT);
5172 // Emit equivalent scalable vector gather.
5173 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5174 SDValue Load =
5175 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5176 Ops, MGT->getMemOperand(), IndexType, ExtType);
5178 // Extract fixed length data then convert to the required result type.
5179 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
5180 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
5181 if (VT.isFloatingPoint())
5182 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
5184 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5187 // Everything else is legal.
5188 return Op;
5191 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5192 SelectionDAG &DAG) const {
5193 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
5195 SDLoc DL(Op);
5196 SDValue Chain = MSC->getChain();
5197 SDValue StoreVal = MSC->getValue();
5198 SDValue Mask = MSC->getMask();
5199 SDValue BasePtr = MSC->getBasePtr();
5200 SDValue Index = MSC->getIndex();
5201 SDValue Scale = MSC->getScale();
5202 EVT VT = StoreVal.getValueType();
5203 EVT MemVT = MSC->getMemoryVT();
5204 ISD::MemIndexType IndexType = MSC->getIndexType();
5205 bool Truncating = MSC->isTruncatingStore();
5207 bool IsScaled = MSC->isIndexScaled();
5208 bool IsSigned = MSC->isIndexSigned();
5210 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5211 // must be calculated before hand.
5212 uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
5213 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5214 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5215 EVT IndexVT = Index.getValueType();
5216 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5217 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5218 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5220 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5221 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5222 MSC->getMemOperand(), IndexType, Truncating);
5225 // Lower fixed length scatter to a scalable equivalent.
5226 if (VT.isFixedLengthVector()) {
5227 assert(Subtarget->useSVEForFixedLengthVectors() &&
5228 "Cannot lower when not using SVE for fixed vectors!");
5230 // Once bitcast we treat floating-point scatters as if integer.
5231 if (VT.isFloatingPoint()) {
5232 VT = VT.changeVectorElementTypeToInteger();
5233 MemVT = MemVT.changeVectorElementTypeToInteger();
5234 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
5237 // Find the smallest integer fixed length vector we can use for the scatter.
5238 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5239 if (VT.getVectorElementType() == MVT::i64 ||
5240 Index.getValueType().getVectorElementType() == MVT::i64 ||
5241 Mask.getValueType().getVectorElementType() == MVT::i64)
5242 PromotedVT = VT.changeVectorElementType(MVT::i64);
5244 // Promote vector operands.
5245 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5246 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5247 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5248 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
5250 // A promoted value type forces the need for a truncating store.
5251 if (PromotedVT != VT)
5252 Truncating = true;
5254 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5256 // Convert fixed length vector operands to scalable.
5257 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5258 Index = convertToScalableVector(DAG, ContainerVT, Index);
5259 Mask = convertFixedMaskToScalableVector(Mask, DAG);
5260 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
5262 // Emit equivalent scalable vector scatter.
5263 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5264 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5265 MSC->getMemOperand(), IndexType, Truncating);
5268 // Everything else is legal.
5269 return Op;
5272 SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
5273 SDLoc DL(Op);
5274 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
5275 assert(LoadNode && "Expected custom lowering of a masked load node");
5276 EVT VT = Op->getValueType(0);
5278 if (useSVEForFixedLengthVectorVT(
5280 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
5281 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
5283 SDValue PassThru = LoadNode->getPassThru();
5284 SDValue Mask = LoadNode->getMask();
5286 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
5287 return Op;
5289 SDValue Load = DAG.getMaskedLoad(
5290 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
5291 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
5292 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
5293 LoadNode->getExtensionType());
5295 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5297 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5300 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
5301 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
5302 EVT VT, EVT MemVT,
5303 SelectionDAG &DAG) {
5304 assert(VT.isVector() && "VT should be a vector type");
5305 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
5307 SDValue Value = ST->getValue();
5309 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
5310 // the word lane which represent the v4i8 subvector. It optimizes the store
5311 // to:
5313 // xtn v0.8b, v0.8h
5314 // str s0, [x0]
5316 SDValue Undef = DAG.getUNDEF(MVT::i16);
5317 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
5318 {Undef, Undef, Undef, Undef});
5320 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
5321 Value, UndefVec);
5322 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
5324 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
5325 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
5326 Trunc, DAG.getConstant(0, DL, MVT::i64));
5328 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
5329 ST->getBasePtr(), ST->getMemOperand());
5332 // Custom lowering for any store, vector or scalar and/or default or with
5333 // a truncate operations. Currently only custom lower truncate operation
5334 // from vector v4i16 to v4i8 or volatile stores of i128.
5335 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
5336 SelectionDAG &DAG) const {
5337 SDLoc Dl(Op);
5338 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
5339 assert (StoreNode && "Can only custom lower store nodes");
5341 SDValue Value = StoreNode->getValue();
5343 EVT VT = Value.getValueType();
5344 EVT MemVT = StoreNode->getMemoryVT();
5346 if (VT.isVector()) {
5347 if (useSVEForFixedLengthVectorVT(
5349 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
5350 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
5352 unsigned AS = StoreNode->getAddressSpace();
5353 Align Alignment = StoreNode->getAlign();
5354 if (Alignment < MemVT.getStoreSize() &&
5355 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
5356 StoreNode->getMemOperand()->getFlags(),
5357 nullptr)) {
5358 return scalarizeVectorStore(StoreNode, DAG);
5361 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
5362 MemVT == MVT::v4i8) {
5363 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
5365 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
5366 // the custom lowering, as there are no un-paired non-temporal stores and
5367 // legalization will break up 256 bit inputs.
5368 ElementCount EC = MemVT.getVectorElementCount();
5369 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
5370 EC.isKnownEven() &&
5371 ((MemVT.getScalarSizeInBits() == 8u ||
5372 MemVT.getScalarSizeInBits() == 16u ||
5373 MemVT.getScalarSizeInBits() == 32u ||
5374 MemVT.getScalarSizeInBits() == 64u))) {
5375 SDValue Lo =
5376 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
5377 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
5378 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
5379 SDValue Hi =
5380 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
5381 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
5382 StoreNode->getValue(),
5383 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
5384 SDValue Result = DAG.getMemIntrinsicNode(
5385 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
5386 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
5387 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
5388 return Result;
5390 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
5391 return LowerStore128(Op, DAG);
5392 } else if (MemVT == MVT::i64x8) {
5393 SDValue Value = StoreNode->getValue();
5394 assert(Value->getValueType(0) == MVT::i64x8);
5395 SDValue Chain = StoreNode->getChain();
5396 SDValue Base = StoreNode->getBasePtr();
5397 EVT PtrVT = Base.getValueType();
5398 for (unsigned i = 0; i < 8; i++) {
5399 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
5400 Value, DAG.getConstant(i, Dl, MVT::i32));
5401 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
5402 DAG.getConstant(i * 8, Dl, PtrVT));
5403 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
5404 StoreNode->getOriginalAlign());
5406 return Chain;
5409 return SDValue();
5412 /// Lower atomic or volatile 128-bit stores to a single STP instruction.
5413 SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
5414 SelectionDAG &DAG) const {
5415 MemSDNode *StoreNode = cast<MemSDNode>(Op);
5416 assert(StoreNode->getMemoryVT() == MVT::i128);
5417 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
5418 assert(!StoreNode->isAtomic() ||
5419 StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
5420 StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
5422 SDValue Value = StoreNode->getOpcode() == ISD::STORE
5423 ? StoreNode->getOperand(1)
5424 : StoreNode->getOperand(2);
5425 SDLoc DL(Op);
5426 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
5427 DAG.getConstant(0, DL, MVT::i64));
5428 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
5429 DAG.getConstant(1, DL, MVT::i64));
5430 SDValue Result = DAG.getMemIntrinsicNode(
5431 AArch64ISD::STP, DL, DAG.getVTList(MVT::Other),
5432 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
5433 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
5434 return Result;
5437 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
5438 SelectionDAG &DAG) const {
5439 SDLoc DL(Op);
5440 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
5441 assert(LoadNode && "Expected custom lowering of a load node");
5443 if (LoadNode->getMemoryVT() == MVT::i64x8) {
5444 SmallVector<SDValue, 8> Ops;
5445 SDValue Base = LoadNode->getBasePtr();
5446 SDValue Chain = LoadNode->getChain();
5447 EVT PtrVT = Base.getValueType();
5448 for (unsigned i = 0; i < 8; i++) {
5449 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
5450 DAG.getConstant(i * 8, DL, PtrVT));
5451 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
5452 LoadNode->getPointerInfo(),
5453 LoadNode->getOriginalAlign());
5454 Ops.push_back(Part);
5455 Chain = SDValue(Part.getNode(), 1);
5457 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
5458 return DAG.getMergeValues({Loaded, Chain}, DL);
5461 // Custom lowering for extending v4i8 vector loads.
5462 EVT VT = Op->getValueType(0);
5463 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
5465 if (LoadNode->getMemoryVT() != MVT::v4i8)
5466 return SDValue();
5468 unsigned ExtType;
5469 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
5470 ExtType = ISD::SIGN_EXTEND;
5471 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
5472 LoadNode->getExtensionType() == ISD::EXTLOAD)
5473 ExtType = ISD::ZERO_EXTEND;
5474 else
5475 return SDValue();
5477 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
5478 LoadNode->getBasePtr(), MachinePointerInfo());
5479 SDValue Chain = Load.getValue(1);
5480 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
5481 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
5482 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
5483 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
5484 DAG.getConstant(0, DL, MVT::i64));
5485 if (VT == MVT::v4i32)
5486 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
5487 return DAG.getMergeValues({Ext, Chain}, DL);
5490 // Generate SUBS and CSEL for integer abs.
5491 SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5492 MVT VT = Op.getSimpleValueType();
5494 if (VT.isVector())
5495 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
5497 SDLoc DL(Op);
5498 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
5499 Op.getOperand(0));
5500 // Generate SUBS & CSEL.
5501 SDValue Cmp =
5502 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
5503 Op.getOperand(0), DAG.getConstant(0, DL, VT));
5504 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
5505 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
5506 Cmp.getValue(1));
5509 static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
5510 SDValue Chain = Op.getOperand(0);
5511 SDValue Cond = Op.getOperand(1);
5512 SDValue Dest = Op.getOperand(2);
5514 AArch64CC::CondCode CC;
5515 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
5516 SDLoc dl(Op);
5517 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
5518 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
5519 Cmp);
5522 return SDValue();
5525 SDValue AArch64TargetLowering::LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const {
5526 assert(Op->getOpcode() == ISD::ZERO_EXTEND && "Expected ZERO_EXTEND");
5528 if (Op.getValueType().isFixedLengthVector())
5529 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
5531 // Try to lower to VSELECT to allow zext to transform into
5532 // a predicated instruction like add, sub or mul.
5533 SDValue Value = Op->getOperand(0);
5534 if (!Value->getValueType(0).isScalableVector() ||
5535 Value->getValueType(0).getScalarType() != MVT::i1)
5536 return SDValue();
5538 SDLoc DL = SDLoc(Op);
5539 EVT VT = Op->getValueType(0);
5540 SDValue Ones = DAG.getConstant(1, DL, VT);
5541 SDValue Zeros = DAG.getConstant(0, DL, VT);
5542 return DAG.getNode(ISD::VSELECT, DL, VT, Value, Ones, Zeros);
5545 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
5546 SelectionDAG &DAG) const {
5547 LLVM_DEBUG(dbgs() << "Custom lowering: ");
5548 LLVM_DEBUG(Op.dump());
5550 switch (Op.getOpcode()) {
5551 default:
5552 llvm_unreachable("unimplemented operand");
5553 return SDValue();
5554 case ISD::BITCAST:
5555 return LowerBITCAST(Op, DAG);
5556 case ISD::GlobalAddress:
5557 return LowerGlobalAddress(Op, DAG);
5558 case ISD::GlobalTLSAddress:
5559 return LowerGlobalTLSAddress(Op, DAG);
5560 case ISD::SETCC:
5561 case ISD::STRICT_FSETCC:
5562 case ISD::STRICT_FSETCCS:
5563 return LowerSETCC(Op, DAG);
5564 case ISD::SETCCCARRY:
5565 return LowerSETCCCARRY(Op, DAG);
5566 case ISD::BRCOND:
5567 return LowerBRCOND(Op, DAG);
5568 case ISD::BR_CC:
5569 return LowerBR_CC(Op, DAG);
5570 case ISD::SELECT:
5571 return LowerSELECT(Op, DAG);
5572 case ISD::SELECT_CC:
5573 return LowerSELECT_CC(Op, DAG);
5574 case ISD::JumpTable:
5575 return LowerJumpTable(Op, DAG);
5576 case ISD::BR_JT:
5577 return LowerBR_JT(Op, DAG);
5578 case ISD::ConstantPool:
5579 return LowerConstantPool(Op, DAG);
5580 case ISD::BlockAddress:
5581 return LowerBlockAddress(Op, DAG);
5582 case ISD::VASTART:
5583 return LowerVASTART(Op, DAG);
5584 case ISD::VACOPY:
5585 return LowerVACOPY(Op, DAG);
5586 case ISD::VAARG:
5587 return LowerVAARG(Op, DAG);
5588 case ISD::ADDCARRY:
5589 return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
5590 case ISD::SUBCARRY:
5591 return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
5592 case ISD::SADDO_CARRY:
5593 return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
5594 case ISD::SSUBO_CARRY:
5595 return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
5596 case ISD::SADDO:
5597 case ISD::UADDO:
5598 case ISD::SSUBO:
5599 case ISD::USUBO:
5600 case ISD::SMULO:
5601 case ISD::UMULO:
5602 return LowerXALUO(Op, DAG);
5603 case ISD::FADD:
5604 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
5605 case ISD::FSUB:
5606 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
5607 case ISD::FMUL:
5608 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
5609 case ISD::FMA:
5610 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
5611 case ISD::FDIV:
5612 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
5613 case ISD::FNEG:
5614 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
5615 case ISD::FCEIL:
5616 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
5617 case ISD::FFLOOR:
5618 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
5619 case ISD::FNEARBYINT:
5620 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
5621 case ISD::FRINT:
5622 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
5623 case ISD::FROUND:
5624 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
5625 case ISD::FROUNDEVEN:
5626 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
5627 case ISD::FTRUNC:
5628 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
5629 case ISD::FSQRT:
5630 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
5631 case ISD::FABS:
5632 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
5633 case ISD::FP_ROUND:
5634 case ISD::STRICT_FP_ROUND:
5635 return LowerFP_ROUND(Op, DAG);
5636 case ISD::FP_EXTEND:
5637 return LowerFP_EXTEND(Op, DAG);
5638 case ISD::FRAMEADDR:
5639 return LowerFRAMEADDR(Op, DAG);
5640 case ISD::SPONENTRY:
5641 return LowerSPONENTRY(Op, DAG);
5642 case ISD::RETURNADDR:
5643 return LowerRETURNADDR(Op, DAG);
5644 case ISD::ADDROFRETURNADDR:
5645 return LowerADDROFRETURNADDR(Op, DAG);
5646 case ISD::CONCAT_VECTORS:
5647 return LowerCONCAT_VECTORS(Op, DAG);
5648 case ISD::INSERT_VECTOR_ELT:
5649 return LowerINSERT_VECTOR_ELT(Op, DAG);
5650 case ISD::EXTRACT_VECTOR_ELT:
5651 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
5652 case ISD::BUILD_VECTOR:
5653 return LowerBUILD_VECTOR(Op, DAG);
5654 case ISD::VECTOR_SHUFFLE:
5655 return LowerVECTOR_SHUFFLE(Op, DAG);
5656 case ISD::SPLAT_VECTOR:
5657 return LowerSPLAT_VECTOR(Op, DAG);
5658 case ISD::EXTRACT_SUBVECTOR:
5659 return LowerEXTRACT_SUBVECTOR(Op, DAG);
5660 case ISD::INSERT_SUBVECTOR:
5661 return LowerINSERT_SUBVECTOR(Op, DAG);
5662 case ISD::SDIV:
5663 case ISD::UDIV:
5664 return LowerDIV(Op, DAG);
5665 case ISD::SMIN:
5666 case ISD::UMIN:
5667 case ISD::SMAX:
5668 case ISD::UMAX:
5669 return LowerMinMax(Op, DAG);
5670 case ISD::SRA:
5671 case ISD::SRL:
5672 case ISD::SHL:
5673 return LowerVectorSRA_SRL_SHL(Op, DAG);
5674 case ISD::SHL_PARTS:
5675 case ISD::SRL_PARTS:
5676 case ISD::SRA_PARTS:
5677 return LowerShiftParts(Op, DAG);
5678 case ISD::CTPOP:
5679 case ISD::PARITY:
5680 return LowerCTPOP_PARITY(Op, DAG);
5681 case ISD::FCOPYSIGN:
5682 return LowerFCOPYSIGN(Op, DAG);
5683 case ISD::OR:
5684 return LowerVectorOR(Op, DAG);
5685 case ISD::XOR:
5686 return LowerXOR(Op, DAG);
5687 case ISD::PREFETCH:
5688 return LowerPREFETCH(Op, DAG);
5689 case ISD::SINT_TO_FP:
5690 case ISD::UINT_TO_FP:
5691 case ISD::STRICT_SINT_TO_FP:
5692 case ISD::STRICT_UINT_TO_FP:
5693 return LowerINT_TO_FP(Op, DAG);
5694 case ISD::FP_TO_SINT:
5695 case ISD::FP_TO_UINT:
5696 case ISD::STRICT_FP_TO_SINT:
5697 case ISD::STRICT_FP_TO_UINT:
5698 return LowerFP_TO_INT(Op, DAG);
5699 case ISD::FP_TO_SINT_SAT:
5700 case ISD::FP_TO_UINT_SAT:
5701 return LowerFP_TO_INT_SAT(Op, DAG);
5702 case ISD::FSINCOS:
5703 return LowerFSINCOS(Op, DAG);
5704 case ISD::FLT_ROUNDS_:
5705 return LowerFLT_ROUNDS_(Op, DAG);
5706 case ISD::SET_ROUNDING:
5707 return LowerSET_ROUNDING(Op, DAG);
5708 case ISD::MUL:
5709 return LowerMUL(Op, DAG);
5710 case ISD::MULHS:
5711 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
5712 case ISD::MULHU:
5713 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
5714 case ISD::INTRINSIC_VOID:
5715 case ISD::INTRINSIC_W_CHAIN:
5716 return LowerINTRINSIC_W_CHAIN(Op, DAG);
5717 case ISD::INTRINSIC_WO_CHAIN:
5718 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5719 case ISD::ATOMIC_STORE:
5720 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
5721 assert(Subtarget->hasLSE2());
5722 return LowerStore128(Op, DAG);
5724 return SDValue();
5725 case ISD::STORE:
5726 return LowerSTORE(Op, DAG);
5727 case ISD::MSTORE:
5728 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
5729 case ISD::MGATHER:
5730 return LowerMGATHER(Op, DAG);
5731 case ISD::MSCATTER:
5732 return LowerMSCATTER(Op, DAG);
5733 case ISD::VECREDUCE_SEQ_FADD:
5734 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
5735 case ISD::VECREDUCE_ADD:
5736 case ISD::VECREDUCE_AND:
5737 case ISD::VECREDUCE_OR:
5738 case ISD::VECREDUCE_XOR:
5739 case ISD::VECREDUCE_SMAX:
5740 case ISD::VECREDUCE_SMIN:
5741 case ISD::VECREDUCE_UMAX:
5742 case ISD::VECREDUCE_UMIN:
5743 case ISD::VECREDUCE_FADD:
5744 case ISD::VECREDUCE_FMAX:
5745 case ISD::VECREDUCE_FMIN:
5746 return LowerVECREDUCE(Op, DAG);
5747 case ISD::ATOMIC_LOAD_SUB:
5748 return LowerATOMIC_LOAD_SUB(Op, DAG);
5749 case ISD::ATOMIC_LOAD_AND:
5750 return LowerATOMIC_LOAD_AND(Op, DAG);
5751 case ISD::DYNAMIC_STACKALLOC:
5752 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5753 case ISD::VSCALE:
5754 return LowerVSCALE(Op, DAG);
5755 case ISD::ANY_EXTEND:
5756 case ISD::SIGN_EXTEND:
5757 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
5758 case ISD::ZERO_EXTEND:
5759 return LowerZERO_EXTEND(Op, DAG);
5760 case ISD::SIGN_EXTEND_INREG: {
5761 // Only custom lower when ExtraVT has a legal byte based element type.
5762 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5763 EVT ExtraEltVT = ExtraVT.getVectorElementType();
5764 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
5765 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
5766 return SDValue();
5768 return LowerToPredicatedOp(Op, DAG,
5769 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
5771 case ISD::TRUNCATE:
5772 return LowerTRUNCATE(Op, DAG);
5773 case ISD::MLOAD:
5774 return LowerMLOAD(Op, DAG);
5775 case ISD::LOAD:
5776 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
5777 Subtarget->forceStreamingCompatibleSVE()))
5778 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
5779 return LowerLOAD(Op, DAG);
5780 case ISD::ADD:
5781 case ISD::AND:
5782 case ISD::SUB:
5783 return LowerToScalableOp(Op, DAG);
5784 case ISD::FMAXIMUM:
5785 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
5786 case ISD::FMAXNUM:
5787 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
5788 case ISD::FMINIMUM:
5789 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
5790 case ISD::FMINNUM:
5791 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
5792 case ISD::VSELECT:
5793 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
5794 case ISD::ABS:
5795 return LowerABS(Op, DAG);
5796 case ISD::ABDS:
5797 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
5798 case ISD::ABDU:
5799 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
5800 case ISD::BITREVERSE:
5801 return LowerBitreverse(Op, DAG);
5802 case ISD::BSWAP:
5803 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
5804 case ISD::CTLZ:
5805 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
5806 case ISD::CTTZ:
5807 return LowerCTTZ(Op, DAG);
5808 case ISD::VECTOR_SPLICE:
5809 return LowerVECTOR_SPLICE(Op, DAG);
5810 case ISD::STRICT_LROUND:
5811 case ISD::STRICT_LLROUND:
5812 case ISD::STRICT_LRINT:
5813 case ISD::STRICT_LLRINT: {
5814 assert(Op.getOperand(1).getValueType() == MVT::f16 &&
5815 "Expected custom lowering of rounding operations only for f16");
5816 SDLoc DL(Op);
5817 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
5818 {Op.getOperand(0), Op.getOperand(1)});
5819 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
5820 {Ext.getValue(1), Ext.getValue(0)});
5825 bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
5826 return !Subtarget->useSVEForFixedLengthVectors();
5829 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
5830 EVT VT, bool OverrideNEON) const {
5831 if (!VT.isFixedLengthVector() || !VT.isSimple())
5832 return false;
5834 // Don't use SVE for vectors we cannot scalarize if required.
5835 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
5836 // Fixed length predicates should be promoted to i8.
5837 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
5838 case MVT::i1:
5839 default:
5840 return false;
5841 case MVT::i8:
5842 case MVT::i16:
5843 case MVT::i32:
5844 case MVT::i64:
5845 case MVT::f16:
5846 case MVT::f32:
5847 case MVT::f64:
5848 break;
5851 // All SVE implementations support NEON sized vectors.
5852 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
5853 return Subtarget->hasSVE();
5855 // Ensure NEON MVTs only belong to a single register class.
5856 if (VT.getFixedSizeInBits() <= 128)
5857 return false;
5859 // Ensure wider than NEON code generation is enabled.
5860 if (!Subtarget->useSVEForFixedLengthVectors())
5861 return false;
5863 // Don't use SVE for types that don't fit.
5864 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
5865 return false;
5867 // TODO: Perhaps an artificial restriction, but worth having whilst getting
5868 // the base fixed length SVE support in place.
5869 if (!VT.isPow2VectorType())
5870 return false;
5872 return true;
5875 //===----------------------------------------------------------------------===//
5876 // Calling Convention Implementation
5877 //===----------------------------------------------------------------------===//
5879 static unsigned getIntrinsicID(const SDNode *N) {
5880 unsigned Opcode = N->getOpcode();
5881 switch (Opcode) {
5882 default:
5883 return Intrinsic::not_intrinsic;
5884 case ISD::INTRINSIC_WO_CHAIN: {
5885 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
5886 if (IID < Intrinsic::num_intrinsics)
5887 return IID;
5888 return Intrinsic::not_intrinsic;
5893 bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
5894 SDValue N1) const {
5895 if (!N0.hasOneUse())
5896 return false;
5898 unsigned IID = getIntrinsicID(N1.getNode());
5899 // Avoid reassociating expressions that can be lowered to smlal/umlal.
5900 if (IID == Intrinsic::aarch64_neon_umull ||
5901 N1.getOpcode() == AArch64ISD::UMULL ||
5902 IID == Intrinsic::aarch64_neon_smull ||
5903 N1.getOpcode() == AArch64ISD::SMULL)
5904 return N0.getOpcode() != ISD::ADD;
5906 return true;
5909 /// Selects the correct CCAssignFn for a given CallingConvention value.
5910 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
5911 bool IsVarArg) const {
5912 switch (CC) {
5913 default:
5914 report_fatal_error("Unsupported calling convention.");
5915 case CallingConv::WebKit_JS:
5916 return CC_AArch64_WebKit_JS;
5917 case CallingConv::GHC:
5918 return CC_AArch64_GHC;
5919 case CallingConv::C:
5920 case CallingConv::Fast:
5921 case CallingConv::PreserveMost:
5922 case CallingConv::CXX_FAST_TLS:
5923 case CallingConv::Swift:
5924 case CallingConv::SwiftTail:
5925 case CallingConv::Tail:
5926 if (Subtarget->isTargetWindows() && IsVarArg) {
5927 if (Subtarget->isWindowsArm64EC())
5928 return CC_AArch64_Arm64EC_VarArg;
5929 return CC_AArch64_Win64_VarArg;
5931 if (!Subtarget->isTargetDarwin())
5932 return CC_AArch64_AAPCS;
5933 if (!IsVarArg)
5934 return CC_AArch64_DarwinPCS;
5935 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
5936 : CC_AArch64_DarwinPCS_VarArg;
5937 case CallingConv::Win64:
5938 if (IsVarArg) {
5939 if (Subtarget->isWindowsArm64EC())
5940 return CC_AArch64_Arm64EC_VarArg;
5941 return CC_AArch64_Win64_VarArg;
5943 return CC_AArch64_AAPCS;
5944 case CallingConv::CFGuard_Check:
5945 return CC_AArch64_Win64_CFGuard_Check;
5946 case CallingConv::AArch64_VectorCall:
5947 case CallingConv::AArch64_SVE_VectorCall:
5948 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
5949 case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
5950 return CC_AArch64_AAPCS;
5954 CCAssignFn *
5955 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
5956 return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
5957 : RetCC_AArch64_AAPCS;
5961 /// Returns true if the Function has ZA state and contains at least one call to
5962 /// a function that requires setting up a lazy-save buffer.
5963 static bool requiresBufferForLazySave(const Function &F) {
5964 SMEAttrs CallerAttrs(F);
5965 if (!CallerAttrs.hasZAState())
5966 return false;
5968 for (const BasicBlock &BB : F)
5969 for (const Instruction &I : BB)
5970 if (const CallInst *Call = dyn_cast<CallInst>(&I))
5971 if (CallerAttrs.requiresLazySave(SMEAttrs(*Call)))
5972 return true;
5973 return false;
5976 unsigned AArch64TargetLowering::allocateLazySaveBuffer(
5977 SDValue &Chain, const SDLoc &DL, SelectionDAG &DAG, Register &Reg) const {
5978 MachineFunction &MF = DAG.getMachineFunction();
5979 MachineFrameInfo &MFI = MF.getFrameInfo();
5981 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
5982 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5983 DAG.getConstant(1, DL, MVT::i32));
5984 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
5985 SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
5986 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
5987 SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
5988 unsigned FI = MFI.CreateVariableSizedObject(Align(1), nullptr);
5989 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
5990 Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, Reg, Buffer.getValue(0));
5992 // Allocate an additional TPIDR2 object on the stack (16 bytes)
5993 unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
5995 // Store the buffer pointer to the TPIDR2 stack object.
5996 MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, FI);
5997 SDValue Ptr = DAG.getFrameIndex(
5998 FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
5999 Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
6001 return TPIDR2Obj;
6004 SDValue AArch64TargetLowering::LowerFormalArguments(
6005 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6006 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6007 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6008 MachineFunction &MF = DAG.getMachineFunction();
6009 const Function &F = MF.getFunction();
6010 MachineFrameInfo &MFI = MF.getFrameInfo();
6011 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
6012 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6014 SmallVector<ISD::OutputArg, 4> Outs;
6015 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
6016 DAG.getTargetLoweringInfo(), MF.getDataLayout());
6017 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6018 FuncInfo->setIsSVECC(true);
6020 // Assign locations to all of the incoming arguments.
6021 SmallVector<CCValAssign, 16> ArgLocs;
6022 DenseMap<unsigned, SDValue> CopiedRegs;
6023 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6025 // At this point, Ins[].VT may already be promoted to i32. To correctly
6026 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6027 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6028 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6029 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6030 // LocVT.
6031 unsigned NumArgs = Ins.size();
6032 Function::const_arg_iterator CurOrigArg = F.arg_begin();
6033 unsigned CurArgIdx = 0;
6034 for (unsigned i = 0; i != NumArgs; ++i) {
6035 MVT ValVT = Ins[i].VT;
6036 if (Ins[i].isOrigArg()) {
6037 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
6038 CurArgIdx = Ins[i].getOrigArgIndex();
6040 // Get type of the original argument.
6041 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
6042 /*AllowUnknown*/ true);
6043 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
6044 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6045 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6046 ValVT = MVT::i8;
6047 else if (ActualMVT == MVT::i16)
6048 ValVT = MVT::i16;
6050 bool UseVarArgCC = false;
6051 if (IsWin64)
6052 UseVarArgCC = isVarArg;
6053 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
6054 bool Res =
6055 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
6056 assert(!Res && "Call operand has unhandled type");
6057 (void)Res;
6060 SMEAttrs Attrs(MF.getFunction());
6061 bool IsLocallyStreaming =
6062 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
6063 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
6064 SDValue Glue = Chain.getValue(1);
6066 SmallVector<SDValue, 16> ArgValues;
6067 unsigned ExtraArgLocs = 0;
6068 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
6069 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
6071 if (Ins[i].Flags.isByVal()) {
6072 // Byval is used for HFAs in the PCS, but the system should work in a
6073 // non-compliant manner for larger structs.
6074 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6075 int Size = Ins[i].Flags.getByValSize();
6076 unsigned NumRegs = (Size + 7) / 8;
6078 // FIXME: This works on big-endian for composite byvals, which are the common
6079 // case. It should also work for fundamental types too.
6080 unsigned FrameIdx =
6081 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
6082 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
6083 InVals.push_back(FrameIdxN);
6085 continue;
6088 if (Ins[i].Flags.isSwiftAsync())
6089 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6091 SDValue ArgValue;
6092 if (VA.isRegLoc()) {
6093 // Arguments stored in registers.
6094 EVT RegVT = VA.getLocVT();
6095 const TargetRegisterClass *RC;
6097 if (RegVT == MVT::i32)
6098 RC = &AArch64::GPR32RegClass;
6099 else if (RegVT == MVT::i64)
6100 RC = &AArch64::GPR64RegClass;
6101 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
6102 RC = &AArch64::FPR16RegClass;
6103 else if (RegVT == MVT::f32)
6104 RC = &AArch64::FPR32RegClass;
6105 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
6106 RC = &AArch64::FPR64RegClass;
6107 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
6108 RC = &AArch64::FPR128RegClass;
6109 else if (RegVT.isScalableVector() &&
6110 RegVT.getVectorElementType() == MVT::i1) {
6111 FuncInfo->setIsSVECC(true);
6112 RC = &AArch64::PPRRegClass;
6113 } else if (RegVT.isScalableVector()) {
6114 FuncInfo->setIsSVECC(true);
6115 RC = &AArch64::ZPRRegClass;
6116 } else
6117 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
6119 // Transform the arguments in physical registers into virtual ones.
6120 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
6122 if (IsLocallyStreaming) {
6123 // LocallyStreamingFunctions must insert the SMSTART in the correct
6124 // position, so we use Glue to ensure no instructions can be scheduled
6125 // between the chain of:
6126 // t0: ch,glue = EntryNode
6127 // t1: res,ch,glue = CopyFromReg
6128 // ...
6129 // tn: res,ch,glue = CopyFromReg t(n-1), ..
6130 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
6131 // ^^^^^^
6132 // This will be the new Chain/Root node.
6133 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
6134 Glue = ArgValue.getValue(2);
6135 } else
6136 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
6138 // If this is an 8, 16 or 32-bit value, it is really passed promoted
6139 // to 64 bits. Insert an assert[sz]ext to capture this, then
6140 // truncate to the right size.
6141 switch (VA.getLocInfo()) {
6142 default:
6143 llvm_unreachable("Unknown loc info!");
6144 case CCValAssign::Full:
6145 break;
6146 case CCValAssign::Indirect:
6147 assert((VA.getValVT().isScalableVector() ||
6148 Subtarget->isWindowsArm64EC()) &&
6149 "Indirect arguments should be scalable on most subtargets");
6150 break;
6151 case CCValAssign::BCvt:
6152 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
6153 break;
6154 case CCValAssign::AExt:
6155 case CCValAssign::SExt:
6156 case CCValAssign::ZExt:
6157 break;
6158 case CCValAssign::AExtUpper:
6159 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
6160 DAG.getConstant(32, DL, RegVT));
6161 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
6162 break;
6164 } else { // VA.isRegLoc()
6165 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
6166 unsigned ArgOffset = VA.getLocMemOffset();
6167 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
6168 ? VA.getLocVT().getSizeInBits()
6169 : VA.getValVT().getSizeInBits()) / 8;
6171 uint32_t BEAlign = 0;
6172 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
6173 !Ins[i].Flags.isInConsecutiveRegs())
6174 BEAlign = 8 - ArgSize;
6176 SDValue FIN;
6177 MachinePointerInfo PtrInfo;
6178 if (isVarArg && Subtarget->isWindowsArm64EC()) {
6179 // In the ARM64EC varargs convention, fixed arguments on the stack are
6180 // accessed relative to x4, not sp.
6181 unsigned ObjOffset = ArgOffset + BEAlign;
6182 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
6183 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
6184 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
6185 DAG.getConstant(ObjOffset, DL, MVT::i64));
6186 PtrInfo = MachinePointerInfo::getUnknownStack(MF);
6187 } else {
6188 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
6190 // Create load nodes to retrieve arguments from the stack.
6191 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
6192 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
6195 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
6196 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
6197 MVT MemVT = VA.getValVT();
6199 switch (VA.getLocInfo()) {
6200 default:
6201 break;
6202 case CCValAssign::Trunc:
6203 case CCValAssign::BCvt:
6204 MemVT = VA.getLocVT();
6205 break;
6206 case CCValAssign::Indirect:
6207 assert((VA.getValVT().isScalableVector() ||
6208 Subtarget->isWindowsArm64EC()) &&
6209 "Indirect arguments should be scalable on most subtargets");
6210 MemVT = VA.getLocVT();
6211 break;
6212 case CCValAssign::SExt:
6213 ExtType = ISD::SEXTLOAD;
6214 break;
6215 case CCValAssign::ZExt:
6216 ExtType = ISD::ZEXTLOAD;
6217 break;
6218 case CCValAssign::AExt:
6219 ExtType = ISD::EXTLOAD;
6220 break;
6223 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
6224 MemVT);
6227 if (VA.getLocInfo() == CCValAssign::Indirect) {
6228 assert(
6229 (VA.getValVT().isScalableVector() || Subtarget->isWindowsArm64EC()) &&
6230 "Indirect arguments should be scalable on most subtargets");
6232 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize();
6233 unsigned NumParts = 1;
6234 if (Ins[i].Flags.isInConsecutiveRegs()) {
6235 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
6236 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
6237 ++NumParts;
6240 MVT PartLoad = VA.getValVT();
6241 SDValue Ptr = ArgValue;
6243 // Ensure we generate all loads for each tuple part, whilst updating the
6244 // pointer after each load correctly using vscale.
6245 while (NumParts > 0) {
6246 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
6247 InVals.push_back(ArgValue);
6248 NumParts--;
6249 if (NumParts > 0) {
6250 SDValue BytesIncrement;
6251 if (PartLoad.isScalableVector()) {
6252 BytesIncrement = DAG.getVScale(
6253 DL, Ptr.getValueType(),
6254 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
6255 } else {
6256 BytesIncrement = DAG.getConstant(
6257 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize), DL,
6258 Ptr.getValueType());
6260 SDNodeFlags Flags;
6261 Flags.setNoUnsignedWrap(true);
6262 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6263 BytesIncrement, Flags);
6264 ExtraArgLocs++;
6265 i++;
6268 } else {
6269 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
6270 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
6271 ArgValue, DAG.getValueType(MVT::i32));
6273 // i1 arguments are zero-extended to i8 by the caller. Emit a
6274 // hint to reflect this.
6275 if (Ins[i].isOrigArg()) {
6276 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
6277 if (OrigArg->getType()->isIntegerTy(1)) {
6278 if (!Ins[i].Flags.isZExt()) {
6279 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
6280 ArgValue.getValueType(), ArgValue);
6285 InVals.push_back(ArgValue);
6288 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
6290 // Insert the SMSTART if this is a locally streaming function and
6291 // make sure it is Glued to the last CopyFromReg value.
6292 if (IsLocallyStreaming) {
6293 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6294 Chain = DAG.getNode(
6295 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6296 {DAG.getRoot(),
6297 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
6298 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64),
6299 DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()), Glue});
6300 // Ensure that the SMSTART happens after the CopyWithChain such that its
6301 // chain result is used.
6302 for (unsigned I=0; I<InVals.size(); ++I) {
6303 Register Reg = MF.getRegInfo().createVirtualRegister(
6304 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
6305 SDValue X = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
6306 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
6307 InVals[I].getValueType());
6311 // varargs
6312 if (isVarArg) {
6313 if (!Subtarget->isTargetDarwin() || IsWin64) {
6314 // The AAPCS variadic function ABI is identical to the non-variadic
6315 // one. As a result there may be more arguments in registers and we should
6316 // save them for future reference.
6317 // Win64 variadic functions also pass arguments in registers, but all float
6318 // arguments are passed in integer registers.
6319 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
6322 // This will point to the next argument passed via stack.
6323 unsigned StackOffset = CCInfo.getNextStackOffset();
6324 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
6325 StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
6326 FuncInfo->setVarArgsStackOffset(StackOffset);
6327 FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
6329 if (MFI.hasMustTailInVarArgFunc()) {
6330 SmallVector<MVT, 2> RegParmTypes;
6331 RegParmTypes.push_back(MVT::i64);
6332 RegParmTypes.push_back(MVT::f128);
6333 // Compute the set of forwarded registers. The rest are scratch.
6334 SmallVectorImpl<ForwardedRegister> &Forwards =
6335 FuncInfo->getForwardedMustTailRegParms();
6336 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
6337 CC_AArch64_AAPCS);
6339 // Conservatively forward X8, since it might be used for aggregate return.
6340 if (!CCInfo.isAllocated(AArch64::X8)) {
6341 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
6342 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
6347 // On Windows, InReg pointers must be returned, so record the pointer in a
6348 // virtual register at the start of the function so it can be returned in the
6349 // epilogue.
6350 if (IsWin64) {
6351 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
6352 if (Ins[I].Flags.isInReg() && Ins[I].Flags.isSRet()) {
6353 assert(!FuncInfo->getSRetReturnReg());
6355 MVT PtrTy = getPointerTy(DAG.getDataLayout());
6356 Register Reg =
6357 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
6358 FuncInfo->setSRetReturnReg(Reg);
6360 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
6361 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
6362 break;
6367 unsigned StackArgSize = CCInfo.getNextStackOffset();
6368 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
6369 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
6370 // This is a non-standard ABI so by fiat I say we're allowed to make full
6371 // use of the stack area to be popped, which must be aligned to 16 bytes in
6372 // any case:
6373 StackArgSize = alignTo(StackArgSize, 16);
6375 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
6376 // a multiple of 16.
6377 FuncInfo->setArgumentStackToRestore(StackArgSize);
6379 // This realignment carries over to the available bytes below. Our own
6380 // callers will guarantee the space is free by giving an aligned value to
6381 // CALLSEQ_START.
6383 // Even if we're not expected to free up the space, it's useful to know how
6384 // much is there while considering tail calls (because we can reuse it).
6385 FuncInfo->setBytesInStackArgArea(StackArgSize);
6387 if (Subtarget->hasCustomCallingConv())
6388 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
6390 if (requiresBufferForLazySave(MF.getFunction())) {
6391 // Set up a buffer once and store the buffer in the MachineFunctionInfo.
6392 Register Reg;
6393 unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG, Reg);
6394 FuncInfo->setLazySaveBufferReg(Reg);
6395 FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
6398 return Chain;
6401 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
6402 SelectionDAG &DAG,
6403 const SDLoc &DL,
6404 SDValue &Chain) const {
6405 MachineFunction &MF = DAG.getMachineFunction();
6406 MachineFrameInfo &MFI = MF.getFrameInfo();
6407 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6408 auto PtrVT = getPointerTy(DAG.getDataLayout());
6409 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
6411 SmallVector<SDValue, 8> MemOps;
6413 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
6414 AArch64::X3, AArch64::X4, AArch64::X5,
6415 AArch64::X6, AArch64::X7 };
6416 unsigned NumGPRArgRegs = std::size(GPRArgRegs);
6417 if (Subtarget->isWindowsArm64EC()) {
6418 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
6419 // functions.
6420 NumGPRArgRegs = 4;
6422 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
6424 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
6425 int GPRIdx = 0;
6426 if (GPRSaveSize != 0) {
6427 if (IsWin64) {
6428 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
6429 if (GPRSaveSize & 15)
6430 // The extra size here, if triggered, will always be 8.
6431 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
6432 } else
6433 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
6435 SDValue FIN;
6436 if (Subtarget->isWindowsArm64EC()) {
6437 // With the Arm64EC ABI, we reserve the save area as usual, but we
6438 // compute its address relative to x4. For a normal AArch64->AArch64
6439 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
6440 // different address.
6441 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
6442 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
6443 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
6444 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
6445 } else {
6446 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
6449 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
6450 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
6451 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
6452 SDValue Store =
6453 DAG.getStore(Val.getValue(1), DL, Val, FIN,
6454 IsWin64 ? MachinePointerInfo::getFixedStack(
6455 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
6456 : MachinePointerInfo::getStack(MF, i * 8));
6457 MemOps.push_back(Store);
6458 FIN =
6459 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
6462 FuncInfo->setVarArgsGPRIndex(GPRIdx);
6463 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
6465 if (Subtarget->hasFPARMv8() && !IsWin64) {
6466 static const MCPhysReg FPRArgRegs[] = {
6467 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
6468 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
6469 static const unsigned NumFPRArgRegs = std::size(FPRArgRegs);
6470 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
6472 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
6473 int FPRIdx = 0;
6474 if (FPRSaveSize != 0) {
6475 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
6477 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
6479 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
6480 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
6481 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
6483 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
6484 MachinePointerInfo::getStack(MF, i * 16));
6485 MemOps.push_back(Store);
6486 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
6487 DAG.getConstant(16, DL, PtrVT));
6490 FuncInfo->setVarArgsFPRIndex(FPRIdx);
6491 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
6494 if (!MemOps.empty()) {
6495 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
6499 /// LowerCallResult - Lower the result values of a call into the
6500 /// appropriate copies out of appropriate physical registers.
6501 SDValue AArch64TargetLowering::LowerCallResult(
6502 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
6503 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
6504 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
6505 SDValue ThisVal) const {
6506 DenseMap<unsigned, SDValue> CopiedRegs;
6507 // Copy all of the result registers out of their specified physreg.
6508 for (unsigned i = 0; i != RVLocs.size(); ++i) {
6509 CCValAssign VA = RVLocs[i];
6511 // Pass 'this' value directly from the argument to return value, to avoid
6512 // reg unit interference
6513 if (i == 0 && isThisReturn) {
6514 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
6515 "unexpected return calling convention register assignment");
6516 InVals.push_back(ThisVal);
6517 continue;
6520 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
6521 // allows one use of a physreg per block.
6522 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
6523 if (!Val) {
6524 Val =
6525 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
6526 Chain = Val.getValue(1);
6527 InFlag = Val.getValue(2);
6528 CopiedRegs[VA.getLocReg()] = Val;
6531 switch (VA.getLocInfo()) {
6532 default:
6533 llvm_unreachable("Unknown loc info!");
6534 case CCValAssign::Full:
6535 break;
6536 case CCValAssign::BCvt:
6537 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
6538 break;
6539 case CCValAssign::AExtUpper:
6540 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
6541 DAG.getConstant(32, DL, VA.getLocVT()));
6542 [[fallthrough]];
6543 case CCValAssign::AExt:
6544 [[fallthrough]];
6545 case CCValAssign::ZExt:
6546 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
6547 break;
6550 InVals.push_back(Val);
6553 return Chain;
6556 /// Return true if the calling convention is one that we can guarantee TCO for.
6557 static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
6558 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
6559 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
6562 /// Return true if we might ever do TCO for calls with this calling convention.
6563 static bool mayTailCallThisCC(CallingConv::ID CC) {
6564 switch (CC) {
6565 case CallingConv::C:
6566 case CallingConv::AArch64_SVE_VectorCall:
6567 case CallingConv::PreserveMost:
6568 case CallingConv::Swift:
6569 case CallingConv::SwiftTail:
6570 case CallingConv::Tail:
6571 case CallingConv::Fast:
6572 return true;
6573 default:
6574 return false;
6578 static void analyzeCallOperands(const AArch64TargetLowering &TLI,
6579 const AArch64Subtarget *Subtarget,
6580 const TargetLowering::CallLoweringInfo &CLI,
6581 CCState &CCInfo) {
6582 const SelectionDAG &DAG = CLI.DAG;
6583 CallingConv::ID CalleeCC = CLI.CallConv;
6584 bool IsVarArg = CLI.IsVarArg;
6585 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
6586 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
6588 unsigned NumArgs = Outs.size();
6589 for (unsigned i = 0; i != NumArgs; ++i) {
6590 MVT ArgVT = Outs[i].VT;
6591 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
6593 bool UseVarArgCC = false;
6594 if (IsVarArg) {
6595 // On Windows, the fixed arguments in a vararg call are passed in GPRs
6596 // too, so use the vararg CC to force them to integer registers.
6597 if (IsCalleeWin64) {
6598 UseVarArgCC = true;
6599 } else {
6600 UseVarArgCC = !Outs[i].IsFixed;
6602 } else {
6603 // Get type of the original argument.
6604 EVT ActualVT =
6605 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
6606 /*AllowUnknown*/ true);
6607 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
6608 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6609 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6610 ArgVT = MVT::i8;
6611 else if (ActualMVT == MVT::i16)
6612 ArgVT = MVT::i16;
6615 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
6616 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
6617 assert(!Res && "Call operand has unhandled type");
6618 (void)Res;
6622 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
6623 const CallLoweringInfo &CLI) const {
6624 CallingConv::ID CalleeCC = CLI.CallConv;
6625 if (!mayTailCallThisCC(CalleeCC))
6626 return false;
6628 SDValue Callee = CLI.Callee;
6629 bool IsVarArg = CLI.IsVarArg;
6630 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
6631 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
6632 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
6633 const SelectionDAG &DAG = CLI.DAG;
6634 MachineFunction &MF = DAG.getMachineFunction();
6635 const Function &CallerF = MF.getFunction();
6636 CallingConv::ID CallerCC = CallerF.getCallingConv();
6638 // SME Streaming functions are not eligible for TCO as they may require
6639 // the streaming mode or ZA to be restored after returning from the call.
6640 SMEAttrs CallerAttrs(MF.getFunction());
6641 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
6642 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
6643 CallerAttrs.requiresLazySave(CalleeAttrs))
6644 return false;
6646 // Functions using the C or Fast calling convention that have an SVE signature
6647 // preserve more registers and should assume the SVE_VectorCall CC.
6648 // The check for matching callee-saved regs will determine whether it is
6649 // eligible for TCO.
6650 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
6651 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
6652 CallerCC = CallingConv::AArch64_SVE_VectorCall;
6654 bool CCMatch = CallerCC == CalleeCC;
6656 // When using the Windows calling convention on a non-windows OS, we want
6657 // to back up and restore X18 in such functions; we can't do a tail call
6658 // from those functions.
6659 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
6660 CalleeCC != CallingConv::Win64)
6661 return false;
6663 // Byval parameters hand the function a pointer directly into the stack area
6664 // we want to reuse during a tail call. Working around this *is* possible (see
6665 // X86) but less efficient and uglier in LowerCall.
6666 for (Function::const_arg_iterator i = CallerF.arg_begin(),
6667 e = CallerF.arg_end();
6668 i != e; ++i) {
6669 if (i->hasByValAttr())
6670 return false;
6672 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
6673 // In this case, it is necessary to save/restore X0 in the callee. Tail
6674 // call opt interferes with this. So we disable tail call opt when the
6675 // caller has an argument with "inreg" attribute.
6677 // FIXME: Check whether the callee also has an "inreg" argument.
6678 if (i->hasInRegAttr())
6679 return false;
6682 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
6683 return CCMatch;
6685 // Externally-defined functions with weak linkage should not be
6686 // tail-called on AArch64 when the OS does not support dynamic
6687 // pre-emption of symbols, as the AAELF spec requires normal calls
6688 // to undefined weak functions to be replaced with a NOP or jump to the
6689 // next instruction. The behaviour of branch instructions in this
6690 // situation (as used for tail calls) is implementation-defined, so we
6691 // cannot rely on the linker replacing the tail call with a return.
6692 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
6693 const GlobalValue *GV = G->getGlobal();
6694 const Triple &TT = getTargetMachine().getTargetTriple();
6695 if (GV->hasExternalWeakLinkage() &&
6696 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
6697 return false;
6700 // Now we search for cases where we can use a tail call without changing the
6701 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
6702 // concept.
6704 // I want anyone implementing a new calling convention to think long and hard
6705 // about this assert.
6706 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
6707 "Unexpected variadic calling convention");
6709 LLVMContext &C = *DAG.getContext();
6710 // Check that the call results are passed in the same way.
6711 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
6712 CCAssignFnForCall(CalleeCC, IsVarArg),
6713 CCAssignFnForCall(CallerCC, IsVarArg)))
6714 return false;
6715 // The callee has to preserve all registers the caller needs to preserve.
6716 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6717 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
6718 if (!CCMatch) {
6719 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
6720 if (Subtarget->hasCustomCallingConv()) {
6721 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
6722 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
6724 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
6725 return false;
6728 // Nothing more to check if the callee is taking no arguments
6729 if (Outs.empty())
6730 return true;
6732 SmallVector<CCValAssign, 16> ArgLocs;
6733 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
6735 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
6737 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
6738 // When we are musttail, additional checks have been done and we can safely ignore this check
6739 // At least two cases here: if caller is fastcc then we can't have any
6740 // memory arguments (we'd be expected to clean up the stack afterwards). If
6741 // caller is C then we could potentially use its argument area.
6743 // FIXME: for now we take the most conservative of these in both cases:
6744 // disallow all variadic memory operands.
6745 for (const CCValAssign &ArgLoc : ArgLocs)
6746 if (!ArgLoc.isRegLoc())
6747 return false;
6750 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6752 // If any of the arguments is passed indirectly, it must be SVE, so the
6753 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
6754 // allocate space on the stack. That is why we determine this explicitly here
6755 // the call cannot be a tailcall.
6756 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
6757 assert((A.getLocInfo() != CCValAssign::Indirect ||
6758 A.getValVT().isScalableVector() ||
6759 Subtarget->isWindowsArm64EC()) &&
6760 "Expected value to be scalable");
6761 return A.getLocInfo() == CCValAssign::Indirect;
6763 return false;
6765 // If the stack arguments for this call do not fit into our own save area then
6766 // the call cannot be made tail.
6767 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
6768 return false;
6770 const MachineRegisterInfo &MRI = MF.getRegInfo();
6771 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
6772 return false;
6774 return true;
6777 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
6778 SelectionDAG &DAG,
6779 MachineFrameInfo &MFI,
6780 int ClobberedFI) const {
6781 SmallVector<SDValue, 8> ArgChains;
6782 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
6783 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
6785 // Include the original chain at the beginning of the list. When this is
6786 // used by target LowerCall hooks, this helps legalize find the
6787 // CALLSEQ_BEGIN node.
6788 ArgChains.push_back(Chain);
6790 // Add a chain value for each stack argument corresponding
6791 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
6792 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
6793 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
6794 if (FI->getIndex() < 0) {
6795 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
6796 int64_t InLastByte = InFirstByte;
6797 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
6799 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
6800 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
6801 ArgChains.push_back(SDValue(L, 1));
6804 // Build a tokenfactor for all the chains.
6805 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
6808 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
6809 bool TailCallOpt) const {
6810 return (CallCC == CallingConv::Fast && TailCallOpt) ||
6811 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
6814 // Check if the value is zero-extended from i1 to i8
6815 static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
6816 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
6817 if (SizeInBits < 8)
6818 return false;
6820 APInt RequredZero(SizeInBits, 0xFE);
6821 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
6822 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
6823 return ZExtBool;
6826 SDValue AArch64TargetLowering::changeStreamingMode(
6827 SelectionDAG &DAG, SDLoc DL, bool Enable,
6828 SDValue Chain, SDValue InFlag, SDValue PStateSM, bool Entry) const {
6829 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6830 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
6831 SDValue MSROp =
6832 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
6834 SDValue ExpectedSMVal =
6835 DAG.getTargetConstant(Entry ? Enable : !Enable, DL, MVT::i64);
6836 SmallVector<SDValue> Ops = {Chain, MSROp, PStateSM, ExpectedSMVal, RegMask};
6838 if (InFlag)
6839 Ops.push_back(InFlag);
6841 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
6842 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
6845 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
6846 /// and add input and output parameter nodes.
6847 SDValue
6848 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
6849 SmallVectorImpl<SDValue> &InVals) const {
6850 SelectionDAG &DAG = CLI.DAG;
6851 SDLoc &DL = CLI.DL;
6852 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
6853 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
6854 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
6855 SDValue Chain = CLI.Chain;
6856 SDValue Callee = CLI.Callee;
6857 bool &IsTailCall = CLI.IsTailCall;
6858 CallingConv::ID &CallConv = CLI.CallConv;
6859 bool IsVarArg = CLI.IsVarArg;
6861 MachineFunction &MF = DAG.getMachineFunction();
6862 MachineFunction::CallSiteInfo CSInfo;
6863 bool IsThisReturn = false;
6865 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6866 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
6867 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
6868 bool IsSibCall = false;
6869 bool GuardWithBTI = false;
6871 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
6872 !Subtarget->noBTIAtReturnTwice()) {
6873 GuardWithBTI = FuncInfo->branchTargetEnforcement();
6876 // Analyze operands of the call, assigning locations to each operand.
6877 SmallVector<CCValAssign, 16> ArgLocs;
6878 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
6880 if (IsVarArg) {
6881 unsigned NumArgs = Outs.size();
6883 for (unsigned i = 0; i != NumArgs; ++i) {
6884 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
6885 report_fatal_error("Passing SVE types to variadic functions is "
6886 "currently not supported");
6890 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
6892 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
6893 // Assign locations to each value returned by this call.
6894 SmallVector<CCValAssign, 16> RVLocs;
6895 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
6896 *DAG.getContext());
6897 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
6899 // Check callee args/returns for SVE registers and set calling convention
6900 // accordingly.
6901 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
6902 auto HasSVERegLoc = [](CCValAssign &Loc) {
6903 if (!Loc.isRegLoc())
6904 return false;
6905 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
6906 AArch64::PPRRegClass.contains(Loc.getLocReg());
6908 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
6909 CallConv = CallingConv::AArch64_SVE_VectorCall;
6912 if (IsTailCall) {
6913 // Check if it's really possible to do a tail call.
6914 IsTailCall = isEligibleForTailCallOptimization(CLI);
6916 // A sibling call is one where we're under the usual C ABI and not planning
6917 // to change that but can still do a tail call:
6918 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
6919 CallConv != CallingConv::SwiftTail)
6920 IsSibCall = true;
6922 if (IsTailCall)
6923 ++NumTailCalls;
6926 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
6927 report_fatal_error("failed to perform tail call elimination on a call "
6928 "site marked musttail");
6930 // Get a count of how many bytes are to be pushed on the stack.
6931 unsigned NumBytes = CCInfo.getNextStackOffset();
6933 if (IsSibCall) {
6934 // Since we're not changing the ABI to make this a tail call, the memory
6935 // operands are already available in the caller's incoming argument space.
6936 NumBytes = 0;
6939 // FPDiff is the byte offset of the call's argument area from the callee's.
6940 // Stores to callee stack arguments will be placed in FixedStackSlots offset
6941 // by this amount for a tail call. In a sibling call it must be 0 because the
6942 // caller will deallocate the entire stack and the callee still expects its
6943 // arguments to begin at SP+0. Completely unused for non-tail calls.
6944 int FPDiff = 0;
6946 if (IsTailCall && !IsSibCall) {
6947 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
6949 // Since callee will pop argument stack as a tail call, we must keep the
6950 // popped size 16-byte aligned.
6951 NumBytes = alignTo(NumBytes, 16);
6953 // FPDiff will be negative if this tail call requires more space than we
6954 // would automatically have in our incoming argument space. Positive if we
6955 // can actually shrink the stack.
6956 FPDiff = NumReusableBytes - NumBytes;
6958 // Update the required reserved area if this is the tail call requiring the
6959 // most argument stack space.
6960 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
6961 FuncInfo->setTailCallReservedStack(-FPDiff);
6963 // The stack pointer must be 16-byte aligned at all times it's used for a
6964 // memory operation, which in practice means at *all* times and in
6965 // particular across call boundaries. Therefore our own arguments started at
6966 // a 16-byte aligned SP and the delta applied for the tail call should
6967 // satisfy the same constraint.
6968 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
6971 // Determine whether we need any streaming mode changes.
6972 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
6973 if (CLI.CB)
6974 CalleeAttrs = SMEAttrs(*CLI.CB);
6975 else if (Optional<SMEAttrs> Attrs =
6976 getCalleeAttrsFromExternalFunction(CLI.Callee))
6977 CalleeAttrs = *Attrs;
6979 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
6981 MachineFrameInfo &MFI = MF.getFrameInfo();
6982 if (RequiresLazySave) {
6983 // Set up a lazy save mechanism by storing the runtime live slices
6984 // (worst-case N*N) to the TPIDR2 stack object.
6985 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6986 DAG.getConstant(1, DL, MVT::i32));
6987 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6988 unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
6990 if (!TPIDR2Obj) {
6991 Register Reg;
6992 TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG, Reg);
6995 MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
6996 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
6997 DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
6998 SDValue BufferPtrAddr =
6999 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
7000 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
7001 Chain = DAG.getTruncStore(Chain, DL, NN, BufferPtrAddr, MPI, MVT::i16);
7002 Chain = DAG.getNode(
7003 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
7004 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
7005 TPIDR2ObjAddr);
7008 SDValue PStateSM;
7009 Optional<bool> RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
7010 if (RequiresSMChange)
7011 PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64);
7013 // Adjust the stack pointer for the new arguments...
7014 // These operations are automatically eliminated by the prolog/epilog pass
7015 if (!IsSibCall)
7016 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
7018 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
7019 getPointerTy(DAG.getDataLayout()));
7021 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
7022 SmallSet<unsigned, 8> RegsUsed;
7023 SmallVector<SDValue, 8> MemOpChains;
7024 auto PtrVT = getPointerTy(DAG.getDataLayout());
7026 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
7027 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
7028 for (const auto &F : Forwards) {
7029 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
7030 RegsToPass.emplace_back(F.PReg, Val);
7034 // Walk the register/memloc assignments, inserting copies/loads.
7035 unsigned ExtraArgLocs = 0;
7036 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
7037 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7038 SDValue Arg = OutVals[i];
7039 ISD::ArgFlagsTy Flags = Outs[i].Flags;
7041 // Promote the value if needed.
7042 switch (VA.getLocInfo()) {
7043 default:
7044 llvm_unreachable("Unknown loc info!");
7045 case CCValAssign::Full:
7046 break;
7047 case CCValAssign::SExt:
7048 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
7049 break;
7050 case CCValAssign::ZExt:
7051 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
7052 break;
7053 case CCValAssign::AExt:
7054 if (Outs[i].ArgVT == MVT::i1) {
7055 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
7057 // Check if we actually have to do this, because the value may
7058 // already be zero-extended.
7060 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
7061 // and rely on DAGCombiner to fold this, because the following
7062 // (anyext i32) is combined with (zext i8) in DAG.getNode:
7064 // (ext (zext x)) -> (zext x)
7066 // This will give us (zext i32), which we cannot remove, so
7067 // try to check this beforehand.
7068 if (!checkZExtBool(Arg, DAG)) {
7069 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
7070 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
7073 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
7074 break;
7075 case CCValAssign::AExtUpper:
7076 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
7077 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
7078 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
7079 DAG.getConstant(32, DL, VA.getLocVT()));
7080 break;
7081 case CCValAssign::BCvt:
7082 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
7083 break;
7084 case CCValAssign::Trunc:
7085 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
7086 break;
7087 case CCValAssign::FPExt:
7088 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
7089 break;
7090 case CCValAssign::Indirect:
7091 bool isScalable = VA.getValVT().isScalableVector();
7092 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
7093 "Indirect arguments should be scalable on most subtargets");
7095 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize();
7096 uint64_t PartSize = StoreSize;
7097 unsigned NumParts = 1;
7098 if (Outs[i].Flags.isInConsecutiveRegs()) {
7099 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
7100 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7101 ++NumParts;
7102 StoreSize *= NumParts;
7105 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
7106 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
7107 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
7108 if (isScalable)
7109 MFI.setStackID(FI, TargetStackID::ScalableVector);
7111 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
7112 SDValue Ptr = DAG.getFrameIndex(
7113 FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7114 SDValue SpillSlot = Ptr;
7116 // Ensure we generate all stores for each tuple part, whilst updating the
7117 // pointer after each store correctly using vscale.
7118 while (NumParts) {
7119 Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
7120 NumParts--;
7121 if (NumParts > 0) {
7122 SDValue BytesIncrement;
7123 if (isScalable) {
7124 BytesIncrement = DAG.getVScale(
7125 DL, Ptr.getValueType(),
7126 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
7127 } else {
7128 BytesIncrement = DAG.getConstant(
7129 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize), DL,
7130 Ptr.getValueType());
7132 SDNodeFlags Flags;
7133 Flags.setNoUnsignedWrap(true);
7135 MPI = MachinePointerInfo(MPI.getAddrSpace());
7136 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7137 BytesIncrement, Flags);
7138 ExtraArgLocs++;
7139 i++;
7143 Arg = SpillSlot;
7144 break;
7147 if (VA.isRegLoc()) {
7148 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
7149 Outs[0].VT == MVT::i64) {
7150 assert(VA.getLocVT() == MVT::i64 &&
7151 "unexpected calling convention register assignment");
7152 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
7153 "unexpected use of 'returned'");
7154 IsThisReturn = true;
7156 if (RegsUsed.count(VA.getLocReg())) {
7157 // If this register has already been used then we're trying to pack
7158 // parts of an [N x i32] into an X-register. The extension type will
7159 // take care of putting the two halves in the right place but we have to
7160 // combine them.
7161 SDValue &Bits =
7162 llvm::find_if(RegsToPass,
7163 [=](const std::pair<unsigned, SDValue> &Elt) {
7164 return Elt.first == VA.getLocReg();
7166 ->second;
7167 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
7168 // Call site info is used for function's parameter entry value
7169 // tracking. For now we track only simple cases when parameter
7170 // is transferred through whole register.
7171 llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
7172 return ArgReg.Reg == VA.getLocReg();
7174 } else {
7175 // Add an extra level of indirection for streaming mode changes by
7176 // using a pseudo copy node that cannot be rematerialised between a
7177 // smstart/smstop and the call by the simple register coalescer.
7178 if (RequiresSMChange && isa<FrameIndexSDNode>(Arg))
7179 Arg = DAG.getNode(AArch64ISD::OBSCURE_COPY, DL, MVT::i64, Arg);
7180 RegsToPass.emplace_back(VA.getLocReg(), Arg);
7181 RegsUsed.insert(VA.getLocReg());
7182 const TargetOptions &Options = DAG.getTarget().Options;
7183 if (Options.EmitCallSiteInfo)
7184 CSInfo.emplace_back(VA.getLocReg(), i);
7186 } else {
7187 assert(VA.isMemLoc());
7189 SDValue DstAddr;
7190 MachinePointerInfo DstInfo;
7192 // FIXME: This works on big-endian for composite byvals, which are the
7193 // common case. It should also work for fundamental types too.
7194 uint32_t BEAlign = 0;
7195 unsigned OpSize;
7196 if (VA.getLocInfo() == CCValAssign::Indirect ||
7197 VA.getLocInfo() == CCValAssign::Trunc)
7198 OpSize = VA.getLocVT().getFixedSizeInBits();
7199 else
7200 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
7201 : VA.getValVT().getSizeInBits();
7202 OpSize = (OpSize + 7) / 8;
7203 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
7204 !Flags.isInConsecutiveRegs()) {
7205 if (OpSize < 8)
7206 BEAlign = 8 - OpSize;
7208 unsigned LocMemOffset = VA.getLocMemOffset();
7209 int32_t Offset = LocMemOffset + BEAlign;
7210 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
7211 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
7213 if (IsTailCall) {
7214 Offset = Offset + FPDiff;
7215 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
7217 DstAddr = DAG.getFrameIndex(FI, PtrVT);
7218 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
7220 // Make sure any stack arguments overlapping with where we're storing
7221 // are loaded before this eventual operation. Otherwise they'll be
7222 // clobbered.
7223 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
7224 } else {
7225 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
7227 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
7228 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
7231 if (Outs[i].Flags.isByVal()) {
7232 SDValue SizeNode =
7233 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
7234 SDValue Cpy = DAG.getMemcpy(
7235 Chain, DL, DstAddr, Arg, SizeNode,
7236 Outs[i].Flags.getNonZeroByValAlign(),
7237 /*isVol = */ false, /*AlwaysInline = */ false,
7238 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
7240 MemOpChains.push_back(Cpy);
7241 } else {
7242 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
7243 // promoted to a legal register type i32, we should truncate Arg back to
7244 // i1/i8/i16.
7245 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
7246 VA.getValVT() == MVT::i16)
7247 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
7249 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
7250 MemOpChains.push_back(Store);
7255 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
7256 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
7257 // describing the argument list. x4 contains the address of the
7258 // first stack parameter. x5 contains the size in bytes of all parameters
7259 // passed on the stack.
7260 RegsToPass.emplace_back(AArch64::X4, StackPtr);
7261 RegsToPass.emplace_back(AArch64::X5,
7262 DAG.getConstant(NumBytes, DL, MVT::i64));
7265 if (!MemOpChains.empty())
7266 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
7268 SDValue InFlag;
7269 if (RequiresSMChange) {
7270 SDValue NewChain = changeStreamingMode(DAG, DL, *RequiresSMChange, Chain,
7271 InFlag, PStateSM, true);
7272 Chain = NewChain.getValue(0);
7273 InFlag = NewChain.getValue(1);
7276 // Build a sequence of copy-to-reg nodes chained together with token chain
7277 // and flag operands which copy the outgoing args into the appropriate regs.
7278 for (auto &RegToPass : RegsToPass) {
7279 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
7280 RegToPass.second, InFlag);
7281 InFlag = Chain.getValue(1);
7284 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
7285 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
7286 // node so that legalize doesn't hack it.
7287 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7288 auto GV = G->getGlobal();
7289 unsigned OpFlags =
7290 Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
7291 if (OpFlags & AArch64II::MO_GOT) {
7292 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
7293 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
7294 } else {
7295 const GlobalValue *GV = G->getGlobal();
7296 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
7298 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
7299 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
7300 Subtarget->isTargetMachO()) {
7301 const char *Sym = S->getSymbol();
7302 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
7303 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
7304 } else {
7305 const char *Sym = S->getSymbol();
7306 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
7310 // We don't usually want to end the call-sequence here because we would tidy
7311 // the frame up *after* the call, however in the ABI-changing tail-call case
7312 // we've carefully laid out the parameters so that when sp is reset they'll be
7313 // in the correct location.
7314 if (IsTailCall && !IsSibCall) {
7315 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InFlag, DL);
7316 InFlag = Chain.getValue(1);
7319 std::vector<SDValue> Ops;
7320 Ops.push_back(Chain);
7321 Ops.push_back(Callee);
7323 if (IsTailCall) {
7324 // Each tail call may have to adjust the stack by a different amount, so
7325 // this information must travel along with the operation for eventual
7326 // consumption by emitEpilogue.
7327 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
7330 // Add argument registers to the end of the list so that they are known live
7331 // into the call.
7332 for (auto &RegToPass : RegsToPass)
7333 Ops.push_back(DAG.getRegister(RegToPass.first,
7334 RegToPass.second.getValueType()));
7336 // Add a register mask operand representing the call-preserved registers.
7337 const uint32_t *Mask;
7338 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7339 if (IsThisReturn) {
7340 // For 'this' returns, use the X0-preserving mask if applicable
7341 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
7342 if (!Mask) {
7343 IsThisReturn = false;
7344 Mask = TRI->getCallPreservedMask(MF, CallConv);
7346 } else
7347 Mask = TRI->getCallPreservedMask(MF, CallConv);
7349 if (Subtarget->hasCustomCallingConv())
7350 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
7352 if (TRI->isAnyArgRegReserved(MF))
7353 TRI->emitReservedArgRegCallError(MF);
7355 assert(Mask && "Missing call preserved mask for calling convention");
7356 Ops.push_back(DAG.getRegisterMask(Mask));
7358 if (InFlag.getNode())
7359 Ops.push_back(InFlag);
7361 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7363 // If we're doing a tall call, use a TC_RETURN here rather than an
7364 // actual call instruction.
7365 if (IsTailCall) {
7366 MF.getFrameInfo().setHasTailCall();
7367 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
7369 if (IsCFICall)
7370 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
7372 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
7373 return Ret;
7376 unsigned CallOpc = AArch64ISD::CALL;
7377 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
7378 // be expanded to the call, directly followed by a special marker sequence and
7379 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
7380 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
7381 assert(!IsTailCall &&
7382 "tail calls cannot be marked with clang.arc.attachedcall");
7383 CallOpc = AArch64ISD::CALL_RVMARKER;
7385 // Add a target global address for the retainRV/claimRV runtime function
7386 // just before the call target.
7387 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
7388 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
7389 Ops.insert(Ops.begin() + 1, GA);
7390 } else if (GuardWithBTI)
7391 CallOpc = AArch64ISD::CALL_BTI;
7393 // Returns a chain and a flag for retval copy to use.
7394 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
7396 if (IsCFICall)
7397 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
7399 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
7400 InFlag = Chain.getValue(1);
7401 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
7403 uint64_t CalleePopBytes =
7404 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
7406 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InFlag, DL);
7407 InFlag = Chain.getValue(1);
7409 // Handle result values, copying them out of physregs into vregs that we
7410 // return.
7411 SDValue Result = LowerCallResult(Chain, InFlag, CallConv, IsVarArg, RVLocs,
7412 DL, DAG, InVals, IsThisReturn,
7413 IsThisReturn ? OutVals[0] : SDValue());
7415 if (!Ins.empty())
7416 InFlag = Result.getValue(Result->getNumValues() - 1);
7418 if (RequiresSMChange) {
7419 assert(PStateSM && "Expected a PStateSM to be set");
7420 Result = changeStreamingMode(DAG, DL, !*RequiresSMChange, Result, InFlag,
7421 PStateSM, false);
7424 if (RequiresLazySave) {
7425 // Unconditionally resume ZA.
7426 Result = DAG.getNode(
7427 AArch64ISD::SMSTART, DL, MVT::Other, Result,
7428 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
7429 DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
7431 // Conditionally restore the lazy save using a pseudo node.
7432 unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
7433 SDValue RegMask = DAG.getRegisterMask(
7434 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
7435 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
7436 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
7437 SDValue TPIDR2_EL0 = DAG.getNode(
7438 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
7439 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
7441 // Copy the address of the TPIDR2 block into X0 before 'calling' the
7442 // RESTORE_ZA pseudo.
7443 SDValue Glue;
7444 SDValue TPIDR2Block = DAG.getFrameIndex(
7445 FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7446 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
7447 Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
7448 {Result, TPIDR2_EL0,
7449 DAG.getRegister(AArch64::X0, MVT::i64),
7450 RestoreRoutine,
7451 RegMask,
7452 Result.getValue(1)});
7454 // Finally reset the TPIDR2_EL0 register to 0.
7455 Result = DAG.getNode(
7456 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
7457 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
7458 DAG.getConstant(0, DL, MVT::i64));
7461 if (RequiresSMChange || RequiresLazySave) {
7462 for (unsigned I = 0; I < InVals.size(); ++I) {
7463 // The smstart/smstop is chained as part of the call, but when the
7464 // resulting chain is discarded (which happens when the call is not part
7465 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
7466 // smstart/smstop is chained to the result value. We can do that by doing
7467 // a vreg -> vreg copy.
7468 Register Reg = MF.getRegInfo().createVirtualRegister(
7469 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7470 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
7471 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
7472 InVals[I].getValueType());
7476 return Result;
7479 bool AArch64TargetLowering::CanLowerReturn(
7480 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
7481 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
7482 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7483 SmallVector<CCValAssign, 16> RVLocs;
7484 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
7485 return CCInfo.CheckReturn(Outs, RetCC);
7488 SDValue
7489 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
7490 bool isVarArg,
7491 const SmallVectorImpl<ISD::OutputArg> &Outs,
7492 const SmallVectorImpl<SDValue> &OutVals,
7493 const SDLoc &DL, SelectionDAG &DAG) const {
7494 auto &MF = DAG.getMachineFunction();
7495 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7497 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7498 SmallVector<CCValAssign, 16> RVLocs;
7499 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
7500 CCInfo.AnalyzeReturn(Outs, RetCC);
7502 // Copy the result values into the output registers.
7503 SDValue Flag;
7504 SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
7505 SmallSet<unsigned, 4> RegsUsed;
7506 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
7507 ++i, ++realRVLocIdx) {
7508 CCValAssign &VA = RVLocs[i];
7509 assert(VA.isRegLoc() && "Can only return in registers!");
7510 SDValue Arg = OutVals[realRVLocIdx];
7512 switch (VA.getLocInfo()) {
7513 default:
7514 llvm_unreachable("Unknown loc info!");
7515 case CCValAssign::Full:
7516 if (Outs[i].ArgVT == MVT::i1) {
7517 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
7518 // value. This is strictly redundant on Darwin (which uses "zeroext
7519 // i1"), but will be optimised out before ISel.
7520 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
7521 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
7523 break;
7524 case CCValAssign::BCvt:
7525 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
7526 break;
7527 case CCValAssign::AExt:
7528 case CCValAssign::ZExt:
7529 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
7530 break;
7531 case CCValAssign::AExtUpper:
7532 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
7533 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
7534 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
7535 DAG.getConstant(32, DL, VA.getLocVT()));
7536 break;
7539 if (RegsUsed.count(VA.getLocReg())) {
7540 SDValue &Bits =
7541 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
7542 return Elt.first == VA.getLocReg();
7543 })->second;
7544 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
7545 } else {
7546 RetVals.emplace_back(VA.getLocReg(), Arg);
7547 RegsUsed.insert(VA.getLocReg());
7551 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7553 // Emit SMSTOP before returning from a locally streaming function
7554 SMEAttrs FuncAttrs(MF.getFunction());
7555 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
7556 Chain = DAG.getNode(
7557 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
7558 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32),
7559 DAG.getConstant(1, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64),
7560 DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()));
7561 Flag = Chain.getValue(1);
7564 SmallVector<SDValue, 4> RetOps(1, Chain);
7565 for (auto &RetVal : RetVals) {
7566 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
7567 Flag = Chain.getValue(1);
7568 RetOps.push_back(
7569 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
7572 // Windows AArch64 ABIs require that for returning structs by value we copy
7573 // the sret argument into X0 for the return.
7574 // We saved the argument into a virtual register in the entry block,
7575 // so now we copy the value out and into X0.
7576 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
7577 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
7578 getPointerTy(MF.getDataLayout()));
7580 unsigned RetValReg = AArch64::X0;
7581 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
7582 Flag = Chain.getValue(1);
7584 RetOps.push_back(
7585 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
7588 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
7589 if (I) {
7590 for (; *I; ++I) {
7591 if (AArch64::GPR64RegClass.contains(*I))
7592 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
7593 else if (AArch64::FPR64RegClass.contains(*I))
7594 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
7595 else
7596 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
7600 RetOps[0] = Chain; // Update chain.
7602 // Add the flag if we have it.
7603 if (Flag.getNode())
7604 RetOps.push_back(Flag);
7606 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
7609 //===----------------------------------------------------------------------===//
7610 // Other Lowering Code
7611 //===----------------------------------------------------------------------===//
7613 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
7614 SelectionDAG &DAG,
7615 unsigned Flag) const {
7616 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
7617 N->getOffset(), Flag);
7620 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
7621 SelectionDAG &DAG,
7622 unsigned Flag) const {
7623 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
7626 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
7627 SelectionDAG &DAG,
7628 unsigned Flag) const {
7629 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
7630 N->getOffset(), Flag);
7633 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
7634 SelectionDAG &DAG,
7635 unsigned Flag) const {
7636 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
7639 // (loadGOT sym)
7640 template <class NodeTy>
7641 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
7642 unsigned Flags) const {
7643 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
7644 SDLoc DL(N);
7645 EVT Ty = getPointerTy(DAG.getDataLayout());
7646 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
7647 // FIXME: Once remat is capable of dealing with instructions with register
7648 // operands, expand this into two nodes instead of using a wrapper node.
7649 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
7652 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
7653 template <class NodeTy>
7654 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
7655 unsigned Flags) const {
7656 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
7657 SDLoc DL(N);
7658 EVT Ty = getPointerTy(DAG.getDataLayout());
7659 const unsigned char MO_NC = AArch64II::MO_NC;
7660 return DAG.getNode(
7661 AArch64ISD::WrapperLarge, DL, Ty,
7662 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
7663 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
7664 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
7665 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
7668 // (addlow (adrp %hi(sym)) %lo(sym))
7669 template <class NodeTy>
7670 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
7671 unsigned Flags) const {
7672 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
7673 SDLoc DL(N);
7674 EVT Ty = getPointerTy(DAG.getDataLayout());
7675 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
7676 SDValue Lo = getTargetNode(N, Ty, DAG,
7677 AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
7678 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
7679 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
7682 // (adr sym)
7683 template <class NodeTy>
7684 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
7685 unsigned Flags) const {
7686 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
7687 SDLoc DL(N);
7688 EVT Ty = getPointerTy(DAG.getDataLayout());
7689 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
7690 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
7693 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
7694 SelectionDAG &DAG) const {
7695 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
7696 const GlobalValue *GV = GN->getGlobal();
7697 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
7699 if (OpFlags != AArch64II::MO_NO_FLAG)
7700 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
7701 "unexpected offset in global node");
7703 // This also catches the large code model case for Darwin, and tiny code
7704 // model with got relocations.
7705 if ((OpFlags & AArch64II::MO_GOT) != 0) {
7706 return getGOT(GN, DAG, OpFlags);
7709 SDValue Result;
7710 if (getTargetMachine().getCodeModel() == CodeModel::Large) {
7711 Result = getAddrLarge(GN, DAG, OpFlags);
7712 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
7713 Result = getAddrTiny(GN, DAG, OpFlags);
7714 } else {
7715 Result = getAddr(GN, DAG, OpFlags);
7717 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7718 SDLoc DL(GN);
7719 if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_DLLIMPORTAUX |
7720 AArch64II::MO_COFFSTUB))
7721 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
7722 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
7723 return Result;
7726 /// Convert a TLS address reference into the correct sequence of loads
7727 /// and calls to compute the variable's address (for Darwin, currently) and
7728 /// return an SDValue containing the final node.
7730 /// Darwin only has one TLS scheme which must be capable of dealing with the
7731 /// fully general situation, in the worst case. This means:
7732 /// + "extern __thread" declaration.
7733 /// + Defined in a possibly unknown dynamic library.
7735 /// The general system is that each __thread variable has a [3 x i64] descriptor
7736 /// which contains information used by the runtime to calculate the address. The
7737 /// only part of this the compiler needs to know about is the first xword, which
7738 /// contains a function pointer that must be called with the address of the
7739 /// entire descriptor in "x0".
7741 /// Since this descriptor may be in a different unit, in general even the
7742 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
7743 /// is:
7744 /// adrp x0, _var@TLVPPAGE
7745 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
7746 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
7747 /// ; the function pointer
7748 /// blr x1 ; Uses descriptor address in x0
7749 /// ; Address of _var is now in x0.
7751 /// If the address of _var's descriptor *is* known to the linker, then it can
7752 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
7753 /// a slight efficiency gain.
7754 SDValue
7755 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
7756 SelectionDAG &DAG) const {
7757 assert(Subtarget->isTargetDarwin() &&
7758 "This function expects a Darwin target");
7760 SDLoc DL(Op);
7761 MVT PtrVT = getPointerTy(DAG.getDataLayout());
7762 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
7763 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
7765 SDValue TLVPAddr =
7766 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
7767 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
7769 // The first entry in the descriptor is a function pointer that we must call
7770 // to obtain the address of the variable.
7771 SDValue Chain = DAG.getEntryNode();
7772 SDValue FuncTLVGet = DAG.getLoad(
7773 PtrMemVT, DL, Chain, DescAddr,
7774 MachinePointerInfo::getGOT(DAG.getMachineFunction()),
7775 Align(PtrMemVT.getSizeInBits() / 8),
7776 MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
7777 Chain = FuncTLVGet.getValue(1);
7779 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
7780 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
7782 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7783 MFI.setAdjustsStack(true);
7785 // TLS calls preserve all registers except those that absolutely must be
7786 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
7787 // silly).
7788 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7789 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
7790 if (Subtarget->hasCustomCallingConv())
7791 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
7793 // Finally, we can make the call. This is just a degenerate version of a
7794 // normal AArch64 call node: x0 takes the address of the descriptor, and
7795 // returns the address of the variable in this thread.
7796 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
7797 Chain =
7798 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
7799 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
7800 DAG.getRegisterMask(Mask), Chain.getValue(1));
7801 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
7804 /// Convert a thread-local variable reference into a sequence of instructions to
7805 /// compute the variable's address for the local exec TLS model of ELF targets.
7806 /// The sequence depends on the maximum TLS area size.
7807 SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
7808 SDValue ThreadBase,
7809 const SDLoc &DL,
7810 SelectionDAG &DAG) const {
7811 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7812 SDValue TPOff, Addr;
7814 switch (DAG.getTarget().Options.TLSSize) {
7815 default:
7816 llvm_unreachable("Unexpected TLS size");
7818 case 12: {
7819 // mrs x0, TPIDR_EL0
7820 // add x0, x0, :tprel_lo12:a
7821 SDValue Var = DAG.getTargetGlobalAddress(
7822 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
7823 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
7824 Var,
7825 DAG.getTargetConstant(0, DL, MVT::i32)),
7829 case 24: {
7830 // mrs x0, TPIDR_EL0
7831 // add x0, x0, :tprel_hi12:a
7832 // add x0, x0, :tprel_lo12_nc:a
7833 SDValue HiVar = DAG.getTargetGlobalAddress(
7834 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
7835 SDValue LoVar = DAG.getTargetGlobalAddress(
7836 GV, DL, PtrVT, 0,
7837 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
7838 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
7839 HiVar,
7840 DAG.getTargetConstant(0, DL, MVT::i32)),
7842 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
7843 LoVar,
7844 DAG.getTargetConstant(0, DL, MVT::i32)),
7848 case 32: {
7849 // mrs x1, TPIDR_EL0
7850 // movz x0, #:tprel_g1:a
7851 // movk x0, #:tprel_g0_nc:a
7852 // add x0, x1, x0
7853 SDValue HiVar = DAG.getTargetGlobalAddress(
7854 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
7855 SDValue LoVar = DAG.getTargetGlobalAddress(
7856 GV, DL, PtrVT, 0,
7857 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
7858 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
7859 DAG.getTargetConstant(16, DL, MVT::i32)),
7861 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
7862 DAG.getTargetConstant(0, DL, MVT::i32)),
7864 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
7867 case 48: {
7868 // mrs x1, TPIDR_EL0
7869 // movz x0, #:tprel_g2:a
7870 // movk x0, #:tprel_g1_nc:a
7871 // movk x0, #:tprel_g0_nc:a
7872 // add x0, x1, x0
7873 SDValue HiVar = DAG.getTargetGlobalAddress(
7874 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
7875 SDValue MiVar = DAG.getTargetGlobalAddress(
7876 GV, DL, PtrVT, 0,
7877 AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
7878 SDValue LoVar = DAG.getTargetGlobalAddress(
7879 GV, DL, PtrVT, 0,
7880 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
7881 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
7882 DAG.getTargetConstant(32, DL, MVT::i32)),
7884 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
7885 DAG.getTargetConstant(16, DL, MVT::i32)),
7887 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
7888 DAG.getTargetConstant(0, DL, MVT::i32)),
7890 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
7895 /// When accessing thread-local variables under either the general-dynamic or
7896 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
7897 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
7898 /// is a function pointer to carry out the resolution.
7900 /// The sequence is:
7901 /// adrp x0, :tlsdesc:var
7902 /// ldr x1, [x0, #:tlsdesc_lo12:var]
7903 /// add x0, x0, #:tlsdesc_lo12:var
7904 /// .tlsdesccall var
7905 /// blr x1
7906 /// (TPIDR_EL0 offset now in x0)
7908 /// The above sequence must be produced unscheduled, to enable the linker to
7909 /// optimize/relax this sequence.
7910 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
7911 /// above sequence, and expanded really late in the compilation flow, to ensure
7912 /// the sequence is produced as per above.
7913 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
7914 const SDLoc &DL,
7915 SelectionDAG &DAG) const {
7916 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7918 SDValue Chain = DAG.getEntryNode();
7919 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7921 Chain =
7922 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
7923 SDValue Glue = Chain.getValue(1);
7925 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
7928 SDValue
7929 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
7930 SelectionDAG &DAG) const {
7931 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
7933 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
7935 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
7937 if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
7938 if (Model == TLSModel::LocalDynamic)
7939 Model = TLSModel::GeneralDynamic;
7942 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
7943 Model != TLSModel::LocalExec)
7944 report_fatal_error("ELF TLS only supported in small memory model or "
7945 "in local exec TLS model");
7946 // Different choices can be made for the maximum size of the TLS area for a
7947 // module. For the small address model, the default TLS size is 16MiB and the
7948 // maximum TLS size is 4GiB.
7949 // FIXME: add tiny and large code model support for TLS access models other
7950 // than local exec. We currently generate the same code as small for tiny,
7951 // which may be larger than needed.
7953 SDValue TPOff;
7954 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7955 SDLoc DL(Op);
7956 const GlobalValue *GV = GA->getGlobal();
7958 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
7960 if (Model == TLSModel::LocalExec) {
7961 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
7962 } else if (Model == TLSModel::InitialExec) {
7963 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
7964 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
7965 } else if (Model == TLSModel::LocalDynamic) {
7966 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
7967 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
7968 // the beginning of the module's TLS region, followed by a DTPREL offset
7969 // calculation.
7971 // These accesses will need deduplicating if there's more than one.
7972 AArch64FunctionInfo *MFI =
7973 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
7974 MFI->incNumLocalDynamicTLSAccesses();
7976 // The call needs a relocation too for linker relaxation. It doesn't make
7977 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
7978 // the address.
7979 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
7980 AArch64II::MO_TLS);
7982 // Now we can calculate the offset from TPIDR_EL0 to this module's
7983 // thread-local area.
7984 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
7986 // Now use :dtprel_whatever: operations to calculate this variable's offset
7987 // in its thread-storage area.
7988 SDValue HiVar = DAG.getTargetGlobalAddress(
7989 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
7990 SDValue LoVar = DAG.getTargetGlobalAddress(
7991 GV, DL, MVT::i64, 0,
7992 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
7994 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
7995 DAG.getTargetConstant(0, DL, MVT::i32)),
7997 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
7998 DAG.getTargetConstant(0, DL, MVT::i32)),
8000 } else if (Model == TLSModel::GeneralDynamic) {
8001 // The call needs a relocation too for linker relaxation. It doesn't make
8002 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
8003 // the address.
8004 SDValue SymAddr =
8005 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8007 // Finally we can make a call to calculate the offset from tpidr_el0.
8008 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
8009 } else
8010 llvm_unreachable("Unsupported ELF TLS access model");
8012 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
8015 SDValue
8016 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
8017 SelectionDAG &DAG) const {
8018 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
8020 SDValue Chain = DAG.getEntryNode();
8021 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8022 SDLoc DL(Op);
8024 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
8026 // Load the ThreadLocalStoragePointer from the TEB
8027 // A pointer to the TLS array is located at offset 0x58 from the TEB.
8028 SDValue TLSArray =
8029 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
8030 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
8031 Chain = TLSArray.getValue(1);
8033 // Load the TLS index from the C runtime;
8034 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
8035 // This also does the same as LOADgot, but using a generic i32 load,
8036 // while LOADgot only loads i64.
8037 SDValue TLSIndexHi =
8038 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
8039 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
8040 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8041 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
8042 SDValue TLSIndex =
8043 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
8044 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
8045 Chain = TLSIndex.getValue(1);
8047 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
8048 // offset into the TLSArray.
8049 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
8050 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
8051 DAG.getConstant(3, DL, PtrVT));
8052 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
8053 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
8054 MachinePointerInfo());
8055 Chain = TLS.getValue(1);
8057 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8058 const GlobalValue *GV = GA->getGlobal();
8059 SDValue TGAHi = DAG.getTargetGlobalAddress(
8060 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
8061 SDValue TGALo = DAG.getTargetGlobalAddress(
8062 GV, DL, PtrVT, 0,
8063 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
8065 // Add the offset from the start of the .tls section (section base).
8066 SDValue Addr =
8067 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
8068 DAG.getTargetConstant(0, DL, MVT::i32)),
8070 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
8071 return Addr;
8074 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
8075 SelectionDAG &DAG) const {
8076 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
8077 if (DAG.getTarget().useEmulatedTLS())
8078 return LowerToTLSEmulatedModel(GA, DAG);
8080 if (Subtarget->isTargetDarwin())
8081 return LowerDarwinGlobalTLSAddress(Op, DAG);
8082 if (Subtarget->isTargetELF())
8083 return LowerELFGlobalTLSAddress(Op, DAG);
8084 if (Subtarget->isTargetWindows())
8085 return LowerWindowsGlobalTLSAddress(Op, DAG);
8087 llvm_unreachable("Unexpected platform trying to use TLS");
8090 // Looks through \param Val to determine the bit that can be used to
8091 // check the sign of the value. It returns the unextended value and
8092 // the sign bit position.
8093 std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
8094 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
8095 return {Val.getOperand(0),
8096 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
8099 if (Val.getOpcode() == ISD::SIGN_EXTEND)
8100 return {Val.getOperand(0),
8101 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
8103 return {Val, Val.getValueSizeInBits() - 1};
8106 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
8107 SDValue Chain = Op.getOperand(0);
8108 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
8109 SDValue LHS = Op.getOperand(2);
8110 SDValue RHS = Op.getOperand(3);
8111 SDValue Dest = Op.getOperand(4);
8112 SDLoc dl(Op);
8114 MachineFunction &MF = DAG.getMachineFunction();
8115 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
8116 // will not be produced, as they are conditional branch instructions that do
8117 // not set flags.
8118 bool ProduceNonFlagSettingCondBr =
8119 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
8121 // Handle f128 first, since lowering it will result in comparing the return
8122 // value of a libcall against zero, which is just what the rest of LowerBR_CC
8123 // is expecting to deal with.
8124 if (LHS.getValueType() == MVT::f128) {
8125 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
8127 // If softenSetCCOperands returned a scalar, we need to compare the result
8128 // against zero to select between true and false values.
8129 if (!RHS.getNode()) {
8130 RHS = DAG.getConstant(0, dl, LHS.getValueType());
8131 CC = ISD::SETNE;
8135 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
8136 // instruction.
8137 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
8138 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
8139 // Only lower legal XALUO ops.
8140 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
8141 return SDValue();
8143 // The actual operation with overflow check.
8144 AArch64CC::CondCode OFCC;
8145 SDValue Value, Overflow;
8146 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
8148 if (CC == ISD::SETNE)
8149 OFCC = getInvertedCondCode(OFCC);
8150 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
8152 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
8153 Overflow);
8156 if (LHS.getValueType().isInteger()) {
8157 assert((LHS.getValueType() == RHS.getValueType()) &&
8158 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
8160 // If the RHS of the comparison is zero, we can potentially fold this
8161 // to a specialized branch.
8162 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
8163 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
8164 if (CC == ISD::SETEQ) {
8165 // See if we can use a TBZ to fold in an AND as well.
8166 // TBZ has a smaller branch displacement than CBZ. If the offset is
8167 // out of bounds, a late MI-layer pass rewrites branches.
8168 // 403.gcc is an example that hits this case.
8169 if (LHS.getOpcode() == ISD::AND &&
8170 isa<ConstantSDNode>(LHS.getOperand(1)) &&
8171 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
8172 SDValue Test = LHS.getOperand(0);
8173 uint64_t Mask = LHS.getConstantOperandVal(1);
8174 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
8175 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
8176 Dest);
8179 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
8180 } else if (CC == ISD::SETNE) {
8181 // See if we can use a TBZ to fold in an AND as well.
8182 // TBZ has a smaller branch displacement than CBZ. If the offset is
8183 // out of bounds, a late MI-layer pass rewrites branches.
8184 // 403.gcc is an example that hits this case.
8185 if (LHS.getOpcode() == ISD::AND &&
8186 isa<ConstantSDNode>(LHS.getOperand(1)) &&
8187 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
8188 SDValue Test = LHS.getOperand(0);
8189 uint64_t Mask = LHS.getConstantOperandVal(1);
8190 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
8191 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
8192 Dest);
8195 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
8196 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
8197 // Don't combine AND since emitComparison converts the AND to an ANDS
8198 // (a.k.a. TST) and the test in the test bit and branch instruction
8199 // becomes redundant. This would also increase register pressure.
8200 uint64_t SignBitPos;
8201 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
8202 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
8203 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
8206 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
8207 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
8208 // Don't combine AND since emitComparison converts the AND to an ANDS
8209 // (a.k.a. TST) and the test in the test bit and branch instruction
8210 // becomes redundant. This would also increase register pressure.
8211 uint64_t SignBitPos;
8212 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
8213 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
8214 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
8217 SDValue CCVal;
8218 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
8219 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
8220 Cmp);
8223 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
8224 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
8226 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
8227 // clean. Some of them require two branches to implement.
8228 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
8229 AArch64CC::CondCode CC1, CC2;
8230 changeFPCCToAArch64CC(CC, CC1, CC2);
8231 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8232 SDValue BR1 =
8233 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
8234 if (CC2 != AArch64CC::AL) {
8235 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
8236 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
8237 Cmp);
8240 return BR1;
8243 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
8244 SelectionDAG &DAG) const {
8245 if (!Subtarget->hasNEON())
8246 return SDValue();
8248 EVT VT = Op.getValueType();
8249 EVT IntVT = VT.changeTypeToInteger();
8250 SDLoc DL(Op);
8252 SDValue In1 = Op.getOperand(0);
8253 SDValue In2 = Op.getOperand(1);
8254 EVT SrcVT = In2.getValueType();
8256 if (!SrcVT.bitsEq(VT))
8257 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
8259 if (VT.isScalableVector())
8260 IntVT =
8261 getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger());
8263 if (VT.isFixedLengthVector() && useSVEForFixedLengthVectorVT(VT)) {
8264 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
8266 In1 = convertToScalableVector(DAG, ContainerVT, In1);
8267 In2 = convertToScalableVector(DAG, ContainerVT, In2);
8269 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
8270 return convertFromScalableVector(DAG, VT, Res);
8273 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
8274 if (VT.isScalableVector())
8275 return getSVESafeBitCast(VT, Op, DAG);
8277 return DAG.getBitcast(VT, Op);
8280 SDValue VecVal1, VecVal2;
8281 EVT VecVT;
8282 auto SetVecVal = [&](int Idx = -1) {
8283 if (!VT.isVector()) {
8284 VecVal1 =
8285 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
8286 VecVal2 =
8287 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
8288 } else {
8289 VecVal1 = BitCast(VecVT, In1, DAG);
8290 VecVal2 = BitCast(VecVT, In2, DAG);
8293 if (VT.isVector()) {
8294 VecVT = IntVT;
8295 SetVecVal();
8296 } else if (VT == MVT::f64) {
8297 VecVT = MVT::v2i64;
8298 SetVecVal(AArch64::dsub);
8299 } else if (VT == MVT::f32) {
8300 VecVT = MVT::v4i32;
8301 SetVecVal(AArch64::ssub);
8302 } else if (VT == MVT::f16) {
8303 VecVT = MVT::v8i16;
8304 SetVecVal(AArch64::hsub);
8305 } else {
8306 llvm_unreachable("Invalid type for copysign!");
8309 unsigned BitWidth = In1.getScalarValueSizeInBits();
8310 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
8312 // We want to materialize a mask with every bit but the high bit set, but the
8313 // AdvSIMD immediate moves cannot materialize that in a single instruction for
8314 // 64-bit elements. Instead, materialize all bits set and then negate that.
8315 if (VT == MVT::f64 || VT == MVT::v2f64) {
8316 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
8317 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
8318 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
8319 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
8322 SDValue BSP =
8323 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
8324 if (VT == MVT::f16)
8325 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
8326 if (VT == MVT::f32)
8327 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
8328 if (VT == MVT::f64)
8329 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
8331 return BitCast(VT, BSP, DAG);
8334 SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
8335 SelectionDAG &DAG) const {
8336 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
8337 Attribute::NoImplicitFloat))
8338 return SDValue();
8340 if (!Subtarget->hasNEON())
8341 return SDValue();
8343 bool IsParity = Op.getOpcode() == ISD::PARITY;
8345 // While there is no integer popcount instruction, it can
8346 // be more efficiently lowered to the following sequence that uses
8347 // AdvSIMD registers/instructions as long as the copies to/from
8348 // the AdvSIMD registers are cheap.
8349 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
8350 // CNT V0.8B, V0.8B // 8xbyte pop-counts
8351 // ADDV B0, V0.8B // sum 8xbyte pop-counts
8352 // UMOV X0, V0.B[0] // copy byte result back to integer reg
8353 SDValue Val = Op.getOperand(0);
8354 SDLoc DL(Op);
8355 EVT VT = Op.getValueType();
8357 if (VT == MVT::i32 || VT == MVT::i64) {
8358 if (VT == MVT::i32)
8359 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
8360 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
8362 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
8363 SDValue UaddLV = DAG.getNode(
8364 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
8365 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
8367 if (IsParity)
8368 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
8369 DAG.getConstant(1, DL, MVT::i32));
8371 if (VT == MVT::i64)
8372 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
8373 return UaddLV;
8374 } else if (VT == MVT::i128) {
8375 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
8377 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
8378 SDValue UaddLV = DAG.getNode(
8379 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
8380 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
8382 if (IsParity)
8383 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
8384 DAG.getConstant(1, DL, MVT::i32));
8386 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
8389 assert(!IsParity && "ISD::PARITY of vector types not supported");
8391 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
8392 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
8394 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
8395 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
8396 "Unexpected type for custom ctpop lowering");
8398 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
8399 Val = DAG.getBitcast(VT8Bit, Val);
8400 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
8402 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
8403 unsigned EltSize = 8;
8404 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
8405 while (EltSize != VT.getScalarSizeInBits()) {
8406 EltSize *= 2;
8407 NumElts /= 2;
8408 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
8409 Val = DAG.getNode(
8410 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
8411 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
8414 return Val;
8417 SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
8418 EVT VT = Op.getValueType();
8419 assert(VT.isScalableVector() ||
8420 useSVEForFixedLengthVectorVT(
8421 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
8423 SDLoc DL(Op);
8424 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
8425 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
8428 SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
8429 SelectionDAG &DAG) const {
8431 EVT VT = Op.getValueType();
8432 SDLoc DL(Op);
8433 unsigned Opcode = Op.getOpcode();
8434 ISD::CondCode CC;
8435 switch (Opcode) {
8436 default:
8437 llvm_unreachable("Wrong instruction");
8438 case ISD::SMAX:
8439 CC = ISD::SETGT;
8440 break;
8441 case ISD::SMIN:
8442 CC = ISD::SETLT;
8443 break;
8444 case ISD::UMAX:
8445 CC = ISD::SETUGT;
8446 break;
8447 case ISD::UMIN:
8448 CC = ISD::SETULT;
8449 break;
8452 if (VT.isScalableVector() ||
8453 useSVEForFixedLengthVectorVT(
8454 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
8455 switch (Opcode) {
8456 default:
8457 llvm_unreachable("Wrong instruction");
8458 case ISD::SMAX:
8459 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
8460 case ISD::SMIN:
8461 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
8462 case ISD::UMAX:
8463 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
8464 case ISD::UMIN:
8465 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
8469 SDValue Op0 = Op.getOperand(0);
8470 SDValue Op1 = Op.getOperand(1);
8471 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
8472 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
8475 SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
8476 SelectionDAG &DAG) const {
8477 EVT VT = Op.getValueType();
8479 if (VT.isScalableVector() ||
8480 useSVEForFixedLengthVectorVT(
8481 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
8482 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
8484 SDLoc DL(Op);
8485 SDValue REVB;
8486 MVT VST;
8488 switch (VT.getSimpleVT().SimpleTy) {
8489 default:
8490 llvm_unreachable("Invalid type for bitreverse!");
8492 case MVT::v2i32: {
8493 VST = MVT::v8i8;
8494 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
8496 break;
8499 case MVT::v4i32: {
8500 VST = MVT::v16i8;
8501 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
8503 break;
8506 case MVT::v1i64: {
8507 VST = MVT::v8i8;
8508 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
8510 break;
8513 case MVT::v2i64: {
8514 VST = MVT::v16i8;
8515 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
8517 break;
8521 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
8522 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
8525 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
8527 if (Op.getValueType().isVector())
8528 return LowerVSETCC(Op, DAG);
8530 bool IsStrict = Op->isStrictFPOpcode();
8531 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
8532 unsigned OpNo = IsStrict ? 1 : 0;
8533 SDValue Chain;
8534 if (IsStrict)
8535 Chain = Op.getOperand(0);
8536 SDValue LHS = Op.getOperand(OpNo + 0);
8537 SDValue RHS = Op.getOperand(OpNo + 1);
8538 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
8539 SDLoc dl(Op);
8541 // We chose ZeroOrOneBooleanContents, so use zero and one.
8542 EVT VT = Op.getValueType();
8543 SDValue TVal = DAG.getConstant(1, dl, VT);
8544 SDValue FVal = DAG.getConstant(0, dl, VT);
8546 // Handle f128 first, since one possible outcome is a normal integer
8547 // comparison which gets picked up by the next if statement.
8548 if (LHS.getValueType() == MVT::f128) {
8549 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
8550 IsSignaling);
8552 // If softenSetCCOperands returned a scalar, use it.
8553 if (!RHS.getNode()) {
8554 assert(LHS.getValueType() == Op.getValueType() &&
8555 "Unexpected setcc expansion!");
8556 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
8560 if (LHS.getValueType().isInteger()) {
8561 SDValue CCVal;
8562 SDValue Cmp = getAArch64Cmp(
8563 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
8565 // Note that we inverted the condition above, so we reverse the order of
8566 // the true and false operands here. This will allow the setcc to be
8567 // matched to a single CSINC instruction.
8568 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
8569 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
8572 // Now we know we're dealing with FP values.
8573 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
8574 LHS.getValueType() == MVT::f64);
8576 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
8577 // and do the comparison.
8578 SDValue Cmp;
8579 if (IsStrict)
8580 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
8581 else
8582 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
8584 AArch64CC::CondCode CC1, CC2;
8585 changeFPCCToAArch64CC(CC, CC1, CC2);
8586 SDValue Res;
8587 if (CC2 == AArch64CC::AL) {
8588 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
8589 CC2);
8590 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8592 // Note that we inverted the condition above, so we reverse the order of
8593 // the true and false operands here. This will allow the setcc to be
8594 // matched to a single CSINC instruction.
8595 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
8596 } else {
8597 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
8598 // totally clean. Some of them require two CSELs to implement. As is in
8599 // this case, we emit the first CSEL and then emit a second using the output
8600 // of the first as the RHS. We're effectively OR'ing the two CC's together.
8602 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
8603 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8604 SDValue CS1 =
8605 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
8607 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
8608 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
8610 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
8613 SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
8614 SelectionDAG &DAG) const {
8616 SDValue LHS = Op.getOperand(0);
8617 SDValue RHS = Op.getOperand(1);
8618 EVT VT = LHS.getValueType();
8619 if (VT != MVT::i32 && VT != MVT::i64)
8620 return SDValue();
8622 SDLoc DL(Op);
8623 SDValue Carry = Op.getOperand(2);
8624 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
8625 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
8626 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
8627 LHS, RHS, InvCarry);
8629 EVT OpVT = Op.getValueType();
8630 SDValue TVal = DAG.getConstant(1, DL, OpVT);
8631 SDValue FVal = DAG.getConstant(0, DL, OpVT);
8633 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
8634 ISD::CondCode CondInv = ISD::getSetCCInverse(Cond, VT);
8635 SDValue CCVal =
8636 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
8637 // Inputs are swapped because the condition is inverted. This will allow
8638 // matching with a single CSINC instruction.
8639 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
8640 Cmp.getValue(1));
8643 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
8644 SDValue RHS, SDValue TVal,
8645 SDValue FVal, const SDLoc &dl,
8646 SelectionDAG &DAG) const {
8647 // Handle f128 first, because it will result in a comparison of some RTLIB
8648 // call result against zero.
8649 if (LHS.getValueType() == MVT::f128) {
8650 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
8652 // If softenSetCCOperands returned a scalar, we need to compare the result
8653 // against zero to select between true and false values.
8654 if (!RHS.getNode()) {
8655 RHS = DAG.getConstant(0, dl, LHS.getValueType());
8656 CC = ISD::SETNE;
8660 // Also handle f16, for which we need to do a f32 comparison.
8661 if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
8662 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
8663 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
8666 // Next, handle integers.
8667 if (LHS.getValueType().isInteger()) {
8668 assert((LHS.getValueType() == RHS.getValueType()) &&
8669 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
8671 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
8672 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
8673 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
8674 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
8675 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
8676 // supported types.
8677 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
8678 CTVal->isOne() && CFVal->isAllOnes() &&
8679 LHS.getValueType() == TVal.getValueType()) {
8680 EVT VT = LHS.getValueType();
8681 SDValue Shift =
8682 DAG.getNode(ISD::SRA, dl, VT, LHS,
8683 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
8684 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
8687 unsigned Opcode = AArch64ISD::CSEL;
8689 // If both the TVal and the FVal are constants, see if we can swap them in
8690 // order to for a CSINV or CSINC out of them.
8691 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
8692 std::swap(TVal, FVal);
8693 std::swap(CTVal, CFVal);
8694 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8695 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
8696 std::swap(TVal, FVal);
8697 std::swap(CTVal, CFVal);
8698 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8699 } else if (TVal.getOpcode() == ISD::XOR) {
8700 // If TVal is a NOT we want to swap TVal and FVal so that we can match
8701 // with a CSINV rather than a CSEL.
8702 if (isAllOnesConstant(TVal.getOperand(1))) {
8703 std::swap(TVal, FVal);
8704 std::swap(CTVal, CFVal);
8705 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8707 } else if (TVal.getOpcode() == ISD::SUB) {
8708 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
8709 // that we can match with a CSNEG rather than a CSEL.
8710 if (isNullConstant(TVal.getOperand(0))) {
8711 std::swap(TVal, FVal);
8712 std::swap(CTVal, CFVal);
8713 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8715 } else if (CTVal && CFVal) {
8716 const int64_t TrueVal = CTVal->getSExtValue();
8717 const int64_t FalseVal = CFVal->getSExtValue();
8718 bool Swap = false;
8720 // If both TVal and FVal are constants, see if FVal is the
8721 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
8722 // instead of a CSEL in that case.
8723 if (TrueVal == ~FalseVal) {
8724 Opcode = AArch64ISD::CSINV;
8725 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
8726 TrueVal == -FalseVal) {
8727 Opcode = AArch64ISD::CSNEG;
8728 } else if (TVal.getValueType() == MVT::i32) {
8729 // If our operands are only 32-bit wide, make sure we use 32-bit
8730 // arithmetic for the check whether we can use CSINC. This ensures that
8731 // the addition in the check will wrap around properly in case there is
8732 // an overflow (which would not be the case if we do the check with
8733 // 64-bit arithmetic).
8734 const uint32_t TrueVal32 = CTVal->getZExtValue();
8735 const uint32_t FalseVal32 = CFVal->getZExtValue();
8737 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
8738 Opcode = AArch64ISD::CSINC;
8740 if (TrueVal32 > FalseVal32) {
8741 Swap = true;
8744 } else {
8745 // 64-bit check whether we can use CSINC.
8746 const uint64_t TrueVal64 = TrueVal;
8747 const uint64_t FalseVal64 = FalseVal;
8749 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
8750 Opcode = AArch64ISD::CSINC;
8752 if (TrueVal > FalseVal) {
8753 Swap = true;
8758 // Swap TVal and FVal if necessary.
8759 if (Swap) {
8760 std::swap(TVal, FVal);
8761 std::swap(CTVal, CFVal);
8762 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8765 if (Opcode != AArch64ISD::CSEL) {
8766 // Drop FVal since we can get its value by simply inverting/negating
8767 // TVal.
8768 FVal = TVal;
8772 // Avoid materializing a constant when possible by reusing a known value in
8773 // a register. However, don't perform this optimization if the known value
8774 // is one, zero or negative one in the case of a CSEL. We can always
8775 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
8776 // FVal, respectively.
8777 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
8778 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
8779 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
8780 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
8781 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
8782 // "a != C ? x : a" to avoid materializing C.
8783 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
8784 TVal = LHS;
8785 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
8786 FVal = LHS;
8787 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
8788 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
8789 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
8790 // avoid materializing C.
8791 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
8792 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
8793 Opcode = AArch64ISD::CSINV;
8794 TVal = LHS;
8795 FVal = DAG.getConstant(0, dl, FVal.getValueType());
8799 SDValue CCVal;
8800 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
8801 EVT VT = TVal.getValueType();
8802 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
8805 // Now we know we're dealing with FP values.
8806 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
8807 LHS.getValueType() == MVT::f64);
8808 assert(LHS.getValueType() == RHS.getValueType());
8809 EVT VT = TVal.getValueType();
8810 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
8812 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
8813 // clean. Some of them require two CSELs to implement.
8814 AArch64CC::CondCode CC1, CC2;
8815 changeFPCCToAArch64CC(CC, CC1, CC2);
8817 if (DAG.getTarget().Options.UnsafeFPMath) {
8818 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
8819 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
8820 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
8821 if (RHSVal && RHSVal->isZero()) {
8822 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
8823 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
8825 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
8826 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
8827 TVal = LHS;
8828 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
8829 CFVal && CFVal->isZero() &&
8830 FVal.getValueType() == LHS.getValueType())
8831 FVal = LHS;
8835 // Emit first, and possibly only, CSEL.
8836 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8837 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
8839 // If we need a second CSEL, emit it, using the output of the first as the
8840 // RHS. We're effectively OR'ing the two CC's together.
8841 if (CC2 != AArch64CC::AL) {
8842 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
8843 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
8846 // Otherwise, return the output of the first CSEL.
8847 return CS1;
8850 SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
8851 SelectionDAG &DAG) const {
8852 EVT Ty = Op.getValueType();
8853 auto Idx = Op.getConstantOperandAPInt(2);
8854 int64_t IdxVal = Idx.getSExtValue();
8855 assert(Ty.isScalableVector() &&
8856 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
8858 // We can use the splice instruction for certain index values where we are
8859 // able to efficiently generate the correct predicate. The index will be
8860 // inverted and used directly as the input to the ptrue instruction, i.e.
8861 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
8862 // splice predicate. However, we can only do this if we can guarantee that
8863 // there are enough elements in the vector, hence we check the index <= min
8864 // number of elements.
8865 Optional<unsigned> PredPattern;
8866 if (Ty.isScalableVector() && IdxVal < 0 &&
8867 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
8868 None) {
8869 SDLoc DL(Op);
8871 // Create a predicate where all but the last -IdxVal elements are false.
8872 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
8873 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
8874 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
8876 // Now splice the two inputs together using the predicate.
8877 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
8878 Op.getOperand(1));
8881 // This will select to an EXT instruction, which has a maximum immediate
8882 // value of 255, hence 2048-bits is the maximum value we can lower.
8883 if (IdxVal >= 0 &&
8884 IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
8885 return Op;
8887 return SDValue();
8890 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
8891 SelectionDAG &DAG) const {
8892 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
8893 SDValue LHS = Op.getOperand(0);
8894 SDValue RHS = Op.getOperand(1);
8895 SDValue TVal = Op.getOperand(2);
8896 SDValue FVal = Op.getOperand(3);
8897 SDLoc DL(Op);
8898 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
8901 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
8902 SelectionDAG &DAG) const {
8903 SDValue CCVal = Op->getOperand(0);
8904 SDValue TVal = Op->getOperand(1);
8905 SDValue FVal = Op->getOperand(2);
8906 SDLoc DL(Op);
8908 EVT Ty = Op.getValueType();
8909 if (Ty.isScalableVector()) {
8910 SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
8911 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
8912 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
8913 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
8916 if (useSVEForFixedLengthVectorVT(Ty)) {
8917 // FIXME: Ideally this would be the same as above using i1 types, however
8918 // for the moment we can't deal with fixed i1 vector types properly, so
8919 // instead extend the predicate to a result type sized integer vector.
8920 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
8921 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
8922 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
8923 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
8924 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
8927 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
8928 // instruction.
8929 if (ISD::isOverflowIntrOpRes(CCVal)) {
8930 // Only lower legal XALUO ops.
8931 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
8932 return SDValue();
8934 AArch64CC::CondCode OFCC;
8935 SDValue Value, Overflow;
8936 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
8937 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
8939 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
8940 CCVal, Overflow);
8943 // Lower it the same way as we would lower a SELECT_CC node.
8944 ISD::CondCode CC;
8945 SDValue LHS, RHS;
8946 if (CCVal.getOpcode() == ISD::SETCC) {
8947 LHS = CCVal.getOperand(0);
8948 RHS = CCVal.getOperand(1);
8949 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
8950 } else {
8951 LHS = CCVal;
8952 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
8953 CC = ISD::SETNE;
8956 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
8957 // order to use FCSELSrrr
8958 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
8959 TVal = SDValue(
8960 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
8961 DAG.getUNDEF(MVT::f32), TVal,
8962 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
8964 FVal = SDValue(
8965 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
8966 DAG.getUNDEF(MVT::f32), FVal,
8967 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
8971 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
8973 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
8974 Res = SDValue(
8975 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, Ty, Res,
8976 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
8980 return Res;
8983 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
8984 SelectionDAG &DAG) const {
8985 // Jump table entries as PC relative offsets. No additional tweaking
8986 // is necessary here. Just get the address of the jump table.
8987 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
8989 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
8990 !Subtarget->isTargetMachO()) {
8991 return getAddrLarge(JT, DAG);
8992 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8993 return getAddrTiny(JT, DAG);
8995 return getAddr(JT, DAG);
8998 SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
8999 SelectionDAG &DAG) const {
9000 // Jump table entries as PC relative offsets. No additional tweaking
9001 // is necessary here. Just get the address of the jump table.
9002 SDLoc DL(Op);
9003 SDValue JT = Op.getOperand(1);
9004 SDValue Entry = Op.getOperand(2);
9005 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
9007 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
9008 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
9010 SDNode *Dest =
9011 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
9012 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
9013 return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
9014 SDValue(Dest, 0));
9017 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
9018 SelectionDAG &DAG) const {
9019 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
9021 if (getTargetMachine().getCodeModel() == CodeModel::Large) {
9022 // Use the GOT for the large code model on iOS.
9023 if (Subtarget->isTargetMachO()) {
9024 return getGOT(CP, DAG);
9026 return getAddrLarge(CP, DAG);
9027 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9028 return getAddrTiny(CP, DAG);
9029 } else {
9030 return getAddr(CP, DAG);
9034 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
9035 SelectionDAG &DAG) const {
9036 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
9037 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
9038 !Subtarget->isTargetMachO()) {
9039 return getAddrLarge(BA, DAG);
9040 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9041 return getAddrTiny(BA, DAG);
9043 return getAddr(BA, DAG);
9046 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
9047 SelectionDAG &DAG) const {
9048 AArch64FunctionInfo *FuncInfo =
9049 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
9051 SDLoc DL(Op);
9052 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
9053 getPointerTy(DAG.getDataLayout()));
9054 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
9055 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9056 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9057 MachinePointerInfo(SV));
9060 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
9061 SelectionDAG &DAG) const {
9062 MachineFunction &MF = DAG.getMachineFunction();
9063 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9065 SDLoc DL(Op);
9066 SDValue FR;
9067 if (Subtarget->isWindowsArm64EC()) {
9068 // With the Arm64EC ABI, we compute the address of the varargs save area
9069 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
9070 // but calls from an entry thunk can pass in a different address.
9071 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
9072 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
9073 uint64_t StackOffset;
9074 if (FuncInfo->getVarArgsGPRSize() > 0)
9075 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
9076 else
9077 StackOffset = FuncInfo->getVarArgsStackOffset();
9078 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
9079 DAG.getConstant(StackOffset, DL, MVT::i64));
9080 } else {
9081 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
9082 ? FuncInfo->getVarArgsGPRIndex()
9083 : FuncInfo->getVarArgsStackIndex(),
9084 getPointerTy(DAG.getDataLayout()));
9086 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9087 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
9088 MachinePointerInfo(SV));
9091 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
9092 SelectionDAG &DAG) const {
9093 // The layout of the va_list struct is specified in the AArch64 Procedure Call
9094 // Standard, section B.3.
9095 MachineFunction &MF = DAG.getMachineFunction();
9096 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9097 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
9098 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
9099 auto PtrVT = getPointerTy(DAG.getDataLayout());
9100 SDLoc DL(Op);
9102 SDValue Chain = Op.getOperand(0);
9103 SDValue VAList = Op.getOperand(1);
9104 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9105 SmallVector<SDValue, 4> MemOps;
9107 // void *__stack at offset 0
9108 unsigned Offset = 0;
9109 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
9110 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
9111 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
9112 MachinePointerInfo(SV), Align(PtrSize)));
9114 // void *__gr_top at offset 8 (4 on ILP32)
9115 Offset += PtrSize;
9116 int GPRSize = FuncInfo->getVarArgsGPRSize();
9117 if (GPRSize > 0) {
9118 SDValue GRTop, GRTopAddr;
9120 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9121 DAG.getConstant(Offset, DL, PtrVT));
9123 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
9124 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
9125 DAG.getConstant(GPRSize, DL, PtrVT));
9126 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
9128 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
9129 MachinePointerInfo(SV, Offset),
9130 Align(PtrSize)));
9133 // void *__vr_top at offset 16 (8 on ILP32)
9134 Offset += PtrSize;
9135 int FPRSize = FuncInfo->getVarArgsFPRSize();
9136 if (FPRSize > 0) {
9137 SDValue VRTop, VRTopAddr;
9138 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9139 DAG.getConstant(Offset, DL, PtrVT));
9141 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
9142 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
9143 DAG.getConstant(FPRSize, DL, PtrVT));
9144 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
9146 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
9147 MachinePointerInfo(SV, Offset),
9148 Align(PtrSize)));
9151 // int __gr_offs at offset 24 (12 on ILP32)
9152 Offset += PtrSize;
9153 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9154 DAG.getConstant(Offset, DL, PtrVT));
9155 MemOps.push_back(
9156 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
9157 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
9159 // int __vr_offs at offset 28 (16 on ILP32)
9160 Offset += 4;
9161 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9162 DAG.getConstant(Offset, DL, PtrVT));
9163 MemOps.push_back(
9164 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
9165 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
9167 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
9170 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
9171 SelectionDAG &DAG) const {
9172 MachineFunction &MF = DAG.getMachineFunction();
9174 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
9175 return LowerWin64_VASTART(Op, DAG);
9176 else if (Subtarget->isTargetDarwin())
9177 return LowerDarwin_VASTART(Op, DAG);
9178 else
9179 return LowerAAPCS_VASTART(Op, DAG);
9182 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
9183 SelectionDAG &DAG) const {
9184 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
9185 // pointer.
9186 SDLoc DL(Op);
9187 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
9188 unsigned VaListSize =
9189 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
9190 ? PtrSize
9191 : Subtarget->isTargetILP32() ? 20 : 32;
9192 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
9193 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
9195 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
9196 DAG.getConstant(VaListSize, DL, MVT::i32),
9197 Align(PtrSize), false, false, false,
9198 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
9201 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
9202 assert(Subtarget->isTargetDarwin() &&
9203 "automatic va_arg instruction only works on Darwin");
9205 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
9206 EVT VT = Op.getValueType();
9207 SDLoc DL(Op);
9208 SDValue Chain = Op.getOperand(0);
9209 SDValue Addr = Op.getOperand(1);
9210 MaybeAlign Align(Op.getConstantOperandVal(3));
9211 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
9212 auto PtrVT = getPointerTy(DAG.getDataLayout());
9213 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
9214 SDValue VAList =
9215 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
9216 Chain = VAList.getValue(1);
9217 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
9219 if (VT.isScalableVector())
9220 report_fatal_error("Passing SVE types to variadic functions is "
9221 "currently not supported");
9223 if (Align && *Align > MinSlotSize) {
9224 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9225 DAG.getConstant(Align->value() - 1, DL, PtrVT));
9226 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
9227 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
9230 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
9231 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
9233 // Scalar integer and FP values smaller than 64 bits are implicitly extended
9234 // up to 64 bits. At the very least, we have to increase the striding of the
9235 // vaargs list to match this, and for FP values we need to introduce
9236 // FP_ROUND nodes as well.
9237 if (VT.isInteger() && !VT.isVector())
9238 ArgSize = std::max(ArgSize, MinSlotSize);
9239 bool NeedFPTrunc = false;
9240 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
9241 ArgSize = 8;
9242 NeedFPTrunc = true;
9245 // Increment the pointer, VAList, to the next vaarg
9246 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
9247 DAG.getConstant(ArgSize, DL, PtrVT));
9248 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
9250 // Store the incremented VAList to the legalized pointer
9251 SDValue APStore =
9252 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
9254 // Load the actual argument out of the pointer VAList
9255 if (NeedFPTrunc) {
9256 // Load the value as an f64.
9257 SDValue WideFP =
9258 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
9259 // Round the value down to an f32.
9260 SDValue NarrowFP =
9261 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
9262 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
9263 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
9264 // Merge the rounded value with the chain output of the load.
9265 return DAG.getMergeValues(Ops, DL);
9268 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
9271 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
9272 SelectionDAG &DAG) const {
9273 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9274 MFI.setFrameAddressIsTaken(true);
9276 EVT VT = Op.getValueType();
9277 SDLoc DL(Op);
9278 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9279 SDValue FrameAddr =
9280 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
9281 while (Depth--)
9282 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
9283 MachinePointerInfo());
9285 if (Subtarget->isTargetILP32())
9286 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
9287 DAG.getValueType(VT));
9289 return FrameAddr;
9292 SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
9293 SelectionDAG &DAG) const {
9294 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9296 EVT VT = getPointerTy(DAG.getDataLayout());
9297 SDLoc DL(Op);
9298 int FI = MFI.CreateFixedObject(4, 0, false);
9299 return DAG.getFrameIndex(FI, VT);
9302 #define GET_REGISTER_MATCHER
9303 #include "AArch64GenAsmMatcher.inc"
9305 // FIXME? Maybe this could be a TableGen attribute on some registers and
9306 // this table could be generated automatically from RegInfo.
9307 Register AArch64TargetLowering::
9308 getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
9309 Register Reg = MatchRegisterName(RegName);
9310 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
9311 const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
9312 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
9313 if (!Subtarget->isXRegisterReserved(DwarfRegNum))
9314 Reg = 0;
9316 if (Reg)
9317 return Reg;
9318 report_fatal_error(Twine("Invalid register name \""
9319 + StringRef(RegName) + "\"."));
9322 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
9323 SelectionDAG &DAG) const {
9324 DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
9326 EVT VT = Op.getValueType();
9327 SDLoc DL(Op);
9329 SDValue FrameAddr =
9330 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
9331 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
9333 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
9336 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
9337 SelectionDAG &DAG) const {
9338 MachineFunction &MF = DAG.getMachineFunction();
9339 MachineFrameInfo &MFI = MF.getFrameInfo();
9340 MFI.setReturnAddressIsTaken(true);
9342 EVT VT = Op.getValueType();
9343 SDLoc DL(Op);
9344 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9345 SDValue ReturnAddress;
9346 if (Depth) {
9347 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
9348 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
9349 ReturnAddress = DAG.getLoad(
9350 VT, DL, DAG.getEntryNode(),
9351 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
9352 } else {
9353 // Return LR, which contains the return address. Mark it an implicit
9354 // live-in.
9355 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
9356 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
9359 // The XPACLRI instruction assembles to a hint-space instruction before
9360 // Armv8.3-A therefore this instruction can be safely used for any pre
9361 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
9362 // that instead.
9363 SDNode *St;
9364 if (Subtarget->hasPAuth()) {
9365 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
9366 } else {
9367 // XPACLRI operates on LR therefore we must move the operand accordingly.
9368 SDValue Chain =
9369 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
9370 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
9372 return SDValue(St, 0);
9375 /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
9376 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
9377 SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
9378 SelectionDAG &DAG) const {
9379 SDValue Lo, Hi;
9380 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
9381 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
9384 bool AArch64TargetLowering::isOffsetFoldingLegal(
9385 const GlobalAddressSDNode *GA) const {
9386 // Offsets are folded in the DAG combine rather than here so that we can
9387 // intelligently choose an offset based on the uses.
9388 return false;
9391 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
9392 bool OptForSize) const {
9393 bool IsLegal = false;
9394 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
9395 // 16-bit case when target has full fp16 support.
9396 // FIXME: We should be able to handle f128 as well with a clever lowering.
9397 const APInt ImmInt = Imm.bitcastToAPInt();
9398 if (VT == MVT::f64)
9399 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
9400 else if (VT == MVT::f32)
9401 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
9402 else if (VT == MVT::f16 && Subtarget->hasFullFP16())
9403 IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero();
9404 // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
9405 // generate that fmov.
9407 // If we can not materialize in immediate field for fmov, check if the
9408 // value can be encoded as the immediate operand of a logical instruction.
9409 // The immediate value will be created with either MOVZ, MOVN, or ORR.
9410 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
9411 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
9412 // however the mov+fmov sequence is always better because of the reduced
9413 // cache pressure. The timings are still the same if you consider
9414 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
9415 // movw+movk is fused). So we limit up to 2 instrdduction at most.
9416 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
9417 AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
9418 Insn);
9419 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
9420 IsLegal = Insn.size() <= Limit;
9423 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
9424 << " imm value: "; Imm.dump(););
9425 return IsLegal;
9428 //===----------------------------------------------------------------------===//
9429 // AArch64 Optimization Hooks
9430 //===----------------------------------------------------------------------===//
9432 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
9433 SDValue Operand, SelectionDAG &DAG,
9434 int &ExtraSteps) {
9435 EVT VT = Operand.getValueType();
9436 if ((ST->hasNEON() &&
9437 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
9438 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
9439 VT == MVT::v4f32)) ||
9440 (ST->hasSVE() &&
9441 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
9442 if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
9443 // For the reciprocal estimates, convergence is quadratic, so the number
9444 // of digits is doubled after each iteration. In ARMv8, the accuracy of
9445 // the initial estimate is 2^-8. Thus the number of extra steps to refine
9446 // the result for float (23 mantissa bits) is 2 and for double (52
9447 // mantissa bits) is 3.
9448 ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
9450 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
9453 return SDValue();
9456 SDValue
9457 AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
9458 const DenormalMode &Mode) const {
9459 SDLoc DL(Op);
9460 EVT VT = Op.getValueType();
9461 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
9462 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
9463 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
9466 SDValue
9467 AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
9468 SelectionDAG &DAG) const {
9469 return Op;
9472 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
9473 SelectionDAG &DAG, int Enabled,
9474 int &ExtraSteps,
9475 bool &UseOneConst,
9476 bool Reciprocal) const {
9477 if (Enabled == ReciprocalEstimate::Enabled ||
9478 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
9479 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
9480 DAG, ExtraSteps)) {
9481 SDLoc DL(Operand);
9482 EVT VT = Operand.getValueType();
9484 SDNodeFlags Flags;
9485 Flags.setAllowReassociation(true);
9487 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
9488 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
9489 for (int i = ExtraSteps; i > 0; --i) {
9490 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
9491 Flags);
9492 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
9493 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
9495 if (!Reciprocal)
9496 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
9498 ExtraSteps = 0;
9499 return Estimate;
9502 return SDValue();
9505 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
9506 SelectionDAG &DAG, int Enabled,
9507 int &ExtraSteps) const {
9508 if (Enabled == ReciprocalEstimate::Enabled)
9509 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
9510 DAG, ExtraSteps)) {
9511 SDLoc DL(Operand);
9512 EVT VT = Operand.getValueType();
9514 SDNodeFlags Flags;
9515 Flags.setAllowReassociation(true);
9517 // Newton reciprocal iteration: E * (2 - X * E)
9518 // AArch64 reciprocal iteration instruction: (2 - M * N)
9519 for (int i = ExtraSteps; i > 0; --i) {
9520 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
9521 Estimate, Flags);
9522 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
9525 ExtraSteps = 0;
9526 return Estimate;
9529 return SDValue();
9532 //===----------------------------------------------------------------------===//
9533 // AArch64 Inline Assembly Support
9534 //===----------------------------------------------------------------------===//
9536 // Table of Constraints
9537 // TODO: This is the current set of constraints supported by ARM for the
9538 // compiler, not all of them may make sense.
9540 // r - A general register
9541 // w - An FP/SIMD register of some size in the range v0-v31
9542 // x - An FP/SIMD register of some size in the range v0-v15
9543 // I - Constant that can be used with an ADD instruction
9544 // J - Constant that can be used with a SUB instruction
9545 // K - Constant that can be used with a 32-bit logical instruction
9546 // L - Constant that can be used with a 64-bit logical instruction
9547 // M - Constant that can be used as a 32-bit MOV immediate
9548 // N - Constant that can be used as a 64-bit MOV immediate
9549 // Q - A memory reference with base register and no offset
9550 // S - A symbolic address
9551 // Y - Floating point constant zero
9552 // Z - Integer constant zero
9554 // Note that general register operands will be output using their 64-bit x
9555 // register name, whatever the size of the variable, unless the asm operand
9556 // is prefixed by the %w modifier. Floating-point and SIMD register operands
9557 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
9558 // %q modifier.
9559 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
9560 // At this point, we have to lower this constraint to something else, so we
9561 // lower it to an "r" or "w". However, by doing this we will force the result
9562 // to be in register, while the X constraint is much more permissive.
9564 // Although we are correct (we are free to emit anything, without
9565 // constraints), we might break use cases that would expect us to be more
9566 // efficient and emit something else.
9567 if (!Subtarget->hasFPARMv8())
9568 return "r";
9570 if (ConstraintVT.isFloatingPoint())
9571 return "w";
9573 if (ConstraintVT.isVector() &&
9574 (ConstraintVT.getSizeInBits() == 64 ||
9575 ConstraintVT.getSizeInBits() == 128))
9576 return "w";
9578 return "r";
9581 enum PredicateConstraint {
9582 Upl,
9583 Upa,
9584 Invalid
9587 static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
9588 PredicateConstraint P = PredicateConstraint::Invalid;
9589 if (Constraint == "Upa")
9590 P = PredicateConstraint::Upa;
9591 if (Constraint == "Upl")
9592 P = PredicateConstraint::Upl;
9593 return P;
9596 /// getConstraintType - Given a constraint letter, return the type of
9597 /// constraint it is for this target.
9598 AArch64TargetLowering::ConstraintType
9599 AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
9600 if (Constraint.size() == 1) {
9601 switch (Constraint[0]) {
9602 default:
9603 break;
9604 case 'x':
9605 case 'w':
9606 case 'y':
9607 return C_RegisterClass;
9608 // An address with a single base register. Due to the way we
9609 // currently handle addresses it is the same as 'r'.
9610 case 'Q':
9611 return C_Memory;
9612 case 'I':
9613 case 'J':
9614 case 'K':
9615 case 'L':
9616 case 'M':
9617 case 'N':
9618 case 'Y':
9619 case 'Z':
9620 return C_Immediate;
9621 case 'z':
9622 case 'S': // A symbolic address
9623 return C_Other;
9625 } else if (parsePredicateConstraint(Constraint) !=
9626 PredicateConstraint::Invalid)
9627 return C_RegisterClass;
9628 return TargetLowering::getConstraintType(Constraint);
9631 /// Examine constraint type and operand type and determine a weight value.
9632 /// This object must already have been set up with the operand type
9633 /// and the current alternative constraint selected.
9634 TargetLowering::ConstraintWeight
9635 AArch64TargetLowering::getSingleConstraintMatchWeight(
9636 AsmOperandInfo &info, const char *constraint) const {
9637 ConstraintWeight weight = CW_Invalid;
9638 Value *CallOperandVal = info.CallOperandVal;
9639 // If we don't have a value, we can't do a match,
9640 // but allow it at the lowest weight.
9641 if (!CallOperandVal)
9642 return CW_Default;
9643 Type *type = CallOperandVal->getType();
9644 // Look at the constraint type.
9645 switch (*constraint) {
9646 default:
9647 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
9648 break;
9649 case 'x':
9650 case 'w':
9651 case 'y':
9652 if (type->isFloatingPointTy() || type->isVectorTy())
9653 weight = CW_Register;
9654 break;
9655 case 'z':
9656 weight = CW_Constant;
9657 break;
9658 case 'U':
9659 if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
9660 weight = CW_Register;
9661 break;
9663 return weight;
9666 std::pair<unsigned, const TargetRegisterClass *>
9667 AArch64TargetLowering::getRegForInlineAsmConstraint(
9668 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
9669 if (Constraint.size() == 1) {
9670 switch (Constraint[0]) {
9671 case 'r':
9672 if (VT.isScalableVector())
9673 return std::make_pair(0U, nullptr);
9674 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
9675 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
9676 if (VT.getFixedSizeInBits() == 64)
9677 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
9678 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
9679 case 'w': {
9680 if (!Subtarget->hasFPARMv8())
9681 break;
9682 if (VT.isScalableVector()) {
9683 if (VT.getVectorElementType() != MVT::i1)
9684 return std::make_pair(0U, &AArch64::ZPRRegClass);
9685 return std::make_pair(0U, nullptr);
9687 uint64_t VTSize = VT.getFixedSizeInBits();
9688 if (VTSize == 16)
9689 return std::make_pair(0U, &AArch64::FPR16RegClass);
9690 if (VTSize == 32)
9691 return std::make_pair(0U, &AArch64::FPR32RegClass);
9692 if (VTSize == 64)
9693 return std::make_pair(0U, &AArch64::FPR64RegClass);
9694 if (VTSize == 128)
9695 return std::make_pair(0U, &AArch64::FPR128RegClass);
9696 break;
9698 // The instructions that this constraint is designed for can
9699 // only take 128-bit registers so just use that regclass.
9700 case 'x':
9701 if (!Subtarget->hasFPARMv8())
9702 break;
9703 if (VT.isScalableVector())
9704 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
9705 if (VT.getSizeInBits() == 128)
9706 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
9707 break;
9708 case 'y':
9709 if (!Subtarget->hasFPARMv8())
9710 break;
9711 if (VT.isScalableVector())
9712 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
9713 break;
9715 } else {
9716 PredicateConstraint PC = parsePredicateConstraint(Constraint);
9717 if (PC != PredicateConstraint::Invalid) {
9718 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
9719 return std::make_pair(0U, nullptr);
9720 bool restricted = (PC == PredicateConstraint::Upl);
9721 return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
9722 : std::make_pair(0U, &AArch64::PPRRegClass);
9725 if (StringRef("{cc}").equals_insensitive(Constraint))
9726 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
9728 // Use the default implementation in TargetLowering to convert the register
9729 // constraint into a member of a register class.
9730 std::pair<unsigned, const TargetRegisterClass *> Res;
9731 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9733 // Not found as a standard register?
9734 if (!Res.second) {
9735 unsigned Size = Constraint.size();
9736 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
9737 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
9738 int RegNo;
9739 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
9740 if (!Failed && RegNo >= 0 && RegNo <= 31) {
9741 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
9742 // By default we'll emit v0-v31 for this unless there's a modifier where
9743 // we'll emit the correct register as well.
9744 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
9745 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
9746 Res.second = &AArch64::FPR64RegClass;
9747 } else {
9748 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
9749 Res.second = &AArch64::FPR128RegClass;
9755 if (Res.second && !Subtarget->hasFPARMv8() &&
9756 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
9757 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
9758 return std::make_pair(0U, nullptr);
9760 return Res;
9763 EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
9764 llvm::Type *Ty,
9765 bool AllowUnknown) const {
9766 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
9767 return EVT(MVT::i64x8);
9769 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
9772 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
9773 /// vector. If it is invalid, don't add anything to Ops.
9774 void AArch64TargetLowering::LowerAsmOperandForConstraint(
9775 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
9776 SelectionDAG &DAG) const {
9777 SDValue Result;
9779 // Currently only support length 1 constraints.
9780 if (Constraint.length() != 1)
9781 return;
9783 char ConstraintLetter = Constraint[0];
9784 switch (ConstraintLetter) {
9785 default:
9786 break;
9788 // This set of constraints deal with valid constants for various instructions.
9789 // Validate and return a target constant for them if we can.
9790 case 'z': {
9791 // 'z' maps to xzr or wzr so it needs an input of 0.
9792 if (!isNullConstant(Op))
9793 return;
9795 if (Op.getValueType() == MVT::i64)
9796 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
9797 else
9798 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
9799 break;
9801 case 'S': {
9802 // An absolute symbolic address or label reference.
9803 if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
9804 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
9805 GA->getValueType(0));
9806 } else if (const BlockAddressSDNode *BA =
9807 dyn_cast<BlockAddressSDNode>(Op)) {
9808 Result =
9809 DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
9810 } else
9811 return;
9812 break;
9815 case 'I':
9816 case 'J':
9817 case 'K':
9818 case 'L':
9819 case 'M':
9820 case 'N':
9821 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
9822 if (!C)
9823 return;
9825 // Grab the value and do some validation.
9826 uint64_t CVal = C->getZExtValue();
9827 switch (ConstraintLetter) {
9828 // The I constraint applies only to simple ADD or SUB immediate operands:
9829 // i.e. 0 to 4095 with optional shift by 12
9830 // The J constraint applies only to ADD or SUB immediates that would be
9831 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
9832 // instruction [or vice versa], in other words -1 to -4095 with optional
9833 // left shift by 12.
9834 case 'I':
9835 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
9836 break;
9837 return;
9838 case 'J': {
9839 uint64_t NVal = -C->getSExtValue();
9840 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
9841 CVal = C->getSExtValue();
9842 break;
9844 return;
9846 // The K and L constraints apply *only* to logical immediates, including
9847 // what used to be the MOVI alias for ORR (though the MOVI alias has now
9848 // been removed and MOV should be used). So these constraints have to
9849 // distinguish between bit patterns that are valid 32-bit or 64-bit
9850 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
9851 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
9852 // versa.
9853 case 'K':
9854 if (AArch64_AM::isLogicalImmediate(CVal, 32))
9855 break;
9856 return;
9857 case 'L':
9858 if (AArch64_AM::isLogicalImmediate(CVal, 64))
9859 break;
9860 return;
9861 // The M and N constraints are a superset of K and L respectively, for use
9862 // with the MOV (immediate) alias. As well as the logical immediates they
9863 // also match 32 or 64-bit immediates that can be loaded either using a
9864 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
9865 // (M) or 64-bit 0x1234000000000000 (N) etc.
9866 // As a note some of this code is liberally stolen from the asm parser.
9867 case 'M': {
9868 if (!isUInt<32>(CVal))
9869 return;
9870 if (AArch64_AM::isLogicalImmediate(CVal, 32))
9871 break;
9872 if ((CVal & 0xFFFF) == CVal)
9873 break;
9874 if ((CVal & 0xFFFF0000ULL) == CVal)
9875 break;
9876 uint64_t NCVal = ~(uint32_t)CVal;
9877 if ((NCVal & 0xFFFFULL) == NCVal)
9878 break;
9879 if ((NCVal & 0xFFFF0000ULL) == NCVal)
9880 break;
9881 return;
9883 case 'N': {
9884 if (AArch64_AM::isLogicalImmediate(CVal, 64))
9885 break;
9886 if ((CVal & 0xFFFFULL) == CVal)
9887 break;
9888 if ((CVal & 0xFFFF0000ULL) == CVal)
9889 break;
9890 if ((CVal & 0xFFFF00000000ULL) == CVal)
9891 break;
9892 if ((CVal & 0xFFFF000000000000ULL) == CVal)
9893 break;
9894 uint64_t NCVal = ~CVal;
9895 if ((NCVal & 0xFFFFULL) == NCVal)
9896 break;
9897 if ((NCVal & 0xFFFF0000ULL) == NCVal)
9898 break;
9899 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
9900 break;
9901 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
9902 break;
9903 return;
9905 default:
9906 return;
9909 // All assembler immediates are 64-bit integers.
9910 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
9911 break;
9914 if (Result.getNode()) {
9915 Ops.push_back(Result);
9916 return;
9919 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
9922 //===----------------------------------------------------------------------===//
9923 // AArch64 Advanced SIMD Support
9924 //===----------------------------------------------------------------------===//
9926 /// WidenVector - Given a value in the V64 register class, produce the
9927 /// equivalent value in the V128 register class.
9928 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
9929 EVT VT = V64Reg.getValueType();
9930 unsigned NarrowSize = VT.getVectorNumElements();
9931 MVT EltTy = VT.getVectorElementType().getSimpleVT();
9932 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
9933 SDLoc DL(V64Reg);
9935 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
9936 V64Reg, DAG.getConstant(0, DL, MVT::i64));
9939 /// getExtFactor - Determine the adjustment factor for the position when
9940 /// generating an "extract from vector registers" instruction.
9941 static unsigned getExtFactor(SDValue &V) {
9942 EVT EltType = V.getValueType().getVectorElementType();
9943 return EltType.getSizeInBits() / 8;
9946 /// NarrowVector - Given a value in the V128 register class, produce the
9947 /// equivalent value in the V64 register class.
9948 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
9949 EVT VT = V128Reg.getValueType();
9950 unsigned WideSize = VT.getVectorNumElements();
9951 MVT EltTy = VT.getVectorElementType().getSimpleVT();
9952 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
9953 SDLoc DL(V128Reg);
9955 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
9958 // Gather data to see if the operation can be modelled as a
9959 // shuffle in combination with VEXTs.
9960 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
9961 SelectionDAG &DAG) const {
9962 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
9963 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
9964 SDLoc dl(Op);
9965 EVT VT = Op.getValueType();
9966 assert(!VT.isScalableVector() &&
9967 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
9968 unsigned NumElts = VT.getVectorNumElements();
9970 struct ShuffleSourceInfo {
9971 SDValue Vec;
9972 unsigned MinElt;
9973 unsigned MaxElt;
9975 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
9976 // be compatible with the shuffle we intend to construct. As a result
9977 // ShuffleVec will be some sliding window into the original Vec.
9978 SDValue ShuffleVec;
9980 // Code should guarantee that element i in Vec starts at element "WindowBase
9981 // + i * WindowScale in ShuffleVec".
9982 int WindowBase;
9983 int WindowScale;
9985 ShuffleSourceInfo(SDValue Vec)
9986 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
9987 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
9989 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
9992 // First gather all vectors used as an immediate source for this BUILD_VECTOR
9993 // node.
9994 SmallVector<ShuffleSourceInfo, 2> Sources;
9995 for (unsigned i = 0; i < NumElts; ++i) {
9996 SDValue V = Op.getOperand(i);
9997 if (V.isUndef())
9998 continue;
9999 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10000 !isa<ConstantSDNode>(V.getOperand(1)) ||
10001 V.getOperand(0).getValueType().isScalableVector()) {
10002 LLVM_DEBUG(
10003 dbgs() << "Reshuffle failed: "
10004 "a shuffle can only come from building a vector from "
10005 "various elements of other fixed-width vectors, provided "
10006 "their indices are constant\n");
10007 return SDValue();
10010 // Add this element source to the list if it's not already there.
10011 SDValue SourceVec = V.getOperand(0);
10012 auto Source = find(Sources, SourceVec);
10013 if (Source == Sources.end())
10014 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
10016 // Update the minimum and maximum lane number seen.
10017 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
10018 Source->MinElt = std::min(Source->MinElt, EltNo);
10019 Source->MaxElt = std::max(Source->MaxElt, EltNo);
10022 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
10023 // better than moving to/from gpr registers for larger vectors.
10024 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
10025 // Construct a mask for the tbl. We may need to adjust the index for types
10026 // larger than i8.
10027 SmallVector<unsigned, 16> Mask;
10028 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
10029 for (unsigned I = 0; I < NumElts; ++I) {
10030 SDValue V = Op.getOperand(I);
10031 if (V.isUndef()) {
10032 for (unsigned OF = 0; OF < OutputFactor; OF++)
10033 Mask.push_back(-1);
10034 continue;
10036 // Set the Mask lanes adjusted for the size of the input and output
10037 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
10038 // output element, adjusted in their positions per input and output types.
10039 unsigned Lane = V.getConstantOperandVal(1);
10040 for (unsigned S = 0; S < Sources.size(); S++) {
10041 if (V.getOperand(0) == Sources[S].Vec) {
10042 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
10043 unsigned InputBase = 16 * S + Lane * InputSize / 8;
10044 for (unsigned OF = 0; OF < OutputFactor; OF++)
10045 Mask.push_back(InputBase + OF);
10046 break;
10051 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
10052 // v16i8, and the TBLMask
10053 SmallVector<SDValue, 16> TBLOperands;
10054 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
10055 ? Intrinsic::aarch64_neon_tbl3
10056 : Intrinsic::aarch64_neon_tbl4,
10057 dl, MVT::i32));
10058 for (unsigned i = 0; i < Sources.size(); i++) {
10059 SDValue Src = Sources[i].Vec;
10060 EVT SrcVT = Src.getValueType();
10061 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
10062 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
10063 "Expected a legally typed vector");
10064 if (SrcVT.is64BitVector())
10065 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
10066 DAG.getUNDEF(MVT::v8i8));
10067 TBLOperands.push_back(Src);
10070 SmallVector<SDValue, 16> TBLMask;
10071 for (unsigned i = 0; i < Mask.size(); i++)
10072 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
10073 assert((Mask.size() == 8 || Mask.size() == 16) &&
10074 "Expected a v8i8 or v16i8 Mask");
10075 TBLOperands.push_back(
10076 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
10078 SDValue Shuffle =
10079 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
10080 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
10081 return DAG.getBitcast(VT, Shuffle);
10084 if (Sources.size() > 2) {
10085 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
10086 << "sensible when at most two source vectors are "
10087 << "involved\n");
10088 return SDValue();
10091 // Find out the smallest element size among result and two sources, and use
10092 // it as element size to build the shuffle_vector.
10093 EVT SmallestEltTy = VT.getVectorElementType();
10094 for (auto &Source : Sources) {
10095 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
10096 if (SrcEltTy.bitsLT(SmallestEltTy)) {
10097 SmallestEltTy = SrcEltTy;
10100 unsigned ResMultiplier =
10101 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
10102 uint64_t VTSize = VT.getFixedSizeInBits();
10103 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
10104 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
10106 // If the source vector is too wide or too narrow, we may nevertheless be able
10107 // to construct a compatible shuffle either by concatenating it with UNDEF or
10108 // extracting a suitable range of elements.
10109 for (auto &Src : Sources) {
10110 EVT SrcVT = Src.ShuffleVec.getValueType();
10112 TypeSize SrcVTSize = SrcVT.getSizeInBits();
10113 if (SrcVTSize == TypeSize::Fixed(VTSize))
10114 continue;
10116 // This stage of the search produces a source with the same element type as
10117 // the original, but with a total width matching the BUILD_VECTOR output.
10118 EVT EltVT = SrcVT.getVectorElementType();
10119 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
10120 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
10122 if (SrcVTSize.getFixedValue() < VTSize) {
10123 assert(2 * SrcVTSize == VTSize);
10124 // We can pad out the smaller vector for free, so if it's part of a
10125 // shuffle...
10126 Src.ShuffleVec =
10127 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
10128 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
10129 continue;
10132 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
10133 LLVM_DEBUG(
10134 dbgs() << "Reshuffle failed: result vector too small to extract\n");
10135 return SDValue();
10138 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
10139 LLVM_DEBUG(
10140 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
10141 return SDValue();
10144 if (Src.MinElt >= NumSrcElts) {
10145 // The extraction can just take the second half
10146 Src.ShuffleVec =
10147 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
10148 DAG.getConstant(NumSrcElts, dl, MVT::i64));
10149 Src.WindowBase = -NumSrcElts;
10150 } else if (Src.MaxElt < NumSrcElts) {
10151 // The extraction can just take the first half
10152 Src.ShuffleVec =
10153 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
10154 DAG.getConstant(0, dl, MVT::i64));
10155 } else {
10156 // An actual VEXT is needed
10157 SDValue VEXTSrc1 =
10158 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
10159 DAG.getConstant(0, dl, MVT::i64));
10160 SDValue VEXTSrc2 =
10161 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
10162 DAG.getConstant(NumSrcElts, dl, MVT::i64));
10163 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
10165 if (!SrcVT.is64BitVector()) {
10166 LLVM_DEBUG(
10167 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
10168 "for SVE vectors.");
10169 return SDValue();
10172 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
10173 VEXTSrc2,
10174 DAG.getConstant(Imm, dl, MVT::i32));
10175 Src.WindowBase = -Src.MinElt;
10179 // Another possible incompatibility occurs from the vector element types. We
10180 // can fix this by bitcasting the source vectors to the same type we intend
10181 // for the shuffle.
10182 for (auto &Src : Sources) {
10183 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
10184 if (SrcEltTy == SmallestEltTy)
10185 continue;
10186 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
10187 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
10188 Src.WindowScale =
10189 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
10190 Src.WindowBase *= Src.WindowScale;
10193 // Final check before we try to actually produce a shuffle.
10194 LLVM_DEBUG(for (auto Src
10195 : Sources)
10196 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
10198 // The stars all align, our next step is to produce the mask for the shuffle.
10199 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
10200 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
10201 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
10202 SDValue Entry = Op.getOperand(i);
10203 if (Entry.isUndef())
10204 continue;
10206 auto Src = find(Sources, Entry.getOperand(0));
10207 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
10209 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
10210 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
10211 // segment.
10212 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
10213 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
10214 VT.getScalarSizeInBits());
10215 int LanesDefined = BitsDefined / BitsPerShuffleLane;
10217 // This source is expected to fill ResMultiplier lanes of the final shuffle,
10218 // starting at the appropriate offset.
10219 int *LaneMask = &Mask[i * ResMultiplier];
10221 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
10222 ExtractBase += NumElts * (Src - Sources.begin());
10223 for (int j = 0; j < LanesDefined; ++j)
10224 LaneMask[j] = ExtractBase + j;
10227 // Final check before we try to produce nonsense...
10228 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
10229 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
10230 return SDValue();
10233 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
10234 for (unsigned i = 0; i < Sources.size(); ++i)
10235 ShuffleOps[i] = Sources[i].ShuffleVec;
10237 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
10238 ShuffleOps[1], Mask);
10239 SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
10241 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
10242 dbgs() << "Reshuffle, creating node: "; V.dump(););
10244 return V;
10247 // check if an EXT instruction can handle the shuffle mask when the
10248 // vector sources of the shuffle are the same.
10249 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
10250 unsigned NumElts = VT.getVectorNumElements();
10252 // Assume that the first shuffle index is not UNDEF. Fail if it is.
10253 if (M[0] < 0)
10254 return false;
10256 Imm = M[0];
10258 // If this is a VEXT shuffle, the immediate value is the index of the first
10259 // element. The other shuffle indices must be the successive elements after
10260 // the first one.
10261 unsigned ExpectedElt = Imm;
10262 for (unsigned i = 1; i < NumElts; ++i) {
10263 // Increment the expected index. If it wraps around, just follow it
10264 // back to index zero and keep going.
10265 ++ExpectedElt;
10266 if (ExpectedElt == NumElts)
10267 ExpectedElt = 0;
10269 if (M[i] < 0)
10270 continue; // ignore UNDEF indices
10271 if (ExpectedElt != static_cast<unsigned>(M[i]))
10272 return false;
10275 return true;
10278 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
10279 // v4i32s. This is really a truncate, which we can construct out of (legal)
10280 // concats and truncate nodes.
10281 static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
10282 if (V.getValueType() != MVT::v16i8)
10283 return SDValue();
10284 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
10286 for (unsigned X = 0; X < 4; X++) {
10287 // Check the first item in each group is an extract from lane 0 of a v4i32
10288 // or v4i16.
10289 SDValue BaseExt = V.getOperand(X * 4);
10290 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10291 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
10292 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
10293 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
10294 BaseExt.getConstantOperandVal(1) != 0)
10295 return SDValue();
10296 SDValue Base = BaseExt.getOperand(0);
10297 // And check the other items are extracts from the same vector.
10298 for (unsigned Y = 1; Y < 4; Y++) {
10299 SDValue Ext = V.getOperand(X * 4 + Y);
10300 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10301 Ext.getOperand(0) != Base ||
10302 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
10303 Ext.getConstantOperandVal(1) != Y)
10304 return SDValue();
10308 // Turn the buildvector into a series of truncates and concates, which will
10309 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
10310 // concat together to produce 2 v8i16. These are both truncated and concat
10311 // together.
10312 SDLoc DL(V);
10313 SDValue Trunc[4] = {
10314 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
10315 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
10316 for (SDValue &V : Trunc)
10317 if (V.getValueType() == MVT::v4i32)
10318 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
10319 SDValue Concat0 =
10320 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
10321 SDValue Concat1 =
10322 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
10323 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
10324 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
10325 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
10328 /// Check if a vector shuffle corresponds to a DUP instructions with a larger
10329 /// element width than the vector lane type. If that is the case the function
10330 /// returns true and writes the value of the DUP instruction lane operand into
10331 /// DupLaneOp
10332 static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
10333 unsigned &DupLaneOp) {
10334 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
10335 "Only possible block sizes for wide DUP are: 16, 32, 64");
10337 if (BlockSize <= VT.getScalarSizeInBits())
10338 return false;
10339 if (BlockSize % VT.getScalarSizeInBits() != 0)
10340 return false;
10341 if (VT.getSizeInBits() % BlockSize != 0)
10342 return false;
10344 size_t SingleVecNumElements = VT.getVectorNumElements();
10345 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
10346 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
10348 // We are looking for masks like
10349 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
10350 // might be replaced by 'undefined'. BlockIndices will eventually contain
10351 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
10352 // for the above examples)
10353 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
10354 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
10355 for (size_t I = 0; I < NumEltsPerBlock; I++) {
10356 int Elt = M[BlockIndex * NumEltsPerBlock + I];
10357 if (Elt < 0)
10358 continue;
10359 // For now we don't support shuffles that use the second operand
10360 if ((unsigned)Elt >= SingleVecNumElements)
10361 return false;
10362 if (BlockElts[I] < 0)
10363 BlockElts[I] = Elt;
10364 else if (BlockElts[I] != Elt)
10365 return false;
10368 // We found a candidate block (possibly with some undefs). It must be a
10369 // sequence of consecutive integers starting with a value divisible by
10370 // NumEltsPerBlock with some values possibly replaced by undef-s.
10372 // Find first non-undef element
10373 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
10374 assert(FirstRealEltIter != BlockElts.end() &&
10375 "Shuffle with all-undefs must have been caught by previous cases, "
10376 "e.g. isSplat()");
10377 if (FirstRealEltIter == BlockElts.end()) {
10378 DupLaneOp = 0;
10379 return true;
10382 // Index of FirstRealElt in BlockElts
10383 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
10385 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
10386 return false;
10387 // BlockElts[0] must have the following value if it isn't undef:
10388 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
10390 // Check the first element
10391 if (Elt0 % NumEltsPerBlock != 0)
10392 return false;
10393 // Check that the sequence indeed consists of consecutive integers (modulo
10394 // undefs)
10395 for (size_t I = 0; I < NumEltsPerBlock; I++)
10396 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
10397 return false;
10399 DupLaneOp = Elt0 / NumEltsPerBlock;
10400 return true;
10403 // check if an EXT instruction can handle the shuffle mask when the
10404 // vector sources of the shuffle are different.
10405 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
10406 unsigned &Imm) {
10407 // Look for the first non-undef element.
10408 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
10410 // Benefit form APInt to handle overflow when calculating expected element.
10411 unsigned NumElts = VT.getVectorNumElements();
10412 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
10413 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
10414 // The following shuffle indices must be the successive elements after the
10415 // first real element.
10416 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
10417 return Elt != ExpectedElt++ && Elt != -1;
10419 if (FoundWrongElt)
10420 return false;
10422 // The index of an EXT is the first element if it is not UNDEF.
10423 // Watch out for the beginning UNDEFs. The EXT index should be the expected
10424 // value of the first element. E.g.
10425 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
10426 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
10427 // ExpectedElt is the last mask index plus 1.
10428 Imm = ExpectedElt.getZExtValue();
10430 // There are two difference cases requiring to reverse input vectors.
10431 // For example, for vector <4 x i32> we have the following cases,
10432 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
10433 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
10434 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
10435 // to reverse two input vectors.
10436 if (Imm < NumElts)
10437 ReverseEXT = true;
10438 else
10439 Imm -= NumElts;
10441 return true;
10444 /// isREVMask - Check if a vector shuffle corresponds to a REV
10445 /// instruction with the specified blocksize. (The order of the elements
10446 /// within each block of the vector is reversed.)
10447 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
10448 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
10449 "Only possible block sizes for REV are: 16, 32, 64");
10451 unsigned EltSz = VT.getScalarSizeInBits();
10452 if (EltSz == 64)
10453 return false;
10455 unsigned NumElts = VT.getVectorNumElements();
10456 unsigned BlockElts = M[0] + 1;
10457 // If the first shuffle index is UNDEF, be optimistic.
10458 if (M[0] < 0)
10459 BlockElts = BlockSize / EltSz;
10461 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
10462 return false;
10464 for (unsigned i = 0; i < NumElts; ++i) {
10465 if (M[i] < 0)
10466 continue; // ignore UNDEF indices
10467 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
10468 return false;
10471 return true;
10474 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10475 unsigned NumElts = VT.getVectorNumElements();
10476 if (NumElts % 2 != 0)
10477 return false;
10478 WhichResult = (M[0] == 0 ? 0 : 1);
10479 unsigned Idx = WhichResult * NumElts / 2;
10480 for (unsigned i = 0; i != NumElts; i += 2) {
10481 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
10482 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
10483 return false;
10484 Idx += 1;
10487 return true;
10490 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10491 unsigned NumElts = VT.getVectorNumElements();
10492 WhichResult = (M[0] == 0 ? 0 : 1);
10493 for (unsigned i = 0; i != NumElts; ++i) {
10494 if (M[i] < 0)
10495 continue; // ignore UNDEF indices
10496 if ((unsigned)M[i] != 2 * i + WhichResult)
10497 return false;
10500 return true;
10503 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10504 unsigned NumElts = VT.getVectorNumElements();
10505 if (NumElts % 2 != 0)
10506 return false;
10507 WhichResult = (M[0] == 0 ? 0 : 1);
10508 for (unsigned i = 0; i < NumElts; i += 2) {
10509 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
10510 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
10511 return false;
10513 return true;
10516 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
10517 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
10518 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
10519 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10520 unsigned NumElts = VT.getVectorNumElements();
10521 if (NumElts % 2 != 0)
10522 return false;
10523 WhichResult = (M[0] == 0 ? 0 : 1);
10524 unsigned Idx = WhichResult * NumElts / 2;
10525 for (unsigned i = 0; i != NumElts; i += 2) {
10526 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
10527 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
10528 return false;
10529 Idx += 1;
10532 return true;
10535 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
10536 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
10537 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
10538 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10539 unsigned Half = VT.getVectorNumElements() / 2;
10540 WhichResult = (M[0] == 0 ? 0 : 1);
10541 for (unsigned j = 0; j != 2; ++j) {
10542 unsigned Idx = WhichResult;
10543 for (unsigned i = 0; i != Half; ++i) {
10544 int MIdx = M[i + j * Half];
10545 if (MIdx >= 0 && (unsigned)MIdx != Idx)
10546 return false;
10547 Idx += 2;
10551 return true;
10554 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
10555 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
10556 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
10557 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
10558 unsigned NumElts = VT.getVectorNumElements();
10559 if (NumElts % 2 != 0)
10560 return false;
10561 WhichResult = (M[0] == 0 ? 0 : 1);
10562 for (unsigned i = 0; i < NumElts; i += 2) {
10563 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
10564 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
10565 return false;
10567 return true;
10570 static bool isINSMask(ArrayRef<int> M, int NumInputElements,
10571 bool &DstIsLeft, int &Anomaly) {
10572 if (M.size() != static_cast<size_t>(NumInputElements))
10573 return false;
10575 int NumLHSMatch = 0, NumRHSMatch = 0;
10576 int LastLHSMismatch = -1, LastRHSMismatch = -1;
10578 for (int i = 0; i < NumInputElements; ++i) {
10579 if (M[i] == -1) {
10580 ++NumLHSMatch;
10581 ++NumRHSMatch;
10582 continue;
10585 if (M[i] == i)
10586 ++NumLHSMatch;
10587 else
10588 LastLHSMismatch = i;
10590 if (M[i] == i + NumInputElements)
10591 ++NumRHSMatch;
10592 else
10593 LastRHSMismatch = i;
10596 if (NumLHSMatch == NumInputElements - 1) {
10597 DstIsLeft = true;
10598 Anomaly = LastLHSMismatch;
10599 return true;
10600 } else if (NumRHSMatch == NumInputElements - 1) {
10601 DstIsLeft = false;
10602 Anomaly = LastRHSMismatch;
10603 return true;
10606 return false;
10609 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
10610 if (VT.getSizeInBits() != 128)
10611 return false;
10613 unsigned NumElts = VT.getVectorNumElements();
10615 for (int I = 0, E = NumElts / 2; I != E; I++) {
10616 if (Mask[I] != I)
10617 return false;
10620 int Offset = NumElts / 2;
10621 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
10622 if (Mask[I] != I + SplitLHS * Offset)
10623 return false;
10626 return true;
10629 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
10630 SDLoc DL(Op);
10631 EVT VT = Op.getValueType();
10632 SDValue V0 = Op.getOperand(0);
10633 SDValue V1 = Op.getOperand(1);
10634 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
10636 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
10637 VT.getVectorElementType() != V1.getValueType().getVectorElementType())
10638 return SDValue();
10640 bool SplitV0 = V0.getValueSizeInBits() == 128;
10642 if (!isConcatMask(Mask, VT, SplitV0))
10643 return SDValue();
10645 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
10646 if (SplitV0) {
10647 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
10648 DAG.getConstant(0, DL, MVT::i64));
10650 if (V1.getValueSizeInBits() == 128) {
10651 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
10652 DAG.getConstant(0, DL, MVT::i64));
10654 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
10657 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
10658 /// the specified operations to build the shuffle. ID is the perfect-shuffle
10659 //ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
10660 //table entry and LHS/RHS are the immediate inputs for this stage of the
10661 //shuffle.
10662 static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
10663 SDValue V2, unsigned PFEntry, SDValue LHS,
10664 SDValue RHS, SelectionDAG &DAG,
10665 const SDLoc &dl) {
10666 unsigned OpNum = (PFEntry >> 26) & 0x0F;
10667 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
10668 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
10670 enum {
10671 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
10672 OP_VREV,
10673 OP_VDUP0,
10674 OP_VDUP1,
10675 OP_VDUP2,
10676 OP_VDUP3,
10677 OP_VEXT1,
10678 OP_VEXT2,
10679 OP_VEXT3,
10680 OP_VUZPL, // VUZP, left result
10681 OP_VUZPR, // VUZP, right result
10682 OP_VZIPL, // VZIP, left result
10683 OP_VZIPR, // VZIP, right result
10684 OP_VTRNL, // VTRN, left result
10685 OP_VTRNR, // VTRN, right result
10686 OP_MOVLANE // Move lane. RHSID is the lane to move into
10689 if (OpNum == OP_COPY) {
10690 if (LHSID == (1 * 9 + 2) * 9 + 3)
10691 return LHS;
10692 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
10693 return RHS;
10696 if (OpNum == OP_MOVLANE) {
10697 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
10698 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
10699 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
10700 Elt = 3 - Elt;
10701 while (Elt > 0) {
10702 ID /= 9;
10703 Elt--;
10705 return (ID % 9 == 8) ? -1 : ID % 9;
10708 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
10709 // get the lane to move from from the PFID, which is always from the
10710 // original vectors (V1 or V2).
10711 SDValue OpLHS = GeneratePerfectShuffle(
10712 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
10713 EVT VT = OpLHS.getValueType();
10714 assert(RHSID < 8 && "Expected a lane index for RHSID!");
10715 unsigned ExtLane = 0;
10716 SDValue Input;
10718 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
10719 // convert into a higher type.
10720 if (RHSID & 0x4) {
10721 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
10722 if (MaskElt == -1)
10723 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
10724 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
10725 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
10726 Input = MaskElt < 2 ? V1 : V2;
10727 if (VT.getScalarSizeInBits() == 16) {
10728 Input = DAG.getBitcast(MVT::v2f32, Input);
10729 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
10730 } else {
10731 assert(VT.getScalarSizeInBits() == 32 &&
10732 "Expected 16 or 32 bit shuffle elemements");
10733 Input = DAG.getBitcast(MVT::v2f64, Input);
10734 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
10736 } else {
10737 int MaskElt = getPFIDLane(ID, RHSID);
10738 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
10739 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
10740 Input = MaskElt < 4 ? V1 : V2;
10741 // Be careful about creating illegal types. Use f16 instead of i16.
10742 if (VT == MVT::v4i16) {
10743 Input = DAG.getBitcast(MVT::v4f16, Input);
10744 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
10747 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
10748 Input.getValueType().getVectorElementType(),
10749 Input, DAG.getVectorIdxConstant(ExtLane, dl));
10750 SDValue Ins =
10751 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
10752 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
10753 return DAG.getBitcast(VT, Ins);
10756 SDValue OpLHS, OpRHS;
10757 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
10758 RHS, DAG, dl);
10759 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
10760 RHS, DAG, dl);
10761 EVT VT = OpLHS.getValueType();
10763 switch (OpNum) {
10764 default:
10765 llvm_unreachable("Unknown shuffle opcode!");
10766 case OP_VREV:
10767 // VREV divides the vector in half and swaps within the half.
10768 if (VT.getVectorElementType() == MVT::i32 ||
10769 VT.getVectorElementType() == MVT::f32)
10770 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
10771 // vrev <4 x i16> -> REV32
10772 if (VT.getVectorElementType() == MVT::i16 ||
10773 VT.getVectorElementType() == MVT::f16 ||
10774 VT.getVectorElementType() == MVT::bf16)
10775 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
10776 // vrev <4 x i8> -> REV16
10777 assert(VT.getVectorElementType() == MVT::i8);
10778 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
10779 case OP_VDUP0:
10780 case OP_VDUP1:
10781 case OP_VDUP2:
10782 case OP_VDUP3: {
10783 EVT EltTy = VT.getVectorElementType();
10784 unsigned Opcode;
10785 if (EltTy == MVT::i8)
10786 Opcode = AArch64ISD::DUPLANE8;
10787 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
10788 Opcode = AArch64ISD::DUPLANE16;
10789 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
10790 Opcode = AArch64ISD::DUPLANE32;
10791 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
10792 Opcode = AArch64ISD::DUPLANE64;
10793 else
10794 llvm_unreachable("Invalid vector element type?");
10796 if (VT.getSizeInBits() == 64)
10797 OpLHS = WidenVector(OpLHS, DAG);
10798 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
10799 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
10801 case OP_VEXT1:
10802 case OP_VEXT2:
10803 case OP_VEXT3: {
10804 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
10805 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
10806 DAG.getConstant(Imm, dl, MVT::i32));
10808 case OP_VUZPL:
10809 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
10810 OpRHS);
10811 case OP_VUZPR:
10812 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
10813 OpRHS);
10814 case OP_VZIPL:
10815 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
10816 OpRHS);
10817 case OP_VZIPR:
10818 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
10819 OpRHS);
10820 case OP_VTRNL:
10821 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
10822 OpRHS);
10823 case OP_VTRNR:
10824 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
10825 OpRHS);
10829 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
10830 SelectionDAG &DAG) {
10831 // Check to see if we can use the TBL instruction.
10832 SDValue V1 = Op.getOperand(0);
10833 SDValue V2 = Op.getOperand(1);
10834 SDLoc DL(Op);
10836 EVT EltVT = Op.getValueType().getVectorElementType();
10837 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
10839 bool Swap = false;
10840 if (V1.isUndef() || isZerosVector(V1.getNode())) {
10841 std::swap(V1, V2);
10842 Swap = true;
10845 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
10846 // out of range values with 0s. We do need to make sure that any out-of-range
10847 // values are really out-of-range for a v16i8 vector.
10848 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
10849 MVT IndexVT = MVT::v8i8;
10850 unsigned IndexLen = 8;
10851 if (Op.getValueSizeInBits() == 128) {
10852 IndexVT = MVT::v16i8;
10853 IndexLen = 16;
10856 SmallVector<SDValue, 8> TBLMask;
10857 for (int Val : ShuffleMask) {
10858 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
10859 unsigned Offset = Byte + Val * BytesPerElt;
10860 if (Swap)
10861 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
10862 if (IsUndefOrZero && Offset >= IndexLen)
10863 Offset = 255;
10864 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
10868 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
10869 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
10871 SDValue Shuffle;
10872 if (IsUndefOrZero) {
10873 if (IndexLen == 8)
10874 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
10875 Shuffle = DAG.getNode(
10876 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
10877 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
10878 DAG.getBuildVector(IndexVT, DL,
10879 makeArrayRef(TBLMask.data(), IndexLen)));
10880 } else {
10881 if (IndexLen == 8) {
10882 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
10883 Shuffle = DAG.getNode(
10884 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
10885 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
10886 DAG.getBuildVector(IndexVT, DL,
10887 makeArrayRef(TBLMask.data(), IndexLen)));
10888 } else {
10889 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
10890 // cannot currently represent the register constraints on the input
10891 // table registers.
10892 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
10893 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
10894 // IndexLen));
10895 Shuffle = DAG.getNode(
10896 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
10897 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
10898 V2Cst, DAG.getBuildVector(IndexVT, DL,
10899 makeArrayRef(TBLMask.data(), IndexLen)));
10902 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
10905 static unsigned getDUPLANEOp(EVT EltType) {
10906 if (EltType == MVT::i8)
10907 return AArch64ISD::DUPLANE8;
10908 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
10909 return AArch64ISD::DUPLANE16;
10910 if (EltType == MVT::i32 || EltType == MVT::f32)
10911 return AArch64ISD::DUPLANE32;
10912 if (EltType == MVT::i64 || EltType == MVT::f64)
10913 return AArch64ISD::DUPLANE64;
10915 llvm_unreachable("Invalid vector element type?");
10918 static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
10919 unsigned Opcode, SelectionDAG &DAG) {
10920 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
10921 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
10922 // Match: dup (bitcast (extract_subv X, C)), LaneC
10923 if (BitCast.getOpcode() != ISD::BITCAST ||
10924 BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
10925 return false;
10927 // The extract index must align in the destination type. That may not
10928 // happen if the bitcast is from narrow to wide type.
10929 SDValue Extract = BitCast.getOperand(0);
10930 unsigned ExtIdx = Extract.getConstantOperandVal(1);
10931 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
10932 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
10933 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
10934 if (ExtIdxInBits % CastedEltBitWidth != 0)
10935 return false;
10937 // Can't handle cases where vector size is not 128-bit
10938 if (!Extract.getOperand(0).getValueType().is128BitVector())
10939 return false;
10941 // Update the lane value by offsetting with the scaled extract index.
10942 LaneC += ExtIdxInBits / CastedEltBitWidth;
10944 // Determine the casted vector type of the wide vector input.
10945 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
10946 // Examples:
10947 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
10948 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
10949 unsigned SrcVecNumElts =
10950 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
10951 CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
10952 SrcVecNumElts);
10953 return true;
10955 MVT CastVT;
10956 if (getScaledOffsetDup(V, Lane, CastVT)) {
10957 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
10958 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10959 V.getOperand(0).getValueType().is128BitVector()) {
10960 // The lane is incremented by the index of the extract.
10961 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
10962 Lane += V.getConstantOperandVal(1);
10963 V = V.getOperand(0);
10964 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
10965 // The lane is decremented if we are splatting from the 2nd operand.
10966 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
10967 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
10968 Lane -= Idx * VT.getVectorNumElements() / 2;
10969 V = WidenVector(V.getOperand(Idx), DAG);
10970 } else if (VT.getSizeInBits() == 64) {
10971 // Widen the operand to 128-bit register with undef.
10972 V = WidenVector(V, DAG);
10974 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
10977 // Return true if we can get a new shuffle mask by checking the parameter mask
10978 // array to test whether every two adjacent mask values are continuous and
10979 // starting from an even number.
10980 static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
10981 SmallVectorImpl<int> &NewMask) {
10982 unsigned NumElts = VT.getVectorNumElements();
10983 if (NumElts % 2 != 0)
10984 return false;
10986 NewMask.clear();
10987 for (unsigned i = 0; i < NumElts; i += 2) {
10988 int M0 = M[i];
10989 int M1 = M[i + 1];
10991 // If both elements are undef, new mask is undef too.
10992 if (M0 == -1 && M1 == -1) {
10993 NewMask.push_back(-1);
10994 continue;
10997 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
10998 NewMask.push_back(M1 / 2);
10999 continue;
11002 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
11003 NewMask.push_back(M0 / 2);
11004 continue;
11007 NewMask.clear();
11008 return false;
11011 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
11012 return true;
11015 // Try to widen element type to get a new mask value for a better permutation
11016 // sequence, so that we can use NEON shuffle instructions, such as zip1/2,
11017 // UZP1/2, TRN1/2, REV, INS, etc.
11018 // For example:
11019 // shufflevector <4 x i32> %a, <4 x i32> %b,
11020 // <4 x i32> <i32 6, i32 7, i32 2, i32 3>
11021 // is equivalent to:
11022 // shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
11023 // Finally, we can get:
11024 // mov v0.d[0], v1.d[1]
11025 static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
11026 SDLoc DL(Op);
11027 EVT VT = Op.getValueType();
11028 EVT ScalarVT = VT.getVectorElementType();
11029 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
11030 SDValue V0 = Op.getOperand(0);
11031 SDValue V1 = Op.getOperand(1);
11032 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
11034 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
11035 // We need to make sure the wider element type is legal. Thus, ElementSize
11036 // should be not larger than 32 bits, and i1 type should also be excluded.
11037 if (ElementSize > 32 || ElementSize == 1)
11038 return SDValue();
11040 SmallVector<int, 8> NewMask;
11041 if (isWideTypeMask(Mask, VT, NewMask)) {
11042 MVT NewEltVT = VT.isFloatingPoint()
11043 ? MVT::getFloatingPointVT(ElementSize * 2)
11044 : MVT::getIntegerVT(ElementSize * 2);
11045 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
11046 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
11047 V0 = DAG.getBitcast(NewVT, V0);
11048 V1 = DAG.getBitcast(NewVT, V1);
11049 return DAG.getBitcast(VT,
11050 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
11054 return SDValue();
11057 // Try to fold shuffle (tbl2, tbl2) into a single tbl4.
11058 static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
11059 ArrayRef<int> ShuffleMask,
11060 SelectionDAG &DAG) {
11061 SDValue Tbl1 = Op->getOperand(0);
11062 SDValue Tbl2 = Op->getOperand(1);
11063 SDLoc dl(Op);
11064 SDValue Tbl2ID =
11065 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
11067 EVT VT = Op.getValueType();
11068 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
11069 Tbl1->getOperand(0) != Tbl2ID ||
11070 Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
11071 Tbl2->getOperand(0) != Tbl2ID)
11072 return SDValue();
11074 if (Tbl1->getValueType(0) != MVT::v16i8 ||
11075 Tbl2->getValueType(0) != MVT::v16i8)
11076 return SDValue();
11078 SDValue Mask1 = Tbl1->getOperand(3);
11079 SDValue Mask2 = Tbl2->getOperand(3);
11080 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
11081 for (unsigned I = 0; I < 16; I++) {
11082 if (ShuffleMask[I] < 16)
11083 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
11084 else {
11085 auto *C =
11086 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
11087 if (!C)
11088 return SDValue();
11089 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
11093 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
11094 SDValue ID =
11095 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
11097 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
11098 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
11099 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
11102 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
11103 SelectionDAG &DAG) const {
11104 SDLoc dl(Op);
11105 EVT VT = Op.getValueType();
11107 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
11109 if (useSVEForFixedLengthVectorVT(VT,
11110 Subtarget->forceStreamingCompatibleSVE()))
11111 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
11113 // Convert shuffles that are directly supported on NEON to target-specific
11114 // DAG nodes, instead of keeping them as shuffles and matching them again
11115 // during code selection. This is more efficient and avoids the possibility
11116 // of inconsistencies between legalization and selection.
11117 ArrayRef<int> ShuffleMask = SVN->getMask();
11119 SDValue V1 = Op.getOperand(0);
11120 SDValue V2 = Op.getOperand(1);
11122 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
11123 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
11124 "Unexpected VECTOR_SHUFFLE mask size!");
11126 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
11127 return Res;
11129 if (SVN->isSplat()) {
11130 int Lane = SVN->getSplatIndex();
11131 // If this is undef splat, generate it via "just" vdup, if possible.
11132 if (Lane == -1)
11133 Lane = 0;
11135 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
11136 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
11137 V1.getOperand(0));
11138 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
11139 // constant. If so, we can just reference the lane's definition directly.
11140 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
11141 !isa<ConstantSDNode>(V1.getOperand(Lane)))
11142 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
11144 // Otherwise, duplicate from the lane of the input vector.
11145 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
11146 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
11149 // Check if the mask matches a DUP for a wider element
11150 for (unsigned LaneSize : {64U, 32U, 16U}) {
11151 unsigned Lane = 0;
11152 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
11153 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
11154 : LaneSize == 32 ? AArch64ISD::DUPLANE32
11155 : AArch64ISD::DUPLANE16;
11156 // Cast V1 to an integer vector with required lane size
11157 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
11158 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
11159 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
11160 V1 = DAG.getBitcast(NewVecTy, V1);
11161 // Constuct the DUP instruction
11162 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
11163 // Cast back to the original type
11164 return DAG.getBitcast(VT, V1);
11168 if (isREVMask(ShuffleMask, VT, 64))
11169 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
11170 if (isREVMask(ShuffleMask, VT, 32))
11171 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
11172 if (isREVMask(ShuffleMask, VT, 16))
11173 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
11175 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
11176 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
11177 ShuffleVectorInst::isReverseMask(ShuffleMask)) {
11178 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
11179 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
11180 DAG.getConstant(8, dl, MVT::i32));
11183 bool ReverseEXT = false;
11184 unsigned Imm;
11185 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
11186 if (ReverseEXT)
11187 std::swap(V1, V2);
11188 Imm *= getExtFactor(V1);
11189 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
11190 DAG.getConstant(Imm, dl, MVT::i32));
11191 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
11192 Imm *= getExtFactor(V1);
11193 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
11194 DAG.getConstant(Imm, dl, MVT::i32));
11197 unsigned WhichResult;
11198 if (isZIPMask(ShuffleMask, VT, WhichResult)) {
11199 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
11200 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
11202 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
11203 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
11204 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
11206 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
11207 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
11208 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
11211 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
11212 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
11213 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
11215 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
11216 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
11217 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
11219 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
11220 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
11221 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
11224 if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
11225 return Concat;
11227 bool DstIsLeft;
11228 int Anomaly;
11229 int NumInputElements = V1.getValueType().getVectorNumElements();
11230 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
11231 SDValue DstVec = DstIsLeft ? V1 : V2;
11232 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
11234 SDValue SrcVec = V1;
11235 int SrcLane = ShuffleMask[Anomaly];
11236 if (SrcLane >= NumInputElements) {
11237 SrcVec = V2;
11238 SrcLane -= VT.getVectorNumElements();
11240 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
11242 EVT ScalarVT = VT.getVectorElementType();
11244 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
11245 ScalarVT = MVT::i32;
11247 return DAG.getNode(
11248 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
11249 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
11250 DstLaneV);
11253 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
11254 return NewSD;
11256 // If the shuffle is not directly supported and it has 4 elements, use
11257 // the PerfectShuffle-generated table to synthesize it from other shuffles.
11258 unsigned NumElts = VT.getVectorNumElements();
11259 if (NumElts == 4) {
11260 unsigned PFIndexes[4];
11261 for (unsigned i = 0; i != 4; ++i) {
11262 if (ShuffleMask[i] < 0)
11263 PFIndexes[i] = 8;
11264 else
11265 PFIndexes[i] = ShuffleMask[i];
11268 // Compute the index in the perfect shuffle table.
11269 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
11270 PFIndexes[2] * 9 + PFIndexes[3];
11271 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
11272 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
11273 dl);
11276 return GenerateTBL(Op, ShuffleMask, DAG);
11279 SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
11280 SelectionDAG &DAG) const {
11281 EVT VT = Op.getValueType();
11283 if (useSVEForFixedLengthVectorVT(VT))
11284 return LowerToScalableOp(Op, DAG);
11286 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
11287 "Unexpected vector type!");
11289 // We can handle the constant cases during isel.
11290 if (isa<ConstantSDNode>(Op.getOperand(0)))
11291 return Op;
11293 // There isn't a natural way to handle the general i1 case, so we use some
11294 // trickery with whilelo.
11295 SDLoc DL(Op);
11296 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
11297 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
11298 DAG.getValueType(MVT::i1));
11299 SDValue ID =
11300 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
11301 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11302 if (VT == MVT::nxv1i1)
11303 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
11304 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
11305 Zero, SplatVal),
11306 Zero);
11307 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
11310 SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
11311 SelectionDAG &DAG) const {
11312 SDLoc DL(Op);
11314 EVT VT = Op.getValueType();
11315 if (!isTypeLegal(VT) || !VT.isScalableVector())
11316 return SDValue();
11318 // Current lowering only supports the SVE-ACLE types.
11319 if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
11320 return SDValue();
11322 // The DUPQ operation is indepedent of element type so normalise to i64s.
11323 SDValue Idx128 = Op.getOperand(2);
11325 // DUPQ can be used when idx is in range.
11326 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
11327 if (CIdx && (CIdx->getZExtValue() <= 3)) {
11328 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
11329 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
11332 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
11334 // The ACLE says this must produce the same result as:
11335 // svtbl(data, svadd_x(svptrue_b64(),
11336 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
11337 // index * 2))
11338 SDValue One = DAG.getConstant(1, DL, MVT::i64);
11339 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
11341 // create the vector 0,1,0,1,...
11342 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
11343 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
11345 // create the vector idx64,idx64+1,idx64,idx64+1,...
11346 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
11347 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
11348 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
11350 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
11351 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
11352 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
11356 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
11357 APInt &UndefBits) {
11358 EVT VT = BVN->getValueType(0);
11359 APInt SplatBits, SplatUndef;
11360 unsigned SplatBitSize;
11361 bool HasAnyUndefs;
11362 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
11363 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
11365 for (unsigned i = 0; i < NumSplats; ++i) {
11366 CnstBits <<= SplatBitSize;
11367 UndefBits <<= SplatBitSize;
11368 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
11369 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
11372 return true;
11375 return false;
11378 // Try 64-bit splatted SIMD immediate.
11379 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11380 const APInt &Bits) {
11381 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11382 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11383 EVT VT = Op.getValueType();
11384 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
11386 if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
11387 Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
11389 SDLoc dl(Op);
11390 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
11391 DAG.getConstant(Value, dl, MVT::i32));
11392 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11396 return SDValue();
11399 // Try 32-bit splatted SIMD immediate.
11400 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11401 const APInt &Bits,
11402 const SDValue *LHS = nullptr) {
11403 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11404 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11405 EVT VT = Op.getValueType();
11406 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
11407 bool isAdvSIMDModImm = false;
11408 uint64_t Shift;
11410 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
11411 Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
11412 Shift = 0;
11414 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
11415 Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
11416 Shift = 8;
11418 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
11419 Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
11420 Shift = 16;
11422 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
11423 Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
11424 Shift = 24;
11427 if (isAdvSIMDModImm) {
11428 SDLoc dl(Op);
11429 SDValue Mov;
11431 if (LHS)
11432 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
11433 DAG.getConstant(Value, dl, MVT::i32),
11434 DAG.getConstant(Shift, dl, MVT::i32));
11435 else
11436 Mov = DAG.getNode(NewOp, dl, MovTy,
11437 DAG.getConstant(Value, dl, MVT::i32),
11438 DAG.getConstant(Shift, dl, MVT::i32));
11440 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11444 return SDValue();
11447 // Try 16-bit splatted SIMD immediate.
11448 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11449 const APInt &Bits,
11450 const SDValue *LHS = nullptr) {
11451 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11452 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11453 EVT VT = Op.getValueType();
11454 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
11455 bool isAdvSIMDModImm = false;
11456 uint64_t Shift;
11458 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
11459 Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
11460 Shift = 0;
11462 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
11463 Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
11464 Shift = 8;
11467 if (isAdvSIMDModImm) {
11468 SDLoc dl(Op);
11469 SDValue Mov;
11471 if (LHS)
11472 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
11473 DAG.getConstant(Value, dl, MVT::i32),
11474 DAG.getConstant(Shift, dl, MVT::i32));
11475 else
11476 Mov = DAG.getNode(NewOp, dl, MovTy,
11477 DAG.getConstant(Value, dl, MVT::i32),
11478 DAG.getConstant(Shift, dl, MVT::i32));
11480 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11484 return SDValue();
11487 // Try 32-bit splatted SIMD immediate with shifted ones.
11488 static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
11489 SelectionDAG &DAG, const APInt &Bits) {
11490 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11491 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11492 EVT VT = Op.getValueType();
11493 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
11494 bool isAdvSIMDModImm = false;
11495 uint64_t Shift;
11497 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
11498 Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
11499 Shift = 264;
11501 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
11502 Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
11503 Shift = 272;
11506 if (isAdvSIMDModImm) {
11507 SDLoc dl(Op);
11508 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
11509 DAG.getConstant(Value, dl, MVT::i32),
11510 DAG.getConstant(Shift, dl, MVT::i32));
11511 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11515 return SDValue();
11518 // Try 8-bit splatted SIMD immediate.
11519 static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11520 const APInt &Bits) {
11521 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11522 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11523 EVT VT = Op.getValueType();
11524 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
11526 if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
11527 Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
11529 SDLoc dl(Op);
11530 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
11531 DAG.getConstant(Value, dl, MVT::i32));
11532 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11536 return SDValue();
11539 // Try FP splatted SIMD immediate.
11540 static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
11541 const APInt &Bits) {
11542 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
11543 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
11544 EVT VT = Op.getValueType();
11545 bool isWide = (VT.getSizeInBits() == 128);
11546 MVT MovTy;
11547 bool isAdvSIMDModImm = false;
11549 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
11550 Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
11551 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
11553 else if (isWide &&
11554 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
11555 Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
11556 MovTy = MVT::v2f64;
11559 if (isAdvSIMDModImm) {
11560 SDLoc dl(Op);
11561 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
11562 DAG.getConstant(Value, dl, MVT::i32));
11563 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
11567 return SDValue();
11570 // Specialized code to quickly find if PotentialBVec is a BuildVector that
11571 // consists of only the same constant int value, returned in reference arg
11572 // ConstVal
11573 static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
11574 uint64_t &ConstVal) {
11575 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
11576 if (!Bvec)
11577 return false;
11578 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
11579 if (!FirstElt)
11580 return false;
11581 EVT VT = Bvec->getValueType(0);
11582 unsigned NumElts = VT.getVectorNumElements();
11583 for (unsigned i = 1; i < NumElts; ++i)
11584 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
11585 return false;
11586 ConstVal = FirstElt->getZExtValue();
11587 return true;
11590 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
11591 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
11592 // BUILD_VECTORs with constant element C1, C2 is a constant, and:
11593 // - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
11594 // - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
11595 // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
11596 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
11597 EVT VT = N->getValueType(0);
11599 if (!VT.isVector())
11600 return SDValue();
11602 SDLoc DL(N);
11604 SDValue And;
11605 SDValue Shift;
11607 SDValue FirstOp = N->getOperand(0);
11608 unsigned FirstOpc = FirstOp.getOpcode();
11609 SDValue SecondOp = N->getOperand(1);
11610 unsigned SecondOpc = SecondOp.getOpcode();
11612 // Is one of the operands an AND or a BICi? The AND may have been optimised to
11613 // a BICi in order to use an immediate instead of a register.
11614 // Is the other operand an shl or lshr? This will have been turned into:
11615 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
11616 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
11617 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
11618 And = FirstOp;
11619 Shift = SecondOp;
11621 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
11622 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
11623 And = SecondOp;
11624 Shift = FirstOp;
11625 } else
11626 return SDValue();
11628 bool IsAnd = And.getOpcode() == ISD::AND;
11629 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
11631 // Is the shift amount constant?
11632 ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
11633 if (!C2node)
11634 return SDValue();
11636 uint64_t C1;
11637 if (IsAnd) {
11638 // Is the and mask vector all constant?
11639 if (!isAllConstantBuildVector(And.getOperand(1), C1))
11640 return SDValue();
11641 } else {
11642 // Reconstruct the corresponding AND immediate from the two BICi immediates.
11643 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
11644 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
11645 assert(C1nodeImm && C1nodeShift);
11646 C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
11649 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
11650 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
11651 // how much one can shift elements of a particular size?
11652 uint64_t C2 = C2node->getZExtValue();
11653 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
11654 if (C2 > ElemSizeInBits)
11655 return SDValue();
11657 APInt C1AsAPInt(ElemSizeInBits, C1);
11658 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
11659 : APInt::getLowBitsSet(ElemSizeInBits, C2);
11660 if (C1AsAPInt != RequiredC1)
11661 return SDValue();
11663 SDValue X = And.getOperand(0);
11664 SDValue Y = Shift.getOperand(0);
11666 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
11667 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
11669 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
11670 LLVM_DEBUG(N->dump(&DAG));
11671 LLVM_DEBUG(dbgs() << "into: \n");
11672 LLVM_DEBUG(ResultSLI->dump(&DAG));
11674 ++NumShiftInserts;
11675 return ResultSLI;
11678 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
11679 SelectionDAG &DAG) const {
11680 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
11681 return LowerToScalableOp(Op, DAG);
11683 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
11684 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
11685 return Res;
11687 EVT VT = Op.getValueType();
11689 SDValue LHS = Op.getOperand(0);
11690 BuildVectorSDNode *BVN =
11691 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
11692 if (!BVN) {
11693 // OR commutes, so try swapping the operands.
11694 LHS = Op.getOperand(1);
11695 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
11697 if (!BVN)
11698 return Op;
11700 APInt DefBits(VT.getSizeInBits(), 0);
11701 APInt UndefBits(VT.getSizeInBits(), 0);
11702 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
11703 SDValue NewOp;
11705 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
11706 DefBits, &LHS)) ||
11707 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
11708 DefBits, &LHS)))
11709 return NewOp;
11711 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
11712 UndefBits, &LHS)) ||
11713 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
11714 UndefBits, &LHS)))
11715 return NewOp;
11718 // We can always fall back to a non-immediate OR.
11719 return Op;
11722 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
11723 // be truncated to fit element width.
11724 static SDValue NormalizeBuildVector(SDValue Op,
11725 SelectionDAG &DAG) {
11726 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11727 SDLoc dl(Op);
11728 EVT VT = Op.getValueType();
11729 EVT EltTy= VT.getVectorElementType();
11731 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
11732 return Op;
11734 SmallVector<SDValue, 16> Ops;
11735 for (SDValue Lane : Op->ops()) {
11736 // For integer vectors, type legalization would have promoted the
11737 // operands already. Otherwise, if Op is a floating-point splat
11738 // (with operands cast to integers), then the only possibilities
11739 // are constants and UNDEFs.
11740 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
11741 APInt LowBits(EltTy.getSizeInBits(),
11742 CstLane->getZExtValue());
11743 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
11744 } else if (Lane.getNode()->isUndef()) {
11745 Lane = DAG.getUNDEF(MVT::i32);
11746 } else {
11747 assert(Lane.getValueType() == MVT::i32 &&
11748 "Unexpected BUILD_VECTOR operand type");
11750 Ops.push_back(Lane);
11752 return DAG.getBuildVector(VT, dl, Ops);
11755 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
11756 EVT VT = Op.getValueType();
11758 APInt DefBits(VT.getSizeInBits(), 0);
11759 APInt UndefBits(VT.getSizeInBits(), 0);
11760 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
11761 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
11762 SDValue NewOp;
11763 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
11764 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
11765 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
11766 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
11767 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
11768 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
11769 return NewOp;
11771 DefBits = ~DefBits;
11772 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
11773 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
11774 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
11775 return NewOp;
11777 DefBits = UndefBits;
11778 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
11779 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
11780 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
11781 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
11782 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
11783 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
11784 return NewOp;
11786 DefBits = ~UndefBits;
11787 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
11788 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
11789 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
11790 return NewOp;
11793 return SDValue();
11796 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
11797 SelectionDAG &DAG) const {
11798 EVT VT = Op.getValueType();
11800 if (useSVEForFixedLengthVectorVT(VT,
11801 Subtarget->forceStreamingCompatibleSVE())) {
11802 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
11803 SDLoc DL(Op);
11804 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11805 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
11806 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
11807 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
11808 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
11811 // Revert to common legalisation for all other variants.
11812 return SDValue();
11815 // Try to build a simple constant vector.
11816 Op = NormalizeBuildVector(Op, DAG);
11817 if (VT.isInteger()) {
11818 // Certain vector constants, used to express things like logical NOT and
11819 // arithmetic NEG, are passed through unmodified. This allows special
11820 // patterns for these operations to match, which will lower these constants
11821 // to whatever is proven necessary.
11822 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
11823 if (BVN->isConstant())
11824 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
11825 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
11826 APInt Val(BitSize,
11827 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
11828 if (Val.isZero() || Val.isAllOnes())
11829 return Op;
11833 if (SDValue V = ConstantBuildVector(Op, DAG))
11834 return V;
11836 // Scan through the operands to find some interesting properties we can
11837 // exploit:
11838 // 1) If only one value is used, we can use a DUP, or
11839 // 2) if only the low element is not undef, we can just insert that, or
11840 // 3) if only one constant value is used (w/ some non-constant lanes),
11841 // we can splat the constant value into the whole vector then fill
11842 // in the non-constant lanes.
11843 // 4) FIXME: If different constant values are used, but we can intelligently
11844 // select the values we'll be overwriting for the non-constant
11845 // lanes such that we can directly materialize the vector
11846 // some other way (MOVI, e.g.), we can be sneaky.
11847 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
11848 SDLoc dl(Op);
11849 unsigned NumElts = VT.getVectorNumElements();
11850 bool isOnlyLowElement = true;
11851 bool usesOnlyOneValue = true;
11852 bool usesOnlyOneConstantValue = true;
11853 bool isConstant = true;
11854 bool AllLanesExtractElt = true;
11855 unsigned NumConstantLanes = 0;
11856 unsigned NumDifferentLanes = 0;
11857 unsigned NumUndefLanes = 0;
11858 SDValue Value;
11859 SDValue ConstantValue;
11860 for (unsigned i = 0; i < NumElts; ++i) {
11861 SDValue V = Op.getOperand(i);
11862 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11863 AllLanesExtractElt = false;
11864 if (V.isUndef()) {
11865 ++NumUndefLanes;
11866 continue;
11868 if (i > 0)
11869 isOnlyLowElement = false;
11870 if (!isIntOrFPConstant(V))
11871 isConstant = false;
11873 if (isIntOrFPConstant(V)) {
11874 ++NumConstantLanes;
11875 if (!ConstantValue.getNode())
11876 ConstantValue = V;
11877 else if (ConstantValue != V)
11878 usesOnlyOneConstantValue = false;
11881 if (!Value.getNode())
11882 Value = V;
11883 else if (V != Value) {
11884 usesOnlyOneValue = false;
11885 ++NumDifferentLanes;
11889 if (!Value.getNode()) {
11890 LLVM_DEBUG(
11891 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
11892 return DAG.getUNDEF(VT);
11895 // Convert BUILD_VECTOR where all elements but the lowest are undef into
11896 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
11897 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
11898 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
11899 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
11900 "SCALAR_TO_VECTOR node\n");
11901 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
11904 if (AllLanesExtractElt) {
11905 SDNode *Vector = nullptr;
11906 bool Even = false;
11907 bool Odd = false;
11908 // Check whether the extract elements match the Even pattern <0,2,4,...> or
11909 // the Odd pattern <1,3,5,...>.
11910 for (unsigned i = 0; i < NumElts; ++i) {
11911 SDValue V = Op.getOperand(i);
11912 const SDNode *N = V.getNode();
11913 if (!isa<ConstantSDNode>(N->getOperand(1)))
11914 break;
11915 SDValue N0 = N->getOperand(0);
11917 // All elements are extracted from the same vector.
11918 if (!Vector) {
11919 Vector = N0.getNode();
11920 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
11921 // BUILD_VECTOR.
11922 if (VT.getVectorElementType() !=
11923 N0.getValueType().getVectorElementType())
11924 break;
11925 } else if (Vector != N0.getNode()) {
11926 Odd = false;
11927 Even = false;
11928 break;
11931 // Extracted values are either at Even indices <0,2,4,...> or at Odd
11932 // indices <1,3,5,...>.
11933 uint64_t Val = N->getConstantOperandVal(1);
11934 if (Val == 2 * i) {
11935 Even = true;
11936 continue;
11938 if (Val - 1 == 2 * i) {
11939 Odd = true;
11940 continue;
11943 // Something does not match: abort.
11944 Odd = false;
11945 Even = false;
11946 break;
11948 if (Even || Odd) {
11949 SDValue LHS =
11950 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
11951 DAG.getConstant(0, dl, MVT::i64));
11952 SDValue RHS =
11953 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
11954 DAG.getConstant(NumElts, dl, MVT::i64));
11956 if (Even && !Odd)
11957 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
11958 RHS);
11959 if (Odd && !Even)
11960 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
11961 RHS);
11965 // Use DUP for non-constant splats. For f32 constant splats, reduce to
11966 // i32 and try again.
11967 if (usesOnlyOneValue) {
11968 if (!isConstant) {
11969 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11970 Value.getValueType() != VT) {
11971 LLVM_DEBUG(
11972 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
11973 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
11976 // This is actually a DUPLANExx operation, which keeps everything vectory.
11978 SDValue Lane = Value.getOperand(1);
11979 Value = Value.getOperand(0);
11980 if (Value.getValueSizeInBits() == 64) {
11981 LLVM_DEBUG(
11982 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
11983 "widening it\n");
11984 Value = WidenVector(Value, DAG);
11987 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
11988 return DAG.getNode(Opcode, dl, VT, Value, Lane);
11991 if (VT.getVectorElementType().isFloatingPoint()) {
11992 SmallVector<SDValue, 8> Ops;
11993 EVT EltTy = VT.getVectorElementType();
11994 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
11995 EltTy == MVT::f64) && "Unsupported floating-point vector type");
11996 LLVM_DEBUG(
11997 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
11998 "BITCASTS, and try again\n");
11999 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
12000 for (unsigned i = 0; i < NumElts; ++i)
12001 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
12002 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
12003 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
12004 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
12005 Val.dump(););
12006 Val = LowerBUILD_VECTOR(Val, DAG);
12007 if (Val.getNode())
12008 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
12012 // If we need to insert a small number of different non-constant elements and
12013 // the vector width is sufficiently large, prefer using DUP with the common
12014 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
12015 // skip the constant lane handling below.
12016 bool PreferDUPAndInsert =
12017 !isConstant && NumDifferentLanes >= 1 &&
12018 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
12019 NumDifferentLanes >= NumConstantLanes;
12021 // If there was only one constant value used and for more than one lane,
12022 // start by splatting that value, then replace the non-constant lanes. This
12023 // is better than the default, which will perform a separate initialization
12024 // for each lane.
12025 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
12026 // Firstly, try to materialize the splat constant.
12027 SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
12028 Val = ConstantBuildVector(Vec, DAG);
12029 if (!Val) {
12030 // Otherwise, materialize the constant and splat it.
12031 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
12032 DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
12035 // Now insert the non-constant lanes.
12036 for (unsigned i = 0; i < NumElts; ++i) {
12037 SDValue V = Op.getOperand(i);
12038 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
12039 if (!isIntOrFPConstant(V))
12040 // Note that type legalization likely mucked about with the VT of the
12041 // source operand, so we may have to convert it here before inserting.
12042 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
12044 return Val;
12047 // This will generate a load from the constant pool.
12048 if (isConstant) {
12049 LLVM_DEBUG(
12050 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
12051 "expansion\n");
12052 return SDValue();
12055 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
12056 // v4i32s. This is really a truncate, which we can construct out of (legal)
12057 // concats and truncate nodes.
12058 if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
12059 return M;
12061 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
12062 if (NumElts >= 4) {
12063 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
12064 return shuffle;
12067 if (PreferDUPAndInsert) {
12068 // First, build a constant vector with the common element.
12069 SmallVector<SDValue, 8> Ops(NumElts, Value);
12070 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
12071 // Next, insert the elements that do not match the common value.
12072 for (unsigned I = 0; I < NumElts; ++I)
12073 if (Op.getOperand(I) != Value)
12074 NewVector =
12075 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
12076 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
12078 return NewVector;
12081 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
12082 // know the default expansion would otherwise fall back on something even
12083 // worse. For a vector with one or two non-undef values, that's
12084 // scalar_to_vector for the elements followed by a shuffle (provided the
12085 // shuffle is valid for the target) and materialization element by element
12086 // on the stack followed by a load for everything else.
12087 if (!isConstant && !usesOnlyOneValue) {
12088 LLVM_DEBUG(
12089 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
12090 "of INSERT_VECTOR_ELT\n");
12092 SDValue Vec = DAG.getUNDEF(VT);
12093 SDValue Op0 = Op.getOperand(0);
12094 unsigned i = 0;
12096 // Use SCALAR_TO_VECTOR for lane zero to
12097 // a) Avoid a RMW dependency on the full vector register, and
12098 // b) Allow the register coalescer to fold away the copy if the
12099 // value is already in an S or D register, and we're forced to emit an
12100 // INSERT_SUBREG that we can't fold anywhere.
12102 // We also allow types like i8 and i16 which are illegal scalar but legal
12103 // vector element types. After type-legalization the inserted value is
12104 // extended (i32) and it is safe to cast them to the vector type by ignoring
12105 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
12106 if (!Op0.isUndef()) {
12107 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
12108 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
12109 ++i;
12111 LLVM_DEBUG(if (i < NumElts) dbgs()
12112 << "Creating nodes for the other vector elements:\n";);
12113 for (; i < NumElts; ++i) {
12114 SDValue V = Op.getOperand(i);
12115 if (V.isUndef())
12116 continue;
12117 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
12118 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
12120 return Vec;
12123 LLVM_DEBUG(
12124 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
12125 "better alternative\n");
12126 return SDValue();
12129 SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
12130 SelectionDAG &DAG) const {
12131 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
12132 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
12134 assert(Op.getValueType().isScalableVector() &&
12135 isTypeLegal(Op.getValueType()) &&
12136 "Expected legal scalable vector type!");
12138 if (isTypeLegal(Op.getOperand(0).getValueType())) {
12139 unsigned NumOperands = Op->getNumOperands();
12140 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
12141 "Unexpected number of operands in CONCAT_VECTORS");
12143 if (NumOperands == 2)
12144 return Op;
12146 // Concat each pair of subvectors and pack into the lower half of the array.
12147 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
12148 while (ConcatOps.size() > 1) {
12149 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
12150 SDValue V1 = ConcatOps[I];
12151 SDValue V2 = ConcatOps[I + 1];
12152 EVT SubVT = V1.getValueType();
12153 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
12154 ConcatOps[I / 2] =
12155 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
12157 ConcatOps.resize(ConcatOps.size() / 2);
12159 return ConcatOps[0];
12162 return SDValue();
12165 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
12166 SelectionDAG &DAG) const {
12167 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
12169 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
12170 return LowerFixedLengthInsertVectorElt(Op, DAG);
12172 // Check for non-constant or out of range lane.
12173 EVT VT = Op.getOperand(0).getValueType();
12175 if (VT.getScalarType() == MVT::i1) {
12176 EVT VectorVT = getPromotedVTForPredicate(VT);
12177 SDLoc DL(Op);
12178 SDValue ExtendedVector =
12179 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
12180 SDValue ExtendedValue =
12181 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
12182 VectorVT.getScalarType().getSizeInBits() < 32
12183 ? MVT::i32
12184 : VectorVT.getScalarType());
12185 ExtendedVector =
12186 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
12187 ExtendedValue, Op.getOperand(2));
12188 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
12191 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
12192 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
12193 return SDValue();
12195 // Insertion/extraction are legal for V128 types.
12196 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
12197 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
12198 VT == MVT::v8f16 || VT == MVT::v8bf16)
12199 return Op;
12201 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
12202 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
12203 VT != MVT::v4bf16)
12204 return SDValue();
12206 // For V64 types, we perform insertion by expanding the value
12207 // to a V128 type and perform the insertion on that.
12208 SDLoc DL(Op);
12209 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
12210 EVT WideTy = WideVec.getValueType();
12212 SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
12213 Op.getOperand(1), Op.getOperand(2));
12214 // Re-narrow the resultant vector.
12215 return NarrowVector(Node, DAG);
12218 SDValue
12219 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
12220 SelectionDAG &DAG) const {
12221 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
12222 EVT VT = Op.getOperand(0).getValueType();
12224 if (VT.getScalarType() == MVT::i1) {
12225 // We can't directly extract from an SVE predicate; extend it first.
12226 // (This isn't the only possible lowering, but it's straightforward.)
12227 EVT VectorVT = getPromotedVTForPredicate(VT);
12228 SDLoc DL(Op);
12229 SDValue Extend =
12230 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
12231 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
12232 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
12233 Extend, Op.getOperand(1));
12234 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
12237 if (useSVEForFixedLengthVectorVT(VT))
12238 return LowerFixedLengthExtractVectorElt(Op, DAG);
12240 // Check for non-constant or out of range lane.
12241 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
12242 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
12243 return SDValue();
12245 // Insertion/extraction are legal for V128 types.
12246 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
12247 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
12248 VT == MVT::v8f16 || VT == MVT::v8bf16)
12249 return Op;
12251 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
12252 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
12253 VT != MVT::v4bf16)
12254 return SDValue();
12256 // For V64 types, we perform extraction by expanding the value
12257 // to a V128 type and perform the extraction on that.
12258 SDLoc DL(Op);
12259 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
12260 EVT WideTy = WideVec.getValueType();
12262 EVT ExtrTy = WideTy.getVectorElementType();
12263 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
12264 ExtrTy = MVT::i32;
12266 // For extractions, we just return the result directly.
12267 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
12268 Op.getOperand(1));
12271 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
12272 SelectionDAG &DAG) const {
12273 assert(Op.getValueType().isFixedLengthVector() &&
12274 "Only cases that extract a fixed length vector are supported!");
12276 EVT InVT = Op.getOperand(0).getValueType();
12277 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12278 unsigned Size = Op.getValueSizeInBits();
12280 // If we don't have legal types yet, do nothing
12281 if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
12282 return SDValue();
12284 if (InVT.isScalableVector()) {
12285 // This will be matched by custom code during ISelDAGToDAG.
12286 if (Idx == 0 && isPackedVectorType(InVT, DAG))
12287 return Op;
12289 return SDValue();
12292 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
12293 if (Idx == 0 && InVT.getSizeInBits() <= 128)
12294 return Op;
12296 // If this is extracting the upper 64-bits of a 128-bit vector, we match
12297 // that directly.
12298 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
12299 InVT.getSizeInBits() == 128)
12300 return Op;
12302 if (useSVEForFixedLengthVectorVT(InVT)) {
12303 SDLoc DL(Op);
12305 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
12306 SDValue NewInVec =
12307 convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
12309 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
12310 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
12311 return convertFromScalableVector(DAG, Op.getValueType(), Splice);
12314 return SDValue();
12317 SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
12318 SelectionDAG &DAG) const {
12319 assert(Op.getValueType().isScalableVector() &&
12320 "Only expect to lower inserts into scalable vectors!");
12322 EVT InVT = Op.getOperand(1).getValueType();
12323 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
12325 SDValue Vec0 = Op.getOperand(0);
12326 SDValue Vec1 = Op.getOperand(1);
12327 SDLoc DL(Op);
12328 EVT VT = Op.getValueType();
12330 if (InVT.isScalableVector()) {
12331 if (!isTypeLegal(VT))
12332 return SDValue();
12334 // Break down insert_subvector into simpler parts.
12335 if (VT.getVectorElementType() == MVT::i1) {
12336 unsigned NumElts = VT.getVectorMinNumElements();
12337 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
12339 SDValue Lo, Hi;
12340 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
12341 DAG.getVectorIdxConstant(0, DL));
12342 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
12343 DAG.getVectorIdxConstant(NumElts / 2, DL));
12344 if (Idx < (NumElts / 2)) {
12345 SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
12346 DAG.getVectorIdxConstant(Idx, DL));
12347 return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi);
12348 } else {
12349 SDValue NewHi =
12350 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
12351 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
12352 return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi);
12356 // Ensure the subvector is half the size of the main vector.
12357 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
12358 return SDValue();
12360 // Here narrow and wide refers to the vector element types. After "casting"
12361 // both vectors must have the same bit length and so because the subvector
12362 // has fewer elements, those elements need to be bigger.
12363 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
12364 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
12366 // NOP cast operands to the largest legal vector of the same element count.
12367 if (VT.isFloatingPoint()) {
12368 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
12369 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
12370 } else {
12371 // Legal integer vectors are already their largest so Vec0 is fine as is.
12372 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
12375 // To replace the top/bottom half of vector V with vector SubV we widen the
12376 // preserved half of V, concatenate this to SubV (the order depending on the
12377 // half being replaced) and then narrow the result.
12378 SDValue Narrow;
12379 if (Idx == 0) {
12380 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
12381 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
12382 } else {
12383 assert(Idx == InVT.getVectorMinNumElements() &&
12384 "Invalid subvector index!");
12385 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
12386 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
12389 return getSVESafeBitCast(VT, Narrow, DAG);
12392 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
12393 // This will be matched by custom code during ISelDAGToDAG.
12394 if (Vec0.isUndef())
12395 return Op;
12397 Optional<unsigned> PredPattern =
12398 getSVEPredPatternFromNumElements(InVT.getVectorNumElements());
12399 auto PredTy = VT.changeVectorElementType(MVT::i1);
12400 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
12401 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
12402 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
12405 return SDValue();
12408 static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
12409 if (Op.getOpcode() != AArch64ISD::DUP &&
12410 Op.getOpcode() != ISD::SPLAT_VECTOR &&
12411 Op.getOpcode() != ISD::BUILD_VECTOR)
12412 return false;
12414 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
12415 !isAllConstantBuildVector(Op, SplatVal))
12416 return false;
12418 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
12419 !isa<ConstantSDNode>(Op->getOperand(0)))
12420 return false;
12422 SplatVal = Op->getConstantOperandVal(0);
12423 if (Op.getValueType().getVectorElementType() != MVT::i64)
12424 SplatVal = (int32_t)SplatVal;
12426 Negated = false;
12427 if (isPowerOf2_64(SplatVal))
12428 return true;
12430 Negated = true;
12431 if (isPowerOf2_64(-SplatVal)) {
12432 SplatVal = -SplatVal;
12433 return true;
12436 return false;
12439 SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
12440 EVT VT = Op.getValueType();
12441 SDLoc dl(Op);
12443 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
12444 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
12446 assert(VT.isScalableVector() && "Expected a scalable vector.");
12448 bool Signed = Op.getOpcode() == ISD::SDIV;
12449 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
12451 bool Negated;
12452 uint64_t SplatVal;
12453 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
12454 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
12455 SDValue Res =
12456 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
12457 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
12458 if (Negated)
12459 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
12461 return Res;
12464 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
12465 return LowerToPredicatedOp(Op, DAG, PredOpcode);
12467 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
12468 // operations, and truncate the result.
12469 EVT WidenedVT;
12470 if (VT == MVT::nxv16i8)
12471 WidenedVT = MVT::nxv8i16;
12472 else if (VT == MVT::nxv8i16)
12473 WidenedVT = MVT::nxv4i32;
12474 else
12475 llvm_unreachable("Unexpected Custom DIV operation");
12477 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
12478 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
12479 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
12480 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
12481 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
12482 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
12483 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
12484 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
12485 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
12488 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
12489 // Currently no fixed length shuffles that require SVE are legal.
12490 if (useSVEForFixedLengthVectorVT(VT))
12491 return false;
12493 if (VT.getVectorNumElements() == 4 &&
12494 (VT.is128BitVector() || VT.is64BitVector())) {
12495 unsigned Cost = getPerfectShuffleCost(M);
12496 if (Cost <= 1)
12497 return true;
12500 bool DummyBool;
12501 int DummyInt;
12502 unsigned DummyUnsigned;
12504 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
12505 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
12506 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
12507 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
12508 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
12509 isZIPMask(M, VT, DummyUnsigned) ||
12510 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
12511 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
12512 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
12513 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
12514 isConcatMask(M, VT, VT.getSizeInBits() == 128));
12517 bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
12518 EVT VT) const {
12519 // Just delegate to the generic legality, clear masks aren't special.
12520 return isShuffleMaskLegal(M, VT);
12523 /// getVShiftImm - Check if this is a valid build_vector for the immediate
12524 /// operand of a vector shift operation, where all the elements of the
12525 /// build_vector must have the same constant integer value.
12526 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
12527 // Ignore bit_converts.
12528 while (Op.getOpcode() == ISD::BITCAST)
12529 Op = Op.getOperand(0);
12530 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
12531 APInt SplatBits, SplatUndef;
12532 unsigned SplatBitSize;
12533 bool HasAnyUndefs;
12534 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
12535 HasAnyUndefs, ElementBits) ||
12536 SplatBitSize > ElementBits)
12537 return false;
12538 Cnt = SplatBits.getSExtValue();
12539 return true;
12542 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
12543 /// operand of a vector shift left operation. That value must be in the range:
12544 /// 0 <= Value < ElementBits for a left shift; or
12545 /// 0 <= Value <= ElementBits for a long left shift.
12546 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
12547 assert(VT.isVector() && "vector shift count is not a vector type");
12548 int64_t ElementBits = VT.getScalarSizeInBits();
12549 if (!getVShiftImm(Op, ElementBits, Cnt))
12550 return false;
12551 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
12554 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
12555 /// operand of a vector shift right operation. The value must be in the range:
12556 /// 1 <= Value <= ElementBits for a right shift; or
12557 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
12558 assert(VT.isVector() && "vector shift count is not a vector type");
12559 int64_t ElementBits = VT.getScalarSizeInBits();
12560 if (!getVShiftImm(Op, ElementBits, Cnt))
12561 return false;
12562 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
12565 SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
12566 SelectionDAG &DAG) const {
12567 EVT VT = Op.getValueType();
12569 if (VT.getScalarType() == MVT::i1) {
12570 // Lower i1 truncate to `(x & 1) != 0`.
12571 SDLoc dl(Op);
12572 EVT OpVT = Op.getOperand(0).getValueType();
12573 SDValue Zero = DAG.getConstant(0, dl, OpVT);
12574 SDValue One = DAG.getConstant(1, dl, OpVT);
12575 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
12576 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
12579 if (!VT.isVector() || VT.isScalableVector())
12580 return SDValue();
12582 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
12583 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
12585 return SDValue();
12588 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
12589 SelectionDAG &DAG) const {
12590 EVT VT = Op.getValueType();
12591 SDLoc DL(Op);
12592 int64_t Cnt;
12594 if (!Op.getOperand(1).getValueType().isVector())
12595 return Op;
12596 unsigned EltSize = VT.getScalarSizeInBits();
12598 switch (Op.getOpcode()) {
12599 case ISD::SHL:
12600 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
12601 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
12603 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
12604 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
12605 DAG.getConstant(Cnt, DL, MVT::i32));
12606 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12607 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
12608 MVT::i32),
12609 Op.getOperand(0), Op.getOperand(1));
12610 case ISD::SRA:
12611 case ISD::SRL:
12612 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) {
12613 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
12614 : AArch64ISD::SRL_PRED;
12615 return LowerToPredicatedOp(Op, DAG, Opc);
12618 // Right shift immediate
12619 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
12620 unsigned Opc =
12621 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
12622 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
12623 DAG.getConstant(Cnt, DL, MVT::i32));
12626 // Right shift register. Note, there is not a shift right register
12627 // instruction, but the shift left register instruction takes a signed
12628 // value, where negative numbers specify a right shift.
12629 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
12630 : Intrinsic::aarch64_neon_ushl;
12631 // negate the shift amount
12632 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
12633 Op.getOperand(1));
12634 SDValue NegShiftLeft =
12635 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12636 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
12637 NegShift);
12638 return NegShiftLeft;
12641 llvm_unreachable("unexpected shift opcode");
12644 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
12645 AArch64CC::CondCode CC, bool NoNans, EVT VT,
12646 const SDLoc &dl, SelectionDAG &DAG) {
12647 EVT SrcVT = LHS.getValueType();
12648 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
12649 "function only supposed to emit natural comparisons");
12651 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
12652 APInt CnstBits(VT.getSizeInBits(), 0);
12653 APInt UndefBits(VT.getSizeInBits(), 0);
12654 bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
12655 bool IsZero = IsCnst && (CnstBits == 0);
12657 if (SrcVT.getVectorElementType().isFloatingPoint()) {
12658 switch (CC) {
12659 default:
12660 return SDValue();
12661 case AArch64CC::NE: {
12662 SDValue Fcmeq;
12663 if (IsZero)
12664 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
12665 else
12666 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
12667 return DAG.getNOT(dl, Fcmeq, VT);
12669 case AArch64CC::EQ:
12670 if (IsZero)
12671 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
12672 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
12673 case AArch64CC::GE:
12674 if (IsZero)
12675 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
12676 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
12677 case AArch64CC::GT:
12678 if (IsZero)
12679 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
12680 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
12681 case AArch64CC::LE:
12682 if (!NoNans)
12683 return SDValue();
12684 // If we ignore NaNs then we can use to the LS implementation.
12685 [[fallthrough]];
12686 case AArch64CC::LS:
12687 if (IsZero)
12688 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
12689 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
12690 case AArch64CC::LT:
12691 if (!NoNans)
12692 return SDValue();
12693 // If we ignore NaNs then we can use to the MI implementation.
12694 [[fallthrough]];
12695 case AArch64CC::MI:
12696 if (IsZero)
12697 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
12698 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
12702 switch (CC) {
12703 default:
12704 return SDValue();
12705 case AArch64CC::NE: {
12706 SDValue Cmeq;
12707 if (IsZero)
12708 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
12709 else
12710 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
12711 return DAG.getNOT(dl, Cmeq, VT);
12713 case AArch64CC::EQ:
12714 if (IsZero)
12715 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
12716 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
12717 case AArch64CC::GE:
12718 if (IsZero)
12719 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
12720 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
12721 case AArch64CC::GT:
12722 if (IsZero)
12723 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
12724 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
12725 case AArch64CC::LE:
12726 if (IsZero)
12727 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
12728 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
12729 case AArch64CC::LS:
12730 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
12731 case AArch64CC::LO:
12732 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
12733 case AArch64CC::LT:
12734 if (IsZero)
12735 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
12736 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
12737 case AArch64CC::HI:
12738 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
12739 case AArch64CC::HS:
12740 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
12744 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
12745 SelectionDAG &DAG) const {
12746 if (Op.getValueType().isScalableVector())
12747 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
12749 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
12750 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
12752 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
12753 SDValue LHS = Op.getOperand(0);
12754 SDValue RHS = Op.getOperand(1);
12755 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
12756 SDLoc dl(Op);
12758 if (LHS.getValueType().getVectorElementType().isInteger()) {
12759 assert(LHS.getValueType() == RHS.getValueType());
12760 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
12761 SDValue Cmp =
12762 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
12763 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
12766 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
12768 // Make v4f16 (only) fcmp operations utilise vector instructions
12769 // v8f16 support will be a litle more complicated
12770 if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
12771 if (LHS.getValueType().getVectorNumElements() == 4) {
12772 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
12773 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
12774 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
12775 DAG.ReplaceAllUsesWith(Op, NewSetcc);
12776 CmpVT = MVT::v4i32;
12777 } else
12778 return SDValue();
12781 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
12782 LHS.getValueType().getVectorElementType() != MVT::f128);
12784 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
12785 // clean. Some of them require two branches to implement.
12786 AArch64CC::CondCode CC1, CC2;
12787 bool ShouldInvert;
12788 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
12790 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
12791 SDValue Cmp =
12792 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
12793 if (!Cmp.getNode())
12794 return SDValue();
12796 if (CC2 != AArch64CC::AL) {
12797 SDValue Cmp2 =
12798 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
12799 if (!Cmp2.getNode())
12800 return SDValue();
12802 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
12805 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
12807 if (ShouldInvert)
12808 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
12810 return Cmp;
12813 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
12814 SelectionDAG &DAG) {
12815 SDValue VecOp = ScalarOp.getOperand(0);
12816 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
12817 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
12818 DAG.getConstant(0, DL, MVT::i64));
12821 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
12822 SelectionDAG &DAG) const {
12823 SDValue Src = Op.getOperand(0);
12825 // Try to lower fixed length reductions to SVE.
12826 EVT SrcVT = Src.getValueType();
12827 bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND ||
12828 Op.getOpcode() == ISD::VECREDUCE_OR ||
12829 Op.getOpcode() == ISD::VECREDUCE_XOR ||
12830 Op.getOpcode() == ISD::VECREDUCE_FADD ||
12831 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
12832 SrcVT.getVectorElementType() == MVT::i64);
12833 if (SrcVT.isScalableVector() ||
12834 useSVEForFixedLengthVectorVT(
12835 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
12837 if (SrcVT.getVectorElementType() == MVT::i1)
12838 return LowerPredReductionToSVE(Op, DAG);
12840 switch (Op.getOpcode()) {
12841 case ISD::VECREDUCE_ADD:
12842 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
12843 case ISD::VECREDUCE_AND:
12844 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
12845 case ISD::VECREDUCE_OR:
12846 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
12847 case ISD::VECREDUCE_SMAX:
12848 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
12849 case ISD::VECREDUCE_SMIN:
12850 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
12851 case ISD::VECREDUCE_UMAX:
12852 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
12853 case ISD::VECREDUCE_UMIN:
12854 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
12855 case ISD::VECREDUCE_XOR:
12856 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
12857 case ISD::VECREDUCE_FADD:
12858 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
12859 case ISD::VECREDUCE_FMAX:
12860 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
12861 case ISD::VECREDUCE_FMIN:
12862 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
12863 default:
12864 llvm_unreachable("Unhandled fixed length reduction");
12868 // Lower NEON reductions.
12869 SDLoc dl(Op);
12870 switch (Op.getOpcode()) {
12871 case ISD::VECREDUCE_ADD:
12872 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
12873 case ISD::VECREDUCE_SMAX:
12874 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
12875 case ISD::VECREDUCE_SMIN:
12876 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
12877 case ISD::VECREDUCE_UMAX:
12878 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
12879 case ISD::VECREDUCE_UMIN:
12880 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
12881 case ISD::VECREDUCE_FMAX: {
12882 return DAG.getNode(
12883 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
12884 DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
12885 Src);
12887 case ISD::VECREDUCE_FMIN: {
12888 return DAG.getNode(
12889 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
12890 DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
12891 Src);
12893 default:
12894 llvm_unreachable("Unhandled reduction");
12898 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
12899 SelectionDAG &DAG) const {
12900 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
12901 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
12902 return SDValue();
12904 // LSE has an atomic load-add instruction, but not a load-sub.
12905 SDLoc dl(Op);
12906 MVT VT = Op.getSimpleValueType();
12907 SDValue RHS = Op.getOperand(2);
12908 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
12909 RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
12910 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
12911 Op.getOperand(0), Op.getOperand(1), RHS,
12912 AN->getMemOperand());
12915 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
12916 SelectionDAG &DAG) const {
12917 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
12918 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
12919 return SDValue();
12921 // LSE has an atomic load-clear instruction, but not a load-and.
12922 SDLoc dl(Op);
12923 MVT VT = Op.getSimpleValueType();
12924 SDValue RHS = Op.getOperand(2);
12925 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
12926 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
12927 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
12928 Op.getOperand(0), Op.getOperand(1), RHS,
12929 AN->getMemOperand());
12932 SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
12933 SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
12934 SDLoc dl(Op);
12935 EVT PtrVT = getPointerTy(DAG.getDataLayout());
12936 SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
12937 PtrVT, 0);
12939 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
12940 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
12941 if (Subtarget->hasCustomCallingConv())
12942 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
12944 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
12945 DAG.getConstant(4, dl, MVT::i64));
12946 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
12947 Chain =
12948 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
12949 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
12950 DAG.getRegisterMask(Mask), Chain.getValue(1));
12951 // To match the actual intent better, we should read the output from X15 here
12952 // again (instead of potentially spilling it to the stack), but rereading Size
12953 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
12954 // here.
12956 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
12957 DAG.getConstant(4, dl, MVT::i64));
12958 return Chain;
12961 SDValue
12962 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
12963 SelectionDAG &DAG) const {
12964 assert(Subtarget->isTargetWindows() &&
12965 "Only Windows alloca probing supported");
12966 SDLoc dl(Op);
12967 // Get the inputs.
12968 SDNode *Node = Op.getNode();
12969 SDValue Chain = Op.getOperand(0);
12970 SDValue Size = Op.getOperand(1);
12971 MaybeAlign Align =
12972 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
12973 EVT VT = Node->getValueType(0);
12975 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
12976 "no-stack-arg-probe")) {
12977 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
12978 Chain = SP.getValue(1);
12979 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
12980 if (Align)
12981 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
12982 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
12983 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
12984 SDValue Ops[2] = {SP, Chain};
12985 return DAG.getMergeValues(Ops, dl);
12988 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
12990 Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
12992 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
12993 Chain = SP.getValue(1);
12994 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
12995 if (Align)
12996 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
12997 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
12998 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
13000 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
13002 SDValue Ops[2] = {SP, Chain};
13003 return DAG.getMergeValues(Ops, dl);
13006 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
13007 SelectionDAG &DAG) const {
13008 EVT VT = Op.getValueType();
13009 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
13011 SDLoc DL(Op);
13012 APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
13013 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
13014 VT);
13017 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
13018 template <unsigned NumVecs>
13019 static bool
13020 setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
13021 AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
13022 Info.opc = ISD::INTRINSIC_VOID;
13023 // Retrieve EC from first vector argument.
13024 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
13025 ElementCount EC = VT.getVectorElementCount();
13026 #ifndef NDEBUG
13027 // Check the assumption that all input vectors are the same type.
13028 for (unsigned I = 0; I < NumVecs; ++I)
13029 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
13030 "Invalid type.");
13031 #endif
13032 // memVT is `NumVecs * VT`.
13033 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
13034 EC * NumVecs);
13035 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
13036 Info.offset = 0;
13037 Info.align.reset();
13038 Info.flags = MachineMemOperand::MOStore;
13039 return true;
13042 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
13043 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
13044 /// specified in the intrinsic calls.
13045 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
13046 const CallInst &I,
13047 MachineFunction &MF,
13048 unsigned Intrinsic) const {
13049 auto &DL = I.getModule()->getDataLayout();
13050 switch (Intrinsic) {
13051 case Intrinsic::aarch64_sve_st2:
13052 return setInfoSVEStN<2>(*this, DL, Info, I);
13053 case Intrinsic::aarch64_sve_st3:
13054 return setInfoSVEStN<3>(*this, DL, Info, I);
13055 case Intrinsic::aarch64_sve_st4:
13056 return setInfoSVEStN<4>(*this, DL, Info, I);
13057 case Intrinsic::aarch64_neon_ld2:
13058 case Intrinsic::aarch64_neon_ld3:
13059 case Intrinsic::aarch64_neon_ld4:
13060 case Intrinsic::aarch64_neon_ld1x2:
13061 case Intrinsic::aarch64_neon_ld1x3:
13062 case Intrinsic::aarch64_neon_ld1x4:
13063 case Intrinsic::aarch64_neon_ld2lane:
13064 case Intrinsic::aarch64_neon_ld3lane:
13065 case Intrinsic::aarch64_neon_ld4lane:
13066 case Intrinsic::aarch64_neon_ld2r:
13067 case Intrinsic::aarch64_neon_ld3r:
13068 case Intrinsic::aarch64_neon_ld4r: {
13069 Info.opc = ISD::INTRINSIC_W_CHAIN;
13070 // Conservatively set memVT to the entire set of vectors loaded.
13071 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
13072 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13073 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
13074 Info.offset = 0;
13075 Info.align.reset();
13076 // volatile loads with NEON intrinsics not supported
13077 Info.flags = MachineMemOperand::MOLoad;
13078 return true;
13080 case Intrinsic::aarch64_neon_st2:
13081 case Intrinsic::aarch64_neon_st3:
13082 case Intrinsic::aarch64_neon_st4:
13083 case Intrinsic::aarch64_neon_st1x2:
13084 case Intrinsic::aarch64_neon_st1x3:
13085 case Intrinsic::aarch64_neon_st1x4:
13086 case Intrinsic::aarch64_neon_st2lane:
13087 case Intrinsic::aarch64_neon_st3lane:
13088 case Intrinsic::aarch64_neon_st4lane: {
13089 Info.opc = ISD::INTRINSIC_VOID;
13090 // Conservatively set memVT to the entire set of vectors stored.
13091 unsigned NumElts = 0;
13092 for (const Value *Arg : I.args()) {
13093 Type *ArgTy = Arg->getType();
13094 if (!ArgTy->isVectorTy())
13095 break;
13096 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
13098 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13099 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
13100 Info.offset = 0;
13101 Info.align.reset();
13102 // volatile stores with NEON intrinsics not supported
13103 Info.flags = MachineMemOperand::MOStore;
13104 return true;
13106 case Intrinsic::aarch64_ldaxr:
13107 case Intrinsic::aarch64_ldxr: {
13108 Type *ValTy = I.getParamElementType(0);
13109 Info.opc = ISD::INTRINSIC_W_CHAIN;
13110 Info.memVT = MVT::getVT(ValTy);
13111 Info.ptrVal = I.getArgOperand(0);
13112 Info.offset = 0;
13113 Info.align = DL.getABITypeAlign(ValTy);
13114 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
13115 return true;
13117 case Intrinsic::aarch64_stlxr:
13118 case Intrinsic::aarch64_stxr: {
13119 Type *ValTy = I.getParamElementType(1);
13120 Info.opc = ISD::INTRINSIC_W_CHAIN;
13121 Info.memVT = MVT::getVT(ValTy);
13122 Info.ptrVal = I.getArgOperand(1);
13123 Info.offset = 0;
13124 Info.align = DL.getABITypeAlign(ValTy);
13125 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
13126 return true;
13128 case Intrinsic::aarch64_ldaxp:
13129 case Intrinsic::aarch64_ldxp:
13130 Info.opc = ISD::INTRINSIC_W_CHAIN;
13131 Info.memVT = MVT::i128;
13132 Info.ptrVal = I.getArgOperand(0);
13133 Info.offset = 0;
13134 Info.align = Align(16);
13135 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
13136 return true;
13137 case Intrinsic::aarch64_stlxp:
13138 case Intrinsic::aarch64_stxp:
13139 Info.opc = ISD::INTRINSIC_W_CHAIN;
13140 Info.memVT = MVT::i128;
13141 Info.ptrVal = I.getArgOperand(2);
13142 Info.offset = 0;
13143 Info.align = Align(16);
13144 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
13145 return true;
13146 case Intrinsic::aarch64_sve_ldnt1: {
13147 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
13148 Info.opc = ISD::INTRINSIC_W_CHAIN;
13149 Info.memVT = MVT::getVT(I.getType());
13150 Info.ptrVal = I.getArgOperand(1);
13151 Info.offset = 0;
13152 Info.align = DL.getABITypeAlign(ElTy);
13153 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
13154 return true;
13156 case Intrinsic::aarch64_sve_stnt1: {
13157 Type *ElTy =
13158 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
13159 Info.opc = ISD::INTRINSIC_W_CHAIN;
13160 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
13161 Info.ptrVal = I.getArgOperand(2);
13162 Info.offset = 0;
13163 Info.align = DL.getABITypeAlign(ElTy);
13164 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
13165 return true;
13167 case Intrinsic::aarch64_mops_memset_tag: {
13168 Value *Dst = I.getArgOperand(0);
13169 Value *Val = I.getArgOperand(1);
13170 Info.opc = ISD::INTRINSIC_W_CHAIN;
13171 Info.memVT = MVT::getVT(Val->getType());
13172 Info.ptrVal = Dst;
13173 Info.offset = 0;
13174 Info.align = I.getParamAlign(0).valueOrOne();
13175 Info.flags = MachineMemOperand::MOStore;
13176 // The size of the memory being operated on is unknown at this point
13177 Info.size = MemoryLocation::UnknownSize;
13178 return true;
13180 default:
13181 break;
13184 return false;
13187 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
13188 ISD::LoadExtType ExtTy,
13189 EVT NewVT) const {
13190 // TODO: This may be worth removing. Check regression tests for diffs.
13191 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
13192 return false;
13194 // If we're reducing the load width in order to avoid having to use an extra
13195 // instruction to do extension then it's probably a good idea.
13196 if (ExtTy != ISD::NON_EXTLOAD)
13197 return true;
13198 // Don't reduce load width if it would prevent us from combining a shift into
13199 // the offset.
13200 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
13201 assert(Mem);
13202 const SDValue &Base = Mem->getBasePtr();
13203 if (Base.getOpcode() == ISD::ADD &&
13204 Base.getOperand(1).getOpcode() == ISD::SHL &&
13205 Base.getOperand(1).hasOneUse() &&
13206 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
13207 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
13208 if (Mem->getMemoryVT().isScalableVector())
13209 return false;
13210 // The shift can be combined if it matches the size of the value being
13211 // loaded (and so reducing the width would make it not match).
13212 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
13213 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
13214 if (ShiftAmount == Log2_32(LoadBytes))
13215 return false;
13217 // We have no reason to disallow reducing the load width, so allow it.
13218 return true;
13221 // Truncations from 64-bit GPR to 32-bit GPR is free.
13222 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
13223 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
13224 return false;
13225 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize();
13226 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize();
13227 return NumBits1 > NumBits2;
13229 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
13230 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
13231 return false;
13232 uint64_t NumBits1 = VT1.getFixedSizeInBits();
13233 uint64_t NumBits2 = VT2.getFixedSizeInBits();
13234 return NumBits1 > NumBits2;
13237 /// Check if it is profitable to hoist instruction in then/else to if.
13238 /// Not profitable if I and it's user can form a FMA instruction
13239 /// because we prefer FMSUB/FMADD.
13240 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
13241 if (I->getOpcode() != Instruction::FMul)
13242 return true;
13244 if (!I->hasOneUse())
13245 return true;
13247 Instruction *User = I->user_back();
13249 if (!(User->getOpcode() == Instruction::FSub ||
13250 User->getOpcode() == Instruction::FAdd))
13251 return true;
13253 const TargetOptions &Options = getTargetMachine().Options;
13254 const Function *F = I->getFunction();
13255 const DataLayout &DL = F->getParent()->getDataLayout();
13256 Type *Ty = User->getOperand(0)->getType();
13258 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
13259 isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
13260 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
13261 Options.UnsafeFPMath));
13264 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
13265 // 64-bit GPR.
13266 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
13267 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
13268 return false;
13269 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
13270 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
13271 return NumBits1 == 32 && NumBits2 == 64;
13273 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
13274 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
13275 return false;
13276 unsigned NumBits1 = VT1.getSizeInBits();
13277 unsigned NumBits2 = VT2.getSizeInBits();
13278 return NumBits1 == 32 && NumBits2 == 64;
13281 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
13282 EVT VT1 = Val.getValueType();
13283 if (isZExtFree(VT1, VT2)) {
13284 return true;
13287 if (Val.getOpcode() != ISD::LOAD)
13288 return false;
13290 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
13291 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
13292 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
13293 VT1.getSizeInBits() <= 32);
13296 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
13297 if (isa<FPExtInst>(Ext))
13298 return false;
13300 // Vector types are not free.
13301 if (Ext->getType()->isVectorTy())
13302 return false;
13304 for (const Use &U : Ext->uses()) {
13305 // The extension is free if we can fold it with a left shift in an
13306 // addressing mode or an arithmetic operation: add, sub, and cmp.
13308 // Is there a shift?
13309 const Instruction *Instr = cast<Instruction>(U.getUser());
13311 // Is this a constant shift?
13312 switch (Instr->getOpcode()) {
13313 case Instruction::Shl:
13314 if (!isa<ConstantInt>(Instr->getOperand(1)))
13315 return false;
13316 break;
13317 case Instruction::GetElementPtr: {
13318 gep_type_iterator GTI = gep_type_begin(Instr);
13319 auto &DL = Ext->getModule()->getDataLayout();
13320 std::advance(GTI, U.getOperandNo()-1);
13321 Type *IdxTy = GTI.getIndexedType();
13322 // This extension will end up with a shift because of the scaling factor.
13323 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
13324 // Get the shift amount based on the scaling factor:
13325 // log2(sizeof(IdxTy)) - log2(8).
13326 uint64_t ShiftAmt =
13327 countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
13328 // Is the constant foldable in the shift of the addressing mode?
13329 // I.e., shift amount is between 1 and 4 inclusive.
13330 if (ShiftAmt == 0 || ShiftAmt > 4)
13331 return false;
13332 break;
13334 case Instruction::Trunc:
13335 // Check if this is a noop.
13336 // trunc(sext ty1 to ty2) to ty1.
13337 if (Instr->getType() == Ext->getOperand(0)->getType())
13338 continue;
13339 [[fallthrough]];
13340 default:
13341 return false;
13344 // At this point we can use the bfm family, so this extension is free
13345 // for that use.
13347 return true;
13350 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower
13351 /// or upper half of the vector elements.
13352 static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
13353 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
13354 auto *FullTy = FullV->getType();
13355 auto *HalfTy = HalfV->getType();
13356 return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
13357 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
13360 auto extractHalf = [](Value *FullV, Value *HalfV) {
13361 auto *FullVT = cast<FixedVectorType>(FullV->getType());
13362 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
13363 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
13366 ArrayRef<int> M1, M2;
13367 Value *S1Op1, *S2Op1;
13368 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
13369 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
13370 return false;
13372 // Check that the operands are half as wide as the result and we extract
13373 // half of the elements of the input vectors.
13374 if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
13375 !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
13376 return false;
13378 // Check the mask extracts either the lower or upper half of vector
13379 // elements.
13380 int M1Start = -1;
13381 int M2Start = -1;
13382 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
13383 if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
13384 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
13385 M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
13386 return false;
13388 return true;
13391 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
13392 /// of the vector elements.
13393 static bool areExtractExts(Value *Ext1, Value *Ext2) {
13394 auto areExtDoubled = [](Instruction *Ext) {
13395 return Ext->getType()->getScalarSizeInBits() ==
13396 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
13399 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
13400 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
13401 !areExtDoubled(cast<Instruction>(Ext1)) ||
13402 !areExtDoubled(cast<Instruction>(Ext2)))
13403 return false;
13405 return true;
13408 /// Check if Op could be used with vmull_high_p64 intrinsic.
13409 static bool isOperandOfVmullHighP64(Value *Op) {
13410 Value *VectorOperand = nullptr;
13411 ConstantInt *ElementIndex = nullptr;
13412 return match(Op, m_ExtractElt(m_Value(VectorOperand),
13413 m_ConstantInt(ElementIndex))) &&
13414 ElementIndex->getValue() == 1 &&
13415 isa<FixedVectorType>(VectorOperand->getType()) &&
13416 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
13419 /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
13420 static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
13421 return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
13424 static bool isSplatShuffle(Value *V) {
13425 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
13426 return all_equal(Shuf->getShuffleMask());
13427 return false;
13430 /// Check if sinking \p I's operands to I's basic block is profitable, because
13431 /// the operands can be folded into a target instruction, e.g.
13432 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
13433 bool AArch64TargetLowering::shouldSinkOperands(
13434 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
13435 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
13436 switch (II->getIntrinsicID()) {
13437 case Intrinsic::aarch64_neon_smull:
13438 case Intrinsic::aarch64_neon_umull:
13439 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) {
13440 Ops.push_back(&II->getOperandUse(0));
13441 Ops.push_back(&II->getOperandUse(1));
13442 return true;
13444 [[fallthrough]];
13446 case Intrinsic::fma:
13447 if (isa<VectorType>(I->getType()) &&
13448 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
13449 !Subtarget->hasFullFP16())
13450 return false;
13451 [[fallthrough]];
13452 case Intrinsic::aarch64_neon_sqdmull:
13453 case Intrinsic::aarch64_neon_sqdmulh:
13454 case Intrinsic::aarch64_neon_sqrdmulh:
13455 // Sink splats for index lane variants
13456 if (isSplatShuffle(II->getOperand(0)))
13457 Ops.push_back(&II->getOperandUse(0));
13458 if (isSplatShuffle(II->getOperand(1)))
13459 Ops.push_back(&II->getOperandUse(1));
13460 return !Ops.empty();
13461 case Intrinsic::aarch64_sve_ptest_first:
13462 case Intrinsic::aarch64_sve_ptest_last:
13463 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
13464 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
13465 Ops.push_back(&II->getOperandUse(0));
13466 return !Ops.empty();
13467 case Intrinsic::aarch64_sme_write_horiz:
13468 case Intrinsic::aarch64_sme_write_vert:
13469 case Intrinsic::aarch64_sme_writeq_horiz:
13470 case Intrinsic::aarch64_sme_writeq_vert: {
13471 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
13472 if (!Idx || Idx->getOpcode() != Instruction::Add)
13473 return false;
13474 Ops.push_back(&II->getOperandUse(1));
13475 return true;
13477 case Intrinsic::aarch64_sme_read_horiz:
13478 case Intrinsic::aarch64_sme_read_vert:
13479 case Intrinsic::aarch64_sme_readq_horiz:
13480 case Intrinsic::aarch64_sme_readq_vert:
13481 case Intrinsic::aarch64_sme_ld1b_vert:
13482 case Intrinsic::aarch64_sme_ld1h_vert:
13483 case Intrinsic::aarch64_sme_ld1w_vert:
13484 case Intrinsic::aarch64_sme_ld1d_vert:
13485 case Intrinsic::aarch64_sme_ld1q_vert:
13486 case Intrinsic::aarch64_sme_st1b_vert:
13487 case Intrinsic::aarch64_sme_st1h_vert:
13488 case Intrinsic::aarch64_sme_st1w_vert:
13489 case Intrinsic::aarch64_sme_st1d_vert:
13490 case Intrinsic::aarch64_sme_st1q_vert:
13491 case Intrinsic::aarch64_sme_ld1b_horiz:
13492 case Intrinsic::aarch64_sme_ld1h_horiz:
13493 case Intrinsic::aarch64_sme_ld1w_horiz:
13494 case Intrinsic::aarch64_sme_ld1d_horiz:
13495 case Intrinsic::aarch64_sme_ld1q_horiz:
13496 case Intrinsic::aarch64_sme_st1b_horiz:
13497 case Intrinsic::aarch64_sme_st1h_horiz:
13498 case Intrinsic::aarch64_sme_st1w_horiz:
13499 case Intrinsic::aarch64_sme_st1d_horiz:
13500 case Intrinsic::aarch64_sme_st1q_horiz: {
13501 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
13502 if (!Idx || Idx->getOpcode() != Instruction::Add)
13503 return false;
13504 Ops.push_back(&II->getOperandUse(3));
13505 return true;
13507 case Intrinsic::aarch64_neon_pmull:
13508 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
13509 return false;
13510 Ops.push_back(&II->getOperandUse(0));
13511 Ops.push_back(&II->getOperandUse(1));
13512 return true;
13513 case Intrinsic::aarch64_neon_pmull64:
13514 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
13515 II->getArgOperand(1)))
13516 return false;
13517 Ops.push_back(&II->getArgOperandUse(0));
13518 Ops.push_back(&II->getArgOperandUse(1));
13519 return true;
13520 default:
13521 return false;
13525 if (!I->getType()->isVectorTy())
13526 return false;
13528 switch (I->getOpcode()) {
13529 case Instruction::Sub:
13530 case Instruction::Add: {
13531 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
13532 return false;
13534 // If the exts' operands extract either the lower or upper elements, we
13535 // can sink them too.
13536 auto Ext1 = cast<Instruction>(I->getOperand(0));
13537 auto Ext2 = cast<Instruction>(I->getOperand(1));
13538 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
13539 Ops.push_back(&Ext1->getOperandUse(0));
13540 Ops.push_back(&Ext2->getOperandUse(0));
13543 Ops.push_back(&I->getOperandUse(0));
13544 Ops.push_back(&I->getOperandUse(1));
13546 return true;
13548 case Instruction::Mul: {
13549 bool IsProfitable = false;
13550 for (auto &Op : I->operands()) {
13551 // Make sure we are not already sinking this operand
13552 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
13553 continue;
13555 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
13557 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
13558 // operand and the s/zext can help create indexed s/umull. This is
13559 // especially useful to prevent i64 mul being scalarized.
13560 if (Shuffle && isSplatShuffle(Shuffle) &&
13561 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
13562 Ops.push_back(&Shuffle->getOperandUse(0));
13563 Ops.push_back(&Op);
13564 IsProfitable = true;
13565 continue;
13568 if (!Shuffle || !Shuffle->isZeroEltSplat())
13569 continue;
13571 Value *ShuffleOperand = Shuffle->getOperand(0);
13572 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
13573 if (!Insert)
13574 continue;
13576 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
13577 if (!OperandInstr)
13578 continue;
13580 ConstantInt *ElementConstant =
13581 dyn_cast<ConstantInt>(Insert->getOperand(2));
13582 // Check that the insertelement is inserting into element 0
13583 if (!ElementConstant || ElementConstant->getZExtValue() != 0)
13584 continue;
13586 unsigned Opcode = OperandInstr->getOpcode();
13587 if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
13588 continue;
13590 Ops.push_back(&Shuffle->getOperandUse(0));
13591 Ops.push_back(&Op);
13592 IsProfitable = true;
13595 return IsProfitable;
13597 default:
13598 return false;
13600 return false;
13603 static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) {
13604 Value *Op = ZExt->getOperand(0);
13605 auto *SrcTy = dyn_cast<FixedVectorType>(Op->getType());
13606 auto *DstTy = dyn_cast<FixedVectorType>(ZExt->getType());
13607 unsigned NumElts = SrcTy->getNumElements();
13608 IRBuilder<> Builder(ZExt);
13609 SmallVector<int> Mask(4 * NumElts, NumElts);
13610 // Create a mask that selects <0,0,0,Op[i]> for each lane of vector of i32 to
13611 // replace the original ZExt. This can later be lowered to a set of tbl
13612 // instructions.
13613 for (unsigned i = 0; i < NumElts; i++) {
13614 if (IsLittleEndian)
13615 Mask[i * 4] = i;
13616 else
13617 Mask[i * 4 + 3] = i;
13620 auto *FirstEltZero = Builder.CreateInsertElement(
13621 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
13622 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
13623 Result = Builder.CreateBitCast(Result, DstTy);
13624 ZExt->replaceAllUsesWith(Result);
13625 ZExt->eraseFromParent();
13628 static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
13629 IRBuilder<> Builder(TI);
13630 SmallVector<Value *> Parts;
13631 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
13632 Parts.push_back(Builder.CreateBitCast(
13633 Builder.CreateShuffleVector(TI->getOperand(0), {0, 1, 2, 3}), VecTy));
13634 Parts.push_back(Builder.CreateBitCast(
13635 Builder.CreateShuffleVector(TI->getOperand(0), {4, 5, 6, 7}), VecTy));
13637 Intrinsic::ID TblID = Intrinsic::aarch64_neon_tbl2;
13638 unsigned NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
13639 if (NumElements == 16) {
13640 Parts.push_back(Builder.CreateBitCast(
13641 Builder.CreateShuffleVector(TI->getOperand(0), {8, 9, 10, 11}), VecTy));
13642 Parts.push_back(Builder.CreateBitCast(
13643 Builder.CreateShuffleVector(TI->getOperand(0), {12, 13, 14, 15}),
13644 VecTy));
13645 TblID = Intrinsic::aarch64_neon_tbl4;
13647 SmallVector<Constant *, 16> MaskConst;
13648 for (unsigned Idx = 0; Idx < NumElements * 4; Idx += 4)
13649 MaskConst.push_back(
13650 ConstantInt::get(Builder.getInt8Ty(), IsLittleEndian ? Idx : Idx + 3));
13652 for (unsigned Idx = NumElements * 4; Idx < 64; Idx += 4)
13653 MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255));
13655 Parts.push_back(ConstantVector::get(MaskConst));
13656 auto *F =
13657 Intrinsic::getDeclaration(TI->getModule(), TblID, Parts[0]->getType());
13658 Value *Res = Builder.CreateCall(F, Parts);
13660 if (NumElements == 8)
13661 Res = Builder.CreateShuffleVector(Res, {0, 1, 2, 3, 4, 5, 6, 7});
13662 TI->replaceAllUsesWith(Res);
13663 TI->eraseFromParent();
13666 bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
13667 Loop *L) const {
13668 // Try to optimize conversions using tbl. This requires materializing constant
13669 // index vectors, which can increase code size and add loads. Skip the
13670 // transform unless the conversion is in a loop block guaranteed to execute
13671 // and we are not optimizing for size.
13672 Function *F = I->getParent()->getParent();
13673 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
13674 F->hasOptSize())
13675 return false;
13677 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
13678 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
13679 if (!SrcTy || !DstTy)
13680 return false;
13682 // Convert 'zext <(8|16) x i8> %x to <(8|16) x i32>' to a shuffle that can be
13683 // lowered to either 2 or 4 tbl instructions to insert the original i8
13684 // elements into i32 lanes.
13685 auto *ZExt = dyn_cast<ZExtInst>(I);
13686 if (ZExt && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
13687 SrcTy->getElementType()->isIntegerTy(8) &&
13688 DstTy->getElementType()->isIntegerTy(32)) {
13689 createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
13690 return true;
13693 auto *UIToFP = dyn_cast<UIToFPInst>(I);
13694 if (UIToFP &&
13695 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
13696 SrcTy->getElementType()->isIntegerTy(8) &&
13697 DstTy->getElementType()->isFloatTy()) {
13698 IRBuilder<> Builder(I);
13699 auto *ZExt = cast<ZExtInst>(
13700 Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
13701 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
13702 I->replaceAllUsesWith(UI);
13703 I->eraseFromParent();
13704 createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
13705 return true;
13708 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
13709 // followed by a truncate lowered to using tbl.4.
13710 auto *FPToUI = dyn_cast<FPToUIInst>(I);
13711 if (FPToUI &&
13712 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
13713 SrcTy->getElementType()->isFloatTy() &&
13714 DstTy->getElementType()->isIntegerTy(8)) {
13715 IRBuilder<> Builder(I);
13716 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
13717 VectorType::getInteger(SrcTy));
13718 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
13719 I->replaceAllUsesWith(TruncI);
13720 I->eraseFromParent();
13721 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
13722 return true;
13725 // Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' to a single tbl.4
13726 // instruction selecting the lowest 8 bits per lane of the input interpreted
13727 // as 2 or 4 <4 x i32> vectors.
13728 auto *TI = dyn_cast<TruncInst>(I);
13729 if (TI && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
13730 SrcTy->getElementType()->isIntegerTy(32) &&
13731 DstTy->getElementType()->isIntegerTy(8)) {
13732 createTblForTrunc(TI, Subtarget->isLittleEndian());
13733 return true;
13736 return false;
13739 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
13740 Align &RequiredAligment) const {
13741 if (!LoadedType.isSimple() ||
13742 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
13743 return false;
13744 // Cyclone supports unaligned accesses.
13745 RequiredAligment = Align(1);
13746 unsigned NumBits = LoadedType.getSizeInBits();
13747 return NumBits == 32 || NumBits == 64;
13750 /// A helper function for determining the number of interleaved accesses we
13751 /// will generate when lowering accesses of the given type.
13752 unsigned AArch64TargetLowering::getNumInterleavedAccesses(
13753 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
13754 unsigned VecSize = UseScalable ? Subtarget->getMinSVEVectorSizeInBits() : 128;
13755 return std::max<unsigned>(1, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize);
13758 MachineMemOperand::Flags
13759 AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
13760 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
13761 I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
13762 return MOStridedAccess;
13763 return MachineMemOperand::MONone;
13766 bool AArch64TargetLowering::isLegalInterleavedAccessType(
13767 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
13769 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
13770 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
13771 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
13773 UseScalable = false;
13775 // Ensure the number of vector elements is greater than 1.
13776 if (NumElements < 2)
13777 return false;
13779 // Ensure the element type is legal.
13780 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
13781 return false;
13783 if (Subtarget->useSVEForFixedLengthVectors() &&
13784 (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
13785 (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
13786 isPowerOf2_32(NumElements) && VecSize > 128))) {
13787 UseScalable = true;
13788 return true;
13791 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
13792 // 128 will be split into multiple interleaved accesses.
13793 return VecSize == 64 || VecSize % 128 == 0;
13796 static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
13797 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
13798 return ScalableVectorType::get(VTy->getElementType(), 2);
13800 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
13801 return ScalableVectorType::get(VTy->getElementType(), 4);
13803 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
13804 return ScalableVectorType::get(VTy->getElementType(), 8);
13806 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
13807 return ScalableVectorType::get(VTy->getElementType(), 8);
13809 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
13810 return ScalableVectorType::get(VTy->getElementType(), 2);
13812 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
13813 return ScalableVectorType::get(VTy->getElementType(), 4);
13815 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
13816 return ScalableVectorType::get(VTy->getElementType(), 8);
13818 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
13819 return ScalableVectorType::get(VTy->getElementType(), 16);
13821 llvm_unreachable("Cannot handle input vector type");
13824 /// Lower an interleaved load into a ldN intrinsic.
13826 /// E.g. Lower an interleaved load (Factor = 2):
13827 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
13828 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
13829 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
13831 /// Into:
13832 /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
13833 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
13834 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
13835 bool AArch64TargetLowering::lowerInterleavedLoad(
13836 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
13837 ArrayRef<unsigned> Indices, unsigned Factor) const {
13838 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
13839 "Invalid interleave factor");
13840 assert(!Shuffles.empty() && "Empty shufflevector input");
13841 assert(Shuffles.size() == Indices.size() &&
13842 "Unmatched number of shufflevectors and indices");
13844 const DataLayout &DL = LI->getModule()->getDataLayout();
13846 VectorType *VTy = Shuffles[0]->getType();
13848 // Skip if we do not have NEON and skip illegal vector types. We can
13849 // "legalize" wide vector types into multiple interleaved accesses as long as
13850 // the vector types are divisible by 128.
13851 bool UseScalable;
13852 if (!Subtarget->hasNEON() ||
13853 !isLegalInterleavedAccessType(VTy, DL, UseScalable))
13854 return false;
13856 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
13858 auto *FVTy = cast<FixedVectorType>(VTy);
13860 // A pointer vector can not be the return type of the ldN intrinsics. Need to
13861 // load integer vectors first and then convert to pointer vectors.
13862 Type *EltTy = FVTy->getElementType();
13863 if (EltTy->isPointerTy())
13864 FVTy =
13865 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
13867 // If we're going to generate more than one load, reset the sub-vector type
13868 // to something legal.
13869 FVTy = FixedVectorType::get(FVTy->getElementType(),
13870 FVTy->getNumElements() / NumLoads);
13872 auto *LDVTy =
13873 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
13875 IRBuilder<> Builder(LI);
13877 // The base address of the load.
13878 Value *BaseAddr = LI->getPointerOperand();
13880 if (NumLoads > 1) {
13881 // We will compute the pointer operand of each load from the original base
13882 // address using GEPs. Cast the base address to a pointer to the scalar
13883 // element type.
13884 BaseAddr = Builder.CreateBitCast(
13885 BaseAddr,
13886 LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
13889 Type *PtrTy =
13890 UseScalable
13891 ? LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())
13892 : LDVTy->getPointerTo(LI->getPointerAddressSpace());
13893 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
13894 LDVTy->getElementCount());
13896 static const Intrinsic::ID SVELoadIntrs[3] = {
13897 Intrinsic::aarch64_sve_ld2_sret, Intrinsic::aarch64_sve_ld3_sret,
13898 Intrinsic::aarch64_sve_ld4_sret};
13899 static const Intrinsic::ID NEONLoadIntrs[3] = {Intrinsic::aarch64_neon_ld2,
13900 Intrinsic::aarch64_neon_ld3,
13901 Intrinsic::aarch64_neon_ld4};
13902 Function *LdNFunc;
13903 if (UseScalable)
13904 LdNFunc = Intrinsic::getDeclaration(LI->getModule(),
13905 SVELoadIntrs[Factor - 2], {LDVTy});
13906 else
13907 LdNFunc = Intrinsic::getDeclaration(
13908 LI->getModule(), NEONLoadIntrs[Factor - 2], {LDVTy, PtrTy});
13910 // Holds sub-vectors extracted from the load intrinsic return values. The
13911 // sub-vectors are associated with the shufflevector instructions they will
13912 // replace.
13913 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
13915 Value *PTrue = nullptr;
13916 if (UseScalable) {
13917 Optional<unsigned> PgPattern =
13918 getSVEPredPatternFromNumElements(FVTy->getNumElements());
13919 if (Subtarget->getMinSVEVectorSizeInBits() ==
13920 Subtarget->getMaxSVEVectorSizeInBits() &&
13921 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
13922 PgPattern = AArch64SVEPredPattern::all;
13924 auto *PTruePat =
13925 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
13926 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
13927 {PTruePat});
13930 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
13932 // If we're generating more than one load, compute the base address of
13933 // subsequent loads as an offset from the previous.
13934 if (LoadCount > 0)
13935 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
13936 FVTy->getNumElements() * Factor);
13938 CallInst *LdN;
13939 if (UseScalable)
13940 LdN = Builder.CreateCall(
13941 LdNFunc, {PTrue, Builder.CreateBitCast(BaseAddr, PtrTy)}, "ldN");
13942 else
13943 LdN = Builder.CreateCall(LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy),
13944 "ldN");
13946 // Extract and store the sub-vectors returned by the load intrinsic.
13947 for (unsigned i = 0; i < Shuffles.size(); i++) {
13948 ShuffleVectorInst *SVI = Shuffles[i];
13949 unsigned Index = Indices[i];
13951 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
13953 if (UseScalable)
13954 SubVec = Builder.CreateExtractVector(
13955 FVTy, SubVec,
13956 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
13958 // Convert the integer vector to pointer vector if the element is pointer.
13959 if (EltTy->isPointerTy())
13960 SubVec = Builder.CreateIntToPtr(
13961 SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
13962 FVTy->getNumElements()));
13964 SubVecs[SVI].push_back(SubVec);
13968 // Replace uses of the shufflevector instructions with the sub-vectors
13969 // returned by the load intrinsic. If a shufflevector instruction is
13970 // associated with more than one sub-vector, those sub-vectors will be
13971 // concatenated into a single wide vector.
13972 for (ShuffleVectorInst *SVI : Shuffles) {
13973 auto &SubVec = SubVecs[SVI];
13974 auto *WideVec =
13975 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
13976 SVI->replaceAllUsesWith(WideVec);
13979 return true;
13982 /// Lower an interleaved store into a stN intrinsic.
13984 /// E.g. Lower an interleaved store (Factor = 3):
13985 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
13986 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
13987 /// store <12 x i32> %i.vec, <12 x i32>* %ptr
13989 /// Into:
13990 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
13991 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
13992 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
13993 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
13995 /// Note that the new shufflevectors will be removed and we'll only generate one
13996 /// st3 instruction in CodeGen.
13998 /// Example for a more general valid mask (Factor 3). Lower:
13999 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
14000 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
14001 /// store <12 x i32> %i.vec, <12 x i32>* %ptr
14003 /// Into:
14004 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
14005 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
14006 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
14007 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
14008 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
14009 ShuffleVectorInst *SVI,
14010 unsigned Factor) const {
14011 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
14012 "Invalid interleave factor");
14014 auto *VecTy = cast<FixedVectorType>(SVI->getType());
14015 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
14017 unsigned LaneLen = VecTy->getNumElements() / Factor;
14018 Type *EltTy = VecTy->getElementType();
14019 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
14021 const DataLayout &DL = SI->getModule()->getDataLayout();
14022 bool UseScalable;
14024 // Skip if we do not have NEON and skip illegal vector types. We can
14025 // "legalize" wide vector types into multiple interleaved accesses as long as
14026 // the vector types are divisible by 128.
14027 if (!Subtarget->hasNEON() ||
14028 !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
14029 return false;
14031 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
14033 Value *Op0 = SVI->getOperand(0);
14034 Value *Op1 = SVI->getOperand(1);
14035 IRBuilder<> Builder(SI);
14037 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
14038 // vectors to integer vectors.
14039 if (EltTy->isPointerTy()) {
14040 Type *IntTy = DL.getIntPtrType(EltTy);
14041 unsigned NumOpElts =
14042 cast<FixedVectorType>(Op0->getType())->getNumElements();
14044 // Convert to the corresponding integer vector.
14045 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
14046 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
14047 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
14049 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
14052 // If we're going to generate more than one store, reset the lane length
14053 // and sub-vector type to something legal.
14054 LaneLen /= NumStores;
14055 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
14057 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
14058 : SubVecTy;
14060 // The base address of the store.
14061 Value *BaseAddr = SI->getPointerOperand();
14063 if (NumStores > 1) {
14064 // We will compute the pointer operand of each store from the original base
14065 // address using GEPs. Cast the base address to a pointer to the scalar
14066 // element type.
14067 BaseAddr = Builder.CreateBitCast(
14068 BaseAddr,
14069 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
14072 auto Mask = SVI->getShuffleMask();
14074 // Sanity check if all the indices are NOT in range.
14075 // If mask is `undef` or `poison`, `Mask` may be a vector of -1s.
14076 // If all of them are `undef`, OOB read will happen later.
14077 if (llvm::all_of(Mask, [](int Idx) { return Idx == UndefMaskElem; })) {
14078 return false;
14081 Type *PtrTy =
14082 UseScalable
14083 ? STVTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())
14084 : STVTy->getPointerTo(SI->getPointerAddressSpace());
14085 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
14086 STVTy->getElementCount());
14088 static const Intrinsic::ID SVEStoreIntrs[3] = {Intrinsic::aarch64_sve_st2,
14089 Intrinsic::aarch64_sve_st3,
14090 Intrinsic::aarch64_sve_st4};
14091 static const Intrinsic::ID NEONStoreIntrs[3] = {Intrinsic::aarch64_neon_st2,
14092 Intrinsic::aarch64_neon_st3,
14093 Intrinsic::aarch64_neon_st4};
14094 Function *StNFunc;
14095 if (UseScalable)
14096 StNFunc = Intrinsic::getDeclaration(SI->getModule(),
14097 SVEStoreIntrs[Factor - 2], {STVTy});
14098 else
14099 StNFunc = Intrinsic::getDeclaration(
14100 SI->getModule(), NEONStoreIntrs[Factor - 2], {STVTy, PtrTy});
14102 Value *PTrue = nullptr;
14103 if (UseScalable) {
14104 Optional<unsigned> PgPattern =
14105 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
14106 if (Subtarget->getMinSVEVectorSizeInBits() ==
14107 Subtarget->getMaxSVEVectorSizeInBits() &&
14108 Subtarget->getMinSVEVectorSizeInBits() ==
14109 DL.getTypeSizeInBits(SubVecTy))
14110 PgPattern = AArch64SVEPredPattern::all;
14112 auto *PTruePat =
14113 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
14114 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
14115 {PTruePat});
14118 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
14120 SmallVector<Value *, 5> Ops;
14122 // Split the shufflevector operands into sub vectors for the new stN call.
14123 for (unsigned i = 0; i < Factor; i++) {
14124 Value *Shuffle;
14125 unsigned IdxI = StoreCount * LaneLen * Factor + i;
14126 if (Mask[IdxI] >= 0) {
14127 Shuffle = Builder.CreateShuffleVector(
14128 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
14129 } else {
14130 unsigned StartMask = 0;
14131 for (unsigned j = 1; j < LaneLen; j++) {
14132 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
14133 if (Mask[IdxJ] >= 0) {
14134 StartMask = Mask[IdxJ] - j;
14135 break;
14138 // Note: Filling undef gaps with random elements is ok, since
14139 // those elements were being written anyway (with undefs).
14140 // In the case of all undefs we're defaulting to using elems from 0
14141 // Note: StartMask cannot be negative, it's checked in
14142 // isReInterleaveMask
14143 Shuffle = Builder.CreateShuffleVector(
14144 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
14147 if (UseScalable)
14148 Shuffle = Builder.CreateInsertVector(
14149 STVTy, UndefValue::get(STVTy), Shuffle,
14150 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
14152 Ops.push_back(Shuffle);
14155 if (UseScalable)
14156 Ops.push_back(PTrue);
14158 // If we generating more than one store, we compute the base address of
14159 // subsequent stores as an offset from the previous.
14160 if (StoreCount > 0)
14161 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
14162 BaseAddr, LaneLen * Factor);
14164 Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
14165 Builder.CreateCall(StNFunc, Ops);
14167 return true;
14170 EVT AArch64TargetLowering::getOptimalMemOpType(
14171 const MemOp &Op, const AttributeList &FuncAttributes) const {
14172 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
14173 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
14174 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
14175 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
14176 // taken one instruction to materialize the v2i64 zero and one store (with
14177 // restrictive addressing mode). Just do i64 stores.
14178 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
14179 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
14180 if (Op.isAligned(AlignCheck))
14181 return true;
14182 bool Fast;
14183 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
14184 MachineMemOperand::MONone, &Fast) &&
14185 Fast;
14188 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
14189 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
14190 return MVT::v16i8;
14191 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
14192 return MVT::f128;
14193 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
14194 return MVT::i64;
14195 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
14196 return MVT::i32;
14197 return MVT::Other;
14200 LLT AArch64TargetLowering::getOptimalMemOpLLT(
14201 const MemOp &Op, const AttributeList &FuncAttributes) const {
14202 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
14203 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
14204 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
14205 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
14206 // taken one instruction to materialize the v2i64 zero and one store (with
14207 // restrictive addressing mode). Just do i64 stores.
14208 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
14209 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
14210 if (Op.isAligned(AlignCheck))
14211 return true;
14212 bool Fast;
14213 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
14214 MachineMemOperand::MONone, &Fast) &&
14215 Fast;
14218 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
14219 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
14220 return LLT::fixed_vector(2, 64);
14221 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
14222 return LLT::scalar(128);
14223 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
14224 return LLT::scalar(64);
14225 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
14226 return LLT::scalar(32);
14227 return LLT();
14230 // 12-bit optionally shifted immediates are legal for adds.
14231 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
14232 if (Immed == std::numeric_limits<int64_t>::min()) {
14233 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
14234 << ": avoid UB for INT64_MIN\n");
14235 return false;
14237 // Same encoding for add/sub, just flip the sign.
14238 Immed = std::abs(Immed);
14239 bool IsLegal = ((Immed >> 12) == 0 ||
14240 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
14241 LLVM_DEBUG(dbgs() << "Is " << Immed
14242 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
14243 return IsLegal;
14246 // Return false to prevent folding
14247 // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
14248 // if the folding leads to worse code.
14249 bool AArch64TargetLowering::isMulAddWithConstProfitable(
14250 SDValue AddNode, SDValue ConstNode) const {
14251 // Let the DAGCombiner decide for vector types and large types.
14252 const EVT VT = AddNode.getValueType();
14253 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
14254 return true;
14256 // It is worse if c1 is legal add immediate, while c1*c2 is not
14257 // and has to be composed by at least two instructions.
14258 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
14259 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
14260 const int64_t C1 = C1Node->getSExtValue();
14261 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
14262 if (!isLegalAddImmediate(C1) || isLegalAddImmediate(C1C2.getSExtValue()))
14263 return true;
14264 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
14265 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), VT.getSizeInBits(), Insn);
14266 if (Insn.size() > 1)
14267 return false;
14269 // Default to true and let the DAGCombiner decide.
14270 return true;
14273 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
14274 // immediates is the same as for an add or a sub.
14275 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
14276 return isLegalAddImmediate(Immed);
14279 /// isLegalAddressingMode - Return true if the addressing mode represented
14280 /// by AM is legal for this target, for a load/store of the specified type.
14281 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
14282 const AddrMode &AM, Type *Ty,
14283 unsigned AS, Instruction *I) const {
14284 // AArch64 has five basic addressing modes:
14285 // reg
14286 // reg + 9-bit signed offset
14287 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
14288 // reg1 + reg2
14289 // reg + SIZE_IN_BYTES * reg
14291 // No global is ever allowed as a base.
14292 if (AM.BaseGV)
14293 return false;
14295 // No reg+reg+imm addressing.
14296 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
14297 return false;
14299 // FIXME: Update this method to support scalable addressing modes.
14300 if (isa<ScalableVectorType>(Ty)) {
14301 uint64_t VecElemNumBytes =
14302 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
14303 return AM.HasBaseReg && !AM.BaseOffs &&
14304 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
14307 // check reg + imm case:
14308 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
14309 uint64_t NumBytes = 0;
14310 if (Ty->isSized()) {
14311 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
14312 NumBytes = NumBits / 8;
14313 if (!isPowerOf2_64(NumBits))
14314 NumBytes = 0;
14317 if (!AM.Scale) {
14318 int64_t Offset = AM.BaseOffs;
14320 // 9-bit signed offset
14321 if (isInt<9>(Offset))
14322 return true;
14324 // 12-bit unsigned offset
14325 unsigned shift = Log2_64(NumBytes);
14326 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
14327 // Must be a multiple of NumBytes (NumBytes is a power of 2)
14328 (Offset >> shift) << shift == Offset)
14329 return true;
14330 return false;
14333 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
14335 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
14338 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
14339 // Consider splitting large offset of struct or array.
14340 return true;
14343 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
14344 const MachineFunction &MF, EVT VT) const {
14345 VT = VT.getScalarType();
14347 if (!VT.isSimple())
14348 return false;
14350 switch (VT.getSimpleVT().SimpleTy) {
14351 case MVT::f16:
14352 return Subtarget->hasFullFP16();
14353 case MVT::f32:
14354 case MVT::f64:
14355 return true;
14356 default:
14357 break;
14360 return false;
14363 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
14364 Type *Ty) const {
14365 switch (Ty->getScalarType()->getTypeID()) {
14366 case Type::FloatTyID:
14367 case Type::DoubleTyID:
14368 return true;
14369 default:
14370 return false;
14374 bool AArch64TargetLowering::generateFMAsInMachineCombiner(
14375 EVT VT, CodeGenOpt::Level OptLevel) const {
14376 return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector() &&
14377 !useSVEForFixedLengthVectorVT(VT);
14380 const MCPhysReg *
14381 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
14382 // LR is a callee-save register, but we must treat it as clobbered by any call
14383 // site. Hence we include LR in the scratch registers, which are in turn added
14384 // as implicit-defs for stackmaps and patchpoints.
14385 static const MCPhysReg ScratchRegs[] = {
14386 AArch64::X16, AArch64::X17, AArch64::LR, 0
14388 return ScratchRegs;
14391 bool
14392 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
14393 CombineLevel Level) const {
14394 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
14395 N->getOpcode() == ISD::SRL) &&
14396 "Expected shift op");
14398 SDValue ShiftLHS = N->getOperand(0);
14399 EVT VT = N->getValueType(0);
14401 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not combine
14402 // it with shift 'N' to let it be lowered to UBFX.
14403 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
14404 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
14405 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
14406 if (isMask_64(TruncMask) &&
14407 ShiftLHS.getOperand(0).getOpcode() == ISD::SRL &&
14408 isa<ConstantSDNode>(ShiftLHS.getOperand(0).getOperand(1)))
14409 return false;
14411 return true;
14414 bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
14415 const SDNode *N) const {
14416 assert(N->getOpcode() == ISD::XOR &&
14417 (N->getOperand(0).getOpcode() == ISD::SHL ||
14418 N->getOperand(0).getOpcode() == ISD::SRL) &&
14419 "Expected XOR(SHIFT) pattern");
14421 // Only commute if the entire NOT mask is a hidden shifted mask.
14422 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
14423 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
14424 if (XorC && ShiftC) {
14425 unsigned MaskIdx, MaskLen;
14426 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
14427 unsigned ShiftAmt = ShiftC->getZExtValue();
14428 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
14429 if (N->getOperand(0).getOpcode() == ISD::SHL)
14430 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
14431 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
14435 return false;
14438 bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
14439 const SDNode *N, CombineLevel Level) const {
14440 assert(((N->getOpcode() == ISD::SHL &&
14441 N->getOperand(0).getOpcode() == ISD::SRL) ||
14442 (N->getOpcode() == ISD::SRL &&
14443 N->getOperand(0).getOpcode() == ISD::SHL)) &&
14444 "Expected shift-shift mask");
14445 // Don't allow multiuse shift folding with the same shift amount.
14446 if (!N->getOperand(0)->hasOneUse())
14447 return false;
14449 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
14450 EVT VT = N->getValueType(0);
14451 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
14452 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
14453 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
14454 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
14457 return true;
14460 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
14461 Type *Ty) const {
14462 assert(Ty->isIntegerTy());
14464 unsigned BitSize = Ty->getPrimitiveSizeInBits();
14465 if (BitSize == 0)
14466 return false;
14468 int64_t Val = Imm.getSExtValue();
14469 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
14470 return true;
14472 if ((int64_t)Val < 0)
14473 Val = ~Val;
14474 if (BitSize == 32)
14475 Val &= (1LL << 32) - 1;
14477 unsigned LZ = countLeadingZeros((uint64_t)Val);
14478 unsigned Shift = (63 - LZ) / 16;
14479 // MOVZ is free so return true for one or fewer MOVK.
14480 return Shift < 3;
14483 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
14484 unsigned Index) const {
14485 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
14486 return false;
14488 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
14491 /// Turn vector tests of the signbit in the form of:
14492 /// xor (sra X, elt_size(X)-1), -1
14493 /// into:
14494 /// cmge X, X, #0
14495 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
14496 const AArch64Subtarget *Subtarget) {
14497 EVT VT = N->getValueType(0);
14498 if (!Subtarget->hasNEON() || !VT.isVector())
14499 return SDValue();
14501 // There must be a shift right algebraic before the xor, and the xor must be a
14502 // 'not' operation.
14503 SDValue Shift = N->getOperand(0);
14504 SDValue Ones = N->getOperand(1);
14505 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
14506 !ISD::isBuildVectorAllOnes(Ones.getNode()))
14507 return SDValue();
14509 // The shift should be smearing the sign bit across each vector element.
14510 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
14511 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
14512 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
14513 return SDValue();
14515 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
14518 // Given a vecreduce_add node, detect the below pattern and convert it to the
14519 // node sequence with UABDL, [S|U]ADB and UADDLP.
14521 // i32 vecreduce_add(
14522 // v16i32 abs(
14523 // v16i32 sub(
14524 // v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
14525 // =================>
14526 // i32 vecreduce_add(
14527 // v4i32 UADDLP(
14528 // v8i16 add(
14529 // v8i16 zext(
14530 // v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
14531 // v8i16 zext(
14532 // v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
14533 static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
14534 SelectionDAG &DAG) {
14535 // Assumed i32 vecreduce_add
14536 if (N->getValueType(0) != MVT::i32)
14537 return SDValue();
14539 SDValue VecReduceOp0 = N->getOperand(0);
14540 unsigned Opcode = VecReduceOp0.getOpcode();
14541 // Assumed v16i32 abs
14542 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
14543 return SDValue();
14545 SDValue ABS = VecReduceOp0;
14546 // Assumed v16i32 sub
14547 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
14548 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
14549 return SDValue();
14551 SDValue SUB = ABS->getOperand(0);
14552 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
14553 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
14554 // Assumed v16i32 type
14555 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
14556 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
14557 return SDValue();
14559 // Assumed zext or sext
14560 bool IsZExt = false;
14561 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
14562 IsZExt = true;
14563 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
14564 IsZExt = false;
14565 } else
14566 return SDValue();
14568 SDValue EXT0 = SUB->getOperand(0);
14569 SDValue EXT1 = SUB->getOperand(1);
14570 // Assumed zext's operand has v16i8 type
14571 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
14572 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
14573 return SDValue();
14575 // Pattern is dectected. Let's convert it to sequence of nodes.
14576 SDLoc DL(N);
14578 // First, create the node pattern of UABD/SABD.
14579 SDValue UABDHigh8Op0 =
14580 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
14581 DAG.getConstant(8, DL, MVT::i64));
14582 SDValue UABDHigh8Op1 =
14583 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
14584 DAG.getConstant(8, DL, MVT::i64));
14585 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
14586 UABDHigh8Op0, UABDHigh8Op1);
14587 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
14589 // Second, create the node pattern of UABAL.
14590 SDValue UABDLo8Op0 =
14591 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
14592 DAG.getConstant(0, DL, MVT::i64));
14593 SDValue UABDLo8Op1 =
14594 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
14595 DAG.getConstant(0, DL, MVT::i64));
14596 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
14597 UABDLo8Op0, UABDLo8Op1);
14598 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
14599 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
14601 // Third, create the node of UADDLP.
14602 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
14604 // Fourth, create the node of VECREDUCE_ADD.
14605 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
14608 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
14609 // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
14610 // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
14611 static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
14612 const AArch64Subtarget *ST) {
14613 if (!ST->hasDotProd())
14614 return performVecReduceAddCombineWithUADDLP(N, DAG);
14616 SDValue Op0 = N->getOperand(0);
14617 if (N->getValueType(0) != MVT::i32 ||
14618 Op0.getValueType().getVectorElementType() != MVT::i32)
14619 return SDValue();
14621 unsigned ExtOpcode = Op0.getOpcode();
14622 SDValue A = Op0;
14623 SDValue B;
14624 if (ExtOpcode == ISD::MUL) {
14625 A = Op0.getOperand(0);
14626 B = Op0.getOperand(1);
14627 if (A.getOpcode() != B.getOpcode() ||
14628 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
14629 return SDValue();
14630 ExtOpcode = A.getOpcode();
14632 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
14633 return SDValue();
14635 EVT Op0VT = A.getOperand(0).getValueType();
14636 if (Op0VT != MVT::v8i8 && Op0VT != MVT::v16i8)
14637 return SDValue();
14639 SDLoc DL(Op0);
14640 // For non-mla reductions B can be set to 1. For MLA we take the operand of
14641 // the extend B.
14642 if (!B)
14643 B = DAG.getConstant(1, DL, Op0VT);
14644 else
14645 B = B.getOperand(0);
14647 SDValue Zeros =
14648 DAG.getConstant(0, DL, Op0VT == MVT::v8i8 ? MVT::v2i32 : MVT::v4i32);
14649 auto DotOpcode =
14650 (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
14651 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
14652 A.getOperand(0), B);
14653 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
14656 // Given an (integer) vecreduce, we know the order of the inputs does not
14657 // matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
14658 // into UADDV(UADDLP(x)). This can also happen through an extra add, where we
14659 // transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
14660 static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
14661 auto DetectAddExtract = [&](SDValue A) {
14662 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
14663 // UADDLP(x) if found.
14664 if (A.getOpcode() != ISD::ADD)
14665 return SDValue();
14666 EVT VT = A.getValueType();
14667 SDValue Op0 = A.getOperand(0);
14668 SDValue Op1 = A.getOperand(1);
14669 if (Op0.getOpcode() != Op0.getOpcode() ||
14670 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
14671 Op0.getOpcode() != ISD::SIGN_EXTEND))
14672 return SDValue();
14673 SDValue Ext0 = Op0.getOperand(0);
14674 SDValue Ext1 = Op1.getOperand(0);
14675 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14676 Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
14677 Ext0.getOperand(0) != Ext1.getOperand(0))
14678 return SDValue();
14679 // Check that the type is twice the add types, and the extract are from
14680 // upper/lower parts of the same source.
14681 if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=
14682 VT.getVectorNumElements() * 2)
14683 return SDValue();
14684 if ((Ext0.getConstantOperandVal(1) != 0 &&
14685 Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) &&
14686 (Ext1.getConstantOperandVal(1) != 0 &&
14687 Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))
14688 return SDValue();
14689 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
14690 : AArch64ISD::SADDLP;
14691 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
14694 SDValue A = N->getOperand(0);
14695 if (SDValue R = DetectAddExtract(A))
14696 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
14697 if (A.getOpcode() == ISD::ADD) {
14698 if (SDValue R = DetectAddExtract(A.getOperand(0)))
14699 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
14700 DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
14701 A.getOperand(1)));
14702 if (SDValue R = DetectAddExtract(A.getOperand(1)))
14703 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
14704 DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
14705 A.getOperand(0)));
14707 return SDValue();
14711 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
14712 TargetLowering::DAGCombinerInfo &DCI,
14713 const AArch64Subtarget *Subtarget) {
14714 if (DCI.isBeforeLegalizeOps())
14715 return SDValue();
14717 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
14720 SDValue
14721 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
14722 SelectionDAG &DAG,
14723 SmallVectorImpl<SDNode *> &Created) const {
14724 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
14725 if (isIntDivCheap(N->getValueType(0), Attr))
14726 return SDValue(N,0); // Lower SDIV as SDIV
14728 EVT VT = N->getValueType(0);
14730 // For scalable and fixed types, mark them as cheap so we can handle it much
14731 // later. This allows us to handle larger than legal types.
14732 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
14733 return SDValue(N, 0);
14735 // fold (sdiv X, pow2)
14736 if ((VT != MVT::i32 && VT != MVT::i64) ||
14737 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
14738 return SDValue();
14740 SDLoc DL(N);
14741 SDValue N0 = N->getOperand(0);
14742 unsigned Lg2 = Divisor.countTrailingZeros();
14743 SDValue Zero = DAG.getConstant(0, DL, VT);
14744 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
14746 // Add (N0 < 0) ? Pow2 - 1 : 0;
14747 SDValue CCVal;
14748 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
14749 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
14750 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
14752 Created.push_back(Cmp.getNode());
14753 Created.push_back(Add.getNode());
14754 Created.push_back(CSel.getNode());
14756 // Divide by pow2.
14757 SDValue SRA =
14758 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
14760 // If we're dividing by a positive value, we're done. Otherwise, we must
14761 // negate the result.
14762 if (Divisor.isNonNegative())
14763 return SRA;
14765 Created.push_back(SRA.getNode());
14766 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
14769 SDValue
14770 AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
14771 SelectionDAG &DAG,
14772 SmallVectorImpl<SDNode *> &Created) const {
14773 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
14774 if (isIntDivCheap(N->getValueType(0), Attr))
14775 return SDValue(N, 0); // Lower SREM as SREM
14777 EVT VT = N->getValueType(0);
14779 // For scalable and fixed types, mark them as cheap so we can handle it much
14780 // later. This allows us to handle larger than legal types.
14781 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
14782 return SDValue(N, 0);
14784 // fold (srem X, pow2)
14785 if ((VT != MVT::i32 && VT != MVT::i64) ||
14786 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
14787 return SDValue();
14789 unsigned Lg2 = Divisor.countTrailingZeros();
14790 if (Lg2 == 0)
14791 return SDValue();
14793 SDLoc DL(N);
14794 SDValue N0 = N->getOperand(0);
14795 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
14796 SDValue Zero = DAG.getConstant(0, DL, VT);
14797 SDValue CCVal, CSNeg;
14798 if (Lg2 == 1) {
14799 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
14800 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
14801 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
14803 Created.push_back(Cmp.getNode());
14804 Created.push_back(And.getNode());
14805 } else {
14806 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
14807 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
14809 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
14810 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
14811 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
14812 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
14813 Negs.getValue(1));
14815 Created.push_back(Negs.getNode());
14816 Created.push_back(AndPos.getNode());
14817 Created.push_back(AndNeg.getNode());
14820 return CSNeg;
14823 static bool IsSVECntIntrinsic(SDValue S) {
14824 switch(getIntrinsicID(S.getNode())) {
14825 default:
14826 break;
14827 case Intrinsic::aarch64_sve_cntb:
14828 case Intrinsic::aarch64_sve_cnth:
14829 case Intrinsic::aarch64_sve_cntw:
14830 case Intrinsic::aarch64_sve_cntd:
14831 return true;
14833 return false;
14836 /// Calculates what the pre-extend type is, based on the extension
14837 /// operation node provided by \p Extend.
14839 /// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
14840 /// pre-extend type is pulled directly from the operand, while other extend
14841 /// operations need a bit more inspection to get this information.
14843 /// \param Extend The SDNode from the DAG that represents the extend operation
14845 /// \returns The type representing the \p Extend source type, or \p MVT::Other
14846 /// if no valid type can be determined
14847 static EVT calculatePreExtendType(SDValue Extend) {
14848 switch (Extend.getOpcode()) {
14849 case ISD::SIGN_EXTEND:
14850 case ISD::ZERO_EXTEND:
14851 return Extend.getOperand(0).getValueType();
14852 case ISD::AssertSext:
14853 case ISD::AssertZext:
14854 case ISD::SIGN_EXTEND_INREG: {
14855 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
14856 if (!TypeNode)
14857 return MVT::Other;
14858 return TypeNode->getVT();
14860 case ISD::AND: {
14861 ConstantSDNode *Constant =
14862 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
14863 if (!Constant)
14864 return MVT::Other;
14866 uint32_t Mask = Constant->getZExtValue();
14868 if (Mask == UCHAR_MAX)
14869 return MVT::i8;
14870 else if (Mask == USHRT_MAX)
14871 return MVT::i16;
14872 else if (Mask == UINT_MAX)
14873 return MVT::i32;
14875 return MVT::Other;
14877 default:
14878 return MVT::Other;
14882 /// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
14883 /// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
14884 /// SExt/ZExt rather than the scalar SExt/ZExt
14885 static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
14886 EVT VT = BV.getValueType();
14887 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
14888 BV.getOpcode() != ISD::VECTOR_SHUFFLE)
14889 return SDValue();
14891 // Use the first item in the buildvector/shuffle to get the size of the
14892 // extend, and make sure it looks valid.
14893 SDValue Extend = BV->getOperand(0);
14894 unsigned ExtendOpcode = Extend.getOpcode();
14895 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
14896 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
14897 ExtendOpcode == ISD::AssertSext;
14898 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
14899 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
14900 return SDValue();
14901 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
14902 // calculatePreExtendType will work without issue.
14903 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
14904 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
14905 return SDValue();
14907 // Restrict valid pre-extend data type
14908 EVT PreExtendType = calculatePreExtendType(Extend);
14909 if (PreExtendType == MVT::Other ||
14910 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
14911 return SDValue();
14913 // Make sure all other operands are equally extended
14914 for (SDValue Op : drop_begin(BV->ops())) {
14915 if (Op.isUndef())
14916 continue;
14917 unsigned Opc = Op.getOpcode();
14918 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
14919 Opc == ISD::AssertSext;
14920 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
14921 return SDValue();
14924 SDValue NBV;
14925 SDLoc DL(BV);
14926 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
14927 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
14928 EVT PreExtendLegalType =
14929 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
14930 SmallVector<SDValue, 8> NewOps;
14931 for (SDValue Op : BV->ops())
14932 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
14933 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
14934 PreExtendLegalType));
14935 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
14936 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
14937 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
14938 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
14939 BV.getOperand(1).isUndef()
14940 ? DAG.getUNDEF(PreExtendVT)
14941 : BV.getOperand(1).getOperand(0),
14942 cast<ShuffleVectorSDNode>(BV)->getMask());
14944 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
14947 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
14948 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
14949 static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
14950 // If the value type isn't a vector, none of the operands are going to be dups
14951 EVT VT = Mul->getValueType(0);
14952 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
14953 return SDValue();
14955 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
14956 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
14958 // Neither operands have been changed, don't make any further changes
14959 if (!Op0 && !Op1)
14960 return SDValue();
14962 SDLoc DL(Mul);
14963 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
14964 Op1 ? Op1 : Mul->getOperand(1));
14967 // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
14968 // Same for other types with equivalent constants.
14969 static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
14970 EVT VT = N->getValueType(0);
14971 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
14972 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
14973 return SDValue();
14974 if (N->getOperand(0).getOpcode() != ISD::AND ||
14975 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
14976 return SDValue();
14978 SDValue And = N->getOperand(0);
14979 SDValue Srl = And.getOperand(0);
14981 APInt V1, V2, V3;
14982 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
14983 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
14984 !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3))
14985 return SDValue();
14987 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
14988 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
14989 V3 != (HalfSize - 1))
14990 return SDValue();
14992 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
14993 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
14994 VT.getVectorElementCount() * 2);
14996 SDLoc DL(N);
14997 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
14998 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
14999 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
15002 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
15003 TargetLowering::DAGCombinerInfo &DCI,
15004 const AArch64Subtarget *Subtarget) {
15006 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
15007 return Ext;
15008 if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
15009 return Ext;
15011 if (DCI.isBeforeLegalizeOps())
15012 return SDValue();
15014 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
15015 // and in MachineCombiner pass, add+mul will be combined into madd.
15016 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
15017 SDLoc DL(N);
15018 EVT VT = N->getValueType(0);
15019 SDValue N0 = N->getOperand(0);
15020 SDValue N1 = N->getOperand(1);
15021 SDValue MulOper;
15022 unsigned AddSubOpc;
15024 auto IsAddSubWith1 = [&](SDValue V) -> bool {
15025 AddSubOpc = V->getOpcode();
15026 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
15027 SDValue Opnd = V->getOperand(1);
15028 MulOper = V->getOperand(0);
15029 if (AddSubOpc == ISD::SUB)
15030 std::swap(Opnd, MulOper);
15031 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
15032 return C->isOne();
15034 return false;
15037 if (IsAddSubWith1(N0)) {
15038 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
15039 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
15042 if (IsAddSubWith1(N1)) {
15043 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
15044 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
15047 // The below optimizations require a constant RHS.
15048 if (!isa<ConstantSDNode>(N1))
15049 return SDValue();
15051 ConstantSDNode *C = cast<ConstantSDNode>(N1);
15052 const APInt &ConstValue = C->getAPIntValue();
15054 // Allow the scaling to be folded into the `cnt` instruction by preventing
15055 // the scaling to be obscured here. This makes it easier to pattern match.
15056 if (IsSVECntIntrinsic(N0) ||
15057 (N0->getOpcode() == ISD::TRUNCATE &&
15058 (IsSVECntIntrinsic(N0->getOperand(0)))))
15059 if (ConstValue.sge(1) && ConstValue.sle(16))
15060 return SDValue();
15062 // Multiplication of a power of two plus/minus one can be done more
15063 // cheaply as as shift+add/sub. For now, this is true unilaterally. If
15064 // future CPUs have a cheaper MADD instruction, this may need to be
15065 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
15066 // 64-bit is 5 cycles, so this is always a win.
15067 // More aggressively, some multiplications N0 * C can be lowered to
15068 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
15069 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
15070 // TODO: lower more cases.
15072 // TrailingZeroes is used to test if the mul can be lowered to
15073 // shift+add+shift.
15074 unsigned TrailingZeroes = ConstValue.countTrailingZeros();
15075 if (TrailingZeroes) {
15076 // Conservatively do not lower to shift+add+shift if the mul might be
15077 // folded into smul or umul.
15078 if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
15079 isZeroExtended(N0.getNode(), DAG)))
15080 return SDValue();
15081 // Conservatively do not lower to shift+add+shift if the mul might be
15082 // folded into madd or msub.
15083 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
15084 N->use_begin()->getOpcode() == ISD::SUB))
15085 return SDValue();
15087 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
15088 // and shift+add+shift.
15089 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
15090 unsigned ShiftAmt;
15092 auto Shl = [&](SDValue N0, unsigned N1) {
15093 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
15094 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
15096 auto Add = [&](SDValue N0, SDValue N1) {
15097 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
15099 auto Sub = [&](SDValue N0, SDValue N1) {
15100 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
15102 auto Negate = [&](SDValue N) {
15103 SDValue Zero = DAG.getConstant(0, DL, VT);
15104 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
15107 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
15108 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
15109 // the (2^N - 1) can't be execused via a single instruction.
15110 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
15111 unsigned BitWidth = C.getBitWidth();
15112 for (unsigned i = 1; i < BitWidth / 2; i++) {
15113 APInt Rem;
15114 APInt X(BitWidth, (1 << i) + 1);
15115 APInt::sdivrem(C, X, N, Rem);
15116 APInt NVMinus1 = N - 1;
15117 if (Rem == 0 && NVMinus1.isPowerOf2()) {
15118 M = X;
15119 return true;
15122 return false;
15125 if (ConstValue.isNonNegative()) {
15126 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
15127 // (mul x, 2^N - 1) => (sub (shl x, N), x)
15128 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
15129 // (mul x, (2^M + 1) * (2^N + 1))
15130 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
15131 APInt SCVMinus1 = ShiftedConstValue - 1;
15132 APInt SCVPlus1 = ShiftedConstValue + 1;
15133 APInt CVPlus1 = ConstValue + 1;
15134 APInt CVM, CVN;
15135 if (SCVMinus1.isPowerOf2()) {
15136 ShiftAmt = SCVMinus1.logBase2();
15137 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
15138 } else if (CVPlus1.isPowerOf2()) {
15139 ShiftAmt = CVPlus1.logBase2();
15140 return Sub(Shl(N0, ShiftAmt), N0);
15141 } else if (SCVPlus1.isPowerOf2()) {
15142 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
15143 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
15144 } else if (Subtarget->hasLSLFast() &&
15145 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
15146 APInt CVMMinus1 = CVM - 1;
15147 APInt CVNMinus1 = CVN - 1;
15148 unsigned ShiftM1 = CVMMinus1.logBase2();
15149 unsigned ShiftN1 = CVNMinus1.logBase2();
15150 // LSLFast implicate that Shifts <= 3 places are fast
15151 if (ShiftM1 <= 3 && ShiftN1 <= 3) {
15152 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
15153 return Add(Shl(MVal, ShiftN1), MVal);
15156 } else {
15157 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
15158 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
15159 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
15160 APInt SCVPlus1 = -ShiftedConstValue + 1;
15161 APInt CVNegPlus1 = -ConstValue + 1;
15162 APInt CVNegMinus1 = -ConstValue - 1;
15163 if (CVNegPlus1.isPowerOf2()) {
15164 ShiftAmt = CVNegPlus1.logBase2();
15165 return Sub(N0, Shl(N0, ShiftAmt));
15166 } else if (CVNegMinus1.isPowerOf2()) {
15167 ShiftAmt = CVNegMinus1.logBase2();
15168 return Negate(Add(Shl(N0, ShiftAmt), N0));
15169 } else if (SCVPlus1.isPowerOf2()) {
15170 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
15171 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
15175 return SDValue();
15178 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
15179 SelectionDAG &DAG) {
15180 // Take advantage of vector comparisons producing 0 or -1 in each lane to
15181 // optimize away operation when it's from a constant.
15183 // The general transformation is:
15184 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
15185 // AND(VECTOR_CMP(x,y), constant2)
15186 // constant2 = UNARYOP(constant)
15188 // Early exit if this isn't a vector operation, the operand of the
15189 // unary operation isn't a bitwise AND, or if the sizes of the operations
15190 // aren't the same.
15191 EVT VT = N->getValueType(0);
15192 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
15193 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
15194 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
15195 return SDValue();
15197 // Now check that the other operand of the AND is a constant. We could
15198 // make the transformation for non-constant splats as well, but it's unclear
15199 // that would be a benefit as it would not eliminate any operations, just
15200 // perform one more step in scalar code before moving to the vector unit.
15201 if (BuildVectorSDNode *BV =
15202 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
15203 // Bail out if the vector isn't a constant.
15204 if (!BV->isConstant())
15205 return SDValue();
15207 // Everything checks out. Build up the new and improved node.
15208 SDLoc DL(N);
15209 EVT IntVT = BV->getValueType(0);
15210 // Create a new constant of the appropriate type for the transformed
15211 // DAG.
15212 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
15213 // The AND node needs bitcasts to/from an integer vector type around it.
15214 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
15215 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
15216 N->getOperand(0)->getOperand(0), MaskConst);
15217 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
15218 return Res;
15221 return SDValue();
15224 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
15225 const AArch64Subtarget *Subtarget) {
15226 // First try to optimize away the conversion when it's conditionally from
15227 // a constant. Vectors only.
15228 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
15229 return Res;
15231 EVT VT = N->getValueType(0);
15232 if (VT != MVT::f32 && VT != MVT::f64)
15233 return SDValue();
15235 // Only optimize when the source and destination types have the same width.
15236 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
15237 return SDValue();
15239 // If the result of an integer load is only used by an integer-to-float
15240 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
15241 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
15242 SDValue N0 = N->getOperand(0);
15243 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
15244 // Do not change the width of a volatile load.
15245 !cast<LoadSDNode>(N0)->isVolatile()) {
15246 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15247 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
15248 LN0->getPointerInfo(), LN0->getAlign(),
15249 LN0->getMemOperand()->getFlags());
15251 // Make sure successors of the original load stay after it by updating them
15252 // to use the new Chain.
15253 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
15255 unsigned Opcode =
15256 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
15257 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
15260 return SDValue();
15263 /// Fold a floating-point multiply by power of two into floating-point to
15264 /// fixed-point conversion.
15265 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
15266 TargetLowering::DAGCombinerInfo &DCI,
15267 const AArch64Subtarget *Subtarget) {
15268 if (!Subtarget->hasNEON())
15269 return SDValue();
15271 if (!N->getValueType(0).isSimple())
15272 return SDValue();
15274 SDValue Op = N->getOperand(0);
15275 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
15276 return SDValue();
15278 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
15279 return SDValue();
15281 SDValue ConstVec = Op->getOperand(1);
15282 if (!isa<BuildVectorSDNode>(ConstVec))
15283 return SDValue();
15285 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
15286 uint32_t FloatBits = FloatTy.getSizeInBits();
15287 if (FloatBits != 32 && FloatBits != 64 &&
15288 (FloatBits != 16 || !Subtarget->hasFullFP16()))
15289 return SDValue();
15291 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
15292 uint32_t IntBits = IntTy.getSizeInBits();
15293 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
15294 return SDValue();
15296 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
15297 if (IntBits > FloatBits)
15298 return SDValue();
15300 BitVector UndefElements;
15301 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
15302 int32_t Bits = IntBits == 64 ? 64 : 32;
15303 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
15304 if (C == -1 || C == 0 || C > Bits)
15305 return SDValue();
15307 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
15308 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
15309 return SDValue();
15311 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
15312 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
15313 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
15314 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
15315 return SDValue();
15318 SDLoc DL(N);
15319 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
15320 N->getOpcode() == ISD::FP_TO_SINT_SAT);
15321 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
15322 : Intrinsic::aarch64_neon_vcvtfp2fxu;
15323 SDValue FixConv =
15324 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
15325 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
15326 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
15327 // We can handle smaller integers by generating an extra trunc.
15328 if (IntBits < FloatBits)
15329 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
15331 return FixConv;
15334 /// Fold a floating-point divide by power of two into fixed-point to
15335 /// floating-point conversion.
15336 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
15337 TargetLowering::DAGCombinerInfo &DCI,
15338 const AArch64Subtarget *Subtarget) {
15339 if (!Subtarget->hasNEON())
15340 return SDValue();
15342 SDValue Op = N->getOperand(0);
15343 unsigned Opc = Op->getOpcode();
15344 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
15345 !Op.getOperand(0).getValueType().isSimple() ||
15346 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
15347 return SDValue();
15349 SDValue ConstVec = N->getOperand(1);
15350 if (!isa<BuildVectorSDNode>(ConstVec))
15351 return SDValue();
15353 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
15354 int32_t IntBits = IntTy.getSizeInBits();
15355 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
15356 return SDValue();
15358 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
15359 int32_t FloatBits = FloatTy.getSizeInBits();
15360 if (FloatBits != 32 && FloatBits != 64)
15361 return SDValue();
15363 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
15364 if (IntBits > FloatBits)
15365 return SDValue();
15367 BitVector UndefElements;
15368 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
15369 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
15370 if (C == -1 || C == 0 || C > FloatBits)
15371 return SDValue();
15373 MVT ResTy;
15374 unsigned NumLanes = Op.getValueType().getVectorNumElements();
15375 switch (NumLanes) {
15376 default:
15377 return SDValue();
15378 case 2:
15379 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
15380 break;
15381 case 4:
15382 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
15383 break;
15386 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
15387 return SDValue();
15389 SDLoc DL(N);
15390 SDValue ConvInput = Op.getOperand(0);
15391 bool IsSigned = Opc == ISD::SINT_TO_FP;
15392 if (IntBits < FloatBits)
15393 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
15394 ResTy, ConvInput);
15396 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
15397 : Intrinsic::aarch64_neon_vcvtfxu2fp;
15398 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
15399 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
15400 DAG.getConstant(C, DL, MVT::i32));
15403 /// An EXTR instruction is made up of two shifts, ORed together. This helper
15404 /// searches for and classifies those shifts.
15405 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
15406 bool &FromHi) {
15407 if (N.getOpcode() == ISD::SHL)
15408 FromHi = false;
15409 else if (N.getOpcode() == ISD::SRL)
15410 FromHi = true;
15411 else
15412 return false;
15414 if (!isa<ConstantSDNode>(N.getOperand(1)))
15415 return false;
15417 ShiftAmount = N->getConstantOperandVal(1);
15418 Src = N->getOperand(0);
15419 return true;
15422 /// EXTR instruction extracts a contiguous chunk of bits from two existing
15423 /// registers viewed as a high/low pair. This function looks for the pattern:
15424 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
15425 /// with an EXTR. Can't quite be done in TableGen because the two immediates
15426 /// aren't independent.
15427 static SDValue tryCombineToEXTR(SDNode *N,
15428 TargetLowering::DAGCombinerInfo &DCI) {
15429 SelectionDAG &DAG = DCI.DAG;
15430 SDLoc DL(N);
15431 EVT VT = N->getValueType(0);
15433 assert(N->getOpcode() == ISD::OR && "Unexpected root");
15435 if (VT != MVT::i32 && VT != MVT::i64)
15436 return SDValue();
15438 SDValue LHS;
15439 uint32_t ShiftLHS = 0;
15440 bool LHSFromHi = false;
15441 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
15442 return SDValue();
15444 SDValue RHS;
15445 uint32_t ShiftRHS = 0;
15446 bool RHSFromHi = false;
15447 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
15448 return SDValue();
15450 // If they're both trying to come from the high part of the register, they're
15451 // not really an EXTR.
15452 if (LHSFromHi == RHSFromHi)
15453 return SDValue();
15455 if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
15456 return SDValue();
15458 if (LHSFromHi) {
15459 std::swap(LHS, RHS);
15460 std::swap(ShiftLHS, ShiftRHS);
15463 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
15464 DAG.getConstant(ShiftRHS, DL, MVT::i64));
15467 static SDValue tryCombineToBSL(SDNode *N,
15468 TargetLowering::DAGCombinerInfo &DCI) {
15469 EVT VT = N->getValueType(0);
15470 SelectionDAG &DAG = DCI.DAG;
15471 SDLoc DL(N);
15473 if (!VT.isVector())
15474 return SDValue();
15476 // The combining code currently only works for NEON vectors. In particular,
15477 // it does not work for SVE when dealing with vectors wider than 128 bits.
15478 if (!VT.is64BitVector() && !VT.is128BitVector())
15479 return SDValue();
15481 SDValue N0 = N->getOperand(0);
15482 if (N0.getOpcode() != ISD::AND)
15483 return SDValue();
15485 SDValue N1 = N->getOperand(1);
15486 if (N1.getOpcode() != ISD::AND)
15487 return SDValue();
15489 // InstCombine does (not (neg a)) => (add a -1).
15490 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15491 // Loop over all combinations of AND operands.
15492 for (int i = 1; i >= 0; --i) {
15493 for (int j = 1; j >= 0; --j) {
15494 SDValue O0 = N0->getOperand(i);
15495 SDValue O1 = N1->getOperand(j);
15496 SDValue Sub, Add, SubSibling, AddSibling;
15498 // Find a SUB and an ADD operand, one from each AND.
15499 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
15500 Sub = O0;
15501 Add = O1;
15502 SubSibling = N0->getOperand(1 - i);
15503 AddSibling = N1->getOperand(1 - j);
15504 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
15505 Add = O0;
15506 Sub = O1;
15507 AddSibling = N0->getOperand(1 - i);
15508 SubSibling = N1->getOperand(1 - j);
15509 } else
15510 continue;
15512 if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()))
15513 continue;
15515 // Constant ones is always righthand operand of the Add.
15516 if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
15517 continue;
15519 if (Sub.getOperand(1) != Add.getOperand(0))
15520 continue;
15522 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15526 // (or (and a b) (and (not a) c)) => (bsl a b c)
15527 // We only have to look for constant vectors here since the general, variable
15528 // case can be handled in TableGen.
15529 unsigned Bits = VT.getScalarSizeInBits();
15530 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
15531 for (int i = 1; i >= 0; --i)
15532 for (int j = 1; j >= 0; --j) {
15533 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
15534 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
15535 if (!BVN0 || !BVN1)
15536 continue;
15538 bool FoundMatch = true;
15539 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15540 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
15541 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
15542 if (!CN0 || !CN1 ||
15543 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
15544 FoundMatch = false;
15545 break;
15549 if (FoundMatch)
15550 return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
15551 N0->getOperand(1 - i), N1->getOperand(1 - j));
15554 return SDValue();
15557 // Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
15558 // convert to csel(ccmp(.., cc0)), depending on cc1:
15560 // (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
15561 // =>
15562 // (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
15564 // (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
15565 // =>
15566 // (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
15567 static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
15568 EVT VT = N->getValueType(0);
15569 SDValue CSel0 = N->getOperand(0);
15570 SDValue CSel1 = N->getOperand(1);
15572 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
15573 CSel1.getOpcode() != AArch64ISD::CSEL)
15574 return SDValue();
15576 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
15577 return SDValue();
15579 if (!isNullConstant(CSel0.getOperand(0)) ||
15580 !isOneConstant(CSel0.getOperand(1)) ||
15581 !isNullConstant(CSel1.getOperand(0)) ||
15582 !isOneConstant(CSel1.getOperand(1)))
15583 return SDValue();
15585 SDValue Cmp0 = CSel0.getOperand(3);
15586 SDValue Cmp1 = CSel1.getOperand(3);
15587 AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2);
15588 AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2);
15589 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
15590 return SDValue();
15591 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
15592 Cmp0.getOpcode() == AArch64ISD::SUBS) {
15593 std::swap(Cmp0, Cmp1);
15594 std::swap(CC0, CC1);
15597 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
15598 return SDValue();
15600 SDLoc DL(N);
15601 SDValue CCmp, Condition;
15602 unsigned NZCV;
15604 if (N->getOpcode() == ISD::AND) {
15605 AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0);
15606 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
15607 NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1);
15608 } else {
15609 AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1);
15610 Condition = DAG.getConstant(CC0, DL, MVT_CC);
15611 NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1);
15614 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
15616 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
15617 if (Op1 && Op1->getAPIntValue().isNegative() &&
15618 Op1->getAPIntValue().sgt(-32)) {
15619 // CCMP accept the constant int the range [0, 31]
15620 // if the Op1 is a constant in the range [-31, -1], we
15621 // can select to CCMN to avoid the extra mov
15622 SDValue AbsOp1 =
15623 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
15624 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
15625 NZCVOp, Condition, Cmp0);
15626 } else {
15627 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
15628 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
15630 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
15631 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
15632 CCmp);
15635 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
15636 const AArch64Subtarget *Subtarget) {
15637 SelectionDAG &DAG = DCI.DAG;
15638 EVT VT = N->getValueType(0);
15640 if (SDValue R = performANDORCSELCombine(N, DAG))
15641 return R;
15643 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
15644 return SDValue();
15646 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
15647 if (SDValue Res = tryCombineToEXTR(N, DCI))
15648 return Res;
15650 if (SDValue Res = tryCombineToBSL(N, DCI))
15651 return Res;
15653 return SDValue();
15656 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
15657 if (!MemVT.getVectorElementType().isSimple())
15658 return false;
15660 uint64_t MaskForTy = 0ull;
15661 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
15662 case MVT::i8:
15663 MaskForTy = 0xffull;
15664 break;
15665 case MVT::i16:
15666 MaskForTy = 0xffffull;
15667 break;
15668 case MVT::i32:
15669 MaskForTy = 0xffffffffull;
15670 break;
15671 default:
15672 return false;
15673 break;
15676 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
15677 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
15678 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
15680 return false;
15683 static SDValue performSVEAndCombine(SDNode *N,
15684 TargetLowering::DAGCombinerInfo &DCI) {
15685 if (DCI.isBeforeLegalizeOps())
15686 return SDValue();
15688 SelectionDAG &DAG = DCI.DAG;
15689 SDValue Src = N->getOperand(0);
15690 unsigned Opc = Src->getOpcode();
15692 // Zero/any extend of an unsigned unpack
15693 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
15694 SDValue UnpkOp = Src->getOperand(0);
15695 SDValue Dup = N->getOperand(1);
15697 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
15698 return SDValue();
15700 SDLoc DL(N);
15701 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
15702 if (!C)
15703 return SDValue();
15705 uint64_t ExtVal = C->getZExtValue();
15707 // If the mask is fully covered by the unpack, we don't need to push
15708 // a new AND onto the operand
15709 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
15710 if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
15711 (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
15712 (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
15713 return Src;
15715 // Truncate to prevent a DUP with an over wide constant
15716 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
15718 // Otherwise, make sure we propagate the AND to the operand
15719 // of the unpack
15720 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
15721 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
15723 SDValue And = DAG.getNode(ISD::AND, DL,
15724 UnpkOp->getValueType(0), UnpkOp, Dup);
15726 return DAG.getNode(Opc, DL, N->getValueType(0), And);
15729 if (!EnableCombineMGatherIntrinsics)
15730 return SDValue();
15732 SDValue Mask = N->getOperand(1);
15734 if (!Src.hasOneUse())
15735 return SDValue();
15737 EVT MemVT;
15739 // SVE load instructions perform an implicit zero-extend, which makes them
15740 // perfect candidates for combining.
15741 switch (Opc) {
15742 case AArch64ISD::LD1_MERGE_ZERO:
15743 case AArch64ISD::LDNF1_MERGE_ZERO:
15744 case AArch64ISD::LDFF1_MERGE_ZERO:
15745 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
15746 break;
15747 case AArch64ISD::GLD1_MERGE_ZERO:
15748 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
15749 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
15750 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
15751 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
15752 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
15753 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
15754 case AArch64ISD::GLDFF1_MERGE_ZERO:
15755 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
15756 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
15757 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
15758 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
15759 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
15760 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
15761 case AArch64ISD::GLDNT1_MERGE_ZERO:
15762 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
15763 break;
15764 default:
15765 return SDValue();
15768 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
15769 return Src;
15771 return SDValue();
15774 static SDValue performANDCombine(SDNode *N,
15775 TargetLowering::DAGCombinerInfo &DCI) {
15776 SelectionDAG &DAG = DCI.DAG;
15777 SDValue LHS = N->getOperand(0);
15778 SDValue RHS = N->getOperand(1);
15779 EVT VT = N->getValueType(0);
15781 if (SDValue R = performANDORCSELCombine(N, DAG))
15782 return R;
15784 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
15785 return SDValue();
15787 if (VT.isScalableVector())
15788 return performSVEAndCombine(N, DCI);
15790 // The combining code below works only for NEON vectors. In particular, it
15791 // does not work for SVE when dealing with vectors wider than 128 bits.
15792 if (!VT.is64BitVector() && !VT.is128BitVector())
15793 return SDValue();
15795 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
15796 if (!BVN)
15797 return SDValue();
15799 // AND does not accept an immediate, so check if we can use a BIC immediate
15800 // instruction instead. We do this here instead of using a (and x, (mvni imm))
15801 // pattern in isel, because some immediates may be lowered to the preferred
15802 // (and x, (movi imm)) form, even though an mvni representation also exists.
15803 APInt DefBits(VT.getSizeInBits(), 0);
15804 APInt UndefBits(VT.getSizeInBits(), 0);
15805 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15806 SDValue NewOp;
15808 DefBits = ~DefBits;
15809 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
15810 DefBits, &LHS)) ||
15811 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
15812 DefBits, &LHS)))
15813 return NewOp;
15815 UndefBits = ~UndefBits;
15816 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
15817 UndefBits, &LHS)) ||
15818 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
15819 UndefBits, &LHS)))
15820 return NewOp;
15823 return SDValue();
15826 static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
15827 switch (Opcode) {
15828 case ISD::STRICT_FADD:
15829 case ISD::FADD:
15830 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
15831 case ISD::ADD:
15832 return VT == MVT::i64;
15833 default:
15834 return false;
15838 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
15839 AArch64CC::CondCode Cond);
15841 static bool isPredicateCCSettingOp(SDValue N) {
15842 if ((N.getOpcode() == ISD::SETCC) ||
15843 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
15844 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
15845 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
15846 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
15847 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
15848 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
15849 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
15850 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
15851 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
15852 // get_active_lane_mask is lowered to a whilelo instruction.
15853 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
15854 return true;
15856 return false;
15859 // Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
15860 // ... into: "ptrue p, all" + PTEST
15861 static SDValue
15862 performFirstTrueTestVectorCombine(SDNode *N,
15863 TargetLowering::DAGCombinerInfo &DCI,
15864 const AArch64Subtarget *Subtarget) {
15865 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
15866 // Make sure PTEST can be legalised with illegal types.
15867 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
15868 return SDValue();
15870 SDValue N0 = N->getOperand(0);
15871 EVT VT = N0.getValueType();
15873 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
15874 !isNullConstant(N->getOperand(1)))
15875 return SDValue();
15877 // Restricted the DAG combine to only cases where we're extracting from a
15878 // flag-setting operation.
15879 if (!isPredicateCCSettingOp(N0))
15880 return SDValue();
15882 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
15883 SelectionDAG &DAG = DCI.DAG;
15884 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
15885 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
15888 // Materialize : Idx = (add (mul vscale, NumEls), -1)
15889 // i1 = extract_vector_elt t37, Constant:i64<Idx>
15890 // ... into: "ptrue p, all" + PTEST
15891 static SDValue
15892 performLastTrueTestVectorCombine(SDNode *N,
15893 TargetLowering::DAGCombinerInfo &DCI,
15894 const AArch64Subtarget *Subtarget) {
15895 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
15896 // Make sure PTEST is legal types.
15897 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
15898 return SDValue();
15900 SDValue N0 = N->getOperand(0);
15901 EVT OpVT = N0.getValueType();
15903 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
15904 return SDValue();
15906 // Idx == (add (mul vscale, NumEls), -1)
15907 SDValue Idx = N->getOperand(1);
15908 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
15909 return SDValue();
15911 SDValue VS = Idx.getOperand(0);
15912 if (VS.getOpcode() != ISD::VSCALE)
15913 return SDValue();
15915 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
15916 if (VS.getConstantOperandVal(0) != NumEls)
15917 return SDValue();
15919 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
15920 SelectionDAG &DAG = DCI.DAG;
15921 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
15922 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
15925 static SDValue
15926 performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
15927 const AArch64Subtarget *Subtarget) {
15928 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
15929 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
15930 return Res;
15931 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
15932 return Res;
15934 SelectionDAG &DAG = DCI.DAG;
15935 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
15936 ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1);
15938 EVT VT = N->getValueType(0);
15939 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
15940 bool IsStrict = N0->isStrictFPOpcode();
15942 // extract(dup x) -> x
15943 if (N0.getOpcode() == AArch64ISD::DUP)
15944 return DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
15946 // Rewrite for pairwise fadd pattern
15947 // (f32 (extract_vector_elt
15948 // (fadd (vXf32 Other)
15949 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
15950 // ->
15951 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
15952 // (extract_vector_elt (vXf32 Other) 1))
15953 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
15954 // we can only do this when it's used only by the extract_vector_elt.
15955 if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
15956 hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
15957 (!IsStrict || N0.hasOneUse())) {
15958 SDLoc DL(N0);
15959 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
15960 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
15962 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
15963 SDValue Other = N00;
15965 // And handle the commutative case.
15966 if (!Shuffle) {
15967 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
15968 Other = N01;
15971 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
15972 Other == Shuffle->getOperand(0)) {
15973 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
15974 DAG.getConstant(0, DL, MVT::i64));
15975 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
15976 DAG.getConstant(1, DL, MVT::i64));
15977 if (!IsStrict)
15978 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
15980 // For strict_fadd we need uses of the final extract_vector to be replaced
15981 // with the strict_fadd, but we also need uses of the chain output of the
15982 // original strict_fadd to use the chain output of the new strict_fadd as
15983 // otherwise it may not be deleted.
15984 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
15985 {VT, MVT::Other},
15986 {N0->getOperand(0), Extract1, Extract2});
15987 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
15988 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
15989 return SDValue(N, 0);
15993 return SDValue();
15996 static SDValue performConcatVectorsCombine(SDNode *N,
15997 TargetLowering::DAGCombinerInfo &DCI,
15998 SelectionDAG &DAG) {
15999 SDLoc dl(N);
16000 EVT VT = N->getValueType(0);
16001 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
16002 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
16004 if (VT.isScalableVector())
16005 return SDValue();
16007 // Optimize concat_vectors of truncated vectors, where the intermediate
16008 // type is illegal, to avoid said illegality, e.g.,
16009 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
16010 // (v2i16 (truncate (v2i64)))))
16011 // ->
16012 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
16013 // (v4i32 (bitcast (v2i64))),
16014 // <0, 2, 4, 6>)))
16015 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
16016 // on both input and result type, so we might generate worse code.
16017 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
16018 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
16019 N1Opc == ISD::TRUNCATE) {
16020 SDValue N00 = N0->getOperand(0);
16021 SDValue N10 = N1->getOperand(0);
16022 EVT N00VT = N00.getValueType();
16024 if (N00VT == N10.getValueType() &&
16025 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
16026 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
16027 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
16028 SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
16029 for (size_t i = 0; i < Mask.size(); ++i)
16030 Mask[i] = i * 2;
16031 return DAG.getNode(ISD::TRUNCATE, dl, VT,
16032 DAG.getVectorShuffle(
16033 MidVT, dl,
16034 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
16035 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
16039 if (N->getOperand(0).getValueType() == MVT::v4i8) {
16040 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
16041 // loads to prevent having to go through the v4i8 load legalization that
16042 // needs to extend each element into a larger type.
16043 if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
16044 if (V.getValueType() != MVT::v4i8)
16045 return false;
16046 if (V.isUndef())
16047 return true;
16048 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
16049 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
16050 LD->getExtensionType() == ISD::NON_EXTLOAD;
16051 })) {
16052 EVT NVT =
16053 EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
16054 SmallVector<SDValue> Ops;
16056 for (unsigned i = 0; i < N->getNumOperands(); i++) {
16057 SDValue V = N->getOperand(i);
16058 if (V.isUndef())
16059 Ops.push_back(DAG.getUNDEF(MVT::f32));
16060 else {
16061 LoadSDNode *LD = cast<LoadSDNode>(V);
16062 SDValue NewLoad =
16063 DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
16064 LD->getMemOperand());
16065 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
16066 Ops.push_back(NewLoad);
16069 return DAG.getBitcast(N->getValueType(0),
16070 DAG.getBuildVector(NVT, dl, Ops));
16075 // Wait 'til after everything is legalized to try this. That way we have
16076 // legal vector types and such.
16077 if (DCI.isBeforeLegalizeOps())
16078 return SDValue();
16080 // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use
16081 // extracted subvectors from the same original vectors. Combine these into a
16082 // single avg that operates on the two original vectors.
16083 // avgceil is the target independant name for rhadd, avgfloor is a hadd.
16084 // Example:
16085 // (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>),
16086 // extract_subvector (v16i8 OpB, <0>))),
16087 // (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>),
16088 // extract_subvector (v16i8 OpB, <8>)))))
16089 // ->
16090 // (v16i8(avgceils(v16i8 OpA, v16i8 OpB)))
16091 if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
16092 (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
16093 N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) {
16094 SDValue N00 = N0->getOperand(0);
16095 SDValue N01 = N0->getOperand(1);
16096 SDValue N10 = N1->getOperand(0);
16097 SDValue N11 = N1->getOperand(1);
16099 EVT N00VT = N00.getValueType();
16100 EVT N10VT = N10.getValueType();
16102 if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16103 N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16104 N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16105 N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
16106 SDValue N00Source = N00->getOperand(0);
16107 SDValue N01Source = N01->getOperand(0);
16108 SDValue N10Source = N10->getOperand(0);
16109 SDValue N11Source = N11->getOperand(0);
16111 if (N00Source == N10Source && N01Source == N11Source &&
16112 N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
16113 assert(N0.getValueType() == N1.getValueType());
16115 uint64_t N00Index = N00.getConstantOperandVal(1);
16116 uint64_t N01Index = N01.getConstantOperandVal(1);
16117 uint64_t N10Index = N10.getConstantOperandVal(1);
16118 uint64_t N11Index = N11.getConstantOperandVal(1);
16120 if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
16121 N10Index == N00VT.getVectorNumElements())
16122 return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
16127 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
16128 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
16129 // canonicalise to that.
16130 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
16131 assert(VT.getScalarSizeInBits() == 64);
16132 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
16133 DAG.getConstant(0, dl, MVT::i64));
16136 // Canonicalise concat_vectors so that the right-hand vector has as few
16137 // bit-casts as possible before its real operation. The primary matching
16138 // destination for these operations will be the narrowing "2" instructions,
16139 // which depend on the operation being performed on this right-hand vector.
16140 // For example,
16141 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
16142 // becomes
16143 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
16145 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
16146 return SDValue();
16147 SDValue RHS = N1->getOperand(0);
16148 MVT RHSTy = RHS.getValueType().getSimpleVT();
16149 // If the RHS is not a vector, this is not the pattern we're looking for.
16150 if (!RHSTy.isVector())
16151 return SDValue();
16153 LLVM_DEBUG(
16154 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
16156 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
16157 RHSTy.getVectorNumElements() * 2);
16158 return DAG.getNode(ISD::BITCAST, dl, VT,
16159 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
16160 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
16161 RHS));
16164 static SDValue
16165 performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
16166 SelectionDAG &DAG) {
16167 if (DCI.isBeforeLegalizeOps())
16168 return SDValue();
16170 EVT VT = N->getValueType(0);
16171 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
16172 return SDValue();
16174 SDValue V = N->getOperand(0);
16176 // NOTE: This combine exists in DAGCombiner, but that version's legality check
16177 // blocks this combine because the non-const case requires custom lowering.
16179 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
16180 if (V.getOpcode() == ISD::SPLAT_VECTOR)
16181 if (isa<ConstantSDNode>(V.getOperand(0)))
16182 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
16184 return SDValue();
16187 static SDValue
16188 performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
16189 SelectionDAG &DAG) {
16190 SDLoc DL(N);
16191 SDValue Vec = N->getOperand(0);
16192 SDValue SubVec = N->getOperand(1);
16193 uint64_t IdxVal = N->getConstantOperandVal(2);
16194 EVT VecVT = Vec.getValueType();
16195 EVT SubVT = SubVec.getValueType();
16197 // Only do this for legal fixed vector types.
16198 if (!VecVT.isFixedLengthVector() ||
16199 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
16200 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
16201 return SDValue();
16203 // Ignore widening patterns.
16204 if (IdxVal == 0 && Vec.isUndef())
16205 return SDValue();
16207 // Subvector must be half the width and an "aligned" insertion.
16208 unsigned NumSubElts = SubVT.getVectorNumElements();
16209 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
16210 (IdxVal != 0 && IdxVal != NumSubElts))
16211 return SDValue();
16213 // Fold insert_subvector -> concat_vectors
16214 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
16215 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
16216 SDValue Lo, Hi;
16217 if (IdxVal == 0) {
16218 Lo = SubVec;
16219 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
16220 DAG.getVectorIdxConstant(NumSubElts, DL));
16221 } else {
16222 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
16223 DAG.getVectorIdxConstant(0, DL));
16224 Hi = SubVec;
16226 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
16229 static SDValue tryCombineFixedPointConvert(SDNode *N,
16230 TargetLowering::DAGCombinerInfo &DCI,
16231 SelectionDAG &DAG) {
16232 // Wait until after everything is legalized to try this. That way we have
16233 // legal vector types and such.
16234 if (DCI.isBeforeLegalizeOps())
16235 return SDValue();
16236 // Transform a scalar conversion of a value from a lane extract into a
16237 // lane extract of a vector conversion. E.g., from foo1 to foo2:
16238 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
16239 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
16241 // The second form interacts better with instruction selection and the
16242 // register allocator to avoid cross-class register copies that aren't
16243 // coalescable due to a lane reference.
16245 // Check the operand and see if it originates from a lane extract.
16246 SDValue Op1 = N->getOperand(1);
16247 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16248 return SDValue();
16250 // Yep, no additional predication needed. Perform the transform.
16251 SDValue IID = N->getOperand(0);
16252 SDValue Shift = N->getOperand(2);
16253 SDValue Vec = Op1.getOperand(0);
16254 SDValue Lane = Op1.getOperand(1);
16255 EVT ResTy = N->getValueType(0);
16256 EVT VecResTy;
16257 SDLoc DL(N);
16259 // The vector width should be 128 bits by the time we get here, even
16260 // if it started as 64 bits (the extract_vector handling will have
16261 // done so). Bail if it is not.
16262 if (Vec.getValueSizeInBits() != 128)
16263 return SDValue();
16265 if (Vec.getValueType() == MVT::v4i32)
16266 VecResTy = MVT::v4f32;
16267 else if (Vec.getValueType() == MVT::v2i64)
16268 VecResTy = MVT::v2f64;
16269 else
16270 return SDValue();
16272 SDValue Convert =
16273 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
16274 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
16277 // AArch64 high-vector "long" operations are formed by performing the non-high
16278 // version on an extract_subvector of each operand which gets the high half:
16280 // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
16282 // However, there are cases which don't have an extract_high explicitly, but
16283 // have another operation that can be made compatible with one for free. For
16284 // example:
16286 // (dupv64 scalar) --> (extract_high (dup128 scalar))
16288 // This routine does the actual conversion of such DUPs, once outer routines
16289 // have determined that everything else is in order.
16290 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
16291 // similarly here.
16292 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
16293 MVT VT = N.getSimpleValueType();
16294 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
16295 N.getConstantOperandVal(1) == 0)
16296 N = N.getOperand(0);
16298 switch (N.getOpcode()) {
16299 case AArch64ISD::DUP:
16300 case AArch64ISD::DUPLANE8:
16301 case AArch64ISD::DUPLANE16:
16302 case AArch64ISD::DUPLANE32:
16303 case AArch64ISD::DUPLANE64:
16304 case AArch64ISD::MOVI:
16305 case AArch64ISD::MOVIshift:
16306 case AArch64ISD::MOVIedit:
16307 case AArch64ISD::MOVImsl:
16308 case AArch64ISD::MVNIshift:
16309 case AArch64ISD::MVNImsl:
16310 break;
16311 default:
16312 // FMOV could be supported, but isn't very useful, as it would only occur
16313 // if you passed a bitcast' floating point immediate to an eligible long
16314 // integer op (addl, smull, ...).
16315 return SDValue();
16318 if (!VT.is64BitVector())
16319 return SDValue();
16321 SDLoc DL(N);
16322 unsigned NumElems = VT.getVectorNumElements();
16323 if (N.getValueType().is64BitVector()) {
16324 MVT ElementTy = VT.getVectorElementType();
16325 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
16326 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
16329 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
16330 DAG.getConstant(NumElems, DL, MVT::i64));
16333 static bool isEssentiallyExtractHighSubvector(SDValue N) {
16334 if (N.getOpcode() == ISD::BITCAST)
16335 N = N.getOperand(0);
16336 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
16337 return false;
16338 if (N.getOperand(0).getValueType().isScalableVector())
16339 return false;
16340 return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
16341 N.getOperand(0).getValueType().getVectorNumElements() / 2;
16344 /// Helper structure to keep track of ISD::SET_CC operands.
16345 struct GenericSetCCInfo {
16346 const SDValue *Opnd0;
16347 const SDValue *Opnd1;
16348 ISD::CondCode CC;
16351 /// Helper structure to keep track of a SET_CC lowered into AArch64 code.
16352 struct AArch64SetCCInfo {
16353 const SDValue *Cmp;
16354 AArch64CC::CondCode CC;
16357 /// Helper structure to keep track of SetCC information.
16358 union SetCCInfo {
16359 GenericSetCCInfo Generic;
16360 AArch64SetCCInfo AArch64;
16363 /// Helper structure to be able to read SetCC information. If set to
16364 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
16365 /// GenericSetCCInfo.
16366 struct SetCCInfoAndKind {
16367 SetCCInfo Info;
16368 bool IsAArch64;
16371 /// Check whether or not \p Op is a SET_CC operation, either a generic or
16372 /// an
16373 /// AArch64 lowered one.
16374 /// \p SetCCInfo is filled accordingly.
16375 /// \post SetCCInfo is meanginfull only when this function returns true.
16376 /// \return True when Op is a kind of SET_CC operation.
16377 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
16378 // If this is a setcc, this is straight forward.
16379 if (Op.getOpcode() == ISD::SETCC) {
16380 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
16381 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
16382 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16383 SetCCInfo.IsAArch64 = false;
16384 return true;
16386 // Otherwise, check if this is a matching csel instruction.
16387 // In other words:
16388 // - csel 1, 0, cc
16389 // - csel 0, 1, !cc
16390 if (Op.getOpcode() != AArch64ISD::CSEL)
16391 return false;
16392 // Set the information about the operands.
16393 // TODO: we want the operands of the Cmp not the csel
16394 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
16395 SetCCInfo.IsAArch64 = true;
16396 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
16397 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
16399 // Check that the operands matches the constraints:
16400 // (1) Both operands must be constants.
16401 // (2) One must be 1 and the other must be 0.
16402 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
16403 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16405 // Check (1).
16406 if (!TValue || !FValue)
16407 return false;
16409 // Check (2).
16410 if (!TValue->isOne()) {
16411 // Update the comparison when we are interested in !cc.
16412 std::swap(TValue, FValue);
16413 SetCCInfo.Info.AArch64.CC =
16414 AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
16416 return TValue->isOne() && FValue->isZero();
16419 // Returns true if Op is setcc or zext of setcc.
16420 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
16421 if (isSetCC(Op, Info))
16422 return true;
16423 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
16424 isSetCC(Op->getOperand(0), Info));
16427 // The folding we want to perform is:
16428 // (add x, [zext] (setcc cc ...) )
16429 // -->
16430 // (csel x, (add x, 1), !cc ...)
16432 // The latter will get matched to a CSINC instruction.
16433 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
16434 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
16435 SDValue LHS = Op->getOperand(0);
16436 SDValue RHS = Op->getOperand(1);
16437 SetCCInfoAndKind InfoAndKind;
16439 // If both operands are a SET_CC, then we don't want to perform this
16440 // folding and create another csel as this results in more instructions
16441 // (and higher register usage).
16442 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
16443 isSetCCOrZExtSetCC(RHS, InfoAndKind))
16444 return SDValue();
16446 // If neither operand is a SET_CC, give up.
16447 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
16448 std::swap(LHS, RHS);
16449 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
16450 return SDValue();
16453 // FIXME: This could be generatized to work for FP comparisons.
16454 EVT CmpVT = InfoAndKind.IsAArch64
16455 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
16456 : InfoAndKind.Info.Generic.Opnd0->getValueType();
16457 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
16458 return SDValue();
16460 SDValue CCVal;
16461 SDValue Cmp;
16462 SDLoc dl(Op);
16463 if (InfoAndKind.IsAArch64) {
16464 CCVal = DAG.getConstant(
16465 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
16466 MVT::i32);
16467 Cmp = *InfoAndKind.Info.AArch64.Cmp;
16468 } else
16469 Cmp = getAArch64Cmp(
16470 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
16471 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
16472 dl);
16474 EVT VT = Op->getValueType(0);
16475 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
16476 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
16479 // ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
16480 static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
16481 EVT VT = N->getValueType(0);
16482 // Only scalar integer and vector types.
16483 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
16484 return SDValue();
16486 SDValue LHS = N->getOperand(0);
16487 SDValue RHS = N->getOperand(1);
16488 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16489 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
16490 return SDValue();
16492 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
16493 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
16494 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
16495 return SDValue();
16497 SDValue Op1 = LHS->getOperand(0);
16498 SDValue Op2 = RHS->getOperand(0);
16499 EVT OpVT1 = Op1.getValueType();
16500 EVT OpVT2 = Op2.getValueType();
16501 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
16502 Op2.getOpcode() != AArch64ISD::UADDV ||
16503 OpVT1.getVectorElementType() != VT)
16504 return SDValue();
16506 SDValue Val1 = Op1.getOperand(0);
16507 SDValue Val2 = Op2.getOperand(0);
16508 EVT ValVT = Val1->getValueType(0);
16509 SDLoc DL(N);
16510 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
16511 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
16512 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
16513 DAG.getConstant(0, DL, MVT::i64));
16516 /// Perform the scalar expression combine in the form of:
16517 /// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
16518 /// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
16519 static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
16520 EVT VT = N->getValueType(0);
16521 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
16522 return SDValue();
16524 SDValue LHS = N->getOperand(0);
16525 SDValue RHS = N->getOperand(1);
16527 // Handle commutivity.
16528 if (LHS.getOpcode() != AArch64ISD::CSEL &&
16529 LHS.getOpcode() != AArch64ISD::CSNEG) {
16530 std::swap(LHS, RHS);
16531 if (LHS.getOpcode() != AArch64ISD::CSEL &&
16532 LHS.getOpcode() != AArch64ISD::CSNEG) {
16533 return SDValue();
16537 if (!LHS.hasOneUse())
16538 return SDValue();
16540 AArch64CC::CondCode AArch64CC =
16541 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
16543 // The CSEL should include a const one operand, and the CSNEG should include
16544 // One or NegOne operand.
16545 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
16546 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
16547 if (!CTVal || !CFVal)
16548 return SDValue();
16550 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
16551 (CTVal->isOne() || CFVal->isOne())) &&
16552 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
16553 (CTVal->isOne() || CFVal->isAllOnes())))
16554 return SDValue();
16556 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
16557 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
16558 !CFVal->isOne()) {
16559 std::swap(CTVal, CFVal);
16560 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
16563 SDLoc DL(N);
16564 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
16565 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
16566 !CFVal->isAllOnes()) {
16567 APInt C = -1 * CFVal->getAPIntValue();
16568 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
16569 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
16570 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
16573 // It might be neutral for larger constants, as the immediate need to be
16574 // materialized in a register.
16575 APInt ADDC = CTVal->getAPIntValue();
16576 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16577 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
16578 return SDValue();
16580 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
16581 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
16582 "Unexpected constant value");
16584 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
16585 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
16586 SDValue Cmp = LHS.getOperand(3);
16588 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
16591 // ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
16592 static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
16593 EVT VT = N->getValueType(0);
16594 if (N->getOpcode() != ISD::ADD)
16595 return SDValue();
16597 SDValue Dot = N->getOperand(0);
16598 SDValue A = N->getOperand(1);
16599 // Handle commutivity
16600 auto isZeroDot = [](SDValue Dot) {
16601 return (Dot.getOpcode() == AArch64ISD::UDOT ||
16602 Dot.getOpcode() == AArch64ISD::SDOT) &&
16603 isZerosVector(Dot.getOperand(0).getNode());
16605 if (!isZeroDot(Dot))
16606 std::swap(Dot, A);
16607 if (!isZeroDot(Dot))
16608 return SDValue();
16610 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
16611 Dot.getOperand(2));
16614 static bool isNegatedInteger(SDValue Op) {
16615 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
16618 static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
16619 SDLoc DL(Op);
16620 EVT VT = Op.getValueType();
16621 SDValue Zero = DAG.getConstant(0, DL, VT);
16622 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
16625 // Try to fold
16627 // (neg (csel X, Y)) -> (csel (neg X), (neg Y))
16629 // The folding helps csel to be matched with csneg without generating
16630 // redundant neg instruction, which includes negation of the csel expansion
16631 // of abs node lowered by lowerABS.
16632 static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
16633 if (!isNegatedInteger(SDValue(N, 0)))
16634 return SDValue();
16636 SDValue CSel = N->getOperand(1);
16637 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
16638 return SDValue();
16640 SDValue N0 = CSel.getOperand(0);
16641 SDValue N1 = CSel.getOperand(1);
16643 // If both of them is not negations, it's not worth the folding as it
16644 // introduces two additional negations while reducing one negation.
16645 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
16646 return SDValue();
16648 SDValue N0N = getNegatedInteger(N0, DAG);
16649 SDValue N1N = getNegatedInteger(N1, DAG);
16651 SDLoc DL(N);
16652 EVT VT = CSel.getValueType();
16653 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
16654 CSel.getOperand(3));
16657 // The basic add/sub long vector instructions have variants with "2" on the end
16658 // which act on the high-half of their inputs. They are normally matched by
16659 // patterns like:
16661 // (add (zeroext (extract_high LHS)),
16662 // (zeroext (extract_high RHS)))
16663 // -> uaddl2 vD, vN, vM
16665 // However, if one of the extracts is something like a duplicate, this
16666 // instruction can still be used profitably. This function puts the DAG into a
16667 // more appropriate form for those patterns to trigger.
16668 static SDValue performAddSubLongCombine(SDNode *N,
16669 TargetLowering::DAGCombinerInfo &DCI,
16670 SelectionDAG &DAG) {
16671 if (DCI.isBeforeLegalizeOps())
16672 return SDValue();
16674 MVT VT = N->getSimpleValueType(0);
16675 if (!VT.is128BitVector()) {
16676 if (N->getOpcode() == ISD::ADD)
16677 return performSetccAddFolding(N, DAG);
16678 return SDValue();
16681 // Make sure both branches are extended in the same way.
16682 SDValue LHS = N->getOperand(0);
16683 SDValue RHS = N->getOperand(1);
16684 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
16685 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
16686 LHS.getOpcode() != RHS.getOpcode())
16687 return SDValue();
16689 unsigned ExtType = LHS.getOpcode();
16691 // It's not worth doing if at least one of the inputs isn't already an
16692 // extract, but we don't know which it'll be so we have to try both.
16693 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
16694 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
16695 if (!RHS.getNode())
16696 return SDValue();
16698 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
16699 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
16700 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
16701 if (!LHS.getNode())
16702 return SDValue();
16704 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
16707 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
16710 static bool isCMP(SDValue Op) {
16711 return Op.getOpcode() == AArch64ISD::SUBS &&
16712 !Op.getNode()->hasAnyUseOfValue(0);
16715 // (CSEL 1 0 CC Cond) => CC
16716 // (CSEL 0 1 CC Cond) => !CC
16717 static Optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
16718 if (Op.getOpcode() != AArch64ISD::CSEL)
16719 return None;
16720 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
16721 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
16722 return None;
16723 SDValue OpLHS = Op.getOperand(0);
16724 SDValue OpRHS = Op.getOperand(1);
16725 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
16726 return CC;
16727 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
16728 return getInvertedCondCode(CC);
16730 return None;
16733 // (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
16734 // (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
16735 static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
16736 SDValue CmpOp = Op->getOperand(2);
16737 if (!isCMP(CmpOp))
16738 return SDValue();
16740 if (IsAdd) {
16741 if (!isOneConstant(CmpOp.getOperand(1)))
16742 return SDValue();
16743 } else {
16744 if (!isNullConstant(CmpOp.getOperand(0)))
16745 return SDValue();
16748 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
16749 auto CC = getCSETCondCode(CsetOp);
16750 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
16751 return SDValue();
16753 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
16754 Op->getOperand(0), Op->getOperand(1),
16755 CsetOp.getOperand(3));
16758 // (ADC x 0 cond) => (CINC x HS cond)
16759 static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
16760 SDValue LHS = N->getOperand(0);
16761 SDValue RHS = N->getOperand(1);
16762 SDValue Cond = N->getOperand(2);
16764 if (!isNullConstant(RHS))
16765 return SDValue();
16767 EVT VT = N->getValueType(0);
16768 SDLoc DL(N);
16770 // (CINC x cc cond) <=> (CSINC x x !cc cond)
16771 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
16772 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
16775 // Transform vector add(zext i8 to i32, zext i8 to i32)
16776 // into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
16777 // This allows extra uses of saddl/uaddl at the lower vector widths, and less
16778 // extends.
16779 static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) {
16780 EVT VT = N->getValueType(0);
16781 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
16782 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
16783 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
16784 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
16785 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
16786 N->getOperand(0).getOperand(0).getValueType() !=
16787 N->getOperand(1).getOperand(0).getValueType())
16788 return SDValue();
16790 SDValue N0 = N->getOperand(0).getOperand(0);
16791 SDValue N1 = N->getOperand(1).getOperand(0);
16792 EVT InVT = N0.getValueType();
16794 EVT S1 = InVT.getScalarType();
16795 EVT S2 = VT.getScalarType();
16796 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
16797 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
16798 SDLoc DL(N);
16799 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
16800 S2.getHalfSizedIntegerVT(*DAG.getContext()),
16801 VT.getVectorElementCount());
16802 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
16803 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
16804 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
16805 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
16807 return SDValue();
16810 static SDValue performBuildVectorCombine(SDNode *N,
16811 TargetLowering::DAGCombinerInfo &DCI,
16812 SelectionDAG &DAG) {
16813 SDLoc DL(N);
16814 EVT VT = N->getValueType(0);
16816 // A build vector of two extracted elements is equivalent to an
16817 // extract subvector where the inner vector is any-extended to the
16818 // extract_vector_elt VT.
16819 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
16820 // (extract_elt_iXX_to_i32 vec Idx+1))
16821 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
16823 // For now, only consider the v2i32 case, which arises as a result of
16824 // legalization.
16825 if (VT != MVT::v2i32)
16826 return SDValue();
16828 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
16829 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
16830 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
16831 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
16832 // Constant index.
16833 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
16834 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
16835 // Both EXTRACT_VECTOR_ELT from same vector...
16836 Elt0->getOperand(0) == Elt1->getOperand(0) &&
16837 // ... and contiguous. First element's index +1 == second element's index.
16838 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
16839 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
16840 // ResultType's known minimum vector length.
16841 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
16842 SDValue VecToExtend = Elt0->getOperand(0);
16843 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
16844 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
16845 return SDValue();
16847 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
16849 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
16850 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
16851 SubvectorIdx);
16854 return SDValue();
16857 static SDValue performAddCombineForShiftedOperands(SDNode *N,
16858 SelectionDAG &DAG) {
16859 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
16860 // commutative.
16861 if (N->getOpcode() != ISD::ADD)
16862 return SDValue();
16864 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
16865 // shifted register is only available for i32 and i64.
16866 EVT VT = N->getValueType(0);
16867 if (VT != MVT::i32 && VT != MVT::i64)
16868 return SDValue();
16870 SDLoc DL(N);
16871 SDValue LHS = N->getOperand(0);
16872 SDValue RHS = N->getOperand(1);
16874 uint64_t LHSImm = 0, RHSImm = 0;
16875 // If both operand are shifted by imm and shift amount is not greater than 4
16876 // for one operand, swap LHS and RHS to put operand with smaller shift amount
16877 // on RHS.
16879 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
16880 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
16881 // with LSL (shift > 4). For the rest of processors, this is no-op for
16882 // performance or correctness.
16883 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
16884 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
16885 RHSImm > 4 && LHS.hasOneUse())
16886 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
16888 return SDValue();
16891 static SDValue performAddSubCombine(SDNode *N,
16892 TargetLowering::DAGCombinerInfo &DCI,
16893 SelectionDAG &DAG) {
16894 // Try to change sum of two reductions.
16895 if (SDValue Val = performAddUADDVCombine(N, DAG))
16896 return Val;
16897 if (SDValue Val = performAddDotCombine(N, DAG))
16898 return Val;
16899 if (SDValue Val = performAddCSelIntoCSinc(N, DAG))
16900 return Val;
16901 if (SDValue Val = performNegCSelCombine(N, DAG))
16902 return Val;
16903 if (SDValue Val = performVectorAddSubExtCombine(N, DAG))
16904 return Val;
16905 if (SDValue Val = performAddCombineForShiftedOperands(N, DAG))
16906 return Val;
16908 return performAddSubLongCombine(N, DCI, DAG);
16911 // Massage DAGs which we can use the high-half "long" operations on into
16912 // something isel will recognize better. E.g.
16914 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
16915 // (aarch64_neon_umull (extract_high (v2i64 vec)))
16916 // (extract_high (v2i64 (dup128 scalar)))))
16918 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
16919 TargetLowering::DAGCombinerInfo &DCI,
16920 SelectionDAG &DAG) {
16921 if (DCI.isBeforeLegalizeOps())
16922 return SDValue();
16924 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
16925 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
16926 assert(LHS.getValueType().is64BitVector() &&
16927 RHS.getValueType().is64BitVector() &&
16928 "unexpected shape for long operation");
16930 // Either node could be a DUP, but it's not worth doing both of them (you'd
16931 // just as well use the non-high version) so look for a corresponding extract
16932 // operation on the other "wing".
16933 if (isEssentiallyExtractHighSubvector(LHS)) {
16934 RHS = tryExtendDUPToExtractHigh(RHS, DAG);
16935 if (!RHS.getNode())
16936 return SDValue();
16937 } else if (isEssentiallyExtractHighSubvector(RHS)) {
16938 LHS = tryExtendDUPToExtractHigh(LHS, DAG);
16939 if (!LHS.getNode())
16940 return SDValue();
16943 if (IID == Intrinsic::not_intrinsic)
16944 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
16946 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
16947 N->getOperand(0), LHS, RHS);
16950 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
16951 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
16952 unsigned ElemBits = ElemTy.getSizeInBits();
16954 int64_t ShiftAmount;
16955 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
16956 APInt SplatValue, SplatUndef;
16957 unsigned SplatBitSize;
16958 bool HasAnyUndefs;
16959 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
16960 HasAnyUndefs, ElemBits) ||
16961 SplatBitSize != ElemBits)
16962 return SDValue();
16964 ShiftAmount = SplatValue.getSExtValue();
16965 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
16966 ShiftAmount = CVN->getSExtValue();
16967 } else
16968 return SDValue();
16970 unsigned Opcode;
16971 bool IsRightShift;
16972 switch (IID) {
16973 default:
16974 llvm_unreachable("Unknown shift intrinsic");
16975 case Intrinsic::aarch64_neon_sqshl:
16976 Opcode = AArch64ISD::SQSHL_I;
16977 IsRightShift = false;
16978 break;
16979 case Intrinsic::aarch64_neon_uqshl:
16980 Opcode = AArch64ISD::UQSHL_I;
16981 IsRightShift = false;
16982 break;
16983 case Intrinsic::aarch64_neon_srshl:
16984 Opcode = AArch64ISD::SRSHR_I;
16985 IsRightShift = true;
16986 break;
16987 case Intrinsic::aarch64_neon_urshl:
16988 Opcode = AArch64ISD::URSHR_I;
16989 IsRightShift = true;
16990 break;
16991 case Intrinsic::aarch64_neon_sqshlu:
16992 Opcode = AArch64ISD::SQSHLU_I;
16993 IsRightShift = false;
16994 break;
16995 case Intrinsic::aarch64_neon_sshl:
16996 case Intrinsic::aarch64_neon_ushl:
16997 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
16998 // left shift for positive shift amounts. Below, we only replace the current
16999 // node with VSHL, if this condition is met.
17000 Opcode = AArch64ISD::VSHL;
17001 IsRightShift = false;
17002 break;
17005 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
17006 SDLoc dl(N);
17007 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
17008 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
17009 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
17010 SDLoc dl(N);
17011 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
17012 DAG.getConstant(ShiftAmount, dl, MVT::i32));
17015 return SDValue();
17018 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
17019 // the intrinsics must be legal and take an i32, this means there's almost
17020 // certainly going to be a zext in the DAG which we can eliminate.
17021 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
17022 SDValue AndN = N->getOperand(2);
17023 if (AndN.getOpcode() != ISD::AND)
17024 return SDValue();
17026 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
17027 if (!CMask || CMask->getZExtValue() != Mask)
17028 return SDValue();
17030 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
17031 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
17034 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
17035 SelectionDAG &DAG) {
17036 SDLoc dl(N);
17037 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
17038 DAG.getNode(Opc, dl,
17039 N->getOperand(1).getSimpleValueType(),
17040 N->getOperand(1)),
17041 DAG.getConstant(0, dl, MVT::i64));
17044 static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
17045 SDLoc DL(N);
17046 SDValue Op1 = N->getOperand(1);
17047 SDValue Op2 = N->getOperand(2);
17048 EVT ScalarTy = Op2.getValueType();
17049 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
17050 ScalarTy = MVT::i32;
17052 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
17053 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
17054 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
17055 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
17056 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
17057 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
17060 static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
17061 SDLoc dl(N);
17062 SDValue Scalar = N->getOperand(3);
17063 EVT ScalarTy = Scalar.getValueType();
17065 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
17066 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
17068 SDValue Passthru = N->getOperand(1);
17069 SDValue Pred = N->getOperand(2);
17070 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
17071 Pred, Scalar, Passthru);
17074 static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
17075 SDLoc dl(N);
17076 LLVMContext &Ctx = *DAG.getContext();
17077 EVT VT = N->getValueType(0);
17079 assert(VT.isScalableVector() && "Expected a scalable vector.");
17081 // Current lowering only supports the SVE-ACLE types.
17082 if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
17083 return SDValue();
17085 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
17086 unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
17087 EVT ByteVT =
17088 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
17090 // Convert everything to the domain of EXT (i.e bytes).
17091 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
17092 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
17093 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
17094 DAG.getConstant(ElemSize, dl, MVT::i32));
17096 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
17097 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
17100 static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
17101 TargetLowering::DAGCombinerInfo &DCI,
17102 SelectionDAG &DAG) {
17103 if (DCI.isBeforeLegalize())
17104 return SDValue();
17106 SDValue Comparator = N->getOperand(3);
17107 if (Comparator.getOpcode() == AArch64ISD::DUP ||
17108 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
17109 unsigned IID = getIntrinsicID(N);
17110 EVT VT = N->getValueType(0);
17111 EVT CmpVT = N->getOperand(2).getValueType();
17112 SDValue Pred = N->getOperand(1);
17113 SDValue Imm;
17114 SDLoc DL(N);
17116 switch (IID) {
17117 default:
17118 llvm_unreachable("Called with wrong intrinsic!");
17119 break;
17121 // Signed comparisons
17122 case Intrinsic::aarch64_sve_cmpeq_wide:
17123 case Intrinsic::aarch64_sve_cmpne_wide:
17124 case Intrinsic::aarch64_sve_cmpge_wide:
17125 case Intrinsic::aarch64_sve_cmpgt_wide:
17126 case Intrinsic::aarch64_sve_cmplt_wide:
17127 case Intrinsic::aarch64_sve_cmple_wide: {
17128 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
17129 int64_t ImmVal = CN->getSExtValue();
17130 if (ImmVal >= -16 && ImmVal <= 15)
17131 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
17132 else
17133 return SDValue();
17135 break;
17137 // Unsigned comparisons
17138 case Intrinsic::aarch64_sve_cmphs_wide:
17139 case Intrinsic::aarch64_sve_cmphi_wide:
17140 case Intrinsic::aarch64_sve_cmplo_wide:
17141 case Intrinsic::aarch64_sve_cmpls_wide: {
17142 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
17143 uint64_t ImmVal = CN->getZExtValue();
17144 if (ImmVal <= 127)
17145 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
17146 else
17147 return SDValue();
17149 break;
17153 if (!Imm)
17154 return SDValue();
17156 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
17157 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
17158 N->getOperand(2), Splat, DAG.getCondCode(CC));
17161 return SDValue();
17164 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
17165 AArch64CC::CondCode Cond) {
17166 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17168 SDLoc DL(Op);
17169 assert(Op.getValueType().isScalableVector() &&
17170 TLI.isTypeLegal(Op.getValueType()) &&
17171 "Expected legal scalable vector type!");
17172 assert(Op.getValueType() == Pg.getValueType() &&
17173 "Expected same type for PTEST operands");
17175 // Ensure target specific opcodes are using legal type.
17176 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
17177 SDValue TVal = DAG.getConstant(1, DL, OutVT);
17178 SDValue FVal = DAG.getConstant(0, DL, OutVT);
17180 // Ensure operands have type nxv16i1.
17181 if (Op.getValueType() != MVT::nxv16i1) {
17182 if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) &&
17183 isZeroingInactiveLanes(Op))
17184 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
17185 else
17186 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
17187 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
17190 // Set condition code (CC) flags.
17191 SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op);
17193 // Convert CC to integer based on requested condition.
17194 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
17195 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
17196 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
17197 return DAG.getZExtOrTrunc(Res, DL, VT);
17200 static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
17201 SelectionDAG &DAG) {
17202 SDLoc DL(N);
17204 SDValue Pred = N->getOperand(1);
17205 SDValue VecToReduce = N->getOperand(2);
17207 // NOTE: The integer reduction's result type is not always linked to the
17208 // operand's element type so we construct it from the intrinsic's result type.
17209 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
17210 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
17212 // SVE reductions set the whole vector register with the first element
17213 // containing the reduction result, which we'll now extract.
17214 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17215 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
17216 Zero);
17219 static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
17220 SelectionDAG &DAG) {
17221 SDLoc DL(N);
17223 SDValue Pred = N->getOperand(1);
17224 SDValue VecToReduce = N->getOperand(2);
17226 EVT ReduceVT = VecToReduce.getValueType();
17227 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
17229 // SVE reductions set the whole vector register with the first element
17230 // containing the reduction result, which we'll now extract.
17231 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17232 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
17233 Zero);
17236 static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
17237 SelectionDAG &DAG) {
17238 SDLoc DL(N);
17240 SDValue Pred = N->getOperand(1);
17241 SDValue InitVal = N->getOperand(2);
17242 SDValue VecToReduce = N->getOperand(3);
17243 EVT ReduceVT = VecToReduce.getValueType();
17245 // Ordered reductions use the first lane of the result vector as the
17246 // reduction's initial value.
17247 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
17248 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
17249 DAG.getUNDEF(ReduceVT), InitVal, Zero);
17251 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
17253 // SVE reductions set the whole vector register with the first element
17254 // containing the reduction result, which we'll now extract.
17255 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
17256 Zero);
17259 static bool isAllInactivePredicate(SDValue N) {
17260 // Look through cast.
17261 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
17262 N = N.getOperand(0);
17264 return ISD::isConstantSplatVectorAllZeros(N.getNode());
17267 static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
17268 unsigned NumElts = N.getValueType().getVectorMinNumElements();
17270 // Look through cast.
17271 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
17272 N = N.getOperand(0);
17273 // When reinterpreting from a type with fewer elements the "new" elements
17274 // are not active, so bail if they're likely to be used.
17275 if (N.getValueType().getVectorMinNumElements() < NumElts)
17276 return false;
17279 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
17280 return true;
17282 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
17283 // or smaller than the implicit element type represented by N.
17284 // NOTE: A larger element count implies a smaller element type.
17285 if (N.getOpcode() == AArch64ISD::PTRUE &&
17286 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
17287 return N.getValueType().getVectorMinNumElements() >= NumElts;
17289 // If we're compiling for a specific vector-length, we can check if the
17290 // pattern's VL equals that of the scalable vector at runtime.
17291 if (N.getOpcode() == AArch64ISD::PTRUE) {
17292 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17293 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
17294 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
17295 if (MaxSVESize && MinSVESize == MaxSVESize) {
17296 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
17297 unsigned PatNumElts =
17298 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
17299 return PatNumElts == (NumElts * VScale);
17303 return false;
17306 // If a merged operation has no inactive lanes we can relax it to a predicated
17307 // or unpredicated operation, which potentially allows better isel (perhaps
17308 // using immediate forms) or relaxing register reuse requirements.
17309 static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
17310 SelectionDAG &DAG, bool UnpredOp = false,
17311 bool SwapOperands = false) {
17312 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
17313 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
17314 SDValue Pg = N->getOperand(1);
17315 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
17316 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
17318 // ISD way to specify an all active predicate.
17319 if (isAllActivePredicate(DAG, Pg)) {
17320 if (UnpredOp)
17321 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
17323 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
17326 // FUTURE: SplatVector(true)
17327 return SDValue();
17330 static SDValue performIntrinsicCombine(SDNode *N,
17331 TargetLowering::DAGCombinerInfo &DCI,
17332 const AArch64Subtarget *Subtarget) {
17333 SelectionDAG &DAG = DCI.DAG;
17334 unsigned IID = getIntrinsicID(N);
17335 switch (IID) {
17336 default:
17337 break;
17338 case Intrinsic::get_active_lane_mask: {
17339 SDValue Res = SDValue();
17340 EVT VT = N->getValueType(0);
17341 if (VT.isFixedLengthVector()) {
17342 // We can use the SVE whilelo instruction to lower this intrinsic by
17343 // creating the appropriate sequence of scalable vector operations and
17344 // then extracting a fixed-width subvector from the scalable vector.
17346 SDLoc DL(N);
17347 SDValue ID =
17348 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
17350 EVT WhileVT = EVT::getVectorVT(
17351 *DAG.getContext(), MVT::i1,
17352 ElementCount::getScalable(VT.getVectorNumElements()));
17354 // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
17355 EVT PromVT = getPromotedVTForPredicate(WhileVT);
17357 // Get the fixed-width equivalent of PromVT for extraction.
17358 EVT ExtVT =
17359 EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
17360 VT.getVectorElementCount());
17362 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
17363 N->getOperand(1), N->getOperand(2));
17364 Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
17365 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
17366 DAG.getConstant(0, DL, MVT::i64));
17367 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
17369 return Res;
17371 case Intrinsic::aarch64_neon_vcvtfxs2fp:
17372 case Intrinsic::aarch64_neon_vcvtfxu2fp:
17373 return tryCombineFixedPointConvert(N, DCI, DAG);
17374 case Intrinsic::aarch64_neon_saddv:
17375 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
17376 case Intrinsic::aarch64_neon_uaddv:
17377 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
17378 case Intrinsic::aarch64_neon_sminv:
17379 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
17380 case Intrinsic::aarch64_neon_uminv:
17381 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
17382 case Intrinsic::aarch64_neon_smaxv:
17383 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
17384 case Intrinsic::aarch64_neon_umaxv:
17385 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
17386 case Intrinsic::aarch64_neon_fmax:
17387 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
17388 N->getOperand(1), N->getOperand(2));
17389 case Intrinsic::aarch64_neon_fmin:
17390 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
17391 N->getOperand(1), N->getOperand(2));
17392 case Intrinsic::aarch64_neon_fmaxnm:
17393 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
17394 N->getOperand(1), N->getOperand(2));
17395 case Intrinsic::aarch64_neon_fminnm:
17396 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
17397 N->getOperand(1), N->getOperand(2));
17398 case Intrinsic::aarch64_neon_smull:
17399 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
17400 N->getOperand(1), N->getOperand(2));
17401 case Intrinsic::aarch64_neon_umull:
17402 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
17403 N->getOperand(1), N->getOperand(2));
17404 case Intrinsic::aarch64_neon_pmull:
17405 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
17406 N->getOperand(1), N->getOperand(2));
17407 case Intrinsic::aarch64_neon_sqdmull:
17408 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
17409 case Intrinsic::aarch64_neon_sqshl:
17410 case Intrinsic::aarch64_neon_uqshl:
17411 case Intrinsic::aarch64_neon_sqshlu:
17412 case Intrinsic::aarch64_neon_srshl:
17413 case Intrinsic::aarch64_neon_urshl:
17414 case Intrinsic::aarch64_neon_sshl:
17415 case Intrinsic::aarch64_neon_ushl:
17416 return tryCombineShiftImm(IID, N, DAG);
17417 case Intrinsic::aarch64_crc32b:
17418 case Intrinsic::aarch64_crc32cb:
17419 return tryCombineCRC32(0xff, N, DAG);
17420 case Intrinsic::aarch64_crc32h:
17421 case Intrinsic::aarch64_crc32ch:
17422 return tryCombineCRC32(0xffff, N, DAG);
17423 case Intrinsic::aarch64_sve_saddv:
17424 // There is no i64 version of SADDV because the sign is irrelevant.
17425 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
17426 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
17427 else
17428 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
17429 case Intrinsic::aarch64_sve_uaddv:
17430 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
17431 case Intrinsic::aarch64_sve_smaxv:
17432 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
17433 case Intrinsic::aarch64_sve_umaxv:
17434 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
17435 case Intrinsic::aarch64_sve_sminv:
17436 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
17437 case Intrinsic::aarch64_sve_uminv:
17438 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
17439 case Intrinsic::aarch64_sve_orv:
17440 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
17441 case Intrinsic::aarch64_sve_eorv:
17442 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
17443 case Intrinsic::aarch64_sve_andv:
17444 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
17445 case Intrinsic::aarch64_sve_index:
17446 return LowerSVEIntrinsicIndex(N, DAG);
17447 case Intrinsic::aarch64_sve_dup:
17448 return LowerSVEIntrinsicDUP(N, DAG);
17449 case Intrinsic::aarch64_sve_dup_x:
17450 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
17451 N->getOperand(1));
17452 case Intrinsic::aarch64_sve_ext:
17453 return LowerSVEIntrinsicEXT(N, DAG);
17454 case Intrinsic::aarch64_sve_mul:
17455 return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG);
17456 case Intrinsic::aarch64_sve_smulh:
17457 return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG);
17458 case Intrinsic::aarch64_sve_umulh:
17459 return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG);
17460 case Intrinsic::aarch64_sve_smin:
17461 return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG);
17462 case Intrinsic::aarch64_sve_umin:
17463 return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG);
17464 case Intrinsic::aarch64_sve_smax:
17465 return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG);
17466 case Intrinsic::aarch64_sve_umax:
17467 return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG);
17468 case Intrinsic::aarch64_sve_lsl:
17469 return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG);
17470 case Intrinsic::aarch64_sve_lsr:
17471 return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG);
17472 case Intrinsic::aarch64_sve_asr:
17473 return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG);
17474 case Intrinsic::aarch64_sve_fadd:
17475 return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG);
17476 case Intrinsic::aarch64_sve_fsub:
17477 return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG);
17478 case Intrinsic::aarch64_sve_fmul:
17479 return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG);
17480 case Intrinsic::aarch64_sve_add:
17481 return convertMergedOpToPredOp(N, ISD::ADD, DAG, true);
17482 case Intrinsic::aarch64_sve_sub:
17483 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true);
17484 case Intrinsic::aarch64_sve_subr:
17485 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
17486 case Intrinsic::aarch64_sve_and:
17487 return convertMergedOpToPredOp(N, ISD::AND, DAG, true);
17488 case Intrinsic::aarch64_sve_bic:
17489 return convertMergedOpToPredOp(N, AArch64ISD::BIC, DAG, true);
17490 case Intrinsic::aarch64_sve_eor:
17491 return convertMergedOpToPredOp(N, ISD::XOR, DAG, true);
17492 case Intrinsic::aarch64_sve_orr:
17493 return convertMergedOpToPredOp(N, ISD::OR, DAG, true);
17494 case Intrinsic::aarch64_sve_sabd:
17495 return convertMergedOpToPredOp(N, ISD::ABDS, DAG, true);
17496 case Intrinsic::aarch64_sve_uabd:
17497 return convertMergedOpToPredOp(N, ISD::ABDU, DAG, true);
17498 case Intrinsic::aarch64_sve_sqadd:
17499 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
17500 case Intrinsic::aarch64_sve_sqsub:
17501 return convertMergedOpToPredOp(N, ISD::SSUBSAT, DAG, true);
17502 case Intrinsic::aarch64_sve_uqadd:
17503 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
17504 case Intrinsic::aarch64_sve_uqsub:
17505 return convertMergedOpToPredOp(N, ISD::USUBSAT, DAG, true);
17506 case Intrinsic::aarch64_sve_sqadd_x:
17507 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
17508 N->getOperand(1), N->getOperand(2));
17509 case Intrinsic::aarch64_sve_sqsub_x:
17510 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
17511 N->getOperand(1), N->getOperand(2));
17512 case Intrinsic::aarch64_sve_uqadd_x:
17513 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
17514 N->getOperand(1), N->getOperand(2));
17515 case Intrinsic::aarch64_sve_uqsub_x:
17516 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
17517 N->getOperand(1), N->getOperand(2));
17518 case Intrinsic::aarch64_sve_asrd:
17519 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
17520 N->getOperand(1), N->getOperand(2), N->getOperand(3));
17521 case Intrinsic::aarch64_sve_cmphs:
17522 if (!N->getOperand(2).getValueType().isFloatingPoint())
17523 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17524 N->getValueType(0), N->getOperand(1), N->getOperand(2),
17525 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
17526 break;
17527 case Intrinsic::aarch64_sve_cmphi:
17528 if (!N->getOperand(2).getValueType().isFloatingPoint())
17529 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17530 N->getValueType(0), N->getOperand(1), N->getOperand(2),
17531 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
17532 break;
17533 case Intrinsic::aarch64_sve_fcmpge:
17534 case Intrinsic::aarch64_sve_cmpge:
17535 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17536 N->getValueType(0), N->getOperand(1), N->getOperand(2),
17537 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
17538 break;
17539 case Intrinsic::aarch64_sve_fcmpgt:
17540 case Intrinsic::aarch64_sve_cmpgt:
17541 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17542 N->getValueType(0), N->getOperand(1), N->getOperand(2),
17543 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
17544 break;
17545 case Intrinsic::aarch64_sve_fcmpeq:
17546 case Intrinsic::aarch64_sve_cmpeq:
17547 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17548 N->getValueType(0), N->getOperand(1), N->getOperand(2),
17549 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
17550 break;
17551 case Intrinsic::aarch64_sve_fcmpne:
17552 case Intrinsic::aarch64_sve_cmpne:
17553 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17554 N->getValueType(0), N->getOperand(1), N->getOperand(2),
17555 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
17556 break;
17557 case Intrinsic::aarch64_sve_fcmpuo:
17558 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
17559 N->getValueType(0), N->getOperand(1), N->getOperand(2),
17560 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
17561 break;
17562 case Intrinsic::aarch64_sve_fadda:
17563 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
17564 case Intrinsic::aarch64_sve_faddv:
17565 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
17566 case Intrinsic::aarch64_sve_fmaxnmv:
17567 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
17568 case Intrinsic::aarch64_sve_fmaxv:
17569 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
17570 case Intrinsic::aarch64_sve_fminnmv:
17571 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
17572 case Intrinsic::aarch64_sve_fminv:
17573 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
17574 case Intrinsic::aarch64_sve_sel:
17575 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
17576 N->getOperand(1), N->getOperand(2), N->getOperand(3));
17577 case Intrinsic::aarch64_sve_cmpeq_wide:
17578 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
17579 case Intrinsic::aarch64_sve_cmpne_wide:
17580 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
17581 case Intrinsic::aarch64_sve_cmpge_wide:
17582 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
17583 case Intrinsic::aarch64_sve_cmpgt_wide:
17584 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
17585 case Intrinsic::aarch64_sve_cmplt_wide:
17586 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
17587 case Intrinsic::aarch64_sve_cmple_wide:
17588 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
17589 case Intrinsic::aarch64_sve_cmphs_wide:
17590 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
17591 case Intrinsic::aarch64_sve_cmphi_wide:
17592 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
17593 case Intrinsic::aarch64_sve_cmplo_wide:
17594 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
17595 case Intrinsic::aarch64_sve_cmpls_wide:
17596 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
17597 case Intrinsic::aarch64_sve_ptest_any:
17598 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
17599 AArch64CC::ANY_ACTIVE);
17600 case Intrinsic::aarch64_sve_ptest_first:
17601 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
17602 AArch64CC::FIRST_ACTIVE);
17603 case Intrinsic::aarch64_sve_ptest_last:
17604 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
17605 AArch64CC::LAST_ACTIVE);
17607 return SDValue();
17610 static bool isCheapToExtend(const SDValue &N) {
17611 unsigned OC = N->getOpcode();
17612 return OC == ISD::LOAD || OC == ISD::MLOAD ||
17613 ISD::isConstantSplatVectorAllZeros(N.getNode());
17616 static SDValue
17617 performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
17618 SelectionDAG &DAG) {
17619 // If we have (sext (setcc A B)) and A and B are cheap to extend,
17620 // we can move the sext into the arguments and have the same result. For
17621 // example, if A and B are both loads, we can make those extending loads and
17622 // avoid an extra instruction. This pattern appears often in VLS code
17623 // generation where the inputs to the setcc have a different size to the
17624 // instruction that wants to use the result of the setcc.
17625 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
17626 N->getOperand(0)->getOpcode() == ISD::SETCC);
17627 const SDValue SetCC = N->getOperand(0);
17629 const SDValue CCOp0 = SetCC.getOperand(0);
17630 const SDValue CCOp1 = SetCC.getOperand(1);
17631 if (!CCOp0->getValueType(0).isInteger() ||
17632 !CCOp1->getValueType(0).isInteger())
17633 return SDValue();
17635 ISD::CondCode Code =
17636 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
17638 ISD::NodeType ExtType =
17639 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
17641 if (isCheapToExtend(SetCC.getOperand(0)) &&
17642 isCheapToExtend(SetCC.getOperand(1))) {
17643 const SDValue Ext1 =
17644 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
17645 const SDValue Ext2 =
17646 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
17648 return DAG.getSetCC(
17649 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
17650 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
17653 return SDValue();
17656 static SDValue performExtendCombine(SDNode *N,
17657 TargetLowering::DAGCombinerInfo &DCI,
17658 SelectionDAG &DAG) {
17659 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
17660 // we can convert that DUP into another extract_high (of a bigger DUP), which
17661 // helps the backend to decide that an sabdl2 would be useful, saving a real
17662 // extract_high operation.
17663 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
17664 (N->getOperand(0).getOpcode() == ISD::ABDU ||
17665 N->getOperand(0).getOpcode() == ISD::ABDS)) {
17666 SDNode *ABDNode = N->getOperand(0).getNode();
17667 SDValue NewABD =
17668 tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);
17669 if (!NewABD.getNode())
17670 return SDValue();
17672 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
17675 if (N->getValueType(0).isFixedLengthVector() &&
17676 N->getOpcode() == ISD::SIGN_EXTEND &&
17677 N->getOperand(0)->getOpcode() == ISD::SETCC)
17678 return performSignExtendSetCCCombine(N, DCI, DAG);
17680 return SDValue();
17683 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
17684 SDValue SplatVal, unsigned NumVecElts) {
17685 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
17686 Align OrigAlignment = St.getAlign();
17687 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
17689 // Create scalar stores. This is at least as good as the code sequence for a
17690 // split unaligned store which is a dup.s, ext.b, and two stores.
17691 // Most of the time the three stores should be replaced by store pair
17692 // instructions (stp).
17693 SDLoc DL(&St);
17694 SDValue BasePtr = St.getBasePtr();
17695 uint64_t BaseOffset = 0;
17697 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
17698 SDValue NewST1 =
17699 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
17700 OrigAlignment, St.getMemOperand()->getFlags());
17702 // As this in ISel, we will not merge this add which may degrade results.
17703 if (BasePtr->getOpcode() == ISD::ADD &&
17704 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
17705 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
17706 BasePtr = BasePtr->getOperand(0);
17709 unsigned Offset = EltOffset;
17710 while (--NumVecElts) {
17711 Align Alignment = commonAlignment(OrigAlignment, Offset);
17712 SDValue OffsetPtr =
17713 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
17714 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
17715 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
17716 PtrInfo.getWithOffset(Offset), Alignment,
17717 St.getMemOperand()->getFlags());
17718 Offset += EltOffset;
17720 return NewST1;
17723 // Returns an SVE type that ContentTy can be trivially sign or zero extended
17724 // into.
17725 static MVT getSVEContainerType(EVT ContentTy) {
17726 assert(ContentTy.isSimple() && "No SVE containers for extended types");
17728 switch (ContentTy.getSimpleVT().SimpleTy) {
17729 default:
17730 llvm_unreachable("No known SVE container for this MVT type");
17731 case MVT::nxv2i8:
17732 case MVT::nxv2i16:
17733 case MVT::nxv2i32:
17734 case MVT::nxv2i64:
17735 case MVT::nxv2f32:
17736 case MVT::nxv2f64:
17737 return MVT::nxv2i64;
17738 case MVT::nxv4i8:
17739 case MVT::nxv4i16:
17740 case MVT::nxv4i32:
17741 case MVT::nxv4f32:
17742 return MVT::nxv4i32;
17743 case MVT::nxv8i8:
17744 case MVT::nxv8i16:
17745 case MVT::nxv8f16:
17746 case MVT::nxv8bf16:
17747 return MVT::nxv8i16;
17748 case MVT::nxv16i8:
17749 return MVT::nxv16i8;
17753 static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
17754 SDLoc DL(N);
17755 EVT VT = N->getValueType(0);
17757 if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
17758 return SDValue();
17760 EVT ContainerVT = VT;
17761 if (ContainerVT.isInteger())
17762 ContainerVT = getSVEContainerType(ContainerVT);
17764 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
17765 SDValue Ops[] = { N->getOperand(0), // Chain
17766 N->getOperand(2), // Pg
17767 N->getOperand(3), // Base
17768 DAG.getValueType(VT) };
17770 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
17771 SDValue LoadChain = SDValue(Load.getNode(), 1);
17773 if (ContainerVT.isInteger() && (VT != ContainerVT))
17774 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
17776 return DAG.getMergeValues({ Load, LoadChain }, DL);
17779 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
17780 SDLoc DL(N);
17781 EVT VT = N->getValueType(0);
17782 EVT PtrTy = N->getOperand(3).getValueType();
17784 EVT LoadVT = VT;
17785 if (VT.isFloatingPoint())
17786 LoadVT = VT.changeTypeToInteger();
17788 auto *MINode = cast<MemIntrinsicSDNode>(N);
17789 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
17790 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
17791 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
17792 MINode->getOperand(2), PassThru,
17793 MINode->getMemoryVT(), MINode->getMemOperand(),
17794 ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
17796 if (VT.isFloatingPoint()) {
17797 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
17798 return DAG.getMergeValues(Ops, DL);
17801 return L;
17804 template <unsigned Opcode>
17805 static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
17806 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
17807 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
17808 "Unsupported opcode.");
17809 SDLoc DL(N);
17810 EVT VT = N->getValueType(0);
17812 EVT LoadVT = VT;
17813 if (VT.isFloatingPoint())
17814 LoadVT = VT.changeTypeToInteger();
17816 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
17817 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
17818 SDValue LoadChain = SDValue(Load.getNode(), 1);
17820 if (VT.isFloatingPoint())
17821 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
17823 return DAG.getMergeValues({Load, LoadChain}, DL);
17826 static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
17827 SDLoc DL(N);
17828 SDValue Data = N->getOperand(2);
17829 EVT DataVT = Data.getValueType();
17830 EVT HwSrcVt = getSVEContainerType(DataVT);
17831 SDValue InputVT = DAG.getValueType(DataVT);
17833 if (DataVT.isFloatingPoint())
17834 InputVT = DAG.getValueType(HwSrcVt);
17836 SDValue SrcNew;
17837 if (Data.getValueType().isFloatingPoint())
17838 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
17839 else
17840 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
17842 SDValue Ops[] = { N->getOperand(0), // Chain
17843 SrcNew,
17844 N->getOperand(4), // Base
17845 N->getOperand(3), // Pg
17846 InputVT
17849 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
17852 static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
17853 SDLoc DL(N);
17855 SDValue Data = N->getOperand(2);
17856 EVT DataVT = Data.getValueType();
17857 EVT PtrTy = N->getOperand(4).getValueType();
17859 if (DataVT.isFloatingPoint())
17860 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
17862 auto *MINode = cast<MemIntrinsicSDNode>(N);
17863 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
17864 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
17865 MINode->getMemoryVT(), MINode->getMemOperand(),
17866 ISD::UNINDEXED, false, false);
17869 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
17870 /// load store optimizer pass will merge them to store pair stores. This should
17871 /// be better than a movi to create the vector zero followed by a vector store
17872 /// if the zero constant is not re-used, since one instructions and one register
17873 /// live range will be removed.
17875 /// For example, the final generated code should be:
17877 /// stp xzr, xzr, [x0]
17879 /// instead of:
17881 /// movi v0.2d, #0
17882 /// str q0, [x0]
17884 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
17885 SDValue StVal = St.getValue();
17886 EVT VT = StVal.getValueType();
17888 // Avoid scalarizing zero splat stores for scalable vectors.
17889 if (VT.isScalableVector())
17890 return SDValue();
17892 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
17893 // 2, 3 or 4 i32 elements.
17894 int NumVecElts = VT.getVectorNumElements();
17895 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
17896 VT.getVectorElementType().getSizeInBits() == 64) ||
17897 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
17898 VT.getVectorElementType().getSizeInBits() == 32)))
17899 return SDValue();
17901 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
17902 return SDValue();
17904 // If the zero constant has more than one use then the vector store could be
17905 // better since the constant mov will be amortized and stp q instructions
17906 // should be able to be formed.
17907 if (!StVal.hasOneUse())
17908 return SDValue();
17910 // If the store is truncating then it's going down to i16 or smaller, which
17911 // means it can be implemented in a single store anyway.
17912 if (St.isTruncatingStore())
17913 return SDValue();
17915 // If the immediate offset of the address operand is too large for the stp
17916 // instruction, then bail out.
17917 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
17918 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
17919 if (Offset < -512 || Offset > 504)
17920 return SDValue();
17923 for (int I = 0; I < NumVecElts; ++I) {
17924 SDValue EltVal = StVal.getOperand(I);
17925 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
17926 return SDValue();
17929 // Use a CopyFromReg WZR/XZR here to prevent
17930 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
17931 SDLoc DL(&St);
17932 unsigned ZeroReg;
17933 EVT ZeroVT;
17934 if (VT.getVectorElementType().getSizeInBits() == 32) {
17935 ZeroReg = AArch64::WZR;
17936 ZeroVT = MVT::i32;
17937 } else {
17938 ZeroReg = AArch64::XZR;
17939 ZeroVT = MVT::i64;
17941 SDValue SplatVal =
17942 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
17943 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
17946 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
17947 /// value. The load store optimizer pass will merge them to store pair stores.
17948 /// This has better performance than a splat of the scalar followed by a split
17949 /// vector store. Even if the stores are not merged it is four stores vs a dup,
17950 /// followed by an ext.b and two stores.
17951 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
17952 SDValue StVal = St.getValue();
17953 EVT VT = StVal.getValueType();
17955 // Don't replace floating point stores, they possibly won't be transformed to
17956 // stp because of the store pair suppress pass.
17957 if (VT.isFloatingPoint())
17958 return SDValue();
17960 // We can express a splat as store pair(s) for 2 or 4 elements.
17961 unsigned NumVecElts = VT.getVectorNumElements();
17962 if (NumVecElts != 4 && NumVecElts != 2)
17963 return SDValue();
17965 // If the store is truncating then it's going down to i16 or smaller, which
17966 // means it can be implemented in a single store anyway.
17967 if (St.isTruncatingStore())
17968 return SDValue();
17970 // Check that this is a splat.
17971 // Make sure that each of the relevant vector element locations are inserted
17972 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
17973 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
17974 SDValue SplatVal;
17975 for (unsigned I = 0; I < NumVecElts; ++I) {
17976 // Check for insert vector elements.
17977 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
17978 return SDValue();
17980 // Check that same value is inserted at each vector element.
17981 if (I == 0)
17982 SplatVal = StVal.getOperand(1);
17983 else if (StVal.getOperand(1) != SplatVal)
17984 return SDValue();
17986 // Check insert element index.
17987 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
17988 if (!CIndex)
17989 return SDValue();
17990 uint64_t IndexVal = CIndex->getZExtValue();
17991 if (IndexVal >= NumVecElts)
17992 return SDValue();
17993 IndexNotInserted.reset(IndexVal);
17995 StVal = StVal.getOperand(0);
17997 // Check that all vector element locations were inserted to.
17998 if (IndexNotInserted.any())
17999 return SDValue();
18001 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
18004 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18005 SelectionDAG &DAG,
18006 const AArch64Subtarget *Subtarget) {
18008 StoreSDNode *S = cast<StoreSDNode>(N);
18009 if (S->isVolatile() || S->isIndexed())
18010 return SDValue();
18012 SDValue StVal = S->getValue();
18013 EVT VT = StVal.getValueType();
18015 if (!VT.isFixedLengthVector())
18016 return SDValue();
18018 // If we get a splat of zeros, convert this vector store to a store of
18019 // scalars. They will be merged into store pairs of xzr thereby removing one
18020 // instruction and one register.
18021 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
18022 return ReplacedZeroSplat;
18024 // FIXME: The logic for deciding if an unaligned store should be split should
18025 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
18026 // a call to that function here.
18028 if (!Subtarget->isMisaligned128StoreSlow())
18029 return SDValue();
18031 // Don't split at -Oz.
18032 if (DAG.getMachineFunction().getFunction().hasMinSize())
18033 return SDValue();
18035 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
18036 // those up regresses performance on micro-benchmarks and olden/bh.
18037 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
18038 return SDValue();
18040 // Split unaligned 16B stores. They are terrible for performance.
18041 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
18042 // extensions can use this to mark that it does not want splitting to happen
18043 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
18044 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
18045 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
18046 S->getAlign() <= Align(2))
18047 return SDValue();
18049 // If we get a splat of a scalar convert this vector store to a store of
18050 // scalars. They will be merged into store pairs thereby removing two
18051 // instructions.
18052 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
18053 return ReplacedSplat;
18055 SDLoc DL(S);
18057 // Split VT into two.
18058 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18059 unsigned NumElts = HalfVT.getVectorNumElements();
18060 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
18061 DAG.getConstant(0, DL, MVT::i64));
18062 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
18063 DAG.getConstant(NumElts, DL, MVT::i64));
18064 SDValue BasePtr = S->getBasePtr();
18065 SDValue NewST1 =
18066 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
18067 S->getAlign(), S->getMemOperand()->getFlags());
18068 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
18069 DAG.getConstant(8, DL, MVT::i64));
18070 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
18071 S->getPointerInfo(), S->getAlign(),
18072 S->getMemOperand()->getFlags());
18075 static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
18076 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
18078 // splice(pg, op1, undef) -> op1
18079 if (N->getOperand(2).isUndef())
18080 return N->getOperand(1);
18082 return SDValue();
18085 static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
18086 const AArch64Subtarget *Subtarget) {
18087 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
18088 N->getOpcode() == AArch64ISD::UUNPKLO) &&
18089 "Unexpected Opcode!");
18091 // uunpklo/hi undef -> undef
18092 if (N->getOperand(0).isUndef())
18093 return DAG.getUNDEF(N->getValueType(0));
18095 // If this is a masked load followed by an UUNPKLO, fold this into a masked
18096 // extending load. We can do this even if this is already a masked
18097 // {z,}extload.
18098 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
18099 N->getOpcode() == AArch64ISD::UUNPKLO) {
18100 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
18101 SDValue Mask = MLD->getMask();
18102 SDLoc DL(N);
18104 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
18105 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
18106 (MLD->getPassThru()->isUndef() ||
18107 isZerosVector(MLD->getPassThru().getNode()))) {
18108 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
18109 unsigned PgPattern = Mask->getConstantOperandVal(0);
18110 EVT VT = N->getValueType(0);
18112 // Ensure we can double the size of the predicate pattern
18113 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
18114 if (NumElts &&
18115 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
18116 Mask =
18117 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
18118 SDValue PassThru = DAG.getConstant(0, DL, VT);
18119 SDValue NewLoad = DAG.getMaskedLoad(
18120 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
18121 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
18122 MLD->getAddressingMode(), ISD::ZEXTLOAD);
18124 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
18126 return NewLoad;
18131 return SDValue();
18134 static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
18135 SDLoc DL(N);
18136 SDValue Op0 = N->getOperand(0);
18137 SDValue Op1 = N->getOperand(1);
18138 EVT ResVT = N->getValueType(0);
18140 // uzp1(x, undef) -> concat(truncate(x), undef)
18141 if (Op1.getOpcode() == ISD::UNDEF) {
18142 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
18143 switch (ResVT.getSimpleVT().SimpleTy) {
18144 default:
18145 break;
18146 case MVT::v16i8:
18147 BCVT = MVT::v8i16;
18148 HalfVT = MVT::v8i8;
18149 break;
18150 case MVT::v8i16:
18151 BCVT = MVT::v4i32;
18152 HalfVT = MVT::v4i16;
18153 break;
18154 case MVT::v4i32:
18155 BCVT = MVT::v2i64;
18156 HalfVT = MVT::v2i32;
18157 break;
18159 if (BCVT != MVT::Other) {
18160 SDValue BC = DAG.getBitcast(BCVT, Op0);
18161 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
18162 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
18163 DAG.getUNDEF(HalfVT));
18167 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
18168 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
18169 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
18170 SDValue X = Op0.getOperand(0).getOperand(0);
18171 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
18175 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
18176 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
18177 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
18178 SDValue Z = Op1.getOperand(0).getOperand(1);
18179 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
18183 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
18184 // Only implemented on little-endian subtargets.
18185 bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
18187 // This optimization only works on little endian.
18188 if (!IsLittleEndian)
18189 return SDValue();
18191 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
18192 return SDValue();
18194 auto getSourceOp = [](SDValue Operand) -> SDValue {
18195 const unsigned Opcode = Operand.getOpcode();
18196 if (Opcode == ISD::TRUNCATE)
18197 return Operand->getOperand(0);
18198 if (Opcode == ISD::BITCAST &&
18199 Operand->getOperand(0).getOpcode() == ISD::TRUNCATE)
18200 return Operand->getOperand(0)->getOperand(0);
18201 return SDValue();
18204 SDValue SourceOp0 = getSourceOp(Op0);
18205 SDValue SourceOp1 = getSourceOp(Op1);
18207 if (!SourceOp0 || !SourceOp1)
18208 return SDValue();
18210 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
18211 !SourceOp0.getValueType().isSimple())
18212 return SDValue();
18214 EVT ResultTy;
18216 switch (SourceOp0.getSimpleValueType().SimpleTy) {
18217 case MVT::v2i64:
18218 ResultTy = MVT::v4i32;
18219 break;
18220 case MVT::v4i32:
18221 ResultTy = MVT::v8i16;
18222 break;
18223 case MVT::v8i16:
18224 ResultTy = MVT::v16i8;
18225 break;
18226 default:
18227 return SDValue();
18230 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
18231 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
18232 SDValue UzpResult =
18233 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
18235 EVT BitcastResultTy;
18237 switch (ResVT.getSimpleVT().SimpleTy) {
18238 case MVT::v2i32:
18239 BitcastResultTy = MVT::v2i64;
18240 break;
18241 case MVT::v4i16:
18242 BitcastResultTy = MVT::v4i32;
18243 break;
18244 case MVT::v8i8:
18245 BitcastResultTy = MVT::v8i16;
18246 break;
18247 default:
18248 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
18251 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
18252 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
18255 static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
18256 unsigned Opc = N->getOpcode();
18258 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
18259 Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
18260 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
18261 Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
18262 "Invalid opcode.");
18264 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
18265 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
18266 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
18267 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
18268 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
18269 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
18270 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
18271 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
18273 SDLoc DL(N);
18274 SDValue Chain = N->getOperand(0);
18275 SDValue Pg = N->getOperand(1);
18276 SDValue Base = N->getOperand(2);
18277 SDValue Offset = N->getOperand(3);
18278 SDValue Ty = N->getOperand(4);
18280 EVT ResVT = N->getValueType(0);
18282 const auto OffsetOpc = Offset.getOpcode();
18283 const bool OffsetIsZExt =
18284 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
18285 const bool OffsetIsSExt =
18286 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
18288 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
18289 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
18290 SDValue ExtPg = Offset.getOperand(0);
18291 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
18292 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
18294 // If the predicate for the sign- or zero-extended offset is the
18295 // same as the predicate used for this load and the sign-/zero-extension
18296 // was from a 32-bits...
18297 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
18298 SDValue UnextendedOffset = Offset.getOperand(1);
18300 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
18301 if (Signed)
18302 NewOpc = getSignExtendedGatherOpcode(NewOpc);
18304 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
18305 {Chain, Pg, Base, UnextendedOffset, Ty});
18309 return SDValue();
18312 /// Optimize a vector shift instruction and its operand if shifted out
18313 /// bits are not used.
18314 static SDValue performVectorShiftCombine(SDNode *N,
18315 const AArch64TargetLowering &TLI,
18316 TargetLowering::DAGCombinerInfo &DCI) {
18317 assert(N->getOpcode() == AArch64ISD::VASHR ||
18318 N->getOpcode() == AArch64ISD::VLSHR);
18320 SDValue Op = N->getOperand(0);
18321 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
18323 unsigned ShiftImm = N->getConstantOperandVal(1);
18324 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
18326 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
18327 APInt DemandedMask = ~ShiftedOutBits;
18329 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
18330 return SDValue(N, 0);
18332 return SDValue();
18335 static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
18336 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
18337 // This transform works in partnership with performSetCCPunpkCombine to
18338 // remove unnecessary transfer of predicates into standard registers and back
18339 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
18340 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
18341 MVT::i1) {
18342 SDValue CC = N->getOperand(0)->getOperand(0);
18343 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
18344 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
18345 DAG.getVectorIdxConstant(0, SDLoc(N)));
18346 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
18349 return SDValue();
18352 /// Target-specific DAG combine function for post-increment LD1 (lane) and
18353 /// post-increment LD1R.
18354 static SDValue performPostLD1Combine(SDNode *N,
18355 TargetLowering::DAGCombinerInfo &DCI,
18356 bool IsLaneOp) {
18357 if (DCI.isBeforeLegalizeOps())
18358 return SDValue();
18360 SelectionDAG &DAG = DCI.DAG;
18361 EVT VT = N->getValueType(0);
18363 if (!VT.is128BitVector() && !VT.is64BitVector())
18364 return SDValue();
18366 unsigned LoadIdx = IsLaneOp ? 1 : 0;
18367 SDNode *LD = N->getOperand(LoadIdx).getNode();
18368 // If it is not LOAD, can not do such combine.
18369 if (LD->getOpcode() != ISD::LOAD)
18370 return SDValue();
18372 // The vector lane must be a constant in the LD1LANE opcode.
18373 SDValue Lane;
18374 if (IsLaneOp) {
18375 Lane = N->getOperand(2);
18376 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
18377 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
18378 return SDValue();
18381 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
18382 EVT MemVT = LoadSDN->getMemoryVT();
18383 // Check if memory operand is the same type as the vector element.
18384 if (MemVT != VT.getVectorElementType())
18385 return SDValue();
18387 // Check if there are other uses. If so, do not combine as it will introduce
18388 // an extra load.
18389 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
18390 ++UI) {
18391 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
18392 continue;
18393 if (*UI != N)
18394 return SDValue();
18397 SDValue Addr = LD->getOperand(1);
18398 SDValue Vector = N->getOperand(0);
18399 // Search for a use of the address operand that is an increment.
18400 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
18401 Addr.getNode()->use_end(); UI != UE; ++UI) {
18402 SDNode *User = *UI;
18403 if (User->getOpcode() != ISD::ADD
18404 || UI.getUse().getResNo() != Addr.getResNo())
18405 continue;
18407 // If the increment is a constant, it must match the memory ref size.
18408 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
18409 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
18410 uint32_t IncVal = CInc->getZExtValue();
18411 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
18412 if (IncVal != NumBytes)
18413 continue;
18414 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
18417 // To avoid cycle construction make sure that neither the load nor the add
18418 // are predecessors to each other or the Vector.
18419 SmallPtrSet<const SDNode *, 32> Visited;
18420 SmallVector<const SDNode *, 16> Worklist;
18421 Visited.insert(Addr.getNode());
18422 Worklist.push_back(User);
18423 Worklist.push_back(LD);
18424 Worklist.push_back(Vector.getNode());
18425 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
18426 SDNode::hasPredecessorHelper(User, Visited, Worklist))
18427 continue;
18429 SmallVector<SDValue, 8> Ops;
18430 Ops.push_back(LD->getOperand(0)); // Chain
18431 if (IsLaneOp) {
18432 Ops.push_back(Vector); // The vector to be inserted
18433 Ops.push_back(Lane); // The lane to be inserted in the vector
18435 Ops.push_back(Addr);
18436 Ops.push_back(Inc);
18438 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
18439 SDVTList SDTys = DAG.getVTList(Tys);
18440 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
18441 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
18442 MemVT,
18443 LoadSDN->getMemOperand());
18445 // Update the uses.
18446 SDValue NewResults[] = {
18447 SDValue(LD, 0), // The result of load
18448 SDValue(UpdN.getNode(), 2) // Chain
18450 DCI.CombineTo(LD, NewResults);
18451 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
18452 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
18454 break;
18456 return SDValue();
18459 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during
18460 /// address translation.
18461 static bool performTBISimplification(SDValue Addr,
18462 TargetLowering::DAGCombinerInfo &DCI,
18463 SelectionDAG &DAG) {
18464 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
18465 KnownBits Known;
18466 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
18467 !DCI.isBeforeLegalizeOps());
18468 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18469 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
18470 DCI.CommitTargetLoweringOpt(TLO);
18471 return true;
18473 return false;
18476 static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
18477 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
18478 "Expected STORE dag node in input!");
18480 if (auto Store = dyn_cast<StoreSDNode>(N)) {
18481 if (!Store->isTruncatingStore() || Store->isIndexed())
18482 return SDValue();
18483 SDValue Ext = Store->getValue();
18484 auto ExtOpCode = Ext.getOpcode();
18485 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
18486 ExtOpCode != ISD::ANY_EXTEND)
18487 return SDValue();
18488 SDValue Orig = Ext->getOperand(0);
18489 if (Store->getMemoryVT() != Orig.getValueType())
18490 return SDValue();
18491 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
18492 Store->getBasePtr(), Store->getMemOperand());
18495 return SDValue();
18498 // Perform TBI simplification if supported by the target and try to break up
18499 // nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
18500 // load instructions can be selected.
18501 static SDValue performLOADCombine(SDNode *N,
18502 TargetLowering::DAGCombinerInfo &DCI,
18503 SelectionDAG &DAG,
18504 const AArch64Subtarget *Subtarget) {
18505 if (Subtarget->supportsAddressTopByteIgnored())
18506 performTBISimplification(N->getOperand(1), DCI, DAG);
18508 LoadSDNode *LD = cast<LoadSDNode>(N);
18509 EVT MemVT = LD->getMemoryVT();
18510 if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
18511 return SDValue(N, 0);
18513 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
18514 MemVT.getSizeInBits() % 256 == 0 ||
18515 256 % MemVT.getScalarSizeInBits() != 0)
18516 return SDValue(N, 0);
18518 SDLoc DL(LD);
18519 SDValue Chain = LD->getChain();
18520 SDValue BasePtr = LD->getBasePtr();
18521 SDNodeFlags Flags = LD->getFlags();
18522 SmallVector<SDValue, 4> LoadOps;
18523 SmallVector<SDValue, 4> LoadOpsChain;
18524 // Replace any non temporal load over 256-bit with a series of 256 bit loads
18525 // and a scalar/vector load less than 256. This way we can utilize 256-bit
18526 // loads and reduce the amount of load instructions generated.
18527 MVT NewVT =
18528 MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
18529 256 / MemVT.getVectorElementType().getSizeInBits());
18530 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
18531 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
18532 for (unsigned I = 0; I < Num256Loads; I++) {
18533 unsigned PtrOffset = I * 32;
18534 SDValue NewPtr = DAG.getMemBasePlusOffset(
18535 BasePtr, TypeSize::Fixed(PtrOffset), DL, Flags);
18536 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
18537 SDValue NewLoad = DAG.getLoad(
18538 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
18539 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
18540 LoadOps.push_back(NewLoad);
18541 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
18544 // Process remaining bits of the load operation.
18545 // This is done by creating an UNDEF vector to match the size of the
18546 // 256-bit loads and inserting the remaining load to it. We extract the
18547 // original load type at the end using EXTRACT_SUBVECTOR instruction.
18548 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
18549 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
18550 MVT RemainingVT = MVT::getVectorVT(
18551 MemVT.getVectorElementType().getSimpleVT(),
18552 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
18553 SDValue NewPtr =
18554 DAG.getMemBasePlusOffset(BasePtr, TypeSize::Fixed(PtrOffset), DL, Flags);
18555 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
18556 SDValue RemainingLoad =
18557 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
18558 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
18559 LD->getMemOperand()->getFlags(), LD->getAAInfo());
18560 SDValue UndefVector = DAG.getUNDEF(NewVT);
18561 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
18562 SDValue ExtendedReminingLoad =
18563 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
18564 {UndefVector, RemainingLoad, InsertIdx});
18565 LoadOps.push_back(ExtendedReminingLoad);
18566 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
18567 EVT ConcatVT =
18568 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18569 LoadOps.size() * NewVT.getVectorNumElements());
18570 SDValue ConcatVectors =
18571 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
18572 // Extract the original vector type size.
18573 SDValue ExtractSubVector =
18574 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
18575 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
18576 SDValue TokenFactor =
18577 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
18578 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
18581 static SDValue performSTORECombine(SDNode *N,
18582 TargetLowering::DAGCombinerInfo &DCI,
18583 SelectionDAG &DAG,
18584 const AArch64Subtarget *Subtarget) {
18585 StoreSDNode *ST = cast<StoreSDNode>(N);
18586 SDValue Chain = ST->getChain();
18587 SDValue Value = ST->getValue();
18588 SDValue Ptr = ST->getBasePtr();
18590 // If this is an FP_ROUND followed by a store, fold this into a truncating
18591 // store. We can do this even if this is already a truncstore.
18592 // We purposefully don't care about legality of the nodes here as we know
18593 // they can be split down into something legal.
18594 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
18595 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
18596 Subtarget->useSVEForFixedLengthVectors() &&
18597 Value.getValueType().isFixedLengthVector() &&
18598 Value.getValueType().getFixedSizeInBits() >=
18599 Subtarget->getMinSVEVectorSizeInBits())
18600 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
18601 ST->getMemoryVT(), ST->getMemOperand());
18603 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
18604 return Split;
18606 if (Subtarget->supportsAddressTopByteIgnored() &&
18607 performTBISimplification(N->getOperand(2), DCI, DAG))
18608 return SDValue(N, 0);
18610 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
18611 return Store;
18613 return SDValue();
18616 static SDValue performMSTORECombine(SDNode *N,
18617 TargetLowering::DAGCombinerInfo &DCI,
18618 SelectionDAG &DAG,
18619 const AArch64Subtarget *Subtarget) {
18620 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
18621 SDValue Value = MST->getValue();
18622 SDValue Mask = MST->getMask();
18623 SDLoc DL(N);
18625 // If this is a UZP1 followed by a masked store, fold this into a masked
18626 // truncating store. We can do this even if this is already a masked
18627 // truncstore.
18628 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
18629 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
18630 Value.getValueType().isInteger()) {
18631 Value = Value.getOperand(0);
18632 if (Value.getOpcode() == ISD::BITCAST) {
18633 EVT HalfVT =
18634 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
18635 EVT InVT = Value.getOperand(0).getValueType();
18637 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
18638 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
18639 unsigned PgPattern = Mask->getConstantOperandVal(0);
18641 // Ensure we can double the size of the predicate pattern
18642 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
18643 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
18644 MinSVESize) {
18645 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
18646 PgPattern);
18647 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
18648 MST->getBasePtr(), MST->getOffset(), Mask,
18649 MST->getMemoryVT(), MST->getMemOperand(),
18650 MST->getAddressingMode(),
18651 /*IsTruncating=*/true);
18657 return SDValue();
18660 /// \return true if part of the index was folded into the Base.
18661 static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
18662 SDLoc DL, SelectionDAG &DAG) {
18663 // This function assumes a vector of i64 indices.
18664 EVT IndexVT = Index.getValueType();
18665 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
18666 return false;
18668 // Simplify:
18669 // BasePtr = Ptr
18670 // Index = X + splat(Offset)
18671 // ->
18672 // BasePtr = Ptr + Offset * scale.
18673 // Index = X
18674 if (Index.getOpcode() == ISD::ADD) {
18675 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
18676 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
18677 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
18678 Index = Index.getOperand(0);
18679 return true;
18683 // Simplify:
18684 // BasePtr = Ptr
18685 // Index = (X + splat(Offset)) << splat(Shift)
18686 // ->
18687 // BasePtr = Ptr + (Offset << Shift) * scale)
18688 // Index = X << splat(shift)
18689 if (Index.getOpcode() == ISD::SHL &&
18690 Index.getOperand(0).getOpcode() == ISD::ADD) {
18691 SDValue Add = Index.getOperand(0);
18692 SDValue ShiftOp = Index.getOperand(1);
18693 SDValue OffsetOp = Add.getOperand(1);
18694 if (auto Shift = DAG.getSplatValue(ShiftOp))
18695 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
18696 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
18697 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
18698 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
18699 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
18700 Add.getOperand(0), ShiftOp);
18701 return true;
18705 return false;
18708 // Analyse the specified address returning true if a more optimal addressing
18709 // mode is available. When returning true all parameters are updated to reflect
18710 // their recommended values.
18711 static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
18712 SDValue &BasePtr, SDValue &Index,
18713 SelectionDAG &DAG) {
18714 // Try to iteratively fold parts of the index into the base pointer to
18715 // simplify the index as much as possible.
18716 bool Changed = false;
18717 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
18718 Changed = true;
18720 // Only consider element types that are pointer sized as smaller types can
18721 // be easily promoted.
18722 EVT IndexVT = Index.getValueType();
18723 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
18724 return Changed;
18726 // Can indices be trivially shrunk?
18727 EVT DataVT = N->getOperand(1).getValueType();
18728 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
18729 // will later be re-extended to 64 bits in legalization
18730 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
18731 return Changed;
18732 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
18733 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
18734 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
18735 return true;
18738 // Match:
18739 // Index = step(const)
18740 int64_t Stride = 0;
18741 if (Index.getOpcode() == ISD::STEP_VECTOR) {
18742 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
18744 // Match:
18745 // Index = step(const) << shift(const)
18746 else if (Index.getOpcode() == ISD::SHL &&
18747 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
18748 SDValue RHS = Index.getOperand(1);
18749 if (auto *Shift =
18750 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
18751 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
18752 Stride = Step << Shift->getZExtValue();
18756 // Return early because no supported pattern is found.
18757 if (Stride == 0)
18758 return Changed;
18760 if (Stride < std::numeric_limits<int32_t>::min() ||
18761 Stride > std::numeric_limits<int32_t>::max())
18762 return Changed;
18764 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
18765 unsigned MaxVScale =
18766 Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
18767 int64_t LastElementOffset =
18768 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
18770 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
18771 LastElementOffset > std::numeric_limits<int32_t>::max())
18772 return Changed;
18774 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
18775 // Stride does not scale explicitly by 'Scale', because it happens in
18776 // the gather/scatter addressing mode.
18777 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
18778 return true;
18781 static SDValue performMaskedGatherScatterCombine(
18782 SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
18783 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
18784 assert(MGS && "Can only combine gather load or scatter store nodes");
18786 if (!DCI.isBeforeLegalize())
18787 return SDValue();
18789 SDLoc DL(MGS);
18790 SDValue Chain = MGS->getChain();
18791 SDValue Scale = MGS->getScale();
18792 SDValue Index = MGS->getIndex();
18793 SDValue Mask = MGS->getMask();
18794 SDValue BasePtr = MGS->getBasePtr();
18795 ISD::MemIndexType IndexType = MGS->getIndexType();
18797 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
18798 return SDValue();
18800 // Here we catch such cases early and change MGATHER's IndexType to allow
18801 // the use of an Index that's more legalisation friendly.
18802 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
18803 SDValue PassThru = MGT->getPassThru();
18804 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
18805 return DAG.getMaskedGather(
18806 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
18807 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
18809 auto *MSC = cast<MaskedScatterSDNode>(MGS);
18810 SDValue Data = MSC->getValue();
18811 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
18812 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
18813 Ops, MSC->getMemOperand(), IndexType,
18814 MSC->isTruncatingStore());
18817 /// Target-specific DAG combine function for NEON load/store intrinsics
18818 /// to merge base address updates.
18819 static SDValue performNEONPostLDSTCombine(SDNode *N,
18820 TargetLowering::DAGCombinerInfo &DCI,
18821 SelectionDAG &DAG) {
18822 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
18823 return SDValue();
18825 unsigned AddrOpIdx = N->getNumOperands() - 1;
18826 SDValue Addr = N->getOperand(AddrOpIdx);
18828 // Search for a use of the address operand that is an increment.
18829 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
18830 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
18831 SDNode *User = *UI;
18832 if (User->getOpcode() != ISD::ADD ||
18833 UI.getUse().getResNo() != Addr.getResNo())
18834 continue;
18836 // Check that the add is independent of the load/store. Otherwise, folding
18837 // it would create a cycle.
18838 SmallPtrSet<const SDNode *, 32> Visited;
18839 SmallVector<const SDNode *, 16> Worklist;
18840 Visited.insert(Addr.getNode());
18841 Worklist.push_back(N);
18842 Worklist.push_back(User);
18843 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
18844 SDNode::hasPredecessorHelper(User, Visited, Worklist))
18845 continue;
18847 // Find the new opcode for the updating load/store.
18848 bool IsStore = false;
18849 bool IsLaneOp = false;
18850 bool IsDupOp = false;
18851 unsigned NewOpc = 0;
18852 unsigned NumVecs = 0;
18853 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
18854 switch (IntNo) {
18855 default: llvm_unreachable("unexpected intrinsic for Neon base update");
18856 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
18857 NumVecs = 2; break;
18858 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
18859 NumVecs = 3; break;
18860 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
18861 NumVecs = 4; break;
18862 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
18863 NumVecs = 2; IsStore = true; break;
18864 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
18865 NumVecs = 3; IsStore = true; break;
18866 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
18867 NumVecs = 4; IsStore = true; break;
18868 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
18869 NumVecs = 2; break;
18870 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
18871 NumVecs = 3; break;
18872 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
18873 NumVecs = 4; break;
18874 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
18875 NumVecs = 2; IsStore = true; break;
18876 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
18877 NumVecs = 3; IsStore = true; break;
18878 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
18879 NumVecs = 4; IsStore = true; break;
18880 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
18881 NumVecs = 2; IsDupOp = true; break;
18882 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
18883 NumVecs = 3; IsDupOp = true; break;
18884 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
18885 NumVecs = 4; IsDupOp = true; break;
18886 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
18887 NumVecs = 2; IsLaneOp = true; break;
18888 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
18889 NumVecs = 3; IsLaneOp = true; break;
18890 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
18891 NumVecs = 4; IsLaneOp = true; break;
18892 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
18893 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
18894 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
18895 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
18896 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
18897 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
18900 EVT VecTy;
18901 if (IsStore)
18902 VecTy = N->getOperand(2).getValueType();
18903 else
18904 VecTy = N->getValueType(0);
18906 // If the increment is a constant, it must match the memory ref size.
18907 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
18908 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
18909 uint32_t IncVal = CInc->getZExtValue();
18910 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
18911 if (IsLaneOp || IsDupOp)
18912 NumBytes /= VecTy.getVectorNumElements();
18913 if (IncVal != NumBytes)
18914 continue;
18915 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
18917 SmallVector<SDValue, 8> Ops;
18918 Ops.push_back(N->getOperand(0)); // Incoming chain
18919 // Load lane and store have vector list as input.
18920 if (IsLaneOp || IsStore)
18921 for (unsigned i = 2; i < AddrOpIdx; ++i)
18922 Ops.push_back(N->getOperand(i));
18923 Ops.push_back(Addr); // Base register
18924 Ops.push_back(Inc);
18926 // Return Types.
18927 EVT Tys[6];
18928 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
18929 unsigned n;
18930 for (n = 0; n < NumResultVecs; ++n)
18931 Tys[n] = VecTy;
18932 Tys[n++] = MVT::i64; // Type of write back register
18933 Tys[n] = MVT::Other; // Type of the chain
18934 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
18936 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
18937 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
18938 MemInt->getMemoryVT(),
18939 MemInt->getMemOperand());
18941 // Update the uses.
18942 std::vector<SDValue> NewResults;
18943 for (unsigned i = 0; i < NumResultVecs; ++i) {
18944 NewResults.push_back(SDValue(UpdN.getNode(), i));
18946 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
18947 DCI.CombineTo(N, NewResults);
18948 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
18950 break;
18952 return SDValue();
18955 // Checks to see if the value is the prescribed width and returns information
18956 // about its extension mode.
18957 static
18958 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
18959 ExtType = ISD::NON_EXTLOAD;
18960 switch(V.getNode()->getOpcode()) {
18961 default:
18962 return false;
18963 case ISD::LOAD: {
18964 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
18965 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
18966 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
18967 ExtType = LoadNode->getExtensionType();
18968 return true;
18970 return false;
18972 case ISD::AssertSext: {
18973 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
18974 if ((TypeNode->getVT() == MVT::i8 && width == 8)
18975 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
18976 ExtType = ISD::SEXTLOAD;
18977 return true;
18979 return false;
18981 case ISD::AssertZext: {
18982 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
18983 if ((TypeNode->getVT() == MVT::i8 && width == 8)
18984 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
18985 ExtType = ISD::ZEXTLOAD;
18986 return true;
18988 return false;
18990 case ISD::Constant:
18991 case ISD::TargetConstant: {
18992 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
18993 1LL << (width - 1);
18997 return true;
19000 // This function does a whole lot of voodoo to determine if the tests are
19001 // equivalent without and with a mask. Essentially what happens is that given a
19002 // DAG resembling:
19004 // +-------------+ +-------------+ +-------------+ +-------------+
19005 // | Input | | AddConstant | | CompConstant| | CC |
19006 // +-------------+ +-------------+ +-------------+ +-------------+
19007 // | | | |
19008 // V V | +----------+
19009 // +-------------+ +----+ | |
19010 // | ADD | |0xff| | |
19011 // +-------------+ +----+ | |
19012 // | | | |
19013 // V V | |
19014 // +-------------+ | |
19015 // | AND | | |
19016 // +-------------+ | |
19017 // | | |
19018 // +-----+ | |
19019 // | | |
19020 // V V V
19021 // +-------------+
19022 // | CMP |
19023 // +-------------+
19025 // The AND node may be safely removed for some combinations of inputs. In
19026 // particular we need to take into account the extension type of the Input,
19027 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
19028 // width of the input (this can work for any width inputs, the above graph is
19029 // specific to 8 bits.
19031 // The specific equations were worked out by generating output tables for each
19032 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
19033 // problem was simplified by working with 4 bit inputs, which means we only
19034 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
19035 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
19036 // patterns present in both extensions (0,7). For every distinct set of
19037 // AddConstant and CompConstants bit patterns we can consider the masked and
19038 // unmasked versions to be equivalent if the result of this function is true for
19039 // all 16 distinct bit patterns of for the current extension type of Input (w0).
19041 // sub w8, w0, w1
19042 // and w10, w8, #0x0f
19043 // cmp w8, w2
19044 // cset w9, AArch64CC
19045 // cmp w10, w2
19046 // cset w11, AArch64CC
19047 // cmp w9, w11
19048 // cset w0, eq
19049 // ret
19051 // Since the above function shows when the outputs are equivalent it defines
19052 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
19053 // would be expensive to run during compiles. The equations below were written
19054 // in a test harness that confirmed they gave equivalent outputs to the above
19055 // for all inputs function, so they can be used determine if the removal is
19056 // legal instead.
19058 // isEquivalentMaskless() is the code for testing if the AND can be removed
19059 // factored out of the DAG recognition as the DAG can take several forms.
19061 static bool isEquivalentMaskless(unsigned CC, unsigned width,
19062 ISD::LoadExtType ExtType, int AddConstant,
19063 int CompConstant) {
19064 // By being careful about our equations and only writing the in term
19065 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
19066 // make them generally applicable to all bit widths.
19067 int MaxUInt = (1 << width);
19069 // For the purposes of these comparisons sign extending the type is
19070 // equivalent to zero extending the add and displacing it by half the integer
19071 // width. Provided we are careful and make sure our equations are valid over
19072 // the whole range we can just adjust the input and avoid writing equations
19073 // for sign extended inputs.
19074 if (ExtType == ISD::SEXTLOAD)
19075 AddConstant -= (1 << (width-1));
19077 switch(CC) {
19078 case AArch64CC::LE:
19079 case AArch64CC::GT:
19080 if ((AddConstant == 0) ||
19081 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
19082 (AddConstant >= 0 && CompConstant < 0) ||
19083 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
19084 return true;
19085 break;
19086 case AArch64CC::LT:
19087 case AArch64CC::GE:
19088 if ((AddConstant == 0) ||
19089 (AddConstant >= 0 && CompConstant <= 0) ||
19090 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
19091 return true;
19092 break;
19093 case AArch64CC::HI:
19094 case AArch64CC::LS:
19095 if ((AddConstant >= 0 && CompConstant < 0) ||
19096 (AddConstant <= 0 && CompConstant >= -1 &&
19097 CompConstant < AddConstant + MaxUInt))
19098 return true;
19099 break;
19100 case AArch64CC::PL:
19101 case AArch64CC::MI:
19102 if ((AddConstant == 0) ||
19103 (AddConstant > 0 && CompConstant <= 0) ||
19104 (AddConstant < 0 && CompConstant <= AddConstant))
19105 return true;
19106 break;
19107 case AArch64CC::LO:
19108 case AArch64CC::HS:
19109 if ((AddConstant >= 0 && CompConstant <= 0) ||
19110 (AddConstant <= 0 && CompConstant >= 0 &&
19111 CompConstant <= AddConstant + MaxUInt))
19112 return true;
19113 break;
19114 case AArch64CC::EQ:
19115 case AArch64CC::NE:
19116 if ((AddConstant > 0 && CompConstant < 0) ||
19117 (AddConstant < 0 && CompConstant >= 0 &&
19118 CompConstant < AddConstant + MaxUInt) ||
19119 (AddConstant >= 0 && CompConstant >= 0 &&
19120 CompConstant >= AddConstant) ||
19121 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
19122 return true;
19123 break;
19124 case AArch64CC::VS:
19125 case AArch64CC::VC:
19126 case AArch64CC::AL:
19127 case AArch64CC::NV:
19128 return true;
19129 case AArch64CC::Invalid:
19130 break;
19133 return false;
19136 static
19137 SDValue performCONDCombine(SDNode *N,
19138 TargetLowering::DAGCombinerInfo &DCI,
19139 SelectionDAG &DAG, unsigned CCIndex,
19140 unsigned CmpIndex) {
19141 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
19142 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
19143 unsigned CondOpcode = SubsNode->getOpcode();
19145 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0))
19146 return SDValue();
19148 // There is a SUBS feeding this condition. Is it fed by a mask we can
19149 // use?
19151 SDNode *AndNode = SubsNode->getOperand(0).getNode();
19152 unsigned MaskBits = 0;
19154 if (AndNode->getOpcode() != ISD::AND)
19155 return SDValue();
19157 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
19158 uint32_t CNV = CN->getZExtValue();
19159 if (CNV == 255)
19160 MaskBits = 8;
19161 else if (CNV == 65535)
19162 MaskBits = 16;
19165 if (!MaskBits)
19166 return SDValue();
19168 SDValue AddValue = AndNode->getOperand(0);
19170 if (AddValue.getOpcode() != ISD::ADD)
19171 return SDValue();
19173 // The basic dag structure is correct, grab the inputs and validate them.
19175 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
19176 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
19177 SDValue SubsInputValue = SubsNode->getOperand(1);
19179 // The mask is present and the provenance of all the values is a smaller type,
19180 // lets see if the mask is superfluous.
19182 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
19183 !isa<ConstantSDNode>(SubsInputValue.getNode()))
19184 return SDValue();
19186 ISD::LoadExtType ExtType;
19188 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
19189 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
19190 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
19191 return SDValue();
19193 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
19194 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
19195 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
19196 return SDValue();
19198 // The AND is not necessary, remove it.
19200 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
19201 SubsNode->getValueType(1));
19202 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
19204 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
19205 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
19207 return SDValue(N, 0);
19210 // Optimize compare with zero and branch.
19211 static SDValue performBRCONDCombine(SDNode *N,
19212 TargetLowering::DAGCombinerInfo &DCI,
19213 SelectionDAG &DAG) {
19214 MachineFunction &MF = DAG.getMachineFunction();
19215 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
19216 // will not be produced, as they are conditional branch instructions that do
19217 // not set flags.
19218 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
19219 return SDValue();
19221 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
19222 N = NV.getNode();
19223 SDValue Chain = N->getOperand(0);
19224 SDValue Dest = N->getOperand(1);
19225 SDValue CCVal = N->getOperand(2);
19226 SDValue Cmp = N->getOperand(3);
19228 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
19229 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
19230 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
19231 return SDValue();
19233 unsigned CmpOpc = Cmp.getOpcode();
19234 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
19235 return SDValue();
19237 // Only attempt folding if there is only one use of the flag and no use of the
19238 // value.
19239 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
19240 return SDValue();
19242 SDValue LHS = Cmp.getOperand(0);
19243 SDValue RHS = Cmp.getOperand(1);
19245 assert(LHS.getValueType() == RHS.getValueType() &&
19246 "Expected the value type to be the same for both operands!");
19247 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
19248 return SDValue();
19250 if (isNullConstant(LHS))
19251 std::swap(LHS, RHS);
19253 if (!isNullConstant(RHS))
19254 return SDValue();
19256 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
19257 LHS.getOpcode() == ISD::SRL)
19258 return SDValue();
19260 // Fold the compare into the branch instruction.
19261 SDValue BR;
19262 if (CC == AArch64CC::EQ)
19263 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
19264 else
19265 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
19267 // Do not add new nodes to DAG combiner worklist.
19268 DCI.CombineTo(N, BR, false);
19270 return SDValue();
19273 static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
19274 unsigned CC = N->getConstantOperandVal(2);
19275 SDValue SUBS = N->getOperand(3);
19276 SDValue Zero, CTTZ;
19278 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
19279 Zero = N->getOperand(0);
19280 CTTZ = N->getOperand(1);
19281 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
19282 Zero = N->getOperand(1);
19283 CTTZ = N->getOperand(0);
19284 } else
19285 return SDValue();
19287 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
19288 (CTTZ.getOpcode() == ISD::TRUNCATE &&
19289 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
19290 return SDValue();
19292 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
19293 "Illegal type in CTTZ folding");
19295 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
19296 return SDValue();
19298 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
19299 ? CTTZ.getOperand(0).getOperand(0)
19300 : CTTZ.getOperand(0);
19302 if (X != SUBS.getOperand(0))
19303 return SDValue();
19305 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
19306 ? CTTZ.getOperand(0).getValueSizeInBits()
19307 : CTTZ.getValueSizeInBits();
19308 SDValue BitWidthMinusOne =
19309 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
19310 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
19311 BitWidthMinusOne);
19314 // (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
19315 // (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
19316 // Where x and y are constants
19318 // (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
19319 // (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
19320 // Where x and y are constants
19321 static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
19322 SDValue L = Op->getOperand(0);
19323 SDValue R = Op->getOperand(1);
19324 AArch64CC::CondCode OpCC =
19325 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
19327 SDValue OpCmp = Op->getOperand(3);
19328 if (!isCMP(OpCmp))
19329 return SDValue();
19331 SDValue CmpLHS = OpCmp.getOperand(0);
19332 SDValue CmpRHS = OpCmp.getOperand(1);
19334 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
19335 std::swap(CmpLHS, CmpRHS);
19336 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
19337 return SDValue();
19339 SDValue X = CmpLHS->getOperand(0);
19340 SDValue Y = CmpLHS->getOperand(1);
19341 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y)) {
19342 return SDValue();
19345 AArch64CC::CondCode CC =
19346 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
19347 SDValue Cond = CmpLHS->getOperand(3);
19349 if (CmpRHS == Y)
19350 CC = AArch64CC::getInvertedCondCode(CC);
19351 else if (CmpRHS != X)
19352 return SDValue();
19354 if (OpCC == AArch64CC::NE)
19355 CC = AArch64CC::getInvertedCondCode(CC);
19356 else if (OpCC != AArch64CC::EQ)
19357 return SDValue();
19359 SDLoc DL(Op);
19360 EVT VT = Op->getValueType(0);
19362 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
19363 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
19366 // Optimize CSEL instructions
19367 static SDValue performCSELCombine(SDNode *N,
19368 TargetLowering::DAGCombinerInfo &DCI,
19369 SelectionDAG &DAG) {
19370 // CSEL x, x, cc -> x
19371 if (N->getOperand(0) == N->getOperand(1))
19372 return N->getOperand(0);
19374 if (SDValue R = foldCSELOfCSEL(N, DAG))
19375 return R;
19377 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
19378 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
19379 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
19380 return Folded;
19382 return performCONDCombine(N, DCI, DAG, 2, 3);
19385 // Try to re-use an already extended operand of a vector SetCC feeding a
19386 // extended select. Doing so avoids requiring another full extension of the
19387 // SET_CC result when lowering the select.
19388 static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
19389 EVT Op0MVT = Op->getOperand(0).getValueType();
19390 if (!Op0MVT.isVector() || Op->use_empty())
19391 return SDValue();
19393 // Make sure that all uses of Op are VSELECTs with result matching types where
19394 // the result type has a larger element type than the SetCC operand.
19395 SDNode *FirstUse = *Op->use_begin();
19396 if (FirstUse->getOpcode() != ISD::VSELECT)
19397 return SDValue();
19398 EVT UseMVT = FirstUse->getValueType(0);
19399 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
19400 return SDValue();
19401 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
19402 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
19404 return SDValue();
19406 APInt V;
19407 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
19408 return SDValue();
19410 SDLoc DL(Op);
19411 SDValue Op0ExtV;
19412 SDValue Op1ExtV;
19413 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
19414 // Check if the first operand of the SET_CC is already extended. If it is,
19415 // split the SET_CC and re-use the extended version of the operand.
19416 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
19417 Op->getOperand(0));
19418 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
19419 Op->getOperand(0));
19420 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
19421 Op0ExtV = SDValue(Op0SExt, 0);
19422 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
19423 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
19424 Op0ExtV = SDValue(Op0ZExt, 0);
19425 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
19426 } else
19427 return SDValue();
19429 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
19430 Op0ExtV, Op1ExtV, Op->getOperand(2));
19433 static SDValue performSETCCCombine(SDNode *N,
19434 TargetLowering::DAGCombinerInfo &DCI,
19435 SelectionDAG &DAG) {
19436 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
19437 SDValue LHS = N->getOperand(0);
19438 SDValue RHS = N->getOperand(1);
19439 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
19440 SDLoc DL(N);
19441 EVT VT = N->getValueType(0);
19443 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
19444 return V;
19446 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
19447 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
19448 LHS->getOpcode() == AArch64ISD::CSEL &&
19449 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
19450 LHS->hasOneUse()) {
19451 // Invert CSEL's condition.
19452 auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2));
19453 auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue());
19454 auto NewCond = getInvertedCondCode(OldCond);
19456 // csel 0, 1, !cond, X
19457 SDValue CSEL =
19458 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
19459 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
19460 LHS.getOperand(3));
19461 return DAG.getZExtOrTrunc(CSEL, DL, VT);
19464 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
19465 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
19466 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
19467 LHS->hasOneUse()) {
19468 EVT TstVT = LHS->getValueType(0);
19469 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
19470 // this pattern will get better opt in emitComparison
19471 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
19472 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
19473 DAG.getConstant(TstImm, DL, TstVT));
19474 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
19478 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
19479 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
19480 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
19481 (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
19482 LHS->getOpcode() == ISD::BITCAST) {
19483 EVT ToVT = LHS->getValueType(0);
19484 EVT FromVT = LHS->getOperand(0).getValueType();
19485 if (FromVT.isFixedLengthVector() &&
19486 FromVT.getVectorElementType() == MVT::i1) {
19487 LHS = DAG.getNode(ISD::VECREDUCE_OR, DL, MVT::i1, LHS->getOperand(0));
19488 LHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ToVT, LHS);
19489 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
19493 return SDValue();
19496 // Replace a flag-setting operator (eg ANDS) with the generic version
19497 // (eg AND) if the flag is unused.
19498 static SDValue performFlagSettingCombine(SDNode *N,
19499 TargetLowering::DAGCombinerInfo &DCI,
19500 unsigned GenericOpcode) {
19501 SDLoc DL(N);
19502 SDValue LHS = N->getOperand(0);
19503 SDValue RHS = N->getOperand(1);
19504 EVT VT = N->getValueType(0);
19506 // If the flag result isn't used, convert back to a generic opcode.
19507 if (!N->hasAnyUseOfValue(1)) {
19508 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
19509 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
19510 DL);
19513 // Combine identical generic nodes into this node, re-using the result.
19514 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
19515 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
19516 DCI.CombineTo(Generic, SDValue(N, 0));
19518 return SDValue();
19521 static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
19522 // setcc_merge_zero pred
19523 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
19524 // => extract_subvector (inner setcc_merge_zero)
19525 SDValue Pred = N->getOperand(0);
19526 SDValue LHS = N->getOperand(1);
19527 SDValue RHS = N->getOperand(2);
19528 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
19530 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
19531 LHS->getOpcode() != ISD::SIGN_EXTEND)
19532 return SDValue();
19534 SDValue Extract = LHS->getOperand(0);
19535 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
19536 Extract->getValueType(0) != N->getValueType(0) ||
19537 Extract->getConstantOperandVal(1) != 0)
19538 return SDValue();
19540 SDValue InnerSetCC = Extract->getOperand(0);
19541 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
19542 return SDValue();
19544 // By this point we've effectively got
19545 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
19546 // lanes are already zero then the trunc(sext()) sequence is redundant and we
19547 // can operate on A directly.
19548 SDValue InnerPred = InnerSetCC.getOperand(0);
19549 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
19550 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
19551 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
19552 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
19553 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
19554 return Extract;
19556 return SDValue();
19559 static SDValue
19560 performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
19561 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
19562 "Unexpected opcode!");
19564 SelectionDAG &DAG = DCI.DAG;
19565 SDValue Pred = N->getOperand(0);
19566 SDValue LHS = N->getOperand(1);
19567 SDValue RHS = N->getOperand(2);
19568 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
19570 if (SDValue V = performSetCCPunpkCombine(N, DAG))
19571 return V;
19573 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
19574 LHS->getOpcode() == ISD::SIGN_EXTEND &&
19575 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
19576 // setcc_merge_zero(
19577 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
19578 // => setcc_merge_zero(pred, ...)
19579 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
19580 LHS->getOperand(0)->getOperand(0) == Pred)
19581 return LHS->getOperand(0);
19583 // setcc_merge_zero(
19584 // all_active, extend(nxvNi1 ...), != splat(0))
19585 // -> nxvNi1 ...
19586 if (isAllActivePredicate(DAG, Pred))
19587 return LHS->getOperand(0);
19589 // setcc_merge_zero(
19590 // pred, extend(nxvNi1 ...), != splat(0))
19591 // -> nxvNi1 and(pred, ...)
19592 if (DCI.isAfterLegalizeDAG())
19593 // Do this after legalization to allow more folds on setcc_merge_zero
19594 // to be recognized.
19595 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
19596 LHS->getOperand(0), Pred);
19599 return SDValue();
19602 // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
19603 // as well as whether the test should be inverted. This code is required to
19604 // catch these cases (as opposed to standard dag combines) because
19605 // AArch64ISD::TBZ is matched during legalization.
19606 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
19607 SelectionDAG &DAG) {
19609 if (!Op->hasOneUse())
19610 return Op;
19612 // We don't handle undef/constant-fold cases below, as they should have
19613 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
19614 // etc.)
19616 // (tbz (trunc x), b) -> (tbz x, b)
19617 // This case is just here to enable more of the below cases to be caught.
19618 if (Op->getOpcode() == ISD::TRUNCATE &&
19619 Bit < Op->getValueType(0).getSizeInBits()) {
19620 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19623 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
19624 if (Op->getOpcode() == ISD::ANY_EXTEND &&
19625 Bit < Op->getOperand(0).getValueSizeInBits()) {
19626 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19629 if (Op->getNumOperands() != 2)
19630 return Op;
19632 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19633 if (!C)
19634 return Op;
19636 switch (Op->getOpcode()) {
19637 default:
19638 return Op;
19640 // (tbz (and x, m), b) -> (tbz x, b)
19641 case ISD::AND:
19642 if ((C->getZExtValue() >> Bit) & 1)
19643 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19644 return Op;
19646 // (tbz (shl x, c), b) -> (tbz x, b-c)
19647 case ISD::SHL:
19648 if (C->getZExtValue() <= Bit &&
19649 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
19650 Bit = Bit - C->getZExtValue();
19651 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19653 return Op;
19655 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
19656 case ISD::SRA:
19657 Bit = Bit + C->getZExtValue();
19658 if (Bit >= Op->getValueType(0).getSizeInBits())
19659 Bit = Op->getValueType(0).getSizeInBits() - 1;
19660 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19662 // (tbz (srl x, c), b) -> (tbz x, b+c)
19663 case ISD::SRL:
19664 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
19665 Bit = Bit + C->getZExtValue();
19666 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19668 return Op;
19670 // (tbz (xor x, -1), b) -> (tbnz x, b)
19671 case ISD::XOR:
19672 if ((C->getZExtValue() >> Bit) & 1)
19673 Invert = !Invert;
19674 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
19678 // Optimize test single bit zero/non-zero and branch.
19679 static SDValue performTBZCombine(SDNode *N,
19680 TargetLowering::DAGCombinerInfo &DCI,
19681 SelectionDAG &DAG) {
19682 unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
19683 bool Invert = false;
19684 SDValue TestSrc = N->getOperand(1);
19685 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
19687 if (TestSrc == NewTestSrc)
19688 return SDValue();
19690 unsigned NewOpc = N->getOpcode();
19691 if (Invert) {
19692 if (NewOpc == AArch64ISD::TBZ)
19693 NewOpc = AArch64ISD::TBNZ;
19694 else {
19695 assert(NewOpc == AArch64ISD::TBNZ);
19696 NewOpc = AArch64ISD::TBZ;
19700 SDLoc DL(N);
19701 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
19702 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
19705 // Swap vselect operands where it may allow a predicated operation to achieve
19706 // the `sel`.
19708 // (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
19709 // => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
19710 static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
19711 auto SelectA = N->getOperand(1);
19712 auto SelectB = N->getOperand(2);
19713 auto NTy = N->getValueType(0);
19715 if (!NTy.isScalableVector())
19716 return SDValue();
19717 SDValue SetCC = N->getOperand(0);
19718 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
19719 return SDValue();
19721 switch (SelectB.getOpcode()) {
19722 default:
19723 return SDValue();
19724 case ISD::FMUL:
19725 case ISD::FSUB:
19726 case ISD::FADD:
19727 break;
19729 if (SelectA != SelectB.getOperand(0))
19730 return SDValue();
19732 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
19733 ISD::CondCode InverseCC =
19734 ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType());
19735 auto InverseSetCC =
19736 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
19737 SetCC.getOperand(1), InverseCC);
19739 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
19740 {InverseSetCC, SelectB, SelectA});
19743 // vselect (v1i1 setcc) ->
19744 // vselect (v1iXX setcc) (XX is the size of the compared operand type)
19745 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
19746 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
19747 // such VSELECT.
19748 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
19749 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
19750 return SwapResult;
19752 SDValue N0 = N->getOperand(0);
19753 EVT CCVT = N0.getValueType();
19755 if (isAllActivePredicate(DAG, N0))
19756 return N->getOperand(1);
19758 if (isAllInactivePredicate(N0))
19759 return N->getOperand(2);
19761 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
19762 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
19763 // supported types.
19764 SDValue SetCC = N->getOperand(0);
19765 if (SetCC.getOpcode() == ISD::SETCC &&
19766 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
19767 SDValue CmpLHS = SetCC.getOperand(0);
19768 EVT VT = CmpLHS.getValueType();
19769 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
19770 SDNode *SplatLHS = N->getOperand(1).getNode();
19771 SDNode *SplatRHS = N->getOperand(2).getNode();
19772 APInt SplatLHSVal;
19773 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
19774 VT.isSimple() &&
19775 is_contained(
19776 makeArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
19777 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
19778 VT.getSimpleVT().SimpleTy) &&
19779 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
19780 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
19781 ISD::isConstantSplatVectorAllOnes(SplatRHS)) {
19782 unsigned NumElts = VT.getVectorNumElements();
19783 SmallVector<SDValue, 8> Ops(
19784 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
19785 VT.getScalarType()));
19786 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
19788 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
19789 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
19790 return Or;
19794 if (N0.getOpcode() != ISD::SETCC ||
19795 CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
19796 CCVT.getVectorElementType() != MVT::i1)
19797 return SDValue();
19799 EVT ResVT = N->getValueType(0);
19800 EVT CmpVT = N0.getOperand(0).getValueType();
19801 // Only combine when the result type is of the same size as the compared
19802 // operands.
19803 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
19804 return SDValue();
19806 SDValue IfTrue = N->getOperand(1);
19807 SDValue IfFalse = N->getOperand(2);
19808 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
19809 N0.getOperand(0), N0.getOperand(1),
19810 cast<CondCodeSDNode>(N0.getOperand(2))->get());
19811 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
19812 IfTrue, IfFalse);
19815 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
19816 /// the compare-mask instructions rather than going via NZCV, even if LHS and
19817 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
19818 /// with a vector one followed by a DUP shuffle on the result.
19819 static SDValue performSelectCombine(SDNode *N,
19820 TargetLowering::DAGCombinerInfo &DCI) {
19821 SelectionDAG &DAG = DCI.DAG;
19822 SDValue N0 = N->getOperand(0);
19823 EVT ResVT = N->getValueType(0);
19825 if (N0.getOpcode() != ISD::SETCC)
19826 return SDValue();
19828 if (ResVT.isScalableVector())
19829 return SDValue();
19831 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
19832 // scalar SetCCResultType. We also don't expect vectors, because we assume
19833 // that selects fed by vector SETCCs are canonicalized to VSELECT.
19834 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
19835 "Scalar-SETCC feeding SELECT has unexpected result type!");
19837 // If NumMaskElts == 0, the comparison is larger than select result. The
19838 // largest real NEON comparison is 64-bits per lane, which means the result is
19839 // at most 32-bits and an illegal vector. Just bail out for now.
19840 EVT SrcVT = N0.getOperand(0).getValueType();
19842 // Don't try to do this optimization when the setcc itself has i1 operands.
19843 // There are no legal vectors of i1, so this would be pointless.
19844 if (SrcVT == MVT::i1)
19845 return SDValue();
19847 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
19848 if (!ResVT.isVector() || NumMaskElts == 0)
19849 return SDValue();
19851 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
19852 EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
19854 // Also bail out if the vector CCVT isn't the same size as ResVT.
19855 // This can happen if the SETCC operand size doesn't divide the ResVT size
19856 // (e.g., f64 vs v3f32).
19857 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
19858 return SDValue();
19860 // Make sure we didn't create illegal types, if we're not supposed to.
19861 assert(DCI.isBeforeLegalize() ||
19862 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
19864 // First perform a vector comparison, where lane 0 is the one we're interested
19865 // in.
19866 SDLoc DL(N0);
19867 SDValue LHS =
19868 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
19869 SDValue RHS =
19870 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
19871 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
19873 // Now duplicate the comparison mask we want across all other lanes.
19874 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
19875 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
19876 Mask = DAG.getNode(ISD::BITCAST, DL,
19877 ResVT.changeVectorElementTypeToInteger(), Mask);
19879 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
19882 static SDValue performDUPCombine(SDNode *N,
19883 TargetLowering::DAGCombinerInfo &DCI) {
19884 EVT VT = N->getValueType(0);
19885 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
19886 // 128bit vector version.
19887 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
19888 EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
19889 if (SDNode *LN = DCI.DAG.getNodeIfExists(
19890 N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) {
19891 SDLoc DL(N);
19892 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
19893 DCI.DAG.getConstant(0, DL, MVT::i64));
19897 return performPostLD1Combine(N, DCI, false);
19900 /// Get rid of unnecessary NVCASTs (that don't change the type).
19901 static SDValue performNVCASTCombine(SDNode *N) {
19902 if (N->getValueType(0) == N->getOperand(0).getValueType())
19903 return N->getOperand(0);
19905 return SDValue();
19908 // If all users of the globaladdr are of the form (globaladdr + constant), find
19909 // the smallest constant, fold it into the globaladdr's offset and rewrite the
19910 // globaladdr as (globaladdr + constant) - constant.
19911 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
19912 const AArch64Subtarget *Subtarget,
19913 const TargetMachine &TM) {
19914 auto *GN = cast<GlobalAddressSDNode>(N);
19915 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
19916 AArch64II::MO_NO_FLAG)
19917 return SDValue();
19919 uint64_t MinOffset = -1ull;
19920 for (SDNode *N : GN->uses()) {
19921 if (N->getOpcode() != ISD::ADD)
19922 return SDValue();
19923 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
19924 if (!C)
19925 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
19926 if (!C)
19927 return SDValue();
19928 MinOffset = std::min(MinOffset, C->getZExtValue());
19930 uint64_t Offset = MinOffset + GN->getOffset();
19932 // Require that the new offset is larger than the existing one. Otherwise, we
19933 // can end up oscillating between two possible DAGs, for example,
19934 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
19935 if (Offset <= uint64_t(GN->getOffset()))
19936 return SDValue();
19938 // Check whether folding this offset is legal. It must not go out of bounds of
19939 // the referenced object to avoid violating the code model, and must be
19940 // smaller than 2^20 because this is the largest offset expressible in all
19941 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
19942 // stores an immediate signed 21 bit offset.)
19944 // This check also prevents us from folding negative offsets, which will end
19945 // up being treated in the same way as large positive ones. They could also
19946 // cause code model violations, and aren't really common enough to matter.
19947 if (Offset >= (1 << 20))
19948 return SDValue();
19950 const GlobalValue *GV = GN->getGlobal();
19951 Type *T = GV->getValueType();
19952 if (!T->isSized() ||
19953 Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
19954 return SDValue();
19956 SDLoc DL(GN);
19957 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
19958 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
19959 DAG.getConstant(MinOffset, DL, MVT::i64));
19962 // Turns the vector of indices into a vector of byte offstes by scaling Offset
19963 // by (BitWidth / 8).
19964 static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
19965 SDLoc DL, unsigned BitWidth) {
19966 assert(Offset.getValueType().isScalableVector() &&
19967 "This method is only for scalable vectors of offsets");
19969 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
19970 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
19972 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
19975 /// Check if the value of \p OffsetInBytes can be used as an immediate for
19976 /// the gather load/prefetch and scatter store instructions with vector base and
19977 /// immediate offset addressing mode:
19979 /// [<Zn>.[S|D]{, #<imm>}]
19981 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
19982 inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
19983 unsigned ScalarSizeInBytes) {
19984 // The immediate is not a multiple of the scalar size.
19985 if (OffsetInBytes % ScalarSizeInBytes)
19986 return false;
19988 // The immediate is out of range.
19989 if (OffsetInBytes / ScalarSizeInBytes > 31)
19990 return false;
19992 return true;
19995 /// Check if the value of \p Offset represents a valid immediate for the SVE
19996 /// gather load/prefetch and scatter store instructiona with vector base and
19997 /// immediate offset addressing mode:
19999 /// [<Zn>.[S|D]{, #<imm>}]
20001 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
20002 static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
20003 unsigned ScalarSizeInBytes) {
20004 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
20005 return OffsetConst && isValidImmForSVEVecImmAddrMode(
20006 OffsetConst->getZExtValue(), ScalarSizeInBytes);
20009 static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
20010 unsigned Opcode,
20011 bool OnlyPackedOffsets = true) {
20012 const SDValue Src = N->getOperand(2);
20013 const EVT SrcVT = Src->getValueType(0);
20014 assert(SrcVT.isScalableVector() &&
20015 "Scatter stores are only possible for SVE vectors");
20017 SDLoc DL(N);
20018 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
20020 // Make sure that source data will fit into an SVE register
20021 if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
20022 return SDValue();
20024 // For FPs, ACLE only supports _packed_ single and double precision types.
20025 if (SrcElVT.isFloatingPoint())
20026 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
20027 return SDValue();
20029 // Depending on the addressing mode, this is either a pointer or a vector of
20030 // pointers (that fits into one register)
20031 SDValue Base = N->getOperand(4);
20032 // Depending on the addressing mode, this is either a single offset or a
20033 // vector of offsets (that fits into one register)
20034 SDValue Offset = N->getOperand(5);
20036 // For "scalar + vector of indices", just scale the indices. This only
20037 // applies to non-temporal scatters because there's no instruction that takes
20038 // indicies.
20039 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
20040 Offset =
20041 getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
20042 Opcode = AArch64ISD::SSTNT1_PRED;
20045 // In the case of non-temporal gather loads there's only one SVE instruction
20046 // per data-size: "scalar + vector", i.e.
20047 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
20048 // Since we do have intrinsics that allow the arguments to be in a different
20049 // order, we may need to swap them to match the spec.
20050 if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
20051 std::swap(Base, Offset);
20053 // SST1_IMM requires that the offset is an immediate that is:
20054 // * a multiple of #SizeInBytes,
20055 // * in the range [0, 31 x #SizeInBytes],
20056 // where #SizeInBytes is the size in bytes of the stored items. For
20057 // immediates outside that range and non-immediate scalar offsets use SST1 or
20058 // SST1_UXTW instead.
20059 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
20060 if (!isValidImmForSVEVecImmAddrMode(Offset,
20061 SrcVT.getScalarSizeInBits() / 8)) {
20062 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
20063 Opcode = AArch64ISD::SST1_UXTW_PRED;
20064 else
20065 Opcode = AArch64ISD::SST1_PRED;
20067 std::swap(Base, Offset);
20071 auto &TLI = DAG.getTargetLoweringInfo();
20072 if (!TLI.isTypeLegal(Base.getValueType()))
20073 return SDValue();
20075 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
20076 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
20077 // nxv2i64. Legalize accordingly.
20078 if (!OnlyPackedOffsets &&
20079 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
20080 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
20082 if (!TLI.isTypeLegal(Offset.getValueType()))
20083 return SDValue();
20085 // Source value type that is representable in hardware
20086 EVT HwSrcVt = getSVEContainerType(SrcVT);
20088 // Keep the original type of the input data to store - this is needed to be
20089 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
20090 // FP values we want the integer equivalent, so just use HwSrcVt.
20091 SDValue InputVT = DAG.getValueType(SrcVT);
20092 if (SrcVT.isFloatingPoint())
20093 InputVT = DAG.getValueType(HwSrcVt);
20095 SDVTList VTs = DAG.getVTList(MVT::Other);
20096 SDValue SrcNew;
20098 if (Src.getValueType().isFloatingPoint())
20099 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
20100 else
20101 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
20103 SDValue Ops[] = {N->getOperand(0), // Chain
20104 SrcNew,
20105 N->getOperand(3), // Pg
20106 Base,
20107 Offset,
20108 InputVT};
20110 return DAG.getNode(Opcode, DL, VTs, Ops);
20113 static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
20114 unsigned Opcode,
20115 bool OnlyPackedOffsets = true) {
20116 const EVT RetVT = N->getValueType(0);
20117 assert(RetVT.isScalableVector() &&
20118 "Gather loads are only possible for SVE vectors");
20120 SDLoc DL(N);
20122 // Make sure that the loaded data will fit into an SVE register
20123 if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
20124 return SDValue();
20126 // Depending on the addressing mode, this is either a pointer or a vector of
20127 // pointers (that fits into one register)
20128 SDValue Base = N->getOperand(3);
20129 // Depending on the addressing mode, this is either a single offset or a
20130 // vector of offsets (that fits into one register)
20131 SDValue Offset = N->getOperand(4);
20133 // For "scalar + vector of indices", just scale the indices. This only
20134 // applies to non-temporal gathers because there's no instruction that takes
20135 // indicies.
20136 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
20137 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
20138 RetVT.getScalarSizeInBits());
20139 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
20142 // In the case of non-temporal gather loads there's only one SVE instruction
20143 // per data-size: "scalar + vector", i.e.
20144 // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
20145 // Since we do have intrinsics that allow the arguments to be in a different
20146 // order, we may need to swap them to match the spec.
20147 if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
20148 Offset.getValueType().isVector())
20149 std::swap(Base, Offset);
20151 // GLD{FF}1_IMM requires that the offset is an immediate that is:
20152 // * a multiple of #SizeInBytes,
20153 // * in the range [0, 31 x #SizeInBytes],
20154 // where #SizeInBytes is the size in bytes of the loaded items. For
20155 // immediates outside that range and non-immediate scalar offsets use
20156 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
20157 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
20158 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
20159 if (!isValidImmForSVEVecImmAddrMode(Offset,
20160 RetVT.getScalarSizeInBits() / 8)) {
20161 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
20162 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
20163 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
20164 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
20165 else
20166 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
20167 ? AArch64ISD::GLD1_MERGE_ZERO
20168 : AArch64ISD::GLDFF1_MERGE_ZERO;
20170 std::swap(Base, Offset);
20174 auto &TLI = DAG.getTargetLoweringInfo();
20175 if (!TLI.isTypeLegal(Base.getValueType()))
20176 return SDValue();
20178 // Some gather load variants allow unpacked offsets, but only as nxv2i32
20179 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
20180 // nxv2i64. Legalize accordingly.
20181 if (!OnlyPackedOffsets &&
20182 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
20183 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
20185 // Return value type that is representable in hardware
20186 EVT HwRetVt = getSVEContainerType(RetVT);
20188 // Keep the original output value type around - this is needed to be able to
20189 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
20190 // values we want the integer equivalent, so just use HwRetVT.
20191 SDValue OutVT = DAG.getValueType(RetVT);
20192 if (RetVT.isFloatingPoint())
20193 OutVT = DAG.getValueType(HwRetVt);
20195 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
20196 SDValue Ops[] = {N->getOperand(0), // Chain
20197 N->getOperand(2), // Pg
20198 Base, Offset, OutVT};
20200 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
20201 SDValue LoadChain = SDValue(Load.getNode(), 1);
20203 if (RetVT.isInteger() && (RetVT != HwRetVt))
20204 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
20206 // If the original return value was FP, bitcast accordingly. Doing it here
20207 // means that we can avoid adding TableGen patterns for FPs.
20208 if (RetVT.isFloatingPoint())
20209 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
20211 return DAG.getMergeValues({Load, LoadChain}, DL);
20214 static SDValue
20215 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
20216 SelectionDAG &DAG) {
20217 SDLoc DL(N);
20218 SDValue Src = N->getOperand(0);
20219 unsigned Opc = Src->getOpcode();
20221 // Sign extend of an unsigned unpack -> signed unpack
20222 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
20224 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
20225 : AArch64ISD::SUNPKLO;
20227 // Push the sign extend to the operand of the unpack
20228 // This is necessary where, for example, the operand of the unpack
20229 // is another unpack:
20230 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
20231 // ->
20232 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
20233 // ->
20234 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
20235 SDValue ExtOp = Src->getOperand(0);
20236 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
20237 EVT EltTy = VT.getVectorElementType();
20238 (void)EltTy;
20240 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
20241 "Sign extending from an invalid type");
20243 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
20245 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
20246 ExtOp, DAG.getValueType(ExtVT));
20248 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
20251 if (DCI.isBeforeLegalizeOps())
20252 return SDValue();
20254 if (!EnableCombineMGatherIntrinsics)
20255 return SDValue();
20257 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
20258 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
20259 unsigned NewOpc;
20260 unsigned MemVTOpNum = 4;
20261 switch (Opc) {
20262 case AArch64ISD::LD1_MERGE_ZERO:
20263 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
20264 MemVTOpNum = 3;
20265 break;
20266 case AArch64ISD::LDNF1_MERGE_ZERO:
20267 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
20268 MemVTOpNum = 3;
20269 break;
20270 case AArch64ISD::LDFF1_MERGE_ZERO:
20271 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
20272 MemVTOpNum = 3;
20273 break;
20274 case AArch64ISD::GLD1_MERGE_ZERO:
20275 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
20276 break;
20277 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
20278 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
20279 break;
20280 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
20281 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
20282 break;
20283 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
20284 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
20285 break;
20286 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
20287 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
20288 break;
20289 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
20290 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
20291 break;
20292 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
20293 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
20294 break;
20295 case AArch64ISD::GLDFF1_MERGE_ZERO:
20296 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
20297 break;
20298 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
20299 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
20300 break;
20301 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
20302 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
20303 break;
20304 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
20305 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
20306 break;
20307 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
20308 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
20309 break;
20310 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
20311 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
20312 break;
20313 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
20314 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
20315 break;
20316 case AArch64ISD::GLDNT1_MERGE_ZERO:
20317 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
20318 break;
20319 default:
20320 return SDValue();
20323 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
20324 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
20326 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
20327 return SDValue();
20329 EVT DstVT = N->getValueType(0);
20330 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
20332 SmallVector<SDValue, 5> Ops;
20333 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
20334 Ops.push_back(Src->getOperand(I));
20336 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
20337 DCI.CombineTo(N, ExtLoad);
20338 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
20340 // Return N so it doesn't get rechecked
20341 return SDValue(N, 0);
20344 /// Legalize the gather prefetch (scalar + vector addressing mode) when the
20345 /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
20346 /// != nxv2i32) do not need legalization.
20347 static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
20348 const unsigned OffsetPos = 4;
20349 SDValue Offset = N->getOperand(OffsetPos);
20351 // Not an unpacked vector, bail out.
20352 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
20353 return SDValue();
20355 // Extend the unpacked offset vector to 64-bit lanes.
20356 SDLoc DL(N);
20357 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
20358 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
20359 // Replace the offset operand with the 64-bit one.
20360 Ops[OffsetPos] = Offset;
20362 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
20365 /// Combines a node carrying the intrinsic
20366 /// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
20367 /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
20368 /// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
20369 /// sve gather prefetch instruction with vector plus immediate addressing mode.
20370 static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
20371 unsigned ScalarSizeInBytes) {
20372 const unsigned ImmPos = 4, OffsetPos = 3;
20373 // No need to combine the node if the immediate is valid...
20374 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
20375 return SDValue();
20377 // ...otherwise swap the offset base with the offset...
20378 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
20379 std::swap(Ops[ImmPos], Ops[OffsetPos]);
20380 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
20381 // `aarch64_sve_prfb_gather_uxtw_index`.
20382 SDLoc DL(N);
20383 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
20384 MVT::i64);
20386 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
20389 // Return true if the vector operation can guarantee only the first lane of its
20390 // result contains data, with all bits in other lanes set to zero.
20391 static bool isLanes1toNKnownZero(SDValue Op) {
20392 switch (Op.getOpcode()) {
20393 default:
20394 return false;
20395 case AArch64ISD::ANDV_PRED:
20396 case AArch64ISD::EORV_PRED:
20397 case AArch64ISD::FADDA_PRED:
20398 case AArch64ISD::FADDV_PRED:
20399 case AArch64ISD::FMAXNMV_PRED:
20400 case AArch64ISD::FMAXV_PRED:
20401 case AArch64ISD::FMINNMV_PRED:
20402 case AArch64ISD::FMINV_PRED:
20403 case AArch64ISD::ORV_PRED:
20404 case AArch64ISD::SADDV_PRED:
20405 case AArch64ISD::SMAXV_PRED:
20406 case AArch64ISD::SMINV_PRED:
20407 case AArch64ISD::UADDV_PRED:
20408 case AArch64ISD::UMAXV_PRED:
20409 case AArch64ISD::UMINV_PRED:
20410 return true;
20414 static SDValue removeRedundantInsertVectorElt(SDNode *N) {
20415 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
20416 SDValue InsertVec = N->getOperand(0);
20417 SDValue InsertElt = N->getOperand(1);
20418 SDValue InsertIdx = N->getOperand(2);
20420 // We only care about inserts into the first element...
20421 if (!isNullConstant(InsertIdx))
20422 return SDValue();
20423 // ...of a zero'd vector...
20424 if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode()))
20425 return SDValue();
20426 // ...where the inserted data was previously extracted...
20427 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
20428 return SDValue();
20430 SDValue ExtractVec = InsertElt.getOperand(0);
20431 SDValue ExtractIdx = InsertElt.getOperand(1);
20433 // ...from the first element of a vector.
20434 if (!isNullConstant(ExtractIdx))
20435 return SDValue();
20437 // If we get here we are effectively trying to zero lanes 1-N of a vector.
20439 // Ensure there's no type conversion going on.
20440 if (N->getValueType(0) != ExtractVec.getValueType())
20441 return SDValue();
20443 if (!isLanes1toNKnownZero(ExtractVec))
20444 return SDValue();
20446 // The explicit zeroing is redundant.
20447 return ExtractVec;
20450 static SDValue
20451 performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
20452 if (SDValue Res = removeRedundantInsertVectorElt(N))
20453 return Res;
20455 return performPostLD1Combine(N, DCI, true);
20458 static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
20459 EVT Ty = N->getValueType(0);
20460 if (Ty.isInteger())
20461 return SDValue();
20463 EVT IntTy = Ty.changeVectorElementTypeToInteger();
20464 EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount());
20465 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
20466 IntTy.getVectorElementType().getScalarSizeInBits())
20467 return SDValue();
20469 SDLoc DL(N);
20470 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
20471 DL, ExtIntTy);
20472 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
20473 DL, ExtIntTy);
20474 SDValue Idx = N->getOperand(2);
20475 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
20476 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
20477 return DAG.getBitcast(Ty, Trunc);
20480 static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
20481 TargetLowering::DAGCombinerInfo &DCI,
20482 const AArch64Subtarget *Subtarget) {
20483 SDValue N0 = N->getOperand(0);
20484 EVT VT = N->getValueType(0);
20486 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
20487 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
20488 return SDValue();
20490 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
20491 // We purposefully don't care about legality of the nodes here as we know
20492 // they can be split down into something legal.
20493 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
20494 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
20495 VT.isFixedLengthVector() &&
20496 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
20497 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
20498 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
20499 LN0->getChain(), LN0->getBasePtr(),
20500 N0.getValueType(), LN0->getMemOperand());
20501 DCI.CombineTo(N, ExtLoad);
20502 DCI.CombineTo(
20503 N0.getNode(),
20504 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
20505 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
20506 ExtLoad.getValue(1));
20507 return SDValue(N, 0); // Return N so it doesn't get rechecked!
20510 return SDValue();
20513 static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
20514 const AArch64Subtarget *Subtarget) {
20515 EVT VT = N->getValueType(0);
20517 // Don't expand for NEON, SVE2 or SME
20518 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
20519 return SDValue();
20521 SDLoc DL(N);
20523 SDValue Mask = N->getOperand(0);
20524 SDValue In1 = N->getOperand(1);
20525 SDValue In2 = N->getOperand(2);
20527 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
20528 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
20529 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
20530 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
20533 static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
20534 EVT VT = N->getValueType(0);
20536 SDValue Insert = N->getOperand(0);
20537 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
20538 return SDValue();
20540 if (!Insert.getOperand(0).isUndef())
20541 return SDValue();
20543 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
20544 uint64_t IdxDupLane = N->getConstantOperandVal(1);
20545 if (IdxInsert != 0 || IdxDupLane != 0)
20546 return SDValue();
20548 SDValue Bitcast = Insert.getOperand(1);
20549 if (Bitcast.getOpcode() != ISD::BITCAST)
20550 return SDValue();
20552 SDValue Subvec = Bitcast.getOperand(0);
20553 EVT SubvecVT = Subvec.getValueType();
20554 if (!SubvecVT.is128BitVector())
20555 return SDValue();
20556 EVT NewSubvecVT =
20557 getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());
20559 SDLoc DL(N);
20560 SDValue NewInsert =
20561 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
20562 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
20563 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
20564 NewInsert, N->getOperand(1));
20565 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
20568 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
20569 DAGCombinerInfo &DCI) const {
20570 SelectionDAG &DAG = DCI.DAG;
20571 switch (N->getOpcode()) {
20572 default:
20573 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
20574 break;
20575 case ISD::ADD:
20576 case ISD::SUB:
20577 return performAddSubCombine(N, DCI, DAG);
20578 case ISD::BUILD_VECTOR:
20579 return performBuildVectorCombine(N, DCI, DAG);
20580 case AArch64ISD::ANDS:
20581 return performFlagSettingCombine(N, DCI, ISD::AND);
20582 case AArch64ISD::ADC:
20583 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
20584 return R;
20585 return foldADCToCINC(N, DAG);
20586 case AArch64ISD::SBC:
20587 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
20588 case AArch64ISD::ADCS:
20589 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
20590 return R;
20591 return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
20592 case AArch64ISD::SBCS:
20593 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
20594 return R;
20595 return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
20596 case ISD::XOR:
20597 return performXorCombine(N, DAG, DCI, Subtarget);
20598 case ISD::MUL:
20599 return performMulCombine(N, DAG, DCI, Subtarget);
20600 case ISD::SINT_TO_FP:
20601 case ISD::UINT_TO_FP:
20602 return performIntToFpCombine(N, DAG, Subtarget);
20603 case ISD::FP_TO_SINT:
20604 case ISD::FP_TO_UINT:
20605 case ISD::FP_TO_SINT_SAT:
20606 case ISD::FP_TO_UINT_SAT:
20607 return performFpToIntCombine(N, DAG, DCI, Subtarget);
20608 case ISD::FDIV:
20609 return performFDivCombine(N, DAG, DCI, Subtarget);
20610 case ISD::OR:
20611 return performORCombine(N, DCI, Subtarget);
20612 case ISD::AND:
20613 return performANDCombine(N, DCI);
20614 case ISD::INTRINSIC_WO_CHAIN:
20615 return performIntrinsicCombine(N, DCI, Subtarget);
20616 case ISD::ANY_EXTEND:
20617 case ISD::ZERO_EXTEND:
20618 case ISD::SIGN_EXTEND:
20619 return performExtendCombine(N, DCI, DAG);
20620 case ISD::SIGN_EXTEND_INREG:
20621 return performSignExtendInRegCombine(N, DCI, DAG);
20622 case ISD::CONCAT_VECTORS:
20623 return performConcatVectorsCombine(N, DCI, DAG);
20624 case ISD::EXTRACT_SUBVECTOR:
20625 return performExtractSubvectorCombine(N, DCI, DAG);
20626 case ISD::INSERT_SUBVECTOR:
20627 return performInsertSubvectorCombine(N, DCI, DAG);
20628 case ISD::SELECT:
20629 return performSelectCombine(N, DCI);
20630 case ISD::VSELECT:
20631 return performVSelectCombine(N, DCI.DAG);
20632 case ISD::SETCC:
20633 return performSETCCCombine(N, DCI, DAG);
20634 case ISD::LOAD:
20635 return performLOADCombine(N, DCI, DAG, Subtarget);
20636 case ISD::STORE:
20637 return performSTORECombine(N, DCI, DAG, Subtarget);
20638 case ISD::MSTORE:
20639 return performMSTORECombine(N, DCI, DAG, Subtarget);
20640 case ISD::MGATHER:
20641 case ISD::MSCATTER:
20642 return performMaskedGatherScatterCombine(N, DCI, DAG);
20643 case ISD::VECTOR_SPLICE:
20644 return performSVESpliceCombine(N, DAG);
20645 case ISD::FP_EXTEND:
20646 return performFPExtendCombine(N, DAG, DCI, Subtarget);
20647 case AArch64ISD::BRCOND:
20648 return performBRCONDCombine(N, DCI, DAG);
20649 case AArch64ISD::TBNZ:
20650 case AArch64ISD::TBZ:
20651 return performTBZCombine(N, DCI, DAG);
20652 case AArch64ISD::CSEL:
20653 return performCSELCombine(N, DCI, DAG);
20654 case AArch64ISD::DUP:
20655 return performDUPCombine(N, DCI);
20656 case AArch64ISD::DUPLANE128:
20657 return performDupLane128Combine(N, DAG);
20658 case AArch64ISD::NVCAST:
20659 return performNVCASTCombine(N);
20660 case AArch64ISD::SPLICE:
20661 return performSpliceCombine(N, DAG);
20662 case AArch64ISD::UUNPKLO:
20663 case AArch64ISD::UUNPKHI:
20664 return performUnpackCombine(N, DAG, Subtarget);
20665 case AArch64ISD::UZP1:
20666 return performUzpCombine(N, DAG);
20667 case AArch64ISD::SETCC_MERGE_ZERO:
20668 return performSetccMergeZeroCombine(N, DCI);
20669 case AArch64ISD::GLD1_MERGE_ZERO:
20670 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
20671 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
20672 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
20673 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
20674 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
20675 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
20676 case AArch64ISD::GLD1S_MERGE_ZERO:
20677 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
20678 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
20679 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
20680 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
20681 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
20682 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
20683 return performGLD1Combine(N, DAG);
20684 case AArch64ISD::VASHR:
20685 case AArch64ISD::VLSHR:
20686 return performVectorShiftCombine(N, *this, DCI);
20687 case AArch64ISD::SUNPKLO:
20688 return performSunpkloCombine(N, DAG);
20689 case AArch64ISD::BSP:
20690 return performBSPExpandForSVE(N, DAG, Subtarget);
20691 case ISD::INSERT_VECTOR_ELT:
20692 return performInsertVectorEltCombine(N, DCI);
20693 case ISD::EXTRACT_VECTOR_ELT:
20694 return performExtractVectorEltCombine(N, DCI, Subtarget);
20695 case ISD::VECREDUCE_ADD:
20696 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
20697 case AArch64ISD::UADDV:
20698 return performUADDVCombine(N, DAG);
20699 case AArch64ISD::SMULL:
20700 case AArch64ISD::UMULL:
20701 case AArch64ISD::PMULL:
20702 return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG);
20703 case ISD::INTRINSIC_VOID:
20704 case ISD::INTRINSIC_W_CHAIN:
20705 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
20706 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
20707 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
20708 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
20709 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
20710 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
20711 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
20712 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
20713 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
20714 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
20715 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
20716 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
20717 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
20718 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
20719 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
20720 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
20721 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
20722 return legalizeSVEGatherPrefetchOffsVec(N, DAG);
20723 case Intrinsic::aarch64_neon_ld2:
20724 case Intrinsic::aarch64_neon_ld3:
20725 case Intrinsic::aarch64_neon_ld4:
20726 case Intrinsic::aarch64_neon_ld1x2:
20727 case Intrinsic::aarch64_neon_ld1x3:
20728 case Intrinsic::aarch64_neon_ld1x4:
20729 case Intrinsic::aarch64_neon_ld2lane:
20730 case Intrinsic::aarch64_neon_ld3lane:
20731 case Intrinsic::aarch64_neon_ld4lane:
20732 case Intrinsic::aarch64_neon_ld2r:
20733 case Intrinsic::aarch64_neon_ld3r:
20734 case Intrinsic::aarch64_neon_ld4r:
20735 case Intrinsic::aarch64_neon_st2:
20736 case Intrinsic::aarch64_neon_st3:
20737 case Intrinsic::aarch64_neon_st4:
20738 case Intrinsic::aarch64_neon_st1x2:
20739 case Intrinsic::aarch64_neon_st1x3:
20740 case Intrinsic::aarch64_neon_st1x4:
20741 case Intrinsic::aarch64_neon_st2lane:
20742 case Intrinsic::aarch64_neon_st3lane:
20743 case Intrinsic::aarch64_neon_st4lane:
20744 return performNEONPostLDSTCombine(N, DCI, DAG);
20745 case Intrinsic::aarch64_sve_ldnt1:
20746 return performLDNT1Combine(N, DAG);
20747 case Intrinsic::aarch64_sve_ld1rq:
20748 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
20749 case Intrinsic::aarch64_sve_ld1ro:
20750 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
20751 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
20752 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
20753 case Intrinsic::aarch64_sve_ldnt1_gather:
20754 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
20755 case Intrinsic::aarch64_sve_ldnt1_gather_index:
20756 return performGatherLoadCombine(N, DAG,
20757 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
20758 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
20759 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
20760 case Intrinsic::aarch64_sve_ld1:
20761 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
20762 case Intrinsic::aarch64_sve_ldnf1:
20763 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
20764 case Intrinsic::aarch64_sve_ldff1:
20765 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
20766 case Intrinsic::aarch64_sve_st1:
20767 return performST1Combine(N, DAG);
20768 case Intrinsic::aarch64_sve_stnt1:
20769 return performSTNT1Combine(N, DAG);
20770 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
20771 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
20772 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
20773 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
20774 case Intrinsic::aarch64_sve_stnt1_scatter:
20775 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
20776 case Intrinsic::aarch64_sve_stnt1_scatter_index:
20777 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
20778 case Intrinsic::aarch64_sve_ld1_gather:
20779 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
20780 case Intrinsic::aarch64_sve_ld1_gather_index:
20781 return performGatherLoadCombine(N, DAG,
20782 AArch64ISD::GLD1_SCALED_MERGE_ZERO);
20783 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
20784 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
20785 /*OnlyPackedOffsets=*/false);
20786 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
20787 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
20788 /*OnlyPackedOffsets=*/false);
20789 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
20790 return performGatherLoadCombine(N, DAG,
20791 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
20792 /*OnlyPackedOffsets=*/false);
20793 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
20794 return performGatherLoadCombine(N, DAG,
20795 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
20796 /*OnlyPackedOffsets=*/false);
20797 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
20798 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
20799 case Intrinsic::aarch64_sve_ldff1_gather:
20800 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
20801 case Intrinsic::aarch64_sve_ldff1_gather_index:
20802 return performGatherLoadCombine(N, DAG,
20803 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
20804 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
20805 return performGatherLoadCombine(N, DAG,
20806 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
20807 /*OnlyPackedOffsets=*/false);
20808 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
20809 return performGatherLoadCombine(N, DAG,
20810 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
20811 /*OnlyPackedOffsets=*/false);
20812 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
20813 return performGatherLoadCombine(N, DAG,
20814 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
20815 /*OnlyPackedOffsets=*/false);
20816 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
20817 return performGatherLoadCombine(N, DAG,
20818 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
20819 /*OnlyPackedOffsets=*/false);
20820 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
20821 return performGatherLoadCombine(N, DAG,
20822 AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
20823 case Intrinsic::aarch64_sve_st1_scatter:
20824 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
20825 case Intrinsic::aarch64_sve_st1_scatter_index:
20826 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
20827 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
20828 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
20829 /*OnlyPackedOffsets=*/false);
20830 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
20831 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
20832 /*OnlyPackedOffsets=*/false);
20833 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
20834 return performScatterStoreCombine(N, DAG,
20835 AArch64ISD::SST1_SXTW_SCALED_PRED,
20836 /*OnlyPackedOffsets=*/false);
20837 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
20838 return performScatterStoreCombine(N, DAG,
20839 AArch64ISD::SST1_UXTW_SCALED_PRED,
20840 /*OnlyPackedOffsets=*/false);
20841 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
20842 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
20843 case Intrinsic::aarch64_rndr:
20844 case Intrinsic::aarch64_rndrrs: {
20845 unsigned IntrinsicID =
20846 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
20847 auto Register =
20848 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
20849 : AArch64SysReg::RNDRRS);
20850 SDLoc DL(N);
20851 SDValue A = DAG.getNode(
20852 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
20853 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
20854 SDValue B = DAG.getNode(
20855 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
20856 DAG.getConstant(0, DL, MVT::i32),
20857 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
20858 return DAG.getMergeValues(
20859 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
20861 default:
20862 break;
20864 break;
20865 case ISD::GlobalAddress:
20866 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
20868 return SDValue();
20871 // Check if the return value is used as only a return value, as otherwise
20872 // we can't perform a tail-call. In particular, we need to check for
20873 // target ISD nodes that are returns and any other "odd" constructs
20874 // that the generic analysis code won't necessarily catch.
20875 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
20876 SDValue &Chain) const {
20877 if (N->getNumValues() != 1)
20878 return false;
20879 if (!N->hasNUsesOfValue(1, 0))
20880 return false;
20882 SDValue TCChain = Chain;
20883 SDNode *Copy = *N->use_begin();
20884 if (Copy->getOpcode() == ISD::CopyToReg) {
20885 // If the copy has a glue operand, we conservatively assume it isn't safe to
20886 // perform a tail call.
20887 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
20888 MVT::Glue)
20889 return false;
20890 TCChain = Copy->getOperand(0);
20891 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
20892 return false;
20894 bool HasRet = false;
20895 for (SDNode *Node : Copy->uses()) {
20896 if (Node->getOpcode() != AArch64ISD::RET_FLAG)
20897 return false;
20898 HasRet = true;
20901 if (!HasRet)
20902 return false;
20904 Chain = TCChain;
20905 return true;
20908 // Return whether the an instruction can potentially be optimized to a tail
20909 // call. This will cause the optimizers to attempt to move, or duplicate,
20910 // return instructions to help enable tail call optimizations for this
20911 // instruction.
20912 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
20913 return CI->isTailCall();
20916 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
20917 SDValue &Offset,
20918 ISD::MemIndexedMode &AM,
20919 bool &IsInc,
20920 SelectionDAG &DAG) const {
20921 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
20922 return false;
20924 Base = Op->getOperand(0);
20925 // All of the indexed addressing mode instructions take a signed
20926 // 9 bit immediate offset.
20927 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
20928 int64_t RHSC = RHS->getSExtValue();
20929 if (Op->getOpcode() == ISD::SUB)
20930 RHSC = -(uint64_t)RHSC;
20931 if (!isInt<9>(RHSC))
20932 return false;
20933 IsInc = (Op->getOpcode() == ISD::ADD);
20934 Offset = Op->getOperand(1);
20935 return true;
20937 return false;
20940 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
20941 SDValue &Offset,
20942 ISD::MemIndexedMode &AM,
20943 SelectionDAG &DAG) const {
20944 EVT VT;
20945 SDValue Ptr;
20946 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
20947 VT = LD->getMemoryVT();
20948 Ptr = LD->getBasePtr();
20949 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
20950 VT = ST->getMemoryVT();
20951 Ptr = ST->getBasePtr();
20952 } else
20953 return false;
20955 bool IsInc;
20956 if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
20957 return false;
20958 AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
20959 return true;
20962 bool AArch64TargetLowering::getPostIndexedAddressParts(
20963 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
20964 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
20965 EVT VT;
20966 SDValue Ptr;
20967 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
20968 VT = LD->getMemoryVT();
20969 Ptr = LD->getBasePtr();
20970 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
20971 VT = ST->getMemoryVT();
20972 Ptr = ST->getBasePtr();
20973 } else
20974 return false;
20976 bool IsInc;
20977 if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
20978 return false;
20979 // Post-indexing updates the base, so it's not a valid transform
20980 // if that's not the same as the load's pointer.
20981 if (Ptr != Base)
20982 return false;
20983 AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
20984 return true;
20987 void AArch64TargetLowering::ReplaceBITCASTResults(
20988 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
20989 SDLoc DL(N);
20990 SDValue Op = N->getOperand(0);
20991 EVT VT = N->getValueType(0);
20992 EVT SrcVT = Op.getValueType();
20994 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
20995 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
20996 "Expected fp->int bitcast!");
20998 // Bitcasting between unpacked vector types of different element counts is
20999 // not a NOP because the live elements are laid out differently.
21000 // 01234567
21001 // e.g. nxv2i32 = XX??XX??
21002 // nxv4f16 = X?X?X?X?
21003 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
21004 return;
21006 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
21007 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
21008 return;
21011 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
21012 return;
21014 Op = SDValue(
21015 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
21016 DAG.getUNDEF(MVT::i32), Op,
21017 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
21019 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
21020 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
21023 static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
21024 SelectionDAG &DAG,
21025 const AArch64Subtarget *Subtarget) {
21026 EVT VT = N->getValueType(0);
21027 if (!VT.is256BitVector() ||
21028 (VT.getScalarType().isFloatingPoint() &&
21029 !N->getFlags().hasAllowReassociation()) ||
21030 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()))
21031 return;
21033 SDValue X = N->getOperand(0);
21034 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
21035 if (!Shuf) {
21036 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
21037 X = N->getOperand(1);
21038 if (!Shuf)
21039 return;
21042 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
21043 return;
21045 // Check the mask is 1,0,3,2,5,4,...
21046 ArrayRef<int> Mask = Shuf->getMask();
21047 for (int I = 0, E = Mask.size(); I < E; I++)
21048 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
21049 return;
21051 SDLoc DL(N);
21052 auto LoHi = DAG.SplitVector(X, DL);
21053 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
21054 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
21055 LoHi.first, LoHi.second);
21057 // Shuffle the elements back into order.
21058 SmallVector<int> NMask;
21059 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
21060 NMask.push_back(I);
21061 NMask.push_back(I);
21063 Results.push_back(
21064 DAG.getVectorShuffle(VT, DL,
21065 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
21066 DAG.getUNDEF(LoHi.first.getValueType())),
21067 DAG.getUNDEF(VT), NMask));
21070 static void ReplaceReductionResults(SDNode *N,
21071 SmallVectorImpl<SDValue> &Results,
21072 SelectionDAG &DAG, unsigned InterOp,
21073 unsigned AcrossOp) {
21074 EVT LoVT, HiVT;
21075 SDValue Lo, Hi;
21076 SDLoc dl(N);
21077 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
21078 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
21079 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
21080 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
21081 Results.push_back(SplitVal);
21084 static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
21085 SDLoc DL(N);
21086 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
21087 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
21088 DAG.getNode(ISD::SRL, DL, MVT::i128, N,
21089 DAG.getConstant(64, DL, MVT::i64)));
21090 return std::make_pair(Lo, Hi);
21093 void AArch64TargetLowering::ReplaceExtractSubVectorResults(
21094 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
21095 SDValue In = N->getOperand(0);
21096 EVT InVT = In.getValueType();
21098 // Common code will handle these just fine.
21099 if (!InVT.isScalableVector() || !InVT.isInteger())
21100 return;
21102 SDLoc DL(N);
21103 EVT VT = N->getValueType(0);
21105 // The following checks bail if this is not a halving operation.
21107 ElementCount ResEC = VT.getVectorElementCount();
21109 if (InVT.getVectorElementCount() != (ResEC * 2))
21110 return;
21112 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
21113 if (!CIndex)
21114 return;
21116 unsigned Index = CIndex->getZExtValue();
21117 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
21118 return;
21120 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
21121 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
21123 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
21124 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
21127 // Create an even/odd pair of X registers holding integer value V.
21128 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
21129 SDLoc dl(V.getNode());
21130 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
21131 SDValue VHi = DAG.getAnyExtOrTrunc(
21132 DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
21133 dl, MVT::i64);
21134 if (DAG.getDataLayout().isBigEndian())
21135 std::swap (VLo, VHi);
21136 SDValue RegClass =
21137 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
21138 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
21139 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
21140 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
21141 return SDValue(
21142 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
21145 static void ReplaceCMP_SWAP_128Results(SDNode *N,
21146 SmallVectorImpl<SDValue> &Results,
21147 SelectionDAG &DAG,
21148 const AArch64Subtarget *Subtarget) {
21149 assert(N->getValueType(0) == MVT::i128 &&
21150 "AtomicCmpSwap on types less than 128 should be legal");
21152 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
21153 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
21154 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
21155 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
21156 SDValue Ops[] = {
21157 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
21158 createGPRPairNode(DAG, N->getOperand(3)), // Store value
21159 N->getOperand(1), // Ptr
21160 N->getOperand(0), // Chain in
21163 unsigned Opcode;
21164 switch (MemOp->getMergedOrdering()) {
21165 case AtomicOrdering::Monotonic:
21166 Opcode = AArch64::CASPX;
21167 break;
21168 case AtomicOrdering::Acquire:
21169 Opcode = AArch64::CASPAX;
21170 break;
21171 case AtomicOrdering::Release:
21172 Opcode = AArch64::CASPLX;
21173 break;
21174 case AtomicOrdering::AcquireRelease:
21175 case AtomicOrdering::SequentiallyConsistent:
21176 Opcode = AArch64::CASPALX;
21177 break;
21178 default:
21179 llvm_unreachable("Unexpected ordering!");
21182 MachineSDNode *CmpSwap = DAG.getMachineNode(
21183 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
21184 DAG.setNodeMemRefs(CmpSwap, {MemOp});
21186 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
21187 if (DAG.getDataLayout().isBigEndian())
21188 std::swap(SubReg1, SubReg2);
21189 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
21190 SDValue(CmpSwap, 0));
21191 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
21192 SDValue(CmpSwap, 0));
21193 Results.push_back(
21194 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
21195 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
21196 return;
21199 unsigned Opcode;
21200 switch (MemOp->getMergedOrdering()) {
21201 case AtomicOrdering::Monotonic:
21202 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
21203 break;
21204 case AtomicOrdering::Acquire:
21205 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
21206 break;
21207 case AtomicOrdering::Release:
21208 Opcode = AArch64::CMP_SWAP_128_RELEASE;
21209 break;
21210 case AtomicOrdering::AcquireRelease:
21211 case AtomicOrdering::SequentiallyConsistent:
21212 Opcode = AArch64::CMP_SWAP_128;
21213 break;
21214 default:
21215 llvm_unreachable("Unexpected ordering!");
21218 auto Desired = splitInt128(N->getOperand(2), DAG);
21219 auto New = splitInt128(N->getOperand(3), DAG);
21220 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
21221 New.first, New.second, N->getOperand(0)};
21222 SDNode *CmpSwap = DAG.getMachineNode(
21223 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
21224 Ops);
21225 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
21227 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
21228 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
21229 Results.push_back(SDValue(CmpSwap, 3));
21232 void AArch64TargetLowering::ReplaceNodeResults(
21233 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
21234 switch (N->getOpcode()) {
21235 default:
21236 llvm_unreachable("Don't know how to custom expand this");
21237 case ISD::BITCAST:
21238 ReplaceBITCASTResults(N, Results, DAG);
21239 return;
21240 case ISD::VECREDUCE_ADD:
21241 case ISD::VECREDUCE_SMAX:
21242 case ISD::VECREDUCE_SMIN:
21243 case ISD::VECREDUCE_UMAX:
21244 case ISD::VECREDUCE_UMIN:
21245 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
21246 return;
21247 case ISD::ADD:
21248 case ISD::FADD:
21249 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
21250 return;
21252 case ISD::CTPOP:
21253 case ISD::PARITY:
21254 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
21255 Results.push_back(Result);
21256 return;
21257 case AArch64ISD::SADDV:
21258 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
21259 return;
21260 case AArch64ISD::UADDV:
21261 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
21262 return;
21263 case AArch64ISD::SMINV:
21264 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
21265 return;
21266 case AArch64ISD::UMINV:
21267 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
21268 return;
21269 case AArch64ISD::SMAXV:
21270 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
21271 return;
21272 case AArch64ISD::UMAXV:
21273 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
21274 return;
21275 case ISD::FP_TO_UINT:
21276 case ISD::FP_TO_SINT:
21277 case ISD::STRICT_FP_TO_SINT:
21278 case ISD::STRICT_FP_TO_UINT:
21279 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
21280 // Let normal code take care of it by not adding anything to Results.
21281 return;
21282 case ISD::ATOMIC_CMP_SWAP:
21283 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
21284 return;
21285 case ISD::ATOMIC_LOAD:
21286 case ISD::LOAD: {
21287 MemSDNode *LoadNode = cast<MemSDNode>(N);
21288 EVT MemVT = LoadNode->getMemoryVT();
21289 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
21290 // targets.
21291 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
21292 MemVT.getSizeInBits() == 256u &&
21293 (MemVT.getScalarSizeInBits() == 8u ||
21294 MemVT.getScalarSizeInBits() == 16u ||
21295 MemVT.getScalarSizeInBits() == 32u ||
21296 MemVT.getScalarSizeInBits() == 64u)) {
21298 SDValue Result = DAG.getMemIntrinsicNode(
21299 AArch64ISD::LDNP, SDLoc(N),
21300 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
21301 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
21302 MVT::Other}),
21303 {LoadNode->getChain(), LoadNode->getBasePtr()},
21304 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
21306 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
21307 Result.getValue(0), Result.getValue(1));
21308 Results.append({Pair, Result.getValue(2) /* Chain */});
21309 return;
21312 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
21313 LoadNode->getMemoryVT() != MVT::i128) {
21314 // Non-volatile or atomic loads are optimized later in AArch64's load/store
21315 // optimizer.
21316 return;
21319 if (SDValue(N, 0).getValueType() == MVT::i128) {
21320 SDValue Result = DAG.getMemIntrinsicNode(
21321 AArch64ISD::LDP, SDLoc(N),
21322 DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
21323 {LoadNode->getChain(), LoadNode->getBasePtr()},
21324 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
21326 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
21327 Result.getValue(0), Result.getValue(1));
21328 Results.append({Pair, Result.getValue(2) /* Chain */});
21330 return;
21332 case ISD::EXTRACT_SUBVECTOR:
21333 ReplaceExtractSubVectorResults(N, Results, DAG);
21334 return;
21335 case ISD::INSERT_SUBVECTOR:
21336 case ISD::CONCAT_VECTORS:
21337 // Custom lowering has been requested for INSERT_SUBVECTOR and
21338 // CONCAT_VECTORS -- but delegate to common code for result type
21339 // legalisation
21340 return;
21341 case ISD::INTRINSIC_WO_CHAIN: {
21342 EVT VT = N->getValueType(0);
21343 assert((VT == MVT::i8 || VT == MVT::i16) &&
21344 "custom lowering for unexpected type");
21346 ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
21347 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
21348 switch (IntID) {
21349 default:
21350 return;
21351 case Intrinsic::aarch64_sve_clasta_n: {
21352 SDLoc DL(N);
21353 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
21354 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
21355 N->getOperand(1), Op2, N->getOperand(3));
21356 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
21357 return;
21359 case Intrinsic::aarch64_sve_clastb_n: {
21360 SDLoc DL(N);
21361 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
21362 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
21363 N->getOperand(1), Op2, N->getOperand(3));
21364 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
21365 return;
21367 case Intrinsic::aarch64_sve_lasta: {
21368 SDLoc DL(N);
21369 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
21370 N->getOperand(1), N->getOperand(2));
21371 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
21372 return;
21374 case Intrinsic::aarch64_sve_lastb: {
21375 SDLoc DL(N);
21376 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
21377 N->getOperand(1), N->getOperand(2));
21378 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
21379 return;
21386 bool AArch64TargetLowering::useLoadStackGuardNode() const {
21387 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
21388 return TargetLowering::useLoadStackGuardNode();
21389 return true;
21392 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
21393 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
21394 // reciprocal if there are three or more FDIVs.
21395 return 3;
21398 TargetLoweringBase::LegalizeTypeAction
21399 AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
21400 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
21401 // v4i16, v2i32 instead of to promote.
21402 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
21403 VT == MVT::v1f32)
21404 return TypeWidenVector;
21406 return TargetLoweringBase::getPreferredVectorAction(VT);
21409 // In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
21410 // provided the address is 16-byte aligned.
21411 bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
21412 if (!Subtarget->hasLSE2())
21413 return false;
21415 if (auto LI = dyn_cast<LoadInst>(I))
21416 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
21417 LI->getAlign() >= Align(16);
21419 if (auto SI = dyn_cast<StoreInst>(I))
21420 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
21421 SI->getAlign() >= Align(16);
21423 return false;
21426 bool AArch64TargetLowering::shouldInsertFencesForAtomic(
21427 const Instruction *I) const {
21428 return isOpSuitableForLDPSTP(I);
21431 // Loads and stores less than 128-bits are already atomic; ones above that
21432 // are doomed anyway, so defer to the default libcall and blame the OS when
21433 // things go wrong.
21434 TargetLoweringBase::AtomicExpansionKind
21435 AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
21436 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21437 if (Size != 128 || isOpSuitableForLDPSTP(SI))
21438 return AtomicExpansionKind::None;
21439 return AtomicExpansionKind::Expand;
21442 // Loads and stores less than 128-bits are already atomic; ones above that
21443 // are doomed anyway, so defer to the default libcall and blame the OS when
21444 // things go wrong.
21445 TargetLowering::AtomicExpansionKind
21446 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
21447 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21449 if (Size != 128 || isOpSuitableForLDPSTP(LI))
21450 return AtomicExpansionKind::None;
21452 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21453 // implement atomicrmw without spilling. If the target address is also on the
21454 // stack and close enough to the spill slot, this can lead to a situation
21455 // where the monitor always gets cleared and the atomic operation can never
21456 // succeed. So at -O0 lower this operation to a CAS loop.
21457 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
21458 return AtomicExpansionKind::CmpXChg;
21460 return AtomicExpansionKind::LLSC;
21463 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
21464 TargetLowering::AtomicExpansionKind
21465 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
21466 if (AI->isFloatingPointOperation())
21467 return AtomicExpansionKind::CmpXChg;
21469 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21470 if (Size > 128) return AtomicExpansionKind::None;
21472 // Nand is not supported in LSE.
21473 // Leave 128 bits to LLSC or CmpXChg.
21474 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
21475 if (Subtarget->hasLSE())
21476 return AtomicExpansionKind::None;
21477 if (Subtarget->outlineAtomics()) {
21478 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
21479 // Don't outline them unless
21480 // (1) high level <atomic> support approved:
21481 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
21482 // (2) low level libgcc and compiler-rt support implemented by:
21483 // min/max outline atomics helpers
21484 if (AI->getOperation() != AtomicRMWInst::Min &&
21485 AI->getOperation() != AtomicRMWInst::Max &&
21486 AI->getOperation() != AtomicRMWInst::UMin &&
21487 AI->getOperation() != AtomicRMWInst::UMax) {
21488 return AtomicExpansionKind::None;
21493 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21494 // implement atomicrmw without spilling. If the target address is also on the
21495 // stack and close enough to the spill slot, this can lead to a situation
21496 // where the monitor always gets cleared and the atomic operation can never
21497 // succeed. So at -O0 lower this operation to a CAS loop.
21498 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
21499 return AtomicExpansionKind::CmpXChg;
21501 return AtomicExpansionKind::LLSC;
21504 TargetLowering::AtomicExpansionKind
21505 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
21506 AtomicCmpXchgInst *AI) const {
21507 // If subtarget has LSE, leave cmpxchg intact for codegen.
21508 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
21509 return AtomicExpansionKind::None;
21510 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21511 // implement cmpxchg without spilling. If the address being exchanged is also
21512 // on the stack and close enough to the spill slot, this can lead to a
21513 // situation where the monitor always gets cleared and the atomic operation
21514 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21515 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
21516 return AtomicExpansionKind::None;
21518 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
21519 // it.
21520 unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
21521 if (Size > 64)
21522 return AtomicExpansionKind::None;
21524 return AtomicExpansionKind::LLSC;
21527 Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
21528 Type *ValueTy, Value *Addr,
21529 AtomicOrdering Ord) const {
21530 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21531 bool IsAcquire = isAcquireOrStronger(Ord);
21533 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
21534 // intrinsic must return {i64, i64} and we have to recombine them into a
21535 // single i128 here.
21536 if (ValueTy->getPrimitiveSizeInBits() == 128) {
21537 Intrinsic::ID Int =
21538 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
21539 Function *Ldxr = Intrinsic::getDeclaration(M, Int);
21541 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
21542 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
21544 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21545 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21546 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21547 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21548 return Builder.CreateOr(
21549 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
21552 Type *Tys[] = { Addr->getType() };
21553 Intrinsic::ID Int =
21554 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
21555 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
21557 const DataLayout &DL = M->getDataLayout();
21558 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
21559 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
21560 CI->addParamAttr(
21561 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
21562 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
21564 return Builder.CreateBitCast(Trunc, ValueTy);
21567 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
21568 IRBuilderBase &Builder) const {
21569 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21570 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
21573 Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
21574 Value *Val, Value *Addr,
21575 AtomicOrdering Ord) const {
21576 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21577 bool IsRelease = isReleaseOrStronger(Ord);
21579 // Since the intrinsics must have legal type, the i128 intrinsics take two
21580 // parameters: "i64, i64". We must marshal Val into the appropriate form
21581 // before the call.
21582 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
21583 Intrinsic::ID Int =
21584 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
21585 Function *Stxr = Intrinsic::getDeclaration(M, Int);
21586 Type *Int64Ty = Type::getInt64Ty(M->getContext());
21588 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
21589 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
21590 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
21591 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
21594 Intrinsic::ID Int =
21595 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
21596 Type *Tys[] = { Addr->getType() };
21597 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
21599 const DataLayout &DL = M->getDataLayout();
21600 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
21601 Val = Builder.CreateBitCast(Val, IntValTy);
21603 CallInst *CI = Builder.CreateCall(
21604 Stxr, {Builder.CreateZExtOrBitCast(
21605 Val, Stxr->getFunctionType()->getParamType(0)),
21606 Addr});
21607 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
21608 Attribute::ElementType, Val->getType()));
21609 return CI;
21612 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
21613 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21614 const DataLayout &DL) const {
21615 if (!Ty->isArrayTy()) {
21616 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
21617 return TySize.isScalable() && TySize.getKnownMinSize() > 128;
21620 // All non aggregate members of the type must have the same type
21621 SmallVector<EVT> ValueVTs;
21622 ComputeValueVTs(*this, DL, Ty, ValueVTs);
21623 return all_equal(ValueVTs);
21626 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
21627 EVT) const {
21628 return false;
21631 static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
21632 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
21633 Function *ThreadPointerFunc =
21634 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
21635 return IRB.CreatePointerCast(
21636 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
21637 Offset),
21638 IRB.getInt8PtrTy()->getPointerTo(0));
21641 Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
21642 // Android provides a fixed TLS slot for the stack cookie. See the definition
21643 // of TLS_SLOT_STACK_GUARD in
21644 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
21645 if (Subtarget->isTargetAndroid())
21646 return UseTlsOffset(IRB, 0x28);
21648 // Fuchsia is similar.
21649 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
21650 if (Subtarget->isTargetFuchsia())
21651 return UseTlsOffset(IRB, -0x10);
21653 return TargetLowering::getIRStackGuard(IRB);
21656 void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
21657 // MSVC CRT provides functionalities for stack protection.
21658 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
21659 // MSVC CRT has a global variable holding security cookie.
21660 M.getOrInsertGlobal("__security_cookie",
21661 Type::getInt8PtrTy(M.getContext()));
21663 // MSVC CRT has a function to validate security cookie.
21664 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
21665 Subtarget->getSecurityCheckCookieName(),
21666 Type::getVoidTy(M.getContext()), Type::getInt8PtrTy(M.getContext()));
21667 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
21668 F->setCallingConv(CallingConv::Win64);
21669 F->addParamAttr(0, Attribute::AttrKind::InReg);
21671 return;
21673 TargetLowering::insertSSPDeclarations(M);
21676 Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
21677 // MSVC CRT has a global variable holding security cookie.
21678 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
21679 return M.getGlobalVariable("__security_cookie");
21680 return TargetLowering::getSDagStackGuard(M);
21683 Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
21684 // MSVC CRT has a function to validate security cookie.
21685 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
21686 return M.getFunction(Subtarget->getSecurityCheckCookieName());
21687 return TargetLowering::getSSPStackGuardCheck(M);
21690 Value *
21691 AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
21692 // Android provides a fixed TLS slot for the SafeStack pointer. See the
21693 // definition of TLS_SLOT_SAFESTACK in
21694 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
21695 if (Subtarget->isTargetAndroid())
21696 return UseTlsOffset(IRB, 0x48);
21698 // Fuchsia is similar.
21699 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
21700 if (Subtarget->isTargetFuchsia())
21701 return UseTlsOffset(IRB, -0x8);
21703 return TargetLowering::getSafeStackPointerLocation(IRB);
21706 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
21707 const Instruction &AndI) const {
21708 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
21709 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
21710 // may be beneficial to sink in other cases, but we would have to check that
21711 // the cmp would not get folded into the br to form a cbz for these to be
21712 // beneficial.
21713 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
21714 if (!Mask)
21715 return false;
21716 return Mask->getValue().isPowerOf2();
21719 bool AArch64TargetLowering::
21720 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
21721 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
21722 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
21723 SelectionDAG &DAG) const {
21724 // Does baseline recommend not to perform the fold by default?
21725 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
21726 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
21727 return false;
21728 // Else, if this is a vector shift, prefer 'shl'.
21729 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
21732 bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
21733 SDNode *N) const {
21734 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
21735 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
21736 return false;
21737 return true;
21740 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
21741 // Update IsSplitCSR in AArch64unctionInfo.
21742 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
21743 AFI->setIsSplitCSR(true);
21746 void AArch64TargetLowering::insertCopiesSplitCSR(
21747 MachineBasicBlock *Entry,
21748 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
21749 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
21750 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
21751 if (!IStart)
21752 return;
21754 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21755 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
21756 MachineBasicBlock::iterator MBBI = Entry->begin();
21757 for (const MCPhysReg *I = IStart; *I; ++I) {
21758 const TargetRegisterClass *RC = nullptr;
21759 if (AArch64::GPR64RegClass.contains(*I))
21760 RC = &AArch64::GPR64RegClass;
21761 else if (AArch64::FPR64RegClass.contains(*I))
21762 RC = &AArch64::FPR64RegClass;
21763 else
21764 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21766 Register NewVR = MRI->createVirtualRegister(RC);
21767 // Create copy from CSR to a virtual register.
21768 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21769 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21770 // nounwind. If we want to generalize this later, we may need to emit
21771 // CFI pseudo-instructions.
21772 assert(Entry->getParent()->getFunction().hasFnAttribute(
21773 Attribute::NoUnwind) &&
21774 "Function should be nounwind in insertCopiesSplitCSR!");
21775 Entry->addLiveIn(*I);
21776 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
21777 .addReg(*I);
21779 // Insert the copy-back instructions right before the terminator.
21780 for (auto *Exit : Exits)
21781 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
21782 TII->get(TargetOpcode::COPY), *I)
21783 .addReg(NewVR);
21787 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
21788 // Integer division on AArch64 is expensive. However, when aggressively
21789 // optimizing for code size, we prefer to use a div instruction, as it is
21790 // usually smaller than the alternative sequence.
21791 // The exception to this is vector division. Since AArch64 doesn't have vector
21792 // integer division, leaving the division as-is is a loss even in terms of
21793 // size, because it will have to be scalarized, while the alternative code
21794 // sequence can be performed in vector form.
21795 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
21796 return OptSize && !VT.isVector();
21799 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
21800 // We want inc-of-add for scalars and sub-of-not for vectors.
21801 return VT.isScalarInteger();
21804 bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
21805 EVT VT) const {
21806 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
21807 // legalize.
21808 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
21809 return false;
21810 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
21813 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
21814 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
21817 unsigned
21818 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
21819 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
21820 return getPointerTy(DL).getSizeInBits();
21822 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
21825 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
21826 MachineFrameInfo &MFI = MF.getFrameInfo();
21827 // If we have any vulnerable SVE stack objects then the stack protector
21828 // needs to be placed at the top of the SVE stack area, as the SVE locals
21829 // are placed above the other locals, so we allocate it as if it were a
21830 // scalable vector.
21831 // FIXME: It may be worthwhile having a specific interface for this rather
21832 // than doing it here in finalizeLowering.
21833 if (MFI.hasStackProtectorIndex()) {
21834 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
21835 if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
21836 MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
21837 MFI.setStackID(MFI.getStackProtectorIndex(),
21838 TargetStackID::ScalableVector);
21839 MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16));
21840 break;
21844 MFI.computeMaxCallFrameSize(MF);
21845 TargetLoweringBase::finalizeLowering(MF);
21848 // Unlike X86, we let frame lowering assign offsets to all catch objects.
21849 bool AArch64TargetLowering::needsFixedCatchObjects() const {
21850 return false;
21853 bool AArch64TargetLowering::shouldLocalize(
21854 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
21855 auto &MF = *MI.getMF();
21856 auto &MRI = MF.getRegInfo();
21857 auto maxUses = [](unsigned RematCost) {
21858 // A cost of 1 means remats are basically free.
21859 if (RematCost == 1)
21860 return std::numeric_limits<unsigned>::max();
21861 if (RematCost == 2)
21862 return 2U;
21864 // Remat is too expensive, only sink if there's one user.
21865 if (RematCost > 2)
21866 return 1U;
21867 llvm_unreachable("Unexpected remat cost");
21870 switch (MI.getOpcode()) {
21871 case TargetOpcode::G_GLOBAL_VALUE: {
21872 // On Darwin, TLS global vars get selected into function calls, which
21873 // we don't want localized, as they can get moved into the middle of a
21874 // another call sequence.
21875 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
21876 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
21877 return false;
21878 break;
21880 case TargetOpcode::G_CONSTANT: {
21881 auto *CI = MI.getOperand(1).getCImm();
21882 APInt Imm = CI->getValue();
21883 InstructionCost Cost = TTI->getIntImmCost(
21884 Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);
21885 assert(Cost.isValid() && "Expected a valid imm cost");
21887 unsigned RematCost = *Cost.getValue();
21888 Register Reg = MI.getOperand(0).getReg();
21889 unsigned MaxUses = maxUses(RematCost);
21890 // Don't pass UINT_MAX sentinal value to hasAtMostUserInstrs().
21891 if (MaxUses == std::numeric_limits<unsigned>::max())
21892 --MaxUses;
21893 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
21895 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
21896 // localizable.
21897 case AArch64::ADRP:
21898 case AArch64::G_ADD_LOW:
21899 return true;
21900 default:
21901 break;
21903 return TargetLoweringBase::shouldLocalize(MI, TTI);
21906 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
21907 if (isa<ScalableVectorType>(Inst.getType()))
21908 return true;
21910 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
21911 if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
21912 return true;
21914 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
21915 if (isa<ScalableVectorType>(AI->getAllocatedType()))
21916 return true;
21919 return false;
21922 // Return the largest legal scalable vector type that matches VT's element type.
21923 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
21924 assert(VT.isFixedLengthVector() &&
21925 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
21926 "Expected legal fixed length vector!");
21927 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
21928 default:
21929 llvm_unreachable("unexpected element type for SVE container");
21930 case MVT::i8:
21931 return EVT(MVT::nxv16i8);
21932 case MVT::i16:
21933 return EVT(MVT::nxv8i16);
21934 case MVT::i32:
21935 return EVT(MVT::nxv4i32);
21936 case MVT::i64:
21937 return EVT(MVT::nxv2i64);
21938 case MVT::f16:
21939 return EVT(MVT::nxv8f16);
21940 case MVT::f32:
21941 return EVT(MVT::nxv4f32);
21942 case MVT::f64:
21943 return EVT(MVT::nxv2f64);
21947 // Return a PTRUE with active lanes corresponding to the extent of VT.
21948 static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
21949 EVT VT) {
21950 assert(VT.isFixedLengthVector() &&
21951 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
21952 "Expected legal fixed length vector!");
21954 Optional<unsigned> PgPattern =
21955 getSVEPredPatternFromNumElements(VT.getVectorNumElements());
21956 assert(PgPattern && "Unexpected element count for SVE predicate");
21958 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
21959 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
21960 // variants of instructions when available.
21961 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
21962 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
21963 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
21964 if (MaxSVESize && MinSVESize == MaxSVESize &&
21965 MaxSVESize == VT.getSizeInBits())
21966 PgPattern = AArch64SVEPredPattern::all;
21968 MVT MaskVT;
21969 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
21970 default:
21971 llvm_unreachable("unexpected element type for SVE predicate");
21972 case MVT::i8:
21973 MaskVT = MVT::nxv16i1;
21974 break;
21975 case MVT::i16:
21976 case MVT::f16:
21977 MaskVT = MVT::nxv8i1;
21978 break;
21979 case MVT::i32:
21980 case MVT::f32:
21981 MaskVT = MVT::nxv4i1;
21982 break;
21983 case MVT::i64:
21984 case MVT::f64:
21985 MaskVT = MVT::nxv2i1;
21986 break;
21989 return getPTrue(DAG, DL, MaskVT, *PgPattern);
21992 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
21993 EVT VT) {
21994 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
21995 "Expected legal scalable vector!");
21996 auto PredTy = VT.changeVectorElementType(MVT::i1);
21997 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
22000 static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
22001 if (VT.isFixedLengthVector())
22002 return getPredicateForFixedLengthVector(DAG, DL, VT);
22004 return getPredicateForScalableVector(DAG, DL, VT);
22007 // Grow V to consume an entire SVE register.
22008 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
22009 assert(VT.isScalableVector() &&
22010 "Expected to convert into a scalable vector!");
22011 assert(V.getValueType().isFixedLengthVector() &&
22012 "Expected a fixed length vector operand!");
22013 SDLoc DL(V);
22014 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22015 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
22018 // Shrink V so it's just big enough to maintain a VT's worth of data.
22019 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
22020 assert(VT.isFixedLengthVector() &&
22021 "Expected to convert into a fixed length vector!");
22022 assert(V.getValueType().isScalableVector() &&
22023 "Expected a scalable vector operand!");
22024 SDLoc DL(V);
22025 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22026 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
22029 // Convert all fixed length vector loads larger than NEON to masked_loads.
22030 SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
22031 SDValue Op, SelectionDAG &DAG) const {
22032 auto Load = cast<LoadSDNode>(Op);
22034 SDLoc DL(Op);
22035 EVT VT = Op.getValueType();
22036 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22037 EVT LoadVT = ContainerVT;
22038 EVT MemVT = Load->getMemoryVT();
22040 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
22042 if (VT.isFloatingPoint()) {
22043 LoadVT = ContainerVT.changeTypeToInteger();
22044 MemVT = MemVT.changeTypeToInteger();
22047 SDValue NewLoad = DAG.getMaskedLoad(
22048 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
22049 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
22050 Load->getAddressingMode(), Load->getExtensionType());
22052 SDValue Result = NewLoad;
22053 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
22054 EVT ExtendVT = ContainerVT.changeVectorElementType(
22055 Load->getMemoryVT().getVectorElementType());
22057 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
22058 Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
22059 Pg, Result, DAG.getUNDEF(ContainerVT));
22060 } else if (VT.isFloatingPoint()) {
22061 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
22064 Result = convertFromScalableVector(DAG, VT, Result);
22065 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
22066 return DAG.getMergeValues(MergedValues, DL);
22069 static SDValue convertFixedMaskToScalableVector(SDValue Mask,
22070 SelectionDAG &DAG) {
22071 SDLoc DL(Mask);
22072 EVT InVT = Mask.getValueType();
22073 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
22075 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
22077 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
22078 return Pg;
22080 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
22081 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
22083 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
22084 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
22087 // Convert all fixed length vector loads larger than NEON to masked_loads.
22088 SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
22089 SDValue Op, SelectionDAG &DAG) const {
22090 auto Load = cast<MaskedLoadSDNode>(Op);
22092 SDLoc DL(Op);
22093 EVT VT = Op.getValueType();
22094 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22096 SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG);
22098 SDValue PassThru;
22099 bool IsPassThruZeroOrUndef = false;
22101 if (Load->getPassThru()->isUndef()) {
22102 PassThru = DAG.getUNDEF(ContainerVT);
22103 IsPassThruZeroOrUndef = true;
22104 } else {
22105 if (ContainerVT.isInteger())
22106 PassThru = DAG.getConstant(0, DL, ContainerVT);
22107 else
22108 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
22109 if (isZerosVector(Load->getPassThru().getNode()))
22110 IsPassThruZeroOrUndef = true;
22113 SDValue NewLoad = DAG.getMaskedLoad(
22114 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
22115 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
22116 Load->getAddressingMode(), Load->getExtensionType());
22118 SDValue Result = NewLoad;
22119 if (!IsPassThruZeroOrUndef) {
22120 SDValue OldPassThru =
22121 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
22122 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
22125 Result = convertFromScalableVector(DAG, VT, Result);
22126 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
22127 return DAG.getMergeValues(MergedValues, DL);
22130 // Convert all fixed length vector stores larger than NEON to masked_stores.
22131 SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
22132 SDValue Op, SelectionDAG &DAG) const {
22133 auto Store = cast<StoreSDNode>(Op);
22135 SDLoc DL(Op);
22136 EVT VT = Store->getValue().getValueType();
22137 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22138 EVT MemVT = Store->getMemoryVT();
22140 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
22141 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
22143 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
22144 EVT TruncVT = ContainerVT.changeVectorElementType(
22145 Store->getMemoryVT().getVectorElementType());
22146 MemVT = MemVT.changeTypeToInteger();
22147 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
22148 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
22149 DAG.getUNDEF(TruncVT));
22150 NewValue =
22151 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
22152 } else if (VT.isFloatingPoint()) {
22153 MemVT = MemVT.changeTypeToInteger();
22154 NewValue =
22155 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
22158 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
22159 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
22160 Store->getMemOperand(), Store->getAddressingMode(),
22161 Store->isTruncatingStore());
22164 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
22165 SDValue Op, SelectionDAG &DAG) const {
22166 auto *Store = cast<MaskedStoreSDNode>(Op);
22168 SDLoc DL(Op);
22169 EVT VT = Store->getValue().getValueType();
22170 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22172 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
22173 SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
22175 return DAG.getMaskedStore(
22176 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
22177 Mask, Store->getMemoryVT(), Store->getMemOperand(),
22178 Store->getAddressingMode(), Store->isTruncatingStore());
22181 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
22182 SDValue Op, SelectionDAG &DAG) const {
22183 SDLoc dl(Op);
22184 EVT VT = Op.getValueType();
22185 EVT EltVT = VT.getVectorElementType();
22187 bool Signed = Op.getOpcode() == ISD::SDIV;
22188 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
22190 bool Negated;
22191 uint64_t SplatVal;
22192 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
22193 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22194 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
22195 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
22197 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
22198 SDValue Res = DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
22199 if (Negated)
22200 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
22202 return convertFromScalableVector(DAG, VT, Res);
22205 // Scalable vector i32/i64 DIV is supported.
22206 if (EltVT == MVT::i32 || EltVT == MVT::i64)
22207 return LowerToPredicatedOp(Op, DAG, PredOpcode);
22209 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
22210 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22211 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
22212 EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
22213 EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT);
22215 // If this is not a full vector, extend, div, and truncate it.
22216 EVT WidenedVT = VT.widenIntegerVectorElementType(*DAG.getContext());
22217 if (DAG.getTargetLoweringInfo().isTypeLegal(WidenedVT)) {
22218 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22219 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(0));
22220 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(1));
22221 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0, Op1);
22222 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
22225 // Convert the operands to scalable vectors.
22226 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
22227 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
22229 // Extend the scalable operands.
22230 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
22231 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
22232 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0);
22233 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1);
22234 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0);
22235 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1);
22237 // Convert back to fixed vectors so the DIV can be further lowered.
22238 Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo);
22239 Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo);
22240 Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi);
22241 Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi);
22242 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
22243 Op0Lo, Op1Lo);
22244 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
22245 Op0Hi, Op1Hi);
22247 // Convert again to scalable vectors to truncate.
22248 ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo);
22249 ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi);
22250 SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT,
22251 ResultLo, ResultHi);
22253 return convertFromScalableVector(DAG, VT, ScalableResult);
22256 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
22257 SDValue Op, SelectionDAG &DAG) const {
22258 EVT VT = Op.getValueType();
22259 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22261 SDLoc DL(Op);
22262 SDValue Val = Op.getOperand(0);
22263 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
22264 Val = convertToScalableVector(DAG, ContainerVT, Val);
22266 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
22267 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
22269 // Repeatedly unpack Val until the result is of the desired element type.
22270 switch (ContainerVT.getSimpleVT().SimpleTy) {
22271 default:
22272 llvm_unreachable("unimplemented container type");
22273 case MVT::nxv16i8:
22274 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
22275 if (VT.getVectorElementType() == MVT::i16)
22276 break;
22277 [[fallthrough]];
22278 case MVT::nxv8i16:
22279 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
22280 if (VT.getVectorElementType() == MVT::i32)
22281 break;
22282 [[fallthrough]];
22283 case MVT::nxv4i32:
22284 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
22285 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
22286 break;
22289 return convertFromScalableVector(DAG, VT, Val);
22292 SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
22293 SDValue Op, SelectionDAG &DAG) const {
22294 EVT VT = Op.getValueType();
22295 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22297 SDLoc DL(Op);
22298 SDValue Val = Op.getOperand(0);
22299 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
22300 Val = convertToScalableVector(DAG, ContainerVT, Val);
22302 // Repeatedly truncate Val until the result is of the desired element type.
22303 switch (ContainerVT.getSimpleVT().SimpleTy) {
22304 default:
22305 llvm_unreachable("unimplemented container type");
22306 case MVT::nxv2i64:
22307 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
22308 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
22309 if (VT.getVectorElementType() == MVT::i32)
22310 break;
22311 [[fallthrough]];
22312 case MVT::nxv4i32:
22313 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
22314 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
22315 if (VT.getVectorElementType() == MVT::i16)
22316 break;
22317 [[fallthrough]];
22318 case MVT::nxv8i16:
22319 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
22320 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
22321 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
22322 break;
22325 return convertFromScalableVector(DAG, VT, Val);
22328 SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
22329 SDValue Op, SelectionDAG &DAG) const {
22330 EVT VT = Op.getValueType();
22331 EVT InVT = Op.getOperand(0).getValueType();
22332 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
22334 SDLoc DL(Op);
22335 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
22336 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
22338 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
22341 SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
22342 SDValue Op, SelectionDAG &DAG) const {
22343 EVT VT = Op.getValueType();
22344 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22346 SDLoc DL(Op);
22347 EVT InVT = Op.getOperand(0).getValueType();
22348 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
22349 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
22351 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
22352 Op.getOperand(1), Op.getOperand(2));
22354 return convertFromScalableVector(DAG, VT, ScalableRes);
22357 // Convert vector operation 'Op' to an equivalent predicated operation whereby
22358 // the original operation's type is used to construct a suitable predicate.
22359 // NOTE: The results for inactive lanes are undefined.
22360 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
22361 SelectionDAG &DAG,
22362 unsigned NewOp) const {
22363 EVT VT = Op.getValueType();
22364 SDLoc DL(Op);
22365 auto Pg = getPredicateForVector(DAG, DL, VT);
22367 if (VT.isFixedLengthVector()) {
22368 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
22369 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22371 // Create list of operands by converting existing ones to scalable types.
22372 SmallVector<SDValue, 4> Operands = {Pg};
22373 for (const SDValue &V : Op->op_values()) {
22374 if (isa<CondCodeSDNode>(V)) {
22375 Operands.push_back(V);
22376 continue;
22379 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
22380 EVT VTArg = VTNode->getVT().getVectorElementType();
22381 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
22382 Operands.push_back(DAG.getValueType(NewVTArg));
22383 continue;
22386 assert(isTypeLegal(V.getValueType()) &&
22387 "Expected only legal fixed-width types");
22388 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
22391 if (isMergePassthruOpcode(NewOp))
22392 Operands.push_back(DAG.getUNDEF(ContainerVT));
22394 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
22395 return convertFromScalableVector(DAG, VT, ScalableRes);
22398 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
22400 SmallVector<SDValue, 4> Operands = {Pg};
22401 for (const SDValue &V : Op->op_values()) {
22402 assert((!V.getValueType().isVector() ||
22403 V.getValueType().isScalableVector()) &&
22404 "Only scalable vectors are supported!");
22405 Operands.push_back(V);
22408 if (isMergePassthruOpcode(NewOp))
22409 Operands.push_back(DAG.getUNDEF(VT));
22411 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
22414 // If a fixed length vector operation has no side effects when applied to
22415 // undefined elements, we can safely use scalable vectors to perform the same
22416 // operation without needing to worry about predication.
22417 SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
22418 SelectionDAG &DAG) const {
22419 EVT VT = Op.getValueType();
22420 assert(useSVEForFixedLengthVectorVT(VT) &&
22421 "Only expected to lower fixed length vector operation!");
22422 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22424 // Create list of operands by converting existing ones to scalable types.
22425 SmallVector<SDValue, 4> Ops;
22426 for (const SDValue &V : Op->op_values()) {
22427 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
22429 // Pass through non-vector operands.
22430 if (!V.getValueType().isVector()) {
22431 Ops.push_back(V);
22432 continue;
22435 // "cast" fixed length vector to a scalable vector.
22436 assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
22437 "Only fixed length vectors are supported!");
22438 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
22441 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
22442 return convertFromScalableVector(DAG, VT, ScalableRes);
22445 SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
22446 SelectionDAG &DAG) const {
22447 SDLoc DL(ScalarOp);
22448 SDValue AccOp = ScalarOp.getOperand(0);
22449 SDValue VecOp = ScalarOp.getOperand(1);
22450 EVT SrcVT = VecOp.getValueType();
22451 EVT ResVT = SrcVT.getVectorElementType();
22453 EVT ContainerVT = SrcVT;
22454 if (SrcVT.isFixedLengthVector()) {
22455 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
22456 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
22459 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
22460 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
22462 // Convert operands to Scalable.
22463 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
22464 DAG.getUNDEF(ContainerVT), AccOp, Zero);
22466 // Perform reduction.
22467 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
22468 Pg, AccOp, VecOp);
22470 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
22473 SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
22474 SelectionDAG &DAG) const {
22475 SDLoc DL(ReduceOp);
22476 SDValue Op = ReduceOp.getOperand(0);
22477 EVT OpVT = Op.getValueType();
22478 EVT VT = ReduceOp.getValueType();
22480 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
22481 return SDValue();
22483 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
22485 switch (ReduceOp.getOpcode()) {
22486 default:
22487 return SDValue();
22488 case ISD::VECREDUCE_OR:
22489 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
22490 // The predicate can be 'Op' because
22491 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
22492 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
22493 else
22494 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
22495 case ISD::VECREDUCE_AND: {
22496 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
22497 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
22499 case ISD::VECREDUCE_XOR: {
22500 SDValue ID =
22501 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
22502 if (OpVT == MVT::nxv1i1) {
22503 // Emulate a CNTP on .Q using .D and a different governing predicate.
22504 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
22505 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
22507 SDValue Cntp =
22508 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
22509 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
22513 return SDValue();
22516 SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
22517 SDValue ScalarOp,
22518 SelectionDAG &DAG) const {
22519 SDLoc DL(ScalarOp);
22520 SDValue VecOp = ScalarOp.getOperand(0);
22521 EVT SrcVT = VecOp.getValueType();
22523 if (useSVEForFixedLengthVectorVT(
22524 SrcVT,
22525 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
22526 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
22527 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
22530 // UADDV always returns an i64 result.
22531 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
22532 SrcVT.getVectorElementType();
22533 EVT RdxVT = SrcVT;
22534 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
22535 RdxVT = getPackedSVEVectorVT(ResVT);
22537 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
22538 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
22539 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
22540 Rdx, DAG.getConstant(0, DL, MVT::i64));
22542 // The VEC_REDUCE nodes expect an element size result.
22543 if (ResVT != ScalarOp.getValueType())
22544 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
22546 return Res;
22549 SDValue
22550 AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
22551 SelectionDAG &DAG) const {
22552 EVT VT = Op.getValueType();
22553 SDLoc DL(Op);
22555 EVT InVT = Op.getOperand(1).getValueType();
22556 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
22557 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
22558 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
22560 // Convert the mask to a predicated (NOTE: We don't need to worry about
22561 // inactive lanes since VSELECT is safe when given undefined elements).
22562 EVT MaskVT = Op.getOperand(0).getValueType();
22563 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
22564 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
22565 Mask = DAG.getNode(ISD::TRUNCATE, DL,
22566 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
22568 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
22569 Mask, Op1, Op2);
22571 return convertFromScalableVector(DAG, VT, ScalableRes);
22574 SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
22575 SDValue Op, SelectionDAG &DAG) const {
22576 SDLoc DL(Op);
22577 EVT InVT = Op.getOperand(0).getValueType();
22578 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
22580 assert(useSVEForFixedLengthVectorVT(InVT) &&
22581 "Only expected to lower fixed length vector operation!");
22582 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
22583 "Expected integer result of the same bit length as the inputs!");
22585 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
22586 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
22587 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
22589 EVT CmpVT = Pg.getValueType();
22590 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
22591 {Pg, Op1, Op2, Op.getOperand(2)});
22593 EVT PromoteVT = ContainerVT.changeTypeToInteger();
22594 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
22595 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
22598 SDValue
22599 AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
22600 SelectionDAG &DAG) const {
22601 SDLoc DL(Op);
22602 auto SrcOp = Op.getOperand(0);
22603 EVT VT = Op.getValueType();
22604 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
22605 EVT ContainerSrcVT =
22606 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
22608 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
22609 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
22610 return convertFromScalableVector(DAG, VT, Op);
22613 SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
22614 SDValue Op, SelectionDAG &DAG) const {
22615 SDLoc DL(Op);
22616 unsigned NumOperands = Op->getNumOperands();
22618 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
22619 "Unexpected number of operands in CONCAT_VECTORS");
22621 auto SrcOp1 = Op.getOperand(0);
22622 auto SrcOp2 = Op.getOperand(1);
22623 EVT VT = Op.getValueType();
22624 EVT SrcVT = SrcOp1.getValueType();
22626 if (NumOperands > 2) {
22627 SmallVector<SDValue, 4> Ops;
22628 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
22629 for (unsigned I = 0; I < NumOperands; I += 2)
22630 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
22631 Op->getOperand(I), Op->getOperand(I + 1)));
22633 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
22636 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22638 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
22639 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
22640 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
22642 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
22644 return convertFromScalableVector(DAG, VT, Op);
22647 SDValue
22648 AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
22649 SelectionDAG &DAG) const {
22650 EVT VT = Op.getValueType();
22651 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22653 SDLoc DL(Op);
22654 SDValue Val = Op.getOperand(0);
22655 SDValue Pg = getPredicateForVector(DAG, DL, VT);
22656 EVT SrcVT = Val.getValueType();
22657 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22658 EVT ExtendVT = ContainerVT.changeVectorElementType(
22659 SrcVT.getVectorElementType());
22661 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
22662 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
22664 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
22665 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
22666 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
22667 Pg, Val, DAG.getUNDEF(ContainerVT));
22669 return convertFromScalableVector(DAG, VT, Val);
22672 SDValue
22673 AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
22674 SelectionDAG &DAG) const {
22675 EVT VT = Op.getValueType();
22676 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22678 SDLoc DL(Op);
22679 SDValue Val = Op.getOperand(0);
22680 EVT SrcVT = Val.getValueType();
22681 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
22682 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
22683 VT.getVectorElementType());
22684 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
22686 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
22687 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
22688 Op.getOperand(1), DAG.getUNDEF(RoundVT));
22689 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
22690 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
22692 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
22693 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
22696 SDValue
22697 AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
22698 SelectionDAG &DAG) const {
22699 EVT VT = Op.getValueType();
22700 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22702 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
22703 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
22704 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
22706 SDLoc DL(Op);
22707 SDValue Val = Op.getOperand(0);
22708 EVT SrcVT = Val.getValueType();
22709 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
22710 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
22712 if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
22713 ContainerDstVT.getVectorElementType().getSizeInBits()) {
22714 SDValue Pg = getPredicateForVector(DAG, DL, VT);
22716 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
22717 VT.changeTypeToInteger(), Val);
22719 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
22720 Val = getSVESafeBitCast(ContainerDstVT.changeTypeToInteger(), Val, DAG);
22721 // Safe to use a larger than specified operand since we just unpacked the
22722 // data, hence the upper bits are zero.
22723 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
22724 DAG.getUNDEF(ContainerDstVT));
22725 return convertFromScalableVector(DAG, VT, Val);
22726 } else {
22727 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
22728 ContainerDstVT.getVectorElementType());
22729 SDValue Pg = getPredicateForVector(DAG, DL, CvtVT);
22731 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
22732 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
22733 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
22734 Val = convertFromScalableVector(DAG, SrcVT, Val);
22736 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
22737 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
22741 SDValue
22742 AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
22743 SelectionDAG &DAG) const {
22744 EVT VT = Op.getValueType();
22745 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22747 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
22748 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
22749 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
22751 SDLoc DL(Op);
22752 SDValue Val = Op.getOperand(0);
22753 EVT SrcVT = Val.getValueType();
22754 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
22755 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
22757 if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
22758 ContainerDstVT.getVectorElementType().getSizeInBits()) {
22759 EVT CvtVT = ContainerDstVT.changeVectorElementType(
22760 ContainerSrcVT.getVectorElementType());
22761 SDValue Pg = getPredicateForVector(DAG, DL, VT);
22763 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
22764 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
22766 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
22767 Val = getSVESafeBitCast(CvtVT, Val, DAG);
22768 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
22769 DAG.getUNDEF(ContainerDstVT));
22770 return convertFromScalableVector(DAG, VT, Val);
22771 } else {
22772 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
22773 SDValue Pg = getPredicateForVector(DAG, DL, CvtVT);
22775 // Safe to use a larger than specified result since an fp_to_int where the
22776 // result doesn't fit into the destination is undefined.
22777 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
22778 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
22779 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
22781 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
22785 SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
22786 SDValue Op, SelectionDAG &DAG) const {
22787 EVT VT = Op.getValueType();
22788 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
22790 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
22791 auto ShuffleMask = SVN->getMask();
22793 SDLoc DL(Op);
22794 SDValue Op1 = Op.getOperand(0);
22795 SDValue Op2 = Op.getOperand(1);
22797 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
22798 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
22799 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
22801 bool ReverseEXT = false;
22802 unsigned Imm;
22803 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
22804 Imm == VT.getVectorNumElements() - 1) {
22805 if (ReverseEXT)
22806 std::swap(Op1, Op2);
22808 EVT ScalarTy = VT.getVectorElementType();
22809 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
22810 ScalarTy = MVT::i32;
22811 SDValue Scalar = DAG.getNode(
22812 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
22813 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
22814 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
22815 return convertFromScalableVector(DAG, VT, Op);
22818 for (unsigned LaneSize : {64U, 32U, 16U}) {
22819 if (isREVMask(ShuffleMask, VT, LaneSize)) {
22820 EVT NewVT =
22821 getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize));
22822 unsigned RevOp;
22823 unsigned EltSz = VT.getScalarSizeInBits();
22824 if (EltSz == 8)
22825 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
22826 else if (EltSz == 16)
22827 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
22828 else
22829 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
22831 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
22832 Op = LowerToPredicatedOp(Op, DAG, RevOp);
22833 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
22834 return convertFromScalableVector(DAG, VT, Op);
22838 unsigned WhichResult;
22839 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
22840 return convertFromScalableVector(
22841 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
22843 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
22844 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
22845 return convertFromScalableVector(
22846 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
22849 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
22850 return convertFromScalableVector(
22851 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
22853 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
22854 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
22855 return convertFromScalableVector(
22856 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
22859 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
22860 // represents the same logical operation as performed by a ZIP instruction. In
22861 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
22862 // equivalent to an AArch64 instruction. There's the extra component of
22863 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
22864 // only operated on 64/128bit vector types that have a direct mapping to a
22865 // target register and so an exact mapping is implied.
22866 // However, when using SVE for fixed length vectors, most legal vector types
22867 // are actually sub-vectors of a larger SVE register. When mapping
22868 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
22869 // how the mask's indices translate. Specifically, when the mapping requires
22870 // an exact meaning for a specific vector index (e.g. Index X is the last
22871 // vector element in the register) then such mappings are often only safe when
22872 // the exact SVE register size is know. The main exception to this is when
22873 // indices are logically relative to the first element of either
22874 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
22875 // when converting from fixed-length to scalable vector types (i.e. the start
22876 // of a fixed length vector is always the start of a scalable vector).
22877 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22878 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
22879 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
22880 if (ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) {
22881 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
22882 return convertFromScalableVector(DAG, VT, Op);
22885 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
22886 return convertFromScalableVector(
22887 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
22889 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
22890 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
22891 return convertFromScalableVector(
22892 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
22895 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
22896 return convertFromScalableVector(
22897 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
22899 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
22900 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
22901 return convertFromScalableVector(
22902 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
22906 return SDValue();
22909 SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
22910 SelectionDAG &DAG) const {
22911 SDLoc DL(Op);
22912 EVT InVT = Op.getValueType();
22914 assert(VT.isScalableVector() && isTypeLegal(VT) &&
22915 InVT.isScalableVector() && isTypeLegal(InVT) &&
22916 "Only expect to cast between legal scalable vector types!");
22917 assert(VT.getVectorElementType() != MVT::i1 &&
22918 InVT.getVectorElementType() != MVT::i1 &&
22919 "For predicate bitcasts, use getSVEPredicateBitCast");
22921 if (InVT == VT)
22922 return Op;
22924 EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
22925 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
22927 // Safe bitcasting between unpacked vector types of different element counts
22928 // is currently unsupported because the following is missing the necessary
22929 // work to ensure the result's elements live where they're supposed to within
22930 // an SVE register.
22931 // 01234567
22932 // e.g. nxv2i32 = XX??XX??
22933 // nxv4f16 = X?X?X?X?
22934 assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
22935 VT == PackedVT || InVT == PackedInVT) &&
22936 "Unexpected bitcast!");
22938 // Pack input if required.
22939 if (InVT != PackedInVT)
22940 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
22942 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
22944 // Unpack result if required.
22945 if (VT != PackedVT)
22946 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
22948 return Op;
22951 bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
22952 SDValue N) const {
22953 return ::isAllActivePredicate(DAG, N);
22956 EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
22957 return ::getPromotedVTForPredicate(VT);
22960 bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
22961 SDValue Op, const APInt &OriginalDemandedBits,
22962 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
22963 unsigned Depth) const {
22965 unsigned Opc = Op.getOpcode();
22966 switch (Opc) {
22967 case AArch64ISD::VSHL: {
22968 // Match (VSHL (VLSHR Val X) X)
22969 SDValue ShiftL = Op;
22970 SDValue ShiftR = Op->getOperand(0);
22971 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
22972 return false;
22974 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
22975 return false;
22977 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
22978 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
22980 // Other cases can be handled as well, but this is not
22981 // implemented.
22982 if (ShiftRBits != ShiftLBits)
22983 return false;
22985 unsigned ScalarSize = Op.getScalarValueSizeInBits();
22986 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
22988 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
22989 APInt UnusedBits = ~OriginalDemandedBits;
22991 if ((ZeroBits & UnusedBits) != ZeroBits)
22992 return false;
22994 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
22995 // used - simplify to just Val.
22996 return TLO.CombineTo(Op, ShiftR->getOperand(0));
23000 return TargetLowering::SimplifyDemandedBitsForTargetNode(
23001 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
23004 bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
23005 return Op.getOpcode() == AArch64ISD::DUP ||
23006 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23007 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
23008 TargetLowering::isTargetCanonicalConstantNode(Op);
23011 bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal(
23012 unsigned Opc, LLT Ty1, LLT Ty2) const {
23013 return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));